diff --git a/lib/cuda/Makefile b/lib/cuda/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..844906ba89b31442a912318c49110fbc3dbfc130 --- /dev/null +++ b/lib/cuda/Makefile @@ -0,0 +1,4 @@ +#Makefile for liblammpscuda.a +#No need to modify anything here! The CUDA path is inserted into Makefile.common + +include Makefile.cudalib \ No newline at end of file diff --git a/lib/cuda/Makefile.common b/lib/cuda/Makefile.common new file mode 100644 index 0000000000000000000000000000000000000000..7c918a23bc8770912580fc72d39520339c91a477 --- /dev/null +++ b/lib/cuda/Makefile.common @@ -0,0 +1,124 @@ +#Common commandline argument interpreter for compilation with lammpscuda (USER-CUDA) installed + +# make options: +# emu=1 switch to cuda emulation mode (otherwise: use gpu) +# dbg=1 print a lot of debugging output during runtime +# verbose=1 output nvcc command line during compilation +# keep=1 do not delete temporary compilation files (.ii, .cubin, ...) +# cufft=1 use cuda's fast fourier transformation lib "cufft" where possible (otherwise: use cpu fftw) +# binning=1 create virtual particle grid (neighbor-lists otherwise); currently this is not supported +# precision=1 single precision (global setting) +# precision=2 double precision (global setting) + +SHELL = /bin/sh + +# System-specific settings + +#CUDA_INSTALL_PATH = /usr/local/cuda +CUDA_INSTALL_PATH = /home/crtrott/lib/cuda +# e.g. in Gentoo +# CUDA_INSTALL_PATH = /opt/cuda + + +#////////////////////////////////////////////////////////////////////////////////////////////// +# no need to change anything below this line +#////////////////////////////////////////////////////////////////////////////////////////////// + +#use CPU FFT if cufft=0 is requested. +FALLBACK_FFT = 1 + +#default settings for compiler switches +ifdef COMPILELIB +include Makefile.defaults +else +include ../../lib/cuda/Makefile.defaults +endif + +#shell echo "Compiling with precision = " ${precision} ", arch = " ${arch} ", cufft = " ${cufft} ", dbg = " ${dbg} ", prec_timer = " ${prec_timer} + +CUDA_FLAGS := -I${CUDA_INSTALL_PATH}/include -DUNIX +CUDA_USRLIB_CONDITIONAL := -L${CUDA_INSTALL_PATH}/lib -L${CUDA_INSTALL_PATH}/lib64 + +# debug setting +ifeq ($(strip $(dbg)), 1) + CUDA_FLAGS += -D_DEBUG -g + NVCC_FLAGS += -g -G +else + NVCC_FLAGS += --compiler-options -fno-strict-aliasing -O3 +endif + +# skip timing on Mac and Windows manually +ifeq ($(strip $(prec_timer)), 0) + CUDA_FLAGS += -DNO_PREC_TIMING +endif + +# set fft routine +ifeq ($(strip $(cufft)), 0) + ifneq ($(FALLBACK_FFT), 1) + FFT_INC = -DFFT_NONE + FFT_PATH = + FFT_LIB = + CUDA_FLAGS += -DFFT_NONE + endif +else + CUDA_FLAGS += -DFFT_CUFFT + CUDA_USRLIB_CONDITIONAL += -lcufft +endif + +# make global precision setting + +ifeq ($(strip $(precision)), 1) + CUDA_FLAGS += -DCUDA_PRECISION=1 +else + ifeq ($(strip $(precision)), 3) + CUDA_FLAGS += -DCUDA_PRECISION=1 -DX_PRECISION=2 + else + ifeq ($(strip $(precision)), 4) + CUDA_FLAGS += -DCUDA_PRECISION=1 -DX_PRECISION=2 -DV_PRECISION=2 + else + CUDA_FLAGS += -DCUDA_PRECISION=2 + endif + endif +endif + +# make architecture settings +ifeq ($(strip $(arch)), 13) + CUDA_FLAGS += -DCUDA_ARCH=13 + SMVERSIONFLAGS := -arch sm_13 +else + ifeq ($(strip $(arch)), 20) + CUDA_FLAGS += -DCUDA_ARCH=20 + #NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true + NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false + SMVERSIONFLAGS := -arch sm_20 + else + ifeq ($(strip $(arch)), 21) + CUDA_FLAGS += -DCUDA_ARCH=20 + #NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true + NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false + SMVERSIONFLAGS := -arch sm_21 + else + ifeq ($(strip $(arch)), 30) + CUDA_FLAGS += -DCUDA_ARCH=20 + #NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true + NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false + SMVERSIONFLAGS := -arch sm_30 + else + ifeq ($(strip $(arch)), 35) + CUDA_FLAGS += -DCUDA_ARCH=20 + #NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true + NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false + SMVERSIONFLAGS := -arch sm_35 + else + CUDA_FLAGS += -DCUDA_ARCH=99 + SMVERSIONFLAGS := -arch sm_13 + endif + endif + endif + endif +endif + + + +CCFLAGS := $(CCFLAGS) $(CUDA_FLAGS) \ + -I$(CUDA_INSTALL_PATH)/include diff --git a/lib/cuda/Makefile.cudalib b/lib/cuda/Makefile.cudalib new file mode 100644 index 0000000000000000000000000000000000000000..f21e95e6868d1367219898dc52cd27bdae049d6c --- /dev/null +++ b/lib/cuda/Makefile.cudalib @@ -0,0 +1,87 @@ +#Makefile for liblammpscuda.a +#No need to modify anything here! The CUDA path is inserted into Makefile.common + +.DEFAULT: lib + +COMPILELIB := 1 + +SHELL = /bin/sh + +CUDA_SRC_DIR = ../cuda +CUDA_TEMP = $(CUDA_SRC_DIR)/.lastmake +CUDA_TEMP_DUMMY := $(shell touch $(CUDA_TEMP) ) +include $(CUDA_TEMP) +CUDA_CU = $(wildcard $(CUDA_SRC_DIR)/*_kernel.cu) +CUDA_CUO = $(CUDA_CU:_kernel.cu=_cu.o) +CUDA_OBJ = $(subst $(CUDA_SRC_DIR)/,,$(CUDA_CUO)) +CUDA_DEP = $(CUDA_OBJ:.o=.d) + +NVCC_FLAGS := + +VPATH = $(CUDA_SRC_DIR) + +#rewriting default settings if new ones are specified + + +ifdef precision +tmp := $(shell sed -i 's|precision ?= [0-9]|precision ?= '${precision}'|g' Makefile.defaults) +endif + +ifdef arch +tmp := $(shell sed -i 's|arch ?= [0-9][0-9]|arch ?= '${arch}'|g' Makefile.defaults) +endif + +ifdef cufft +tmp := $(shell sed -i 's|cufft ?= [0-9]|cufft ?= '${cufft}'|g' Makefile.defaults) +endif + +ifdef dbg +tmp := $(shell sed -i 's|dbg ?= [0-9]|dbg ?= '${dbg}'|g' Makefile.defaults) +endif + +ifdef prec_timer +tmp := $(shell sed -i 's|prec_timer ?= [0-9]|prec_timer ?= '${prec_timer}'|g' Makefile.defaults) +endif + +include Makefile.common + +tmp := $(shell sed -i '2 d' Makefile.lammps) +tmp := $(shell sed -i '2 d' Makefile.lammps) +tmp := $(shell sed -i '1a CUDA_FLAGS := ${CUDA_FLAGS}' Makefile.lammps) +tmp := $(shell sed -i '2a CUDA_USRLIB_CONDITIONAL := ${CUDA_USRLIB_CONDITIONAL}' Makefile.lammps) + +# verbose nvcc output during compilation +ifeq ($(verbose), 1) + VERBOSE := + NVCC_FLAGS += --ptxas-options=-v +else + VERBOSE := @ +endif + +# keep temporary compilation files of nvcc +ifeq ($(keep), 1) + NVCC_FLAGS += -keep -Xptxas="--verbose" +endif + + +NVCC := $(CUDA_INSTALL_PATH)/bin/nvcc +CUDA_INCLUDES = -I./ -I$(CUDA_INSTALL_PATH)/include -I../../src/USER-CUDA +CUDA_USRLIB = + +# Link target + +lib: $(CUDA_OBJ) + $(NVCC) -lib $(CUDA_OBJ) $(CUDA_FLAGS) $(CUDA_USRLIB_CONDITIONAL) -o liblammpscuda.a + +clean: + rm $(CUDA_SRC_DIR)/*.o + rm liblammpscuda.a + +# Library target + + +# Cuda compilation rules + +%_cu.o: %.cu %_kernel.cu %_cu.h cuda_shared.h + $(VERBOSE)$(NVCC) $(NVCC_FLAGS) $(CUDA_FLAGS) $(CUDA_INCLUDES) $(CUDA_USRLIB) $(SMVERSIONFLAGS) -o $@ -c $< + diff --git a/lib/cuda/Makefile.defaults b/lib/cuda/Makefile.defaults new file mode 100644 index 0000000000000000000000000000000000000000..590435446c23babb55649c13e7ec53e13e06446b --- /dev/null +++ b/lib/cuda/Makefile.defaults @@ -0,0 +1,19 @@ + +#precision setting: 1 single, 2 double, 4 mixed +precision ?= 2 + +#verbose setting: 0 no, 1 yes +verbose ?= 1 + +#GPU architecture (compute capability): 13, 20, 21 +arch ?= 20 + +#Using cufft (should not be changed) +cufft ?= 1 + +#Using dbg mode +dbg ?= 0 + +#On mac machines set this to 0 in order to avoid usage of linux specific precision timer +prec_timer ?= 1 + diff --git a/lib/cuda/Makefile.lammps b/lib/cuda/Makefile.lammps new file mode 100644 index 0000000000000000000000000000000000000000..75dd5a26bc49622587c5311fe96dbc2e3bd77c09 --- /dev/null +++ b/lib/cuda/Makefile.lammps @@ -0,0 +1,8 @@ +# Settings that the LAMMPS build will import when this package library is used +CUDA_FLAGS := -I/home/crtrott/lib/cuda/include -DUNIX -DFFT_CUFFT -DCUDA_PRECISION=2 -DCUDA_ARCH=20 +CUDA_USRLIB_CONDITIONAL := -L/home/crtrott/lib/cuda/lib -L/home/crtrott/lib/cuda/lib64 -lcufft + + user-cuda_SYSINC = ${CUDA_FLAGS} + user-cuda_SYSLIB = -lcuda -lcudart -lrt + user-cuda_SYSPATH = $(CUDA_USRLIB_CONDITIONAL) + diff --git a/lib/cuda/README b/lib/cuda/README new file mode 100644 index 0000000000000000000000000000000000000000..ce0dedcffe91da8956c2f48dd8b2d0f211aa073d --- /dev/null +++ b/lib/cuda/README @@ -0,0 +1,26 @@ +This directory has source files to build a library that LAMMPS +links against when using the USER-CUDA package. + +When you are done building this library, two files should +exist in this directory: + +liblammpscuda.a the library LAMMPS will link against +Makefile.lammps settings the LAMMPS Makefile will import + +The latter file will have settings like this (can be omitted if blank): + +user-cuda_SYSINC = -I$(CUDA_INSTALL_PATH)/include +user-cuda_SYSLIB = -lcuda -lcudart -lrt +user-cuda_SYSPATH = -L$(CUDA_INSTALL_PATH)/lib64 -L$(CUDA_INSTALL_PATH)/lib $(CUDA_USRLIB_CONDITIONAL) + +SYSINC is for settings needed to compile LAMMPS source files +SYSLIB is for additional system libraries needed by this package +SYSPATH is the path(s) to where those libraries are + +You must insure these settings are correct for your system, else +the LAMMPS build will likely fail. + +------------------------------------------------------------------------- + +Christian - there needs to additional info here about how +to build the lammpscuda lib. diff --git a/lib/cuda/atom_vec_angle_cuda.cu b/lib/cuda/atom_vec_angle_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..29bf2d65f00884deb6c253237f3ae313c610b6aa --- /dev/null +++ b/lib/cuda/atom_vec_angle_cuda.cu @@ -0,0 +1,85 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +const unsigned int ANGLE_DATA_MASK = X_MASK | V_MASK | F_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | MOLECULE_MASK; + +#include "atom_vec_angle_cuda_cu.h" + +void Cuda_AtomVecAngleCuda_Init(cuda_shared_data* sdata) +{ + return Cuda_AtomVecCuda_Init<ANGLE_DATA_MASK>(sdata); +} + +int Cuda_AtomVecAngleCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send) +{ + const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | MOLECULE_MASK; + return Cuda_AtomVecCuda_PackExchangeList<data_mask>(sdata, n, dim, buf_send); +} + +int Cuda_AtomVecAngleCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist) +{ + const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | MOLECULE_MASK; + return Cuda_AtomVecCuda_PackExchange<data_mask>(sdata, nsend, buf_send, copylist); +} + +int Cuda_AtomVecAngleCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist) +{ + const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | MOLECULE_MASK; + return Cuda_AtomVecCuda_UnpackExchange<data_mask>(sdata, nsend, buf_send, copylist); +} + +int Cuda_AtomVecAngleCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag) +{ + const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | MOLECULE_MASK; + return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata, nsend, iswap, buf_send, pbc, pbc_flag); +} + +int Cuda_AtomVecAngleCuda_PackBorderVel(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag) +{ + const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | MOLECULE_MASK; + return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata, nsend, iswap, buf_send, pbc, pbc_flag); +} + +int Cuda_AtomVecAngleCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag) +{ + const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | MOLECULE_MASK; + return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata, n, iswap, first, pbc, pbc_flag); +} + +int Cuda_AtomVecAngleCuda_PackBorderVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag) +{ + const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | MOLECULE_MASK; + return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata, n, iswap, first, pbc, pbc_flag); +} + +int Cuda_AtomVecAngleCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv) +{ + const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | MOLECULE_MASK; + return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata, n, first, buf_recv); +} + +int Cuda_AtomVecAngleCuda_UnpackBorderVel(cuda_shared_data* sdata, int n, int first, void* buf_recv) +{ + const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | MOLECULE_MASK; + return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata, n, first, buf_recv); +} diff --git a/lib/cuda/atom_vec_angle_cuda_cu.h b/lib/cuda/atom_vec_angle_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..17e8e116878df713661b1ba8439d6a22be4cc7ae --- /dev/null +++ b/lib/cuda/atom_vec_angle_cuda_cu.h @@ -0,0 +1,15 @@ +#ifndef ATOM_VEC_ANGLE_CUDA_CU_H_ +#define ATOM_VEC_ANGLE_CUDA_CU_H_ + +extern "C" void Cuda_AtomVecAngleCuda_Init(cuda_shared_data* sdata); +extern "C" int Cuda_AtomVecAngleCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send); +extern "C" int Cuda_AtomVecAngleCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist); +extern "C" int Cuda_AtomVecAngleCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist); +extern "C" int Cuda_AtomVecAngleCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag); +extern "C" int Cuda_AtomVecAngleCuda_PackBorderVel(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag); +extern "C" int Cuda_AtomVecAngleCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag); +extern "C" int Cuda_AtomVecAngleCuda_PackBorderVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag); +extern "C" int Cuda_AtomVecAngleCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv); +extern "C" int Cuda_AtomVecAngleCuda_UnpackBorderVel(cuda_shared_data* sdata, int n, int first, void* buf_recv); + +#endif /*ATOM_VEC_ANGLE2_CUDA_CU_H_*/ diff --git a/lib/cuda/atom_vec_atomic_cuda.cu b/lib/cuda/atom_vec_atomic_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..e53efe9f74803a19ec08c812af928699038690f3 --- /dev/null +++ b/lib/cuda/atom_vec_atomic_cuda.cu @@ -0,0 +1,85 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +const unsigned int ATOMIC_DATA_MASK = X_MASK | V_MASK | F_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK; + +#include "atom_vec_atomic_cuda_cu.h" + +void Cuda_AtomVecAtomicCuda_Init(cuda_shared_data* sdata) +{ + return Cuda_AtomVecCuda_Init<ATOMIC_DATA_MASK>(sdata); +} + +int Cuda_AtomVecAtomicCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send) +{ + const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK; + return Cuda_AtomVecCuda_PackExchangeList<data_mask>(sdata, n, dim, buf_send); +} + +int Cuda_AtomVecAtomicCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist) +{ + const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK; + return Cuda_AtomVecCuda_PackExchange<data_mask>(sdata, nsend, buf_send, copylist); +} + +int Cuda_AtomVecAtomicCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist) +{ + const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK; + return Cuda_AtomVecCuda_UnpackExchange<data_mask>(sdata, nsend, buf_send, copylist); +} + +int Cuda_AtomVecAtomicCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag) +{ + const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK; + return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata, nsend, iswap, buf_send, pbc, pbc_flag); +} + +int Cuda_AtomVecAtomicCuda_PackBorderVel(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag) +{ + const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK; + return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata, nsend, iswap, buf_send, pbc, pbc_flag); +} + +int Cuda_AtomVecAtomicCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag) +{ + const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK; + return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata, n, iswap, first, pbc, pbc_flag); +} + +int Cuda_AtomVecAtomicCuda_PackBorderVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag) +{ + const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK; + return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata, n, iswap, first, pbc, pbc_flag); +} + +int Cuda_AtomVecAtomicCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv) +{ + const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK; + return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata, n, first, buf_recv); +} + +int Cuda_AtomVecAtomicCuda_UnpackBorderVel(cuda_shared_data* sdata, int n, int first, void* buf_recv) +{ + const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK; + return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata, n, first, buf_recv); +} diff --git a/lib/cuda/atom_vec_atomic_cuda_cu.h b/lib/cuda/atom_vec_atomic_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..88b18f311def5964d78259f399792d47e82b0332 --- /dev/null +++ b/lib/cuda/atom_vec_atomic_cuda_cu.h @@ -0,0 +1,15 @@ +#ifndef ATOM_VEC_ATOMIC_CUDA_CU_H_ +#define ATOM_VEC_ATOMIC_CUDA_CU_H_ + +extern "C" void Cuda_AtomVecAtomicCuda_Init(cuda_shared_data* sdata); +extern "C" int Cuda_AtomVecAtomicCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send); +extern "C" int Cuda_AtomVecAtomicCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist); +extern "C" int Cuda_AtomVecAtomicCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist); +extern "C" int Cuda_AtomVecAtomicCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag); +extern "C" int Cuda_AtomVecAtomicCuda_PackBorderVel(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag); +extern "C" int Cuda_AtomVecAtomicCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag); +extern "C" int Cuda_AtomVecAtomicCuda_PackBorderVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag); +extern "C" int Cuda_AtomVecAtomicCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv); +extern "C" int Cuda_AtomVecAtomicCuda_UnpackBorderVel(cuda_shared_data* sdata, int n, int first, void* buf_recv); + +#endif /*ATOM_VEC_ATOMIC2_CUDA_CU_H_*/ diff --git a/lib/cuda/atom_vec_charge_cuda.cu b/lib/cuda/atom_vec_charge_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..b2441d770cd5b34526c866601389d0fb97882de1 --- /dev/null +++ b/lib/cuda/atom_vec_charge_cuda.cu @@ -0,0 +1,85 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +const unsigned int CHARGE_DATA_MASK = X_MASK | V_MASK | F_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | Q_MASK; + +#include "atom_vec_charge_cuda_cu.h" + +void Cuda_AtomVecChargeCuda_Init(cuda_shared_data* sdata) +{ + return Cuda_AtomVecCuda_Init<CHARGE_DATA_MASK>(sdata); +} + +int Cuda_AtomVecChargeCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send) +{ + const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | Q_MASK; + return Cuda_AtomVecCuda_PackExchangeList<data_mask>(sdata, n, dim, buf_send); +} + +int Cuda_AtomVecChargeCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist) +{ + const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | Q_MASK; + return Cuda_AtomVecCuda_PackExchange<data_mask>(sdata, nsend, buf_send, copylist); +} + +int Cuda_AtomVecChargeCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist) +{ + const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | Q_MASK; + return Cuda_AtomVecCuda_UnpackExchange<data_mask>(sdata, nsend, buf_send, copylist); +} + +int Cuda_AtomVecChargeCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag) +{ + const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK; + return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata, nsend, iswap, buf_send, pbc, pbc_flag); +} + +int Cuda_AtomVecChargeCuda_PackBorderVel(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag) +{ + const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK; + return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata, nsend, iswap, buf_send, pbc, pbc_flag); +} + +int Cuda_AtomVecChargeCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag) +{ + const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK; + return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata, n, iswap, first, pbc, pbc_flag); +} + +int Cuda_AtomVecChargeCuda_PackBorderVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag) +{ + const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK; + return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata, n, iswap, first, pbc, pbc_flag); +} + +int Cuda_AtomVecChargeCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv) +{ + const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK; + return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata, n, first, buf_recv); +} + +int Cuda_AtomVecChargeCuda_UnpackBorderVel(cuda_shared_data* sdata, int n, int first, void* buf_recv) +{ + const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK; + return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata, n, first, buf_recv); +} diff --git a/lib/cuda/atom_vec_charge_cuda_cu.h b/lib/cuda/atom_vec_charge_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..ebae713142bf6b6ffa3ec2277aeb81cf1dc90632 --- /dev/null +++ b/lib/cuda/atom_vec_charge_cuda_cu.h @@ -0,0 +1,15 @@ +#ifndef ATOM_VEC_CHARGE_CUDA_CU_H_ +#define ATOM_VEC_CHARGE_CUDA_CU_H_ + +extern "C" void Cuda_AtomVecChargeCuda_Init(cuda_shared_data* sdata); +extern "C" int Cuda_AtomVecChargeCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send); +extern "C" int Cuda_AtomVecChargeCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist); +extern "C" int Cuda_AtomVecChargeCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist); +extern "C" int Cuda_AtomVecChargeCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag); +extern "C" int Cuda_AtomVecChargeCuda_PackBorderVel(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag); +extern "C" int Cuda_AtomVecChargeCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag); +extern "C" int Cuda_AtomVecChargeCuda_PackBorderVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag); +extern "C" int Cuda_AtomVecChargeCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv); +extern "C" int Cuda_AtomVecChargeCuda_UnpackBorderVel(cuda_shared_data* sdata, int n, int first, void* buf_recv); + +#endif /*ATOM_VEC_CHARGE2_CUDA_CU_H_*/ diff --git a/lib/cuda/atom_vec_cuda.cu b/lib/cuda/atom_vec_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..091fb7dbf17a8b171f220c1fc64bf616efe2cba4 --- /dev/null +++ b/lib/cuda/atom_vec_cuda.cu @@ -0,0 +1,628 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> +#define MY_PREFIX atom_vec_cuda +#include "cuda_shared.h" +#include "cuda_common.h" +#include "cuda_wrapper_cu.h" +#include "crm_cuda_utils.cu" + +#include "atom_vec_cuda_kernel.cu" + +int AtomVecCuda_CountDataItems(unsigned int data_mask) +{ + int n = 0; + + if(data_mask & X_MASK) n += 3; + + if(data_mask & V_MASK) n += 3; + + if(data_mask & F_MASK) n += 3; + + if(data_mask & TAG_MASK) n++; + + if(data_mask & TYPE_MASK) n++; + + if(data_mask & MASK_MASK) n++; + + if(data_mask & IMAGE_MASK) n++; + + if(data_mask & Q_MASK) n++; + + if(data_mask & MOLECULE_MASK) n++; + + if(data_mask & RMASS_MASK) n++; + + if(data_mask & RADIUS_MASK) n++; + + if(data_mask & DENSITY_MASK) n++; + + if(data_mask & OMEGA_MASK) n += 3; + + if(data_mask & TORQUE_MASK) n++; + + //if(data_mask & NSPECIAL_MASK) n+=3; + return n; +} + +void Cuda_AtomVecCuda_UpdateBuffer(cuda_shared_data* sdata, int size) +{ + if(sdata->buffersize < size) { + MYDBG(printf("Cuda_AtomVecCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);) + CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize); + sdata->buffer = CudaWrapper_AllocCudaData(size); + sdata->buffersize = size; + sdata->buffer_new++; + MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);) + } + + cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*)); +} + +template <const unsigned int data_mask> +void Cuda_AtomVecCuda_UpdateNmax(cuda_shared_data* sdata) +{ + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(image) , & sdata->atom.image.dev_data, sizeof(int*)); + + if(data_mask & Q_MASK) cudaMemcpyToSymbol(MY_AP(q) , & sdata->atom.q .dev_data, sizeof(F_FLOAT*)); + + if(data_mask & MOLECULE_MASK) cudaMemcpyToSymbol(MY_AP(molecule) , & sdata->atom.molecule.dev_data, sizeof(int*)); + + if(data_mask & RADIUS_MASK) cudaMemcpyToSymbol(MY_AP(radius) , & sdata->atom.radius.dev_data, sizeof(int*)); + + if(data_mask & DENSITY_MASK) cudaMemcpyToSymbol(MY_AP(density) , & sdata->atom.density.dev_data, sizeof(int*)); + + if(data_mask & RMASS_MASK) cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(int*)); + + if(data_mask & OMEGA_MASK) cudaMemcpyToSymbol(MY_AP(omega) , & sdata->atom.omega.dev_data, sizeof(int*)); + + //if(data_mask & NSPECIAL_MASK) cudaMemcpyToSymbol(MY_AP(nspecial) , & sdata->atom.nspecial.dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_AP(flag) , & sdata->flag, sizeof(int*)); +} + +template <const unsigned int data_mask> +void Cuda_AtomVecCuda_Init(cuda_shared_data* sdata) +{ + MYDBG(printf("# CUDA: Cuda_AtomVecCuda_Init ... start\n");) + + if(sdata->atom.update_nmax) + Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata); + + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + + MYDBG(printf("# CUDA: Cuda_AtomVecCuda_Init ... post Nmax\n");) + cudaMemcpyToSymbol(MY_AP(prd) , sdata->domain.prd, 3 * sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_AP(sublo) , & sdata->domain.sublo, 3 * sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_AP(subhi) , & sdata->domain.subhi, 3 * sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_AP(flag) , & sdata->flag, sizeof(int*)); + cudaThreadSynchronize(); + MYDBG(printf("# CUDA: Cuda_AtomVecCuda_Init ... end\n");) +} + + +template <const unsigned int data_mask> +int Cuda_AtomVecCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag) +{ + + timespec time1, time2; + + if(sdata->atom.update_nmax) + Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata); + + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + + int n_data_items = AtomVecCuda_CountDataItems(data_mask); + int size = (n * n_data_items) * sizeof(X_FLOAT); + + if(sdata->buffer_new or (size > sdata->buffersize)) + Cuda_AtomVecCuda_UpdateBuffer(sdata, size); + + X_FLOAT dx = 0.0; + X_FLOAT dy = 0.0; + X_FLOAT dz = 0.0; + + if(pbc_flag != 0) { + if(sdata->domain.triclinic == 0) { + dx = pbc[0] * sdata->domain.prd[0]; + dy = pbc[1] * sdata->domain.prd[1]; + dz = pbc[2] * sdata->domain.prd[2]; + } else { + dx = pbc[0] * sdata->domain.prd[0] + pbc[5] * sdata->domain.xy + pbc[4] * sdata->domain.xz; + dy = pbc[1] * sdata->domain.prd[1] + pbc[3] * sdata->domain.yz; + dz = pbc[2] * sdata->domain.prd[2]; + } + } + + int3 layout = getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + if(sdata->atom.nlocal > 0) { + cudaMemset(sdata->flag, 0, sizeof(int)); + + clock_gettime(CLOCK_REALTIME, &time1); + + void* buf = sdata->overlap_comm ? sdata->comm.buf_send_dev[iswap] : sdata->buffer; + Cuda_AtomVecCuda_PackComm_Kernel<data_mask> <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n + , sdata->comm.maxlistlength, iswap, dx, dy, dz, buf); + cudaThreadSynchronize(); + + clock_gettime(CLOCK_REALTIME, &time2); + sdata->cuda_timings.comm_forward_kernel_pack += + time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; + + CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm: Kernel execution failed"); + + if(not sdata->overlap_comm) + cudaMemcpy(buf_send, sdata->buffer, n* n_data_items* sizeof(X_FLOAT), cudaMemcpyDeviceToHost); + //cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost); + + clock_gettime(CLOCK_REALTIME, &time1); + sdata->cuda_timings.comm_forward_download += + time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000; + + int aflag; + cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost); + if(aflag != 0) printf("aflag PackComm: %i\n", aflag); + CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm: Kernel execution failed"); + + } + + return n_data_items * n; +} + + +template <const unsigned int data_mask> +int Cuda_AtomVecCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag) +{ + MYDBG(printf(" # CUDA: AtomVecCuda_PackComm_Self\n");) + timespec time1, time2; + + if(sdata->atom.update_nmax) + Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata); + + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + + int n_data_items = AtomVecCuda_CountDataItems(data_mask); + int size = (n * n_data_items) * sizeof(X_FLOAT); + + if(sdata->buffer_new or (size > sdata->buffersize)) + Cuda_AtomVecCuda_UpdateBuffer(sdata, size); + + static int count = -1; + count++; + X_FLOAT dx = 0.0; + X_FLOAT dy = 0.0; + X_FLOAT dz = 0.0; + + if(pbc_flag != 0) { + if(sdata->domain.triclinic == 0) { + dx = pbc[0] * sdata->domain.prd[0]; + dy = pbc[1] * sdata->domain.prd[1]; + dz = pbc[2] * sdata->domain.prd[2]; + } else { + dx = pbc[0] * sdata->domain.prd[0] + pbc[5] * sdata->domain.xy + pbc[4] * sdata->domain.xz; + dy = pbc[1] * sdata->domain.prd[1] + pbc[3] * sdata->domain.yz; + dz = pbc[2] * sdata->domain.prd[2]; + } + } + + + + int3 layout = getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + if(sdata->atom.nlocal > 0) { + + clock_gettime(CLOCK_REALTIME, &time1); + CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm_Self:Pre Kernel execution failed"); + + Cuda_AtomVecCuda_PackComm_Self_Kernel<data_mask> <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz, first); + cudaThreadSynchronize(); + + clock_gettime(CLOCK_REALTIME, &time2); + sdata->cuda_timings.comm_forward_kernel_self += + time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; + + CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm_Self: Kernel execution failed"); + } + + return n_data_items * n; +} + + +template <const unsigned int data_mask> +void Cuda_AtomVecCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* buf_recv, int iswap) +{ + timespec time1, time2; + + if(sdata->atom.update_nmax) + Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata); + + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + + int n_data_items = AtomVecCuda_CountDataItems(data_mask); + int size = (n * n_data_items) * sizeof(X_FLOAT); + + if(sdata->buffer_new or (size > sdata->buffersize)) + Cuda_AtomVecCuda_UpdateBuffer(sdata, size); + + int3 layout = getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + if(sdata->atom.nlocal > 0) { + clock_gettime(CLOCK_REALTIME, &time1); + + if(not sdata->overlap_comm || iswap < 0) + cudaMemcpy(sdata->buffer, (void*)buf_recv, n_data_items * n * sizeof(X_FLOAT), cudaMemcpyHostToDevice); + + clock_gettime(CLOCK_REALTIME, &time2); + sdata->cuda_timings.comm_forward_upload += + time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; + void* buf = (sdata->overlap_comm && iswap >= 0) ? sdata->comm.buf_recv_dev[iswap] : sdata->buffer; + Cuda_AtomVecCuda_UnpackComm_Kernel<data_mask> <<< grid, threads, 0>>>(n, first, buf); + cudaThreadSynchronize(); + + clock_gettime(CLOCK_REALTIME, &time1); + sdata->cuda_timings.comm_forward_kernel_unpack += + time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000; + + CUT_CHECK_ERROR("Cuda_AtomVecCuda_UnpackComm: Kernel execution failed"); + + } +} + +template <const unsigned int data_mask> +int Cuda_AtomVecCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send) +{ + MYDBG(printf("# CUDA: Cuda_AtomVecCuda_PackExchangeList ... start dim %i \n", dim);) + CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchangeList: pre Kernel execution failed"); + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + Cuda_AtomVecCuda_Init<data_mask>(sdata); + int size = n * sizeof(double); + + if(sdata->buffer_new or (size > sdata->buffersize)) + Cuda_AtomVecCuda_UpdateBuffer(sdata, size); + + cudaMemset((int*)(sdata->buffer), 0, sizeof(int)); + + int3 layout = getgrid(sdata->atom.nlocal, sizeof(int), 256, true); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + timespec time1, time2; + clock_gettime(CLOCK_REALTIME, &time1); + + Cuda_AtomVecCuda_PackExchangeList_Kernel <<< grid, threads, (threads.x + 1)*sizeof(int) >>> (n - 1, dim); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchangeList: Kernel execution failed"); + + clock_gettime(CLOCK_REALTIME, &time2); + sdata->cuda_timings.comm_exchange_kernel_pack += + time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; + + cudaMemcpy(buf_send, sdata->buffer, sizeof(double), cudaMemcpyDeviceToHost); + int return_value = ((int*) buf_send)[0]; + + if(n > 1 + return_value) + cudaMemcpy(buf_send, sdata->buffer, (1 + return_value)*sizeof(double), cudaMemcpyDeviceToHost); + + CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchangeList: return copy failed"); + + clock_gettime(CLOCK_REALTIME, &time1); + sdata->cuda_timings.comm_exchange_download += + time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000; + + MYDBG(printf("# CUDA: Cuda_AtomVecCuda_PackExchangeList ... done\n");) + return return_value; +} + +template <const unsigned int data_mask> +int Cuda_AtomVecCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist) +{ + MYDBG(printf("# CUDA: Cuda_AtomVecCuda_PackExchange ... start \n");) + + if(sdata->atom.update_nmax) + Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata); + + //if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + + int n_data_items = AtomVecCuda_CountDataItems(data_mask) + 1; + int size = (nsend * n_data_items + 1) * sizeof(double); + + if(sdata->buffer_new or (size > sdata->buffersize)) + Cuda_AtomVecCuda_UpdateBuffer(sdata, size); + + cudaMemset((int*)(sdata->buffer), 0, sizeof(int)); + + int3 layout = getgrid(nsend, 0); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + timespec time1, time2; + clock_gettime(CLOCK_REALTIME, &time1); + + Cuda_AtomVecCuda_PackExchange_Kernel<data_mask> <<< grid, threads, 0>>>(nsend, (int*) copylist); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchange: Kernel execution failed"); + + clock_gettime(CLOCK_REALTIME, &time2); + sdata->cuda_timings.comm_exchange_kernel_pack += + time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; + + cudaMemcpy(buf_send, sdata->buffer, size, cudaMemcpyDeviceToHost); + + clock_gettime(CLOCK_REALTIME, &time1); + sdata->cuda_timings.comm_exchange_download += + time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000; + + MYDBG(printf("# CUDA: Cuda_AtomVecCuda_PackExchange ... done\n");) + return nsend * n_data_items + 1; +} + + +template <const unsigned int data_mask> +int Cuda_AtomVecCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist) +{ + Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata); + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + int n_data_items = AtomVecCuda_CountDataItems(data_mask) + 1; + + int size = (nsend * n_data_items + 1) * sizeof(double); + + if(sdata->buffer_new or (size > sdata->buffersize)) + Cuda_AtomVecCuda_UpdateBuffer(sdata, size); + + cudaMemcpyToSymbol(MY_AP(flag) , & sdata->flag, sizeof(int*)); + + cudaMemset((int*)(sdata->flag), 0, sizeof(int)); + + if(nsend) { + int3 layout = getgrid(nsend, 0); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + if(sdata->atom.nlocal > 0) { + timespec time1, time2; + clock_gettime(CLOCK_REALTIME, &time1); + + cudaMemcpy(sdata->buffer, buf_send , size, cudaMemcpyHostToDevice); + + clock_gettime(CLOCK_REALTIME, &time2); + sdata->cuda_timings.comm_exchange_upload += + time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; + + Cuda_AtomVecCuda_UnpackExchange_Kernel<data_mask> <<< grid, threads, 0>>>(sdata->exchange_dim, nsend, (int*) copylist); + cudaThreadSynchronize(); + + clock_gettime(CLOCK_REALTIME, &time1); + sdata->cuda_timings.comm_exchange_kernel_unpack += + time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000; + + CUT_CHECK_ERROR("Cuda_AtomVecCuda_UnpackExchange: Kernel execution failed"); + } + } + + int naccept; + cudaMemcpy((void*)&naccept, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost); + + return naccept; +} + +template <const unsigned int data_mask> +int Cuda_AtomVecCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag) +{ + timespec atime1, atime2; + clock_gettime(CLOCK_REALTIME, &atime1); + + if(sdata->atom.update_nmax) + Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata); + + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + + clock_gettime(CLOCK_REALTIME, &atime2); + sdata->cuda_timings.test1 += + atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000; + + int n_data_items = AtomVecCuda_CountDataItems(data_mask); + + int size = nsend * n_data_items * sizeof(X_FLOAT); + + if(sdata->buffer_new or (size > sdata->buffersize)) + Cuda_AtomVecCuda_UpdateBuffer(sdata, size); + + X_FLOAT dx = 0.0; + X_FLOAT dy = 0.0; + X_FLOAT dz = 0.0; + + if(pbc_flag != 0) { + if(sdata->domain.triclinic == 0) { + dx = pbc[0] * sdata->domain.prd[0]; + dy = pbc[1] * sdata->domain.prd[1]; + dz = pbc[2] * sdata->domain.prd[2]; + } else { + dx = pbc[0]; + dy = pbc[1]; + dz = pbc[2]; + } + } + + int3 layout = getgrid(nsend); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + if(sdata->atom.nlocal > 0) { + timespec time1, time2; + clock_gettime(CLOCK_REALTIME, &time1); + + Cuda_AtomVecCuda_PackBorder_Kernel<data_mask> <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, nsend, sdata->comm.maxlistlength, iswap, dx, dy, dz); + cudaThreadSynchronize(); + + clock_gettime(CLOCK_REALTIME, &time2); + sdata->cuda_timings.comm_border_kernel_pack += + time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; + + cudaMemcpy(buf_send, sdata->buffer, size, cudaMemcpyDeviceToHost); + CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackBorder: Kernel execution failed"); + + clock_gettime(CLOCK_REALTIME, &time1); + sdata->cuda_timings.comm_border_download += + time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000; + + } + + return nsend * n_data_items; +} + +template <const unsigned int data_mask> +int Cuda_AtomVecCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag) +{ + if(sdata->atom.update_nmax) + Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata); + + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + + int n_data_items = AtomVecCuda_CountDataItems(data_mask); + + int size = n * n_data_items * sizeof(X_FLOAT); + + if(sdata->buffer_new or (size > sdata->buffersize)) + Cuda_AtomVecCuda_UpdateBuffer(sdata, size); + + X_FLOAT dx = 0.0; + X_FLOAT dy = 0.0; + X_FLOAT dz = 0.0; + + if(pbc_flag != 0) { + if(sdata->domain.triclinic == 0) { + dx = pbc[0] * sdata->domain.prd[0]; + dy = pbc[1] * sdata->domain.prd[1]; + dz = pbc[2] * sdata->domain.prd[2]; + } else { + dx = pbc[0]; + dy = pbc[1]; + dz = pbc[2]; + } + } + + int3 layout = getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + if(sdata->atom.nlocal > 0) { + timespec time1, time2; + clock_gettime(CLOCK_REALTIME, &time1); + + Cuda_AtomVecCuda_PackBorder_Self_Kernel<data_mask> <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz, first); + cudaThreadSynchronize(); + + clock_gettime(CLOCK_REALTIME, &time2); + sdata->cuda_timings.comm_border_kernel_self += + time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; + + CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackBorder_Self: Kernel execution failed"); + + } + + return n * n_data_items; +} + + +template <const unsigned int data_mask> +int Cuda_AtomVecCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv) +{ + timespec atime1, atime2; + clock_gettime(CLOCK_REALTIME, &atime1); + + if(sdata->atom.update_nmax) + Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata); + + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + + clock_gettime(CLOCK_REALTIME, &atime2); + sdata->cuda_timings.test1 += + atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000; + + int n_data_items = AtomVecCuda_CountDataItems(data_mask); + + int size = n * n_data_items * sizeof(X_FLOAT); + + if(sdata->buffer_new or (size > sdata->buffersize)) + Cuda_AtomVecCuda_UpdateBuffer(sdata, size); + + int3 layout = getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + if(sdata->atom.nlocal > 0) { + timespec time1, time2; + clock_gettime(CLOCK_REALTIME, &time1); + + cudaMemset((int*)(sdata->flag), 0, sizeof(int)); + cudaMemcpy(sdata->buffer, (void*)buf_recv, size, cudaMemcpyHostToDevice); + + clock_gettime(CLOCK_REALTIME, &time2); + sdata->cuda_timings.comm_border_upload += + time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; + + Cuda_AtomVecCuda_UnpackBorder_Kernel<data_mask> <<< grid, threads, 0>>>(n, first); + cudaThreadSynchronize(); + + clock_gettime(CLOCK_REALTIME, &time1); + sdata->cuda_timings.comm_border_kernel_unpack += + time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000; + + cudaMemcpy(&sdata->comm.grow_flag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost); + + CUT_CHECK_ERROR("Cuda_AtomVecCuda_UnpackBorder: Kernel execution failed"); + + } + + return sdata->comm.grow_flag; +} + + +#include "atom_vec_angle_cuda.cu" +#include "atom_vec_atomic_cuda.cu" +#include "atom_vec_charge_cuda.cu" +#include "atom_vec_full_cuda.cu" +//#include "atom_vec_granular_cuda.cu" diff --git a/lib/cuda/atom_vec_cuda_cu.h b/lib/cuda/atom_vec_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/lib/cuda/atom_vec_cuda_kernel.cu b/lib/cuda/atom_vec_cuda_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..5e2f6a974fc74ddf0e787c7ad24b97d17c6999c3 --- /dev/null +++ b/lib/cuda/atom_vec_cuda_kernel.cu @@ -0,0 +1,512 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ +#define RIMLARGER 1.000001 +#define RIMSMALLER 0.999999 +#define SMALL 1e-5 + +extern __shared__ int shared[]; + +template <const unsigned int data_mask> +__global__ void Cuda_AtomVecCuda_PackComm_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz, void* buffer) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + int* list = sendlist + iswap * maxlistlength; + + if(i < n) { + int j = list[i]; + + if(j > _nmax) _flag[0] = 1; + + int k = 0; + + if(data_mask & X_MASK) { + ((X_FLOAT*) buffer)[i + k * n] = _x[j] + dx; + k++; + ((X_FLOAT*) buffer)[i + k * n] = _x[j + _nmax] + dy; + k++; + ((X_FLOAT*) buffer)[i + k * n] = _x[j + 2 * _nmax] + dz; + k++; + } + + if(data_mask & V_MASK) { + ((X_FLOAT*) buffer)[i + k * n] = _v[j]; + k++; + ((X_FLOAT*) buffer)[i + k * n] = _v[j + _nmax]; + k++; + ((X_FLOAT*) buffer)[i + k * n] = _v[j + 2 * _nmax]; + k++; + } + + if(data_mask & OMEGA_MASK) { + ((X_FLOAT*) buffer)[i + k * n] = _omega[j]; + k++; + ((X_FLOAT*) buffer)[i + k * n] = _omega[j + _nmax]; + k++; + ((X_FLOAT*) buffer)[i + k * n] = _omega[j + 2 * _nmax]; + k++; + } + + if(data_mask & RADIUS_MASK)((X_FLOAT*) buffer)[i + k * n] = _radius[j]; + + k++; + + if(data_mask & RMASS_MASK)((X_FLOAT*) buffer)[i + k * n] = _rmass[j]; + + k++; + } +} + +template <const unsigned int data_mask> +__global__ void Cuda_AtomVecCuda_PackComm_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz, int first) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + int* list = sendlist + iswap * maxlistlength; + + if(i < n) { + int j = i; + j = list[i]; + + if(data_mask & X_MASK) { + _x[i + first] = _x[j] + dx; + _x[i + first + _nmax] = _x[j + _nmax] + dy; + _x[i + first + 2 * _nmax] = _x[j + 2 * _nmax] + dz; + } + + if(data_mask & V_MASK) { + _v[i + first] = _v[j]; + _v[i + first + _nmax] = _v[j + _nmax]; + _v[i + first + 2 * _nmax] = _v[j + 2 * _nmax]; + } + + if(data_mask & OMEGA_MASK) { + _omega[i + first] = _omega[j]; + _omega[i + first + _nmax] = _omega[j + _nmax]; + _omega[i + first + 2 * _nmax] = _omega[j + 2 * _nmax]; + } + + if(data_mask & RADIUS_MASK) _radius[i + first] = _radius[j]; + + if(data_mask & RMASS_MASK) _rmass[i + first] = _rmass[j]; + } +} + + +template <const unsigned int data_mask> +__global__ void Cuda_AtomVecCuda_UnpackComm_Kernel(int n, int first, void* buffer) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if(i < n) { + int k = 0; + + if(data_mask & X_MASK) { + _x[i + first] = ((X_FLOAT*) buffer)[i + k * n]; + k++; + _x[i + first + _nmax] = ((X_FLOAT*) buffer)[i + k * n]; + k++; + _x[i + first + 2 * _nmax] = ((X_FLOAT*) buffer)[i + k * n]; + k++; + } + + if(data_mask & V_MASK) { + _v[i + first] = ((X_FLOAT*) buffer)[i + k * n]; + k++; + _v[i + first + _nmax] = ((X_FLOAT*) buffer)[i + k * n]; + k++; + _v[i + first + 2 * _nmax] = ((X_FLOAT*) buffer)[i + k * n]; + k++; + } + + if(data_mask & OMEGA_MASK) { + _omega[i + first] = ((X_FLOAT*) buffer)[i + k * n]; + k++; + _omega[i + first + _nmax] = ((X_FLOAT*) buffer)[i + k * n]; + k++; + _omega[i + first + 2 * _nmax] = ((X_FLOAT*) buffer)[i + k * n]; + k++; + } + + if(data_mask & RADIUS_MASK) _radius[i + first] = ((X_FLOAT*) buffer)[i + k * n]; + + k++; + + if(data_mask & RMASS_MASK) _rmass[i + first] = ((X_FLOAT*) buffer)[i + k * n]; + + k++; + } +} + + +__global__ void Cuda_AtomVecCuda_PackExchangeList_Kernel(int n, int dim) +{ + double* buf = (double*) _buffer; + buf = &buf[1]; + + //X_FLOAT lo=slablo[iswap]; + //X_FLOAT hi=slabhi[iswap]; + + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + bool add = false; + + if(i < _nlocal) { + double xdim_tmp = static_cast <double>(_x[i + dim * _nmax]); + + if(xdim_tmp < _sublo[dim] || xdim_tmp >= _subhi[dim]) { + add = true; + } + } + + shared[threadIdx.x] = add ? 1 : 0; + __syncthreads(); + int nsend = 0; + + if(threadIdx.x == 0) { + for(int k = 0; k < blockDim.x; k++) { + if(shared[k]) { + nsend++; + shared[k] = nsend; + } + } + + shared[blockDim.x] = atomicAdd((int*) _buffer, nsend); + } + + __syncthreads(); + + nsend = shared[blockDim.x] + shared[threadIdx.x] - 1; + + if(add && nsend + 1 < n) + buf[nsend] = i; +} + +template <const unsigned int data_mask> +__global__ void Cuda_AtomVecCuda_PackExchange_Kernel(int nsend, int* copylist) +{ + double* buf = (double*) _buffer; + int k = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if(k >= nsend) return; + + buf = &buf[1 + k]; + + int i = static_cast <int>(buf[0]); + int j = copylist[k]; + + int m = 1; + + if(data_mask & X_MASK) { + buf[(m++)*nsend] = static_cast <double>(_x[i]); + buf[(m++)*nsend] = static_cast <double>(_x[i + _nmax]); + buf[(m++)*nsend] = static_cast <double>(_x[i + 2 * _nmax]); + } + + if(data_mask & V_MASK) { + buf[(m++)*nsend] = _v[i]; + buf[(m++)*nsend] = _v[i + _nmax]; + buf[(m++)*nsend] = _v[i + 2 * _nmax]; + } + + if(data_mask & TAG_MASK) buf[(m++)*nsend] = _tag[i]; + + if(data_mask & TYPE_MASK) buf[(m++)*nsend] = _type[i]; + + if(data_mask & MASK_MASK) buf[(m++)*nsend] = _mask[i]; + + if(data_mask & IMAGE_MASK) buf[(m++)*nsend] = _image[i]; + + if(data_mask & Q_MASK) buf[(m++)*nsend] = _q[i]; + + if(data_mask & MOLECULE_MASK) buf[(m++)*nsend] = _molecule[i]; + + if(data_mask & RADIUS_MASK) buf[(m++)*nsend] = _radius[i]; + + if(data_mask & DENSITY_MASK) buf[(m++)*nsend] = _density[i]; + + if(data_mask & RMASS_MASK) buf[(m++)*nsend] = _rmass[i]; + + if(data_mask & OMEGA_MASK) { + buf[(m++)*nsend] = _omega[i]; + buf[(m++)*nsend] = _omega[i + _nmax]; + buf[(m++)*nsend] = _omega[i + 2 * _nmax]; + } + + /* if(data_mask & NSPECIAL_MASK) + { + buf[(m++)*nsend] = _nspecial[i]; + buf[(m++)*nsend] = _nspecial[i+_nmax]; + buf[(m++)*nsend] = _nspecial[i+2* _nmax]; + }*/ + + if(i >= _nlocal) return; + + if(data_mask & X_MASK) { + _x[i] = _x[j]; + _x[i + _nmax] = _x[j + _nmax]; + _x[i + 2 * _nmax] = _x[j + 2 * _nmax]; + } + + if(data_mask & V_MASK) { + _v[i] = _v[j]; + _v[i + _nmax] = _v[j + _nmax]; + _v[i + 2 * _nmax] = _v[j + 2 * _nmax]; + } + + if(data_mask & TAG_MASK) _tag[i] = _tag[j]; + + if(data_mask & TYPE_MASK) _type[i] = _type[j]; + + if(data_mask & MASK_MASK) _mask[i] = _mask[j]; + + if(data_mask & IMAGE_MASK) _image[i] = _image[j]; + + if(data_mask & Q_MASK) _q[i] = _q[j]; + + if(data_mask & MOLECULE_MASK) _molecule[i] = _molecule[j]; + + if(data_mask & RADIUS_MASK) _radius[i] = _radius[j]; + + if(data_mask & DENSITY_MASK) _density[i] = _density[j]; + + if(data_mask & RMASS_MASK) _rmass[i] = _rmass[j]; + + if(data_mask & OMEGA_MASK) { + _omega[i] = _omega[j]; + _omega[i + _nmax] = _omega[j + _nmax]; + _omega[i + 2 * _nmax] = _omega[j + 2 * _nmax]; + } + + /* if(data_mask & NSPECIAL_MASK) + { + _nspecial[i] = _nspecial[j]; + _nspecial[i+_nmax] = _nspecial[j+_nmax]; + _nspecial[i+2* _nmax] = _nspecial[j+2* _nmax]; + }*/ +} + +template <const unsigned int data_mask> +__global__ void Cuda_AtomVecCuda_UnpackExchange_Kernel(int dim, int nsend, int* copylist) +{ + double* buf = (double*) _buffer; + int k = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if(k >= nsend) return; + + buf = &buf[1 + k]; + int i = -1; + double xdim_tmp = buf[(1 + dim) * nsend]; + + if(xdim_tmp >= _sublo[dim] - SMALL && xdim_tmp < _subhi[dim] + SMALL) { + i = atomicAdd(_flag, 1) + _nlocal; + + int m = 1; + + if(data_mask & X_MASK) { + _x[i] = buf[(m++) * nsend]; + _x[i + _nmax] = buf[(m++) * nsend]; + _x[i + 2 * _nmax] = buf[(m++) * nsend]; + } + + if(data_mask & V_MASK) { + _v[i] = buf[(m++) * nsend]; + _v[i + _nmax] = buf[(m++) * nsend]; + _v[i + 2 * _nmax] = buf[(m++) * nsend]; + } + + if(data_mask & TAG_MASK) _tag[i] = buf[(m++) * nsend]; + + if(data_mask & TYPE_MASK) _type[i] = buf[(m++) * nsend]; + + if(data_mask & MASK_MASK) _mask[i] = buf[(m++) * nsend]; + + if(data_mask & IMAGE_MASK) _image[i] = buf[(m++) * nsend]; + + if(data_mask & Q_MASK) _q[i] = buf[(m++) * nsend]; + + if(data_mask & MOLECULE_MASK) _molecule[i] = buf[(m++) * nsend]; + + if(data_mask & RADIUS_MASK) _radius[i] = buf[(m++) * nsend]; + + if(data_mask & DENSITY_MASK) _density[i] = buf[(m++) * nsend]; + + if(data_mask & RMASS_MASK) _rmass[i] = buf[(m++) * nsend]; + + if(data_mask & OMEGA_MASK) { + _omega[i] = buf[(m++) * nsend]; + _omega[i + _nmax] = buf[(m++) * nsend]; + _omega[i + 2 * _nmax] = buf[(m++) * nsend]; + } + + /* if(data_mask & NSPECIAL_MASK) + { + _nspecial[i] = buf[(m++)*nsend]; + _nspecial[i+_nmax] = buf[(m++)*nsend]; + _nspecial[i+2*_nmax] = buf[(m++)*nsend]; + }*/ + } + + copylist[k] = i; +} + +template <const unsigned int data_mask> +__global__ void Cuda_AtomVecCuda_PackBorder_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + int* list = sendlist + iswap * maxlistlength; + + if(i < n) { + int j = list[i]; + int m = 0; + + if(data_mask & X_MASK) { + ((X_FLOAT*) _buffer)[i + (m++)*n] = _x[j] + dx; + ((X_FLOAT*) _buffer)[i + (m++)*n] = _x[j + _nmax] + dy; + ((X_FLOAT*) _buffer)[i + (m++)*n] = _x[j + 2 * _nmax] + dz; + } + + if(data_mask & V_MASK) { + ((X_FLOAT*) _buffer)[i + (m++)*n] = _v[j]; + ((X_FLOAT*) _buffer)[i + (m++)*n] = _v[j + _nmax]; + ((X_FLOAT*) _buffer)[i + (m++)*n] = _v[j + 2 * _nmax]; + } + + if(data_mask & TAG_MASK)((X_FLOAT*) _buffer)[i + (m++)*n] = _tag[j]; + + if(data_mask & TYPE_MASK)((X_FLOAT*) _buffer)[i + (m++)*n] = _type[j]; + + if(data_mask & MASK_MASK)((X_FLOAT*) _buffer)[i + (m++)*n] = _mask[j]; + + if(data_mask & Q_MASK)((X_FLOAT*) _buffer)[i + (m++)*n] = _q[j]; + + if(data_mask & MOLECULE_MASK)((X_FLOAT*) _buffer)[i + (m++)*n] = _molecule[j]; + + if(data_mask & RADIUS_MASK)((X_FLOAT*) _buffer)[i + (m++)*n] = _radius[i]; + + if(data_mask & DENSITY_MASK)((X_FLOAT*) _buffer)[i + (m++)*n] = _density[i]; + + if(data_mask & RMASS_MASK)((X_FLOAT*) _buffer)[i + (m++)*n] = _rmass[i]; + + if(data_mask & OMEGA_MASK) { + ((X_FLOAT*) _buffer)[i + (m++)*n] = _omega[i]; + ((X_FLOAT*) _buffer)[i + (m++)*n] = _omega[i + _nmax]; + ((X_FLOAT*) _buffer)[i + (m++)*n] = _omega[i + 2 * _nmax]; + } + } +} + + + +template <const unsigned int data_mask> +__global__ void Cuda_AtomVecCuda_PackBorder_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz, int first) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + int* list = sendlist + iswap * maxlistlength; + + if(i < n) { + int j = list[i]; + + if(data_mask & X_MASK) { + _x[i + first] = _x[j] + dx; + _x[i + first + _nmax] = _x[j + _nmax] + dy; + _x[i + first + 2 * _nmax] = _x[j + 2 * _nmax] + dz; + } + + if(data_mask & V_MASK) { + _v[i + first] = _v[j]; + _v[i + first + _nmax] = _v[j + _nmax]; + _v[i + first + 2 * _nmax] = _v[j + 2 * _nmax]; + } + + if(data_mask & TAG_MASK) _tag[i + first] = _tag[j]; + + if(data_mask & TYPE_MASK) _type[i + first] = _type[j]; + + if(data_mask & MASK_MASK) _mask[i + first] = _mask[j]; + + if(data_mask & Q_MASK) _q[i + first] = _q[j]; + + if(data_mask & MOLECULE_MASK) _molecule[i + first] = _molecule[j]; + + if(data_mask & RADIUS_MASK) _radius[i + first] = _radius[j]; + + if(data_mask & DENSITY_MASK) _density[i + first] = _density[j]; + + if(data_mask & RMASS_MASK) _rmass[i + first] = _rmass[j]; + + if(data_mask & OMEGA_MASK) { + _omega[i + first] = _omega[j]; + _omega[i + first + _nmax] = _omega[j + _nmax]; + _omega[i + first + 2 * _nmax] = _omega[j + 2 * _nmax]; + } + } +} + +template <const unsigned int data_mask> +__global__ void Cuda_AtomVecCuda_UnpackBorder_Kernel(int n, int first) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if(i < n) { + if(i + first < _nmax) { + int m = 0; + + if(data_mask & X_MASK) { + _x[i + first] = ((X_FLOAT*) _buffer)[i + (m++) * n]; + _x[i + first + _nmax] = ((X_FLOAT*) _buffer)[i + (m++) * n]; + _x[i + first + 2 * _nmax] = ((X_FLOAT*) _buffer)[i + (m++) * n]; + } + + if(data_mask & V_MASK) { + _v[i + first] = ((X_FLOAT*) _buffer)[i + (m++) * n]; + _v[i + first + _nmax] = ((X_FLOAT*) _buffer)[i + (m++) * n]; + _v[i + first + 2 * _nmax] = ((X_FLOAT*) _buffer)[i + (m++) * n]; + } + + if(data_mask & TAG_MASK) _tag[i + first] = static_cast<int>(((X_FLOAT*) _buffer)[i + (m++) * n]); + + if(data_mask & TYPE_MASK) _type[i + first] = static_cast<int>(((X_FLOAT*) _buffer)[i + (m++) * n]); + + if(data_mask & MASK_MASK) _mask[i + first] = static_cast<int>(((X_FLOAT*) _buffer)[i + (m++) * n]); + + if(data_mask & Q_MASK) _q[i + first] = ((X_FLOAT*) _buffer)[i + (m++) * n]; + + if(data_mask & MOLECULE_MASK) _molecule[i + first] = static_cast<int>(((X_FLOAT*) _buffer)[i + (m++) * n]); + + if(data_mask & RADIUS_MASK) _radius[i + first] = ((X_FLOAT*) _buffer)[i + (m++) * n]; + + if(data_mask & DENSITY_MASK) _density[i + first] = ((X_FLOAT*) _buffer)[i + (m++) * n]; + + if(data_mask & RMASS_MASK) _rmass[i + first] = ((X_FLOAT*) _buffer)[i + (m++) * n]; + + if(data_mask & OMEGA_MASK) { + _omega[i + first] = ((X_FLOAT*) _buffer)[i + (m++) * n]; + _omega[i + first + _nmax] = ((X_FLOAT*) _buffer)[i + (m++) * n]; + _omega[i + first + 2 * _nmax] = ((X_FLOAT*) _buffer)[i + (m++) * n]; + } + } else { + _flag[0] = 1; + } + } +} + + diff --git a/lib/cuda/atom_vec_full_cuda.cu b/lib/cuda/atom_vec_full_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..3128f2cf7d91f0fa60117a04afa34ad8da7f2548 --- /dev/null +++ b/lib/cuda/atom_vec_full_cuda.cu @@ -0,0 +1,85 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +const unsigned int FULL_DATA_MASK = X_MASK | V_MASK | F_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | Q_MASK | MOLECULE_MASK; + +#include "atom_vec_full_cuda_cu.h" + +void Cuda_AtomVecFullCuda_Init(cuda_shared_data* sdata) +{ + return Cuda_AtomVecCuda_Init<FULL_DATA_MASK>(sdata); +} + +int Cuda_AtomVecFullCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send) +{ + const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | Q_MASK | MOLECULE_MASK; + return Cuda_AtomVecCuda_PackExchangeList<data_mask>(sdata, n, dim, buf_send); +} + +int Cuda_AtomVecFullCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist) +{ + const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | Q_MASK | MOLECULE_MASK; + return Cuda_AtomVecCuda_PackExchange<data_mask>(sdata, nsend, buf_send, copylist); +} + +int Cuda_AtomVecFullCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist) +{ + const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | Q_MASK | MOLECULE_MASK; + return Cuda_AtomVecCuda_UnpackExchange<data_mask>(sdata, nsend, buf_send, copylist); +} + +int Cuda_AtomVecFullCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag) +{ + const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK | MOLECULE_MASK; + return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata, nsend, iswap, buf_send, pbc, pbc_flag); +} + +int Cuda_AtomVecFullCuda_PackBorderVel(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag) +{ + const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK | MOLECULE_MASK; + return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata, nsend, iswap, buf_send, pbc, pbc_flag); +} + +int Cuda_AtomVecFullCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag) +{ + const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK | MOLECULE_MASK; + return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata, n, iswap, first, pbc, pbc_flag); +} + +int Cuda_AtomVecFullCuda_PackBorderVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag) +{ + const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK | MOLECULE_MASK; + return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata, n, iswap, first, pbc, pbc_flag); +} + +int Cuda_AtomVecFullCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv) +{ + const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK | MOLECULE_MASK; + return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata, n, first, buf_recv); +} + +int Cuda_AtomVecFullCuda_UnpackBorderVel(cuda_shared_data* sdata, int n, int first, void* buf_recv) +{ + const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK | MOLECULE_MASK; + return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata, n, first, buf_recv); +} diff --git a/lib/cuda/atom_vec_full_cuda_cu.h b/lib/cuda/atom_vec_full_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..811d2df624dd78eb1fcd5551a4c556d3859ed515 --- /dev/null +++ b/lib/cuda/atom_vec_full_cuda_cu.h @@ -0,0 +1,15 @@ +#ifndef ATOM_VEC_FULL_CUDA_CU_H_ +#define ATOM_VEC_FULL_CUDA_CU_H_ + +extern "C" void Cuda_AtomVecFullCuda_Init(cuda_shared_data* sdata); +extern "C" int Cuda_AtomVecFullCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send); +extern "C" int Cuda_AtomVecFullCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist); +extern "C" int Cuda_AtomVecFullCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist); +extern "C" int Cuda_AtomVecFullCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag); +extern "C" int Cuda_AtomVecFullCuda_PackBorderVel(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag); +extern "C" int Cuda_AtomVecFullCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag); +extern "C" int Cuda_AtomVecFullCuda_PackBorderVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag); +extern "C" int Cuda_AtomVecFullCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv); +extern "C" int Cuda_AtomVecFullCuda_UnpackBorderVel(cuda_shared_data* sdata, int n, int first, void* buf_recv); + +#endif /*ATOM_VEC_FULL2_CUDA_CU_H_*/ diff --git a/lib/cuda/binning_kernel.cu b/lib/cuda/binning_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..bd40e3f0b940934d9f413da1b545016519d02ad0 --- /dev/null +++ b/lib/cuda/binning_kernel.cu @@ -0,0 +1,189 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +// load some variables from shared cuda data into device's constant memory: +__device__ __constant__ X_FLOAT rez_bin_size[3]; +__device__ __constant__ unsigned* bin_error_count; + +__device__ __constant__ int cuda_dummy_type; +__device__ __constant__ unsigned binned_size_all; +__device__ __constant__ X_FLOAT outside[3]; + +__global__ void PreBinning_Kernel() +{ + const unsigned bin = gridDim.y * blockIdx.x + blockIdx.y; + + if(bin < gridDim.x * gridDim.y) { // TODO: suspected always to be true + _binned_type[blockDim.x * bin + threadIdx.x] = cuda_dummy_type; + + const int i = 3 * blockDim.x * bin + threadIdx.x; + X_FLOAT* binned_x = _binned_x + i; + *binned_x = _subhi[0] + outside[0] * (1 + i); + binned_x += blockDim.x; + *binned_x = _subhi[1] + outside[1] * (1 + i); + binned_x += blockDim.x; + *binned_x = _subhi[2] + outside[2] * (1 + i); + _binned_tag[i] = -1; + } +} + +__global__ void Binning_Kernel(X_FLOAT* x, X_FLOAT* binned_x, int q_flag, int offset, int rmass_flag) +{ + const unsigned i = blockDim.x * blockIdx.x + threadIdx.x + offset; + + int binatoms = _natoms; + + if(offset == 0) binatoms = _nlocal ; + + if(i < binatoms) { + // copy atom position from global device memory to local register + // in this 3 steps to get as much coalesced access as possible + X_FLOAT my_xX, my_xY, my_xZ; + x += i; + my_xX = *x; + x += _nmax; + my_xY = *x; + x += _nmax; + my_xZ = *x; + //my_xX=x[i]; + //my_xY=x[i+_nmax]; + //my_xZ=x[i+2*_nmax]; + + + // calculate flat bin index + int bx = __float2int_rd(rez_bin_size[0] * (my_xX - _sublo[0])) + 2; + int by = __float2int_rd(rez_bin_size[1] * (my_xY - _sublo[1])) + 2; + int bz = __float2int_rd(rez_bin_size[2] * (my_xZ - _sublo[2])) + 2; + + bx -= bx * negativCUDA(1.0f * bx); + bx -= (bx - _bin_dim.x + 1) * negativCUDA(1.0f * _bin_dim.x - 1.0f - 1.0f * bx); + by -= by * negativCUDA(1.0f * by); + by -= (by - _bin_dim.y + 1) * negativCUDA(1.0f * _bin_dim.y - 1.0f - 1.0f * by); + bz -= bz * negativCUDA(1.0f * bz); + bz -= (bz - _bin_dim.z + 1) * negativCUDA(1.0f * _bin_dim.z - 1.0f - 1.0f * bz); + + + const unsigned j = _bin_dim.z * (_bin_dim.y * bx + by) + bz; + + // add new atom to bin, get bin-array position + const unsigned k = atomicAdd(& _bin_count_all[j], 1); + + if(offset == 0) atomicAdd(& _bin_count_local[j], 1); + + if(k < _bin_nmax) { + // copy register values back to global device memory + unsigned pos = 3 * _bin_nmax * j + k; + _binpos[i] = pos; + binned_x += pos; + *binned_x = my_xX; + binned_x += _bin_nmax; + *binned_x = my_xY; + binned_x += _bin_nmax; + *binned_x = my_xZ; + + // also copy velocity and force accordingly + + binned_x = _binned_v + pos; + x = _v + i; + *binned_x = *x; + binned_x += _bin_nmax; + x += _nmax; + *binned_x = *x; + binned_x += _bin_nmax; + x += _nmax; + *binned_x = *x; + + binned_x = _binned_f + pos; + x = _f + i; + *binned_x = *x; + binned_x += _bin_nmax; + x += _nmax; + *binned_x = *x; + binned_x += _bin_nmax; + x += _nmax; + *binned_x = *x; + + pos = _bin_nmax * j + k; + _binned_type [pos] = _type[i]; + _binned_tag [pos] = _tag[i]; + + if(rmass_flag) + _binned_rmass[pos] = _rmass[i]; + + if(q_flag) + _binned_q [pos] = _q[i]; + } else { + // normally, this should not happen: + int errorn = atomicAdd(bin_error_count, 1); + MYEMUDBG(printf("# CUDA: Binning_Kernel: WARNING: atom %i ignored, no place left in bin %u\n", i, j);) + } + } +} + +__global__ void ReverseBinning_Kernel(X_FLOAT* x, X_FLOAT* binned_x, int q_flag) +{ + const unsigned i = blockDim.x * blockIdx.x + threadIdx.x; + + if(i < _nlocal) { + unsigned bin_pos3 = _binpos[i]; + unsigned bin_pos = bin_pos3 / (3 * _bin_nmax); + bin_pos *= _bin_nmax; + bin_pos += bin_pos3 - bin_pos * 3; + + binned_x = _binned_x + bin_pos3; + x = x + i; + *x = *binned_x; + binned_x += _bin_nmax; + x += _nmax; + *x = *binned_x; + binned_x += _bin_nmax; + x += _nmax; + *x = *binned_x; + + binned_x = _binned_v + bin_pos3; + x = _v + i; + *x = *binned_x; + binned_x += _bin_nmax; + x += _nmax; + *x = *binned_x; + binned_x += _bin_nmax; + x += _nmax; + *x = *binned_x; + + binned_x = _binned_f + bin_pos3; + x = _f + i; + *x = *binned_x; + binned_x += _bin_nmax; + x += _nmax; + *x = *binned_x; + binned_x += _bin_nmax; + x += _nmax; + *x = *binned_x; + + + _type[i] = _binned_type[bin_pos]; + _tag[i] = _binned_tag[bin_pos]; + + if(q_flag) _q[i] = _binned_q[bin_pos]; + } +} diff --git a/lib/cuda/comm_cuda.cu b/lib/cuda/comm_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..8ca2d63cacf82c4ad1c6c5f5a8acca6f7668398c --- /dev/null +++ b/lib/cuda/comm_cuda.cu @@ -0,0 +1,539 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> +#define MY_PREFIX comm_cuda +#include "cuda_shared.h" +#include "cuda_common.h" + +#include "crm_cuda_utils.cu" + +#include "comm_cuda_cu.h" +#include "comm_cuda_kernel.cu" +#include <ctime> + +void Cuda_CommCuda_UpdateBuffer(cuda_shared_data* sdata, int n) +{ + int size = n * 3 * sizeof(X_FLOAT); + + if(sdata->buffersize < size) { + MYDBG(printf("Cuda_ComputeTempCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);) + CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize); + sdata->buffer = CudaWrapper_AllocCudaData(size); + sdata->buffersize = size; + sdata->buffer_new++; + MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);) + } + + cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*)); +} + + +void Cuda_CommCuda_UpdateNmax(cuda_shared_data* sdata) +{ + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(X_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*)); +} + + +void Cuda_CommCuda_Init(cuda_shared_data* sdata) +{ + Cuda_CommCuda_UpdateNmax(sdata); + int ntypesp = sdata->atom.ntypes + 1; + cudaMemcpyToSymbol(MY_AP(cuda_ntypes) , &ntypesp, sizeof(int)); + cudaMemcpyToSymbol(MY_AP(prd) , sdata->domain.prd, 3 * sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_AP(flag) , &sdata->flag, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(debugdata) , &sdata->debugdata, sizeof(int*)); +} + +int Cuda_CommCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag) +{ + + timespec time1, time2; + + if(sdata->atom.update_nmax) + Cuda_CommCuda_UpdateNmax(sdata); + + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + + int size = n * 3 * sizeof(X_FLOAT); + + if(sdata->buffer_new or (size > sdata->buffersize)) + Cuda_CommCuda_UpdateBuffer(sdata, n); + + X_FLOAT dx = 0.0; + X_FLOAT dy = 0.0; + X_FLOAT dz = 0.0; + + if(pbc_flag != 0) { + if(sdata->domain.triclinic == 0) { + dx = pbc[0] * sdata->domain.prd[0]; + dy = pbc[1] * sdata->domain.prd[1]; + dz = pbc[2] * sdata->domain.prd[2]; + } else { + dx = pbc[0] * sdata->domain.prd[0] + pbc[5] * sdata->domain.xy + pbc[4] * sdata->domain.xz; + dy = pbc[1] * sdata->domain.prd[1] + pbc[3] * sdata->domain.yz; + dz = pbc[2] * sdata->domain.prd[2]; + } + } + + int3 layout = getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + if(sdata->atom.nlocal > 0) { + cudaMemset(sdata->flag, 0, sizeof(int)); + + clock_gettime(CLOCK_REALTIME, &time1); + + void* buf = sdata->overlap_comm ? sdata->comm.buf_send_dev[iswap] : sdata->buffer; + Cuda_CommCuda_PackComm_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n + , sdata->comm.maxlistlength, iswap, dx, dy, dz, buf); + cudaThreadSynchronize(); + + clock_gettime(CLOCK_REALTIME, &time2); + sdata->cuda_timings.comm_forward_kernel_pack += + time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; + + CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed"); + + if(not sdata->overlap_comm) + cudaMemcpy(buf_send, sdata->buffer, n * 3 * sizeof(X_FLOAT), cudaMemcpyDeviceToHost); + //cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost); + + clock_gettime(CLOCK_REALTIME, &time1); + sdata->cuda_timings.comm_forward_download += + time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000; + + int aflag; + cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost); + if(aflag != 0) printf("aflag PackComm: %i\n", aflag); + CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed"); + + } + + return 3 * n; +} + +int Cuda_CommCuda_PackCommVel(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag) +{ + + timespec time1, time2; + + if(sdata->atom.update_nmax) + Cuda_CommCuda_UpdateNmax(sdata); + + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + + int size = n * 6 * sizeof(X_FLOAT); + + if(sdata->buffer_new or (size > sdata->buffersize)) + Cuda_CommCuda_UpdateBuffer(sdata, n); + + X_FLOAT dx = 0.0; + X_FLOAT dy = 0.0; + X_FLOAT dz = 0.0; + + if(pbc_flag != 0) { + if(sdata->domain.triclinic == 0) { + dx = pbc[0] * sdata->domain.prd[0]; + dy = pbc[1] * sdata->domain.prd[1]; + dz = pbc[2] * sdata->domain.prd[2]; + } else { + dx = pbc[0] * sdata->domain.prd[0] + pbc[5] * sdata->domain.xy + pbc[4] * sdata->domain.xz; + dy = pbc[1] * sdata->domain.prd[1] + pbc[3] * sdata->domain.yz; + dz = pbc[2] * sdata->domain.prd[2]; + } + } + + int3 layout = getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + if(sdata->atom.nlocal > 0) { + cudaMemset(sdata->flag, 0, sizeof(int)); + + clock_gettime(CLOCK_REALTIME, &time1); + + void* buf = sdata->overlap_comm ? sdata->comm.buf_send_dev[iswap] : sdata->buffer; + Cuda_CommCuda_PackComm_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n + , sdata->comm.maxlistlength, iswap, dx, dy, dz, buf); + cudaThreadSynchronize(); + + clock_gettime(CLOCK_REALTIME, &time2); + sdata->cuda_timings.comm_forward_kernel_pack += + time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; + + CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed"); + + if(not sdata->overlap_comm) + cudaMemcpy(buf_send, sdata->buffer, n * 6 * sizeof(X_FLOAT), cudaMemcpyDeviceToHost); + //cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost); + + clock_gettime(CLOCK_REALTIME, &time1); + sdata->cuda_timings.comm_forward_download += + time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000; + + int aflag; + cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost); + if(aflag != 0) printf("aflag PackComm: %i\n", aflag); + CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed"); + + } + + return 6 * n; +} + +int Cuda_CommCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag) +{ + MYDBG(printf(" # CUDA: CommCuda_PackComm_Self\n");) + timespec time1, time2; + + if(sdata->atom.update_nmax) + Cuda_CommCuda_UpdateNmax(sdata); + + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + + int size = n * 3 * sizeof(X_FLOAT); + + if(sdata->buffer_new or (size > sdata->buffersize)) + Cuda_CommCuda_UpdateBuffer(sdata, n); + + static int count = -1; + count++; + X_FLOAT dx = 0.0; + X_FLOAT dy = 0.0; + X_FLOAT dz = 0.0; + + if(pbc_flag != 0) { + if(sdata->domain.triclinic == 0) { + dx = pbc[0] * sdata->domain.prd[0]; + dy = pbc[1] * sdata->domain.prd[1]; + dz = pbc[2] * sdata->domain.prd[2]; + } else { + dx = pbc[0] * sdata->domain.prd[0] + pbc[5] * sdata->domain.xy + pbc[4] * sdata->domain.xz; + dy = pbc[1] * sdata->domain.prd[1] + pbc[3] * sdata->domain.yz; + dz = pbc[2] * sdata->domain.prd[2]; + } + } + + + + int3 layout = getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + if(sdata->atom.nlocal > 0) { + + clock_gettime(CLOCK_REALTIME, &time1); + + Cuda_CommCuda_PackComm_Self_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz, first); + cudaThreadSynchronize(); + + clock_gettime(CLOCK_REALTIME, &time2); + sdata->cuda_timings.comm_forward_kernel_self += + time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; + + CUT_CHECK_ERROR("Cuda_CommCuda_PackComm_Self: Kernel execution failed"); + } + + return 3 * n; +} + +int Cuda_CommCuda_PackCommVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag) +{ + MYDBG(printf(" # CUDA: CommCuda_PackComm_Self\n");) + timespec time1, time2; + + if(sdata->atom.update_nmax) + Cuda_CommCuda_UpdateNmax(sdata); + + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + + int size = n * 6 * sizeof(X_FLOAT); + + if(sdata->buffer_new or (size > sdata->buffersize)) + Cuda_CommCuda_UpdateBuffer(sdata, n); + + static int count = -1; + count++; + X_FLOAT dx = 0.0; + X_FLOAT dy = 0.0; + X_FLOAT dz = 0.0; + + if(pbc_flag != 0) { + if(sdata->domain.triclinic == 0) { + dx = pbc[0] * sdata->domain.prd[0]; + dy = pbc[1] * sdata->domain.prd[1]; + dz = pbc[2] * sdata->domain.prd[2]; + } else { + dx = pbc[0] * sdata->domain.prd[0] + pbc[5] * sdata->domain.xy + pbc[4] * sdata->domain.xz; + dy = pbc[1] * sdata->domain.prd[1] + pbc[3] * sdata->domain.yz; + dz = pbc[2] * sdata->domain.prd[2]; + } + } + + + + int3 layout = getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + if(sdata->atom.nlocal > 0) { + + clock_gettime(CLOCK_REALTIME, &time1); + + Cuda_CommCuda_PackComm_Self_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz, first); + cudaThreadSynchronize(); + + clock_gettime(CLOCK_REALTIME, &time2); + sdata->cuda_timings.comm_forward_kernel_self += + time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; + + CUT_CHECK_ERROR("Cuda_CommCuda_PackComm_Self: Kernel execution failed"); + } + + return 6 * n; +} + +void Cuda_CommCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* buf_recv, int iswap) +{ + timespec time1, time2; + + if(sdata->atom.update_nmax) + Cuda_CommCuda_UpdateNmax(sdata); + + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + + int size = n * 3 * sizeof(X_FLOAT); + + if(sdata->buffer_new or (size > sdata->buffersize)) + Cuda_CommCuda_UpdateBuffer(sdata, n); + + int3 layout = getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + if(sdata->atom.nlocal > 0) { + clock_gettime(CLOCK_REALTIME, &time1); + + if(not sdata->overlap_comm || iswap < 0) + cudaMemcpy(sdata->buffer, (void*)buf_recv, n * 3 * sizeof(X_FLOAT), cudaMemcpyHostToDevice); + + clock_gettime(CLOCK_REALTIME, &time2); + sdata->cuda_timings.comm_forward_upload += + time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; + void* buf = (sdata->overlap_comm && iswap >= 0) ? sdata->comm.buf_recv_dev[iswap] : sdata->buffer; + Cuda_CommCuda_UnpackComm_Kernel <<< grid, threads, 0>>>(n, first, buf); + cudaThreadSynchronize(); + + clock_gettime(CLOCK_REALTIME, &time1); + sdata->cuda_timings.comm_forward_kernel_unpack += + time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000; + + CUT_CHECK_ERROR("Cuda_CommCuda_UnpackComm: Kernel execution failed"); + + } +} + +void Cuda_CommCuda_UnpackCommVel(cuda_shared_data* sdata, int n, int first, void* buf_recv, int iswap) +{ + timespec time1, time2; + + if(sdata->atom.update_nmax) + Cuda_CommCuda_UpdateNmax(sdata); + + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + + int size = n * 6 * sizeof(X_FLOAT); + + if(sdata->buffer_new or (size > sdata->buffersize)) + Cuda_CommCuda_UpdateBuffer(sdata, n); + + int3 layout = getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + if(sdata->atom.nlocal > 0) { + clock_gettime(CLOCK_REALTIME, &time1); + + if(not sdata->overlap_comm || iswap < 0) + cudaMemcpy(sdata->buffer, (void*)buf_recv, n * 6 * sizeof(X_FLOAT), cudaMemcpyHostToDevice); + + clock_gettime(CLOCK_REALTIME, &time2); + sdata->cuda_timings.comm_forward_upload += + time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; + void* buf = (sdata->overlap_comm && iswap >= 0) ? sdata->comm.buf_recv_dev[iswap] : sdata->buffer; + Cuda_CommCuda_UnpackComm_Kernel <<< grid, threads, 0>>>(n, first, buf); + cudaThreadSynchronize(); + + clock_gettime(CLOCK_REALTIME, &time1); + sdata->cuda_timings.comm_forward_kernel_unpack += + time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000; + + CUT_CHECK_ERROR("Cuda_CommCuda_UnpackComm: Kernel execution failed"); + + } +} + +int Cuda_CommCuda_PackReverse(cuda_shared_data* sdata, int n, int first, void* buf_send) +{ + if(sdata->atom.update_nmax) + Cuda_CommCuda_UpdateNmax(sdata); + + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + + int size = n * 3 * sizeof(F_FLOAT); + + if(sdata->buffer_new or (size > sdata->buffersize)) + Cuda_CommCuda_UpdateBuffer(sdata, n); + + + F_FLOAT* buf = (F_FLOAT*)buf_send; + F_FLOAT* f_dev = (F_FLOAT*)sdata->atom.f.dev_data; + f_dev += first; + cudaMemcpy(buf, f_dev, n * sizeof(F_FLOAT), cudaMemcpyDeviceToHost); + buf += n; + f_dev += sdata->atom.nmax; + cudaMemcpy(buf, f_dev, n * sizeof(F_FLOAT), cudaMemcpyDeviceToHost); + buf += n; + f_dev += sdata->atom.nmax; + cudaMemcpy(buf, f_dev, n * sizeof(F_FLOAT), cudaMemcpyDeviceToHost); + return n * 3; +} + + +void Cuda_CommCuda_UnpackReverse(cuda_shared_data* sdata, int n, int iswap, void* buf_recv) +{ + if(sdata->atom.update_nmax) + Cuda_CommCuda_UpdateNmax(sdata); + + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + + int size = n * 3 * sizeof(F_FLOAT); + + if(sdata->buffer_new or (size > sdata->buffersize)) + Cuda_CommCuda_UpdateBuffer(sdata, n); + + + int3 layout = getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + if(sdata->atom.nlocal > 0) { + cudaMemcpy(sdata->buffer, buf_recv, size, cudaMemcpyHostToDevice); + Cuda_CommCuda_UnpackReverse_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_CommCuda_UnpackReverse: Kernel execution failed"); + } +} + +void Cuda_CommCuda_UnpackReverse_Self(cuda_shared_data* sdata, int n, int iswap, int first) +{ + if(sdata->atom.update_nmax) + Cuda_CommCuda_UpdateNmax(sdata); + + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + + int size = n * 3 * sizeof(X_FLOAT); + + if(sdata->buffer_new or (size > sdata->buffersize)) + Cuda_CommCuda_UpdateBuffer(sdata, n); + + int3 layout = getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + if(sdata->atom.nlocal > 0) { + Cuda_CommCuda_UnpackReverse_Self_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, first); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_CommCuda_PackReverse_Self: Kernel execution failed"); + + } +} + + +int Cuda_CommCuda_BuildSendlist(cuda_shared_data* sdata, int bordergroup, int ineed, int style, int atom_nfirst, int nfirst, int nlast, int dim, int iswap) +{ + MYDBG(printf(" # CUDA: CommCuda_BuildSendlist\n");) + timespec time1, time2; + + if(sdata->atom.update_nmax) + Cuda_CommCuda_UpdateNmax(sdata); + + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + + if(sdata->buffer_new or (80 > sdata->buffersize)) + Cuda_CommCuda_UpdateBuffer(sdata, 10); + + int n; + + if(!bordergroup || ineed >= 2) + n = nlast - nfirst + 1; + else { + n = atom_nfirst; + + if(nlast - sdata->atom.nlocal + 1 > n) n = nlast - sdata->atom.nlocal + 1; + } + + int3 layout = getgrid(n, 0, 512, true); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x + 1, layout.y, 1); + + + cudaMemset((int*)(sdata->buffer), 0, sizeof(int)); + + clock_gettime(CLOCK_REALTIME, &time1); + + if(style == 1) + Cuda_CommCuda_BuildSendlist_Single <<< grid, threads, (threads.x + 1)*sizeof(int) >>> (bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap, (X_FLOAT*) sdata->comm.slablo.dev_data, (X_FLOAT*) sdata->comm.slabhi.dev_data, (int*) sdata->comm.sendlist.dev_data, sdata->comm.maxlistlength); + else + Cuda_CommCuda_BuildSendlist_Multi <<< grid, threads, (threads.x + 1)*sizeof(int) >>> (bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap, (X_FLOAT*) sdata->comm.multilo.dev_data, (X_FLOAT*) sdata->comm.multihi.dev_data, (int*) sdata->comm.sendlist.dev_data, sdata->comm.maxlistlength); + + cudaThreadSynchronize(); + clock_gettime(CLOCK_REALTIME, &time2); + sdata->cuda_timings.comm_border_kernel_buildlist += + time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; + + CUT_CHECK_ERROR("Cuda_CommCuda_BuildSendlist: Kernel execution failed"); + int nsend; + cudaMemcpy(&nsend, sdata->buffer, sizeof(int), cudaMemcpyDeviceToHost); + return nsend; + + +} + diff --git a/lib/cuda/comm_cuda_cu.h b/lib/cuda/comm_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..d5ac1560ca69d4e4fc3d9e02c2a3568f6ac79048 --- /dev/null +++ b/lib/cuda/comm_cuda_cu.h @@ -0,0 +1,35 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" int Cuda_CommCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbcflag); +extern "C" int Cuda_CommCuda_PackCommVel(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbcflag); +extern "C" int Cuda_CommCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbcflag); +extern "C" int Cuda_CommCuda_PackCommVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbcflag); +extern "C" void Cuda_CommCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* buf_recv, int iswap = -1); +extern "C" void Cuda_CommCuda_UnpackCommVel(cuda_shared_data* sdata, int n, int first, void* buf_recv, int iswap = -1); +extern "C" int Cuda_CommCuda_PackReverse(cuda_shared_data* sdata, int n, int first, void* buf_send); +extern "C" void Cuda_CommCuda_UnpackReverse(cuda_shared_data* sdata, int n, int iswap, void* buf_recv); +extern "C" void Cuda_CommCuda_UnpackReverse_Self(cuda_shared_data* sdata, int n, int iswap, int first); +extern "C" int Cuda_CommCuda_BuildSendlist(cuda_shared_data* sdata, int bordergroup, int ineed, int style, int atom_nfirst, int nfirst, int nlast, int dim, int iswap); diff --git a/lib/cuda/comm_cuda_kernel.cu b/lib/cuda/comm_cuda_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..f87b3af5406d10bfb91eb2dc7ff54e34f6068dd4 --- /dev/null +++ b/lib/cuda/comm_cuda_kernel.cu @@ -0,0 +1,394 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +__global__ void Cuda_CommCuda_PackComm_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz, void* buffer) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + int* list = sendlist + iswap * maxlistlength; + + if(i < n) { + int j = list[i]; + + if(j > _nmax) _flag[0] = 1; + + ((X_FLOAT*) buffer)[i] = _x[j] + dx; + ((X_FLOAT*) buffer)[i + 1 * n] = _x[j + _nmax] + dy; + ((X_FLOAT*) buffer)[i + 2 * n] = _x[j + 2 * _nmax] + dz; + } +} + +__global__ void Cuda_CommCuda_PackCommVel_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz, void* buffer) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + int* list = sendlist + iswap * maxlistlength; + + if(i < n) { + int j = list[i]; + + if(j > _nmax) _flag[0] = 1; + + ((X_FLOAT*) buffer)[i] = _x[j] + dx; + ((X_FLOAT*) buffer)[i + 1 * n] = _x[j + _nmax] + dy; + ((X_FLOAT*) buffer)[i + 2 * n] = _x[j + 2 * _nmax] + dz; + ((X_FLOAT*) buffer)[i + 3 * n] = _v[j]; + ((X_FLOAT*) buffer)[i + 4 * n] = _v[j + _nmax]; + ((X_FLOAT*) buffer)[i + 5 * n] = _v[j + 2 * _nmax]; + } +} + +__global__ void Cuda_CommCuda_PackComm_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz, int first) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + int* list = sendlist + iswap * maxlistlength; + + if(i < n) { + int j = i; + j = list[i]; + + _x[i + first] = _x[j] + dx; + _x[i + first + _nmax] = _x[j + _nmax] + dy; + _x[i + first + 2 * _nmax] = _x[j + 2 * _nmax] + dz; + } +} + +__global__ void Cuda_CommCuda_PackCommVel_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz, int first) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + int* list = sendlist + iswap * maxlistlength; + + if(i < n) { + int j = i; + j = list[i]; + + _x[i + first] = _x[j] + dx; + _x[i + first + _nmax] = _x[j + _nmax] + dy; + _x[i + first + 2 * _nmax] = _x[j + 2 * _nmax] + dz; + _v[i + first] = _v[j]; + _v[i + first + _nmax] = _v[j + _nmax]; + _v[i + first + 2 * _nmax] = _v[j + 2 * _nmax]; + } +} + +__global__ void Cuda_CommCuda_UnpackComm_Kernel(int n, int first, void* buffer) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if(i < n) { + _x[i + first] = ((X_FLOAT*) buffer)[i]; + _x[i + first + _nmax] = ((X_FLOAT*) buffer)[i + 1 * n]; + _x[i + first + 2 * _nmax] = ((X_FLOAT*) buffer)[i + 2 * n]; + } +} + + +__global__ void Cuda_CommCuda_UnpackCommVel_Kernel(int n, int first, void* buffer) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if(i < n) { + _x[i + first] = ((X_FLOAT*) buffer)[i]; + _x[i + first + _nmax] = ((X_FLOAT*) buffer)[i + 1 * n]; + _x[i + first + 2 * _nmax] = ((X_FLOAT*) buffer)[i + 2 * n]; + _v[i + first] = ((X_FLOAT*) buffer)[i + 3 * n]; + _v[i + first + _nmax] = ((X_FLOAT*) buffer)[i + 4 * n]; + _v[i + first + 2 * _nmax] = ((X_FLOAT*) buffer)[i + 5 * n]; + } +} + +__global__ void Cuda_CommCuda_PackReverse_Kernel(int n, int first) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if(i < n) { + ((F_FLOAT*) _buffer)[i] = _f[i + first]; + ((F_FLOAT*) _buffer)[i + n] = _f[i + first + _nmax]; + ((F_FLOAT*) _buffer)[i + 2 * n] = _f[i + first + 2 * _nmax]; + } + +} + +__global__ void Cuda_CommCuda_UnpackReverse_Kernel(int* sendlist, int n, int maxlistlength, int iswap) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + int* list = sendlist + iswap * maxlistlength; + + if(i < n) { + int j = list[i]; + _f[j] += ((F_FLOAT*)_buffer)[i]; + _f[j + _nmax] += ((F_FLOAT*) _buffer)[i + n]; + _f[j + 2 * _nmax] += ((F_FLOAT*) _buffer)[i + 2 * n]; + } + +} + +__global__ void Cuda_CommCuda_UnpackReverse_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, int first) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + int* list = sendlist + iswap * maxlistlength; + + if(i < n) { + int j = list[i]; + + _f[j] += _f[i + first]; + _f[j + _nmax] += _f[i + first + _nmax]; + _f[j + 2 * _nmax] += _f[i + first + 2 * _nmax]; + } + +} + +extern __shared__ int shared[]; + +__global__ void Cuda_CommCuda_BuildSendlist_Single(int bordergroup, int ineed, int atom_nfirst, + int nfirst, int nlast, int dim, int iswap, X_FLOAT* slablo, X_FLOAT* slabhi, int* sendlist, int maxlistlength) +{ + int* list = sendlist + iswap * maxlistlength; + X_FLOAT lo = slablo[iswap]; + X_FLOAT hi = slabhi[iswap]; + bool add = false; + + if(!bordergroup || ineed >= 2) { + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x + nfirst; + + if(i < nlast) + if(_x[i + dim * _nmax] >= lo && _x[i + dim * _nmax] <= hi) { + add = true; + } + + shared[threadIdx.x] = add ? 1 : 0; + + __syncthreads(); + + int nsend = 0; + + if(threadIdx.x == 0) { + for(int k = 0; k < blockDim.x; k++) { + if(shared[k]) { + nsend++; + shared[k] = nsend; + } + } + + shared[blockDim.x] = atomicAdd((int*) _buffer, nsend); + } + + __syncthreads(); + + nsend = shared[blockDim.x] + shared[threadIdx.x] - 1; + + if(add && nsend < maxlistlength) + list[nsend] = i; + + + } else { + + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if(i < atom_nfirst) + if(_x[i + dim * _nmax] >= lo && _x[i + dim * _nmax] <= hi) { + add = true; + } + + shared[threadIdx.x] = add ? 1 : 0; + + __syncthreads(); + + int nsend = 0; + + if(threadIdx.x == 0) { + for(int k = 0; k < blockDim.x; k++) { + if(shared[k]) { + nsend++; + shared[k] = nsend; + } + } + + shared[blockDim.x] = atomicAdd((int*) _buffer, nsend); + } + + __syncthreads(); + + nsend = shared[blockDim.x] + shared[threadIdx.x] - 1; + + if(add && nsend < maxlistlength) + list[nsend] = i; + + __syncthreads(); + + add = false; + i += _nlocal; + + if(i < nlast) + if(_x[i + dim * _nmax] >= lo && _x[i + dim * _nmax] <= hi) { + add = true; + } + + shared[threadIdx.x] = add ? 1 : 0; + + __syncthreads(); + + nsend = 0; + + if(threadIdx.x == 0) { + for(int k = 0; k < blockDim.x; k++) { + if(shared[k]) { + nsend++; + shared[k] = nsend; + } + } + + shared[blockDim.x] = atomicAdd((int*) _buffer, nsend); + } + + __syncthreads(); + + nsend = shared[blockDim.x] + shared[threadIdx.x] - 1; + + if(add && nsend < maxlistlength) + list[nsend] = i; + + } +} + + +__global__ void Cuda_CommCuda_BuildSendlist_Multi(int bordergroup, int ineed, int atom_nfirst + , int nfirst, int nlast, int dim, int iswap, X_FLOAT* multilo, X_FLOAT* multihi, int* sendlist, int maxlistlength) +{ + int* list = sendlist + iswap * maxlistlength; + X_FLOAT* mlo = &multilo[iswap * _cuda_ntypes]; + X_FLOAT* mhi = &multihi[iswap * _cuda_ntypes]; + int itype = 0; + bool add = false; + + if(!bordergroup || ineed >= 2) { + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x + nfirst; + + if(i < nlast) { + itype = _type[i]; + + if(_x[i + dim * _nmax] >= mlo[itype] && _x[i + dim * _nmax] <= mhi[itype]) { + add = true; + } + } + + shared[threadIdx.x] = add ? 1 : 0; + + __syncthreads(); + + int nsend = 0; + + if(threadIdx.x == 0) { + for(int k = 0; k < blockDim.x; k++) { + if(shared[k]) { + nsend++; + shared[k] = nsend; + } + } + + shared[blockDim.x] = atomicAdd((int*) _buffer, nsend); + } + + __syncthreads(); + + nsend = shared[blockDim.x] + shared[threadIdx.x] - 1; + + if(add && nsend < maxlistlength) + list[nsend] = i; + + + } else { + + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if(i < atom_nfirst) { + itype = _type[i]; + + if(_x[i + dim * _nmax] >= mlo[itype] && _x[i + dim * _nmax] <= mhi[itype]) { + add = true; + } + } + + shared[threadIdx.x] = add ? 1 : 0; + + __syncthreads(); + + int nsend = 0; + + if(threadIdx.x == 0) { + for(int k = 0; k < blockDim.x; k++) { + if(shared[k]) { + nsend++; + shared[k] = nsend; + } + } + + shared[blockDim.x] = atomicAdd((int*) _buffer, nsend); + } + + __syncthreads(); + + nsend = shared[blockDim.x] + shared[threadIdx.x] - 1; + + if(add && nsend < maxlistlength) + list[nsend] = i; + + __syncthreads(); + + add = false; + i += _nlocal; + + if(i < nlast) { + itype = _type[i]; + + if(_x[i + dim * _nmax] >= mlo[itype] && _x[i + dim * _nmax] <= mhi[itype]) { + add = true; + } + } + + shared[threadIdx.x] = add ? 1 : 0; + + __syncthreads(); + + nsend = 0; + + if(threadIdx.x == 0) { + for(int k = 0; k < blockDim.x; k++) { + if(shared[k]) { + nsend++; + shared[k] = nsend; + } + } + + shared[blockDim.x] = atomicAdd((int*) _buffer, nsend); + } + + __syncthreads(); + + nsend = shared[blockDim.x] + shared[threadIdx.x] - 1; + + if(add && nsend < maxlistlength) + list[nsend] = i; + + } +} diff --git a/lib/cuda/compute_temp_cuda.cu b/lib/cuda/compute_temp_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..ece4cf93a9fed1f2a12d023f0f9bb7119df179da --- /dev/null +++ b/lib/cuda/compute_temp_cuda.cu @@ -0,0 +1,126 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> +#define MY_PREFIX compute_temp_cuda +#include "cuda_shared.h" +#include "cuda_common.h" + +#include "crm_cuda_utils.cu" + +#include "compute_temp_cuda_cu.h" +#include "compute_temp_cuda_kernel.cu" + +void Cuda_ComputeTempCuda_UpdateBuffer(cuda_shared_data* sdata) +{ + int size = (unsigned)((sdata->atom.nlocal + 63) / 64.0) * 6 * sizeof(ENERGY_FLOAT); + + if(sdata->buffersize < size) { + MYDBG(printf("Cuda_ComputeTempCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);) + CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize); + sdata->buffer = CudaWrapper_AllocCudaData(size); + sdata->buffersize = size; + sdata->buffer_new++; + MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);) + } + + cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*)); +} + +void Cuda_ComputeTempCuda_UpdateNmax(cuda_shared_data* sdata) +{ + cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass .dev_data, sizeof(V_FLOAT*)); + + if(sdata->atom.rmass_flag) + cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*)); + + cudaMemcpyToSymbol(MY_AP(rmass_flag) , & sdata->atom.rmass_flag, sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*)); +} + +void Cuda_ComputeTempCuda_Init(cuda_shared_data* sdata) +{ + Cuda_ComputeTempCuda_UpdateNmax(sdata); +} + + +void Cuda_ComputeTempCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t) +{ + //if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary + Cuda_ComputeTempCuda_UpdateNmax(sdata); + //if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + //if(sdata->buffer_new) + Cuda_ComputeTempCuda_UpdateBuffer(sdata); + + int3 layout = getgrid(sdata->atom.nlocal); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + if(sdata->atom.nlocal > 0) { + Cuda_ComputeTempCuda_Vector_Kernel <<< grid, threads, threads.x* 6* sizeof(ENERGY_FLOAT)>>> (groupbit); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Vector: compute_vector Kernel execution failed"); + + int oldgrid = grid.x * grid.y; + grid.x = 6; + grid.y = 1; + threads.x = 512; + Cuda_ComputeTempCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>> (oldgrid, t); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Vector: reduce_vector Kernel execution failed"); + } +} + +void Cuda_ComputeTempCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t) +{ + //if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary + Cuda_ComputeTempCuda_UpdateNmax(sdata); + //if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + //if(sdata->buffer_new) + Cuda_ComputeTempCuda_UpdateBuffer(sdata); + MYDBG(printf("#CUDA ComputeTempCuda_Scalar: %i\n", sdata->atom.nlocal);) + int3 layout = getgrid(sdata->atom.nlocal); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + if(sdata->atom.nlocal > 0) { + CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Scalar: pre compute_scalar Kernel"); + Cuda_ComputeTempCuda_Scalar_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>> (groupbit); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Scalar: compute_scalar Kernel execution failed"); + + int oldgrid = grid.x * grid.y; + grid.x = 1; + grid.y = 1; + threads.x = 512; + Cuda_ComputeTempCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>> (oldgrid, t); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Scalar: reduce_scalar Kernel execution failed"); + } +} diff --git a/lib/cuda/compute_temp_cuda_cu.h b/lib/cuda/compute_temp_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..9ab43d727adb6b5928879a0884109bb4835c5500 --- /dev/null +++ b/lib/cuda/compute_temp_cuda_cu.h @@ -0,0 +1,28 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_ComputeTempCuda_Init(cuda_shared_data* sdata); +extern "C" void Cuda_ComputeTempCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t); +extern "C" void Cuda_ComputeTempCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t); diff --git a/lib/cuda/compute_temp_cuda_kernel.cu b/lib/cuda/compute_temp_cuda_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..79562a0e28fdb7636477be3f0e238f22bf6f92cf --- /dev/null +++ b/lib/cuda/compute_temp_cuda_kernel.cu @@ -0,0 +1,118 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +extern __shared__ ENERGY_FLOAT sharedmem[]; + + +__global__ void Cuda_ComputeTempCuda_Scalar_Kernel(int groupbit) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + sharedmem[threadIdx.x] = 0; + + if(i < _nlocal) { + if(_rmass_flag) { + if(_mask[i] & groupbit) + sharedmem[threadIdx.x] = (_v[i] * _v[i] + _v[i + _nmax] * _v[i + _nmax] + _v[i + 2 * _nmax] * _v[i + 2 * _nmax]) * _rmass[i]; + } else { + if(_mask[i] & groupbit) + sharedmem[threadIdx.x] = (_v[i] * _v[i] + _v[i + _nmax] * _v[i + _nmax] + _v[i + 2 * _nmax] * _v[i + 2 * _nmax]) * (_mass[_type[i]]); + } + } + + reduceBlock(sharedmem); + ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; + + if(threadIdx.x == 0) { + buffer[(blockIdx.x * gridDim.y + blockIdx.y)] = sharedmem[0]; + } +} + +__global__ void Cuda_ComputeTempCuda_Vector_Kernel(int groupbit) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + sharedmem[threadIdx.x] = 0; + sharedmem[threadIdx.x + blockDim.x] = 0; + sharedmem[threadIdx.x + 2 * blockDim.x] = 0; + sharedmem[threadIdx.x + 3 * blockDim.x] = 0; + sharedmem[threadIdx.x + 4 * blockDim.x] = 0; + sharedmem[threadIdx.x + 5 * blockDim.x] = 0; + + if(i < _nlocal) + if(_mask[i] & groupbit) { + V_FLOAT massone; + + if(_rmass_flag) massone = _rmass[i]; + else massone = _mass[_type[i]]; + + sharedmem[threadIdx.x] = massone * _v[i] * _v[i]; + sharedmem[threadIdx.x + blockDim.x] = massone * _v[i + _nmax] * _v[i + _nmax]; + sharedmem[threadIdx.x + 2 * blockDim.x] = massone * _v[i + 2 * _nmax] * _v[i + 2 * _nmax]; + sharedmem[threadIdx.x + 3 * blockDim.x] = massone * _v[i] * _v[i + _nmax]; + sharedmem[threadIdx.x + 4 * blockDim.x] = massone * _v[i] * _v[i + 2 * _nmax]; + sharedmem[threadIdx.x + 5 * blockDim.x] = massone * _v[i + _nmax] * _v[i + 2 * _nmax]; + } + + reduceBlock(sharedmem); + reduceBlock(&sharedmem[blockDim.x]); + reduceBlock(&sharedmem[2 * blockDim.x]); + reduceBlock(&sharedmem[3 * blockDim.x]); + reduceBlock(&sharedmem[4 * blockDim.x]); + reduceBlock(&sharedmem[5 * blockDim.x]); + ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; + + if(threadIdx.x == 0) { + buffer[(blockIdx.x * gridDim.y + blockIdx.y)] = sharedmem[0]; + buffer[(blockIdx.x * gridDim.y + blockIdx.y) + gridDim.x * gridDim.y] = sharedmem[blockDim.x]; + buffer[(blockIdx.x * gridDim.y + blockIdx.y) + 2 * gridDim.x * gridDim.y] = sharedmem[2 * blockDim.x]; + buffer[(blockIdx.x * gridDim.y + blockIdx.y) + 3 * gridDim.x * gridDim.y] = sharedmem[3 * blockDim.x]; + buffer[(blockIdx.x * gridDim.y + blockIdx.y) + 4 * gridDim.x * gridDim.y] = sharedmem[4 * blockDim.x]; + buffer[(blockIdx.x * gridDim.y + blockIdx.y) + 5 * gridDim.x * gridDim.y] = sharedmem[5 * blockDim.x]; + } +} + + +__global__ void Cuda_ComputeTempCuda_Reduce_Kernel(int n, ENERGY_FLOAT* t) +{ + int i = 0; + sharedmem[threadIdx.x] = 0; + ENERGY_FLOAT myforig = 0.0; + ENERGY_FLOAT* buf = (ENERGY_FLOAT*) _buffer; + buf = &buf[blockIdx.x * n]; + + while(i < n) { + sharedmem[threadIdx.x] = 0; + + if(i + threadIdx.x < n) + sharedmem[threadIdx.x] = buf[i + threadIdx.x]; + + __syncthreads(); + reduceBlock(sharedmem); + i += blockDim.x; + + if(threadIdx.x == 0) + myforig += sharedmem[0]; + } + + if(threadIdx.x == 0) + t[blockIdx.x] = myforig; +} diff --git a/lib/cuda/compute_temp_partial_cuda.cu b/lib/cuda/compute_temp_partial_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..bc78592640d2086c6877edaefa3928592e0dc468 --- /dev/null +++ b/lib/cuda/compute_temp_partial_cuda.cu @@ -0,0 +1,164 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> +#define MY_PREFIX compute_temp_partial_cuda +#include "cuda_shared.h" +#include "cuda_common.h" + +#include "crm_cuda_utils.cu" + +#include "compute_temp_partial_cuda_cu.h" +#include "compute_temp_partial_cuda_kernel.cu" + +void Cuda_ComputeTempPartialCuda_UpdateBuffer(cuda_shared_data* sdata) +{ + int size = (unsigned)((sdata->atom.nlocal + 63) / 64.0) * 6 * sizeof(ENERGY_FLOAT); + + if(sdata->buffersize < size) { + MYDBG(printf("Cuda_ComputeTempPartialCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);) + CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize); + sdata->buffer = CudaWrapper_AllocCudaData(size); + sdata->buffersize = size; + sdata->buffer_new++; + MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);) + } + + cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*)); +} + +void Cuda_ComputeTempPartialCuda_UpdateNmax(cuda_shared_data* sdata) +{ + cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass .dev_data, sizeof(V_FLOAT*)); + + if(sdata->atom.rmass_flag) + cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*)); + + cudaMemcpyToSymbol(MY_AP(rmass_flag) , & sdata->atom.rmass_flag, sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*)); +} + +void Cuda_ComputeTempPartialCuda_Init(cuda_shared_data* sdata) +{ + Cuda_ComputeTempPartialCuda_UpdateNmax(sdata); +} + + +void Cuda_ComputeTempPartialCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t, int xflag, int yflag, int zflag) +{ + //if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary + Cuda_ComputeTempPartialCuda_UpdateNmax(sdata); + //if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + //if(sdata->buffer_new) + Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata); + + int3 layout = getgrid(sdata->atom.nlocal); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + if(sdata->atom.nlocal > 0) { + Cuda_ComputeTempPartialCuda_Vector_Kernel <<< grid, threads, threads.x* 6* sizeof(ENERGY_FLOAT)>>> (groupbit, xflag, yflag, zflag); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Vector: compute_vector Kernel execution failed"); + + int oldgrid = grid.x * grid.y; + grid.x = 6; + threads.x = 512; + Cuda_ComputeTempPartialCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>> (oldgrid, t); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Vector: reduce_vector Kernel execution failed"); + } +} + +void Cuda_ComputeTempPartialCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t, int xflag, int yflag, int zflag) +{ + //if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary + Cuda_ComputeTempPartialCuda_UpdateNmax(sdata); + //if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + //if(sdata->buffer_new) + Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata); + MYDBG(printf("#CUDA ComputeTempPartialCuda_Scalar: %i\n", sdata->atom.nlocal);) + int3 layout = getgrid(sdata->atom.nlocal); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + if(sdata->atom.nlocal > 0) { + CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: pre compute_scalar Kernel"); + Cuda_ComputeTempPartialCuda_Scalar_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>> (groupbit, xflag, yflag, zflag); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: compute_scalar Kernel execution failed"); + + int oldgrid = grid.x * grid.y; + grid.x = 1; + threads.x = 512; + Cuda_ComputeTempPartialCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>> (oldgrid, t); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: reduce_scalar Kernel execution failed"); + } +} + +void Cuda_ComputeTempPartialCuda_RemoveBiasAll(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, void* vbiasall) +{ + //if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary + Cuda_ComputeTempPartialCuda_UpdateNmax(sdata); + //if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + //if(sdata->buffer_new) + Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata); + + int3 layout = getgrid(sdata->atom.nlocal); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + if(sdata->atom.nlocal > 0) { + Cuda_ComputeTempPartialCuda_RemoveBiasAll_Kernel <<< grid, threads, 0>>> (groupbit, xflag, yflag, zflag, (V_FLOAT*) vbiasall); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_RemoveBiasAll: compute_vector Kernel execution failed"); + } +} + +void Cuda_ComputeTempPartialCuda_RestoreBiasAll(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, void* vbiasall) +{ + //if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary + Cuda_ComputeTempPartialCuda_UpdateNmax(sdata); + //if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + //if(sdata->buffer_new) + Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata); + + int3 layout = getgrid(sdata->atom.nlocal); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + if(sdata->atom.nlocal > 0) { + Cuda_ComputeTempPartialCuda_RestoreBiasAll_Kernel <<< grid, threads, 0>>> (groupbit, xflag, yflag, zflag, (V_FLOAT*) vbiasall); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_RemoveBiasAll: compute_vector Kernel execution failed"); + } +} diff --git a/lib/cuda/compute_temp_partial_cuda_cu.h b/lib/cuda/compute_temp_partial_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..00fc8a7c3699d43b2cdca76f0e6ec415e45839d4 --- /dev/null +++ b/lib/cuda/compute_temp_partial_cuda_cu.h @@ -0,0 +1,30 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_ComputeTempPartialCuda_Init(cuda_shared_data* sdata); +extern "C" void Cuda_ComputeTempPartialCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t, int xflag, int yflag, int zflag); +extern "C" void Cuda_ComputeTempPartialCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t, int xflag, int yflag, int zflag); +extern "C" void Cuda_ComputeTempPartialCuda_RemoveBiasAll(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, void* vbiasall); +extern "C" void Cuda_ComputeTempPartialCuda_RestoreBiasAll(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, void* vbiasall); diff --git a/lib/cuda/compute_temp_partial_cuda_kernel.cu b/lib/cuda/compute_temp_partial_cuda_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..ec0fff334f8d1b4c2e8c048b0f880548d84ac77e --- /dev/null +++ b/lib/cuda/compute_temp_partial_cuda_kernel.cu @@ -0,0 +1,161 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +extern __shared__ ENERGY_FLOAT sharedmem[]; + + +__global__ void Cuda_ComputeTempPartialCuda_Scalar_Kernel(int groupbit, int xflag, int yflag, int zflag) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + sharedmem[threadIdx.x] = 0; + + if(i < _nlocal) { + if(_rmass_flag) { + if(_mask[i] & groupbit) + sharedmem[threadIdx.x] = (_v[i] * _v[i] * xflag + _v[i + _nmax] * _v[i + _nmax] * yflag + _v[i + 2 * _nmax] * _v[i + 2 * _nmax] * zflag) * _rmass[i]; + } else { + if(_mask[i] & groupbit) + sharedmem[threadIdx.x] = (_v[i] * _v[i] * xflag + _v[i + _nmax] * _v[i + _nmax] * yflag + _v[i + 2 * _nmax] * _v[i + 2 * _nmax] * zflag) * (_mass[_type[i]]); + } + } + + reduceBlock(sharedmem); + ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; + + if(threadIdx.x == 0) { + buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0]; + } +} + +__global__ void Cuda_ComputeTempPartialCuda_Vector_Kernel(int groupbit, int xflag, int yflag, int zflag) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + sharedmem[threadIdx.x] = 0; + sharedmem[threadIdx.x + blockDim.x] = 0; + sharedmem[threadIdx.x + 2 * blockDim.x] = 0; + sharedmem[threadIdx.x + 3 * blockDim.x] = 0; + sharedmem[threadIdx.x + 4 * blockDim.x] = 0; + sharedmem[threadIdx.x + 5 * blockDim.x] = 0; + + if(i < _nlocal) + if(_mask[i] & groupbit) { + V_FLOAT massone; + + if(_rmass_flag) massone = _rmass[i]; + else massone = _mass[_type[i]]; + + sharedmem[threadIdx.x] = massone * _v[i] * _v[i] * xflag; + sharedmem[threadIdx.x + blockDim.x] = massone * _v[i + _nmax] * _v[i + _nmax] * yflag; + sharedmem[threadIdx.x + 2 * blockDim.x] = massone * _v[i + 2 * _nmax] * _v[i + 2 * _nmax] * zflag; + sharedmem[threadIdx.x + 3 * blockDim.x] = massone * _v[i] * _v[i + _nmax] * xflag * yflag; + sharedmem[threadIdx.x + 4 * blockDim.x] = massone * _v[i] * _v[i + 2 * _nmax] * xflag * zflag; + sharedmem[threadIdx.x + 5 * blockDim.x] = massone * _v[i + _nmax] * _v[i + 2 * _nmax] * yflag * zflag; + } + + reduceBlock(sharedmem); + reduceBlock(&sharedmem[blockDim.x]); + reduceBlock(&sharedmem[2 * blockDim.x]); + reduceBlock(&sharedmem[3 * blockDim.x]); + reduceBlock(&sharedmem[4 * blockDim.x]); + reduceBlock(&sharedmem[5 * blockDim.x]); + ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; + + if(threadIdx.x == 0) { + buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0]; + buffer[blockIdx.x * gridDim.y + blockIdx.y + gridDim.x * gridDim.y] = sharedmem[blockDim.x]; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = sharedmem[2 * blockDim.x]; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = sharedmem[3 * blockDim.x]; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 4 * gridDim.x * gridDim.y] = sharedmem[4 * blockDim.x]; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 5 * gridDim.x * gridDim.y] = sharedmem[5 * blockDim.x]; + } +} + + +__global__ void Cuda_ComputeTempPartialCuda_Reduce_Kernel(int n, ENERGY_FLOAT* t) +{ + int i = 0; + sharedmem[threadIdx.x] = 0; + ENERGY_FLOAT myforig = 0.0; + ENERGY_FLOAT* buf = (ENERGY_FLOAT*) _buffer; + buf = &buf[blockIdx.x * n]; + + while(i < n) { + sharedmem[threadIdx.x] = 0; + + if(i + threadIdx.x < n) + sharedmem[threadIdx.x] = buf[i + threadIdx.x]; + + __syncthreads(); + reduceBlock(sharedmem); + i += blockDim.x; + + if(threadIdx.x == 0) + myforig += sharedmem[0]; + } + + if(threadIdx.x == 0) + t[blockIdx.x] = myforig; +} + +__global__ void Cuda_ComputeTempPartialCuda_RemoveBiasAll_Kernel(int groupbit, int xflag, int yflag, int zflag, V_FLOAT* vbiasall) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if(i < _nlocal) + if(_mask[i] & groupbit) { + if(!xflag) { + vbiasall[i] = _v[i]; + _v[i] = V_F(0.0); + } + + if(!yflag) { + vbiasall[i + _nmax] = _v[i + _nmax]; + _v[i + _nmax] = V_F(0.0); + } + + if(!zflag) { + vbiasall[i + 2 * _nmax] = _v[i + 2 * _nmax]; + _v[i + 2 * _nmax] = V_F(0.0); + } + } +} + +__global__ void Cuda_ComputeTempPartialCuda_RestoreBiasAll_Kernel(int groupbit, int xflag, int yflag, int zflag, V_FLOAT* vbiasall) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if(i < _nlocal) + if(_mask[i] & groupbit) { + if(!xflag) { + _v[i] += vbiasall[i]; + } + + if(!yflag) { + _v[i + _nmax] += vbiasall[i + _nmax]; + } + + if(!zflag) { + _v[i + 2 * _nmax] += vbiasall[i + 2 * _nmax]; + } + } +} diff --git a/lib/cuda/crm_cuda_utils.cu b/lib/cuda/crm_cuda_utils.cu new file mode 100644 index 0000000000000000000000000000000000000000..6337d0d01545b357b8daf0b471f4768618013b36 --- /dev/null +++ b/lib/cuda/crm_cuda_utils.cu @@ -0,0 +1,919 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifndef CRM_CUDA_UTILS +#define CRM_CUDA_UTILS + +//split n threads into 2 dimensional grid + threads, return values are grid.x grid.y and threads.x +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + +inline int3 getgrid(int n, int shared_per_thread = 0, int threadsmax = 256, bool p2 = false) +{ + int3 gridparams; + int sharedsize = 16000; + + if(shared_per_thread > 0) threadsmax = sharedsize / shared_per_thread < threadsmax ? sharedsize / shared_per_thread : threadsmax; + + if((n < 60 * 32) || (threadsmax < 64)) + gridparams.z = 32; + else if((n < 60 * 64) || (threadsmax < 128)) + gridparams.z = 64; + else if((n < 60 * 128) || (threadsmax < 256)) + gridparams.z = 128; + else if((n < 60 * 256) || (threadsmax < 512)) + gridparams.z = 256; + else gridparams.z = 512; + + if(p2) { + gridparams.z = 16; + + while(gridparams.z * 2 <= threadsmax) gridparams.z *= 2; + } + + + int blocks = (n + gridparams.z - 1) / gridparams.z; + + if(blocks > 10000) + gridparams.x = gridparams.y = int(sqrt(blocks)); + else { + gridparams.x = blocks; + gridparams.y = 1; + } + + while(gridparams.x * gridparams.y * gridparams.z < n) gridparams.x++; + + if(gridparams.x == 0) gridparams.x = 1; + + return gridparams; +} + +//return value: 1 if f<0; else: 0 +//take care if working with values as "blockId.x-n" for f: it might be interpreted as a unsigned int +static inline __device__ int negativCUDA(float f) +{ + return ((unsigned int)1 << 31 & (__float_as_int(f))) >> 31; +} + +//return value: -1 if f<0; else +1 +static inline __device__ float fsignCUDA(float f) +{ + return f < 0.0f ? -1.0f : 1.0f; +} + +//functions to copy data between global and shared memory (indeed you can copy data between two arbitrary memory regims on device - as long as you have read respectively write rights) +//blockDim.y and blockDim.z are assumed to be 1 +static inline __device__ void copySharedToGlob(int* shared, int* glob, const int &n) +{ + int i, k; + k = n - blockDim.x; + + for(i = 0; i < k; i += blockDim.x) { + glob[i + threadIdx.x] = shared[i + threadIdx.x]; + } + + if(threadIdx.x < n - i) { + glob[i + threadIdx.x] = shared[i + threadIdx.x]; + } + + __syncthreads(); +} + +static inline __device__ void copySharedToGlob(float* shared, float* glob, const int &n) +{ + int i, k; + k = n - blockDim.x; + + for(i = 0; i < k; i += blockDim.x) { + glob[i + threadIdx.x] = shared[i + threadIdx.x]; + } + + if(threadIdx.x < n - i) { + glob[i + threadIdx.x] = shared[i + threadIdx.x]; + } + + __syncthreads(); +} + +static inline __device__ void copySharedToGlob(double* shared, double* glob, const int &n) +{ + int i, k; + k = n - blockDim.x; + + for(i = 0; i < k; i += blockDim.x) { + glob[i + threadIdx.x] = shared[i + threadIdx.x]; + } + + if(threadIdx.x < n - i) { + glob[i + threadIdx.x] = shared[i + threadIdx.x]; + } + + __syncthreads(); +} + +static inline __device__ void copyGlobToShared(int* glob, int* shared, const int &n) +{ + int i, k; + k = n - blockDim.x; + + for(i = 0; i < k; i += blockDim.x) { + shared[i + threadIdx.x] = glob[i + threadIdx.x]; + } + + if(threadIdx.x < n - i) { + shared[i + threadIdx.x] = glob[i + threadIdx.x]; + } + + __syncthreads(); +} + +static __device__ inline void copyGlobToShared(float* glob, float* shared, const int &n) +{ + int i, k; + k = n - blockDim.x; + + for(i = 0; i < k; i += blockDim.x) { + shared[i + threadIdx.x] = glob[i + threadIdx.x]; + } + + if(threadIdx.x < n - i) { + shared[i + threadIdx.x] = glob[i + threadIdx.x]; + } + + __syncthreads(); +} + +static __device__ inline void copyGlobToShared(double* glob, double* shared, const int &n) +{ + int i; + + for(i = 0; i < n - blockDim.x; i += blockDim.x) { + shared[i + threadIdx.x] = glob[i + threadIdx.x]; + } + + if(threadIdx.x < n - i) { + shared[i + threadIdx.x] = glob[i + threadIdx.x]; + } + + __syncthreads(); +} + +//copy data between two memory areas on device, 3d BlockDims are allowed +static __device__ inline void copyData(double* source, double* target, const int &n) +{ + int i; + int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z; + + for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) { + target[i + offset] = source[i + offset]; + } + + if(offset < n - i) { + target[i + offset] = source[i + offset]; + } + + __syncthreads(); +} + +static __device__ inline void copyData(float* source, float* target, const int &n) +{ + int i; + int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z; + + for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) { + target[i + offset] = source[i + offset]; + } + + if(offset < n - i) { + target[i + offset] = source[i + offset]; + } + + __syncthreads(); +} + +static __device__ inline void copyData(int* source, int* target, const int &n) +{ + int i; + int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z; + + for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) { + target[i + offset] = source[i + offset]; + } + + if(offset < n - i) { + target[i + offset] = source[i + offset]; + } + + __syncthreads(); +} + +static __device__ inline void copyData(unsigned int* source, unsigned int* target, const int &n) +{ + int i; + int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z; + + for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) { + target[i + offset] = source[i + offset]; + } + + if(offset < n - i) { + target[i + offset] = source[i + offset]; + } + + __syncthreads(); +} + +//functions in order to sum over values of one block. P2 means blockdim MUST be a power of 2 otherwise the behaviour is not well defined +//in the end in data[0]=sum_i=0^blockDim.x data[i] +//for reduceBlockP2 and reduceBlock blockDim.y=1 and blockDim.z=1 +static __device__ inline void reduceBlockP2(int* data) +{ + __syncthreads(); + + for(int i = 2; i <= blockDim.x; i *= 2) { + if(threadIdx.x < blockDim.x / i) + data[threadIdx.x] += data[threadIdx.x + blockDim.x / i]; + + __syncthreads(); + } +} + +static __device__ inline void reduceBlockP2(unsigned int* data) +{ + __syncthreads(); + + for(int i = 2; i <= blockDim.x; i *= 2) { + if(threadIdx.x < blockDim.x / i) + data[threadIdx.x] += data[threadIdx.x + blockDim.x / i]; + + __syncthreads(); + } +} + +static __device__ inline void reduceBlockP2(float* data) +{ + __syncthreads(); + + for(int i = 2; i <= blockDim.x; i *= 2) { + if(threadIdx.x < blockDim.x / i) + data[threadIdx.x] += data[threadIdx.x + blockDim.x / i]; + + __syncthreads(); + } +} + +static __device__ inline void reduceBlockP2(double* data) +{ + __syncthreads(); + + for(int i = 2; i <= blockDim.x; i *= 2) { + if(threadIdx.x < blockDim.x / i) + data[threadIdx.x] += data[threadIdx.x + blockDim.x / i]; + + __syncthreads(); + } +} + +static __device__ inline void reduceBlock(float* data) +{ + __syncthreads(); + int p2 = 1; + + while(p2 * 2 < blockDim.x) p2 *= 2; + + if(threadIdx.x < blockDim.x - p2) + data[threadIdx.x] += data[threadIdx.x + p2]; + + __syncthreads(); + + for(int i = 2; i <= p2; i *= 2) { + if(threadIdx.x < p2 / i) + data[threadIdx.x] += data[threadIdx.x + p2 / i]; + + __syncthreads(); + } +} + +static __device__ inline void reduceBlock(int* data) +{ + __syncthreads(); + int p2 = 1; + + while(p2 * 2 < blockDim.x) p2 *= 2; + + if(threadIdx.x < blockDim.x - p2) + data[threadIdx.x] += data[threadIdx.x + p2]; + + __syncthreads(); + + for(int i = 2; i <= p2; i *= 2) { + if(threadIdx.x < p2 / i) + data[threadIdx.x] += data[threadIdx.x + p2 / i]; + + __syncthreads(); + } +} + +static __device__ inline void reduceBlock(unsigned int* data) +{ + __syncthreads(); + int p2 = 1; + + while(p2 * 2 < blockDim.x) p2 *= 2; + + if(threadIdx.x < blockDim.x - p2) + data[threadIdx.x] += data[threadIdx.x + p2]; + + __syncthreads(); + + for(int i = 2; i <= p2; i *= 2) { + if(threadIdx.x < p2 / i) + data[threadIdx.x] += data[threadIdx.x + p2 / i]; + + __syncthreads(); + } +} + +static __device__ inline void reduceBlock(double* data) +{ + __syncthreads(); + int p2 = 1; + + while(p2 * 2 < blockDim.x) p2 *= 2; + + if(threadIdx.x < blockDim.x - p2) + data[threadIdx.x] += data[threadIdx.x + p2]; + + __syncthreads(); + + for(int i = 2; i <= p2; i *= 2) { + if(threadIdx.x < p2 / i) + data[threadIdx.x] += data[threadIdx.x + p2 / i]; + + __syncthreads(); + } +} + +static __device__ inline void cudaFillBlockData_int(int* data, const int &n, const int &value) +{ + int i; + + for(i = 0; i < n - blockDim.x; i += blockDim.x) { + data[i + threadIdx.x] = value; + } + + if(threadIdx.x < n - i) data[i + threadIdx.x] = value; +} + +static __device__ inline void cudaFillBlockData_float(float* data, const int &n, const float &value) +{ + int i; + + for(i = 0; i < n - blockDim.x; i += blockDim.x) { + data[i + threadIdx.x] = value; + } + + if(threadIdx.x < n - i) data[i + threadIdx.x] = value; +} + +static __device__ inline void reduce(float* data, int n) //cautious not sure if working +{ + __syncthreads(); + int p2 = 1; + + while(p2 * 2 < n) p2 *= 2; + + int j = 0; + + while((threadIdx.x + blockDim.x * j) * 2 < n - p2) { + data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2]; + j++; + } + + __syncthreads(); + + for(int i = 2; i <= p2; i *= 2) { + while((threadIdx.x + blockDim.x * j) < p2 / i) { + data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2 / i]; + j++; + } + + __syncthreads(); + } +} + +static __device__ inline void reduce(double* data, int n) //cautious not sure if working +{ + __syncthreads(); + int p2 = 1; + + while(p2 * 2 < n) p2 *= 2; + + int j = 0; + + while((threadIdx.x + blockDim.x * j) * 2 < n - p2) { + data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2]; + j++; + } + + __syncthreads(); + + for(int i = 2; i <= p2; i *= 2) { + while((threadIdx.x + blockDim.x * j) < p2 / i) { + data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2 / i]; + j++; + } + + __syncthreads(); + } +} + +static __device__ inline void minOfBlock(float* data) +{ + __syncthreads(); + int p2 = 1; + + while(p2 * 2 < blockDim.x) p2 *= 2; + + if(threadIdx.x < blockDim.x - p2) + data[threadIdx.x] = MIN(data[threadIdx.x + p2], data[threadIdx.x]); + + __syncthreads(); + + for(int i = 2; i <= p2; i *= 2) { + if(threadIdx.x < p2 / i) + data[threadIdx.x] = MIN(data[threadIdx.x + p2 / i], data[threadIdx.x]); + + __syncthreads(); + } +} + +static __device__ inline void maxOfBlock(float* data) +{ + __syncthreads(); + int p2 = 1; + + while(p2 * 2 < blockDim.x) p2 *= 2; + + if(threadIdx.x < blockDim.x - p2) + data[threadIdx.x] = MAX(data[threadIdx.x + p2], data[threadIdx.x]); + + __syncthreads(); + + for(int i = 2; i <= p2; i *= 2) { + if(threadIdx.x < p2 / i) + data[threadIdx.x] = MAX(data[threadIdx.x + p2 / i], data[threadIdx.x]); + + __syncthreads(); + } +} + +static __device__ inline void minOfBlock(double* data) +{ + __syncthreads(); + int p2 = 1; + + while(p2 * 2 < blockDim.x) p2 *= 2; + + if(threadIdx.x < blockDim.x - p2) + data[threadIdx.x] = MIN(data[threadIdx.x + p2], data[threadIdx.x]); + + __syncthreads(); + + for(int i = 2; i <= p2; i *= 2) { + if(threadIdx.x < p2 / i) + data[threadIdx.x] = MIN(data[threadIdx.x + p2 / i], data[threadIdx.x]); + + __syncthreads(); + } +} + +static __device__ inline void maxOfBlock(double* data) +{ + __syncthreads(); + int p2 = 1; + + while(p2 * 2 < blockDim.x) p2 *= 2; + + if(threadIdx.x < blockDim.x - p2) + data[threadIdx.x] = MAX(data[threadIdx.x + p2], data[threadIdx.x]); + + __syncthreads(); + + for(int i = 2; i <= p2; i *= 2) { + if(threadIdx.x < p2 / i) + data[threadIdx.x] = MAX(data[threadIdx.x + p2 / i], data[threadIdx.x]); + + __syncthreads(); + } +} + + +static __device__ inline void minOfData(double* data, int n) //cautious not sure if working +{ + __syncthreads(); + int p2 = 1; + + while(p2 * 2 < n) p2 *= 2; + + int j = 0; + + while((threadIdx.x + blockDim.x * j) < n - p2) { + data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]); + j++; + } + + __syncthreads(); + + for(int i = 2; i <= p2; i *= 2) { + while((threadIdx.x + blockDim.x * j) < p2 / i) { + data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]); + j++; + } + + __syncthreads(); + } +} + +static __device__ inline void maxOfData(double* data, int n) //cautious not sure if working +{ + __syncthreads(); + int p2 = 1; + + while(p2 * 2 < n) p2 *= 2; + + int j = 0; + + while((threadIdx.x + blockDim.x * j) < n - p2) { + data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]); + j++; + } + + __syncthreads(); + + for(int i = 2; i <= p2; i *= 2) { + while((threadIdx.x + blockDim.x * j) < p2 / i) { + data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]); + j++; + } + + __syncthreads(); + } +} + +static __device__ inline void minOfData(float* data, int n) //cautious not sure if working +{ + __syncthreads(); + int p2 = 1; + + while(p2 * 2 < n) p2 *= 2; + + int j = 0; + + while((threadIdx.x + blockDim.x * j) < n - p2) { + data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]); + j++; + } + + __syncthreads(); + + for(int i = 2; i <= p2; i *= 2) { + while((threadIdx.x + blockDim.x * j) < p2 / i) { + data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]); + j++; + } + + __syncthreads(); + } +} + +static __device__ inline void maxOfData(float* data, int n) //cautious not sure if working +{ + __syncthreads(); + int p2 = 1; + + while(p2 * 2 < n) p2 *= 2; + + int j = 0; + + while((threadIdx.x + blockDim.x * j) < n - p2) { + data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]); + j++; + } + + __syncthreads(); + + for(int i = 2; i <= p2; i *= 2) { + while((threadIdx.x + blockDim.x * j) < p2 / i) { + data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]); + j++; + } + + __syncthreads(); + } +} + +#if X_PRECISION == 2 +static __device__ inline double tex1Dfetch_double(texture<int2, 1> t, int i) +{ + int2 v = tex1Dfetch(t, i); + return __hiloint2double(v.y, v.x); +} + +static __device__ inline X_FLOAT4 tex1Dfetch_double(texture<int4, 1> t, int i) +{ + int4 v = tex1Dfetch(t, 2 * i); + int4 u = tex1Dfetch(t, 2 * i + 1); + X_FLOAT4 w; + + w.x = __hiloint2double(v.y, v.x); + w.y = __hiloint2double(v.w, v.z); + w.z = __hiloint2double(u.y, u.x); + w.w = __hiloint2double(u.w, u.z); + return w; +} +#endif + +inline void BindXTypeTexture(cuda_shared_data* sdata) +{ +#ifdef CUDA_USE_TEXTURE + _x_type_tex.normalized = false; // access with normalized texture coordinates + _x_type_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _x_type_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + const textureReference* x_type_texture_ptr = &MY_AP(x_type_tex); + +#if X_PRECISION == 1 + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float4>(); + cudaBindTexture(0, x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(X_FLOAT4)); +#else + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int4>(); + cudaBindTexture(0, x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int4)); +#endif +#endif +} + +static __device__ inline X_FLOAT4 fetchXType(int i) +{ +#ifdef CUDA_USE_TEXTURE +#if X_PRECISION == 1 + return tex1Dfetch(_x_type_tex, i); +#else + return tex1Dfetch_double(_x_type_tex, i); +#endif +#else + return _x_type[i]; +#endif +} + +#if V_PRECISION == 2 +static __device__ inline double tex1Dfetch_double_v(texture<int2, 1> t, int i) +{ + int2 v = tex1Dfetch(t, i); + return __hiloint2double(v.y, v.x); +} + +static __device__ inline V_FLOAT4 tex1Dfetch_double_v(texture<int4, 1> t, int i) +{ + int4 v = tex1Dfetch(t, 2 * i); + int4 u = tex1Dfetch(t, 2 * i + 1); + V_FLOAT4 w; + + w.x = __hiloint2double(v.y, v.x); + w.y = __hiloint2double(v.w, v.z); + w.z = __hiloint2double(u.y, u.x); + w.w = __hiloint2double(u.w, u.z); + return w; +} +#endif + +inline void BindVRadiusTexture(cuda_shared_data* sdata) +{ +#ifdef CUDA_USE_TEXTURE + _v_radius_tex.normalized = false; // access with normalized texture coordinates + _v_radius_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _v_radius_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + const textureReference* v_radius_texture_ptr = &MY_AP(v_radius_tex); + +#if V_PRECISION == 1 + cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc<float4>(); + cudaBindTexture(0, v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax * sizeof(X_FLOAT4)); +#else + cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc<int4>(); + cudaBindTexture(0, v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax * 2 * sizeof(int4)); +#endif +#endif +} + +static __device__ inline V_FLOAT4 fetchVRadius(int i) +{ +#ifdef CUDA_USE_TEXTURE +#if V_PRECISION == 1 + return tex1Dfetch(_v_radius_tex, i); +#else + return tex1Dfetch_double_v(_v_radius_tex, i); +#endif +#else + return _v_radius[i]; +#endif +} + +inline void BindOmegaRmassTexture(cuda_shared_data* sdata) +{ +#ifdef CUDA_USE_TEXTURE + _omega_rmass_tex.normalized = false; // access with normalized texture coordinates + _omega_rmass_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _omega_rmass_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + const textureReference* omega_rmass_texture_ptr = &MY_AP(omega_rmass_tex); + +#if V_PRECISION == 1 + cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc<float4>(); + cudaBindTexture(0, omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax * sizeof(X_FLOAT4)); +#else + cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc<int4>(); + cudaBindTexture(0, omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax * 2 * sizeof(int4)); +#endif +#endif +} + +static __device__ inline V_FLOAT4 fetchOmegaRmass(int i) +{ +#ifdef CUDA_USE_TEXTURE +#if V_PRECISION == 1 + return tex1Dfetch(_omega_rmass_tex, i); +#else + return tex1Dfetch_double_v(_omega_rmass_tex, i); +#endif +#else + return _omega_rmass[i]; +#endif +} + +#if F_PRECISION == 2 +static __device__ inline double tex1Dfetch_double_f(texture<int2, 1> t, int i) +{ + int2 v = tex1Dfetch(t, i); + return __hiloint2double(v.y, v.x); +} + +static __device__ inline F_FLOAT4 tex1Dfetch_double_f(texture<int4, 1> t, int i) +{ + int4 v = tex1Dfetch(t, 2 * i); + int4 u = tex1Dfetch(t, 2 * i + 1); + F_FLOAT4 w; + + w.x = __hiloint2double(v.y, v.x); + w.y = __hiloint2double(v.w, v.z); + w.z = __hiloint2double(u.y, u.x); + w.w = __hiloint2double(u.w, u.z); + return w; +} +#endif + +inline void BindQTexture(cuda_shared_data* sdata) +{ +#ifdef CUDA_USE_TEXTURE + _q_tex.normalized = false; // access with normalized texture coordinates + _q_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _q_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + const textureReference* q_texture_ptr = &MY_AP(q_tex); + +#if F_PRECISION == 1 + cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc<float>(); + cudaBindTexture(0, q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax * sizeof(F_FLOAT)); +#else + cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc<int2>(); + cudaBindTexture(0, q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax * sizeof(int2)); +#endif +#endif +} + +static __device__ inline F_FLOAT fetchQ(int i) +{ +#ifdef CUDA_USE_TEXTURE +#if F_PRECISION == 1 + return tex1Dfetch(_q_tex, i); +#else + return tex1Dfetch_double_f(_q_tex, i); +#endif +#else + return _q[i]; +#endif +} + +#endif + +/* + +inline void BindPairCoeffTypeTexture(cuda_shared_data* sdata,coeff_tex) +{ + #ifdef CUDA_USE_TEXTURE + _coeff_tex.normalized = false; // access with normalized texture coordinates + _coeff_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _coeff_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + const textureReference* coeff_texture_ptr; + cudaGetTextureReference(&coeff_texture_ptr, &MY_AP(coeff_tex)); + + #if F_PRECISION == 1 + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float4>(); + cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(X_FLOAT4)); + #else + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int4>(); + cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int4)); + #endif + #endif +} + +static __device__ inline X_FLOAT4 fetchXType(int i) +{ + #ifdef CUDA_USE_TEXTURE + #if X_PRECISION == 1 + return tex1Dfetch(_x_type_tex,i); + #else + return tex1Dfetch_double(_x_type_tex,i); + #endif + #else + return _x_type[i]; + #endif +} +*/ +#define SBBITS 30 + +static inline __device__ int sbmask(int j) +{ + return j >> SBBITS & 3; +} + +static inline __device__ void minimum_image(X_FLOAT4 &delta) +{ + if(_triclinic == 0) { + if(_periodicity[0]) { + delta.x += delta.x < -X_F(0.5) * _prd[0] ? _prd[0] : + (delta.x > X_F(0.5) * _prd[0] ? -_prd[0] : X_F(0.0)); + } + + if(_periodicity[1]) { + delta.y += delta.y < -X_F(0.5) * _prd[1] ? _prd[1] : + (delta.y > X_F(0.5) * _prd[1] ? -_prd[1] : X_F(0.0)); + } + + if(_periodicity[2]) { + delta.z += delta.z < -X_F(0.5) * _prd[2] ? _prd[2] : + (delta.z > X_F(0.5) * _prd[2] ? -_prd[2] : X_F(0.0)); + } + + } else { + if(_periodicity[1]) { + delta.z += delta.z < -X_F(0.5) * _prd[2] ? _prd[2] : + (delta.z > X_F(0.5) * _prd[2] ? -_prd[2] : X_F(0.0)); + delta.y += delta.z < -X_F(0.5) * _prd[2] ? _h[3] : + (delta.z > X_F(0.5) * _prd[2] ? -_h[3] : X_F(0.0)); + delta.x += delta.z < -X_F(0.5) * _prd[2] ? _h[4] : + (delta.z > X_F(0.5) * _prd[2] ? -_h[4] : X_F(0.0)); + + } + + if(_periodicity[1]) { + delta.y += delta.y < -X_F(0.5) * _prd[1] ? _prd[1] : + (delta.y > X_F(0.5) * _prd[1] ? -_prd[1] : X_F(0.0)); + delta.x += delta.y < -X_F(0.5) * _prd[1] ? _h[5] : + (delta.y > X_F(0.5) * _prd[1] ? -_h[5] : X_F(0.0)); + + } + + if(_periodicity[0]) { + delta.x += delta.x < -X_F(0.5) * _prd[0] ? _prd[0] : + (delta.x > X_F(0.5) * _prd[0] ? -_prd[0] : X_F(0.0)); + } + } +} + +static inline __device__ void closest_image(X_FLOAT4 &x1, X_FLOAT4 &x2, X_FLOAT4 &ci) +{ + ci.x = x2.x - x1.x; + ci.y = x2.y - x1.y; + ci.z = x2.z - x1.z; + minimum_image(ci); + ci.x += x1.x; + ci.y += x1.y; + ci.z += x1.z; +} diff --git a/lib/cuda/cuda.cu b/lib/cuda/cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..b0c7a917762da994322d6aeeab4e2827af8c3229 --- /dev/null +++ b/lib/cuda/cuda.cu @@ -0,0 +1,22 @@ +#include "cuda_precision.h" +#include "cuda_shared.h" +#include "cuda_cu.h" + +void Cuda_Cuda_GetCompileSettings(cuda_shared_data* sdata) +{ + sdata->compile_settings.prec_glob = sizeof(CUDA_FLOAT) / 4; + sdata->compile_settings.prec_x = sizeof(X_FLOAT) / 4; + sdata->compile_settings.prec_v = sizeof(V_FLOAT) / 4; + sdata->compile_settings.prec_f = sizeof(F_FLOAT) / 4; + sdata->compile_settings.prec_pppm = sizeof(PPPM_FLOAT) / 4; + sdata->compile_settings.prec_fft = sizeof(FFT_FLOAT) / 4; + +#ifdef FFT_CUFFT + sdata->compile_settings.cufft = 1; +#else + sdata->compile_settings.cufft = 0; +#endif + + sdata->compile_settings.arch = CUDA_ARCH; + +} diff --git a/lib/cuda/cuda_common.h b/lib/cuda/cuda_common.h new file mode 100644 index 0000000000000000000000000000000000000000..a6806bcfd854065d62f87a98b74aa9673fd5b090 --- /dev/null +++ b/lib/cuda/cuda_common.h @@ -0,0 +1,344 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifndef _CUDA_COMMON_H_ +#define _CUDA_COMMON_H_ + +//#include "cutil.h" +#include "cuda_precision.h" +#include "cuda_wrapper_cu.h" + +#define CUDA_MAX_TYPES_PLUS_ONE 12 //for pair styles which use constant space for parameters, this needs to be one larger than the number of atom types +//this can not be arbitrarly large, since constant space is limited. +//in principle one could alter potentials to use global memory for parameters, some du that already since the first examples I encountered had a high number (20+) of atom types +//Christian +#define CUDA_MAX_TYPES2 (CUDA_MAX_TYPES_PLUS_ONE * CUDA_MAX_TYPES_PLUS_ONE) +#define CUDA_MAX_NSPECIAL 25 + +// define some easy-to-use debug and emulation macros +#ifdef _DEBUG +#define MYDBG(a) a +#else +#define MYDBG(a) +#endif + +#if __DEVICE_EMULATION__ +#define MYEMU(a) a +#else +#define MYEMU(a) +#endif + +#define MYEMUDBG(a) MYEMU(MYDBG(a)) + +// Add Prefix (needed as workaround, same constant's names in different files causes conflict) +#define MY_ADD_PREFIX(prefix, var) prefix##_##var +#define MY_ADD_PREFIX2(prefix, var) MY_ADD_PREFIX(prefix, var) +#define MY_AP(var) MY_ADD_PREFIX2(MY_PREFIX, var) + +#define MY_VAR_TO_STR(var) #var +#define MY_VAR_TO_STR2(var) MY_VAR_TO_STR(var) +//#define &MY_AP(var) (MY_VAR_TO_STR2(MY_PREFIX) "_" MY_VAR_TO_STR2(var)) +//#define &MY_AP(var) &(MY_AP(var)) +#define CUDA_USE_TEXTURE +#define CUDA_USE_FLOAT4 + +//constants used by many classes + +//domain +#define _boxhi MY_AP(boxhi) +#define _boxlo MY_AP(boxlo) +#define _subhi MY_AP(subhi) +#define _sublo MY_AP(sublo) +#define _box_size MY_AP(box_size) +#define _prd MY_AP(prd) +#define _periodicity MY_AP(periodicity) +#define _triclinic MY_AP(triclinic) +#define _boxhi_lamda MY_AP(boxhi_lamda) +#define _boxlo_lamda MY_AP(boxlo_lamda) +#define _prd_lamda MY_AP(prd_lamda) +#define _h MY_AP(h) +#define _h_inv MY_AP(h_inv) +#define _h_rate MY_AP(h_rate) +__device__ __constant__ X_FLOAT _boxhi[3]; +__device__ __constant__ X_FLOAT _boxlo[3]; +__device__ __constant__ X_FLOAT _subhi[3]; +__device__ __constant__ X_FLOAT _sublo[3]; +__device__ __constant__ X_FLOAT _box_size[3]; +__device__ __constant__ X_FLOAT _prd[3]; +__device__ __constant__ int _periodicity[3]; +__device__ __constant__ int _triclinic; +__device__ __constant__ X_FLOAT _boxhi_lamda[3]; +__device__ __constant__ X_FLOAT _boxlo_lamda[3]; +__device__ __constant__ X_FLOAT _prd_lamda[3]; +__device__ __constant__ X_FLOAT _h[6]; +__device__ __constant__ X_FLOAT _h_inv[6]; +__device__ __constant__ V_FLOAT _h_rate[6]; + + +//atom properties +#define _x MY_AP(x) +#define _v MY_AP(v) +#define _f MY_AP(f) +#define _tag MY_AP(tag) +#define _type MY_AP(type) +#define _mask MY_AP(mask) +#define _image MY_AP(image) +#define _q MY_AP(q) +#define _mass MY_AP(mass) +#define _rmass MY_AP(rmass) +#define _rmass_flag MY_AP(rmass_flag) +#define _eatom MY_AP(eatom) +#define _vatom MY_AP(vatom) +#define _x_type MY_AP(x_type) +#define _radius MY_AP(radius) +#define _density MY_AP(density) +#define _omega MY_AP(omega) +#define _torque MY_AP(torque) +#define _special MY_AP(special) +#define _maxspecial MY_AP(maxspecial) +#define _nspecial MY_AP(nspecial) +#define _special_flag MY_AP(special_flag) +#define _molecule MY_AP(molecule) +#define _v_radius MY_AP(v_radius) +#define _omega_rmass MY_AP(omega_rmass) +#define _freeze_group_bit MY_AP(freeze_group_bit) +#define _map_array MY_AP(map_array) +__device__ __constant__ X_FLOAT* _x; //holds pointer to positions +__device__ __constant__ V_FLOAT* _v; +__device__ __constant__ F_FLOAT* _f; +__device__ __constant__ int* _tag; +__device__ __constant__ int* _type; +__device__ __constant__ int* _mask; +__device__ __constant__ int* _image; +__device__ __constant__ V_FLOAT* _mass; +__device__ __constant__ F_FLOAT* _q; +__device__ __constant__ V_FLOAT* _rmass; +__device__ __constant__ int _rmass_flag; +__device__ __constant__ ENERGY_FLOAT* _eatom; +__device__ __constant__ ENERGY_FLOAT* _vatom; +__device__ __constant__ X_FLOAT4* _x_type; //holds pointer to positions +__device__ __constant__ X_FLOAT* _radius; +__device__ __constant__ F_FLOAT* _density; +__device__ __constant__ V_FLOAT* _omega; +__device__ __constant__ F_FLOAT* _torque; +__device__ __constant__ int* _special; +__device__ __constant__ int _maxspecial; +__device__ __constant__ int* _nspecial; +__device__ __constant__ int _special_flag[4]; +__device__ __constant__ int* _molecule; +__device__ __constant__ V_FLOAT4* _v_radius; //holds pointer to positions +__device__ __constant__ V_FLOAT4* _omega_rmass; //holds pointer to positions +__device__ __constant__ int _freeze_group_bit; +__device__ __constant__ int* _map_array; + +#ifdef CUDA_USE_TEXTURE + +#define _x_tex MY_AP(x_tex) +#if X_PRECISION == 1 +texture<float> _x_tex; +#else +texture<int2, 1> _x_tex; +#endif + +#define _type_tex MY_AP(type_tex) +texture<int> _type_tex; + +#define _x_type_tex MY_AP(x_type_tex) +#if X_PRECISION == 1 +texture<float4, 1> _x_type_tex; +#else +texture<int4, 1> _x_type_tex; +#endif + +#define _v_radius_tex MY_AP(v_radius_tex) +#if V_PRECISION == 1 +texture<float4, 1> _v_radius_tex; +#else +texture<int4, 1> _v_radius_tex; +#endif + +#define _omega_rmass_tex MY_AP(omega_rmass_tex) +#if V_PRECISION == 1 +texture<float4, 1> _omega_rmass_tex; +#else +texture<int4, 1> _omega_rmass_tex; +#endif + +#define _q_tex MY_AP(q_tex) +#if F_PRECISION == 1 +texture<float> _q_tex; +#else +texture<int2, 1> _q_tex; +#endif + +#endif + +//neighbor +#ifdef IncludeCommonNeigh +#define _inum MY_AP(inum) +#define _inum_border MY_AP(inum_border) +#define _ilist MY_AP(ilist) +#define _ilist_border MY_AP(ilist_border) +#define _numneigh MY_AP(numneigh) +#define _numneigh_border MY_AP(numneigh_border) +#define _numneigh_inner MY_AP(numneigh_inner) +#define _firstneigh MY_AP(firstneigh) +#define _neighbors MY_AP(neighbors) +#define _neighbors_border MY_AP(neighbors_border) +#define _neighbors_inner MY_AP(neighbors_inner) +#define _reneigh_flag MY_AP(reneigh_flag) +#define _triggerneighsq MY_AP(triggerneighsq) +#define _xhold MY_AP(xhold) +#define _maxhold MY_AP(maxhold) +#define _dist_check MY_AP(dist_check) +#define _neighbor_maxlocal MY_AP(neighbor_maxlocal) +#define _maxneighbors MY_AP(maxneighbors) +#define _overlap_comm MY_AP(overlap_comm) +__device__ __constant__ int _inum; +__device__ __constant__ int* _inum_border; +__device__ __constant__ int* _ilist; +__device__ __constant__ int* _ilist_border; +__device__ __constant__ int* _numneigh; +__device__ __constant__ int* _numneigh_border; +__device__ __constant__ int* _numneigh_inner; +__device__ __constant__ int** _firstneigh; +__device__ __constant__ int* _neighbors; +__device__ __constant__ int* _neighbors_border; +__device__ __constant__ int* _neighbors_inner; +__device__ __constant__ int* _reneigh_flag; +__device__ __constant__ X_FLOAT _triggerneighsq; +__device__ __constant__ X_FLOAT* _xhold; //holds pointer to positions +__device__ __constant__ int _maxhold; +__device__ __constant__ int _dist_check; +__device__ __constant__ int _neighbor_maxlocal; +__device__ __constant__ int _maxneighbors; +__device__ __constant__ int _overlap_comm; +#endif + +//system properties +#define _nall MY_AP(nall) +#define _nghost MY_AP(nghost) +#define _nlocal MY_AP(nlocal) +#define _nmax MY_AP(nmax) +#define _cuda_ntypes MY_AP(cuda_ntypes) +#define _dtf MY_AP(dtf) +#define _dtv MY_AP(dtv) +#define _factor MY_AP(factor) +#define _virial MY_AP(virial) +#define _eng_vdwl MY_AP(eng_vdwl) +#define _eng_coul MY_AP(eng_coul) +#define _molecular MY_AP(molecular) +__device__ __constant__ unsigned _nall; +__device__ __constant__ unsigned _nghost; +__device__ __constant__ unsigned _nlocal; +__device__ __constant__ unsigned _nmax; +__device__ __constant__ unsigned _cuda_ntypes; +__device__ __constant__ V_FLOAT _dtf; +__device__ __constant__ X_FLOAT _dtv; +__device__ __constant__ V_FLOAT _factor; +__device__ __constant__ ENERGY_FLOAT* _virial; +__device__ __constant__ ENERGY_FLOAT* _eng_vdwl; +__device__ __constant__ ENERGY_FLOAT* _eng_coul; +__device__ __constant__ int _molecular; + +//other general constants +#define _buffer MY_AP(buffer) +#define _flag MY_AP(flag) +#define _debugdata MY_AP(debugdata) +__device__ __constant__ void* _buffer; +__device__ __constant__ int* _flag; +__device__ __constant__ int* _debugdata; + +// pointers to data fields on GPU are hold in constant space +// -> reduces register usage and number of parameters for kernelcalls +// will be variables of file scope in cuda files + + + + +// maybe used to output cudaError_t +#define MY_OUTPUT_RESULT(result) \ + switch(result) \ + { \ + case cudaSuccess: printf(" => cudaSuccess\n"); break; \ + case cudaErrorInvalidValue: printf(" => cudaErrorInvalidValue\n"); break; \ + case cudaErrorInvalidSymbol: printf(" => cudaErrorInvalidSymbol\n"); break; \ + case cudaErrorInvalidDevicePointer: printf(" => cudaErrorInvalidDevicePointer\n"); break; \ + case cudaErrorInvalidMemcpyDirection: printf(" => cudaErrorInvalidMemcpyDirection\n"); break; \ + default: printf(" => unknown\n"); break; \ + } + +#ifdef _DEBUG +# define CUT_CHECK_ERROR(errorMessage) { \ + cudaError_t err = cudaGetLastError(); \ + if( cudaSuccess != err) { \ + fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ + errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\ + exit(EXIT_FAILURE); \ + } \ + err = cudaThreadSynchronize(); \ + if( cudaSuccess != err) { \ + fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ + errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\ + exit(EXIT_FAILURE); \ + } \ + } +#else +# define CUT_CHECK_ERROR(errorMessage) { \ + cudaError_t err = cudaGetLastError(); \ + if( cudaSuccess != err) { \ + fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ + errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\ + exit(EXIT_FAILURE); \ + } \ + } +#endif + +# define CUDA_SAFE_CALL_NO_SYNC( call) { \ + cudaError err = call; \ + if( cudaSuccess != err) { \ + fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \ + __FILE__, __LINE__, cudaGetErrorString( err) ); \ + exit(EXIT_FAILURE); \ + } } + +# define CUDA_SAFE_CALL( call) CUDA_SAFE_CALL_NO_SYNC(call); + +#define X_MASK 1 +#define V_MASK 2 +#define F_MASK 4 +#define TAG_MASK 8 +#define TYPE_MASK 16 +#define MASK_MASK 32 +#define IMAGE_MASK 64 +#define Q_MASK 128 +#define MOLECULE_MASK 256 +#define RMASS_MASK 512 +#define RADIUS_MASK 1024 +#define DENSITY_MASK 2048 +#define OMEGA_MASK 4096 +#define TORQUE_MASK 8192 + + + +#endif // #ifdef _CUDA_COMMON_H_ diff --git a/lib/cuda/cuda_cu.h b/lib/cuda/cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..48498b8d0f353cc5cf7f99a4cdeb0403483322c3 --- /dev/null +++ b/lib/cuda/cuda_cu.h @@ -0,0 +1 @@ +extern "C" void Cuda_Cuda_GetCompileSettings(cuda_shared_data* sdata); diff --git a/lib/cuda/cuda_data.cu b/lib/cuda/cuda_data.cu new file mode 100644 index 0000000000000000000000000000000000000000..6e6669ea1196024408d4e5ecd5d3db770da19f05 --- /dev/null +++ b/lib/cuda/cuda_data.cu @@ -0,0 +1,220 @@ +enum copy_mode {x, xx, xy, yx, xyz, xzy}; // yxz, yzx, zxy, zyx not yet implemented since they were not needed yet + +#include "cuda_data_cu.h" +#include "cuda_wrapper_cu.h" +#include "cuda_data_kernel.cu" +#include <cstdio> + +void CudaData_Upload_DoubleFloat(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer) +{ + int size = n[0]; + + if(n[1] > 0) size *= n[1]; + + if(n[2] > 0) size *= n[2]; + + dim3 threads; + threads.x = 1; + threads.y = 1; + threads.z = 1; + dim3 grid; + grid.x = 1; + grid.y = 1; + grid.z = 1; + + if(size <= 128 * 30) + threads.x = 32; + else if(size <= 256 * 30) + threads.x = 64; + else if(size <= 512 * 30) + threads.x = 128; + else + threads.x = 256; + + grid.x = ((size - 1) + threads.x) / threads.x; + + if(grid.x > 32000) + grid.x = 32000; + + while(grid.x * grid.y * threads.x < size) grid.y++; + + float debugdata[size]; + //int* cu_debug=(int*) CudaWrapper_AllocCudaData(size*sizeof(FLOAT)); + size *= sizeof(double); + printf("size: %i (%i %i %i) (%i %i %i) %p\n", size, grid.x, grid.y, threads.x, n[0], n[1], n[2], buffer); + CudaWrapper_UploadCudaData(host_data, buffer, size); + CudaData_Upload_Kernel_DoubleFloat <<< grid, threads>>>((double*)buffer, (float*)dev_data, n[0], n[1], n[2], mode); + cudaThreadSynchronize(); + CudaWrapper_DownloadCudaData(debugdata, dev_data, size / 2); + double sum = 0; + printf("debugdata: "); + + for(int i = 0; i < size / sizeof(double); i++) sum += (debugdata[i] - ((double*) host_data)[i]) * (debugdata[i] - ((double*) host_data)[i]); + + printf("%lf \n", sum); + +} + +void CudaData_Upload_DoubleDouble(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer) +{ + int size = n[0]; + + if(n[1] > 0) size *= n[1]; + + if(n[2] > 0) size *= n[2]; + + dim3 threads; + threads.x = 1; + threads.y = 1; + threads.z = 1; + dim3 grid; + grid.x = 1; + grid.y = 1; + grid.z = 1; + + if(size <= 128 * 30) + threads.x = 32; + else if(size <= 256 * 30) + threads.x = 64; + else if(size <= 512 * 30) + threads.x = 128; + else + threads.x = 256; + + grid.x = ((size - 1) + threads.x) / threads.x; + + if(grid.x > 32000) + grid.x = 32000; + + while(grid.x * grid.y * threads.x < size) grid.y++; + + size *= sizeof(double); + + CudaWrapper_UploadCudaData(host_data, buffer, size); + CudaData_Upload_Kernel_DoubleDouble <<< grid, threads>>>((double*)buffer, (double*)dev_data, n[0], n[1], n[2], mode); + cudaThreadSynchronize(); +} + +void CudaData_Upload_FloatDouble(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer) +{ + int size = n[0]; + + if(n[1] > 0) size *= n[1]; + + if(n[2] > 0) size *= n[2]; + + dim3 threads; + threads.x = 1; + threads.y = 1; + threads.z = 1; + dim3 grid; + grid.x = 1; + grid.y = 1; + grid.z = 1; + + if(size <= 128 * 30) + threads.x = 32; + else if(size <= 256 * 30) + threads.x = 64; + else if(size <= 512 * 30) + threads.x = 128; + else + threads.x = 256; + + grid.x = ((size - 1) + threads.x) / threads.x; + + if(grid.x > 32000) + grid.x = 32000; + + while(grid.x * grid.y * threads.x < size) grid.y++; + + size *= sizeof(float); + + CudaWrapper_UploadCudaData(host_data, buffer, size); + CudaData_Upload_Kernel_FloatDouble <<< grid, threads>>>((float*)buffer, (double*)dev_data, n[0], n[1], n[2], mode); + cudaThreadSynchronize(); +} + +void CudaData_Upload_FloatFloat(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer) +{ + int size = n[0]; + + if(n[1] > 0) size *= n[1]; + + if(n[2] > 0) size *= n[2]; + + dim3 threads; + threads.x = 1; + threads.y = 1; + threads.z = 1; + dim3 grid; + grid.x = 1; + grid.y = 1; + grid.z = 1; + + if(size <= 128 * 30) + threads.x = 32; + else if(size <= 256 * 30) + threads.x = 64; + else if(size <= 512 * 30) + threads.x = 128; + else + threads.x = 256; + + grid.x = ((size - 1) + threads.x) / threads.x; + + if(grid.x > 32000) + grid.x = 32000; + + while(grid.x * grid.y * threads.x < size) grid.y++; + + size *= sizeof(float); + + CudaWrapper_UploadCudaData(host_data, buffer, size); + CudaData_Upload_Kernel_FloatFloat <<< grid, threads>>>((float*)buffer, (float*)dev_data, n[0], n[1], n[2], mode); + cudaThreadSynchronize(); +} + +void CudaData_Upload_IntInt(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer) +{ + int size = n[0]; + + if(n[1] > 0) size *= n[1]; + + if(n[2] > 0) size *= n[2]; + + dim3 threads; + threads.x = 1; + threads.y = 1; + threads.z = 1; + dim3 grid; + grid.x = 1; + grid.y = 1; + grid.z = 1; + + if(size <= 128 * 30) + threads.x = 32; + else if(size <= 256 * 30) + threads.x = 64; + else if(size <= 512 * 30) + threads.x = 128; + else + threads.x = 256; + + grid.x = ((size - 1) + threads.x) / threads.x; + + if(grid.x > 32000) + grid.x = 32000; + + while(grid.x * grid.y * threads.x < size) grid.y++; + + size *= sizeof(int); + + CudaWrapper_UploadCudaData(host_data, buffer, size); + CudaData_Upload_Kernel_IntInt <<< grid, threads>>>((int*)buffer, (int*)dev_data, n[0], n[1], n[2], mode); + cudaThreadSynchronize(); +} + +void CudaData_Download(void* host_data, void* dev_data, int host_size, int dev_size, unsigned* n, copy_mode mode, void* buffer) +{ +} diff --git a/lib/cuda/cuda_data_cu.h b/lib/cuda/cuda_data_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..90dbd141b65682fb682855088020d0d3e650f33b --- /dev/null +++ b/lib/cuda/cuda_data_cu.h @@ -0,0 +1,13 @@ +#ifndef CUDA_DATA_CU_H_ +#define CUDA_DATA_CU_H_ + +extern "C" void CudaData_Upload_DoubleFloat(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer); +extern "C" void CudaData_Upload_DoubleDouble(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer); +extern "C" void CudaData_Upload_FloatDouble(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer); +extern "C" void CudaData_Upload_FloatFloat(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer); +extern "C" void CudaData_Upload_IntInt(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer); + +extern "C" void CudaData_Download(void* host_data, void* dev_data, int host_size, int dev_size, unsigned* n, copy_mode mode, void* buffer); + + +#endif /*CUDA_DATA_CU_H_*/ diff --git a/lib/cuda/cuda_data_kernel.cu b/lib/cuda/cuda_data_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..41eea01564fa3667937e431e8e3bf7bb932fa1a6 --- /dev/null +++ b/lib/cuda/cuda_data_kernel.cu @@ -0,0 +1,195 @@ +__global__ void CudaData_Upload_Kernel_DoubleFloat(double* buffer, float* dev_data, + unsigned nx, unsigned ny, unsigned nz, copy_mode mode) +{ + if(mode == x) mode = xx; + + unsigned length = nx; + + if(ny > 0) length *= ny; + + if(nz > 0) length *= nz; + + unsigned i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x, j, k, l; + + + if(i >= length) return; + + switch(mode) { + case xx: { + dev_data[i] = buffer[i]; + } + + case xy: { + dev_data[i] = buffer[i]; + } + + case yx: { + j = i / ny; + k = i % ny; + dev_data[k * nx + j] = buffer[j * ny + k]; + } + + case xyz: { + dev_data[i] = buffer[i]; + } + + case xzy: { + j = i / (ny * nz); + k = (i % (ny * nz)) / nz; + l = i % nz; + dev_data[j * ny * nz + l * ny + k] = buffer[j * ny * nz + k * nz + l]; + } + } +} + +__global__ void CudaData_Upload_Kernel_DoubleDouble(double* buffer, double* dev_data, + unsigned nx, unsigned ny, unsigned nz, copy_mode mode) +{ + if(mode == x) mode = xx; + + unsigned length = nx; + + if(ny > 0) length *= ny; + + if(nz > 0) length *= nz; + + unsigned i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x, j, k, l; + + if(i >= length) return; + + switch(mode) { + case xx: + dev_data[i] = buffer[i]; + + case xy: + dev_data[i] = buffer[i]; + + case yx: + j = i / ny; + k = i % ny; + dev_data[k * nx + j] = buffer[j * ny + k]; + + case xyz: + dev_data[i] = buffer[i]; + + case xzy: + j = i / (ny * nz); + k = (i % (ny * nz)) / nz; + l = i % nz; + dev_data[j * ny * nz + l * ny + k] = buffer[j * ny * nz + k * nz + l]; + } +} + +__global__ void CudaData_Upload_Kernel_FloatDouble(float* buffer, double* dev_data, + unsigned nx, unsigned ny, unsigned nz, copy_mode mode) +{ + if(mode == x) mode = xx; + + unsigned length = nx; + + if(ny > 0) length *= ny; + + if(nz > 0) length *= nz; + + unsigned i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x, j, k, l; + + if(i >= length) return; + + switch(mode) { + case xx: + dev_data[i] = buffer[i]; + + case xy: + dev_data[i] = buffer[i]; + + case yx: + j = i / ny; + k = i % ny; + dev_data[k * nx + j] = buffer[j * ny + k]; + + case xyz: + dev_data[i] = buffer[i]; + + case xzy: + j = i / (ny * nz); + k = (i % (ny * nz)) / nz; + l = i % nz; + dev_data[j * ny * nz + l * ny + k] = buffer[j * ny * nz + k * nz + l]; + } +} + +__global__ void CudaData_Upload_Kernel_FloatFloat(float* buffer, float* dev_data, + unsigned nx, unsigned ny, unsigned nz, copy_mode mode) +{ + if(mode == x) mode = xx; + + unsigned length = nx; + + if(ny > 0) length *= ny; + + if(nz > 0) length *= nz; + + unsigned i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x, j, k, l; + + if(i >= length) return; + + switch(mode) { + case xx: + dev_data[i] = buffer[i]; + + case xy: + dev_data[i] = buffer[i]; + + case yx: + j = i / ny; + k = i % ny; + dev_data[k * nx + j] = buffer[j * ny + k]; + + case xyz: + dev_data[i] = buffer[i]; + + case xzy: + j = i / (ny * nz); + k = (i % (ny * nz)) / nz; + l = i % nz; + dev_data[j * ny * nz + l * ny + k] = buffer[j * ny * nz + k * nz + l]; + } +} + +__global__ void CudaData_Upload_Kernel_IntInt(int* buffer, int* dev_data, + unsigned nx, unsigned ny, unsigned nz, copy_mode mode) +{ + if(mode == x) mode = xx; + + unsigned length = nx; + + if(ny > 0) length *= ny; + + if(nz > 0) length *= nz; + + unsigned i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x, j, k, l; + + if(i >= length) return; + + switch(mode) { + case xx: + dev_data[i] = buffer[i]; + + case xy: + dev_data[i] = buffer[i]; + + case yx: + j = i / ny; + k = i % ny; + dev_data[k * nx + j] = buffer[j * ny + k]; + + case xyz: + dev_data[i] = buffer[i]; + + case xzy: + j = i / (ny * nz); + k = (i % (ny * nz)) / nz; + l = i % nz; + dev_data[j * ny * nz + l * ny + k] = buffer[j * ny * nz + k * nz + l]; + } +} diff --git a/lib/cuda/cuda_kernel.cu b/lib/cuda/cuda_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/lib/cuda/cuda_pair.cu b/lib/cuda/cuda_pair.cu new file mode 100644 index 0000000000000000000000000000000000000000..9f9900a2d8da708b043bbb0c206094188d17f5c6 --- /dev/null +++ b/lib/cuda/cuda_pair.cu @@ -0,0 +1,1015 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +enum PAIR_FORCES {PAIR_NONE, PAIR_BORN, PAIR_BUCK, PAIR_CG_CMM, PAIR_LJ_CHARMM, PAIR_LJ_CLASS2, PAIR_LJ_CUT, PAIR_LJ_EXPAND, PAIR_LJ_GROMACS, PAIR_LJ_SMOOTH, PAIR_LJ96_CUT, PAIR_MORSE, PAIR_MORSE_R6}; +enum COUL_FORCES {COUL_NONE, COUL_CHARMM, COUL_CHARMM_IMPLICIT, COUL_CUT, COUL_LONG, COUL_DEBYE, COUL_GROMACS, COUL_SPECIAL}; +#define DATA_NONE 0 +#define DATA_V 1 +#define DATA_TAG 2 +#define DATA_RMASS 4 +#define DATA_MASS 8 +#define DATA_TORQUE 16 +#define DATA_OMEGA 32 +#define DATA_RADIUS 64 +#define DATA_DENSITY 128 +#define DATA_MASK 256 +#define DATA_V_RADIUS 512 +#define DATA_OMEGA_RMASS 1024 + +#define NEIGHMASK 0x3FFFFFFF + +#define MY_PREFIX cuda_pair +#define IncludeCommonNeigh +#include "cuda_shared.h" +#include "cuda_common.h" +#include "cuda_wrapper_cu.h" +#include "crm_cuda_utils.cu" + +//constants used by multiple forces + +//general +#define _cutsq MY_AP(cutsq) +#define _offset MY_AP(offset) +#define _special_lj MY_AP(special_lj) +#define _special_coul MY_AP(special_coul) +#define _cutsq_global MY_AP(cutsq_global) +#define _collect_forces_later MY_AP(collect_forces_later) + +__device__ __constant__ X_FLOAT _cutsq[CUDA_MAX_TYPES2]; +__device__ __constant__ ENERGY_FLOAT _offset[CUDA_MAX_TYPES2]; +__device__ __constant__ F_FLOAT _special_lj[4]; +__device__ __constant__ F_FLOAT _special_coul[4]; +__device__ __constant__ X_FLOAT _cutsq_global; +__device__ __constant__ int _collect_forces_later; + +__device__ __constant__ F_FLOAT MY_AP(coeff1)[CUDA_MAX_TYPES2]; //pair force coefficients in case ntypes < CUDA_MAX_TYPES (coeffs fit into constant space) +__device__ __constant__ F_FLOAT MY_AP(coeff2)[CUDA_MAX_TYPES2]; +__device__ __constant__ F_FLOAT MY_AP(coeff3)[CUDA_MAX_TYPES2]; +__device__ __constant__ F_FLOAT MY_AP(coeff4)[CUDA_MAX_TYPES2]; +__device__ __constant__ F_FLOAT MY_AP(coeff5)[CUDA_MAX_TYPES2]; + + +__device__ __constant__ F_FLOAT* MY_AP(coeff1_gm); //pair force coefficients in case ntypes > CUDA_MAX_TYPES (coeffs do not fit into constant space) +__device__ __constant__ F_FLOAT* MY_AP(coeff2_gm); +__device__ __constant__ F_FLOAT* MY_AP(coeff3_gm); +__device__ __constant__ F_FLOAT* MY_AP(coeff4_gm); +__device__ __constant__ F_FLOAT* MY_AP(coeff5_gm); +__device__ __constant__ F_FLOAT* MY_AP(coeff6_gm); +__device__ __constant__ F_FLOAT* MY_AP(coeff7_gm); +__device__ __constant__ F_FLOAT* MY_AP(coeff8_gm); +__device__ __constant__ F_FLOAT* MY_AP(coeff9_gm); +__device__ __constant__ F_FLOAT* MY_AP(coeff10_gm); + +#define _coeff1_gm_tex MY_AP(coeff1_gm_tex) +#if F_PRECISION == 1 +texture<float> _coeff1_gm_tex; +#else +texture<int2, 1> _coeff1_gm_tex; +#endif + +#define _coeff2_gm_tex MY_AP(coeff2_gm_tex) +#if F_PRECISION == 1 +texture<float> _coeff2_gm_tex; +#else +texture<int2, 1> _coeff2_gm_tex; +#endif + +#define _coeff3_gm_tex MY_AP(coeff3_gm_tex) +#if F_PRECISION == 1 +texture<float> _coeff3_gm_tex; +#else +texture<int2, 1> _coeff3_gm_tex; +#endif + +#define _coeff4_gm_tex MY_AP(coeff4_gm_tex) +#if F_PRECISION == 1 +texture<float> _coeff4_gm_tex; +#else +texture<int2, 1> _coeff4_gm_tex; +#endif + +#define _coeff5_gm_tex MY_AP(coeff5_gm_tex) +#if F_PRECISION == 1 +texture<float> _coeff5_gm_tex; +#else +texture<int2, 1> _coeff5_gm_tex; +#endif + +#define _coeff6_gm_tex MY_AP(coeff6_gm_tex) +#if F_PRECISION == 1 +texture<float> _coeff6_gm_tex; +#else +texture<int2, 1> _coeff6_gm_tex; +#endif + +#define _coeff7_gm_tex MY_AP(coeff7_gm_tex) +#if F_PRECISION == 1 +texture<float> _coeff7_gm_tex; +#else +texture<int2, 1> _coeff7_gm_tex; +#endif + +#define _coeff8_gm_tex MY_AP(coeff8_gm_tex) +#if F_PRECISION == 1 +texture<float> _coeff8_gm_tex; +#else +texture<int2, 1> _coeff8_gm_tex; +#endif + +#define _coeff9_gm_tex MY_AP(coeff9_gm_tex) +#if F_PRECISION == 1 +texture<float> _coeff9_gm_tex; +#else +texture<int2, 1> _coeff9_gm_tex; +#endif + +#define _coeff10_gm_tex MY_AP(coeff10_gm_tex) +#if F_PRECISION == 1 +texture<float> _coeff10_gm_tex; +#else +texture<int2, 1> _coeff10_gm_tex; +#endif + +//if more than 5 coefficients are needed for a pair potential add them here + + +//coulomb +#define _cut_coulsq MY_AP(cut_coulsq) +#define _cut_coulsq_global MY_AP(cut_coulsq_global) +#define _g_ewald MY_AP(g_ewald) +#define _qqrd2e MY_AP(qqrd2e) +#define _kappa MY_AP(kappa) +__device__ __constant__ X_FLOAT _cut_coulsq[CUDA_MAX_TYPES2]; +__device__ __constant__ X_FLOAT _cut_coulsq_global; +__device__ __constant__ F_FLOAT _g_ewald; +__device__ __constant__ F_FLOAT _qqrd2e; +__device__ __constant__ F_FLOAT _kappa; + +//inner cutoff +#define _cut_innersq MY_AP(cut_innersq) +#define _cut_innersq_global MY_AP(cut_innersq_global) +__device__ __constant__ X_FLOAT _cut_innersq[CUDA_MAX_TYPES2]; +__device__ __constant__ X_FLOAT _cut_innersq_global; + + +template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data> +__global__ void Pair_Kernel_TpA(int eflag, int vflag, int eflag_atom, int vflag_atom); + +template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data> +__global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_atom); + +template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data> +__global__ void Pair_Kernel_TpA_opt(int eflag, int vflag, int eflag_atom, int vflag_atom, int comm_phase); + +template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data> +__global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vflag_atom, int comm_phase); + +#include <stdio.h> +#include "cuda_pair_cu.h" +#include "cuda_pair_virial_kernel_nc.cu" + +//Functions which are shared by pair styles + +//Update Buffersize +void Cuda_UpdateBuffer(cuda_shared_data* sdata, int size) +{ + CUT_CHECK_ERROR("Cuda_Pair_UpdateBuffer_AllStyles: before updateBuffer failed"); + + if(sdata->buffersize < size) { + MYDBG(printf("Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);) + CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize); + sdata->buffer = CudaWrapper_AllocCudaData(size); + sdata->buffersize = size; + sdata->buffer_new++; + MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);) + } + + cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*)); + CUT_CHECK_ERROR("Cuda_Pair_UpdateBuffer_AllStyles failed"); +} + +void Cuda_Pair_UpdateNeighbor_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) +{ + //Neighbor + cudaMemcpyToSymbol(MY_AP(neighbor_maxlocal) , & sneighlist->firstneigh.dim[0] , sizeof(unsigned)); + cudaMemcpyToSymbol(MY_AP(firstneigh) , & sneighlist->firstneigh.dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(ilist) , & sneighlist->ilist .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(inum) , & sneighlist->inum , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(numneigh) , & sneighlist->numneigh .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(neighbors) , & sneighlist->neighbors .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(maxneighbors) , & sneighlist->maxneighbors , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(overlap_comm) , & sdata->overlap_comm, sizeof(int)); + + if(sdata->overlap_comm) { + cudaMemcpyToSymbol(MY_AP(numneigh_border) , & sneighlist->numneigh_border .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(numneigh_inner) , & sneighlist->numneigh_inner .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(neighbors_border) , & sneighlist->neighbors_border.dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(neighbors_inner) , & sneighlist->neighbors_inner .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(ilist_border) , & sneighlist->ilist_border .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(inum_border) , & sneighlist->inum_border .dev_data, sizeof(int*)); + } + +} +//Update constants after nmax change which are generally needed by all pair styles +void Cuda_Pair_UpdateNmax_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) +{ + CUT_CHECK_ERROR("Cuda_Pair_UpdateNmax_AllStyles: Begin"); + + //System + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); + + //Atom + cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_FLOAT4*)); + cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(q) , & sdata->atom.q .dev_data, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*)); + + + //Other + cudaMemcpyToSymbol(MY_AP(debugdata) , & sdata->debugdata , sizeof(int*)); + CUT_CHECK_ERROR("Cuda_Pair_UpdateNmax_AllStyles: End"); +} + +//Initialisation of GPU Constants which rarely change +void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q = false, bool use_global_params = false, bool need_innercut = false, bool need_cut = true) +{ + unsigned cuda_ntypes = sdata->atom.ntypes + 1; + unsigned cuda_ntypes2 = cuda_ntypes * cuda_ntypes; + unsigned n = sizeof(F_FLOAT) * cuda_ntypes2; + unsigned nx = sizeof(X_FLOAT) * cuda_ntypes2; + + //check if enough constant memory is available + if((cuda_ntypes2 > CUDA_MAX_TYPES2) && !use_global_params) + printf("# CUDA: Cuda_Pair_Init: you need %u types. this is more than %u " + "(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=32 " + "or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES_PLUS_ONE - 1); + + if((cuda_ntypes2 > CUDA_MAX_TYPES2) && !use_global_params) + exit(0); + + //type conversion of cutoffs and parameters + if(need_cut) { + X_FLOAT cutsq[cuda_ntypes2]; + + for(int i = 1; i <= sdata->atom.ntypes; ++i) { + for(int j = 1; j <= sdata->atom.ntypes; ++j) { + cutsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_global * sdata->pair.cut_global); + } + } + + int cutsqdiffer = 0; + X_FLOAT cutsq_global; + cutsq_global = (X_FLOAT)(sdata->pair.cut_global * sdata->pair.cut_global); + + if(sdata->pair.cut) { + for(int i = 1; i <= sdata->atom.ntypes; ++i) { + for(int j = i; j <= sdata->atom.ntypes; ++j) { + if(sdata->pair.cut[i][j] > 1e-6) { + cutsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut[i][j] * sdata->pair.cut[i][j]); + cutsq[j * cuda_ntypes + i] = (X_FLOAT)(sdata->pair.cut[i][j] * sdata->pair.cut[i][j]); + } + + if(i == 1 && j == 1) cutsq_global = cutsq[i * cuda_ntypes + j]; + + if((cutsq_global - cutsq[i * cuda_ntypes + j]) * (cutsq_global - cutsq[i * cuda_ntypes + j]) > 1e-6) + cutsqdiffer++; + } + } + } + + if(sdata->pair.cutsq) { + for(int i = 1; i <= sdata->atom.ntypes; ++i) { + for(int j = i; j <= sdata->atom.ntypes; ++j) { + if(sdata->pair.cut[i][j] > 1e-6) { + cutsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cutsq[i][j]); + cutsq[j * cuda_ntypes + i] = (X_FLOAT)(sdata->pair.cutsq[i][j]); + } + + if(i == 1 && j == 1) cutsq_global = cutsq[i * cuda_ntypes + j]; + + if((cutsq_global - cutsq[i * cuda_ntypes + j]) * (cutsq_global - cutsq[i * cuda_ntypes + j]) > 1e-6) + cutsqdiffer++; + } + } + } + + //printf("CUTSQGLOB: %i %e\n",cutsqdiffer,cutsq_global); + if(cutsqdiffer) { + + cutsq_global = -1.0; + cudaMemcpyToSymbol(MY_AP(cutsq) , cutsq , nx); + } + + cudaMemcpyToSymbol(MY_AP(cutsq_global) , &cutsq_global , sizeof(X_FLOAT)); + } + + if(need_innercut) { + X_FLOAT cut_innersq[cuda_ntypes2]; + + for(int i = 1; i <= sdata->atom.ntypes; ++i) { + for(int j = 1; j <= sdata->atom.ntypes; ++j) { + cut_innersq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_inner_global * sdata->pair.cut_inner_global); + } + } + + int cutsqdiffer = 0; + X_FLOAT cut_innersq_global; + cut_innersq_global = (X_FLOAT)(sdata->pair.cut_inner_global * sdata->pair.cut_inner_global); + + if(sdata->pair.cut_inner) { + for(int i = 1; i <= sdata->atom.ntypes; ++i) { + for(int j = i; j <= sdata->atom.ntypes; ++j) { + if(sdata->pair.cut_inner[i][j] > 1e-6) { + cut_innersq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_inner[i][j] * sdata->pair.cut_inner[i][j]); + cut_innersq[j * cuda_ntypes + i] = (X_FLOAT)(sdata->pair.cut_inner[i][j] * sdata->pair.cut_inner[i][j]); + } + + if(i == 1 && j == 1) cut_innersq_global = cut_innersq[i * cuda_ntypes + j]; + + if((cut_innersq_global - cut_innersq[i * cuda_ntypes + j]) * (cut_innersq_global - cut_innersq[i * cuda_ntypes + j]) > 1e-6) + cutsqdiffer++; + } + } + } + + if(cutsqdiffer) { + cut_innersq_global = -1.0; + cudaMemcpyToSymbol(MY_AP(cut_innersq) , cut_innersq , nx); + } + + cudaMemcpyToSymbol(MY_AP(cut_innersq_global) , &cut_innersq_global , sizeof(X_FLOAT)); + } + + if(need_q) { + X_FLOAT cut_coulsq[cuda_ntypes2]; + + for(int i = 1; i <= sdata->atom.ntypes; ++i) { + for(int j = 1; j <= sdata->atom.ntypes; ++j) { + cut_coulsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_coul_global * sdata->pair.cut_coul_global); + } + } + + int cutsqdiffer = 0; + X_FLOAT cut_coulsq_global; + cut_coulsq_global = (X_FLOAT)(sdata->pair.cut_coul_global * sdata->pair.cut_coul_global); + + if(sdata->pair.cut_coulsq_global > cut_coulsq_global) cut_coulsq_global = (X_FLOAT) sdata->pair.cut_coulsq_global; + + if(sdata->pair.cut_coul) { + for(int i = 1; i <= sdata->atom.ntypes; ++i) { + for(int j = i; j <= sdata->atom.ntypes; ++j) { + if(sdata->pair.cut_coul[i][j] > 1e-6) { + cut_coulsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_coul[i][j] * sdata->pair.cut_coul[i][j]); + cut_coulsq[j * cuda_ntypes + i] = (X_FLOAT)(sdata->pair.cut_coul[i][j] * sdata->pair.cut_coul[i][j]); + } + + if(i == 1 && j == 1) cut_coulsq_global = cut_coulsq[i * cuda_ntypes + j]; + + if((cut_coulsq_global - cut_coulsq[i * cuda_ntypes + j]) * (cut_coulsq_global - cut_coulsq[i * cuda_ntypes + j]) > 1e-6) + cutsqdiffer++; + } + } + } + + if(cutsqdiffer) { + cut_coulsq_global = -1.0; + cudaMemcpyToSymbol(MY_AP(cut_coulsq) , cut_coulsq , nx); + } + + cudaMemcpyToSymbol(MY_AP(cut_coulsq_global), &cut_coulsq_global , sizeof(X_FLOAT)); + } + + CUT_CHECK_ERROR("Cuda_Pair: init pre Coeff failed"); + + if(ncoeff > 0) { + F_FLOAT coeff1[cuda_ntypes2]; + + for(int i = 1; i <= sdata->atom.ntypes; ++i) { + for(int j = 1; j <= sdata->atom.ntypes; ++j) { + coeff1[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff1[i][j]; + } + } + + if(use_global_params) { + cudaMemcpyToSymbol(MY_AP(coeff1_gm) , &sdata->pair.coeff1_gm.dev_data , sizeof(F_FLOAT*)); + cudaMemcpy((sdata->pair.coeff1_gm.dev_data), coeff1, n, cudaMemcpyHostToDevice); + + _coeff1_gm_tex.normalized = false; // access with normalized texture coordinates + _coeff1_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _coeff1_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + const textureReference* coeff1_gm_texture_ptr = &MY_AP(coeff1_gm_tex); + CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 a failed"); + +#if F_PRECISION == 1 + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>(); + CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 b failed"); + cudaBindTexture(0, coeff1_gm_texture_ptr, sdata->pair.coeff1_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT)); + CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 c failed"); +#else + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>(); + CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 b-d failed"); + cudaBindTexture(0, coeff1_gm_texture_ptr, sdata->pair.coeff1_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2)); + CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 c-d failed"); +#endif + + } else + cudaMemcpyToSymbol(MY_AP(coeff1), coeff1 , n); + } + + CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 failed"); + + if(ncoeff > 1) { + F_FLOAT coeff2[cuda_ntypes2]; + + for(int i = 1; i <= sdata->atom.ntypes; ++i) { + for(int j = 1; j <= sdata->atom.ntypes; ++j) { + coeff2[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff2[i][j]; + } + } + + if(use_global_params) { + cudaMemcpyToSymbol(MY_AP(coeff2_gm) , &sdata->pair.coeff2_gm.dev_data , sizeof(F_FLOAT*)); + cudaMemcpy(sdata->pair.coeff2_gm.dev_data, coeff2, n, cudaMemcpyHostToDevice); + + _coeff2_gm_tex.normalized = false; // access with normalized texture coordinates + _coeff2_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _coeff2_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + const textureReference* coeff2_gm_texture_ptr = &MY_AP(coeff2_gm_tex); + +#if F_PRECISION == 1 + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>(); + cudaBindTexture(0, coeff2_gm_texture_ptr, sdata->pair.coeff2_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT)); +#else + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>(); + cudaBindTexture(0, coeff2_gm_texture_ptr, sdata->pair.coeff2_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2)); +#endif + + } else + cudaMemcpyToSymbol(MY_AP(coeff2), coeff2 , n); + } + + CUT_CHECK_ERROR("Cuda_Pair: init Coeff1 failed"); + + if(ncoeff > 2) { + F_FLOAT coeff3[cuda_ntypes2]; + + for(int i = 1; i <= sdata->atom.ntypes; ++i) { + for(int j = 1; j <= sdata->atom.ntypes; ++j) { + coeff3[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff3[i][j]; + } + } + + if(use_global_params) { + cudaMemcpyToSymbol(MY_AP(coeff3_gm) , &sdata->pair.coeff3_gm.dev_data , sizeof(F_FLOAT*)); + cudaMemcpy(sdata->pair.coeff3_gm.dev_data, coeff3, n, cudaMemcpyHostToDevice); + _coeff3_gm_tex.normalized = false; // access with normalized texture coordinates + _coeff3_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _coeff3_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + const textureReference* coeff3_gm_texture_ptr = &MY_AP(coeff3_gm_tex); + +#if F_PRECISION == 1 + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>(); + cudaBindTexture(0, coeff3_gm_texture_ptr, sdata->pair.coeff3_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT)); +#else + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>(); + cudaBindTexture(0, coeff3_gm_texture_ptr, sdata->pair.coeff3_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2)); +#endif + } else + cudaMemcpyToSymbol(MY_AP(coeff3), coeff3 , n); + } + + CUT_CHECK_ERROR("Cuda_Pair: init Coeff3 failed"); + + if(ncoeff > 3) { + F_FLOAT coeff4[cuda_ntypes2]; + + for(int i = 1; i <= sdata->atom.ntypes; ++i) { + for(int j = 1; j <= sdata->atom.ntypes; ++j) { + coeff4[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff4[i][j]; + } + } + + if(use_global_params) { + cudaMemcpyToSymbol(MY_AP(coeff4_gm) , &sdata->pair.coeff4_gm.dev_data , sizeof(F_FLOAT*)); + cudaMemcpy(sdata->pair.coeff4_gm.dev_data, coeff4, n, cudaMemcpyHostToDevice); + _coeff4_gm_tex.normalized = false; // access with normalized texture coordinates + _coeff4_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _coeff4_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + const textureReference* coeff4_gm_texture_ptr = &MY_AP(coeff4_gm_tex); + +#if F_PRECISION == 1 + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>(); + cudaBindTexture(0, coeff4_gm_texture_ptr, sdata->pair.coeff4_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT)); +#else + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>(); + cudaBindTexture(0, coeff4_gm_texture_ptr, sdata->pair.coeff4_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2)); +#endif + } else + cudaMemcpyToSymbol(MY_AP(coeff4), coeff4 , n); + } + + CUT_CHECK_ERROR("Cuda_Pair: init Coeff4 failed"); + + if(ncoeff > 4) { + F_FLOAT coeff5[cuda_ntypes2]; + + for(int i = 1; i <= sdata->atom.ntypes; ++i) { + for(int j = 1; j <= sdata->atom.ntypes; ++j) { + coeff5[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff5[i][j]; + } + } + + if(use_global_params) { + cudaMemcpyToSymbol(MY_AP(coeff5_gm) , &sdata->pair.coeff5_gm.dev_data , sizeof(F_FLOAT*)); + cudaMemcpy(sdata->pair.coeff5_gm.dev_data, coeff5, n, cudaMemcpyHostToDevice); + _coeff5_gm_tex.normalized = false; // access with normalized texture coordinates + _coeff5_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _coeff5_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + const textureReference* coeff5_gm_texture_ptr = &MY_AP(coeff5_gm_tex); + +#if F_PRECISION == 1 + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>(); + cudaBindTexture(0, coeff5_gm_texture_ptr, sdata->pair.coeff5_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT)); +#else + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>(); + cudaBindTexture(0, coeff5_gm_texture_ptr, sdata->pair.coeff5_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2)); +#endif + } else + cudaMemcpyToSymbol(MY_AP(coeff5), coeff5 , n); + } + + CUT_CHECK_ERROR("Cuda_Pair: init Coeff5 failed"); + + if(ncoeff > 5) { + F_FLOAT coeff6[cuda_ntypes2]; + + for(int i = 1; i <= sdata->atom.ntypes; ++i) { + for(int j = 1; j <= sdata->atom.ntypes; ++j) { + coeff6[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff6[i][j]; + } + } + + if(use_global_params) { + cudaMemcpyToSymbol(MY_AP(coeff6_gm) , &sdata->pair.coeff6_gm.dev_data , sizeof(F_FLOAT*)); + cudaMemcpy(sdata->pair.coeff6_gm.dev_data, coeff6, n, cudaMemcpyHostToDevice); + _coeff6_gm_tex.normalized = false; // access with normalized texture coordinates + _coeff6_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _coeff6_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + const textureReference* coeff6_gm_texture_ptr = &MY_AP(coeff6_gm_tex); + +#if F_PRECISION == 1 + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>(); + cudaBindTexture(0, coeff6_gm_texture_ptr, sdata->pair.coeff6_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT)); +#else + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>(); + cudaBindTexture(0, coeff6_gm_texture_ptr, sdata->pair.coeff6_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2)); +#endif + } + } + + CUT_CHECK_ERROR("Cuda_Pair: init Coeff6 failed"); + + if(ncoeff > 6) { + F_FLOAT coeff7[cuda_ntypes2]; + + for(int i = 1; i <= sdata->atom.ntypes; ++i) { + for(int j = 1; j <= sdata->atom.ntypes; ++j) { + coeff7[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff7[i][j]; + } + } + + if(use_global_params) { + cudaMemcpyToSymbol(MY_AP(coeff7_gm) , &sdata->pair.coeff7_gm.dev_data , sizeof(F_FLOAT*)); + cudaMemcpy(sdata->pair.coeff7_gm.dev_data, coeff7, n, cudaMemcpyHostToDevice); + _coeff7_gm_tex.normalized = false; // access with normalized texture coordinates + _coeff7_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _coeff7_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + const textureReference* coeff7_gm_texture_ptr = &MY_AP(coeff7_gm_tex); + +#if F_PRECISION == 1 + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>(); + cudaBindTexture(0, coeff7_gm_texture_ptr, sdata->pair.coeff7_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT)); +#else + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>(); + cudaBindTexture(0, coeff7_gm_texture_ptr, sdata->pair.coeff7_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2)); +#endif + } + } + + CUT_CHECK_ERROR("Cuda_Pair: init Coeff7 failed"); + + if(ncoeff > 7) { + F_FLOAT coeff8[cuda_ntypes2]; + + for(int i = 1; i <= sdata->atom.ntypes; ++i) { + for(int j = 1; j <= sdata->atom.ntypes; ++j) { + coeff8[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff8[i][j]; + } + } + + if(use_global_params) { + cudaMemcpyToSymbol(MY_AP(coeff8_gm) , &sdata->pair.coeff8_gm.dev_data , sizeof(F_FLOAT*)); + cudaMemcpy(sdata->pair.coeff8_gm.dev_data, coeff8, n, cudaMemcpyHostToDevice); + _coeff8_gm_tex.normalized = false; // access with normalized texture coordinates + _coeff8_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _coeff8_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + const textureReference* coeff8_gm_texture_ptr = &MY_AP(coeff8_gm_tex); + +#if F_PRECISION == 1 + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>(); + cudaBindTexture(0, coeff8_gm_texture_ptr, sdata->pair.coeff8_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT)); +#else + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>(); + cudaBindTexture(0, coeff8_gm_texture_ptr, sdata->pair.coeff8_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2)); +#endif + } + } + + CUT_CHECK_ERROR("Cuda_Pair: init Coeff8 failed"); + + if(ncoeff > 8) { + F_FLOAT coeff9[cuda_ntypes2]; + + for(int i = 1; i <= sdata->atom.ntypes; ++i) { + for(int j = 1; j <= sdata->atom.ntypes; ++j) { + coeff9[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff9[i][j]; + } + } + + if(use_global_params) { + cudaMemcpyToSymbol(MY_AP(coeff9_gm) , &sdata->pair.coeff9_gm.dev_data , sizeof(F_FLOAT*)); + cudaMemcpy(sdata->pair.coeff9_gm.dev_data, coeff9, n, cudaMemcpyHostToDevice); + _coeff9_gm_tex.normalized = false; // access with normalized texture coordinates + _coeff9_gm_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _coeff9_gm_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + const textureReference* coeff9_gm_texture_ptr = &MY_AP(coeff9_gm_tex); + +#if F_PRECISION == 1 + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>(); + cudaBindTexture(0, coeff9_gm_texture_ptr, sdata->pair.coeff9_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT)); +#else + cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>(); + cudaBindTexture(0, coeff9_gm_texture_ptr, sdata->pair.coeff9_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2)); +#endif + } + } + + CUT_CHECK_ERROR("Cuda_Pair: init Coeff9 failed"); + + F_FLOAT special_lj[4]; + special_lj[0] = sdata->pair.special_lj[0]; + special_lj[1] = sdata->pair.special_lj[1]; + special_lj[2] = sdata->pair.special_lj[2]; + special_lj[3] = sdata->pair.special_lj[3]; + + + X_FLOAT box_size[3] = { + sdata->domain.subhi[0] - sdata->domain.sublo[0], + sdata->domain.subhi[1] - sdata->domain.sublo[1], + sdata->domain.subhi[2] - sdata->domain.sublo[2] + }; + + cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_FLOAT) * 3); + cudaMemcpyToSymbol(MY_AP(cuda_ntypes) , &cuda_ntypes , sizeof(unsigned)); + cudaMemcpyToSymbol(MY_AP(special_lj) , special_lj , sizeof(F_FLOAT) * 4); + cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(periodicity) , sdata->domain.periodicity , sizeof(int) * 3); + cudaMemcpyToSymbol(MY_AP(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int)); + + if(need_q) { + F_FLOAT qqrd2e_tmp = sdata->pppm.qqrd2e; + F_FLOAT special_coul[4]; + special_coul[0] = sdata->pair.special_coul[0]; + special_coul[1] = sdata->pair.special_coul[1]; + special_coul[2] = sdata->pair.special_coul[2]; + special_coul[3] = sdata->pair.special_coul[3]; + + cudaMemcpyToSymbol(MY_AP(special_coul) , special_coul , sizeof(F_FLOAT) * 4); + cudaMemcpyToSymbol(MY_AP(g_ewald) , &sdata->pair.g_ewald , sizeof(F_FLOAT)); + cudaMemcpyToSymbol(MY_AP(qqrd2e) , &qqrd2e_tmp , sizeof(F_FLOAT)); + cudaMemcpyToSymbol(MY_AP(kappa) , &sdata->pair.kappa , sizeof(F_FLOAT)); + cudaMemcpyToSymbol(MY_AP(eng_coul) , &sdata->pair.eng_coul.dev_data , sizeof(ENERGY_FLOAT*)); + } + + CUT_CHECK_ERROR("Cuda_Pair: init failed"); +} +timespec startpairtime, endpairtime; +//Function which is called prior to kernel invocation, determins grid, Binds Textures, updates constant memory if necessary +void Cuda_Pair_PreKernel_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, dim3 &grid, dim3 &threads, int &sharedperproc, bool need_q = false, int maxthreads = 256) +{ + if(sdata->atom.nlocal == 0) return; + + if(sdata->atom.update_neigh) + Cuda_Pair_UpdateNeighbor_AllStyles(sdata, sneighlist); + + if(sdata->atom.update_nmax) + Cuda_Pair_UpdateNmax_AllStyles(sdata, sneighlist); + + if(sdata->atom.update_nlocal) { + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int)); + } + + + + BindXTypeTexture(sdata); + + if(need_q) BindQTexture(sdata); + + + sharedperproc = 0; + + if(sdata->pair.use_block_per_atom) sharedperproc += 3; + + if(eflag) sharedperproc += 1; + + if(need_q && eflag) sharedperproc += 1; + + if(vflag) sharedperproc += 6; + + int threadnum = sneighlist->inum; + + if(sdata->comm.comm_phase == 2)threadnum = sneighlist->inum_border2; + + if(sdata->pair.use_block_per_atom) { + threadnum *= 64; + maxthreads = 64; + } + + int3 layout = getgrid(threadnum, sharedperproc * sizeof(ENERGY_FLOAT), maxthreads, true); //need to limit to 192 threads due to register limit + threads.x = layout.z; + threads.y = 1; + threads.z = 1; + grid.x = layout.x; + grid.y = layout.y; + grid.z = 1; + + int size = (unsigned)(layout.y * layout.x) * sharedperproc * sizeof(ENERGY_FLOAT); + + if(sdata->pair.collect_forces_later) size += (unsigned)(sdata->atom.nmax * 3 * sizeof(F_FLOAT)); + + Cuda_UpdateBuffer(sdata, size); + + if(sdata->pair.use_block_per_atom) + cudaMemset(sdata->buffer, 0, size); + + sdata->pair.lastgridsize = grid.x * grid.y; + sdata->pair.n_energy_virial = sharedperproc; + + if(sdata->pair.use_block_per_atom) sdata->pair.n_energy_virial -= 3; + + clock_gettime(CLOCK_REALTIME, &startpairtime); + + MYDBG(printf("# CUDA: Cuda_Pair: kernel start eflag: %i vflag: %i config: %i %i %i %i\n", eflag, vflag, grid.x, grid.y, threads.x, sharedperproc * sizeof(ENERGY_FLOAT)*threads.x);) +} + +//Function which is called after the kernel invocation, collects energy and virial +void Cuda_Pair_PostKernel_AllStyles(cuda_shared_data* sdata, dim3 &grid, int &sharedperproc, int eflag, int vflag) +{ + if((not sdata->pair.collect_forces_later) && (eflag || vflag)) { //not sdata->comm.comm_phase==2)) + cudaThreadSynchronize(); + clock_gettime(CLOCK_REALTIME, &endpairtime); + sdata->cuda_timings.pair_kernel += + endpairtime.tv_sec - startpairtime.tv_sec + 1.0 * (endpairtime.tv_nsec - startpairtime.tv_nsec) / 1000000000; + CUT_CHECK_ERROR("Cuda_Pair: Kernel execution failed"); + + if(eflag || vflag) { + int n = grid.x * grid.y; + + if(sdata->pair.use_block_per_atom) + grid.x = sharedperproc - 3; + else + grid.x = sharedperproc; + + grid.y = 1; + dim3 threads(128, 1, 1); + MYDBG(printf("# CUDA: Cuda_Pair: virial compute kernel start eflag: %i vflag: %i config: %i %i %i %i\n", eflag, vflag, grid.x, grid.y, threads.x, sharedperproc * sizeof(ENERGY_FLOAT)*threads.x);) + MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>>(n); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Pair: virial compute Kernel execution failed"); + } + + MYDBG(printf("# CUDA: Cuda_Pair: kernel done\n");) + } +} + + +#include "pair_born_coul_long_cuda.cu" +#include "pair_buck_coul_cut_cuda.cu" +#include "pair_buck_coul_long_cuda.cu" +#include "pair_buck_cuda.cu" +#include "pair_lj_sdk_cuda.cu" +#include "pair_lj_sdk_coul_cut_cuda.cu" +#include "pair_lj_sdk_coul_debye_cuda.cu" +#include "pair_lj_sdk_coul_long_cuda.cu" +#include "pair_gran_hooke_cuda.cu" +#include "pair_lj_charmm_coul_charmm_implicit_cuda.cu" +#include "pair_lj_charmm_coul_charmm_cuda.cu" +#include "pair_lj_charmm_coul_long_cuda.cu" +#include "pair_lj_class2_coul_cut_cuda.cu" +#include "pair_lj_class2_coul_long_cuda.cu" +#include "pair_lj_class2_cuda.cu" +#include "pair_lj_cut_coul_cut_cuda.cu" +#include "pair_lj_cut_coul_debye_cuda.cu" +#include "pair_lj_cut_coul_long_cuda.cu" +#include "pair_lj_cut_cuda.cu" +#include "pair_lj_cut_experimental_cuda.cu" +#include "pair_lj_expand_cuda.cu" +#include "pair_lj_gromacs_cuda.cu" +#include "pair_lj_gromacs_coul_gromacs_cuda.cu" +#include "pair_lj_smooth_cuda.cu" +#include "pair_lj96_cut_cuda.cu" +#include "pair_morse_coul_long_cuda.cu" +#include "pair_morse_cuda.cu" +#include "pair_eam_cuda.cu" + +#include "cuda_pair_kernel.cu" + +#include "pair_manybody_const.h" +#include "pair_tersoff_cuda.cu" +#include "pair_sw_cuda.cu" + +void Cuda_Pair_UpdateNmax(cuda_shared_data* sdata) +{ + CUT_CHECK_ERROR("Cuda_Pair: before updateNmax failed"); + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_FLOAT4*)); + cudaMemcpyToSymbol(MY_AP(xhold) , & sdata->atom.xhold .dev_data, sizeof(X_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(radius) , & sdata->atom.radius .dev_data, sizeof(X_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(v_radius) , & sdata->atom.v_radius .dev_data, sizeof(V_FLOAT4*)); + cudaMemcpyToSymbol(MY_AP(omega) , & sdata->atom.omega .dev_data, sizeof(V_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass .dev_data, sizeof(V_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(omega_rmass), & sdata->atom.omega_rmass.dev_data, sizeof(V_FLOAT4*)); + cudaMemcpyToSymbol(MY_AP(map_array), & sdata->atom.map_array .dev_data, sizeof(int*)); + CUT_CHECK_ERROR("Cuda_Pair: updateNmax failed"); +} + + +void Cuda_Pair_GenerateXType(cuda_shared_data* sdata) +{ + MYDBG(printf(" # CUDA: GenerateXType ... start %i %i %i %p %p %p %p\n", sdata->atom.nlocal, sdata->atom.nall, sdata->atom.nmax, sdata->atom.x.dev_data, sdata->atom.x_type.dev_data, sdata->atom.xhold.dev_data, sdata->atom.type.dev_data);) + + if(sdata->atom.update_nmax) + Cuda_Pair_UpdateNmax(sdata); + + if(sdata->atom.update_nlocal) { + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int)); + } + + MYDBG(printf(" # CUDA: GenerateXType ... getgrid\n"); fflush(stdout);) + + int3 layout = getgrid(sdata->atom.nall); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + MYDBG(printf(" # CUDA: GenerateXType ... kernel start test\n"); fflush(stdout);) + Pair_GenerateXType_Kernel <<< grid, threads, 0>>>(); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Pair GenerateXType: Kernel failed"); + MYDBG(printf(" # CUDA: GenerateXType ... end\n"); fflush(stdout);) +} + +void Cuda_Pair_RevertXType(cuda_shared_data* sdata) +{ + MYDBG(printf(" # CUDA: RevertXType ... start\n");) + + if(sdata->atom.update_nmax) + Cuda_Pair_UpdateNmax(sdata); + + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int)); + + int3 layout = getgrid(sdata->atom.nall); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + Pair_RevertXType_Kernel <<< grid, threads, 0>>>(); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Pair GenerateXType: Kernel failed"); + MYDBG(printf(" # CUDA: RevertXType ... end\n");) +} + +void Cuda_Pair_GenerateVRadius(cuda_shared_data* sdata) +{ + MYDBG(printf(" # CUDA: GenerateVRadius ... start %i %i %i %p %p %p %p\n", sdata->atom.nlocal, sdata->atom.nall, sdata->atom.nmax, sdata->atom.x.dev_data, sdata->atom.x_type.dev_data, sdata->atom.xhold.dev_data, sdata->atom.type.dev_data);) + + if(sdata->atom.update_nmax) + Cuda_Pair_UpdateNmax(sdata); + + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int)); + MYDBG(printf(" # CUDA: GenerateVRadius ... getgrid\n"); fflush(stdout);) + + int3 layout = getgrid(sdata->atom.nall); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + MYDBG(printf(" # CUDA: GenerateVRadius ... kernel start test\n"); fflush(stdout);) + Pair_GenerateVRadius_Kernel <<< grid, threads, 0>>>(); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Pair GenerateVRadius: Kernel failed"); + MYDBG(printf(" # CUDA: GenerateVRadius ... end\n"); fflush(stdout);) +} + +void Cuda_Pair_GenerateOmegaRmass(cuda_shared_data* sdata) +{ + MYDBG(printf(" # CUDA: GenerateOmegaRmass ... start %i %i %i %p %p %p %p\n", sdata->atom.nlocal, sdata->atom.nall, sdata->atom.nmax, sdata->atom.x.dev_data, sdata->atom.x_type.dev_data, sdata->atom.xhold.dev_data, sdata->atom.type.dev_data);) + + if(sdata->atom.update_nmax) + Cuda_Pair_UpdateNmax(sdata); + + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int)); + MYDBG(printf(" # CUDA: GenerateOmegaRmass ... getgrid\n"); fflush(stdout);) + + int3 layout = getgrid(sdata->atom.nall); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + MYDBG(printf(" # CUDA: GenerateOmegaRmass ... kernel start test\n"); fflush(stdout);) + Pair_GenerateOmegaRmass_Kernel <<< grid, threads, 0>>>(); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Pair GenerateOmegaRmass: Kernel failed"); + MYDBG(printf(" # CUDA: GenerateOmegaRmass ... end\n"); fflush(stdout);) +} + +void Cuda_Pair_BuildXHold(cuda_shared_data* sdata) +{ + if(sdata->atom.update_nmax) + Cuda_Pair_UpdateNmax(sdata); + + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int)); + + int3 layout = getgrid(sdata->atom.nall); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + Pair_BuildXHold_Kernel <<< grid, threads, 0>>>(); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Pair GenerateXType: Kernel failed"); +} + +void Cuda_Pair_CollectForces(cuda_shared_data* sdata, int eflag, int vflag) +{ + cudaThreadSynchronize(); + clock_gettime(CLOCK_REALTIME, &endpairtime); + sdata->cuda_timings.pair_kernel += + endpairtime.tv_sec - startpairtime.tv_sec + 1.0 * (endpairtime.tv_nsec - startpairtime.tv_nsec) / 1000000000; + CUT_CHECK_ERROR("Cuda_Pair: Kernel execution failed"); + dim3 threads; + dim3 grid; + + if(eflag || vflag) { + int n = sdata->pair.lastgridsize; + grid.x = sdata->pair.n_energy_virial; + grid.y = 1; + threads.x = 128; + //printf("A grid.x: %i\n",grid.x); + MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>>(n); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Pair_CollectForces: virial compute Kernel execution failed"); + } + + int3 layout = getgrid(sdata->atom.nlocal); + threads.x = layout.z; + grid.x = layout.x; + grid.y = layout.y; + Pair_CollectForces_Kernel <<< grid, threads, 0>>>(sdata->pair.n_energy_virial, sdata->pair.lastgridsize); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Pair_CollectForces: Force Summation Kernel execution failed"); + +} diff --git a/lib/cuda/cuda_pair_cu.h b/lib/cuda/cuda_pair_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..bec7e82229d380b420c2e451f60097c480707971 --- /dev/null +++ b/lib/cuda/cuda_pair_cu.h @@ -0,0 +1,30 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ +#include "cuda_shared.h" + +extern "C" void Cuda_Pair_GenerateXType(cuda_shared_data* sdata); +extern "C" void Cuda_Pair_RevertXType(cuda_shared_data* sdata); +extern "C" void Cuda_Pair_GenerateVRadius(cuda_shared_data* sdata); +extern "C" void Cuda_Pair_GenerateOmegaRmass(cuda_shared_data* sdata); +extern "C" void Cuda_Pair_BuildXHold(cuda_shared_data* sdata); +extern "C" void Cuda_Pair_CollectForces(cuda_shared_data* sdata, int eflag, int vflag); diff --git a/lib/cuda/cuda_pair_kernel.cu b/lib/cuda/cuda_pair_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..2c697f9c7ea278c03aa3de265c374fdf1e127cb2 --- /dev/null +++ b/lib/cuda/cuda_pair_kernel.cu @@ -0,0 +1,1437 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ +#define EWALD_F 1.12837917 +#define EWALD_P 0.3275911 +#define A1 0.254829592 +#define A2 -0.284496736 +#define A3 1.421413741 +#define A4 -1.453152027 +#define A5 1.061405429 + + +template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data> +__global__ void Pair_Kernel_TpA(int eflag, int vflag, int eflag_atom, int vflag_atom) +{ + ENERGY_FLOAT evdwl = ENERGY_F(0.0); + ENERGY_FLOAT ecoul = ENERGY_F(0.0); + + ENERGY_FLOAT* sharedE; + ENERGY_FLOAT* sharedECoul; + ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x]; + + if(eflag || eflag_atom) { + sharedE = &sharedmem[threadIdx.x]; + sharedE[0] = ENERGY_F(0.0); + sharedV += blockDim.x; + + if(coul_type != COUL_NONE) { + sharedECoul = sharedE + blockDim.x; + sharedECoul[0] = ENERGY_F(0.0); + sharedV += blockDim.x; + } + } + + if(vflag || vflag_atom) { + sharedV[0 * blockDim.x] = ENERGY_F(0.0); + sharedV[1 * blockDim.x] = ENERGY_F(0.0); + sharedV[2 * blockDim.x] = ENERGY_F(0.0); + sharedV[3 * blockDim.x] = ENERGY_F(0.0); + sharedV[4 * blockDim.x] = ENERGY_F(0.0); + sharedV[5 * blockDim.x] = ENERGY_F(0.0); + } + + int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + X_FLOAT xtmp, ytmp, ztmp; + X_FLOAT4 myxtype; + F_FLOAT fxtmp, fytmp, fztmp, fpair; + F_FLOAT delx, dely, delz; + F_FLOAT factor_lj, factor_coul; + F_FLOAT qtmp; + int itype, i, j; + int jnum = 0; + int* jlist; + + if(ii < _inum) { + i = _ilist[ii]; + + myxtype = fetchXType(i); + xtmp = myxtype.x; + ytmp = myxtype.y; + ztmp = myxtype.z; + itype = static_cast <int>(myxtype.w); + + + fxtmp = F_F(0.0); + fytmp = F_F(0.0); + fztmp = F_F(0.0); + + if(coul_type != COUL_NONE) + qtmp = fetchQ(i); + + jnum = _numneigh[i]; + jlist = &_neighbors[i]; + } + + __syncthreads(); + + for(int jj = 0; jj < jnum; jj++) { + if(ii < _inum) + if(jj < jnum) { + fpair = F_F(0.0); + j = jlist[jj * _nlocal]; + factor_lj = _special_lj[sbmask(j)]; + + if(coul_type != COUL_NONE) + factor_coul = _special_coul[sbmask(j)]; + + j &= NEIGHMASK; + + myxtype = fetchXType(j); + delx = xtmp - myxtype.x; + dely = ytmp - myxtype.y; + delz = ztmp - myxtype.z; + int jtype = static_cast <int>(myxtype.w); + + + const F_FLOAT rsq = delx * delx + dely * dely + delz * delz; + + bool in_cutoff = rsq < (_cutsq_global > X_F(0.0) ? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]); + + if(in_cutoff) { + switch(pair_type) { + case PAIR_BORN: + fpair += PairBornCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_BUCK: + fpair += PairBuckCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_CG_CMM: + fpair += PairLJSDKCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_LJ_CHARMM: + fpair += PairLJCharmmCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_LJ_CLASS2: + fpair += PairLJClass2Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_LJ_CUT: + fpair += PairLJCutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_LJ_EXPAND: + fpair += PairLJExpandCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_LJ_GROMACS: + fpair += PairLJGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_LJ_SMOOTH: + fpair += PairLJSmoothCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_LJ96_CUT: + fpair += PairLJ96CutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_MORSE_R6: + fpair += PairMorseR6Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_MORSE: + fpair += PairMorseCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + } + } + + if(coul_type != COUL_NONE) { + const F_FLOAT qiqj = qtmp * fetchQ(j); + + if(qiqj * qiqj > 1e-8) { + const bool in_coul_cutoff = + rsq < (_cut_coulsq_global > X_F(0.0) ? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]); + + if(in_coul_cutoff) { + switch(coul_type) { + case COUL_CHARMM: + fpair += CoulCharmmCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj); + break; + + case COUL_CHARMM_IMPLICIT: + fpair += CoulCharmmImplicitCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj); + break; + + case COUL_CUT: { + const F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq); + + if(eflag) { + ecoul += forcecoul; + } + + fpair += forcecoul * (F_F(1.0) / rsq); + } + break; + + case COUL_DEBYE: { + const F_FLOAT r2inv = F_F(1.0) / rsq; + const X_FLOAT r = _RSQRT_(r2inv); + const X_FLOAT rinv = F_F(1.0) / r; + const F_FLOAT screening = _EXP_(-_kappa * r); + F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ; + + if(eflag) { + ecoul += forcecoul * rinv; + } + + forcecoul *= (_kappa + rinv); + fpair += forcecoul * r2inv; + } + break; + + case COUL_GROMACS: + fpair += CoulGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_coul, eflag, ecoul, qiqj); + break; + + case COUL_LONG: { + const F_FLOAT r2inv = F_F(1.0) / rsq; + const F_FLOAT r = _RSQRT_(r2inv); + const F_FLOAT grij = _g_ewald * r; + const F_FLOAT expm2 = _EXP_(-grij * grij); + const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij); + const F_FLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2; + const F_FLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r); + F_FLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2); + + if(factor_coul < 1.0) forcecoul -= (1.0 - factor_coul) * prefactor; + + if(eflag) { + ecoul += prefactor * erfc; + + if(factor_coul < 1.0) ecoul -= (1.0 - factor_coul) * prefactor; + } + + fpair += forcecoul * r2inv; + } + break; + } + } + + in_cutoff = in_cutoff || in_coul_cutoff; + } + } + + + if(in_cutoff) { + F_FLOAT dxfp, dyfp, dzfp; + fxtmp += dxfp = delx * fpair; + fytmp += dyfp = dely * fpair; + fztmp += dzfp = delz * fpair; + + if(vflag) { + sharedV[0 * blockDim.x] += delx * dxfp; + sharedV[1 * blockDim.x] += dely * dyfp; + sharedV[2 * blockDim.x] += delz * dzfp; + sharedV[3 * blockDim.x] += delx * dyfp; + sharedV[4 * blockDim.x] += delx * dzfp; + sharedV[5 * blockDim.x] += dely * dzfp; + } + } + } + } + + __syncthreads(); + + if(ii < _inum) { + F_FLOAT* my_f; + + if(_collect_forces_later) { + ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; + + if(eflag) { + buffer = &buffer[1 * gridDim.x * gridDim.y]; + + if(coul_type != COUL_NONE) + buffer = &buffer[1 * gridDim.x * gridDim.y]; + } + + if(vflag) { + buffer = &buffer[6 * gridDim.x * gridDim.y]; + } + + my_f = (F_FLOAT*) buffer; + my_f += i; + *my_f = fxtmp; + my_f += _nmax; + *my_f = fytmp; + my_f += _nmax; + *my_f = fztmp; + } else { + my_f = _f + i; + *my_f += fxtmp; + my_f += _nmax; + *my_f += fytmp; + my_f += _nmax; + *my_f += fztmp; + } + } + + __syncthreads(); + + if(eflag) { + sharedE[0] = evdwl; + + if(coul_type != COUL_NONE) + sharedECoul[0] = ecoul; + } + + if(eflag_atom && i < _nlocal) { + if(coul_type != COUL_NONE) + _eatom[i] += evdwl + ecoul; + else + _eatom[i] += evdwl; + } + + if(vflag_atom && i < _nlocal) { + _vatom[i] += ENERGY_F(0.5) * sharedV[0 * blockDim.x]; + _vatom[i + _nmax] += ENERGY_F(0.5) * sharedV[1 * blockDim.x]; + _vatom[i + 2 * _nmax] += ENERGY_F(0.5) * sharedV[2 * blockDim.x]; + _vatom[i + 3 * _nmax] += ENERGY_F(0.5) * sharedV[3 * blockDim.x]; + _vatom[i + 4 * _nmax] += ENERGY_F(0.5) * sharedV[4 * blockDim.x]; + _vatom[i + 5 * _nmax] += ENERGY_F(0.5) * sharedV[5 * blockDim.x]; + } + + if(vflag || eflag) PairVirialCompute_A_Kernel(eflag, vflag, coul_type != COUL_NONE ? 1 : 0); +} + +template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data> +__global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_atom) +{ + int ii = (blockIdx.x * gridDim.y + blockIdx.y); + + if(ii >= _inum) + return; + + ENERGY_FLOAT evdwl = ENERGY_F(0.0); + ENERGY_FLOAT ecoul = ENERGY_F(0.0); + F_FLOAT3* sharedVirial1; + F_FLOAT3* sharedVirial2; + F_FLOAT* sharedEnergy; + F_FLOAT* sharedEnergyCoul; + + F_FLOAT3* sharedForce = (F_FLOAT3*) &sharedmem[0]; + + if(vflag) { + sharedVirial1 = &sharedForce[64]; + sharedVirial2 = &sharedVirial1[64]; + } else { + sharedVirial1 = &sharedForce[0]; + sharedVirial2 = &sharedVirial1[0]; + } + + if(eflag) { + if(vflag || vflag_atom) + sharedEnergy = (F_FLOAT*) &sharedVirial2[64]; + else + sharedEnergy = (F_FLOAT*) &sharedForce[64]; + + if(coul_type != COUL_NONE) + sharedEnergyCoul = (F_FLOAT*) &sharedEnergy[64]; + + } + + F_FLOAT3 partialForce = { F_F(0.0), F_F(0.0), F_F(0.0) }; + F_FLOAT3 partialVirial1 = { F_F(0.0), F_F(0.0), F_F(0.0) }; + F_FLOAT3 partialVirial2 = { F_F(0.0), F_F(0.0), F_F(0.0) }; + + X_FLOAT xtmp, ytmp, ztmp; + X_FLOAT4 myxtype; + F_FLOAT delx, dely, delz; + F_FLOAT factor_lj, factor_coul; + F_FLOAT fpair; + F_FLOAT qtmp; + int itype, jnum, i, j; + int* jlist; + + i = _ilist[ii]; + + myxtype = fetchXType(i); + + xtmp = myxtype.x; + ytmp = myxtype.y; + ztmp = myxtype.z; + itype = static_cast <int>(myxtype.w); + + if(coul_type != COUL_NONE) + qtmp = fetchQ(i); + + jnum = _numneigh[i]; + + jlist = &_neighbors[i * _maxneighbors]; + __syncthreads(); + + for(int jj = threadIdx.x; jj < jnum + blockDim.x; jj += blockDim.x) { + if(jj < jnum) { + fpair = F_F(0.0); + j = jlist[jj]; + factor_lj = _special_lj[sbmask(j)]; + + if(coul_type != COUL_NONE) + factor_coul = _special_coul[sbmask(j)]; + + j &= NEIGHMASK; + + myxtype = fetchXType(j); + + delx = xtmp - myxtype.x; + dely = ytmp - myxtype.y; + delz = ztmp - myxtype.z; + int jtype = static_cast <int>(myxtype.w); + + const F_FLOAT rsq = delx * delx + dely * dely + delz * delz; + + bool in_cutoff = rsq < (_cutsq_global > X_F(0.0) ? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]); + bool in_coul_cutoff; + + if(in_cutoff) { + switch(pair_type) { + case PAIR_BORN: + fpair += PairBornCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_BUCK: + fpair += PairBuckCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_CG_CMM: + fpair += PairLJSDKCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_LJ_CHARMM: + fpair += PairLJCharmmCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_LJ_CLASS2: + fpair += PairLJClass2Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_LJ_CUT: + fpair += PairLJCutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_LJ_EXPAND: + fpair += PairLJExpandCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_LJ_GROMACS: + fpair += PairLJGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_LJ_SMOOTH: + fpair += PairLJSmoothCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_LJ96_CUT: + fpair += PairLJ96CutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_MORSE_R6: + fpair += PairMorseR6Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_MORSE: + fpair += PairMorseCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + } + } + + if(coul_type != COUL_NONE) { + const F_FLOAT qiqj = qtmp * fetchQ(j); + + if(qiqj * qiqj > (1e-8f)) { + in_coul_cutoff = + rsq < (_cut_coulsq_global > X_F(0.0) ? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]); + + if(in_coul_cutoff) { + switch(coul_type) { + case COUL_CHARMM: + fpair += CoulCharmmCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj); + break; + + case COUL_CHARMM_IMPLICIT: + fpair += CoulCharmmImplicitCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj); + break; + + case COUL_GROMACS: + fpair += CoulGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_coul, eflag, ecoul, qiqj); + break; + + case COUL_LONG: { + const F_FLOAT r2inv = F_F(1.0) / rsq; + const F_FLOAT r = _RSQRT_(r2inv); + const F_FLOAT grij = _g_ewald * r; + const F_FLOAT expm2 = _EXP_(-grij * grij); + const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij); + const F_FLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2; + const F_FLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r); + F_FLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2); + + if(factor_coul < 1.0) forcecoul -= (1.0 - factor_coul) * prefactor; + + if(eflag) { + ecoul += prefactor * erfc; + + if(factor_coul < 1.0) ecoul -= (1.0 - factor_coul) * prefactor; + } + + fpair += forcecoul * r2inv; + } + break; + + case COUL_DEBYE: { + const F_FLOAT r2inv = F_F(1.0) / rsq; + const X_FLOAT r = _RSQRT_(r2inv); + const X_FLOAT rinv = F_F(1.0) / r; + const F_FLOAT screening = _EXP_(-_kappa * r); + F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ; + + if(eflag) { + ecoul += forcecoul * rinv; + } + + forcecoul *= (_kappa + rinv); + fpair += forcecoul * r2inv; + } + break; + + case COUL_CUT: { + const F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq); + + if(eflag) { + ecoul += forcecoul; + } + + fpair += forcecoul * (F_F(1.0) / rsq); + } + break; + + + } + } + } + } + + + + if(in_cutoff || in_coul_cutoff) { + F_FLOAT dxfp, dyfp, dzfp; + partialForce.x += dxfp = delx * fpair; + partialForce.y += dyfp = dely * fpair; + partialForce.z += dzfp = delz * fpair; + + if(vflag) { + partialVirial1.x += delx * dxfp; + partialVirial1.y += dely * dyfp; + partialVirial1.z += delz * dzfp; + partialVirial2.x += delx * dyfp; + partialVirial2.y += delx * dzfp; + partialVirial2.z += dely * dzfp; + } + } + } + } + + if(eflag) { + sharedEnergy[threadIdx.x] = evdwl; + + if(coul_type != COUL_NONE) + sharedEnergyCoul[threadIdx.x] = ecoul; + } + + sharedForce[threadIdx.x] = partialForce; + + if(vflag) { + sharedVirial1[threadIdx.x] = partialVirial1; + sharedVirial2[threadIdx.x] = partialVirial2; + } + + __syncthreads(); + + + for(unsigned int s = blockDim.x >> 1; s > 0; s >>= 1) { + + if(threadIdx.x < s) { + sharedForce[ threadIdx.x ].x += sharedForce[ threadIdx.x + s ].x; + sharedForce[ threadIdx.x ].y += sharedForce[ threadIdx.x + s ].y; + sharedForce[ threadIdx.x ].z += sharedForce[ threadIdx.x + s ].z; + + if(vflag) { + sharedVirial1[ threadIdx.x ].x += sharedVirial1[ threadIdx.x + s ].x; + sharedVirial1[ threadIdx.x ].y += sharedVirial1[ threadIdx.x + s ].y; + sharedVirial1[ threadIdx.x ].z += sharedVirial1[ threadIdx.x + s ].z; + + sharedVirial2[ threadIdx.x ].x += sharedVirial2[ threadIdx.x + s ].x; + sharedVirial2[ threadIdx.x ].y += sharedVirial2[ threadIdx.x + s ].y; + sharedVirial2[ threadIdx.x ].z += sharedVirial2[ threadIdx.x + s ].z; + } + + if(eflag) { + sharedEnergy[ threadIdx.x ] += sharedEnergy[ threadIdx.x + s ]; + + if(coul_type != COUL_NONE) + sharedEnergyCoul[ threadIdx.x ] += sharedEnergyCoul[ threadIdx.x + s ]; + } + } + + __syncthreads(); + } + + if(threadIdx.x == 0) { + + ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; + + if(eflag) { + ENERGY_FLOAT tmp_evdwl; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp_evdwl = ENERGY_F(0.5) * sharedEnergy[0]; + + if(eflag_atom) + _eatom[i] = tmp_evdwl; + + buffer = &buffer[gridDim.x * gridDim.y]; + + if(coul_type != COUL_NONE) { + buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp_evdwl = ENERGY_F(0.5) * sharedEnergyCoul[0]; + + if(eflag_atom) + _eatom[i] += tmp_evdwl; + + buffer = &buffer[gridDim.x * gridDim.y]; + } + } + + if(vflag) { + ENERGY_FLOAT tmp; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].x; + + if(vflag_atom) _vatom[i + 0 * _nmax] = tmp; + + buffer[blockIdx.x * gridDim.y + blockIdx.y + 1 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].y; + + if(vflag_atom) _vatom[i + 1 * _nmax] = tmp; + + buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].z; + + if(vflag_atom) _vatom[i + 2 * _nmax] = tmp; + + buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial2[0].x; + + if(vflag_atom) _vatom[i + 3 * _nmax] = tmp; + + buffer[blockIdx.x * gridDim.y + blockIdx.y + 4 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial2[0].y; + + if(vflag_atom) _vatom[i + 4 * _nmax] = tmp; + + buffer[blockIdx.x * gridDim.y + blockIdx.y + 5 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial2[0].z; + + if(vflag_atom) _vatom[i + 5 * _nmax] = tmp; + + buffer = &buffer[6 * gridDim.x * gridDim.y]; + } + + F_FLOAT* my_f; + + if(_collect_forces_later) { + my_f = (F_FLOAT*) buffer; + my_f += i; + *my_f = sharedForce[0].x; + my_f += _nmax; + *my_f = sharedForce[0].y; + my_f += _nmax; + *my_f = sharedForce[0].z; + } else { + my_f = _f + i; + *my_f += sharedForce[0].x; + my_f += _nmax; + *my_f += sharedForce[0].y; + my_f += _nmax; + *my_f += sharedForce[0].z; + } + } +} + + +template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data> +__global__ void Pair_Kernel_TpA_opt(int eflag, int vflag, int eflag_atom, int vflag_atom, int comm_phase) +{ + ENERGY_FLOAT evdwl = ENERGY_F(0.0); + ENERGY_FLOAT ecoul = ENERGY_F(0.0); + + ENERGY_FLOAT* sharedE; + ENERGY_FLOAT* sharedECoul; + ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x]; + + if(eflag || eflag_atom) { + sharedE = &sharedmem[threadIdx.x]; + sharedE[0] = ENERGY_F(0.0); + sharedV += blockDim.x; + + if(coul_type != COUL_NONE) { + sharedECoul = sharedE + blockDim.x; + sharedECoul[0] = ENERGY_F(0.0); + sharedV += blockDim.x; + } + } + + if(vflag || vflag_atom) { + sharedV[0 * blockDim.x] = ENERGY_F(0.0); + sharedV[1 * blockDim.x] = ENERGY_F(0.0); + sharedV[2 * blockDim.x] = ENERGY_F(0.0); + sharedV[3 * blockDim.x] = ENERGY_F(0.0); + sharedV[4 * blockDim.x] = ENERGY_F(0.0); + sharedV[5 * blockDim.x] = ENERGY_F(0.0); + } + + int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + X_FLOAT xtmp, ytmp, ztmp; + X_FLOAT4 myxtype; + F_FLOAT fxtmp, fytmp, fztmp, fpair; + F_FLOAT delx, dely, delz; + F_FLOAT factor_lj, factor_coul; + F_FLOAT qtmp; + int itype, i, j; + int jnum = 0; + int* jlist; + + if(ii < (comm_phase < 2 ? _inum : _inum_border[0])) { + i = comm_phase < 2 ? _ilist[ii] : _ilist_border[ii] ; + + myxtype = fetchXType(i); + myxtype = _x_type[i]; + xtmp = myxtype.x; + ytmp = myxtype.y; + ztmp = myxtype.z; + itype = static_cast <int>(myxtype.w); + + + fxtmp = F_F(0.0); + fytmp = F_F(0.0); + fztmp = F_F(0.0); + + if(coul_type != COUL_NONE) + qtmp = fetchQ(i); + + jnum = comm_phase == 0 ? _numneigh[i] : (comm_phase == 1 ? _numneigh_inner[i] : _numneigh_border[ii]); + + + jlist = comm_phase == 0 ? &_neighbors[i] : (comm_phase == 1 ? &_neighbors_inner[i] : &_neighbors_border[ii]); + } + + __syncthreads(); + + for(int jj = 0; jj < jnum; jj++) { + if(ii < (comm_phase < 2 ? _inum : _inum_border[0])) + if(jj < jnum) { + fpair = F_F(0.0); + j = jlist[jj * _nlocal]; + + factor_lj = j < _nall ? F_F(1.0) : _special_lj[j / _nall]; + + if(coul_type != COUL_NONE) + factor_coul = j < _nall ? F_F(1.0) : _special_coul[j / _nall]; + + j = j < _nall ? j : j % _nall; + + myxtype = fetchXType(j); + delx = xtmp - myxtype.x; + dely = ytmp - myxtype.y; + delz = ztmp - myxtype.z; + int jtype = static_cast <int>(myxtype.w); + + + const F_FLOAT rsq = delx * delx + dely * dely + delz * delz; + + bool in_cutoff = rsq < (_cutsq_global > X_F(0.0) ? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]); + + if(in_cutoff) { + switch(pair_type) { + case PAIR_BORN: + fpair += PairBornCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_BUCK: + fpair += PairBuckCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_CG_CMM: + fpair += PairLJSDKCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_LJ_CHARMM: + fpair += PairLJCharmmCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_LJ_CLASS2: + fpair += PairLJClass2Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_LJ_CUT: + fpair += PairLJCutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_LJ_EXPAND: + fpair += PairLJExpandCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_LJ_GROMACS: + fpair += PairLJGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_LJ_SMOOTH: + fpair += PairLJSmoothCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_LJ96_CUT: + fpair += PairLJ96CutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_MORSE_R6: + fpair += PairMorseR6Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_MORSE: + fpair += PairMorseCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + } + } + + if(coul_type != COUL_NONE) { + const F_FLOAT qiqj = qtmp * fetchQ(j); + + if(qiqj * qiqj > 1e-8) { + const bool in_coul_cutoff = + rsq < (_cut_coulsq_global > X_F(0.0) ? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]); + + if(in_coul_cutoff) { + switch(coul_type) { + case COUL_CHARMM: + fpair += CoulCharmmCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj); + break; + + case COUL_CHARMM_IMPLICIT: + fpair += CoulCharmmImplicitCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj); + break; + + case COUL_CUT: { + const F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq); + + if(eflag) { + ecoul += forcecoul; + } + + fpair += forcecoul * (F_F(1.0) / rsq); + } + break; + + case COUL_DEBYE: { + const F_FLOAT r2inv = F_F(1.0) / rsq; + const X_FLOAT r = _RSQRT_(r2inv); + const X_FLOAT rinv = F_F(1.0) / r; + const F_FLOAT screening = _EXP_(-_kappa * r); + F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ; + + if(eflag) { + ecoul += forcecoul * rinv; + } + + forcecoul *= (_kappa + rinv); + fpair += forcecoul * r2inv; + } + break; + + case COUL_GROMACS: + fpair += CoulGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_coul, eflag, ecoul, qiqj); + break; + + case COUL_LONG: { + const F_FLOAT r2inv = F_F(1.0) / rsq; + const F_FLOAT r = _RSQRT_(r2inv); + const F_FLOAT grij = _g_ewald * r; + const F_FLOAT expm2 = _EXP_(-grij * grij); + const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij); + const F_FLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2; + const F_FLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r); + F_FLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2); + + if(factor_coul < 1.0) forcecoul -= (1.0 - factor_coul) * prefactor; + + if(eflag) { + ecoul += prefactor * erfc; + + if(factor_coul < 1.0) ecoul -= (1.0 - factor_coul) * prefactor; + } + + fpair += forcecoul * r2inv; + } + break; + + } + } + + in_cutoff = in_cutoff || in_coul_cutoff; + } + } + + + if(in_cutoff) { + F_FLOAT dxfp, dyfp, dzfp; + fxtmp += dxfp = delx * fpair; + fytmp += dyfp = dely * fpair; + fztmp += dzfp = delz * fpair; + + if(vflag) { + sharedV[0 * blockDim.x] += delx * dxfp; + sharedV[1 * blockDim.x] += dely * dyfp; + sharedV[2 * blockDim.x] += delz * dzfp; + sharedV[3 * blockDim.x] += delx * dyfp; + sharedV[4 * blockDim.x] += delx * dzfp; + sharedV[5 * blockDim.x] += dely * dzfp; + } + } + } + } + + __syncthreads(); + + if(ii < (comm_phase < 2 ? _inum : _inum_border[0])) { + F_FLOAT* my_f; + + if(_collect_forces_later) { + ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; + + if(eflag) { + buffer = &buffer[1 * gridDim.x * gridDim.y]; + + if(coul_type != COUL_NONE) + buffer = &buffer[1 * gridDim.x * gridDim.y]; + } + + if(vflag) { + buffer = &buffer[6 * gridDim.x * gridDim.y]; + } + + my_f = (F_FLOAT*) buffer; + my_f += i; + *my_f = fxtmp; + my_f += _nmax; + *my_f = fytmp; + my_f += _nmax; + *my_f = fztmp; + } else { + my_f = _f + i; + *my_f += fxtmp; + my_f += _nmax; + *my_f += fytmp; + my_f += _nmax; + *my_f += fztmp; + } + } + + __syncthreads(); + + if(eflag) { + sharedE[0] = evdwl; + + if(coul_type != COUL_NONE) + sharedECoul[0] = ecoul; + } + + if(eflag_atom && i < _nlocal) { + if(coul_type != COUL_NONE) + _eatom[i] += evdwl + ecoul; + else + _eatom[i] += evdwl; + } + + if(vflag_atom && i < _nlocal) { + _vatom[i] += ENERGY_F(0.5) * sharedV[0 * blockDim.x]; + _vatom[i + _nmax] += ENERGY_F(0.5) * sharedV[1 * blockDim.x]; + _vatom[i + 2 * _nmax] += ENERGY_F(0.5) * sharedV[2 * blockDim.x]; + _vatom[i + 3 * _nmax] += ENERGY_F(0.5) * sharedV[3 * blockDim.x]; + _vatom[i + 4 * _nmax] += ENERGY_F(0.5) * sharedV[4 * blockDim.x]; + _vatom[i + 5 * _nmax] += ENERGY_F(0.5) * sharedV[5 * blockDim.x]; + } + + if(vflag || eflag) PairVirialCompute_A_Kernel(eflag, vflag, coul_type != COUL_NONE ? 1 : 0); +} + +template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data> +__global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vflag_atom, int comm_phase) +{ + int ii = (blockIdx.x * gridDim.y + blockIdx.y); + + if(ii >= (comm_phase < 2 ? _inum : _inum_border[0])) + return; + + ENERGY_FLOAT evdwl = ENERGY_F(0.0); + ENERGY_FLOAT ecoul = ENERGY_F(0.0); + F_FLOAT3* sharedVirial1; + F_FLOAT3* sharedVirial2; + F_FLOAT* sharedEnergy; + F_FLOAT* sharedEnergyCoul; + + F_FLOAT3* sharedForce = (F_FLOAT3*) &sharedmem[0]; + + if(vflag) { + sharedVirial1 = &sharedForce[64]; + sharedVirial2 = &sharedVirial1[64]; + } else { + sharedVirial1 = &sharedForce[0]; + sharedVirial2 = &sharedVirial1[0]; + } + + if(eflag) { + if(vflag || vflag_atom) + sharedEnergy = (F_FLOAT*) &sharedVirial2[64]; + else + sharedEnergy = (F_FLOAT*) &sharedForce[64]; + + if(coul_type != COUL_NONE) + sharedEnergyCoul = (F_FLOAT*) &sharedEnergy[64]; + + } + + F_FLOAT3 partialForce = { F_F(0.0), F_F(0.0), F_F(0.0) }; + F_FLOAT3 partialVirial1 = { F_F(0.0), F_F(0.0), F_F(0.0) }; + F_FLOAT3 partialVirial2 = { F_F(0.0), F_F(0.0), F_F(0.0) }; + + X_FLOAT xtmp, ytmp, ztmp; + X_FLOAT4 myxtype; + F_FLOAT delx, dely, delz; + F_FLOAT factor_lj, factor_coul; + F_FLOAT fpair; + F_FLOAT qtmp; + int itype, jnum, i, j; + int* jlist; + + i = comm_phase < 2 ? _ilist[ii] : _ilist_border[ii]; + + myxtype = fetchXType(i); + + xtmp = myxtype.x; + ytmp = myxtype.y; + ztmp = myxtype.z; + itype = static_cast <int>(myxtype.w); + + if(coul_type != COUL_NONE) + qtmp = fetchQ(i); + + jnum = comm_phase == 0 ? _numneigh[i] : (comm_phase == 1 ? _numneigh_inner[i] : _numneigh_border[ii]); + + jlist = comm_phase == 0 ? &_neighbors[i * _maxneighbors] : (comm_phase == 1 ? &_neighbors_inner[i * _maxneighbors] : &_neighbors_border[ii * _maxneighbors]); + __syncthreads(); + + for(int jj = threadIdx.x; jj < jnum + blockDim.x; jj += blockDim.x) { + if(jj < jnum) { + fpair = F_F(0.0); + j = jlist[jj]; + factor_lj = j < _nall ? F_F(1.0) : _special_lj[j / _nall]; + + if(coul_type != COUL_NONE) + factor_coul = j < _nall ? F_F(1.0) : _special_coul[j / _nall]; + + j = j < _nall ? j : j % _nall; + + myxtype = fetchXType(j); + + delx = xtmp - myxtype.x; + dely = ytmp - myxtype.y; + delz = ztmp - myxtype.z; + int jtype = static_cast <int>(myxtype.w); + + const F_FLOAT rsq = delx * delx + dely * dely + delz * delz; + + bool in_cutoff = rsq < (_cutsq_global > X_F(0.0) ? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]); + bool in_coul_cutoff; + + if(in_cutoff) { + switch(pair_type) { + case PAIR_BORN: + fpair += PairBornCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_BUCK: + fpair += PairBuckCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_CG_CMM: + fpair += PairLJSDKCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_LJ_CHARMM: + fpair += PairLJCharmmCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_LJ_CLASS2: + fpair += PairLJClass2Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_LJ_CUT: + fpair += PairLJCutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_LJ_EXPAND: + fpair += PairLJExpandCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_LJ_GROMACS: + fpair += PairLJGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_LJ_SMOOTH: + fpair += PairLJSmoothCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_LJ96_CUT: + fpair += PairLJ96CutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_MORSE_R6: + fpair += PairMorseR6Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + + case PAIR_MORSE: + fpair += PairMorseCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl); + break; + } + } + + if(coul_type != COUL_NONE) { + const F_FLOAT qiqj = qtmp * fetchQ(j); + + if(qiqj * qiqj > (1e-8f)) { + in_coul_cutoff = + rsq < (_cut_coulsq_global > X_F(0.0) ? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]); + + if(in_coul_cutoff) { + switch(coul_type) { + case COUL_CHARMM: + fpair += CoulCharmmCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj); + break; + + case COUL_CHARMM_IMPLICIT: + fpair += CoulCharmmImplicitCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj); + break; + + case COUL_GROMACS: + fpair += CoulGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_coul, eflag, ecoul, qiqj); + break; + + case COUL_LONG: { + const F_FLOAT r2inv = F_F(1.0) / rsq; + const F_FLOAT r = _RSQRT_(r2inv); + const F_FLOAT grij = _g_ewald * r; + const F_FLOAT expm2 = _EXP_(-grij * grij); + const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij); + const F_FLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2; + const F_FLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r); + F_FLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2); + + if(factor_coul < 1.0) forcecoul -= (1.0 - factor_coul) * prefactor; + + if(eflag) { + ecoul += prefactor * erfc; + + if(factor_coul < 1.0) ecoul -= (1.0 - factor_coul) * prefactor; + } + + fpair += forcecoul * r2inv; + } + break; + + case COUL_DEBYE: { + const F_FLOAT r2inv = F_F(1.0) / rsq; + const X_FLOAT r = _RSQRT_(r2inv); + const X_FLOAT rinv = F_F(1.0) / r; + const F_FLOAT screening = _EXP_(-_kappa * r); + F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ; + + if(eflag) { + ecoul += forcecoul * rinv; + } + + forcecoul *= (_kappa + rinv); + fpair += forcecoul * r2inv; + } + break; + + case COUL_CUT: { + const F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq); + + if(eflag) { + ecoul += forcecoul; + } + + fpair += forcecoul * (F_F(1.0) / rsq); + } + break; + + + } + } + } + } + + + + if(in_cutoff || in_coul_cutoff) { + F_FLOAT dxfp, dyfp, dzfp; + partialForce.x += dxfp = delx * fpair; + partialForce.y += dyfp = dely * fpair; + partialForce.z += dzfp = delz * fpair; + + if(vflag) { + partialVirial1.x += delx * dxfp; + partialVirial1.y += dely * dyfp; + partialVirial1.z += delz * dzfp; + partialVirial2.x += delx * dyfp; + partialVirial2.y += delx * dzfp; + partialVirial2.z += dely * dzfp; + } + } + } + } + + if(eflag) { + sharedEnergy[threadIdx.x] = evdwl; + + if(coul_type != COUL_NONE) + sharedEnergyCoul[threadIdx.x] = ecoul; + } + + sharedForce[threadIdx.x] = partialForce; + + if(vflag) { + sharedVirial1[threadIdx.x] = partialVirial1; + sharedVirial2[threadIdx.x] = partialVirial2; + } + + __syncthreads(); + + + for(unsigned int s = blockDim.x >> 1; s > 0; s >>= 1) { + + if(threadIdx.x < s) { + sharedForce[ threadIdx.x ].x += sharedForce[ threadIdx.x + s ].x; + sharedForce[ threadIdx.x ].y += sharedForce[ threadIdx.x + s ].y; + sharedForce[ threadIdx.x ].z += sharedForce[ threadIdx.x + s ].z; + + if(vflag) { + sharedVirial1[ threadIdx.x ].x += sharedVirial1[ threadIdx.x + s ].x; + sharedVirial1[ threadIdx.x ].y += sharedVirial1[ threadIdx.x + s ].y; + sharedVirial1[ threadIdx.x ].z += sharedVirial1[ threadIdx.x + s ].z; + + sharedVirial2[ threadIdx.x ].x += sharedVirial2[ threadIdx.x + s ].x; + sharedVirial2[ threadIdx.x ].y += sharedVirial2[ threadIdx.x + s ].y; + sharedVirial2[ threadIdx.x ].z += sharedVirial2[ threadIdx.x + s ].z; + } + + if(eflag) { + sharedEnergy[ threadIdx.x ] += sharedEnergy[ threadIdx.x + s ]; + + if(coul_type != COUL_NONE) + sharedEnergyCoul[ threadIdx.x ] += sharedEnergyCoul[ threadIdx.x + s ]; + } + } + + __syncthreads(); + } + + if(threadIdx.x == 0) { + + ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; + + if(eflag) { + ENERGY_FLOAT tmp_evdwl; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp_evdwl = ENERGY_F(0.5) * sharedEnergy[0]; + + if(eflag_atom) + _eatom[i] = tmp_evdwl; + + buffer = &buffer[gridDim.x * gridDim.y]; + + if(coul_type != COUL_NONE) { + buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp_evdwl = ENERGY_F(0.5) * sharedEnergyCoul[0]; + + if(eflag_atom) + _eatom[i] += tmp_evdwl; + + buffer = &buffer[gridDim.x * gridDim.y]; + } + } + + if(vflag) { + ENERGY_FLOAT tmp; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].x; + + if(vflag_atom) _vatom[i + 0 * _nmax] = tmp; + + buffer[blockIdx.x * gridDim.y + blockIdx.y + 1 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].y; + + if(vflag_atom) _vatom[i + 1 * _nmax] = tmp; + + buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].z; + + if(vflag_atom) _vatom[i + 2 * _nmax] = tmp; + + buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial2[0].x; + + if(vflag_atom) _vatom[i + 3 * _nmax] = tmp; + + buffer[blockIdx.x * gridDim.y + blockIdx.y + 4 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial2[0].y; + + if(vflag_atom) _vatom[i + 4 * _nmax] = tmp; + + buffer[blockIdx.x * gridDim.y + blockIdx.y + 5 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial2[0].z; + + if(vflag_atom) _vatom[i + 5 * _nmax] = tmp; + + buffer = &buffer[6 * gridDim.x * gridDim.y]; + } + + F_FLOAT* my_f; + + if(_collect_forces_later) { + my_f = (F_FLOAT*) buffer; + my_f += i; + *my_f = sharedForce[0].x; + my_f += _nmax; + *my_f = sharedForce[0].y; + my_f += _nmax; + *my_f = sharedForce[0].z; + } else { + my_f = _f + i; + *my_f += sharedForce[0].x; + my_f += _nmax; + *my_f += sharedForce[0].y; + my_f += _nmax; + *my_f += sharedForce[0].z; + } + } +} + +__global__ void Pair_GenerateXType_Kernel() +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if(i < _nall) { + X_FLOAT4 xtype; + xtype.x = _x[i]; + xtype.y = _x[i + _nmax]; + xtype.z = _x[i + 2 * _nmax]; + xtype.w = _type[i]; + _x_type[i] = xtype; + } + +} + +__global__ void Pair_GenerateVRadius_Kernel() +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if(i < _nall) { + V_FLOAT4 vradius; + vradius.x = _v[i]; + vradius.y = _v[i + _nmax]; + vradius.z = _v[i + 2 * _nmax]; + vradius.w = _radius[i]; + _v_radius[i] = vradius; + } +} + +__global__ void Pair_GenerateOmegaRmass_Kernel() +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if(i < _nall) { + V_FLOAT4 omegarmass; + omegarmass.x = _omega[i]; + omegarmass.y = _omega[i + _nmax]; + omegarmass.z = _omega[i + 2 * _nmax]; + omegarmass.w = _rmass[i]; + _omega_rmass[i] = omegarmass; + } +} + +__global__ void Pair_RevertXType_Kernel() +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if(i < _nall) { + X_FLOAT4 xtype = _x_type[i]; + _x[i] = xtype.x; + _x[i + _nmax] = xtype.y; + _x[i + 2 * _nmax] = xtype.z; + _type[i] = static_cast <int>(xtype.w); + } + +} + +__global__ void Pair_BuildXHold_Kernel() +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if(i < _nall) { + X_FLOAT4 xtype = _x_type[i]; + _xhold[i] = xtype.x; + _xhold[i + _nmax] = xtype.y; + _xhold[i + 2 * _nmax] = xtype.z; + } + +} + +__global__ void Pair_CollectForces_Kernel(int nperblock, int n) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if(i >= _nlocal) return; + + ENERGY_FLOAT* buf = (ENERGY_FLOAT*) _buffer; + + F_FLOAT* buf_f = (F_FLOAT*) &buf[nperblock * n]; + F_FLOAT* my_f = _f + i; + buf_f += i; + *my_f += * buf_f; + my_f += _nmax; + buf_f += _nmax; + *my_f += * buf_f; + my_f += _nmax; + buf_f += _nmax; + *my_f += * buf_f; + my_f += _nmax; +} diff --git a/lib/cuda/cuda_pair_virial_kernel_nc.cu b/lib/cuda/cuda_pair_virial_kernel_nc.cu new file mode 100644 index 0000000000000000000000000000000000000000..3987bde43e3ccd9b1d2e8bc2343d8a1f9d6c01bc --- /dev/null +++ b/lib/cuda/cuda_pair_virial_kernel_nc.cu @@ -0,0 +1,126 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +extern __shared__ ENERGY_FLOAT sharedmem[]; + +static inline __device__ void PairVirialCompute_A_Kernel(int eflag, int vflag, int coulflag = 0) +{ + __syncthreads(); + ENERGY_FLOAT* shared = sharedmem; + + if(eflag) { + reduceBlock(shared); + shared += blockDim.x; + + if(coulflag) { + reduceBlock(shared); + shared += blockDim.x; + } + } + + if(vflag) { + reduceBlock(shared + 0 * blockDim.x); + reduceBlock(shared + 1 * blockDim.x); + reduceBlock(shared + 2 * blockDim.x); + reduceBlock(shared + 3 * blockDim.x); + reduceBlock(shared + 4 * blockDim.x); + reduceBlock(shared + 5 * blockDim.x); + } + + if(threadIdx.x == 0) { + shared = sharedmem; + ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; + + if(eflag) { + buffer[blockIdx.x * gridDim.y + blockIdx.y] = ENERGY_F(0.5) * shared[0]; + shared += blockDim.x; + buffer += gridDim.x * gridDim.y; + + if(coulflag) { + buffer[blockIdx.x * gridDim.y + blockIdx.y] = ENERGY_F(0.5) * shared[0]; + shared += blockDim.x; + buffer += gridDim.x * gridDim.y; + } + } + + if(vflag) { + buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[0 * blockDim.x]; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 1 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[1 * blockDim.x]; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[2 * blockDim.x]; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[3 * blockDim.x]; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 4 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[4 * blockDim.x]; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 5 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[5 * blockDim.x]; + } + } + + __syncthreads(); +} + +__global__ void MY_AP(PairVirialCompute_reduce)(int n) +{ + sharedmem[threadIdx.x] = ENERGY_F(0.0); + ENERGY_FLOAT sum = ENERGY_F(0.0); + ENERGY_FLOAT* buf = (ENERGY_FLOAT*) _buffer; + buf = &buf[blockIdx.x * n]; + //if(blockIdx.x==2) buf=&buf[n]; + + for(int i = 0; i < n; i += blockDim.x) { + sharedmem[threadIdx.x] = (i + threadIdx.x < n) ? buf[i + threadIdx.x] : ENERGY_F(0.0); + __syncthreads(); + reduceBlock(sharedmem); + + if(threadIdx.x == 0) sum += sharedmem[0]; + } + + if(threadIdx.x == 0) { + if(gridDim.x == 1) { //evdwl + _eng_vdwl[0] += sum; + } + + if(gridDim.x == 2) { //evdwl + ecoul only + if(blockIdx.x == 0) + _eng_vdwl[0] += sum; + else + _eng_coul[0] += sum; + } + + if(gridDim.x == 6) { //virial + _virial[blockIdx.x] += sum; + } + + if(gridDim.x == 7) { //evdwl+virial + if(blockIdx.x == 0) + _eng_vdwl[0] += sum; + else _virial[blockIdx.x - 1] += sum; + } + + if(gridDim.x == 8) { //evdwl+ecoul+virial + if(blockIdx.x == 0) + _eng_vdwl[0] += sum; + else if(blockIdx.x == 1) + _eng_coul[0] += sum; + else + _virial[blockIdx.x - 2] += sum; + } + } +} diff --git a/lib/cuda/cuda_precision.h b/lib/cuda/cuda_precision.h new file mode 100644 index 0000000000000000000000000000000000000000..2dc4ab5607cc42ced1d3a1ae88005730a7b7dde3 --- /dev/null +++ b/lib/cuda/cuda_precision.h @@ -0,0 +1,274 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifndef CUDA_PRECISION_H_ +#define CUDA_PRECISION_H_ +/* This File gives Type definitions for mixed precision calculation in the cuda part of LAMMPS-CUDA. + * Predefined behaviour is given by global CUDA_PRECISION (can be overwritten during compilation). + * ***_FLOAT: type definition of given property + * ***_F: constant extension in code (1.0 is interpreted as double while 1.0f is interpreted as float, now use: 1.0CUDA_F) + */ + +#ifdef CUDA_USE_BINNING +#define CUDA_IF_BINNING(a) a +#else +#define CUDA_IF_BINNING(a) +#endif + +//GLOBAL + +#ifdef CUDA_PRECISION +#if CUDA_PRECISION == 1 +#define CUDA_FLOAT float +#define CUDA_F(x) x##f +#endif +#if CUDA_PRECISION == 2 +#define CUDA_FLOAT double +#define CUDA_F(x) x +#endif +#endif + +#ifndef CUDA_PRECISION +#define CUDA_FLOAT double +#define CUDA_F(x) x +#define CUDA_PRECISION 2 +#endif +//-------------------------------- +//-----------FFT----------------- +//-------------------------------- + +#ifdef FFT_PRECISION_CU +#if FFT_PRECISION_CU == 1 +#define FFT_FLOAT float +#define FFT_F(x) x##f +#endif +#if FFT_PRECISION_CU == 2 +#define FFT_FLOAT double +#define FFT_F(x) x +#endif +#endif + +#ifndef FFT_PRECISION_CU +#define FFT_FLOAT CUDA_FLOAT +#define FFT_F(x) CUDA_F(x) +#define FFT_PRECISION_CU CUDA_PRECISION +#endif + +//-------------------------------- +//-----------PPPM----------------- +//-------------------------------- + +#ifndef PPPM_PRECISION +#define PPPM_PRECISION CUDA_PRECISION +#endif + +#ifdef PPPM_PRECISION +#if PPPM_PRECISION == 1 +#define PPPM_FLOAT float +#ifdef float3 +#define PPPM_FLOAT3 float3 +#else +struct PPPM_FLOAT3 { + PPPM_FLOAT x; + PPPM_FLOAT y; + PPPM_FLOAT z; +}; +#endif +#define PPPM_F(x) x##f +#endif +#if PPPM_PRECISION == 2 +#define PPPM_FLOAT double +struct PPPM_FLOAT3 { + PPPM_FLOAT x; + PPPM_FLOAT y; + PPPM_FLOAT z; +}; +#define PPPM_F(x) x +#endif +#endif + + +//-------------------------------- +//-----------FORCE----------------- +//-------------------------------- + + +#ifdef F_PRECISION +#if F_PRECISION == 1 +#define F_FLOAT float +#define F_F(x) x##f +#endif +#if F_PRECISION == 2 +#define F_FLOAT double +#define F_F(x) x +#endif +#endif + +#ifndef F_PRECISION +#define F_FLOAT CUDA_FLOAT +#define F_F(x) CUDA_F(x) +#define F_PRECISION CUDA_PRECISION +#endif + +#if F_PRECISION == 1 +#define _SQRT_ sqrtf +#define _RSQRT_ rsqrtf +#define _EXP_ expf +#else +#define _SQRT_ sqrt +#define _RSQRT_ rsqrt +#define _EXP_ exp +#endif + +#if F_PRECISION == 2 +struct F_FLOAT2 { + F_FLOAT x; + F_FLOAT y; +}; +struct F_FLOAT3 { + F_FLOAT x; + F_FLOAT y; + F_FLOAT z; +}; +struct F_FLOAT4 { + F_FLOAT x; + F_FLOAT y; + F_FLOAT z; + F_FLOAT w; +}; +#else +#define F_FLOAT2 float2 +#define F_FLOAT3 float3 +#define F_FLOAT4 float4 +#endif +//-------------------------------- +//-----------ENERGY----------------- +//-------------------------------- + +#ifndef ENERGY_PRECISION +#define ENERGY_FLOAT CUDA_FLOAT +#define ENERGY_F(x) CUDA_F(x) +#endif + +#ifdef ENERGY_PRECISION +#if ENERGY_PRECISION == 1 +#define ENERGY_FLOAT float +#define ENERGY_F(x) x##f +#endif +#if ENERGY_PRECISION == 2 +#define ENERGY_FLOAT double +#define ENERGY_F(x) x +#endif +#endif + +#ifndef ENERGY_PRECISION +#define ENERGY_FLOAT CUDA_FLOAT +#define ENERGY_F(x) CUDA_F(x) +#define ENERGY_PRECISION CUDA_PRECISION +#endif + +//-------------------------------- +//-----------POSITIONS------------ +//-------------------------------- + +#ifdef X_PRECISION +#if X_PRECISION == 1 +#define X_FLOAT float +#define X_F(x) x##f +#endif +#if X_PRECISION == 2 +#define X_FLOAT double +#define X_F(x) x +#endif +#endif + +#ifndef X_PRECISION +#define X_FLOAT CUDA_FLOAT +#define X_F(x) CUDA_F(x) +#define X_PRECISION CUDA_PRECISION +#endif + +#if X_PRECISION == 2 +struct X_FLOAT2 { + X_FLOAT x; + X_FLOAT y; +}; +struct X_FLOAT3 { + X_FLOAT x; + X_FLOAT y; + X_FLOAT z; +}; +struct X_FLOAT4 { + X_FLOAT x; + X_FLOAT y; + X_FLOAT z; + X_FLOAT w; +}; +#else +#define X_FLOAT2 float2 +#define X_FLOAT3 float3 +#define X_FLOAT4 float4 +#endif + +//-------------------------------- +//-----------velocities----------- +//-------------------------------- + +#ifdef V_PRECISION +#if V_PRECISION == 1 +#define V_FLOAT float +#define V_F(x) x##f +#endif +#if V_PRECISION == 2 +#define V_FLOAT double +#define V_F(x) x +#endif +#endif + +#ifndef V_PRECISION +#define V_FLOAT CUDA_FLOAT +#define V_F(x) CUDA_F(x) +#define V_PRECISION CUDA_PRECISION +#endif + +#if V_PRECISION == 2 +struct V_FLOAT4 { + V_FLOAT x; + V_FLOAT y; + V_FLOAT z; + V_FLOAT w; +}; +#else +#define V_FLOAT4 float4 +#endif + +#ifdef NO_PREC_TIMING +struct timespec_2 { + unsigned int tv_sec; + unsigned int tv_nsec; +}; + +#define timespec timespec_2 +#define clock_gettime(a,b) +#endif +#endif /*CUDA_PRECISION_H_*/ diff --git a/lib/cuda/cuda_shared.h b/lib/cuda/cuda_shared.h new file mode 100644 index 0000000000000000000000000000000000000000..1d29336b003665155a1e221217a2568158de9836 --- /dev/null +++ b/lib/cuda/cuda_shared.h @@ -0,0 +1,370 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifndef _CUDA_SHARED_H_ +#define _CUDA_SHARED_H_ +#include "cuda_precision.h" + +#define CUDA_MAX_DEBUG_SIZE 1000 //size of debugdata array (allows for so many doubles or twice as many int) + +struct dev_array { + void* dev_data; // pointer to memory address on cuda device + unsigned dim[3]; // array dimensions +}; + +struct cuda_shared_atom { // relevent data from atom class + dev_array dx; // cumulated distance for binning settings + dev_array x; // position + dev_array v; // velocity + dev_array f; // force + dev_array tag; + dev_array type; // global ID number, there are ghosttype = ntypes (ntypescuda=ntypes+1) + dev_array mask; + dev_array image; + dev_array q; // charges + dev_array mass; // per-type masses + dev_array rmass; // per-atom masses + dev_array radius; // per-atom radius + dev_array density; + dev_array omega; + dev_array torque; + dev_array molecule; + + dev_array special; + int maxspecial; + dev_array nspecial; + int* special_flag; + int molecular; + + dev_array eatom; // per-atom energy + dev_array vatom; // per-atom virial + int need_eatom; + int need_vatom; + + dev_array x_type; // position + type in X_FLOAT4 struct + dev_array v_radius; // velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style + dev_array omega_rmass; // velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style + + double* mass_host; // remember per-type host pointer to masses + //int natoms; // total # of atoms in system, could be 0 + int nghost; // and ghost atoms on this proc + int nlocal; // # of owned + int nall; // total # of atoms in this proc + int nmax; // max # of owned+ghost in arrays on this proc + int ntypes; + int q_flag; // do we have charges? + int rmass_flag; // do we have per-atom masses? + int firstgroup; + int nfirst; + + int update_nlocal; + int update_nmax; + int update_neigh; + + dev_array xhold; // position at last neighboring + X_FLOAT triggerneighsq; // maximum square movement before reneighboring + int reneigh_flag; // is reneighboring necessary + int maxhold; // size of xhold + int dist_check; //perform distance check for reneighboring + dev_array binned_id; //id of each binned atom (not tag!!) + dev_array binned_idnew; //new id of each binned atom for sorting basically setting atom[binned_id[k]] at atom[binned_newid[k]] + float bin_extraspace; + int bin_dim[3]; + int bin_nmax; + dev_array map_array; +}; + +struct cuda_shared_pair { // relevent data from pair class + char cudable_force; // check for (cudable_force!=0) + X_FLOAT cut_global; + X_FLOAT cut_inner_global; + X_FLOAT cut_coul_global; + double** cut; // type-type cutoff + double** cutsq; // type-type cutoff + double** cut_inner; // type-type cutoff for coul + double** cut_coul; // type-type cutoff for coul + double** coeff1; // tpye-type pair parameters + double** coeff2; + double** coeff3; + double** coeff4; + double** coeff5; + double** coeff6; + double** coeff7; + double** coeff8; + double** coeff9; + double** coeff10; + double** offset; + double* special_lj; + double* special_coul; + dev_array virial; // ENERGY_FLOAT + dev_array eng_vdwl; // ENERGY_FLOAT + dev_array eng_coul; // ENERGY_FLOAT + X_FLOAT cut_coulsq_global; + F_FLOAT g_ewald, kappa; + int freeze_group_bit; + + dev_array coeff1_gm; + dev_array coeff2_gm; + dev_array coeff3_gm; + dev_array coeff4_gm; + dev_array coeff5_gm; + dev_array coeff6_gm; + dev_array coeff7_gm; + dev_array coeff8_gm; + dev_array coeff9_gm; + dev_array coeff10_gm; + + int lastgridsize; + int n_energy_virial; + int collect_forces_later; + int use_block_per_atom; + int override_block_per_atom; + bool neighall; + +}; + +struct cuda_shared_domain { // relevent data from domain class + X_FLOAT sublo[3]; // orthogonal box -> sub-box bounds on this proc + X_FLOAT subhi[3]; + X_FLOAT boxlo[3]; + X_FLOAT boxhi[3]; + X_FLOAT prd[3]; + int periodicity[3]; // xyz periodicity as array + + int triclinic; + X_FLOAT xy; + X_FLOAT xz; + X_FLOAT yz; + X_FLOAT boxlo_lamda[3]; + X_FLOAT boxhi_lamda[3]; + X_FLOAT prd_lamda[3]; + X_FLOAT h[6]; + X_FLOAT h_inv[6]; + V_FLOAT h_rate[6]; + int update; +}; + +struct cuda_shared_pppm { + char cudable_force; +#ifdef FFT_CUFFT + FFT_FLOAT* work1; + FFT_FLOAT* work2; + FFT_FLOAT* work3; + PPPM_FLOAT* greensfn; + PPPM_FLOAT* fkx; + PPPM_FLOAT* fky; + PPPM_FLOAT* fkz; + PPPM_FLOAT* vg; +#endif + int* part2grid; + PPPM_FLOAT* density_brick; + int* density_brick_int; + PPPM_FLOAT density_intScale; + PPPM_FLOAT* vdx_brick; + PPPM_FLOAT* vdy_brick; + PPPM_FLOAT* vdz_brick; + PPPM_FLOAT* density_fft; + ENERGY_FLOAT* energy; + ENERGY_FLOAT* virial; + int nxlo_in; + int nxhi_in; + int nxlo_out; + int nxhi_out; + int nylo_in; + int nyhi_in; + int nylo_out; + int nyhi_out; + int nzlo_in; + int nzhi_in; + int nzlo_out; + int nzhi_out; + int nx_pppm; + int ny_pppm; + int nz_pppm; + PPPM_FLOAT qqrd2e; + int order; + // float3 sublo; + PPPM_FLOAT* rho_coeff; + int nmax; + int nlocal; + PPPM_FLOAT* debugdata; + PPPM_FLOAT delxinv; + PPPM_FLOAT delyinv; + PPPM_FLOAT delzinv; + int nlower; + int nupper; + PPPM_FLOAT shiftone; + PPPM_FLOAT3* fH; +}; + +struct cuda_shared_comm { + int maxswap; + int maxlistlength; + dev_array pbc; + dev_array slablo; + dev_array slabhi; + dev_array multilo; + dev_array multihi; + dev_array sendlist; + int grow_flag; + int comm_phase; + + int nsend; + int* nsend_swap; + int* send_size; + int* recv_size; + double** buf_send; + void** buf_send_dev; + double** buf_recv; + void** buf_recv_dev; + void* buffer; + int buffer_size; + double overlap_split_ratio; +}; + +struct cuda_shared_neighlist { // member of CudaNeighList, has no instance in cuda_shared_data + int maxlocal; + int inum; // # of I atoms neighbors are stored for local indices of I atoms + int inum_border2; + dev_array inum_border; // # of atoms which interact with border atoms + dev_array ilist; + dev_array ilist_border; + dev_array numneigh; + dev_array numneigh_inner; + dev_array numneigh_border; + dev_array firstneigh; + dev_array neighbors; + dev_array neighbors_border; + dev_array neighbors_inner; + int maxpage; + dev_array page_pointers; + dev_array* pages; + int maxneighbors; + int neigh_lists_per_page; + double** cutneighsq; + CUDA_FLOAT* cu_cutneighsq; + int* binned_id; + int* bin_dim; + int bin_nmax; + float bin_extraspace; + double maxcut; + dev_array ex_type; + int nex_type; + dev_array ex1_bit; + dev_array ex2_bit; + int nex_group; + dev_array ex_mol_bit; + int nex_mol; + +}; + +struct cuda_compile_settings { // this is used to compare compile settings (i.e. precision) of the cu files, and the cpp files + int prec_glob; + int prec_x; + int prec_v; + int prec_f; + int prec_pppm; + int prec_fft; + int cufft; + int arch; +}; + +struct cuda_timings_struct { + //Debug: + double test1; + double test2; + //transfers + double transfer_upload_tmp_constr; + double transfer_download_tmp_deconstr; + + //communication + double comm_forward_total; + double comm_forward_mpi_upper; + double comm_forward_mpi_lower; + double comm_forward_kernel_pack; + double comm_forward_kernel_unpack; + double comm_forward_kernel_self; + double comm_forward_upload; + double comm_forward_download; + + double comm_exchange_total; + double comm_exchange_mpi; + double comm_exchange_kernel_pack; + double comm_exchange_kernel_unpack; + double comm_exchange_kernel_fill; + double comm_exchange_cpu_pack; + double comm_exchange_upload; + double comm_exchange_download; + + double comm_border_total; + double comm_border_mpi; + double comm_border_kernel_pack; + double comm_border_kernel_unpack; + double comm_border_kernel_self; + double comm_border_kernel_buildlist; + double comm_border_upload; + double comm_border_download; + + //pair forces + double pair_xtype_conversion; + double pair_kernel; + double pair_virial; + double pair_force_collection; + + //neighbor + double neigh_bin; + double neigh_build; + double neigh_special; + + //PPPM + double pppm_particle_map; + double pppm_make_rho; + double pppm_brick2fft; + double pppm_poisson; + double pppm_fillbrick; + double pppm_fieldforce; + double pppm_compute; + +}; + +struct cuda_shared_data { // holds space for all relevent data from the different classes + void* buffer; //holds temporary GPU data [data used in subroutines, which has not to be consistend outside of that routine] + int buffersize; //maxsize of buffer + int buffer_new; //should be 1 if the pointer to buffer has changed + void* flag; + void* debugdata; //array for easily collecting debugdata from device class cuda contains the corresponding cu_debugdata and host array + cuda_shared_atom atom; + cuda_shared_pair pair; + cuda_shared_domain domain; + cuda_shared_pppm pppm; + cuda_shared_comm comm; + cuda_compile_settings compile_settings; + cuda_timings_struct cuda_timings; + int exchange_dim; + int me; //mpi rank + unsigned int datamask; + int overlap_comm; +}; + + +#endif // #ifndef _CUDA_SHARED_H_ diff --git a/lib/cuda/cuda_wrapper.cu b/lib/cuda/cuda_wrapper.cu new file mode 100644 index 0000000000000000000000000000000000000000..50366a87da1f9d739d3585a52935560d3ba1a247 --- /dev/null +++ b/lib/cuda/cuda_wrapper.cu @@ -0,0 +1,337 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> +#include "cuda_shared.h" +#include "cuda_common.h" +#include "cuda_wrapper_cu.h" +#include "cuda_wrapper_kernel.cu" + +static int CudaWrapper_total_gpu_mem = 0; +static double CudaWrapper_total_upload_time = 0; +static double CudaWrapper_total_download_time = 0; +static double CudaWrapper_cpubuffer_upload_time = 0; +static double CudaWrapper_cpubuffer_download_time = 0; +static cudaStream_t* streams; +static int nstreams = 0; + +void CudaWrapper_Init(int argc, char** argv, int me, int ppn, int* devicelist) +{ + MYDBG(printf("# CUDA: debug mode on\n");) + +#if __DEVICE_EMULATION__ + + printf("# CUDA: emulation mode on\n"); + +#else + + // modified from cutil.h + static int deviceCount = 0; + static bool sharedmode = false; + + if(deviceCount && !sharedmode) return; + + if(deviceCount && sharedmode) cudaThreadExit(); + + CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceCount(&deviceCount)); + + if(deviceCount == 0) { + fprintf(stderr, "cutil error: no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + MYDBG(printf("# CUDA There are %i devices supporting CUDA in this system.\n", deviceCount);) + + cudaDeviceProp deviceProp[deviceCount]; + + for(int i = 0; i < deviceCount; i++) + CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceProperties(&(deviceProp[i]), i)); + + + int dev_list[deviceCount]; + + for(int i = 0; i < deviceCount; i++) dev_list[i] = i; + + for(int i = 0; i < deviceCount; i++) { + for(int j = 0; j < deviceCount - 1 - i; j++) + if(deviceProp[dev_list[j]].multiProcessorCount < deviceProp[dev_list[j + 1]].multiProcessorCount) { + int k = dev_list[j]; + dev_list[j] = dev_list[j + 1]; + dev_list[j + 1] = k; + } + } + + for(int i = 0; i < deviceCount; i++) { + if((deviceProp[dev_list[i]].computeMode == 0)) sharedmode = true; + + cudaSetDevice(i); + cudaSetDeviceFlags(cudaDeviceMapHost); + } + + if(sharedmode) { + if(ppn && (me % ppn + 1) > deviceCount) { + printf("Asking for more GPUs per node when there are. Reduce gpu/node setting.\n"); + exit(0); + } + + int devicea = me % ppn; + + if(devicelist) devicea = devicelist[devicea]; + else + devicea = dev_list[devicea]; + + if(devicea >= deviceCount) { + printf("Asking for non existent GPU %i. Found only %i GPUs.\n", devicea, deviceCount); + exit(0); + } + + MYDBG( + printf(" # CUDA myid: %i take device: %i\n", me, devicea); + ) + CUDA_SAFE_CALL(cudaSetDevice(devicea)); + } else { + CUDA_SAFE_CALL(cudaSetValidDevices(dev_list, deviceCount)); + } + + cudaThreadSynchronize(); + + int dev; + CUDA_SAFE_CALL(cudaGetDevice(&dev)); + + if(deviceProp[dev].major < 1) { + fprintf(stderr, "CUDA error: device does not support CUDA.\n"); + exit(EXIT_FAILURE); + } else if((deviceProp[dev].major == 1) && (deviceProp[dev].minor != 3)) { + fprintf(stderr, "CUDA error: You need a device with compute capability 1.3 or higher (Device %i is a %s with CC %i.%i)\n", dev, deviceProp[dev].name, deviceProp[dev].major, deviceProp[dev].minor); + exit(EXIT_FAILURE); + } + + if((deviceProp[dev].major == 2) && (CUDA_ARCH < 20)) { + fprintf(stderr, "CUDA warning: You are using a compute %i.%i or higher GPU while LAMMPScuda has been compiled for architecture 1.3\n", deviceProp[dev].major, deviceProp[dev].minor); + } + + if((deviceProp[dev].major == 1) && (CUDA_ARCH >= 20)) { + fprintf(stderr, "CUDA error: You are using a compute 1.3 GPU while LAMMPScuda has been compiled for architecture %i\n", CUDA_ARCH); + exit(EXIT_FAILURE); + } + + + fprintf(stderr, "# Using device %d: %s\n", dev, deviceProp[dev].name); + MYDBG(fprintf(stderr, "# Using device %d: %s\n", dev, deviceProp[dev].name);) + + MYDBG + ( + printf("name = %s\n", deviceProp[dev].name); + printf("totalGlobalMem = %u\n", deviceProp[dev].totalGlobalMem); + printf("sharedMemPerBlock = %i\n", deviceProp[dev].sharedMemPerBlock); + printf("regsPerBlock = %i\n", deviceProp[dev].regsPerBlock); + printf("warpSize = %i\n", deviceProp[dev].warpSize); + printf("memPitch = %i\n", deviceProp[dev].memPitch); + printf("maxThreadsPerBlock = %i\n", deviceProp[dev].maxThreadsPerBlock); + printf("maxThreadsDim = [%i, %i, %i]\n", deviceProp[dev].maxThreadsDim[0], deviceProp[dev].maxThreadsDim[1], deviceProp[dev].maxThreadsDim[2]); + printf("maxGridSize = [%i, %i, %i]\n", deviceProp[dev].maxGridSize[0], deviceProp[dev].maxGridSize[1], deviceProp[dev].maxGridSize[2]); + printf("totalConstMem = %i\n", deviceProp[dev].totalConstMem); + printf("major . minor = %i . %i\n", deviceProp[dev].major, deviceProp[dev].minor); + printf("clockRate = %i\n", deviceProp[dev].clockRate); + printf("textureAlignment = %i\n", deviceProp[dev].textureAlignment); + printf("deviceOverlap = %i\n", deviceProp[dev].deviceOverlap); + printf("multiProcessorCount = %i\n", deviceProp[dev].multiProcessorCount); + printf("computeMode = %i\n", deviceProp[dev].computeMode); + ) + +#endif +} + +void* CudaWrapper_AllocCudaData(unsigned nbytes) +{ + void* dev_data; + CUDA_SAFE_CALL(cudaMalloc((void**)&dev_data, nbytes)); + MYDBG(printf("# CUDA: allocated %u bytes on device at dev%p\n", nbytes, dev_data);) + CudaWrapper_total_gpu_mem += nbytes; + return dev_data; +} + +void CudaWrapper_UploadCudaData(void* host_data, void* dev_data, unsigned nbytes) +{ + MYDBG(printf("# CUDA: uploading %u bytes to device at dev%p from %p\n", nbytes, dev_data, host_data);) + cudaThreadSynchronize(); + timespec time1, time2; + clock_gettime(CLOCK_REALTIME, &time1); + CUDA_SAFE_CALL(cudaMemcpy(dev_data, host_data, nbytes, cudaMemcpyHostToDevice)); + clock_gettime(CLOCK_REALTIME, &time2); + CudaWrapper_total_upload_time += + time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; +} + +void CudaWrapper_UploadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes, int stream) +{ + MYDBG(printf("# CUDA: downloading %u bytes from device at dev%p\n", nbytes, dev_data);) + cudaMemcpyAsync(dev_data, host_data, nbytes, cudaMemcpyHostToDevice, streams[stream]); +} + +void CudaWrapper_DownloadCudaData(void* host_data, void* dev_data, unsigned nbytes) +{ + MYDBG(printf("# CUDA: downloading %u bytes from device at dev%p\n", nbytes, dev_data);) + cudaThreadSynchronize(); + timespec time1, time2; + clock_gettime(CLOCK_REALTIME, &time1); + CUDA_SAFE_CALL(cudaMemcpy(host_data, dev_data, nbytes, cudaMemcpyDeviceToHost)); + clock_gettime(CLOCK_REALTIME, &time2); + CudaWrapper_total_download_time += + time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; +} + +void CudaWrapper_DownloadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes, int stream) +{ + MYDBG(printf("# CUDA: downloading %u bytes from device at dev%p\n", nbytes, dev_data);) + cudaMemcpyAsync(host_data, dev_data, nbytes, cudaMemcpyDeviceToHost, streams[stream]); +} + +void CudaWrapper_FreeCudaData(void* dev_data, unsigned nbytes) +{ + MYDBG(printf("# CUDA: freeing memory at dev%p with %i bytes (last adress: %p)\n", dev_data, nbytes, (char*)dev_data + nbytes);) + CUDA_SAFE_CALL(cudaFree(dev_data)); + CudaWrapper_total_gpu_mem -= nbytes; +} + +void CudaWrapper_Memset(void* dev_data, int value, unsigned nbytes) +{ + MYDBG(printf("# CUDA: setting %u bytes to %i at dev%p\n", nbytes, value, dev_data);) + CUDA_SAFE_CALL(cudaMemset(dev_data, value, nbytes)); +} + +void CudaWrapper_CopyData(void* dev_dest, void* dev_source, unsigned nbytes) +{ + MYDBG(printf("# CUDA: copy %u bytes from dev%p to dev%p\n", nbytes, dev_source, dev_dest);) + CUDA_SAFE_CALL(cudaMemcpy(dev_dest, dev_source, nbytes, cudaMemcpyDeviceToDevice)); +} + +void* CudaWrapper_AllocPinnedHostData(unsigned nbytes, bool mapped, bool writeCombined) +{ + void* host_data; + int flags = 0; + + if(mapped) flags = flags | cudaHostAllocMapped; + + if(writeCombined) flags = flags | cudaHostAllocWriteCombined; + + CUDA_SAFE_CALL(cudaHostAlloc((void**)&host_data, nbytes, flags)); + // CUDA_SAFE_CALL( cudaMallocHost((void**)&host_data, nbytes) ); + MYDBG(printf("# CUDA: allocated %u bytes pinned memory on host at %p\n", nbytes, host_data);) + return host_data; +} + +void CudaWrapper_FreePinnedHostData(void* host_data) +{ + MYDBG(printf("# CUDA: freeing pinned host memory at %p \n", host_data);) + + if(host_data) + CUDA_SAFE_CALL(cudaFreeHost(host_data)); +} + +void cuda_check_error(char* comment) +{ + printf("ERROR-CUDA %s %s\n", comment, cudaGetErrorString(cudaGetLastError())); +} + +int CudaWrapper_CheckMemUseage() +{ + size_t free, total; + cudaMemGetInfo(&free, &total); + return total - free; //possible with cuda 3.0 ??? + //return CudaWrapper_total_gpu_mem; +} + +double CudaWrapper_CheckUploadTime(bool reset) +{ + if(reset) CudaWrapper_total_upload_time = 0.0; + + return CudaWrapper_total_upload_time; +} + +double CudaWrapper_CheckDownloadTime(bool reset) +{ + if(reset) CudaWrapper_total_download_time = 0.0; + + return CudaWrapper_total_download_time; +} + +double CudaWrapper_CheckCPUBufUploadTime(bool reset) +{ + if(reset) CudaWrapper_cpubuffer_upload_time = 0.0; + + return CudaWrapper_cpubuffer_upload_time; +} + +double CudaWrapper_CheckCPUBufDownloadTime(bool reset) +{ + if(reset) CudaWrapper_cpubuffer_download_time = 0.0; + + return CudaWrapper_cpubuffer_download_time; +} + +void CudaWrapper_AddCPUBufUploadTime(double dt) +{ + CudaWrapper_cpubuffer_upload_time += dt; +} + +void CudaWrapper_AddCPUBufDownloadTime(double dt) +{ + CudaWrapper_cpubuffer_download_time += dt; +} + +void CudaWrapper_Sync() +{ + cudaThreadSynchronize(); +} + +void CudaWrapper_SyncStream(int stream) +{ + cudaStreamSynchronize(streams[stream]); +} + +void CudaWrapper_AddStreams(int n) +{ + cudaStream_t* new_streams = new cudaStream_t[nstreams + n]; + + for(int i = 0; i < nstreams; i++) new_streams[i] = streams[i]; + + for(int i = nstreams; i < nstreams + n; i++) cudaStreamCreate(&new_streams[i]); + + if(nstreams > 0) + delete [] streams; + + streams = new_streams; + nstreams += n; +} + +void* CudaWrapper_returnStreams() +{ + return (void*) streams; +} + +int CudaWrapper_returnNStreams() +{ + return nstreams; +} + diff --git a/lib/cuda/cuda_wrapper_cu.h b/lib/cuda/cuda_wrapper_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..5bcfaffd4466f0dc6ddc4883e7d16f0c45785d3a --- /dev/null +++ b/lib/cuda/cuda_wrapper_cu.h @@ -0,0 +1,52 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifndef _CUDA_DATA_WRAPPER_H_ +#define _CUDA_DATA_WRAPPER_H_ + +extern "C" void CudaWrapper_Init(int argc, char** argv, int me = 0, int ppn = 2, int* devicelist = NULL); +extern "C" void* CudaWrapper_AllocCudaData(unsigned nbytes); +extern "C" void CudaWrapper_UploadCudaData(void* host_data, void* dev_data, unsigned nbytes); +extern "C" void CudaWrapper_UploadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes, int stream_id); +extern "C" void CudaWrapper_DownloadCudaData(void* host_data, void* dev_data, unsigned nbytes); +extern "C" void CudaWrapper_DownloadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes, int stream_id); +extern "C" void CudaWrapper_FreeCudaData(void* dev_data, unsigned nbytes = 0); +extern "C" void CudaWrapper_Memset(void* dev_data, int value, unsigned nbytes); +extern "C" void CudaWrapper_CopyData(void* dev_dest, void* dev_source, unsigned nbytes); +extern "C" void* CudaWrapper_AllocPinnedHostData(unsigned nbytes, bool mapped = false, bool writeCombind = false); +extern "C" void CudaWrapper_FreePinnedHostData(void* dev_data); +extern "C" void cuda_check_error(char* comment); +extern "C" int CudaWrapper_CheckMemUseage(); +extern "C" double CudaWrapper_CheckUploadTime(bool reset = false); +extern "C" double CudaWrapper_CheckDownloadTime(bool reset = false); +extern "C" double CudaWrapper_CheckCPUBufUploadTime(bool reset = false); +extern "C" double CudaWrapper_CheckCPUBufDownloadTime(bool reset = false); +extern "C" void CudaWrapper_AddCPUBufUploadTime(double dt); +extern "C" void CudaWrapper_AddCPUBufDownloadTime(double dt); +extern "C" void CudaWrapper_Sync(); +extern "C" void CudaWrapper_SyncStream(int n); +extern "C" void CudaWrapper_AddStreams(int n); +extern "C" void* CudaWrapper_returnStreams(); +extern "C" int CudaWrapper_returnNStreams(); + +#endif // _CUDA_DATA_WRAPPER_H_ diff --git a/lib/cuda/cuda_wrapper_kernel.cu b/lib/cuda/cuda_wrapper_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..9e0369856ba913c2d88af2fb81b6b73cca194db3 --- /dev/null +++ b/lib/cuda/cuda_wrapper_kernel.cu @@ -0,0 +1,24 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +// empty file to obay common make rule diff --git a/lib/cuda/domain.cu b/lib/cuda/domain.cu new file mode 100644 index 0000000000000000000000000000000000000000..9dddbf65fd294b0a0c63fd668f82ed4637935af8 --- /dev/null +++ b/lib/cuda/domain.cu @@ -0,0 +1,202 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> +#define MY_PREFIX domain +#include "cuda_shared.h" +#include "cuda_common.h" + +#include "crm_cuda_utils.cu" + +#include "domain_cu.h" +#include "domain_kernel.cu" + +void Cuda_Domain_UpdateBuffer(cuda_shared_data* sdata, int size) +{ + if(sdata->buffersize < size) { + MYDBG(printf("Cuda_Domain Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);) + CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize); + sdata->buffer = CudaWrapper_AllocCudaData(size); + sdata->buffersize = size; + sdata->buffer_new++; + MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);) + } + + cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*)); +} + +void Cuda_Domain_UpdateNmax(cuda_shared_data* sdata) +{ + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(image) , & sdata->atom.image.dev_data, sizeof(int*)); +} + +void Cuda_Domain_UpdateDomain(cuda_shared_data* sdata) +{ + cudaMemcpyToSymbol(MY_AP(boxlo) , sdata->domain.boxlo , 3 * sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_AP(boxhi) , sdata->domain.boxhi , 3 * sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_AP(sublo) , sdata->domain.sublo , 3 * sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_AP(subhi) , sdata->domain.subhi , 3 * sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_AP(prd) , sdata->domain.prd , 3 * sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_AP(periodicity) , sdata->domain.periodicity , 3 * sizeof(int)); + cudaMemcpyToSymbol(MY_AP(triclinic) , & sdata->domain.triclinic , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(boxlo_lamda) , sdata->domain.boxlo_lamda , 3 * sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_AP(boxhi_lamda) , sdata->domain.boxhi_lamda , 3 * sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_AP(prd_lamda) , sdata->domain.prd_lamda , 3 * sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_AP(h) , sdata->domain.h , 6 * sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_AP(h_inv) , sdata->domain.h_inv , 6 * sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_AP(h_rate) , sdata->domain.h_rate , 6 * sizeof(V_FLOAT)); + cudaMemcpyToSymbol(MY_AP(flag) , &sdata->flag , sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(debugdata) , &sdata->debugdata , sizeof(int*)); +} + +void Cuda_Domain_Init(cuda_shared_data* sdata) +{ + Cuda_Domain_UpdateNmax(sdata); + Cuda_Domain_UpdateDomain(sdata); +} + +void Cuda_Domain_PBC(cuda_shared_data* sdata, int deform_remap, int deform_groupbit, double* extent) +{ + Cuda_Domain_UpdateNmax(sdata); + //if(sdata->domain.update) + Cuda_Domain_UpdateDomain(sdata); + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + + int box_change = 0; + + if(extent) box_change = 1; + + int sharedmem = 0; + + if(box_change) sharedmem = 6 * sizeof(X_FLOAT); + + int3 layout = getgrid(sdata->atom.nlocal, sharedmem); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + sharedmem *= threads.x; + + if((box_change) && (sdata->buffer_new or (6 * sizeof(X_FLOAT)*grid.x * grid.y > sdata->buffersize))) + Cuda_Domain_UpdateBuffer(sdata, layout.x * layout.y * 6 * sizeof(X_FLOAT)); + + + Domain_PBC_Kernel <<< grid, threads, sharedmem>>>(deform_remap, deform_groupbit, box_change); + cudaThreadSynchronize(); + + CUT_CHECK_ERROR("Cuda_Domain_PBC: Kernel execution failed"); + + if(box_change) { + X_FLOAT buf2[6 * layout.x * layout.y]; + X_FLOAT* buf = buf2; + int flag; + cudaMemcpy(buf, sdata->buffer, 6 * layout.x * layout.y * sizeof(X_FLOAT), cudaMemcpyDeviceToHost); + cudaMemcpy(&flag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost); + //printf("Flag: %i\n",flag); + X_FLOAT min, max; + min = 1.0 * BIG; + max = -1.0 * BIG; + + for(int i = 0; i < layout.x * layout.y; i++) { + if(buf[i] < min) min = buf[i]; + + if(buf[i + layout.x * layout.y] > max) max = buf[i + layout.x * layout.y]; + } + + extent[0] = min; + extent[1] = max; + + buf += 2 * layout.x * layout.y; + min = 1.0 * BIG; + max = -1.0 * BIG; + + for(int i = 0; i < layout.x * layout.y; i++) { + if(buf[i] < min) min = buf[i]; + + if(buf[i + layout.x * layout.y] > max) max = buf[i + layout.x * layout.y]; + } + + extent[2] = min; + extent[3] = max; + + buf += 2 * layout.x * layout.y; + min = 1.0 * BIG; + max = -1.0 * BIG; + + for(int i = 0; i < layout.x * layout.y; i++) { + if(buf[i] < min) min = buf[i]; + + if(buf[i + layout.x * layout.y] > max) max = buf[i + layout.x * layout.y]; + } + + extent[4] = min; + extent[5] = max; + //printf("Extent: %lf %lf %lf %lf %lf %lf\n",extent[0],extent[1],extent[2],extent[3],extent[4],extent[5]); + /* int n=grid.x*grid.y; + if(n<128) threads.x=32; + else if(n<256) threads.x=64; + else threads.x=128; + sharedmem=n*sizeof(X_FLOAT); + grid.x=6; + grid.y=1; + Domain_reduceBoxExtent<<<grid, threads,sharedmem>>>(extent,n); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Domain_reduceBoxExtent: Kernel execution failed");*/ + } +} + +void Cuda_Domain_lamda2x(cuda_shared_data* sdata, int n) +{ + Cuda_Domain_UpdateNmax(sdata); + //if(sdata->domain.update) + Cuda_Domain_UpdateDomain(sdata); + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + + int3 layout = getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + Domain_lamda2x_Kernel <<< grid, threads, 0>>>(n); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Domain_lamda2x: Kernel execution failed"); +} + +void Cuda_Domain_x2lamda(cuda_shared_data* sdata, int n) +{ + Cuda_Domain_UpdateNmax(sdata); + //if(sdata->domain.update) + Cuda_Domain_UpdateDomain(sdata); + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + + int3 layout = getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + Domain_x2lamda_Kernel <<< grid, threads, 0>>>(n); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Domain_x2lamda: Kernel execution failed"); +} diff --git a/lib/cuda/domain_cu.h b/lib/cuda/domain_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..38eb78e8568acf5e6451cd67924493735c20d533 --- /dev/null +++ b/lib/cuda/domain_cu.h @@ -0,0 +1,29 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_Domain_Init(cuda_shared_data* sdata); +extern "C" void Cuda_Domain_PBC(cuda_shared_data* sdata, int deform_remap, int deform_groupbit, double* extent = NULL); +extern "C" void Cuda_Domain_lamda2x(cuda_shared_data* sdata, int n); +extern "C" void Cuda_Domain_x2lamda(cuda_shared_data* sdata, int n); diff --git a/lib/cuda/domain_kernel.cu b/lib/cuda/domain_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..fedb7807a80ee659108c9a805f488b47ec4c3a6f --- /dev/null +++ b/lib/cuda/domain_kernel.cu @@ -0,0 +1,293 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +extern __shared__ X_FLOAT sharedmem[]; + +#define BIG 1e10 +__global__ void Domain_PBC_Kernel(int deform_remap, int deform_groupbit, int box_change) +{ + int idim, otherdims; + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + X_FLOAT lo[3]; + X_FLOAT hi[3]; + X_FLOAT* period; + + if(_triclinic == 0) { + lo[0] = _boxlo[0]; + lo[1] = _boxlo[1]; + lo[2] = _boxlo[2]; + + hi[0] = _boxhi[0]; + hi[1] = _boxhi[1]; + hi[2] = _boxhi[2]; + period = _prd; + } else { + lo[0] = _boxlo_lamda[0]; + lo[1] = _boxlo_lamda[1]; + lo[2] = _boxlo_lamda[2]; + + hi[0] = _boxhi_lamda[0]; + hi[1] = _boxhi_lamda[1]; + hi[2] = _boxhi_lamda[2]; + period = _prd_lamda; + } + + + X_FLOAT tmpx = X_F(0.5) * (hi[0] + lo[0]); + X_FLOAT tmpy = X_F(0.5) * (hi[1] + lo[1]); + X_FLOAT tmpz = X_F(0.5) * (hi[2] + lo[2]); + + X_FLOAT* buf = (X_FLOAT*) _buffer; + buf += blockIdx.x * gridDim.y + blockIdx.y; + buf[0] = tmpx; + buf += gridDim.x * gridDim.y; + buf[0] = tmpx; + buf += gridDim.x * gridDim.y; + buf[0] = tmpy; + buf += gridDim.x * gridDim.y; + buf[0] = tmpy; + buf += gridDim.x * gridDim.y; + buf[0] = tmpz; + buf += gridDim.x * gridDim.y; + buf[0] = tmpz; + + if(i < _nlocal) { + + if(_periodicity[0]) { + if(_x[i] < lo[0]) { + _x[i] += period[0]; + + if(deform_remap && _mask[i] & deform_groupbit) _v[i] += _h_rate[0]; + + idim = _image[i] & 1023; + otherdims = _image[i] ^ idim; + idim--; + idim &= 1023; + _image[i] = otherdims | idim; + } + + if(_x[i] >= hi[0]) { + _x[i] -= period[0]; + _x[i] = MAX(_x[i], lo[0]); + + if(deform_remap && _mask[i] & deform_groupbit) _v[i] -= _h_rate[0]; + + idim = _image[i] & 1023; + otherdims = _image[i] ^ idim; + idim++; + idim &= 1023; + _image[i] = otherdims | idim; + } + } + + if(_periodicity[1]) { + if(_x[i + _nmax] < lo[1]) { + _x[i + _nmax] += period[1]; + + if(deform_remap && _mask[i] & deform_groupbit) { + _v[i] += _h_rate[5]; + _v[i + _nmax] += _h_rate[1]; + } + + idim = (_image[i] >> 10) & 1023; + otherdims = _image[i] ^ (idim << 10); + idim--; + idim &= 1023; + _image[i] = otherdims | (idim << 10); + } + + if(_x[i + _nmax] >= hi[1]) { + _x[i + _nmax] -= period[1]; + _x[i + _nmax] = MAX(_x[i + _nmax], lo[1]); + + if(deform_remap && _mask[i] & deform_groupbit) { + _v[i] -= _h_rate[5]; + _v[i + _nmax] -= _h_rate[1]; + } + + idim = (_image[i] >> 10) & 1023; + otherdims = _image[i] ^ (idim << 10); + idim++; + idim &= 1023; + _image[i] = otherdims | (idim << 10); + } + } + + if(_periodicity[2]) { + if(_x[i + 2 * _nmax] < lo[2]) { + _x[i + 2 * _nmax] += period[2]; + + if(deform_remap && _mask[i] & deform_groupbit) { + _v[i] += _h_rate[4]; + _v[i + _nmax] += _h_rate[3]; + _v[i + 2 * _nmax] += _h_rate[2]; + } + + idim = _image[i] >> 20; + otherdims = _image[i] ^ (idim << 20); + idim--; + idim &= 1023; + _image[i] = otherdims | (idim << 20); + } + + if(_x[i + 2 * _nmax] >= hi[2]) { + _x[i + 2 * _nmax] -= period[2]; + _x[i + 2 * _nmax] = MAX(_x[i + 2 * _nmax], lo[2]); + + if(deform_remap && _mask[i] & deform_groupbit) { + _v[i] -= _h_rate[4]; + _v[i + _nmax] -= _h_rate[3]; + _v[i + 2 * _nmax] -= _h_rate[2]; + } + + idim = _image[i] >> 20; + otherdims = _image[i] ^ (idim << 20); + idim++; + idim &= 1023; + _image[i] = otherdims | (idim << 20); + } + } + + if(box_change) { + tmpx = _x[i]; + tmpy = _x[i + _nmax]; + tmpz = _x[i + 2 * _nmax]; + + + } + } + + __syncthreads(); + + if(box_change) { + X_FLOAT minx = BIG; + X_FLOAT maxx = -BIG; + X_FLOAT miny = BIG; + X_FLOAT maxy = -BIG; + X_FLOAT minz = BIG; + X_FLOAT maxz = -BIG; + + if(not _periodicity[0]) { + sharedmem[threadIdx.x] = tmpx; + minOfBlock(sharedmem); + minx = sharedmem[0]; + __syncthreads(); + sharedmem[threadIdx.x] = tmpx; + maxOfBlock(sharedmem); + maxx = sharedmem[0]; + __syncthreads(); + } else { + minx = lo[0]; + maxx = hi[0]; + } + + if(not _periodicity[1]) { + sharedmem[threadIdx.x] = tmpy; + minOfBlock(sharedmem); + miny = sharedmem[0]; + __syncthreads(); + sharedmem[threadIdx.x] = tmpy; + maxOfBlock(sharedmem); + maxy = sharedmem[0]; + __syncthreads(); + } else { + minx = lo[1]; + maxx = hi[1]; + } + + if(not _periodicity[2]) { + sharedmem[threadIdx.x] = tmpz; + minOfBlock(sharedmem); + minz = sharedmem[0]; + __syncthreads(); + sharedmem[threadIdx.x] = tmpz; + maxOfBlock(sharedmem); + maxz = sharedmem[0]; + __syncthreads(); + } else { + minz = lo[2]; + maxz = hi[2]; + } + + if(threadIdx.x == 0) { + buf = (X_FLOAT*) _buffer; + buf += blockIdx.x * gridDim.y + blockIdx.y; + buf[0] = minx; + buf += gridDim.x * gridDim.y; + buf[0] = maxx; + buf += gridDim.x * gridDim.y; + buf[0] = miny; + buf += gridDim.x * gridDim.y; + buf[0] = maxy; + buf += gridDim.x * gridDim.y; + buf[0] = minz; + buf += gridDim.x * gridDim.y; + buf[0] = maxz; + } + } +} + +__global__ void Domain_reduceBoxExtent(double* extent, int n) +{ + X_FLOAT* buf = (X_FLOAT*) _buffer; + buf += blockIdx.x * n; + copyGlobToShared(buf, sharedmem, n); + + if(blockIdx.x % 2 == 0) + minOfData(sharedmem, n); + else + maxOfData(sharedmem, n); + + extent[blockIdx.x] = sharedmem[0]; +} + +__global__ void Domain_lamda2x_Kernel(int n) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if(i < n) { + X_FLOAT ytmp = _x[i + _nmax]; + X_FLOAT ztmp = _x[i + 2 * _nmax]; + _x[i] = _h[0] * _x[i] + _h[5] * ytmp + _h[4] * ztmp + _boxlo[0]; + _x[i + _nmax] = _h[1] * ytmp + _h[3] * ztmp + _boxlo[1]; + _x[i + 2 * _nmax] = _h[2] * ztmp + _boxlo[2]; + } +} + +__global__ void Domain_x2lamda_Kernel(int n) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + X_FLOAT delta[3]; + + if(i < n) { + delta[0] = _x[i] - _boxlo[0]; + delta[1] = _x[i + _nmax] - _boxlo[1]; + delta[2] = _x[i + 2 * _nmax] - _boxlo[2]; + + _x[i] = _h_inv[0] * delta[0] + _h_inv[5] * delta[1] + _h_inv[4] * delta[2]; + _x[i + _nmax] = _h_inv[1] * delta[1] + _h_inv[3] * delta[2]; + _x[i + 2 * _nmax] = _h_inv[2] * delta[2]; + } +} diff --git a/lib/cuda/fft3d_cuda.cu b/lib/cuda/fft3d_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..d5ac077f9d0784758b946c25b9dcf276cad6677b --- /dev/null +++ b/lib/cuda/fft3d_cuda.cu @@ -0,0 +1,103 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +//#define CUDA_PRECISION 1 +#include "cuda_precision.h" +#include "cuda_common.h" +struct FFT_DATA { + FFT_FLOAT re; + FFT_FLOAT im; +}; + +#include "fft3d_cuda_cu.h" +#include "fft3d_cuda_kernel.cu" +#include <stdio.h> + +void initfftdata(double* in, FFT_FLOAT* out, int nfast, int nmid, int nslow) +{ + + dim3 grid; + grid.x = nslow; + grid.y = nmid; + grid.z = 1; + dim3 threads; + threads.x = nfast; + threads.y = 1; + threads.z = 1; + cudaThreadSynchronize(); + initfftdata_kernel <<< grid, threads, 0>>>(in, out); + cudaThreadSynchronize(); + MYDBG(printf("ERROR-CUDA initfftdata_kernel: %s\n", cudaGetErrorString(cudaGetLastError()))); +} + + +void permute(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow) +{ + + dim3 grid; + grid.x = nslow; + grid.y = nmid; + grid.z = 1; + dim3 threads; + threads.x = nfast * 2; + threads.y = 1; + threads.z = 1; + permute_kernel <<< grid, threads, 0>>>((FFT_FLOAT*)in, (FFT_FLOAT*)out); + cudaThreadSynchronize(); + MYDBG(printf("ERROR-CUDA permute_kernel: %s\n", cudaGetErrorString(cudaGetLastError()))); +} + +void permute_scale(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow) +{ + + dim3 grid; + grid.x = nslow; + grid.y = nmid; + grid.z = 1; + dim3 threads; + threads.x = nfast * 2; + threads.y = 1; + threads.z = 1; + permute_kernel <<< grid, threads, 0>>>((FFT_FLOAT*)in, (FFT_FLOAT*)out); + cudaThreadSynchronize(); +} +void permute_part(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow, int ihi, int ilo, int jhi, int jlo, int khi, int klo) +{ + + dim3 grid; + grid.x = (ihi - ilo + 1); + grid.y = (jhi - jlo + 1); + grid.z = 1; + dim3 threads; + threads.x = (khi - klo + 1) * 2; + threads.y = 1; + threads.z = 1; + permute_part_kernel <<< grid, threads, 0>>>((FFT_FLOAT*)in, (FFT_FLOAT*)out, nfast, nmid, nslow, ihi, ilo, jhi, jlo, khi, klo); + cudaThreadSynchronize(); +} + +void FFTsyncthreads() +{ + cudaThreadSynchronize(); +} + diff --git a/lib/cuda/fft3d_cuda_cu.h b/lib/cuda/fft3d_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..6447d8e125463da7a5d485bf3010434c9e4a1222 --- /dev/null +++ b/lib/cuda/fft3d_cuda_cu.h @@ -0,0 +1,30 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void initfftdata(double* in, FFT_FLOAT* out, int nfast, int nmid, int nslow); +extern "C" void permute(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow); +extern "C" void permute_scale(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow); +extern "C" void permute_part(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow, int ihi, int ilo, int jhi, int jlo, int khi, int klo); +extern "C" void FFTsyncthreads(); diff --git a/lib/cuda/fft3d_cuda_kernel.cu b/lib/cuda/fft3d_cuda_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..8a1be74bb1eff790c9f4eb1b654171493c0c70de --- /dev/null +++ b/lib/cuda/fft3d_cuda_kernel.cu @@ -0,0 +1,46 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +__global__ void initfftdata_kernel(double* in, FFT_FLOAT* out) +{ + out[2 * (((blockIdx.x * gridDim.y + blockIdx.y)*blockDim.x) + threadIdx.x)] = in[((blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x) + threadIdx.x]; + out[2 * (((blockIdx.x * gridDim.y + blockIdx.y)*blockDim.x) + threadIdx.x) + 1] = 0; +} + + +__global__ void permute_kernel(FFT_FLOAT* in, FFT_FLOAT* out) +{ + out[2 * (((threadIdx.x / 2)*gridDim.x + blockIdx.x)*gridDim.y + blockIdx.y) + threadIdx.x - 2 * (threadIdx.x / 2)] = in[((blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x) + threadIdx.x]; +} + +__global__ void permute_scale_kernel(FFT_FLOAT* in, FFT_FLOAT* out) +{ + out[2 * (((threadIdx.x / 2)*gridDim.x + blockIdx.x)*gridDim.y + blockIdx.y) + threadIdx.x - 2 * (threadIdx.x / 2)] = in[((blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x) + threadIdx.x] * gridDim.x * gridDim.y * blockDim.x * 0.5; +} + +__global__ void permute_part_kernel(FFT_FLOAT* in, FFT_FLOAT* out, int nfast, int nmid, int nslow, int ihi, int ilo, int jhi, int jlo, int khi, int klo) +{ + { + out[2 * ((threadIdx.x / 2) * (ihi - ilo + 1) * (jhi - jlo + 1) + (blockIdx.x) * (jhi - jlo + 1) + blockIdx.y - jlo) + threadIdx.x - 2 * (threadIdx.x / 2)] = in[2 * (blockIdx.x + ilo) * nmid * nslow + 2 * (blockIdx.y + jlo) * nmid + threadIdx.x + 2 * klo]; + } +} diff --git a/lib/cuda/fix_addforce_cuda.cu b/lib/cuda/fix_addforce_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..c1ecefba45f59d252ea60374a3cc622d3332e354 --- /dev/null +++ b/lib/cuda/fix_addforce_cuda.cu @@ -0,0 +1,93 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> +#define MY_PREFIX fix_add_force_cuda +#include "cuda_shared.h" +#include "cuda_common.h" + +#include "crm_cuda_utils.cu" + +#include "fix_addforce_cuda_cu.h" +#include "fix_addforce_cuda_kernel.cu" + +void Cuda_FixAddForceCuda_UpdateBuffer(cuda_shared_data* sdata) +{ + int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_FLOAT)); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + int size = (unsigned)(layout.z * layout.y * layout.x) * 4 * sizeof(F_FLOAT); + + if(sdata->buffersize < size) { + MYDBG(printf("Cuda_FixAddForceCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);) + CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize); + sdata->buffer = CudaWrapper_AllocCudaData(size); + sdata->buffersize = size; + sdata->buffer_new++; + MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);) + } + + cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*)); +} + +void Cuda_FixAddForceCuda_UpdateNmax(cuda_shared_data* sdata) +{ + cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*)); +} + +void Cuda_FixAddForceCuda_Init(cuda_shared_data* sdata) +{ + Cuda_FixAddForceCuda_UpdateNmax(sdata); +} + +void Cuda_FixAddForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLOAT axvalue, F_FLOAT ayvalue, F_FLOAT azvalue, F_FLOAT* aforiginal) +{ + if(sdata->atom.update_nmax) + Cuda_FixAddForceCuda_UpdateNmax(sdata); + + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + + if(sdata->buffer_new) + Cuda_FixAddForceCuda_UpdateBuffer(sdata); + + int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_FLOAT)); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + Cuda_FixAddForceCuda_PostForce_Kernel <<< grid, threads, threads.x* 4* sizeof(F_FLOAT)>>> (groupbit, axvalue, ayvalue, azvalue); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_FixAddForceCuda_PostForce: fix add_force post_force Compute Kernel execution failed"); + + int oldgrid = grid.x; + grid.x = 4; + threads.x = 512; + reduce_foriginal <<< grid, threads, threads.x* sizeof(F_FLOAT)>>> (oldgrid, aforiginal); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_FixAddForceCuda_PostForce: fix add_force post_force Reduce Kernel execution failed"); + +} diff --git a/lib/cuda/fix_addforce_cuda_cu.h b/lib/cuda/fix_addforce_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..1bf59300c928d0384709f89f257f0d15e00d2c1c --- /dev/null +++ b/lib/cuda/fix_addforce_cuda_cu.h @@ -0,0 +1,27 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_FixAddForceCuda_Init(cuda_shared_data* sdata); +extern "C" void Cuda_FixAddForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLOAT axvalue, F_FLOAT ayvalue, F_FLOAT azvalue, F_FLOAT* aforiginal); diff --git a/lib/cuda/fix_addforce_cuda_kernel.cu b/lib/cuda/fix_addforce_cuda_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..e0265f3797570fe361c4b3ed87607f00f1a0eff8 --- /dev/null +++ b/lib/cuda/fix_addforce_cuda_kernel.cu @@ -0,0 +1,90 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +extern __shared__ F_FLOAT sharedmem[]; + + +__global__ void Cuda_FixAddForceCuda_PostForce_Kernel(int groupbit, F_FLOAT xvalue, F_FLOAT yvalue, F_FLOAT zvalue) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + sharedmem[threadIdx.x] = 0; + sharedmem[threadIdx.x + blockDim.x] = 0; + sharedmem[threadIdx.x + 2 * blockDim.x] = 0; + sharedmem[threadIdx.x + 3 * blockDim.x] = 0; + + if(i < _nlocal) + if(_mask[i] & groupbit) + //if (iregion >= 0 && + //match(x[i][0],x[i][1],x[i][2],iregion)) //currently not supported + { + sharedmem[threadIdx.x] = -xvalue * _x[i] - yvalue * _x[i + 1 * _nmax] - zvalue * _x[i + 2 * _nmax]; + sharedmem[threadIdx.x + blockDim.x] = _f[i]; + sharedmem[threadIdx.x + 2 * blockDim.x] = _f[i + 1 * _nmax]; + sharedmem[threadIdx.x + 3 * blockDim.x] = _f[i + 2 * _nmax]; + _f[i] += xvalue; + _f[i + 1 * _nmax] += yvalue; + _f[i + 2 * _nmax] += zvalue; + } + + reduceBlock(sharedmem); + reduceBlock(&sharedmem[blockDim.x]); + reduceBlock(&sharedmem[2 * blockDim.x]); + reduceBlock(&sharedmem[3 * blockDim.x]); + F_FLOAT* buffer = (F_FLOAT*) _buffer; + + if(threadIdx.x == 0) { + buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0]; + buffer[blockIdx.x * gridDim.y + blockIdx.y + gridDim.x * gridDim.y] = sharedmem[blockDim.x]; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = sharedmem[2 * blockDim.x]; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = sharedmem[3 * blockDim.x]; + } + +} + + +__global__ void reduce_foriginal(int n, F_FLOAT* foriginal) +{ + int i = 0; + sharedmem[threadIdx.x] = 0; + F_FLOAT myforig = 0.0; + F_FLOAT* buf = (F_FLOAT*) _buffer; + buf = &buf[blockIdx.x * n]; + + while(i < n) { + sharedmem[threadIdx.x] = 0; + + if(i + threadIdx.x < n) + sharedmem[threadIdx.x] = buf[i + threadIdx.x]; + + __syncthreads(); + reduceBlock(sharedmem); + i += blockDim.x; + + if(threadIdx.x == 0) + myforig += sharedmem[0]; + } + + if(threadIdx.x == 0) + foriginal[blockIdx.x] = myforig; +} diff --git a/lib/cuda/fix_aveforce_cuda.cu b/lib/cuda/fix_aveforce_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..600f1d95e08e3856c34c1578604a065e1e82e455 --- /dev/null +++ b/lib/cuda/fix_aveforce_cuda.cu @@ -0,0 +1,107 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> +#define MY_PREFIX fix_ave_force_cuda +#include "cuda_shared.h" +#include "cuda_common.h" + +#include "crm_cuda_utils.cu" + +#include "fix_aveforce_cuda_cu.h" +#include "fix_aveforce_cuda_kernel.cu" + +void Cuda_FixAveForceCuda_UpdateBuffer(cuda_shared_data* sdata) +{ + int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_FLOAT)); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + int size = (unsigned)(layout.z * layout.y * layout.x) * 4 * sizeof(F_FLOAT); + + if(sdata->buffersize < size) { + MYDBG(printf("Cuda_FixAveForceCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);) + CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize); + sdata->buffer = CudaWrapper_AllocCudaData(size); + sdata->buffersize = size; + sdata->buffer_new++; + MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);) + } + + cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*)); +} + +void Cuda_FixAveForceCuda_UpdateNmax(cuda_shared_data* sdata) +{ + cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*)); +} + +void Cuda_FixAveForceCuda_Init(cuda_shared_data* sdata) +{ + Cuda_FixAveForceCuda_UpdateNmax(sdata); +} + +void Cuda_FixAveForceCuda_PostForce_FOrg(cuda_shared_data* sdata, int groupbit, F_FLOAT* aforiginal) +{ + if(sdata->atom.update_nmax) + Cuda_FixAveForceCuda_UpdateNmax(sdata); + + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + + if(sdata->buffer_new) + Cuda_FixAveForceCuda_UpdateBuffer(sdata); + + int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_FLOAT)); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + + Cuda_FixAveForceCuda_PostForce_FOrg_Kernel <<< grid, threads, threads.x* 4* sizeof(F_FLOAT)>>> (groupbit); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_FixAveForceCuda_PostForce: fix ave_force post_force Compute Kernel execution failed"); + + int oldgrid = grid.x; + grid.x = 4; + threads.x = 512; + Cuda_FixAveForceCuda_reduce_foriginal <<< grid, threads, threads.x* sizeof(F_FLOAT)>>> (oldgrid, aforiginal); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_FixAveForceCuda_PostForce: fix ave_force post_force Reduce Kernel execution failed"); + +} + +void Cuda_FixAveForceCuda_PostForce_Set(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, F_FLOAT axvalue, F_FLOAT ayvalue, F_FLOAT azvalue) +{ + int3 layout = getgrid(sdata->atom.nlocal); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + + Cuda_FixAveForceCuda_PostForce_Set_Kernel <<< grid, threads, 0>>> (groupbit, xflag, yflag, zflag, axvalue, ayvalue, azvalue); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_FixAveForceCuda_PostForce_Set: fix ave_force post_force Compute Kernel execution failed"); + +} diff --git a/lib/cuda/fix_aveforce_cuda_cu.h b/lib/cuda/fix_aveforce_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..6d58a472e0c184638ffabc8c052d75aebc212083 --- /dev/null +++ b/lib/cuda/fix_aveforce_cuda_cu.h @@ -0,0 +1,28 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_FixAveForceCuda_Init(cuda_shared_data* sdata); +extern "C" void Cuda_FixAveForceCuda_PostForce_FOrg(cuda_shared_data* sdata, int groupbit, F_FLOAT* aforiginal); +extern "C" void Cuda_FixAveForceCuda_PostForce_Set(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, F_FLOAT axvalue, F_FLOAT ayvalue, F_FLOAT azvalue); diff --git a/lib/cuda/fix_aveforce_cuda_kernel.cu b/lib/cuda/fix_aveforce_cuda_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..37d80d92e8616b043c56343afe165b6a3d682ff6 --- /dev/null +++ b/lib/cuda/fix_aveforce_cuda_kernel.cu @@ -0,0 +1,96 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +extern __shared__ F_FLOAT sharedmem[]; + + +__global__ void Cuda_FixAveForceCuda_PostForce_FOrg_Kernel(int groupbit) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + sharedmem[threadIdx.x] = 0; + sharedmem[threadIdx.x + blockDim.x] = 0; + sharedmem[threadIdx.x + 2 * blockDim.x] = 0; + sharedmem[threadIdx.x + 3 * blockDim.x] = 0; + + if(i < _nlocal) + if(_mask[i] & groupbit) { + sharedmem[threadIdx.x] = _f[i]; + sharedmem[threadIdx.x + blockDim.x] = _f[i + 1 * _nmax]; + sharedmem[threadIdx.x + 2 * blockDim.x] = _f[i + 2 * _nmax]; + sharedmem[threadIdx.x + 3 * blockDim.x] = 1; + } + + reduceBlock(sharedmem); + reduceBlock(&sharedmem[blockDim.x]); + reduceBlock(&sharedmem[2 * blockDim.x]); + reduceBlock(&sharedmem[3 * blockDim.x]); + F_FLOAT* buffer = (F_FLOAT*) _buffer; + + if(threadIdx.x == 0) { + buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0]; + buffer[blockIdx.x * gridDim.y + blockIdx.y + gridDim.x * gridDim.y] = sharedmem[blockDim.x]; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = sharedmem[2 * blockDim.x]; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = sharedmem[3 * blockDim.x]; + } +} + + +__global__ void Cuda_FixAveForceCuda_reduce_foriginal(int n, F_FLOAT* foriginal) +{ + int i = 0; + sharedmem[threadIdx.x] = 0; + F_FLOAT myforig = 0.0; + F_FLOAT* buf = (F_FLOAT*) _buffer; + buf = &buf[blockIdx.x * n]; + + while(i < n) { + sharedmem[threadIdx.x] = 0; + + if(i + threadIdx.x < n) + sharedmem[threadIdx.x] = buf[i + threadIdx.x]; + + __syncthreads(); + reduceBlock(sharedmem); + i += blockDim.x; + + if(threadIdx.x == 0) + myforig += sharedmem[0]; + } + + if(threadIdx.x == 0) + foriginal[blockIdx.x] = myforig; +} + +__global__ void Cuda_FixAveForceCuda_PostForce_Set_Kernel(int groupbit, int xflag, int yflag, int zflag, F_FLOAT xvalue, F_FLOAT yvalue, F_FLOAT zvalue) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if(i < _nlocal) + if(_mask[i] & groupbit) { + if(xflag) _f[i] = xvalue; + + if(yflag) _f[i + 1 * _nmax] = yvalue; + + if(zflag) _f[i + 2 * _nmax] = zvalue; + } +} diff --git a/lib/cuda/fix_enforce2d_cuda.cu b/lib/cuda/fix_enforce2d_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..cc48ed070d3ca07648ce3749c779aeb7aa02ecc4 --- /dev/null +++ b/lib/cuda/fix_enforce2d_cuda.cu @@ -0,0 +1,55 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> +#define MY_PREFIX fix_enforce2d_cuda +#include "cuda_shared.h" +#include "cuda_common.h" +#include "crm_cuda_utils.cu" +#include "fix_enforce2d_cuda_cu.h" +#include "fix_enforce2d_cuda_kernel.cu" + +void Cuda_FixEnforce2dCuda_Init(cuda_shared_data* sdata) +{ + cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*)); +} + +void Cuda_FixEnforce2dCuda_PostForce(cuda_shared_data* sdata, int groupbit) +{ + if(sdata->atom.update_nmax) + Cuda_FixEnforce2dCuda_Init(sdata); + + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + + int3 layout = getgrid(sdata->atom.nlocal); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + Cuda_FixEnforce2dCuda_PostForce_Kernel <<< grid, threads>>> (groupbit); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Cuda_FixEnforce2dCuda_PostForce: fix enforce2d post_force Kernel execution failed"); +} diff --git a/lib/cuda/fix_enforce2d_cuda_cu.h b/lib/cuda/fix_enforce2d_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..7a1cfadef402e7d10dcf3bc5bf2a0cb9feaafb48 --- /dev/null +++ b/lib/cuda/fix_enforce2d_cuda_cu.h @@ -0,0 +1,27 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_FixEnforce2dCuda_Init(cuda_shared_data* sdata); +extern "C" void Cuda_FixEnforce2dCuda_PostForce(cuda_shared_data* sdata, int groupbit); diff --git a/lib/cuda/fix_enforce2d_cuda_kernel.cu b/lib/cuda/fix_enforce2d_cuda_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..5950df2e54e96ed9151cb41ad93d46a5016bdf04 --- /dev/null +++ b/lib/cuda/fix_enforce2d_cuda_kernel.cu @@ -0,0 +1,34 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + + +__global__ void Cuda_FixEnforce2dCuda_PostForce_Kernel(int groupbit) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if(i < _nlocal) + if(_mask[i] & groupbit) { + _v[i + 2 * _nmax] = V_F(0.0); + _f[i + 2 * _nmax] = F_F(0.0); + } +} diff --git a/lib/cuda/fix_freeze_cuda.cu b/lib/cuda/fix_freeze_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..613c76bbde002a02ca2d50d589f782c21ce51f5e --- /dev/null +++ b/lib/cuda/fix_freeze_cuda.cu @@ -0,0 +1,98 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> +#define MY_PREFIX fix_freeze_cuda +#include "cuda_shared.h" +#include "cuda_common.h" +#include "crm_cuda_utils.cu" + +#include "fix_freeze_cuda_cu.h" +#include "fix_freeze_cuda_kernel.cu" + +void Cuda_FixFreezeCuda_UpdateBuffer(cuda_shared_data* sdata) +{ + int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_FLOAT)); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + int size = (unsigned)(layout.z * layout.y * layout.x) * 3 * sizeof(F_FLOAT); + + if(sdata->buffersize < size) { + MYDBG(printf("Cuda_FixFreezeCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);) + CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize); + sdata->buffer = CudaWrapper_AllocCudaData(size); + sdata->buffersize = size; + sdata->buffer_new++; + MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);) + + } + + cudaMemcpyToSymbol(MY_AP(buffer) , & sdata->buffer, sizeof(int*)); +} + +void Cuda_FixFreezeCuda_UpdateNmax(cuda_shared_data* sdata) +{ + cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(torque) , & sdata->atom.torque .dev_data, sizeof(F_FLOAT*)); +} + + +void Cuda_FixFreezeCuda_Init(cuda_shared_data* sdata) +{ + Cuda_FixFreezeCuda_UpdateNmax(sdata); + +} + + +void Cuda_FixFreezeCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLOAT* foriginal) +{ + if(sdata->atom.update_nmax) + Cuda_FixFreezeCuda_UpdateNmax(sdata); + + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + + if(sdata->buffer_new) + Cuda_FixFreezeCuda_UpdateBuffer(sdata); + + + int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_FLOAT)); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + Cuda_FixFreezeCuda_PostForce_Kernel <<< grid, threads, threads.x* 3* sizeof(F_FLOAT)>>> (groupbit); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Cuda_FixFreezeCuda_PostForce: fix add_force post_force compute Kernel execution failed"); + + int oldgrid = grid.x; + grid.x = 3; + threads.x = 512; + Cuda_FixFreezeCuda_Reduce_FOriginal <<< grid, threads, threads.x* sizeof(F_FLOAT)>>> (oldgrid, foriginal); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Cuda_FixFreezeCuda_PostForce: fix add_force post_force reduce Kernel execution failed"); + +} diff --git a/lib/cuda/fix_freeze_cuda_cu.h b/lib/cuda/fix_freeze_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..e8b21a9558acf78324f676f6eee4889206740aa2 --- /dev/null +++ b/lib/cuda/fix_freeze_cuda_cu.h @@ -0,0 +1,27 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_FixFreezeCuda_Init(cuda_shared_data* sdata); +extern "C" void Cuda_FixFreezeCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLOAT* foriginal); diff --git a/lib/cuda/fix_freeze_cuda_kernel.cu b/lib/cuda/fix_freeze_cuda_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..5f5057c87d82d5c32ecf7630d62245bfa7c0e5c6 --- /dev/null +++ b/lib/cuda/fix_freeze_cuda_kernel.cu @@ -0,0 +1,87 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +extern __shared__ F_FLOAT sharedmem[]; + + +__global__ void Cuda_FixFreezeCuda_PostForce_Kernel(int groupbit) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + sharedmem[threadIdx.x] = 0; + sharedmem[threadIdx.x + blockDim.x] = 0; + sharedmem[threadIdx.x + 2 * blockDim.x] = 0; + + if(i < _nlocal) + if(_mask[i] & groupbit) { + sharedmem[threadIdx.x] = _f[i]; + sharedmem[threadIdx.x + blockDim.x] = _f[i + 1 * _nmax]; + sharedmem[threadIdx.x + 2 * blockDim.x] = _f[i + 2 * _nmax]; + + _f[i] = F_F(0.0); + _f[i + 1 * _nmax] = F_F(0.0); + _f[i + 2 * _nmax] = F_F(0.0); + _torque[i] = F_F(0.0); + _torque[i + 1 * _nmax] = F_F(0.0); + _torque[i + 2 * _nmax] = F_F(0.0); + } + + + reduceBlock(sharedmem); + reduceBlock(&sharedmem[blockDim.x]); + reduceBlock(&sharedmem[2 * blockDim.x]); + F_FLOAT* buffer = (F_FLOAT*)_buffer; + + if(threadIdx.x == 0) { + buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0]; + buffer[blockIdx.x * gridDim.y + blockIdx.y + gridDim.x * gridDim.y] = sharedmem[blockDim.x]; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = sharedmem[2 * blockDim.x]; + } +} + + +__global__ void Cuda_FixFreezeCuda_Reduce_FOriginal(int n, F_FLOAT* foriginal) +{ + int i = 0; + sharedmem[threadIdx.x] = 0; + F_FLOAT myforig = 0.0; + F_FLOAT* buf = (F_FLOAT*)_buffer; + buf = &buf[blockIdx.x * n]; + + while(i < n) { + sharedmem[threadIdx.x] = 0; + + if(i + threadIdx.x < n) + sharedmem[threadIdx.x] = buf[i + threadIdx.x]; + + __syncthreads(); + reduceBlock(sharedmem); + i += blockDim.x; + + if(threadIdx.x == 0) + myforig += sharedmem[0]; + } + + if(threadIdx.x == 0) + foriginal[blockIdx.x] = myforig; +} + diff --git a/lib/cuda/fix_gravity_cuda.cu b/lib/cuda/fix_gravity_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..0fc7051b86556e280ea60be491651be76cb9821e --- /dev/null +++ b/lib/cuda/fix_gravity_cuda.cu @@ -0,0 +1,92 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> +#define MY_PREFIX fix_gravity_cuda +#include "cuda_shared.h" +#include "cuda_common.h" +#include "crm_cuda_utils.cu" + +#include "fix_gravity_cuda_cu.h" +#include "fix_gravity_cuda_kernel.cu" + +void Cuda_FixGravityCuda_UpdateBuffer(cuda_shared_data* sdata) +{ + int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_FLOAT)); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + int size = (unsigned)(layout.z * layout.y * layout.x) * 3 * sizeof(F_FLOAT); + + if(sdata->buffersize < size) { + MYDBG(printf("Cuda_FixGravityCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);) + CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize); + sdata->buffer = CudaWrapper_AllocCudaData(size); + sdata->buffersize = size; + sdata->buffer_new++; + MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);) + + } + + cudaMemcpyToSymbol(MY_AP(buffer) , & sdata->buffer, sizeof(int*)); +} + +void Cuda_FixGravityCuda_UpdateNmax(cuda_shared_data* sdata) +{ + cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(rmass_flag) , & sdata->atom.rmass_flag, sizeof(int)); + cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass .dev_data, sizeof(V_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass .dev_data, sizeof(V_FLOAT*)); +} + +void Cuda_FixGravityCuda_Init(cuda_shared_data* sdata) +{ + Cuda_FixGravityCuda_UpdateNmax(sdata); + +} + + +void Cuda_FixGravityCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLOAT xacc, F_FLOAT yacc, F_FLOAT zacc) +{ + if(sdata->atom.update_nmax) + Cuda_FixGravityCuda_UpdateNmax(sdata); + + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + + if(sdata->buffer_new) + Cuda_FixGravityCuda_UpdateBuffer(sdata); + + + int3 layout = getgrid(sdata->atom.nlocal); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + Cuda_FixGravityCuda_PostForce_Kernel <<< grid, threads>>> (groupbit, xacc, yacc, zacc); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Cuda_FixGravityCuda_PostForce: fix add_force post_force compute Kernel execution failed"); +} diff --git a/lib/cuda/fix_gravity_cuda_cu.h b/lib/cuda/fix_gravity_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..014b71f0114131682c92314c2a285191a915fdb3 --- /dev/null +++ b/lib/cuda/fix_gravity_cuda_cu.h @@ -0,0 +1,27 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_FixGravityCuda_Init(cuda_shared_data* sdata); +extern "C" void Cuda_FixGravityCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLOAT xacc, F_FLOAT yacc, F_FLOAT zacc); diff --git a/lib/cuda/fix_gravity_cuda_kernel.cu b/lib/cuda/fix_gravity_cuda_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..ba58d39bc863e90192c56f815cd6414ee7ea68e5 --- /dev/null +++ b/lib/cuda/fix_gravity_cuda_kernel.cu @@ -0,0 +1,36 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +__global__ void Cuda_FixGravityCuda_PostForce_Kernel(int groupbit, F_FLOAT xacc, F_FLOAT yacc, F_FLOAT zacc) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if(i < _nlocal) + if(_mask[i] & groupbit) { + F_FLOAT mass = _rmass_flag ? _rmass[i] : _mass[_type[i]]; + _f[i] += mass * xacc; + _f[i + 1 * _nmax] += mass * yacc; + _f[i + 2 * _nmax] += mass * zacc; + } +} + diff --git a/lib/cuda/fix_nh_cuda.cu b/lib/cuda/fix_nh_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..1a9d84061f05dc0aaa0fe32e8f1d0cf03af4be66 --- /dev/null +++ b/lib/cuda/fix_nh_cuda.cu @@ -0,0 +1,255 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> +#define MY_PREFIX fix_nh_cuda +#define IncludeCommonNeigh +#include "cuda_shared.h" +#include "cuda_common.h" +#include "crm_cuda_utils.cu" +#include "fix_nh_cuda_cu.h" +#include "fix_nh_cuda_kernel.cu" + +void Cuda_FixNHCuda_UpdateNmax(cuda_shared_data* sdata) +{ + cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(debugdata) , & sdata->debugdata, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass.dev_data, sizeof(V_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(xhold) , & sdata->atom.xhold.dev_data, sizeof(X_FLOAT*)); //might be moved to a neighbor record in sdata + cudaMemcpyToSymbol(MY_AP(maxhold) , & sdata->atom.maxhold, sizeof(int)); //might be moved to a neighbor record in sdata + cudaMemcpyToSymbol(MY_AP(reneigh_flag), & sdata->buffer, sizeof(int*)); //might be moved to a neighbor record in sdata + cudaMemcpyToSymbol(MY_AP(triggerneighsq), & sdata->atom.triggerneighsq, sizeof(X_FLOAT)); //might be moved to a neighbor record in sdata +} + +void Cuda_FixNHCuda_UpdateBuffer(cuda_shared_data* sdata) +{ + int size = (unsigned)10 * sizeof(int); + + if(sdata->buffersize < size) { + MYDBG(printf("Cuda_FixNHCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);) + CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize); + sdata->buffer = CudaWrapper_AllocCudaData(size); + sdata->buffersize = size; + sdata->buffer_new++; + MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);) + + } + + cudaMemcpyToSymbol(MY_AP(buffer) , & sdata->buffer, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(reneigh_flag), & sdata->buffer, sizeof(int*)); //might be moved to a neighbor record in sdata +} + +void Cuda_FixNHCuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, V_FLOAT dtf) +{ + cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass.dev_data , sizeof(V_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(dtf) , & dtf , sizeof(V_FLOAT)); + cudaMemcpyToSymbol(MY_AP(dtv) , & dtv , sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_AP(triggerneighsq), &sdata->atom.triggerneighsq, sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_AP(dist_check), & sdata->atom.dist_check , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(rmass_flag), & sdata->atom.rmass_flag , sizeof(int)); // + Cuda_FixNHCuda_UpdateNmax(sdata); +} + + +void Cuda_FixNHCuda_nh_v_press(cuda_shared_data* sdata, int groupbit, double* factor_h, int mynlocal, int p_triclinic) //mynlocal can be nfirst if firstgroup==igroup see cpp +{ + timespec atime1, atime2; + clock_gettime(CLOCK_REALTIME, &atime1); + + if(sdata->atom.update_nmax) + Cuda_FixNHCuda_UpdateNmax(sdata); + + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + + clock_gettime(CLOCK_REALTIME, &atime2); + sdata->cuda_timings.test1 += + atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000; + + if(sdata->buffer_new) + Cuda_FixNHCuda_UpdateBuffer(sdata); + + F_FLOAT3 factor = {factor_h[0], factor_h[1], factor_h[2]}; + F_FLOAT3 factor2; + + if(p_triclinic) { + factor2.x = factor_h[3], factor2.y = factor_h[4]; + factor2.z = factor_h[5]; + } + + int3 layout = getgrid(mynlocal); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + FixNHCuda_nh_v_press_Kernel <<< grid, threads>>> (groupbit, factor, p_triclinic, factor2); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("FixNHCuda: fix nh v_press Kernel execution failed"); + +} + +void Cuda_FixNHCuda_nh_v_press_and_nve_v_NoBias(cuda_shared_data* sdata, int groupbit, double* factor_h, int mynlocal, int p_triclinic) //mynlocal can be nfirst if firstgroup==igroup see cpp +{ + if(sdata->atom.update_nmax) + Cuda_FixNHCuda_UpdateNmax(sdata); + + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + + if(sdata->buffer_new) + Cuda_FixNHCuda_UpdateBuffer(sdata); + + F_FLOAT3 factor = {factor_h[0], factor_h[1], factor_h[2]}; + F_FLOAT3 factor2; + + if(p_triclinic) { + factor2.x = factor_h[3], factor2.y = factor_h[4]; + factor2.z = factor_h[5]; + } + + int3 layout = getgrid(mynlocal); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + CUT_CHECK_ERROR("FixNHCuda: fix nh v_press pre Kernel execution failed"); + FixNHCuda_nh_v_press_and_nve_v_NoBias_Kernel <<< grid, threads>>> (groupbit, factor, p_triclinic, factor2); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("FixNHCuda: fix nh v_press Kernel execution failed"); + +} + +void Cuda_FixNHCuda_nh_v_temp(cuda_shared_data* sdata, int groupbit, F_FLOAT factor_eta, int mynlocal) //mynlocal can be nfirst if firstgroup==igroup see cpp +{ + timespec atime1, atime2; + clock_gettime(CLOCK_REALTIME, &atime1); + + if(sdata->atom.update_nmax) + Cuda_FixNHCuda_UpdateNmax(sdata); + + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + + clock_gettime(CLOCK_REALTIME, &atime2); + sdata->cuda_timings.test1 += + atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000; + + if(sdata->buffer_new) + Cuda_FixNHCuda_UpdateBuffer(sdata); + + int3 layout = getgrid(mynlocal); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + FixNHCuda_nh_v_temp_Kernel <<< grid, threads>>> (groupbit, factor_eta); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("FixNHCuda: fix nh v_temp Kernel execution failed"); + +} +void Cuda_FixNHCuda_nve_v(cuda_shared_data* sdata, int groupbit, int mynlocal) //mynlocal can be nfirst if firstgroup==igroup see cpp +{ + timespec atime1, atime2; + clock_gettime(CLOCK_REALTIME, &atime1); + + if(sdata->atom.update_nmax) + Cuda_FixNHCuda_UpdateNmax(sdata); + + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + + clock_gettime(CLOCK_REALTIME, &atime2); + sdata->cuda_timings.test1 += + atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000; + + if(sdata->buffer_new) + Cuda_FixNHCuda_UpdateBuffer(sdata); + + int3 layout = getgrid(mynlocal); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + FixNHCuda_nve_v_Kernel <<< grid, threads>>> (groupbit); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("FixNHCuda: nve_v Kernel execution failed"); +} + + +void Cuda_FixNHCuda_nve_x(cuda_shared_data* sdata, int groupbit, int mynlocal) //mynlocal can be nfirst if firstgroup==igroup see cpp +{ + timespec atime1, atime2; + clock_gettime(CLOCK_REALTIME, &atime1); + + if(sdata->atom.update_nmax) + Cuda_FixNHCuda_UpdateNmax(sdata); + + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + + clock_gettime(CLOCK_REALTIME, &atime2); + sdata->cuda_timings.test1 += + atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000; + + if(sdata->buffer_new) + Cuda_FixNHCuda_UpdateBuffer(sdata); + + int3 layout = getgrid(mynlocal); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + cudaMemset(sdata->buffer, 0, sizeof(int)); + FixNHCuda_nve_x_Kernel <<< grid, threads>>> (groupbit); + cudaThreadSynchronize(); + int reneigh_flag; + cudaMemcpy((void*)(&reneigh_flag), sdata->buffer, sizeof(int), cudaMemcpyDeviceToHost); + sdata->atom.reneigh_flag += reneigh_flag; + CUT_CHECK_ERROR("FixNHCuda: nve_x Kernel execution failed"); +} + +void Cuda_FixNHCuda_nve_v_and_nh_v_press_NoBias(cuda_shared_data* sdata, int groupbit, double* factor_h, int mynlocal, int p_triclinic) //mynlocal can be nfirst if firstgroup==igroup see cpp +{ + if(sdata->atom.update_nmax) + Cuda_FixNHCuda_UpdateNmax(sdata); + + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + + if(sdata->buffer_new) + Cuda_FixNHCuda_UpdateBuffer(sdata); + + F_FLOAT3 factor = {factor_h[0], factor_h[1], factor_h[2]}; + F_FLOAT3 factor2; + + if(p_triclinic) { + factor2.x = factor_h[3], factor2.y = factor_h[4]; + factor2.z = factor_h[5]; + } + + int3 layout = getgrid(mynlocal); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + FixNHCuda_nve_v_and_nh_v_press_NoBias_Kernel <<< grid, threads>>> (groupbit, factor, p_triclinic, factor2); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("FixNHCuda__nve_v_and_nh_v_press_NoBias: Kernel execution failed"); +} + diff --git a/lib/cuda/fix_nh_cuda_cu.h b/lib/cuda/fix_nh_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..ba6203cfd077abb65602230f1287a022e187ae4e --- /dev/null +++ b/lib/cuda/fix_nh_cuda_cu.h @@ -0,0 +1,32 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_FixNHCuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, V_FLOAT dtf); +extern "C" void Cuda_FixNHCuda_nh_v_press(cuda_shared_data* sdata, int groupbit, double* factor_h, int mynlocal, int p_triclinic); //mynlocal can be nfirst if firstgroup==igroup see cpp +extern "C" void Cuda_FixNHCuda_nh_v_temp(cuda_shared_data* sdata, int groupbit, F_FLOAT factor_eta, int mynlocal); //mynlocal can be nfirst if firstgroup==igroup see cpp +extern "C" void Cuda_FixNHCuda_nh_v_press_and_nve_v_NoBias(cuda_shared_data* sdata, int groupbit, double* factor_h, int mynlocal, int p_triclinic); //mynlocal can be nfirst if firstgroup==igroup see cpp +extern "C" void Cuda_FixNHCuda_nve_v(cuda_shared_data* sdata, int groupbit, int mynlocal); //mynlocal can be nfirst if firstgroup==igroup see cpp +extern "C" void Cuda_FixNHCuda_nve_x(cuda_shared_data* sdata, int groupbit, int mynlocal); //mynlocal can be nfirst if firstgroup==igroup see cpp +extern "C" void Cuda_FixNHCuda_nve_v_and_nh_v_press_NoBias(cuda_shared_data* sdata, int groupbit, double* factor_h, int mynlocal, int p_triclinic); //mynlocal can be nfirst if firstgroup==igroup see cpp diff --git a/lib/cuda/fix_nh_cuda_kernel.cu b/lib/cuda/fix_nh_cuda_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..8e14fa7d8747226b62b2d2a43633da61f25bc5b7 --- /dev/null +++ b/lib/cuda/fix_nh_cuda_kernel.cu @@ -0,0 +1,205 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +static inline __device__ void check_distance(X_FLOAT &xtmp, X_FLOAT &ytmp, X_FLOAT &ztmp, int &i, int groupbit) +{ + if(_dist_check) { + + X_FLOAT d = X_F(0.0); + + if(i < _nlocal) { + X_FLOAT tmp = xtmp - _xhold[i]; + d = tmp * tmp; + tmp = ytmp - _xhold[i + _maxhold]; + d += tmp * tmp; + tmp = ztmp - _xhold[i + 2 * _maxhold]; + d += tmp * tmp; + + d = ((_mask[i] & groupbit)) ? d : X_F(0.0); + } + + if(not __all(d <= _triggerneighsq)) + _reneigh_flag[0] = 1; + } +} + +__global__ void FixNHCuda_nh_v_press_Kernel(int groupbit, F_FLOAT3 factor, int p_triclinic, F_FLOAT3 factor2) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if(i < _nlocal && _mask[i] & groupbit) { + V_FLOAT* my_v = _v + i; + V_FLOAT vx = my_v[0]; + V_FLOAT vy = my_v[_nmax]; + V_FLOAT vz = my_v[2 * _nmax]; + vx *= factor.x; + vy *= factor.y; + vz *= factor.z; + + if(p_triclinic) { + vx += vy * factor2.z + vz * factor2.y; + vy += vz * factor2.x; + } + + vx *= factor.x; + vy *= factor.y; + vz *= factor.z; + my_v[0] = vx; + my_v[_nmax] = vy; + my_v[2 * _nmax] = vz; + } + +} + +__global__ void FixNHCuda_nh_v_temp_Kernel(int groupbit, F_FLOAT factor_eta) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if(i < _nlocal && _mask[i] & groupbit) { + V_FLOAT* my_v = _v + i; + my_v[0] *= factor_eta; + my_v[_nmax] *= factor_eta; + my_v[2 * _nmax] *= factor_eta; + } + +} + +__global__ void FixNHCuda_nh_v_press_and_nve_v_NoBias_Kernel(int groupbit, F_FLOAT3 factor, int p_triclinic, F_FLOAT3 factor2) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if(i < _nlocal && _mask[i] & groupbit) { + F_FLOAT* my_f = _f + i; + V_FLOAT* my_v = _v + i; + + V_FLOAT dtfm = _dtf; + + if(_rmass_flag) dtfm *= V_F(1.0) / _rmass[i]; + else dtfm *= V_F(1.0) / _mass[_type[i]]; + + V_FLOAT vx = my_v[0]; + V_FLOAT vy = my_v[_nmax]; + V_FLOAT vz = my_v[2 * _nmax]; + vx *= factor.x; + vy *= factor.y; + vz *= factor.z; + + if(p_triclinic) { + vx += vy * factor2.z + vz * factor2.y; + vy += vz * factor2.x; + } + + vx *= factor.x; + vy *= factor.y; + vz *= factor.z; + my_v[0] = vx + dtfm * my_f[0]; + my_v[_nmax] = vy + dtfm * my_f[_nmax]; + my_v[2 * _nmax] = vz + dtfm * my_f[_nmax * 2]; + } + +} + +__global__ void FixNHCuda_nve_v_Kernel(int groupbit) +{ + + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if(i < _nlocal && _mask[i] & groupbit) { + F_FLOAT* my_f = _f + i; + V_FLOAT* my_v = _v + i; + + V_FLOAT dtfm = _dtf; + + if(_rmass_flag) dtfm *= V_F(1.0) / _rmass[i]; + else dtfm *= V_F(1.0) / _mass[_type[i]]; + + *my_v = (*my_v + dtfm * (*my_f)); + my_f += _nmax; + my_v += _nmax; + *my_v = (*my_v + dtfm * (*my_f)); + my_f += _nmax; + my_v += _nmax; + *my_v = (*my_v + dtfm * (*my_f)); + } +} + +__global__ void FixNHCuda_nve_x_Kernel(int groupbit) +{ + X_FLOAT xtmp, ytmp, ztmp; + + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if(i < _nlocal && _mask[i] & groupbit) { + V_FLOAT* my_v = _v + i; + X_FLOAT* my_x = _x + i; + + xtmp = *my_x += _dtv * *my_v; + my_v += _nmax; + my_x += _nmax; + ytmp = *my_x += _dtv * *my_v; + my_v += _nmax; + my_x += _nmax; + ztmp = *my_x += _dtv * *my_v; + } + + check_distance(xtmp, ytmp, ztmp, i, groupbit); +} + + +__global__ void FixNHCuda_nve_v_and_nh_v_press_NoBias_Kernel(int groupbit, F_FLOAT3 factor, int p_triclinic, F_FLOAT3 factor2) +{ + + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if(i < _nlocal && _mask[i] & groupbit) { + F_FLOAT* my_f = _f + i; + V_FLOAT* my_v = _v + i; + + V_FLOAT dtfm = _dtf; + + if(_rmass_flag) dtfm *= V_F(1.0) / _rmass[i]; + else dtfm *= V_F(1.0) / _mass[_type[i]]; + + V_FLOAT vx = my_v[0] + dtfm * my_f[0]; + V_FLOAT vy = my_v[_nmax] + dtfm * my_f[_nmax]; + V_FLOAT vz = my_v[2 * _nmax] + dtfm * my_f[2 * _nmax]; + + vx *= factor.x; + vy *= factor.y; + vz *= factor.z; + + if(p_triclinic) { + vx += vy * factor2.z + vz * factor2.y; + vy += vz * factor2.x; + } + + vx *= factor.x; + vy *= factor.y; + vz *= factor.z; + my_v[0] = vx; + my_v[_nmax] = vy; + my_v[2 * _nmax] = vz; + + } +} + diff --git a/lib/cuda/fix_nve_cuda.cu b/lib/cuda/fix_nve_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..8acddcd6f189af2df642b44676919ebeefce51a0 --- /dev/null +++ b/lib/cuda/fix_nve_cuda.cu @@ -0,0 +1,134 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> +#define MY_PREFIX fix_nve_cuda +#define IncludeCommonNeigh +#include "cuda_shared.h" +#include "cuda_common.h" +#include "crm_cuda_utils.cu" +#include "fix_nve_cuda_cu.h" +#include "fix_nve_cuda_kernel.cu" + +void Cuda_FixNVECuda_UpdateNmax(cuda_shared_data* sdata) +{ + cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass.dev_data , sizeof(V_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(xhold) , & sdata->atom.xhold.dev_data, sizeof(X_FLOAT*)); //might be moved to a neighbor record in sdata + cudaMemcpyToSymbol(MY_AP(maxhold) , & sdata->atom.maxhold, sizeof(int)); //might be moved to a neighbor record in sdata + cudaMemcpyToSymbol(MY_AP(reneigh_flag), & sdata->buffer, sizeof(int*)); //might be moved to a neighbor record in sdata + cudaMemcpyToSymbol(MY_AP(triggerneighsq), & sdata->atom.triggerneighsq, sizeof(X_FLOAT)); //might be moved to a neighbor record in sdata +} + +void Cuda_FixNVECuda_UpdateBuffer(cuda_shared_data* sdata) +{ + int size = (unsigned)10 * sizeof(int); + + if(sdata->buffersize < size) { + MYDBG(printf("Cuda_FixNVECuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);) + CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize); + sdata->buffer = CudaWrapper_AllocCudaData(size); + sdata->buffersize = size; + sdata->buffer_new++; + MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);) + + } + + cudaMemcpyToSymbol(MY_AP(buffer) , & sdata->buffer, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(reneigh_flag), & sdata->buffer, sizeof(int*)); //might be moved to a neighbor record in sdata +} + +void Cuda_FixNVECuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, V_FLOAT dtf) +{ + cudaMemcpyToSymbol(MY_AP(mass) , & sdata->atom.mass.dev_data , sizeof(V_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(dtf) , & dtf , sizeof(V_FLOAT)); + cudaMemcpyToSymbol(MY_AP(dtv) , & dtv , sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_AP(triggerneighsq), &sdata->atom.triggerneighsq, sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_AP(dist_check), & sdata->atom.dist_check , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(rmass_flag), & sdata->atom.rmass_flag , sizeof(int)); // + Cuda_FixNVECuda_UpdateNmax(sdata); +} + + +void Cuda_FixNVECuda_InitialIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal)//mynlocal can be nfirst if firstgroup==igroup see cpp +{ + if(sdata->atom.update_nmax) + Cuda_FixNVECuda_UpdateNmax(sdata); + + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + + if(sdata->buffer_new) + Cuda_FixNVECuda_UpdateBuffer(sdata); + + int3 layout = getgrid(mynlocal); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + cudaMemset(sdata->buffer, 0, sizeof(int)); + FixNVECuda_InitialIntegrate_Kernel <<< grid, threads>>> (groupbit); + cudaThreadSynchronize(); + int reneigh_flag; + cudaMemcpy((void*)(&reneigh_flag), sdata->buffer, sizeof(int), cudaMemcpyDeviceToHost); + sdata->atom.reneigh_flag += reneigh_flag; + CUT_CHECK_ERROR("Cuda_FixNVECuda_InitialIntegrate_N: fix nve initial integrate Kernel execution failed"); + +} + +void Cuda_FixNVECuda_FinalIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal)//mynlocal can be nfirst if firstgroup==igroup see cpp +{ + if(sdata->atom.update_nmax) + Cuda_FixNVECuda_UpdateNmax(sdata); + + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + + if(sdata->buffer_new) + Cuda_FixNVECuda_UpdateBuffer(sdata); + +#ifdef CUDA_USE_BINNING + + dim3 grid(sdata->domain.bin_dim[0], sdata->domain.bin_dim[1] * sdata->domain.bin_neighbors[2], 1); + dim3 threads(sdata->domain.bin_nmax, 1, 1); + FixNVECuda_FinalIntegrate_Kernel <<< grid, threads>>> (groupbit); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_FixNVECuda_FinalIntegrate: fix nve final integrate (binning) Kernel execution failed"); + +#else + + int3 layout = getgrid(mynlocal); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + FixNVECuda_FinalIntegrate_Kernel <<< grid, threads>>> (groupbit); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_FixNVECuda_FinalIntegrate: fix nve final integrate Kernel execution failed"); + +#endif +} + diff --git a/lib/cuda/fix_nve_cuda_cu.h b/lib/cuda/fix_nve_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..90b393c9ec48086b27d80bb89e256b3262faa396 --- /dev/null +++ b/lib/cuda/fix_nve_cuda_cu.h @@ -0,0 +1,28 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_FixNVECuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, V_FLOAT dtf); +extern "C" void Cuda_FixNVECuda_InitialIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal); +extern "C" void Cuda_FixNVECuda_FinalIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal); diff --git a/lib/cuda/fix_nve_cuda_kernel.cu b/lib/cuda/fix_nve_cuda_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..c99439adb4b5605213affedfbeb153e99e0c6682 --- /dev/null +++ b/lib/cuda/fix_nve_cuda_kernel.cu @@ -0,0 +1,166 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +static inline __device__ void check_distance(X_FLOAT &xtmp, X_FLOAT &ytmp, X_FLOAT &ztmp, int &i, int groupbit) +{ + if(_dist_check) { + X_FLOAT tmp = xtmp - _xhold[i]; + X_FLOAT d = tmp * tmp; + tmp = ytmp - _xhold[i + _maxhold]; + d += tmp * tmp; + tmp = ztmp - _xhold[i + 2 * _maxhold]; + d += tmp * tmp; + + d = ((i < _nlocal) && (_mask[i] & groupbit)) ? d : X_F(0.0); + + if(not __all(d <= _triggerneighsq)) + _reneigh_flag[0] = 1; + } +} + + +__global__ void FixNVECuda_InitialIntegrate_Kernel(int groupbit) +{ + X_FLOAT xtmp, ytmp, ztmp; +#ifdef CUDA_USE_BINNING + + const unsigned bin = gridDim.y * blockIdx.x + blockIdx.y; + + if(threadIdx.x < _bin_count_local[bin]) { + const int i = 3 * blockDim.x * bin + threadIdx.x; + + if(_mask[i] & groupbit) { + F_FLOAT* my_f = _binned_f + i; + V_FLOAT* my_v = _binned_v + i; + X_FLOAT* my_x = _binned_x + i; + + V_FLOAT dtfm = _dtf + + if(_rmass_flag) dtfm *= V_F(1.0) / _binned_rmass[i]; + else dtfm *= V_F(1.0) / _mass[_binned_type[blockDim.x * bin + threadIdx.x]]; + + V_FLOAT v_mem; + v_mem = *my_v += dtfm * (*my_f); + xtmp = *my_x += _dtv * v_mem; + my_f += blockDim.x; + my_v += blockDim.x; + my_x += blockDim.x; + v_mem = *my_v += dtfm * (*my_f); + ytmp = *my_x += _dtv * v_mem; + my_f += blockDim.x; + my_v += blockDim.x; + my_x += blockDim.x; + v_mem = *my_v += dtfm * (*my_f); + ztmp = *my_x += _dtv * v_mem; + } + } + +#else + + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if(i < _nlocal && _mask[i] & groupbit) { + F_FLOAT* my_f = _f + i; + V_FLOAT* my_v = _v + i; + X_FLOAT* my_x = _x + i; + + V_FLOAT dtfm = _dtf; + + if(_rmass_flag) dtfm *= V_F(1.0) / _rmass[i]; + else dtfm *= V_F(1.0) / _mass[_type[i]]; + + V_FLOAT v_mem; + v_mem = *my_v += dtfm * (*my_f); + xtmp = *my_x += _dtv * v_mem; + my_f += _nmax; + my_v += _nmax; + my_x += _nmax; + v_mem = *my_v += dtfm * (*my_f); + ytmp = *my_x += _dtv * v_mem; + my_f += _nmax; + my_v += _nmax; + my_x += _nmax; + v_mem = *my_v += dtfm * (*my_f); + ztmp = *my_x += _dtv * v_mem; + } + +#endif + + check_distance(xtmp, ytmp, ztmp, i, groupbit); +} + +__global__ void FixNVECuda_FinalIntegrate_Kernel(int groupbit) +{ +#ifdef CUDA_USE_BINNING + + const unsigned bin = gridDim.y * blockIdx.x + blockIdx.y; + + if(threadIdx.x < _bin_count_local[bin]) { + const int i = 3 * blockDim.x * bin + threadIdx.x; + + if(_mask[i] & groupbit) { + F_FLOAT* my_f = _binned_f + i; + V_FLOAT* my_v = _binned_v + i; + + V_FLOAT dtfm = _dtf + + if(_rmass_flag) dtfm *= V_F(1.0) / _binned_rmass[i]; + else dtfm *= V_F(1.0) / _mass[_binned_type[blockDim.x * bin + threadIdx.x]]; + + *my_v += dtfm * (*my_f); + my_f += blockDim.x; + my_v += blockDim.x; + *my_v += dtfm * (*my_f); + my_f += blockDim.x; + my_v += blockDim.x; + *my_v += dtfm * (*my_f); + } + } + +#else + + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if(i < _nlocal && _mask[i] & groupbit) { + F_FLOAT* my_f = _f + i; + V_FLOAT* my_v = _v + i; + + V_FLOAT dtfm = _dtf; + + if(_rmass_flag) dtfm *= V_F(1.0) / _rmass[i]; + else dtfm *= V_F(1.0) / _mass[_type[i]]; + + *my_v += dtfm * (*my_f); + my_f += _nmax; + my_v += _nmax; + *my_v += dtfm * (*my_f); + my_f += _nmax; + my_v += _nmax; + *my_v += dtfm * (*my_f); + } + +#endif +} + + + diff --git a/lib/cuda/fix_set_force_cuda.cu b/lib/cuda/fix_set_force_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..afa1a4789cd5bb311ec458facbd54f13aa9f7466 --- /dev/null +++ b/lib/cuda/fix_set_force_cuda.cu @@ -0,0 +1,96 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> +#define MY_PREFIX fix_set_force_cuda +#include "cuda_shared.h" +#include "cuda_common.h" +#include "crm_cuda_utils.cu" + +#include "fix_set_force_cuda_cu.h" +#include "fix_set_force_cuda_kernel.cu" + +void Cuda_FixSetForceCuda_UpdateBuffer(cuda_shared_data* sdata) +{ + int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_FLOAT)); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + int size = (unsigned)(layout.z * layout.y * layout.x) * 3 * sizeof(F_FLOAT); + + if(sdata->buffersize < size) { + MYDBG(printf("Cuda_FixSetForceCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);) + CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize); + sdata->buffer = CudaWrapper_AllocCudaData(size); + sdata->buffersize = size; + sdata->buffer_new++; + MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);) + + } + + cudaMemcpyToSymbol(MY_AP(buffer) , & sdata->buffer, sizeof(int*)); +} + +void Cuda_FixSetForceCuda_UpdateNmax(cuda_shared_data* sdata) +{ + cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*)); +} + +void Cuda_FixSetForceCuda_Init(cuda_shared_data* sdata) +{ + Cuda_FixSetForceCuda_UpdateNmax(sdata); + +} + + +void Cuda_FixSetForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLOAT xvalue, F_FLOAT yvalue, F_FLOAT zvalue, F_FLOAT* foriginal, int flagx, int flagy, int flagz) +{ + if(sdata->atom.update_nmax) + Cuda_FixSetForceCuda_UpdateNmax(sdata); + + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + + if(sdata->buffer_new) + Cuda_FixSetForceCuda_UpdateBuffer(sdata); + + + int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_FLOAT)); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + Cuda_FixSetForceCuda_PostForce_Kernel <<< grid, threads, threads.x* 3* sizeof(F_FLOAT)>>> (groupbit, xvalue, yvalue, zvalue, flagx, flagy, flagz); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Cuda_FixSetForceCuda_PostForce: fix add_force post_force compute Kernel execution failed"); + + int oldgrid = grid.x; + grid.x = 3; + threads.x = 512; + Cuda_FixSetForceCuda_Reduce_FOriginal <<< grid, threads, threads.x* sizeof(F_FLOAT)>>> (oldgrid, foriginal); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Cuda_FixSetForceCuda_PostForce: fix add_force post_force reduce Kernel execution failed"); + +} diff --git a/lib/cuda/fix_set_force_cuda_cu.h b/lib/cuda/fix_set_force_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..63e528acce1aa50c8601863115bcad21febc4bf3 --- /dev/null +++ b/lib/cuda/fix_set_force_cuda_cu.h @@ -0,0 +1,27 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_FixSetForceCuda_Init(cuda_shared_data* sdata); +extern "C" void Cuda_FixSetForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLOAT xvalue, F_FLOAT yvalue, F_FLOAT zvalue, F_FLOAT* foriginal, int flagx, int flagy, int flagz); diff --git a/lib/cuda/fix_set_force_cuda_kernel.cu b/lib/cuda/fix_set_force_cuda_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..ee1a59062299831d49ff722b9256658186216680 --- /dev/null +++ b/lib/cuda/fix_set_force_cuda_kernel.cu @@ -0,0 +1,86 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +extern __shared__ F_FLOAT sharedmem[]; + + +__global__ void Cuda_FixSetForceCuda_PostForce_Kernel(int groupbit, F_FLOAT xvalue, F_FLOAT yvalue, F_FLOAT zvalue, int flagx, int flagy, int flagz) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + sharedmem[threadIdx.x] = 0; + sharedmem[threadIdx.x + blockDim.x] = 0; + sharedmem[threadIdx.x + 2 * blockDim.x] = 0; + + if(i < _nlocal) + if(_mask[i] & groupbit) { + sharedmem[threadIdx.x] = _f[i]; + sharedmem[threadIdx.x + blockDim.x] = _f[i + 1 * _nmax]; + sharedmem[threadIdx.x + 2 * blockDim.x] = _f[i + 2 * _nmax]; + + if(flagx) _f[i] = xvalue; + + if(flagy) _f[i + 1 * _nmax] = yvalue; + + if(flagz) _f[i + 2 * _nmax] = zvalue; + } + + + reduceBlock(sharedmem); + reduceBlock(&sharedmem[blockDim.x]); + reduceBlock(&sharedmem[2 * blockDim.x]); + F_FLOAT* buffer = (F_FLOAT*)_buffer; + + if(threadIdx.x == 0) { + buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0]; + buffer[blockIdx.x * gridDim.y + blockIdx.y + gridDim.x * gridDim.y] = sharedmem[blockDim.x]; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = sharedmem[2 * blockDim.x]; + } +} + + +__global__ void Cuda_FixSetForceCuda_Reduce_FOriginal(int n, F_FLOAT* foriginal) +{ + int i = 0; + sharedmem[threadIdx.x] = 0; + F_FLOAT myforig = 0.0; + F_FLOAT* buf = (F_FLOAT*)_buffer; + buf = &buf[blockIdx.x * n]; + + while(i < n) { + sharedmem[threadIdx.x] = 0; + + if(i + threadIdx.x < n) + sharedmem[threadIdx.x] = buf[i + threadIdx.x]; + + __syncthreads(); + reduceBlock(sharedmem); + i += blockDim.x; + + if(threadIdx.x == 0) + myforig += sharedmem[0]; + } + + if(threadIdx.x == 0) + foriginal[blockIdx.x] = myforig; +} + diff --git a/lib/cuda/fix_shake_cuda.cu b/lib/cuda/fix_shake_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..e27f54e9686517f01cc6b73c691ece40dc41169c --- /dev/null +++ b/lib/cuda/fix_shake_cuda.cu @@ -0,0 +1,297 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> +#define MY_PREFIX fix_shake_cuda +#include "cuda_shared.h" +#include "cuda_common.h" +#include "crm_cuda_utils.cu" +#include "fix_shake_cuda_cu.h" +#include "cuda_pair_virial_kernel_nc.cu" + +#define _shake_atom MY_AP(shake_atom) +#define _shake_type MY_AP(shake_type) +#define _shake_flag MY_AP(shake_flag) +#define _xshake MY_AP(xshake) +#define _dtfsq MY_AP(dtfsq) +#define _bond_distance MY_AP(bond_distance) +#define _angle_distance MY_AP(angle_distance) +#define _max_iter MY_AP(max_iter) +#define _tolerance MY_AP(tolerance) +__device__ __constant__ int* _shake_atom; +__device__ __constant__ int* _shake_type; +__device__ __constant__ int* _shake_flag; +__device__ __constant__ X_FLOAT3* _xshake; +__device__ __constant__ F_FLOAT _dtfsq; +__device__ __constant__ X_FLOAT* _bond_distance; +__device__ __constant__ X_FLOAT* _angle_distance; +__device__ __constant__ int _max_iter; +__device__ __constant__ X_FLOAT _tolerance; + +#include "fix_shake_cuda_kernel.cu" + +void Cuda_FixShakeCuda_UpdateNmax(cuda_shared_data* sdata) +{ + cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(V_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(rmass) , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(map_array), & sdata->atom.map_array .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom.dev_data, sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(debugdata), & sdata->debugdata , sizeof(int*)); +} + +void Cuda_FixShakeCuda_UpdateDomain(cuda_shared_data* sdata) +{ + cudaMemcpyToSymbol(MY_AP(periodicity), sdata->domain.periodicity , sizeof(int) * 3); + cudaMemcpyToSymbol(MY_AP(prd) , sdata->domain.prd , sizeof(X_FLOAT) * 3); + cudaMemcpyToSymbol(MY_AP(triclinic) , &sdata->domain.triclinic , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(h) , sdata->domain.h , sizeof(X_FLOAT) * 6); +} + +void Cuda_FixShakeCuda_UpdateBuffer(cuda_shared_data* sdata, int size) +{ + if(sdata->buffersize < size) { + MYDBG(printf("Cuda_FixShakeCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);) + CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize); + sdata->buffer = CudaWrapper_AllocCudaData(size); + sdata->buffersize = size; + sdata->buffer_new++; + MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);) + + } + + cudaMemcpyToSymbol(MY_AP(buffer) , & sdata->buffer, sizeof(int*)); +} + +void Cuda_FixShakeCuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, F_FLOAT dtfsq, + void* shake_flag, void* shake_atom, void* shake_type, void* xshake, + void* bond_distance, void* angle_distance, void* virial, + int max_iter, X_FLOAT tolerance) +{ + Cuda_FixShakeCuda_UpdateNmax(sdata); + Cuda_FixShakeCuda_UpdateDomain(sdata); + cudaMemcpyToSymbol(MY_AP(shake_atom) , & shake_atom , sizeof(void*)); + cudaMemcpyToSymbol(MY_AP(shake_type) , & shake_type , sizeof(void*)); + cudaMemcpyToSymbol(MY_AP(shake_flag) , & shake_flag , sizeof(void*)); + cudaMemcpyToSymbol(MY_AP(xshake) , & xshake , sizeof(void*)); + cudaMemcpyToSymbol(MY_AP(dtv) , & dtv , sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_AP(dtfsq) , & dtfsq , sizeof(F_FLOAT)); + cudaMemcpyToSymbol(MY_AP(bond_distance) , & bond_distance , sizeof(void*)); + cudaMemcpyToSymbol(MY_AP(angle_distance) , & angle_distance , sizeof(void*)); + cudaMemcpyToSymbol(MY_AP(virial) , & virial , sizeof(void*)); + cudaMemcpyToSymbol(MY_AP(flag) , &sdata->flag , sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(max_iter) , &max_iter , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(tolerance) , &tolerance , sizeof(X_FLOAT)); + + if(sdata->atom.mass_host) + cudaMemcpyToSymbol(MY_AP(mass), & sdata->atom.mass.dev_data , sizeof(V_FLOAT*)); + + cudaMemcpyToSymbol(MY_AP(rmass_flag), & sdata->atom.rmass_flag , sizeof(int)); // + + cudaMemcpyToSymbol(MY_AP(flag) , &sdata->flag, sizeof(int*)); + +} + +void Cuda_FixShakeCuda_UnconstrainedUpdate(cuda_shared_data* sdata) +{ + if(sdata->atom.update_nmax) + Cuda_FixShakeCuda_UpdateNmax(sdata); + + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + + if(sdata->buffer_new) + Cuda_FixShakeCuda_UpdateBuffer(sdata, 10 * sizeof(double)); + + int3 layout = getgrid(sdata->atom.nlocal); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + FixShakeCuda_UnconstrainedUpdate_Kernel <<< grid, threads>>> (); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("FixShakeCuda_UnconstrainedUpdate: Kernel execution failed"); +} + +void Cuda_FixShakeCuda_Shake(cuda_shared_data* sdata, int vflag, int vflag_atom, int* list, int nlist) +{ + if(sdata->atom.update_nmax) + Cuda_FixShakeCuda_UpdateNmax(sdata); + + if(sdata->domain.update) + Cuda_FixShakeCuda_UpdateDomain(sdata); + + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + + int3 layout = getgrid(sdata->atom.nlocal, 6 * sizeof(ENERGY_FLOAT), 64); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + if(sdata->buffer_new) + Cuda_FixShakeCuda_UpdateBuffer(sdata, grid.x * grid.y * 6 * sizeof(ENERGY_FLOAT)); + + BindXTypeTexture(sdata); + + FixShakeCuda_Shake_Kernel <<< grid, threads, 6* threads.x* sizeof(ENERGY_FLOAT)>>> (vflag, vflag_atom, list, nlist); + cudaThreadSynchronize(); + + CUT_CHECK_ERROR("FixShakeCuda_Shake: Kernel execution failed"); + + if(vflag) { + int n = grid.x * grid.y; + grid.x = 6; + grid.y = 1; + threads.x = 256; + MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>>(n); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_FixShakeCuda: (no binning) virial compute Kernel execution failed"); + } + +} + +int Cuda_FixShakeCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag) +{ + if(sdata->atom.update_nmax) + Cuda_FixShakeCuda_UpdateNmax(sdata); + + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + + int size = n * 3 * sizeof(X_FLOAT); + + if(sdata->buffer_new or (size > sdata->buffersize)) + Cuda_FixShakeCuda_UpdateBuffer(sdata, size); + + X_FLOAT dx = 0.0; + X_FLOAT dy = 0.0; + X_FLOAT dz = 0.0; + + if(pbc_flag != 0) { + if(sdata->domain.triclinic == 0) { + dx = pbc[0] * sdata->domain.prd[0]; + dy = pbc[1] * sdata->domain.prd[1]; + dz = pbc[2] * sdata->domain.prd[2]; + } else { + dx = pbc[0] * sdata->domain.prd[0] + pbc[5] * sdata->domain.xy + pbc[4] * sdata->domain.xz; + dy = pbc[1] * sdata->domain.prd[1] + pbc[3] * sdata->domain.yz; + dz = pbc[2] * sdata->domain.prd[2]; + } + } + + int3 layout = getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + if(sdata->atom.nlocal > 0) { + cudaMemset(sdata->flag, 0, sizeof(int)); + FixShakeCuda_PackComm_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz); + cudaThreadSynchronize(); + cudaMemcpy(buf_send, sdata->buffer, n * 3 * sizeof(X_FLOAT), cudaMemcpyDeviceToHost); + int aflag; + cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost); + + if(aflag != 0) printf("aflag PackComm: %i\n", aflag); + CUT_CHECK_ERROR("Cuda_FixShakeCuda_PackComm: Kernel execution failed"); + + } + + return 3 * n; +} + +int Cuda_FixShakeCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag) +{ + if(sdata->atom.update_nmax) + Cuda_FixShakeCuda_UpdateNmax(sdata); + + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + + int size = n * 3 * sizeof(X_FLOAT); + + if(sdata->buffer_new or (size > sdata->buffersize)) + Cuda_FixShakeCuda_UpdateBuffer(sdata, size); + + static int count = -1; + count++; + X_FLOAT dx = 0.0; + X_FLOAT dy = 0.0; + X_FLOAT dz = 0.0; + + if(pbc_flag != 0) { + if(sdata->domain.triclinic == 0) { + dx = pbc[0] * sdata->domain.prd[0]; + dy = pbc[1] * sdata->domain.prd[1]; + dz = pbc[2] * sdata->domain.prd[2]; + } else { + dx = pbc[0] * sdata->domain.prd[0] + pbc[5] * sdata->domain.xy + pbc[4] * sdata->domain.xz; + dy = pbc[1] * sdata->domain.prd[1] + pbc[3] * sdata->domain.yz; + dz = pbc[2] * sdata->domain.prd[2]; + } + } + + + + int3 layout = getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + if(sdata->atom.nlocal > 0) { + FixShakeCuda_PackComm_Self_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz, first); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_CommCuda_PackComm_Self: Kernel execution failed"); + } + + return 3 * n; +} + +void Cuda_FixShakeCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* buf_recv) +{ + if(sdata->atom.update_nmax) + Cuda_FixShakeCuda_UpdateNmax(sdata); + + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + + int size = n * 3 * sizeof(X_FLOAT); + + if(sdata->buffer_new or (size > sdata->buffersize)) + Cuda_FixShakeCuda_UpdateBuffer(sdata, size); + + int3 layout = getgrid(n); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + if(sdata->atom.nlocal > 0) { + cudaMemcpy(sdata->buffer, (void*)buf_recv, n * 3 * sizeof(X_FLOAT), cudaMemcpyHostToDevice); + FixShakeCuda_UnpackComm_Kernel <<< grid, threads, 0>>>(n, first); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_FixShakeCuda_UnpackComm: Kernel execution failed"); + + } +} diff --git a/lib/cuda/fix_shake_cuda_cu.h b/lib/cuda/fix_shake_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..9b808a7216e6d96bc3635bc24d86bcf2a69c9819 --- /dev/null +++ b/lib/cuda/fix_shake_cuda_cu.h @@ -0,0 +1,34 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ +#include "cuda_shared.h" + +extern "C" void Cuda_FixShakeCuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, F_FLOAT dtfsq, + void* shake_flag, void* shake_atom, void* shake_type, void* xshake, + void* bond_distance, void* angle_distance, void* virial, + int max_iter, X_FLOAT tolerance); +extern "C" void Cuda_FixShakeCuda_UnconstrainedUpdate(cuda_shared_data* sdata); +extern "C" void Cuda_FixShakeCuda_Shake(cuda_shared_data* sdata, int vflag, int vflag_atom, int* list, int nlist); +extern "C" int Cuda_FixShakeCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag); +extern "C" int Cuda_FixShakeCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag); +extern "C" void Cuda_FixShakeCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* buf_recv); + diff --git a/lib/cuda/fix_shake_cuda_kernel.cu b/lib/cuda/fix_shake_cuda_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..da176d0770af48241cafe074d147b0f49fd75479 --- /dev/null +++ b/lib/cuda/fix_shake_cuda_kernel.cu @@ -0,0 +1,1060 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +__device__ void v_tally(int &vflag_global, int &vflag_atom, int &n, int* list, ENERGY_FLOAT total, ENERGY_FLOAT* v) +{ + /*if(vflag_global) + { + ENERGY_FLOAT fraction = n/total; + ENERGY_FLOAT* shared = &sharedmem[threadIdx.x]; + *shared += fraction*v[0]; shared+=blockDim.x; + *shared += fraction*v[1]; shared+=blockDim.x; + *shared += fraction*v[2]; shared+=blockDim.x; + *shared += fraction*v[3]; shared+=blockDim.x; + *shared += fraction*v[4]; shared+=blockDim.x; + *shared += fraction*v[5]; + }*/ + if(vflag_atom) { + ENERGY_FLOAT fraction = ENERGY_F(1.0) / total; + + for(int i = 0; i < n; i++) { + int m = list[i]; + ENERGY_FLOAT* myvatom = &_vatom[m]; + + *myvatom += fraction * v[0]; + myvatom += _nmax; + *myvatom += fraction * v[1]; + myvatom += _nmax; + *myvatom += fraction * v[2]; + myvatom += _nmax; + *myvatom += fraction * v[3]; + myvatom += _nmax; + *myvatom += fraction * v[4]; + myvatom += _nmax; + *myvatom += fraction * v[5]; + } + } +} + +inline __device__ void minimum_image(X_FLOAT3 &delta) +{ + if(_triclinic == 0) { + if(_periodicity[0]) { + delta.x += delta.x < -X_F(0.5) * _prd[0] ? _prd[0] : + (delta.x > X_F(0.5) * _prd[0] ? -_prd[0] : X_F(0.0)); + } + + if(_periodicity[1]) { + delta.y += delta.y < -X_F(0.5) * _prd[1] ? _prd[1] : + (delta.y > X_F(0.5) * _prd[1] ? -_prd[1] : X_F(0.0)); + } + + if(_periodicity[2]) { + delta.z += delta.z < -X_F(0.5) * _prd[2] ? _prd[2] : + (delta.z > X_F(0.5) * _prd[2] ? -_prd[2] : X_F(0.0)); + } + + } else { + if(_periodicity[1]) { + delta.z += delta.z < -X_F(0.5) * _prd[2] ? _prd[2] : + (delta.z > X_F(0.5) * _prd[2] ? -_prd[2] : X_F(0.0)); + delta.y += delta.z < -X_F(0.5) * _prd[2] ? _h[3] : + (delta.z > X_F(0.5) * _prd[2] ? -_h[3] : X_F(0.0)); + delta.x += delta.z < -X_F(0.5) * _prd[2] ? _h[4] : + (delta.z > X_F(0.5) * _prd[2] ? -_h[4] : X_F(0.0)); + + } + + if(_periodicity[1]) { + delta.y += delta.y < -X_F(0.5) * _prd[1] ? _prd[1] : + (delta.y > X_F(0.5) * _prd[1] ? -_prd[1] : X_F(0.0)); + delta.x += delta.y < -X_F(0.5) * _prd[1] ? _h[5] : + (delta.y > X_F(0.5) * _prd[1] ? -_h[5] : X_F(0.0)); + + } + + if(_periodicity[0]) { + delta.x += delta.x < -X_F(0.5) * _prd[0] ? _prd[0] : + (delta.x > X_F(0.5) * _prd[0] ? -_prd[0] : X_F(0.0)); + } + } +} + +__global__ void FixShakeCuda_UnconstrainedUpdate_Kernel() +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if(i >= _nlocal) return; + + X_FLOAT3 my_xshake = {X_F(0.0), X_F(0.0), X_F(0.0)}; + + if(_shake_flag[i]) { + F_FLOAT* my_f = _f + i; + V_FLOAT* my_v = _v + i; + X_FLOAT* my_x = _x + i; + + V_FLOAT dtfmsq = _dtfsq; + + if(_rmass_flag) dtfmsq *= V_F(1.0) / _rmass[i]; + else dtfmsq *= V_F(1.0) / _mass[_type[i]]; + + my_xshake.x = *my_x + _dtv* *my_v + dtfmsq* *my_f; + my_f += _nmax; + my_v += _nmax; + my_x += _nmax; + my_xshake.y = *my_x + _dtv* *my_v + dtfmsq* *my_f; + my_f += _nmax; + my_v += _nmax; + my_x += _nmax; + my_xshake.z = *my_x + _dtv* *my_v + dtfmsq* *my_f; + } + + _xshake[i] = my_xshake; +} + + + + +__device__ void FixShakeCuda_Shake2(int &vflag, int &vflag_atom, int &m) +{ + int nlist, list[2]; + ENERGY_FLOAT v[6]; + X_FLOAT invmass0, invmass1; + + // local atom IDs and constraint distances + + int i0 = _map_array[_shake_atom[m]]; + int i1 = _map_array[_shake_atom[m + _nmax]]; + X_FLOAT bond1 = _bond_distance[_shake_type[m]]; + + // r01 = distance vec between atoms, with PBC + + X_FLOAT3 r01; + + X_FLOAT4 x_i0, x_i1; + x_i0 = fetchXType(i0); + x_i1 = fetchXType(i1); + + r01.x = x_i0.x - x_i1.x; + r01.y = x_i0.y - x_i1.y; + r01.z = x_i0.z - x_i1.z; + minimum_image(r01); + + // s01 = distance vec after unconstrained update, with PBC + + X_FLOAT3 s01; + X_FLOAT3 xs_i0 = _xshake[i0]; + X_FLOAT3 xs_i1 = _xshake[i1]; + + s01.x = xs_i0.x - xs_i1.x; + s01.y = xs_i0.y - xs_i1.y; + s01.z = xs_i0.z - xs_i1.z; + minimum_image(s01); + + // scalar distances between atoms + + X_FLOAT r01sq = r01.x * r01.x + r01.y * r01.y + r01.z * r01.z; + X_FLOAT s01sq = s01.x * s01.x + s01.y * s01.y + s01.z * s01.z; + + // a,b,c = coeffs in quadratic equation for lamda + + if(_rmass_flag) { + invmass0 = X_F(1.0) / _rmass[i0]; + invmass1 = X_F(1.0) / _rmass[i1]; + } else { + invmass0 = X_F(1.0) / _mass[static_cast <int>(x_i0.w)]; + invmass1 = X_F(1.0) / _mass[static_cast <int>(x_i1.w)]; + } + + X_FLOAT a = (invmass0 + invmass1) * (invmass0 + invmass1) * r01sq; + X_FLOAT b = X_F(2.0) * (invmass0 + invmass1) * + (s01.x * r01.x + s01.y * r01.y + s01.z * r01.z); + X_FLOAT c = s01sq - bond1 * bond1; + + // error check + + X_FLOAT determ = b * b - X_F(4.0) * a * c; + + if(determ < X_F(0.0)) { + _flag[0]++; + determ = X_F(0.0); + } + + // exact quadratic solution for lamda + + X_FLOAT lamda, lamda1, lamda2; + lamda1 = -b + _SQRT_(determ); + lamda2 = -lamda1 - X_F(2.0) * b; + lamda1 *= X_F(1.0) / (X_F(2.0) * a); + lamda2 *= X_F(1.0) / (X_F(2.0) * a); + + lamda = (fabs(lamda1) <= fabs(lamda2)) ? lamda1 : lamda2; + + // update forces if atom is owned by this processor + + lamda *= X_F(1.0) / _dtfsq; + + + //attenion: are shake clusters <-> atom unique? + nlist = 0; + + if(i0 < _nlocal) { + _f[i0] += lamda * r01.x; + _f[i0 + _nmax] += lamda * r01.y; + _f[i0 + 2 * _nmax] += lamda * r01.z; + list[nlist++] = i0; + } + + if(i1 < _nlocal) { + _f[i1] -= lamda * r01.x; + _f[i1 + _nmax] -= lamda * r01.y; + _f[i1 + 2 * _nmax] -= lamda * r01.z; + list[nlist++] = i1; + } + + if(vflag || vflag_atom) { + ENERGY_FLOAT* shared = &sharedmem[threadIdx.x]; + X_FLOAT factor = nlist; + v[0] = lamda * r01.x * r01.x; + *shared = factor * v[0]; + shared += blockDim.x; //times 2.0 since the reducing function is the same as in force calculations, which adds a factor 0.5 + v[1] = lamda * r01.y * r01.y; + *shared = factor * v[1]; + shared += blockDim.x; + v[2] = lamda * r01.z * r01.z; + *shared = factor * v[2]; + shared += blockDim.x; + v[3] = lamda * r01.x * r01.y; + *shared = factor * v[3]; + shared += blockDim.x; + v[4] = lamda * r01.x * r01.z; + *shared = factor * v[4]; + shared += blockDim.x; + v[5] = lamda * r01.y * r01.z; + *shared = factor * v[5]; + shared += blockDim.x; + + v_tally(vflag, vflag_atom, nlist, list, 2.0, v); + } +} + + +__device__ void FixShakeCuda_Shake3(int &vflag, int &vflag_atom, int &m) +{ + int nlist, list[3]; + ENERGY_FLOAT v[6]; + X_FLOAT invmass0, invmass1, invmass2; + + // local atom IDs and constraint distances + + int i0 = _map_array[_shake_atom[m]]; + int i1 = _map_array[_shake_atom[m + _nmax]]; + int i2 = _map_array[_shake_atom[m + 2 * _nmax]]; + X_FLOAT bond1 = _bond_distance[_shake_type[m]]; + X_FLOAT bond2 = _bond_distance[_shake_type[m + _nmax]]; + + // r01 = distance vec between atoms, with PBC + + X_FLOAT3 r01, r02; + + X_FLOAT4 x_i0, x_i1, x_i2; + x_i0 = fetchXType(i0); + x_i1 = fetchXType(i1); + x_i2 = fetchXType(i2); + + r01.x = x_i0.x - x_i1.x; + r01.y = x_i0.y - x_i1.y; + r01.z = x_i0.z - x_i1.z; + minimum_image(r01); + + r02.x = x_i0.x - x_i2.x; + r02.y = x_i0.y - x_i2.y; + r02.z = x_i0.z - x_i2.z; + minimum_image(r02); + + // s01 = distance vec after unconstrained update, with PBC + + X_FLOAT3 s01, s02; + X_FLOAT3 xs_i0 = _xshake[i0]; + X_FLOAT3 xs_i1 = _xshake[i1]; + X_FLOAT3 xs_i2 = _xshake[i2]; + + s01.x = xs_i0.x - xs_i1.x; + s01.y = xs_i0.y - xs_i1.y; + s01.z = xs_i0.z - xs_i1.z; + minimum_image(s01); + + s02.x = xs_i0.x - xs_i2.x; + s02.y = xs_i0.y - xs_i2.y; + s02.z = xs_i0.z - xs_i2.z; + minimum_image(s02); + + // scalar distances between atoms + + X_FLOAT r01sq = r01.x * r01.x + r01.y * r01.y + r01.z * r01.z; + X_FLOAT r02sq = r02.x * r02.x + r02.y * r02.y + r02.z * r02.z; + X_FLOAT s01sq = s01.x * s01.x + s01.y * s01.y + s01.z * s01.z; + X_FLOAT s02sq = s02.x * s02.x + s02.y * s02.y + s02.z * s02.z; + + // a,b,c = coeffs in quadratic equation for lamda + + if(_rmass_flag) { + invmass0 = X_F(1.0) / _rmass[i0]; + invmass1 = X_F(1.0) / _rmass[i1]; + invmass2 = X_F(1.0) / _rmass[i2]; + } else { + invmass0 = X_F(1.0) / _mass[static_cast <int>(x_i0.w)]; + invmass1 = X_F(1.0) / _mass[static_cast <int>(x_i1.w)]; + invmass2 = X_F(1.0) / _mass[static_cast <int>(x_i2.w)]; + } + + X_FLOAT a11 = X_F(2.0) * (invmass0 + invmass1) * + (s01.x * r01.x + s01.y * r01.y + s01.z * r01.z); + X_FLOAT a12 = X_F(2.0) * invmass0 * + (s01.x * r02.x + s01.y * r02.y + s01.z * r02.z); + X_FLOAT a21 = X_F(2.0) * invmass0 * + (s02.x * r01.x + s02.y * r01.y + s02.z * r01.z); + X_FLOAT a22 = X_F(2.0) * (invmass0 + invmass2) * + (s02.x * r02.x + s02.y * r02.y + s02.z * r02.z); + + // error check + + X_FLOAT determ = a11 * a22 - a12 * a21; + + if(determ == X_F(0.0)) _flag[0]++; + + X_FLOAT determinv = X_F(1.0) / determ; + + X_FLOAT a11inv = a22 * determinv; + X_FLOAT a12inv = -a12 * determinv; + X_FLOAT a21inv = -a21 * determinv; + X_FLOAT a22inv = a11 * determinv; + + // quadratic correction coeffs + + X_FLOAT r0102 = (r01.x * r02.x + r01.y * r02.y + r01.z * r02.z); + + X_FLOAT quad1_0101 = (invmass0 + invmass1) * (invmass0 + invmass1) * r01sq; + X_FLOAT quad1_0202 = invmass0 * invmass0 * r02sq; + X_FLOAT quad1_0102 = X_F(2.0) * (invmass0 + invmass1) * invmass0 * r0102; + + X_FLOAT quad2_0202 = (invmass0 + invmass2) * (invmass0 + invmass2) * r02sq; + X_FLOAT quad2_0101 = invmass0 * invmass0 * r01sq; + X_FLOAT quad2_0102 = X_F(2.0) * (invmass0 + invmass2) * invmass0 * r0102; + + // iterate until converged + + X_FLOAT lamda01 = X_F(0.0); + X_FLOAT lamda02 = X_F(0.0); + int niter = 0; + int done = 0; + + X_FLOAT quad1, quad2, b1, b2, lamda01_new, lamda02_new; + + //maybe all running full loop? + while(__any(!done) && niter < _max_iter) { + quad1 = quad1_0101 * lamda01 * lamda01 + quad1_0202 * lamda02 * lamda02 + + quad1_0102 * lamda01 * lamda02; + quad2 = quad2_0101 * lamda01 * lamda01 + quad2_0202 * lamda02 * lamda02 + + quad2_0102 * lamda01 * lamda02; + + b1 = bond1 * bond1 - s01sq - quad1; + b2 = bond2 * bond2 - s02sq - quad2; + + lamda01_new = a11inv * b1 + a12inv * b2; + lamda02_new = a21inv * b1 + a22inv * b2; + + done++; + done = (fabs(lamda01_new - lamda01) > _tolerance) ? 0 : done; + done = (fabs(lamda02_new - lamda02) > _tolerance) ? 0 : done; + + + lamda01 = done < 2 ? lamda01_new : lamda01; + lamda02 = done < 2 ? lamda02_new : lamda02; + niter++; + } + + // update forces if atom is owned by this processor + + lamda01 *= X_F(1.0) / _dtfsq; + lamda02 *= X_F(1.0) / _dtfsq; + + + //attenion: are shake clusters <-> atom unique? + nlist = 0; + + if(i0 < _nlocal) { + _f[i0] += lamda01 * r01.x + lamda02 * r02.x; + _f[i0 + _nmax] += lamda01 * r01.y + lamda02 * r02.y; + _f[i0 + 2 * _nmax] += lamda01 * r01.z + lamda02 * r02.z; + list[nlist++] = i0; + } + + if(i1 < _nlocal) { + _f[i1] -= lamda01 * r01.x; + _f[i1 + _nmax] -= lamda01 * r01.y; + _f[i1 + 2 * _nmax] -= lamda01 * r01.z; + list[nlist++] = i1; + } + + if(i2 < _nlocal) { + _f[i2] -= lamda02 * r02.x; + _f[i2 + _nmax] -= lamda02 * r02.y; + _f[i2 + 2 * _nmax] -= lamda02 * r02.z; + list[nlist++] = i2; + } + + if(vflag || vflag_atom) { + ENERGY_FLOAT* shared = &sharedmem[threadIdx.x]; + X_FLOAT factor = X_F(2.0) / X_F(3.0) * nlist; + v[0] = lamda01 * r01.x * r01.x + lamda02 * r02.x * r02.x; + *shared = factor * v[0]; + shared += blockDim.x; //times 2.0 since the reducing function is the same as in force calculations, which adds a factor 0.5 + v[1] = lamda01 * r01.y * r01.y + lamda02 * r02.y * r02.y; + *shared = factor * v[1]; + shared += blockDim.x; + v[2] = lamda01 * r01.z * r01.z + lamda02 * r02.z * r02.z; + *shared = factor * v[2]; + shared += blockDim.x; + v[3] = lamda01 * r01.x * r01.y + lamda02 * r02.x * r02.y; + *shared = factor * v[3]; + shared += blockDim.x; + v[4] = lamda01 * r01.x * r01.z + lamda02 * r02.x * r02.z; + *shared = factor * v[4]; + shared += blockDim.x; + v[5] = lamda01 * r01.y * r01.z + lamda02 * r02.y * r02.z; + *shared = factor * v[5]; + shared += blockDim.x; + + v_tally(vflag, vflag_atom, nlist, list, 3.0, v); + } +} + +__device__ void FixShakeCuda_Shake4(int &vflag, int &vflag_atom, int &m) +{ + int nlist, list[4]; + ENERGY_FLOAT v[6]; + X_FLOAT invmass0, invmass1, invmass2, invmass3; + + // local atom IDs and constraint distances + + int i0 = _map_array[_shake_atom[m]]; + int i1 = _map_array[_shake_atom[m + _nmax]]; + int i2 = _map_array[_shake_atom[m + 2 * _nmax]]; + int i3 = _map_array[_shake_atom[m + 3 * _nmax]]; + X_FLOAT bond1 = _bond_distance[_shake_type[m]]; + X_FLOAT bond2 = _bond_distance[_shake_type[m + _nmax]]; + X_FLOAT bond3 = _bond_distance[_shake_type[m + 2 * _nmax]]; + + // r01 = distance vec between atoms, with PBC + + X_FLOAT3 r01, r02, r03; + + X_FLOAT4 x_i0, x_i1, x_i2, x_i3; + x_i0 = fetchXType(i0); + x_i1 = fetchXType(i1); + x_i2 = fetchXType(i2); + x_i3 = fetchXType(i3); + + r01.x = x_i0.x - x_i1.x; + r01.y = x_i0.y - x_i1.y; + r01.z = x_i0.z - x_i1.z; + minimum_image(r01); + + r02.x = x_i0.x - x_i2.x; + r02.y = x_i0.y - x_i2.y; + r02.z = x_i0.z - x_i2.z; + minimum_image(r02); + + r03.x = x_i0.x - x_i3.x; + r03.y = x_i0.y - x_i3.y; + r03.z = x_i0.z - x_i3.z; + minimum_image(r03); + + // s01 = distance vec after unconstrained update, with PBC + + X_FLOAT3 s01, s02, s03; + X_FLOAT3 xs_i0 = _xshake[i0]; + X_FLOAT3 xs_i1 = _xshake[i1]; + X_FLOAT3 xs_i2 = _xshake[i2]; + X_FLOAT3 xs_i3 = _xshake[i3]; + + s01.x = xs_i0.x - xs_i1.x; + s01.y = xs_i0.y - xs_i1.y; + s01.z = xs_i0.z - xs_i1.z; + minimum_image(s01); + + s02.x = xs_i0.x - xs_i2.x; + s02.y = xs_i0.y - xs_i2.y; + s02.z = xs_i0.z - xs_i2.z; + minimum_image(s02); + + s03.x = xs_i0.x - xs_i3.x; + s03.y = xs_i0.y - xs_i3.y; + s03.z = xs_i0.z - xs_i3.z; + minimum_image(s03); + + // scalar distances between atoms + + X_FLOAT r01sq = r01.x * r01.x + r01.y * r01.y + r01.z * r01.z; + X_FLOAT r02sq = r02.x * r02.x + r02.y * r02.y + r02.z * r02.z; + X_FLOAT r03sq = r03.x * r03.x + r03.y * r03.y + r03.z * r03.z; + X_FLOAT s01sq = s01.x * s01.x + s01.y * s01.y + s01.z * s01.z; + X_FLOAT s02sq = s02.x * s02.x + s02.y * s02.y + s02.z * s02.z; + X_FLOAT s03sq = s03.x * s03.x + s03.y * s03.y + s03.z * s03.z; + + // a,b,c = coeffs in quadratic equation for lamda + + if(_rmass_flag) { + invmass0 = X_F(1.0) / _rmass[i0]; + invmass1 = X_F(1.0) / _rmass[i1]; + invmass2 = X_F(1.0) / _rmass[i2]; + invmass3 = X_F(1.0) / _rmass[i3]; + } else { + invmass0 = X_F(1.0) / _mass[static_cast <int>(x_i0.w)]; + invmass1 = X_F(1.0) / _mass[static_cast <int>(x_i1.w)]; + invmass2 = X_F(1.0) / _mass[static_cast <int>(x_i2.w)]; + invmass3 = X_F(1.0) / _mass[static_cast <int>(x_i3.w)]; + } + + X_FLOAT a11 = X_F(2.0) * (invmass0 + invmass1) * + (s01.x * r01.x + s01.y * r01.y + s01.z * r01.z); + X_FLOAT a12 = X_F(2.0) * invmass0 * + (s01.x * r02.x + s01.y * r02.y + s01.z * r02.z); + X_FLOAT a13 = X_F(2.0) * invmass0 * + (s01.x * r03.x + s01.y * r03.y + s01.z * r03.z); + X_FLOAT a21 = X_F(2.0) * invmass0 * + (s02.x * r01.x + s02.y * r01.y + s02.z * r01.z); + X_FLOAT a22 = X_F(2.0) * (invmass0 + invmass2) * + (s02.x * r02.x + s02.y * r02.y + s02.z * r02.z); + X_FLOAT a23 = X_F(2.0) * (invmass0) * + (s02.x * r03.x + s02.y * r03.y + s02.z * r03.z); + X_FLOAT a31 = X_F(2.0) * (invmass0) * + (s03.x * r01.x + s03.y * r01.y + s03.z * r01.z); + X_FLOAT a32 = X_F(2.0) * (invmass0) * + (s03.x * r02.x + s03.y * r02.y + s03.z * r02.z); + X_FLOAT a33 = X_F(2.0) * (invmass0 + invmass3) * + (s03.x * r03.x + s03.y * r03.y + s03.z * r03.z); + + // error check + + X_FLOAT determ = a11 * a22 * a33 + a12 * a23 * a31 + a13 * a21 * a32 - + a11 * a23 * a32 - a12 * a21 * a33 - a13 * a22 * a31; + + if(determ == X_F(0.0)) _flag[0]++; + + X_FLOAT determinv = X_F(1.0) / determ; + + X_FLOAT a11inv = determinv * (a22 * a33 - a23 * a32); + X_FLOAT a12inv = -determinv * (a12 * a33 - a13 * a32); + X_FLOAT a13inv = determinv * (a12 * a23 - a13 * a22); + X_FLOAT a21inv = -determinv * (a21 * a33 - a23 * a31); + X_FLOAT a22inv = determinv * (a11 * a33 - a13 * a31); + X_FLOAT a23inv = -determinv * (a11 * a23 - a13 * a21); + X_FLOAT a31inv = determinv * (a21 * a32 - a22 * a31); + X_FLOAT a32inv = -determinv * (a11 * a32 - a12 * a31); + X_FLOAT a33inv = determinv * (a11 * a22 - a12 * a21); + + // quadratic correction coeffs + + X_FLOAT r0102 = (r01.x * r02.x + r01.y * r02.y + r01.z * r02.z); + X_FLOAT r0103 = (r01.x * r03.x + r01.y * r03.y + r01.z * r03.z); + X_FLOAT r0203 = (r02.x * r03.x + r02.y * r03.y + r02.z * r03.z); + + X_FLOAT quad1_0101 = (invmass0 + invmass1) * (invmass0 + invmass1) * r01sq; + X_FLOAT quad1_0202 = invmass0 * invmass0 * r02sq; + X_FLOAT quad1_0303 = invmass0 * invmass0 * r03sq; + X_FLOAT quad1_0102 = X_F(2.0) * (invmass0 + invmass1) * invmass0 * r0102; + X_FLOAT quad1_0103 = X_F(2.0) * (invmass0 + invmass1) * invmass0 * r0103; + X_FLOAT quad1_0203 = X_F(2.0) * invmass0 * invmass0 * r0203; + + X_FLOAT quad2_0101 = invmass0 * invmass0 * r01sq; + X_FLOAT quad2_0202 = (invmass0 + invmass2) * (invmass0 + invmass2) * r02sq; + X_FLOAT quad2_0303 = invmass0 * invmass0 * r03sq; + X_FLOAT quad2_0102 = X_F(2.0) * (invmass0 + invmass2) * invmass0 * r0102; + X_FLOAT quad2_0103 = X_F(2.0) * invmass0 * invmass0 * r0103; + X_FLOAT quad2_0203 = X_F(2.0) * (invmass0 + invmass2) * invmass0 * r0203; + + X_FLOAT quad3_0101 = invmass0 * invmass0 * r01sq; + X_FLOAT quad3_0202 = invmass0 * invmass0 * r02sq; + X_FLOAT quad3_0303 = (invmass0 + invmass3) * (invmass0 + invmass3) * r03sq; + X_FLOAT quad3_0102 = X_F(2.0) * invmass0 * invmass0 * r0102; + X_FLOAT quad3_0103 = X_F(2.0) * (invmass0 + invmass3) * invmass0 * r0103; + X_FLOAT quad3_0203 = X_F(2.0) * (invmass0 + invmass3) * invmass0 * r0203; + // iterate until converged + + X_FLOAT lamda01 = X_F(0.0); + X_FLOAT lamda02 = X_F(0.0); + X_FLOAT lamda03 = X_F(0.0); + int niter = 0; + int done = 0; + + X_FLOAT quad1, quad2, quad3, b1, b2, b3, lamda01_new, lamda02_new, lamda03_new; + + //maybe all running full loop? + while(__any(!done) && niter < _max_iter) { + quad1 = quad1_0101 * lamda01 * lamda01 + + quad1_0202 * lamda02 * lamda02 + + quad1_0303 * lamda03 * lamda03 + + quad1_0102 * lamda01 * lamda02 + + quad1_0103 * lamda01 * lamda03 + + quad1_0203 * lamda02 * lamda03; + + quad2 = quad2_0101 * lamda01 * lamda01 + + quad2_0202 * lamda02 * lamda02 + + quad2_0303 * lamda03 * lamda03 + + quad2_0102 * lamda01 * lamda02 + + quad2_0103 * lamda01 * lamda03 + + quad2_0203 * lamda02 * lamda03; + + quad3 = quad3_0101 * lamda01 * lamda01 + + quad3_0202 * lamda02 * lamda02 + + quad3_0303 * lamda03 * lamda03 + + quad3_0102 * lamda01 * lamda02 + + quad3_0103 * lamda01 * lamda03 + + quad3_0203 * lamda02 * lamda03; + + b1 = bond1 * bond1 - s01sq - quad1; + b2 = bond2 * bond2 - s02sq - quad2; + b3 = bond3 * bond3 - s03sq - quad3; + + lamda01_new = a11inv * b1 + a12inv * b2 + a13inv * b3; + lamda02_new = a21inv * b1 + a22inv * b2 + a23inv * b3; + lamda03_new = a31inv * b1 + a32inv * b2 + a33inv * b3; + + done++; + done = (fabs(lamda01_new - lamda01) > _tolerance) ? 0 : done; + done = (fabs(lamda02_new - lamda02) > _tolerance) ? 0 : done; + done = (fabs(lamda03_new - lamda03) > _tolerance) ? 0 : done; + + lamda01 = done < 2 ? lamda01_new : lamda01; + lamda02 = done < 2 ? lamda02_new : lamda02; + lamda03 = done < 2 ? lamda03_new : lamda03; + niter++; + } + + // update forces if atom is owned by this processor + + lamda01 *= X_F(1.0) / _dtfsq; + lamda02 *= X_F(1.0) / _dtfsq; + lamda03 *= X_F(1.0) / _dtfsq; + + + //attenion: are shake clusters <-> atom unique? + nlist = 0; + + if(i0 < _nlocal) { + _f[i0] += lamda01 * r01.x + lamda02 * r02.x + lamda03 * r03.x; + _f[i0 + _nmax] += lamda01 * r01.y + lamda02 * r02.y + lamda03 * r03.y; + _f[i0 + 2 * _nmax] += lamda01 * r01.z + lamda02 * r02.z + lamda03 * r03.z; + list[nlist++] = i0; + } + + if(i1 < _nlocal) { + _f[i1] -= lamda01 * r01.x; + _f[i1 + _nmax] -= lamda01 * r01.y; + _f[i1 + 2 * _nmax] -= lamda01 * r01.z; + list[nlist++] = i1; + } + + if(i2 < _nlocal) { + _f[i2] -= lamda02 * r02.x; + _f[i2 + _nmax] -= lamda02 * r02.y; + _f[i2 + 2 * _nmax] -= lamda02 * r02.z; + list[nlist++] = i2; + } + + if(i3 < _nlocal) { + _f[i3] -= lamda03 * r03.x; + _f[i3 + _nmax] -= lamda03 * r03.y; + _f[i3 + 2 * _nmax] -= lamda03 * r03.z; + list[nlist++] = i3; + } + + if(vflag || vflag_atom) { + ENERGY_FLOAT* shared = &sharedmem[threadIdx.x]; + X_FLOAT factor = X_F(2.0) / X_F(4.0) * nlist; + v[0] = lamda01 * r01.x * r01.x + lamda02 * r02.x * r02.x + lamda03 * r03.x * r03.x; + *shared = factor * v[0]; + shared += blockDim.x; //times 2.0 since the reducing function is the same as in force calculations, which adds a factor 0.5 + v[1] = lamda01 * r01.y * r01.y + lamda02 * r02.y * r02.y + lamda03 * r03.y * r03.y; + *shared = factor * v[1]; + shared += blockDim.x; + v[2] = lamda01 * r01.z * r01.z + lamda02 * r02.z * r02.z + lamda03 * r03.z * r03.z; + *shared = factor * v[2]; + shared += blockDim.x; + v[3] = lamda01 * r01.x * r01.y + lamda02 * r02.x * r02.y + lamda03 * r03.x * r03.y; + *shared = factor * v[3]; + shared += blockDim.x; + v[4] = lamda01 * r01.x * r01.z + lamda02 * r02.x * r02.z + lamda03 * r03.x * r03.z; + *shared = factor * v[4]; + shared += blockDim.x; + v[5] = lamda01 * r01.y * r01.z + lamda02 * r02.y * r02.z + lamda03 * r03.y * r03.z; + *shared = factor * v[5]; + shared += blockDim.x; + + v_tally(vflag, vflag_atom, nlist, list, 4.0, v); + } +} + +__device__ void FixShakeCuda_Shake3Angle(int &vflag, int &vflag_atom, int &m) +{ + int nlist, list[3]; + ENERGY_FLOAT v[6]; + X_FLOAT invmass0, invmass1, invmass2; + + // local atom IDs and constraint distances + + int i0 = _map_array[_shake_atom[m]]; + int i1 = _map_array[_shake_atom[m + _nmax]]; + int i2 = _map_array[_shake_atom[m + 2 * _nmax]]; + X_FLOAT bond1 = _bond_distance[_shake_type[m]]; + X_FLOAT bond2 = _bond_distance[_shake_type[m + _nmax]]; + X_FLOAT bond12 = _angle_distance[_shake_type[m + 2 * _nmax]]; + + // r01 = distance vec between atoms, with PBC + + X_FLOAT3 r01, r02, r12; + + X_FLOAT4 x_i0, x_i1, x_i2; + x_i0 = fetchXType(i0); + x_i1 = fetchXType(i1); + x_i2 = fetchXType(i2); + + r01.x = x_i0.x - x_i1.x; + r01.y = x_i0.y - x_i1.y; + r01.z = x_i0.z - x_i1.z; + minimum_image(r01); + + r02.x = x_i0.x - x_i2.x; + r02.y = x_i0.y - x_i2.y; + r02.z = x_i0.z - x_i2.z; + minimum_image(r02); + + r12.x = x_i1.x - x_i2.x; + r12.y = x_i1.y - x_i2.y; + r12.z = x_i1.z - x_i2.z; + minimum_image(r12); + + // s01 = distance vec after unconstrained update, with PBC + + X_FLOAT3 s01, s02, s12; + X_FLOAT3 xs_i0 = _xshake[i0]; + X_FLOAT3 xs_i1 = _xshake[i1]; + X_FLOAT3 xs_i2 = _xshake[i2]; + + s01.x = xs_i0.x - xs_i1.x; + s01.y = xs_i0.y - xs_i1.y; + s01.z = xs_i0.z - xs_i1.z; + minimum_image(s01); + + s02.x = xs_i0.x - xs_i2.x; + s02.y = xs_i0.y - xs_i2.y; + s02.z = xs_i0.z - xs_i2.z; + minimum_image(s02); + + s12.x = xs_i1.x - xs_i2.x; + s12.y = xs_i1.y - xs_i2.y; + s12.z = xs_i1.z - xs_i2.z; + minimum_image(s12); + + // scalar distances between atoms + + X_FLOAT r01sq = r01.x * r01.x + r01.y * r01.y + r01.z * r01.z; + X_FLOAT r02sq = r02.x * r02.x + r02.y * r02.y + r02.z * r02.z; + X_FLOAT r12sq = r12.x * r12.x + r12.y * r12.y + r12.z * r12.z; + X_FLOAT s01sq = s01.x * s01.x + s01.y * s01.y + s01.z * s01.z; + X_FLOAT s02sq = s02.x * s02.x + s02.y * s02.y + s02.z * s02.z; + X_FLOAT s12sq = s12.x * s12.x + s12.y * s12.y + s12.z * s12.z; + + // a,b,c = coeffs in quadratic equation for lamda + + if(_rmass_flag) { + invmass0 = X_F(1.0) / _rmass[i0]; + invmass1 = X_F(1.0) / _rmass[i1]; + invmass2 = X_F(1.0) / _rmass[i2]; + } else { + invmass0 = X_F(1.0) / _mass[static_cast <int>(x_i0.w)]; + invmass1 = X_F(1.0) / _mass[static_cast <int>(x_i1.w)]; + invmass2 = X_F(1.0) / _mass[static_cast <int>(x_i2.w)]; + } + + X_FLOAT a11 = X_F(2.0) * (invmass0 + invmass1) * + (s01.x * r01.x + s01.y * r01.y + s01.z * r01.z); + X_FLOAT a12 = X_F(2.0) * invmass0 * + (s01.x * r02.x + s01.y * r02.y + s01.z * r02.z); + X_FLOAT a13 = - X_F(2.0) * invmass1 * + (s01.x * r12.x + s01.y * r12.y + s01.z * r12.z); + X_FLOAT a21 = X_F(2.0) * invmass0 * + (s02.x * r01.x + s02.y * r01.y + s02.z * r01.z); + X_FLOAT a22 = X_F(2.0) * (invmass0 + invmass2) * + (s02.x * r02.x + s02.y * r02.y + s02.z * r02.z); + X_FLOAT a23 = X_F(2.0) * invmass2 * + (s02.x * r12.x + s02.y * r12.y + s02.z * r12.z); + X_FLOAT a31 = - X_F(2.0) * invmass1 * + (s12.x * r01.x + s12.y * r01.y + s12.z * r01.z); + X_FLOAT a32 = X_F(2.0) * invmass2 * + (s12.x * r02.x + s12.y * r02.y + s12.z * r02.z); + X_FLOAT a33 = X_F(2.0) * (invmass1 + invmass2) * + (s12.x * r12.x + s12.y * r12.y + s12.z * r12.z); + + // inverse of matrix + + X_FLOAT determ = a11 * a22 * a33 + a12 * a23 * a31 + a13 * a21 * a32 - + a11 * a23 * a32 - a12 * a21 * a33 - a13 * a22 * a31; + + if(determ == X_F(0.0)) _flag[0]++; + + X_FLOAT determinv = X_F(1.0) / determ; + + X_FLOAT a11inv = determinv * (a22 * a33 - a23 * a32); + X_FLOAT a12inv = -determinv * (a12 * a33 - a13 * a32); + X_FLOAT a13inv = determinv * (a12 * a23 - a13 * a22); + X_FLOAT a21inv = -determinv * (a21 * a33 - a23 * a31); + X_FLOAT a22inv = determinv * (a11 * a33 - a13 * a31); + X_FLOAT a23inv = -determinv * (a11 * a23 - a13 * a21); + X_FLOAT a31inv = determinv * (a21 * a32 - a22 * a31); + X_FLOAT a32inv = -determinv * (a11 * a32 - a12 * a31); + X_FLOAT a33inv = determinv * (a11 * a22 - a12 * a21); + + // quadratic correction coeffs + + X_FLOAT r0102 = (r01.x * r02.x + r01.y * r02.y + r01.z * r02.z); + X_FLOAT r0112 = (r01.x * r12.x + r01.y * r12.y + r01.z * r12.z); + X_FLOAT r0212 = (r02.x * r12.x + r02.y * r12.y + r02.z * r12.z); + + X_FLOAT quad1_0101 = (invmass0 + invmass1) * (invmass0 + invmass1) * r01sq; + X_FLOAT quad1_0202 = invmass0 * invmass0 * r02sq; + X_FLOAT quad1_1212 = invmass1 * invmass1 * r12sq; + X_FLOAT quad1_0102 = X_F(2.0) * (invmass0 + invmass1) * invmass0 * r0102; + X_FLOAT quad1_0112 = - X_F(2.0) * (invmass0 + invmass1) * invmass1 * r0112; + X_FLOAT quad1_0212 = - X_F(2.0) * invmass0 * invmass1 * r0212; + + X_FLOAT quad2_0101 = invmass0 * invmass0 * r01sq; + X_FLOAT quad2_0202 = (invmass0 + invmass2) * (invmass0 + invmass2) * r02sq; + X_FLOAT quad2_1212 = invmass2 * invmass2 * r12sq; + X_FLOAT quad2_0102 = X_F(2.0) * (invmass0 + invmass2) * invmass0 * r0102; + X_FLOAT quad2_0112 = X_F(2.0) * invmass0 * invmass2 * r0112; + X_FLOAT quad2_0212 = X_F(2.0) * (invmass0 + invmass2) * invmass2 * r0212; + + X_FLOAT quad3_0101 = invmass1 * invmass1 * r01sq; + X_FLOAT quad3_0202 = invmass2 * invmass2 * r02sq; + X_FLOAT quad3_1212 = (invmass1 + invmass2) * (invmass1 + invmass2) * r12sq; + X_FLOAT quad3_0102 = - X_F(2.0) * invmass1 * invmass2 * r0102; + X_FLOAT quad3_0112 = - X_F(2.0) * (invmass1 + invmass2) * invmass1 * r0112; + X_FLOAT quad3_0212 = X_F(2.0) * (invmass1 + invmass2) * invmass2 * r0212; + // iterate until converged + + X_FLOAT lamda01 = X_F(0.0); + X_FLOAT lamda02 = X_F(0.0); + X_FLOAT lamda12 = X_F(0.0); + int niter = 0; + int done = 0; + + X_FLOAT quad1, quad2, quad3, b1, b2, b3, lamda01_new, lamda02_new, lamda12_new; + + //maybe all running full loop? + while(__any(!done) && niter < _max_iter) { + quad1 = quad1_0101 * lamda01 * lamda01 + + quad1_0202 * lamda02 * lamda02 + + quad1_1212 * lamda12 * lamda12 + + quad1_0102 * lamda01 * lamda02 + + quad1_0112 * lamda01 * lamda12 + + quad1_0212 * lamda02 * lamda12; + + quad2 = quad2_0101 * lamda01 * lamda01 + + quad2_0202 * lamda02 * lamda02 + + quad2_1212 * lamda12 * lamda12 + + quad2_0102 * lamda01 * lamda02 + + quad2_0112 * lamda01 * lamda12 + + quad2_0212 * lamda02 * lamda12; + + quad3 = quad3_0101 * lamda01 * lamda01 + + quad3_0202 * lamda02 * lamda02 + + quad3_1212 * lamda12 * lamda12 + + quad3_0102 * lamda01 * lamda02 + + quad3_0112 * lamda01 * lamda12 + + quad3_0212 * lamda02 * lamda12; + + b1 = bond1 * bond1 - s01sq - quad1; + b2 = bond2 * bond2 - s02sq - quad2; + b3 = bond12 * bond12 - s12sq - quad3; + + lamda01_new = a11inv * b1 + a12inv * b2 + a13inv * b3; + lamda02_new = a21inv * b1 + a22inv * b2 + a23inv * b3; + lamda12_new = a31inv * b1 + a32inv * b2 + a33inv * b3; + + done++; + done = (fabs(lamda01_new - lamda01) > _tolerance) ? 0 : done; + done = (fabs(lamda02_new - lamda02) > _tolerance) ? 0 : done; + done = (fabs(lamda12_new - lamda12) > _tolerance) ? 0 : done; + + lamda01 = done < 2 ? lamda01_new : lamda01; + lamda02 = done < 2 ? lamda02_new : lamda02; + lamda12 = done < 2 ? lamda12_new : lamda12; + niter++; + } + + // update forces if atom is owned by this processor + + lamda01 *= X_F(1.0) / _dtfsq; + lamda02 *= X_F(1.0) / _dtfsq; + lamda12 *= X_F(1.0) / _dtfsq; + + + //attenion: are shake clusters <-> atom unique? + nlist = 0; + + if(i0 < _nlocal) { + _f[i0] += lamda01 * r01.x + lamda02 * r02.x; + _f[i0 + _nmax] += lamda01 * r01.y + lamda02 * r02.y; + _f[i0 + 2 * _nmax] += lamda01 * r01.z + lamda02 * r02.z; + list[nlist++] = i0; + } + + if(i1 < _nlocal) { + _f[i1] -= lamda01 * r01.x - lamda12 * r12.x; + _f[i1 + _nmax] -= lamda01 * r01.y - lamda12 * r12.y; + _f[i1 + 2 * _nmax] -= lamda01 * r01.z - lamda12 * r12.z; + list[nlist++] = i1; + } + + if(i2 < _nlocal) { + _f[i2] -= lamda02 * r02.x + lamda12 * r12.x; + _f[i2 + _nmax] -= lamda02 * r02.y + lamda12 * r12.y; + _f[i2 + 2 * _nmax] -= lamda02 * r02.z + lamda12 * r12.z; + list[nlist++] = i2; + } + + if(vflag || vflag_atom) { + ENERGY_FLOAT* shared = &sharedmem[threadIdx.x]; + X_FLOAT factor = X_F(2.0) / X_F(3.0) * nlist; + v[0] = lamda01 * r01.x * r01.x + lamda02 * r02.x * r02.x + lamda12 * r12.x * r12.x; + *shared = factor * v[0]; + shared += blockDim.x; //times 2.0 since the reducing function is the same as in force calculations, which adds a factor 0.5 + v[1] = lamda01 * r01.y * r01.y + lamda02 * r02.y * r02.y + lamda12 * r12.y * r12.y; + *shared = factor * v[1]; + shared += blockDim.x; + v[2] = lamda01 * r01.z * r01.z + lamda02 * r02.z * r02.z + lamda12 * r12.z * r12.z; + *shared = factor * v[2]; + shared += blockDim.x; + v[3] = lamda01 * r01.x * r01.y + lamda02 * r02.x * r02.y + lamda12 * r12.x * r12.y; + *shared = factor * v[3]; + shared += blockDim.x; + v[4] = lamda01 * r01.x * r01.z + lamda02 * r02.x * r02.z + lamda12 * r12.x * r12.z; + *shared = factor * v[4]; + shared += blockDim.x; + v[5] = lamda01 * r01.y * r01.z + lamda02 * r02.y * r02.z + lamda12 * r12.y * r12.z; + *shared = factor * v[5]; + shared += blockDim.x; + + v_tally(vflag, vflag_atom, nlist, list, 3.0, v); + } +} + +__global__ void FixShakeCuda_Shake_Kernel(int vflag, int vflag_atom, int* list, int nlist) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if(i < nlist) { + + int m = list[i]; + int sflag = _shake_flag[m]; + + if(sflag == 2) FixShakeCuda_Shake2(vflag, vflag_atom, m); + else if(sflag == 3) FixShakeCuda_Shake3(vflag, vflag_atom, m); + else if(sflag == 4) FixShakeCuda_Shake4(vflag, vflag_atom, m); + else FixShakeCuda_Shake3Angle(vflag, vflag_atom, m); + } else { + ENERGY_FLOAT* shared = &sharedmem[threadIdx.x]; + *shared = ENERGY_F(0.0); + shared += blockDim.x; + *shared = ENERGY_F(0.0); + shared += blockDim.x; + *shared = ENERGY_F(0.0); + shared += blockDim.x; + *shared = ENERGY_F(0.0); + shared += blockDim.x; + *shared = ENERGY_F(0.0); + shared += blockDim.x; + *shared = ENERGY_F(0.0); + } + + if(vflag) { + __syncthreads(); + int eflag = 0; + PairVirialCompute_A_Kernel(eflag, vflag); + } + +} + +__global__ void FixShakeCuda_PackComm_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + int* list = sendlist + iswap * maxlistlength; + + if(i < n) { + int j = list[i]; + + if(j > _nmax) _flag[0] = 1; + + X_FLOAT3 xs = _xshake[j]; + ((X_FLOAT*) _buffer)[i] = xs.x + dx; + ((X_FLOAT*) _buffer)[i + 1 * n] = xs.y + dy; + ((X_FLOAT*) _buffer)[i + 2 * n] = xs.z + dz; + } + +} + +__global__ void FixShakeCuda_PackComm_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz, int first) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + int* list = sendlist + iswap * maxlistlength; + + if(i < n) { + int j = list[i]; + + if(j > _nmax) _flag[0] = 1; + + X_FLOAT3 xs = _xshake[j]; + xs.x += dx; + xs.y += dy; + xs.z += dz; + _xshake[i + first] = xs; + } + +} + +__global__ void FixShakeCuda_UnpackComm_Kernel(int n, int first) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if(i < n) { + X_FLOAT3 xs; + xs.x = ((X_FLOAT*) _buffer)[i]; + xs.y = ((X_FLOAT*) _buffer)[i + 1 * n]; + xs.z = ((X_FLOAT*) _buffer)[i + 2 * n]; + _xshake[i + first] = xs; + } +} + diff --git a/lib/cuda/fix_temp_berendsen_cuda.cu b/lib/cuda/fix_temp_berendsen_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..b99608dda505ce530261b8be625afd58558707ed --- /dev/null +++ b/lib/cuda/fix_temp_berendsen_cuda.cu @@ -0,0 +1,66 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> +#define MY_PREFIX fix_temp_berendsen_cuda +#include "cuda_shared.h" +#include "cuda_common.h" +#include "crm_cuda_utils.cu" + +#include "fix_temp_berendsen_cuda_cu.h" +#include "fix_temp_berendsen_cuda_kernel.cu" + + +void Cuda_FixTempBerendsenCuda_UpdateNmax(cuda_shared_data* sdata) +{ + cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(X_FLOAT*)); +} + +void Cuda_FixTempBerendsenCuda_Init(cuda_shared_data* sdata) +{ + Cuda_FixTempBerendsenCuda_UpdateNmax(sdata); + +} + + +void Cuda_FixTempBerendsenCuda_EndOfStep(cuda_shared_data* sdata, int groupbit, double afactor) +{ + V_FLOAT factor = afactor; + + if(sdata->atom.update_nmax) + Cuda_FixTempBerendsenCuda_UpdateNmax(sdata); + + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + + int3 layout = getgrid(sdata->atom.nlocal); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + Cuda_FixTempBerendsenCuda_EndOfStep_Kernel <<< grid, threads, 0>>> (groupbit, factor); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Cuda_FixTempBerendsenCuda_PostForce: fix add_force post_force compute Kernel execution failed"); +} diff --git a/lib/cuda/fix_temp_berendsen_cuda_cu.h b/lib/cuda/fix_temp_berendsen_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..1cffbc730d29b074df7d212280818fc68da79920 --- /dev/null +++ b/lib/cuda/fix_temp_berendsen_cuda_cu.h @@ -0,0 +1,27 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_FixTempBerendsenCuda_Init(cuda_shared_data* sdata); +extern "C" void Cuda_FixTempBerendsenCuda_EndOfStep(cuda_shared_data* sdata, int groupbit, double afactor); diff --git a/lib/cuda/fix_temp_berendsen_cuda_kernel.cu b/lib/cuda/fix_temp_berendsen_cuda_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..2d3b04ace5acb4a554ea35afb2eeaf5c0e59a84f --- /dev/null +++ b/lib/cuda/fix_temp_berendsen_cuda_kernel.cu @@ -0,0 +1,37 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + + + +__global__ void Cuda_FixTempBerendsenCuda_EndOfStep_Kernel(int groupbit, V_FLOAT factor) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if(i < _nlocal) + if(_mask[i] & groupbit) { + _v[i] *= factor; + _v[i + _nmax] *= factor; + _v[i + 2 * _nmax] *= factor; + } +} + diff --git a/lib/cuda/fix_temp_rescale_cuda.cu b/lib/cuda/fix_temp_rescale_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..171156519b96d4204ccd9121914cfa655a213358 --- /dev/null +++ b/lib/cuda/fix_temp_rescale_cuda.cu @@ -0,0 +1,64 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> +#define MY_PREFIX fix_temp_rescale_cuda +#include "cuda_shared.h" +#include "cuda_common.h" +#include "crm_cuda_utils.cu" + +#include "fix_temp_rescale_cuda_cu.h" +#include "fix_temp_rescale_cuda_kernel.cu" + + +void Cuda_FixTempRescaleCuda_UpdateNmax(cuda_shared_data* sdata) +{ + cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(X_FLOAT*)); +} + +void Cuda_FixTempRescaleCuda_Init(cuda_shared_data* sdata) +{ + Cuda_FixTempRescaleCuda_UpdateNmax(sdata); + +} + + +void Cuda_FixTempRescaleCuda_EndOfStep(cuda_shared_data* sdata, int groupbit, double afactor) +{ + V_FLOAT factor = afactor; + //if(sdata->atom.update_nmax) //fix temp rescale is usually not called every timestep so it might miss an update step + Cuda_FixTempRescaleCuda_UpdateNmax(sdata); + //if(sdata->atom.update_nlocal) + //cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + + int3 layout = getgrid(sdata->atom.nlocal); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + Cuda_FixTempRescaleCuda_EndOfStep_Kernel <<< grid, threads, 0>>> (groupbit, factor); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Cuda_FixTempRescaleCuda_PostForce: fix add_force post_force compute Kernel execution failed"); +} diff --git a/lib/cuda/fix_temp_rescale_cuda_cu.h b/lib/cuda/fix_temp_rescale_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..3ec9a3161fd64c2ea3350d3773654cd431d8e242 --- /dev/null +++ b/lib/cuda/fix_temp_rescale_cuda_cu.h @@ -0,0 +1,27 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_FixTempRescaleCuda_Init(cuda_shared_data* sdata); +extern "C" void Cuda_FixTempRescaleCuda_EndOfStep(cuda_shared_data* sdata, int groupbit, double afactor); diff --git a/lib/cuda/fix_temp_rescale_cuda_kernel.cu b/lib/cuda/fix_temp_rescale_cuda_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..2e34ec592f5f8582a0cb63d84154f76dcf7d0d21 --- /dev/null +++ b/lib/cuda/fix_temp_rescale_cuda_kernel.cu @@ -0,0 +1,37 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + + + +__global__ void Cuda_FixTempRescaleCuda_EndOfStep_Kernel(int groupbit, V_FLOAT factor) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if(i < _nlocal) + if(_mask[i] & groupbit) { + _v[i] *= factor; + _v[i + _nmax] *= factor; + _v[i + 2 * _nmax] *= factor; + } +} + diff --git a/lib/cuda/fix_temp_rescale_limit_cuda.cu b/lib/cuda/fix_temp_rescale_limit_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..72028a124ef00797812f9e1196db59ac1fbc21a9 --- /dev/null +++ b/lib/cuda/fix_temp_rescale_limit_cuda.cu @@ -0,0 +1,64 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> +#define MY_PREFIX fix_temp_rescale_limit_cuda +#include "cuda_shared.h" +#include "cuda_common.h" +#include "crm_cuda_utils.cu" + +#include "fix_temp_rescale_limit_cuda_cu.h" +#include "fix_temp_rescale_limit_cuda_kernel.cu" + + +void Cuda_FixTempRescaleLimitCuda_UpdateNmax(cuda_shared_data* sdata) +{ + cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.v .dev_data, sizeof(X_FLOAT*)); +} + +void Cuda_FixTempRescaleLimitCuda_Init(cuda_shared_data* sdata) +{ + Cuda_FixTempRescaleLimitCuda_UpdateNmax(sdata); + +} + + +void Cuda_FixTempRescaleLimitCuda_EndOfStep(cuda_shared_data* sdata, int groupbit, double afactor, double limit) +{ + V_FLOAT factor = afactor; + //if(sdata->atom.update_nmax) //fix temp rescale is usually not called every timestep so it might miss an update step + Cuda_FixTempRescaleLimitCuda_UpdateNmax(sdata); + //if(sdata->atom.update_nlocal) + //cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int) ); + + int3 layout = getgrid(sdata->atom.nlocal); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + Cuda_FixTempRescaleLimitCuda_EndOfStep_Kernel <<< grid, threads, 0>>> (groupbit, factor, limit); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Cuda_FixTempRescaleLimitCuda_PostForce: fix add_force post_force compute Kernel execution failed"); +} diff --git a/lib/cuda/fix_temp_rescale_limit_cuda_cu.h b/lib/cuda/fix_temp_rescale_limit_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..44efa566beec8a17a1097e9196a0fd05711b8b1a --- /dev/null +++ b/lib/cuda/fix_temp_rescale_limit_cuda_cu.h @@ -0,0 +1,27 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_FixTempRescaleLimitCuda_Init(cuda_shared_data* sdata); +extern "C" void Cuda_FixTempRescaleLimitCuda_EndOfStep(cuda_shared_data* sdata, int groupbit, double afactor, double limit); diff --git a/lib/cuda/fix_temp_rescale_limit_cuda_kernel.cu b/lib/cuda/fix_temp_rescale_limit_cuda_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..eda86ccdce850bebe622255efb39b28a6129032b --- /dev/null +++ b/lib/cuda/fix_temp_rescale_limit_cuda_kernel.cu @@ -0,0 +1,44 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + + + +__global__ void Cuda_FixTempRescaleLimitCuda_EndOfStep_Kernel(int groupbit, V_FLOAT factor, V_FLOAT limit) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if(i < _nlocal) + if(_mask[i] & groupbit) { + V_FLOAT vx = _v[i]; + V_FLOAT vy = _v[i + _nmax]; + V_FLOAT vz = _v[i + 2 * _nmax]; + vx *= factor; + vy *= factor; + vz *= factor; + + _v[i] = vx > 0 ? min(vx, limit) : max(vx, -limit); + _v[i + _nmax] = vy > 0 ? min(vy, limit) : max(vy, -limit); + _v[i + 2 * _nmax] = vz > 0 ? min(vz, limit) : max(vz, -limit); + } +} + diff --git a/lib/cuda/fix_viscous_cuda.cu b/lib/cuda/fix_viscous_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..03a019bc9fb923fd8d095e081766c8921721731f --- /dev/null +++ b/lib/cuda/fix_viscous_cuda.cu @@ -0,0 +1,67 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> +#define MY_PREFIX fix_viscous_cuda +#include "cuda_shared.h" +#include "cuda_common.h" +#include "crm_cuda_utils.cu" + +#include "fix_viscous_cuda_cu.h" +#include "fix_viscous_cuda_kernel.cu" + +void Cuda_FixViscousCuda_UpdateNmax(cuda_shared_data* sdata) +{ + cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(v) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*)); +} + +void Cuda_FixViscousCuda_Init(cuda_shared_data* sdata) +{ + Cuda_FixViscousCuda_UpdateNmax(sdata); + +} + + +void Cuda_FixViscousCuda_PostForce(cuda_shared_data* sdata, int groupbit, void* gamma) +{ + if(sdata->atom.update_nmax) + Cuda_FixViscousCuda_UpdateNmax(sdata); + + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + + + int3 layout = getgrid(sdata->atom.nlocal, 0); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + Cuda_FixViscousCuda_PostForce_Kernel <<< grid, threads, 0>>> (groupbit, (F_FLOAT*) gamma); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_Cuda_FixViscousCuda_PostForce: Kernel execution failed"); + +} diff --git a/lib/cuda/fix_viscous_cuda_cu.h b/lib/cuda/fix_viscous_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..3727bc35651af9124cc6acf274d5d2561eb40c67 --- /dev/null +++ b/lib/cuda/fix_viscous_cuda_cu.h @@ -0,0 +1,27 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_FixViscousCuda_Init(cuda_shared_data* sdata); +extern "C" void Cuda_FixViscousCuda_PostForce(cuda_shared_data* sdata, int groupbit, void* gamma); diff --git a/lib/cuda/fix_viscous_cuda_kernel.cu b/lib/cuda/fix_viscous_cuda_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..2c3397715f324618234648325ee60339b86dd674 --- /dev/null +++ b/lib/cuda/fix_viscous_cuda_kernel.cu @@ -0,0 +1,35 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +__global__ void Cuda_FixViscousCuda_PostForce_Kernel(int groupbit, F_FLOAT* gamma) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if(i < _nlocal) + if(_mask[i] & groupbit) { + F_FLOAT drag = gamma[_type[i]]; + _f[i] -= drag * _v[i]; + _f[i + 1 * _nmax] -= drag * _v[i + 1 * _nmax]; + _f[i + 2 * _nmax] -= drag * _v[i + 2 * _nmax]; + } +} diff --git a/lib/cuda/neighbor.cu b/lib/cuda/neighbor.cu new file mode 100644 index 0000000000000000000000000000000000000000..ddcf6ddc091cfd64bd0da941321143d00e3054a7 --- /dev/null +++ b/lib/cuda/neighbor.cu @@ -0,0 +1,364 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> +#include <time.h> +#define MY_PREFIX neighbor +#define IncludeCommonNeigh +#include "cuda_shared.h" +#include "cuda_common.h" +#include "crm_cuda_utils.cu" +#include "cuda_wrapper_cu.h" + +#define _cutneighsq MY_AP(cutneighsq) +#define _ex_type MY_AP(ex_type) +#define _nex_type MY_AP(nex_type) +#define _ex1_bit MY_AP(ex1_bit) +#define _ex2_bit MY_AP(ex2_bit) +#define _nex_group MY_AP(nex_group) +#define _ex_mol_bit MY_AP(ex_mol_bit) +#define _nex_mol MY_AP(nex_mol) +__device__ __constant__ CUDA_FLOAT* _cutneighsq; +__device__ __constant__ int* _ex_type; +__device__ __constant__ int _nex_type; +__device__ __constant__ int* _ex1_bit; +__device__ __constant__ int* _ex2_bit; +__device__ __constant__ int _nex_group; +__device__ __constant__ int* _ex_mol_bit; +__device__ __constant__ int _nex_mol; + +#include "neighbor_cu.h" +#include "neighbor_kernel.cu" + +void Cuda_Neighbor_UpdateBuffer(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) +{ + CUT_CHECK_ERROR("Cuda_PairLJCutCuda: before updateBuffer failed"); + + int size = (unsigned)(sizeof(int) * 20 + sneighlist->bin_dim[0] * sneighlist->bin_dim[1] * sneighlist->bin_dim[2] * (sizeof(int) + sneighlist->bin_nmax * 3 * sizeof(CUDA_FLOAT))); + + if(sdata->buffersize < size) { + MYDBG(printf("Cuda_Neighbor Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);) + + if(sdata->buffer != NULL) CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize); + + sdata->buffer = CudaWrapper_AllocCudaData(size); + sdata->buffersize = size; + sdata->buffer_new++; + MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);) + } + + cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*)); + CUT_CHECK_ERROR("Cuda_PairLJCutCuda: updateBuffer failed"); +} + +int Cuda_BinAtoms(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) +{ + if(sdata->buffer_new) + Cuda_Neighbor_UpdateBuffer(sdata, sneighlist); + + // initialize only on first call + CUDA_FLOAT rez_bin_size[3] = { + (1.0 * sneighlist->bin_dim[0] - 4.0) / (sdata->domain.subhi[0] - sdata->domain.sublo[0]), + (1.0 * sneighlist->bin_dim[1] - 4.0) / (sdata->domain.subhi[1] - sdata->domain.sublo[1]), + (1.0 * sneighlist->bin_dim[2] - 4.0) / (sdata->domain.subhi[2] - sdata->domain.sublo[2]) + }; + + short init = 0; + + if(! init) { + init = 0; + cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(unsigned)); + cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(unsigned)); + cudaMemcpyToSymbol(MY_AP(sublo) , sdata->domain.sublo , sizeof(X_FLOAT) * 3); + } + + + int3 layout = getgrid(sdata->atom.nall); // sneighlist->inum + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + timespec starttime, endtime; + clock_gettime(CLOCK_REALTIME, &starttime); + + cudaMemset((int*)(sdata->buffer), 0, sizeof(int) * (20 + (sneighlist->bin_dim[0]) * (sneighlist->bin_dim[1]) * (sneighlist->bin_dim[2])) + 3 * sizeof(CUDA_FLOAT) * (sneighlist->bin_dim[0]) * (sneighlist->bin_dim[1]) * (sneighlist->bin_dim[2]) * (sneighlist->bin_nmax)); + + Binning_Kernel <<< grid, threads>>> (sneighlist->binned_id, sneighlist->bin_nmax, sneighlist->bin_dim[0], sneighlist->bin_dim[1], sneighlist->bin_dim[2], rez_bin_size[0], rez_bin_size[1], rez_bin_size[2]); + cudaThreadSynchronize(); + + clock_gettime(CLOCK_REALTIME, &endtime); + sdata->cuda_timings.neigh_bin += + endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000; + + + int binning_error; + cudaMemcpy((void*) &binning_error, (void*) sdata->buffer, 1 * sizeof(int), cudaMemcpyDeviceToHost); + + if(binning_error) { + sneighlist->bin_extraspace += 0.05; + } else { + MYDBG(printf("CUDA: binning successful\n");) + } + CUT_CHECK_ERROR("Cuda_Binning: binning Kernel execution failed"); + return binning_error; +} + +int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) +{ + //Cuda_Neighbor_UpdateBuffer(sdata,sneighlist); + CUDA_FLOAT globcutoff = -1.0; + + short init = 0; + + if(! init) { + init = 1; + + // !! LAMMPS indexes atom types starting with 1 !! + + unsigned cuda_ntypes = sdata->atom.ntypes + 1; + + unsigned nx = sizeof(CUDA_FLOAT) * cuda_ntypes * cuda_ntypes; + + CUDA_FLOAT* acutneighsq = (CUDA_FLOAT*) malloc(nx); + //printf("Allocate: %i\n",nx); + sneighlist->cu_cutneighsq = (CUDA_FLOAT*) CudaWrapper_AllocCudaData(nx); + + if(sneighlist->cutneighsq) { + int cutoffsdiffer = 0; + double cutoff0 = sneighlist->cutneighsq[1][1]; + + for(int i = 1; i <= sdata->atom.ntypes; ++i) { + for(int j = 1; j <= sdata->atom.ntypes; ++j) { + acutneighsq[i * cuda_ntypes + j] = (CUDA_FLOAT)(sneighlist->cutneighsq[i][j]); + + if((sneighlist->cutneighsq[i][j] - cutoff0) * (sneighlist->cutneighsq[i][j] - cutoff0) > 1e-6) cutoffsdiffer++; + } + } + + if(not cutoffsdiffer) globcutoff = (CUDA_FLOAT) cutoff0; + } else { + MYEMUDBG(printf("# CUDA: Cuda_NeighborBuild: cutneighsq == NULL\n");) + return 0; + } + + int size = 100; + + if(sdata->buffersize < size) { + MYDBG(printf("Cuda_NeighborBuild Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);) + CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize); + sdata->buffer = CudaWrapper_AllocCudaData(size); + sdata->buffersize = size; + sdata->buffer_new++; + MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);) + } + + CudaWrapper_UploadCudaData(acutneighsq, sneighlist->cu_cutneighsq, nx); + cudaMemcpyToSymbol(MY_AP(cutneighsq) , &sneighlist->cu_cutneighsq , sizeof(CUDA_FLOAT*)); + + cudaMemcpyToSymbol(MY_AP(cuda_ntypes) , & cuda_ntypes , sizeof(unsigned)); + cudaMemcpyToSymbol(MY_AP(special_flag) , sdata->atom.special_flag , 4 * sizeof(int)); + cudaMemcpyToSymbol(MY_AP(molecular) , & sdata->atom.molecular , sizeof(int)); + } + + cudaMemcpyToSymbol(MY_AP(neighbor_maxlocal), & sneighlist->firstneigh.dim[0] , sizeof(unsigned)); + //cudaMemcpyToSymbol(MY_AP(firstneigh) , & sneighlist->firstneigh.dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_AP(ilist) , & sneighlist->ilist .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(inum) , & sneighlist->inum , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(numneigh) , & sneighlist->numneigh .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(special) , & sdata->atom.special .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(maxspecial) , & sdata->atom.maxspecial , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nspecial) , & sdata->atom.nspecial .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(maxneighbors) , & sneighlist->maxneighbors , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(debugdata) , & sdata->debugdata , sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(overlap_comm) , & sdata->overlap_comm, sizeof(int)); + cudaMemcpyToSymbol(MY_AP(neighbors) , & sneighlist->neighbors.dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(ex_type) , & sneighlist->ex_type.dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(ex1_bit) , & sneighlist->ex1_bit.dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(ex2_bit) , & sneighlist->ex2_bit.dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(ex_mol_bit) , & sneighlist->ex_mol_bit.dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(nex_type) , & sneighlist->nex_type, sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nex_group) , & sneighlist->nex_group, sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nex_mol) , & sneighlist->nex_mol, sizeof(int)); + + if(sdata->overlap_comm) { + cudaMemcpyToSymbol(MY_AP(numneigh_border) , & sneighlist->numneigh_border .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(numneigh_inner) , & sneighlist->numneigh_inner .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(neighbors_border) , & sneighlist->neighbors_border.dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(neighbors_inner) , & sneighlist->neighbors_inner .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(ilist_border) , & sneighlist->ilist_border .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(inum_border) , & sneighlist->inum_border .dev_data, sizeof(int*)); + } + + //dim3 threads(sneighlist->bin_nmax,1,1); + dim3 threads(MIN(128, sneighlist->bin_nmax), 1, 1); + dim3 grid(sneighlist->bin_dim[0]*sneighlist->bin_dim[1], sneighlist->bin_dim[2], 1); + + //printf("Configuration: %i %i %i %i %i\n",grid.x,grid.y,threads.x,(sizeof(int)+3*sizeof(X_FLOAT))*threads.x,sneighlist->bin_nmax); + int buffer[20]; + buffer[0] = 1; + buffer[1] = 0; + CudaWrapper_UploadCudaData(buffer, sdata->buffer, 2 * sizeof(int)); + CUT_CHECK_ERROR("Cuda_NeighborBuild: pre neighbor build kernel error"); + //cudaMemset(sdata->debugdata,0,100*sizeof(int)); + unsigned int shared_size = (sizeof(int) + 3 * sizeof(CUDA_FLOAT)) * threads.x; + MYDBG(printf("Configuration: %i %i %i %u %i\n", grid.x, grid.y, threads.x, shared_size, sneighlist->bin_nmax);) + //shared_size=2056; + timespec starttime, endtime; + clock_gettime(CLOCK_REALTIME, &starttime); + //for(int i=0;i<100;i++) + { + if(sdata->overlap_comm) + NeighborBuildFullBin_OverlapComm_Kernel <<< grid, threads, shared_size>>> + (sneighlist->binned_id, sneighlist->bin_nmax, sneighlist->bin_dim[0], sneighlist->bin_dim[1], globcutoff, sdata->pair.use_block_per_atom); + else { + int exclude = sneighlist->nex_mol | sneighlist->nex_group | sneighlist->nex_type; + + if(exclude) + NeighborBuildFullBin_Kernel<1> <<< grid, threads, shared_size>>> + (sneighlist->binned_id, sneighlist->bin_nmax, sneighlist->bin_dim[0], sneighlist->bin_dim[1], globcutoff, sdata->pair.use_block_per_atom, sdata->pair.neighall); + else + NeighborBuildFullBin_Kernel<0> <<< grid, threads, shared_size>>> + (sneighlist->binned_id, sneighlist->bin_nmax, sneighlist->bin_dim[0], sneighlist->bin_dim[1], globcutoff, sdata->pair.use_block_per_atom, sdata->pair.neighall); + } + //NeighborBuildFullBin_Kernel_Restrict<<<grid,threads,(2*sizeof(int)+3*sizeof(X_FLOAT))*threads.x+sizeof(int)>>> + // (sneighlist->binned_id,sneighlist->bin_nmax,sneighlist->bin_dim[0],sneighlist->bin_dim[1],globcutoff); + + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_NeighborBuild: neighbor build kernel execution failed"); + clock_gettime(CLOCK_REALTIME, &endtime); + sdata->cuda_timings.neigh_build += + endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000; + //dim3 threads,grid; + CudaWrapper_DownloadCudaData(buffer, sdata->buffer, sizeof(int)); + + if(buffer[0] >= 0 && true && sdata->atom.molecular) { + //printf("Find Special: %i %i\n",sneighlist->inum,sdata->atom.nall); + clock_gettime(CLOCK_REALTIME, &starttime); + int3 layout = getgrid(sdata->atom.nlocal, 0, 512); + threads.x = layout.z; + threads.y = 1; + threads.z = 1; + grid.x = layout.x; + grid.y = layout.y; + grid.z = 1; + FindSpecial <<< grid, threads>>>(sdata->pair.use_block_per_atom); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_NeighborBuild: FindSpecial kernel execution failed"); + clock_gettime(CLOCK_REALTIME, &endtime); + sdata->cuda_timings.neigh_special += + endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000; + } + } + //printf("Neightime: %lf\n",sdata->cuda_timings.test1); + CUT_CHECK_ERROR("Cuda_NeighborBuild: neighbor build kernel execution failed"); + + //CudaWrapper_DownloadCudaData(buffer, sneighlist->numneigh_border .dev_data, sizeof(int)); + + MYDBG(printf("Cuda_NeighborBuildFullBin build neighbor list ... end\n");) + return buffer[0]; +} + +int Cuda_NeighborBuildFullNsq(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) +{ + MYDBG(printf("Cuda_NeighborBuildFullNsq build neighbor list ... start\n");) + // initialize only on first call + /*static*/ short init = 0; + + if(! init) { + init = 1; + + // !! LAMMPS indexes atom types starting with 1 !! + + unsigned cuda_ntypes = sdata->atom.ntypes + 1; + + if(cuda_ntypes * cuda_ntypes > CUDA_MAX_TYPES2) + printf("# CUDA: Cuda_PairLJCutCuda_Init: you need %u types. this is more than %u " + "(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=32 " + "or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES2); + + unsigned nx = sizeof(CUDA_FLOAT) * cuda_ntypes * cuda_ntypes; + CUDA_FLOAT* acutneighsq = (CUDA_FLOAT*) malloc(nx); + + if(sneighlist->cutneighsq) { + for(int i = 1; i <= sdata->atom.ntypes; ++i) { + for(int j = 1; j <= sdata->atom.ntypes; ++j) { + acutneighsq[i * cuda_ntypes + j] = (CUDA_FLOAT)(sneighlist->cutneighsq[i][j]); + //printf("CUTOFFS: %i %i %i %e\n",i,j,cuda_ntypes,acutneighsq[i * cuda_ntypes + j]); + } + } + } else { + MYEMUDBG(printf("# CUDA: Cuda_NeighborBuild: cutneighsq == NULL\n");) + return 0; + } + + int size = 100; + + if(sdata->buffersize < size) { + MYDBG(printf("Cuda_NeighborBuild Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);) + CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize); + sdata->buffer = CudaWrapper_AllocCudaData(size); + sdata->buffersize = size; + sdata->buffer_new++; + MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);) + } + + cudaMemcpyToSymbol(MY_AP(buffer) , & sdata->buffer , sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(cuda_ntypes) , & cuda_ntypes , sizeof(unsigned)); + cudaMemcpyToSymbol(MY_AP(cutneighsq) , acutneighsq , nx); + cudaMemcpyToSymbol(MY_AP(neighbor_maxlocal), & sneighlist->firstneigh.dim[0] , sizeof(unsigned)); + cudaMemcpyToSymbol(MY_AP(firstneigh) , & sneighlist->firstneigh.dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(ilist) , & sneighlist->ilist .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(inum) , & sneighlist->inum , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(numneigh) , & sneighlist->numneigh .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(maxneighbors) , & sneighlist->maxneighbors , sizeof(int)); + + free(acutneighsq); + } + + int3 layout = getgrid(sdata->atom.nlocal); // sneighlist->inum + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + int return_value = 1; + CudaWrapper_UploadCudaData(& return_value, sdata->buffer, sizeof(int)); + + CUT_CHECK_ERROR("Cuda_NeighborBuild: pre neighbor build kernel execution failed"); + NeighborBuildFullNsq_Kernel <<< grid, threads>>> (); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_NeighborBuild: neighbor build kernel execution failed"); + + int buffer[20]; + CudaWrapper_DownloadCudaData(buffer, sdata->buffer, sizeof(int) * 20); + MYDBG(printf("Cuda_NeighborBuildFullNSQ build neighbor list ... end\n");) + return return_value = buffer[0]; +} diff --git a/lib/cuda/neighbor_cu.h b/lib/cuda/neighbor_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..b3028e5400f35fe972958701596c318eee64e84c --- /dev/null +++ b/lib/cuda/neighbor_cu.h @@ -0,0 +1,32 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifndef NEIGHBOR_CU_H_ +#define NEIGHBOR_CU_H_ +#include "cuda_shared.h" + +extern "C" int Cuda_BinAtoms(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist); +extern "C" int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist); +extern "C" int Cuda_NeighborBuildFullNsq(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist); + +#endif /*NEIGHBOR_CU_H_*/ diff --git a/lib/cuda/neighbor_kernel.cu b/lib/cuda/neighbor_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..3892f5ec29e09f1a95e73508de828025ddb6a200 --- /dev/null +++ b/lib/cuda/neighbor_kernel.cu @@ -0,0 +1,660 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#define SBBITS 30 + +__global__ void Binning_Kernel(int* binned_id, int bin_nmax, int bin_dim_x, int bin_dim_y, int bin_dim_z, + CUDA_FLOAT rez_bin_size_x, CUDA_FLOAT rez_bin_size_y, CUDA_FLOAT rez_bin_size_z) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + /*int* bin_count=(int*) _buffer; + bin_count=bin_count+20; + CUDA_FLOAT* binned_x=(CUDA_FLOAT*)(bin_count+bin_dim_x*bin_dim_y*bin_dim_z);*/ + CUDA_FLOAT* binned_x = (CUDA_FLOAT*) _buffer; + binned_x = &binned_x[2]; + int* bin_count = (int*) &binned_x[3 * bin_dim_x * bin_dim_y * bin_dim_z * bin_nmax]; + + if(i < _nall) { + // copy atom position from global device memory to local register + // in this 3 steps to get as much coalesced access as possible + X_FLOAT* my_x = _x + i; + CUDA_FLOAT x_i = *my_x; + my_x += _nmax; + CUDA_FLOAT y_i = *my_x; + my_x += _nmax; + CUDA_FLOAT z_i = *my_x; + + + // calculate flat bin index + int bx = __float2int_rd(rez_bin_size_x * (x_i - _sublo[0])) + 2; + int by = __float2int_rd(rez_bin_size_y * (y_i - _sublo[1])) + 2; + int bz = __float2int_rd(rez_bin_size_z * (z_i - _sublo[2])) + 2; + + bx -= bx * negativCUDA(1.0f * bx); + bx -= (bx - bin_dim_x + 1) * negativCUDA(1.0f * bin_dim_x - 1.0f - 1.0f * bx); + by -= by * negativCUDA(1.0f * by); + by -= (by - bin_dim_y + 1) * negativCUDA(1.0f * bin_dim_y - 1.0f - 1.0f * by); + bz -= bz * negativCUDA(1.0f * bz); + bz -= (bz - bin_dim_z + 1) * negativCUDA(1.0f * bin_dim_z - 1.0f - 1.0f * bz); + + + const unsigned j = bin_dim_z * (bin_dim_y * bx + by) + bz; + + // add new atom to bin, get bin-array position + const unsigned k = atomicAdd(& bin_count[j], 1); + + if(k < bin_nmax) { + binned_id [bin_nmax * j + k] = i; + binned_x [3 * bin_nmax * j + k] = x_i; + binned_x [3 * bin_nmax * j + k + bin_nmax] = y_i; + binned_x [3 * bin_nmax * j + k + 2 * bin_nmax] = z_i; + } else { + // normally, this should not happen: + int errorn = atomicAdd((int*) _buffer, 1); + MYEMUDBG(printf("# CUDA: Binning_Kernel: WARNING: atom %i ignored, no place left in bin %u\n", i, j);) + } + } +} + + +__device__ inline int exclusion(int &i, int &j, int &itype, int &jtype) +{ + int m; + + if(_nex_type) + if(_ex_type[itype * _cuda_ntypes + jtype]) return 1; + + if(_nex_group) { + for(m = 0; m < _nex_group; m++) { + if(_mask[i] & _ex1_bit[m] && _mask[j] & _ex2_bit[m]) return 1; + + if(_mask[i] & _ex2_bit[m] && _mask[j] & _ex1_bit[m]) return 1; + } + } + + if(_nex_mol) { + if(_molecule[i] == _molecule[j]) + for(m = 0; m < _nex_mol; m++) + if(_mask[i] & _ex_mol_bit[m] && _mask[j] & _ex_mol_bit[m]) return 1; + } + + return 0; +} + +extern __shared__ CUDA_FLOAT shared[]; + +__device__ inline int find_special(int3 &n, int* list, int &tag, int3 flag) +{ + int k = n.z; + + for(int l = 0; l < n.z; l++) k = ((list[l] == tag) ? l : k); + + return k < n.x ? flag.x : (k < n.y ? flag.y : (k < n.z ? flag.z : 0)); +} + +template <const unsigned int exclude> +__global__ void NeighborBuildFullBin_Kernel(int* binned_id, int bin_nmax, int bin_dim_x, int bin_dim_y, CUDA_FLOAT globcutoff, int block_style, bool neighall) +{ + int natoms = neighall ? _nall : _nlocal; + //const bool domol=false; + int bin_dim_z = gridDim.y; + CUDA_FLOAT* binned_x = (CUDA_FLOAT*) _buffer; + binned_x = &binned_x[2]; + int* bin_count = (int*) &binned_x[3 * bin_dim_x * bin_dim_y * bin_dim_z * bin_nmax]; + int bin = __mul24(gridDim.y, blockIdx.x) + blockIdx.y; + int bin_x = blockIdx.x / bin_dim_y; + int bin_y = blockIdx.x - bin_x * bin_dim_y; + int bin_z = blockIdx.y; + int bin_c = bin_count[bin]; + + + CUDA_FLOAT cut; + + if(globcutoff > 0) + cut = globcutoff; + + int i = _nall; + CUDA_FLOAT* my_x; + CUDA_FLOAT x_i, y_i, z_i; + + for(int actOffset = 0; actOffset < bin_c; actOffset += blockDim.x) { + + int actIdx = threadIdx.x + actOffset; + CUDA_FLOAT* other_x = shared; + int* other_id = (int*) &other_x[3 * blockDim.x]; + + if(actIdx < bin_c) { + i = binned_id[__mul24(bin, bin_nmax) + actIdx]; + my_x = binned_x + __mul24(__mul24(bin, 3), bin_nmax) + actIdx; + x_i = *my_x; + my_x += bin_nmax; + y_i = *my_x; + my_x += bin_nmax; + z_i = *my_x; + } else + i = 2 * _nall; + + __syncthreads(); + + int jnum = 0; + int itype; + + if(i < natoms) { + jnum = 0; + _ilist[i] = i; + itype = _type[i]; + } + + //__syncthreads(); + + + for(int otherActOffset = 0; otherActOffset < bin_c; otherActOffset += blockDim.x) { + int otherActIdx = threadIdx.x + otherActOffset; + + if(otherActIdx < bin_c) { + if(otherActOffset == actOffset) { + other_id[threadIdx.x] = i; + other_x[threadIdx.x] = x_i; + other_x[threadIdx.x + blockDim.x] = y_i; + other_x[threadIdx.x + 2 * blockDim.x] = z_i; + } else { + other_id[threadIdx.x] = binned_id[__mul24(bin, bin_nmax) + otherActIdx]; + my_x = binned_x + __mul24(__mul24(bin, 3), bin_nmax) + otherActIdx; + other_x[threadIdx.x] = *my_x; + my_x += bin_nmax; + other_x[threadIdx.x + blockDim.x] = *my_x; + my_x += bin_nmax; + other_x[threadIdx.x + __mul24(2, blockDim.x)] = *my_x; + + } + } + + __syncthreads(); + int kk = threadIdx.x; + + for(int k = 0; k < MIN(bin_c - otherActOffset, blockDim.x); ++k) { + if(i < natoms) { + kk++; + kk = kk < MIN(bin_c - otherActOffset, blockDim.x) ? kk : 0; + int j = other_id[kk]; + + if(exclude && exclusion(i, j, itype, _type[j])) continue; + + if(globcutoff < 0) { + int jtype = _type[j]; + cut = _cutneighsq[itype * _cuda_ntypes + jtype]; + } + + CUDA_FLOAT delx = x_i - other_x[kk]; + CUDA_FLOAT dely = y_i - other_x[kk + blockDim.x]; + CUDA_FLOAT delz = z_i - other_x[kk + 2 * blockDim.x]; + CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz; + + + if(rsq <= cut && i != j) { + if(jnum < _maxneighbors) { + if(block_style) + _neighbors[i * _maxneighbors + jnum] = j; + else + _neighbors[i + jnum * natoms] = j; + } + + ++jnum; + } + } + } + + __syncthreads(); + + } + + for(int obin_x = bin_x - 1; obin_x < bin_x + 2; obin_x++) + for(int obin_y = bin_y - 1; obin_y < bin_y + 2; obin_y++) + for(int obin_z = bin_z - 1; obin_z < bin_z + 2; obin_z++) { + if(obin_x < 0 || obin_y < 0 || obin_z < 0) continue; + + if(obin_x >= bin_dim_x || obin_y >= bin_dim_y || obin_z >= bin_dim_z) continue; + + int other_bin = bin_dim_z * (bin_dim_y * obin_x + obin_y) + obin_z; + + if(other_bin == bin) continue; + + int obin_c = bin_count[other_bin]; + + for(int otherActOffset = 0; otherActOffset < obin_c; otherActOffset += blockDim.x) { + int otherActIdx = otherActOffset + threadIdx.x; + + if(threadIdx.x < MIN(blockDim.x, obin_c - otherActOffset)) { + other_id[threadIdx.x] = binned_id[__mul24(other_bin, bin_nmax) + otherActIdx]; + my_x = binned_x + __mul24(__mul24(other_bin, 3), bin_nmax) + otherActIdx; + other_x[threadIdx.x] = *my_x; + my_x += bin_nmax; + other_x[threadIdx.x + blockDim.x] = *my_x; + my_x += bin_nmax; + other_x[threadIdx.x + 2 * blockDim.x] = *my_x; + } + + __syncthreads(); + + for(int k = 0; k < MIN(blockDim.x, obin_c - otherActOffset); ++k) { + if(i < natoms) { + int j = other_id[k]; + + if(exclude && exclusion(i, j, itype, _type[j])) continue; + + if(globcutoff < 0) { + int jtype = _type[j]; + cut = _cutneighsq[itype * _cuda_ntypes + jtype]; + } + + CUDA_FLOAT delx = x_i - other_x[k]; + CUDA_FLOAT dely = y_i - other_x[k + blockDim.x]; + CUDA_FLOAT delz = z_i - other_x[k + 2 * blockDim.x]; + CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz; + + if(rsq <= cut && i != j) { + if(jnum < _maxneighbors) { + if(block_style) + _neighbors[i * _maxneighbors + jnum] = j; + else + _neighbors[i + jnum * natoms] = j; + } + + ++jnum; + } + } + } + + __syncthreads(); + + } + } + + if(jnum > _maxneighbors)((int*)_buffer)[0] = -jnum; + + if(i < natoms) + _numneigh[i] = jnum; + } +} + + +__global__ void FindSpecial(int block_style) +{ + int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + int which; + int tag_mask = 0; + int3 spec_flag; + + int3 mynspecial = {0, 0, 1}; + + if(ii >= _nlocal) return; + + int special_id[CUDA_MAX_NSPECIAL]; + + int i = _ilist[ii]; + + if(i >= _nlocal) return; + + int jnum = _numneigh[i]; + + if(_special_flag[1] == 0) spec_flag.x = -1; + else if(_special_flag[1] == 1) spec_flag.x = 0; + else spec_flag.x = 1; + + if(_special_flag[2] == 0) spec_flag.y = -1; + else if(_special_flag[2] == 1) spec_flag.y = 0; + else spec_flag.y = 2; + + if(_special_flag[3] == 0) spec_flag.z = -1; + else if(_special_flag[3] == 1) spec_flag.z = 0; + else spec_flag.z = 3; + + mynspecial.x = _nspecial[i]; + mynspecial.y = _nspecial[i + _nmax]; + mynspecial.z = _nspecial[i + 2 * _nmax]; + + if(i < _nlocal) { + int* list = &_special[i]; + + for(int k = 0; k < mynspecial.z; k++) { + special_id[k] = list[k * _nmax]; + tag_mask = tag_mask | special_id[k]; + } + } + + + for(int k = 0; k < MIN(jnum, _maxneighbors); k++) { + int j; + + if(block_style) + j = _neighbors[i * _maxneighbors + k]; + else + j = _neighbors[i + k * _nlocal]; + + int tag_j = _tag[j]; + which = 0; + + if((tag_mask & tag_j) == tag_j) { + which = find_special(mynspecial, special_id, tag_j, spec_flag); + + if(which > 0) { + if(block_style) + _neighbors[i * _maxneighbors + k] = j ^ (which << SBBITS); + else + _neighbors[i + k * _nlocal] = j ^ (which << SBBITS); + } else if(which < 0) { + if(block_style) + _neighbors[i * _maxneighbors + k] = _neighbors[i * _maxneighbors + jnum - 1]; + else + _neighbors[i + k * _nlocal] = _neighbors[i + (jnum - 1) * _nlocal]; + + jnum--; + k--; + } + } + } + + _numneigh[i] = jnum; +} + +__global__ void NeighborBuildFullBin_OverlapComm_Kernel(int* binned_id, int bin_nmax, int bin_dim_x, int bin_dim_y, CUDA_FLOAT globcutoff, int block_style) +{ + int bin_dim_z = gridDim.y; + CUDA_FLOAT* binned_x = (CUDA_FLOAT*) _buffer; + binned_x = &binned_x[2]; + int* bin_count = (int*) &binned_x[3 * bin_dim_x * bin_dim_y * bin_dim_z * bin_nmax]; + int bin = __mul24(gridDim.y, blockIdx.x) + blockIdx.y; + int bin_x = blockIdx.x / bin_dim_y; + int bin_y = blockIdx.x - bin_x * bin_dim_y; + int bin_z = blockIdx.y; + int bin_c = bin_count[bin]; + + + CUDA_FLOAT cut; + + if(globcutoff > 0) + cut = globcutoff; + + int i = _nall; + CUDA_FLOAT* my_x; + CUDA_FLOAT x_i, y_i, z_i; + + for(int actOffset = 0; actOffset < bin_c; actOffset += blockDim.x) { + + int actIdx = threadIdx.x + actOffset; + CUDA_FLOAT* other_x = shared; + int* other_id = (int*) &other_x[3 * blockDim.x]; + + if(actIdx < bin_c) { + i = binned_id[__mul24(bin, bin_nmax) + actIdx]; + my_x = binned_x + __mul24(__mul24(bin, 3), bin_nmax) + actIdx; + x_i = *my_x; + my_x += bin_nmax; + y_i = *my_x; + my_x += bin_nmax; + z_i = *my_x; + } else + i = 2 * _nall; + + __syncthreads(); + + int jnum = 0; + int jnum_border = 0; + int jnum_inner = 0; + int i_border = -1; + int itype; + + if(i < _nlocal) { + jnum = 0; + _ilist[i] = i; + itype = _type[i]; + } + + __syncthreads(); + + + for(int otherActOffset = 0; otherActOffset < bin_c; otherActOffset += blockDim.x) { + int otherActIdx = threadIdx.x + otherActOffset; + + if(otherActIdx < bin_c) { + if(otherActOffset == actOffset) { + other_id[threadIdx.x] = i; + other_x[threadIdx.x] = x_i; + other_x[threadIdx.x + blockDim.x] = y_i; + other_x[threadIdx.x + 2 * blockDim.x] = z_i; + } else { + other_id[threadIdx.x] = binned_id[__mul24(bin, bin_nmax) + otherActIdx]; + my_x = binned_x + __mul24(__mul24(bin, 3), bin_nmax) + otherActIdx; + other_x[threadIdx.x] = *my_x; + my_x += bin_nmax; + other_x[threadIdx.x + blockDim.x] = *my_x; + my_x += bin_nmax; + other_x[threadIdx.x + __mul24(2, blockDim.x)] = *my_x; + + } + } + + __syncthreads(); + int kk = threadIdx.x; + + for(int k = 0; k < MIN(bin_c - otherActOffset, blockDim.x); ++k) { + if(i < _nlocal) { + kk++; + kk = kk < MIN(bin_c - otherActOffset, blockDim.x) ? kk : 0; + int j = other_id[kk]; + + if(globcutoff < 0) { + int jtype = _type[j]; + cut = _cutneighsq[itype * _cuda_ntypes + jtype]; + } + + CUDA_FLOAT delx = x_i - other_x[kk]; + CUDA_FLOAT dely = y_i - other_x[kk + blockDim.x]; + CUDA_FLOAT delz = z_i - other_x[kk + 2 * blockDim.x]; + CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz; + + + if(rsq <= cut && i != j) { + if((j >= _nlocal) && (i_border < 0)) + i_border = atomicAdd(_inum_border, 1); + + if(jnum < _maxneighbors) { + if(block_style) { + _neighbors[i * _maxneighbors + jnum] = j; + + if(j >= _nlocal) { + _neighbors_border[i_border * _maxneighbors + jnum_border] = j; + } else { + _neighbors_inner[i * _maxneighbors + jnum_inner] = j; + } + } else { + _neighbors[i + jnum * _nlocal] = j; + + if(j >= _nlocal) { + _neighbors_border[i_border + jnum_border * _nlocal] = j; + } else { + _neighbors_inner[i + jnum_inner * _nlocal] = j; + } + } + } + + ++jnum; + + if(j >= _nlocal) + jnum_border++; + else + jnum_inner++; + } + } + } + + __syncthreads(); + } + + for(int obin_x = bin_x - 1; obin_x < bin_x + 2; obin_x++) + for(int obin_y = bin_y - 1; obin_y < bin_y + 2; obin_y++) + for(int obin_z = bin_z - 1; obin_z < bin_z + 2; obin_z++) { + if(obin_x < 0 || obin_y < 0 || obin_z < 0) continue; + + if(obin_x >= bin_dim_x || obin_y >= bin_dim_y || obin_z >= bin_dim_z) continue; + + int other_bin = bin_dim_z * (bin_dim_y * obin_x + obin_y) + obin_z; + + if(other_bin == bin) continue; + + int obin_c = bin_count[other_bin]; + + for(int otherActOffset = 0; otherActOffset < obin_c; otherActOffset += blockDim.x) { + int otherActIdx = otherActOffset + threadIdx.x; + + if(threadIdx.x < MIN(blockDim.x, obin_c - otherActOffset)) { + other_id[threadIdx.x] = binned_id[__mul24(other_bin, bin_nmax) + otherActIdx]; + my_x = binned_x + __mul24(__mul24(other_bin, 3), bin_nmax) + otherActIdx; + other_x[threadIdx.x] = *my_x; + my_x += bin_nmax; + other_x[threadIdx.x + blockDim.x] = *my_x; + my_x += bin_nmax; + other_x[threadIdx.x + 2 * blockDim.x] = *my_x; + } + + __syncthreads(); + + for(int k = 0; k < MIN(blockDim.x, obin_c - otherActOffset); ++k) { + if(i < _nlocal) { + int j = other_id[k]; + + if(globcutoff < 0) { + int jtype = _type[j]; + cut = _cutneighsq[itype * _cuda_ntypes + jtype]; + } + + CUDA_FLOAT delx = x_i - other_x[k]; + CUDA_FLOAT dely = y_i - other_x[k + blockDim.x]; + CUDA_FLOAT delz = z_i - other_x[k + 2 * blockDim.x]; + CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz; + + if(rsq <= cut && i != j) { + if((j >= _nlocal) && (i_border < 0)) + i_border = atomicAdd(_inum_border, 1); + + if(jnum < _maxneighbors) { + if(block_style) { + _neighbors[i * _maxneighbors + jnum] = j; + + if(j >= _nlocal) { + _neighbors_border[i_border * _maxneighbors + jnum_border] = j; + } else { + _neighbors_inner[i * _maxneighbors + jnum_inner] = j; + } + } else { + _neighbors[i + jnum * _nlocal] = j; + + if(j >= _nlocal) { + _neighbors_border[i_border + jnum_border * _nlocal] = j; + } else { + _neighbors_inner[i + jnum_inner * _nlocal] = j; + } + } + } + + ++jnum; + + if(j >= _nlocal) + jnum_border++; + else + jnum_inner++; + } + } + } + + __syncthreads(); + } + } + + if(jnum > _maxneighbors)((int*)_buffer)[0] = -jnum; + + if(i < _nlocal) { + _numneigh[i] = jnum; + _numneigh_inner[i] = jnum_inner; + + if(i_border >= 0) _numneigh_border[i_border] = jnum_border; + + if(i_border >= 0) _ilist_border[i_border] = i; + + } + } +} + +__global__ void NeighborBuildFullNsq_Kernel() +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + int* buffer = (int*) _buffer; + + if(i < _nlocal) { + X_FLOAT* my_x = _x + i; + CUDA_FLOAT x_i = *my_x; + my_x += _nmax; + CUDA_FLOAT y_i = *my_x; + my_x += _nmax; + CUDA_FLOAT z_i = *my_x; + int jnum = 0; + int* jlist = _firstneigh[i]; + _ilist[i] = i; + + int itype = _type[i]; + __syncthreads(); + + for(int j = 0; j < _nall; ++j) { + my_x = _x + j; + CUDA_FLOAT x_j = *my_x; + my_x += _nmax; + CUDA_FLOAT y_j = *my_x; + my_x += _nmax; + CUDA_FLOAT z_j = *my_x; + CUDA_FLOAT delx = x_i - x_j; + CUDA_FLOAT dely = y_i - y_j; + CUDA_FLOAT delz = z_i - z_j; + CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz; + int jtype = _type[j]; + + if(rsq <= _cutneighsq[itype * _cuda_ntypes + jtype] && i != j) { + if(jnum < _maxneighbors) + jlist[jnum] = j; + + if(i == 151)((int*)_buffer)[jnum + 2] = j; + + ++jnum; + } + + __syncthreads(); + } + + if(jnum > _maxneighbors) buffer[0] = 0; + + _numneigh[i] = jnum; + + if(i == 151)((int*)_buffer)[1] = jnum; + } +} + diff --git a/lib/cuda/pair_born_coul_long_cuda.cu b/lib/cuda/pair_born_coul_long_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..e6d66733e73fa107980534ce92224147b7b66356 --- /dev/null +++ b/lib/cuda/pair_born_coul_long_cuda.cu @@ -0,0 +1,78 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> + +#define _rhoinv MY_AP(coeff1) +#define _sigma MY_AP(coeff2) +#define _a MY_AP(coeff3) +#define _c MY_AP(coeff4) +#define _d MY_AP(coeff5) + +#include "pair_born_coul_long_cuda_cu.h" +#include "pair_born_coul_long_cuda_kernel_nc.cu" + +#include <time.h> + +void Cuda_PairBornCoulLongCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 5, true); +} + +void Cuda_PairBornCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom) +{ + + + static short init = 0; + + if(! init) { + init = 1; + Cuda_PairBornCoulLongCuda_Init(sdata); + } + + dim3 grid, threads; + int sharedperproc; + + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 192); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA<PAIR_BORN, COUL_LONG, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + else + Pair_Kernel_TpA<PAIR_BORN, COUL_LONG, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + + + +#undef _rhoinv +#undef _sigma +#undef _a +#undef _c +#undef _d + diff --git a/lib/cuda/pair_born_coul_long_cuda_cu.h b/lib/cuda/pair_born_coul_long_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..cdd4e6cafae7f06c9b3edd47b0a91dee9c0838cc --- /dev/null +++ b/lib/cuda/pair_born_coul_long_cuda_cu.h @@ -0,0 +1,30 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +#ifdef CUDA_USE_BINNING +extern "C" void Cuda_PairBornCoulLongCuda(cuda_shared_data* sdata, int eflag, int vflag); +#else +extern "C" void Cuda_PairBornCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom); +#endif diff --git a/lib/cuda/pair_born_coul_long_cuda_kernel_nc.cu b/lib/cuda/pair_born_coul_long_cuda_kernel_nc.cu new file mode 100644 index 0000000000000000000000000000000000000000..bc79848b08dc794fb3d77908a63a309882deef2d --- /dev/null +++ b/lib/cuda/pair_born_coul_long_cuda_kernel_nc.cu @@ -0,0 +1,36 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ +__device__ inline F_FLOAT PairBornCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl) +{ + const F_FLOAT r2inv = F_F(1.0) / rsq; + const F_FLOAT r = _RSQRT_(r2inv); + const F_FLOAT r6inv = r2inv * r2inv * r2inv; + const F_FLOAT rexp = _EXP_((_sigma[ij_type] - r) * _rhoinv[ij_type]); + const F_FLOAT forceborn = _a[ij_type] * _rhoinv[ij_type] * r * rexp - + F_F(6.0) * _c[ij_type] * r6inv + F_F(8.0) * _d[ij_type] * r2inv * r6inv; + + if(eflag) evdwl += factor_lj * (_a[ij_type] * rexp - _c[ij_type] * r6inv + + _d[ij_type] * r2inv * r6inv - _offset[ij_type]); + + return factor_lj * forceborn * r2inv; +} diff --git a/lib/cuda/pair_buck_coul_cut_cuda.cu b/lib/cuda/pair_buck_coul_cut_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..ba61f5e036302939297d4d577259de67c688faea --- /dev/null +++ b/lib/cuda/pair_buck_coul_cut_cuda.cu @@ -0,0 +1,75 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> + +#define _rhoinv MY_AP(coeff1) +#define _buck1 MY_AP(coeff2) +#define _buck2 MY_AP(coeff3) +#define _a MY_AP(coeff4) +#define _c MY_AP(coeff5) + +#include "pair_buck_coul_cut_cuda_cu.h" + +#include <time.h> +void Cuda_PairBuckCoulCutCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 5, true); +} + +void Cuda_PairBuckCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom) +{ + + + static short init = 0; + + if(! init) { + init = 1; + Cuda_PairBuckCoulCutCuda_Init(sdata); + } + + dim3 grid, threads; + int sharedperproc; + + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 192); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA<PAIR_BUCK, COUL_CUT, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + else + Pair_Kernel_TpA<PAIR_BUCK, COUL_CUT, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + + +#undef _rhoinv +#undef _buck1 +#undef _buck2 +#undef _a +#undef _c + diff --git a/lib/cuda/pair_buck_coul_cut_cuda_cu.h b/lib/cuda/pair_buck_coul_cut_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..25f916119bdadf065243851354f4090fbe627946 --- /dev/null +++ b/lib/cuda/pair_buck_coul_cut_cuda_cu.h @@ -0,0 +1,30 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +#ifdef CUDA_USE_BINNING +extern "C" void Cuda_PairBuckCoulCutCuda(cuda_shared_data* sdata, int eflag, int vflag); +#else +extern "C" void Cuda_PairBuckCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom); +#endif diff --git a/lib/cuda/pair_buck_coul_long_cuda.cu b/lib/cuda/pair_buck_coul_long_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..f4e7203f839dbbad3acb352a2f104d1a43c2f79c --- /dev/null +++ b/lib/cuda/pair_buck_coul_long_cuda.cu @@ -0,0 +1,78 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> + +#define _rhoinv MY_AP(coeff1) +#define _buck1 MY_AP(coeff2) +#define _buck2 MY_AP(coeff3) +#define _a MY_AP(coeff4) +#define _c MY_AP(coeff5) + +#include "pair_buck_coul_long_cuda_cu.h" + +#include <time.h> + +void Cuda_PairBuckCoulLongCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 5, true); +} + +void Cuda_PairBuckCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom) +{ + + + static short init = 0; + + if(! init) { + init = 1; + Cuda_PairBuckCoulLongCuda_Init(sdata); + } + + dim3 grid, threads; + int sharedperproc; + + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 192); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA<PAIR_BUCK, COUL_LONG, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + else + Pair_Kernel_TpA<PAIR_BUCK, COUL_LONG, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + + + + +#undef _rhoinv +#undef _buck1 +#undef _buck2 +#undef _a +#undef _c + diff --git a/lib/cuda/pair_buck_coul_long_cuda_cu.h b/lib/cuda/pair_buck_coul_long_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..b13476644ae88d88d8cae476894cc5975fed5b70 --- /dev/null +++ b/lib/cuda/pair_buck_coul_long_cuda_cu.h @@ -0,0 +1,30 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +#ifdef CUDA_USE_BINNING +extern "C" void Cuda_PairBuckCoulLongCuda(cuda_shared_data* sdata, int eflag, int vflag); +#else +extern "C" void Cuda_PairBuckCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom); +#endif diff --git a/lib/cuda/pair_buck_cuda.cu b/lib/cuda/pair_buck_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..b7ca740c00567fb1f856291490d324d22d0a038c --- /dev/null +++ b/lib/cuda/pair_buck_cuda.cu @@ -0,0 +1,77 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> + +#define _rhoinv MY_AP(coeff1) +#define _buck1 MY_AP(coeff2) +#define _buck2 MY_AP(coeff3) +#define _a MY_AP(coeff4) +#define _c MY_AP(coeff5) + +#include "pair_buck_cuda_cu.h" +#include "pair_buck_cuda_kernel_nc.cu" + +#include <time.h> + +void Cuda_PairBuckCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 5); +} + +void Cuda_PairBuckCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom) +{ + + + static short init = 0; + + if(! init) { + init = 1; + Cuda_PairBuckCuda_Init(sdata); + } + + dim3 grid, threads; + int sharedperproc; + + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 192); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA<PAIR_BUCK, COUL_NONE, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + else + Pair_Kernel_TpA<PAIR_BUCK, COUL_NONE, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + + +#undef _rhoinv +#undef _buck1 +#undef _buck2 +#undef _a +#undef _c + diff --git a/lib/cuda/pair_buck_cuda_cu.h b/lib/cuda/pair_buck_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..152fad94f44b9043831cbd13d6b61b2b2838e665 --- /dev/null +++ b/lib/cuda/pair_buck_cuda_cu.h @@ -0,0 +1,30 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +#ifdef CUDA_USE_BINNING +extern "C" void Cuda_PairBuckCuda(cuda_shared_data* sdata, int eflag, int vflag); +#else +extern "C" void Cuda_PairBuckCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom); +#endif diff --git a/lib/cuda/pair_buck_cuda_kernel_nc.cu b/lib/cuda/pair_buck_cuda_kernel_nc.cu new file mode 100644 index 0000000000000000000000000000000000000000..8ce7d28654e288e62ce3f281cc63c523db1505ec --- /dev/null +++ b/lib/cuda/pair_buck_cuda_kernel_nc.cu @@ -0,0 +1,35 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ +__device__ inline F_FLOAT PairBuckCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl) +{ + const F_FLOAT r2inv = F_F(1.0) / rsq; + const F_FLOAT r6inv = r2inv * r2inv * r2inv; + const F_FLOAT r = _RSQRT_(r2inv); + const F_FLOAT rexp = _EXP_(-r * _rhoinv[ij_type]); + const F_FLOAT forcebuck = _buck1[ij_type] * r * rexp - _buck2[ij_type] * r6inv; + + if(eflag) evdwl += factor_lj * (_a[ij_type] * rexp - _c[ij_type] * r6inv - + _offset[ij_type]); + + return (factor_lj * forcebuck) * r2inv; +} diff --git a/lib/cuda/pair_cg_cmm_coul_cut_cuda.cu b/lib/cuda/pair_cg_cmm_coul_cut_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..7f9f853437b4575390d807652dd7fcd85f79b216 --- /dev/null +++ b/lib/cuda/pair_cg_cmm_coul_cut_cuda.cu @@ -0,0 +1,81 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> + +#define _lj1 MY_AP(coeff1) +#define _lj2 MY_AP(coeff2) +#define _lj3 MY_AP(coeff3) +#define _lj4 MY_AP(coeff4) +#define _cg_type MY_AP(coeff5) + + +#include "pair_cg_cmm_coul_cut_cuda_cu.h" +#include <time.h> + + + + +void Cuda_PairCGCMMCoulCutCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 5, true, false); + +} + + + + +void Cuda_PairCGCMMCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom) +{ + + // initialize only on first call + static short init = 0; + + if(! init) { + init = 1; + Cuda_PairCGCMMCoulCutCuda_Init(sdata); + } + + dim3 grid, threads; + int sharedperproc; + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 128); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA<PAIR_CG_CMM, COUL_CUT, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + else + Pair_Kernel_TpA<PAIR_CG_CMM, COUL_CUT, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 +#undef _cg_type + diff --git a/lib/cuda/pair_cg_cmm_coul_cut_cuda_cu.h b/lib/cuda/pair_cg_cmm_coul_cut_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..fff7db7a646a72d9676983f27326a4f907d3cd5f --- /dev/null +++ b/lib/cuda/pair_cg_cmm_coul_cut_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairCGCMMCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom); diff --git a/lib/cuda/pair_cg_cmm_coul_debye_cuda.cu b/lib/cuda/pair_cg_cmm_coul_debye_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..43bedca88321f22e54916cf720911740bf7ec6ad --- /dev/null +++ b/lib/cuda/pair_cg_cmm_coul_debye_cuda.cu @@ -0,0 +1,81 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> + +#define _lj1 MY_AP(coeff1) +#define _lj2 MY_AP(coeff2) +#define _lj3 MY_AP(coeff3) +#define _lj4 MY_AP(coeff4) +#define _cg_type MY_AP(coeff5) + + +#include "pair_cg_cmm_coul_debye_cuda_cu.h" +#include <time.h> + + + + +void Cuda_PairCGCMMCoulDebyeCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 5, true, false); + +} + + + + +void Cuda_PairCGCMMCoulDebyeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom) +{ + + // initialize only on first call + static short init = 0; + + if(! init) { + init = 1; + Cuda_PairCGCMMCoulDebyeCuda_Init(sdata); + } + + dim3 grid, threads; + int sharedperproc; + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 128); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA<PAIR_CG_CMM, COUL_DEBYE, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + else + Pair_Kernel_TpA<PAIR_CG_CMM, COUL_DEBYE, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 +#undef _cg_type + diff --git a/lib/cuda/pair_cg_cmm_coul_debye_cuda_cu.h b/lib/cuda/pair_cg_cmm_coul_debye_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..5b0b6af597f064ed9644f2920432540df9c1ecb7 --- /dev/null +++ b/lib/cuda/pair_cg_cmm_coul_debye_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairCGCMMCoulDebyeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom); diff --git a/lib/cuda/pair_cg_cmm_coul_long_cuda.cu b/lib/cuda/pair_cg_cmm_coul_long_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..ed1bbf0cfc5da4f7e50d767384e69c68b355f82c --- /dev/null +++ b/lib/cuda/pair_cg_cmm_coul_long_cuda.cu @@ -0,0 +1,81 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> + +#define _lj1 MY_AP(coeff1) +#define _lj2 MY_AP(coeff2) +#define _lj3 MY_AP(coeff3) +#define _lj4 MY_AP(coeff4) +#define _cg_type MY_AP(coeff5) + + +#include "pair_cg_cmm_coul_long_cuda_cu.h" +#include <time.h> + + + + +void Cuda_PairCGCMMCoulLongCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 5, true, false); + +} + + + + +void Cuda_PairCGCMMCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom) +{ + + // initialize only on first call + static short init = 0; + + if(! init) { + init = 1; + Cuda_PairCGCMMCoulLongCuda_Init(sdata); + } + + dim3 grid, threads; + int sharedperproc; + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 128); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA<PAIR_CG_CMM, COUL_LONG, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + else + Pair_Kernel_TpA<PAIR_CG_CMM, COUL_LONG, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 +#undef _cg_type + diff --git a/lib/cuda/pair_cg_cmm_coul_long_cuda_cu.h b/lib/cuda/pair_cg_cmm_coul_long_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..86befd78b8705dcbe46fa3bd30252f23c5b613bd --- /dev/null +++ b/lib/cuda/pair_cg_cmm_coul_long_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairCGCMMCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom); diff --git a/lib/cuda/pair_cg_cmm_cuda.cu b/lib/cuda/pair_cg_cmm_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..7ec1ebff99911b98361c5bb32d21822a9870fba1 --- /dev/null +++ b/lib/cuda/pair_cg_cmm_cuda.cu @@ -0,0 +1,87 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> + +#define _lj1 MY_AP(coeff1) +#define _lj2 MY_AP(coeff2) +#define _lj3 MY_AP(coeff3) +#define _lj4 MY_AP(coeff4) +#define _cg_type MY_AP(coeff5) + +enum {CG_NOT_SET = 0, CG_LJ9_6, CG_LJ12_4, CG_LJ12_6, NUM_CG_TYPES, + CG_COUL_NONE, CG_COUL_CUT, CG_COUL_DEBYE, CG_COUL_LONG + }; + +#include "pair_cg_cmm_cuda_cu.h" +#include "pair_cg_cmm_cuda_kernel_nc.cu" +#include <time.h> + + + + +void Cuda_PairCGCMMCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 5, false, false); + +} + + + + +void Cuda_PairCGCMMCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom) +{ + + // initialize only on first call + static short init = 0; + + if(! init) { + init = 1; + Cuda_PairCGCMMCuda_Init(sdata); + } + + dim3 grid, threads; + int sharedperproc; + + int maxthreads = 128; + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, maxthreads); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA<PAIR_CG_CMM, COUL_NONE, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + else + Pair_Kernel_TpA<PAIR_CG_CMM, COUL_NONE, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 +#undef _cg_type + diff --git a/lib/cuda/pair_cg_cmm_cuda_cu.h b/lib/cuda/pair_cg_cmm_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..739c0ae28f2885015ef50a1048ab7c8e7a90d29c --- /dev/null +++ b/lib/cuda/pair_cg_cmm_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairCGCMMCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom); diff --git a/lib/cuda/pair_cg_cmm_cuda_kernel_nc.cu b/lib/cuda/pair_cg_cmm_cuda_kernel_nc.cu new file mode 100644 index 0000000000000000000000000000000000000000..85b41605bd00558e399434ec26541d541072a8ff --- /dev/null +++ b/lib/cuda/pair_cg_cmm_cuda_kernel_nc.cu @@ -0,0 +1,49 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +__device__ inline F_FLOAT PairCGCMMCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl) //0.11 of 0.4 +{ + const F_FLOAT r2inv = F_F(1.0) / rsq; + const int cg_type = _cg_type[ij_type]; + const F_FLOAT r4inv = r2inv * r2inv; + const F_FLOAT rNinv_first = cg_type != CG_LJ9_6 ? r4inv : _RSQRT_(rsq); + const F_FLOAT rNinv_second = cg_type != CG_LJ12_4 ? -r2inv : -F_F(1.0); + const F_FLOAT forcelj = r4inv * (_lj1[ij_type] * r4inv * rNinv_first + _lj2[ij_type] * rNinv_second); + + if(eflag) evdwl += factor_lj * (r4inv * (_lj3[ij_type] * r4inv * rNinv_first + _lj4[ij_type] * rNinv_second) - _offset[ij_type]); + + return factor_lj * forcelj * r2inv; +} + +/*__device__ inline F_FLOAT PairCGCMMCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl) +{ + const int cg_type = tex1Dfetch(_coeff5_gm_tex,ij_type); + const F_FLOAT r2inv = F_F(1.0)/rsq; + const F_FLOAT r4inv = r2inv*r2inv; + const F_FLOAT rNinv_first = cg_type!=CG_LJ9_6?r4inv:_RSQRT_(rsq); + const F_FLOAT rNinv_second = cg_type!=CG_LJ12_4?r2inv:F_F(1.0); + const F_FLOAT forcelj = r4inv * (tex1Dfetch(_coeff1_gm_tex,ij_type)*r4inv*rNinv_first - tex1Dfetch(_coeff2_gm_tex,ij_type)*rNinv_second); + + if(eflag) evdwl += factor_lj*(r4inv*(tex1Dfetch(_coeff3_gm_tex,ij_type)*r4inv*rNinv_first-tex1Dfetch(_coeff4_gm_tex,ij_type)*rNinv_second)); + return factor_lj*forcelj*r2inv; +}*/ diff --git a/lib/cuda/pair_eam_cuda.cu b/lib/cuda/pair_eam_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..cb20343770514e82824d1bb242015f7e568d85ff --- /dev/null +++ b/lib/cuda/pair_eam_cuda.cu @@ -0,0 +1,351 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> + +#define _type2frho MY_AP(coeff1) +#define _type2rhor MY_AP(coeff2) +#define _type2z2r MY_AP(coeff3) +#define _rdr MY_AP(rdr) +#define _rdrho MY_AP(rdrho) +#define _nr MY_AP(nr) +#define _nrho MY_AP(nrho) +#define _nfrho MY_AP(nfrho) +#define _nrhor MY_AP(nrhor) +#define _nz2r MY_AP(nz2r) +#define _frho_spline MY_AP(frho_spline) +#define _rhor_spline MY_AP(rhor_spline) +#define _z2r_spline MY_AP(z2r_spline) +#define _rho MY_AP(rho) +#define _fp MY_AP(fp) + +__device__ __constant__ F_FLOAT MY_AP(rdr); +__device__ __constant__ F_FLOAT MY_AP(rdrho); +__device__ __constant__ int MY_AP(nr); +__device__ __constant__ int MY_AP(nrho); +__device__ __constant__ int MY_AP(nfrho); +__device__ __constant__ int MY_AP(nrhor); +__device__ __constant__ int MY_AP(nz2r); +__device__ __constant__ F_FLOAT* MY_AP(frho_spline); +__device__ __constant__ F_FLOAT* MY_AP(rhor_spline); +__device__ __constant__ F_FLOAT* MY_AP(z2r_spline); +__device__ __constant__ F_FLOAT* MY_AP(rho); +__device__ __constant__ F_FLOAT* MY_AP(fp); + +#define _rhor_spline_tex MY_AP(rhor_spline_tex) +#if F_PRECISION == 1 +texture<float4, 1> _rhor_spline_tex; +#else +texture<int4, 1> _rhor_spline_tex; +#endif + + +#define _z2r_spline_tex MY_AP(z2r_spline_tex) +#if F_PRECISION == 1 +texture<float4, 1> _z2r_spline_tex; +#else +texture<int4, 1> _z2r_spline_tex; +#endif + + + +#include "pair_eam_cuda_cu.h" +#include "pair_eam_cuda_kernel_nc.cu" +#include <time.h> + +int eam_buff_offset; +int rhor_spline_size; +void* rhor_spline_pointer; +int z2r_spline_size; +void* z2r_spline_pointer; + + +inline void BindEAMTextures(cuda_shared_data* sdata) +{ + _rhor_spline_tex.normalized = false; // access with normalized texture coordinates + _rhor_spline_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _rhor_spline_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + + const textureReference* rhor_spline_texture_ptr = &MY_AP(rhor_spline_tex); + +#if F_PRECISION == 1 + cudaChannelFormatDesc channelDescRhor = cudaCreateChannelDesc<float4>(); + cudaBindTexture(0, rhor_spline_texture_ptr, rhor_spline_pointer, &channelDescRhor, rhor_spline_size); +#else + cudaChannelFormatDesc channelDescRhor = cudaCreateChannelDesc<int4>(); + cudaBindTexture(0, rhor_spline_texture_ptr, rhor_spline_pointer, &channelDescRhor, rhor_spline_size); +#endif + + _z2r_spline_tex.normalized = false; // access with normalized texture coordinates + _z2r_spline_tex.filterMode = cudaFilterModePoint; // Point mode, so no + _z2r_spline_tex.addressMode[0] = cudaAddressModeWrap; // wrap texture coordinates + + const textureReference* z2r_spline_texture_ptr = &MY_AP(z2r_spline_tex); + +#if F_PRECISION == 1 + cudaChannelFormatDesc channelDescZ2r = cudaCreateChannelDesc<float4>(); + cudaBindTexture(0, z2r_spline_texture_ptr, z2r_spline_pointer, &channelDescZ2r, z2r_spline_size); +#else + cudaChannelFormatDesc channelDescZ2r = cudaCreateChannelDesc<int4>(); + cudaBindTexture(0, z2r_spline_texture_ptr, z2r_spline_pointer, &channelDescZ2r, z2r_spline_size); +#endif + +} + +void Cuda_PairEAMCuda_UpdateBuffer(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) +{ + CUT_CHECK_ERROR("Cuda_PairEAMCuda: before updateBuffer failed"); + int3 layout = getgrid(sneighlist->inum, 7 * sizeof(F_FLOAT)); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + int size = (unsigned)(layout.y * layout.x) * 7 * sizeof(F_FLOAT); + + if(sdata->buffersize < size) { + MYDBG(printf("Cuda_PairEAMCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);) + + if(sdata->buffer != NULL) cudaFree(sdata->buffer); + + cudaMalloc((void**)&sdata->buffer, size); + sdata->buffersize = size; + sdata->buffer_new++; + MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);) + } + + cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*)); + CUT_CHECK_ERROR("Cuda_PairEAMCuda: updateBuffer failed"); +} + +void Cuda_PairEAMCuda_UpdateNeighbor(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) +{ + cudaMemcpyToSymbol(MY_AP(neighbor_maxlocal) , & sneighlist->firstneigh.dim[0] , sizeof(unsigned)); + cudaMemcpyToSymbol(MY_AP(firstneigh), & sneighlist->firstneigh.dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(ilist) , & sneighlist->ilist .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(inum) , & sneighlist->inum , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(numneigh) , & sneighlist->numneigh .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(neighbors) , & sneighlist->neighbors .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(maxneighbors) , & sneighlist->maxneighbors , sizeof(int)); +} + +void Cuda_PairEAMCuda_UpdateNmax(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) +{ + CUT_CHECK_ERROR("Cuda_PairEAMCuda: before updateNmax failed"); + cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_FLOAT4*)); + cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*)); + CUT_CHECK_ERROR("Cuda_PairEAMCuda: updateNmax failed"); +} + + +void Cuda_PairEAMCuda_Init(cuda_shared_data* sdata, double rdr, double rdrho, int nfrho, int nrhor, int nr, int nrho, int nz2r, + void* frho_spline, void* rhor_spline, void* z2r_spline, void* rho, void* fp, + int* type2frho, int** type2z2r, int** type2rhor) +{ + // !! LAMMPS indexes atom types starting with 1 !! + + unsigned cuda_ntypes = sdata->atom.ntypes + 1; + + if(cuda_ntypes * cuda_ntypes > CUDA_MAX_TYPES2) + printf("# CUDA: Cuda_PairEAMCuda_Init: you need %u types. this is more than %u " + "(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=99 " + "or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES2); + + unsigned nI = sizeof(F_FLOAT) * cuda_ntypes * cuda_ntypes; + + X_FLOAT cutsq_global; + cutsq_global = (X_FLOAT)(sdata->pair.cut_global); + cudaMemcpyToSymbol(MY_AP(cutsq_global) , &cutsq_global , sizeof(X_FLOAT)); + + + F_FLOAT* coeff_buf = new F_FLOAT[cuda_ntypes * cuda_ntypes]; + + for(int i = 0; i < cuda_ntypes; i++) coeff_buf[i] = type2frho[i]; + + cudaMemcpyToSymbol(MY_AP(coeff1) , coeff_buf , cuda_ntypes * sizeof(F_FLOAT)); + + for(int i = 0; i < cuda_ntypes * cuda_ntypes; i++) coeff_buf[i] = (&type2rhor[0][0])[i]; + + cudaMemcpyToSymbol(MY_AP(coeff2) , coeff_buf , nI); + + for(int i = 0; i < cuda_ntypes * cuda_ntypes; i++) coeff_buf[i] = (&type2z2r[0][0])[i]; + + cudaMemcpyToSymbol(MY_AP(coeff3) , coeff_buf , nI); + + delete [] coeff_buf; + X_FLOAT box_size[3] = { + sdata->domain.subhi[0] - sdata->domain.sublo[0], + sdata->domain.subhi[1] - sdata->domain.sublo[1], + sdata->domain.subhi[2] - sdata->domain.sublo[2] + }; + F_FLOAT rdr_F = rdr; + F_FLOAT rdrho_F = rdrho; + cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_FLOAT) * 3); + cudaMemcpyToSymbol(MY_AP(cuda_ntypes), & cuda_ntypes , sizeof(unsigned)); + cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(periodicity), sdata->domain.periodicity, sizeof(int) * 3); + cudaMemcpyToSymbol(MY_AP(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(rdr), &rdr_F, sizeof(F_FLOAT)); + cudaMemcpyToSymbol(MY_AP(rdrho), &rdrho_F, sizeof(F_FLOAT)); + cudaMemcpyToSymbol(MY_AP(nr), &nr, sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nrho), &nrho, sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nfrho), &nfrho, sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nrhor), &nrhor, sizeof(int)); + cudaMemcpyToSymbol(MY_AP(rho), &rho, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(fp), &fp, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(frho_spline), &frho_spline, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(rhor_spline), &rhor_spline, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(z2r_spline), &z2r_spline, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(nrhor), &nrhor, sizeof(int)); + + rhor_spline_size = nrhor * (nr + 1) * EAM_COEFF_LENGTH * sizeof(F_FLOAT); + z2r_spline_size = nz2r * (nr + 1) * EAM_COEFF_LENGTH * sizeof(F_FLOAT); + rhor_spline_pointer = rhor_spline; + z2r_spline_pointer = z2r_spline; + + CUT_CHECK_ERROR("Cuda_PairEAMCuda: init failed"); + +} + + + +void Cuda_PairEAM1Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom) +{ + + if(sdata->atom.update_nmax) + Cuda_PairEAMCuda_UpdateNmax(sdata, sneighlist); + + if(sdata->atom.update_neigh) + Cuda_PairEAMCuda_UpdateNeighbor(sdata, sneighlist); + + if(sdata->atom.update_nlocal) + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + + if(sdata->buffer_new) + Cuda_PairEAMCuda_UpdateBuffer(sdata, sneighlist); + + cudaMemcpyToSymbol(MY_AP(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*)); + + int sharedperproc = 0; + + if(eflag || eflag_atom) sharedperproc = 1; + + if(vflag || vflag_atom) sharedperproc = 7; + + int3 layout = getgrid(sneighlist->inum, sharedperproc * sizeof(ENERGY_FLOAT)); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + eam_buff_offset = grid.x * grid.y; + + BindXTypeTexture(sdata); + BindEAMTextures(sdata); // initialize only on first call + + + MYDBG(printf("# CUDA: Cuda_PairEAMCuda: kernel start eflag: %i vflag: %i\n", eflag, vflag);) + CUT_CHECK_ERROR("Cuda_PairEAMCuda: pre pair Kernel 1 problems before kernel invocation"); + PairEAMCuda_Kernel1 <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x>>> (eflag, vflag, eflag_atom, vflag_atom); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 1 execution failed"); + + + + MYDBG(printf("# CUDA: Cuda_PairEAMCoulLongCuda: kernel done\n");) + +} + +void Cuda_PairEAM2Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom) +{ + int sharedperproc = 0; + + if(eflag || eflag_atom) sharedperproc = 1; + + if(vflag || vflag_atom) sharedperproc = 7; + + int3 layout = getgrid(sneighlist->inum, sharedperproc * sizeof(ENERGY_FLOAT)); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + BindXTypeTexture(sdata); + BindEAMTextures(sdata); // initialize only on first call + // initialize only on first call + sdata->pair.lastgridsize = grid.x * grid.y; + sdata->pair.n_energy_virial = sharedperproc; + + MYDBG(printf("# CUDA: Cuda_PairEAMCuda: kernel start eflag: %i vflag: %i\n", eflag, vflag);) + CUT_CHECK_ERROR("Cuda_PairEAMCuda: pre pair Kernel 2 problems before kernel invocation"); + PairEAMCuda_Kernel2 <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x>>> (eflag, vflag, eflag_atom, vflag_atom); + CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 2 start failed"); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 2 execution failed"); + + if(eflag || vflag) { + int n = grid.x * grid.y; + grid.x = sharedperproc; + grid.y = 1; + threads.x = 256; + MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)*sharedperproc>>>(n); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_PairEAMCuda: virial compute Kernel execution failed"); + } + + MYDBG(printf("# CUDA: Cuda_PairEAMCoulLongCuda: kernel done\n");) + +} + +void Cuda_PairEAMCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_send) +{ + int3 layout = getgrid(n, 0); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + F_FLOAT* buf = (F_FLOAT*)(& ((double*)sdata->buffer)[eam_buff_offset]); + + PairEAMCuda_PackComm_Kernel <<< grid, threads, 0>>> ((int*) sdata->comm.sendlist.dev_data, n + , sdata->comm.maxlistlength, iswap, buf); + cudaThreadSynchronize(); + cudaMemcpy(buf_send, buf, n* sizeof(F_FLOAT), cudaMemcpyDeviceToHost); + cudaThreadSynchronize(); +} + +void Cuda_PairEAMCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* buf_recv, void* fp) +{ + F_FLOAT* fp_first = &(((F_FLOAT*) fp)[first]); + cudaMemcpy(fp_first, buf_recv, n * sizeof(F_FLOAT), cudaMemcpyHostToDevice); +} + +#undef _type2frho +#undef _type2rhor +#undef _type2z2r + + +/* ---------------------------------------------------------------------- + tally eng_vdwl and virial into global and per-atom accumulators + need i < nlocal test since called by bond_quartic and dihedral_charmm +------------------------------------------------------------------------- */ + diff --git a/lib/cuda/pair_eam_cuda_cu.h b/lib/cuda/pair_eam_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..d491efe70fadceef1ed2eb4faf6833d037dbd81e --- /dev/null +++ b/lib/cuda/pair_eam_cuda_cu.h @@ -0,0 +1,33 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" +extern "C" void Cuda_PairEAMCuda_Init(cuda_shared_data* sdata, double rdr, double rdrho, int nfrho, int nrhor, int nr, int nrho, int nz2r, + void* frho_spline, void* rhor_spline, void* z2r_spline, void* rho, void* fp, + int* type2frho, int** type2z2r, int** type2rhor); +extern "C" void Cuda_PairEAM1Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom); +extern "C" void Cuda_PairEAM2Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom); +extern "C" void Cuda_PairEAMCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_send); +extern "C" void Cuda_PairEAMCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* buf_recv, void* fp); + +#define EAM_COEFF_LENGTH 8 diff --git a/lib/cuda/pair_eam_cuda_kernel_nc.cu b/lib/cuda/pair_eam_cuda_kernel_nc.cu new file mode 100644 index 0000000000000000000000000000000000000000..458945418ab1131504faf4e86cec3f827c1f2e2c --- /dev/null +++ b/lib/cuda/pair_eam_cuda_kernel_nc.cu @@ -0,0 +1,341 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + + + + +static __device__ inline F_FLOAT4 fetchRhor(int i) +{ +#ifdef CUDA_USE_TEXTURE +#if F_PRECISION == 1 + return tex1Dfetch(_rhor_spline_tex, i); +#else + return tex1Dfetch_double_f(_rhor_spline_tex, i); +#endif +#else + return _rhor_spline[i]; +#endif +} + +static __device__ inline F_FLOAT4 fetchZ2r(int i) +{ +#ifdef CUDA_USE_TEXTURE +#if F_PRECISION == 1 + return tex1Dfetch(_z2r_spline_tex, i); +#else + return tex1Dfetch_double_f(_z2r_spline_tex, i); +#endif +#else + return _z2r_spline[i]; +#endif +} + +__global__ void PairEAMCuda_Kernel1(int eflag, int vflag, int eflag_atom, int vflag_atom) +{ + ENERGY_FLOAT* sharedE; + ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x]; + + + if(eflag || eflag_atom) { + sharedE = &sharedmem[threadIdx.x]; + sharedE[0] = ENERGY_F(0.0); + sharedV += blockDim.x; + } + + if(vflag || vflag_atom) { + sharedV[0 * blockDim.x] = ENERGY_F(0.0); + sharedV[1 * blockDim.x] = ENERGY_F(0.0); + sharedV[2 * blockDim.x] = ENERGY_F(0.0); + sharedV[3 * blockDim.x] = ENERGY_F(0.0); + sharedV[4 * blockDim.x] = ENERGY_F(0.0); + sharedV[5 * blockDim.x] = ENERGY_F(0.0); + } + + int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + X_FLOAT xtmp, ytmp, ztmp; + X_FLOAT4 myxtype; + F_FLOAT delx, dely, delz; + int itype; + int i = _nlocal; + int jnum = 0; + int* jlist; + + if(ii < _inum) { + i = _ilist[ii]; + + myxtype = fetchXType(i); + xtmp = myxtype.x; + ytmp = myxtype.y; + ztmp = myxtype.z; + itype = static_cast <int>(myxtype.w); + + jnum = _numneigh[i]; + + jlist = &_neighbors[i]; + + if(i < _nlocal) + _rho[i] = F_F(0.0); + } + + __syncthreads(); + + for(int jj = 0; jj < jnum; jj++) { + if(ii < _inum) + if(jj < jnum) { + const int j = jlist[jj * _nlocal]; + myxtype = fetchXType(j); + delx = xtmp - myxtype.x; + dely = ytmp - myxtype.y; + delz = ztmp - myxtype.z; + int jtype = static_cast <int>(myxtype.w); + const F_FLOAT rsq = delx * delx + dely * dely + delz * delz; + + if(rsq < _cutsq_global) { + F_FLOAT p = sqrt(rsq) * _rdr + F_F(1.0); + int m = static_cast<int>(p); + m = MIN(m, _nr - 1); + p -= m; + p = MIN(p, F_F(1.0)); + + int k = (static_cast <int>(_type2rhor[jtype * _cuda_ntypes + itype]) * (_nr + 1) + m) * 2; + F_FLOAT4 c = fetchRhor(k + 1); + _rho[i] += ((c.w * p + c.x) * p + c.y) * p + c.z; + } + } + } + + if(ii < _inum) { + + F_FLOAT p = _rho[i] * _rdrho + F_F(1.0); + int m = static_cast<int>(p); + m = MAX(1, MIN(m, _nrho - 1)); + p -= m; + p = MIN(p, F_F(1.0)); + F_FLOAT* coeff = &_frho_spline[(static_cast <int>(_type2frho[itype]) * (_nrho + 1) + m) * EAM_COEFF_LENGTH]; + _fp[i] = (coeff[0] * p + coeff[1]) * p + coeff[2]; + + if(eflag || eflag_atom) { + sharedmem[threadIdx.x] += ((coeff[3] * p + coeff[4]) * p + coeff[5]) * p + coeff[6]; + } + + } + + __syncthreads(); + + if(eflag || eflag_atom) { + if(i < _nlocal && eflag_atom) + _eatom[i] += sharedmem[threadIdx.x]; + + reduceBlock(sharedmem); + ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; + buffer[blockIdx.x * gridDim.y + blockIdx.y] = ENERGY_F(2.0) * sharedmem[0]; + } +} + +__global__ void PairEAMCuda_Kernel2(int eflag, int vflag, int eflag_atom, int vflag_atom) +{ + ENERGY_FLOAT evdwl = ENERGY_F(0.0); + + ENERGY_FLOAT* sharedE; + ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x]; + + + if(eflag || eflag_atom) { + sharedE = &sharedmem[threadIdx.x]; + sharedE[0] = ENERGY_F(0.0); + sharedV += blockDim.x; + } + + if(vflag || vflag_atom) { + sharedV[0 * blockDim.x] = ENERGY_F(0.0); + sharedV[1 * blockDim.x] = ENERGY_F(0.0); + sharedV[2 * blockDim.x] = ENERGY_F(0.0); + sharedV[3 * blockDim.x] = ENERGY_F(0.0); + sharedV[4 * blockDim.x] = ENERGY_F(0.0); + sharedV[5 * blockDim.x] = ENERGY_F(0.0); + } + + int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + X_FLOAT xtmp, ytmp, ztmp; + X_FLOAT4 myxtype; + F_FLOAT fxtmp, fytmp, fztmp, fpair; + F_FLOAT delx, dely, delz; + int itype, i; + int jnum = 0; + int* jlist; + + if(ii < _inum) { + i = _ilist[ii]; + + myxtype = fetchXType(i); + xtmp = myxtype.x; + ytmp = myxtype.y; + ztmp = myxtype.z; + itype = static_cast <int>(myxtype.w); + fxtmp = F_F(0.0); + fytmp = F_F(0.0); + fztmp = F_F(0.0); + + jnum = _numneigh[i]; + + jlist = &_neighbors[i]; + + if(i < _nlocal) + _rho[i] = F_F(0.0); + } + + if(ii < gridDim.x * gridDim.y) evdwl = ((ENERGY_FLOAT*) _buffer)[ii]; + + __syncthreads(); + + for(int jj = 0; jj < jnum; jj++) { + if(ii < _inum) + if(jj < jnum) { + const int j = jlist[jj * _nlocal]; + myxtype = fetchXType(j); + delx = xtmp - myxtype.x; + dely = ytmp - myxtype.y; + delz = ztmp - myxtype.z; + int jtype = static_cast <int>(myxtype.w); + const F_FLOAT rsq = delx * delx + dely * dely + delz * delz; + + if(rsq < _cutsq_global) { + F_FLOAT r = _SQRT_(rsq); + F_FLOAT p = r * _rdr + F_F(1.0); + int m = static_cast<int>(p); + m = MIN(m, _nr - 1); + p -= m; + p = MIN(p, F_F(1.0)); + + int k = (static_cast <int>(_type2rhor[itype * _cuda_ntypes + jtype]) * (_nr + 1) + m) * 2; + F_FLOAT4 c = fetchRhor(k); + F_FLOAT rhoip = (c.x * p + c.y) * p + c.z; + k = (static_cast <int>(_type2rhor[jtype * _cuda_ntypes + itype]) * (_nr + 1) + m) * 2; + c = fetchRhor(k); + F_FLOAT rhojp = (c.x * p + c.y) * p + c.z; + k = (static_cast <int>(_type2z2r[itype * _cuda_ntypes + jtype]) * (_nr + 1) + m) * 2; + c = fetchZ2r(k); + F_FLOAT z2p = (c.x * p + c.y) * p + c.z; + c = fetchZ2r(k + 1); + F_FLOAT z2 = ((c.w * p + c.x) * p + c.y) * p + c.z; + + F_FLOAT recip = F_F(1.0) / r; + F_FLOAT phi = z2 * recip; + F_FLOAT phip = z2p * recip - phi * recip; + F_FLOAT psip = _fp[i] * rhojp + _fp[j] * rhoip + phip; + fpair = -psip * recip; + + F_FLOAT dxfp, dyfp, dzfp; + fxtmp += dxfp = delx * fpair; + fytmp += dyfp = dely * fpair; + fztmp += dzfp = delz * fpair; + evdwl += phi; + + if(vflag || vflag_atom) { + sharedV[0 * blockDim.x] += delx * dxfp; + sharedV[1 * blockDim.x] += dely * dyfp; + sharedV[2 * blockDim.x] += delz * dzfp; + sharedV[3 * blockDim.x] += delx * dyfp; + sharedV[4 * blockDim.x] += delx * dzfp; + sharedV[5 * blockDim.x] += dely * dzfp; + } + } + } + } + + __syncthreads(); + + if(ii < _inum) { + F_FLOAT* my_f; + + if(_collect_forces_later) { + ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; + + if(eflag) { + buffer = &buffer[1 * gridDim.x * gridDim.y]; + } + + if(vflag) { + buffer = &buffer[6 * gridDim.x * gridDim.y]; + } + + my_f = (F_FLOAT*) buffer; + my_f += i; + *my_f = fxtmp; + my_f += _nmax; + *my_f = fytmp; + my_f += _nmax; + *my_f = fztmp; + } else { + my_f = _f + i; + *my_f += fxtmp; + my_f += _nmax; + *my_f += fytmp; + my_f += _nmax; + *my_f += fztmp; + } + } + + __syncthreads(); + + if(eflag) { + sharedE[0] = evdwl; + } + + if(eflag_atom && i < _nlocal) { + _eatom[i] += evdwl; + } + + if(vflag_atom && i < _nlocal) { + _vatom[i] += ENERGY_F(0.5) * sharedV[0 * blockDim.x]; + _vatom[i + _nmax] += ENERGY_F(0.5) * sharedV[1 * blockDim.x]; + _vatom[i + 2 * _nmax] += ENERGY_F(0.5) * sharedV[2 * blockDim.x]; + _vatom[i + 3 * _nmax] += ENERGY_F(0.5) * sharedV[3 * blockDim.x]; + _vatom[i + 4 * _nmax] += ENERGY_F(0.5) * sharedV[4 * blockDim.x]; + _vatom[i + 5 * _nmax] += ENERGY_F(0.5) * sharedV[5 * blockDim.x]; + } + + if(vflag || eflag) PairVirialCompute_A_Kernel(eflag, vflag, 0); +} + +__global__ void PairEAMCuda_PackComm_Kernel(int* sendlist, int n, int maxlistlength, int iswap, F_FLOAT* buffer) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + int* list = sendlist + iswap * maxlistlength; + + if(i < n) { + int j = list[i]; + buffer[i] = _fp[j]; + } +} + +__global__ void PairEAMCuda_UnpackComm_Kernel(int n, int first, F_FLOAT* buffer) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if(i < n) { + _fp[i + first] = buffer[i]; + } +} diff --git a/lib/cuda/pair_gran_hooke_cuda.cu b/lib/cuda/pair_gran_hooke_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..5c143240cb5efb23af24d6ce174671516aae58c2 --- /dev/null +++ b/lib/cuda/pair_gran_hooke_cuda.cu @@ -0,0 +1,201 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> + +#define _kn MY_AP(coeff1) //[0] +#define _kt MY_AP(coeff1) //[1] +#define _gamman MY_AP(coeff1) //[2] +#define _gammat MY_AP(coeff3) //[0] +#define _xmu MY_AP(coeff2) //[0] +#define _dampflag MY_AP(coeff2) //[1] + +#include "pair_gran_hooke_cuda_cu.h" +#include "pair_gran_hooke_cuda_kernel_nc.cu" +#include <time.h> + +void Cuda_PairGranHookeCuda_UpdateBuffer(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) +{ + CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: before updateBuffer failed"); + int3 layout = getgrid(sneighlist->inum, 7 * sizeof(ENERGY_FLOAT)); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + int size = (unsigned)(layout.y * layout.x) * 7 * sizeof(ENERGY_FLOAT); + + if(sdata->buffersize < size) { + MYDBG(printf("Cuda_PairGranHookeCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);) + + if(sdata->buffer != NULL) cudaFree(sdata->buffer); + + cudaMalloc((void**)&sdata->buffer, size); + sdata->buffersize = size; + sdata->buffer_new++; + MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);) + } + + cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*)); + CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: updateBuffer failed"); +} + +void Cuda_PairGranHookeCuda_UpdateNmax(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist) +{ + CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: before updateNmax failed"); + cudaMemcpyToSymbol(MY_AP(neighbor_maxlocal) , & sneighlist->firstneigh.dim[0] , sizeof(unsigned)); + //cudaMemcpyToSymbol(MY_AP(firstneigh), & sneighlist->firstneigh.dev_data, sizeof(int*) ); + cudaMemcpyToSymbol(MY_AP(ilist) , & sneighlist->ilist .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(inum) , & sneighlist->inum , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nmax) , & sdata->atom.nmax , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(numneigh) , & sneighlist->numneigh .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(neighbors) , & sneighlist->neighbors .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(type) , & sdata->atom.type .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(mask) , & sdata->atom.mask .dev_data, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(x_type) , & sdata->atom.x_type .dev_data, sizeof(X_FLOAT4*)); + cudaMemcpyToSymbol(MY_AP(v_radius) , & sdata->atom.v_radius .dev_data, sizeof(V_FLOAT4*)); + cudaMemcpyToSymbol(MY_AP(omega_rmass), & sdata->atom.omega_rmass.dev_data, sizeof(V_FLOAT4*)); + cudaMemcpyToSymbol(MY_AP(torque) , & sdata->atom.torque .dev_data, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(maxneighbors), &sneighlist->maxneighbors , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(eatom) , & sdata->atom.eatom .dev_data, sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(vatom) , & sdata->atom.vatom .dev_data, sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(debugdata) , & sdata->debugdata , sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(freeze_group_bit) , & sdata->pair.freeze_group_bit, sizeof(int)); + + + CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: updateNmax failed"); +} + + +void Cuda_PairGranHookeCuda_Init(cuda_shared_data* sdata) +{ + // !! LAMMPS indexes atom types starting with 1 !! + + unsigned cuda_ntypes = sdata->atom.ntypes + 2; + + if(cuda_ntypes * cuda_ntypes > CUDA_MAX_TYPES2) + printf("# CUDA: Cuda_PairGranHookeCuda_Init: you need %u types. this is more than %u " + "(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=32 " + "or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES_PLUS_ONE - 1); + + unsigned cuda_ntypes2 = cuda_ntypes * cuda_ntypes; + unsigned n = sizeof(F_FLOAT) * cuda_ntypes2; + + F_FLOAT coeffs1[cuda_ntypes2]; + coeffs1[0] = (F_FLOAT) sdata->pair.coeff1[0][0]; + coeffs1[1] = (F_FLOAT) sdata->pair.coeff1[0][1]; + coeffs1[2] = (F_FLOAT) sdata->pair.coeff1[1][0]; + F_FLOAT coeffs3[cuda_ntypes2]; + coeffs3[0] = (F_FLOAT) sdata->pair.coeff1[1][1]; + F_FLOAT coeffs2[cuda_ntypes2]; + coeffs2[0] = (F_FLOAT) sdata->pair.coeff2[0][0]; + coeffs2[1] = (F_FLOAT) sdata->pair.coeff2[0][1]; + + + X_FLOAT box_size[3] = { + sdata->domain.subhi[0] - sdata->domain.sublo[0], + sdata->domain.subhi[1] - sdata->domain.sublo[1], + sdata->domain.subhi[2] - sdata->domain.sublo[2] + }; + //printf("n: %i %i\n",n,CUDA_MAX_TYPES2); + cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_FLOAT) * 3); + cudaMemcpyToSymbol(MY_AP(cuda_ntypes), & cuda_ntypes , sizeof(unsigned)); + cudaMemcpyToSymbol(MY_AP(coeff1) , coeffs1 , n); + cudaMemcpyToSymbol(MY_AP(coeff2) , coeffs2 , n); + cudaMemcpyToSymbol(MY_AP(coeff3) , coeffs3 , n); + cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(periodicity), sdata->domain.periodicity, sizeof(int) * 3); + CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: init failed"); +} + + + +void Cuda_PairGranHookeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom) +{ + + //if(sdata->atom.update_nmax) + Cuda_PairGranHookeCuda_UpdateNmax(sdata, sneighlist); + //if(sdata->atom.update_nlocal) + { + cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int)); + cudaMemcpyToSymbol(MY_AP(nall) , & sdata->atom.nall , sizeof(int)); + } + //if(sdata->buffer_new) + Cuda_PairGranHookeCuda_UpdateBuffer(sdata, sneighlist); + + BindXTypeTexture(sdata); + BindVRadiusTexture(sdata); + BindOmegaRmassTexture(sdata); + + int sharedperproc = 0; + + if(eflag) sharedperproc += 1; + + if(vflag) sharedperproc += 6; + + int3 layout = getgrid(sneighlist->inum, sharedperproc * sizeof(ENERGY_FLOAT), 128); + dim3 threads(layout.z, 1, 1); + dim3 grid(layout.x, layout.y, 1); + + // initialize only on first call + static short init = 0; + + if(! init) { + init = 1; + Cuda_PairGranHookeCuda_Init(sdata); + } + + MYDBG(printf("# CUDA: Cuda_PairGranHookeCuda: kernel start eflag: %i vflag: %i config: %i %i %i %i\n", eflag, vflag, grid.x, grid.y, threads.x, sharedperproc * sizeof(ENERGY_FLOAT)*threads.x);) + + CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: (no binning) pre pair lj cut Kernel problems before kernel invocation"); + PairGranHookeCuda_Kernel <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x>>> (eflag, vflag, eflag_atom, vflag_atom, (int**)sneighlist->firstneigh.dev_data, sneighlist->binned_id + , (F_FLOAT) sdata->pair.coeff1[0][0], (F_FLOAT) sdata->pair.coeff1[1][0], (F_FLOAT) sdata->pair.coeff1[1][1], (F_FLOAT) sdata->pair.coeff2[0][0]); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: (no binning) pair lj cut Kernel execution failed"); + + if(eflag || vflag) { + int n = grid.x * grid.y; + grid.x = sharedperproc; + grid.y = 1; + threads.x = 256; + MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>>(n); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: (no binning) virial compute Kernel execution failed"); + } + + MYDBG(printf("# CUDA: Cuda_PairGranHookeCoulLongCuda: kernel done\n");) + +} + + +#undef _kn +#undef _kt +#undef _gamman +#undef _gammat +#undef _xmu +#undef _dampflag + + diff --git a/lib/cuda/pair_gran_hooke_cuda_cu.h b/lib/cuda/pair_gran_hooke_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..78ad3c945266fc71a38c50ed9d8e241398b9133c --- /dev/null +++ b/lib/cuda/pair_gran_hooke_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairGranHookeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom); diff --git a/lib/cuda/pair_gran_hooke_cuda_kernel_nc.cu b/lib/cuda/pair_gran_hooke_cuda_kernel_nc.cu new file mode 100644 index 0000000000000000000000000000000000000000..e6a4ed2b8a75d7a40245531cccd404c3fd3d345e --- /dev/null +++ b/lib/cuda/pair_gran_hooke_cuda_kernel_nc.cu @@ -0,0 +1,227 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + + +__global__ void PairGranHookeCuda_Kernel(int eflag, int vflag, int eflag_atom, int vflag_atom, int** firstneight, int* binned_id + , F_FLOAT kn, F_FLOAT gamman, F_FLOAT gammat, F_FLOAT xmu) +{ + ENERGY_FLOAT evdwl = ENERGY_F(0.0); + + ENERGY_FLOAT* sharedE; + ENERGY_FLOAT* sharedV; + + if(eflag || eflag_atom) { + sharedE = &sharedmem[threadIdx.x]; + sharedV = &sharedmem[0]; + sharedE[0] = ENERGY_F(0.0); + sharedV += blockDim.x; + } + + if(vflag || vflag_atom) { + sharedV += threadIdx.x; + sharedV[0 * blockDim.x] = ENERGY_F(0.0); + sharedV[1 * blockDim.x] = ENERGY_F(0.0); + sharedV[2 * blockDim.x] = ENERGY_F(0.0); + sharedV[3 * blockDim.x] = ENERGY_F(0.0); + sharedV[4 * blockDim.x] = ENERGY_F(0.0); + sharedV[5 * blockDim.x] = ENERGY_F(0.0); + } + + int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + MYEMUDBG(if(ii == 0) printf("# CUDA: PairGranHookeCuda_Kernel: -- no binning --\n");) + + X_FLOAT xtmp, ytmp, ztmp; + + X_FLOAT4 myxtype; + V_FLOAT4 myvradius, ovradius; + F_FLOAT fxtmp, fytmp, fztmp, torquextmp, torqueytmp, torqueztmp; + F_FLOAT delx, dely, delz; + F_FLOAT radi, radj, radsum, r, rsqinv; + F_FLOAT vr1, vr2, vr3, vnnr, vn1, vn2, vn3, vt1, vt2, vt3; + F_FLOAT wr1, wr2, wr3; + F_FLOAT vtr1, vtr2, vtr3, vrel; + F_FLOAT meff, damp, ccel, tor1, tor2, tor3; + F_FLOAT fn, fs, ft, fs1, fs2, fs3; + + int jnum = 0; + int i, j; + int* jlist; + + if(ii < _inum) { + i = _ilist[ii]; + + myxtype = fetchXType(i); + myvradius = fetchVRadius(i); + + xtmp = myxtype.x; + ytmp = myxtype.y; + ztmp = myxtype.z; + radi = myvradius.w; + + fxtmp = F_F(0.0); + fytmp = F_F(0.0); + fztmp = F_F(0.0); + torquextmp = F_F(0.0); + torqueytmp = F_F(0.0); + torqueztmp = F_F(0.0); + + jnum = _numneigh[i]; + + jlist = &_neighbors[i]; + } + + __syncthreads(); + + for(int jj = 0; jj < jnum; jj++) { + if(ii < _inum) + if(jj < jnum) { + j = jlist[jj * _nlocal]; + + myxtype = fetchXType(j); + ovradius = fetchVRadius(j); + + delx = xtmp - myxtype.x; + dely = ytmp - myxtype.y; + delz = ztmp - myxtype.z; + + radj = ovradius.w; + radsum = radi + radj; + + const F_FLOAT rsq = delx * delx + dely * dely + delz * delz; + + if(rsq < radsum * radsum) { + const F_FLOAT rinv = _RSQRT_(rsq); + r = F_F(1.0) / rinv; + rsqinv = F_F(1.0) / rsq; + + // relative translational velocity + + vr1 = myvradius.x - ovradius.x; + vr2 = myvradius.y - ovradius.y; + vr3 = myvradius.z - ovradius.z; + + // normal component + + vnnr = vr1 * delx + vr2 * dely + vr3 * delz; + vn1 = delx * vnnr * rsqinv; + vn2 = dely * vnnr * rsqinv; + vn3 = delz * vnnr * rsqinv; + + // tangential component + + vt1 = vr1 - vn1; + vt2 = vr2 - vn2; + vt3 = vr3 - vn3; + + // relative rotational velocity + V_FLOAT4 omegarmass_i = fetchOmegaRmass(i); + V_FLOAT4 omegarmass_j = fetchOmegaRmass(j); + + wr1 = (radi * omegarmass_i.x + radj * omegarmass_j.x) * rinv; + wr2 = (radi * omegarmass_i.y + radj * omegarmass_j.y) * rinv; + wr3 = (radi * omegarmass_i.z + radj * omegarmass_j.z) * rinv; + + meff = omegarmass_i.w * omegarmass_j.w / (omegarmass_i.w + omegarmass_j.w); + + if(_mask[i] & _freeze_group_bit) meff = omegarmass_j.w; + + if(_mask[j] & _freeze_group_bit) meff = omegarmass_i.w; + + damp = meff * gamman * vnnr * rsqinv; + ccel = kn * (radsum - r) * rinv - damp; + + vtr1 = vt1 - (delz * wr2 - dely * wr3); + vtr2 = vt2 - (delx * wr3 - delz * wr1); + vtr3 = vt3 - (dely * wr1 - delx * wr2); + vrel = vtr1 * vtr1 + vtr2 * vtr2 + vtr3 * vtr3; + vrel = _SQRT_(vrel); + + fn = xmu * fabs(ccel * r); + fs = meff * gammat * vrel; + ft = (vrel != F_F(0.0)) ? MIN(fn, fs) / vrel : F_F(0.0); + + fs1 = -ft * vtr1; + fs2 = -ft * vtr2; + fs3 = -ft * vtr3; + + F_FLOAT dxfp, dyfp, dzfp; + fxtmp += dxfp = delx * ccel + fs1; + fytmp += dyfp = dely * ccel + fs2; + fztmp += dzfp = delz * ccel + fs3; + + tor1 = rinv * (dely * fs3 - delz * fs2); + tor2 = rinv * (delz * fs1 - delx * fs3); + tor3 = rinv * (delx * fs2 - dely * fs1); + + torquextmp -= radi * tor1; + torqueytmp -= radi * tor2; + torqueztmp -= radi * tor3; + + if(vflag) { + sharedV[0 * blockDim.x] += delx * dxfp; + sharedV[1 * blockDim.x] += dely * dyfp; + sharedV[2 * blockDim.x] += delz * dzfp; + sharedV[3 * blockDim.x] += delx * dyfp; + sharedV[4 * blockDim.x] += delx * dzfp; + sharedV[5 * blockDim.x] += dely * dzfp; + } + + } + } + } + + __syncthreads(); + + if(ii < _inum) { + F_FLOAT* my_f = _f + i; + *my_f += fxtmp; + my_f += _nmax; + *my_f += fytmp; + my_f += _nmax; + *my_f += fztmp; + F_FLOAT* my_torque = _torque + i; + *my_torque += torquextmp; + my_torque += _nmax; + *my_torque += torqueytmp; + my_torque += _nmax; + *my_torque += torqueztmp; + } + + __syncthreads(); + + if(eflag) sharedE[0] = evdwl; + + if(eflag_atom && i < _nlocal) _eatom[i] += evdwl; + + if(vflag_atom && i < _nlocal) { + _vatom[i] += ENERGY_F(0.5) * sharedV[0 * blockDim.x]; + _vatom[i + _nmax] += ENERGY_F(0.5) * sharedV[1 * blockDim.x]; + _vatom[i + 2 * _nmax] += ENERGY_F(0.5) * sharedV[2 * blockDim.x]; + _vatom[i + 3 * _nmax] += ENERGY_F(0.5) * sharedV[3 * blockDim.x]; + _vatom[i + 4 * _nmax] += ENERGY_F(0.5) * sharedV[4 * blockDim.x]; + _vatom[i + 5 * _nmax] += ENERGY_F(0.5) * sharedV[5 * blockDim.x]; + } + + if(vflag || eflag) PairVirialCompute_A_Kernel(eflag, vflag, 0); +} diff --git a/lib/cuda/pair_lj96_cut_cuda.cu b/lib/cuda/pair_lj96_cut_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..1d40a3c82e9334fcdea8d0744710535316f6b531 --- /dev/null +++ b/lib/cuda/pair_lj96_cut_cuda.cu @@ -0,0 +1,79 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> + +#define _lj1 MY_AP(coeff1) +#define _lj2 MY_AP(coeff2) +#define _lj3 MY_AP(coeff3) +#define _lj4 MY_AP(coeff4) + +#include "pair_lj96_cut_cuda_cu.h" +#include "pair_lj96_cut_cuda_kernel_nc.cu" +#include <time.h> + + + + +void Cuda_PairLJ96CutCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 4, false, false); +} + + + + +void Cuda_PairLJ96CutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom) +{ + + // initialize only on first call + static short init = 0; + + if(! init) { + init = 1; + Cuda_PairLJ96CutCuda_Init(sdata); + } + + dim3 grid, threads; + int sharedperproc; + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 256); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA<PAIR_LJ96_CUT, COUL_NONE, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + else + Pair_Kernel_TpA<PAIR_LJ96_CUT, COUL_NONE, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 + + diff --git a/lib/cuda/pair_lj96_cut_cuda_cu.h b/lib/cuda/pair_lj96_cut_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..0e4b62daff2835855c6d51bf39e3d80b3c59a502 --- /dev/null +++ b/lib/cuda/pair_lj96_cut_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairLJ96CutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom); diff --git a/lib/cuda/pair_lj96_cut_cuda_kernel_nc.cu b/lib/cuda/pair_lj96_cut_cuda_kernel_nc.cu new file mode 100644 index 0000000000000000000000000000000000000000..f3c2477be6f738e18c219cdadd929baad84d295a --- /dev/null +++ b/lib/cuda/pair_lj96_cut_cuda_kernel_nc.cu @@ -0,0 +1,35 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +__device__ inline F_FLOAT PairLJ96CutCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl) +{ + const F_FLOAT r2inv = F_F(1.0) / rsq; + const F_FLOAT r6inv = r2inv * r2inv * r2inv; + const F_FLOAT r3inv = _SQRT_(r6inv); + const F_FLOAT forcelj = r6inv * (_lj1[ij_type] * r3inv - _lj2[ij_type]); + + if(eflag) evdwl += factor_lj * (r6inv * (_lj3[ij_type] * r3inv - _lj4[ij_type]) - _offset[ij_type]); + + return factor_lj * forcelj * r2inv; +} + diff --git a/lib/cuda/pair_lj_charmm_coul_charmm_cuda.cu b/lib/cuda/pair_lj_charmm_coul_charmm_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..752f3bd47dbb435a14f2d7ab08ba6c8697aefd85 --- /dev/null +++ b/lib/cuda/pair_lj_charmm_coul_charmm_cuda.cu @@ -0,0 +1,79 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> + +#define _lj1 MY_AP(coeff1_gm) +#define _lj2 MY_AP(coeff2_gm) +#define _lj3 MY_AP(coeff3_gm) +#define _lj4 MY_AP(coeff4_gm) + +#include "pair_lj_charmm_coul_charmm_cuda_cu.h" +#include "pair_lj_charmm_coul_charmm_cuda_kernel_nc.cu" + +#include <time.h> + +void Cuda_PairLJCharmmCoulCharmmCuda_Init(cuda_shared_data* sdata, F_FLOAT cut_coul_innersq, F_FLOAT denom_lj_inv, F_FLOAT denom_coul_inv) +{ + Cuda_Pair_Init_AllStyles(sdata, 4, true, true, true); + cudaMemcpyToSymbol(MY_AP(cut_coul_innersq_global) , &cut_coul_innersq , sizeof(F_FLOAT)); + cudaMemcpyToSymbol(MY_AP(denom_lj_inv) , &denom_lj_inv , sizeof(F_FLOAT)); + cudaMemcpyToSymbol(MY_AP(denom_coul_inv) , &denom_coul_inv , sizeof(F_FLOAT)); + + return; +} + + + +void Cuda_PairLJCharmmCoulCharmmCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, + int eflag_atom, int vflag_atom, F_FLOAT denom_lj, F_FLOAT cut_coul_innersq, F_FLOAT denom_coul) +{ + + static short init = 0; + + if(! init) { + init = 1; + Cuda_PairLJCharmmCoulCharmmCuda_Init(sdata, cut_coul_innersq, 1.0 / denom_lj, 1.0 / denom_coul); + } + + dim3 grid, threads; + int sharedperproc; + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 192); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA<PAIR_LJ_CHARMM, COUL_CHARMM, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + else + Pair_Kernel_TpA<PAIR_LJ_CHARMM, COUL_CHARMM, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 diff --git a/lib/cuda/pair_lj_charmm_coul_charmm_cuda_cu.h b/lib/cuda/pair_lj_charmm_coul_charmm_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..e44d2941f2aa9de73b6ff4395df36cde895579aa --- /dev/null +++ b/lib/cuda/pair_lj_charmm_coul_charmm_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairLJCharmmCoulCharmmCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom, F_FLOAT denom_lj, F_FLOAT cut_coul_innersq, F_FLOAT denom_coul); diff --git a/lib/cuda/pair_lj_charmm_coul_charmm_cuda_kernel_nc.cu b/lib/cuda/pair_lj_charmm_coul_charmm_cuda_kernel_nc.cu new file mode 100644 index 0000000000000000000000000000000000000000..d4ed2f48af11dd3f98983a995f072f1e924a0706 --- /dev/null +++ b/lib/cuda/pair_lj_charmm_coul_charmm_cuda_kernel_nc.cu @@ -0,0 +1,73 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ +__device__ inline F_FLOAT PairLJCharmmCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl) +{ + const F_FLOAT r2inv = F_F(1.0) / rsq; + const F_FLOAT r6inv = r2inv * r2inv * r2inv; + F_FLOAT forcelj = r6inv * (_lj1[ij_type] * r6inv - _lj2[ij_type]); + F_FLOAT philj, switch1; + + if(rsq > _cut_innersq_global) { + switch1 = (_cutsq_global - rsq) * (_cutsq_global - rsq) * + (_cutsq_global + F_F(2.0) * rsq - F_F(3.0) * _cut_innersq_global) * _denom_lj_inv; + const F_FLOAT switch2 = F_F(12.0) * rsq * (_cutsq_global - rsq) * + (rsq - _cut_innersq_global) * _denom_lj_inv; + philj = r6inv * (_lj3[ij_type] * r6inv - _lj4[ij_type]); + forcelj = forcelj * switch1 + philj * switch2; + } + + if(eflag) { + ENERGY_FLOAT evdwl_tmp = factor_lj; + + if(rsq > _cut_innersq_global) { + evdwl_tmp *= philj * switch1; + } else + evdwl_tmp *= r6inv * (_lj3[ij_type] * r6inv - _lj4[ij_type]); + + evdwl += evdwl_tmp; + } + + return factor_lj * forcelj * r2inv; +} + +__device__ inline F_FLOAT CoulCharmmCuda_Eval(const F_FLOAT &rsq, F_FLOAT &factor_coul, int &eflag, ENERGY_FLOAT &ecoul, F_FLOAT qij) +{ + F_FLOAT forcecoul; + ENERGY_FLOAT ecoul_tmp = forcecoul = _qqrd2e * qij * _RSQRT_(rsq) * factor_coul; + + if(rsq > _cut_coul_innersq_global) { + const F_FLOAT switch1 = (_cut_coulsq_global - rsq) * (_cut_coulsq_global - rsq) * + (_cut_coulsq_global + F_F(2.0) * rsq - F_F(3.0) * _cut_coul_innersq_global) * _denom_coul_inv; + ecoul_tmp *= switch1; + const F_FLOAT switch2 = F_F(12.0) * rsq * (_cut_coulsq_global - rsq) * + (rsq - _cut_coul_innersq_global) * _denom_coul_inv; + forcecoul *= switch1 + switch2; + } + + if(eflag) { + ecoul += ecoul_tmp * factor_coul; + } + + return forcecoul * (F_F(1.0) / rsq); +} + diff --git a/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda.cu b/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..31df02b2efe3569cc7478f0f55051233371a4e1e --- /dev/null +++ b/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda.cu @@ -0,0 +1,86 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> + +#define _lj1 MY_AP(coeff1_gm) +#define _lj2 MY_AP(coeff2_gm) +#define _lj3 MY_AP(coeff3_gm) +#define _lj4 MY_AP(coeff4_gm) +#define _cut_coul_innersq_global MY_AP(cut_coul_innersq_global) +#define _denom_lj_inv MY_AP(denom_lj_inv) +#define _denom_coul_inv MY_AP(denom_coul_inv) +__device__ __constant__ F_FLOAT _cut_coul_innersq_global; +__device__ __constant__ F_FLOAT _denom_lj_inv; +__device__ __constant__ F_FLOAT _denom_coul_inv; + + +#include "pair_lj_charmm_coul_charmm_implicit_cuda_cu.h" +#include "pair_lj_charmm_coul_charmm_implicit_cuda_kernel_nc.cu" + +#include <time.h> + +void Cuda_PairLJCharmmCoulCharmmImplicitCuda_Init(cuda_shared_data* sdata, F_FLOAT cut_coul_innersq, F_FLOAT denom_lj_inv, F_FLOAT denom_coul_inv) +{ + Cuda_Pair_Init_AllStyles(sdata, 4, true, true, true); + cudaMemcpyToSymbol(MY_AP(cut_coul_innersq_global) , &cut_coul_innersq , sizeof(F_FLOAT)); + cudaMemcpyToSymbol(MY_AP(denom_lj_inv) , &denom_lj_inv , sizeof(F_FLOAT)); + cudaMemcpyToSymbol(MY_AP(denom_coul_inv) , &denom_coul_inv , sizeof(F_FLOAT)); + + return; +} + + + +void Cuda_PairLJCharmmCoulCharmmImplicitCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, + int eflag_atom, int vflag_atom, F_FLOAT denom_lj, F_FLOAT cut_coul_innersq, F_FLOAT denom_coul) +{ + + static short init = 0; + + if(! init) { + init = 1; + Cuda_PairLJCharmmCoulCharmmImplicitCuda_Init(sdata, cut_coul_innersq, 1.0 / denom_lj, 1.0 / denom_coul); + } + + dim3 grid, threads; + int sharedperproc; + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 192); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA<PAIR_LJ_CHARMM, COUL_CHARMM_IMPLICIT, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + else + Pair_Kernel_TpA<PAIR_LJ_CHARMM, COUL_CHARMM_IMPLICIT, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 diff --git a/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda_cu.h b/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..c410906957cbb8e51b9e5e17745e648c93165846 --- /dev/null +++ b/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairLJCharmmCoulCharmmImplicitCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom, F_FLOAT denom_lj, F_FLOAT cut_coul_innersq, F_FLOAT denom_coul); diff --git a/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda_kernel_nc.cu b/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda_kernel_nc.cu new file mode 100644 index 0000000000000000000000000000000000000000..6a20b8626accdbf4cf9698d65bf5a6d5a7d61987 --- /dev/null +++ b/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda_kernel_nc.cu @@ -0,0 +1,44 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +__device__ inline F_FLOAT CoulCharmmImplicitCuda_Eval(const F_FLOAT &rsq, F_FLOAT &factor_coul, int &eflag, ENERGY_FLOAT &ecoul, F_FLOAT qij) +{ + F_FLOAT forcecoul; + ENERGY_FLOAT ecoul_tmp = forcecoul = _qqrd2e * qij * (F_F(1.0) / rsq) * factor_coul; + + if(rsq > _cut_coul_innersq_global) { + const F_FLOAT switch1 = (_cut_coulsq_global - rsq) * (_cut_coulsq_global - rsq) * + (_cut_coulsq_global + F_F(2.0) * rsq - F_F(3.0) * _cut_coul_innersq_global) * _denom_coul_inv; + ecoul_tmp *= switch1; + const F_FLOAT switch2 = F_F(12.0) * rsq * (_cut_coulsq_global - rsq) * + (rsq - _cut_coul_innersq_global) * _denom_coul_inv; + forcecoul *= (switch1 + switch2); + } + + if(eflag) { + ecoul += ecoul_tmp * factor_coul; + } + + return F_F(2.0) * forcecoul * (F_F(1.0) / rsq); +} + diff --git a/lib/cuda/pair_lj_charmm_coul_long_cuda.cu b/lib/cuda/pair_lj_charmm_coul_long_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..0096f7757e5545fd808bd89c6cf21d4f87862c22 --- /dev/null +++ b/lib/cuda/pair_lj_charmm_coul_long_cuda.cu @@ -0,0 +1,76 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> + +#define _lj1 MY_AP(coeff1_gm) +#define _lj2 MY_AP(coeff2_gm) +#define _lj3 MY_AP(coeff3_gm) +#define _lj4 MY_AP(coeff4_gm) + +#include "pair_lj_charmm_coul_long_cuda_cu.h" + +#include <time.h> + +void Cuda_PairLJCharmmCoulLongCuda_Init(cuda_shared_data* sdata, F_FLOAT denom_lj_inv) +{ + Cuda_Pair_Init_AllStyles(sdata, 4, true, true, true); + cudaMemcpyToSymbol(MY_AP(denom_lj_inv) , &denom_lj_inv , sizeof(F_FLOAT)); + + return; +} + + + +void Cuda_PairLJCharmmCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, + int eflag_atom, int vflag_atom, F_FLOAT denom_lj) +{ + + static short init = 0; + + if(! init) { + init = 1; + Cuda_PairLJCharmmCoulLongCuda_Init(sdata, 1.0 / denom_lj); + } + + dim3 grid, threads; + int sharedperproc; + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 192); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA<PAIR_LJ_CHARMM, COUL_LONG, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + else + Pair_Kernel_TpA<PAIR_LJ_CHARMM, COUL_LONG, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 diff --git a/lib/cuda/pair_lj_charmm_coul_long_cuda_cu.h b/lib/cuda/pair_lj_charmm_coul_long_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..34b0b722ef5f173552cf2486926ffada9815f73c --- /dev/null +++ b/lib/cuda/pair_lj_charmm_coul_long_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairLJCharmmCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom, F_FLOAT denom_lj); diff --git a/lib/cuda/pair_lj_class2_coul_cut_cuda.cu b/lib/cuda/pair_lj_class2_coul_cut_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..f09a480534c9786f43e55dfd7ed99a1a219735f2 --- /dev/null +++ b/lib/cuda/pair_lj_class2_coul_cut_cuda.cu @@ -0,0 +1,73 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> + +#define _lj1 MY_AP(coeff1) +#define _lj2 MY_AP(coeff2) +#define _lj3 MY_AP(coeff3) +#define _lj4 MY_AP(coeff4) + +#include "pair_lj_class2_coul_cut_cuda_cu.h" + +#include <time.h> + +void Cuda_PairLJClass2CoulCutCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 4, true); +} + +void Cuda_PairLJClass2CoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom) +{ + + + static short init = 0; + + if(! init) { + init = 1; + Cuda_PairLJClass2CoulCutCuda_Init(sdata); + } + + dim3 grid, threads; + int sharedperproc; + + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 192); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA<PAIR_LJ_CLASS2, COUL_CUT, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + else + Pair_Kernel_TpA<PAIR_LJ_CLASS2, COUL_CUT, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 diff --git a/lib/cuda/pair_lj_class2_coul_cut_cuda_cu.h b/lib/cuda/pair_lj_class2_coul_cut_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..b775ad34682f6ace4a52ee1b374cecca613dbe43 --- /dev/null +++ b/lib/cuda/pair_lj_class2_coul_cut_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairLJClass2CoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom); diff --git a/lib/cuda/pair_lj_class2_coul_long_cuda.cu b/lib/cuda/pair_lj_class2_coul_long_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..f20c74c33a550e8c251eb0df56e0b66b682524ee --- /dev/null +++ b/lib/cuda/pair_lj_class2_coul_long_cuda.cu @@ -0,0 +1,73 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> + +#define _lj1 MY_AP(coeff1) +#define _lj2 MY_AP(coeff2) +#define _lj3 MY_AP(coeff3) +#define _lj4 MY_AP(coeff4) + +#include "pair_lj_class2_coul_long_cuda_cu.h" + +#include <time.h> + +void Cuda_PairLJClass2CoulLongCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 4, true); +} + +void Cuda_PairLJClass2CoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom) +{ + + + static short init = 0; + + if(! init) { + init = 1; + Cuda_PairLJClass2CoulLongCuda_Init(sdata); + } + + dim3 grid, threads; + int sharedperproc; + + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 192); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA<PAIR_LJ_CLASS2, COUL_LONG, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + else + Pair_Kernel_TpA<PAIR_LJ_CLASS2, COUL_LONG, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 diff --git a/lib/cuda/pair_lj_class2_coul_long_cuda_cu.h b/lib/cuda/pair_lj_class2_coul_long_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..3c2fba253a16652448d561d1e4d611c20e7415f7 --- /dev/null +++ b/lib/cuda/pair_lj_class2_coul_long_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairLJClass2CoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom); diff --git a/lib/cuda/pair_lj_class2_cuda.cu b/lib/cuda/pair_lj_class2_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..a72e31fd9c4ab6ab6b7f159b80bbd85737f79038 --- /dev/null +++ b/lib/cuda/pair_lj_class2_cuda.cu @@ -0,0 +1,75 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> + +#define _lj1 MY_AP(coeff1) +#define _lj2 MY_AP(coeff2) +#define _lj3 MY_AP(coeff3) +#define _lj4 MY_AP(coeff4) + +#include "pair_lj_class2_cuda_cu.h" +#include "pair_lj_class2_cuda_kernel_nc.cu" + +#include <time.h> + +void Cuda_PairLJClass2Cuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 4); +} + +void Cuda_PairLJClass2Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom) +{ + + + static short init = 0; + + if(! init) { + init = 1; + Cuda_PairLJClass2Cuda_Init(sdata); + } + + dim3 grid, threads; + int sharedperproc; + + //int maxthreads=192*sizeof(double)/sizeof(F_FLOAT); + //if(CUDA_ARCH==20) maxthreads*=2; + //cudaFuncSetCacheConfig(Pair_Kernel_TpA_opt<PAIR_LJ_CUT,COUL_NONE,DATA_NONE>,cudaFuncCachePreferL1); + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 192); + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA<PAIR_LJ_CLASS2, COUL_NONE, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + else + Pair_Kernel_TpA<PAIR_LJ_CLASS2, COUL_NONE, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 diff --git a/lib/cuda/pair_lj_class2_cuda_cu.h b/lib/cuda/pair_lj_class2_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..e58da30450c2ca594005a4c4db4f58e13450bbef --- /dev/null +++ b/lib/cuda/pair_lj_class2_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairLJClass2Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom); diff --git a/lib/cuda/pair_lj_class2_cuda_kernel_nc.cu b/lib/cuda/pair_lj_class2_cuda_kernel_nc.cu new file mode 100644 index 0000000000000000000000000000000000000000..761e985ec8c017b105b4bb6cb31489088d6ecf91 --- /dev/null +++ b/lib/cuda/pair_lj_class2_cuda_kernel_nc.cu @@ -0,0 +1,35 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +__device__ inline F_FLOAT PairLJClass2Cuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl) +{ + const F_FLOAT r2inv = F_F(1.0) / rsq; + const F_FLOAT r6inv = r2inv * r2inv * r2inv; + const F_FLOAT r3inv = _SQRT_(r6inv); + + if(eflag) evdwl += factor_lj * (r6inv * (_lj3[ij_type] * r3inv - + _lj4[ij_type]) - _offset[ij_type]); + + return factor_lj * r6inv * (_lj1[ij_type] * r3inv - _lj2[ij_type]) * r2inv; +} + diff --git a/lib/cuda/pair_lj_cut_coul_cut_cuda.cu b/lib/cuda/pair_lj_cut_coul_cut_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..88ba0300cf44d3ad3ffef3713abdc1b50264f96d --- /dev/null +++ b/lib/cuda/pair_lj_cut_coul_cut_cuda.cu @@ -0,0 +1,73 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> + +#define _lj1 MY_AP(coeff1) +#define _lj2 MY_AP(coeff2) +#define _lj3 MY_AP(coeff3) +#define _lj4 MY_AP(coeff4) + +#include "pair_lj_cut_coul_cut_cuda_cu.h" + +#include <time.h> + +void Cuda_PairLJCutCoulCutCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 4, true); +} + +void Cuda_PairLJCutCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom) +{ + + + static short init = 0; + + if(! init) { + init = 1; + Cuda_PairLJCutCoulCutCuda_Init(sdata); + } + + dim3 grid, threads; + int sharedperproc; + + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 192); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA<PAIR_LJ_CUT, COUL_CUT, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + else + Pair_Kernel_TpA<PAIR_LJ_CUT, COUL_CUT, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 diff --git a/lib/cuda/pair_lj_cut_coul_cut_cuda_cu.h b/lib/cuda/pair_lj_cut_coul_cut_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..5aa5ce22c8ad3f57797cd21a27c5be9c5ab7f7f6 --- /dev/null +++ b/lib/cuda/pair_lj_cut_coul_cut_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairLJCutCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom); diff --git a/lib/cuda/pair_lj_cut_coul_debye_cuda.cu b/lib/cuda/pair_lj_cut_coul_debye_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..fdbe594768ca93d0c83744a8231a50f413fa2449 --- /dev/null +++ b/lib/cuda/pair_lj_cut_coul_debye_cuda.cu @@ -0,0 +1,72 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> + +#define _lj1 MY_AP(coeff1) +#define _lj2 MY_AP(coeff2) +#define _lj3 MY_AP(coeff3) +#define _lj4 MY_AP(coeff4) + +#include "pair_lj_cut_coul_debye_cuda_cu.h" + +#include <time.h> + +void Cuda_PairLJCutCoulDebyeCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 4, true); +} + +void Cuda_PairLJCutCoulDebyeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom) +{ + + + static short init = 0; + + if(! init) { + init = 1; + Cuda_PairLJCutCoulDebyeCuda_Init(sdata); + } + + dim3 grid, threads; + int sharedperproc; + + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 192); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA<PAIR_LJ_CUT, COUL_DEBYE, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + else + Pair_Kernel_TpA<PAIR_LJ_CUT, COUL_DEBYE, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 diff --git a/lib/cuda/pair_lj_cut_coul_debye_cuda_cu.h b/lib/cuda/pair_lj_cut_coul_debye_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..59e4cdbc159008343c5d615542c876fc8b58e5a1 --- /dev/null +++ b/lib/cuda/pair_lj_cut_coul_debye_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairLJCutCoulDebyeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom); diff --git a/lib/cuda/pair_lj_cut_coul_long_cuda.cu b/lib/cuda/pair_lj_cut_coul_long_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..316bb68351c50cb53fb26cc6cfb4986984283fae --- /dev/null +++ b/lib/cuda/pair_lj_cut_coul_long_cuda.cu @@ -0,0 +1,73 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> + +#define _lj1 MY_AP(coeff1) +#define _lj2 MY_AP(coeff2) +#define _lj3 MY_AP(coeff3) +#define _lj4 MY_AP(coeff4) + +#include "pair_lj_cut_coul_long_cuda_cu.h" + +#include <time.h> + +void Cuda_PairLJCutCoulLongCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 4, true); +} + +void Cuda_PairLJCutCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom) +{ + + + static short init = 0; + + if(! init) { + init = 1; + Cuda_PairLJCutCoulLongCuda_Init(sdata); + } + + dim3 grid, threads; + int sharedperproc; + + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 192); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA<PAIR_LJ_CUT, COUL_LONG, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + else + Pair_Kernel_TpA<PAIR_LJ_CUT, COUL_LONG, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 diff --git a/lib/cuda/pair_lj_cut_coul_long_cuda_cu.h b/lib/cuda/pair_lj_cut_coul_long_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..a5ed5be999506308613f5ed83e56b4c7c6487ad8 --- /dev/null +++ b/lib/cuda/pair_lj_cut_coul_long_cuda_cu.h @@ -0,0 +1,30 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +#ifdef CUDA_USE_BINNING +extern "C" void Cuda_PairLJCutCoulLongCuda(cuda_shared_data* sdata, int eflag, int vflag); +#else +extern "C" void Cuda_PairLJCutCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom); +#endif diff --git a/lib/cuda/pair_lj_cut_cuda.cu b/lib/cuda/pair_lj_cut_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..4f2796e95883495c569955e9e2cdfaba48a26308 --- /dev/null +++ b/lib/cuda/pair_lj_cut_cuda.cu @@ -0,0 +1,75 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> + +#define _lj1 MY_AP(coeff1) +#define _lj2 MY_AP(coeff2) +#define _lj3 MY_AP(coeff3) +#define _lj4 MY_AP(coeff4) + +#include "pair_lj_cut_cuda_cu.h" +#include "pair_lj_cut_cuda_kernel_nc.cu" + +#include <time.h> + +void Cuda_PairLJCutCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 4); +} + +void Cuda_PairLJCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom) +{ + + + static short init = 0; + + if(! init) { + init = 1; + Cuda_PairLJCutCuda_Init(sdata); + } + + dim3 grid, threads; + int sharedperproc; + + //int maxthreads=192*sizeof(double)/sizeof(F_FLOAT); + //if(CUDA_ARCH==20) maxthreads*=2; + //cudaFuncSetCacheConfig(Pair_Kernel_TpA_opt<PAIR_LJ_CUT,COUL_NONE,DATA_NONE>,cudaFuncCachePreferL1); + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 192); + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA<PAIR_LJ_CUT, COUL_NONE, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + else + Pair_Kernel_TpA<PAIR_LJ_CUT, COUL_NONE, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 diff --git a/lib/cuda/pair_lj_cut_cuda_cu.h b/lib/cuda/pair_lj_cut_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..b8a6365c8cba1be1f117755ede9bd57555b56a6c --- /dev/null +++ b/lib/cuda/pair_lj_cut_cuda_cu.h @@ -0,0 +1,30 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +#ifdef CUDA_USE_BINNING +extern "C" void Cuda_PairLJCutCuda(cuda_shared_data* sdata, int eflag, int vflag); +#else +extern "C" void Cuda_PairLJCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom); +#endif diff --git a/lib/cuda/pair_lj_cut_cuda_kernel_nc.cu b/lib/cuda/pair_lj_cut_cuda_kernel_nc.cu new file mode 100644 index 0000000000000000000000000000000000000000..2517a006e9bdc4e925bdd50bf40099f3f8f18d60 --- /dev/null +++ b/lib/cuda/pair_lj_cut_cuda_kernel_nc.cu @@ -0,0 +1,34 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +__device__ inline F_FLOAT PairLJCutCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl) +{ + const F_FLOAT r2inv = F_F(1.0) / rsq; + const F_FLOAT r6inv = r2inv * r2inv * r2inv; + + if(eflag) evdwl += factor_lj * (r6inv * (_lj3[ij_type] * r6inv - + _lj4[ij_type]) - _offset[ij_type]); + + return factor_lj * r6inv * (_lj1[ij_type] * r6inv - _lj2[ij_type]) * r2inv; +} + diff --git a/lib/cuda/pair_lj_cut_experimental_cuda.cu b/lib/cuda/pair_lj_cut_experimental_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..4df5755326d49c31b3cf1b6e219ddfe218e9ea85 --- /dev/null +++ b/lib/cuda/pair_lj_cut_experimental_cuda.cu @@ -0,0 +1,78 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> + +#define _lj1 MY_AP(coeff1) +#define _lj2 MY_AP(coeff2) +#define _lj3 MY_AP(coeff3) +#define _lj4 MY_AP(coeff4) + +#include "pair_lj_cut_experimental_cuda_cu.h" + +#include <time.h> + +void Cuda_PairLJCutExperimentalCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 4); +} + +void Cuda_PairLJCutExperimentalCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom) +{ + + + static short init = 0; + + if(! init) { + init = 1; + Cuda_PairLJCutExperimentalCuda_Init(sdata); + } + + dim3 grid, threads; + int sharedperproc; + + //int maxthreads=192*sizeof(double)/sizeof(F_FLOAT); + //if(CUDA_ARCH==20) maxthreads*=2; + //cudaFuncSetCacheConfig(Pair_Kernel_TpA_opt<PAIR_LJ_CUT,COUL_NONE,DATA_NONE>,cudaFuncCachePreferL1); + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 192); + + if(sharedperproc == 0) sharedperproc++; + + //printf("comm_phase: %i\n",sdata->comm.comm_phase); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA<PAIR_LJ_CUT, COUL_NONE, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + else + Pair_Kernel_TpA_opt<PAIR_LJ_CUT, COUL_NONE, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom, sdata->comm.comm_phase); + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 diff --git a/lib/cuda/pair_lj_cut_experimental_cuda_cu.h b/lib/cuda/pair_lj_cut_experimental_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..34e9f5417875096cd8cb7b76da1a80bbda0787a6 --- /dev/null +++ b/lib/cuda/pair_lj_cut_experimental_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairLJCutExperimentalCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom); diff --git a/lib/cuda/pair_lj_expand_cuda.cu b/lib/cuda/pair_lj_expand_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..290c9a7a97c1d9fbad9d27f6d7dc5ac27d3bd032 --- /dev/null +++ b/lib/cuda/pair_lj_expand_cuda.cu @@ -0,0 +1,78 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> + +#define _lj1 MY_AP(coeff1) +#define _lj2 MY_AP(coeff2) +#define _lj3 MY_AP(coeff3) +#define _lj4 MY_AP(coeff4) +#define _shift MY_AP(coeff5) + +#include "pair_lj_expand_cuda_cu.h" +#include "pair_lj_expand_cuda_kernel_nc.cu" +#include <time.h> + + +void Cuda_PairLJExpandCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 5); +} + + + + +void Cuda_PairLJExpandCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom) +{ + + // initialize only on first call + static short init = 0; + + if(! init) { + init = 1; + Cuda_PairLJExpandCuda_Init(sdata); + } + + dim3 grid, threads; + int sharedperproc; + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 256); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA<PAIR_LJ_EXPAND, COUL_NONE, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + else + Pair_Kernel_TpA<PAIR_LJ_EXPAND, COUL_NONE, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 + + diff --git a/lib/cuda/pair_lj_expand_cuda_cu.h b/lib/cuda/pair_lj_expand_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..fff9a09fbfe2506ebfaa159e835b0b0be47195d5 --- /dev/null +++ b/lib/cuda/pair_lj_expand_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairLJExpandCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom); diff --git a/lib/cuda/pair_lj_expand_cuda_kernel_nc.cu b/lib/cuda/pair_lj_expand_cuda_kernel_nc.cu new file mode 100644 index 0000000000000000000000000000000000000000..fc03d6fbf48e18544a1267b6ae890ae263834f84 --- /dev/null +++ b/lib/cuda/pair_lj_expand_cuda_kernel_nc.cu @@ -0,0 +1,35 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ +__device__ inline F_FLOAT PairLJExpandCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl) +{ + const F_FLOAT r = _SQRT_(rsq); + const F_FLOAT rshift = r - _shift[ij_type]; + const F_FLOAT rshiftsq = rshift * rshift; + const F_FLOAT r2inv = F_F(1.0) / rshiftsq; + const F_FLOAT r6inv = r2inv * r2inv * r2inv; + const F_FLOAT forcelj = r6inv * (_lj1[ij_type] * r6inv - _lj2[ij_type]); + + if(eflag) evdwl += factor_lj * (r6inv * (_lj3[ij_type] * r6inv - _lj4[ij_type]) - _offset[ij_type]); + + return factor_lj * forcelj * (F_F(1.0) / rshift) * (F_F(1.0) / r); +} diff --git a/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda.cu b/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..354f06b54b714fea10735c367c8fd6ca3f27caf2 --- /dev/null +++ b/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda.cu @@ -0,0 +1,103 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> + +#define _lj1 MY_AP(coeff1_gm) +#define _lj2 MY_AP(coeff2_gm) +#define _lj3 MY_AP(coeff3_gm) +#define _lj4 MY_AP(coeff4_gm) +#define _ljsw1 MY_AP(coeff5_gm) +#define _ljsw2 MY_AP(coeff6_gm) +#define _ljsw3 MY_AP(coeff7_gm) +#define _ljsw4 MY_AP(coeff8_gm) +#define _ljsw5 MY_AP(coeff9_gm) + +#define _cut_coul_inner_global MY_AP(cut_coul_inner_global) +#define _coulsw1 MY_AP(coulsw1) +#define _coulsw2 MY_AP(coulsw2) +#define _coulsw5 MY_AP(coulsw5) +__device__ __constant__ F_FLOAT _cut_coul_inner_global; +__device__ __constant__ F_FLOAT _coulsw1; +__device__ __constant__ F_FLOAT _coulsw2; +__device__ __constant__ F_FLOAT _coulsw5; + + +#include "pair_lj_gromacs_coul_gromacs_cuda_cu.h" +#include "pair_lj_gromacs_coul_gromacs_cuda_kernel_nc.cu" + +#include <time.h> + +void Cuda_PairLJGromacsCoulGromacsCuda_Init(cuda_shared_data* sdata, F_FLOAT cut_coul_inner, F_FLOAT coulsw1, F_FLOAT coulsw2, F_FLOAT coulsw5) +{ + Cuda_Pair_Init_AllStyles(sdata, 9, true, true, true); + cudaMemcpyToSymbol(MY_AP(cut_coul_inner_global) , &cut_coul_inner , sizeof(F_FLOAT)); + cudaMemcpyToSymbol(MY_AP(coulsw1) , &coulsw1 , sizeof(F_FLOAT)); + cudaMemcpyToSymbol(MY_AP(coulsw2) , &coulsw2 , sizeof(F_FLOAT)); + cudaMemcpyToSymbol(MY_AP(coulsw5) , &coulsw5 , sizeof(F_FLOAT)); + + return; +} + + + +void Cuda_PairLJGromacsCoulGromacsCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, + int eflag_atom, int vflag_atom, F_FLOAT cut_coul_inner, F_FLOAT coulsw1, F_FLOAT coulsw2, F_FLOAT coulsw5) +{ + static short init = 0; + + if(! init) { + init = 1; + Cuda_PairLJGromacsCoulGromacsCuda_Init(sdata, cut_coul_inner, coulsw1, coulsw2, coulsw5); + } + + dim3 grid, threads; + int sharedperproc; + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 192); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA<PAIR_LJ_GROMACS, COUL_GROMACS, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + else + Pair_Kernel_TpA<PAIR_LJ_GROMACS, COUL_GROMACS, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 +#undef _ljsw1 +#undef _ljsw2 +#undef _ljsw3 +#undef _ljsw4 +#undef _ljsw5 +#undef _cut_coul_inner_global +#undef _coulsw1 +#undef _coulsw2 +#undef _coulsw5 diff --git a/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda_cu.h b/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..0e3b078166ea8a6839d2a9105e612b6fb4101c56 --- /dev/null +++ b/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairLJGromacsCoulGromacsCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom, F_FLOAT cut_coul_inner, F_FLOAT coulsw1, F_FLOAT coulsw2, F_FLOAT coulsw5); diff --git a/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda_kernel_nc.cu b/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda_kernel_nc.cu new file mode 100644 index 0000000000000000000000000000000000000000..ee6dda06f019ef1d8852331372245c440c72dfbc --- /dev/null +++ b/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda_kernel_nc.cu @@ -0,0 +1,51 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +__device__ inline F_FLOAT CoulGromacsCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_coul, int &eflag, ENERGY_FLOAT &ecoul, F_FLOAT qij) +{ + if(qij != F_F(0.0)) { + F_FLOAT ecoul_tmp; + F_FLOAT forcecoul = _RSQRT_(rsq); + + if(eflag) ecoul_tmp = forcecoul - _coulsw5; + + if(rsq > _cut_coul_inner_global * _cut_coul_inner_global) { + const F_FLOAT r = F_F(1.0) / forcecoul; + const F_FLOAT tc = r - _cut_coul_inner_global; + forcecoul += r * tc * tc * (_coulsw1 + _coulsw2 * tc); + + if(eflag) ecoul_tmp -= tc * tc * tc * (_coulsw1 * (F_F(1.0) / F_F(3.0)) + _coulsw2 * tc * (F_F(1.0) / F_F(4.0))); + } + + F_FLOAT qprod = _qqrd2e * qij * factor_coul; + forcecoul *= qprod; + + if(eflag) { + ecoul += ecoul_tmp * qprod; + } + + return forcecoul * (F_F(1.0) / rsq); + } + + return F_F(0.0); +} diff --git a/lib/cuda/pair_lj_gromacs_cuda.cu b/lib/cuda/pair_lj_gromacs_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..35cc94a3cf6bc2559ddcc8f5c1d7ab91f664b534 --- /dev/null +++ b/lib/cuda/pair_lj_gromacs_cuda.cu @@ -0,0 +1,84 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> + +#define _lj1 MY_AP(coeff1_gm) +#define _lj2 MY_AP(coeff2_gm) +#define _lj3 MY_AP(coeff3_gm) +#define _lj4 MY_AP(coeff4_gm) +#define _ljsw1 MY_AP(coeff5_gm) +#define _ljsw2 MY_AP(coeff6_gm) +#define _ljsw3 MY_AP(coeff7_gm) +#define _ljsw4 MY_AP(coeff8_gm) +#define _ljsw5 MY_AP(coeff9_gm) + +#include "pair_lj_gromacs_cuda_cu.h" +#include "pair_lj_gromacs_cuda_kernel_nc.cu" + +#include <time.h> + +void Cuda_PairLJGromacsCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 9, false, true, true); +} + + + +void Cuda_PairLJGromacsCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, + int eflag_atom, int vflag_atom) +{ + static short init = 0; + + if(! init) { + init = 1; + Cuda_PairLJGromacsCuda_Init(sdata); + } + + dim3 grid, threads; + int sharedperproc; + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 192); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA<PAIR_LJ_GROMACS, COUL_NONE, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + else + Pair_Kernel_TpA<PAIR_LJ_GROMACS, COUL_NONE, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); + +} + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 +#undef _ljsw1 +#undef _ljsw2 +#undef _ljsw3 +#undef _ljsw4 +#undef _ljsw5 diff --git a/lib/cuda/pair_lj_gromacs_cuda_cu.h b/lib/cuda/pair_lj_gromacs_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..273d190e0afe02d0cb720ede95b41271226f279b --- /dev/null +++ b/lib/cuda/pair_lj_gromacs_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairLJGromacsCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom); diff --git a/lib/cuda/pair_lj_gromacs_cuda_kernel_nc.cu b/lib/cuda/pair_lj_gromacs_cuda_kernel_nc.cu new file mode 100644 index 0000000000000000000000000000000000000000..bf9d042e65bb9409871a7dcf0c2a50ff1c9d27ab --- /dev/null +++ b/lib/cuda/pair_lj_gromacs_cuda_kernel_nc.cu @@ -0,0 +1,50 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +__device__ inline F_FLOAT PairLJGromacsCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl) +{ + F_FLOAT tlj; + const F_FLOAT r2inv = F_F(1.0) / rsq; + const F_FLOAT r = _RSQRT_(r2inv); + const F_FLOAT r6inv = r2inv * r2inv * r2inv; + F_FLOAT forcelj = r6inv * (_lj1[ij_type] * r6inv - _lj2[ij_type]); + const X_FLOAT cut_lj_innersq = (_cut_innersq_global > X_F(0.0) ? _cut_innersq_global : _cut_innersq[ij_type]); + + if(rsq > cut_lj_innersq) { + tlj = r - _SQRT_(cut_lj_innersq); + forcelj += r * tlj * tlj * (_ljsw1[ij_type] + _ljsw2[ij_type] * tlj); + } + + if(eflag) { + ENERGY_FLOAT evdwl_tmp = r6inv * (_lj3[ij_type] * r6inv - _lj4[ij_type]); + + if(rsq > cut_lj_innersq) { + evdwl_tmp += tlj * tlj * tlj * + (_ljsw3[ij_type] + _ljsw4[ij_type] * tlj) + _ljsw5[ij_type];; + } + + evdwl += evdwl_tmp * factor_lj; + } + + return factor_lj * forcelj * r2inv; +} diff --git a/lib/cuda/pair_lj_sdk_coul_cut_cuda.cu b/lib/cuda/pair_lj_sdk_coul_cut_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..8647b1a62e7d2141dadefcba3989744c18fc1854 --- /dev/null +++ b/lib/cuda/pair_lj_sdk_coul_cut_cuda.cu @@ -0,0 +1,81 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> + +#define _lj1 MY_AP(coeff1) +#define _lj2 MY_AP(coeff2) +#define _lj3 MY_AP(coeff3) +#define _lj4 MY_AP(coeff4) +#define _lj_type MY_AP(coeff5) + + +#include "pair_lj_sdk_coul_cut_cuda_cu.h" +#include <time.h> + + + + +void Cuda_PairLJSDKCoulCutCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 5, true, false); + +} + + + + +void Cuda_PairLJSDKCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom) +{ + + // initialize only on first call + static short init = 0; + + if(! init) { + init = 1; + Cuda_PairLJSDKCoulCutCuda_Init(sdata); + } + + dim3 grid, threads; + int sharedperproc; + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 128); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA<PAIR_CG_CMM, COUL_CUT, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + else + Pair_Kernel_TpA<PAIR_CG_CMM, COUL_CUT, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 +#undef _lj_type + diff --git a/lib/cuda/pair_lj_sdk_coul_cut_cuda_cu.h b/lib/cuda/pair_lj_sdk_coul_cut_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..2a7cea4086d49ca1b005774a170bbb6251d3292f --- /dev/null +++ b/lib/cuda/pair_lj_sdk_coul_cut_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairLJSDKCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom); diff --git a/lib/cuda/pair_lj_sdk_coul_debye_cuda.cu b/lib/cuda/pair_lj_sdk_coul_debye_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..48dddcae6a9b547234f3d8fd1b3c349736e010e9 --- /dev/null +++ b/lib/cuda/pair_lj_sdk_coul_debye_cuda.cu @@ -0,0 +1,81 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> + +#define _lj1 MY_AP(coeff1) +#define _lj2 MY_AP(coeff2) +#define _lj3 MY_AP(coeff3) +#define _lj4 MY_AP(coeff4) +#define _lj_type MY_AP(coeff5) + + +#include "pair_lj_sdk_coul_debye_cuda_cu.h" +#include <time.h> + + + + +void Cuda_PairLJSDKCoulDebyeCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 5, true, false); + +} + + + + +void Cuda_PairLJSDKCoulDebyeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom) +{ + + // initialize only on first call + static short init = 0; + + if(! init) { + init = 1; + Cuda_PairLJSDKCoulDebyeCuda_Init(sdata); + } + + dim3 grid, threads; + int sharedperproc; + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 128); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA<PAIR_CG_CMM, COUL_DEBYE, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + else + Pair_Kernel_TpA<PAIR_CG_CMM, COUL_DEBYE, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 +#undef _lj_type + diff --git a/lib/cuda/pair_lj_sdk_coul_debye_cuda_cu.h b/lib/cuda/pair_lj_sdk_coul_debye_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..7cee6319d60c16025be80f07948bf9fe6a0cb5fb --- /dev/null +++ b/lib/cuda/pair_lj_sdk_coul_debye_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairLJSDKCoulDebyeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom); diff --git a/lib/cuda/pair_lj_sdk_coul_long_cuda.cu b/lib/cuda/pair_lj_sdk_coul_long_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..6cbe15c7ab48df4b55e1bf8a76e6819446793c00 --- /dev/null +++ b/lib/cuda/pair_lj_sdk_coul_long_cuda.cu @@ -0,0 +1,81 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> + +#define _lj1 MY_AP(coeff1) +#define _lj2 MY_AP(coeff2) +#define _lj3 MY_AP(coeff3) +#define _lj4 MY_AP(coeff4) +#define _lj_type MY_AP(coeff5) + + +#include "pair_lj_sdk_coul_long_cuda_cu.h" +#include <time.h> + + + + +void Cuda_PairLJSDKCoulLongCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 5, true, false); + +} + + + + +void Cuda_PairLJSDKCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom) +{ + + // initialize only on first call + static short init = 0; + + if(! init) { + init = 1; + Cuda_PairLJSDKCoulLongCuda_Init(sdata); + } + + dim3 grid, threads; + int sharedperproc; + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 128); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA<PAIR_CG_CMM, COUL_LONG, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + else + Pair_Kernel_TpA<PAIR_CG_CMM, COUL_LONG, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 +#undef _lj_type + diff --git a/lib/cuda/pair_lj_sdk_coul_long_cuda_cu.h b/lib/cuda/pair_lj_sdk_coul_long_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..ae407b4841973c77f6e79541eef2d8af2ae83187 --- /dev/null +++ b/lib/cuda/pair_lj_sdk_coul_long_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairLJSDKCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom); diff --git a/lib/cuda/pair_lj_sdk_cuda.cu b/lib/cuda/pair_lj_sdk_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..a6fcf7f7a0f0da81d0b826db0a36bd22e0c9a39d --- /dev/null +++ b/lib/cuda/pair_lj_sdk_cuda.cu @@ -0,0 +1,87 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> + +#define _lj1 MY_AP(coeff1) +#define _lj2 MY_AP(coeff2) +#define _lj3 MY_AP(coeff3) +#define _lj4 MY_AP(coeff4) +#define _lj_type MY_AP(coeff5) + +enum {CG_NOT_SET = 0, CG_LJ9_6, CG_LJ12_4, CG_LJ12_6, NUM_CG_TYPES, + CG_COUL_NONE, CG_COUL_CUT, CG_COUL_DEBYE, CG_COUL_LONG + }; + +#include "pair_lj_sdk_cuda_cu.h" +#include "pair_lj_sdk_cuda_kernel_nc.cu" +#include <time.h> + + + + +void Cuda_PairLJSDKCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 5, false, false); + +} + + + + +void Cuda_PairLJSDKCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom) +{ + + // initialize only on first call + static short init = 0; + + if(! init) { + init = 1; + Cuda_PairLJSDKCuda_Init(sdata); + } + + dim3 grid, threads; + int sharedperproc; + + int maxthreads = 128; + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, maxthreads); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA<PAIR_CG_CMM, COUL_NONE, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + else + Pair_Kernel_TpA<PAIR_CG_CMM, COUL_NONE, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 +#undef _lj_type + diff --git a/lib/cuda/pair_lj_sdk_cuda_cu.h b/lib/cuda/pair_lj_sdk_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..6b858cf7075c95d193810438f0debfb1f107784c --- /dev/null +++ b/lib/cuda/pair_lj_sdk_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairLJSDKCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom); diff --git a/lib/cuda/pair_lj_sdk_cuda_kernel_nc.cu b/lib/cuda/pair_lj_sdk_cuda_kernel_nc.cu new file mode 100644 index 0000000000000000000000000000000000000000..f8f2474551840bb13113345c21b2d226b333df92 --- /dev/null +++ b/lib/cuda/pair_lj_sdk_cuda_kernel_nc.cu @@ -0,0 +1,49 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +__device__ inline F_FLOAT PairLJSDKCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl) //0.11 of 0.4 +{ + const F_FLOAT r2inv = F_F(1.0) / rsq; + const int lj_type = _lj_type[ij_type]; + const F_FLOAT r4inv = r2inv * r2inv; + const F_FLOAT rNinv_first = lj_type != CG_LJ9_6 ? r4inv : _RSQRT_(rsq); + const F_FLOAT rNinv_second = lj_type != CG_LJ12_4 ? -r2inv : -F_F(1.0); + const F_FLOAT forcelj = r4inv * (_lj1[ij_type] * r4inv * rNinv_first + _lj2[ij_type] * rNinv_second); + + if(eflag) evdwl += factor_lj * (r4inv * (_lj3[ij_type] * r4inv * rNinv_first + _lj4[ij_type] * rNinv_second) - _offset[ij_type]); + + return factor_lj * forcelj * r2inv; +} + +/*__device__ inline F_FLOAT PairLJSDKCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl) +{ + const int lj_type = tex1Dfetch(_coeff5_gm_tex,ij_type); + const F_FLOAT r2inv = F_F(1.0)/rsq; + const F_FLOAT r4inv = r2inv*r2inv; + const F_FLOAT rNinv_first = lj_type!=CG_LJ9_6?r4inv:_RSQRT_(rsq); + const F_FLOAT rNinv_second = lj_type!=CG_LJ12_4?r2inv:F_F(1.0); + const F_FLOAT forcelj = r4inv * (tex1Dfetch(_coeff1_gm_tex,ij_type)*r4inv*rNinv_first - tex1Dfetch(_coeff2_gm_tex,ij_type)*rNinv_second); + + if(eflag) evdwl += factor_lj*(r4inv*(tex1Dfetch(_coeff3_gm_tex,ij_type)*r4inv*rNinv_first-tex1Dfetch(_coeff4_gm_tex,ij_type)*rNinv_second)); + return factor_lj*forcelj*r2inv; +}*/ diff --git a/lib/cuda/pair_lj_smooth_cuda.cu b/lib/cuda/pair_lj_smooth_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..aa1df9e6bec193c2ce4f06fbb88fd7786d93ad04 --- /dev/null +++ b/lib/cuda/pair_lj_smooth_cuda.cu @@ -0,0 +1,84 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> + +#define _lj1 MY_AP(coeff1_gm) +#define _lj2 MY_AP(coeff2_gm) +#define _lj3 MY_AP(coeff3_gm) +#define _lj4 MY_AP(coeff4_gm) +#define _ljsw1 MY_AP(coeff5_gm) +#define _ljsw2 MY_AP(coeff6_gm) +#define _ljsw3 MY_AP(coeff7_gm) +#define _ljsw4 MY_AP(coeff8_gm) +#define _ljsw0 MY_AP(coeff9_gm) + +#include "pair_lj_smooth_cuda_cu.h" +#include "pair_lj_smooth_cuda_kernel_nc.cu" + +#include <time.h> + +void Cuda_PairLJSmoothCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 9, false, true, true); +} + + + +void Cuda_PairLJSmoothCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, + int eflag_atom, int vflag_atom) +{ + // initialize only on first call + static short init = 0; + + if(! init) { + init = 1; + Cuda_PairLJSmoothCuda_Init(sdata); + } + + dim3 grid, threads; + int sharedperproc; + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 192); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA<PAIR_LJ_SMOOTH, COUL_NONE, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + else + Pair_Kernel_TpA<PAIR_LJ_SMOOTH, COUL_NONE, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + +#undef _lj1 +#undef _lj2 +#undef _lj3 +#undef _lj4 +#undef _ljsw1 +#undef _ljsw2 +#undef _ljsw3 +#undef _ljsw4 +#undef _ljsw0 diff --git a/lib/cuda/pair_lj_smooth_cuda_cu.h b/lib/cuda/pair_lj_smooth_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..78a04db227252f229c99b7bcec9316d343c74439 --- /dev/null +++ b/lib/cuda/pair_lj_smooth_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairLJSmoothCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom); diff --git a/lib/cuda/pair_lj_smooth_cuda_kernel_nc.cu b/lib/cuda/pair_lj_smooth_cuda_kernel_nc.cu new file mode 100644 index 0000000000000000000000000000000000000000..c1bb3b07854493195b6ee0caa2db089f6a2822ca --- /dev/null +++ b/lib/cuda/pair_lj_smooth_cuda_kernel_nc.cu @@ -0,0 +1,61 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +__device__ inline F_FLOAT PairLJSmoothCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl) +{ + F_FLOAT fskin, t, tsq, forcelj; + const F_FLOAT r2inv = F_F(1.0) / rsq; + const F_FLOAT r = _RSQRT_(r2inv); + const F_FLOAT r6inv = r2inv * r2inv * r2inv; + + + X_FLOAT cut_lj_innersq = (_cut_innersq_global > X_F(0.0) ? _cut_innersq_global : _cut_innersq[ij_type]); + + if(rsq < cut_lj_innersq) { + forcelj = r6inv * (_lj1[ij_type] * r6inv - _lj2[ij_type]); + } else { + t = r - _SQRT_(cut_lj_innersq); + tsq = t * t; + fskin = _ljsw1[ij_type] + _ljsw2[ij_type] * t + + _ljsw3[ij_type] * tsq + _ljsw4[ij_type] * tsq * t; + forcelj = fskin * r; + + } + + if(eflag) { + ENERGY_FLOAT evdwl_tmp; + + if(rsq < cut_lj_innersq) { + evdwl_tmp = r6inv * (_lj3[ij_type] * r6inv - _lj4[ij_type]) - + _offset[ij_type]; + } else { + evdwl_tmp = _ljsw0[ij_type] - _ljsw1[ij_type] * t - + _ljsw2[ij_type] * tsq * F_F(0.5) - _ljsw3[ij_type] * tsq * t * (F_F(1.0) / F_F(3.0)) - + _ljsw4[ij_type] * tsq * tsq * (F_F(1.0) / F_F(4.0)) - _offset[ij_type]; + } + + evdwl += evdwl_tmp * factor_lj; + } + + return factor_lj * forcelj * r2inv; +} diff --git a/lib/cuda/pair_manybody_const.h b/lib/cuda/pair_manybody_const.h new file mode 100644 index 0000000000000000000000000000000000000000..94d644a9a1346b931d671db67d8915149833a4a5 --- /dev/null +++ b/lib/cuda/pair_manybody_const.h @@ -0,0 +1,16 @@ +/* + * pair_manybody_const.h + * + * Created on: Oct 11, 2011 + * Author: chmu-tph + */ + +#define MANYBODY_NPAIR 3 + +__device__ __constant__ int elem2param[(MANYBODY_NPAIR + 1) * (MANYBODY_NPAIR + 1) * (MANYBODY_NPAIR + 1)]; +__device__ __constant__ int nelements; +__device__ __constant__ int map[MANYBODY_NPAIR + 2]; +__device__ __constant__ int* _glob_numneigh_red; //number of neighbors within force cutoff (as opposed to neighbor cutoff) +__device__ __constant__ int* _glob_neighbors_red; //indices of neighbors within force cutoff +__device__ __constant__ int* _glob_neightype_red; //type of neighbors within force cutoff + diff --git a/lib/cuda/pair_morse_coul_long_cuda.cu b/lib/cuda/pair_morse_coul_long_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..7512eb05679a3e75f1cfac2a437682af0b5e50fb --- /dev/null +++ b/lib/cuda/pair_morse_coul_long_cuda.cu @@ -0,0 +1,79 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> + +#define _r0 MY_AP(coeff1) +#define _alpha MY_AP(coeff2) +#define _morse1 MY_AP(coeff3) +#define _d0 MY_AP(coeff4) +#define _c0 MY_AP(coeff5) + +#include "pair_morse_coul_long_cuda_cu.h" +#include "pair_morse_coul_long_cuda_kernel_nc.cu" + +#include <time.h> + +void Cuda_PairMorseCoulLongCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 5, true); +} + +void Cuda_PairMorseCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom) +{ + + + static short init = 0; + + if(! init) { + init = 1; + Cuda_PairMorseCoulLongCuda_Init(sdata); + } + + dim3 grid, threads; + int sharedperproc; + + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 192); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA<PAIR_MORSE_R6, COUL_LONG, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + else + Pair_Kernel_TpA<PAIR_MORSE_R6, COUL_LONG, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + + + +#undef _rhoinv +#undef _sigma +#undef _a +#undef _c +#undef _d +#undef _c0 + diff --git a/lib/cuda/pair_morse_coul_long_cuda_cu.h b/lib/cuda/pair_morse_coul_long_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..c6d031a1ce89539854a70bedb68ee8ed74693ab0 --- /dev/null +++ b/lib/cuda/pair_morse_coul_long_cuda_cu.h @@ -0,0 +1,30 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +#ifdef CUDA_USE_BINNING +extern "C" void Cuda_PairMorseCoulLongCuda(cuda_shared_data* sdata, int eflag, int vflag); +#else +extern "C" void Cuda_PairMorseCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom); +#endif diff --git a/lib/cuda/pair_morse_coul_long_cuda_kernel_nc.cu b/lib/cuda/pair_morse_coul_long_cuda_kernel_nc.cu new file mode 100644 index 0000000000000000000000000000000000000000..f6b436f0d681e0ee9d9c7c24dd0ec63e7b365a48 --- /dev/null +++ b/lib/cuda/pair_morse_coul_long_cuda_kernel_nc.cu @@ -0,0 +1,35 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ +__device__ inline F_FLOAT PairMorseR6Cuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl) +{ + const F_FLOAT r2inv = F_F(1.0) / rsq; + const F_FLOAT r = _SQRT_(rsq); + const F_FLOAT r4inv = r2inv * r2inv; + const F_FLOAT dr = r - _r0[ij_type]; + const F_FLOAT dexp = _EXP_(-_alpha[ij_type] * dr); + + if(eflag) evdwl += factor_lj * (_d0[ij_type] * (dexp * dexp - F_F(2.0) * dexp) + _c0[ij_type] * r4inv * r4inv * r4inv + - _offset[ij_type]); + + return factor_lj * (_morse1[ij_type] * (dexp * dexp - dexp) * (F_F(1.0) / r) - F_F(12.0) * _c0[ij_type] * r4inv * r4inv * r4inv * r2inv); +} diff --git a/lib/cuda/pair_morse_cuda.cu b/lib/cuda/pair_morse_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..b2a651b916347a401d7a19ebec051a0f8d414abc --- /dev/null +++ b/lib/cuda/pair_morse_cuda.cu @@ -0,0 +1,78 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> + +#define _r0 MY_AP(coeff1) +#define _alpha MY_AP(coeff2) +#define _morse1 MY_AP(coeff3) +#define _d0 MY_AP(coeff4) + +#include "pair_morse_cuda_cu.h" +#include "pair_morse_cuda_kernel_nc.cu" +#include <time.h> + + + +void Cuda_PairMorseCuda_Init(cuda_shared_data* sdata) +{ + Cuda_Pair_Init_AllStyles(sdata, 4); +} + + + + +void Cuda_PairMorseCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom) +{ + + // initialize only on first call + static short init = 0; + + if(! init) { + init = 1; + Cuda_PairMorseCuda_Init(sdata); + } + + dim3 grid, threads; + int sharedperproc; + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 256); + + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + + if(sdata->pair.use_block_per_atom) + Pair_Kernel_BpA<PAIR_MORSE, COUL_NONE, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + else + Pair_Kernel_TpA<PAIR_MORSE, COUL_NONE, DATA_NONE> + <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom); + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + +#undef _r0 +#undef _alpha +#undef _morse1 +#undef _d0 + + diff --git a/lib/cuda/pair_morse_cuda_cu.h b/lib/cuda/pair_morse_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..a9f923d20b1b88381e84d9c720f42432eec81432 --- /dev/null +++ b/lib/cuda/pair_morse_cuda_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairMorseCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom); diff --git a/lib/cuda/pair_morse_cuda_kernel_nc.cu b/lib/cuda/pair_morse_cuda_kernel_nc.cu new file mode 100644 index 0000000000000000000000000000000000000000..0b3baac412ac0d6284a190ee94ac030c0e8b59fd --- /dev/null +++ b/lib/cuda/pair_morse_cuda_kernel_nc.cu @@ -0,0 +1,34 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ +__device__ inline F_FLOAT PairMorseCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl) +{ + const F_FLOAT r = _SQRT_(rsq); + const F_FLOAT dr = r - _r0[ij_type]; + const F_FLOAT dexp = _EXP_(-_alpha[ij_type] * dr); + + if(eflag) evdwl += factor_lj * (_d0[ij_type] * (dexp * dexp - F_F(2.0) * dexp) + - _offset[ij_type]); + + return factor_lj * _morse1[ij_type] * (dexp * dexp - dexp) * (F_F(1.0) / r); +} + diff --git a/lib/cuda/pair_sw_cuda.cu b/lib/cuda/pair_sw_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..491d4d666fc4289084e216c00d78c1bfd1a4bf12 --- /dev/null +++ b/lib/cuda/pair_sw_cuda.cu @@ -0,0 +1,139 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> + +#include "pair_sw_cuda_cu.h" +__device__ __constant__ ParamSW_Float params_sw[MANYBODY_NPAIR* MANYBODY_NPAIR* MANYBODY_NPAIR]; + +#include "pair_sw_cuda_kernel_nc.cu" + +#include <time.h> + + +void Cuda_PairSWCuda_Init(cuda_shared_data* sdata, ParamSW_Float* params_host, void* map_host, void* elem2param_host, int nelements_h) +{ + unsigned cuda_ntypes = sdata->atom.ntypes + 1; + X_FLOAT box_size[3] = { + sdata->domain.subhi[0] - sdata->domain.sublo[0], + sdata->domain.subhi[1] - sdata->domain.sublo[1], + sdata->domain.subhi[2] - sdata->domain.sublo[2] + }; + + cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_FLOAT) * 3); + cudaMemcpyToSymbol(MY_AP(cuda_ntypes) , &cuda_ntypes , sizeof(unsigned)); + cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(periodicity) , sdata->domain.periodicity , sizeof(int) * 3); + cudaMemcpyToSymbol(MY_AP(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int)); + cudaMemcpyToSymbol(params_sw, params_host , sizeof(ParamSW_Float)*nelements_h * nelements_h * nelements_h); + cudaMemcpyToSymbol(elem2param, elem2param_host , sizeof(int)*nelements_h * nelements_h * nelements_h); + cudaMemcpyToSymbol(map, map_host , sizeof(int)*cuda_ntypes); + cudaMemcpyToSymbol(nelements, &nelements_h, sizeof(int)); +} + +void Cuda_PairSWCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom) +{ + static int glob_ij_size = 0; + static F_FLOAT4* glob_r_ij = NULL; + static int* glob_numneigh_red = NULL; + static int* glob_neighbors_red = NULL; + static int* glob_neightype_red = NULL; + + if(glob_ij_size < sdata->atom.nall * sneighlist->maxneighbors * sizeof(F_FLOAT)) { + glob_ij_size = sdata->atom.nall * sneighlist->maxneighbors * sizeof(F_FLOAT); + cudaFree(glob_r_ij); + cudaFree(glob_numneigh_red); + cudaFree(glob_neighbors_red); + cudaFree(glob_neightype_red); + cudaMalloc(&glob_r_ij, glob_ij_size * 4); + cudaMalloc(&glob_numneigh_red, sdata->atom.nall * sizeof(int)); + cudaMalloc(&glob_neighbors_red, sdata->atom.nall * sneighlist->maxneighbors * sizeof(int)); + cudaMalloc(&glob_neightype_red, sdata->atom.nall * sneighlist->maxneighbors * sizeof(int)); + cudaMemcpyToSymbol(_glob_numneigh_red, &glob_numneigh_red , sizeof(int*)); + cudaMemcpyToSymbol(_glob_neighbors_red, &glob_neighbors_red , sizeof(int*)); + cudaMemcpyToSymbol(_glob_neightype_red, &glob_neightype_red , sizeof(int*)); + cudaMemcpyToSymbol(_glob_r_ij, &glob_r_ij , sizeof(F_FLOAT4*)); + } + + dim3 grid, threads; + int sharedperproc; + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 64); + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + + + + dim3 grid2; + + if(sdata->atom.nall <= 256 * 64000) { + grid2.x = (sdata->atom.nall + 255) / 256; + grid2.y = 1; + } else { + grid2.x = (sdata->atom.nall + 256 * 128 - 1) / (256 * 128); + grid2.y = 128; + } + + grid2.z = 1; + dim3 threads2; + threads2.x = 256; + threads2.y = 1; + threads2.z = 1; + + timespec time1, time2; + + //pre-calculate all neighbordistances and zeta_ij + clock_gettime(CLOCK_REALTIME, &time1); + Pair_SW_Kernel_TpA_RIJ <<< grid2, threads2, 0, streams[1]>>>(); + cudaThreadSynchronize(); + clock_gettime(CLOCK_REALTIME, &time2); + sdata->cuda_timings.test1 += + time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; + clock_gettime(CLOCK_REALTIME, &time1); + + //actual force calculation + unsigned int sharedsize = (sharedperproc * sizeof(ENERGY_FLOAT) + 4 * sizeof(F_FLOAT)) * threads.x; //extra 4 floats per thread used to reduce register pressure + + if(eflag) { + if(vflag) + Pair_SW_Kernel_TpA<1, 1> <<< grid, threads, sharedsize, streams[1]>>> + (eflag_atom, vflag_atom); + else + Pair_SW_Kernel_TpA<1, 0> <<< grid, threads, sharedsize, streams[1]>>> + (eflag_atom, vflag_atom); + } else { + if(vflag) + Pair_SW_Kernel_TpA<0, 1> <<< grid, threads, sharedsize, streams[1]>>> + (eflag_atom, vflag_atom); + else + Pair_SW_Kernel_TpA<0, 0> <<< grid, threads, sharedsize, streams[1]>>> + (eflag_atom, vflag_atom); + } + cudaThreadSynchronize(); + clock_gettime(CLOCK_REALTIME, &time2); + sdata->cuda_timings.test2 += + time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + diff --git a/lib/cuda/pair_sw_cuda_cu.h b/lib/cuda/pair_sw_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..c3713a398902e20e8738f707a8658007111f46fe --- /dev/null +++ b/lib/cuda/pair_sw_cuda_cu.h @@ -0,0 +1,39 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +struct ParamSW_Float { + F_FLOAT epsilon, sigma; + F_FLOAT littlea, lambda, gamma, costheta; + F_FLOAT biga, bigb; + F_FLOAT powerp, powerq; + F_FLOAT tol; + F_FLOAT cut, cutsq; + F_FLOAT sigma_gamma, lambda_epsilon, lambda_epsilon2; + F_FLOAT c1, c2, c3, c4, c5, c6; + int ielement, jelement, kelement; +}; + +extern "C" void Cuda_PairSWCuda_Init(cuda_shared_data* sdata, ParamSW_Float* params_host, void* map_host, void* elem2param_host, int nelements_h); +extern "C" void Cuda_PairSWCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom); diff --git a/lib/cuda/pair_sw_cuda_kernel_nc.cu b/lib/cuda/pair_sw_cuda_kernel_nc.cu new file mode 100644 index 0000000000000000000000000000000000000000..ade74808eefdf20b5a0e533d632914ed56a80b0a --- /dev/null +++ b/lib/cuda/pair_sw_cuda_kernel_nc.cu @@ -0,0 +1,457 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ +#define Pi F_F(3.1415926535897932384626433832795) +#define PI Pi +#define PI2 F_F(0.5)*Pi +#define PI4 F_F(0.25)*Pi + + + +__device__ void twobody(int iparam, F_FLOAT rsq, F_FLOAT &fforce, + int eflag, ENERGY_FLOAT &eng) +{ + F_FLOAT r, rp, rq, rainv, expsrainv; + + r = sqrt(rsq); + rp = pow(r, -params_sw[iparam].powerp); + rq = pow(r, -params_sw[iparam].powerq); + rainv = 1.0 / (r - params_sw[iparam].cut); + expsrainv = exp(params_sw[iparam].sigma * rainv); + fforce = (params_sw[iparam].c1 * rp - params_sw[iparam].c2 * rq + + (params_sw[iparam].c3 * rp - params_sw[iparam].c4 * rq) * rainv * rainv * r) * expsrainv / rsq; + + if(eflag) eng += (params_sw[iparam].c5 * rp - params_sw[iparam].c6 * rq) * expsrainv; +} + +__device__ void threebody(int paramij, int paramik, int paramijk, + F_FLOAT4 &delr1, + F_FLOAT4 &delr2, + F_FLOAT3 &fj, F_FLOAT3 &fk, int eflag, ENERGY_FLOAT &eng) +{ + F_FLOAT r1, rinvsq1, rainv1, gsrainv1, gsrainvsq1, expgsrainv1; + F_FLOAT r2, rinvsq2, rainv2, gsrainv2, gsrainvsq2, expgsrainv2; + F_FLOAT rinv12, cs, delcs, delcssq, facexp, facrad, frad1, frad2; + F_FLOAT facang, facang12, csfacang, csfac1, csfac2; + + r1 = sqrt(delr1.w); + rinvsq1 = F_F(1.0) / delr1.w; + rainv1 = F_F(1.0) / (r1 - params_sw[paramij].cut); + gsrainv1 = params_sw[paramij].sigma_gamma * rainv1; + gsrainvsq1 = gsrainv1 * rainv1 / r1; + expgsrainv1 = exp(gsrainv1); + + r2 = sqrt(delr2.w); + rinvsq2 = F_F(1.0) / delr2.w; + rainv2 = F_F(1.0) / (r2 - params_sw[paramik].cut); + gsrainv2 = params_sw[paramik].sigma_gamma * rainv2; + gsrainvsq2 = gsrainv2 * rainv2 / r2; + expgsrainv2 = exp(gsrainv2); + + rinv12 = F_F(1.0) / (r1 * r2); + cs = (delr1.x * delr2.x + delr1.y * delr2.y + delr1.z * delr2.z) * rinv12; + delcs = cs - params_sw[paramijk].costheta; + delcssq = delcs * delcs; + + facexp = expgsrainv1 * expgsrainv2; + + // facrad = sqrt(paramij->lambda_epsilon*paramik->lambda_epsilon) * + // facexp*delcssq; + + facrad = params_sw[paramijk].lambda_epsilon * facexp * delcssq; + frad1 = facrad * gsrainvsq1; + frad2 = facrad * gsrainvsq2; + facang = params_sw[paramijk].lambda_epsilon2 * facexp * delcs; + facang12 = rinv12 * facang; + csfacang = cs * facang; + csfac1 = rinvsq1 * csfacang; + + fj.x = delr1.x * (frad1 + csfac1) - delr2.x * facang12; + fj.y = delr1.y * (frad1 + csfac1) - delr2.y * facang12; + fj.z = delr1.z * (frad1 + csfac1) - delr2.z * facang12; + + csfac2 = rinvsq2 * csfacang; + + fk.x = delr2.x * (frad2 + csfac2) - delr1.x * facang12; + fk.y = delr2.y * (frad2 + csfac2) - delr1.y * facang12; + fk.z = delr2.z * (frad2 + csfac2) - delr1.z * facang12; + + if(eflag) eng += F_F(2.0) * facrad; +} + +__device__ void threebody_fj(int paramij, int paramik, int paramijk, + F_FLOAT4 &delr1, + F_FLOAT4 &delr2, + F_FLOAT3 &fj) +{ + F_FLOAT r1, rinvsq1, rainv1, gsrainv1, gsrainvsq1, expgsrainv1; + F_FLOAT r2, rainv2, gsrainv2, expgsrainv2; + F_FLOAT rinv12, cs, delcs, delcssq, facexp, facrad, frad1; + F_FLOAT facang, facang12, csfacang, csfac1; + + r1 = sqrt(delr1.w); + rinvsq1 = F_F(1.0) / delr1.w; + rainv1 = F_F(1.0) / (r1 - params_sw[paramij].cut); + gsrainv1 = params_sw[paramij].sigma_gamma * rainv1; + gsrainvsq1 = gsrainv1 * rainv1 / r1; + expgsrainv1 = exp(gsrainv1); + + r2 = sqrt(delr2.w); + rainv2 = F_F(1.0) / (r2 - params_sw[paramik].cut); + gsrainv2 = params_sw[paramik].sigma_gamma * rainv2; + expgsrainv2 = exp(gsrainv2); + + rinv12 = F_F(1.0) / (r1 * r2); + cs = (delr1.x * delr2.x + delr1.y * delr2.y + delr1.z * delr2.z) * rinv12; + delcs = cs - params_sw[paramijk].costheta; + delcssq = delcs * delcs; + + facexp = expgsrainv1 * expgsrainv2; + + // facrad = sqrt(paramij->lambda_epsilon*paramik->lambda_epsilon) * + // facexp*delcssq; + + facrad = params_sw[paramijk].lambda_epsilon * facexp * delcssq; + frad1 = facrad * gsrainvsq1; + facang = params_sw[paramijk].lambda_epsilon2 * facexp * delcs; + facang12 = rinv12 * facang; + csfacang = cs * facang; + csfac1 = rinvsq1 * csfacang; + + fj.x = delr1.x * (frad1 + csfac1) - delr2.x * facang12; + fj.y = delr1.y * (frad1 + csfac1) - delr2.y * facang12; + fj.z = delr1.z * (frad1 + csfac1) - delr2.z * facang12; +} + + +__global__ void Pair_SW_Kernel_TpA_RIJ()//F_FLOAT4* _glob_r_ij,int* _glob_numneigh_red,int* _glob_neighbors_red,int* _glob_neightype_red) +{ + int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if(ii >= _nall) return; + + X_FLOAT4 myxtype; + F_FLOAT4 delij; + F_FLOAT xtmp, ytmp, ztmp; + int itype, jnum, i, j; + int* jlist; + int neigh_red = 0; + i = ii;//_ilist[ii]; + myxtype = fetchXType(i); + + xtmp = myxtype.x; + ytmp = myxtype.y; + ztmp = myxtype.z; + itype = map[(static_cast <int>(myxtype.w))]; + + jnum = _numneigh[i]; + jlist = &_neighbors[i]; + + __syncthreads(); + + for(int jj = 0; jj < jnum; jj++) { + if(jj < jnum) { + + j = jlist[jj * _nall]; + j &= NEIGHMASK; + myxtype = fetchXType(j); + delij.x = xtmp - myxtype.x; + delij.y = ytmp - myxtype.y; + delij.z = ztmp - myxtype.z; + int jtype = map[(static_cast <int>(myxtype.w))]; + int iparam_ij = elem2param[(itype * nelements + jtype) * nelements + jtype]; + delij.w = vec3_dot(delij, delij); + + if(delij.w < params_sw[iparam_ij].cutsq) { + _glob_neighbors_red[i + neigh_red * _nall] = j; + _glob_neightype_red[i + neigh_red * _nall] = jtype; + _glob_r_ij[i + neigh_red * _nall] = delij; + neigh_red++; + } + } + } + + _glob_numneigh_red[i] = neigh_red; +} + + +template <int eflag, int vflagm> +__global__ void Pair_SW_Kernel_TpA(int eflag_atom, int vflag_atom) //,F_FLOAT* _glob_zeta_ij,F_FLOAT4* _glob_r_ij,int* _glob_numneigh_red,int* _glob_neighbors_red,int* _glob_neightype_red) +{ + ENERGY_FLOAT evdwl = ENERGY_F(0.0); + + ENERGY_FLOAT* sharedE = &sharedmem[threadIdx.x]; + ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x]; + + F_FLOAT* shared_F_F = (F_FLOAT*) sharedmem; + + if((eflag || eflag_atom) && (vflagm || vflag_atom)) shared_F_F = (F_FLOAT*) &sharedmem[7 * blockDim.x]; + else if(eflag) shared_F_F = (F_FLOAT*) &sharedmem[blockDim.x]; + else if(vflagm) shared_F_F = (F_FLOAT*) &sharedmem[6 * blockDim.x]; + + shared_F_F += threadIdx.x; + + if(eflag_atom || eflag) { + sharedE[0] = ENERGY_F(0.0); + sharedV += blockDim.x; + } + + if(vflagm || vflag_atom) { + sharedV[0 * blockDim.x] = ENERGY_F(0.0); + sharedV[1 * blockDim.x] = ENERGY_F(0.0); + sharedV[2 * blockDim.x] = ENERGY_F(0.0); + sharedV[3 * blockDim.x] = ENERGY_F(0.0); + sharedV[4 * blockDim.x] = ENERGY_F(0.0); + sharedV[5 * blockDim.x] = ENERGY_F(0.0); + } + + int jnum_red = 0; +#define fxtmp shared_F_F[0] +#define fytmp shared_F_F[blockDim.x] +#define fztmp shared_F_F[2*blockDim.x] + //#define jnum_red (static_cast <int> (shared_F_F[3*blockDim.x])) + + int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + X_FLOAT4 myxtype_i, myxtype_j, myxtype_k; + F_FLOAT4 delij, delik, deljk; + F_FLOAT fpair; + + int itype, i, j; + int* jlist_red; + + if(ii < _inum) { + i = _ilist[ii]; + + if(vflagm) + myxtype_i = fetchXType(i); + + //itype=map[(static_cast <int> (myxtype_i.w))]; + itype = map[_type[i]]; + + + fxtmp = F_F(0.0); + fytmp = F_F(0.0); + fztmp = F_F(0.0); + + + //shared_F_F[3*blockDim.x] = _glob_numneigh_red[i]; + jnum_red = _glob_numneigh_red[i]; + jlist_red = &_glob_neighbors_red[i]; + } + + __syncthreads(); +#pragma unroll 1 + + for(int jj = 0; jj < jnum_red; jj++) { + if(i < _nlocal) { + fpair = F_F(0.0); + j = jlist_red[jj * _nall]; + j &= NEIGHMASK; + + if(vflagm) + myxtype_j = fetchXType(j); + + int jtype = _glob_neightype_red[i + jj * _nall]; + delij = _glob_r_ij[i + jj * _nall]; + + volatile int iparam_ij = elem2param[(itype * nelements + jtype) * nelements + jtype]; + volatile int iparam_ji = elem2param[(jtype * nelements + itype) * nelements + itype]; + + if(delij.w < params_sw[iparam_ij].cutsq) { + F_FLOAT dxfp, dyfp, dzfp; + twobody(iparam_ij, delij.w, fpair, eflag, evdwl); + fxtmp += dxfp = delij.x * fpair; + fytmp += dyfp = delij.y * fpair; + fztmp += dzfp = delij.z * fpair; + + if(vflagm) { + sharedV[0 * blockDim.x] += delij.x * dxfp; + sharedV[1 * blockDim.x] += delij.y * dyfp; + sharedV[2 * blockDim.x] += delij.z * dzfp; + sharedV[3 * blockDim.x] += delij.x * dyfp; + sharedV[4 * blockDim.x] += delij.x * dzfp; + sharedV[5 * blockDim.x] += delij.y * dzfp; + } + + + + + + + vec3_scale(F_F(-1.0), delij, delij); + +#pragma unroll 1 + + for(int kk = jj + 1; kk < jnum_red; kk++) { + int k = jlist_red[kk * _nall]; + k &= NEIGHMASK; + + if(vflagm) + myxtype_k = fetchXType(k); + + delik = _glob_r_ij[i + kk * _nall]; + + int ktype = _glob_neightype_red[i + kk * _nall]; + int iparam_ik = elem2param[(itype * nelements + ktype) * nelements + ktype]; + int iparam_ijk = elem2param[(itype * nelements + jtype) * nelements + ktype]; + vec3_scale(F_F(-1.0), delik, delik); + + if(delik.w <= params_sw[iparam_ijk].cutsq) { + F_FLOAT3 fj, fk; + threebody(iparam_ij, iparam_ik, iparam_ijk, + delij, delik, fj, fk, eflag, evdwl); + fxtmp -= fj.x + fk.x; + fytmp -= fj.y + fk.y; + fztmp -= fj.z + fk.z; + + if(vflagm) { + sharedV[0 * blockDim.x] -= ENERGY_F(2.0) * myxtype_i.x * (fj.x + fk.x); + sharedV[1 * blockDim.x] -= ENERGY_F(2.0) * myxtype_i.y * (fj.y + fk.y); + sharedV[2 * blockDim.x] -= ENERGY_F(2.0) * myxtype_i.z * (fj.z + fk.z); + sharedV[3 * blockDim.x] -= ENERGY_F(2.0) * myxtype_i.x * (fj.y + fk.y); + sharedV[4 * blockDim.x] -= ENERGY_F(2.0) * myxtype_i.x * (fj.z + fk.z); + sharedV[5 * blockDim.x] -= ENERGY_F(2.0) * myxtype_i.y * (fj.z + fk.z); + + sharedV[0 * blockDim.x] += ENERGY_F(2.0) * myxtype_j.x * fj.x; + sharedV[1 * blockDim.x] += ENERGY_F(2.0) * myxtype_j.y * fj.y; + sharedV[2 * blockDim.x] += ENERGY_F(2.0) * myxtype_j.z * fj.z; + sharedV[3 * blockDim.x] += ENERGY_F(2.0) * myxtype_j.x * fj.y; + sharedV[4 * blockDim.x] += ENERGY_F(2.0) * myxtype_j.x * fj.z; + sharedV[5 * blockDim.x] += ENERGY_F(2.0) * myxtype_j.y * fj.z; + + sharedV[0 * blockDim.x] += ENERGY_F(2.0) * myxtype_k.x * fk.x; + sharedV[1 * blockDim.x] += ENERGY_F(2.0) * myxtype_k.y * fk.y; + sharedV[2 * blockDim.x] += ENERGY_F(2.0) * myxtype_k.z * fk.z; + sharedV[3 * blockDim.x] += ENERGY_F(2.0) * myxtype_k.x * fk.y; + sharedV[4 * blockDim.x] += ENERGY_F(2.0) * myxtype_k.x * fk.z; + sharedV[5 * blockDim.x] += ENERGY_F(2.0) * myxtype_k.y * fk.z; + } + } + } + + int j_jnum_red = _glob_numneigh_red[j]; + int* j_jlist_red = &_glob_neighbors_red[j]; + + int j_ii = 0; + + //#pragma unroll 1 + for(int j_kk = 0; j_kk < j_jnum_red; j_kk++) { + if(j_jlist_red[j_kk * _nall] == i) j_ii = j_kk; + } + +#pragma unroll 1 + + for(int kk = 0; kk < j_jnum_red; kk++) { + if(j_ii == kk) continue; + + int k = j_jlist_red[kk * _nall]; + k &= NEIGHMASK; + deljk = _glob_r_ij[j + kk * _nall]; + vec3_scale(F_F(-1.0), deljk, deljk); + int ktype = _glob_neightype_red[j + kk * _nall]; + + int iparam_ji = elem2param[(jtype * nelements + itype) * nelements + itype]; + int iparam_jk = elem2param[(jtype * nelements + ktype) * nelements + ktype]; + int iparam_jik = elem2param[(jtype * nelements + itype) * nelements + ktype]; + + + vec3_scale(F_F(-1.0), delij, delij); + + if(deljk.w <= params_sw[iparam_jik].cutsq) { + F_FLOAT3 fj; + + threebody_fj(iparam_ji, iparam_jk, iparam_jik, + delij, deljk, fj); + fxtmp += fj.x; + fytmp += fj.y; + fztmp += fj.z; + + } + + vec3_scale(F_F(-1.0), delij, delij); + } + } + } + + } + + __syncthreads(); + + if(ii < _inum) { + F_FLOAT* my_f; + + if(_collect_forces_later) { + ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; + + if(eflag) { + buffer = &buffer[1 * gridDim.x * gridDim.y]; + } + + if(vflagm) { + buffer = &buffer[6 * gridDim.x * gridDim.y]; + } + + my_f = (F_FLOAT*) buffer; + my_f += i; + *my_f = fxtmp; + my_f += _nmax; + *my_f = fytmp; + my_f += _nmax; + *my_f = fztmp; + } else { + my_f = _f + i; + *my_f += fxtmp; + my_f += _nmax; + *my_f += fytmp; + my_f += _nmax; + *my_f += fztmp; + } + } + + __syncthreads(); + + if(eflag) { + sharedE[0] = evdwl; + } + + if(eflag_atom && i < _nlocal) { + _eatom[i] = ENERGY_F(0.5) * evdwl; + } + + if(vflag_atom && i < _nlocal) { + _vatom[i] = ENERGY_F(0.5) * sharedV[0 * blockDim.x]; + _vatom[i + _nmax] = ENERGY_F(0.5) * sharedV[1 * blockDim.x]; + _vatom[i + 2 * _nmax] = ENERGY_F(0.5) * sharedV[2 * blockDim.x]; + _vatom[i + 3 * _nmax] = ENERGY_F(0.5) * sharedV[3 * blockDim.x]; + _vatom[i + 4 * _nmax] = ENERGY_F(0.5) * sharedV[4 * blockDim.x]; + _vatom[i + 5 * _nmax] = ENERGY_F(0.5) * sharedV[5 * blockDim.x]; + } + + if(vflagm && eflag) PairVirialCompute_A_Kernel_Template<1, 1>(); + else if(eflag) PairVirialCompute_A_Kernel_Template<1, 0>(); + else if(vflagm) PairVirialCompute_A_Kernel_Template<0, 1>(); + +#undef fxtmp +#undef fytmp +#undef fztmp + //#undef jnum_red +} diff --git a/lib/cuda/pair_tersoff_cuda.cu b/lib/cuda/pair_tersoff_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..0ae5e846a000545c12377f7718bb4e48ee1cf154 --- /dev/null +++ b/lib/cuda/pair_tersoff_cuda.cu @@ -0,0 +1,154 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include <stdio.h> + + +#include "pair_tersoff_cuda_cu.h" +__device__ __constant__ Param_Float params[MANYBODY_NPAIR* MANYBODY_NPAIR* MANYBODY_NPAIR]; +__device__ __constant__ F_FLOAT* _glob_zeta_ij; //zeta_ij +__device__ __constant__ F_FLOAT4* _glob_r_ij; //r_ij (x,y,z,r^2) for pairs within force cutoff +__device__ __constant__ bool _zbl; //is tersoff zbl? + + +#include "pair_tersoff_cuda_kernel_nc.cu" + +#include <time.h> + + +void Cuda_PairTersoffCuda_Init(cuda_shared_data* sdata, Param_Float* params_host, void* map_host, void* elem2param_host, int nelements_h, bool zbl) +{ + unsigned cuda_ntypes = sdata->atom.ntypes + 1; + X_FLOAT box_size[3] = { + sdata->domain.subhi[0] - sdata->domain.sublo[0], + sdata->domain.subhi[1] - sdata->domain.sublo[1], + sdata->domain.subhi[2] - sdata->domain.sublo[2] + }; + + cudaMemcpyToSymbol(MY_AP(box_size) , box_size , sizeof(X_FLOAT) * 3); + cudaMemcpyToSymbol(MY_AP(cuda_ntypes) , &cuda_ntypes , sizeof(unsigned)); + cudaMemcpyToSymbol(MY_AP(virial) , &sdata->pair.virial.dev_data , sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(eng_vdwl) , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(periodicity) , sdata->domain.periodicity , sizeof(int) * 3); + cudaMemcpyToSymbol(MY_AP(collect_forces_later), &sdata->pair.collect_forces_later , sizeof(int)); + cudaMemcpyToSymbol(params, params_host , sizeof(Param_Float)*nelements_h * nelements_h * nelements_h); + cudaMemcpyToSymbol(elem2param, elem2param_host , sizeof(int)*nelements_h * nelements_h * nelements_h); + cudaMemcpyToSymbol(map, map_host , sizeof(int)*cuda_ntypes); + cudaMemcpyToSymbol(nelements, &nelements_h, sizeof(int)); + cudaMemcpyToSymbol(_zbl, &zbl, sizeof(bool)); + +} + +void Cuda_PairTersoffCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom) +{ + static F_FLOAT* glob_zeta_ij = NULL; + static int glob_zeta_ij_size = 0; + static F_FLOAT4* glob_r_ij = NULL; + static int* glob_numneigh_red = NULL; + static int* glob_neighbors_red = NULL; + static int* glob_neightype_red = NULL; + + if(glob_zeta_ij_size < sdata->atom.nall * sneighlist->maxneighbors * sizeof(F_FLOAT)) { + glob_zeta_ij_size = sdata->atom.nall * sneighlist->maxneighbors * sizeof(F_FLOAT); + cudaFree(glob_zeta_ij); + cudaFree(glob_r_ij); + cudaFree(glob_numneigh_red); + cudaFree(glob_neighbors_red); + cudaFree(glob_neightype_red); + cudaMalloc(&glob_zeta_ij, glob_zeta_ij_size); + cudaMalloc(&glob_r_ij, glob_zeta_ij_size * 4); + cudaMalloc(&glob_numneigh_red, sdata->atom.nall * sizeof(int)); + cudaMalloc(&glob_neighbors_red, sdata->atom.nall * sneighlist->maxneighbors * sizeof(int)); + cudaMalloc(&glob_neightype_red, sdata->atom.nall * sneighlist->maxneighbors * sizeof(int)); + cudaMemcpyToSymbol(_glob_numneigh_red, &glob_numneigh_red , sizeof(int*)); + cudaMemcpyToSymbol(_glob_neighbors_red, &glob_neighbors_red , sizeof(int*)); + cudaMemcpyToSymbol(_glob_neightype_red, &glob_neightype_red , sizeof(int*)); + cudaMemcpyToSymbol(_glob_r_ij, &glob_r_ij , sizeof(F_FLOAT4*)); + cudaMemcpyToSymbol(_glob_zeta_ij, &glob_zeta_ij , sizeof(F_FLOAT*)); + } + + dim3 grid, threads; + int sharedperproc; + + Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 64); + cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams(); + + + + dim3 grid2; + + if(sdata->atom.nall <= 256 * 64000) { + grid2.x = (sdata->atom.nall + 255) / 256; + grid2.y = 1; + } else { + grid2.x = (sdata->atom.nall + 256 * 128 - 1) / (256 * 128); + grid2.y = 128; + } + + grid2.z = 1; + dim3 threads2; + threads2.x = 256; + threads2.y = 1; + threads2.z = 1; + + timespec time1, time2; + + //pre-calculate all neighbordistances and zeta_ij + clock_gettime(CLOCK_REALTIME, &time1); + Pair_Tersoff_Kernel_TpA_RIJ <<< grid2, threads2, 0, streams[1]>>> + (); + cudaThreadSynchronize(); + Pair_Tersoff_Kernel_TpA_ZetaIJ <<< grid2, threads2, 0, streams[1]>>> + (); + cudaThreadSynchronize(); + clock_gettime(CLOCK_REALTIME, &time2); + sdata->cuda_timings.test1 += + time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; + clock_gettime(CLOCK_REALTIME, &time1); + + //actual force calculation + unsigned int sharedsize = (sharedperproc * sizeof(ENERGY_FLOAT) + 4 * sizeof(F_FLOAT)) * threads.x; //extra 4 floats per thread used to reduce register pressure + + if(eflag) { + if(vflag) + Pair_Tersoff_Kernel_TpA<1, 1> <<< grid, threads, sharedsize, streams[1]>>> + (eflag_atom, vflag_atom); + else + Pair_Tersoff_Kernel_TpA<1, 0> <<< grid, threads, sharedsize, streams[1]>>> + (eflag_atom, vflag_atom); + } else { + if(vflag) + Pair_Tersoff_Kernel_TpA<0, 1> <<< grid, threads, sharedsize, streams[1]>>> + (eflag_atom, vflag_atom); + else + Pair_Tersoff_Kernel_TpA<0, 0> <<< grid, threads, sharedsize, streams[1]>>> + (eflag_atom, vflag_atom); + } + cudaThreadSynchronize(); + clock_gettime(CLOCK_REALTIME, &time2); + sdata->cuda_timings.test2 += + time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000; + + Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag); +} + diff --git a/lib/cuda/pair_tersoff_cuda_cu.h b/lib/cuda/pair_tersoff_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..e4eb81827f6a8deb9e8a89c141f86a685091db48 --- /dev/null +++ b/lib/cuda/pair_tersoff_cuda_cu.h @@ -0,0 +1,42 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +struct Param_Float { + F_FLOAT lam1, lam2, lam3; + F_FLOAT c, d, h; + F_FLOAT gamma, powerm; + F_FLOAT powern, beta; + F_FLOAT biga, bigb, bigd, bigr; + F_FLOAT cut, cutsq; + F_FLOAT c1, c2, c3, c4; + int ielement, jelement, kelement; + int powermint; + //F_FLOAT Z_i,Z_j; + F_FLOAT ZBLcut, ZBLexpscale; + F_FLOAT a_ij, premult; +}; + +extern "C" void Cuda_PairTersoffCuda_Init(cuda_shared_data* sdata, Param_Float* params_host, void* map_host, void* elem2param_host, int nelements_h, bool zbl); +extern "C" void Cuda_PairTersoffCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom); diff --git a/lib/cuda/pair_tersoff_cuda_kernel_nc.cu b/lib/cuda/pair_tersoff_cuda_kernel_nc.cu new file mode 100644 index 0000000000000000000000000000000000000000..e5143b36adc5d4b0efd409e9999d6885240f121a --- /dev/null +++ b/lib/cuda/pair_tersoff_cuda_kernel_nc.cu @@ -0,0 +1,1097 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ +#define Pi F_F(3.1415926535897932384626433832795) +#define PI Pi +#define PI2 F_F(0.5)*Pi +#define PI4 F_F(0.25)*Pi +template <const int eflag, const int vflag> +static inline __device__ void PairVirialCompute_A_Kernel_Template() +{ + __syncthreads(); + ENERGY_FLOAT* shared = sharedmem; + + if(eflag) { + reduceBlock(shared); + shared += blockDim.x; + } + + if(vflag) { + reduceBlock(shared + 0 * blockDim.x); + reduceBlock(shared + 1 * blockDim.x); + reduceBlock(shared + 2 * blockDim.x); + reduceBlock(shared + 3 * blockDim.x); + reduceBlock(shared + 4 * blockDim.x); + reduceBlock(shared + 5 * blockDim.x); + } + + if(threadIdx.x == 0) { + shared = sharedmem; + ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; + + if(eflag) { + buffer[blockIdx.x * gridDim.y + blockIdx.y] = ENERGY_F(0.5) * shared[0]; + shared += blockDim.x; + buffer += gridDim.x * gridDim.y; + } + + if(vflag) { + buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[0 * blockDim.x]; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 1 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[1 * blockDim.x]; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[2 * blockDim.x]; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[3 * blockDim.x]; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 4 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[4 * blockDim.x]; + buffer[blockIdx.x * gridDim.y + blockIdx.y + 5 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[5 * blockDim.x]; + } + } + + __syncthreads(); +} + +__global__ void virial_fdotr_compute_kernel(int eflag) +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + ENERGY_FLOAT* sharedE = (ENERGY_FLOAT*) &sharedmem[0]; + ENERGY_FLOAT* sharedVirial = (ENERGY_FLOAT*) &sharedE[blockDim.x]; + sharedE += threadIdx.x; + sharedVirial += threadIdx.x; + + if(i < _nlocal) { + + F_FLOAT x = _x[i]; + F_FLOAT y = _x[i + _nmax]; + F_FLOAT z = _x[i + 2 * _nmax]; + F_FLOAT fx = _f[i]; + F_FLOAT fy = _f[i + _nmax]; + F_FLOAT fz = _f[i + 2 * _nmax]; + //if(fz*z*fz*z>1e-5) printf("V %i %i %e %e %e %e %e %e\n",i,_tag[i],x,y,z,fx,fy,fz); + sharedVirial[0] = fx * x; + sharedVirial[1 * blockDim.x] = fy * y; + sharedVirial[2 * blockDim.x] = fz * z; + sharedVirial[3 * blockDim.x] = fy * x; + sharedVirial[4 * blockDim.x] = fz * x; + sharedVirial[5 * blockDim.x] = fz * y; + } else { + sharedVirial[0] = 0; + sharedVirial[1 * blockDim.x] = 0; + sharedVirial[2 * blockDim.x] = 0; + sharedVirial[3 * blockDim.x] = 0; + sharedVirial[4 * blockDim.x] = 0; + sharedVirial[5 * blockDim.x] = 0; + } + + sharedVirial = (ENERGY_FLOAT*) &sharedmem[0]; + sharedVirial += blockDim.x; + reduceBlockP2(sharedVirial); + reduceBlockP2(&sharedVirial[1 * blockDim.x]); + reduceBlockP2(&sharedVirial[2 * blockDim.x]); + reduceBlockP2(&sharedVirial[3 * blockDim.x]); + reduceBlockP2(&sharedVirial[4 * blockDim.x]); + reduceBlockP2(&sharedVirial[5 * blockDim.x]); + + if(threadIdx.x < 6) { + ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; + + if(eflag) buffer = &buffer[gridDim.x * gridDim.y]; + + buffer[blockIdx.x * gridDim.y + blockIdx.y + threadIdx.x * gridDim.x * gridDim.y] = sharedVirial[threadIdx.x * blockDim.x]; + } +} + +/*#define vec3_scale(K,X,Y) Y.x = K*X.x; Y.y = K*X.y; Y.z = K*X.z; +#define vec3_scaleadd(K,X,Y,Z) Z.x = K*X.x+Y.x; Z.y = K*X.y+Y.y; Z.z = K*X.z+Y.z; +#define vec3_add(X,Y,Z) Z.x = X.x+Y.x; Z.y = X.y+Y.y; Z.z = X.z+Y.z; +#define vec3_dot(X,Y) (X.x*Y.x + X.y*Y.y + X.z*Y.z)*/ + +__device__ inline void vec3_scale(F_FLOAT k, F_FLOAT3 &x, F_FLOAT3 &y) +{ + y.x = k * x.x; + y.y = k * x.y; + y.z = k * x.z; +} + +__device__ inline void vec3_scale(F_FLOAT k, F_FLOAT4 &x, F_FLOAT3 &y) +{ + y.x = k * x.x; + y.y = k * x.y; + y.z = k * x.z; +} + +__device__ inline void vec3_scale(F_FLOAT k, F_FLOAT4 &x, F_FLOAT4 &y) +{ + y.x = k * x.x; + y.y = k * x.y; + y.z = k * x.z; +} + +__device__ inline void vec3_scaleadd(F_FLOAT k, F_FLOAT3 &x, F_FLOAT3 &y, F_FLOAT3 &z) +{ + z.x = k * x.x + y.x; + z.y = k * x.y + y.y; + z.z = k * x.z + y.z; +} + +__device__ inline void vec3_add(F_FLOAT3 &x, F_FLOAT3 &y, F_FLOAT3 &z) +{ + z.x = x.x + y.x; + z.y = x.y + y.y; + z.z = x.z + y.z; +} + +__device__ inline F_FLOAT vec3_dot(F_FLOAT3 x, F_FLOAT3 y) +{ + return x.x * y.x + x.y * y.y + x.z * y.z; +} + +__device__ inline F_FLOAT vec3_dot(F_FLOAT4 x, F_FLOAT4 y) +{ + return x.x * y.x + x.y * y.y + x.z * y.z; +} + +/* ---------------------------------------------------------------------- + Fermi-like smoothing function +------------------------------------------------------------------------- */ + +__device__ inline F_FLOAT F_fermi(F_FLOAT &r, int &iparam) +{ + return F_F(1.0) / (F_F(1.0) + exp(-params[iparam].ZBLexpscale * (r - params[iparam].ZBLcut))); +} + +/* ---------------------------------------------------------------------- + Fermi-like smoothing function derivative with respect to r +------------------------------------------------------------------------- */ + +__device__ inline F_FLOAT F_fermi_d(F_FLOAT &r, int &iparam) +{ + volatile const F_FLOAT tmp = exp(-params[iparam].ZBLexpscale * (r - params[iparam].ZBLcut)); + return params[iparam].ZBLexpscale * tmp / + ((F_F(1.0) + tmp) * (F_F(1.0) + tmp)); +} + +__device__ inline F_FLOAT ters_fc(F_FLOAT r, F_FLOAT ters_R, F_FLOAT ters_D) +{ + return (r < ters_R - ters_D) ? F_F(1.0) : ((r > ters_R + ters_D) ? + F_F(0.0) : F_F(0.5) * (F_F(1.0) - sin(PI2 * (r - ters_R) / ters_D))); +} + +__device__ inline F_FLOAT ters_fc_d(F_FLOAT r, F_FLOAT ters_R, F_FLOAT ters_D) +{ + return ((r < ters_R - ters_D) || (r > ters_R + ters_D)) ? + F_F(0.0) : -(PI4 / ters_D) * cos(PI2 * (r - ters_R) / ters_D); +} + + +__device__ inline F_FLOAT ters_gijk(F_FLOAT &cos_theta, int iparam) +{ + F_FLOAT ters_c = params[iparam].c; + F_FLOAT ters_d = params[iparam].d; + + return params[iparam].gamma * (F_F(1.0) + pow(params[iparam].c / params[iparam].d, F_F(2.0)) - + pow(ters_c, F_F(2.0)) / (pow(ters_d, F_F(2.0)) + pow(params[iparam].h - cos_theta, F_F(2.0)))); +} + +__device__ F_FLOAT ters_gijk2(F_FLOAT &cos_theta, int iparam) +{ + F_FLOAT ters_c = params[iparam].c; + F_FLOAT ters_d = params[iparam].d; + + return params[iparam].gamma * (F_F(1.0) + pow(ters_c / ters_d, F_F(2.0)) - + pow(ters_c, F_F(2.0)) / (pow(ters_d, F_F(2.0)) + pow(params[iparam].h - cos_theta, F_F(2.0)))); +} + +__device__ inline F_FLOAT ters_gijk_d(F_FLOAT costheta, int iparam) +{ + F_FLOAT numerator = -F_F(2.0) * pow(params[iparam].c, F_F(2.0)) * (params[iparam].h - costheta); + F_FLOAT denominator = pow(pow(params[iparam].d, F_F(2.0)) + + pow(params[iparam].h - costheta, F_F(2.0)), F_F(2.0)); + return params[iparam].gamma * numerator / denominator; +} + +__device__ inline F_FLOAT zeta(int iparam, const F_FLOAT rsqij, const F_FLOAT rsqik, + F_FLOAT3 &delij, F_FLOAT3 &delik) +{ + F_FLOAT rij, rik, costheta, arg, ex_delr; + + rij = sqrt(rsqij); + rik = sqrt(rsqik); + costheta = vec3_dot(delij, delik) / (rij * rik); + + arg = (params[iparam].powermint == 3) ? (params[iparam].lam3 * (rij - rik) * params[iparam].lam3 * (rij - rik) * params[iparam].lam3 * (rij - rik)) : params[iparam].lam3 * (rij - rik); + + if(arg > F_F(69.0776)) ex_delr = F_F(1.e30); + else if(arg < -F_F(69.0776)) ex_delr = F_F(0.0); + else ex_delr = exp(arg); + + return ters_fc(rik, params[iparam].bigr, params[iparam].bigd) * ex_delr * params[iparam].gamma * (F_F(1.0) + (params[iparam].c * params[iparam].c / (params[iparam].d * params[iparam].d)) - + (params[iparam].c * params[iparam].c) / ((params[iparam].d * params[iparam].d) + (params[iparam].h - costheta) * (params[iparam].h - costheta))); +} + +__device__ void repulsive(int iparam, F_FLOAT rsq, F_FLOAT &fforce, + int eflag, ENERGY_FLOAT &eng) +{ + F_FLOAT r, tmp_fc, tmp_fc_d, tmp_exp; + + F_FLOAT ters_R = params[iparam].bigr; + F_FLOAT ters_D = params[iparam].bigd; + r = sqrt(rsq); + tmp_fc = ters_fc(r, ters_R, ters_D); + tmp_fc_d = ters_fc_d(r, ters_R, ters_D); + tmp_exp = exp(-params[iparam].lam1 * r); + + if(!_zbl) { + fforce = -params[iparam].biga * tmp_exp * (tmp_fc_d - tmp_fc * params[iparam].lam1) / r; + + if(eflag) eng += tmp_fc * params[iparam].biga * tmp_exp; + } else { + F_FLOAT const fforce_ters = params[iparam].biga * tmp_exp * (tmp_fc_d - tmp_fc * params[iparam].lam1); + ENERGY_FLOAT eng_ters = tmp_fc * params[iparam].biga * tmp_exp; + + F_FLOAT r_ov_a = r / params[iparam].a_ij; + F_FLOAT phi = F_F(0.1818) * exp(-F_F(3.2) * r_ov_a) + F_F(0.5099) * exp(-F_F(0.9423) * r_ov_a) + + F_F(0.2802) * exp(-F_F(0.4029) * r_ov_a) + F_F(0.02817) * exp(-F_F(0.2016) * r_ov_a); + F_FLOAT dphi = (F_F(1.0) / params[iparam].a_ij) * (-F_F(3.2) * F_F(0.1818) * exp(-F_F(3.2) * r_ov_a) - + F_F(0.9423) * F_F(0.5099) * exp(-F_F(0.9423) * r_ov_a) - + F_F(0.4029) * F_F(0.2802) * exp(-F_F(0.4029) * r_ov_a) - + F_F(0.2016) * F_F(0.02817) * exp(-F_F(0.2016) * r_ov_a)); + F_FLOAT fforce_ZBL = params[iparam].premult / (-r * r) * phi + params[iparam].premult / r * dphi; + ENERGY_FLOAT eng_ZBL = params[iparam].premult * (F_F(1.0) / r) * phi; + + fforce = -(-F_fermi_d(r, iparam) * (eng_ZBL - eng_ters) + fforce_ZBL + F_fermi(r, iparam) * (fforce_ters - fforce_ZBL)) / r; + + if(eflag) + eng += eng_ZBL + F_fermi(r, iparam) * (eng_ters - eng_ZBL); + } + + +} + +/* ---------------------------------------------------------------------- */ + +__device__ inline F_FLOAT ters_fa(F_FLOAT r, int iparam, F_FLOAT ters_R, F_FLOAT ters_D) +{ + if(r > ters_R + ters_D) return F_F(0.0); + + if(_zbl) + return -params[iparam].bigb * exp(-params[iparam].lam2 * r) * ters_fc(r, ters_R, ters_D) * F_fermi(r, iparam); + else + return -params[iparam].bigb * exp(-params[iparam].lam2 * r) * ters_fc(r, ters_R, ters_D); +} + +/* ---------------------------------------------------------------------- */ + +__device__ inline F_FLOAT ters_fa_d(F_FLOAT r, int iparam, F_FLOAT ters_R, F_FLOAT ters_D) +{ + if(r > ters_R + ters_D) return F_F(0.0); + + if(_zbl) + return params[iparam].bigb * exp(-params[iparam].lam2 * r) * + ((params[iparam].lam2 * ters_fc(r, ters_R, ters_D) - ters_fc_d(r, ters_R, ters_D)) * F_fermi(r, iparam) + - ters_fc(r, ters_R, ters_D) * F_fermi_d(r, iparam)); + else + return params[iparam].bigb * exp(-params[iparam].lam2 * r) * + (params[iparam].lam2 * ters_fc(r, ters_R, ters_D) - ters_fc_d(r, ters_R, ters_D)); +} + +/* ---------------------------------------------------------------------- */ + +__device__ inline F_FLOAT ters_bij(F_FLOAT zeta, int iparam) +{ + F_FLOAT tmp = params[iparam].beta * zeta; + + if(tmp > params[iparam].c1) return F_F(1.0) / sqrt(tmp); + + if(tmp > params[iparam].c2) + return (F_F(1.0) - pow(tmp, -params[iparam].powern) / (F_F(2.0) * params[iparam].powern)) / sqrt(tmp); + + if(tmp < params[iparam].c4) return F_F(1.0); + + if(tmp < params[iparam].c3) + return F_F(1.0) - pow(tmp, params[iparam].powern) / (F_F(2.0) * params[iparam].powern); + + return pow(F_F(1.0) + pow(tmp, params[iparam].powern), -F_F(1.0) / (F_F(2.0) * params[iparam].powern)); +} + +/* ---------------------------------------------------------------------- */ + +__device__ inline F_FLOAT ters_bij_d(F_FLOAT zeta, int iparam) +{ + F_FLOAT tmp = params[iparam].beta * zeta; + + if(tmp > params[iparam].c1) return params[iparam].beta * -F_F(0.5) * pow(tmp, -F_F(1.5)); + + if(tmp > params[iparam].c2) + return params[iparam].beta * (-F_F(0.5) * pow(tmp, -F_F(1.5)) * + (F_F(1.0) - F_F(0.5) * (F_F(1.0) + F_F(1.0) / (F_F(2.0) * params[iparam].powern)) * + pow(tmp, -params[iparam].powern))); + + if(tmp < params[iparam].c4) return F_F(0.0); + + if(tmp < params[iparam].c3) + return -F_F(0.5) * params[iparam].beta * pow(tmp, params[iparam].powern - F_F(1.0)); + + F_FLOAT tmp_n = pow(tmp, params[iparam].powern); + return -F_F(0.5) * pow(F_F(1.0) + tmp_n, -F_F(1.0) - (F_F(1.0) / (F_F(2.0) * params[iparam].powern))) * tmp_n / zeta; +} + +__device__ void force_zeta(int iparam, F_FLOAT rsq, F_FLOAT zeta_ij, + F_FLOAT &fforce, F_FLOAT &prefactor, + int eflag, F_FLOAT &eng) +{ + F_FLOAT r, fa, fa_d, bij; + F_FLOAT ters_R = params[iparam].bigr; + F_FLOAT ters_D = params[iparam].bigd; + r = sqrt(rsq); + fa = ters_fa(r, iparam, ters_R, ters_D); + fa_d = ters_fa_d(r, iparam, ters_R, ters_D); + bij = ters_bij(zeta_ij, iparam); + fforce = F_F(0.5) * bij * fa_d / r; + prefactor = -F_F(0.5) * fa * ters_bij_d(zeta_ij, iparam); + + if(eflag) eng += bij * fa; +} + +__device__ void force_zeta_prefactor_force(int iparam, F_FLOAT rsq, F_FLOAT zeta_ij, + F_FLOAT &fforce, F_FLOAT &prefactor) +{ + F_FLOAT r, fa, fa_d, bij; + F_FLOAT ters_R = params[iparam].bigr; + F_FLOAT ters_D = params[iparam].bigd; + r = sqrt(rsq); + fa = ters_fa(r, iparam, ters_R, ters_D); + fa_d = ters_fa_d(r, iparam, ters_R, ters_D); + bij = ters_bij(zeta_ij, iparam); + fforce = F_F(0.5) * bij * fa_d / r; + prefactor = -F_F(0.5) * fa * ters_bij_d(zeta_ij, iparam); +} + +__device__ void force_zeta_prefactor(int iparam, F_FLOAT rsq, F_FLOAT zeta_ij, + F_FLOAT &prefactor) +{ + F_FLOAT r, fa; + r = sqrt(rsq); + fa = ters_fa(r, iparam, params[iparam].bigr, params[iparam].bigd); + prefactor = -F_F(0.5) * fa * ters_bij_d(zeta_ij, iparam); +} + + +__device__ void costheta_d(F_FLOAT3 &rij_hat, F_FLOAT &rij, + F_FLOAT3 &rik_hat, F_FLOAT &rik, + F_FLOAT3 &dri, F_FLOAT3 &drj, F_FLOAT3 &drk) +{ + // first element is derivative wrt Ri, second wrt Rj, third wrt Rk + + F_FLOAT cos_theta = vec3_dot(rij_hat, rik_hat); + + vec3_scaleadd(-cos_theta, rij_hat, rik_hat, drj); + vec3_scale(F_F(1.0) / rij, drj, drj); + vec3_scaleadd(-cos_theta, rik_hat, rij_hat, drk); + vec3_scale(F_F(1.0) / rik, drk, drk); + vec3_add(drj, drk, dri); + vec3_scale(-F_F(1.0), dri, dri); +} + +__device__ void ters_zetaterm_d(F_FLOAT prefactor, + F_FLOAT3 &rij_hat, F_FLOAT rij, + F_FLOAT3 &rik_hat, F_FLOAT rik, + F_FLOAT3 &dri, F_FLOAT3 &drj, F_FLOAT3 &drk, + int iparam) +{ + F_FLOAT ex_delr, ex_delr_d, tmp; + F_FLOAT3 dcosdri, dcosdrj, dcosdrk; + + if(params[iparam].powermint == 3) tmp = (params[iparam].lam3 * (rij - rik) * params[iparam].lam3 * (rij - rik) * params[iparam].lam3 * (rij - rik)); + else tmp = params[iparam].lam3 * (rij - rik); + + if(tmp > F_F(69.0776)) ex_delr = F_F(1.e30); + else if(tmp < -F_F(69.0776)) ex_delr = F_F(0.0); + else ex_delr = exp(tmp); + + if(params[iparam].powermint == 3) + ex_delr_d = F_F(3.0) * (params[iparam].lam3 * params[iparam].lam3 * params[iparam].lam3) * (rij - rik) * (rij - rik) * ex_delr; + else ex_delr_d = params[iparam].lam3 * ex_delr; + + + const F_FLOAT cos_theta = vec3_dot(rij_hat, rik_hat); + costheta_d(rij_hat, rij, rik_hat, rik, dcosdri, dcosdrj, dcosdrk); + + const F_FLOAT gijk = params[iparam].gamma * (F_F(1.0) + (params[iparam].c * params[iparam].c) / (params[iparam].d * params[iparam].d) - + (params[iparam].c * params[iparam].c) / (params[iparam].d * params[iparam].d + (params[iparam].h - cos_theta) * (params[iparam].h - cos_theta))); + const F_FLOAT numerator = -F_F(2.0) * params[iparam].c * params[iparam].c * (params[iparam].h - cos_theta); + const F_FLOAT denominator = (params[iparam].d * params[iparam].d) + + (params[iparam].h - cos_theta) * (params[iparam].h - cos_theta); + const F_FLOAT gijk_d = params[iparam].gamma * numerator / (denominator * denominator); // compute the derivative wrt Ri + // dri = -dfc*gijk*ex_delr*rik_hat; + // dri += fc*gijk_d*ex_delr*dcosdri; + // dri += fc*gijk*ex_delr_d*(rik_hat - rij_hat); + const F_FLOAT fc = ters_fc(rik, params[iparam].bigr, params[iparam].bigd); + const F_FLOAT dfc = ters_fc_d(rik, params[iparam].bigr, params[iparam].bigd); + + + vec3_scale(-dfc * gijk * ex_delr, rik_hat, dri); + vec3_scaleadd(fc * gijk_d * ex_delr, dcosdri, dri, dri); + vec3_scaleadd(fc * gijk * ex_delr_d, rik_hat, dri, dri); + vec3_scaleadd(-fc * gijk * ex_delr_d, rij_hat, dri, dri); + vec3_scale(prefactor, dri, dri); + // compute the derivative wrt Rj + // drj = fc*gijk_d*ex_delr*dcosdrj; + // drj += fc*gijk*ex_delr_d*rij_hat; + + vec3_scale(fc * gijk_d * ex_delr, dcosdrj, drj); + vec3_scaleadd(fc * gijk * ex_delr_d, rij_hat, drj, drj); + vec3_scale(prefactor, drj, drj); + + // compute the derivative wrt Rk + // drk = dfc*gijk*ex_delr*rik_hat; + // drk += fc*gijk_d*ex_delr*dcosdrk; + // drk += -fc*gijk*ex_delr_d*rik_hat; + + vec3_scale(dfc * gijk * ex_delr, rik_hat, drk); + vec3_scaleadd(fc * gijk_d * ex_delr, dcosdrk, drk, drk); + vec3_scaleadd(-fc * gijk * ex_delr_d, rik_hat, drk, drk); + vec3_scale(prefactor, drk, drk); +} + +__device__ void ters_zetaterm_d_fi(F_FLOAT &prefactor, + F_FLOAT3 &rij_hat, F_FLOAT &rij, + F_FLOAT3 &rik_hat, F_FLOAT &rik, + F_FLOAT3 &dri, int &iparam) +{ + F_FLOAT ex_delr, ex_delr_d, tmp; + + if(params[iparam].powermint == 3) tmp = (params[iparam].lam3 * (rij - rik) * params[iparam].lam3 * (rij - rik) * params[iparam].lam3 * (rij - rik)); + else tmp = params[iparam].lam3 * (rij - rik); + + if(tmp > F_F(69.0776)) ex_delr = F_F(1.e30); + else if(tmp < -F_F(69.0776)) ex_delr = F_F(0.0); + else ex_delr = exp(tmp); + + if(params[iparam].powermint == 3) + ex_delr_d = F_F(3.0) * (params[iparam].lam3 * params[iparam].lam3 * params[iparam].lam3) * (rij - rik) * (rij - rik) * ex_delr; + else ex_delr_d = params[iparam].lam3 * ex_delr; + + const F_FLOAT cos_theta = vec3_dot(rij_hat, rik_hat); + //costheta_d(rij_hat,rij,rik_hat,rik,dcosdri,dcosdrj,dcosdrk); + + + F_FLOAT3 dcosdri; + vec3_scaleadd(-cos_theta, rij_hat, rik_hat, dri); + vec3_scale(F_F(1.0) / rij, dri, dri); + vec3_scaleadd(-cos_theta, rik_hat, rij_hat, dcosdri); + vec3_scale(F_F(1.0) / rik, dcosdri, dcosdri); + vec3_add(dri, dcosdri, dcosdri); + vec3_scale(-F_F(1.0), dcosdri, dcosdri); + + const F_FLOAT gijk = params[iparam].gamma * (F_F(1.0) + (params[iparam].c * params[iparam].c) / (params[iparam].d * params[iparam].d) - + (params[iparam].c * params[iparam].c) / (params[iparam].d * params[iparam].d + (params[iparam].h - cos_theta) * (params[iparam].h - cos_theta))); + const F_FLOAT numerator = -F_F(2.0) * params[iparam].c * params[iparam].c * (params[iparam].h - cos_theta); + const F_FLOAT denominator = (params[iparam].d * params[iparam].d) + + (params[iparam].h - cos_theta) * (params[iparam].h - cos_theta); + const F_FLOAT gijk_d = params[iparam].gamma * numerator / (denominator * denominator); // compute the derivative wrt Ri + // + const F_FLOAT fc = ters_fc(rik, params[iparam].bigr, params[iparam].bigd); + const F_FLOAT dfc = ters_fc_d(rik, params[iparam].bigr, params[iparam].bigd); + + vec3_scale(-dfc * gijk * ex_delr, rik_hat, dri); + vec3_scaleadd(fc * gijk_d * ex_delr, dcosdri, dri, dri); + vec3_scaleadd(fc * gijk * ex_delr_d, rik_hat, dri, dri); + vec3_scaleadd(-fc * gijk * ex_delr_d, rij_hat, dri, dri); + vec3_scale(prefactor, dri, dri); + +} + +__device__ void ters_zetaterm_d_fj(F_FLOAT &prefactor, + F_FLOAT3 &rij_hat, F_FLOAT &rij, + F_FLOAT3 &rik_hat, F_FLOAT &rik, + F_FLOAT3 &drj, int &iparam) +{ + F_FLOAT ex_delr, ex_delr_d, tmp; + + if(params[iparam].powermint == 3) tmp = (params[iparam].lam3 * (rij - rik) * params[iparam].lam3 * (rij - rik) * params[iparam].lam3 * (rij - rik)); + else tmp = params[iparam].lam3 * (rij - rik); + + if(tmp > F_F(69.0776)) ex_delr = F_F(1.e30); + else if(tmp < -F_F(69.0776)) ex_delr = F_F(0.0); + else ex_delr = exp(tmp); + + if(params[iparam].powermint == 3) + ex_delr_d = F_F(3.0) * (params[iparam].lam3 * params[iparam].lam3 * params[iparam].lam3) * (rij - rik) * (rij - rik) * ex_delr; + else ex_delr_d = params[iparam].lam3 * ex_delr; + + const F_FLOAT cos_theta = vec3_dot(rij_hat, rik_hat); + vec3_scaleadd(-cos_theta, rij_hat, rik_hat, drj); + vec3_scale(F_F(1.0) / rij, drj, drj); + + const F_FLOAT gijk = params[iparam].gamma * (F_F(1.0) + (params[iparam].c * params[iparam].c) / (params[iparam].d * params[iparam].d) - + (params[iparam].c * params[iparam].c) / (params[iparam].d * params[iparam].d + (params[iparam].h - cos_theta) * (params[iparam].h - cos_theta))); + const F_FLOAT numerator = -F_F(2.0) * params[iparam].c * params[iparam].c * (params[iparam].h - cos_theta); + const F_FLOAT denominator = (params[iparam].d * params[iparam].d) + + (params[iparam].h - cos_theta) * (params[iparam].h - cos_theta); + const F_FLOAT gijk_d = params[iparam].gamma * numerator / (denominator * denominator); // compute the derivative wrt Ri + + const F_FLOAT fc = ters_fc(rik, params[iparam].bigr, params[iparam].bigd); + + vec3_scale(fc * gijk_d * ex_delr, drj, drj); + vec3_scaleadd(fc * gijk * ex_delr_d, rij_hat, drj, drj); + vec3_scale(prefactor, drj, drj); +} + +__device__ void ters_zetaterm_d_fk(F_FLOAT &prefactor, + F_FLOAT3 &rij_hat, F_FLOAT &rij, + F_FLOAT3 &rik_hat, F_FLOAT &rik, + F_FLOAT3 &drk, int &iparam) +{ + F_FLOAT ex_delr, ex_delr_d, tmp; + + if(params[iparam].powermint == 3) tmp = (params[iparam].lam3 * (rij - rik) * params[iparam].lam3 * (rij - rik) * params[iparam].lam3 * (rij - rik)); + else tmp = params[iparam].lam3 * (rij - rik); + + if(tmp > F_F(69.0776)) ex_delr = F_F(1.e30); + else if(tmp < -F_F(69.0776)) ex_delr = F_F(0.0); + else ex_delr = exp(tmp); + + if(params[iparam].powermint == 3) + ex_delr_d = F_F(3.0) * (params[iparam].lam3 * params[iparam].lam3 * params[iparam].lam3) * (rij - rik) * (rij - rik) * ex_delr; + else ex_delr_d = params[iparam].lam3 * ex_delr; + + const F_FLOAT cos_theta = vec3_dot(rij_hat, rik_hat); + vec3_scaleadd(-cos_theta, rik_hat, rij_hat, drk); + vec3_scale(F_F(1.0) / rik, drk, drk); + + const F_FLOAT gijk = params[iparam].gamma * (F_F(1.0) + (params[iparam].c * params[iparam].c) / (params[iparam].d * params[iparam].d) - + (params[iparam].c * params[iparam].c) / (params[iparam].d * params[iparam].d + (params[iparam].h - cos_theta) * (params[iparam].h - cos_theta))); + const F_FLOAT numerator = -F_F(2.0) * params[iparam].c * params[iparam].c * (params[iparam].h - cos_theta); + const F_FLOAT denominator = (params[iparam].d * params[iparam].d) + + (params[iparam].h - cos_theta) * (params[iparam].h - cos_theta); + const F_FLOAT gijk_d = params[iparam].gamma * numerator / (denominator * denominator); // compute the derivative wrt Ri + + const F_FLOAT fc = ters_fc(rik, params[iparam].bigr, params[iparam].bigd); + const F_FLOAT dfc = ters_fc_d(rik, params[iparam].bigr, params[iparam].bigd); + + vec3_scale(fc * gijk_d * ex_delr, drk, drk); + vec3_scaleadd(dfc * gijk * ex_delr, rik_hat, drk, drk); + vec3_scaleadd(-fc * gijk * ex_delr_d, rik_hat, drk, drk); + vec3_scale(prefactor, drk, drk); +} + +__device__ void attractive(int iparam, F_FLOAT prefactor, + F_FLOAT4 &delij, + F_FLOAT4 &delik, + F_FLOAT3 &fi, F_FLOAT3 &fj, F_FLOAT3 &fk) +{ + F_FLOAT3 rij_hat, rik_hat; + F_FLOAT rij, rijinv, rik, rikinv; + + rij = sqrt(delij.w); + rijinv = F_F(1.0) / rij; + vec3_scale(rijinv, delij, rij_hat); + + rik = sqrt(delik.w); + rikinv = F_F(1.0) / rik; + vec3_scale(rikinv, delik, rik_hat); + + ters_zetaterm_d(prefactor, rij_hat, rij, rik_hat, rik, fi, fj, fk, iparam); +} + +__device__ void attractive_fi(int &iparam, F_FLOAT &prefactor, + F_FLOAT4 &delij, + F_FLOAT4 &delik, + F_FLOAT3 &f) +{ + F_FLOAT3 rij_hat, rik_hat; + F_FLOAT rij, rijinv, rik, rikinv; + + rij = sqrt(delij.w); + rijinv = F_F(1.0) / rij; + vec3_scale(rijinv, delij, rij_hat); + + rik = sqrt(delik.w); + rikinv = F_F(1.0) / rik; + vec3_scale(rikinv, delik, rik_hat); + + ters_zetaterm_d_fi(prefactor, rij_hat, rij, rik_hat, rik, f, iparam); +} + +__device__ void attractive_fj(int iparam, F_FLOAT prefactor, + F_FLOAT4 &delij, + F_FLOAT4 &delik, + F_FLOAT3 &f) +{ + F_FLOAT3 rij_hat, rik_hat; + F_FLOAT rij, rijinv, rik, rikinv; + + rij = sqrt(delij.w); + rijinv = F_F(1.0) / rij; + vec3_scale(rijinv, delij, rij_hat); + + rik = sqrt(delik.w); + rikinv = F_F(1.0) / rik; + vec3_scale(rikinv, delik, rik_hat); + + ters_zetaterm_d_fj(prefactor, rij_hat, rij, rik_hat, rik, f, iparam); +} + +__device__ void attractive_fk(int iparam, F_FLOAT prefactor, + F_FLOAT4 &delij, + F_FLOAT4 &delik, + F_FLOAT3 &f) +{ + F_FLOAT3 rij_hat, rik_hat; + F_FLOAT rij, rijinv, rik, rikinv; + + rij = sqrt(delij.w); + rijinv = F_F(1.0) / rij; + vec3_scale(rijinv, delij, rij_hat); + + rik = sqrt(delik.w); + rikinv = F_F(1.0) / rik; + vec3_scale(rikinv, delik, rik_hat); + + ters_zetaterm_d_fk(prefactor, rij_hat, rij, rik_hat, rik, f, iparam); +} + +__global__ void Pair_Tersoff_Kernel_TpA_RIJ()//F_FLOAT4* _glob_r_ij,int* _glob_numneigh_red,int* _glob_neighbors_red,int* _glob_neightype_red) +{ + int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if(ii >= _nall) return; + + X_FLOAT4 myxtype; + F_FLOAT4 delij; + F_FLOAT xtmp, ytmp, ztmp; + int itype, jnum, i, j; + int* jlist; + int neigh_red = 0; + i = ii;//_ilist[ii]; + myxtype = fetchXType(i); + + xtmp = myxtype.x; + ytmp = myxtype.y; + ztmp = myxtype.z; + itype = map[(static_cast <int>(myxtype.w))]; + + jnum = _numneigh[i]; + jlist = &_neighbors[i]; + + __syncthreads(); + + for(int jj = 0; jj < jnum; jj++) { + if(jj < jnum) { + + j = jlist[jj * _nall]; + j &= NEIGHMASK; + myxtype = fetchXType(j); + delij.x = xtmp - myxtype.x; + delij.y = ytmp - myxtype.y; + delij.z = ztmp - myxtype.z; + int jtype = map[(static_cast <int>(myxtype.w))]; + int iparam_ij = elem2param[(itype * nelements + jtype) * nelements + jtype]; + delij.w = vec3_dot(delij, delij); + + if(delij.w < params[iparam_ij].cutsq) { + _glob_neighbors_red[i + neigh_red * _nall] = j; + _glob_neightype_red[i + neigh_red * _nall] = jtype; + _glob_r_ij[i + neigh_red * _nall] = delij; + neigh_red++; + } + } + } + + _glob_numneigh_red[i] = neigh_red; +} + + +__global__ void Pair_Tersoff_Kernel_TpA_ZetaIJ()//F_FLOAT* _glob_zeta_ij,F_FLOAT4* _glob_r_ij,int* _glob_numneigh_red,int* _glob_neighbors_red,int* _glob_neightype_red) +{ + + int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + if(ii >= _nall) return; + + + F_FLOAT4 delij; + F_FLOAT4 delik; + + int itype, jnum, i, j; + int* jlist; + i = ii; + itype = map[(static_cast <int>(_type[i]))]; + + jnum = _glob_numneigh_red[i]; + jlist = &_glob_neighbors_red[i]; + + __syncthreads(); + + for(int jj = 0; jj < jnum; jj++) { + if(jj < jnum) { + + j = jlist[jj * _nall]; + j &= NEIGHMASK; + int jtype = _glob_neightype_red[i + jj * _nall]; + delij = _glob_r_ij[i + jj * _nall]; + + int iparam_ij = elem2param[(itype * nelements + jtype) * nelements + jtype]; + + if(delij.w < params[iparam_ij].cutsq) { + F_FLOAT zeta_ij = 0.0; + F_FLOAT3 delij3 = {delij.x, delij.y, delij.z}; + + for(int kk = 0; kk < jnum; kk++) { + if(jj == kk) continue; + + int k = jlist[kk * _nall]; + k &= NEIGHMASK; + + int ktype = _glob_neightype_red[i + kk * _nall]; + delik = _glob_r_ij[i + kk * _nall]; + F_FLOAT3 delik3 = {delik.x, delik.y, delik.z}; + int iparam_ijk = elem2param[(itype * nelements + jtype) * nelements + ktype]; + const F_FLOAT rsqki = delik.w; + + if(rsqki <= params[iparam_ijk].cutsq) + zeta_ij += zeta(iparam_ijk, delij.w, rsqki, delij3, delik3); + } + + _glob_zeta_ij[i + jj * _nall] = zeta_ij; + } + } + } +} + +//back3: num 12 steps 10: ZetaIJ/TPA 0.255/0.106 +//back5: num 12 steps 10: ZetaIJ/TPA 0.257/0.098 +//back6: num 12 steps 10: ZetaIJ/TPA 0.027/0.097 /rij berechnung extra +//back12: num 12 steps 10: ZetaIJ/TPA 0.026/0.070 +//back15: num 12 steps 10: ZetaIJ/TPA 0.0137/0.0287 //pow beseitigt +// num 12 steps 10: ZetaIJ/TPA 0.0137/0.027 +template <int eflag, int vflagm> +__global__ void Pair_Tersoff_Kernel_TpA(int eflag_atom, int vflag_atom) //,F_FLOAT* _glob_zeta_ij,F_FLOAT4* _glob_r_ij,int* _glob_numneigh_red,int* _glob_neighbors_red,int* _glob_neightype_red) +{ + ENERGY_FLOAT evdwl = ENERGY_F(0.0); + + ENERGY_FLOAT* sharedE = &sharedmem[threadIdx.x]; + ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x]; + + F_FLOAT* shared_F_F = (F_FLOAT*) sharedmem; + + if((eflag || eflag_atom) && (vflagm || vflag_atom)) shared_F_F = (F_FLOAT*) &sharedmem[7 * blockDim.x]; + else if(eflag) shared_F_F = (F_FLOAT*) &sharedmem[blockDim.x]; + else if(vflagm) shared_F_F = (F_FLOAT*) &sharedmem[6 * blockDim.x]; + + shared_F_F += threadIdx.x; + + if(eflag_atom || eflag) { + sharedE[0] = ENERGY_F(0.0); + sharedV += blockDim.x; + } + + if(vflagm || vflag_atom) { + sharedV[0 * blockDim.x] = ENERGY_F(0.0); + sharedV[1 * blockDim.x] = ENERGY_F(0.0); + sharedV[2 * blockDim.x] = ENERGY_F(0.0); + sharedV[3 * blockDim.x] = ENERGY_F(0.0); + sharedV[4 * blockDim.x] = ENERGY_F(0.0); + sharedV[5 * blockDim.x] = ENERGY_F(0.0); + } + + int jnum_red = 0; +#define fxtmp shared_F_F[0] +#define fytmp shared_F_F[blockDim.x] +#define fztmp shared_F_F[2*blockDim.x] + //#define jnum_red (static_cast <int> (shared_F_F[3*blockDim.x])) + + int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + + X_FLOAT4 myxtype_i, myxtype_j, myxtype_k; + F_FLOAT4 delij, delik, deljk; + F_FLOAT fpair; + F_FLOAT prefactor_ij, prefactor_ji; + + int itype, i, j; + int* jlist_red; + + if(ii < _inum) { + i = _ilist[ii]; + + if(vflagm) + myxtype_i = fetchXType(i); + + //itype=map[(static_cast <int> (myxtype_i.w))]; + itype = map[_type[i]]; + + + fxtmp = F_F(0.0); + fytmp = F_F(0.0); + fztmp = F_F(0.0); + + + //shared_F_F[3*blockDim.x] = _glob_numneigh_red[i]; + jnum_red = _glob_numneigh_red[i]; + jlist_red = &_glob_neighbors_red[i]; + } + + __syncthreads(); + +#pragma unroll 1 + + for(int jj = 0; jj < jnum_red; jj++) { + if(i < _nlocal) { + fpair = F_F(0.0); + j = jlist_red[jj * _nall]; + j &= NEIGHMASK; + + if(vflagm) + myxtype_j = fetchXType(j); + + int jtype = _glob_neightype_red[i + jj * _nall]; + delij = _glob_r_ij[i + jj * _nall]; + + volatile int iparam_ij = elem2param[(itype * nelements + jtype) * nelements + jtype]; + volatile int iparam_ji = elem2param[(jtype * nelements + itype) * nelements + itype]; + + if(delij.w < params[iparam_ij].cutsq) { + F_FLOAT dxfp, dyfp, dzfp; + repulsive(iparam_ij, delij.w, fpair, eflag, evdwl); + fxtmp += dxfp = delij.x * fpair; + fytmp += dyfp = delij.y * fpair; + fztmp += dzfp = delij.z * fpair; + + if(vflagm) { + sharedV[0 * blockDim.x] += delij.x * dxfp; + sharedV[1 * blockDim.x] += delij.y * dyfp; + sharedV[2 * blockDim.x] += delij.z * dzfp; + sharedV[3 * blockDim.x] += delij.x * dyfp; + sharedV[4 * blockDim.x] += delij.x * dzfp; + sharedV[5 * blockDim.x] += delij.y * dzfp; + } + + + + force_zeta(iparam_ij, delij.w, _glob_zeta_ij[i + jj * _nall], fpair, prefactor_ij, eflag, evdwl); + fxtmp -= + dxfp = delij.x * fpair; + fytmp -= + dyfp = delij.y * fpair; + fztmp -= + dzfp = delij.z * fpair; + + if(vflagm) { + sharedV[0 * blockDim.x] -= ENERGY_F(2.0) * delij.x * dxfp; + sharedV[1 * blockDim.x] -= ENERGY_F(2.0) * delij.y * dyfp; + sharedV[2 * blockDim.x] -= ENERGY_F(2.0) * delij.z * dzfp; + sharedV[3 * blockDim.x] -= ENERGY_F(2.0) * delij.x * dyfp; + sharedV[4 * blockDim.x] -= ENERGY_F(2.0) * delij.x * dzfp; + sharedV[5 * blockDim.x] -= ENERGY_F(2.0) * delij.y * dzfp; + } + + int j_jj = 0; + + //#pragma unroll 1 + for(int kk = 0; kk < _glob_numneigh_red[j]; kk++) { + if(_glob_neighbors_red[j + kk * _nall] == i) j_jj = kk; + } + + force_zeta_prefactor_force(iparam_ji, delij.w, _glob_zeta_ij[j + j_jj * _nall], fpair, prefactor_ji); + + fxtmp -= + dxfp = delij.x * fpair; + fytmp -= + dyfp = delij.y * fpair; + fztmp -= + dzfp = delij.z * fpair; + + + + vec3_scale(F_F(-1.0), delij, delij); + +#pragma unroll 1 + + for(int kk = 0; kk < jnum_red; kk++) { + if(jj == kk) continue; + + int k = jlist_red[kk * _nall]; + k &= NEIGHMASK; + + if(vflagm) + myxtype_k = fetchXType(k); + + delik = _glob_r_ij[i + kk * _nall]; + + int ktype = _glob_neightype_red[i + kk * _nall]; + int iparam_ijk = elem2param[(itype * nelements + jtype) * nelements + ktype]; + vec3_scale(F_F(-1.0), delik, delik); + + if(delik.w <= params[iparam_ijk].cutsq) { + if(vflagm) { + F_FLOAT3 fi, fj, fk; + attractive(iparam_ijk, prefactor_ij, + delij, delik, fi, fj, fk); + fxtmp += fi.x; + fytmp += fi.y; + fztmp += fi.z; + + sharedV[0 * blockDim.x] += ENERGY_F(2.0) * myxtype_i.x * fi.x; + sharedV[1 * blockDim.x] += ENERGY_F(2.0) * myxtype_i.y * fi.y; + sharedV[2 * blockDim.x] += ENERGY_F(2.0) * myxtype_i.z * fi.z; + sharedV[3 * blockDim.x] += ENERGY_F(2.0) * myxtype_i.x * fi.y; + sharedV[4 * blockDim.x] += ENERGY_F(2.0) * myxtype_i.x * fi.z; + sharedV[5 * blockDim.x] += ENERGY_F(2.0) * myxtype_i.y * fi.z; + + sharedV[0 * blockDim.x] += ENERGY_F(2.0) * myxtype_j.x * fj.x; + sharedV[1 * blockDim.x] += ENERGY_F(2.0) * myxtype_j.y * fj.y; + sharedV[2 * blockDim.x] += ENERGY_F(2.0) * myxtype_j.z * fj.z; + sharedV[3 * blockDim.x] += ENERGY_F(2.0) * myxtype_j.x * fj.y; + sharedV[4 * blockDim.x] += ENERGY_F(2.0) * myxtype_j.x * fj.z; + sharedV[5 * blockDim.x] += ENERGY_F(2.0) * myxtype_j.y * fj.z; + + sharedV[0 * blockDim.x] += ENERGY_F(2.0) * myxtype_k.x * fk.x; + sharedV[1 * blockDim.x] += ENERGY_F(2.0) * myxtype_k.y * fk.y; + sharedV[2 * blockDim.x] += ENERGY_F(2.0) * myxtype_k.z * fk.z; + sharedV[3 * blockDim.x] += ENERGY_F(2.0) * myxtype_k.x * fk.y; + sharedV[4 * blockDim.x] += ENERGY_F(2.0) * myxtype_k.x * fk.z; + sharedV[5 * blockDim.x] += ENERGY_F(2.0) * myxtype_k.y * fk.z; + } else { + F_FLOAT3 fi; //local variable + attractive_fi(iparam_ijk, prefactor_ij, + delij, delik, fi); + fxtmp += fi.x; + fytmp += fi.y; + fztmp += fi.z; + + } + } + } + + int j_jnum_red = _glob_numneigh_red[j]; + int* j_jlist_red = &_glob_neighbors_red[j]; + + int j_ii = 0; + + //#pragma unroll 1 + for(int j_kk = 0; j_kk < j_jnum_red; j_kk++) { + if(j_jlist_red[j_kk * _nall] == i) j_ii = j_kk; + } + +#pragma unroll 1 + + for(int kk = 0; kk < j_jnum_red; kk++) { + if(j_ii == kk) continue; + + int k = j_jlist_red[kk * _nall]; + k &= NEIGHMASK; + deljk = _glob_r_ij[j + kk * _nall]; + vec3_scale(F_F(-1.0), deljk, deljk); + int ktype = _glob_neightype_red[j + kk * _nall]; + + int iparam_jik = elem2param[(jtype * nelements + itype) * nelements + ktype]; + int iparam_jki = elem2param[(jtype * nelements + ktype) * nelements + itype]; + + + vec3_scale(F_F(-1.0), delij, delij); + + if(deljk.w <= params[iparam_jik].cutsq) { + F_FLOAT3 ftmp; //local variable + + attractive_fj(iparam_jik, prefactor_ji, + delij, deljk, ftmp); + fxtmp += ftmp.x; + fytmp += ftmp.y; + fztmp += ftmp.z; + int iparam_jk = elem2param[(jtype * nelements + ktype) * nelements + ktype]; + F_FLOAT prefactor_jk; + force_zeta_prefactor(iparam_jk, deljk.w, _glob_zeta_ij[j + kk * _nall], prefactor_jk); + + attractive_fk(iparam_jki, prefactor_jk, + deljk, delij, ftmp); + fxtmp += ftmp.x; + fytmp += ftmp.y; + fztmp += ftmp.z; + + } + + vec3_scale(F_F(-1.0), delij, delij); + } + } + } + + } + + __syncthreads(); + + if(ii < _inum) { + F_FLOAT* my_f; + + if(_collect_forces_later) { + ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer; + + if(eflag) { + buffer = &buffer[1 * gridDim.x * gridDim.y]; + } + + if(vflagm) { + buffer = &buffer[6 * gridDim.x * gridDim.y]; + } + + my_f = (F_FLOAT*) buffer; + my_f += i; + *my_f = fxtmp; + my_f += _nmax; + *my_f = fytmp; + my_f += _nmax; + *my_f = fztmp; + } else { + my_f = _f + i; + *my_f += fxtmp; + my_f += _nmax; + *my_f += fytmp; + my_f += _nmax; + *my_f += fztmp; + } + } + + __syncthreads(); + + if(eflag) { + sharedE[0] = evdwl; + } + + if(eflag_atom && i < _nlocal) { + _eatom[i] = ENERGY_F(0.5) * evdwl; + } + + if(vflag_atom && i < _nlocal) { + _vatom[i] = ENERGY_F(0.5) * sharedV[0 * blockDim.x]; + _vatom[i + _nmax] = ENERGY_F(0.5) * sharedV[1 * blockDim.x]; + _vatom[i + 2 * _nmax] = ENERGY_F(0.5) * sharedV[2 * blockDim.x]; + _vatom[i + 3 * _nmax] = ENERGY_F(0.5) * sharedV[3 * blockDim.x]; + _vatom[i + 4 * _nmax] = ENERGY_F(0.5) * sharedV[4 * blockDim.x]; + _vatom[i + 5 * _nmax] = ENERGY_F(0.5) * sharedV[5 * blockDim.x]; + } + + if(vflagm && eflag) PairVirialCompute_A_Kernel_Template<1, 1>(); + else if(eflag) PairVirialCompute_A_Kernel_Template<1, 0>(); + else if(vflagm) PairVirialCompute_A_Kernel_Template<0, 1>(); + +#undef fxtmp +#undef fytmp +#undef fztmp + //#undef jnum_red +} diff --git a/lib/cuda/pair_virial_compute_cu.h b/lib/cuda/pair_virial_compute_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..872471537e573ccf6700bf51d60fbd5f4daaba1a --- /dev/null +++ b/lib/cuda/pair_virial_compute_cu.h @@ -0,0 +1,26 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_shared.h" + +extern "C" void Cuda_PairVirialCompute(cuda_shared_data* sdata, int offset, int end); diff --git a/lib/cuda/pppm_cuda.cu b/lib/cuda/pppm_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..19c2a23a68e8e9805f5bd0684496775291f0bef1 --- /dev/null +++ b/lib/cuda/pppm_cuda.cu @@ -0,0 +1,588 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#include "cuda_precision.h" +//#define FFT_CUFFT +#define MY_PREFIX pppm +#include "cuda_shared.h" +#include "cuda_common.h" +#include "pppm_cuda_cu.h" +#include "cuda_runtime.h" +#include <stdio.h> + +//#include "crm_cuda_utils.cu" +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + +__device__ __constant__ FFT_FLOAT* work1; +__device__ __constant__ FFT_FLOAT* work2; +__device__ __constant__ FFT_FLOAT* work3; +__device__ __constant__ PPPM_FLOAT* greensfn; +__device__ __constant__ PPPM_FLOAT* gf_b; +__device__ __constant__ PPPM_FLOAT* fkx; +__device__ __constant__ PPPM_FLOAT* fky; +__device__ __constant__ PPPM_FLOAT* fkz; +__device__ __constant__ PPPM_FLOAT* vg; +__device__ __constant__ int* part2grid; +__device__ __constant__ PPPM_FLOAT* density_brick; +__device__ __constant__ int* density_brick_int; +__device__ __constant__ PPPM_FLOAT density_intScale; +__device__ __constant__ PPPM_FLOAT* vdx_brick; +__device__ __constant__ PPPM_FLOAT* vdy_brick; +__device__ __constant__ PPPM_FLOAT* vdz_brick; +__device__ __constant__ PPPM_FLOAT* density_fft; +__device__ __constant__ ENERGY_FLOAT* energy; +__device__ __constant__ ENERGY_FLOAT* virial; +__device__ __constant__ int nxlo_in; +__device__ __constant__ int nxhi_in; +__device__ __constant__ int nxlo_out; +__device__ __constant__ int nxhi_out; +__device__ __constant__ int nylo_in; +__device__ __constant__ int nyhi_in; +__device__ __constant__ int nylo_out; +__device__ __constant__ int nyhi_out; +__device__ __constant__ int nzlo_in; +__device__ __constant__ int nzhi_in; +__device__ __constant__ int nzlo_out; +__device__ __constant__ int nzhi_out; +__device__ __constant__ int nxlo_fft; +__device__ __constant__ int nxhi_fft; +__device__ __constant__ int nylo_fft; +__device__ __constant__ int nyhi_fft; +__device__ __constant__ int nzlo_fft; +__device__ __constant__ int nzhi_fft; +__device__ __constant__ int nx_pppm; +__device__ __constant__ int ny_pppm; +__device__ __constant__ int nz_pppm; +__device__ __constant__ int slabflag; +__device__ __constant__ PPPM_FLOAT qqrd2e; +__device__ __constant__ int order; +//__device__ __constant__ float3 sublo; +__device__ __constant__ PPPM_FLOAT* rho_coeff; +__device__ __constant__ int nmax; +__device__ __constant__ int nlocal; +__device__ __constant__ PPPM_FLOAT* debugdata; +__device__ __constant__ PPPM_FLOAT delxinv; +__device__ __constant__ PPPM_FLOAT delyinv; +__device__ __constant__ PPPM_FLOAT delzinv; +__device__ __constant__ int nlower; +__device__ __constant__ int nupper; +__device__ __constant__ PPPM_FLOAT shiftone; + + +#include "pppm_cuda_kernel.cu" +#include "stdio.h" +void pppm_device_init(void* cu_density_brick, void* cu_vdx_brick, void* cu_vdy_brick, void* cu_vdz_brick, void* cu_density_fft, void* cu_energy, void* cu_virial + , void* cu_work1, void* cu_work2, void* cu_work3, void* cu_greensfn, void* cu_fkx, void* cu_fky, void* cu_fkz, void* cu_vg + , int cu_nxlo_in, int cu_nxhi_in, int cu_nylo_in, int cu_nyhi_in, int cu_nzlo_in, int cu_nzhi_in, int cu_nxlo_out, int cu_nxhi_out, int cu_nylo_out, int cu_nyhi_out, int cu_nzlo_out, int cu_nzhi_out, int cu_nx_pppm, int cu_ny_pppm, int cu_nz_pppm + , int cu_nxlo_fft, int cu_nxhi_fft, int cu_nylo_fft, int cu_nyhi_fft, int cu_nzlo_fft, int cu_nzhi_fft, void* cu_gf_b + , double cu_qqrd2e, int cu_order, void* cu_rho_coeff, void* cu_debugdata, void* cu_density_brick_int, int cu_slabflag + ) +{ + CUT_CHECK_ERROR("ERROR-CUDA poisson_init Start"); + cudaMemcpyToSymbol(density_brick, &cu_density_brick, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol(density_brick_int, &cu_density_brick_int, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol(vdx_brick, &cu_vdx_brick, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol(vdy_brick, &cu_vdy_brick, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol(vdz_brick, &cu_vdz_brick, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol(density_fft, &cu_density_fft, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol(energy, &cu_energy, sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol(virial, &cu_virial, sizeof(ENERGY_FLOAT*)); + cudaMemcpyToSymbol(nxlo_in, &cu_nxlo_in, sizeof(int)); + cudaMemcpyToSymbol(nxhi_in, &cu_nxhi_in, sizeof(int)); + cudaMemcpyToSymbol(nxlo_out, &cu_nxlo_out, sizeof(int)); + cudaMemcpyToSymbol(nxhi_out, &cu_nxhi_out, sizeof(int)); + cudaMemcpyToSymbol(nylo_in, &cu_nylo_in, sizeof(int)); + cudaMemcpyToSymbol(nyhi_in, &cu_nyhi_in, sizeof(int)); + cudaMemcpyToSymbol(nylo_out, &cu_nylo_out, sizeof(int)); + cudaMemcpyToSymbol(nyhi_out, &cu_nyhi_out, sizeof(int)); + cudaMemcpyToSymbol(nzlo_in, &cu_nzlo_in, sizeof(int)); + cudaMemcpyToSymbol(nzhi_in, &cu_nzhi_in, sizeof(int)); + cudaMemcpyToSymbol(nzlo_out, &cu_nzlo_out, sizeof(int)); + cudaMemcpyToSymbol(nzhi_out, &cu_nzhi_out, sizeof(int)); + cudaMemcpyToSymbol(nxlo_fft, &cu_nxlo_fft, sizeof(int)); + cudaMemcpyToSymbol(nxhi_fft, &cu_nxhi_fft, sizeof(int)); + cudaMemcpyToSymbol(nylo_fft, &cu_nylo_fft, sizeof(int)); + cudaMemcpyToSymbol(nyhi_fft, &cu_nyhi_fft, sizeof(int)); + cudaMemcpyToSymbol(nzlo_fft, &cu_nzlo_fft, sizeof(int)); + cudaMemcpyToSymbol(nzhi_fft, &cu_nzhi_fft, sizeof(int)); + cudaMemcpyToSymbol(slabflag, &cu_slabflag, sizeof(int)); + cudaMemcpyToSymbol(nx_pppm, &cu_nx_pppm, sizeof(int)); + cudaMemcpyToSymbol(ny_pppm, &cu_ny_pppm, sizeof(int)); + cudaMemcpyToSymbol(nz_pppm, &cu_nz_pppm, sizeof(int)); + cudaMemcpyToSymbol(work1, &cu_work1, sizeof(FFT_FLOAT*)); + cudaMemcpyToSymbol(work2, &cu_work2, sizeof(FFT_FLOAT*)); + cudaMemcpyToSymbol(work3, &cu_work3, sizeof(FFT_FLOAT*)); + cudaMemcpyToSymbol(greensfn, &cu_greensfn, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol(gf_b, &cu_gf_b, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol(fkx, &cu_fkx, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol(fky, &cu_fky, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol(fkz, &cu_fkz, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol(vg, &cu_vg, sizeof(PPPM_FLOAT*)); + + PPPM_FLOAT cu_qqrd2e_a = cu_qqrd2e; + cudaMemcpyToSymbol(qqrd2e, &cu_qqrd2e_a, sizeof(PPPM_FLOAT)); + cudaMemcpyToSymbol(order, &cu_order, sizeof(int)); + cudaMemcpyToSymbol(rho_coeff, &cu_rho_coeff, sizeof(PPPM_FLOAT*)); + cudaMemcpyToSymbol(debugdata, &cu_debugdata, sizeof(PPPM_FLOAT*)); + + CUT_CHECK_ERROR("ERROR-CUDA poisson_init"); + + /*if(sizeof(CUDA_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision\n"); + + #ifdef PPPM_PRECISION + if(sizeof(PPPM_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for pppm core\n"); + if(sizeof(PPPM_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for pppm core\n"); + #endif + #ifdef ENERGY_PRECISION + if(sizeof(ENERGY_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for energy\n"); + if(sizeof(ENERGY_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for energy\n"); + #endif + #ifdef ENERGY_PRECISION + if(sizeof(FFT_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for fft\n"); + if(sizeof(FFT_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for fft\n"); + #endif + #ifdef X_PRECISION + if(sizeof(X_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for positions\n"); + if(sizeof(X_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for positions\n"); + #endif + #ifdef F_PRECISION + if(sizeof(F_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for forces\n"); + if(sizeof(F_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for forces\n"); + #endif*/ +} + +void pppm_device_init_setup(cuda_shared_data* sdata, PPPM_FLOAT cu_shiftone, PPPM_FLOAT cu_delxinv, PPPM_FLOAT cu_delyinv, PPPM_FLOAT cu_delzinv, int cu_nlower, int cu_nupper) +{ + cudaMemcpyToSymbol(delxinv, &cu_delxinv, sizeof(PPPM_FLOAT)); + cudaMemcpyToSymbol(delyinv, &cu_delyinv, sizeof(PPPM_FLOAT)); + cudaMemcpyToSymbol(delzinv, &cu_delzinv, sizeof(PPPM_FLOAT)); + cudaMemcpyToSymbol(shiftone, &cu_shiftone, sizeof(PPPM_FLOAT)); + cudaMemcpyToSymbol(nlower, &cu_nlower, sizeof(int)); + cudaMemcpyToSymbol(nupper, &cu_nupper, sizeof(int)); + cudaMemcpyToSymbol(MY_AP(sublo) , sdata->domain.sublo, 3 * sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_AP(subhi) , sdata->domain.subhi, 3 * sizeof(X_FLOAT)); + cudaMemcpyToSymbol(MY_AP(boxlo) , sdata->domain.boxlo, 3 * sizeof(X_FLOAT)); + CUT_CHECK_ERROR("ERROR-CUDA pppm_init_setup"); +} + +void pppm_device_update(cuda_shared_data* sdata, void* cu_part2grid, int nlocala, int nmaxa) +{ + cudaMemcpyToSymbol(part2grid, &cu_part2grid, sizeof(int*)); + cudaMemcpyToSymbol(MY_AP(x) , & sdata->atom.x .dev_data, sizeof(X_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(f) , & sdata->atom.f .dev_data, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(q) , & sdata->atom.q .dev_data, sizeof(F_FLOAT*)); + cudaMemcpyToSymbol(MY_AP(tag) , & sdata->atom.tag .dev_data, sizeof(int*)); + //cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal .dev_data, sizeof(int)); + cudaMemcpyToSymbol(nlocal , &nlocala, sizeof(int)); + cudaMemcpyToSymbol(nmax , &nmaxa, sizeof(int)); + CUT_CHECK_ERROR("ERROR-CUDA pppm_device_update"); + +} + +void pppm_update_nlocal(int nlocala) +{ + cudaMemcpyToSymbol(nlocal , &nlocala, sizeof(int)); + CUT_CHECK_ERROR("ERROR-CUDA update_nlocal b"); +} + + +void Cuda_PPPM_Setup_fkxyz_vg(int nx_pppma, int ny_pppma, int nz_pppma, PPPM_FLOAT unitkx, PPPM_FLOAT unitky, PPPM_FLOAT unitkz, PPPM_FLOAT g_ewald) +{ + dim3 grid; + dim3 threads; + grid.x = nz_pppma; + grid.y = ny_pppma; + grid.z = 1; + threads.x = nx_pppma; + threads.y = 1; + threads.z = 1; + setup_fkxyz_vg <<< grid, threads, 0>>>(unitkx, unitky, unitkz, g_ewald); + cudaThreadSynchronize(); + + CUT_CHECK_ERROR("ERROR-CUDA Cuda_PPPM_Setup_fkxyz_vg "); +} + +void Cuda_PPPM_setup_greensfn(int nx_pppma, int ny_pppma, int nz_pppma, PPPM_FLOAT unitkx, PPPM_FLOAT unitky, PPPM_FLOAT unitkz, PPPM_FLOAT g_ewald, + int nbx, int nby, int nbz, PPPM_FLOAT xprd, PPPM_FLOAT yprd, PPPM_FLOAT zprd_slab) +{ + dim3 grid; + dim3 threads; + grid.x = nz_pppma; + grid.y = ny_pppma; + grid.z = 1; + threads.x = nx_pppma; + threads.y = 1; + threads.z = 1; + setup_greensfn <<< grid, threads, 0>>>(unitkx, unitky, unitkz, g_ewald, nbx, nby, nbz, xprd, yprd, zprd_slab); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("ERROR-CUDA Cuda_PPPM_Setup_greensfn "); +} + +void poisson_scale(int nx_pppma, int ny_pppma, int nz_pppma) +{ + dim3 grid; + dim3 threads; + grid.x = nz_pppma; + grid.y = ny_pppma; + grid.z = 1; + threads.x = nx_pppma; + threads.y = 1; + threads.z = 1; + poisson_scale_kernel <<< grid, threads, 0>>>(); + CUT_CHECK_ERROR("ERROR-CUDA poisson_scale "); + +} + +void poisson_xgrad(int nx_pppma, int ny_pppma, int nz_pppma) +{ + dim3 grid; + dim3 threads; + grid.x = nz_pppma; + grid.y = ny_pppma; + grid.z = 1; + threads.x = nx_pppma; + threads.y = 1; + threads.z = 1; + poisson_xgrad_kernel <<< grid, threads, 0>>>(); + CUT_CHECK_ERROR("ERROR-CUDA poisson_xgrad "); +} + +void poisson_ygrad(int nx_pppma, int ny_pppma, int nz_pppma) +{ + dim3 grid; + dim3 threads; + grid.x = nz_pppma; + grid.y = ny_pppma; + grid.z = 1; + threads.x = nx_pppma; + threads.y = 1; + threads.z = 1; + poisson_ygrad_kernel <<< grid, threads, 0>>>(); + CUT_CHECK_ERROR("ERROR-CUDA poisson_ygrad "); +} + +void poisson_zgrad(int nx_pppma, int ny_pppma, int nz_pppma) +{ + dim3 grid; + dim3 threads; + grid.x = nz_pppma; + grid.y = ny_pppma; + grid.z = 1; + threads.x = nx_pppma; + threads.y = 1; + threads.z = 1; + poisson_zgrad_kernel <<< grid, threads, 0>>>(); + CUT_CHECK_ERROR("ERROR-CUDA poisson_zgrad "); +} + +void poisson_vdx_brick(int ihi, int ilo, int jhi, int jlo, int khi, int klo, int nx_pppma, int ny_pppma, int nz_pppma) +{ + + dim3 grid; + dim3 threads; + grid.x = khi - klo + 1; + grid.y = jhi - jlo + 1; + grid.z = 1; + threads.x = ihi - ilo + 1; + threads.y = 1; + threads.z = 1; + //printf("VDX_BRICK CUDA: %i %i %i\n",grid.x,grid.y,threads.x); + poisson_vdx_brick_kernel <<< grid, threads, 0>>>(ilo, jlo, klo); + CUT_CHECK_ERROR("ERROR-CUDA poisson_vdxbrick "); + cudaThreadSynchronize(); +} + +void poisson_vdy_brick(int ihi, int ilo, int jhi, int jlo, int khi, int klo, int nx_pppm, int ny_pppm, int nz_pppm) +{ + dim3 grid; + dim3 threads; + grid.x = khi - klo + 1; + grid.y = jhi - jlo + 1; + grid.z = 1; + threads.x = ihi - ilo + 1; + threads.y = 1; + threads.z = 1; + poisson_vdy_brick_kernel <<< grid, threads, 0>>>(ilo, jlo, klo); + CUT_CHECK_ERROR("ERROR-CUDA poisson_vdybrick "); + cudaThreadSynchronize(); +} + +void poisson_vdz_brick(int ihi, int ilo, int jhi, int jlo, int khi, int klo, int nx_pppm, int ny_pppm, int nz_pppm) +{ + dim3 grid; + dim3 threads; + grid.x = khi - klo + 1; + grid.y = jhi - jlo + 1; + grid.z = 1; + threads.x = ihi - ilo + 1; + threads.y = 1; + threads.z = 1; + poisson_vdz_brick_kernel <<< grid, threads, 0>>>(ilo, jlo, klo); + CUT_CHECK_ERROR("ERROR-CUDA poisson_vdzbrick "); + cudaThreadSynchronize(); +} + + +void poisson_energy(int nxlo_fft, int nxhi_fft, int nylo_fft, int nyhi_fft, int nzlo_fft, int nzhi_fft, int vflag) +{ + //printf("VFLAG_GPU: %i\n",vflag); + CUT_CHECK_ERROR("ERROR-CUDA poisson_energy start "); + dim3 grid; + dim3 threads; + grid.x = nzhi_fft - nzlo_fft + 1; + grid.y = nyhi_fft - nylo_fft + 1; + grid.z = 1; + threads.x = nxhi_fft - nxlo_fft + 1; + threads.y = 1; + threads.z = 1; + poisson_energy_kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>>(nxlo_fft, nylo_fft, nzlo_fft, vflag); + + cudaThreadSynchronize(); + CUT_CHECK_ERROR("ERROR-CUDA poisson_energy end "); +} + +ENERGY_FLOAT sum_energy(void* cu_virial, void* cu_energy, int nx_pppma, int ny_pppma, int nz_pppma, int vflag, ENERGY_FLOAT* cpu_virial) +{ + ENERGY_FLOAT host_energy = 0; + dim3 grid; + dim3 threads; + + grid.x = nz_pppma; + grid.y = 1; + grid.z = 1; + threads.x = ny_pppma; + threads.y = 1; + threads.z = 1; + sum_energy_kernel1 <<< grid, threads, ny_pppma* sizeof(ENERGY_FLOAT)>>>(vflag); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("ERROR-CUDA sumenergy_kernel1 "); + + grid.x = 1; + grid.y = 1; + grid.z = 1; + threads.x = nz_pppma; + threads.y = 1; + threads.z = 1; + sum_energy_kernel2 <<< grid, threads, nz_pppma* sizeof(ENERGY_FLOAT)>>>(vflag); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("ERROR-CUDA sumenergy_kernel2 "); + + cudaMemcpy((void*)(&host_energy), cu_energy, sizeof(ENERGY_FLOAT), cudaMemcpyDeviceToHost); + + if(vflag) + cudaMemcpy((void*) cpu_virial, (void*) cu_virial, 6 * sizeof(ENERGY_FLOAT), cudaMemcpyDeviceToHost); + CUT_CHECK_ERROR("ERROR-CUDA sumenergy_memcopy"); + + return host_energy; +} + +void cuda_make_rho(cuda_shared_data* sdata, void* flag, PPPM_FLOAT* cu_density_intScale, int ihi, int ilo, int jhi, int jlo, int khi, int klo, void* cu_density_brick, void* cu_density_brick_int) +{ + CUT_CHECK_ERROR("cuda_make_rho begin"); + dim3 grid, threads; + int cpu_flag[3]; + grid.x = (sdata->atom.nlocal + 31) / 32; + grid.y = 1; + grid.z = 1; + threads.x = 32; + threads.y = 1; + threads.z = 1; + int sharedmemsize = (32 + 32 * (sdata->pppm.nupper - sdata->pppm.nlower + 1) + sdata->pppm.order * (sdata->pppm.order / 2 - (1 - sdata->pppm.order) / 2 + 1)) * sizeof(PPPM_FLOAT); + + do { + cpu_flag[0] = 0; + cpu_flag[1] = 0; + cpu_flag[2] = 0; + cudaMemcpyToSymbol(density_intScale, cu_density_intScale, sizeof(PPPM_FLOAT*)); + CUT_CHECK_ERROR("ERROR-CUDA make_rho pre Z"); + cudaMemset(flag, 0, 3 * sizeof(int)); + CUT_CHECK_ERROR("ERROR-CUDA make_rho pre A"); + cudaMemset(cu_density_brick, 0, (khi - klo + 1) * (jhi - jlo + 1) * (ihi - ilo + 1)*sizeof(PPPM_FLOAT)); + CUT_CHECK_ERROR("ERROR-CUDA make_rho pre B"); + cudaMemset(cu_density_brick_int, 0, (khi - klo + 1) * (jhi - jlo + 1) * (ihi - ilo + 1)*sizeof(int)); + CUT_CHECK_ERROR("ERROR-CUDA make_rho pre C"); + make_rho_kernel <<< grid, threads, sharedmemsize>>>((int*) flag, 32 / (sdata->pppm.nupper - sdata->pppm.nlower + 1)); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("ERROR-CUDA make_rho A"); + cudaMemcpy((void*) &cpu_flag, flag, 3 * sizeof(int), cudaMemcpyDeviceToHost); + + if(cpu_flag[0] != 0) { + (*cu_density_intScale) /= 2; + MYDBG(printf("PPPM_Cuda::cuda_make_rho: Decrease cu_density_intScale to: %e\n", *cu_density_intScale);) + } + if((cpu_flag[0] == 0) && (cpu_flag[1] == 0)) { + (*cu_density_intScale) *= 2; + MYDBG(printf("PPPM_Cuda::cuda_make_rho: Increase cu_density_intScale to: %e\n", *cu_density_intScale);) + } + /* if((*cu_density_intScale)>0xe0000000) + { + printf("Error Scaling\n"); + cpu_flag[0]=0; + cpu_flag[1]=1; + }*/ + CUT_CHECK_ERROR("ERROR-CUDA make_rho B"); + } while((cpu_flag[0] != 0) || (cpu_flag[1] == 0)); + + + grid.x = khi - klo + 1; + grid.y = jhi - jlo + 1; + threads.x = ihi - ilo + 1; + scale_rho_kernel <<< grid, threads, 0>>>(); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("ERROR-CUDA make_rho_scale"); +} + + +int cuda_particle_map(cuda_shared_data* sdata, void* flag) +{ + dim3 grid, threads; + int cpu_flag; + grid.x = (sdata->atom.nlocal + 31) / 32; + grid.y = 1; + grid.z = 1; + threads.x = 32; + threads.y = 1; + threads.z = 1; + CUT_CHECK_ERROR("ERROR-CUDA particla_map ..pre"); + particle_map_kernel <<< grid, threads, 0>>>((int*) flag); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("ERROR-CUDA particla_map a"); + cudaMemcpy((void*) &cpu_flag, flag, sizeof(int), cudaMemcpyDeviceToHost); + CUT_CHECK_ERROR("ERROR-CUDA particla_map b"); + return cpu_flag; +} + + +void cuda_fieldforce(cuda_shared_data* sdata, void* flag) +{ + dim3 grid, threads; + grid.x = (sdata->atom.nlocal + 31) / 32; + grid.y = 1; + grid.z = 1; + threads.x = 32; + threads.y = 1; + threads.z = 1; + int sharedmemsize = (32 + 3 * 32 * (sdata->pppm.nupper - sdata->pppm.nlower + 1) + sdata->pppm.order * (sdata->pppm.order / 2 - (1 - sdata->pppm.order) / 2 + 1)) * sizeof(PPPM_FLOAT); + fieldforce_kernel <<< grid, threads, sharedmemsize>>> + (sdata->pppm.nupper - sdata->pppm.nlower + 1, 32 / (sdata->pppm.nupper - sdata->pppm.nlower + 1), (int*) flag); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("ERROR-CUDA fieldforce"); +} + +double cuda_slabcorr_energy(cuda_shared_data* sdata, ENERGY_FLOAT* buf, ENERGY_FLOAT* dev_buf) +{ + dim3 grid, threads; + grid.x = (sdata->atom.nlocal + 31) / 32; + grid.y = 1; + grid.z = 1; + threads.x = 32; + threads.y = 1; + threads.z = 1; + slabcorr_energy_kernel <<< grid, threads, 32* sizeof(ENERGY_FLOAT)>>>(dev_buf); + cudaThreadSynchronize(); + cudaMemcpy((void*) buf, dev_buf, grid.x* sizeof(ENERGY_FLOAT), cudaMemcpyDeviceToHost); + + double dipole_all = 0.0; + + for(int i = 0; i < grid.x; i++) + dipole_all += buf[i]; + + return dipole_all; +} + +void cuda_slabcorr_force(cuda_shared_data* sdata, F_FLOAT ffact) +{ + dim3 grid, threads; + grid.x = (sdata->atom.nlocal + 31) / 32; + grid.y = 1; + grid.z = 1; + threads.x = 32; + threads.y = 1; + threads.z = 1; + slabcorr_force_kernel <<< grid, threads>>>(ffact); + cudaThreadSynchronize(); +} + +void sum_virial(double* host_virial) +{ +} + +void pppm_initfftdata(cuda_shared_data* sdata, PPPM_FLOAT* in, FFT_FLOAT* out) +{ + int nslow = sdata->pppm.nzhi_in - sdata->pppm.nzlo_in; + int nmid = sdata->pppm.nyhi_in - sdata->pppm.nylo_in; + int nfast = sdata->pppm.nxhi_in - sdata->pppm.nxlo_in; + int nrimz = MAX(sdata->pppm.nzlo_in - sdata->pppm.nzlo_out, sdata->pppm.nzhi_out - sdata->pppm.nzhi_in); + int nrimy = MAX(sdata->pppm.nylo_in - sdata->pppm.nylo_out, sdata->pppm.nyhi_out - sdata->pppm.nyhi_in); + int nrimx = MAX(sdata->pppm.nxlo_in - sdata->pppm.nxlo_out, sdata->pppm.nxhi_out - sdata->pppm.nxhi_in); + dim3 grid; + grid.x = nslow + 1; + grid.y = nmid + 1; + grid.z = 1; + dim3 threads; + threads.x = nfast + 1; + threads.y = 1; + threads.z = 1; + cudaThreadSynchronize(); + initfftdata_core_kernel <<< grid, threads, 0>>>(in, out); + cudaThreadSynchronize(); + grid.x = nrimz; + grid.y = nmid + 1; + threads.x = nfast + 1; + initfftdata_z_kernel <<< grid, threads, 0>>>(in, out); + cudaThreadSynchronize(); + grid.x = nslow + 1; + grid.y = nrimy; + threads.x = nfast + 1; + initfftdata_y_kernel <<< grid, threads, 0>>>(in, out); + cudaThreadSynchronize(); + grid.x = nslow + 1; + grid.y = nmid + 1; + threads.x = nrimx; + initfftdata_x_kernel <<< grid, threads, 0>>>(in, out); + cudaThreadSynchronize(); + grid.x = nrimz; + grid.y = nrimy; + threads.x = nfast + 1; + initfftdata_yz_kernel <<< grid, threads, 0>>>(in, out); + cudaThreadSynchronize(); + grid.x = nrimz; + grid.y = nmid + 1; + threads.x = nrimx; + initfftdata_xz_kernel <<< grid, threads, 0>>>(in, out); + cudaThreadSynchronize(); + grid.x = nslow + 1; + grid.y = nrimy; + threads.x = nrimx; + initfftdata_xy_kernel <<< grid, threads, 0>>>(in, out); + cudaThreadSynchronize(); + grid.x = nrimz; + grid.y = nrimy; + threads.x = nrimx; + initfftdata_xyz_kernel <<< grid, threads, 0>>>(in, out); + cudaThreadSynchronize(); + CUT_CHECK_ERROR("ERROR-CUDA initfftdata_kernel"); +} + + diff --git a/lib/cuda/pppm_cuda_cu.h b/lib/cuda/pppm_cuda_cu.h new file mode 100644 index 0000000000000000000000000000000000000000..a22e811c3830c266b08010baf339dc381e0faae9 --- /dev/null +++ b/lib/cuda/pppm_cuda_cu.h @@ -0,0 +1,55 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#ifndef PPPM_CUDA_CU_H_ +#define PPPM_CUDA_CU_H_ + +extern "C" void pppm_device_init(void* cu_density_brick, void* cu_vdx_brick, void* cu_vdy_brick, void* cu_vdz_brick, void* cu_density_fft, void* cu_energy, void* cu_virial + , void* cu_work1, void* cu_work2, void* cu_work3, void* cu_greensfn, void* cu_fkx, void* cu_fky, void* cu_fkz, void* cu_vg + , int nxlo_in, int nxhi_in, int nylo_in, int nyhi_in, int nzlo_in, int nzhi_in, int nxlo_out, int nxhi_out, int nylo_out, int nyhi_out, int nzlo_out, int nzhi_out, int nx_pppm, int ny_pppm, int nz_pppm + , int cu_nxlo_fft, int cu_nxhi_fft, int cu_nylo_fft, int cu_nyhi_fft, int cu_nzlo_fft, int cu_nzhi_fft, void* cu_gf_b + , double cu_qqrd2e, int cu_order, void* cu_rho_coeff, void* cu_debugdata, void* cu_density_brick_lock, int slabflag + ); +extern "C" void pppm_device_init_setup(cuda_shared_data* sdata, PPPM_FLOAT shiftone, PPPM_FLOAT delxinv, PPPM_FLOAT delyinv, PPPM_FLOAT delzinv, int nlower, int nupper); +extern "C" void Cuda_PPPM_Setup_fkxyz_vg(int nx_pppma, int ny_pppma, int nz_pppma, PPPM_FLOAT unitkx, PPPM_FLOAT unitky, PPPM_FLOAT unitkz, PPPM_FLOAT g_ewald); +extern "C" void Cuda_PPPM_setup_greensfn(int nx_pppma, int ny_pppma, int nz_pppma, PPPM_FLOAT unitkx, PPPM_FLOAT unitky, PPPM_FLOAT unitkz, PPPM_FLOAT g_ewald, + int nbx, int nby, int nbz, PPPM_FLOAT xprd, PPPM_FLOAT yprd, PPPM_FLOAT zprd_slab); + +extern "C" void pppm_device_update(cuda_shared_data* sdata, void* cu_part2grid, int nlocala, int nmaxa); +extern "C" void pppm_update_nlocal(int nlocala); +extern "C" void poisson_scale(int nx_pppm, int ny_pppm, int nz_pppm); +extern "C" void poisson_xgrad(int nx_pppm, int ny_pppm, int nz_pppm); +extern "C" void poisson_ygrad(int nx_pppm, int ny_pppm, int nz_pppm); +extern "C" void poisson_zgrad(int nx_pppm, int ny_pppm, int nz_pppm); +extern "C" void poisson_vdx_brick(int ihi, int ilo, int jhi, int jlo, int khi, int klo, int nx_pppm, int ny_pppm, int nz_pppm); +extern "C" void poisson_vdy_brick(int ihi, int ilo, int jhi, int jlo, int khi, int klo, int nx_pppm, int ny_pppm, int nz_pppm); +extern "C" void poisson_vdz_brick(int ihi, int ilo, int jhi, int jlo, int khi, int klo, int nx_pppm, int ny_pppm, int nz_pppm); +extern "C" void poisson_energy(int nxlo_fft, int nxhi_fft, int nylo_fft, int nyhi_fft, int nzlo_fft, int nzhi_fft, int vflag); +extern "C" ENERGY_FLOAT sum_energy(void* cu_virial, void* cu_energy, int nx_pppma, int ny_pppma, int nz_pppma, int vflag, ENERGY_FLOAT* cpu_virial); +extern "C" int cuda_particle_map(cuda_shared_data* sdata, void* flag); +extern "C" void cuda_make_rho(cuda_shared_data* sdata, void* flag, PPPM_FLOAT* cu_density_intScale, int ihi, int ilo, int jhi, int jlo, int khi, int klo, void* cu_density_brick, void* cu_density_brick_int); +extern "C" void cuda_fieldforce(cuda_shared_data* sdata, void* flag); +extern "C" double cuda_slabcorr_energy(cuda_shared_data* sdata, ENERGY_FLOAT* buf, ENERGY_FLOAT* dev_buf); +extern "C" void cuda_slabcorr_force(cuda_shared_data* sdata, F_FLOAT ffact); +extern "C" void pppm_initfftdata(cuda_shared_data* sdata, PPPM_FLOAT* in, FFT_FLOAT* out); +#endif /*PPPM_CUDA_CU_H_*/ diff --git a/lib/cuda/pppm_cuda_kernel.cu b/lib/cuda/pppm_cuda_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..25a81866f05305f17357a5f398f89715cd0be426 --- /dev/null +++ b/lib/cuda/pppm_cuda_kernel.cu @@ -0,0 +1,858 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + + Original Version: + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + See the README file in the top-level LAMMPS directory. + + ----------------------------------------------------------------------- + + USER-CUDA Package and associated modifications: + https://sourceforge.net/projects/lammpscuda/ + + Christian Trott, christian.trott@tu-ilmenau.de + Lars Winterfeld, lars.winterfeld@tu-ilmenau.de + Theoretical Physics II, University of Technology Ilmenau, Germany + + See the README file in the USER-CUDA directory. + + This software is distributed under the GNU General Public License. +------------------------------------------------------------------------- */ + +#define OFFSET 4096 +__device__ int negativCUDA(float f) +{ + return ((unsigned int)1 << 31 & (__float_as_int(f))) >> 31; +} + +__device__ void reduceBlock(float* data) +{ + int p2 = 1; + + while(p2 * 2 < blockDim.x) p2 *= 2; + + if(threadIdx.x < blockDim.x - p2) + data[threadIdx.x] += data[threadIdx.x + p2]; + + __syncthreads(); + + for(int i = 2; i <= p2; i *= 2) { + if(threadIdx.x < p2 / i) + data[threadIdx.x] += data[threadIdx.x + p2 / i]; + + __syncthreads(); + } +} + +__device__ void reduceBlock(double* data) +{ + int p2 = 1; + + while(p2 * 2 < blockDim.x) p2 *= 2; + + if(threadIdx.x < blockDim.x - p2) + data[threadIdx.x] += data[threadIdx.x + p2]; + + __syncthreads(); + + for(int i = 2; i <= p2; i *= 2) { + if(threadIdx.x < p2 / i) + data[threadIdx.x] += data[threadIdx.x + p2 / i]; + + __syncthreads(); + } +} + +extern __shared__ PPPM_FLOAT sharedmem[]; + +__global__ void setup_fkxyz_vg(PPPM_FLOAT unitkx, PPPM_FLOAT unitky, PPPM_FLOAT unitkz, PPPM_FLOAT g_ewald) +{ + PPPM_FLOAT my_fkx = unitkx * (int(threadIdx.x) - nx_pppm * (2 * int(threadIdx.x) / nx_pppm)); + PPPM_FLOAT my_fky = unitky * (int(blockIdx.y) - ny_pppm * (2 * int(blockIdx.y) / ny_pppm)); + PPPM_FLOAT my_fkz = unitkz * (int(blockIdx.x) - nz_pppm * (2 * int(blockIdx.x) / nz_pppm)); + + if((blockIdx.x == 0) && (blockIdx.y == 0)) fkx[threadIdx.x] = my_fkx; + + if((blockIdx.x == 0) && (threadIdx.x == 0)) fky[blockIdx.y] = my_fky; + + if((threadIdx.x == 0) && (blockIdx.y == 0)) fkz[blockIdx.x] = my_fkz; + + __syncthreads(); + + if((blockIdx.x >= nzlo_fft) && (blockIdx.x <= nzhi_fft) && + (blockIdx.y >= nylo_fft) && (blockIdx.y <= nyhi_fft) && + (threadIdx.x >= nxlo_fft) && (threadIdx.x <= nxhi_fft)) { + int n = ((int(blockIdx.x) - nzlo_fft) * (nyhi_fft - nylo_fft + 1) + int(blockIdx.y) - nylo_fft) * (nxhi_fft - nxlo_fft + 1) + int(threadIdx.x) - nxlo_fft; + PPPM_FLOAT sqk = my_fkx * my_fkx + my_fky * my_fky + my_fkz * my_fkz; + PPPM_FLOAT vterm = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : PPPM_F(-2.0) * (PPPM_F(1.0) / sqk + PPPM_F(0.25) / (g_ewald * g_ewald)); + vg[6 * n + 0] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : PPPM_F(1.0) + vterm * my_fkx * my_fkx; + vg[6 * n + 1] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : PPPM_F(1.0) + vterm * my_fky * my_fky; + vg[6 * n + 2] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : PPPM_F(1.0) + vterm * my_fkz * my_fkz; + vg[6 * n + 3] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : vterm * my_fkx * my_fky; + vg[6 * n + 4] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : vterm * my_fkx * my_fkz; + vg[6 * n + 5] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : vterm * my_fky * my_fkz; + + } +} + +__device__ PPPM_FLOAT gf_denom(PPPM_FLOAT x, PPPM_FLOAT y, PPPM_FLOAT z) +{ + PPPM_FLOAT sx, sy, sz; + sz = sy = sx = PPPM_F(0.0); + + for(int l = order - 1; l >= 0; l--) { + sx = gf_b[l] + sx * x; + sy = gf_b[l] + sy * y; + sz = gf_b[l] + sz * z; + } + + PPPM_FLOAT s = sx * sy * sz; + return s * s; +} + +__global__ void setup_greensfn(PPPM_FLOAT unitkx, PPPM_FLOAT unitky, PPPM_FLOAT unitkz, PPPM_FLOAT g_ewald, + int nbx, int nby, int nbz, + PPPM_FLOAT xprd, PPPM_FLOAT yprd, PPPM_FLOAT zprd_slab) +{ + PPPM_FLOAT sqk; + int nx, ny, nz, kper, lper, mper, k, l, m; + PPPM_FLOAT snx, sny, snz, snx2, sny2, snz2; + PPPM_FLOAT argx, argy, argz, wx, wy, wz, sx, sy, sz, qx, qy, qz; + PPPM_FLOAT sum1, dot1, dot2; + PPPM_FLOAT numerator, denominator; + + PPPM_FLOAT form = PPPM_F(1.0); + int n = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + m = blockIdx.x; + l = blockIdx.y; + k = threadIdx.x; + + mper = m - nz_pppm * (2 * m / nz_pppm); + snz = sin(PPPM_F(0.5) * unitkz * mper * zprd_slab / nz_pppm); + snz2 = snz * snz; + + + lper = l - ny_pppm * (2 * l / ny_pppm); + sny = sin(PPPM_F(0.5) * unitky * lper * yprd / ny_pppm); + sny2 = sny * sny; + + kper = k - nx_pppm * (2 * k / nx_pppm); + snx = sin(PPPM_F(0.5) * unitkx * kper * xprd / nx_pppm); + snx2 = snx * snx; + + sqk = pow(unitkx * kper, PPPM_F(2.0)) + pow(unitky * lper, PPPM_F(2.0)) + + pow(unitkz * mper, PPPM_F(2.0)); + + if(sqk != PPPM_F(0.0)) { + numerator = form * PPPM_F(12.5663706) / sqk; + denominator = gf_denom(snx2, sny2, snz2); + sum1 = PPPM_F(0.0); + + for(nx = -nbx; nx <= nbx; nx++) { + qx = unitkx * (kper + nx_pppm * nx); + sx = exp(PPPM_F(-.25) * pow(qx / g_ewald, PPPM_F(2.0))); + wx = PPPM_F(1.0); + argx = PPPM_F(0.5) * qx * xprd / nx_pppm; + + if(argx != PPPM_F(0.0)) wx = pow(sin(argx) / argx, order); + + for(ny = -nby; ny <= nby; ny++) { + qy = unitky * (lper + ny_pppm * ny); + sy = exp(PPPM_F(-.25) * pow(qy / g_ewald, PPPM_F(2.0))); + wy = PPPM_F(1.0); + argy = PPPM_F(0.5) * qy * yprd / ny_pppm; + + if(argy != PPPM_F(0.0)) wy = pow(sin(argy) / argy, order); + + for(nz = -nbz; nz <= nbz; nz++) { + qz = unitkz * (mper + nz_pppm * nz); + sz = exp(PPPM_F(-.25) * pow(qz / g_ewald, PPPM_F(2.0))); + wz = PPPM_F(1.0); + argz = PPPM_F(0.5) * qz * zprd_slab / nz_pppm; + + if(argz != PPPM_F(0.0)) wz = pow(sin(argz) / argz, order); + + dot1 = unitkx * kper * qx + unitky * lper * qy + unitkz * mper * qz; + dot2 = qx * qx + qy * qy + qz * qz; + sum1 += (dot1 / dot2) * sx * sy * sz * pow(wx * wy * wz, PPPM_F(2.0)); + } + } + } + + greensfn[n] = numerator * sum1 / denominator; + } else greensfn[n] = PPPM_F(0.0); +} + +__global__ void poisson_scale_kernel() +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + FFT_FLOAT scaleinv = FFT_F(1.0) / (gridDim.x * gridDim.y * blockDim.x); + work1[2 * i] *= scaleinv * greensfn[i]; + work1[2 * i + 1] *= scaleinv * greensfn[i]; +} + +__global__ void poisson_xgrad_kernel() +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + work2[2 * i] = fkx[threadIdx.x] * work1[2 * i + 1]; + work2[2 * i + 1] = -fkx[threadIdx.x] * work1[2 * i]; +} + +__global__ void poisson_ygrad_kernel() +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + work2[2 * i] = fky[blockIdx.y] * work1[2 * i + 1]; + work2[2 * i + 1] = -fky[blockIdx.y] * work1[2 * i]; +} + +__global__ void poisson_zgrad_kernel() +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + work2[2 * i] = fkz[blockIdx.x] * work1[2 * i + 1]; + work2[2 * i + 1] = -fkz[blockIdx.x] * work1[2 * i]; +} + +__global__ void poisson_vdx_brick_kernel(int ilo, int jlo, int klo) +{ + int k = blockIdx.x + klo; + k += nz_pppm * negativCUDA(CUDA_F(1.0) * k) - nz_pppm * negativCUDA(CUDA_F(1.0) * (nz_pppm - k - 1)); + int j = blockIdx.y + jlo; + j += ny_pppm * negativCUDA(CUDA_F(1.0) * j) - ny_pppm * negativCUDA(CUDA_F(1.0) * (ny_pppm - j - 1)); + int i = threadIdx.x + ilo; + i += nx_pppm * negativCUDA(CUDA_F(1.0) * i) - nx_pppm * negativCUDA(CUDA_F(1.0) * (nx_pppm - i - 1)); + vdx_brick[((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1) + threadIdx.x] = work3[2 * (((k) * ny_pppm + (j)) * nx_pppm + i)]; +} + +__global__ void poisson_vdy_brick_kernel(int ilo, int jlo, int klo) +{ + int k = blockIdx.x + klo; + k += nz_pppm * negativCUDA(CUDA_F(1.0) * k) - nz_pppm * negativCUDA(CUDA_F(1.0) * (nz_pppm - k - 1)); + int j = blockIdx.y + jlo; + j += ny_pppm * negativCUDA(CUDA_F(1.0) * j) - ny_pppm * negativCUDA(CUDA_F(1.0) * (ny_pppm - j - 1)); + int i = threadIdx.x + ilo; + i += nx_pppm * negativCUDA(CUDA_F(1.0) * i) - nx_pppm * negativCUDA(CUDA_F(1.0) * (nx_pppm - i - 1)); + vdy_brick[((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1) + threadIdx.x] = work3[2 * (((k) * ny_pppm + (j)) * nx_pppm + i)]; +} + +__global__ void poisson_vdz_brick_kernel(int ilo, int jlo, int klo) +{ + int k = blockIdx.x + klo; + k += nz_pppm * negativCUDA(CUDA_F(1.0) * k) - nz_pppm * negativCUDA(CUDA_F(1.0) * (nz_pppm - k - 1)); + int j = blockIdx.y + jlo; + j += ny_pppm * negativCUDA(CUDA_F(1.0) * j) - ny_pppm * negativCUDA(CUDA_F(1.0) * (ny_pppm - j - 1)); + int i = threadIdx.x + ilo; + i += nx_pppm * negativCUDA(CUDA_F(1.0) * i) - nx_pppm * negativCUDA(CUDA_F(1.0) * (nx_pppm - i - 1)); + vdz_brick[((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1) + threadIdx.x] = work3[2 * (((k) * ny_pppm + (j)) * nx_pppm + i)]; +} + +__global__ void poisson_energy_kernel(int nxlo_fft, int nylo_fft, int nzlo_fft, int vflag) +{ + ENERGY_FLOAT scaleinv = FFT_F(1.0) / (nx_pppm * ny_pppm * nz_pppm); + int i = (blockIdx.x + nzlo_fft) * ny_pppm * nx_pppm + (blockIdx.y + nylo_fft) * nx_pppm + threadIdx.x + nxlo_fft; + ENERGY_FLOAT* s_energy = (ENERGY_FLOAT*) sharedmem; + ENERGY_FLOAT myenergy = scaleinv * scaleinv * greensfn[i] * (work1[2 * i] * work1[2 * i] + work1[2 * i + 1] * work1[2 * i + 1]); + s_energy[threadIdx.x] = myenergy; + + __syncthreads(); + reduceBlock(s_energy); + + if(threadIdx.x == 0) + energy[blockIdx.x * ny_pppm + blockIdx.y] = s_energy[0]; + + if(vflag) { + __syncthreads(); + + for(int j = 0; j < 6; j++) { + s_energy[threadIdx.x] = myenergy * vg[((blockIdx.x * gridDim.y + blockIdx.y) * (blockDim.x) + threadIdx.x) * 6 + j]; + __syncthreads(); + reduceBlock(s_energy); + + if(threadIdx.x == 0) + virial[blockIdx.x * ny_pppm + blockIdx.y + j * nz_pppm * ny_pppm] = s_energy[0]; + } + } +} + + +__global__ void sum_energy_kernel1(int vflag) +{ + ENERGY_FLOAT myenergy = energy[(blockIdx.x * ny_pppm + threadIdx.x)]; + ENERGY_FLOAT* s_energy = (ENERGY_FLOAT*) sharedmem; + s_energy[threadIdx.x] = myenergy; + __syncthreads(); + reduceBlock(s_energy); + + if(threadIdx.x == 0) + energy[blockIdx.x * ny_pppm] = s_energy[0]; + + if(vflag) { + __syncthreads(); + + for(int j = 0; j < 6; j++) { + myenergy = virial[blockIdx.x * ny_pppm + threadIdx.x + j * ny_pppm * nz_pppm]; + s_energy[threadIdx.x] = myenergy; + __syncthreads(); + reduceBlock(s_energy); + + if(threadIdx.x == 0) + virial[blockIdx.x * ny_pppm + j * ny_pppm * nz_pppm] = s_energy[0]; + } + } + +} + +__global__ void sum_energy_kernel2(int vflag) +{ + ENERGY_FLOAT myenergy = energy[threadIdx.x * ny_pppm]; + ENERGY_FLOAT* s_energy = (ENERGY_FLOAT*) sharedmem; + s_energy[threadIdx.x] = myenergy; + __syncthreads(); + reduceBlock(s_energy); + + if(threadIdx.x == 0) + energy[0] = s_energy[0]; + + if(vflag) { + __syncthreads(); + + for(int j = 0; j < 6; j++) { + myenergy = virial[threadIdx.x * ny_pppm + j * ny_pppm * nz_pppm]; + s_energy[threadIdx.x] = myenergy; + __syncthreads(); + reduceBlock(s_energy); + + if(threadIdx.x == 0) + virial[j] = s_energy[0]; + } + } +} + +__device__ PPPM_FLOAT rho1d(int k, PPPM_FLOAT d, PPPM_FLOAT* srho_coeff) +{ + PPPM_FLOAT rho1d_tmp = PPPM_F(0.0); + + for(int l = order - 1; l >= 0; l--) + rho1d_tmp = srho_coeff[l * order + k - (1 - order) / 2] + rho1d_tmp * d; + + return rho1d_tmp; +} + +__global__ void particle_map_kernel(int* flag) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + + if(i < nlocal) { + int nx, ny, nz; + PPPM_FLOAT shift = PPPM_F(0.5) - shiftone; //+OFFSET; + nx = (int)((_x[i] - _boxlo[0]) * delxinv + shift); // - OFFSET; + ny = (int)((_x[i + nmax] - _boxlo[1]) * delyinv + shift); // - OFFSET; + nz = (int)((_x[i + 2 * nmax] - _boxlo[2]) * delzinv + shift); // - OFFSET; + + part2grid[i] = nx; + part2grid[i + nmax] = ny; + part2grid[i + 2 * nmax] = nz; + + // check that entire stencil around nx,ny,nz will fit in my 3d brick + if(nx + nlower < nxlo_out || nx + nupper > nxhi_out || + ny + nlower < nylo_out || ny + nupper > nyhi_out || + nz + nlower < nzlo_out || nz + nupper > nzhi_out) { + flag[0]++; + debugdata[0] = i; + debugdata[1] = _boxlo[0]; + debugdata[2] = _boxlo[1]; + debugdata[3] = _boxlo[2]; + debugdata[4] = nx; + debugdata[5] = ny; + debugdata[6] = nz; + debugdata[7] = _x[i]; + debugdata[8] = _x[i + _nmax]; + debugdata[9] = _x[i + 2 * _nmax]; + debugdata[10] = nlocal; + + } + } +} + +__global__ void make_rho_kernelA() +{ + int i, l, m, n, nx, ny, nz, mx, my, mz; + + // clear 3d density array + + + // loop over my charges, add their contribution to nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt + + i = blockIdx.x * blockDim.x + threadIdx.x; + + if(i < nlocal) { + + PPPM_FLOAT dx, dy, dz, x0, y0, z0; + nx = part2grid[i]; + ny = part2grid[i + nmax]; + nz = part2grid[i + 2 * nmax]; + dx = nx + shiftone - (_x[i] - _boxlo[0]) * delxinv; + dy = ny + shiftone - (_x[i + nmax] - _boxlo[1]) * delyinv; + dz = nz + shiftone - (_x[i + 2 * nmax] - _boxlo[2]) * delzinv; + + z0 = delxinv * delyinv * delzinv * _q[i]; + + for(n = nlower; n <= nupper; n++) { + mz = n + nz; + y0 = z0 * rho1d(n, dz, rho_coeff); + + for(m = nlower; m <= nupper; m++) { + my = m + ny; + x0 = y0 * rho1d(m, dy, rho_coeff); + + for(l = nlower; l <= nupper; l++) { + mx = l + nx; + int mzyx = ((mz - nzlo_out) * (nyhi_out - nylo_out + 1) + my - nylo_out) * (nxhi_out - nxlo_out + 1) + mx - nxlo_out; + + while(atomicAdd(&density_brick_int[mzyx], 1) != 0) atomicAdd(&density_brick_int[mzyx], -1); + + density_brick[mzyx] += x0 * rho1d(l, dx, rho_coeff); + __threadfence(); + atomicAdd(&density_brick_int[mzyx], -1); + __syncthreads(); + + } + } + } + } +} + +__global__ void make_rho_kernel(int* flag, int read_threads_at_same_time) +{ + int i, l, m, n, nx, ny, nz, mx, my, mz, a, b; + + // clear 3d density array + + + // loop over my charges, add their contribution to nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt + // int nzxy=blockIdx.x*gridDim.y+blockIdx.y; + + int nelements = nupper - nlower + 1; + int* idx = (int*) sharedmem; + int* sdensity_brick_int = &idx[blockDim.x]; + PPPM_FLOAT* srho_coeff = (PPPM_FLOAT*) &sdensity_brick_int[nelements * blockDim.x]; + + if(threadIdx.x < order * (order / 2 - (1 - order) / 2 + 1)) + srho_coeff[threadIdx.x] = rho_coeff[threadIdx.x]; + + __syncthreads(); + + i = blockIdx.x * blockDim.x + threadIdx.x; + + if(false) { + if(i < nlocal) { + + PPPM_FLOAT dx, dy, dz, x0, y0, z0; + nx = part2grid[i]; + ny = part2grid[i + nmax]; + nz = part2grid[i + 2 * nmax]; + dx = nx + shiftone - (_x[i] - _boxlo[0]) * delxinv; + dy = ny + shiftone - (_x[i + nmax] - _boxlo[1]) * delyinv; + dz = nz + shiftone - (_x[i + 2 * nmax] - _boxlo[2]) * delzinv; + + z0 = delxinv * delyinv * delzinv * _q[i]; + + for(n = nlower; n <= nupper; n++) { + mz = n + nz; + y0 = z0 * rho1d(n, dz, srho_coeff); + + for(m = nlower; m <= nupper; m++) { + my = m + ny; + x0 = y0 * rho1d(m, dy, srho_coeff); + + for(l = nlower; l <= nupper; l++) { + mx = l + nx; + int mzyx = ((mz - nzlo_out) * (nyhi_out - nylo_out + 1) + my - nylo_out) * (nxhi_out - nxlo_out + 1) + mx - nxlo_out; + + a = int(x0 * rho1d(l, dx, srho_coeff) * density_intScale); + b = (atomicAdd(&density_brick_int[mzyx], a) | a); + + if(((b) & (0x7c000000)) && (not((b) & (0x80000000)))) { + flag[1]++; + + if((b) & (0x60000000)) flag[0]++; + } + + __syncthreads(); + } + } + } + } + + return; + } + + i = blockIdx.x * blockDim.x + threadIdx.x; + { + + PPPM_FLOAT dx, dy, dz, x0, y0, z0, qtmp; + + if(i < nlocal) { + qtmp = _q[i]; + nx = part2grid[i]; + ny = part2grid[i + nmax]; + nz = part2grid[i + 2 * nmax]; + dx = nx + shiftone - (_x[i] - _boxlo[0]) * delxinv; + dy = ny + shiftone - (_x[i + nmax] - _boxlo[1]) * delyinv; + dz = nz + shiftone - (_x[i + 2 * nmax] - _boxlo[2]) * delzinv; + z0 = delxinv * delyinv * delzinv * qtmp; + } else { + nx = ny = nz = 1; + dx = dy = dz = PPPM_F(0.1); + } + + __syncthreads(); + + for(n = nlower; n <= nupper; n++) { + mz = n + nz; + y0 = z0 * rho1d(n, dz, srho_coeff); + + for(m = nlower; m <= nupper; m++) { + my = m + ny; + x0 = y0 * rho1d(m, dy, srho_coeff); + + if(i < nlocal) { + idx[threadIdx.x] = ((mz - nzlo_out) * (nyhi_out - nylo_out + 1) + my - nylo_out) * (nxhi_out - nxlo_out + 1) + nx + nlower - nxlo_out; + + for(l = nlower; l <= nupper; l++) { + sdensity_brick_int[threadIdx.x * nelements + l - nlower] = int(x0 * rho1d(l, dx, srho_coeff) * density_intScale); + } + } else idx[threadIdx.x] = -1; + + __syncthreads(); + + for(int ii = 0; ii < blockDim.x; ii += read_threads_at_same_time) { + int kk = threadIdx.x / nelements; + + if((threadIdx.x < nelements * read_threads_at_same_time) && (kk + ii < blockDim.x) && (idx[ii + kk] > -1)) { + a = sdensity_brick_int[ii * nelements + threadIdx.x]; + //if(a*a>1e-100) + b = (atomicAdd(&density_brick_int[idx[ii + kk] + threadIdx.x - kk * nelements], a) | a); + + //else + //b=(density_brick_int[idx[ii+kk]+threadIdx.x-kk*nelements]|a); + if(((b) & (0x7c000000)) && (not((b) & (0x80000000)))) { + flag[1]++; + + if((b) & (0x60000000)) flag[0]++; + } + } + } + + __syncthreads(); //*/ + } + } + + } +} + +__global__ void scale_rho_kernel() +{ + int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; + density_brick[i] = (1.0 / density_intScale) * density_brick_int[i]; +} + +__global__ void fieldforce_kernel(int elements_per_thread, int read_threads_at_same_time, int* flag) //20*x64 0.36 +{ + int i; + + // loop over my charges, interpolate electric field from nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt + // ek = 3 components of E-field on particle + i = blockIdx.x * blockDim.x + threadIdx.x; + int* idx = (int*) sharedmem; + PPPM_FLOAT* tmp_brick = (PPPM_FLOAT*) &idx[blockDim.x]; + PPPM_FLOAT* srho_coeff = (PPPM_FLOAT*) &tmp_brick[3 * blockDim.x * elements_per_thread]; + + if(threadIdx.x < order * (order / 2 - (1 - order) / 2 + 1)) + srho_coeff[threadIdx.x] = rho_coeff[threadIdx.x]; + + __syncthreads(); + { + int l, m, n, nx, ny, nz, my, mz; + PPPM_FLOAT dx, dy, dz, x0, y0, z0; + PPPM_FLOAT ek[3]; + + if(i < nlocal) { + nx = part2grid[i]; + ny = part2grid[i + nmax]; + nz = part2grid[i + 2 * nmax]; + dx = nx + shiftone - (_x[i] - _boxlo[0]) * delxinv; + dy = ny + shiftone - (_x[i + nmax] - _boxlo[1]) * delyinv; + dz = nz + shiftone - (_x[i + 2 * nmax] - _boxlo[2]) * delzinv; + + ek[0] = ek[1] = ek[2] = PPPM_F(0.0); + } else { + nx = ny = nz = 1; + dx = dy = dz = PPPM_F(0.1); + } + + __syncthreads(); + + for(n = nlower; n <= nupper; n++) { + mz = n + nz; + z0 = rho1d(n, dz, srho_coeff); + + for(m = nlower; m <= nupper; m++) { + my = m + ny; + y0 = z0 * rho1d(m, dy, srho_coeff); + + + if(i < nlocal) + idx[threadIdx.x] = ((mz - nzlo_out) * (nyhi_out - nylo_out + 1) + my - nylo_out) * (nxhi_out - nxlo_out + 1) + nx + nlower - nxlo_out; + else idx[threadIdx.x] = -1; + + __syncthreads(); + + for(int ii = 0; ii < blockDim.x; ii += read_threads_at_same_time) { + int kk = threadIdx.x / elements_per_thread; + + if((threadIdx.x < elements_per_thread * read_threads_at_same_time) && (kk + ii < blockDim.x) && (idx[ii + kk] > -1)) { + tmp_brick[ii * elements_per_thread + threadIdx.x] = vdx_brick[idx[ii + kk] + threadIdx.x - kk * elements_per_thread]; + tmp_brick[(ii + blockDim.x)*elements_per_thread + threadIdx.x] = vdy_brick[idx[ii + kk] + threadIdx.x - kk * elements_per_thread]; + tmp_brick[(ii + 2 * blockDim.x)*elements_per_thread + threadIdx.x] = vdz_brick[idx[ii + kk] + threadIdx.x - kk * elements_per_thread]; + } + } + + __syncthreads(); + + if(i < nlocal) + for(l = nlower; l <= nupper; l++) { + x0 = y0 * rho1d(l, dx, srho_coeff); + + ek[0] -= x0 * tmp_brick[threadIdx.x * elements_per_thread + l - nlower]; + ek[1] -= x0 * tmp_brick[threadIdx.x * elements_per_thread + l - nlower + blockDim.x * elements_per_thread]; + ek[2] -= x0 * tmp_brick[threadIdx.x * elements_per_thread + l - nlower + 2 * blockDim.x * elements_per_thread]; + } + + __syncthreads(); + } + } + + // convert E-field to force + + + _f[i] += qqrd2e * _q[i] * ek[0]; + _f[i + nmax] += qqrd2e * _q[i] * ek[1]; + _f[i + 2 * nmax] += qqrd2e * _q[i] * ek[2]; + } +} + +__global__ void slabcorr_energy_kernel(ENERGY_FLOAT* buf) +{ + ENERGY_FLOAT* dipole = (ENERGY_FLOAT*) sharedmem; + int i = blockIdx.x * blockDim.x + threadIdx.x; + + if(i < nlocal) + dipole[threadIdx.x] = _q[i] * _x[i + 2 * nmax]; + else + dipole[threadIdx.x] = ENERGY_F(0.0); + + __syncthreads(); + reduceBlock(dipole); + + if(threadIdx.x == 0) buf[blockIdx.x] = dipole[0]; +} + +__global__ void slabcorr_force_kernel(F_FLOAT ffact) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + + if(i < nlocal) + _f[i + 2 * nmax] += qqrd2e * _q[i] * ffact; +} + + +__global__ void initfftdata_core_kernel(PPPM_FLOAT* in, FFT_FLOAT* out) +{ + out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] = in[(((blockIdx.x + nzlo_in - nzlo_out) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out]; + out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x) + 1] = 0; +} + +__global__ void initfftdata_z_kernel(PPPM_FLOAT* in, FFT_FLOAT* out) +{ + if(slabflag) { + if(blockIdx.x < nzlo_in - nzlo_out) + out[2 * (((nzhi_in - nzlo_in + 2 - nupper - slabflag + blockIdx.x) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out]; + } else { + if(blockIdx.x < nzlo_in - nzlo_out) + out[2 * (((blockIdx.x + 2 * (nzhi_in + 1) - nzlo_in - nzhi_out) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out]; + } + + if(blockIdx.x < nzhi_out - nzhi_in) + out[2 * ((((blockIdx.x) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + (nzhi_out - nzlo_in)) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out]; +} + +__global__ void initfftdata_y_kernel(PPPM_FLOAT* in, FFT_FLOAT* out) +{ + if(blockIdx.y < nylo_in - nylo_out) + out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + (2 * (nyhi_in + 1) - nylo_in - nyhi_out) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzlo_in - nzlo_out) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out]; + + if(blockIdx.y < nyhi_out - nyhi_in) + out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzlo_in - nzlo_out) * (nyhi_out - nylo_out + 1) + blockIdx.y + (nyhi_out - nylo_in)) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out]; +} + +__global__ void initfftdata_x_kernel(PPPM_FLOAT* in, FFT_FLOAT* out) +{ + if(threadIdx.x < nxlo_in - nxlo_out) + out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x + nzlo_in - nzlo_out) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x]; + + if(threadIdx.x < nxhi_out - nxhi_in) + out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzlo_in - nzlo_out) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1]; +} + +__global__ void initfftdata_yz_kernel(PPPM_FLOAT* in, FFT_FLOAT* out) +{ + if(slabflag) { + if(blockIdx.x < nzlo_in - nzlo_out) + if(blockIdx.y < nyhi_out - nyhi_in) + out[2 * ((((nzhi_in - nzlo_in + 2 - nupper - slabflag + blockIdx.x) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out]; + + if(blockIdx.x < nzlo_in - nzlo_out) + if(blockIdx.y < nylo_in - nylo_out) + out[2 * ((((nzhi_in - nzlo_in + 2 - nupper - slabflag + blockIdx.x) * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out]; + } else { + if(blockIdx.x < nzlo_in - nzlo_out) + if(blockIdx.y < nyhi_out - nyhi_in) + out[2 * ((((blockIdx.x + 2 * (nzhi_in + 1) - nzlo_in - nzhi_out) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out]; + + if(blockIdx.x < nzlo_in - nzlo_out) + if(blockIdx.y < nylo_in - nylo_out) + out[2 * ((((blockIdx.x + 2 * (nzhi_in + 1) - nzlo_in - nzhi_out) * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out]; + } + + if(blockIdx.x < nzhi_out - nzhi_in) + if(blockIdx.y < nyhi_out - nyhi_in) + out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzhi_in - nzlo_out + 1) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out]; + + if(blockIdx.x < nzhi_out - nzhi_in) + if(blockIdx.y < nylo_in - nylo_out) + out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzhi_in - nzlo_out + 1) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out]; +} + +__global__ void initfftdata_xz_kernel(PPPM_FLOAT* in, FFT_FLOAT* out) +{ + if(blockIdx.x < nzhi_out - nzhi_in) + if(threadIdx.x < nxlo_in - nxlo_out) + out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x + nzhi_in - nzlo_out + 1) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x]; + + if(blockIdx.x < nzhi_out - nzhi_in) + if(threadIdx.x < nxhi_out - nxhi_in) + out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzhi_in - nzlo_out + 1) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1]; + + if(slabflag) { + if(blockIdx.x < nzlo_in - nzlo_out) + if(threadIdx.x < nxlo_in - nxlo_out) + out[2 * ((((nzhi_in - nzlo_in + 2 - nupper - slabflag + blockIdx.x) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x]; + + if(blockIdx.x < nzlo_in - nzlo_out) + if(threadIdx.x < nxhi_out - nxhi_in) + out[2 * ((((nzhi_in - nzlo_in + 2 - nupper - slabflag + blockIdx.x) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1]; + } else { + if(blockIdx.x < nzlo_in - nzlo_out) + if(threadIdx.x < nxlo_in - nxlo_out) + out[2 * ((((blockIdx.x + 2 * (nzhi_in + 1) - nzlo_in - nzhi_out) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x]; + + if(blockIdx.x < nzlo_in - nzlo_out) + if(threadIdx.x < nxhi_out - nxhi_in) + out[2 * ((((blockIdx.x + 2 * (nzhi_in + 1) - nzlo_in - nzhi_out) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1]; + } +} + +__global__ void initfftdata_xy_kernel(PPPM_FLOAT* in, FFT_FLOAT* out) +{ + if(blockIdx.y < nyhi_out - nyhi_in) + if(threadIdx.x < nxlo_in - nxlo_out) + out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x + nzlo_in - nzlo_out) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x]; + + if(blockIdx.y < nyhi_out - nyhi_in) + if(threadIdx.x < nxhi_out - nxhi_in) + out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzlo_in - nzlo_out) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1]; + + if(blockIdx.y < nylo_in - nylo_out) + if(threadIdx.x < nxlo_in - nxlo_out) + out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x + nzlo_in - nzlo_out) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x]; + + if(blockIdx.y < nylo_in - nylo_out) + if(threadIdx.x < nxhi_out - nxhi_in) + out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzlo_in - nzlo_out) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1]; +} + +__global__ void initfftdata_xyz_kernel(PPPM_FLOAT* in, FFT_FLOAT* out) +{ + if(blockIdx.x < nzhi_out - nzhi_in) + if(blockIdx.y < nyhi_out - nyhi_in) + if(threadIdx.x < nxlo_in - nxlo_out) + out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x + nzhi_in - nzlo_out + 1) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x]; + + if(blockIdx.x < nzhi_out - nzhi_in) + if(blockIdx.y < nyhi_out - nyhi_in) + if(threadIdx.x < nxhi_out - nxhi_in) + out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzhi_in - nzlo_out + 1) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1]; + + if(blockIdx.x < nzhi_out - nzhi_in) + if(blockIdx.y < nylo_in - nylo_out) + if(threadIdx.x < nxlo_in - nxlo_out) + out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x + nzhi_in - nzlo_out + 1) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x]; + + if(blockIdx.x < nzhi_out - nzhi_in) + if(blockIdx.y < nylo_in - nylo_out) + if(threadIdx.x < nxhi_out - nxhi_in) + out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzhi_in - nzlo_out + 1) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1]; + + if(slabflag) { + if(blockIdx.x < nzlo_in - nzlo_out) + if(blockIdx.y < nyhi_out - nyhi_in) + if(threadIdx.x < nxlo_in - nxlo_out) + out[2 * ((((nzhi_in - nzlo_in + 2 - nupper - slabflag + blockIdx.x) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x]; + + if(blockIdx.x < nzlo_in - nzlo_out) + if(blockIdx.y < nyhi_out - nyhi_in) + if(threadIdx.x < nxhi_out - nxhi_in) + out[2 * ((((nzhi_in - nzlo_in + 2 - nupper - slabflag + blockIdx.x) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1]; + + if(blockIdx.x < nzlo_in - nzlo_out) + if(blockIdx.y < nylo_in - nylo_out) + if(threadIdx.x < nxlo_in - nxlo_out) + out[2 * ((((nzhi_in - nzlo_in + 2 - nupper - slabflag + blockIdx.x) * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x]; + + if(blockIdx.x < nzlo_in - nzlo_out) + if(blockIdx.y < nylo_in - nylo_out) + if(threadIdx.x < nxhi_out - nxhi_in) + out[2 * ((((nzhi_in - nzlo_in + 2 - nupper - slabflag + blockIdx.x) * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1]; + } else { + if(blockIdx.x < nzlo_in - nzlo_out) + if(blockIdx.y < nyhi_out - nyhi_in) + if(threadIdx.x < nxlo_in - nxlo_out) + out[2 * ((((blockIdx.x + 2 * (nzhi_in + 1) - nzlo_in - nzhi_out) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x]; + + if(blockIdx.x < nzlo_in - nzlo_out) + if(blockIdx.y < nyhi_out - nyhi_in) + if(threadIdx.x < nxhi_out - nxhi_in) + out[2 * ((((blockIdx.x + 2 * (nzhi_in + 1) - nzlo_in - nzhi_out) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1]; + + if(blockIdx.x < nzlo_in - nzlo_out) + if(blockIdx.y < nylo_in - nylo_out) + if(threadIdx.x < nxlo_in - nxlo_out) + out[2 * ((((blockIdx.x + 2 * (nzhi_in + 1) - nzlo_in - nzhi_out) * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x]; + + if(blockIdx.x < nzlo_in - nzlo_out) + if(blockIdx.y < nylo_in - nylo_out) + if(threadIdx.x < nxhi_out - nxhi_in) + out[2 * ((((blockIdx.x + 2 * (nzhi_in + 1) - nzlo_in - nzhi_out) * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1]; + } +}