From 3545d44d0d160a3c8e111741c54e2edf20c26e9a Mon Sep 17 00:00:00 2001 From: sjplimp <sjplimp@f3b2605a-c512-4ea7-a41b-209d697bcdaa> Date: Thu, 29 May 2014 22:51:58 +0000 Subject: [PATCH] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@12040 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- src/KOKKOS/Install.sh | 71 ++ src/KOKKOS/atom_kokkos.cpp | 190 ++++ src/KOKKOS/atom_kokkos.h | 86 ++ src/KOKKOS/atom_vec_atomic_kokkos.cpp | 1371 ++++++++++++++++++++++ src/KOKKOS/atom_vec_atomic_kokkos.h | 111 ++ src/KOKKOS/atom_vec_kokkos.cpp | 23 + src/KOKKOS/atom_vec_kokkos.h | 76 ++ src/KOKKOS/comm_kokkos.cpp | 820 ++++++++++++++ src/KOKKOS/comm_kokkos.h | 63 ++ src/KOKKOS/domain_kokkos.cpp | 207 ++++ src/KOKKOS/domain_kokkos.h | 38 + src/KOKKOS/fix_nve_kokkos.cpp | 177 +++ src/KOKKOS/fix_nve_kokkos.h | 110 ++ src/KOKKOS/kokkos.cpp | 220 ++++ src/KOKKOS/kokkos.h | 40 + src/KOKKOS/kokkos_type.h | 617 ++++++++++ src/KOKKOS/memory_kokkos.h | 208 ++++ src/KOKKOS/modify_kokkos.cpp | 585 ++++++++++ src/KOKKOS/modify_kokkos.h | 73 ++ src/KOKKOS/neigh_full_kokkos.h | 507 +++++++++ src/KOKKOS/neigh_list_kokkos.cpp | 118 ++ src/KOKKOS/neigh_list_kokkos.h | 104 ++ src/KOKKOS/neighbor_kokkos.cpp | 269 +++++ src/KOKKOS/neighbor_kokkos.h | 257 +++++ src/KOKKOS/pair_kokkos.h | 655 +++++++++++ src/KOKKOS/pair_lj_cut_kokkos.cpp | 267 +++++ src/KOKKOS/pair_lj_cut_kokkos.h | 112 ++ src/KOKKOS/pair_table_kokkos.cpp | 1500 +++++++++++++++++++++++++ src/KOKKOS/pair_table_kokkos.h | 352 ++++++ src/KOKKOS/verlet_kokkos.cpp | 443 ++++++++ src/KOKKOS/verlet_kokkos.h | 48 + src/MAKE/Makefile.cuda | 111 ++ src/Makefile | 2 +- src/atom_vec.cpp | 4 +- src/compute_property_local.cpp | 4 +- 35 files changed, 9835 insertions(+), 4 deletions(-) create mode 100644 src/KOKKOS/Install.sh create mode 100644 src/KOKKOS/atom_kokkos.cpp create mode 100644 src/KOKKOS/atom_kokkos.h create mode 100644 src/KOKKOS/atom_vec_atomic_kokkos.cpp create mode 100644 src/KOKKOS/atom_vec_atomic_kokkos.h create mode 100644 src/KOKKOS/atom_vec_kokkos.cpp create mode 100644 src/KOKKOS/atom_vec_kokkos.h create mode 100644 src/KOKKOS/comm_kokkos.cpp create mode 100644 src/KOKKOS/comm_kokkos.h create mode 100644 src/KOKKOS/domain_kokkos.cpp create mode 100644 src/KOKKOS/domain_kokkos.h create mode 100644 src/KOKKOS/fix_nve_kokkos.cpp create mode 100644 src/KOKKOS/fix_nve_kokkos.h create mode 100644 src/KOKKOS/kokkos.cpp create mode 100644 src/KOKKOS/kokkos.h create mode 100644 src/KOKKOS/kokkos_type.h create mode 100644 src/KOKKOS/memory_kokkos.h create mode 100644 src/KOKKOS/modify_kokkos.cpp create mode 100644 src/KOKKOS/modify_kokkos.h create mode 100644 src/KOKKOS/neigh_full_kokkos.h create mode 100644 src/KOKKOS/neigh_list_kokkos.cpp create mode 100644 src/KOKKOS/neigh_list_kokkos.h create mode 100644 src/KOKKOS/neighbor_kokkos.cpp create mode 100644 src/KOKKOS/neighbor_kokkos.h create mode 100644 src/KOKKOS/pair_kokkos.h create mode 100644 src/KOKKOS/pair_lj_cut_kokkos.cpp create mode 100644 src/KOKKOS/pair_lj_cut_kokkos.h create mode 100644 src/KOKKOS/pair_table_kokkos.cpp create mode 100644 src/KOKKOS/pair_table_kokkos.h create mode 100644 src/KOKKOS/verlet_kokkos.cpp create mode 100644 src/KOKKOS/verlet_kokkos.h create mode 100755 src/MAKE/Makefile.cuda diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh new file mode 100644 index 0000000000..9378eccfc7 --- /dev/null +++ b/src/KOKKOS/Install.sh @@ -0,0 +1,71 @@ +# Install/unInstall package files in LAMMPS +# mode = 0/1/2 for uninstall/install/update + +mode=$1 + +# arg1 = file, arg2 = file it depends on + +action () { + if (test $mode = 0) then + rm -f ../$1 + elif (! cmp -s $1 ../$1) then + if (test -z "$2" || test -e ../$2) then + cp $1 .. + if (test $mode = 2) then + echo " updating src/$1" + fi + fi + elif (test -n "$2") then + if (test ! -e ../$2) then + rm -f ../$1 + fi + fi +} + +# force rebuild of files with LMP_KOKKOS switch + +touch ../accelerator_kokkos.h +touch ../memory.h + +# all package files with no dependencies + +for file in *.cpp *.h; do + action $file +done + +# edit 2 Makefile.package files to include/exclude package info + +if (test $1 = 1) then + + if (test -e ../Makefile.package) then + sed -i -e 's/[^ \t]*kokkos[^ \t]* //g' ../Makefile.package + sed -i -e 's/[^ \t]*KOKKOS[^ \t]* //g' ../Makefile.package + sed -i -e 's|^PKG_INC =[ \t]*|&-I..\/..\/lib\/kokkos\/core\/src -I../../lib/kokkos/containers/src -DLMP_KOKKOS |' ../Makefile.package + sed -i -e 's|^PKG_PATH =[ \t]*|&-L..\/..\/lib\/kokkos\/core\/src |' ../Makefile.package + sed -i -e 's|^PKG_LIB =[ \t]*|&-lkokkoscore |' ../Makefile.package + sed -i -e 's|^PKG_SYSINC =[ \t]*|&$(kokkos_SYSINC) |' ../Makefile.package + sed -i -e 's|^PKG_SYSLIB =[ \t]*|&$(kokkos_SYSLIB) |' ../Makefile.package + sed -i -e 's|^PKG_SYSPATH =[ \t]*|&$(kokkos_SYSPATH) |' ../Makefile.package + fi + + if (test -e ../Makefile.package.settings) then + sed -i -e '/^include.*kokkos.*$/d' ../Makefile.package.settings + # multiline form needed for BSD sed on Macs + sed -i -e '4 i \ +include ..\/..\/lib\/kokkos\/Makefile.lammps +' ../Makefile.package.settings + + fi + +elif (test $1 = 0) then + + if (test -e ../Makefile.package) then + sed -i -e 's/[^ \t]*kokkos[^ \t]* //g' ../Makefile.package + sed -i -e 's/[^ \t]*KOKKOS[^ \t]* //g' ../Makefile.package + fi + + if (test -e ../Makefile.package.settings) then + sed -i -e '/^include.*kokkos.*$/d' ../Makefile.package.settings + fi + +fi diff --git a/src/KOKKOS/atom_kokkos.cpp b/src/KOKKOS/atom_kokkos.cpp new file mode 100644 index 0000000000..e36a5a926c --- /dev/null +++ b/src/KOKKOS/atom_kokkos.cpp @@ -0,0 +1,190 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include "mpi.h" +#include "atom_kokkos.h" +#include "atom_vec.h" +#include "atom_vec_kokkos.h" +#include "comm_kokkos.h" +#include "update.h" +#include "domain.h" +#include "atom_masks.h" +#include "memory.h" +#include "error.h" + +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- */ + +AtomKokkos::AtomKokkos(LAMMPS *lmp) : Atom(lmp) +{ + // set CommKokkos pointer to Atom class, since CommKokkos allocated first + + ((CommKokkos *) comm)->atomKK = this; +} + +/* ---------------------------------------------------------------------- */ + +AtomKokkos::~AtomKokkos() +{ + k_tag = DAT::tdual_int_1d(); + k_mask = DAT::tdual_int_1d(); + k_type = DAT::tdual_int_1d(); + k_image = DAT::tdual_int_1d(); + k_molecule = DAT::tdual_int_1d(); + + k_x = DAT::tdual_x_array(); + k_v = DAT::tdual_v_array(); + k_f = DAT::tdual_f_array(); + + k_mass = DAT::tdual_float_1d(); + + tag = NULL; + mask = NULL; + type = NULL; + image = NULL; + molecule = NULL; + mass = NULL; + + memory->sfree(x); + memory->sfree(v); + memory->sfree(f); + x = NULL; + v = NULL; + f = NULL; +} + +/* ---------------------------------------------------------------------- */ + +void AtomKokkos::sync(const ExecutionSpace space, unsigned int mask) +{ + ((AtomVecKokkos *) avec)->sync(space,mask); +} + +/* ---------------------------------------------------------------------- */ + +void AtomKokkos::modified(const ExecutionSpace space, unsigned int mask) +{ + ((AtomVecKokkos *) avec)->modified(space,mask); +} + +/* ---------------------------------------------------------------------- */ + +void AtomKokkos::allocate_type_arrays() +{ + if (avec->mass_type) { + k_mass = DAT::tdual_float_1d("Mass",ntypes+1); + mass = k_mass.h_view.ptr_on_device(); + mass_setflag = new int[ntypes+1]; + for (int itype = 1; itype <= ntypes; itype++) mass_setflag[itype] = 0; + } +} + +/* ---------------------------------------------------------------------- */ + +void AtomKokkos::sort() +{ + int i,m,n,ix,iy,iz,ibin,empty; + + sync(Host,ALL_MASK); + modified(Host,ALL_MASK); + + // set next timestep for sorting to take place + + nextsort = (update->ntimestep/sortfreq)*sortfreq + sortfreq; + + // re-setup sort bins if needed + + if (domain->box_change) setup_sort_bins(); + if (nbins == 1) return; + + // reallocate per-atom vectors if needed + + if (nlocal > maxnext) { + memory->destroy(next); + memory->destroy(permute); + maxnext = atom->nmax; + memory->create(next,maxnext,"atom:next"); + memory->create(permute,maxnext,"atom:permute"); + } + + // insure there is one extra atom location at end of arrays for swaps + + if (nlocal == nmax) avec->grow(0); + + // bin atoms in reverse order so linked list will be in forward order + + for (i = 0; i < nbins; i++) binhead[i] = -1; + + HAT::t_x_array_const h_x = k_x.view<LMPHostType>(); + for (i = nlocal-1; i >= 0; i--) { + ix = static_cast<int> ((h_x(i,0)-bboxlo[0])*bininvx); + iy = static_cast<int> ((h_x(i,1)-bboxlo[1])*bininvy); + iz = static_cast<int> ((h_x(i,2)-bboxlo[2])*bininvz); + ix = MAX(ix,0); + iy = MAX(iy,0); + iz = MAX(iz,0); + ix = MIN(ix,nbinx-1); + iy = MIN(iy,nbiny-1); + iz = MIN(iz,nbinz-1); + ibin = iz*nbiny*nbinx + iy*nbinx + ix; + next[i] = binhead[ibin]; + binhead[ibin] = i; + } + + // permute = desired permutation of atoms + // permute[I] = J means Ith new atom will be Jth old atom + + n = 0; + for (m = 0; m < nbins; m++) { + i = binhead[m]; + while (i >= 0) { + permute[n++] = i; + i = next[i]; + } + } + + // current = current permutation, just reuse next vector + // current[I] = J means Ith current atom is Jth old atom + + int *current = next; + for (i = 0; i < nlocal; i++) current[i] = i; + + // reorder local atom list, when done, current = permute + // perform "in place" using copy() to extra atom location at end of list + // inner while loop processes one cycle of the permutation + // copy before inner-loop moves an atom to end of atom list + // copy after inner-loop moves atom at end of list back into list + // empty = location in atom list that is currently empty + + for (i = 0; i < nlocal; i++) { + if (current[i] == permute[i]) continue; + avec->copy(i,nlocal,0); + empty = i; + while (permute[empty] != i) { + avec->copy(permute[empty],empty,0); + empty = current[empty] = permute[empty]; + } + avec->copy(nlocal,empty,0); + current[empty] = permute[empty]; + } + + // sanity check that current = permute + + //int flag = 0; + //for (i = 0; i < nlocal; i++) + // if (current[i] != permute[i]) flag = 1; + //int flagall; + //MPI_Allreduce(&flag,&flagall,1,MPI_INT,MPI_SUM,world); + //if (flagall) error->all(FLERR,"Atom sort did not operate correctly"); +} diff --git a/src/KOKKOS/atom_kokkos.h b/src/KOKKOS/atom_kokkos.h new file mode 100644 index 0000000000..594bf80e5f --- /dev/null +++ b/src/KOKKOS/atom_kokkos.h @@ -0,0 +1,86 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include "atom.h" +#include "kokkos_type.h" + +#ifndef LMP_ATOM_KOKKOS_H +#define LMP_ATOM_KOKKOS_H + +namespace LAMMPS_NS { + +class AtomKokkos : public Atom { + public: + DAT::tdual_int_1d k_tag, k_type, k_mask, k_molecule; + DAT::tdual_tagint_1d k_image; + DAT::tdual_x_array k_x; + DAT::tdual_v_array k_v; + DAT::tdual_f_array k_f; + + DAT::tdual_float_1d k_mass; + + AtomKokkos(class LAMMPS *); + ~AtomKokkos(); + + virtual void allocate_type_arrays(); + void sync(const ExecutionSpace space, unsigned int mask); + void modified(const ExecutionSpace space, unsigned int mask); + virtual void sort(); +}; + +template<class ViewType, class IndexView> +class SortFunctor { + typedef typename ViewType::device_type device_type; + ViewType source; + Kokkos::View<typename ViewType::non_const_data_type,typename ViewType::array_type,device_type> dest; + IndexView index; + SortFunctor(ViewType src, typename Kokkos::Impl::enable_if<ViewType::dynamic_rank==1,IndexView>::type ind):source(src),index(ind){ + dest = Kokkos::View<typename ViewType::non_const_data_type,typename ViewType::array_type,device_type>("",src.dimension_0()); + } + SortFunctor(ViewType src, typename Kokkos::Impl::enable_if<ViewType::dynamic_rank==2,IndexView>::type ind):source(src),index(ind){ + dest = Kokkos::View<typename ViewType::non_const_data_type,typename ViewType::array_type,device_type>("",src.dimension_0(),src.dimension_1()); + } + SortFunctor(ViewType src, typename Kokkos::Impl::enable_if<ViewType::dynamic_rank==3,IndexView>::type ind):source(src),index(ind){ + dest = Kokkos::View<typename ViewType::non_const_data_type,typename ViewType::array_type,device_type>("",src.dimension_0(),src.dimension_1(),src.dimension_2()); + } + SortFunctor(ViewType src, typename Kokkos::Impl::enable_if<ViewType::dynamic_rank==4,IndexView>::type ind):source(src),index(ind){ + dest = Kokkos::View<typename ViewType::non_const_data_type,typename ViewType::array_type,device_type>("",src.dimension_0(),src.dimension_1(),src.dimension_2(),src.dimension_3()); + } + KOKKOS_INLINE_FUNCTION + void operator()(const typename Kokkos::Impl::enable_if<ViewType::rank==1, int>::type& i) { + dest(i) = source(index(i)); + } + void operator()(const typename Kokkos::Impl::enable_if<ViewType::rank==2, int>::type& i) { + for(int j=0;j<source.dimension_1();j++) + dest(i,j) = source(index(i),j); + } + void operator()(const typename Kokkos::Impl::enable_if<ViewType::rank==3, int>::type& i) { + for(int j=0;j<source.dimension_1();j++) + for(int k=0;k<source.dimension_2();k++) + dest(i,j,k) = source(index(i),j,k); + } + void operator()(const typename Kokkos::Impl::enable_if<ViewType::rank==4, int>::type& i) { + for(int j=0;j<source.dimension_1();j++) + for(int k=0;k<source.dimension_2();k++) + for(int l=0;l<source.dimension_3();l++) + dest(i,j,k,l) = source(index(i),j,k,l); + } +}; + +} + +#endif + +/* ERROR/WARNING messages: + +*/ diff --git a/src/KOKKOS/atom_vec_atomic_kokkos.cpp b/src/KOKKOS/atom_vec_atomic_kokkos.cpp new file mode 100644 index 0000000000..1db293cd44 --- /dev/null +++ b/src/KOKKOS/atom_vec_atomic_kokkos.cpp @@ -0,0 +1,1371 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale AtomicKokkos/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include "stdlib.h" +#include "atom_vec_atomic_kokkos.h" +#include "atom_kokkos.h" +#include "comm_kokkos.h" +#include "domain.h" +#include "modify.h" +#include "fix.h" +#include "atom_masks.h" +#include "memory.h" +#include "error.h" + +using namespace LAMMPS_NS; + +#define DELTA 10000 + +/* ---------------------------------------------------------------------- */ + +AtomVecAtomicKokkos::AtomVecAtomicKokkos(LAMMPS *lmp) : AtomVecKokkos(lmp) +{ + molecular = 0; + mass_type = 1; + + comm_x_only = comm_f_only = 1; + size_forward = 3; + size_reverse = 3; + size_border = 6; + size_velocity = 3; + size_data_atom = 5; + size_data_vel = 4; + xcol_data = 3; + + k_count = DAT::tdual_int_1d("atom::k_count",1); + atomKK = (AtomKokkos *) atom; + commKK = (CommKokkos *) comm; +} + +/* ---------------------------------------------------------------------- + grow atom arrays + n = 0 grows arrays by DELTA + n > 0 allocates arrays to size n +------------------------------------------------------------------------- */ + +void AtomVecAtomicKokkos::grow(int n) +{ + if (n == 0) nmax += DELTA; + else nmax = n; + atomKK->nmax = nmax; + if (nmax < 0 || nmax > MAXSMALLINT) + error->one(FLERR,"Per-processor system is too big"); + + sync(Device,ALL_MASK); + modified(Device,ALL_MASK); + + memory->grow_kokkos(atomKK->k_tag,atomKK->tag,nmax,"atom:tag"); + memory->grow_kokkos(atomKK->k_type,atomKK->type,nmax,"atom:type"); + memory->grow_kokkos(atomKK->k_mask,atomKK->mask,nmax,"atom:mask"); + memory->grow_kokkos(atomKK->k_image,atomKK->image,nmax,"atom:image"); + + memory->grow_kokkos(atomKK->k_x,atomKK->x,nmax,3,"atom:x"); + memory->grow_kokkos(atomKK->k_v,atomKK->v,nmax,3,"atom:v"); + memory->grow_kokkos(atomKK->k_f,atomKK->f,nmax,3,"atom:f"); + + grow_reset(); + sync(Host,ALL_MASK); + + if (atom->nextra_grow) + for (int iextra = 0; iextra < atom->nextra_grow; iextra++) + modify->fix[atom->extra_grow[iextra]]->grow_arrays(nmax); +} + +/* ---------------------------------------------------------------------- + reset local array ptrs +------------------------------------------------------------------------- */ + +void AtomVecAtomicKokkos::grow_reset() +{ + tag = atomKK->tag; + d_tag = atomKK->k_tag.d_view; + h_tag = atomKK->k_tag.h_view; + + type = atomKK->type; + d_type = atomKK->k_type.d_view; + h_type = atomKK->k_type.h_view; + mask = atomKK->mask; + d_mask = atomKK->k_mask.d_view; + h_mask = atomKK->k_mask.h_view; + image = atomKK->image; + d_image = atomKK->k_image.d_view; + h_image = atomKK->k_image.h_view; + + x = atomKK->x; + d_x = atomKK->k_x.d_view; + h_x = atomKK->k_x.h_view; + v = atomKK->v; + d_v = atomKK->k_v.d_view; + h_v = atomKK->k_v.h_view; + f = atomKK->f; + d_f = atomKK->k_f.d_view; + h_f = atomKK->k_f.h_view; +} + +/* ---------------------------------------------------------------------- + copy atom I info to atom J +------------------------------------------------------------------------- */ + +void AtomVecAtomicKokkos::copy(int i, int j, int delflag) +{ + h_tag[j] = h_tag[i]; + h_type[j] = h_type[i]; + mask[j] = mask[i]; + h_image[j] = h_image[i]; + h_x(j,0) = h_x(i,0); + h_x(j,1) = h_x(i,1); + h_x(j,2) = h_x(i,2); + h_v(j,0) = h_v(i,0); + h_v(j,1) = h_v(i,1); + h_v(j,2) = h_v(i,2); + + if (atom->nextra_grow) + for (int iextra = 0; iextra < atom->nextra_grow; iextra++) + modify->fix[atom->extra_grow[iextra]]->copy_arrays(i,j,delflag); +} + +/* ---------------------------------------------------------------------- */ + +template<class DeviceType,int PBC_FLAG,int TRICLINIC> +struct AtomVecAtomicKokkos_PackComm { + typedef DeviceType device_type; + + typename ArrayTypes<DeviceType>::t_x_array_randomread _x; + typename ArrayTypes<DeviceType>::t_xfloat_2d_um _buf; + typename ArrayTypes<DeviceType>::t_int_2d_const _list; + const int _iswap; + X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz; + X_FLOAT _pbc[6]; + + AtomVecAtomicKokkos_PackComm( + const typename DAT::tdual_x_array &x, + const typename DAT::tdual_xfloat_2d &buf, + const typename DAT::tdual_int_2d &list, + const int & iswap, + const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd, + const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc): + _x(x.view<DeviceType>()),_list(list.view<DeviceType>()),_iswap(iswap), + _xprd(xprd),_yprd(yprd),_zprd(zprd), + _xy(xy),_xz(xz),_yz(yz) { + const size_t maxsend = (buf.view<DeviceType>().dimension_0()*buf.view<DeviceType>().dimension_1())/3; + const size_t elements = 3; + buffer_view<DeviceType>(_buf,buf,maxsend,elements); + _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2]; + _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5]; + }; + + KOKKOS_INLINE_FUNCTION + void operator() (const int& i) const { + const int j = _list(_iswap,i); + if (PBC_FLAG == 0) { + _buf(i,0) = _x(j,0); + _buf(i,1) = _x(j,1); + _buf(i,2) = _x(j,2); + } else { + if (TRICLINIC == 0) { + _buf(i,0) = _x(j,0) + _pbc[0]*_xprd; + _buf(i,1) = _x(j,1) + _pbc[1]*_yprd; + _buf(i,2) = _x(j,2) + _pbc[2]*_zprd; + } else { + _buf(i,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz; + _buf(i,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz; + _buf(i,2) = _x(j,2) + _pbc[2]*_zprd; + } + } + } +}; + +/* ---------------------------------------------------------------------- */ + +int AtomVecAtomicKokkos::pack_comm_kokkos(const int &n, + const DAT::tdual_int_2d &list, + const int & iswap, + const DAT::tdual_xfloat_2d &buf, + const int &pbc_flag, + const int* const pbc) +{ + // Check whether to always run forward communication on the host + // Choose correct forward PackComm kernel + + if(commKK->forward_comm_on_host) { + sync(Host,X_MASK); + if(pbc_flag) { + if(domain->triclinic) { + struct AtomVecAtomicKokkos_PackComm<LMPHostType,1,1> f(atomKK->k_x,buf,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } else { + struct AtomVecAtomicKokkos_PackComm<LMPHostType,1,0> f(atomKK->k_x,buf,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } + } else { + if(domain->triclinic) { + struct AtomVecAtomicKokkos_PackComm<LMPHostType,0,1> f(atomKK->k_x,buf,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } else { + struct AtomVecAtomicKokkos_PackComm<LMPHostType,0,0> f(atomKK->k_x,buf,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } + } + LMPHostType::fence(); + } else { + sync(Device,X_MASK); + if(pbc_flag) { + if(domain->triclinic) { + struct AtomVecAtomicKokkos_PackComm<LMPDeviceType,1,1> f(atomKK->k_x,buf,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } else { + struct AtomVecAtomicKokkos_PackComm<LMPDeviceType,1,0> f(atomKK->k_x,buf,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } + } else { + if(domain->triclinic) { + struct AtomVecAtomicKokkos_PackComm<LMPDeviceType,0,1> f(atomKK->k_x,buf,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } else { + struct AtomVecAtomicKokkos_PackComm<LMPDeviceType,0,0> f(atomKK->k_x,buf,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } + } + LMPDeviceType::fence(); + } + + return n*size_forward; +} + +/* ---------------------------------------------------------------------- */ + +template<class DeviceType,int PBC_FLAG,int TRICLINIC> +struct AtomVecAtomicKokkos_PackCommSelf { + typedef DeviceType device_type; + + typename ArrayTypes<DeviceType>::t_x_array_randomread _x; + typename ArrayTypes<DeviceType>::t_x_array _xw; + int _nfirst; + typename ArrayTypes<DeviceType>::t_int_2d_const _list; + const int _iswap; + X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz; + X_FLOAT _pbc[6]; + + AtomVecAtomicKokkos_PackCommSelf( + const typename DAT::tdual_x_array &x, + const int &nfirst, + const typename DAT::tdual_int_2d &list, + const int & iswap, + const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd, + const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc): + _x(x.view<DeviceType>()),_xw(x.view<DeviceType>()),_nfirst(nfirst),_list(list.view<DeviceType>()),_iswap(iswap), + _xprd(xprd),_yprd(yprd),_zprd(zprd), + _xy(xy),_xz(xz),_yz(yz) { + _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2]; + _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5]; + }; + + KOKKOS_INLINE_FUNCTION + void operator() (const int& i) const { + const int j = _list(_iswap,i); + if (PBC_FLAG == 0) { + _xw(i+_nfirst,0) = _x(j,0); + _xw(i+_nfirst,1) = _x(j,1); + _xw(i+_nfirst,2) = _x(j,2); + } else { + if (TRICLINIC == 0) { + _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd; + _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd; + _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd; + } else { + _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz; + _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz; + _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd; + } + } + + } +}; + +/* ---------------------------------------------------------------------- */ + +int AtomVecAtomicKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap, + const int nfirst, const int &pbc_flag, const int* const pbc) { + if(commKK->forward_comm_on_host) { + sync(Host,X_MASK); + modified(Host,X_MASK); + if(pbc_flag) { + if(domain->triclinic) { + struct AtomVecAtomicKokkos_PackCommSelf<LMPHostType,1,1> f(atomKK->k_x,nfirst,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } else { + struct AtomVecAtomicKokkos_PackCommSelf<LMPHostType,1,0> f(atomKK->k_x,nfirst,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } + } else { + if(domain->triclinic) { + struct AtomVecAtomicKokkos_PackCommSelf<LMPHostType,0,1> f(atomKK->k_x,nfirst,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } else { + struct AtomVecAtomicKokkos_PackCommSelf<LMPHostType,0,0> f(atomKK->k_x,nfirst,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } + } + LMPHostType::fence(); + } else { + sync(Device,X_MASK); + modified(Device,X_MASK); + if(pbc_flag) { + if(domain->triclinic) { + struct AtomVecAtomicKokkos_PackCommSelf<LMPDeviceType,1,1> f(atomKK->k_x,nfirst,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } else { + struct AtomVecAtomicKokkos_PackCommSelf<LMPDeviceType,1,0> f(atomKK->k_x,nfirst,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } + } else { + if(domain->triclinic) { + struct AtomVecAtomicKokkos_PackCommSelf<LMPDeviceType,0,1> f(atomKK->k_x,nfirst,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } else { + struct AtomVecAtomicKokkos_PackCommSelf<LMPDeviceType,0,0> f(atomKK->k_x,nfirst,list,iswap, + domain->xprd,domain->yprd,domain->zprd, + domain->xy,domain->xz,domain->yz,pbc); + Kokkos::parallel_for(n,f); + } + } + LMPDeviceType::fence(); + } + return n*3; +} + +/* ---------------------------------------------------------------------- */ + +template<class DeviceType> +struct AtomVecAtomicKokkos_UnpackComm { + typedef DeviceType device_type; + + typename ArrayTypes<DeviceType>::t_x_array _x; + typename ArrayTypes<DeviceType>::t_xfloat_2d_const _buf; + int _first; + + AtomVecAtomicKokkos_UnpackComm( + const typename DAT::tdual_x_array &x, + const typename DAT::tdual_xfloat_2d &buf, + const int& first):_x(x.view<DeviceType>()),_buf(buf.view<DeviceType>()), + _first(first) {}; + + KOKKOS_INLINE_FUNCTION + void operator() (const int& i) const { + _x(i+_first,0) = _buf(i,0); + _x(i+_first,1) = _buf(i,1); + _x(i+_first,2) = _buf(i,2); + } +}; + +/* ---------------------------------------------------------------------- */ + +void AtomVecAtomicKokkos::unpack_comm_kokkos(const int &n, const int &first, + const DAT::tdual_xfloat_2d &buf ) { + if(commKK->forward_comm_on_host) { + sync(Host,X_MASK); + modified(Host,X_MASK); + struct AtomVecAtomicKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,buf,first); + Kokkos::parallel_for(n,f); + LMPDeviceType::fence(); + } else { + sync(Device,X_MASK); + modified(Device,X_MASK); + struct AtomVecAtomicKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,buf,first); + Kokkos::parallel_for(n,f); + LMPDeviceType::fence(); + } +} + +/* ---------------------------------------------------------------------- */ + +int AtomVecAtomicKokkos::pack_comm(int n, int *list, double *buf, + int pbc_flag, int *pbc) +{ + int i,j,m; + double dx,dy,dz; + + m = 0; + if (pbc_flag == 0) { + for (i = 0; i < n; i++) { + j = list[i]; + buf[m++] = h_x(j,0); + buf[m++] = h_x(j,1); + buf[m++] = h_x(j,2); + } + } else { + if (domain->triclinic == 0) { + dx = pbc[0]*domain->xprd; + dy = pbc[1]*domain->yprd; + dz = pbc[2]*domain->zprd; + } else { + dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz; + dy = pbc[1]*domain->yprd + pbc[3]*domain->yz; + dz = pbc[2]*domain->zprd; + } + for (i = 0; i < n; i++) { + j = list[i]; + buf[m++] = h_x(j,0) + dx; + buf[m++] = h_x(j,1) + dy; + buf[m++] = h_x(j,2) + dz; + } + } + return m; +} + +/* ---------------------------------------------------------------------- */ + +int AtomVecAtomicKokkos::pack_comm_vel(int n, int *list, double *buf, + int pbc_flag, int *pbc) +{ + int i,j,m; + double dx,dy,dz,dvx,dvy,dvz; + + m = 0; + if (pbc_flag == 0) { + for (i = 0; i < n; i++) { + j = list[i]; + buf[m++] = h_x(j,0); + buf[m++] = h_x(j,1); + buf[m++] = h_x(j,2); + buf[m++] = h_v(j,0); + buf[m++] = h_v(j,1); + buf[m++] = h_v(j,2); + } + } else { + if (domain->triclinic == 0) { + dx = pbc[0]*domain->xprd; + dy = pbc[1]*domain->yprd; + dz = pbc[2]*domain->zprd; + } else { + dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz; + dy = pbc[1]*domain->yprd + pbc[3]*domain->yz; + dz = pbc[2]*domain->zprd; + } + if (!deform_vremap) { + for (i = 0; i < n; i++) { + j = list[i]; + buf[m++] = h_x(j,0) + dx; + buf[m++] = h_x(j,1) + dy; + buf[m++] = h_x(j,2) + dz; + buf[m++] = h_v(j,0); + buf[m++] = h_v(j,1); + buf[m++] = h_v(j,2); + } + } else { + dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4]; + dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3]; + dvz = pbc[2]*h_rate[2]; + for (i = 0; i < n; i++) { + j = list[i]; + buf[m++] = h_x(j,0) + dx; + buf[m++] = h_x(j,1) + dy; + buf[m++] = h_x(j,2) + dz; + if (mask[i] & deform_groupbit) { + buf[m++] = h_v(j,0) + dvx; + buf[m++] = h_v(j,1) + dvy; + buf[m++] = h_v(j,2) + dvz; + } else { + buf[m++] = h_v(j,0); + buf[m++] = h_v(j,1); + buf[m++] = h_v(j,2); + } + } + } + } + return m; +} + +/* ---------------------------------------------------------------------- */ + +void AtomVecAtomicKokkos::unpack_comm(int n, int first, double *buf) +{ + int i,m,last; + + m = 0; + last = first + n; + for (i = first; i < last; i++) { + h_x(i,0) = buf[m++]; + h_x(i,1) = buf[m++]; + h_x(i,2) = buf[m++]; + } +} + +/* ---------------------------------------------------------------------- */ + +void AtomVecAtomicKokkos::unpack_comm_vel(int n, int first, double *buf) +{ + int i,m,last; + + m = 0; + last = first + n; + for (i = first; i < last; i++) { + h_x(i,0) = buf[m++]; + h_x(i,1) = buf[m++]; + h_x(i,2) = buf[m++]; + h_v(i,0) = buf[m++]; + h_v(i,1) = buf[m++]; + h_v(i,2) = buf[m++]; + } +} + +/* ---------------------------------------------------------------------- */ + +int AtomVecAtomicKokkos::pack_reverse(int n, int first, double *buf) +{ + int i,m,last; + + m = 0; + last = first + n; + for (i = first; i < last; i++) { + buf[m++] = f[i][0]; + buf[m++] = f[i][1]; + buf[m++] = f[i][2]; + } + return m; +} + +/* ---------------------------------------------------------------------- */ + +void AtomVecAtomicKokkos::unpack_reverse(int n, int *list, double *buf) +{ + int i,j,m; + + m = 0; + for (i = 0; i < n; i++) { + j = list[i]; + f[j][0] += buf[m++]; + f[j][1] += buf[m++]; + f[j][2] += buf[m++]; + } +} + +/* ---------------------------------------------------------------------- */ + +template<class DeviceType,int PBC_FLAG> +struct AtomVecAtomicKokkos_PackBorder { + typedef DeviceType device_type; + + typename ArrayTypes<DeviceType>::t_xfloat_2d _buf; + const typename ArrayTypes<DeviceType>::t_int_2d_const _list; + const typename ArrayTypes<DeviceType>::t_x_array_randomread _x; + const typename ArrayTypes<DeviceType>::t_tagint_1d _tag; + const typename ArrayTypes<DeviceType>::t_int_1d _type; + const typename ArrayTypes<DeviceType>::t_int_1d _mask; + const int _iswap; + X_FLOAT _dx,_dy,_dz; + + AtomVecAtomicKokkos_PackBorder( + const typename ArrayTypes<DeviceType>::t_xfloat_2d &buf, + const typename ArrayTypes<DeviceType>::t_int_2d_const &list, + const int & iswap, + const typename ArrayTypes<DeviceType>::t_x_array &x, + const typename ArrayTypes<DeviceType>::t_tagint_1d &tag, + const typename ArrayTypes<DeviceType>::t_int_1d &type, + const typename ArrayTypes<DeviceType>::t_int_1d &mask, + const X_FLOAT &dx, const X_FLOAT &dy, const X_FLOAT &dz): + _buf(buf),_list(list),_iswap(iswap), + _x(x),_tag(tag),_type(type),_mask(mask), + _dx(dx),_dy(dy),_dz(dz) {} + + KOKKOS_INLINE_FUNCTION + void operator() (const int& i) const { + const int j = _list(_iswap,i); + if (PBC_FLAG == 0) { + _buf(i,0) = _x(j,0); + _buf(i,1) = _x(j,1); + _buf(i,2) = _x(j,2); + _buf(i,3) = _tag(j); + _buf(i,4) = _type(j); + _buf(i,5) = _mask(j); + } else { + _buf(i,0) = _x(j,0) + _dx; + _buf(i,1) = _x(j,1) + _dy; + _buf(i,2) = _x(j,2) + _dz; + _buf(i,3) = _tag(j); + _buf(i,4) = _type(j); + _buf(i,5) = _mask(j); + } + } +}; + +/* ---------------------------------------------------------------------- */ + +int AtomVecAtomicKokkos::pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist, DAT::tdual_xfloat_2d buf,int iswap, + int pbc_flag, int *pbc, ExecutionSpace space) +{ + X_FLOAT dx,dy,dz; + + if (pbc_flag != 0) { + if (domain->triclinic == 0) { + dx = pbc[0]*domain->xprd; + dy = pbc[1]*domain->yprd; + dz = pbc[2]*domain->zprd; + } else { + dx = pbc[0]; + dy = pbc[1]; + dz = pbc[2]; + } + if(space==Host) { + AtomVecAtomicKokkos_PackBorder<LMPHostType,1> f( + buf.view<LMPHostType>(), k_sendlist.view<LMPHostType>(), + iswap,h_x,h_tag,h_type,h_mask,dx,dy,dz); + Kokkos::parallel_for(n,f); + LMPHostType::fence(); + } else { + AtomVecAtomicKokkos_PackBorder<LMPDeviceType,1> f( + buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(), + iswap,d_x,d_tag,d_type,d_mask,dx,dy,dz); + Kokkos::parallel_for(n,f); + LMPDeviceType::fence(); + } + + } else { + dx = dy = dz = 0; + if(space==Host) { + AtomVecAtomicKokkos_PackBorder<LMPHostType,0> f( + buf.view<LMPHostType>(), k_sendlist.view<LMPHostType>(), + iswap,h_x,h_tag,h_type,h_mask,dx,dy,dz); + Kokkos::parallel_for(n,f); + LMPHostType::fence(); + } else { + AtomVecAtomicKokkos_PackBorder<LMPDeviceType,0> f( + buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(), + iswap,d_x,d_tag,d_type,d_mask,dx,dy,dz); + Kokkos::parallel_for(n,f); + LMPDeviceType::fence(); + } + } + return n*6; +} + +/* ---------------------------------------------------------------------- */ + +int AtomVecAtomicKokkos::pack_border(int n, int *list, double *buf, + int pbc_flag, int *pbc) +{ + int i,j,m; + double dx,dy,dz; + + m = 0; + if (pbc_flag == 0) { + for (i = 0; i < n; i++) { + j = list[i]; + buf[m++] = h_x(j,0); + buf[m++] = h_x(j,1); + buf[m++] = h_x(j,2); + buf[m++] = h_tag[j]; + buf[m++] = h_type[j]; + buf[m++] = h_mask[j]; + } + } else { + if (domain->triclinic == 0) { + dx = pbc[0]*domain->xprd; + dy = pbc[1]*domain->yprd; + dz = pbc[2]*domain->zprd; + } else { + dx = pbc[0]; + dy = pbc[1]; + dz = pbc[2]; + } + for (i = 0; i < n; i++) { + j = list[i]; + buf[m++] = h_x(j,0) + dx; + buf[m++] = h_x(j,1) + dy; + buf[m++] = h_x(j,2) + dz; + buf[m++] = h_tag[j]; + buf[m++] = h_type[j]; + buf[m++] = h_mask[j]; + } + } + return m; +} + +/* ---------------------------------------------------------------------- */ + +int AtomVecAtomicKokkos::pack_border_vel(int n, int *list, double *buf, + int pbc_flag, int *pbc) +{ + int i,j,m; + double dx,dy,dz,dvx,dvy,dvz; + + m = 0; + if (pbc_flag == 0) { + for (i = 0; i < n; i++) { + j = list[i]; + buf[m++] = h_x(j,0); + buf[m++] = h_x(j,1); + buf[m++] = h_x(j,2); + buf[m++] = h_tag[j]; + buf[m++] = h_type[j]; + buf[m++] = h_mask[j]; + buf[m++] = h_v(j,0); + buf[m++] = h_v(j,1); + buf[m++] = h_v(j,2); + } + } else { + if (domain->triclinic == 0) { + dx = pbc[0]*domain->xprd; + dy = pbc[1]*domain->yprd; + dz = pbc[2]*domain->zprd; + } else { + dx = pbc[0]; + dy = pbc[1]; + dz = pbc[2]; + } + if (!deform_vremap) { + for (i = 0; i < n; i++) { + j = list[i]; + buf[m++] = h_x(j,0) + dx; + buf[m++] = h_x(j,1) + dy; + buf[m++] = h_x(j,2) + dz; + buf[m++] = h_tag[j]; + buf[m++] = h_type[j]; + buf[m++] = h_mask[j]; + buf[m++] = h_v(j,0); + buf[m++] = h_v(j,1); + buf[m++] = h_v(j,2); + } + } else { + dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4]; + dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3]; + dvz = pbc[2]*h_rate[2]; + for (i = 0; i < n; i++) { + j = list[i]; + buf[m++] = h_x(j,0) + dx; + buf[m++] = h_x(j,1) + dy; + buf[m++] = h_x(j,2) + dz; + buf[m++] = h_tag[j]; + buf[m++] = h_type[j]; + buf[m++] = h_mask[j]; + if (mask[i] & deform_groupbit) { + buf[m++] = h_v(j,0) + dvx; + buf[m++] = h_v(j,1) + dvy; + buf[m++] = h_v(j,2) + dvz; + } else { + buf[m++] = h_v(j,0); + buf[m++] = h_v(j,1); + buf[m++] = h_v(j,2); + } + } + } + } + return m; +} + +/* ---------------------------------------------------------------------- */ + +template<class DeviceType> +struct AtomVecAtomicKokkos_UnpackBorder { + typedef DeviceType device_type; + + const typename ArrayTypes<DeviceType>::t_xfloat_2d_const _buf; + typename ArrayTypes<DeviceType>::t_x_array _x; + typename ArrayTypes<DeviceType>::t_tagint_1d _tag; + typename ArrayTypes<DeviceType>::t_int_1d _type; + typename ArrayTypes<DeviceType>::t_int_1d _mask; + int _first; + + + AtomVecAtomicKokkos_UnpackBorder( + const typename ArrayTypes<DeviceType>::t_xfloat_2d_const &buf, + typename ArrayTypes<DeviceType>::t_x_array &x, + typename ArrayTypes<DeviceType>::t_tagint_1d &tag, + typename ArrayTypes<DeviceType>::t_int_1d &type, + typename ArrayTypes<DeviceType>::t_int_1d &mask, + const int& first): + _buf(buf),_x(x),_tag(tag),_type(type),_mask(mask),_first(first){ + }; + + KOKKOS_INLINE_FUNCTION + void operator() (const int& i) const { + _x(i+_first,0) = _buf(i,0); + _x(i+_first,1) = _buf(i,1); + _x(i+_first,2) = _buf(i,2); + _tag(i+_first) = static_cast<int> (_buf(i,3)); + _type(i+_first) = static_cast<int> (_buf(i,4)); + _mask(i+_first) = static_cast<int> (_buf(i,5)); +// printf("%i %i %lf %lf %lf %i BORDER\n",_tag(i+_first),i+_first,_x(i+_first,0),_x(i+_first,1),_x(i+_first,2),_type(i+_first)); + } +}; + +/* ---------------------------------------------------------------------- */ + +void AtomVecAtomicKokkos::unpack_border_kokkos(const int &n, const int &first, + const DAT::tdual_xfloat_2d &buf,ExecutionSpace space) { + modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK); + while (first+n >= nmax) grow(0); + modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK); + if(space==Host) { + struct AtomVecAtomicKokkos_UnpackBorder<LMPHostType> f(buf.view<LMPHostType>(),h_x,h_tag,h_type,h_mask,first); + Kokkos::parallel_for(n,f); + LMPHostType::fence(); + } else { + struct AtomVecAtomicKokkos_UnpackBorder<LMPDeviceType> f(buf.view<LMPDeviceType>(),d_x,d_tag,d_type,d_mask,first); + Kokkos::parallel_for(n,f); + LMPDeviceType::fence(); + } +} + +/* ---------------------------------------------------------------------- */ + +void AtomVecAtomicKokkos::unpack_border(int n, int first, double *buf) +{ + int i,m,last; + + m = 0; + last = first + n; + for (i = first; i < last; i++) { + if (i == nmax) grow(0); + modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK); + h_x(i,0) = buf[m++]; + h_x(i,1) = buf[m++]; + h_x(i,2) = buf[m++]; + h_tag[i] = static_cast<int> (buf[m++]); + h_type[i] = static_cast<int> (buf[m++]); + h_mask[i] = static_cast<int> (buf[m++]); + } +} + +/* ---------------------------------------------------------------------- */ + +void AtomVecAtomicKokkos::unpack_border_vel(int n, int first, double *buf) +{ + int i,m,last; + + m = 0; + last = first + n; + for (i = first; i < last; i++) { + if (i == nmax) grow(0); + h_x(i,0) = buf[m++]; + h_x(i,1) = buf[m++]; + h_x(i,2) = buf[m++]; + h_tag[i] = static_cast<int> (buf[m++]); + h_type[i] = static_cast<int> (buf[m++]); + h_mask[i] = static_cast<int> (buf[m++]); + h_v(i,0) = buf[m++]; + h_v(i,1) = buf[m++]; + h_v(i,2) = buf[m++]; + } +} + +/* ---------------------------------------------------------------------- */ + +template<class DeviceType> +struct AtomVecAtomicKokkos_PackExchangeFunctor { + typedef DeviceType device_type; + typedef ArrayTypes<DeviceType> AT; + X_FLOAT _lo,_hi; + typename AT::t_x_array_randomread _x; + typename AT::t_v_array_randomread _v; + typename AT::t_tagint_1d_randomread _tag; + typename AT::t_int_1d_randomread _type; + typename AT::t_int_1d_randomread _mask; + typename AT::t_int_1d_randomread _image; + typename AT::t_x_array _xw; + typename AT::t_v_array _vw; + typename AT::t_tagint_1d _tagw; + typename AT::t_int_1d _typew; + typename AT::t_int_1d _maskw; + typename AT::t_int_1d _imagew; + + typename AT::t_xfloat_2d_um _buf; + int _nlocal,_dim; + typename AT::t_int_1d_const _sendlist; + typename AT::t_int_1d_const _copylist; + + AtomVecAtomicKokkos_PackExchangeFunctor( + const AtomKokkos* atom, + const typename AT::tdual_xfloat_2d buf, + typename AT::tdual_int_1d sendlist, + typename AT::tdual_int_1d copylist,int nlocal, int dim, + X_FLOAT lo, X_FLOAT hi): + _x(atom->k_x.view<DeviceType>()), + _v(atom->k_v.view<DeviceType>()), + _tag(atom->k_tag.view<DeviceType>()), + _type(atom->k_type.view<DeviceType>()), + _mask(atom->k_mask.view<DeviceType>()), + _image(atom->k_image.view<DeviceType>()), + _xw(atom->k_x.view<DeviceType>()), + _vw(atom->k_v.view<DeviceType>()), + _tagw(atom->k_tag.view<DeviceType>()), + _typew(atom->k_type.view<DeviceType>()), + _maskw(atom->k_mask.view<DeviceType>()), + _imagew(atom->k_image.view<DeviceType>()), + _sendlist(sendlist.template view<DeviceType>()), + _copylist(copylist.template view<DeviceType>()), + _nlocal(nlocal),_dim(dim), + _lo(lo),_hi(hi){ + const size_t elements = 11; + const int maxsendlist = (buf.template view<DeviceType>().dimension_0()*buf.template view<DeviceType>().dimension_1())/elements; + + buffer_view<DeviceType>(_buf,buf,maxsendlist,elements); + } + + KOKKOS_INLINE_FUNCTION + void operator() (const int &mysend) const { + const int i = _sendlist(mysend); + _buf(mysend,0) = 11; + _buf(mysend,1) = _x(i,0); + _buf(mysend,2) = _x(i,1); + _buf(mysend,3) = _x(i,2); + _buf(mysend,4) = _v(i,0); + _buf(mysend,5) = _v(i,1); + _buf(mysend,6) = _v(i,2); + _buf(mysend,7) = _tag[i]; + _buf(mysend,8) = _type[i]; + _buf(mysend,9) = _mask[i]; + _buf(mysend,10) = _image[i]; + const int j = _copylist(mysend); + + if(j>-1) { + _xw(i,0) = _x(j,0); + _xw(i,1) = _x(j,1); + _xw(i,2) = _x(j,2); + _vw(i,0) = _v(j,0); + _vw(i,1) = _v(j,1); + _vw(i,2) = _v(j,2); + _tagw[i] = _tag(j); + _typew[i] = _type(j); + _maskw[i] = _mask(j); + _imagew[i] = _image(j); + } + } +}; + +/* ---------------------------------------------------------------------- */ + +int AtomVecAtomicKokkos::pack_exchange_kokkos(const int &nsend,DAT::tdual_xfloat_2d &k_buf, DAT::tdual_int_1d k_sendlist,DAT::tdual_int_1d k_copylist,ExecutionSpace space,int dim,X_FLOAT lo,X_FLOAT hi ) +{ + if(nsend > (k_buf.view<LMPHostType>().dimension_0()*k_buf.view<LMPHostType>().dimension_1())/11) { + int newsize = nsend*11/k_buf.view<LMPHostType>().dimension_1()+1; + k_buf.resize(newsize,k_buf.view<LMPHostType>().dimension_1()); + } + if(space == Host) { + AtomVecAtomicKokkos_PackExchangeFunctor<LMPHostType> f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi); + Kokkos::parallel_for(nsend,f); + LMPHostType::fence(); + return nsend*11; + } else { + AtomVecAtomicKokkos_PackExchangeFunctor<LMPDeviceType> f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi); + Kokkos::parallel_for(nsend,f); + LMPDeviceType::fence(); + return nsend*11; + } +} + +/* ---------------------------------------------------------------------- */ + +int AtomVecAtomicKokkos::pack_exchange(int i, double *buf) +{ + int m = 1; + buf[m++] = h_x(i,0); + buf[m++] = h_x(i,1); + buf[m++] = h_x(i,2); + buf[m++] = h_v(i,0); + buf[m++] = h_v(i,1); + buf[m++] = h_v(i,2); + buf[m++] = h_tag[i]; + buf[m++] = h_type[i]; + buf[m++] = h_mask[i]; + *((tagint *) &buf[m++]) = h_image[i]; + + if (atom->nextra_grow) + for (int iextra = 0; iextra < atom->nextra_grow; iextra++) + m += modify->fix[atom->extra_grow[iextra]]->pack_exchange(i,&buf[m]); + + buf[0] = m; + return m; +} + +/* ---------------------------------------------------------------------- */ + +template<class DeviceType> +struct AtomVecAtomicKokkos_UnpackExchangeFunctor { + typedef DeviceType device_type; + typedef ArrayTypes<DeviceType> AT; + X_FLOAT _lo,_hi; + typename AT::t_x_array _x; + typename AT::t_v_array _v; + typename AT::t_tagint_1d _tag; + typename AT::t_int_1d _type; + typename AT::t_int_1d _mask; + typename AT::t_int_1d _image; + + typename AT::t_xfloat_2d_um _buf; + int _dim; + typename AT::t_int_1d _nlocal; + + AtomVecAtomicKokkos_UnpackExchangeFunctor( + const AtomKokkos* atom, + const typename AT::tdual_xfloat_2d buf, + typename AT::tdual_int_1d nlocal, + int dim, X_FLOAT lo, X_FLOAT hi): + _x(atom->k_x.view<DeviceType>()), + _v(atom->k_v.view<DeviceType>()), + _tag(atom->k_tag.view<DeviceType>()), + _type(atom->k_type.view<DeviceType>()), + _mask(atom->k_mask.view<DeviceType>()), + _image(atom->k_image.view<DeviceType>()), + _nlocal(nlocal.template view<DeviceType>()),_dim(dim), + _lo(lo),_hi(hi){ + const size_t elements = 11; + const int maxsendlist = (buf.template view<DeviceType>().dimension_0()*buf.template view<DeviceType>().dimension_1())/elements; + + buffer_view<DeviceType>(_buf,buf,maxsendlist,elements); + } + + KOKKOS_INLINE_FUNCTION + void operator() (const int &myrecv) const { + X_FLOAT x = _buf(myrecv,_dim+1); + if (x >= _lo && x < _hi) { + int i = Kokkos::atomic_fetch_add(&_nlocal(0),1); + _x(i,0) = _buf(myrecv,1); + _x(i,1) = _buf(myrecv,2); + _x(i,2) = _buf(myrecv,3); + _v(i,0) = _buf(myrecv,4); + _v(i,1) = _buf(myrecv,5); + _v(i,2) = _buf(myrecv,6); + _tag[i] = _buf(myrecv,7); + _type[i] = _buf(myrecv,8); + _mask[i] = _buf(myrecv,9); + _image[i] = _buf(myrecv,10); + } + } +}; + +/* ---------------------------------------------------------------------- */ + +int AtomVecAtomicKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int nrecv,int nlocal,int dim,X_FLOAT lo,X_FLOAT hi,ExecutionSpace space) { + if(space == Host) { + k_count.h_view(0) = nlocal; + AtomVecAtomicKokkos_UnpackExchangeFunctor<LMPHostType> f(atomKK,k_buf,k_count,dim,lo,hi); + Kokkos::parallel_for(nrecv/11,f); + LMPHostType::fence(); + return k_count.h_view(0); + } else { + k_count.h_view(0) = nlocal; + k_count.modify<LMPHostType>(); + k_count.sync<LMPDeviceType>(); + AtomVecAtomicKokkos_UnpackExchangeFunctor<LMPDeviceType> f(atomKK,k_buf,k_count,dim,lo,hi); + Kokkos::parallel_for(nrecv/11,f); + LMPDeviceType::fence(); + k_count.modify<LMPDeviceType>(); + k_count.sync<LMPHostType>(); + + return k_count.h_view(0); + } +} + +/* ---------------------------------------------------------------------- */ + +int AtomVecAtomicKokkos::unpack_exchange(double *buf) +{ + int nlocal = atom->nlocal; + if (nlocal == nmax) grow(0); + modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK | + MASK_MASK | IMAGE_MASK); + + int m = 1; + h_x(nlocal,0) = buf[m++]; + h_x(nlocal,1) = buf[m++]; + h_x(nlocal,2) = buf[m++]; + h_v(nlocal,0) = buf[m++]; + h_v(nlocal,1) = buf[m++]; + h_v(nlocal,2) = buf[m++]; + h_tag[nlocal] = static_cast<int> (buf[m++]); + h_type[nlocal] = static_cast<int> (buf[m++]); + h_mask[nlocal] = static_cast<int> (buf[m++]); + h_image[nlocal] = static_cast<int> (buf[m++]); + + if (atom->nextra_grow) + for (int iextra = 0; iextra < atom->nextra_grow; iextra++) + m += modify->fix[atom->extra_grow[iextra]]-> + unpack_exchange(nlocal,&buf[m]); + + atom->nlocal++; + return m; +} + +/* ---------------------------------------------------------------------- + size of restart data for all atoms owned by this proc + include extra data stored by fixes +------------------------------------------------------------------------- */ + +int AtomVecAtomicKokkos::size_restart() +{ + int i; + + int nlocal = atom->nlocal; + int n = 11 * nlocal; + + if (atom->nextra_restart) + for (int iextra = 0; iextra < atom->nextra_restart; iextra++) + for (i = 0; i < nlocal; i++) + n += modify->fix[atom->extra_restart[iextra]]->size_restart(i); + + return n; +} + +/* ---------------------------------------------------------------------- + pack atom I's data for restart file including extra quantities + xyz must be 1st 3 values, so that read_restart can test on them + molecular types may be negative, but write as positive +------------------------------------------------------------------------- */ + +int AtomVecAtomicKokkos::pack_restart(int i, double *buf) +{ + int m = 1; + buf[m++] = h_x(i,0); + buf[m++] = h_x(i,1); + buf[m++] = h_x(i,2); + buf[m++] = h_tag[i]; + buf[m++] = h_type[i]; + buf[m++] = h_mask[i]; + buf[m++] = h_image[i]; + buf[m++] = h_v(i,0); + buf[m++] = h_v(i,1); + buf[m++] = h_v(i,2); + + if (atom->nextra_restart) + for (int iextra = 0; iextra < atom->nextra_restart; iextra++) + m += modify->fix[atom->extra_restart[iextra]]->pack_restart(i,&buf[m]); + + buf[0] = m; + return m; +} + +/* ---------------------------------------------------------------------- + unpack data for one atom from restart file including extra quantities +------------------------------------------------------------------------- */ + +int AtomVecAtomicKokkos::unpack_restart(double *buf) +{ + int nlocal = atom->nlocal; + if (nlocal == nmax) { + grow(0); + if (atom->nextra_store) + memory->grow(atom->extra,nmax,atom->nextra_store,"atom:extra"); + } + + int m = 1; + h_x(nlocal,0) = buf[m++]; + h_x(nlocal,1) = buf[m++]; + h_x(nlocal,2) = buf[m++]; + h_tag[nlocal] = static_cast<int> (buf[m++]); + h_type[nlocal] = static_cast<int> (buf[m++]); + h_mask[nlocal] = static_cast<int> (buf[m++]); + h_image[nlocal] = *((tagint *) &buf[m++]); + h_v(nlocal,0) = buf[m++]; + h_v(nlocal,1) = buf[m++]; + h_v(nlocal,2) = buf[m++]; + + double **extra = atom->extra; + if (atom->nextra_store) { + int size = static_cast<int> (buf[0]) - m; + for (int i = 0; i < size; i++) extra[nlocal][i] = buf[m++]; + } + + atom->nlocal++; + return m; +} + +/* ---------------------------------------------------------------------- + create one atom of itype at coord + set other values to defaults +------------------------------------------------------------------------- */ + +void AtomVecAtomicKokkos::create_atom(int itype, double *coord) +{ + int nlocal = atom->nlocal; + if (nlocal == nmax) { + //if(nlocal>2) printf("typeA: %i %i\n",type[0],type[1]); + atomKK->modified(Host,ALL_MASK); + grow(0); + //if(nlocal>2) printf("typeB: %i %i\n",type[0],type[1]); + } + atomKK->modified(Host,ALL_MASK); + + tag[nlocal] = 0; + type[nlocal] = itype; + h_x(nlocal,0) = coord[0]; + h_x(nlocal,1) = coord[1]; + h_x(nlocal,2) = coord[2]; + h_mask[nlocal] = 1; + h_image[nlocal] = ((tagint) IMGMAX << IMG2BITS) | + ((tagint) IMGMAX << IMGBITS) | IMGMAX; + h_v(nlocal,0) = 0.0; + h_v(nlocal,1) = 0.0; + h_v(nlocal,2) = 0.0; + + atom->nlocal++; +} + +/* ---------------------------------------------------------------------- + unpack one line from Atoms section of data file + initialize other atom quantities +------------------------------------------------------------------------- */ + +void AtomVecAtomicKokkos::data_atom(double *coord, tagint imagetmp, + char **values) +{ + int nlocal = atom->nlocal; + if (nlocal == nmax) grow(0); + + h_tag[nlocal] = atoi(values[0]); + if (tag[nlocal] <= 0) + error->one(FLERR,"Invalid atom ID in Atoms section of data file"); + + h_type[nlocal] = atoi(values[1]); + if (type[nlocal] <= 0 || type[nlocal] > atom->ntypes) + error->one(FLERR,"Invalid atom type in Atoms section of data file"); + + h_x(nlocal,0) = coord[0]; + h_x(nlocal,1) = coord[1]; + h_x(nlocal,2) = coord[2]; + + h_image[nlocal] = imagetmp; + + h_mask[nlocal] = 1; + h_v(nlocal,0) = 0.0; + h_v(nlocal,1) = 0.0; + h_v(nlocal,2) = 0.0; + + atom->nlocal++; +} + +/* ---------------------------------------------------------------------- + pack atom info for data file including 3 image flags +------------------------------------------------------------------------- */ + +void AtomVecAtomicKokkos::pack_data(double **buf) +{ + int nlocal = atom->nlocal; + for (int i = 0; i < nlocal; i++) { + buf[i][0] = h_tag[i]; + buf[i][1] = h_type[i]; + buf[i][2] = h_x(i,0); + buf[i][3] = h_x(i,1); + buf[i][4] = h_x(i,2); + buf[i][5] = (h_image[i] & IMGMASK) - IMGMAX; + buf[i][6] = (h_image[i] >> IMGBITS & IMGMASK) - IMGMAX; + buf[i][7] = (h_image[i] >> IMG2BITS) - IMGMAX; + } +} + +/* ---------------------------------------------------------------------- + write atom info to data file including 3 image flags +------------------------------------------------------------------------- */ + +void AtomVecAtomicKokkos::write_data(FILE *fp, int n, double **buf) +{ + for (int i = 0; i < n; i++) + fprintf(fp,"%d %d %-1.16e %-1.16e %-1.16e %d %d %d\n", + (int) buf[i][0],(int) buf[i][1],buf[i][2],buf[i][3],buf[i][4], + (int) buf[i][5],(int) buf[i][6],(int) buf[i][7]); +} + +/* ---------------------------------------------------------------------- + return # of bytes of allocated memory +------------------------------------------------------------------------- */ + +bigint AtomVecAtomicKokkos::memory_usage() +{ + bigint bytes = 0; + + if (atom->memcheck("tag")) bytes += memory->usage(tag,nmax); + if (atom->memcheck("type")) bytes += memory->usage(type,nmax); + if (atom->memcheck("mask")) bytes += memory->usage(mask,nmax); + if (atom->memcheck("image")) bytes += memory->usage(image,nmax); + if (atom->memcheck("x")) bytes += memory->usage(x,nmax,3); + if (atom->memcheck("v")) bytes += memory->usage(v,nmax,3); + if (atom->memcheck("f")) bytes += memory->usage(f,nmax*commKK->nthreads,3); + + return bytes; +} + +/* ---------------------------------------------------------------------- */ + +void AtomVecAtomicKokkos::sync(ExecutionSpace space, unsigned int mask) +{ + if (space == Device) { + if (mask & X_MASK) atomKK->k_x.sync<LMPDeviceType>(); + if (mask & V_MASK) atomKK->k_v.sync<LMPDeviceType>(); + if (mask & F_MASK) atomKK->k_f.sync<LMPDeviceType>(); + if (mask & TAG_MASK) atomKK->k_tag.sync<LMPDeviceType>(); + if (mask & TYPE_MASK) atomKK->k_type.sync<LMPDeviceType>(); + if (mask & MASK_MASK) atomKK->k_mask.sync<LMPDeviceType>(); + if (mask & IMAGE_MASK) atomKK->k_image.sync<LMPDeviceType>(); + } else { + if (mask & X_MASK) atomKK->k_x.sync<LMPHostType>(); + if (mask & V_MASK) atomKK->k_v.sync<LMPHostType>(); + if (mask & F_MASK) atomKK->k_f.sync<LMPHostType>(); + if (mask & TAG_MASK) atomKK->k_tag.sync<LMPHostType>(); + if (mask & TYPE_MASK) atomKK->k_type.sync<LMPHostType>(); + if (mask & MASK_MASK) atomKK->k_mask.sync<LMPHostType>(); + if (mask & IMAGE_MASK) atomKK->k_image.sync<LMPHostType>(); + } +} + +/* ---------------------------------------------------------------------- */ + +void AtomVecAtomicKokkos::modified(ExecutionSpace space, unsigned int mask) +{ + if (space == Device) { + if (mask & X_MASK) atomKK->k_x.modify<LMPDeviceType>(); + if (mask & V_MASK) atomKK->k_v.modify<LMPDeviceType>(); + if (mask & F_MASK) atomKK->k_f.modify<LMPDeviceType>(); + if (mask & TAG_MASK) atomKK->k_tag.modify<LMPDeviceType>(); + if (mask & TYPE_MASK) atomKK->k_type.modify<LMPDeviceType>(); + if (mask & MASK_MASK) atomKK->k_mask.modify<LMPDeviceType>(); + if (mask & IMAGE_MASK) atomKK->k_image.modify<LMPDeviceType>(); + } else { + if (mask & X_MASK) atomKK->k_x.modify<LMPHostType>(); + if (mask & V_MASK) atomKK->k_v.modify<LMPHostType>(); + if (mask & F_MASK) atomKK->k_f.modify<LMPHostType>(); + if (mask & TAG_MASK) atomKK->k_tag.modify<LMPHostType>(); + if (mask & TYPE_MASK) atomKK->k_type.modify<LMPHostType>(); + if (mask & MASK_MASK) atomKK->k_mask.modify<LMPHostType>(); + if (mask & IMAGE_MASK) atomKK->k_image.modify<LMPHostType>(); + } +} diff --git a/src/KOKKOS/atom_vec_atomic_kokkos.h b/src/KOKKOS/atom_vec_atomic_kokkos.h new file mode 100644 index 0000000000..dc96cbb650 --- /dev/null +++ b/src/KOKKOS/atom_vec_atomic_kokkos.h @@ -0,0 +1,111 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale AtomicKokkos/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef ATOM_CLASS + +AtomStyle(atomic/kk,AtomVecAtomicKokkos) + +#else + +#ifndef LMP_ATOM_VEC_ATOMIC_KOKKOS_H +#define LMP_ATOM_VEC_ATOMIC_KOKKOS_H + +#include "atom_vec_kokkos.h" +#include "kokkos_type.h" + +namespace LAMMPS_NS { + +class AtomVecAtomicKokkos : public AtomVecKokkos { + public: + AtomVecAtomicKokkos(class LAMMPS *); + virtual ~AtomVecAtomicKokkos() {} + void grow(int); + void copy(int, int, int); + int pack_comm(int, int *, double *, int, int *); + int pack_comm_vel(int, int *, double *, int, int *); + void unpack_comm(int, int, double *); + void unpack_comm_vel(int, int, double *); + int pack_reverse(int, int, double *); + void unpack_reverse(int, int *, double *); + int pack_border(int, int *, double *, int, int *); + int pack_border_vel(int, int *, double *, int, int *); + void unpack_border(int, int, double *); + void unpack_border_vel(int, int, double *); + int pack_exchange(int, double *); + int unpack_exchange(double *); + int size_restart(); + int pack_restart(int, double *); + int unpack_restart(double *); + void create_atom(int, double *); + void data_atom(double *, tagint, char **); + void pack_data(double **); + void write_data(FILE *, int, double **); + bigint memory_usage(); + + void grow_reset(); + int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist, + const int & iswap, + const DAT::tdual_xfloat_2d &buf, + const int &pbc_flag, const int pbc[]); + void unpack_comm_kokkos(const int &n, const int &nfirst, + const DAT::tdual_xfloat_2d &buf); + int pack_comm_self(const int &n, const DAT::tdual_int_2d &list, + const int & iswap, const int nfirst, + const int &pbc_flag, const int pbc[]); + int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist, + DAT::tdual_xfloat_2d buf,int iswap, + int pbc_flag, int *pbc, ExecutionSpace space); + void unpack_border_kokkos(const int &n, const int &nfirst, + const DAT::tdual_xfloat_2d &buf, + ExecutionSpace space); + int pack_exchange_kokkos(const int &nsend,DAT::tdual_xfloat_2d &buf, + DAT::tdual_int_1d k_sendlist, + DAT::tdual_int_1d k_copylist, + ExecutionSpace space, int dim, + X_FLOAT lo, X_FLOAT hi); + int unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf, int nrecv, + int nlocal, int dim, X_FLOAT lo, X_FLOAT hi, + ExecutionSpace space); + + void sync(ExecutionSpace space, unsigned int mask); + void modified(ExecutionSpace space, unsigned int mask); + + protected: + int *tag,*type,*mask; + tagint *image; + double **x,**v,**f; + + DAT::t_int_1d d_tag, d_type, d_mask; + HAT::t_int_1d h_tag, h_type, h_mask; + + DAT::t_tagint_1d d_image; + HAT::t_tagint_1d h_image; + + DAT::t_x_array d_x; + DAT::t_v_array d_v; + DAT::t_f_array d_f; + HAT::t_x_array h_x; + HAT::t_v_array h_v; + HAT::t_f_array h_f; + + DAT::tdual_int_1d k_count; +}; + +} + +#endif +#endif + +/* ERROR/WARNING messages: + +*/ diff --git a/src/KOKKOS/atom_vec_kokkos.cpp b/src/KOKKOS/atom_vec_kokkos.cpp new file mode 100644 index 0000000000..1d9174196a --- /dev/null +++ b/src/KOKKOS/atom_vec_kokkos.cpp @@ -0,0 +1,23 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include "atom_vec_kokkos.h" + +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- */ + +AtomVecKokkos::AtomVecKokkos(LAMMPS *lmp) : AtomVec(lmp) +{ + kokkosable = 1; +} diff --git a/src/KOKKOS/atom_vec_kokkos.h b/src/KOKKOS/atom_vec_kokkos.h new file mode 100644 index 0000000000..ac651b0b5a --- /dev/null +++ b/src/KOKKOS/atom_vec_kokkos.h @@ -0,0 +1,76 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifndef LMP_ATOM_VEC_KOKKOS_H +#define LMP_ATOM_VEC_KOKKOS_H + +#include "atom_vec.h" +#include "kokkos_type.h" + +namespace LAMMPS_NS { + +class AtomVecKokkos : public AtomVec { + public: + AtomVecKokkos(class LAMMPS *); + virtual ~AtomVecKokkos() {} + + virtual void sync(ExecutionSpace space, unsigned int mask) {}; + virtual void modified(ExecutionSpace space, unsigned int mask) {}; + + virtual int + pack_comm_self(const int &n, const DAT::tdual_int_2d &list, + const int & iswap, const int nfirst, + const int &pbc_flag, const int pbc[]) + {return 0;} + virtual int + pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &list, + const int & iswap, const DAT::tdual_xfloat_2d &buf, + const int &pbc_flag, const int pbc[]) + {return 0;} + virtual void + unpack_comm_kokkos(const int &n, const int &nfirst, + const DAT::tdual_xfloat_2d &buf) {}; + virtual int + pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist, + DAT::tdual_xfloat_2d buf,int iswap, + int pbc_flag, int *pbc, ExecutionSpace space) + {return 0;}; + virtual void + unpack_border_kokkos(const int &n, const int &nfirst, + const DAT::tdual_xfloat_2d &buf, + ExecutionSpace space) {}; + + virtual int + pack_exchange_kokkos(const int &nsend, DAT::tdual_xfloat_2d &buf, + DAT::tdual_int_1d k_sendlist, + DAT::tdual_int_1d k_copylist, + ExecutionSpace space, int dim, X_FLOAT lo, X_FLOAT hi) + {return 0;}; + virtual int + unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf, int nrecv, + int nlocal, int dim, X_FLOAT lo, X_FLOAT hi, + ExecutionSpace space) + {return 0;}; + + protected: + class AtomKokkos *atomKK; + class CommKokkos *commKK; +}; + +} + +#endif + +/* ERROR/WARNING messages: + +*/ diff --git a/src/KOKKOS/comm_kokkos.cpp b/src/KOKKOS/comm_kokkos.cpp new file mode 100644 index 0000000000..5211d11a02 --- /dev/null +++ b/src/KOKKOS/comm_kokkos.cpp @@ -0,0 +1,820 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include "comm_kokkos.h" +#include "kokkos.h" +#include "atom.h" +#include "atom_kokkos.h" +#include "atom_vec.h" +#include "atom_vec_kokkos.h" +#include "domain.h" +#include "atom_masks.h" +#include "error.h" +#include "memory.h" + +using namespace LAMMPS_NS; + +#define BUFFACTOR 1.5 +#define BUFMIN 10000 +#define BUFEXTRA 1000 + +enum{SINGLE,MULTI}; + +/* ---------------------------------------------------------------------- + setup MPI and allocate buffer space +------------------------------------------------------------------------- */ + +CommKokkos::CommKokkos(LAMMPS *lmp) : CommBrick(lmp) +{ + sendlist = NULL; // need to free this since parent allocated? + k_sendlist = ArrayTypes<LMPDeviceType>::tdual_int_2d(); + + // error check for disallow of OpenMP threads? + + // initialize comm buffers & exchange memory + + maxsend = BUFMIN; + k_buf_send = ArrayTypes<LMPDeviceType>:: + tdual_xfloat_2d("comm:k_buf_send",(maxsend+BUFEXTRA+5)/6,6); + buf_send = k_buf_send.view<LMPHostType>().ptr_on_device(); + + maxrecv = BUFMIN; + k_buf_recv = ArrayTypes<LMPDeviceType>:: + tdual_xfloat_2d("comm:k_buf_recv",(maxrecv+5)/6,6); + buf_recv = k_buf_recv.view<LMPHostType>().ptr_on_device(); + + k_exchange_sendlist = ArrayTypes<LMPDeviceType>:: + tdual_int_1d("comm:k_exchange_sendlist",100); + k_exchange_copylist = ArrayTypes<LMPDeviceType>:: + tdual_int_1d("comm:k_exchange_copylist",100); + k_count = ArrayTypes<LMPDeviceType>::tdual_int_1d("comm:k_count",1); + k_sendflag = ArrayTypes<LMPDeviceType>::tdual_int_1d("comm:k_sendflag",100); + + // next line is bogus? + + memory->create(maxsendlist,maxswap,"comm:maxsendlist"); + for (int i = 0; i < maxswap; i++) { + maxsendlist[i] = BUFMIN; + } + memory->create_kokkos(k_sendlist,sendlist,maxswap,BUFMIN,"comm:sendlist"); +} + +/* ---------------------------------------------------------------------- */ + +CommKokkos::~CommKokkos() +{ + memory->destroy_kokkos(k_sendlist,sendlist); + memory->destroy_kokkos(k_buf_send,buf_send); + memory->destroy_kokkos(k_buf_recv,buf_recv); +} + +/* ---------------------------------------------------------------------- */ + +void CommKokkos::init() +{ + atomKK = (AtomKokkos *) atom; + exchange_comm_classic = lmp->kokkos->exchange_comm_classic; + forward_comm_classic = lmp->kokkos->forward_comm_classic; + exchange_comm_on_host = lmp->kokkos->exchange_comm_on_host; + forward_comm_on_host = lmp->kokkos->forward_comm_on_host; + + CommBrick::init(); +} + +/* ---------------------------------------------------------------------- + forward communication of atom coords every timestep + other per-atom attributes may also be sent via pack/unpack routines +------------------------------------------------------------------------- */ + +void CommKokkos::forward_comm(int dummy) +{ + + if (!forward_comm_classic) { + if (forward_comm_on_host) forward_comm_device<LMPHostType>(dummy); + else forward_comm_device<LMPDeviceType>(dummy); + return; + } + + k_sendlist.sync<LMPHostType>(); + + if (comm_x_only) { + atomKK->sync(Host,X_MASK); + atomKK->modified(Host,X_MASK); + } else if (ghost_velocity) { + atomKK->sync(Host,X_MASK | V_MASK); + atomKK->modified(Host,X_MASK | V_MASK); + } else { + atomKK->sync(Host,ALL_MASK); + atomKK->modified(Host,ALL_MASK); + } + + CommBrick::forward_comm(dummy); +} + +/* ---------------------------------------------------------------------- */ + +template<class DeviceType> +void CommKokkos::forward_comm_device(int dummy) +{ + int n; + MPI_Request request; + MPI_Status status; + AtomVecKokkos *avec = (AtomVecKokkos *) atom->avec; + double **x = atom->x; + double *buf; + + // exchange data with another proc + // if other proc is self, just copy + // if comm_x_only set, exchange or copy directly to x, don't unpack + + k_sendlist.sync<DeviceType>(); + + for (int iswap = 0; iswap < nswap; iswap++) { + + if (sendproc[iswap] != me) { + if (comm_x_only) { + atomKK->sync(ExecutionSpaceFromDevice<DeviceType>::space,X_MASK); + if (size_forward_recv[iswap]) buf = x[firstrecv[iswap]]; + else buf = NULL; + + if (size_forward_recv[iswap]) { + buf = atomKK->k_x.view<DeviceType>().ptr_on_device() + + firstrecv[iswap]*atomKK->k_x.view<DeviceType>().dimension_1(); + MPI_Irecv(buf,size_forward_recv[iswap],MPI_DOUBLE, + recvproc[iswap],0,world,&request); + } + n = avec->pack_comm_kokkos(sendnum[iswap],k_sendlist, + iswap,k_buf_send,pbc_flag[iswap],pbc[iswap]); + + if (n) { + MPI_Send(k_buf_send.view<DeviceType>().ptr_on_device(), + n,MPI_DOUBLE,sendproc[iswap],0,world); + } + + if (size_forward_recv[iswap]) MPI_Wait(&request,&status); + atomKK->modified(ExecutionSpaceFromDevice<DeviceType>:: + space,X_MASK); + } else if (ghost_velocity) { + error->all(FLERR,"Ghost velocity forward comm not yet " + "implemented with Kokkos"); + if (size_forward_recv[iswap]) + MPI_Irecv(k_buf_recv.view<LMPHostType>().ptr_on_device(), + size_forward_recv[iswap],MPI_DOUBLE, + recvproc[iswap],0,world,&request); + n = avec->pack_comm_vel(sendnum[iswap],sendlist[iswap], + buf_send,pbc_flag[iswap],pbc[iswap]); + if (n) MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world); + if (size_forward_recv[iswap]) MPI_Wait(&request,&status); + avec->unpack_comm_vel(recvnum[iswap],firstrecv[iswap],buf_recv); + } else { + if (size_forward_recv[iswap]) + MPI_Irecv(k_buf_recv.view<DeviceType>().ptr_on_device(), + size_forward_recv[iswap],MPI_DOUBLE, + recvproc[iswap],0,world,&request); + n = avec->pack_comm_kokkos(sendnum[iswap],k_sendlist,iswap, + k_buf_send,pbc_flag[iswap],pbc[iswap]); + if (n) + MPI_Send(k_buf_send.view<DeviceType>().ptr_on_device(),n, + MPI_DOUBLE,sendproc[iswap],0,world); + if (size_forward_recv[iswap]) MPI_Wait(&request,&status); + avec->unpack_comm_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_recv); + } + + } else { + if (!ghost_velocity) { + if (sendnum[iswap]) + n = avec->pack_comm_self(sendnum[iswap],k_sendlist,iswap, + firstrecv[iswap],pbc_flag[iswap],pbc[iswap]); + } else if (ghost_velocity) { + error->all(FLERR,"Ghost velocity forward comm not yet " + "implemented with Kokkos"); + n = avec->pack_comm_vel(sendnum[iswap],sendlist[iswap], + buf_send,pbc_flag[iswap],pbc[iswap]); + avec->unpack_comm_vel(recvnum[iswap],firstrecv[iswap],buf_send); + } + } + } +} + +/* ---------------------------------------------------------------------- + exchange: move atoms to correct processors + atoms exchanged with all 6 stencil neighbors + send out atoms that have left my box, receive ones entering my box + atoms will be lost if not inside some proc's box + can happen if atom moves outside of non-periodic bounary + or if atom moves more than one proc away + this routine called before every reneighboring + for triclinic, atoms must be in lamda coords (0-1) before exchange is called +------------------------------------------------------------------------- */ + +void CommKokkos::exchange() +{ + if (!exchange_comm_classic) { + if (exchange_comm_on_host) exchange_device<LMPHostType>(); + else exchange_device<LMPDeviceType>(); + return; + } + + atomKK->sync(Host,ALL_MASK); + atomKK->modified(Host,ALL_MASK); + + CommBrick::exchange(); +} + +/* ---------------------------------------------------------------------- */ + +template<class DeviceType> +struct BuildExchangeListFunctor { + typedef DeviceType device_type; + typedef ArrayTypes<DeviceType> AT; + X_FLOAT _lo,_hi; + typename AT::t_x_array _x; + + int _nlocal,_dim; + typename AT::t_int_1d _nsend; + typename AT::t_int_1d _sendlist; + typename AT::t_int_1d _sendflag; + + + BuildExchangeListFunctor( + const typename AT::tdual_x_array x, + const typename AT::tdual_int_1d sendlist, + typename AT::tdual_int_1d nsend, + typename AT::tdual_int_1d sendflag,int nlocal, int dim, + X_FLOAT lo, X_FLOAT hi): + _x(x.template view<DeviceType>()), + _sendlist(sendlist.template view<DeviceType>()), + _nsend(nsend.template view<DeviceType>()), + _sendflag(sendflag.template view<DeviceType>()), + _nlocal(nlocal),_dim(dim), + _lo(lo),_hi(hi){ + } + + KOKKOS_INLINE_FUNCTION + void operator() (int i) const { + if (_x(i,_dim) < _lo || _x(i,_dim) >= _hi) { + const int mysend=Kokkos::atomic_fetch_add(&_nsend(0),1); + if(mysend<_sendlist.dimension_0()) { + _sendlist(mysend) = i; + _sendflag(i) = 1; + } + } else + _sendflag(i) = 0; + } +}; + +/* ---------------------------------------------------------------------- */ + +template<class DeviceType> +void CommKokkos::exchange_device() +{ + int i,m,nsend,nrecv,nrecv1,nrecv2,nlocal; + double lo,hi,value; + double **x; + double *sublo,*subhi,*buf; + MPI_Request request; + MPI_Status status; + AtomVecKokkos *avec = (AtomVecKokkos *) atom->avec; + + // clear global->local map for owned and ghost atoms + // b/c atoms migrate to new procs in exchange() and + // new ghosts are created in borders() + // map_set() is done at end of borders() + // clear ghost count and any ghost bonus data internal to AtomVec + + if (map_style) atom->map_clear(); + atom->nghost = 0; + atom->avec->clear_bonus(); + + // subbox bounds for orthogonal or triclinic + + if (triclinic == 0) { + sublo = domain->sublo; + subhi = domain->subhi; + } else { + sublo = domain->sublo_lamda; + subhi = domain->subhi_lamda; + } + + atomKK->sync(ExecutionSpaceFromDevice<DeviceType>::space,ALL_MASK); + + // loop over dimensions + for (int dim = 0; dim < 3; dim++) { + + // fill buffer with atoms leaving my box, using < and >= + // when atom is deleted, fill it in with last atom + + x = atom->x; + lo = sublo[dim]; + hi = subhi[dim]; + nlocal = atom->nlocal; + i = nsend = 0; + + if (true) { + if (k_sendflag.h_view.dimension_0()<nlocal) k_sendflag.resize(nlocal); + k_count.h_view(0) = k_exchange_sendlist.h_view.dimension_0(); + while (k_count.h_view(0)>=k_exchange_sendlist.h_view.dimension_0()) { + k_count.h_view(0) = 0; + k_count.modify<LMPHostType>(); + k_count.sync<DeviceType>(); + + BuildExchangeListFunctor<DeviceType> + f(atomKK->k_x,k_exchange_sendlist,k_count,k_sendflag, + nlocal,dim,lo,hi); + Kokkos::parallel_for(nlocal,f); + DeviceType::fence(); + k_exchange_sendlist.modify<DeviceType>(); + k_sendflag.modify<DeviceType>(); + k_count.modify<DeviceType>(); + + k_count.sync<LMPHostType>(); + if (k_count.h_view(0)>=k_exchange_sendlist.h_view.dimension_0()) { + k_exchange_sendlist.resize(k_count.h_view(0)*1.1); + k_exchange_copylist.resize(k_count.h_view(0)*1.1); + k_count.h_view(0)=k_exchange_sendlist.h_view.dimension_0(); + } + } + k_exchange_sendlist.sync<LMPHostType>(); + k_sendflag.sync<LMPHostType>(); + + int sendpos = nlocal-1; + nlocal -= k_count.h_view(0); + for(int i = 0; i < k_count.h_view(0); i++) { + if (k_exchange_sendlist.h_view(i)<nlocal) { + while (k_sendflag.h_view(sendpos)) sendpos--; + k_exchange_copylist.h_view(i) = sendpos; + sendpos--; + } else + k_exchange_copylist.h_view(i) = -1; + } + + k_exchange_copylist.modify<LMPHostType>(); + k_exchange_copylist.sync<DeviceType>(); + nsend = + avec->pack_exchange_kokkos(k_count.h_view(0),k_buf_send, + k_exchange_sendlist,k_exchange_copylist, + ExecutionSpaceFromDevice<DeviceType>:: + space,dim,lo,hi); + DeviceType::fence(); + + } else { + while (i < nlocal) { + if (x[i][dim] < lo || x[i][dim] >= hi) { + if (nsend > maxsend) grow_send_kokkos(nsend,1); + nsend += avec->pack_exchange(i,&buf_send[nsend]); + avec->copy(nlocal-1,i,1); + nlocal--; + } else i++; + } + } + atom->nlocal = nlocal; + + // send/recv atoms in both directions + // if 1 proc in dimension, no send/recv, set recv buf to send buf + // if 2 procs in dimension, single send/recv + // if more than 2 procs in dimension, send/recv to both neighbors + + if (procgrid[dim] == 1) { + nrecv = nsend; + buf = buf_send; + if (nrecv) { + atom->nlocal=avec-> + unpack_exchange_kokkos(k_buf_send,nrecv,atom->nlocal,dim,lo,hi, + ExecutionSpaceFromDevice<DeviceType>::space); + DeviceType::fence(); + } + } else { + MPI_Sendrecv(&nsend,1,MPI_INT,procneigh[dim][0],0, + &nrecv1,1,MPI_INT,procneigh[dim][1],0,world,&status); + nrecv = nrecv1; + if (procgrid[dim] > 2) { + MPI_Sendrecv(&nsend,1,MPI_INT,procneigh[dim][1],0, + &nrecv2,1,MPI_INT,procneigh[dim][0],0,world,&status); + nrecv += nrecv2; + } + if (nrecv > maxrecv) grow_recv_kokkos(nrecv); + + MPI_Irecv(k_buf_recv.view<DeviceType>().ptr_on_device(),nrecv1, + MPI_DOUBLE,procneigh[dim][1],0, + world,&request); + MPI_Send(k_buf_send.view<DeviceType>().ptr_on_device(),nsend, + MPI_DOUBLE,procneigh[dim][0],0,world); + MPI_Wait(&request,&status); + + if (procgrid[dim] > 2) { + MPI_Irecv(k_buf_recv.view<DeviceType>().ptr_on_device()+nrecv1, + nrecv2,MPI_DOUBLE,procneigh[dim][0],0, + world,&request); + MPI_Send(k_buf_send.view<DeviceType>().ptr_on_device(),nsend, + MPI_DOUBLE,procneigh[dim][1],0,world); + MPI_Wait(&request,&status); + } + + buf = buf_recv; + if (nrecv) { + atom->nlocal = avec-> + unpack_exchange_kokkos(k_buf_recv,nrecv,atom->nlocal,dim,lo,hi, + ExecutionSpaceFromDevice<DeviceType>::space); + DeviceType::fence(); + } + } + + // check incoming atoms to see if they are in my box + // if so, add to my list + + } + + atomKK->modified(ExecutionSpaceFromDevice<DeviceType>::space,ALL_MASK); + + if (atom->firstgroupname) { + /* this is not yet implemented with Kokkos */ + atomKK->sync(Host,ALL_MASK); + atom->first_reorder(); + atomKK->modified(Host,ALL_MASK); + } +} + +/* ---------------------------------------------------------------------- + borders: list nearby atoms to send to neighboring procs at every timestep + one list is created for every swap that will be made + as list is made, actually do swaps + this does equivalent of a communicate, so don't need to explicitly + call communicate routine on reneighboring timestep + this routine is called before every reneighboring + for triclinic, atoms must be in lamda coords (0-1) before borders is called +------------------------------------------------------------------------- */ + +void CommKokkos::borders() +{ + if (!exchange_comm_classic) { + if (exchange_comm_on_host) borders_device<LMPHostType>(); + else borders_device<LMPDeviceType>(); + return; + } + + atomKK->sync(Host,ALL_MASK); + k_sendlist.modify<LMPHostType>(); + atomKK->modified(Host,ALL_MASK); + + CommBrick::borders(); +} + +/* ---------------------------------------------------------------------- */ + +template<class DeviceType> +struct BuildBorderListFunctor { + typedef DeviceType device_type; + typedef ArrayTypes<DeviceType> AT; + X_FLOAT lo,hi; + typename AT::t_x_array x; + int iswap,maxsendlist; + int nfirst,nlast,dim; + typename AT::t_int_2d sendlist; + typename AT::t_int_1d nsend; + + BuildBorderListFunctor(typename AT::tdual_x_array _x, + typename AT::tdual_int_2d _sendlist, + typename AT::tdual_int_1d _nsend,int _nfirst, + int _nlast, int _dim, + X_FLOAT _lo, X_FLOAT _hi, int _iswap, + int _maxsendlist): + x(_x.template view<DeviceType>()), + sendlist(_sendlist.template view<DeviceType>()), + nsend(_nsend.template view<DeviceType>()), + nfirst(_nfirst),nlast(_nlast),dim(_dim), + lo(_lo),hi(_hi),iswap(_iswap),maxsendlist(_maxsendlist){} + + + KOKKOS_INLINE_FUNCTION + void operator() (DeviceType dev) const { + const int chunk = ((nlast - nfirst + dev.league_size() - 1 ) / + dev.league_size()); + const int teamstart = chunk*dev.league_rank() + nfirst; + const int teamend = (teamstart + chunk) < nlast?(teamstart + chunk):nlast; + int mysend = 0; + for (int i=teamstart + dev.team_rank(); i<teamend; i+=dev.team_size()) { + if (x(i,dim) >= lo && x(i,dim) <= hi) mysend++; + } + const int my_store_pos = dev.team_scan(mysend,&nsend(0)); + + if (my_store_pos+mysend < maxsendlist) { + mysend = my_store_pos; + for(int i=teamstart + dev.team_rank(); i<teamend; i+=dev.team_size()){ + if (x(i,dim) >= lo && x(i,dim) <= hi) { + sendlist(iswap,mysend++) = i; + } + } + } + } + + size_t shmem_size() const { return 1000u;} +}; + +/* ---------------------------------------------------------------------- */ + +template<class DeviceType> +void CommKokkos::borders_device() { + int i,n,itype,iswap,dim,ineed,twoneed,smax,rmax; + int nsend,nrecv,sendflag,nfirst,nlast,ngroup; + double lo,hi; + int *type; + double **x; + double *buf,*mlo,*mhi; + MPI_Request request; + MPI_Status status; + AtomVecKokkos *avec = (AtomVecKokkos *) atom->avec; + + ExecutionSpace exec_space = ExecutionSpaceFromDevice<DeviceType>::space; + k_sendlist.modify<DeviceType>(); + atomKK->sync(exec_space,ALL_MASK); + + // do swaps over all 3 dimensions + + iswap = 0; + smax = rmax = 0; + + for (dim = 0; dim < 3; dim++) { + nlast = 0; + twoneed = 2*maxneed[dim]; + for (ineed = 0; ineed < twoneed; ineed++) { + + // find atoms within slab boundaries lo/hi using <= and >= + // check atoms between nfirst and nlast + // for first swaps in a dim, check owned and ghost + // for later swaps in a dim, only check newly arrived ghosts + // store sent atom indices in list for use in future timesteps + + x = atom->x; + if (style == SINGLE) { + lo = slablo[iswap]; + hi = slabhi[iswap]; + } else { + type = atom->type; + mlo = multilo[iswap]; + mhi = multihi[iswap]; + } + if (ineed % 2 == 0) { + nfirst = nlast; + nlast = atom->nlocal + atom->nghost; + } + + nsend = 0; + + // sendflag = 0 if I do not send on this swap + // sendneed test indicates receiver no longer requires data + // e.g. due to non-PBC or non-uniform sub-domains + + if (ineed/2 >= sendneed[dim][ineed % 2]) sendflag = 0; + else sendflag = 1; + + // find send atoms according to SINGLE vs MULTI + // all atoms eligible versus atoms in bordergroup + // only need to limit loop to bordergroup for first sends (ineed < 2) + // on these sends, break loop in two: owned (in group) and ghost + + if (sendflag) { + if (!bordergroup || ineed >= 2) { + if (style == SINGLE) { + typename ArrayTypes<DeviceType>::tdual_int_1d total_send("TS",1); + total_send.h_view(0) = 0; + if(exec_space == Device) { + total_send.template modify<DeviceType>(); + total_send.template sync<LMPDeviceType>(); + } + BuildBorderListFunctor<DeviceType> f(atomKK->k_x,k_sendlist, + total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]); + Kokkos::ParallelWorkRequest config((nlast-nfirst+127)/128,128); + Kokkos::parallel_for(config,f); + DeviceType::fence(); + total_send.template modify<DeviceType>(); + total_send.template sync<LMPHostType>(); + + if(total_send.h_view(0) >= maxsendlist[iswap]) { + grow_list(iswap,total_send.h_view(0)); + total_send.h_view(0) = 0; + if(exec_space == Device) { + total_send.template modify<LMPHostType>(); + total_send.template sync<LMPDeviceType>(); + } + BuildBorderListFunctor<DeviceType> f(atomKK->k_x,k_sendlist, + total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]); + Kokkos::ParallelWorkRequest config((nlast-nfirst+127)/128,128); + Kokkos::parallel_for(config,f); + DeviceType::fence(); + total_send.template modify<DeviceType>(); + total_send.template sync<LMPHostType>(); + } + nsend = total_send.h_view(0); + } else { + error->all(FLERR,"Required border comm not yet " + "implemented with Kokkos\n"); + for (i = nfirst; i < nlast; i++) { + itype = type[i]; + if (x[i][dim] >= mlo[itype] && x[i][dim] <= mhi[itype]) { + if (nsend == maxsendlist[iswap]) grow_list(iswap,nsend); + sendlist[iswap][nsend++] = i; + } + } + } + + } else { + error->all(FLERR,"Required border comm not yet " + "implemented with Kokkos\n"); + if (style == SINGLE) { + ngroup = atom->nfirst; + for (i = 0; i < ngroup; i++) + if (x[i][dim] >= lo && x[i][dim] <= hi) { + if (nsend == maxsendlist[iswap]) grow_list(iswap,nsend); + sendlist[iswap][nsend++] = i; + } + for (i = atom->nlocal; i < nlast; i++) + if (x[i][dim] >= lo && x[i][dim] <= hi) { + if (nsend == maxsendlist[iswap]) grow_list(iswap,nsend); + sendlist[iswap][nsend++] = i; + } + } else { + ngroup = atom->nfirst; + for (i = 0; i < ngroup; i++) { + itype = type[i]; + if (x[i][dim] >= mlo[itype] && x[i][dim] <= mhi[itype]) { + if (nsend == maxsendlist[iswap]) grow_list(iswap,nsend); + sendlist[iswap][nsend++] = i; + } + } + for (i = atom->nlocal; i < nlast; i++) { + itype = type[i]; + if (x[i][dim] >= mlo[itype] && x[i][dim] <= mhi[itype]) { + if (nsend == maxsendlist[iswap]) grow_list(iswap,nsend); + sendlist[iswap][nsend++] = i; + } + } + } + } + } + + // pack up list of border atoms + + if (nsend*size_border > maxsend) + grow_send_kokkos(nsend*size_border,0); + if (ghost_velocity) { + error->all(FLERR,"Required border comm not yet " + "implemented with Kokkos\n"); + n = avec->pack_border_vel(nsend,sendlist[iswap],buf_send, + pbc_flag[iswap],pbc[iswap]); + } + else + n = avec-> + pack_border_kokkos(nsend,k_sendlist,k_buf_send,iswap, + pbc_flag[iswap],pbc[iswap],exec_space); + + // swap atoms with other proc + // no MPI calls except SendRecv if nsend/nrecv = 0 + // put incoming ghosts at end of my atom arrays + // if swapping with self, simply copy, no messages + + if (sendproc[iswap] != me) { + MPI_Sendrecv(&nsend,1,MPI_INT,sendproc[iswap],0, + &nrecv,1,MPI_INT,recvproc[iswap],0,world,&status); + if (nrecv*size_border > maxrecv) grow_recv_kokkos(nrecv*size_border); + if (nrecv) MPI_Irecv(k_buf_recv.view<DeviceType>().ptr_on_device(), + nrecv*size_border,MPI_DOUBLE, + recvproc[iswap],0,world,&request); + if (n) MPI_Send(k_buf_send.view<DeviceType>().ptr_on_device(),n, + MPI_DOUBLE,sendproc[iswap],0,world); + if (nrecv) MPI_Wait(&request,&status); + buf = buf_recv; + } else { + nrecv = nsend; + buf = buf_send; + } + + // unpack buffer + + if (ghost_velocity) { + error->all(FLERR,"Required border comm not yet " + "implemented with Kokkos\n"); + avec->unpack_border_vel(nrecv,atom->nlocal+atom->nghost,buf); + } + else + if (sendproc[iswap] != me) + avec->unpack_border_kokkos(nrecv,atom->nlocal+atom->nghost, + k_buf_recv,exec_space); + else + avec->unpack_border_kokkos(nrecv,atom->nlocal+atom->nghost, + k_buf_send,exec_space); + + // set all pointers & counters + + smax = MAX(smax,nsend); + rmax = MAX(rmax,nrecv); + sendnum[iswap] = nsend; + recvnum[iswap] = nrecv; + size_forward_recv[iswap] = nrecv*size_forward; + size_reverse_send[iswap] = nrecv*size_reverse; + size_reverse_recv[iswap] = nsend*size_reverse; + firstrecv[iswap] = atom->nlocal + atom->nghost; + atom->nghost += nrecv; + iswap++; + } + } + + // insure send/recv buffers are long enough for all forward & reverse comm + + int max = MAX(maxforward*smax,maxreverse*rmax); + if (max > maxsend) grow_send_kokkos(max,0); + max = MAX(maxforward*rmax,maxreverse*smax); + if (max > maxrecv) grow_recv_kokkos(max); + + // reset global->local map + + if (map_style) atom->map_set(); + if (exec_space == Host) k_sendlist.sync<LMPDeviceType>(); + atomKK->modified(exec_space,ALL_MASK); + DeviceType::fence(); +} + +/* ---------------------------------------------------------------------- + realloc the size of the send buffer as needed with BUFFACTOR & BUFEXTRA + if flag = 1, realloc + if flag = 0, don't need to realloc with copy, just free/malloc +------------------------------------------------------------------------- */ + +void CommKokkos::grow_send_kokkos(int n, int flag, ExecutionSpace space) +{ + maxsend = static_cast<int> (BUFFACTOR * n); + int maxsend_border = (maxsend+BUFEXTRA+5)/atom->avec->size_border + 2; + if (flag) { + if(space == Device) + k_buf_send.modify<LMPDeviceType>(); + else + k_buf_send.modify<LMPHostType>(); + + k_buf_send.resize(maxsend_border,atom->avec->size_border); + buf_send = k_buf_send.view<LMPHostType>().ptr_on_device(); + } + else { + k_buf_send = ArrayTypes<LMPDeviceType>:: + tdual_xfloat_2d("comm:k_buf_send",maxsend_border,atom->avec->size_border); + buf_send = k_buf_send.view<LMPHostType>().ptr_on_device(); + } +} + +/* ---------------------------------------------------------------------- + free/malloc the size of the recv buffer as needed with BUFFACTOR +------------------------------------------------------------------------- */ + +void CommKokkos::grow_recv_kokkos(int n, ExecutionSpace space) +{ + maxrecv = static_cast<int> (BUFFACTOR * n); + int maxrecv_border = (maxrecv+BUFEXTRA+5)/atom->avec->size_border + 2; + k_buf_recv = ArrayTypes<LMPDeviceType>:: + tdual_xfloat_2d("comm:k_buf_recv",maxrecv_border,atom->avec->size_border); + buf_recv = k_buf_recv.view<LMPHostType>().ptr_on_device(); +} + +/* ---------------------------------------------------------------------- + realloc the size of the iswap sendlist as needed with BUFFACTOR +------------------------------------------------------------------------- */ + +void CommKokkos::grow_list(int iswap, int n) +{ + int size = static_cast<int> (BUFFACTOR * n); + + memory->grow_kokkos(k_sendlist,sendlist,maxswap,size,"comm:sendlist"); + + for(int i=0;i<maxswap;i++) { + maxsendlist[i]=size; sendlist[i]=&k_sendlist.view<LMPHostType>()(i,0); + } +} + +/* ---------------------------------------------------------------------- + realloc the buffers needed for swaps +------------------------------------------------------------------------- */ + +void CommKokkos::grow_swap(int n) +{ + free_swap(); + allocate_swap(n); + if (style == MULTI) { + free_multi(); + allocate_multi(n); + } + + maxswap = n; + int size = MAX(k_sendlist.d_view.dimension_1(),BUFMIN); + + memory->grow_kokkos(k_sendlist,sendlist,maxswap,size,"comm:sendlist"); + + memory->grow(maxsendlist,n,"comm:maxsendlist"); + for (int i=0;i<maxswap;i++) maxsendlist[i]=size; +} diff --git a/src/KOKKOS/comm_kokkos.h b/src/KOKKOS/comm_kokkos.h new file mode 100644 index 0000000000..46d3552d2d --- /dev/null +++ b/src/KOKKOS/comm_kokkos.h @@ -0,0 +1,63 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifndef LMP_COMM_KOKKOS_H +#define LMP_COMM_KOKKOS_H + +#include "comm_brick.h" +#include "kokkos_type.h" + +namespace LAMMPS_NS { + +class CommKokkos : public CommBrick { + public: + class AtomKokkos *atomKK; + + bool exchange_comm_classic; + bool forward_comm_classic; + bool exchange_comm_on_host; + bool forward_comm_on_host; + + CommKokkos(class LAMMPS *); + ~CommKokkos(); + void init(); + + void forward_comm(int dummy = 0); // forward comm of atom coords + void exchange(); // move atoms to new procs + void borders(); // setup list of atoms to comm + + template<class DeviceType> void forward_comm_device(int dummy); + template<class DeviceType> void exchange_device(); + template<class DeviceType> void borders_device(); + + protected: + DAT::tdual_int_2d k_sendlist; + DAT::tdual_xfloat_2d k_buf_send,k_buf_recv; + DAT::tdual_int_1d k_exchange_sendlist,k_exchange_copylist,k_sendflag; + DAT::tdual_int_1d k_count; + //double *buf_send; // send buffer for all comm + //double *buf_recv; // recv buffer for all comm + + void grow_send_kokkos(int, int, ExecutionSpace space = Host); + void grow_recv_kokkos(int, ExecutionSpace space = Host); + void grow_list(int, int); + void grow_swap(int); +}; + +} + +#endif + +/* ERROR/WARNING messages: + +*/ diff --git a/src/KOKKOS/domain_kokkos.cpp b/src/KOKKOS/domain_kokkos.cpp new file mode 100644 index 0000000000..c2214b611b --- /dev/null +++ b/src/KOKKOS/domain_kokkos.cpp @@ -0,0 +1,207 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include "domain_kokkos.h" +#include "atom_kokkos.h" +#include "atom_masks.h" + +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- */ + +DomainKokkos::DomainKokkos(LAMMPS *lmp) : Domain(lmp) {} + +/* ---------------------------------------------------------------------- */ + +void DomainKokkos::init() +{ + atomKK = (AtomKokkos *) atom; + Domain::init(); +} + +/* ---------------------------------------------------------------------- */ + +template<class DeviceType, int PERIODIC, int DEFORM_VREMAP> +struct DomainPBCFunctor { + typedef DeviceType device_type; + double lo[3],hi[3],period[3]; + typename ArrayTypes<DeviceType>::t_x_array x; + typename ArrayTypes<DeviceType>::t_v_array v; + typename ArrayTypes<DeviceType>::t_int_1d mask; + typename ArrayTypes<DeviceType>::t_int_1d image; + int deform_groupbit; + double h_rate[6]; + int xperiodic,yperiodic,zperiodic; + + DomainPBCFunctor(double* _lo, double* _hi, double* _period, + DAT::tdual_x_array _x, DAT::tdual_v_array _v, + DAT::tdual_int_1d _mask, DAT::tdual_int_1d _image, + int _deform_groupbit, double* _h_rate, + int _xperiodic, int _yperiodic, int _zperiodic): + x(_x.view<DeviceType>()), v(_v.view<DeviceType>()), + mask(_mask.view<DeviceType>()), image(_image.view<DeviceType>()), + deform_groupbit(_deform_groupbit), + xperiodic(_xperiodic), yperiodic(_yperiodic), zperiodic(_zperiodic){ + lo[0]=_lo[0]; lo[1]=_lo[1]; lo[2]=_lo[2]; + hi[0]=_hi[0]; hi[1]=_hi[1]; hi[2]=_hi[2]; + period[0]=_period[0]; period[1]=_period[1]; period[2]=_period[2]; + h_rate[0]=_h_rate[0]; h_rate[1]=_h_rate[1]; h_rate[2]=_h_rate[2]; + h_rate[3]=_h_rate[3]; h_rate[4]=_h_rate[4]; h_rate[5]=_h_rate[5]; + } + + KOKKOS_INLINE_FUNCTION + void operator() (const int &i) const { + if (PERIODIC && xperiodic) { + if (x(i,0) < lo[0]) { + x(i,0) += period[0]; + if (DEFORM_VREMAP && (mask[i] & deform_groupbit)) v(i,0) += h_rate[0]; + int idim = image[i] & IMGMASK; + const int otherdims = image[i] ^ idim; + idim--; + idim &= IMGMASK; + image[i] = otherdims | idim; + } + if (x(i,0) >= hi[0]) { + x(i,0) -= period[0]; + x(i,0) = MAX(x(i,0),lo[0]); + if (DEFORM_VREMAP && (mask[i] & deform_groupbit)) v(i,0) -= h_rate[0]; + int idim = image[i] & IMGMASK; + const int otherdims = image[i] ^ idim; + idim++; + idim &= IMGMASK; + image[i] = otherdims | idim; + } + } + + if (PERIODIC && yperiodic) { + if (x(i,1) < lo[1]) { + x(i,1) += period[1]; + if (DEFORM_VREMAP && (mask[i] & deform_groupbit)) { + v(i,0) += h_rate[5]; + v(i,1) += h_rate[1]; + } + int idim = (image[i] >> IMGBITS) & IMGMASK; + const int otherdims = image[i] ^ (idim << IMGBITS); + idim--; + idim &= IMGMASK; + image[i] = otherdims | (idim << IMGBITS); + } + if (x(i,1) >= hi[1]) { + x(i,1) -= period[1]; + x(i,1) = MAX(x(i,1),lo[1]); + if (DEFORM_VREMAP && (mask[i] & deform_groupbit)) { + v(i,0) -= h_rate[5]; + v(i,1) -= h_rate[1]; + } + int idim = (image[i] >> IMGBITS) & IMGMASK; + const int otherdims = image[i] ^ (idim << IMGBITS); + idim++; + idim &= IMGMASK; + image[i] = otherdims | (idim << IMGBITS); + } + } + + if (PERIODIC && zperiodic) { + if (x(i,2) < lo[2]) { + x(i,2) += period[2]; + if (DEFORM_VREMAP && (mask[i] & deform_groupbit)) { + v(i,0) += h_rate[4]; + v(i,1) += h_rate[3]; + v(i,2) += h_rate[2]; + } + int idim = image[i] >> IMG2BITS; + const int otherdims = image[i] ^ (idim << IMG2BITS); + idim--; + idim &= IMGMASK; + image[i] = otherdims | (idim << IMG2BITS); + } + if (x(i,2) >= hi[2]) { + x(i,2) -= period[2]; + x(i,2) = MAX(x(i,2),lo[2]); + if (DEFORM_VREMAP && (mask[i] & deform_groupbit)) { + v(i,0) -= h_rate[4]; + v(i,1) -= h_rate[3]; + v(i,2) -= h_rate[2]; + } + int idim = image[i] >> IMG2BITS; + const int otherdims = image[i] ^ (idim << IMG2BITS); + idim++; + idim &= IMGMASK; + image[i] = otherdims | (idim << IMG2BITS); + } + } + } +}; + +/* ---------------------------------------------------------------------- + enforce PBC and modify box image flags for each atom + called every reneighboring and by other commands that change atoms + resulting coord must satisfy lo <= coord < hi + MAX is important since coord - prd < lo can happen when coord = hi + if fix deform, remap velocity of fix group atoms by box edge velocities + for triclinic, atoms must be in lamda coords (0-1) before pbc is called + image = 10 bits for each dimension + increment/decrement in wrap-around fashion +------------------------------------------------------------------------- */ + +void DomainKokkos::pbc() +{ + double *lo,*hi,*period; + int nlocal = atomKK->nlocal; + + if (triclinic == 0) { + lo = boxlo; + hi = boxhi; + period = prd; + } else { + lo = boxlo_lamda; + hi = boxhi_lamda; + period = prd_lamda; + } + + atomKK->sync(Device,X_MASK|V_MASK|MASK_MASK|IMAGE_MASK); + atomKK->modified(Device,X_MASK|V_MASK); + + if (xperiodic || yperiodic || zperiodic) { + if (deform_vremap) { + DomainPBCFunctor<LMPDeviceType,1,1> + f(lo,hi,period, + atomKK->k_x,atomKK->k_v,atomKK->k_mask,atomKK->k_image, + deform_groupbit,h_rate,xperiodic,yperiodic,zperiodic); + Kokkos::parallel_for(nlocal,f); + } else { + DomainPBCFunctor<LMPDeviceType,1,0> + f(lo,hi,period, + atomKK->k_x,atomKK->k_v,atomKK->k_mask,atomKK->k_image, + deform_groupbit,h_rate,xperiodic,yperiodic,zperiodic); + Kokkos::parallel_for(nlocal,f); + } + } else { + if (deform_vremap) { + DomainPBCFunctor<LMPDeviceType,0,1> + f(lo,hi,period, + atomKK->k_x,atomKK->k_v,atomKK->k_mask,atomKK->k_image, + deform_groupbit,h_rate,xperiodic,yperiodic,zperiodic); + Kokkos::parallel_for(nlocal,f); + } else { + DomainPBCFunctor<LMPDeviceType,0,0> + f(lo,hi,period, + atomKK->k_x,atomKK->k_v,atomKK->k_mask,atomKK->k_image, + deform_groupbit,h_rate,xperiodic,yperiodic,zperiodic); + Kokkos::parallel_for(nlocal,f); + } + } + + LMPDeviceType::fence(); +} + diff --git a/src/KOKKOS/domain_kokkos.h b/src/KOKKOS/domain_kokkos.h new file mode 100644 index 0000000000..36e0aa4aaa --- /dev/null +++ b/src/KOKKOS/domain_kokkos.h @@ -0,0 +1,38 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifndef LMP_DOMAIN_KOKKOS_H +#define LMP_DOMAIN_KOKKOS_H + +#include "domain.h" +#include "kokkos_type.h" + +namespace LAMMPS_NS { + +class DomainKokkos : public Domain { + public: + class AtomKokkos *atomKK; + + DomainKokkos(class LAMMPS *); + ~DomainKokkos() {} + void init(); + void pbc(); +}; + +} + +#endif + +/* ERROR/WARNING messages: + +*/ diff --git a/src/KOKKOS/fix_nve_kokkos.cpp b/src/KOKKOS/fix_nve_kokkos.cpp new file mode 100644 index 0000000000..3076dca4fa --- /dev/null +++ b/src/KOKKOS/fix_nve_kokkos.cpp @@ -0,0 +1,177 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include "stdio.h" +#include "string.h" +#include "fix_nve_kokkos.h" +#include "atom_masks.h" +#include "atom_kokkos.h" +#include "force.h" +#include "update.h" +#include "respa.h" +#include "error.h" + +using namespace LAMMPS_NS; +using namespace FixConst; + +/* ---------------------------------------------------------------------- */ + +template<class DeviceType> +FixNVEKokkos<DeviceType>::FixNVEKokkos(LAMMPS *lmp, int narg, char **arg) : + FixNVE(lmp, narg, arg) +{ + atomKK = (AtomKokkos *) atom; + execution_space = ExecutionSpaceFromDevice<DeviceType>::space; + + datamask_read = X_MASK | V_MASK | F_MASK | MASK_MASK | RMASS_MASK | TYPE_MASK; + datamask_modify = X_MASK | V_MASK | F_MASK; +} + +/* ---------------------------------------------------------------------- */ + +template<class DeviceType> +void FixNVEKokkos<DeviceType>::init() +{ + FixNVE::init(); + + atomKK->k_mass.modify<LMPHostType>(); + atomKK->k_mass.sync<LMPDeviceType>(); +} + +/* ---------------------------------------------------------------------- + allow for both per-type and per-atom mass +------------------------------------------------------------------------- */ + +template<class DeviceType> +void FixNVEKokkos<DeviceType>::initial_integrate(int vflag) +{ + atomKK->sync(execution_space,datamask_read); + atomKK->modified(execution_space,datamask_modify); + + x = atomKK->k_x.view<DeviceType>(); + v = atomKK->k_v.view<DeviceType>(); + f = atomKK->k_f.view<DeviceType>(); + rmass = atomKK->rmass; + mass = atomKK->k_mass.view<DeviceType>(); + type = atomKK->k_type.view<DeviceType>(); + mask = atomKK->k_mask.view<DeviceType>(); + int nlocal = atomKK->nlocal; + if (igroup == atomKK->firstgroup) nlocal = atomKK->nfirst; + + if (rmass) { + FixNVEKokkosInitialIntegrateFunctor<DeviceType,1> functor(this); + Kokkos::parallel_for(nlocal,functor); + } else { + FixNVEKokkosInitialIntegrateFunctor<DeviceType,0> functor(this); + Kokkos::parallel_for(nlocal,functor); + } + DeviceType::fence(); +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void FixNVEKokkos<DeviceType>::initial_integrate_item(int i) const +{ + if (mask[i] & groupbit) { + const double dtfm = dtf / mass[type[i]]; + v(i,0) += dtfm * f(i,0); + v(i,1) += dtfm * f(i,1); + v(i,2) += dtfm * f(i,2); + x(i,0) += dtv * v(i,0); + x(i,1) += dtv * v(i,1); + x(i,2) += dtv * v(i,2); + } +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void FixNVEKokkos<DeviceType>::initial_integrate_rmass_item(int i) const +{ + if (mask[i] & groupbit) { + const double dtfm = dtf / rmass[type[i]]; + v(i,0) += dtfm * f(i,0); + v(i,1) += dtfm * f(i,1); + v(i,2) += dtfm * f(i,2); + x(i,0) += dtv * v(i,0); + x(i,1) += dtv * v(i,1); + x(i,2) += dtv * v(i,2); + } +} + +/* ---------------------------------------------------------------------- */ + +template<class DeviceType> +void FixNVEKokkos<DeviceType>::final_integrate() +{ + atomKK->sync(execution_space,datamask_read); + atomKK->modified(execution_space,datamask_modify); + + v = atomKK->k_v.view<DeviceType>(); + f = atomKK->k_f.view<DeviceType>(); + rmass = atomKK->rmass; + mass = atomKK->k_mass.view<DeviceType>(); + type = atomKK->k_type.view<DeviceType>(); + mask = atomKK->k_mask.view<DeviceType>(); + int nlocal = atomKK->nlocal; + if (igroup == atomKK->firstgroup) nlocal = atomKK->nfirst; + + if (rmass) { + FixNVEKokkosFinalIntegrateFunctor<DeviceType,1> functor(this); + Kokkos::parallel_for(nlocal,functor); + } else { + FixNVEKokkosFinalIntegrateFunctor<DeviceType,0> functor(this); + Kokkos::parallel_for(nlocal,functor); + } + DeviceType::fence(); + + // debug + //atomKK->sync(Host,datamask_read); +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void FixNVEKokkos<DeviceType>::final_integrate_item(int i) const +{ + if (mask[i] & groupbit) { + const double dtfm = dtf / mass[type[i]]; + v(i,0) += dtfm * f(i,0); + v(i,1) += dtfm * f(i,1); + v(i,2) += dtfm * f(i,2); + } +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void FixNVEKokkos<DeviceType>::final_integrate_rmass_item(int i) const +{ + if (mask[i] & groupbit) { + const double dtfm = dtf / rmass[type[i]]; + v(i,0) += dtfm * f(i,0); + v(i,1) += dtfm * f(i,1); + v(i,2) += dtfm * f(i,2); + } +} + +/* ---------------------------------------------------------------------- */ + +template<class DeviceType> +void FixNVEKokkos<DeviceType>::cleanup_copy() +{ + id = style = NULL; + vatom = NULL; +} + +template class FixNVEKokkos<LMPDeviceType>; +#if DEVICE==2 +template class FixNVEKokkos<LMPHostType>; +#endif diff --git a/src/KOKKOS/fix_nve_kokkos.h b/src/KOKKOS/fix_nve_kokkos.h new file mode 100644 index 0000000000..bd9ec4d816 --- /dev/null +++ b/src/KOKKOS/fix_nve_kokkos.h @@ -0,0 +1,110 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef FIX_CLASS + +FixStyle(nve/kk,FixNVEKokkos<LMPDeviceType>) +FixStyle(nve/kk/device,FixNVEKokkos<LMPDeviceType>) +FixStyle(nve/kk/host,FixNVEKokkos<LMPHostType>) + +#else + +#ifndef LMP_FIX_NVE_KOKKOS_H +#define LMP_FIX_NVE_KOKKOS_H + +#include "fix_nve.h" +#include "kokkos_type.h" + +namespace LAMMPS_NS { + +template<class DeviceType> +class FixNVEKokkos; + +template <class DeviceType, int RMass> +class FixNVEKokkosInitialIntegrateFunctor; +template <class DeviceType, int RMass> +class FixNVEKokkosFinalIntegrateFunctor; + +template<class DeviceType> +class FixNVEKokkos : public FixNVE { + public: + FixNVEKokkos(class LAMMPS *, int, char **); + ~FixNVEKokkos() {} + void cleanup_copy(); + void init(); + void initial_integrate(int); + void final_integrate(); + + KOKKOS_INLINE_FUNCTION + void initial_integrate_item(int) const; + KOKKOS_INLINE_FUNCTION + void initial_integrate_rmass_item(int) const; + KOKKOS_INLINE_FUNCTION + void final_integrate_item(int) const; + KOKKOS_INLINE_FUNCTION + void final_integrate_rmass_item(int) const; + + private: + class AtomKokkos *atomKK; + + typename ArrayTypes<DeviceType>::t_x_array x; + typename ArrayTypes<DeviceType>::t_v_array v; + typename ArrayTypes<DeviceType>::t_f_array_const f; + double *rmass; + typename ArrayTypes<DeviceType>::t_float_1d_randomread mass; + typename ArrayTypes<DeviceType>::t_int_1d type; + typename ArrayTypes<DeviceType>::t_int_1d mask; +}; + +template <class DeviceType, int RMass> +struct FixNVEKokkosInitialIntegrateFunctor { + typedef DeviceType device_type ; + FixNVEKokkos<DeviceType> c; + + FixNVEKokkosInitialIntegrateFunctor(FixNVEKokkos<DeviceType>* c_ptr): + c(*c_ptr) {c.cleanup_copy();}; + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { + if (RMass) c.initial_integrate_rmass_item(i); + else c.initial_integrate_item(i); + } +}; + +template <class DeviceType, int RMass> +struct FixNVEKokkosFinalIntegrateFunctor { + typedef DeviceType device_type ; + FixNVEKokkos<DeviceType> c; + + FixNVEKokkosFinalIntegrateFunctor(FixNVEKokkos<DeviceType>* c_ptr): + c(*c_ptr) {c.cleanup_copy();}; + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { + if (RMass) c.final_integrate_rmass_item(i); + else c.final_integrate_item(i); + } +}; + +} + +#endif +#endif + +/* ERROR/WARNING messages: + +E: Illegal ... command + +Self-explanatory. Check the input script syntax and compare to the +documentation for the command. You can use -echo screen as a +command-line option when running LAMMPS to see the offending line. + +*/ diff --git a/src/KOKKOS/kokkos.cpp b/src/KOKKOS/kokkos.cpp new file mode 100644 index 0000000000..4f6031f229 --- /dev/null +++ b/src/KOKKOS/kokkos.cpp @@ -0,0 +1,220 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include "stdio.h" +#include "string.h" +#include "stdlib.h" +#include "ctype.h" +#include "kokkos.h" +#include "lammps.h" +#include "neighbor_kokkos.h" +#include "neigh_list_kokkos.h" +#include "error.h" + +using namespace LAMMPS_NS; + +enum{FULL,HALFTHREAD,HALF}; + +/* ---------------------------------------------------------------------- */ + +KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp) +{ + kokkos_exists = 1; + lmp->kokkos = this; + + // process any command-line args that invoke Kokkos settings + + int device = 0; + int num_threads = 1; + int numa = 1; + + int iarg = 0; + while (iarg < narg) { + if (strcmp(arg[iarg],"d") == 0 || strcmp(arg[iarg],"device") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Invalid Kokkos command-line args"); + device = atoi(arg[iarg+1]); + iarg += 2; + + } else if (strcmp(arg[iarg],"g") == 0 || + strcmp(arg[iarg],"gpus") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Invalid Kokkos command-line args"); + int ngpu = atoi(arg[iarg+1]); + iarg += 2; + + int skip_gpu = 9999; + if (iarg+2 < narg && isdigit(arg[iarg+2][0])) { + skip_gpu = atoi(arg[iarg+2]); + iarg++; + } + + char *str; + if (str = getenv("SLURM_LOCALID")) { + int local_rank = atoi(str); + device = local_rank % ngpu; + if (device >= skip_gpu) device++; + } + if (str = getenv("MV2_COMM_WORLD_LOCAL_RANK")) { + int local_rank = atoi(str); + device = local_rank % ngpu; + if (device >= skip_gpu) device++; + } + if (str = getenv("OMPI_COMM_WORLD_LOCAL_RANK")) { + int local_rank = atoi(str); + device = local_rank % ngpu; + if (device >= skip_gpu) device++; + } + + } else if (strcmp(arg[iarg],"t") == 0 || + strcmp(arg[iarg],"threads") == 0) { + num_threads = atoi(arg[iarg+1]); + iarg += 2; + + } else if (strcmp(arg[iarg],"n") == 0 || + strcmp(arg[iarg],"numa") == 0) { + numa = atoi(arg[iarg+1]); + iarg += 2; + + } else error->all(FLERR,"Invalid Kokkos command-line args"); + } + + // initialize Kokkos + +#if DEVICE==2 + Kokkos::Cuda::host_mirror_device_type::initialize(num_threads,numa); + Kokkos::Cuda::SelectDevice select_device(device); + Kokkos::Cuda::initialize(select_device); +#else + LMPHostType::initialize(num_threads,numa); +#endif + + // default settings for package kokkos command + + neighflag = FULL; + exchange_comm_classic = 0; + forward_comm_classic = 0; + exchange_comm_on_host = 1; + forward_comm_on_host = 1; +} + +/* ---------------------------------------------------------------------- */ + +KokkosLMP::~KokkosLMP() +{ + // finalize Kokkos + +#if DEVICE==2 + Kokkos::Cuda::finalize(); + Kokkos::Cuda::host_mirror_device_type::finalize(); +#else + LMPHostType::finalize(); +#endif +} + +/* ---------------------------------------------------------------------- + invoked by package kokkos command +------------------------------------------------------------------------- */ + +void KokkosLMP::accelerator(int narg, char **arg) +{ + int iarg = 0; + while (iarg < narg) { + if (strcmp(arg[iarg],"neigh") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal package command"); + if (strcmp(arg[iarg+1],"full") == 0) neighflag = FULL; + else if (strcmp(arg[iarg+1],"half/thread") == 0) neighflag = HALFTHREAD; + else if (strcmp(arg[iarg+1],"half") == 0) neighflag = HALF; + else if (strcmp(arg[iarg+1],"n2") == 0) neighflag = N2; + else if (strcmp(arg[iarg+1],"full/cluster") == 0) neighflag = FULLCLUSTER; + else error->all(FLERR,"Illegal package command"); + iarg += 2; + } else if (strcmp(arg[iarg],"comm/exchange") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal package command"); + if (strcmp(arg[iarg+1],"no") == 0) exchange_comm_classic = 1; + else if (strcmp(arg[iarg+1],"host") == 0) { + exchange_comm_classic = 0; + exchange_comm_on_host = 1; + } else if (strcmp(arg[iarg+1],"device") == 0) { + exchange_comm_classic = 0; + exchange_comm_on_host = 0; + } else error->all(FLERR,"Illegal package command"); + iarg += 2; + } else if (strcmp(arg[iarg],"comm/forward") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal package command"); + if (strcmp(arg[iarg+1],"no") == 0) forward_comm_classic = 1; + else if (strcmp(arg[iarg+1],"host") == 0) { + forward_comm_classic = 0; + forward_comm_on_host = 1; + } else if (strcmp(arg[iarg+1],"device") == 0) { + forward_comm_classic = 0; + forward_comm_on_host = 0; + } else error->all(FLERR,"Illegal package command"); + iarg += 2; + } else error->all(FLERR,"Illegal package command"); + } +} + +/* ---------------------------------------------------------------------- + called by Finish +------------------------------------------------------------------------- */ + +int KokkosLMP::neigh_list_kokkos(int m) +{ + NeighborKokkos *nk = (NeighborKokkos *) neighbor; + if (nk->lists_host[m] && nk->lists_host[m]->d_numneigh.dimension_0()) + return 1; + if (nk->lists_device[m] && nk->lists_device[m]->d_numneigh.dimension_0()) + return 1; + return 0; +} + +/* ---------------------------------------------------------------------- + called by Finish +------------------------------------------------------------------------- */ + +int KokkosLMP::neigh_count(int m) +{ + int inum; + int nneigh = 0; + + ArrayTypes<LMPHostType>::t_int_1d h_ilist; + ArrayTypes<LMPHostType>::t_int_1d h_numneigh; + + NeighborKokkos *nk = (NeighborKokkos *) neighbor; + if (nk->lists_host[m]) { + inum = nk->lists_host[m]->inum; +#ifndef KOKKOS_USE_UVM + h_ilist = Kokkos::create_mirror_view(nk->lists_host[m]->d_ilist); + h_numneigh = Kokkos::create_mirror_view(nk->lists_host[m]->d_numneigh); +#else + h_ilist = nk->lists_host[m]->d_ilist; + h_numneigh = nk->lists_host[m]->d_numneigh; +#endif + Kokkos::deep_copy(h_ilist,nk->lists_host[m]->d_ilist); + Kokkos::deep_copy(h_numneigh,nk->lists_host[m]->d_numneigh); + } else if (nk->lists_device[m]) { + inum = nk->lists_device[m]->inum; +#ifndef KOKKOS_USE_UVM + h_ilist = Kokkos::create_mirror_view(nk->lists_device[m]->d_ilist); + h_numneigh = Kokkos::create_mirror_view(nk->lists_device[m]->d_numneigh); +#else + h_ilist = nk->lists_device[m]->d_ilist; + h_numneigh = nk->lists_device[m]->d_numneigh; +#endif + Kokkos::deep_copy(h_ilist,nk->lists_device[m]->d_ilist); + Kokkos::deep_copy(h_numneigh,nk->lists_device[m]->d_numneigh); + } + + for (int i = 0; i < inum; i++) nneigh += h_numneigh[h_ilist[i]]; + + return nneigh; +} diff --git a/src/KOKKOS/kokkos.h b/src/KOKKOS/kokkos.h new file mode 100644 index 0000000000..512c76a489 --- /dev/null +++ b/src/KOKKOS/kokkos.h @@ -0,0 +1,40 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifndef KOKKOS_LMP_H +#define KOKKOS_LMP_H + +#include "pointers.h" +#include "kokkos_type.h" + +namespace LAMMPS_NS { + +class KokkosLMP : protected Pointers { + public: + int kokkos_exists; + int neighflag; + int exchange_comm_classic; + int forward_comm_classic; + int exchange_comm_on_host; + int forward_comm_on_host; + + KokkosLMP(class LAMMPS *, int, char **); + ~KokkosLMP(); + void accelerator(int, char **); + int neigh_list_kokkos(int); + int neigh_count(int); +}; + +} + +#endif diff --git a/src/KOKKOS/kokkos_type.h b/src/KOKKOS/kokkos_type.h new file mode 100644 index 0000000000..4887b91b10 --- /dev/null +++ b/src/KOKKOS/kokkos_type.h @@ -0,0 +1,617 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifndef LMP_LMPTYPE_KOKKOS_H +#define LMP_LMPTYPE_KOKKOS_H + +#include <Kokkos_View.hpp> +#include <Kokkos_Macros.hpp> +#include <Kokkos_Atomic.hpp> +#include <Kokkos_DualView.hpp> +#include <impl/Kokkos_Timer.hpp> +#include <Kokkos_Vectorization.hpp> + +#define MAX_TYPES_STACKPARAMS 12 +#define NeighClusterSize 8 +// set LMPHostype and LMPDeviceType + +#ifndef DEVICE +#define DEVICE 1 +#endif + +#if DEVICE==1 + #ifdef KOKKOS_HAVE_OPENMP + #include "Kokkos_OpenMP.hpp" + typedef Kokkos::OpenMP LMPDeviceType; + typedef Kokkos::OpenMP LMPHostType; + #else + #include "Kokkos_Threads.hpp" + typedef Kokkos::Threads LMPDeviceType; + typedef Kokkos::Threads LMPHostType; + #endif + #ifndef __CUDACC__ + struct double2 { + double x, y; + }; + struct float2 { + float x, y; + }; + struct double4 { + double x, y, z, w; + }; + struct float4 { + float x, y, z, w; + }; + #endif +#else + #include "cuda.h" + #include "cuda_runtime.h" + #include "Kokkos_Cuda.hpp" + #include "Kokkos_Threads.hpp" + typedef Kokkos::Cuda LMPDeviceType; + typedef Kokkos::Cuda::host_mirror_device_type LMPHostType; +#endif + +// set ExecutionSpace stuct with variable "space" + +template<class Device> +struct ExecutionSpaceFromDevice; + +#ifdef KOKKOS_HAVE_OPENMP +template<> +struct ExecutionSpaceFromDevice<Kokkos::OpenMP> { + static const LAMMPS_NS::ExecutionSpace space = LAMMPS_NS::Host; +}; +#else +template<> +struct ExecutionSpaceFromDevice<Kokkos::Threads> { + static const LAMMPS_NS::ExecutionSpace space = LAMMPS_NS::Host; +}; +#endif +#if DEVICE==2 +template<> +struct ExecutionSpaceFromDevice<Kokkos::Cuda> { + static const LAMMPS_NS::ExecutionSpace space = LAMMPS_NS::Device; +}; +#endif + +// define precision +// handle global precision, force, energy, positions, kspace separately + +#ifndef PRECISION +#define PRECISION 2 +#endif +#if PRECISION==1 +typedef float LMP_FLOAT; +typedef float2 LMP_FLOAT2; +typedef float4 LMP_FLOAT4; +#else +typedef double LMP_FLOAT; +typedef double2 LMP_FLOAT2; +typedef double4 LMP_FLOAT4; +#endif + +#ifndef PREC_FORCE +#define PREC_FORCE PRECISION +#endif + +#if PREC_FORCE==1 +typedef float F_FLOAT; +typedef float2 F_FLOAT2; +typedef float4 F_FLOAT4; +#else +typedef double F_FLOAT; +typedef double2 F_FLOAT2; +typedef double4 F_FLOAT4; +#endif + +#ifndef PREC_ENERGY +#define PREC_ENERGY PRECISION +#endif + +#if PREC_ENERGY==1 +typedef float E_FLOAT; +typedef float2 E_FLOAT2; +typedef float4 E_FLOAT4; +#else +typedef double E_FLOAT; +typedef double2 E_FLOAT2; +typedef double4 E_FLOAT4; +#endif + +struct s_EV_FLOAT { + E_FLOAT evdwl; + E_FLOAT ecoul; + E_FLOAT v[6]; + KOKKOS_INLINE_FUNCTION + s_EV_FLOAT() { + evdwl = 0; + ecoul = 0; + v[0] = 0; v[1] = 0; v[2] = 0; + v[3] = 0; v[4] = 0; v[5] = 0; + } + + KOKKOS_INLINE_FUNCTION + s_EV_FLOAT& operator+=(const s_EV_FLOAT &rhs) { + evdwl += rhs.evdwl; + ecoul += rhs.ecoul; + v[0] += rhs.v[0]; + v[1] += rhs.v[1]; + v[2] += rhs.v[2]; + v[3] += rhs.v[3]; + v[4] += rhs.v[4]; + v[5] += rhs.v[5]; + return *this; + } +}; +typedef struct s_EV_FLOAT EV_FLOAT; + +#ifndef PREC_POS +#define PREC_POS PRECISION +#endif + +#if PREC_POS==1 +typedef float X_FLOAT; +typedef float2 X_FLOAT2; +typedef float4 X_FLOAT4; +#else +typedef double X_FLOAT; +typedef double2 X_FLOAT2; +typedef double4 X_FLOAT4; +#endif + +#ifndef PREC_VELOCITIES +#define PREC_VELOCITIES PRECISION +#endif + +#if PREC_VELOCITIES==1 +typedef float V_FLOAT; +typedef float2 V_FLOAT2; +typedef float4 V_FLOAT4; +#else +typedef double V_FLOAT; +typedef double2 V_FLOAT2; +typedef double4 V_FLOAT4; +#endif + +#if PREC_KSPACE==1 +typedef float K_FLOAT; +typedef float2 K_FLOAT2; +typedef float4 K_FLOAT4; +#else +typedef double K_FLOAT; +typedef double2 K_FLOAT2; +typedef double4 K_FLOAT4; +#endif + +// ------------------------------------------------------------------------ + +// LAMMPS types + +template <class DeviceType> +struct ArrayTypes; + +template <> +struct ArrayTypes<LMPDeviceType> { + +// scalar types + +typedef Kokkos:: + DualView<int, LMPDeviceType::array_layout, LMPDeviceType> tdual_int_scalar; +typedef tdual_int_scalar::t_dev t_int_scalar; +typedef tdual_int_scalar::t_dev_const t_int_scalar_const; +typedef tdual_int_scalar::t_dev_um t_int_scalar_um; +typedef tdual_int_scalar::t_dev_const_um t_int_scalar_const_um; + +typedef Kokkos:: + DualView<LMP_FLOAT, LMPDeviceType::array_layout, LMPDeviceType> + tdual_float_scalar; +typedef tdual_float_scalar::t_dev t_float_scalar; +typedef tdual_float_scalar::t_dev_const t_float_scalar_const; +typedef tdual_float_scalar::t_dev_um t_float_scalar_um; +typedef tdual_float_scalar::t_dev_const_um t_float_scalar_const_um; + +// generic array types + +typedef Kokkos:: + DualView<int*, LMPDeviceType::array_layout, LMPDeviceType> tdual_int_1d; +typedef tdual_int_1d::t_dev t_int_1d; +typedef tdual_int_1d::t_dev_const t_int_1d_const; +typedef tdual_int_1d::t_dev_um t_int_1d_um; +typedef tdual_int_1d::t_dev_const_um t_int_1d_const_um; +typedef tdual_int_1d::t_dev_const_randomread t_int_1d_randomread; + +typedef Kokkos:: + DualView<int**, Kokkos::LayoutRight, LMPDeviceType> tdual_int_2d; +typedef tdual_int_2d::t_dev t_int_2d; +typedef tdual_int_2d::t_dev_const t_int_2d_const; +typedef tdual_int_2d::t_dev_um t_int_2d_um; +typedef tdual_int_2d::t_dev_const_um t_int_2d_const_um; +typedef tdual_int_2d::t_dev_const_randomread t_int_2d_randomread; + +typedef Kokkos:: + DualView<LAMMPS_NS::tagint*, LMPDeviceType::array_layout, LMPDeviceType> + tdual_tagint_1d; +typedef tdual_tagint_1d::t_dev t_tagint_1d; +typedef tdual_tagint_1d::t_dev_const t_tagint_1d_const; +typedef tdual_tagint_1d::t_dev_um t_tagint_1d_um; +typedef tdual_tagint_1d::t_dev_const_um t_tagint_1d_const_um; +typedef tdual_tagint_1d::t_dev_const_randomread t_tagint_1d_randomread; + +// 1d float array n + +typedef Kokkos::DualView<LMP_FLOAT*, LMPDeviceType::array_layout, LMPDeviceType> tdual_float_1d; +typedef tdual_float_1d::t_dev t_float_1d; +typedef tdual_float_1d::t_dev_const t_float_1d_const; +typedef tdual_float_1d::t_dev_um t_float_1d_um; +typedef tdual_float_1d::t_dev_const_um t_float_1d_const_um; +typedef tdual_float_1d::t_dev_const_randomread t_float_1d_randomread; + +//2d float array n +typedef Kokkos::DualView<LMP_FLOAT**, Kokkos::LayoutRight, LMPDeviceType> tdual_float_2d; +typedef tdual_float_2d::t_dev t_float_2d; +typedef tdual_float_2d::t_dev_const t_float_2d_const; +typedef tdual_float_2d::t_dev_um t_float_2d_um; +typedef tdual_float_2d::t_dev_const_um t_float_2d_const_um; +typedef tdual_float_2d::t_dev_const_randomread t_float_2d_randomread; + +//Position Types +//1d X_FLOAT array n +typedef Kokkos::DualView<X_FLOAT*, LMPDeviceType::array_layout, LMPDeviceType> tdual_xfloat_1d; +typedef tdual_xfloat_1d::t_dev t_xfloat_1d; +typedef tdual_xfloat_1d::t_dev_const t_xfloat_1d_const; +typedef tdual_xfloat_1d::t_dev_um t_xfloat_1d_um; +typedef tdual_xfloat_1d::t_dev_const_um t_xfloat_1d_const_um; +typedef tdual_xfloat_1d::t_dev_const_randomread t_xfloat_1d_randomread; + +//2d X_FLOAT array n*m +typedef Kokkos::DualView<X_FLOAT**, Kokkos::LayoutRight, LMPDeviceType> tdual_xfloat_2d; +typedef tdual_xfloat_2d::t_dev t_xfloat_2d; +typedef tdual_xfloat_2d::t_dev_const t_xfloat_2d_const; +typedef tdual_xfloat_2d::t_dev_um t_xfloat_2d_um; +typedef tdual_xfloat_2d::t_dev_const_um t_xfloat_2d_const_um; +typedef tdual_xfloat_2d::t_dev_const_randomread t_xfloat_2d_randomread; + +//2d X_FLOAT array n*4 +#ifdef LMP_KOKKOS_NO_LEGACY +typedef Kokkos::DualView<X_FLOAT*[3], Kokkos::LayoutLeft, LMPDeviceType> tdual_x_array; +#else +typedef Kokkos::DualView<X_FLOAT*[3], Kokkos::LayoutRight, LMPDeviceType> tdual_x_array; +#endif +typedef tdual_x_array::t_dev t_x_array; +typedef tdual_x_array::t_dev_const t_x_array_const; +typedef tdual_x_array::t_dev_um t_x_array_um; +typedef tdual_x_array::t_dev_const_um t_x_array_const_um; +typedef tdual_x_array::t_dev_const_randomread t_x_array_randomread; + +//Velocity Types +//1d V_FLOAT array n +typedef Kokkos::DualView<V_FLOAT*, LMPDeviceType::array_layout, LMPDeviceType> tdual_vfloat_1d; +typedef tdual_vfloat_1d::t_dev t_vfloat_1d; +typedef tdual_vfloat_1d::t_dev_const t_vfloat_1d_const; +typedef tdual_vfloat_1d::t_dev_um t_vfloat_1d_um; +typedef tdual_vfloat_1d::t_dev_const_um t_vfloat_1d_const_um; +typedef tdual_vfloat_1d::t_dev_const_randomread t_vfloat_1d_randomread; + +//2d V_FLOAT array n*m +typedef Kokkos::DualView<V_FLOAT**, Kokkos::LayoutRight, LMPDeviceType> tdual_vfloat_2d; +typedef tdual_vfloat_2d::t_dev t_vfloat_2d; +typedef tdual_vfloat_2d::t_dev_const t_vfloat_2d_const; +typedef tdual_vfloat_2d::t_dev_um t_vfloat_2d_um; +typedef tdual_vfloat_2d::t_dev_const_um t_vfloat_2d_const_um; +typedef tdual_vfloat_2d::t_dev_const_randomread t_vfloat_2d_randomread; + +//2d V_FLOAT array n*3 +typedef Kokkos::DualView<V_FLOAT*[3], Kokkos::LayoutRight, LMPDeviceType> tdual_v_array; +//typedef Kokkos::DualView<V_FLOAT*[3], LMPDeviceType::array_layout, LMPDeviceType> tdual_v_array; +typedef tdual_v_array::t_dev t_v_array; +typedef tdual_v_array::t_dev_const t_v_array_const; +typedef tdual_v_array::t_dev_um t_v_array_um; +typedef tdual_v_array::t_dev_const_um t_v_array_const_um; +typedef tdual_v_array::t_dev_const_randomread t_v_array_randomread; + +//Force Types +//1d F_FLOAT array n + +typedef Kokkos::DualView<F_FLOAT*, LMPDeviceType::array_layout, LMPDeviceType> tdual_ffloat_1d; +typedef tdual_ffloat_1d::t_dev t_ffloat_1d; +typedef tdual_ffloat_1d::t_dev_const t_ffloat_1d_const; +typedef tdual_ffloat_1d::t_dev_um t_ffloat_1d_um; +typedef tdual_ffloat_1d::t_dev_const_um t_ffloat_1d_const_um; +typedef tdual_ffloat_1d::t_dev_const_randomread t_ffloat_1d_randomread; + +//2d F_FLOAT array n*m + +typedef Kokkos::DualView<F_FLOAT**, Kokkos::LayoutRight, LMPDeviceType> tdual_ffloat_2d; +typedef tdual_ffloat_2d::t_dev t_ffloat_2d; +typedef tdual_ffloat_2d::t_dev_const t_ffloat_2d_const; +typedef tdual_ffloat_2d::t_dev_um t_ffloat_2d_um; +typedef tdual_ffloat_2d::t_dev_const_um t_ffloat_2d_const_um; +typedef tdual_ffloat_2d::t_dev_const_randomread t_ffloat_2d_randomread; + +//2d F_FLOAT array n*3 + +typedef Kokkos::DualView<F_FLOAT*[3], Kokkos::LayoutRight, LMPDeviceType> tdual_f_array; +//typedef Kokkos::DualView<F_FLOAT*[3], LMPDeviceType::array_layout, LMPDeviceType> tdual_f_array; +typedef tdual_f_array::t_dev t_f_array; +typedef tdual_f_array::t_dev_const t_f_array_const; +typedef tdual_f_array::t_dev_um t_f_array_um; +typedef tdual_f_array::t_dev_const_um t_f_array_const_um; +typedef tdual_f_array::t_dev_const_randomread t_f_array_randomread; + +//2d F_FLOAT array n*6 (for virial) + +typedef Kokkos::DualView<F_FLOAT*[6], Kokkos::LayoutRight, LMPDeviceType> tdual_virial_array; +typedef tdual_virial_array::t_dev t_virial_array; +typedef tdual_virial_array::t_dev_const t_virial_array_const; +typedef tdual_virial_array::t_dev_um t_virial_array_um; +typedef tdual_virial_array::t_dev_const_um t_virial_array_const_um; +typedef tdual_virial_array::t_dev_const_randomread t_virial_array_randomread; + +//Energy Types +//1d E_FLOAT array n + +typedef Kokkos::DualView<E_FLOAT*, LMPDeviceType::array_layout, LMPDeviceType> tdual_efloat_1d; +typedef tdual_efloat_1d::t_dev t_efloat_1d; +typedef tdual_efloat_1d::t_dev_const t_efloat_1d_const; +typedef tdual_efloat_1d::t_dev_um t_efloat_1d_um; +typedef tdual_efloat_1d::t_dev_const_um t_efloat_1d_const_um; +typedef tdual_efloat_1d::t_dev_const_randomread t_efloat_1d_randomread; + +//2d E_FLOAT array n*m + +typedef Kokkos::DualView<E_FLOAT**, Kokkos::LayoutRight, LMPDeviceType> tdual_efloat_2d; +typedef tdual_efloat_2d::t_dev t_efloat_2d; +typedef tdual_efloat_2d::t_dev_const t_efloat_2d_const; +typedef tdual_efloat_2d::t_dev_um t_efloat_2d_um; +typedef tdual_efloat_2d::t_dev_const_um t_efloat_2d_const_um; +typedef tdual_efloat_2d::t_dev_const_randomread t_efloat_2d_randomread; + +//2d E_FLOAT array n*3 + +typedef Kokkos::DualView<E_FLOAT*[3], Kokkos::LayoutRight, LMPDeviceType> tdual_e_array; +typedef tdual_e_array::t_dev t_e_array; +typedef tdual_e_array::t_dev_const t_e_array_const; +typedef tdual_e_array::t_dev_um t_e_array_um; +typedef tdual_e_array::t_dev_const_um t_e_array_const_um; +typedef tdual_e_array::t_dev_const_randomread t_e_array_randomread; + +//Neighbor Types + +typedef Kokkos::DualView<int**, LMPDeviceType::array_layout, LMPDeviceType> tdual_neighbors_2d; +typedef tdual_neighbors_2d::t_dev t_neighbors_2d; +typedef tdual_neighbors_2d::t_dev_const t_neighbors_2d_const; +typedef tdual_neighbors_2d::t_dev_um t_neighbors_2d_um; +typedef tdual_neighbors_2d::t_dev_const_um t_neighbors_2d_const_um; +typedef tdual_neighbors_2d::t_dev_const_randomread t_neighbors_2d_randomread; + +}; + +#if DEVICE==2 +template <> +struct ArrayTypes<LMPHostType> { + +//Scalar Types + +typedef Kokkos::DualView<int, LMPDeviceType::array_layout, LMPDeviceType> tdual_int_scalar; +typedef tdual_int_scalar::t_host t_int_scalar; +typedef tdual_int_scalar::t_host_const t_int_scalar_const; +typedef tdual_int_scalar::t_host_um t_int_scalar_um; +typedef tdual_int_scalar::t_host_const_um t_int_scalar_const_um; + +typedef Kokkos::DualView<LMP_FLOAT, LMPDeviceType::array_layout, LMPDeviceType> tdual_float_scalar; +typedef tdual_float_scalar::t_host t_float_scalar; +typedef tdual_float_scalar::t_host_const t_float_scalar_const; +typedef tdual_float_scalar::t_host_um t_float_scalar_um; +typedef tdual_float_scalar::t_host_const_um t_float_scalar_const_um; + +//Generic ArrayTypes +typedef Kokkos::DualView<int*, LMPDeviceType::array_layout, LMPDeviceType> tdual_int_1d; +typedef tdual_int_1d::t_host t_int_1d; +typedef tdual_int_1d::t_host_const t_int_1d_const; +typedef tdual_int_1d::t_host_um t_int_1d_um; +typedef tdual_int_1d::t_host_const_um t_int_1d_const_um; +typedef tdual_int_1d::t_host_const_randomread t_int_1d_randomread; + +typedef Kokkos::DualView<int**, Kokkos::LayoutRight, LMPDeviceType> tdual_int_2d; +typedef tdual_int_2d::t_host t_int_2d; +typedef tdual_int_2d::t_host_const t_int_2d_const; +typedef tdual_int_2d::t_host_um t_int_2d_um; +typedef tdual_int_2d::t_host_const_um t_int_2d_const_um; +typedef tdual_int_2d::t_host_const_randomread t_int_2d_randomread; + +typedef Kokkos::DualView<LAMMPS_NS::tagint*, LMPDeviceType::array_layout, LMPDeviceType> tdual_tagint_1d; +typedef tdual_tagint_1d::t_host t_tagint_1d; +typedef tdual_tagint_1d::t_host_const t_tagint_1d_const; +typedef tdual_tagint_1d::t_host_um t_tagint_1d_um; +typedef tdual_tagint_1d::t_host_const_um t_tagint_1d_const_um; +typedef tdual_tagint_1d::t_host_const_randomread t_tagint_1d_randomread; + +//1d float array n +typedef Kokkos::DualView<LMP_FLOAT*, LMPDeviceType::array_layout, LMPDeviceType> tdual_float_1d; +typedef tdual_float_1d::t_host t_float_1d; +typedef tdual_float_1d::t_host_const t_float_1d_const; +typedef tdual_float_1d::t_host_um t_float_1d_um; +typedef tdual_float_1d::t_host_const_um t_float_1d_const_um; +typedef tdual_float_1d::t_host_const_randomread t_float_1d_randomread; + +//2d float array n +typedef Kokkos::DualView<LMP_FLOAT**, Kokkos::LayoutRight, LMPDeviceType> tdual_float_2d; +typedef tdual_float_2d::t_host t_float_2d; +typedef tdual_float_2d::t_host_const t_float_2d_const; +typedef tdual_float_2d::t_host_um t_float_2d_um; +typedef tdual_float_2d::t_host_const_um t_float_2d_const_um; +typedef tdual_float_2d::t_host_const_randomread t_float_2d_randomread; + +//Position Types +//1d X_FLOAT array n +typedef Kokkos::DualView<X_FLOAT*, LMPDeviceType::array_layout, LMPDeviceType> tdual_xfloat_1d; +typedef tdual_xfloat_1d::t_host t_xfloat_1d; +typedef tdual_xfloat_1d::t_host_const t_xfloat_1d_const; +typedef tdual_xfloat_1d::t_host_um t_xfloat_1d_um; +typedef tdual_xfloat_1d::t_host_const_um t_xfloat_1d_const_um; +typedef tdual_xfloat_1d::t_host_const_randomread t_xfloat_1d_randomread; + +//2d X_FLOAT array n*m +typedef Kokkos::DualView<X_FLOAT**, Kokkos::LayoutRight, LMPDeviceType> tdual_xfloat_2d; +typedef tdual_xfloat_2d::t_host t_xfloat_2d; +typedef tdual_xfloat_2d::t_host_const t_xfloat_2d_const; +typedef tdual_xfloat_2d::t_host_um t_xfloat_2d_um; +typedef tdual_xfloat_2d::t_host_const_um t_xfloat_2d_const_um; +typedef tdual_xfloat_2d::t_host_const_randomread t_xfloat_2d_randomread; + +//2d X_FLOAT array n*3 +typedef Kokkos::DualView<X_FLOAT*[3], Kokkos::LayoutRight, LMPDeviceType> tdual_x_array; +typedef tdual_x_array::t_host t_x_array; +typedef tdual_x_array::t_host_const t_x_array_const; +typedef tdual_x_array::t_host_um t_x_array_um; +typedef tdual_x_array::t_host_const_um t_x_array_const_um; +typedef tdual_x_array::t_host_const_randomread t_x_array_randomread; + +//Velocity Types +//1d V_FLOAT array n +typedef Kokkos::DualView<V_FLOAT*, LMPDeviceType::array_layout, LMPDeviceType> tdual_vfloat_1d; +typedef tdual_vfloat_1d::t_host t_vfloat_1d; +typedef tdual_vfloat_1d::t_host_const t_vfloat_1d_const; +typedef tdual_vfloat_1d::t_host_um t_vfloat_1d_um; +typedef tdual_vfloat_1d::t_host_const_um t_vfloat_1d_const_um; +typedef tdual_vfloat_1d::t_host_const_randomread t_vfloat_1d_randomread; + +//2d V_FLOAT array n*m +typedef Kokkos::DualView<V_FLOAT**, Kokkos::LayoutRight, LMPDeviceType> tdual_vfloat_2d; +typedef tdual_vfloat_2d::t_host t_vfloat_2d; +typedef tdual_vfloat_2d::t_host_const t_vfloat_2d_const; +typedef tdual_vfloat_2d::t_host_um t_vfloat_2d_um; +typedef tdual_vfloat_2d::t_host_const_um t_vfloat_2d_const_um; +typedef tdual_vfloat_2d::t_host_const_randomread t_vfloat_2d_randomread; + +//2d V_FLOAT array n*3 +typedef Kokkos::DualView<V_FLOAT*[3], Kokkos::LayoutRight, LMPDeviceType> tdual_v_array; +//typedef Kokkos::DualView<V_FLOAT*[3], LMPDeviceType::array_layout, LMPDeviceType> tdual_v_array; +typedef tdual_v_array::t_host t_v_array; +typedef tdual_v_array::t_host_const t_v_array_const; +typedef tdual_v_array::t_host_um t_v_array_um; +typedef tdual_v_array::t_host_const_um t_v_array_const_um; +typedef tdual_v_array::t_host_const_randomread t_v_array_randomread; + +//Force Types +//1d F_FLOAT array n +typedef Kokkos::DualView<F_FLOAT*, LMPDeviceType::array_layout, LMPDeviceType> tdual_ffloat_1d; +typedef tdual_ffloat_1d::t_host t_ffloat_1d; +typedef tdual_ffloat_1d::t_host_const t_ffloat_1d_const; +typedef tdual_ffloat_1d::t_host_um t_ffloat_1d_um; +typedef tdual_ffloat_1d::t_host_const_um t_ffloat_1d_const_um; +typedef tdual_ffloat_1d::t_host_const_randomread t_ffloat_1d_randomread; + +//2d F_FLOAT array n*m +typedef Kokkos::DualView<F_FLOAT**, Kokkos::LayoutRight, LMPDeviceType> tdual_ffloat_2d; +typedef tdual_ffloat_2d::t_host t_ffloat_2d; +typedef tdual_ffloat_2d::t_host_const t_ffloat_2d_const; +typedef tdual_ffloat_2d::t_host_um t_ffloat_2d_um; +typedef tdual_ffloat_2d::t_host_const_um t_ffloat_2d_const_um; +typedef tdual_ffloat_2d::t_host_const_randomread t_ffloat_2d_randomread; + +//2d F_FLOAT array n*3 +typedef Kokkos::DualView<F_FLOAT*[3], Kokkos::LayoutRight, LMPDeviceType> tdual_f_array; +//typedef Kokkos::DualView<F_FLOAT*[3], LMPDeviceType::array_layout, LMPDeviceType> tdual_f_array; +typedef tdual_f_array::t_host t_f_array; +typedef tdual_f_array::t_host_const t_f_array_const; +typedef tdual_f_array::t_host_um t_f_array_um; +typedef tdual_f_array::t_host_const_um t_f_array_const_um; +typedef tdual_f_array::t_host_const_randomread t_f_array_randomread; + +//2d F_FLOAT array n*6 (for virial) +typedef Kokkos::DualView<F_FLOAT*[6], Kokkos::LayoutRight, LMPDeviceType> tdual_virial_array; +typedef tdual_virial_array::t_host t_virial_array; +typedef tdual_virial_array::t_host_const t_virial_array_const; +typedef tdual_virial_array::t_host_um t_virial_array_um; +typedef tdual_virial_array::t_host_const_um t_virial_array_const_um; +typedef tdual_virial_array::t_host_const_randomread t_virial_array_randomread; + + + +//Energy Types +//1d E_FLOAT array n +typedef Kokkos::DualView<E_FLOAT*, LMPDeviceType::array_layout, LMPDeviceType> tdual_efloat_1d; +typedef tdual_efloat_1d::t_host t_efloat_1d; +typedef tdual_efloat_1d::t_host_const t_efloat_1d_const; +typedef tdual_efloat_1d::t_host_um t_efloat_1d_um; +typedef tdual_efloat_1d::t_host_const_um t_efloat_1d_const_um; +typedef tdual_efloat_1d::t_host_const_randomread t_efloat_1d_randomread; + +//2d E_FLOAT array n*m +typedef Kokkos::DualView<E_FLOAT**, Kokkos::LayoutRight, LMPDeviceType> tdual_efloat_2d; +typedef tdual_efloat_2d::t_host t_efloat_2d; +typedef tdual_efloat_2d::t_host_const t_efloat_2d_const; +typedef tdual_efloat_2d::t_host_um t_efloat_2d_um; +typedef tdual_efloat_2d::t_host_const_um t_efloat_2d_const_um; +typedef tdual_efloat_2d::t_host_const_randomread t_efloat_2d_randomread; + +//2d E_FLOAT array n*3 +typedef Kokkos::DualView<E_FLOAT*[3], Kokkos::LayoutRight, LMPDeviceType> tdual_e_array; +typedef tdual_e_array::t_host t_e_array; +typedef tdual_e_array::t_host_const t_e_array_const; +typedef tdual_e_array::t_host_um t_e_array_um; +typedef tdual_e_array::t_host_const_um t_e_array_const_um; +typedef tdual_e_array::t_host_const_randomread t_e_array_randomread; + +//Neighbor Types +typedef Kokkos::DualView<int**, LMPDeviceType::array_layout, LMPDeviceType> tdual_neighbors_2d; +typedef tdual_neighbors_2d::t_host t_neighbors_2d; +typedef tdual_neighbors_2d::t_host_const t_neighbors_2d_const; +typedef tdual_neighbors_2d::t_host_um t_neighbors_2d_um; +typedef tdual_neighbors_2d::t_host_const_um t_neighbors_2d_const_um; +typedef tdual_neighbors_2d::t_host_const_randomread t_neighbors_2d_randomread; + +}; +#endif +//default LAMMPS Types +typedef struct ArrayTypes<LMPDeviceType> DAT; +typedef struct ArrayTypes<LMPHostType> HAT; + +template<class DeviceType, class BufferView, class DualView> +void buffer_view(BufferView &buf, DualView &view, + const size_t n0, + const size_t n1 = 0, + const size_t n2 = 0, + const size_t n3 = 0, + const size_t n4 = 0, + const size_t n5 = 0, + const size_t n6 = 0, + const size_t n7 = 0) { + + buf = BufferView( + view.template view<DeviceType>().ptr_on_device(), + n0,n1,n2,n3,n4,n5,n6,n7); + +} + +template<class DeviceType> +struct MemsetZeroFunctor { + typedef DeviceType device_type ; + void* ptr; + KOKKOS_INLINE_FUNCTION void operator()(const int i) const { + ((int*)ptr)[i] = 0; + } +}; + +template<class ViewType> +void memset_kokkos (ViewType &view) { + static MemsetZeroFunctor<typename ViewType::device_type> f; + f.ptr = view.ptr_on_device(); + Kokkos::parallel_for(view.capacity()*sizeof(typename ViewType::value_type)/4, f); + ViewType::device_type::fence(); +} + + +#endif diff --git a/src/KOKKOS/memory_kokkos.h b/src/KOKKOS/memory_kokkos.h new file mode 100644 index 0000000000..2651c5e5c0 --- /dev/null +++ b/src/KOKKOS/memory_kokkos.h @@ -0,0 +1,208 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Kokkos versions of create/grow/destroy multi-dimensional arrays +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + create a 1d array +------------------------------------------------------------------------- */ + +template <typename TYPE> +TYPE create_kokkos(TYPE &data, typename TYPE::value_type *&array, + int n1, const char *name) +{ + data = TYPE(name,n1); + array = data.h_view.ptr_on_device(); + return data; +} + +template <typename TYPE, typename HTYPE> + TYPE create_kokkos(TYPE &data, HTYPE &h_data, + typename TYPE::value_type *&array, int n1, + const char *name) +{ + data = TYPE(std::string(name),n1); +#ifndef KOKKOS_USE_UVM + h_data = Kokkos::create_mirror_view(data); +#else + h_data = data; +#endif + array = h_data.ptr_on_device(); + return data; +} + + +template <typename TYPE, typename HTYPE> + TYPE create_kokkos(TYPE &data, HTYPE &h_data, + int n1, const char *name) +{ + data = TYPE(std::string(name),n1); +#ifndef KOKKOS_USE_UVM + h_data = Kokkos::create_mirror_view(data); +#else + h_data = data; +#endif + return data; +} + +/* ---------------------------------------------------------------------- + grow or shrink 1st dim of a 1d array + last dim must stay the same +------------------------------------------------------------------------- */ + +template <typename TYPE> +TYPE grow_kokkos(TYPE &data, typename TYPE::value_type *&array, + int n1, const char *name) +{ + if (array == NULL) return create_kokkos(data,array,n1,name); + + data.resize(n1); + array = data.h_view.ptr_on_device(); + return data; +} + +template <typename TYPE> +void destroy_kokkos(TYPE data, typename TYPE::value_type* &array) +{ + if (array == NULL) return; + data = TYPE(); + array = NULL; +} + +/* ---------------------------------------------------------------------- + create a 2d array +------------------------------------------------------------------------- */ + +template <typename TYPE> +TYPE create_kokkos(TYPE &data, int n1, int n2, const char *name) +{ + data = TYPE(name,n1,n2); + return data; +} + +template <typename TYPE, typename HTYPE> + TYPE create_kokkos(TYPE &data, HTYPE &h_data, int n1, int n2, + const char *name) +{ + data = TYPE(std::string(name),n1,n2); +#ifndef KOKKOS_USE_UVM + h_data = Kokkos::create_mirror_view(data); +#else + h_data = data; +#endif + return data; +} + +template <typename TYPE> +TYPE create_kokkos(TYPE &data, typename TYPE::value_type **&array, + int n1, int n2, const char *name) +{ + data = TYPE(std::string(name),n1,n2); + bigint nbytes = ((bigint) sizeof(typename TYPE::value_type *)) * n1; + array = (typename TYPE::value_type **) smalloc(nbytes,name); + + bigint n = 0; + for (int i = 0; i < n1; i++) { + array[i] = &data.h_view(i,0); + n += n2; + } + return data; +} + +template <typename TYPE, typename HTYPE> + TYPE create_kokkos(TYPE &data, HTYPE &h_data, + typename TYPE::value_type **&array, int n1, int n2, + const char *name) +{ + data = TYPE(std::string(name),n1,n2); +#ifndef KOKKOS_USE_UVM + h_data = Kokkos::create_mirror_view(data); +#else + h_data = data; +#endif + bigint nbytes = ((bigint) sizeof(typename TYPE::value_type *)) * n1; + array = (typename TYPE::value_type **) smalloc(nbytes,name); + + bigint n = 0; + for (int i = 0; i < n1; i++) { + array[i] = &h_data(i,0); + n += n2; + } + return data; +} + +/* ---------------------------------------------------------------------- + grow or shrink 1st dim of a 2d array + last dim must stay the same +------------------------------------------------------------------------- */ + +template <typename TYPE> +TYPE grow_kokkos(TYPE &data, typename TYPE::value_type **&array, + int n1, int n2, const char *name) +{ + if (array == NULL) return create_kokkos(data,array,n1,n2,name); + data.resize(n1,n2); + bigint nbytes = ((bigint) sizeof(typename TYPE::value_type *)) * n1; + array = (typename TYPE::value_type**) srealloc(array,nbytes,name); + + for (int i = 0; i < n1; i++) + array[i] = &data.h_view(i,0); + + return data; +} + +template <typename TYPE> +TYPE create_kokkos(TYPE &data, typename TYPE::value_type **&array, + int n1, const char *name) +{ + data = TYPE(std::string(name),n1); + bigint nbytes = ((bigint) sizeof(typename TYPE::value_type *)) * n1; + array = (typename TYPE::value_type **) smalloc(nbytes,name); + + for (int i = 0; i < n1; i++) + array[i] = &data.h_view(i,0); + + return data; +} + +template <typename TYPE> +TYPE grow_kokkos(TYPE &data, typename TYPE::value_type **&array, + int n1, const char *name) +{ + if (array == NULL) return create_kokkos(data,array,n1,name); + + data.resize(n1); + + bigint nbytes = ((bigint) sizeof(typename TYPE::value_type *)) * n1; + array = (typename TYPE::value_type **) smalloc(nbytes,name); + + for (int i = 0; i < n1; i++) + array[i] = &data.h_view(i,0); + + return data; +} + +/* ---------------------------------------------------------------------- + destroy a 2d array +------------------------------------------------------------------------- */ + +template <typename TYPE> +void destroy_kokkos(TYPE data, typename TYPE::value_type** &array) +{ + if (array == NULL) return; + data = TYPE(); + sfree(array); + array = NULL; +} diff --git a/src/KOKKOS/modify_kokkos.cpp b/src/KOKKOS/modify_kokkos.cpp new file mode 100644 index 0000000000..4fcd136156 --- /dev/null +++ b/src/KOKKOS/modify_kokkos.cpp @@ -0,0 +1,585 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include "modify_kokkos.h" +#include "atom_kokkos.h" +#include "update.h" +#include "fix.h" +#include "compute.h" + +using namespace LAMMPS_NS; + +#define BIG 1.0e20 + +/* ---------------------------------------------------------------------- */ + +ModifyKokkos::ModifyKokkos(LAMMPS *lmp) : Modify(lmp) +{ + atomKK = (AtomKokkos *) atom; +} + +/* ---------------------------------------------------------------------- + setup for run, calls setup() of all fixes and computes + called from Verlet, RESPA, Min +------------------------------------------------------------------------- */ + +void ModifyKokkos::setup(int vflag) +{ + // compute setup needs to come before fix setup + // b/c NH fixes need use DOF of temperature computes + + for (int i = 0; i < ncompute; i++) compute[i]->setup(); + + if (update->whichflag == 1) + for (int i = 0; i < nfix; i++) { + atomKK->sync(fix[i]->execution_space,fix[i]->datamask_read); + atomKK->modified(fix[i]->execution_space,fix[i]->datamask_modify); + fix[i]->setup(vflag); + } + else if (update->whichflag == 2) + for (int i = 0; i < nfix; i++) { + atomKK->sync(fix[i]->execution_space,fix[i]->datamask_read); + atomKK->modified(fix[i]->execution_space,fix[i]->datamask_modify); + fix[i]->min_setup(vflag); + } +} + +/* ---------------------------------------------------------------------- + setup pre_exchange call, only for fixes that define pre_exchange + called from Verlet, RESPA, Min, and WriteRestart with whichflag = 0 +------------------------------------------------------------------------- */ + +void ModifyKokkos::setup_pre_exchange() +{ + if (update->whichflag <= 1) + for (int i = 0; i < n_pre_exchange; i++) { + atomKK->sync(fix[list_pre_exchange[i]]->execution_space, + fix[list_pre_exchange[i]]->datamask_read); + atomKK->modified(fix[list_pre_exchange[i]]->execution_space, + fix[list_pre_exchange[i]]->datamask_modify); + fix[list_pre_exchange[i]]->setup_pre_exchange(); + } + else if (update->whichflag == 2) + for (int i = 0; i < n_min_pre_exchange; i++) { + atomKK->sync(fix[list_min_pre_exchange[i]]->execution_space, + fix[list_min_pre_exchange[i]]->datamask_read); + atomKK->modified(fix[list_min_pre_exchange[i]]->execution_space, + fix[list_min_pre_exchange[i]]->datamask_modify); + fix[list_min_pre_exchange[i]]->min_setup_pre_exchange(); + } +} + +/* ---------------------------------------------------------------------- + setup pre_neighbor call, only for fixes that define pre_neighbor + called from Verlet, RESPA +------------------------------------------------------------------------- */ + +void ModifyKokkos::setup_pre_neighbor() +{ + if (update->whichflag == 1) + for (int i = 0; i < n_pre_neighbor; i++) { + atomKK->sync(fix[list_pre_neighbor[i]]->execution_space, + fix[list_pre_neighbor[i]]->datamask_read); + atomKK->modified(fix[list_pre_neighbor[i]]->execution_space, + fix[list_pre_neighbor[i]]->datamask_modify); + fix[list_pre_neighbor[i]]->setup_pre_neighbor(); + } + else if (update->whichflag == 2) + for (int i = 0; i < n_min_pre_neighbor; i++) { + atomKK->sync(fix[list_min_pre_neighbor[i]]->execution_space, + fix[list_min_pre_neighbor[i]]->datamask_read); + atomKK->modified(fix[list_min_pre_neighbor[i]]->execution_space, + fix[list_min_pre_neighbor[i]]->datamask_modify); + fix[list_min_pre_neighbor[i]]->min_setup_pre_neighbor(); + } +} + +/* ---------------------------------------------------------------------- + setup pre_force call, only for fixes that define pre_force + called from Verlet, RESPA, Min +------------------------------------------------------------------------- */ + +void ModifyKokkos::setup_pre_force(int vflag) +{ + if (update->whichflag == 1) + for (int i = 0; i < n_pre_force; i++) { + atomKK->sync(fix[list_pre_force[i]]->execution_space, + fix[list_pre_force[i]]->datamask_read); + atomKK->modified(fix[list_pre_force[i]]->execution_space, + fix[list_pre_force[i]]->datamask_modify); + fix[list_pre_force[i]]->setup_pre_force(vflag); + } + else if (update->whichflag == 2) + for (int i = 0; i < n_min_pre_force; i++) { + atomKK->sync(fix[list_min_pre_force[i]]->execution_space, + fix[list_min_pre_force[i]]->datamask_read); + atomKK->modified(fix[list_min_pre_force[i]]->execution_space, + fix[list_min_pre_force[i]]->datamask_modify); + fix[list_min_pre_force[i]]->min_setup_pre_force(vflag); + } +} + +/* ---------------------------------------------------------------------- + 1st half of integrate call, only for relevant fixes +------------------------------------------------------------------------- */ + +void ModifyKokkos::initial_integrate(int vflag) +{ + for (int i = 0; i < n_initial_integrate; i++) { + atomKK->sync(fix[list_initial_integrate[i]]->execution_space, + fix[list_initial_integrate[i]]->datamask_read); + atomKK->modified(fix[list_initial_integrate[i]]->execution_space, + fix[list_initial_integrate[i]]->datamask_modify); + fix[list_initial_integrate[i]]->initial_integrate(vflag); + } +} + +/* ---------------------------------------------------------------------- + post_integrate call, only for relevant fixes +------------------------------------------------------------------------- */ + +void ModifyKokkos::post_integrate() +{ + for (int i = 0; i < n_post_integrate; i++) { + atomKK->sync(fix[list_post_integrate[i]]->execution_space, + fix[list_post_integrate[i]]->datamask_read); + atomKK->modified(fix[list_post_integrate[i]]->execution_space, + fix[list_post_integrate[i]]->datamask_modify); + fix[list_post_integrate[i]]->post_integrate(); + } +} + +/* ---------------------------------------------------------------------- + pre_exchange call, only for relevant fixes +------------------------------------------------------------------------- */ + +void ModifyKokkos::pre_exchange() +{ + for (int i = 0; i < n_pre_exchange; i++) { + atomKK->sync(fix[list_pre_exchange[i]]->execution_space, + fix[list_pre_exchange[i]]->datamask_read); + atomKK->modified(fix[list_pre_exchange[i]]->execution_space, + fix[list_pre_exchange[i]]->datamask_modify); + fix[list_pre_exchange[i]]->pre_exchange(); + } +} + +/* ---------------------------------------------------------------------- + pre_neighbor call, only for relevant fixes +------------------------------------------------------------------------- */ + +void ModifyKokkos::pre_neighbor() +{ + for (int i = 0; i < n_pre_neighbor; i++) { + atomKK->sync(fix[list_pre_neighbor[i]]->execution_space, + fix[list_pre_neighbor[i]]->datamask_read); + atomKK->modified(fix[list_pre_neighbor[i]]->execution_space, + fix[list_pre_neighbor[i]]->datamask_modify); + fix[list_pre_neighbor[i]]->pre_neighbor(); + } +} + +/* ---------------------------------------------------------------------- + pre_force call, only for relevant fixes +------------------------------------------------------------------------- */ + +void ModifyKokkos::pre_force(int vflag) +{ + for (int i = 0; i < n_pre_force; i++) { + atomKK->sync(fix[list_pre_force[i]]->execution_space, + fix[list_pre_force[i]]->datamask_read); + atomKK->modified(fix[list_pre_force[i]]->execution_space, + fix[list_pre_force[i]]->datamask_modify); + fix[list_pre_force[i]]->pre_force(vflag); + } +} + +/* ---------------------------------------------------------------------- + post_force call, only for relevant fixes +------------------------------------------------------------------------- */ + +void ModifyKokkos::post_force(int vflag) +{ + for (int i = 0; i < n_post_force; i++) { + atomKK->sync(fix[list_post_force[i]]->execution_space, + fix[list_post_force[i]]->datamask_read); + atomKK->modified(fix[list_post_force[i]]->execution_space, + fix[list_post_force[i]]->datamask_modify); + fix[list_post_force[i]]->post_force(vflag); + } +} + +/* ---------------------------------------------------------------------- + 2nd half of integrate call, only for relevant fixes +------------------------------------------------------------------------- */ + +void ModifyKokkos::final_integrate() +{ + for (int i = 0; i < n_final_integrate; i++) { + atomKK->sync(fix[list_final_integrate[i]]->execution_space, + fix[list_final_integrate[i]]->datamask_read); + atomKK->modified(fix[list_final_integrate[i]]->execution_space, + fix[list_final_integrate[i]]->datamask_modify); + fix[list_final_integrate[i]]->final_integrate(); + } +} + +/* ---------------------------------------------------------------------- + end-of-timestep call, only for relevant fixes + only call fix->end_of_step() on timesteps that are multiples of nevery +------------------------------------------------------------------------- */ + +void ModifyKokkos::end_of_step() +{ + for (int i = 0; i < n_end_of_step; i++) + if (update->ntimestep % end_of_step_every[i] == 0) { + atomKK->sync(fix[list_end_of_step[i]]->execution_space, + fix[list_end_of_step[i]]->datamask_read); + atomKK->modified(fix[list_end_of_step[i]]->execution_space, + fix[list_end_of_step[i]]->datamask_modify); + fix[list_end_of_step[i]]->end_of_step(); + } +} + +/* ---------------------------------------------------------------------- + thermo energy call, only for relevant fixes + called by Thermo class + compute_scalar() is fix call to return energy +------------------------------------------------------------------------- */ + +double ModifyKokkos::thermo_energy() +{ + double energy = 0.0; + for (int i = 0; i < n_thermo_energy; i++) { + atomKK->sync(fix[list_thermo_energy[i]]->execution_space, + fix[list_thermo_energy[i]]->datamask_read); + atomKK->modified(fix[list_thermo_energy[i]]->execution_space, + fix[list_thermo_energy[i]]->datamask_modify); + energy += fix[list_thermo_energy[i]]->compute_scalar(); + } + return energy; +} + +/* ---------------------------------------------------------------------- + post_run call +------------------------------------------------------------------------- */ + +void ModifyKokkos::post_run() +{ + for (int i = 0; i < nfix; i++) { + atomKK->sync(fix[i]->execution_space, + fix[i]->datamask_read); + atomKK->modified(fix[i]->execution_space, + fix[i]->datamask_modify); + fix[i]->post_run(); + } +} + +/* ---------------------------------------------------------------------- + setup rRESPA pre_force call, only for relevant fixes +------------------------------------------------------------------------- */ + +void ModifyKokkos::setup_pre_force_respa(int vflag, int ilevel) +{ + for (int i = 0; i < n_pre_force; i++) { + atomKK->sync(fix[list_pre_force[i]]->execution_space, + fix[list_pre_force[i]]->datamask_read); + atomKK->modified(fix[list_pre_force[i]]->execution_space, + fix[list_pre_force[i]]->datamask_modify); + fix[list_pre_force[i]]->setup_pre_force_respa(vflag,ilevel); + } +} + +/* ---------------------------------------------------------------------- + 1st half of rRESPA integrate call, only for relevant fixes +------------------------------------------------------------------------- */ + +void ModifyKokkos::initial_integrate_respa(int vflag, int ilevel, int iloop) +{ + for (int i = 0; i < n_initial_integrate_respa; i++) { + atomKK->sync(fix[list_initial_integrate_respa[i]]->execution_space, + fix[list_initial_integrate_respa[i]]->datamask_read); + atomKK->modified(fix[list_initial_integrate_respa[i]]->execution_space, + fix[list_initial_integrate_respa[i]]->datamask_modify); + fix[list_initial_integrate_respa[i]]-> + initial_integrate_respa(vflag,ilevel,iloop); + } +} + +/* ---------------------------------------------------------------------- + rRESPA post_integrate call, only for relevant fixes +------------------------------------------------------------------------- */ + +void ModifyKokkos::post_integrate_respa(int ilevel, int iloop) +{ + for (int i = 0; i < n_post_integrate_respa; i++) { + atomKK->sync(fix[list_post_integrate_respa[i]]->execution_space, + fix[list_post_integrate_respa[i]]->datamask_read); + atomKK->modified(fix[list_post_integrate_respa[i]]->execution_space, + fix[list_post_integrate_respa[i]]->datamask_modify); + fix[list_post_integrate_respa[i]]->post_integrate_respa(ilevel,iloop); + } +} + +/* ---------------------------------------------------------------------- + rRESPA pre_force call, only for relevant fixes +------------------------------------------------------------------------- */ + +void ModifyKokkos::pre_force_respa(int vflag, int ilevel, int iloop) +{ + for (int i = 0; i < n_pre_force_respa; i++) { + atomKK->sync(fix[list_pre_force_respa[i]]->execution_space, + fix[list_pre_force_respa[i]]->datamask_read); + atomKK->modified(fix[list_pre_force_respa[i]]->execution_space, + fix[list_pre_force_respa[i]]->datamask_modify); + fix[list_pre_force_respa[i]]->pre_force_respa(vflag,ilevel,iloop); + } +} + +/* ---------------------------------------------------------------------- + rRESPA post_force call, only for relevant fixes +------------------------------------------------------------------------- */ + +void ModifyKokkos::post_force_respa(int vflag, int ilevel, int iloop) +{ + for (int i = 0; i < n_post_force_respa; i++) { + atomKK->sync(fix[list_post_force_respa[i]]->execution_space, + fix[list_post_force_respa[i]]->datamask_read); + atomKK->modified(fix[list_post_force_respa[i]]->execution_space, + fix[list_post_force_respa[i]]->datamask_modify); + fix[list_post_force_respa[i]]->post_force_respa(vflag,ilevel,iloop); + } +} + +/* ---------------------------------------------------------------------- + 2nd half of rRESPA integrate call, only for relevant fixes +------------------------------------------------------------------------- */ + +void ModifyKokkos::final_integrate_respa(int ilevel, int iloop) +{ + for (int i = 0; i < n_final_integrate_respa; i++) { + atomKK->sync(fix[list_final_integrate_respa[i]]->execution_space, + fix[list_final_integrate_respa[i]]->datamask_read); + atomKK->modified(fix[list_final_integrate_respa[i]]->execution_space, + fix[list_final_integrate_respa[i]]->datamask_modify); + fix[list_final_integrate_respa[i]]->final_integrate_respa(ilevel,iloop); + } +} + +/* ---------------------------------------------------------------------- + minimizer pre-exchange call, only for relevant fixes +------------------------------------------------------------------------- */ + +void ModifyKokkos::min_pre_exchange() +{ + for (int i = 0; i < n_min_pre_exchange; i++) { + atomKK->sync(fix[list_min_pre_exchange[i]]->execution_space, + fix[list_min_pre_exchange[i]]->datamask_read); + atomKK->modified(fix[list_min_pre_exchange[i]]->execution_space, + fix[list_min_pre_exchange[i]]->datamask_modify); + fix[list_min_pre_exchange[i]]->min_pre_exchange(); + } +} + +/* ---------------------------------------------------------------------- + minimizer pre-neighbor call, only for relevant fixes +------------------------------------------------------------------------- */ + +void ModifyKokkos::min_pre_neighbor() +{ + for (int i = 0; i < n_min_pre_neighbor; i++) { + atomKK->sync(fix[list_min_pre_neighbor[i]]->execution_space, + fix[list_min_pre_neighbor[i]]->datamask_read); + atomKK->modified(fix[list_min_pre_neighbor[i]]->execution_space, + fix[list_min_pre_neighbor[i]]->datamask_modify); + fix[list_min_pre_neighbor[i]]->min_pre_neighbor(); + } +} + +/* ---------------------------------------------------------------------- + minimizer pre-force call, only for relevant fixes +------------------------------------------------------------------------- */ + +void ModifyKokkos::min_pre_force(int vflag) +{ + for (int i = 0; i < n_min_pre_force; i++) { + atomKK->sync(fix[list_min_pre_force[i]]->execution_space, + fix[list_min_pre_force[i]]->datamask_read); + atomKK->modified(fix[list_min_pre_force[i]]->execution_space, + fix[list_min_pre_force[i]]->datamask_modify); + fix[list_min_pre_force[i]]->min_pre_force(vflag); + } +} + +/* ---------------------------------------------------------------------- + minimizer force adjustment call, only for relevant fixes +------------------------------------------------------------------------- */ + +void ModifyKokkos::min_post_force(int vflag) +{ + for (int i = 0; i < n_min_post_force; i++) { + atomKK->sync(fix[list_min_post_force[i]]->execution_space, + fix[list_min_post_force[i]]->datamask_read); + atomKK->modified(fix[list_min_post_force[i]]->execution_space, + fix[list_min_post_force[i]]->datamask_modify); + fix[list_min_post_force[i]]->min_post_force(vflag); + } +} + +/* ---------------------------------------------------------------------- + minimizer energy/force evaluation, only for relevant fixes + return energy and forces on extra degrees of freedom +------------------------------------------------------------------------- */ + +double ModifyKokkos::min_energy(double *fextra) +{ + int ifix,index; + + index = 0; + double eng = 0.0; + for (int i = 0; i < n_min_energy; i++) { + ifix = list_min_energy[i]; + atomKK->sync(fix[ifix]->execution_space,fix[ifix]->datamask_read); + atomKK->modified(fix[ifix]->execution_space,fix[ifix]->datamask_modify); + eng += fix[ifix]->min_energy(&fextra[index]); + index += fix[ifix]->min_dof(); + } + return eng; +} + +/* ---------------------------------------------------------------------- + store current state of extra dof, only for relevant fixes +------------------------------------------------------------------------- */ + +void ModifyKokkos::min_store() +{ + for (int i = 0; i < n_min_energy; i++) { + atomKK->sync(fix[list_min_energy[i]]->execution_space, + fix[list_min_energy[i]]->datamask_read); + atomKK->modified(fix[list_min_energy[i]]->execution_space, + fix[list_min_energy[i]]->datamask_modify); + fix[list_min_energy[i]]->min_store(); + } +} + +/* ---------------------------------------------------------------------- + mange state of extra dof on a stack, only for relevant fixes +------------------------------------------------------------------------- */ + +void ModifyKokkos::min_clearstore() +{ + for (int i = 0; i < n_min_energy; i++) { + atomKK->sync(fix[list_min_energy[i]]->execution_space, + fix[list_min_energy[i]]->datamask_read); + atomKK->modified(fix[list_min_energy[i]]->execution_space, + fix[list_min_energy[i]]->datamask_modify); + fix[list_min_energy[i]]->min_clearstore(); + } +} + +void ModifyKokkos::min_pushstore() +{ + for (int i = 0; i < n_min_energy; i++) { + atomKK->sync(fix[list_min_energy[i]]->execution_space, + fix[list_min_energy[i]]->datamask_read); + atomKK->modified(fix[list_min_energy[i]]->execution_space, + fix[list_min_energy[i]]->datamask_modify); + fix[list_min_energy[i]]->min_pushstore(); + } +} + +void ModifyKokkos::min_popstore() +{ + for (int i = 0; i < n_min_energy; i++) { + atomKK->sync(fix[list_min_energy[i]]->execution_space, + fix[list_min_energy[i]]->datamask_read); + atomKK->modified(fix[list_min_energy[i]]->execution_space, + fix[list_min_energy[i]]->datamask_modify); + fix[list_min_energy[i]]->min_popstore(); + } +} + +/* ---------------------------------------------------------------------- + displace extra dof along vector hextra, only for relevant fixes +------------------------------------------------------------------------- */ + +void ModifyKokkos::min_step(double alpha, double *hextra) +{ + int ifix,index; + + index = 0; + for (int i = 0; i < n_min_energy; i++) { + ifix = list_min_energy[i]; + atomKK->sync(fix[ifix]->execution_space,fix[ifix]->datamask_read); + atomKK->modified(fix[ifix]->execution_space,fix[ifix]->datamask_modify); + fix[ifix]->min_step(alpha,&hextra[index]); + index += fix[ifix]->min_dof(); + } +} + +/* ---------------------------------------------------------------------- + compute max allowed step size along vector hextra, only for relevant fixes +------------------------------------------------------------------------- */ + +double ModifyKokkos::max_alpha(double *hextra) +{ + int ifix,index; + + double alpha = BIG; + index = 0; + for (int i = 0; i < n_min_energy; i++) { + ifix = list_min_energy[i]; + atomKK->sync(fix[ifix]->execution_space,fix[ifix]->datamask_read); + atomKK->modified(fix[ifix]->execution_space,fix[ifix]->datamask_modify); + double alpha_one = fix[ifix]->max_alpha(&hextra[index]); + alpha = MIN(alpha,alpha_one); + index += fix[ifix]->min_dof(); + } + return alpha; +} + +/* ---------------------------------------------------------------------- + extract extra dof for minimization, only for relevant fixes +------------------------------------------------------------------------- */ + +int ModifyKokkos::min_dof() +{ + int ndof = 0; + for (int i = 0; i < n_min_energy; i++) { + atomKK->sync(fix[list_min_energy[i]]->execution_space, + fix[list_min_energy[i]]->datamask_read); + atomKK->modified(fix[list_min_energy[i]]->execution_space, + fix[list_min_energy[i]]->datamask_modify); + ndof += fix[list_min_energy[i]]->min_dof(); + } + return ndof; +} + +/* ---------------------------------------------------------------------- + reset reference state of fix, only for relevant fixes +------------------------------------------------------------------------- */ + +int ModifyKokkos::min_reset_ref() +{ + int itmp,itmpall; + itmpall = 0; + for (int i = 0; i < n_min_energy; i++) { + atomKK->sync(fix[list_min_energy[i]]->execution_space, + fix[list_min_energy[i]]->datamask_read); + atomKK->modified(fix[list_min_energy[i]]->execution_space, + fix[list_min_energy[i]]->datamask_modify); + itmp = fix[list_min_energy[i]]->min_reset_ref(); + if (itmp) itmpall = 1; + } + return itmpall; +} diff --git a/src/KOKKOS/modify_kokkos.h b/src/KOKKOS/modify_kokkos.h new file mode 100644 index 0000000000..c0c3a8d680 --- /dev/null +++ b/src/KOKKOS/modify_kokkos.h @@ -0,0 +1,73 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifndef LMP_MODIFY_KOKKOS_H +#define LMP_MODIFY_KOKKOS_H + +#include "modify.h" + +namespace LAMMPS_NS { + +class ModifyKokkos : public Modify { + public: + ModifyKokkos(class LAMMPS *); + ~ModifyKokkos() {} + void setup(int); + void setup_pre_exchange(); + void setup_pre_neighbor(); + void setup_pre_force(int); + void initial_integrate(int); + void post_integrate(); + void pre_decide(); + void pre_exchange(); + void pre_neighbor(); + void pre_force(int); + void post_force(int); + void final_integrate(); + void end_of_step(); + double thermo_energy(); + void post_run(); + + void setup_pre_force_respa(int, int); + void initial_integrate_respa(int, int, int); + void post_integrate_respa(int, int); + void pre_force_respa(int, int, int); + void post_force_respa(int, int, int); + void final_integrate_respa(int, int); + + void min_pre_exchange(); + void min_pre_neighbor(); + void min_pre_force(int); + void min_post_force(int); + + double min_energy(double *); + void min_store(); + void min_step(double, double *); + void min_clearstore(); + void min_pushstore(); + void min_popstore(); + double max_alpha(double *); + int min_dof(); + int min_reset_ref(); + + protected: + class AtomKokkos *atomKK; +}; + +} + +#endif + +/* ERROR/WARNING messages: + +*/ diff --git a/src/KOKKOS/neigh_full_kokkos.h b/src/KOKKOS/neigh_full_kokkos.h new file mode 100644 index 0000000000..9112e5049a --- /dev/null +++ b/src/KOKKOS/neigh_full_kokkos.h @@ -0,0 +1,507 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include "atom_kokkos.h" +#include "atom_masks.h" + +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- */ + +template<class DeviceType, int HALF_NEIGH> +void NeighborKokkos::full_bin_kokkos(NeighListKokkos<DeviceType> *list) +{ + const int nall = includegroup?atom->nfirst:atom->nlocal; + list->grow(nall); + + NeighborKokkosExecute<DeviceType> + data(*list, + k_cutneighsq.view<DeviceType>(), + k_bincount.view<DeviceType>(), + k_bins.view<DeviceType>(),nall, + atomKK->k_x.view<DeviceType>(), + atomKK->k_type.view<DeviceType>(), + atomKK->k_mask.view<DeviceType>(), + atomKK->k_molecule.view<DeviceType>(), + nbinx,nbiny,nbinz,mbinx,mbiny,mbinz,mbinxlo,mbinylo,mbinzlo, + bininvx,bininvy,bininvz, + bboxhi,bboxlo); + + k_cutneighsq.sync<DeviceType>(); + atomKK->sync(Device,X_MASK|TYPE_MASK|MASK_MASK); + Kokkos::deep_copy(list->d_stencil,list->h_stencil); + + while(data.h_resize() > 0) { + data.h_resize() = 0; + deep_copy(data.resize, data.h_resize); + + MemsetZeroFunctor<DeviceType> f_zero; + f_zero.ptr = (void*) k_bincount.view<DeviceType>().ptr_on_device(); + Kokkos::parallel_for(mbins, f_zero); + DeviceType::fence(); + + NeighborKokkosBinAtomsFunctor<DeviceType> f(data); + + Kokkos::parallel_for(atom->nlocal+atom->nghost, f); + DeviceType::fence(); + + deep_copy(data.h_resize, data.resize); + if(data.h_resize()) { + + atoms_per_bin += 16; + k_bins = DAT::tdual_int_2d("bins", mbins, atoms_per_bin); + data.bins = k_bins.view<DeviceType>(); + data.c_bins = data.bins; + } + } + + if(list->d_neighbors.dimension_0()<nall) { + list->d_neighbors = typename ArrayTypes<DeviceType>::t_neighbors_2d("neighbors", nall*1.1, list->maxneighs); + list->d_numneigh = typename ArrayTypes<DeviceType>::t_int_1d("numneigh", nall*1.1); + data.neigh_list.d_neighbors = list->d_neighbors; + data.neigh_list.d_numneigh = list->d_numneigh; + } + data.h_resize()=1; + while(data.h_resize()) { + data.h_new_maxneighs() = list->maxneighs; + data.h_resize() = 0; + + Kokkos::deep_copy(data.resize, data.h_resize); + Kokkos::deep_copy(data.new_maxneighs, data.h_new_maxneighs); +#if DEVICE==2 + #define BINS_PER_BLOCK 2 + const int factor = atoms_per_bin<64?2:1; + Kokkos::ParallelWorkRequest config((mbins+factor-1)/factor,atoms_per_bin*factor); +#else + const int factor = 1; +#endif + +if(newton_pair) { + NeighborKokkosBuildFunctor<DeviceType,HALF_NEIGH,1> f(data,atoms_per_bin * 5 * sizeof(X_FLOAT) * factor); +#if DEVICE==2 + Kokkos::parallel_for(config, f); +#else + Kokkos::parallel_for(nall, f); +#endif +} else { + NeighborKokkosBuildFunctor<DeviceType,HALF_NEIGH,0> f(data,atoms_per_bin * 5 * sizeof(X_FLOAT) * factor); +#if DEVICE==2 + Kokkos::parallel_for(config, f); +#else + Kokkos::parallel_for(nall, f); +#endif +} + DeviceType::fence(); + deep_copy(data.h_resize, data.resize); + + if(data.h_resize()) { + deep_copy(data.h_new_maxneighs, data.new_maxneighs); + list->maxneighs = data.h_new_maxneighs() * 1.2; + list->d_neighbors = typename ArrayTypes<DeviceType>::t_neighbors_2d("neighbors", list->d_neighbors.dimension_0(), list->maxneighs); + data.neigh_list.d_neighbors = list->d_neighbors; + data.neigh_list.maxneighs = list->maxneighs; + } + } + + list->inum = nall; + list->gnum = 0; + +} + +/* ---------------------------------------------------------------------- */ + +template<class Device> +KOKKOS_INLINE_FUNCTION +void NeighborKokkosExecute<Device>::binatomsItem(const int &i) const +{ + const int ibin = coord2bin(x(i, 0), x(i, 1), x(i, 2)); + + const int ac = Kokkos::atomic_fetch_add(&bincount[ibin], (int)1); + if(ac < bins.dimension_1()) { + bins(ibin, ac) = i; + } else { + resize() = 1; + } +} + +/* ---------------------------------------------------------------------- */ + +template<class Device> template<int HalfNeigh,int GhostNewton> +void NeighborKokkosExecute<Device>:: + build_Item(const int &i) const +{ + /* if necessary, goto next page and add pages */ + int n = 0; + + // get subview of neighbors of i + + const AtomNeighbors neighbors_i = neigh_list.get_neighbors(i); + const X_FLOAT xtmp = x(i, 0); + const X_FLOAT ytmp = x(i, 1); + const X_FLOAT ztmp = x(i, 2); + const int itype = type(i); + + const int ibin = coord2bin(xtmp, ytmp, ztmp); + + const int nstencil = neigh_list.nstencil; + const typename ArrayTypes<Device>::t_int_1d_const_um stencil + = neigh_list.d_stencil; + + // loop over all bins in neighborhood (includes ibin) + if(HalfNeigh) + for(int m = 0; m < c_bincount(ibin); m++) { + const int j = c_bins(ibin,m); + // printf("%i %i %i\n",i,ibin,m,c_bincount(ibin),j); + const int jtype = type(j); + //for same bin as atom i skip j if i==j and skip atoms "below and to the left" if using HalfNeighborlists + if((j == i) || (HalfNeigh && !GhostNewton && (j < i)) || + (HalfNeigh && GhostNewton && ((j < i) || ((j >= nlocal) && + ((x(j, 2) < ztmp) || (x(j, 2) == ztmp && x(j, 1) < ytmp) || + (x(j, 2) == ztmp && x(j, 1) == ytmp && x(j, 0) < xtmp))))) + ) continue; + //if(Exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue; + + + const X_FLOAT delx = xtmp - x(j, 0); + const X_FLOAT dely = ytmp - x(j, 1); + const X_FLOAT delz = ztmp - x(j, 2); + const X_FLOAT rsq = delx * delx + dely * dely + delz * delz; + if(rsq <= cutneighsq(itype,jtype)) { + if(n<neigh_list.maxneighs) neighbors_i(n) = j; + n++; + } + } + + for(int k = 0; k < nstencil; k++) { + const int jbin = ibin + stencil[k]; + // get subview of jbin + if(!GhostNewton&&HalfNeigh&&(ibin==jbin)) continue; + //const ArrayTypes<Device>::t_int_1d_const_um =Kokkos::subview<t_int_1d_const_um>(bins,jbin,ALL); + for(int m = 0; m < c_bincount(jbin); m++) { + const int j = c_bins(jbin,m); + //if(i==0) + //printf("%i %i %i %i %i %i %i\n",i,jbin,m,c_bincount(jbin),j,k,stencil[k]); + const int jtype = type(j); + + if(HalfNeigh && !GhostNewton && (j < i)) continue; + if(!HalfNeigh && j==i) continue; + //if(Exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue; + + const X_FLOAT delx = xtmp - x(j, 0); + const X_FLOAT dely = ytmp - x(j, 1); + const X_FLOAT delz = ztmp - x(j, 2); + const X_FLOAT rsq = delx * delx + dely * dely + delz * delz; + //if(i==0) + //printf("%i %i %lf %lf NEIGHS\n",i,j,rsq,cutneighsq(itype,jtype)); + + if(rsq <= cutneighsq(itype,jtype)) { + if(n<neigh_list.maxneighs) neighbors_i(n) = j; + n++; + } + + } + } + + neigh_list.d_numneigh(i) = n; + + if(n >= neigh_list.maxneighs) { + resize() = 1; + + if(n >= new_maxneighs()) new_maxneighs() = n; + } + neigh_list.d_ilist(i) = i; +} + +#if DEVICE==2 +extern __shared__ X_FLOAT sharedmem[]; + +/* ---------------------------------------------------------------------- */ + +template<class DeviceType> template<int HalfNeigh> +__device__ inline +void NeighborKokkosExecute<DeviceType>::build_ItemCuda(DeviceType dev) const +{ + /* loop over atoms in i's bin, + */ + const int atoms_per_bin = c_bins.dimension_1(); + const int BINS_PER_TEAM = blockDim.x/atoms_per_bin; + const int MY_BIN = threadIdx.x/atoms_per_bin; + const int MY_II = threadIdx.x%atoms_per_bin; + + const int ibin = (blockIdx.x)*BINS_PER_TEAM+MY_BIN; + + if(ibin >=c_bincount.dimension_0()) return; + X_FLOAT* other_x = sharedmem; + other_x = other_x + 5*atoms_per_bin*MY_BIN; + + int* other_id = (int*) &other_x[4 * atoms_per_bin]; + + int bincount_current = c_bincount[ibin]; + + const int i = MY_II < bincount_current ? c_bins(ibin, MY_II) : -1; + /* if necessary, goto next page and add pages */ + + int n = 0; + + X_FLOAT xtmp; + X_FLOAT ytmp; + X_FLOAT ztmp; + int itype; + const AtomNeighbors neighbors_i = neigh_list.get_neighbors((i>=0&&i<nlocal)?i:0); + + if(i >= 0) { + xtmp = x(i, 0); + ytmp = x(i, 1); + ztmp = x(i, 2); + itype = type(i); + other_x[MY_II] = xtmp; + other_x[MY_II + atoms_per_bin] = ytmp; + other_x[MY_II + 2 * atoms_per_bin] = ztmp; + other_x[MY_II + 3 * atoms_per_bin] = itype; + } + other_id[MY_II] = i; + int test = (__syncthreads_count(i >= 0 && i <= nlocal) == 0); + + if(test) return; + + if(i >= 0 && i < nlocal) { + #pragma unroll 4 + for(int m = 0; m < bincount_current; m++) { + int j = other_id[m]; + + //for same bin as atom i skip j if i==j and skip atoms "below and to the left" if using halfneighborlists + //if(j==i) continue; + if((j == i) || (HalfNeigh && (j < i))) continue; + + const X_FLOAT delx = xtmp - other_x[m]; + const X_FLOAT dely = ytmp - other_x[m + atoms_per_bin]; + const X_FLOAT delz = ztmp - other_x[m + 2 * atoms_per_bin]; + const int jtype = other_x[m + 3 * atoms_per_bin]; + const X_FLOAT rsq = delx * delx + dely * dely + delz * delz; + if((rsq <= cutneighsq(itype,jtype)) && (n < neigh_list.maxneighs)) neighbors_i(n++) = j; + } + } + __syncthreads(); + + const int nstencil = neigh_list.nstencil; + const typename ArrayTypes<DeviceType>::t_int_1d_const_um stencil + = neigh_list.d_stencil; + for(int k = 0; k < nstencil; k++) { + const int jbin = ibin + stencil[k]; + + if(ibin == jbin) continue; + + bincount_current = c_bincount[jbin]; + int j = MY_II < bincount_current ? c_bins(jbin, MY_II) : -1; + + if(j >= 0) { + other_x[MY_II] = x(j, 0); + other_x[MY_II + atoms_per_bin] = x(j, 1); + other_x[MY_II + 2 * atoms_per_bin] = x(j, 2); + other_x[MY_II + 3 * atoms_per_bin] = type(j); + } + + other_id[MY_II] = j; + + __syncthreads(); + + if(i >= 0 && i < nlocal) { + #pragma unroll 8 + for(int m = 0; m < bincount_current; m++) { + const int j = other_id[m]; + + if(HalfNeigh && (j < i)) continue; + + const X_FLOAT delx = xtmp - other_x[m]; + const X_FLOAT dely = ytmp - other_x[m + atoms_per_bin]; + const X_FLOAT delz = ztmp - other_x[m + 2 * atoms_per_bin]; + const int jtype = other_x[m + 3 * atoms_per_bin]; + const X_FLOAT rsq = delx * delx + dely * dely + delz * delz; + if((rsq <= cutneighsq(itype,jtype)) && (n < neigh_list.maxneighs)) neighbors_i(n++) = j; + } + } + __syncthreads(); + } + + if(i >= 0 && i < nlocal) { + neigh_list.d_numneigh(i) = n; + neigh_list.d_ilist(i) = i; + } + + if(n >= neigh_list.maxneighs) { + resize() = 1; + + if(n >= new_maxneighs()) new_maxneighs() = n; + } +} +#endif + +template<class DeviceType> +void NeighborKokkos::full_bin_cluster_kokkos(NeighListKokkos<DeviceType> *list) +{ + const int nall = includegroup?atom->nfirst:atom->nlocal; + list->grow(nall); + + NeighborKokkosExecute<DeviceType> + data(*list, + k_cutneighsq.view<DeviceType>(), + k_bincount.view<DeviceType>(), + k_bins.view<DeviceType>(),nall, + atomKK->k_x.view<DeviceType>(), + atomKK->k_type.view<DeviceType>(), + atomKK->k_mask.view<DeviceType>(), + atomKK->k_molecule.view<DeviceType>(), + nbinx,nbiny,nbinz,mbinx,mbiny,mbinz,mbinxlo,mbinylo,mbinzlo, + bininvx,bininvy,bininvz, + bboxhi,bboxlo); + + k_cutneighsq.sync<DeviceType>(); + atomKK->sync(Device,X_MASK|TYPE_MASK|MASK_MASK); + Kokkos::deep_copy(list->d_stencil,list->h_stencil); + DeviceType::fence(); + + while(data.h_resize() > 0) { + data.h_resize() = 0; + deep_copy(data.resize, data.h_resize); + + MemsetZeroFunctor<DeviceType> f_zero; + f_zero.ptr = (void*) k_bincount.view<DeviceType>().ptr_on_device(); + Kokkos::parallel_for(mbins, f_zero); + DeviceType::fence(); + + NeighborKokkosBinAtomsFunctor<DeviceType> f(data); + + Kokkos::parallel_for(atom->nlocal+atom->nghost, f); + DeviceType::fence(); + + deep_copy(data.h_resize, data.resize); + if(data.h_resize()) { + + atoms_per_bin += 16; + k_bins = DAT::tdual_int_2d("bins", mbins, atoms_per_bin); + data.bins = k_bins.view<DeviceType>(); + data.c_bins = data.bins; + } + } + + if(list->d_neighbors.dimension_0()<nall) { + list->d_neighbors = typename ArrayTypes<DeviceType>::t_neighbors_2d("neighbors", nall*1.1, list->maxneighs); + list->d_numneigh = typename ArrayTypes<DeviceType>::t_int_1d("numneigh", nall*1.1); + data.neigh_list.d_neighbors = list->d_neighbors; + data.neigh_list.d_numneigh = list->d_numneigh; + } + data.h_resize()=1; + while(data.h_resize()) { + data.h_new_maxneighs() = list->maxneighs; + data.h_resize() = 0; + + Kokkos::deep_copy(data.resize, data.h_resize); + Kokkos::deep_copy(data.new_maxneighs, data.h_new_maxneighs); +#if DEVICE==2 + #define BINS_PER_BLOCK 2 + const int factor = atoms_per_bin<64?2:1; + Kokkos::ParallelWorkRequest config((mbins+factor-1)/factor,atoms_per_bin*factor); +#else + const int factor = 1; +#endif + +if(newton_pair) { + NeighborClusterKokkosBuildFunctor<DeviceType,NeighClusterSize> f(data,atoms_per_bin * 5 * sizeof(X_FLOAT) * factor); +//#if DEVICE==2 +// Kokkos::parallel_for(config, f); +//#else + Kokkos::parallel_for(nall, f); +//#endif +} else { + NeighborClusterKokkosBuildFunctor<DeviceType,NeighClusterSize> f(data,atoms_per_bin * 5 * sizeof(X_FLOAT) * factor); +//#if DEVICE==2 +// Kokkos::parallel_for(config, f); +//#else + Kokkos::parallel_for(nall, f); +//#endif +} + DeviceType::fence(); + deep_copy(data.h_resize, data.resize); + + if(data.h_resize()) { + deep_copy(data.h_new_maxneighs, data.new_maxneighs); + list->maxneighs = data.h_new_maxneighs() * 1.2; + list->d_neighbors = typename ArrayTypes<DeviceType>::t_neighbors_2d("neighbors", list->d_neighbors.dimension_0(), list->maxneighs); + data.neigh_list.d_neighbors = list->d_neighbors; + data.neigh_list.maxneighs = list->maxneighs; + } + } + + list->inum = nall; + list->gnum = 0; + +} + +/* ---------------------------------------------------------------------- */ + +template<class Device> template<int ClusterSize> +void NeighborKokkosExecute<Device>:: + build_cluster_Item(const int &i) const +{ + /* if necessary, goto next page and add pages */ + int n = 0; + + // get subview of neighbors of i + + const AtomNeighbors neighbors_i = neigh_list.get_neighbors(i); + const X_FLOAT xtmp = x(i, 0); + const X_FLOAT ytmp = x(i, 1); + const X_FLOAT ztmp = x(i, 2); + const int itype = type(i); + + const int ibin = coord2bin(xtmp, ytmp, ztmp); + + const int nstencil = neigh_list.nstencil; + const typename ArrayTypes<Device>::t_int_1d_const_um stencil + = neigh_list.d_stencil; + + for(int k = 0; k < nstencil; k++) { + const int jbin = ibin + stencil[k]; + for(int m = 0; m < c_bincount(jbin); m++) { + const int j = c_bins(jbin,m); + bool skip = i == j; + for(int k = 0; k< (n<neigh_list.maxneighs?n:neigh_list.maxneighs); k++) + if((j-(j%ClusterSize)) == neighbors_i(k)) {skip=true;};//{m += ClusterSize - j&(ClusterSize-1)-1; skip=true;} + + if(!skip) { + const int jtype = type(j); + + const X_FLOAT delx = xtmp - x(j, 0); + const X_FLOAT dely = ytmp - x(j, 1); + const X_FLOAT delz = ztmp - x(j, 2); + const X_FLOAT rsq = delx * delx + dely * dely + delz * delz; + + if(rsq <= cutneighsq(itype,jtype)) { + if(n<neigh_list.maxneighs) neighbors_i(n) = (j-(j%ClusterSize)); + n++; + //m += ClusterSize - j&(ClusterSize-1)-1; + } + } + + } + } + + neigh_list.d_numneigh(i) = n; + + if(n >= neigh_list.maxneighs) { + resize() = 1; + + if(n >= new_maxneighs()) new_maxneighs() = n; + } + neigh_list.d_ilist(i) = i; +} diff --git a/src/KOKKOS/neigh_list_kokkos.cpp b/src/KOKKOS/neigh_list_kokkos.cpp new file mode 100644 index 0000000000..dbb0aa5727 --- /dev/null +++ b/src/KOKKOS/neigh_list_kokkos.cpp @@ -0,0 +1,118 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include "neigh_list_kokkos.h" +#include "atom.h" +#include "memory.h" + +using namespace LAMMPS_NS; + +enum{NSQ,BIN,MULTI}; + +/* ---------------------------------------------------------------------- */ + +template<class Device> +void NeighListKokkos<Device>::clean_copy() +{ + ilist = NULL; + numneigh = NULL; + firstneigh = NULL; + firstdouble = NULL; + dnum = 0; + iskip = NULL; + ijskip = NULL; + + ipage = NULL; + dpage = NULL; + maxstencil = 0; + ghostflag = 0; + maxstencil_multi = 0; +} + +/* ---------------------------------------------------------------------- */ + +template<class Device> +void NeighListKokkos<Device>::grow(int nmax) +{ + // skip if this list is already long enough to store nmax atoms + + if (nmax <= maxatoms) return; + maxatoms = nmax; + + d_ilist = + typename ArrayTypes<Device>::t_int_1d("neighlist:ilist",maxatoms); + d_numneigh = + typename ArrayTypes<Device>::t_int_1d("neighlist:numneigh",maxatoms); + d_neighbors = + typename ArrayTypes<Device>::t_neighbors_2d("neighlist:neighbors", + maxatoms,maxneighs); + + memory->sfree(firstneigh); + memory->sfree(firstdouble); + + firstneigh = (int **) memory->smalloc(maxatoms*sizeof(int *), + "neighlist:firstneigh"); + if (dnum) + firstdouble = (double **) memory->smalloc(maxatoms*sizeof(double *), + "neighlist:firstdouble"); +} + +/* ---------------------------------------------------------------------- */ + +template<class Device> +void NeighListKokkos<Device>::stencil_allocate(int smax, int style) +{ + int i; + + if (style == BIN) { + if (smax > maxstencil) { + maxstencil = smax; + d_stencil = + memory->create_kokkos(d_stencil,h_stencil,stencil,maxstencil, + "neighlist:stencil"); + if (ghostflag) { + memory->destroy(stencilxyz); + memory->create(stencilxyz,maxstencil,3,"neighlist:stencilxyz"); + } + } + + } else { + int n = atom->ntypes; + if (maxstencil_multi == 0) { + nstencil_multi = new int[n+1]; + stencil_multi = new int*[n+1]; + distsq_multi = new double*[n+1]; + for (i = 1; i <= n; i++) { + nstencil_multi[i] = 0; + stencil_multi[i] = NULL; + distsq_multi[i] = NULL; + } + } + if (smax > maxstencil_multi) { + maxstencil_multi = smax; + for (i = 1; i <= n; i++) { + memory->destroy(stencil_multi[i]); + memory->destroy(distsq_multi[i]); + memory->create(stencil_multi[i],maxstencil_multi, + "neighlist:stencil_multi"); + memory->create(distsq_multi[i],maxstencil_multi, + "neighlist:distsq_multi"); + } + } + } +} + +template class NeighListKokkos<LMPDeviceType>; +#if DEVICE==2 +template class NeighListKokkos<LMPHostType>; +#endif diff --git a/src/KOKKOS/neigh_list_kokkos.h b/src/KOKKOS/neigh_list_kokkos.h new file mode 100644 index 0000000000..fd4ac3acc9 --- /dev/null +++ b/src/KOKKOS/neigh_list_kokkos.h @@ -0,0 +1,104 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifndef LMP_NEIGH_LIST_KOKKOS_H +#define LMP_NEIGH_LIST_KOKKOS_H + +#include "pointers.h" +#include "neigh_list.h" +#include "kokkos_type.h" + +namespace LAMMPS_NS { + +enum{FULL,HALFTHREAD,HALF,N2,FULLCLUSTER}; + +class AtomNeighbors +{ + public: + const int num_neighs; + + KOKKOS_INLINE_FUNCTION + AtomNeighbors(int* const & firstneigh, const int & _num_neighs, + const int & stride): + _firstneigh(firstneigh), _stride(stride), num_neighs(_num_neighs) {}; + KOKKOS_INLINE_FUNCTION + int& operator()(const int &i) const { + return _firstneigh[i*_stride]; + } + + private: + int* const _firstneigh; + const int _stride; +}; + +class AtomNeighborsConst +{ + public: + const int* const _firstneigh; + const int numneigh; + + KOKKOS_INLINE_FUNCTION + AtomNeighborsConst(int* const & firstneigh, const int & _numneigh, + const int & stride): + _firstneigh(firstneigh), _stride(stride), numneigh(_numneigh) {}; + KOKKOS_INLINE_FUNCTION + const int& operator()(const int &i) const { + return _firstneigh[i*_stride]; + } + + private: + //const int* const _firstneigh; + const int _stride; +}; + +template<class Device> +class NeighListKokkos: public NeighList { + int _stride; + +public: + int maxneighs; + + void clean_copy(); + void grow(int nmax); + typename ArrayTypes<Device>::t_neighbors_2d d_neighbors; + typename ArrayTypes<Device>::t_int_1d d_ilist; // local indices of I atoms + typename ArrayTypes<Device>::t_int_1d d_numneigh; // # of J neighs for each I + typename ArrayTypes<Device>::t_int_1d d_stencil; // # of J neighs for each I + typename ArrayTypes<LMPHostType>::t_int_1d h_stencil; // # of J neighs per I + + NeighListKokkos(class LAMMPS *lmp): + NeighList(lmp) {_stride = 1; maxneighs = 16;}; + ~NeighListKokkos() {stencil = NULL; numneigh = NULL; ilist = NULL;}; + + KOKKOS_INLINE_FUNCTION + AtomNeighbors get_neighbors(const int &i) const { + return AtomNeighbors(&d_neighbors(i,0),d_numneigh(i), + &d_neighbors(i,1)-&d_neighbors(i,0)); + } + + KOKKOS_INLINE_FUNCTION + AtomNeighborsConst get_neighbors_const(const int &i) const { + return AtomNeighborsConst(&d_neighbors(i,0),d_numneigh(i), + &d_neighbors(i,1)-&d_neighbors(i,0)); + } + + KOKKOS_INLINE_FUNCTION + int& num_neighs(const int & i) const { + return d_numneigh(i); + } + void stencil_allocate(int smax, int style); +}; + +} + +#endif diff --git a/src/KOKKOS/neighbor_kokkos.cpp b/src/KOKKOS/neighbor_kokkos.cpp new file mode 100644 index 0000000000..adea823976 --- /dev/null +++ b/src/KOKKOS/neighbor_kokkos.cpp @@ -0,0 +1,269 @@ +;/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include "neighbor_kokkos.h" +#include "atom.h" +#include "pair.h" +#include "neigh_request.h" +#include "memory.h" + +using namespace LAMMPS_NS; + +enum{NSQ,BIN,MULTI}; // also in neigh_list.cpp + +/* ---------------------------------------------------------------------- */ + +NeighborKokkos::NeighborKokkos(LAMMPS *lmp) : Neighbor(lmp) +{ + atoms_per_bin = 16; + + nlist_host = 0; + lists_host = NULL; + pair_build_host = NULL; + stencil_create_host = NULL; + nlist_device = 0; + lists_device = NULL; + pair_build_device = NULL; + stencil_create_device = NULL; +} + +/* ---------------------------------------------------------------------- */ + +NeighborKokkos::~NeighborKokkos() +{ + memory->destroy_kokkos(k_cutneighsq,cutneighsq); + cutneighsq = NULL; + + for (int i = 0; i < nlist_host; i++) delete lists_host[i]; + delete [] lists_host; + for (int i = 0; i < nlist_device; i++) delete lists_device[i]; + delete [] lists_device; + + delete [] pair_build_device; + delete [] pair_build_host; +} + +/* ---------------------------------------------------------------------- */ + +void NeighborKokkos::init() +{ + atomKK = (AtomKokkos *) atom; + Neighbor::init(); +} + +/* ---------------------------------------------------------------------- */ + +void NeighborKokkos::init_cutneighsq_kokkos(int n) +{ + memory->create_kokkos(k_cutneighsq,cutneighsq,n+1,n+1,"neigh:cutneighsq"); + k_cutneighsq.modify<LMPHostType>(); +} + +/* ---------------------------------------------------------------------- */ + +int NeighborKokkos::init_lists_kokkos() +{ + int i; + + for (i = 0; i < nlist_host; i++) delete lists_host[i]; + delete [] lists_host; + delete [] pair_build_host; + delete [] stencil_create_host; + nlist_host = 0; + + for (i = 0; i < nlist_device; i++) delete lists_device[i]; + delete [] lists_device; + delete [] pair_build_device; + delete [] stencil_create_device; + nlist_device = 0; + + nlist = 0; + for (i = 0; i < nrequest; i++) { + if (requests[i]->kokkos_device) nlist_device++; + else if (requests[i]->kokkos_host) nlist_host++; + else nlist++; + } + + lists_host = new NeighListKokkos<LMPHostType>*[nrequest]; + pair_build_host = new PairPtrHost[nrequest]; + stencil_create_host = new StencilPtrHost[nrequest]; + for (i = 0; i < nrequest; i++) { + lists_host[i] = NULL; + pair_build_host[i] = NULL; + stencil_create_host[i] = NULL; + } + + for (i = 0; i < nrequest; i++) { + if (!requests[i]->kokkos_host) continue; + lists_host[i] = new NeighListKokkos<LMPHostType>(lmp); + lists_host[i]->index = i; + lists_host[i]->dnum = requests[i]->dnum; + if (requests[i]->pair) { + Pair *pair = (Pair *) requests[i]->requestor; + pair->init_list(requests[i]->id,lists_host[i]); + } + } + + lists_device = new NeighListKokkos<LMPDeviceType>*[nrequest]; + pair_build_device = new PairPtrDevice[nrequest]; + stencil_create_device = new StencilPtrDevice[nrequest]; + for (i = 0; i < nrequest; i++) { + lists_device[i] = NULL; + pair_build_device[i] = NULL; + stencil_create_device[i] = NULL; + } + + for (i = 0; i < nrequest; i++) { + if (!requests[i]->kokkos_device) continue; + lists_device[i] = new NeighListKokkos<LMPDeviceType>(lmp); + lists_device[i]->index = i; + lists_device[i]->dnum = requests[i]->dnum; + if (requests[i]->pair) { + Pair *pair = (Pair *) requests[i]->requestor; + pair->init_list(requests[i]->id,lists_device[i]); + } + } + + // return # of non-Kokkos lists + + return nlist; +} + +/* ---------------------------------------------------------------------- */ + +void NeighborKokkos::init_list_flags1_kokkos(int i) +{ + if (lists_host[i]) { + lists_host[i]->buildflag = 1; + if (pair_build_host[i] == NULL) lists_host[i]->buildflag = 0; + if (requests[i]->occasional) lists_host[i]->buildflag = 0; + + lists_host[i]->growflag = 1; + if (requests[i]->copy) lists_host[i]->growflag = 0; + + lists_host[i]->stencilflag = 1; + if (style == NSQ) lists_host[i]->stencilflag = 0; + if (stencil_create[i] == NULL) lists_host[i]->stencilflag = 0; + + lists_host[i]->ghostflag = 0; + if (requests[i]->ghost) lists_host[i]->ghostflag = 1; + if (requests[i]->ghost && !requests[i]->occasional) anyghostlist = 1; + } + + if (lists_device[i]) { + lists_device[i]->buildflag = 1; + if (pair_build_device[i] == NULL) lists_device[i]->buildflag = 0; + if (requests[i]->occasional) lists_device[i]->buildflag = 0; + + lists_device[i]->growflag = 1; + if (requests[i]->copy) lists_device[i]->growflag = 0; + + lists_device[i]->stencilflag = 1; + if (style == NSQ) lists_device[i]->stencilflag = 0; + if (stencil_create[i] == NULL) lists_device[i]->stencilflag = 0; + + lists_device[i]->ghostflag = 0; + if (requests[i]->ghost) lists_device[i]->ghostflag = 1; + if (requests[i]->ghost && !requests[i]->occasional) anyghostlist = 1; + } +} + +/* ---------------------------------------------------------------------- */ + +void NeighborKokkos::init_list_flags2_kokkos(int i) +{ + if (lists_host[i]) { + if (lists_host[i]->buildflag) blist[nblist++] = i; + if (lists_host[i]->growflag && requests[i]->occasional == 0) + glist[nglist++] = i; + if (lists_host[i]->stencilflag && requests[i]->occasional == 0) + slist[nslist++] = i; + } + + if (lists_device[i]) { + if (lists_device[i]->buildflag) blist[nblist++] = i; + if (lists_device[i]->growflag && requests[i]->occasional == 0) + glist[nglist++] = i; + if (lists_device[i]->stencilflag && requests[i]->occasional == 0) + slist[nslist++] = i; + } +} + +/* ---------------------------------------------------------------------- */ + +void NeighborKokkos::init_list_grow_kokkos(int i) +{ + if (lists_host[i]!=NULL && lists_host[i]->growflag) + lists_host[i]->grow(maxatom); + if (lists_device[i]!=NULL && lists_device[i]->growflag) + lists_device[i]->grow(maxatom); +} + +/* ---------------------------------------------------------------------- */ + +void NeighborKokkos::choose_build(int index, NeighRequest *rq) +{ + if (rq->kokkos_host != 0) { + PairPtrHost pb = NULL; + if (rq->full) pb = &NeighborKokkos::full_bin_kokkos<LMPHostType,0>; + else if (rq->half) pb = &NeighborKokkos::full_bin_kokkos<LMPHostType,1>; + pair_build_host[index] = pb; + return; + } + if (rq->kokkos_device != 0) { + PairPtrDevice pb = NULL; + if (rq->full) { + if (rq->full_cluster) pb = &NeighborKokkos::full_bin_cluster_kokkos<LMPDeviceType>; + else pb = &NeighborKokkos::full_bin_kokkos<LMPDeviceType,0>; + } + else if (rq->half) pb = &NeighborKokkos::full_bin_kokkos<LMPDeviceType,1>; + pair_build_device[index] = pb; + return; + } + + Neighbor::choose_build(index,rq); +} + +/* ---------------------------------------------------------------------- */ + +void NeighborKokkos::build_kokkos(int i) +{ + if (lists_host[blist[i]]) + (this->*pair_build_host[blist[i]])(lists_host[blist[i]]); + else if (lists_device[blist[i]]) + (this->*pair_build_device[blist[i]])(lists_device[blist[i]]); +} + +/* ---------------------------------------------------------------------- */ + +void NeighborKokkos::setup_bins_kokkos(int i) +{ + if (lists_host[slist[i]]) { + lists_host[slist[i]]->stencil_allocate(smax,style); + (this->*stencil_create[slist[i]])(lists_host[slist[i]],sx,sy,sz); + } else if (lists_device[slist[i]]) { + lists_device[slist[i]]->stencil_allocate(smax,style); + (this->*stencil_create[slist[i]])(lists_device[slist[i]],sx,sy,sz); + } + + if (i < nslist-1) return; + + if (maxhead > k_bins.d_view.dimension_0()) { + k_bins = DAT::tdual_int_2d("Neighbor::d_bins",maxhead,atoms_per_bin); + k_bincount = DAT::tdual_int_1d("Neighbor::d_bincount",maxhead); + } +} + +// include to trigger instantiation of templated functions + +#include "neigh_full_kokkos.h" diff --git a/src/KOKKOS/neighbor_kokkos.h b/src/KOKKOS/neighbor_kokkos.h new file mode 100644 index 0000000000..30e73792e4 --- /dev/null +++ b/src/KOKKOS/neighbor_kokkos.h @@ -0,0 +1,257 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifndef LMP_NEIGHBOR_KOKKOS_H +#define LMP_NEIGHBOR_KOKKOS_H + +#include "neighbor.h" +#include "neigh_list_kokkos.h" +#include "kokkos_type.h" + +namespace LAMMPS_NS { + +template<class Device> +class NeighborKokkosExecute +{ + typedef ArrayTypes<Device> AT; + + public: + NeighListKokkos<Device> neigh_list; + const typename AT::t_xfloat_2d_randomread cutneighsq; + const typename AT::t_int_1d bincount; + const typename AT::t_int_1d_const c_bincount; + typename AT::t_int_2d bins; + typename AT::t_int_2d_const c_bins; + const typename AT::t_x_array_randomread x; + const typename AT::t_int_1d_const type,mask,molecule; + + const int nbinx,nbiny,nbinz; + const int mbinx,mbiny,mbinz; + const int mbinxlo,mbinylo,mbinzlo; + const X_FLOAT bininvx,bininvy,bininvz; + X_FLOAT bboxhi[3],bboxlo[3]; + + const int nlocal; + + typename AT::t_int_scalar resize; + typename AT::t_int_scalar new_maxneighs; + typename ArrayTypes<LMPHostType>::t_int_scalar h_resize; + typename ArrayTypes<LMPHostType>::t_int_scalar h_new_maxneighs; + + NeighborKokkosExecute( + const NeighListKokkos<Device> &_neigh_list, + const typename AT::t_xfloat_2d_randomread &_cutneighsq, + const typename AT::t_int_1d &_bincount, + const typename AT::t_int_2d &_bins, + const int _nlocal, + const typename AT::t_x_array_randomread &_x, + const typename AT::t_int_1d_const &_type, + const typename AT::t_int_1d_const &_mask, + const typename AT::t_int_1d_const &_molecule, + const int & _nbinx,const int & _nbiny,const int & _nbinz, + const int & _mbinx,const int & _mbiny,const int & _mbinz, + const int & _mbinxlo,const int & _mbinylo,const int & _mbinzlo, + const X_FLOAT &_bininvx,const X_FLOAT &_bininvy,const X_FLOAT &_bininvz, + const X_FLOAT *_bboxhi, const X_FLOAT* _bboxlo): + neigh_list(_neigh_list), cutneighsq(_cutneighsq), + bincount(_bincount),c_bincount(_bincount),bins(_bins),c_bins(_bins), + nlocal(_nlocal), + x(_x),type(_type),mask(_mask),molecule(_molecule), + nbinx(_nbinx),nbiny(_nbiny),nbinz(_nbinz), + mbinx(_mbinx),mbiny(_mbiny),mbinz(_mbinz), + mbinxlo(_mbinxlo),mbinylo(_mbinylo),mbinzlo(_mbinzlo), + bininvx(_bininvx),bininvy(_bininvy),bininvz(_bininvz) { + + bboxlo[0] = _bboxlo[0]; bboxlo[1] = _bboxlo[1]; bboxlo[2] = _bboxlo[2]; + bboxhi[0] = _bboxhi[0]; bboxhi[1] = _bboxhi[1]; bboxhi[2] = _bboxhi[2]; + + resize = typename AT::t_int_scalar("NeighborKokkosFunctor::resize"); +#ifndef KOKKOS_USE_UVM + h_resize = Kokkos::create_mirror_view(resize); +#else + h_resize = resize; +#endif + h_resize() = 1; + new_maxneighs = typename AT:: + t_int_scalar("NeighborKokkosFunctor::new_maxneighs"); +#ifndef KOKKOS_USE_UVM + h_new_maxneighs = Kokkos::create_mirror_view(new_maxneighs); +#else + h_new_maxneighs = new_maxneighs; +#endif + h_new_maxneighs() = neigh_list.maxneighs; + }; + + ~NeighborKokkosExecute() {neigh_list.clean_copy();}; + + template<int HalfNeigh, int GhostNewton> + KOKKOS_FUNCTION + void build_Item(const int &i) const; + + template<int ClusterSize> + KOKKOS_FUNCTION + void build_cluster_Item(const int &i) const; + +#if DEVICE==2 + template<int HalfNeigh> + __device__ inline + void build_ItemCuda(Device dev) const; +#endif + + KOKKOS_INLINE_FUNCTION + void binatomsItem(const int &i) const; + + KOKKOS_INLINE_FUNCTION + int coord2bin(const X_FLOAT & x,const X_FLOAT & y,const X_FLOAT & z) const + { + int ix,iy,iz; + + if (x >= bboxhi[0]) + ix = static_cast<int> ((x-bboxhi[0])*bininvx) + nbinx; + else if (x >= bboxlo[0]) { + ix = static_cast<int> ((x-bboxlo[0])*bininvx); + ix = MIN(ix,nbinx-1); + } else + ix = static_cast<int> ((x-bboxlo[0])*bininvx) - 1; + + if (y >= bboxhi[1]) + iy = static_cast<int> ((y-bboxhi[1])*bininvy) + nbiny; + else if (y >= bboxlo[1]) { + iy = static_cast<int> ((y-bboxlo[1])*bininvy); + iy = MIN(iy,nbiny-1); + } else + iy = static_cast<int> ((y-bboxlo[1])*bininvy) - 1; + + if (z >= bboxhi[2]) + iz = static_cast<int> ((z-bboxhi[2])*bininvz) + nbinz; + else if (z >= bboxlo[2]) { + iz = static_cast<int> ((z-bboxlo[2])*bininvz); + iz = MIN(iz,nbinz-1); + } else + iz = static_cast<int> ((z-bboxlo[2])*bininvz) - 1; + + return (iz-mbinzlo)*mbiny*mbinx + (iy-mbinylo)*mbinx + (ix-mbinxlo); + } +}; + +template<class Device> +struct NeighborKokkosBinAtomsFunctor { + typedef Device device_type; + + const NeighborKokkosExecute<Device> c; + + NeighborKokkosBinAtomsFunctor(const NeighborKokkosExecute<Device> &_c): + c(_c) {}; + ~NeighborKokkosBinAtomsFunctor() {} + KOKKOS_INLINE_FUNCTION + void operator() (const int & i) const { + c.binatomsItem(i); + } +}; + +template<class Device,int HALF_NEIGH,int GHOST_NEWTON> +struct NeighborKokkosBuildFunctor { + typedef Device device_type; + + const NeighborKokkosExecute<Device> c; + const size_t sharedsize; + + NeighborKokkosBuildFunctor(const NeighborKokkosExecute<Device> &_c, + const size_t _sharedsize):c(_c), + sharedsize(_sharedsize) {}; + + KOKKOS_INLINE_FUNCTION + void operator() (const int & i) const { + c.template build_Item<HALF_NEIGH,GHOST_NEWTON>(i); + } +#if DEVICE==2 + KOKKOS_INLINE_FUNCTION + void operator() (Device dev) const { + c.template build_ItemCuda<HALF_NEIGH>(dev); + } + size_t shmem_size() const { return sharedsize; } +#endif +}; + +template<class Device,int ClusterSize> +struct NeighborClusterKokkosBuildFunctor { + typedef Device device_type; + + const NeighborKokkosExecute<Device> c; + const size_t sharedsize; + + NeighborClusterKokkosBuildFunctor(const NeighborKokkosExecute<Device> &_c, + const size_t _sharedsize):c(_c), + sharedsize(_sharedsize) {}; + + KOKKOS_INLINE_FUNCTION + void operator() (const int & i) const { + c.template build_cluster_Item<ClusterSize>(i); + } +}; + +class NeighborKokkos : public Neighbor { + public: + class AtomKokkos *atomKK; + + int nlist_host; // pairwise neighbor lists on Host + NeighListKokkos<LMPHostType> **lists_host; + int nlist_device; // pairwise neighbor lists on Device + NeighListKokkos<LMPDeviceType> **lists_device; + + NeighborKokkos(class LAMMPS *); + ~NeighborKokkos(); + void init(); + + private: + int atoms_per_bin; + DAT::tdual_xfloat_2d k_cutneighsq; + DAT::tdual_int_1d k_bincount; + DAT::tdual_int_2d k_bins; + + void init_cutneighsq_kokkos(int); + int init_lists_kokkos(); + void init_list_flags1_kokkos(int); + void init_list_flags2_kokkos(int); + void init_list_grow_kokkos(int); + void choose_build(int, NeighRequest *); + void build_kokkos(int); + void setup_bins_kokkos(int); + + typedef void (NeighborKokkos::*PairPtrHost) + (class NeighListKokkos<LMPHostType> *); + PairPtrHost *pair_build_host; + typedef void (NeighborKokkos::*PairPtrDevice) + (class NeighListKokkos<LMPDeviceType> *); + PairPtrDevice *pair_build_device; + + template<class DeviceType,int HALF_NEIGH> + void full_bin_kokkos(NeighListKokkos<DeviceType> *list); + template<class DeviceType> + void full_bin_cluster_kokkos(NeighListKokkos<DeviceType> *list); + + typedef void (NeighborKokkos::*StencilPtrHost) + (class NeighListKokkos<LMPHostType> *, int, int, int); + StencilPtrHost *stencil_create_host; + typedef void (NeighborKokkos::*StencilPtrDevice) + (class NeighListKokkos<LMPDeviceType> *, int, int, int); + StencilPtrDevice *stencil_create_device; +}; + +} + +#endif + +/* ERROR/WARNING messages: + +*/ diff --git a/src/KOKKOS/pair_kokkos.h b/src/KOKKOS/pair_kokkos.h new file mode 100644 index 0000000000..de67e7df0b --- /dev/null +++ b/src/KOKKOS/pair_kokkos.h @@ -0,0 +1,655 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +#else + +#ifndef LMP_PAIR_KOKKOS_H +#define LMP_PAIR_KOKKOS_H + +#include "Kokkos_Macros.hpp" +#include "pair.h" +#include "neigh_list_kokkos.h" +#include "Kokkos_Vectorization.hpp" + +namespace LAMMPS_NS { + +template <class PairStyle, int NEIGHFLAG, bool STACKPARAMS, class Specialisation = void> +struct PairComputeFunctor { + typedef typename PairStyle::device_type device_type ; + typedef EV_FLOAT value_type; + + PairStyle c; + NeighListKokkos<device_type> list; + + PairComputeFunctor(PairStyle* c_ptr, + NeighListKokkos<device_type>* list_ptr): + c(*c_ptr),list(*list_ptr) {}; + ~PairComputeFunctor() {c.cleanup_copy();list.clean_copy();}; + + KOKKOS_INLINE_FUNCTION int sbmask(const int& j) const { + return j >> SBBITS & 3; + } + + template<int EVFLAG, int NEWTON_PAIR> + KOKKOS_FUNCTION + EV_FLOAT compute_item(const int& ii, + const NeighListKokkos<device_type> &list) const { + EV_FLOAT ev; + const int i = list.d_ilist[ii]; + const X_FLOAT xtmp = c.x(i,0); + const X_FLOAT ytmp = c.x(i,1); + const X_FLOAT ztmp = c.x(i,2); + const int itype = c.type(i); + + const AtomNeighborsConst neighbors_i = list.get_neighbors_const(i); + const int jnum = list.d_numneigh[i]; + + F_FLOAT fxtmp = 0.0; + F_FLOAT fytmp = 0.0; + F_FLOAT fztmp = 0.0; + + for (int jj = 0; jj < jnum; jj++) { + int j = neighbors_i(jj); + const F_FLOAT factor_lj = c.special_lj[sbmask(j)]; + j &= NEIGHMASK; + const X_FLOAT delx = xtmp - c.x(j,0); + const X_FLOAT dely = ytmp - c.x(j,1); + const X_FLOAT delz = ztmp - c.x(j,2); + const int jtype = c.type(j); + const F_FLOAT rsq = delx*delx + dely*dely + delz*delz; + + if(rsq < (STACKPARAMS?c.m_cutsq[itype][jtype]:c.d_cutsq(itype,jtype))) { + + const F_FLOAT fpair = factor_lj*c.template compute_fpair<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype); + + fxtmp += delx*fpair; + fytmp += dely*fpair; + fztmp += delz*fpair; + if ((NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < c.nlocal)) { + Kokkos::atomic_fetch_add(&c.f(j,0),-delx*fpair); + Kokkos::atomic_fetch_add(&c.f(j,1),-dely*fpair); + Kokkos::atomic_fetch_add(&c.f(j,2),-delz*fpair); + } + + if ((NEIGHFLAG==HALF) && (NEWTON_PAIR || j < c.nlocal)) { + c.f(j,0) -= delx*fpair; + c.f(j,1) -= dely*fpair; + c.f(j,2) -= delz*fpair; + } + + if (EVFLAG) { + if (c.eflag) { + ev.evdwl += (((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD)&&(NEWTON_PAIR||(j<c.nlocal)))?1.0:0.5)* + factor_lj * c.template compute_evdwl<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype); + if (c.COUL_FLAG) + ev.ecoul += (((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD)&&(NEWTON_PAIR||(j<c.nlocal)))?1.0:0.5)* + factor_lj * c.template compute_ecoul<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype); + } + + if (c.vflag_either) ev_tally(ev,i,j,fpair,delx,dely,delz); + } + } + + } + if (NEIGHFLAG == HALFTHREAD) { + Kokkos::atomic_fetch_add(&c.f(i,0),fxtmp); + Kokkos::atomic_fetch_add(&c.f(i,1),fytmp); + Kokkos::atomic_fetch_add(&c.f(i,2),fztmp); + } else { + c.f(i,0) += fxtmp; + c.f(i,1) += fytmp; + c.f(i,2) += fztmp; + } + + return ev; + } + + KOKKOS_INLINE_FUNCTION + void ev_tally(EV_FLOAT &ev, const int &i, const int &j, + const F_FLOAT &fpair, const F_FLOAT &delx, + const F_FLOAT &dely, const F_FLOAT &delz) const + { + const int EFLAG = c.eflag; + const int NEWTON_PAIR = c.newton_pair; + const int VFLAG = c.vflag_either; + + if (EFLAG) { + if (c.eflag_atom) { + const E_FLOAT epairhalf = 0.5 * (ev.evdwl + ev.ecoul); + if (NEWTON_PAIR || i < c.nlocal) c.eatom[i] += epairhalf; + if (NEWTON_PAIR || j < c.nlocal) c.eatom[j] += epairhalf; + } + } + + if (VFLAG) { + const E_FLOAT v0 = delx*delx*fpair; + const E_FLOAT v1 = dely*dely*fpair; + const E_FLOAT v2 = delz*delz*fpair; + const E_FLOAT v3 = delx*dely*fpair; + const E_FLOAT v4 = delx*delz*fpair; + const E_FLOAT v5 = dely*delz*fpair; + + if (c.vflag_global) { + if (NEIGHFLAG) { + if (NEWTON_PAIR) { + ev.v[0] += v0; + ev.v[1] += v1; + ev.v[2] += v2; + ev.v[3] += v3; + ev.v[4] += v4; + ev.v[5] += v5; + } else { + if (i < c.nlocal) { + ev.v[0] += 0.5*v0; + ev.v[1] += 0.5*v1; + ev.v[2] += 0.5*v2; + ev.v[3] += 0.5*v3; + ev.v[4] += 0.5*v4; + ev.v[5] += 0.5*v5; + } + if (j < c.nlocal) { + ev.v[0] += 0.5*v0; + ev.v[1] += 0.5*v1; + ev.v[2] += 0.5*v2; + ev.v[3] += 0.5*v3; + ev.v[4] += 0.5*v4; + ev.v[5] += 0.5*v5; + } + } + } else { + ev.v[0] += 0.5*v0; + ev.v[1] += 0.5*v1; + ev.v[2] += 0.5*v2; + ev.v[3] += 0.5*v3; + ev.v[4] += 0.5*v4; + ev.v[5] += 0.5*v5; + } + } + + if (c.vflag_atom) { + if (NEWTON_PAIR || i < c.nlocal) { + c.d_vatom(i,0) += 0.5*v0; + c.d_vatom(i,1) += 0.5*v1; + c.d_vatom(i,2) += 0.5*v2; + c.d_vatom(i,3) += 0.5*v3; + c.d_vatom(i,4) += 0.5*v4; + c.d_vatom(i,5) += 0.5*v5; + } + if (NEWTON_PAIR || (NEIGHFLAG && j < c.nlocal)) { + c.d_vatom(j,0) += 0.5*v0; + c.d_vatom(j,1) += 0.5*v1; + c.d_vatom(j,2) += 0.5*v2; + c.d_vatom(j,3) += 0.5*v3; + c.d_vatom(j,4) += 0.5*v4; + c.d_vatom(j,5) += 0.5*v5; + } + } + } + } + + + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { + if (c.newton_pair) compute_item<0,1>(i,list); + else compute_item<0,0>(i,list); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const int i, value_type &energy_virial) const { + if (c.newton_pair) + energy_virial += compute_item<1,1>(i,list); + else + energy_virial += compute_item<1,0>(i,list); + } + + KOKKOS_INLINE_FUNCTION + static void init(volatile value_type &update) { + update.evdwl = 0; + update.ecoul = 0; + update.v[0] = 0; + update.v[1] = 0; + update.v[2] = 0; + update.v[3] = 0; + update.v[4] = 0; + update.v[5] = 0; + } + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type &update, + const volatile value_type &source) { + update.evdwl += source.evdwl; + update.ecoul += source.ecoul; + update.v[0] += source.v[0]; + update.v[1] += source.v[1]; + update.v[2] += source.v[2]; + update.v[3] += source.v[3]; + update.v[4] += source.v[4]; + update.v[5] += source.v[5]; + } + + +}; + +template <class PairStyle, bool STACKPARAMS, class Specialisation> +struct PairComputeFunctor<PairStyle,FULLCLUSTER,STACKPARAMS,Specialisation> { + typedef typename PairStyle::device_type device_type ; + typedef Kokkos::Vectorization<device_type,NeighClusterSize> vectorization; + typedef EV_FLOAT value_type; + + PairStyle c; + NeighListKokkos<device_type> list; + + PairComputeFunctor(PairStyle* c_ptr, + NeighListKokkos<device_type>* list_ptr): + c(*c_ptr),list(*list_ptr) {}; + ~PairComputeFunctor() {c.cleanup_copy();list.clean_copy();}; + + KOKKOS_INLINE_FUNCTION int sbmask(const int& j) const { + return j >> SBBITS & 3; + } + + template<int EVFLAG, int NEWTON_PAIR> + KOKKOS_FUNCTION + EV_FLOAT compute_item(const device_type& dev, + const NeighListKokkos<device_type> &list) const { + EV_FLOAT ev; + const int i = vectorization::global_thread_rank(dev); + + const X_FLOAT xtmp = c.c_x(i,0); + const X_FLOAT ytmp = c.c_x(i,1); + const X_FLOAT ztmp = c.c_x(i,2); + const int itype = c.type(i); + + const AtomNeighborsConst neighbors_i = list.get_neighbors_const(i); + const int jnum = list.d_numneigh[i]; + + F_FLOAT fxtmp = 0.0; + F_FLOAT fytmp = 0.0; + F_FLOAT fztmp = 0.0; + + for (int jj = 0; jj < jnum; jj++) { + const int jjj = neighbors_i(jj); + + for (int k = vectorization::begin(); k<NeighClusterSize; k+=vectorization::increment) { + const F_FLOAT factor_lj = c.special_lj[sbmask(jjj+k)]; + const int j = (jjj + k)&NEIGHMASK; + if((j==i)||(j>=c.nall)) continue; + const X_FLOAT delx = xtmp - c.c_x(j,0); + const X_FLOAT dely = ytmp - c.c_x(j,1); + const X_FLOAT delz = ztmp - c.c_x(j,2); + const int jtype = c.type(j); + const F_FLOAT rsq = (delx*delx + dely*dely + delz*delz); + + if(rsq < (STACKPARAMS?c.m_cutsq[itype][jtype]:c.d_cutsq(itype,jtype))) { + + const F_FLOAT fpair = factor_lj*c.template compute_fpair<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype); + fxtmp += delx*fpair; + fytmp += dely*fpair; + fztmp += delz*fpair; + + if (EVFLAG) { + if (c.eflag) { + ev.evdwl += 0.5* + factor_lj * c.template compute_evdwl<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype); + if (c.COUL_FLAG) + ev.ecoul += 0.5* + factor_lj * c.template compute_ecoul<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype); + } + + if (c.vflag_either) ev_tally(ev,i,j,fpair,delx,dely,delz); + } + } + } + } + + const F_FLOAT fx = vectorization::reduce(fxtmp); + const F_FLOAT fy = vectorization::reduce(fytmp); + const F_FLOAT fz = vectorization::reduce(fztmp); + if(vectorization::is_lane_0(dev)) { + c.f(i,0) += fx; + c.f(i,1) += fy; + c.f(i,2) += fz; + } + + return ev; + } + + KOKKOS_INLINE_FUNCTION + void ev_tally(EV_FLOAT &ev, const int &i, const int &j, + const F_FLOAT &fpair, const F_FLOAT &delx, + const F_FLOAT &dely, const F_FLOAT &delz) const + { + const int EFLAG = c.eflag; + const int NEWTON_PAIR = c.newton_pair; + const int VFLAG = c.vflag_either; + + if (EFLAG) { + if (c.eflag_atom) { + const E_FLOAT epairhalf = 0.5 * (ev.evdwl + ev.ecoul); + if (NEWTON_PAIR || i < c.nlocal) c.eatom[i] += epairhalf; + if (NEWTON_PAIR || j < c.nlocal) c.eatom[j] += epairhalf; + } + } + + if (VFLAG) { + const E_FLOAT v0 = delx*delx*fpair; + const E_FLOAT v1 = dely*dely*fpair; + const E_FLOAT v2 = delz*delz*fpair; + const E_FLOAT v3 = delx*dely*fpair; + const E_FLOAT v4 = delx*delz*fpair; + const E_FLOAT v5 = dely*delz*fpair; + + if (c.vflag_global) { + ev.v[0] += 0.5*v0; + ev.v[1] += 0.5*v1; + ev.v[2] += 0.5*v2; + ev.v[3] += 0.5*v3; + ev.v[4] += 0.5*v4; + ev.v[5] += 0.5*v5; + } + + if (c.vflag_atom) { + if (i < c.nlocal) { + c.d_vatom(i,0) += 0.5*v0; + c.d_vatom(i,1) += 0.5*v1; + c.d_vatom(i,2) += 0.5*v2; + c.d_vatom(i,3) += 0.5*v3; + c.d_vatom(i,4) += 0.5*v4; + c.d_vatom(i,5) += 0.5*v5; + } + } + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(const device_type& dev) const { + if (c.newton_pair) compute_item<0,1>(dev,list); + else compute_item<0,0>(dev,list); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const device_type& dev, value_type &energy_virial) const { + if (c.newton_pair) + energy_virial += compute_item<1,1>(dev,list); + else + energy_virial += compute_item<1,0>(dev,list); + } + + KOKKOS_INLINE_FUNCTION + static void init(volatile value_type &update) { + update.evdwl = 0; + update.ecoul = 0; + update.v[0] = 0; + update.v[1] = 0; + update.v[2] = 0; + update.v[3] = 0; + update.v[4] = 0; + update.v[5] = 0; + } + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type &update, + const volatile value_type &source) { + update.evdwl += source.evdwl; + update.ecoul += source.ecoul; + update.v[0] += source.v[0]; + update.v[1] += source.v[1]; + update.v[2] += source.v[2]; + update.v[3] += source.v[3]; + update.v[4] += source.v[4]; + update.v[5] += source.v[5]; + } + + +}; + +template <class PairStyle, bool STACKPARAMS, class Specialisation> +struct PairComputeFunctor<PairStyle,N2,STACKPARAMS,Specialisation> { + typedef typename PairStyle::device_type device_type ; + typedef EV_FLOAT value_type; + + PairStyle c; + NeighListKokkos<device_type> list; + + PairComputeFunctor(PairStyle* c_ptr, + NeighListKokkos<device_type>* list_ptr): + c(*c_ptr),list(*list_ptr) {}; + ~PairComputeFunctor() {c.cleanup_copy();list.clean_copy();}; + + KOKKOS_INLINE_FUNCTION int sbmask(const int& j) const { + return j >> SBBITS & 3; + } + + template<int EVFLAG, int NEWTON_PAIR> + KOKKOS_FUNCTION + EV_FLOAT compute_item(const int& ii, + const NeighListKokkos<device_type> &list) const { + EV_FLOAT ev; + const int i = ii;//list.d_ilist[ii]; + const X_FLOAT xtmp = c.x(i,0); + const X_FLOAT ytmp = c.x(i,1); + const X_FLOAT ztmp = c.x(i,2); + const int itype = c.type(i); + + //const AtomNeighborsConst neighbors_i = list.get_neighbors_const(i); + const int jnum = c.nall; + + F_FLOAT fxtmp = 0.0; + F_FLOAT fytmp = 0.0; + F_FLOAT fztmp = 0.0; + + for (int jj = 0; jj < jnum; jj++) { + int j = jj;//neighbors_i(jj); + if(i==j) continue; + const F_FLOAT factor_lj = c.special_lj[sbmask(j)]; + j &= NEIGHMASK; + const X_FLOAT delx = xtmp - c.x(j,0); + const X_FLOAT dely = ytmp - c.x(j,1); + const X_FLOAT delz = ztmp - c.x(j,2); + const int jtype = c.type(j); + const F_FLOAT rsq = delx*delx + dely*dely + delz*delz; + + if(rsq < (STACKPARAMS?c.m_cutsq[itype][jtype]:c.d_cutsq(itype,jtype))) { + + const F_FLOAT fpair = factor_lj*c.template compute_fpair<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype); + fxtmp += delx*fpair; + fytmp += dely*fpair; + fztmp += delz*fpair; + + if (EVFLAG) { + if (c.eflag) { + ev.evdwl += 0.5* + factor_lj * c.template compute_evdwl<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype); + if (c.COUL_FLAG) + ev.ecoul += 0.5* + factor_lj * c.template compute_ecoul<STACKPARAMS,Specialisation>(rsq,i,j,itype,jtype); + } + + if (c.vflag_either) ev_tally(ev,i,j,fpair,delx,dely,delz); + } + } + } + + c.f(i,0) += fxtmp; + c.f(i,1) += fytmp; + c.f(i,2) += fztmp; + + return ev; + } + + KOKKOS_INLINE_FUNCTION + void ev_tally(EV_FLOAT &ev, const int &i, const int &j, + const F_FLOAT &fpair, const F_FLOAT &delx, + const F_FLOAT &dely, const F_FLOAT &delz) const + { + const int EFLAG = c.eflag; + const int VFLAG = c.vflag_either; + + if (EFLAG) { + if (c.eflag_atom) { + const E_FLOAT epairhalf = 0.5 * (ev.evdwl + ev.ecoul); + if (i < c.nlocal) c.eatom[i] += epairhalf; + if (j < c.nlocal) c.eatom[j] += epairhalf; + } + } + + if (VFLAG) { + const E_FLOAT v0 = delx*delx*fpair; + const E_FLOAT v1 = dely*dely*fpair; + const E_FLOAT v2 = delz*delz*fpair; + const E_FLOAT v3 = delx*dely*fpair; + const E_FLOAT v4 = delx*delz*fpair; + const E_FLOAT v5 = dely*delz*fpair; + + if (c.vflag_global) { + ev.v[0] += 0.5*v0; + ev.v[1] += 0.5*v1; + ev.v[2] += 0.5*v2; + ev.v[3] += 0.5*v3; + ev.v[4] += 0.5*v4; + ev.v[5] += 0.5*v5; + } + + if (c.vflag_atom) { + if (i < c.nlocal) { + c.d_vatom(i,0) += 0.5*v0; + c.d_vatom(i,1) += 0.5*v1; + c.d_vatom(i,2) += 0.5*v2; + c.d_vatom(i,3) += 0.5*v3; + c.d_vatom(i,4) += 0.5*v4; + c.d_vatom(i,5) += 0.5*v5; + } + } + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { + compute_item<0,0>(i,list); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const int i, value_type &energy_virial) const { + energy_virial += compute_item<1,0>(i,list); + } + + KOKKOS_INLINE_FUNCTION + static void init(volatile value_type &update) { + update.evdwl = 0; + update.ecoul = 0; + update.v[0] = 0; + update.v[1] = 0; + update.v[2] = 0; + update.v[3] = 0; + update.v[4] = 0; + update.v[5] = 0; + } + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type &update, + const volatile value_type &source) { + update.evdwl += source.evdwl; + update.ecoul += source.ecoul; + update.v[0] += source.v[0]; + update.v[1] += source.v[1]; + update.v[2] += source.v[2]; + update.v[3] += source.v[3]; + update.v[4] += source.v[4]; + update.v[5] += source.v[5]; + } + + +}; + +template<class PairStyle, class Specialisation> +EV_FLOAT pair_compute (PairStyle* fpair, NeighListKokkos<typename PairStyle::device_type>* list) { + EV_FLOAT ev; + if(fpair->atom->ntypes > MAX_TYPES_STACKPARAMS) { + if (fpair->neighflag == FULL) { + PairComputeFunctor<PairStyle,FULL,false,Specialisation > + ff(fpair, list); + if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(list->inum,ff,ev); + else Kokkos::parallel_for(list->inum,ff); + } else if (fpair->neighflag == HALFTHREAD) { + PairComputeFunctor<PairStyle,HALFTHREAD,false,Specialisation > + ff(fpair, list); + if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(list->inum,ff,ev); + else Kokkos::parallel_for(list->inum,ff); + } else if (fpair->neighflag == HALF) { + PairComputeFunctor<PairStyle,HALF,false,Specialisation > + ff(fpair, list); + if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(list->inum,ff,ev); + else Kokkos::parallel_for(list->inum,ff); + } else if (fpair->neighflag == N2) { + PairComputeFunctor<PairStyle,N2,false,Specialisation > + ff(fpair, list); + if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(fpair->nlocal,ff,ev); + else Kokkos::parallel_for(fpair->nlocal,ff); + } else if (fpair->neighflag == FULLCLUSTER) { + typedef PairComputeFunctor<PairStyle,FULLCLUSTER,false,Specialisation > + f_type; + f_type ff(fpair, list); + #ifdef KOKKOS_HAVE_CUDA + const int teamsize = Kokkos::Impl::is_same<typename f_type::device_type, Kokkos::Cuda>::value ? 256 : 1; + #else + const int teamsize = 1; + #endif + const int nteams = (list->inum*f_type::vectorization::increment+teamsize-1)/teamsize; + if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(Kokkos::ParallelWorkRequest(nteams,teamsize),ff,ev); + else Kokkos::parallel_for(Kokkos::ParallelWorkRequest(nteams,teamsize),ff); + } + } else { + if (fpair->neighflag == FULL) { + PairComputeFunctor<PairStyle,FULL,true,Specialisation > + ff(fpair, list); + if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(list->inum,ff,ev); + else Kokkos::parallel_for(list->inum,ff); + } else if (fpair->neighflag == HALFTHREAD) { + PairComputeFunctor<PairStyle,HALFTHREAD,true,Specialisation > + ff(fpair, list); + if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(list->inum,ff,ev); + else Kokkos::parallel_for(list->inum,ff); + } else if (fpair->neighflag == HALF) { + PairComputeFunctor<PairStyle,HALF,true,Specialisation > + ff(fpair, list); + if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(list->inum,ff,ev); + else Kokkos::parallel_for(list->inum,ff); + } else if (fpair->neighflag == N2) { + PairComputeFunctor<PairStyle,N2,true,Specialisation > + ff(fpair, list); + if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(fpair->nlocal,ff,ev); + else Kokkos::parallel_for(fpair->nlocal,ff); + } else if (fpair->neighflag == FULLCLUSTER) { + typedef PairComputeFunctor<PairStyle,FULLCLUSTER,true,Specialisation > + f_type; + f_type ff(fpair, list); + #ifdef KOKKOS_HAVE_CUDA + const int teamsize = Kokkos::Impl::is_same<typename f_type::device_type, Kokkos::Cuda>::value ? 256 : 1; + #else + const int teamsize = 1; + #endif + const int nteams = (list->inum*f_type::vectorization::increment+teamsize-1)/teamsize; + if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(Kokkos::ParallelWorkRequest(nteams,teamsize),ff,ev); + else Kokkos::parallel_for(Kokkos::ParallelWorkRequest(nteams,teamsize),ff); + } + } + return ev; +} + +} + +#endif +#endif + +/* ERROR/WARNING messages: + +*/ diff --git a/src/KOKKOS/pair_lj_cut_kokkos.cpp b/src/KOKKOS/pair_lj_cut_kokkos.cpp new file mode 100644 index 0000000000..94576a36c7 --- /dev/null +++ b/src/KOKKOS/pair_lj_cut_kokkos.cpp @@ -0,0 +1,267 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include "math.h" +#include "stdio.h" +#include "stdlib.h" +#include "string.h" +#include "pair_lj_cut_kokkos.h" +#include "kokkos.h" +#include "atom_kokkos.h" +#include "comm.h" +#include "force.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "update.h" +#include "integrate.h" +#include "respa.h" +#include "math_const.h" +#include "memory.h" +#include "error.h" +#include "atom_masks.h" + +using namespace LAMMPS_NS; +using namespace MathConst; + +#define KOKKOS_CUDA_MAX_THREADS 256 +#define KOKKOS_CUDA_MIN_BLOCKS 8 + +/* ---------------------------------------------------------------------- */ + +template<class DeviceType> +PairLJCutKokkos<DeviceType>::PairLJCutKokkos(LAMMPS *lmp) : PairLJCut(lmp) +{ + respa_enable = 0; + + atomKK = (AtomKokkos *) atom; + execution_space = ExecutionSpaceFromDevice<DeviceType>::space; + datamask_read = X_MASK | F_MASK | TYPE_MASK | ENERGY_MASK | VIRIAL_MASK; + datamask_modify = F_MASK | ENERGY_MASK | VIRIAL_MASK; + cutsq = NULL; +} + +/* ---------------------------------------------------------------------- */ + +template<class DeviceType> +PairLJCutKokkos<DeviceType>::~PairLJCutKokkos() +{ + if (allocated) { + k_cutsq = DAT::tdual_ffloat_2d(); + memory->sfree(cutsq); + cutsq = NULL; + } +} + +/* ---------------------------------------------------------------------- */ + +template<class DeviceType> +void PairLJCutKokkos<DeviceType>::cleanup_copy() { + // WHY needed: this prevents parent copy from deallocating any arrays + allocated = 0; + cutsq = NULL; + eatom = NULL; + vatom = NULL; +} + +/* ---------------------------------------------------------------------- */ + +template<class DeviceType> +void PairLJCutKokkos<DeviceType>::compute(int eflag_in, int vflag_in) +{ + eflag = eflag_in; + vflag = vflag_in; + + + if (neighflag == FULL || neighflag == FULLCLUSTER) no_virial_fdotr_compute = 1; + + double evdwl = 0.0; + if (eflag || vflag) ev_setup(eflag,vflag); + else evflag = vflag_fdotr = 0; + + atomKK->sync(execution_space,datamask_read); + k_cutsq.template sync<DeviceType>(); + k_params.template sync<DeviceType>(); + if (eflag || vflag) atomKK->modified(execution_space,datamask_modify); + else atomKK->modified(execution_space,F_MASK); + + x = atomKK->k_x.view<DeviceType>(); + c_x = atomKK->k_x.view<DeviceType>(); + f = atomKK->k_f.view<DeviceType>(); + type = atomKK->k_type.view<DeviceType>(); + nlocal = atom->nlocal; + nall = atom->nlocal + atom->nghost; + special_lj[0] = force->special_lj[0]; + special_lj[1] = force->special_lj[1]; + special_lj[2] = force->special_lj[2]; + special_lj[3] = force->special_lj[3]; + newton_pair = force->newton_pair; + + // loop over neighbors of my atoms + + EV_FLOAT ev = pair_compute<PairLJCutKokkos<DeviceType>,void >(this,(NeighListKokkos<DeviceType>*)list); + + DeviceType::fence(); + + if (eflag) eng_vdwl += ev.evdwl; + if (vflag_global) { + virial[0] += ev.v[0]; + virial[1] += ev.v[1]; + virial[2] += ev.v[2]; + virial[3] += ev.v[3]; + virial[4] += ev.v[4]; + virial[5] += ev.v[5]; + } + + if (vflag_fdotr) virial_fdotr_compute(); +} + +template<class DeviceType> +template<bool STACKPARAMS, class Specialisation> +KOKKOS_INLINE_FUNCTION +F_FLOAT PairLJCutKokkos<DeviceType>:: +compute_fpair(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const { + const F_FLOAT r2inv = 1.0/rsq; + const F_FLOAT r6inv = r2inv*r2inv*r2inv; + + const F_FLOAT forcelj = r6inv * + ((STACKPARAMS?m_params[itype][jtype].lj1:params(itype,jtype).lj1)*r6inv - + (STACKPARAMS?m_params[itype][jtype].lj2:params(itype,jtype).lj2)); + return forcelj*r2inv; +} + +template<class DeviceType> +template<bool STACKPARAMS, class Specialisation> +KOKKOS_INLINE_FUNCTION +F_FLOAT PairLJCutKokkos<DeviceType>:: +compute_evdwl(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const { + const F_FLOAT r2inv = 1.0/rsq; + const F_FLOAT r6inv = r2inv*r2inv*r2inv; + return r6inv*((STACKPARAMS?m_params[itype][jtype].lj3:params(itype,jtype).lj3)*r6inv - + (STACKPARAMS?m_params[itype][jtype].lj4:params(itype,jtype).lj4)) - + (STACKPARAMS?m_params[itype][jtype].offset:params(itype,jtype).offset); +} + +/* ---------------------------------------------------------------------- + allocate all arrays +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PairLJCutKokkos<DeviceType>::allocate() +{ + PairLJCut::allocate(); + + int n = atom->ntypes; + memory->destroy(cutsq); + memory->create_kokkos(k_cutsq,cutsq,n+1,n+1,"pair:cutsq"); + d_cutsq = k_cutsq.template view<DeviceType>(); + k_params = Kokkos::DualView<params_lj**,Kokkos::LayoutRight,DeviceType>("PairLJCut::params",n+1,n+1); + params = k_params.d_view; +} + +/* ---------------------------------------------------------------------- + global settings +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PairLJCutKokkos<DeviceType>::settings(int narg, char **arg) +{ + if (narg > 2) error->all(FLERR,"Illegal pair_style command"); + + PairLJCut::settings(1,arg); +} + +/* ---------------------------------------------------------------------- + init specific to this pair style +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PairLJCutKokkos<DeviceType>::init_style() +{ + PairLJCut::init_style(); + + // error if rRESPA with inner levels + + if (update->whichflag == 1 && strstr(update->integrate_style,"respa")) { + int respa = 0; + if (((Respa *) update->integrate)->level_inner >= 0) respa = 1; + if (((Respa *) update->integrate)->level_middle >= 0) respa = 2; + if (respa) + error->all(FLERR,"Cannot use Kokkos pair style with rRESPA inner/middle"); + } + + // irequest = neigh request made by parent class + + neighflag = lmp->kokkos->neighflag; + int irequest = neighbor->nrequest - 1; + + neighbor->requests[irequest]-> + kokkos_host = Kokkos::Impl::is_same<DeviceType,LMPHostType>::value && + !Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value; + neighbor->requests[irequest]-> + kokkos_device = Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value; + + if (neighflag == FULL) { + neighbor->requests[irequest]->full = 1; + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->full_cluster = 0; + } else if (neighflag == HALF || neighflag == HALFTHREAD) { + neighbor->requests[irequest]->full = 0; + neighbor->requests[irequest]->half = 1; + neighbor->requests[irequest]->full_cluster = 0; + } else if (neighflag == N2) { + neighbor->requests[irequest]->full = 0; + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->full_cluster = 0; + } else if (neighflag == FULLCLUSTER) { + neighbor->requests[irequest]->full_cluster = 1; + neighbor->requests[irequest]->full = 1; + neighbor->requests[irequest]->half = 0; + } else { + error->all(FLERR,"Cannot use chosen neighbor list style with lj/cut/kk"); + } +} + +/* ---------------------------------------------------------------------- + init for one type pair i,j and corresponding j,i +------------------------------------------------------------------------- */ + +template<class DeviceType> +double PairLJCutKokkos<DeviceType>::init_one(int i, int j) +{ + double cutone = PairLJCut::init_one(i,j); + + k_params.h_view(i,j).lj1 = lj1[i][j]; + k_params.h_view(i,j).lj2 = lj2[i][j]; + k_params.h_view(i,j).lj3 = lj3[i][j]; + k_params.h_view(i,j).lj4 = lj4[i][j]; + k_params.h_view(i,j).offset = offset[i][j]; + k_params.h_view(i,j).cutsq = cutone*cutone; + k_params.h_view(j,i) = k_params.h_view(i,j); + if(i<MAX_TYPES_STACKPARAMS+1 && j<MAX_TYPES_STACKPARAMS+1) { + m_params[i][j] = m_params[j][i] = k_params.h_view(i,j); + m_cutsq[j][i] = m_cutsq[i][j] = cutone*cutone; + } + k_cutsq.h_view(i,j) = cutone*cutone; + k_cutsq.template modify<LMPHostType>(); + k_params.template modify<LMPHostType>(); + + return cutone; +} + + + +template class PairLJCutKokkos<LMPDeviceType>; +#if DEVICE==2 +template class PairLJCutKokkos<LMPHostType>; +#endif diff --git a/src/KOKKOS/pair_lj_cut_kokkos.h b/src/KOKKOS/pair_lj_cut_kokkos.h new file mode 100644 index 0000000000..5c3c002af5 --- /dev/null +++ b/src/KOKKOS/pair_lj_cut_kokkos.h @@ -0,0 +1,112 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(lj/cut/kk,PairLJCutKokkos<LMPDeviceType>) +PairStyle(lj/cut/kk/device,PairLJCutKokkos<LMPDeviceType>) +PairStyle(lj/cut/kk/host,PairLJCutKokkos<LMPHostType>) + +#else + +#ifndef LMP_PAIR_LJ_CUT_KOKKOS_H +#define LMP_PAIR_LJ_CUT_KOKKOS_H + +#include "pair_kokkos.h" +#include "pair_lj_cut.h" +#include "neigh_list_kokkos.h" + +namespace LAMMPS_NS { + +template<class DeviceType> +class PairLJCutKokkos : public PairLJCut { + public: + enum {COUL_FLAG=0}; + typedef DeviceType device_type; + PairLJCutKokkos(class LAMMPS *); + ~PairLJCutKokkos(); + + void compute(int, int); + + void settings(int, char **); + void init_style(); + double init_one(int, int); + + struct params_lj{ + params_lj(){cutsq=0,lj1=0;lj2=0;lj3=0;lj4=0;offset=0;}; + params_lj(int i){cutsq=0,lj1=0;lj2=0;lj3=0;lj4=0;offset=0;}; + F_FLOAT cutsq,lj1,lj2,lj3,lj4,offset; + }; + + protected: + void cleanup_copy(); + + template<bool STACKPARAMS, class Specialisation> + KOKKOS_INLINE_FUNCTION + F_FLOAT compute_fpair(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const; + + template<bool STACKPARAMS, class Specialisation> + KOKKOS_INLINE_FUNCTION + F_FLOAT compute_evdwl(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const; + + template<bool STACKPARAMS, class Specialisation> + KOKKOS_INLINE_FUNCTION + F_FLOAT compute_ecoul(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const { + return 0; + } + + + Kokkos::DualView<params_lj**,Kokkos::LayoutRight,DeviceType> k_params; + typename Kokkos::DualView<params_lj**,Kokkos::LayoutRight,DeviceType>::t_dev_const params; + params_lj m_params[MAX_TYPES_STACKPARAMS+1][MAX_TYPES_STACKPARAMS+1]; // hardwired to space for 15 atom types + F_FLOAT m_cutsq[MAX_TYPES_STACKPARAMS+1][MAX_TYPES_STACKPARAMS+1]; + typename ArrayTypes<DeviceType>::t_x_array_randomread x; + typename ArrayTypes<DeviceType>::t_x_array c_x; + typename ArrayTypes<DeviceType>::t_f_array f; + typename ArrayTypes<DeviceType>::t_int_1d_randomread type; + typename ArrayTypes<DeviceType>::t_efloat_1d d_eatom; + typename ArrayTypes<DeviceType>::t_virial_array d_vatom; + //typename ArrayTypes<DeviceType>::t_ffloat_1d special_lj; + + int newton_pair; + + typename ArrayTypes<DeviceType>::tdual_ffloat_2d k_cutsq; + typename ArrayTypes<DeviceType>::t_ffloat_2d d_cutsq; + + class AtomKokkos *atomKK; + int neighflag; + int nlocal,nall,eflag,vflag; + + void allocate(); + friend class PairComputeFunctor<PairLJCutKokkos,FULL,true>; + friend class PairComputeFunctor<PairLJCutKokkos,HALF,true>; + friend class PairComputeFunctor<PairLJCutKokkos,HALFTHREAD,true>; + friend class PairComputeFunctor<PairLJCutKokkos,N2,true>; + friend class PairComputeFunctor<PairLJCutKokkos,FULLCLUSTER,true >; + friend class PairComputeFunctor<PairLJCutKokkos,FULL,false>; + friend class PairComputeFunctor<PairLJCutKokkos,HALF,false>; + friend class PairComputeFunctor<PairLJCutKokkos,HALFTHREAD,false>; + friend class PairComputeFunctor<PairLJCutKokkos,N2,false>; + friend class PairComputeFunctor<PairLJCutKokkos,FULLCLUSTER,false >; + friend EV_FLOAT pair_compute<PairLJCutKokkos,void>(PairLJCutKokkos*,NeighListKokkos<DeviceType>*); + +}; + +} + +#endif +#endif + +/* ERROR/WARNING messages: + +*/ diff --git a/src/KOKKOS/pair_table_kokkos.cpp b/src/KOKKOS/pair_table_kokkos.cpp new file mode 100644 index 0000000000..cc8072991a --- /dev/null +++ b/src/KOKKOS/pair_table_kokkos.cpp @@ -0,0 +1,1500 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Paul Crozier (SNL) +------------------------------------------------------------------------- */ + +#include "mpi.h" +#include "math.h" +#include "stdlib.h" +#include "string.h" +#include "pair_table_kokkos.h" +#include "kokkos.h" +#include "atom.h" +#include "force.h" +#include "comm.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "memory.h" +#include "error.h" +#include "atom_masks.h" + +using namespace LAMMPS_NS; + +enum{NONE,RLINEAR,RSQ,BMP}; +enum{FULL,HALFTHREAD,HALF}; + +#define MAXLINE 1024 + +/* ---------------------------------------------------------------------- */ + +template<class DeviceType> +PairTableKokkos<DeviceType>::PairTableKokkos(LAMMPS *lmp) : Pair(lmp) +{ + update_table = 0; + atomKK = (AtomKokkos *) atom; + ntables = 0; + tables = NULL; + execution_space = ExecutionSpaceFromDevice<DeviceType>::space; + datamask_read = X_MASK | F_MASK | TYPE_MASK | ENERGY_MASK | VIRIAL_MASK; + datamask_modify = F_MASK | ENERGY_MASK | VIRIAL_MASK; + h_table = new TableHost(); + d_table = new TableDevice(); +} + +/* ---------------------------------------------------------------------- */ + +template<class DeviceType> +PairTableKokkos<DeviceType>::~PairTableKokkos() +{ +/* for (int m = 0; m < ntables; m++) free_table(&tables[m]); + memory->sfree(tables); + + if (allocated) { + memory->destroy(setflag); + memory->destroy(cutsq); + memory->destroy(tabindex); + }*/ + delete h_table; + delete d_table; + +} + +/* ---------------------------------------------------------------------- */ + +template<class DeviceType> +void PairTableKokkos<DeviceType>::compute(int eflag_in, int vflag_in) +{ + if(update_table) + create_kokkos_tables(); + if(tabstyle == LOOKUP) + compute_style<LOOKUP>(eflag_in,vflag_in); + if(tabstyle == LINEAR) + compute_style<LINEAR>(eflag_in,vflag_in); + if(tabstyle == SPLINE) + compute_style<SPLINE>(eflag_in,vflag_in); + if(tabstyle == BITMAP) + compute_style<BITMAP>(eflag_in,vflag_in); +} + +template<class DeviceType> +template<int TABSTYLE> +void PairTableKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in) +{ + eflag = eflag_in; + vflag = vflag_in; + + if (neighflag == FULL || neighflag == FULLCLUSTER) no_virial_fdotr_compute = 1; + + double evdwl = 0.0; + if (eflag || vflag) ev_setup(eflag,vflag); + else evflag = vflag_fdotr = 0; + + atomKK->sync(execution_space,datamask_read); + //k_cutsq.template sync<DeviceType>(); + //k_params.template sync<DeviceType>(); + if (eflag || vflag) atomKK->modified(execution_space,datamask_modify); + else atomKK->modified(execution_space,F_MASK); + + x = c_x = atomKK->k_x.view<DeviceType>(); + f = atomKK->k_f.view<DeviceType>(); + type = atomKK->k_type.view<DeviceType>(); + nlocal = atom->nlocal; + nall = atom->nlocal + atom->nghost; + special_lj[0] = force->special_lj[0]; + special_lj[1] = force->special_lj[1]; + special_lj[2] = force->special_lj[2]; + special_lj[3] = force->special_lj[3]; + newton_pair = force->newton_pair; + d_cutsq = d_table->cutsq; + // loop over neighbors of my atoms + + EV_FLOAT ev; + if(atom->ntypes > MAX_TYPES_STACKPARAMS) { + if (neighflag == FULL) { + PairComputeFunctor<PairTableKokkos<DeviceType>,FULL,false,S_TableCompute<DeviceType,TABSTYLE> > + ff(this,(NeighListKokkos<DeviceType>*) list); + if (eflag || vflag) Kokkos::parallel_reduce(list->inum,ff,ev); + else Kokkos::parallel_for(list->inum,ff); + } else if (neighflag == HALFTHREAD) { + PairComputeFunctor<PairTableKokkos<DeviceType>,HALFTHREAD,false,S_TableCompute<DeviceType,TABSTYLE> > + ff(this,(NeighListKokkos<DeviceType>*) list); + if (eflag || vflag) Kokkos::parallel_reduce(list->inum,ff,ev); + else Kokkos::parallel_for(list->inum,ff); + } else if (neighflag == HALF) { + PairComputeFunctor<PairTableKokkos<DeviceType>,HALF,false,S_TableCompute<DeviceType,TABSTYLE> > + f(this,(NeighListKokkos<DeviceType>*) list); + if (eflag || vflag) Kokkos::parallel_reduce(list->inum,f,ev); + else Kokkos::parallel_for(list->inum,f); + } else if (neighflag == N2) { + PairComputeFunctor<PairTableKokkos<DeviceType>,N2,false,S_TableCompute<DeviceType,TABSTYLE> > + f(this,(NeighListKokkos<DeviceType>*) list); + if (eflag || vflag) Kokkos::parallel_reduce(nlocal,f,ev); + else Kokkos::parallel_for(nlocal,f); + } else if (neighflag == FULLCLUSTER) { + typedef PairComputeFunctor<PairTableKokkos<DeviceType>,FULLCLUSTER,false,S_TableCompute<DeviceType,TABSTYLE> > + f_type; + f_type f(this,(NeighListKokkos<DeviceType>*) list); + #ifdef KOKKOS_HAVE_CUDA + const int teamsize = Kokkos::Impl::is_same<typename f_type::device_type, Kokkos::Cuda>::value ? 256 : 1; + #else + const int teamsize = 1; + #endif + const int nteams = (list->inum*f_type::vectorization::increment+teamsize-1)/teamsize; + if (eflag || vflag) Kokkos::parallel_reduce(Kokkos::ParallelWorkRequest(nteams,teamsize),f,ev); + else Kokkos::parallel_for(Kokkos::ParallelWorkRequest(nteams,teamsize),f); + } + } else { + if (neighflag == FULL) { + PairComputeFunctor<PairTableKokkos<DeviceType>,FULL,true,S_TableCompute<DeviceType,TABSTYLE> > + f(this,(NeighListKokkos<DeviceType>*) list); + if (eflag || vflag) Kokkos::parallel_reduce(list->inum,f,ev); + else Kokkos::parallel_for(list->inum,f); + } else if (neighflag == HALFTHREAD) { + PairComputeFunctor<PairTableKokkos<DeviceType>,HALFTHREAD,true,S_TableCompute<DeviceType,TABSTYLE> > + f(this,(NeighListKokkos<DeviceType>*) list); + if (eflag || vflag) Kokkos::parallel_reduce(list->inum,f,ev); + else Kokkos::parallel_for(list->inum,f); + } else if (neighflag == HALF) { + PairComputeFunctor<PairTableKokkos<DeviceType>,HALF,true,S_TableCompute<DeviceType,TABSTYLE> > + f(this,(NeighListKokkos<DeviceType>*) list); + if (eflag || vflag) Kokkos::parallel_reduce(list->inum,f,ev); + else Kokkos::parallel_for(list->inum,f); + } else if (neighflag == N2) { + PairComputeFunctor<PairTableKokkos<DeviceType>,N2,true,S_TableCompute<DeviceType,TABSTYLE> > + f(this,(NeighListKokkos<DeviceType>*) list); + if (eflag || vflag) Kokkos::parallel_reduce(nlocal,f,ev); + else Kokkos::parallel_for(nlocal,f); + } else if (neighflag == FULLCLUSTER) { + typedef PairComputeFunctor<PairTableKokkos<DeviceType>,FULLCLUSTER,true,S_TableCompute<DeviceType,TABSTYLE> > + f_type; + f_type f(this,(NeighListKokkos<DeviceType>*) list); + #ifdef KOKKOS_HAVE_CUDA + const int teamsize = Kokkos::Impl::is_same<typename f_type::device_type, Kokkos::Cuda>::value ? 256 : 1; + #else + const int teamsize = 1; + #endif + const int nteams = (list->inum*f_type::vectorization::increment+teamsize-1)/teamsize; + if (eflag || vflag) Kokkos::parallel_reduce(Kokkos::ParallelWorkRequest(nteams,teamsize),f,ev); + else Kokkos::parallel_for(Kokkos::ParallelWorkRequest(nteams,teamsize),f); + } + } + DeviceType::fence(); + + if (eflag) eng_vdwl += ev.evdwl; + if (vflag_global) { + virial[0] += ev.v[0]; + virial[1] += ev.v[1]; + virial[2] += ev.v[2]; + virial[3] += ev.v[3]; + virial[4] += ev.v[4]; + virial[5] += ev.v[5]; + } + + if (vflag_fdotr) virial_fdotr_compute(); +} + +template<class DeviceType> +template<bool STACKPARAMS, class Specialisation> +KOKKOS_INLINE_FUNCTION +F_FLOAT PairTableKokkos<DeviceType>:: +compute_fpair(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const { + union_int_float_t rsq_lookup; + double fpair; + const int tidx = d_table_const.tabindex(itype,jtype); + //const Table* const tb = &tables[tabindex[itype][jtype]]; + + //if (rsq < d_table_const.innersq(tidx)) + // error->one(FLERR,"Pair distance < table inner cutoff"); + + if (Specialisation::TabStyle == LOOKUP) { + const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx)); + //if (itable >= tlm1) + // error->one(FLERR,"Pair distance > table outer cutoff"); + fpair = d_table_const.f(tidx,itable); + } else if (Specialisation::TabStyle == LINEAR) { + const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx)); + //if (itable >= tlm1) + // error->one(FLERR,"Pair distance > table outer cutoff"); + const double fraction = (rsq - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx); + fpair = d_table_const.f(tidx,itable) + fraction*d_table_const.df(tidx,itable); + } else if (Specialisation::TabStyle == SPLINE) { + const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx)); + //if (itable >= tlm1) + // error->one(FLERR,"Pair distance > table outer cutoff"); + const double b = (rsq - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx); + const double a = 1.0 - b; + fpair = a * d_table_const.f(tidx,itable) + b * d_table_const.f(tidx,itable+1) + + ((a*a*a-a)*d_table_const.f2(tidx,itable) + (b*b*b-b)*d_table_const.f2(tidx,itable+1)) * + d_table_const.deltasq6(tidx); + } else { + rsq_lookup.f = rsq; + int itable = rsq_lookup.i & d_table_const.nmask(tidx); + itable >>= d_table_const.nshiftbits(tidx); + const double fraction = (rsq_lookup.f - d_table_const.rsq(tidx,itable)) * d_table_const.drsq(tidx,itable); + fpair = d_table_const.f(tidx,itable) + fraction*d_table_const.df(tidx,itable); + } + return fpair; +} + +template<class DeviceType> +template<bool STACKPARAMS, class Specialisation> +KOKKOS_INLINE_FUNCTION +F_FLOAT PairTableKokkos<DeviceType>:: +compute_evdwl(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const { + double evdwl; + union_int_float_t rsq_lookup; + const int tidx = d_table_const.tabindex(itype,jtype); + //const Table* const tb = &tables[tabindex[itype][jtype]]; + + //if (rsq < d_table_const.innersq(tidx)) + // error->one(FLERR,"Pair distance < table inner cutoff"); + + if (Specialisation::TabStyle == LOOKUP) { + const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx)); + //if (itable >= tlm1) + // error->one(FLERR,"Pair distance > table outer cutoff"); + evdwl = d_table_const.e(tidx,itable); + } else if (Specialisation::TabStyle == LINEAR) { + const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx)); + //if (itable >= tlm1) + // error->one(FLERR,"Pair distance > table outer cutoff"); + const double fraction = (rsq - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx); + evdwl = d_table_const.e(tidx,itable) + fraction*d_table_const.de(tidx,itable); + } else if (Specialisation::TabStyle == SPLINE) { + const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx)); + //if (itable >= tlm1) + // error->one(FLERR,"Pair distance > table outer cutoff"); + const double b = (rsq - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx); + const double a = 1.0 - b; + evdwl = a * d_table_const.e(tidx,itable) + b * d_table_const.e(tidx,itable+1) + + ((a*a*a-a)*d_table_const.e2(tidx,itable) + (b*b*b-b)*d_table_const.e2(tidx,itable+1)) * + d_table_const.deltasq6(tidx); + } else { + rsq_lookup.f = rsq; + int itable = rsq_lookup.i & d_table_const.nmask(tidx); + itable >>= d_table_const.nshiftbits(tidx); + const double fraction = (rsq_lookup.f - d_table_const.rsq(tidx,itable)) * d_table_const.drsq(tidx,itable); + evdwl = d_table_const.e(tidx,itable) + fraction*d_table_const.de(tidx,itable); + } + return evdwl; +} + +/* +template<class DeviceType> +template<int EVFLAG, int NEIGHFLAG, int NEWTON_PAIR,int TABSTYLE> +KOKKOS_FUNCTION +EV_FLOAT PairTableKokkos<DeviceType>:: +compute_item(const int &ii, const NeighListKokkos<DeviceType> &list) const +{ + EV_FLOAT ev; + const int tlm1 = tablength - 1; + union_int_float_t rsq_lookup; + const int i = list.d_ilist[ii]; + const X_FLOAT xtmp = x(i,0); + const X_FLOAT ytmp = x(i,1); + const X_FLOAT ztmp = x(i,2); + const int itype = type(i); + + const AtomNeighborsConst neighbors_i = list.get_neighbors_const(i); + const int jnum = list.d_numneigh[i]; + + F_FLOAT fxtmp = 0.0; + F_FLOAT fytmp = 0.0; + F_FLOAT fztmp = 0.0; + + for (int jj = 0; jj < jnum; jj++) { + int j = neighbors_i(jj); + const F_FLOAT factor_lj = 1.0; //special_lj[sbmask(j)]; + j &= NEIGHMASK; + const X_FLOAT delx = xtmp - x(j,0); + const X_FLOAT dely = ytmp - x(j,1); + const X_FLOAT delz = ztmp - x(j,2); + const int jtype = type(j); + const F_FLOAT rsq = delx*delx + dely*dely + delz*delz; + + if (rsq < d_table_const.cutsq(itype,jtype)) { + double fpair; + const int tidx = d_table_const.tabindex(itype,jtype); + //const Table* const tb = &tables[tabindex[itype][jtype]]; + + //if (rsq < d_table_const.innersq(tidx)) + // error->one(FLERR,"Pair distance < table inner cutoff"); + + if (TABSTYLE == LOOKUP) { + const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx)); + //if (itable >= tlm1) + // error->one(FLERR,"Pair distance > table outer cutoff"); + fpair = factor_lj * d_table_const.f(tidx,itable); + if (EVFLAG) + ev.evdwl = d_table_const.e(tidx,itable); + } else if (TABSTYLE == LINEAR) { + const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx)); + //if (itable >= tlm1) + // error->one(FLERR,"Pair distance > table outer cutoff"); + const double fraction = (rsq - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx); + const double value = d_table_const.f(tidx,itable) + fraction*d_table_const.df(tidx,itable); + fpair = factor_lj * value; + if (EVFLAG) + ev.evdwl = d_table_const.e(tidx,itable) + fraction*d_table_const.de(tidx,itable); + } else if (TABSTYLE == SPLINE) { + const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx)); + //if (itable >= tlm1) + // error->one(FLERR,"Pair distance > table outer cutoff"); + const double b = (rsq - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx); + const double a = 1.0 - b; + const double value = a * d_table_const.f(tidx,itable) + b * d_table_const.f(tidx,itable+1) + + ((a*a*a-a)*d_table_const.f2(tidx,itable) + (b*b*b-b)*d_table_const.f2(tidx,itable+1)) * + d_table_const.deltasq6(tidx); + fpair = factor_lj * value; + if (EVFLAG) + ev.evdwl = a * d_table_const.e(tidx,itable) + b * d_table_const.e(tidx,itable+1) + + ((a*a*a-a)*d_table_const.e2(tidx,itable) + (b*b*b-b)*d_table_const.e2(tidx,itable+1)) * + d_table_const.deltasq6(tidx); + } else { + rsq_lookup.f = rsq; + int itable = rsq_lookup.i & d_table_const.nmask(tidx); + itable >>= d_table_const.nshiftbits(tidx); + const double fraction = (rsq_lookup.f - d_table_const.rsq(tidx,itable)) * d_table_const.drsq(tidx,itable); + const double value = d_table_const.f(tidx,itable) + fraction*d_table_const.df(tidx,itable); + fpair = factor_lj * value; + if (EVFLAG) + ev.evdwl = d_table_const.e(tidx,itable) + fraction*d_table_const.de(tidx,itable); + } + + fxtmp += delx*fpair; + fytmp += dely*fpair; + fztmp += delz*fpair; + if ((NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < nlocal)) { + Kokkos::atomic_fetch_add(&f(j,0),-delx*fpair); + Kokkos::atomic_fetch_add(&f(j,1),-dely*fpair); + Kokkos::atomic_fetch_add(&f(j,2),-delz*fpair); + } + + if ((NEIGHFLAG==HALF) && (NEWTON_PAIR || j < nlocal)) { + f(j,0) -= delx*fpair; + f(j,1) -= dely*fpair; + f(j,2) -= delz*fpair; + } + + if(EVFLAG) { + if (eflag) { + ev.evdwl *= factor_lj; + } + + if (evflag) ev_tally<NEIGHFLAG>(ev,i,j +,fpair,delx,dely,delz); + } + } + } + + if (NEIGHFLAG == HALFTHREAD) { + Kokkos::atomic_fetch_add(&f(i,0),fxtmp); + Kokkos::atomic_fetch_add(&f(i,1),fytmp); + Kokkos::atomic_fetch_add(&f(i,2),fztmp); + } else { + f(i,0) += fxtmp; + f(i,1) += fytmp; + f(i,2) += fztmp; + } + + return ev; +} +*/ + +template<class DeviceType> +void PairTableKokkos<DeviceType>::create_kokkos_tables() +{ + const int tlm1 = tablength-1; + + memory->create_kokkos(d_table->nshiftbits,h_table->nshiftbits,ntables,"Table::nshiftbits"); + memory->create_kokkos(d_table->nmask,h_table->nmask,ntables,"Table::nmask"); + memory->create_kokkos(d_table->innersq,h_table->innersq,ntables,"Table::innersq"); + memory->create_kokkos(d_table->invdelta,h_table->invdelta,ntables,"Table::invdelta"); + memory->create_kokkos(d_table->deltasq6,h_table->deltasq6,ntables,"Table::deltasq6"); + + if(tabstyle == LOOKUP) { + memory->create_kokkos(d_table->e,h_table->e,ntables,tlm1,"Table::e"); + memory->create_kokkos(d_table->f,h_table->f,ntables,tlm1,"Table::f"); + } + + if(tabstyle == LINEAR) { + memory->create_kokkos(d_table->rsq,h_table->rsq,ntables,tablength,"Table::rsq"); + memory->create_kokkos(d_table->e,h_table->e,ntables,tablength,"Table::e"); + memory->create_kokkos(d_table->f,h_table->f,ntables,tablength,"Table::f"); + memory->create_kokkos(d_table->de,h_table->de,ntables,tlm1,"Table::de"); + memory->create_kokkos(d_table->df,h_table->df,ntables,tlm1,"Table::df"); + } + + if(tabstyle == SPLINE) { + memory->create_kokkos(d_table->rsq,h_table->rsq,ntables,tablength,"Table::rsq"); + memory->create_kokkos(d_table->e,h_table->e,ntables,tablength,"Table::e"); + memory->create_kokkos(d_table->f,h_table->f,ntables,tablength,"Table::f"); + memory->create_kokkos(d_table->e2,h_table->e2,ntables,tablength,"Table::e2"); + memory->create_kokkos(d_table->f2,h_table->f2,ntables,tablength,"Table::f2"); + } + + if(tabstyle == BITMAP) { + int ntable = 1 << tablength; + memory->create_kokkos(d_table->rsq,h_table->rsq,ntables,ntable,"Table::rsq"); + memory->create_kokkos(d_table->e,h_table->e,ntables,ntable,"Table::e"); + memory->create_kokkos(d_table->f,h_table->f,ntables,ntable,"Table::f"); + memory->create_kokkos(d_table->de,h_table->de,ntables,ntable,"Table::de"); + memory->create_kokkos(d_table->df,h_table->df,ntables,ntable,"Table::df"); + memory->create_kokkos(d_table->drsq,h_table->drsq,ntables,ntable,"Table::drsq"); + } + + + + for(int i=0; i < ntables; i++) { + Table* tb = &tables[i]; + + h_table->nshiftbits[i] = tb->nshiftbits; + h_table->nmask[i] = tb->nmask; + h_table->innersq[i] = tb->innersq; + h_table->invdelta[i] = tb->invdelta; + h_table->deltasq6[i] = tb->deltasq6; + + for(int j = 0; j<h_table->rsq.dimension_1(); j++) + h_table->rsq(i,j) = tb->rsq[j]; + for(int j = 0; j<h_table->drsq.dimension_1(); j++) + h_table->drsq(i,j) = tb->drsq[j]; + for(int j = 0; j<h_table->e.dimension_1(); j++) + h_table->e(i,j) = tb->e[j]; + for(int j = 0; j<h_table->de.dimension_1(); j++) + h_table->de(i,j) = tb->de[j]; + for(int j = 0; j<h_table->f.dimension_1(); j++) + h_table->f(i,j) = tb->f[j]; + for(int j = 0; j<h_table->df.dimension_1(); j++) + h_table->df(i,j) = tb->df[j]; + for(int j = 0; j<h_table->e2.dimension_1(); j++) + h_table->e2(i,j) = tb->e2[j]; + for(int j = 0; j<h_table->f2.dimension_1(); j++) + h_table->f2(i,j) = tb->f2[j]; + } + + + Kokkos::deep_copy(d_table->nshiftbits,h_table->nshiftbits); + Kokkos::deep_copy(d_table->nmask,h_table->nmask); + Kokkos::deep_copy(d_table->innersq,h_table->innersq); + Kokkos::deep_copy(d_table->invdelta,h_table->invdelta); + Kokkos::deep_copy(d_table->deltasq6,h_table->deltasq6); + Kokkos::deep_copy(d_table->rsq,h_table->rsq); + Kokkos::deep_copy(d_table->drsq,h_table->drsq); + Kokkos::deep_copy(d_table->e,h_table->e); + Kokkos::deep_copy(d_table->de,h_table->de); + Kokkos::deep_copy(d_table->f,h_table->f); + Kokkos::deep_copy(d_table->df,h_table->df); + Kokkos::deep_copy(d_table->e2,h_table->e2); + Kokkos::deep_copy(d_table->f2,h_table->f2); + Kokkos::deep_copy(d_table->tabindex,h_table->tabindex); + + d_table_const.nshiftbits = d_table->nshiftbits; + d_table_const.nmask = d_table->nmask; + d_table_const.innersq = d_table->innersq; + d_table_const.invdelta = d_table->invdelta; + d_table_const.deltasq6 = d_table->deltasq6; + d_table_const.rsq = d_table->rsq; + d_table_const.drsq = d_table->drsq; + d_table_const.e = d_table->e; + d_table_const.de = d_table->de; + d_table_const.f = d_table->f; + d_table_const.df = d_table->df; + d_table_const.e2 = d_table->e2; + d_table_const.f2 = d_table->f2; + + + Kokkos::deep_copy(d_table->cutsq,h_table->cutsq); + update_table = 0; +} + +/* ---------------------------------------------------------------------- + allocate all arrays +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PairTableKokkos<DeviceType>::allocate() +{ + allocated = 1; + const int nt = atom->ntypes + 1; + + memory->create(setflag,nt,nt,"pair:setflag"); + memory->create_kokkos(d_table->cutsq,h_table->cutsq,cutsq,nt,nt,"pair:cutsq"); + memory->create_kokkos(d_table->tabindex,h_table->tabindex,tabindex,nt,nt,"pair:tabindex"); + + d_table_const.cutsq = d_table->cutsq; + d_table_const.tabindex = d_table->tabindex; + memset(&setflag[0][0],0,nt*nt*sizeof(int)); + memset(&cutsq[0][0],0,nt*nt*sizeof(double)); + memset(&tabindex[0][0],0,nt*nt*sizeof(int)); +} + +/* ---------------------------------------------------------------------- + global settings +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PairTableKokkos<DeviceType>::settings(int narg, char **arg) +{ + if (narg < 2) error->all(FLERR,"Illegal pair_style command"); + + // new settings + + if (strcmp(arg[0],"lookup") == 0) tabstyle = LOOKUP; + else if (strcmp(arg[0],"linear") == 0) tabstyle = LINEAR; + else if (strcmp(arg[0],"spline") == 0) tabstyle = SPLINE; + else if (strcmp(arg[0],"bitmap") == 0) tabstyle = BITMAP; + else error->all(FLERR,"Unknown table style in pair_style command"); + + tablength = force->inumeric(FLERR,arg[1]); + if (tablength < 2) error->all(FLERR,"Illegal number of pair table entries"); + + // optional keywords + // assert the tabulation is compatible with a specific long-range solver + + int iarg = 2; + while (iarg < narg) { + if (strcmp(arg[iarg],"ewald") == 0) ewaldflag = 1; + else if (strcmp(arg[iarg],"pppm") == 0) pppmflag = 1; + else if (strcmp(arg[iarg],"msm") == 0) msmflag = 1; + else if (strcmp(arg[iarg],"dispersion") == 0) dispersionflag = 1; + else if (strcmp(arg[iarg],"tip4p") == 0) tip4pflag = 1; + else error->all(FLERR,"Illegal pair_style command"); + iarg++; + } + + // delete old tables, since cannot just change settings + + for (int m = 0; m < ntables; m++) free_table(&tables[m]); + memory->sfree(tables); + + if (allocated) { + memory->destroy(setflag); + + d_table_const.tabindex = d_table->tabindex = typename ArrayTypes<DeviceType>::t_int_2d(); + h_table->tabindex = typename ArrayTypes<LMPHostType>::t_int_2d(); + + d_table_const.cutsq = d_table->cutsq = typename ArrayTypes<DeviceType>::t_ffloat_2d(); + h_table->cutsq = typename ArrayTypes<LMPHostType>::t_ffloat_2d(); + } + allocated = 0; + + ntables = 0; + tables = NULL; +} + +/* ---------------------------------------------------------------------- + set coeffs for one or more type pairs +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PairTableKokkos<DeviceType>::coeff(int narg, char **arg) +{ + if (narg != 4 && narg != 5) error->all(FLERR,"Illegal pair_coeff command"); + if (!allocated) allocate(); + + int ilo,ihi,jlo,jhi; + force->bounds(arg[0],atom->ntypes,ilo,ihi); + force->bounds(arg[1],atom->ntypes,jlo,jhi); + + int me; + MPI_Comm_rank(world,&me); + tables = (Table *) + memory->srealloc(tables,(ntables+1)*sizeof(Table),"pair:tables"); + Table *tb = &tables[ntables]; + null_table(tb); + if (me == 0) read_table(tb,arg[2],arg[3]); + bcast_table(tb); + + // set table cutoff + + if (narg == 5) tb->cut = force->numeric(FLERR,arg[4]); + else if (tb->rflag) tb->cut = tb->rhi; + else tb->cut = tb->rfile[tb->ninput-1]; + + // error check on table parameters + // insure cutoff is within table + // for BITMAP tables, file values can be in non-ascending order + + if (tb->ninput <= 1) error->one(FLERR,"Invalid pair table length"); + double rlo,rhi; + if (tb->rflag == 0) { + rlo = tb->rfile[0]; + rhi = tb->rfile[tb->ninput-1]; + } else { + rlo = tb->rlo; + rhi = tb->rhi; + } + if (tb->cut <= rlo || tb->cut > rhi) + error->all(FLERR,"Invalid pair table cutoff"); + if (rlo <= 0.0) error->all(FLERR,"Invalid pair table cutoff"); + + // match = 1 if don't need to spline read-in tables + // this is only the case if r values needed by final tables + // exactly match r values read from file + // for tabstyle SPLINE, always need to build spline tables + + tb->match = 0; + if (tabstyle == LINEAR && tb->ninput == tablength && + tb->rflag == RSQ && tb->rhi == tb->cut) tb->match = 1; + if (tabstyle == BITMAP && tb->ninput == 1 << tablength && + tb->rflag == BMP && tb->rhi == tb->cut) tb->match = 1; + if (tb->rflag == BMP && tb->match == 0) + error->all(FLERR,"Bitmapped table in file does not match requested table"); + + // spline read-in values and compute r,e,f vectors within table + + if (tb->match == 0) spline_table(tb); + compute_table(tb); + + // store ptr to table in tabindex + + int count = 0; + for (int i = ilo; i <= ihi; i++) { + for (int j = MAX(jlo,i); j <= jhi; j++) { + tabindex[i][j] = ntables; + setflag[i][j] = 1; + count++; + } + } + + if (count == 0) error->all(FLERR,"Illegal pair_coeff command"); + ntables++; +} + +/* ---------------------------------------------------------------------- + init for one type pair i,j and corresponding j,i +------------------------------------------------------------------------- */ + +template<class DeviceType> +double PairTableKokkos<DeviceType>::init_one(int i, int j) +{ + if (setflag[i][j] == 0) error->all(FLERR,"All pair coeffs are not set"); + + tabindex[j][i] = tabindex[i][j]; + + if(i<MAX_TYPES_STACKPARAMS+1 && j<MAX_TYPES_STACKPARAMS+1) { + m_cutsq[j][i] = m_cutsq[i][j] = tables[tabindex[i][j]].cut*tables[tabindex[i][j]].cut; + } + + return tables[tabindex[i][j]].cut; +} + +/* ---------------------------------------------------------------------- + read a table section from a tabulated potential file + only called by proc 0 + this function sets these values in Table: + ninput,rfile,efile,ffile,rflag,rlo,rhi,fpflag,fplo,fphi,ntablebits +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PairTableKokkos<DeviceType>::read_table(Table *tb, char *file, char *keyword) +{ + char line[MAXLINE]; + + // open file + + FILE *fp = force->open_potential(file); + if (fp == NULL) { + char str[128]; + sprintf(str,"Cannot open file %s",file); + error->one(FLERR,str); + } + + // loop until section found with matching keyword + + while (1) { + if (fgets(line,MAXLINE,fp) == NULL) + error->one(FLERR,"Did not find keyword in table file"); + if (strspn(line," \t\n\r") == strlen(line)) continue; // blank line + if (line[0] == '#') continue; // comment + char *word = strtok(line," \t\n\r"); + if (strcmp(word,keyword) == 0) break; // matching keyword + fgets(line,MAXLINE,fp); // no match, skip section + param_extract(tb,line); + fgets(line,MAXLINE,fp); + for (int i = 0; i < tb->ninput; i++) fgets(line,MAXLINE,fp); + } + + // read args on 2nd line of section + // allocate table arrays for file values + + fgets(line,MAXLINE,fp); + param_extract(tb,line); + memory->create(tb->rfile,tb->ninput,"pair:rfile"); + memory->create(tb->efile,tb->ninput,"pair:efile"); + memory->create(tb->ffile,tb->ninput,"pair:ffile"); + + // setup bitmap parameters for table to read in + + tb->ntablebits = 0; + int masklo,maskhi,nmask,nshiftbits; + if (tb->rflag == BMP) { + while (1 << tb->ntablebits < tb->ninput) tb->ntablebits++; + if (1 << tb->ntablebits != tb->ninput) + error->one(FLERR,"Bitmapped table is incorrect length in table file"); + init_bitmap(tb->rlo,tb->rhi,tb->ntablebits,masklo,maskhi,nmask,nshiftbits); + } + + // read r,e,f table values from file + // if rflag set, compute r + // if rflag not set, use r from file + + int itmp; + double rtmp; + union_int_float_t rsq_lookup; + + fgets(line,MAXLINE,fp); + for (int i = 0; i < tb->ninput; i++) { + fgets(line,MAXLINE,fp); + sscanf(line,"%d %lg %lg %lg",&itmp,&rtmp,&tb->efile[i],&tb->ffile[i]); + + if (tb->rflag == RLINEAR) + rtmp = tb->rlo + (tb->rhi - tb->rlo)*i/(tb->ninput-1); + else if (tb->rflag == RSQ) { + rtmp = tb->rlo*tb->rlo + + (tb->rhi*tb->rhi - tb->rlo*tb->rlo)*i/(tb->ninput-1); + rtmp = sqrt(rtmp); + } else if (tb->rflag == BMP) { + rsq_lookup.i = i << nshiftbits; + rsq_lookup.i |= masklo; + if (rsq_lookup.f < tb->rlo*tb->rlo) { + rsq_lookup.i = i << nshiftbits; + rsq_lookup.i |= maskhi; + } + rtmp = sqrtf(rsq_lookup.f); + } + + tb->rfile[i] = rtmp; + } + + // close file + + fclose(fp); +} + +/* ---------------------------------------------------------------------- + broadcast read-in table info from proc 0 to other procs + this function communicates these values in Table: + ninput,rfile,efile,ffile,rflag,rlo,rhi,fpflag,fplo,fphi +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PairTableKokkos<DeviceType>::bcast_table(Table *tb) +{ + MPI_Bcast(&tb->ninput,1,MPI_INT,0,world); + + int me; + MPI_Comm_rank(world,&me); + if (me > 0) { + memory->create(tb->rfile,tb->ninput,"pair:rfile"); + memory->create(tb->efile,tb->ninput,"pair:efile"); + memory->create(tb->ffile,tb->ninput,"pair:ffile"); + } + + MPI_Bcast(tb->rfile,tb->ninput,MPI_DOUBLE,0,world); + MPI_Bcast(tb->efile,tb->ninput,MPI_DOUBLE,0,world); + MPI_Bcast(tb->ffile,tb->ninput,MPI_DOUBLE,0,world); + + MPI_Bcast(&tb->rflag,1,MPI_INT,0,world); + if (tb->rflag) { + MPI_Bcast(&tb->rlo,1,MPI_DOUBLE,0,world); + MPI_Bcast(&tb->rhi,1,MPI_DOUBLE,0,world); + } + MPI_Bcast(&tb->fpflag,1,MPI_INT,0,world); + if (tb->fpflag) { + MPI_Bcast(&tb->fplo,1,MPI_DOUBLE,0,world); + MPI_Bcast(&tb->fphi,1,MPI_DOUBLE,0,world); + } +} + +/* ---------------------------------------------------------------------- + build spline representation of e,f over entire range of read-in table + this function sets these values in Table: e2file,f2file +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PairTableKokkos<DeviceType>::spline_table(Table *tb) +{ + memory->create(tb->e2file,tb->ninput,"pair:e2file"); + memory->create(tb->f2file,tb->ninput,"pair:f2file"); + + double ep0 = - tb->ffile[0]; + double epn = - tb->ffile[tb->ninput-1]; + spline(tb->rfile,tb->efile,tb->ninput,ep0,epn,tb->e2file); + + if (tb->fpflag == 0) { + tb->fplo = (tb->ffile[1] - tb->ffile[0]) / (tb->rfile[1] - tb->rfile[0]); + tb->fphi = (tb->ffile[tb->ninput-1] - tb->ffile[tb->ninput-2]) / + (tb->rfile[tb->ninput-1] - tb->rfile[tb->ninput-2]); + } + + double fp0 = tb->fplo; + double fpn = tb->fphi; + spline(tb->rfile,tb->ffile,tb->ninput,fp0,fpn,tb->f2file); +} + +/* ---------------------------------------------------------------------- + extract attributes from parameter line in table section + format of line: N value R/RSQ/BITMAP lo hi FP fplo fphi + N is required, other params are optional +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PairTableKokkos<DeviceType>::param_extract(Table *tb, char *line) +{ + tb->ninput = 0; + tb->rflag = NONE; + tb->fpflag = 0; + + char *word = strtok(line," \t\n\r\f"); + while (word) { + if (strcmp(word,"N") == 0) { + word = strtok(NULL," \t\n\r\f"); + tb->ninput = atoi(word); + } else if (strcmp(word,"R") == 0 || strcmp(word,"RSQ") == 0 || + strcmp(word,"BITMAP") == 0) { + if (strcmp(word,"R") == 0) tb->rflag = RLINEAR; + else if (strcmp(word,"RSQ") == 0) tb->rflag = RSQ; + else if (strcmp(word,"BITMAP") == 0) tb->rflag = BMP; + word = strtok(NULL," \t\n\r\f"); + tb->rlo = atof(word); + word = strtok(NULL," \t\n\r\f"); + tb->rhi = atof(word); + } else if (strcmp(word,"FP") == 0) { + tb->fpflag = 1; + word = strtok(NULL," \t\n\r\f"); + tb->fplo = atof(word); + word = strtok(NULL," \t\n\r\f"); + tb->fphi = atof(word); + } else { + printf("WORD: %s\n",word); + error->one(FLERR,"Invalid keyword in pair table parameters"); + } + word = strtok(NULL," \t\n\r\f"); + } + + if (tb->ninput == 0) error->one(FLERR,"Pair table parameters did not set N"); +} + +/* ---------------------------------------------------------------------- + compute r,e,f vectors from splined values +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PairTableKokkos<DeviceType>::compute_table(Table *tb) +{ + update_table = 1; + int tlm1 = tablength-1; + + // inner = inner table bound + // cut = outer table bound + // delta = table spacing in rsq for N-1 bins + + double inner; + if (tb->rflag) inner = tb->rlo; + else inner = tb->rfile[0]; + tb->innersq = inner*inner; + tb->delta = (tb->cut*tb->cut - tb->innersq) / tlm1; + tb->invdelta = 1.0/tb->delta; + + // direct lookup tables + // N-1 evenly spaced bins in rsq from inner to cut + // e,f = value at midpt of bin + // e,f are N-1 in length since store 1 value at bin midpt + // f is converted to f/r when stored in f[i] + // e,f are never a match to read-in values, always computed via spline interp + + if (tabstyle == LOOKUP) { + memory->create(tb->e,tlm1,"pair:e"); + memory->create(tb->f,tlm1,"pair:f"); + + double r,rsq; + for (int i = 0; i < tlm1; i++) { + rsq = tb->innersq + (i+0.5)*tb->delta; + r = sqrt(rsq); + tb->e[i] = splint(tb->rfile,tb->efile,tb->e2file,tb->ninput,r); + tb->f[i] = splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,r)/r; + } + } + + // linear tables + // N-1 evenly spaced bins in rsq from inner to cut + // rsq,e,f = value at lower edge of bin + // de,df values = delta from lower edge to upper edge of bin + // rsq,e,f are N in length so de,df arrays can compute difference + // f is converted to f/r when stored in f[i] + // e,f can match read-in values, else compute via spline interp + + if (tabstyle == LINEAR) { + memory->create(tb->rsq,tablength,"pair:rsq"); + memory->create(tb->e,tablength,"pair:e"); + memory->create(tb->f,tablength,"pair:f"); + memory->create(tb->de,tlm1,"pair:de"); + memory->create(tb->df,tlm1,"pair:df"); + + double r,rsq; + for (int i = 0; i < tablength; i++) { + rsq = tb->innersq + i*tb->delta; + r = sqrt(rsq); + tb->rsq[i] = rsq; + if (tb->match) { + tb->e[i] = tb->efile[i]; + tb->f[i] = tb->ffile[i]/r; + } else { + tb->e[i] = splint(tb->rfile,tb->efile,tb->e2file,tb->ninput,r); + tb->f[i] = splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,r)/r; + } + } + + for (int i = 0; i < tlm1; i++) { + tb->de[i] = tb->e[i+1] - tb->e[i]; + tb->df[i] = tb->f[i+1] - tb->f[i]; + } + } + + // cubic spline tables + // N-1 evenly spaced bins in rsq from inner to cut + // rsq,e,f = value at lower edge of bin + // e2,f2 = spline coefficient for each bin + // rsq,e,f,e2,f2 are N in length so have N-1 spline bins + // f is converted to f/r after e is splined + // e,f can match read-in values, else compute via spline interp + + if (tabstyle == SPLINE) { + memory->create(tb->rsq,tablength,"pair:rsq"); + memory->create(tb->e,tablength,"pair:e"); + memory->create(tb->f,tablength,"pair:f"); + memory->create(tb->e2,tablength,"pair:e2"); + memory->create(tb->f2,tablength,"pair:f2"); + + tb->deltasq6 = tb->delta*tb->delta / 6.0; + + double r,rsq; + for (int i = 0; i < tablength; i++) { + rsq = tb->innersq + i*tb->delta; + r = sqrt(rsq); + tb->rsq[i] = rsq; + if (tb->match) { + tb->e[i] = tb->efile[i]; + tb->f[i] = tb->ffile[i]/r; + } else { + tb->e[i] = splint(tb->rfile,tb->efile,tb->e2file,tb->ninput,r); + tb->f[i] = splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,r); + } + } + + // ep0,epn = dh/dg at inner and at cut + // h(r) = e(r) and g(r) = r^2 + // dh/dg = (de/dr) / 2r = -f/2r + + double ep0 = - tb->f[0] / (2.0 * sqrt(tb->innersq)); + double epn = - tb->f[tlm1] / (2.0 * tb->cut); + spline(tb->rsq,tb->e,tablength,ep0,epn,tb->e2); + + // fp0,fpn = dh/dg at inner and at cut + // h(r) = f(r)/r and g(r) = r^2 + // dh/dg = (1/r df/dr - f/r^2) / 2r + // dh/dg in secant approx = (f(r2)/r2 - f(r1)/r1) / (g(r2) - g(r1)) + + double fp0,fpn; + double secant_factor = 0.1; + if (tb->fpflag) fp0 = (tb->fplo/sqrt(tb->innersq) - tb->f[0]/tb->innersq) / + (2.0 * sqrt(tb->innersq)); + else { + double rsq1 = tb->innersq; + double rsq2 = rsq1 + secant_factor*tb->delta; + fp0 = (splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,sqrt(rsq2)) / + sqrt(rsq2) - tb->f[0] / sqrt(rsq1)) / (secant_factor*tb->delta); + } + + if (tb->fpflag && tb->cut == tb->rfile[tb->ninput-1]) fpn = + (tb->fphi/tb->cut - tb->f[tlm1]/(tb->cut*tb->cut)) / (2.0 * tb->cut); + else { + double rsq2 = tb->cut * tb->cut; + double rsq1 = rsq2 - secant_factor*tb->delta; + fpn = (tb->f[tlm1] / sqrt(rsq2) - + splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,sqrt(rsq1)) / + sqrt(rsq1)) / (secant_factor*tb->delta); + } + + for (int i = 0; i < tablength; i++) tb->f[i] /= sqrt(tb->rsq[i]); + spline(tb->rsq,tb->f,tablength,fp0,fpn,tb->f2); + } + + // bitmapped linear tables + // 2^N bins from inner to cut, spaced in bitmapped manner + // f is converted to f/r when stored in f[i] + // e,f can match read-in values, else compute via spline interp + + if (tabstyle == BITMAP) { + double r; + union_int_float_t rsq_lookup; + int masklo,maskhi; + + // linear lookup tables of length ntable = 2^n + // stored value = value at lower edge of bin + + init_bitmap(inner,tb->cut,tablength,masklo,maskhi,tb->nmask,tb->nshiftbits); + int ntable = 1 << tablength; + int ntablem1 = ntable - 1; + + memory->create(tb->rsq,ntable,"pair:rsq"); + memory->create(tb->e,ntable,"pair:e"); + memory->create(tb->f,ntable,"pair:f"); + memory->create(tb->de,ntable,"pair:de"); + memory->create(tb->df,ntable,"pair:df"); + memory->create(tb->drsq,ntable,"pair:drsq"); + + union_int_float_t minrsq_lookup; + minrsq_lookup.i = 0 << tb->nshiftbits; + minrsq_lookup.i |= maskhi; + + for (int i = 0; i < ntable; i++) { + rsq_lookup.i = i << tb->nshiftbits; + rsq_lookup.i |= masklo; + if (rsq_lookup.f < tb->innersq) { + rsq_lookup.i = i << tb->nshiftbits; + rsq_lookup.i |= maskhi; + } + r = sqrtf(rsq_lookup.f); + tb->rsq[i] = rsq_lookup.f; + if (tb->match) { + tb->e[i] = tb->efile[i]; + tb->f[i] = tb->ffile[i]/r; + } else { + tb->e[i] = splint(tb->rfile,tb->efile,tb->e2file,tb->ninput,r); + tb->f[i] = splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,r)/r; + } + minrsq_lookup.f = MIN(minrsq_lookup.f,rsq_lookup.f); + } + + tb->innersq = minrsq_lookup.f; + + for (int i = 0; i < ntablem1; i++) { + tb->de[i] = tb->e[i+1] - tb->e[i]; + tb->df[i] = tb->f[i+1] - tb->f[i]; + tb->drsq[i] = 1.0/(tb->rsq[i+1] - tb->rsq[i]); + } + + // get the delta values for the last table entries + // tables are connected periodically between 0 and ntablem1 + + tb->de[ntablem1] = tb->e[0] - tb->e[ntablem1]; + tb->df[ntablem1] = tb->f[0] - tb->f[ntablem1]; + tb->drsq[ntablem1] = 1.0/(tb->rsq[0] - tb->rsq[ntablem1]); + + // get the correct delta values at itablemax + // smallest r is in bin itablemin + // largest r is in bin itablemax, which is itablemin-1, + // or ntablem1 if itablemin=0 + + // deltas at itablemax only needed if corresponding rsq < cut*cut + // if so, compute deltas between rsq and cut*cut + // if tb->match, data at cut*cut is unavailable, so we'll take + // deltas at itablemax-1 as a good approximation + + double e_tmp,f_tmp; + int itablemin = minrsq_lookup.i & tb->nmask; + itablemin >>= tb->nshiftbits; + int itablemax = itablemin - 1; + if (itablemin == 0) itablemax = ntablem1; + int itablemaxm1 = itablemax - 1; + if (itablemax == 0) itablemaxm1 = ntablem1; + rsq_lookup.i = itablemax << tb->nshiftbits; + rsq_lookup.i |= maskhi; + if (rsq_lookup.f < tb->cut*tb->cut) { + if (tb->match) { + tb->de[itablemax] = tb->de[itablemaxm1]; + tb->df[itablemax] = tb->df[itablemaxm1]; + tb->drsq[itablemax] = tb->drsq[itablemaxm1]; + } else { + rsq_lookup.f = tb->cut*tb->cut; + r = sqrtf(rsq_lookup.f); + e_tmp = splint(tb->rfile,tb->efile,tb->e2file,tb->ninput,r); + f_tmp = splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,r)/r; + tb->de[itablemax] = e_tmp - tb->e[itablemax]; + tb->df[itablemax] = f_tmp - tb->f[itablemax]; + tb->drsq[itablemax] = 1.0/(rsq_lookup.f - tb->rsq[itablemax]); + } + } + } +} + +/* ---------------------------------------------------------------------- + set all ptrs in a table to NULL, so can be freed safely +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PairTableKokkos<DeviceType>::null_table(Table *tb) +{ + tb->rfile = tb->efile = tb->ffile = NULL; + tb->e2file = tb->f2file = NULL; + tb->rsq = tb->drsq = tb->e = tb->de = NULL; + tb->f = tb->df = tb->e2 = tb->f2 = NULL; +} + +/* ---------------------------------------------------------------------- + free all arrays in a table +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PairTableKokkos<DeviceType>::free_table(Table *tb) +{ + memory->destroy(tb->rfile); + memory->destroy(tb->efile); + memory->destroy(tb->ffile); + memory->destroy(tb->e2file); + memory->destroy(tb->f2file); + + memory->destroy(tb->rsq); + memory->destroy(tb->drsq); + memory->destroy(tb->e); + memory->destroy(tb->de); + memory->destroy(tb->f); + memory->destroy(tb->df); + memory->destroy(tb->e2); + memory->destroy(tb->f2); +} + +/* ---------------------------------------------------------------------- + spline and splint routines modified from Numerical Recipes +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PairTableKokkos<DeviceType>::spline(double *x, double *y, int n, + double yp1, double ypn, double *y2) +{ + int i,k; + double p,qn,sig,un; + double *u = new double[n]; + + if (yp1 > 0.99e30) y2[0] = u[0] = 0.0; + else { + y2[0] = -0.5; + u[0] = (3.0/(x[1]-x[0])) * ((y[1]-y[0]) / (x[1]-x[0]) - yp1); + } + for (i = 1; i < n-1; i++) { + sig = (x[i]-x[i-1]) / (x[i+1]-x[i-1]); + p = sig*y2[i-1] + 2.0; + y2[i] = (sig-1.0) / p; + u[i] = (y[i+1]-y[i]) / (x[i+1]-x[i]) - (y[i]-y[i-1]) / (x[i]-x[i-1]); + u[i] = (6.0*u[i] / (x[i+1]-x[i-1]) - sig*u[i-1]) / p; + } + if (ypn > 0.99e30) qn = un = 0.0; + else { + qn = 0.5; + un = (3.0/(x[n-1]-x[n-2])) * (ypn - (y[n-1]-y[n-2]) / (x[n-1]-x[n-2])); + } + y2[n-1] = (un-qn*u[n-2]) / (qn*y2[n-2] + 1.0); + for (k = n-2; k >= 0; k--) y2[k] = y2[k]*y2[k+1] + u[k]; + + delete [] u; +} + +/* ---------------------------------------------------------------------- */ + +template<class DeviceType> +double PairTableKokkos<DeviceType>::splint(double *xa, double *ya, double *y2a, int n, double x) +{ + int klo,khi,k; + double h,b,a,y; + + klo = 0; + khi = n-1; + while (khi-klo > 1) { + k = (khi+klo) >> 1; + if (xa[k] > x) khi = k; + else klo = k; + } + h = xa[khi]-xa[klo]; + a = (xa[khi]-x) / h; + b = (x-xa[klo]) / h; + y = a*ya[klo] + b*ya[khi] + + ((a*a*a-a)*y2a[klo] + (b*b*b-b)*y2a[khi]) * (h*h)/6.0; + return y; +} + +/* ---------------------------------------------------------------------- + proc 0 writes to restart file +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PairTableKokkos<DeviceType>::write_restart(FILE *fp) +{ + write_restart_settings(fp); +} + +/* ---------------------------------------------------------------------- + proc 0 reads from restart file, bcasts +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PairTableKokkos<DeviceType>::read_restart(FILE *fp) +{ + read_restart_settings(fp); + allocate(); +} + +/* ---------------------------------------------------------------------- + proc 0 writes to restart file +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PairTableKokkos<DeviceType>::write_restart_settings(FILE *fp) +{ + fwrite(&tabstyle,sizeof(int),1,fp); + fwrite(&tablength,sizeof(int),1,fp); + fwrite(&ewaldflag,sizeof(int),1,fp); + fwrite(&pppmflag,sizeof(int),1,fp); + fwrite(&msmflag,sizeof(int),1,fp); + fwrite(&dispersionflag,sizeof(int),1,fp); + fwrite(&tip4pflag,sizeof(int),1,fp); +} + +/* ---------------------------------------------------------------------- + proc 0 reads from restart file, bcasts +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PairTableKokkos<DeviceType>::read_restart_settings(FILE *fp) +{ + if (comm->me == 0) { + fread(&tabstyle,sizeof(int),1,fp); + fread(&tablength,sizeof(int),1,fp); + fread(&ewaldflag,sizeof(int),1,fp); + fread(&pppmflag,sizeof(int),1,fp); + fread(&msmflag,sizeof(int),1,fp); + fread(&dispersionflag,sizeof(int),1,fp); + fread(&tip4pflag,sizeof(int),1,fp); + } + MPI_Bcast(&tabstyle,1,MPI_INT,0,world); + MPI_Bcast(&tablength,1,MPI_INT,0,world); + MPI_Bcast(&ewaldflag,1,MPI_INT,0,world); + MPI_Bcast(&pppmflag,1,MPI_INT,0,world); + MPI_Bcast(&msmflag,1,MPI_INT,0,world); + MPI_Bcast(&dispersionflag,1,MPI_INT,0,world); + MPI_Bcast(&tip4pflag,1,MPI_INT,0,world); +} + +/* ---------------------------------------------------------------------- */ + +template<class DeviceType> +double PairTableKokkos<DeviceType>::single(int i, int j, int itype, int jtype, double rsq, + double factor_coul, double factor_lj, + double &fforce) +{ + int itable; + double fraction,value,a,b,phi; + int tlm1 = tablength - 1; + + Table *tb = &tables[tabindex[itype][jtype]]; + if (rsq < tb->innersq) error->one(FLERR,"Pair distance < table inner cutoff"); + + if (tabstyle == LOOKUP) { + itable = static_cast<int> ((rsq-tb->innersq) * tb->invdelta); + if (itable >= tlm1) error->one(FLERR,"Pair distance > table outer cutoff"); + fforce = factor_lj * tb->f[itable]; + } else if (tabstyle == LINEAR) { + itable = static_cast<int> ((rsq-tb->innersq) * tb->invdelta); + if (itable >= tlm1) error->one(FLERR,"Pair distance > table outer cutoff"); + fraction = (rsq - tb->rsq[itable]) * tb->invdelta; + value = tb->f[itable] + fraction*tb->df[itable]; + fforce = factor_lj * value; + } else if (tabstyle == SPLINE) { + itable = static_cast<int> ((rsq-tb->innersq) * tb->invdelta); + if (itable >= tlm1) error->one(FLERR,"Pair distance > table outer cutoff"); + b = (rsq - tb->rsq[itable]) * tb->invdelta; + a = 1.0 - b; + value = a * tb->f[itable] + b * tb->f[itable+1] + + ((a*a*a-a)*tb->f2[itable] + (b*b*b-b)*tb->f2[itable+1]) * + tb->deltasq6; + fforce = factor_lj * value; + } else { + union_int_float_t rsq_lookup; + rsq_lookup.f = rsq; + itable = rsq_lookup.i & tb->nmask; + itable >>= tb->nshiftbits; + fraction = (rsq_lookup.f - tb->rsq[itable]) * tb->drsq[itable]; + value = tb->f[itable] + fraction*tb->df[itable]; + fforce = factor_lj * value; + } + + if (tabstyle == LOOKUP) + phi = tb->e[itable]; + else if (tabstyle == LINEAR || tabstyle == BITMAP) + phi = tb->e[itable] + fraction*tb->de[itable]; + else + phi = a * tb->e[itable] + b * tb->e[itable+1] + + ((a*a*a-a)*tb->e2[itable] + (b*b*b-b)*tb->e2[itable+1]) * tb->deltasq6; + return factor_lj*phi; +} + +/* ---------------------------------------------------------------------- + return the Coulomb cutoff for tabled potentials + called by KSpace solvers which require that all pairwise cutoffs be the same + loop over all tables not just those indexed by tabindex[i][j] since + no way to know which tables are active since pair::init() not yet called +------------------------------------------------------------------------- */ + +template<class DeviceType> +void *PairTableKokkos<DeviceType>::extract(const char *str, int &dim) +{ + if (strcmp(str,"cut_coul") != 0) return NULL; + if (ntables == 0) error->all(FLERR,"All pair coeffs are not set"); + + double cut_coul = tables[0].cut; + for (int m = 1; m < ntables; m++) + if (tables[m].cut != cut_coul) + error->all(FLERR, + "Pair table cutoffs must all be equal to use with KSpace"); + dim = 0; + return &tables[0].cut; +} + +template<class DeviceType> +void PairTableKokkos<DeviceType>::init_style() +{ + neighbor->request(this); + neighflag = lmp->kokkos->neighflag; + int irequest = neighbor->nrequest - 1; + + neighbor->requests[irequest]-> + kokkos_host = Kokkos::Impl::is_same<DeviceType,LMPHostType>::value && + !Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value; + neighbor->requests[irequest]-> + kokkos_device = Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value; + + if (neighflag == FULL) { + neighbor->requests[irequest]->full = 1; + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->full_cluster = 0; + } else if (neighflag == HALF || neighflag == HALFTHREAD) { + neighbor->requests[irequest]->full = 0; + neighbor->requests[irequest]->half = 1; + neighbor->requests[irequest]->full_cluster = 0; + } else if (neighflag == N2) { + neighbor->requests[irequest]->full = 0; + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->full_cluster = 0; + } else if (neighflag == FULLCLUSTER) { + neighbor->requests[irequest]->full_cluster = 1; + neighbor->requests[irequest]->full = 1; + neighbor->requests[irequest]->half = 0; + } else { + error->all(FLERR,"Cannot use chosen neighbor list style with lj/cut/kk"); + } +} + +/* +template <class DeviceType> template<int NEIGHFLAG> +KOKKOS_INLINE_FUNCTION +void PairTableKokkos<DeviceType>:: +ev_tally(EV_FLOAT &ev, const int &i, const int &j, const F_FLOAT &fpair, + const F_FLOAT &delx, const F_FLOAT &dely, const F_FLOAT &delz) const +{ + const int EFLAG = eflag; + const int NEWTON_PAIR = newton_pair; + const int VFLAG = vflag_either; + + if (EFLAG) { + if (eflag_atom) { + E_FLOAT epairhalf = 0.5 * (ev.evdwl + ev.ecoul); + if (NEWTON_PAIR || i < nlocal) eatom[i] += epairhalf; + if (NEWTON_PAIR || j < nlocal) eatom[j] += epairhalf; + } + } + + if (VFLAG) { + const E_FLOAT v0 = delx*delx*fpair; + const E_FLOAT v1 = dely*dely*fpair; + const E_FLOAT v2 = delz*delz*fpair; + const E_FLOAT v3 = delx*dely*fpair; + const E_FLOAT v4 = delx*delz*fpair; + const E_FLOAT v5 = dely*delz*fpair; + + if (vflag_global) { + if (NEIGHFLAG) { + if (NEWTON_PAIR) { + ev.v[0] += v0; + ev.v[1] += v1; + ev.v[2] += v2; + ev.v[3] += v3; + ev.v[4] += v4; + ev.v[5] += v5; + } else { + if (i < nlocal) { + ev.v[0] += 0.5*v0; + ev.v[1] += 0.5*v1; + ev.v[2] += 0.5*v2; + ev.v[3] += 0.5*v3; + ev.v[4] += 0.5*v4; + ev.v[5] += 0.5*v5; + } + if (j < nlocal) { + ev.v[0] += 0.5*v0; + ev.v[1] += 0.5*v1; + ev.v[2] += 0.5*v2; + ev.v[3] += 0.5*v3; + ev.v[4] += 0.5*v4; + ev.v[5] += 0.5*v5; + } + } + } else { + ev.v[0] += 0.5*v0; + ev.v[1] += 0.5*v1; + ev.v[2] += 0.5*v2; + ev.v[3] += 0.5*v3; + ev.v[4] += 0.5*v4; + ev.v[5] += 0.5*v5; + } + } + + if (vflag_atom) { + if (NEWTON_PAIR || i < nlocal) { + d_vatom(i,0) += 0.5*v0; + d_vatom(i,1) += 0.5*v1; + d_vatom(i,2) += 0.5*v2; + d_vatom(i,3) += 0.5*v3; + d_vatom(i,4) += 0.5*v4; + d_vatom(i,5) += 0.5*v5; + } + if (NEWTON_PAIR || (NEIGHFLAG && j < nlocal)) { + d_vatom(j,0) += 0.5*v0; + d_vatom(j,1) += 0.5*v1; + d_vatom(j,2) += 0.5*v2; + d_vatom(j,3) += 0.5*v3; + d_vatom(j,4) += 0.5*v4; + d_vatom(j,5) += 0.5*v5; + } + } + } +} +*/ +template<class DeviceType> +void PairTableKokkos<DeviceType>::cleanup_copy() { + // WHY needed: this prevents parent copy from deallocating any arrays + allocated = 0; + cutsq = NULL; + eatom = NULL; + vatom = NULL; + h_table=NULL; d_table=NULL; +} + +template class PairTableKokkos<LMPDeviceType>; +#if DEVICE==2 +template class PairTableKokkos<LMPHostType>; +#endif + diff --git a/src/KOKKOS/pair_table_kokkos.h b/src/KOKKOS/pair_table_kokkos.h new file mode 100644 index 0000000000..317703c895 --- /dev/null +++ b/src/KOKKOS/pair_table_kokkos.h @@ -0,0 +1,352 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(table/kk,PairTableKokkos<LMPDeviceType>) +PairStyle(table/kk/device,PairTableKokkos<LMPDeviceType>) +PairStyle(table/kk/host,PairTableKokkos<LMPHostType>) + +#else + +#ifndef LMP_PAIR_TABLE_KOKKOS_H +#define LMP_PAIR_TABLE_KOKKOS_H + +#include "pair.h" +#include "pair_kokkos.h" +#include "neigh_list_kokkos.h" +#include "atom_kokkos.h" + +namespace LAMMPS_NS { + +template<class Device,int TABSTYLE> +struct S_TableCompute { + enum {TabStyle = TABSTYLE}; +}; + +template <class DeviceType, int NEIGHFLAG, int TABSTYLE> +class PairTableComputeFunctor; + +template<class DeviceType> +class PairTableKokkos : public Pair { + public: + + enum {COUL_FLAG=0}; + typedef DeviceType device_type; + + PairTableKokkos(class LAMMPS *); + virtual ~PairTableKokkos(); + + virtual void compute(int, int); + + template<int TABSTYLE> + void compute_style(int, int); + + /*template<int EVFLAG, int NEIGHFLAG, int NEWTON_PAIR, int TABSTYLE> + KOKKOS_FUNCTION + EV_FLOAT compute_item(const int& i, + const NeighListKokkos<DeviceType> &list) const; +*/ + void settings(int, char **); + void coeff(int, char **); + double init_one(int, int); + void write_restart(FILE *); + void read_restart(FILE *); + void write_restart_settings(FILE *); + void read_restart_settings(FILE *); + double single(int, int, int, int, double, double, double, double &); + void *extract(const char *, int &); + + void init_style(); + + + protected: + enum{LOOKUP,LINEAR,SPLINE,BITMAP}; + + int tabstyle,tablength; + /*struct TableDeviceConst { + typename ArrayTypes<DeviceType>::t_ffloat_2d_randomread cutsq; + typename ArrayTypes<DeviceType>::t_int_2d_randomread tabindex; + typename ArrayTypes<DeviceType>::t_int_1d_randomread nshiftbits,nmask; + typename ArrayTypes<DeviceType>::t_ffloat_1d_randomread innersq,invdelta,deltasq6; + typename ArrayTypes<DeviceType>::t_ffloat_2d_randomread rsq,drsq,e,de,f,df,e2,f2; + };*/ + //Its faster not to use texture fetch if the number of tables is less than 32! + struct TableDeviceConst { + typename ArrayTypes<DeviceType>::t_ffloat_2d cutsq; + typename ArrayTypes<DeviceType>::t_int_2d tabindex; + typename ArrayTypes<DeviceType>::t_int_1d nshiftbits,nmask; + typename ArrayTypes<DeviceType>::t_ffloat_1d innersq,invdelta,deltasq6; + typename ArrayTypes<DeviceType>::t_ffloat_2d_randomread rsq,drsq,e,de,f,df,e2,f2; + }; + + struct TableDevice { + typename ArrayTypes<DeviceType>::t_ffloat_2d cutsq; + typename ArrayTypes<DeviceType>::t_int_2d tabindex; + typename ArrayTypes<DeviceType>::t_int_1d nshiftbits,nmask; + typename ArrayTypes<DeviceType>::t_ffloat_1d innersq,invdelta,deltasq6; + typename ArrayTypes<DeviceType>::t_ffloat_2d rsq,drsq,e,de,f,df,e2,f2; + }; + + struct TableHost { + typename ArrayTypes<LMPHostType>::t_ffloat_2d cutsq; + typename ArrayTypes<LMPHostType>::t_int_2d tabindex; + typename ArrayTypes<LMPHostType>::t_int_1d nshiftbits,nmask; + typename ArrayTypes<LMPHostType>::t_ffloat_1d innersq,invdelta,deltasq6; + typename ArrayTypes<LMPHostType>::t_ffloat_2d rsq,drsq,e,de,f,df,e2,f2; + }; + + struct Table { + int ninput,rflag,fpflag,match,ntablebits; + int nshiftbits,nmask; + double rlo,rhi,fplo,fphi,cut; + double *rfile,*efile,*ffile; + double *e2file,*f2file; + double innersq,delta,invdelta,deltasq6; + double *rsq,*drsq,*e,*de,*f,*df,*e2,*f2; + }; + int ntables; + Table *tables; + TableDeviceConst d_table_const; + TableDevice* d_table; + TableHost* h_table; + + int **tabindex; + F_FLOAT m_cutsq[MAX_TYPES_STACKPARAMS+1][MAX_TYPES_STACKPARAMS+1]; + + typename ArrayTypes<DeviceType>::t_ffloat_2d d_cutsq; + + void allocate(); + void read_table(Table *, char *, char *); + void param_extract(Table *, char *); + void bcast_table(Table *); + void spline_table(Table *); + void compute_table(Table *); + void null_table(Table *); + void free_table(Table *); + void spline(double *, double *, int, double, double, double *); + double splint(double *, double *, double *, int, double); + + typename ArrayTypes<DeviceType>::t_x_array_randomread x; + typename ArrayTypes<DeviceType>::t_x_array_const c_x; + typename ArrayTypes<DeviceType>::t_f_array f; + typename ArrayTypes<DeviceType>::t_int_1d_randomread type; + typename ArrayTypes<DeviceType>::t_efloat_1d d_eatom; + typename ArrayTypes<DeviceType>::t_virial_array d_vatom; + + protected: + int nlocal,nall,eflag,vflag,neighflag,newton_pair; + class AtomKokkos *atomKK; + int update_table; + void create_kokkos_tables(); + void cleanup_copy(); + + template<bool STACKPARAMS, class Specialisation> + KOKKOS_INLINE_FUNCTION + F_FLOAT compute_fpair(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const; + + template<bool STACKPARAMS, class Specialisation> + KOKKOS_INLINE_FUNCTION + F_FLOAT compute_evdwl(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const; + + template<bool STACKPARAMS, class Specialisation> + KOKKOS_INLINE_FUNCTION + F_FLOAT compute_ecoul(const F_FLOAT& rsq, const int& i, const int&j, const int& itype, const int& jtype) const { + return 0; + } + + friend class PairComputeFunctor<PairTableKokkos,FULL,true,S_TableCompute<DeviceType,LOOKUP> >; + friend class PairComputeFunctor<PairTableKokkos,HALF,true,S_TableCompute<DeviceType,LOOKUP> >; + friend class PairComputeFunctor<PairTableKokkos,HALFTHREAD,true,S_TableCompute<DeviceType,LOOKUP> >; + friend class PairComputeFunctor<PairTableKokkos,N2,true,S_TableCompute<DeviceType,LOOKUP> >; + friend class PairComputeFunctor<PairTableKokkos,FULLCLUSTER,true,S_TableCompute<DeviceType,LOOKUP> >; + friend class PairComputeFunctor<PairTableKokkos,FULL,false,S_TableCompute<DeviceType,LOOKUP> >; + friend class PairComputeFunctor<PairTableKokkos,HALF,false,S_TableCompute<DeviceType,LOOKUP> >; + friend class PairComputeFunctor<PairTableKokkos,HALFTHREAD,false,S_TableCompute<DeviceType,LOOKUP> >; + friend class PairComputeFunctor<PairTableKokkos,N2,false,S_TableCompute<DeviceType,LOOKUP> >; + friend class PairComputeFunctor<PairTableKokkos,FULLCLUSTER,false,S_TableCompute<DeviceType,LOOKUP> >; + + friend class PairComputeFunctor<PairTableKokkos,FULL,true,S_TableCompute<DeviceType,LINEAR> >; + friend class PairComputeFunctor<PairTableKokkos,HALF,true,S_TableCompute<DeviceType,LINEAR> >; + friend class PairComputeFunctor<PairTableKokkos,HALFTHREAD,true,S_TableCompute<DeviceType,LINEAR> >; + friend class PairComputeFunctor<PairTableKokkos,N2,true,S_TableCompute<DeviceType,LINEAR> >; + friend class PairComputeFunctor<PairTableKokkos,FULLCLUSTER,true,S_TableCompute<DeviceType,LINEAR> >; + friend class PairComputeFunctor<PairTableKokkos,FULL,false,S_TableCompute<DeviceType,LINEAR> >; + friend class PairComputeFunctor<PairTableKokkos,HALF,false,S_TableCompute<DeviceType,LINEAR> >; + friend class PairComputeFunctor<PairTableKokkos,HALFTHREAD,false,S_TableCompute<DeviceType,LINEAR> >; + friend class PairComputeFunctor<PairTableKokkos,N2,false,S_TableCompute<DeviceType,LINEAR> >; + friend class PairComputeFunctor<PairTableKokkos,FULLCLUSTER,false,S_TableCompute<DeviceType,LINEAR> >; + + friend class PairComputeFunctor<PairTableKokkos,FULL,true,S_TableCompute<DeviceType,SPLINE> >; + friend class PairComputeFunctor<PairTableKokkos,HALF,true,S_TableCompute<DeviceType,SPLINE> >; + friend class PairComputeFunctor<PairTableKokkos,HALFTHREAD,true,S_TableCompute<DeviceType,SPLINE> >; + friend class PairComputeFunctor<PairTableKokkos,N2,true,S_TableCompute<DeviceType,SPLINE> >; + friend class PairComputeFunctor<PairTableKokkos,FULLCLUSTER,true,S_TableCompute<DeviceType,SPLINE> >; + friend class PairComputeFunctor<PairTableKokkos,FULL,false,S_TableCompute<DeviceType,SPLINE> >; + friend class PairComputeFunctor<PairTableKokkos,HALF,false,S_TableCompute<DeviceType,SPLINE> >; + friend class PairComputeFunctor<PairTableKokkos,HALFTHREAD,false,S_TableCompute<DeviceType,SPLINE> >; + friend class PairComputeFunctor<PairTableKokkos,N2,false,S_TableCompute<DeviceType,SPLINE> >; + friend class PairComputeFunctor<PairTableKokkos,FULLCLUSTER,false,S_TableCompute<DeviceType,SPLINE> >; + + friend class PairComputeFunctor<PairTableKokkos,FULL,true,S_TableCompute<DeviceType,BITMAP> >; + friend class PairComputeFunctor<PairTableKokkos,HALF,true,S_TableCompute<DeviceType,BITMAP> >; + friend class PairComputeFunctor<PairTableKokkos,HALFTHREAD,true,S_TableCompute<DeviceType,BITMAP> >; + friend class PairComputeFunctor<PairTableKokkos,N2,true,S_TableCompute<DeviceType,BITMAP> >; + friend class PairComputeFunctor<PairTableKokkos,FULLCLUSTER,true,S_TableCompute<DeviceType,BITMAP> >; + friend class PairComputeFunctor<PairTableKokkos,FULL,false,S_TableCompute<DeviceType,BITMAP> >; + friend class PairComputeFunctor<PairTableKokkos,HALF,false,S_TableCompute<DeviceType,BITMAP> >; + friend class PairComputeFunctor<PairTableKokkos,HALFTHREAD,false,S_TableCompute<DeviceType,BITMAP> >; + friend class PairComputeFunctor<PairTableKokkos,N2,false,S_TableCompute<DeviceType,BITMAP> >; + friend class PairComputeFunctor<PairTableKokkos,FULLCLUSTER,false,S_TableCompute<DeviceType,BITMAP> >; +/*template<int FULL_NEIGH> + KOKKOS_INLINE_FUNCTION + void ev_tally(EV_FLOAT &ev, const int &i, const int &j, + const F_FLOAT &fpair, const F_FLOAT &delx, + const F_FLOAT &dely, const F_FLOAT &delz) const; +*/ +}; +/* +template <class DeviceType, int NEIGHFLAG, int TABSTYLE> +struct PairTableComputeFunctor { + typedef DeviceType device_type ; + typedef EV_FLOAT value_type; + + PairTableKokkos<DeviceType> c; + NeighListKokkos<DeviceType> list; + + PairTableComputeFunctor(PairTableKokkos<DeviceType>* c_ptr, + NeighListKokkos<DeviceType>* list_ptr): + c(*c_ptr),list(*list_ptr) {}; + ~PairTableComputeFunctor() {c.cleanup_copy();list.clean_copy();}; + + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { + if (c.newton_pair) c.template compute_item<0,NEIGHFLAG,1,TABSTYLE>(i,list); + else c.template compute_item<0,NEIGHFLAG,0,TABSTYLE>(i,list); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const int i, value_type &energy_virial) const { + if (c.newton_pair) + energy_virial += c.template compute_item<1,NEIGHFLAG,1,TABSTYLE>(i,list); + else + energy_virial += c.template compute_item<1,NEIGHFLAG,0,TABSTYLE>(i,list); + } + + KOKKOS_INLINE_FUNCTION + static void init(volatile value_type &update) { + update.evdwl = 0; + update.ecoul = 0; + update.v[0] = 0; + update.v[1] = 0; + update.v[2] = 0; + update.v[3] = 0; + update.v[4] = 0; + update.v[5] = 0; + } + KOKKOS_INLINE_FUNCTION + static void join(volatile value_type &update, + const volatile value_type &source) { + update.evdwl += source.evdwl; + update.ecoul += source.ecoul; + update.v[0] += source.v[0]; + update.v[1] += source.v[1]; + update.v[2] += source.v[2]; + update.v[3] += source.v[3]; + update.v[4] += source.v[4]; + update.v[5] += source.v[5]; + } +}; + +*/ + + + + + +} + +#endif +#endif + +/* ERROR/WARNING messages: + +E: Pair distance < table inner cutoff + +Two atoms are closer together than the pairwise table allows. + +E: Pair distance > table outer cutoff + +Two atoms are further apart than the pairwise table allows. + +E: Illegal ... command + +Self-explanatory. Check the input script syntax and compare to the +documentation for the command. You can use -echo screen as a +command-line option when running LAMMPS to see the offending line. + +E: Unknown table style in pair_style command + +Style of table is invalid for use with pair_style table command. + +E: Illegal number of pair table entries + +There must be at least 2 table entries. + +E: Invalid pair table length + +Length of read-in pair table is invalid + +E: Invalid pair table cutoff + +Cutoffs in pair_coeff command are not valid with read-in pair table. + +E: Bitmapped table in file does not match requested table + +Setting for bitmapped table in pair_coeff command must match table +in file exactly. + +E: All pair coeffs are not set + +All pair coefficients must be set in the data file or by the +pair_coeff command before running a simulation. + +E: Cannot open file %s + +The specified file cannot be opened. Check that the path and name are +correct. If the file is a compressed file, also check that the gzip +executable can be found and run. + +E: Did not find keyword in table file + +Keyword used in pair_coeff command was not found in table file. + +E: Bitmapped table is incorrect length in table file + +Number of table entries is not a correct power of 2. + +E: Invalid keyword in pair table parameters + +Keyword used in list of table parameters is not recognized. + +E: Pair table parameters did not set N + +List of pair table parameters must include N setting. + +E: Pair table cutoffs must all be equal to use with KSpace + +When using pair style table with a long-range KSpace solver, the +cutoffs for all atom type pairs must all be the same, since the +long-range solver starts at that cutoff. + +*/ diff --git a/src/KOKKOS/verlet_kokkos.cpp b/src/KOKKOS/verlet_kokkos.cpp new file mode 100644 index 0000000000..2883cb06e3 --- /dev/null +++ b/src/KOKKOS/verlet_kokkos.cpp @@ -0,0 +1,443 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include "string.h" +#include "verlet_kokkos.h" +#include "neighbor.h" +#include "domain.h" +#include "comm.h" +#include "atom.h" +#include "atom_kokkos.h" +#include "atom_masks.h" +#include "force.h" +#include "pair.h" +#include "bond.h" +#include "angle.h" +#include "dihedral.h" +#include "improper.h" +#include "kspace.h" +#include "output.h" +#include "update.h" +#include "modify.h" +#include "compute.h" +#include "fix.h" +#include "timer.h" +#include "memory.h" +#include "error.h" + +#include <ctime> + +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- */ + +VerletKokkos::VerletKokkos(LAMMPS *lmp, int narg, char **arg) : + Verlet(lmp, narg, arg) +{ + atomKK = (AtomKokkos *) atom; +} + +/* ---------------------------------------------------------------------- + setup before run +------------------------------------------------------------------------- */ + +void VerletKokkos::setup() +{ + if (comm->me == 0 && screen) fprintf(screen,"Setting up run ...\n"); + update->setupflag = 1; + + // setup domain, communication and neighboring + // acquire ghosts + // build neighbor lists + + atomKK->modified(Host,ALL_MASK); + + atomKK->setup(); + modify->setup_pre_exchange(); + // debug + atomKK->sync(Host,ALL_MASK); + atomKK->modified(Host,ALL_MASK); + if (triclinic) domain->x2lamda(atomKK->nlocal); + domain->pbc(); + + atomKK->sync(Host,ALL_MASK); + + domain->reset_box(); + comm->setup(); + if (neighbor->style) neighbor->setup_bins(); + comm->exchange(); + if (atomKK->sortfreq > 0) atomKK->sort(); + comm->borders(); + if (triclinic) domain->lamda2x(atomKK->nlocal+atomKK->nghost); + + atomKK->sync(Host,ALL_MASK); + + domain->image_check(); + domain->box_too_small_check(); + modify->setup_pre_neighbor(); + + atomKK->modified(Host,ALL_MASK); + + neighbor->build(); + neighbor->ncalls = 0; + + // compute all forces + + ev_set(update->ntimestep); + force_clear(); + modify->setup_pre_force(vflag); + + if (pair_compute_flag) force->pair->compute(eflag,vflag); + else if (force->pair) force->pair->compute_dummy(eflag,vflag); + + if (atomKK->molecular) { + if (force->bond) force->bond->compute(eflag,vflag); + if (force->angle) force->angle->compute(eflag,vflag); + if (force->dihedral) force->dihedral->compute(eflag,vflag); + if (force->improper) force->improper->compute(eflag,vflag); + } + + if (force->kspace) { + force->kspace->setup(); + if (kspace_compute_flag) force->kspace->compute(eflag,vflag); + else force->kspace->compute_dummy(eflag,vflag); + } + + if (force->newton) comm->reverse_comm(); + + modify->setup(vflag); + output->setup(); + update->setupflag = 0; +} + +/* ---------------------------------------------------------------------- + setup without output + flag = 0 = just force calculation + flag = 1 = reneighbor and force calculation +------------------------------------------------------------------------- */ + +void VerletKokkos::setup_minimal(int flag) +{ + update->setupflag = 1; + + // setup domain, communication and neighboring + // acquire ghosts + // build neighbor lists + + if (flag) { + atomKK->modified(Host,ALL_MASK); + + modify->setup_pre_exchange(); + // debug + atomKK->sync(Host,ALL_MASK); + atomKK->modified(Host,ALL_MASK); + + if (triclinic) domain->x2lamda(atomKK->nlocal); + domain->pbc(); + + atomKK->sync(Host,ALL_MASK); + + domain->reset_box(); + comm->setup(); + if (neighbor->style) neighbor->setup_bins(); + comm->exchange(); + comm->borders(); + if (triclinic) domain->lamda2x(atomKK->nlocal+atomKK->nghost); + + atomKK->sync(Host,ALL_MASK); + + domain->image_check(); + domain->box_too_small_check(); + modify->setup_pre_neighbor(); + + atomKK->modified(Host,ALL_MASK); + + neighbor->build(); + neighbor->ncalls = 0; + } + + // compute all forces + + ev_set(update->ntimestep); + force_clear(); + modify->setup_pre_force(vflag); + + if (pair_compute_flag) force->pair->compute(eflag,vflag); + else if (force->pair) force->pair->compute_dummy(eflag,vflag); + + if (atomKK->molecular) { + if (force->bond) force->bond->compute(eflag,vflag); + if (force->angle) force->angle->compute(eflag,vflag); + if (force->dihedral) force->dihedral->compute(eflag,vflag); + if (force->improper) force->improper->compute(eflag,vflag); + } + + if (force->kspace) { + force->kspace->setup(); + if (kspace_compute_flag) force->kspace->compute(eflag,vflag); + else force->kspace->compute_dummy(eflag,vflag); + } + + if (force->newton) comm->reverse_comm(); + + modify->setup(vflag); + update->setupflag = 0; +} + +/* ---------------------------------------------------------------------- + run for N steps +------------------------------------------------------------------------- */ + +void VerletKokkos::run(int n) +{ + bigint ntimestep; + int nflag,sortflag; + + int n_post_integrate = modify->n_post_integrate; + int n_pre_exchange = modify->n_pre_exchange; + int n_pre_neighbor = modify->n_pre_neighbor; + int n_pre_force = modify->n_pre_force; + int n_post_force = modify->n_post_force; + int n_end_of_step = modify->n_end_of_step; + + if (atomKK->sortfreq > 0) sortflag = 1; + else sortflag = 0; + + static double time = 0.0; + static int count = 0; + atomKK->sync(Device,ALL_MASK); + Kokkos::Impl::Timer ktimer; + + for (int i = 0; i < n; i++) { + + ntimestep = ++update->ntimestep; + ev_set(ntimestep); + + // initial time integration + + ktimer.reset(); + modify->initial_integrate(vflag); + time += ktimer.seconds(); + if (n_post_integrate) modify->post_integrate(); + + // regular communication vs neighbor list rebuild + + nflag = neighbor->decide(); + + if (nflag == 0) { + timer->stamp(); + comm->forward_comm(); + timer->stamp(TIME_COMM); + } else { + // added debug + //atomKK->sync(Host,ALL_MASK); + //atomKK->modified(Host,ALL_MASK); + + if (n_pre_exchange) modify->pre_exchange(); + // debug + //atomKK->sync(Host,ALL_MASK); + //atomKK->modified(Host,ALL_MASK); + if (triclinic) domain->x2lamda(atomKK->nlocal); + domain->pbc(); + if (domain->box_change) { + domain->reset_box(); + comm->setup(); + if (neighbor->style) neighbor->setup_bins(); + } + timer->stamp(); + + // added debug + //atomKK->sync(Device,ALL_MASK); + //atomKK->modified(Device,ALL_MASK); + + comm->exchange(); + if (sortflag && ntimestep >= atomKK->nextsort) atomKK->sort(); + comm->borders(); + + // added debug + //atomKK->sync(Host,ALL_MASK); + //atomKK->modified(Host,ALL_MASK); + + if (triclinic) domain->lamda2x(atomKK->nlocal+atomKK->nghost); + + timer->stamp(TIME_COMM); + if (n_pre_neighbor) modify->pre_neighbor(); + neighbor->build(); + timer->stamp(TIME_NEIGHBOR); + } + + // force computations + // important for pair to come before bonded contributions + // since some bonded potentials tally pairwise energy/virial + // and Pair:ev_tally() needs to be called before any tallying + + force_clear(); + // added for debug + //atomKK->k_x.sync<LMPHostType>(); + //atomKK->k_f.sync<LMPHostType>(); + //atomKK->k_f.modify<LMPHostType>(); + if (n_pre_force) modify->pre_force(vflag); + + timer->stamp(); + + if (pair_compute_flag) { + force->pair->compute(eflag,vflag); + timer->stamp(TIME_PAIR); + } + + if (atomKK->molecular) { + if (force->bond) force->bond->compute(eflag,vflag); + if (force->angle) force->angle->compute(eflag,vflag); + if (force->dihedral) force->dihedral->compute(eflag,vflag); + if (force->improper) force->improper->compute(eflag,vflag); + timer->stamp(TIME_BOND); + } + + if (kspace_compute_flag) { + force->kspace->compute(eflag,vflag); + timer->stamp(TIME_KSPACE); + } + + // reverse communication of forces + + if (force->newton) { + atomKK->sync(Host,F_MASK); + comm->reverse_comm(); + atomKK->modified(Host,F_MASK); + timer->stamp(TIME_COMM); + } + + // force modifications, final time integration, diagnostics + + ktimer.reset(); + + if (n_post_force) modify->post_force(vflag); + modify->final_integrate(); + if (n_end_of_step) modify->end_of_step(); + + time += ktimer.seconds(); + + // all output + + if (ntimestep == output->next) { + atomKK->sync(Host,ALL_MASK); + + timer->stamp(); + output->write(ntimestep); + timer->stamp(TIME_OUTPUT); + } + } +} + +/* ---------------------------------------------------------------------- + clear force on own & ghost atoms + clear other arrays as needed +------------------------------------------------------------------------- */ + +void VerletKokkos::force_clear() +{ + int i; + + if (external_force_clear) return; + + // clear force on all particles + // if either newton flag is set, also include ghosts + // when using threads always clear all forces. + + if (neighbor->includegroup == 0) { + int nall; + if (force->newton) nall = atomKK->nlocal + atomKK->nghost; + else nall = atomKK->nlocal; + + size_t nbytes = sizeof(double) * nall; + + if (nbytes) { + if (atomKK->k_f.modified_host > atomKK->k_f.modified_device) { + memset_kokkos(atomKK->k_f.view<LMPHostType>()); + atomKK->modified(Host,F_MASK); + } else { + memset_kokkos(atomKK->k_f.view<LMPDeviceType>()); + atomKK->modified(Device,F_MASK); + } + if (torqueflag) memset(&(atomKK->torque[0][0]),0,3*nbytes); + if (erforceflag) memset(&(atomKK->erforce[0]), 0, nbytes); + if (e_flag) memset(&(atomKK->de[0]), 0, nbytes); + if (rho_flag) memset(&(atomKK->drho[0]), 0, nbytes); + } + + // neighbor includegroup flag is set + // clear force only on initial nfirst particles + // if either newton flag is set, also include ghosts + + } else { + int nall = atomKK->nfirst; + if (atomKK->k_f.modified_host > atomKK->k_f.modified_device) { + memset_kokkos(atomKK->k_f.view<LMPHostType>()); + atomKK->modified(Host,F_MASK); + } else { + memset_kokkos(atomKK->k_f.view<LMPDeviceType>()); + atomKK->modified(Device,F_MASK); + } + if (torqueflag) { + double **torque = atomKK->torque; + for (i = 0; i < nall; i++) { + torque[i][0] = 0.0; + torque[i][1] = 0.0; + torque[i][2] = 0.0; + } + } + + if (erforceflag) { + double *erforce = atomKK->erforce; + for (i = 0; i < nall; i++) erforce[i] = 0.0; + } + + if (e_flag) { + double *de = atomKK->de; + for (i = 0; i < nall; i++) de[i] = 0.0; + } + + if (rho_flag) { + double *drho = atomKK->drho; + for (i = 0; i < nall; i++) drho[i] = 0.0; + } + + if (force->newton) { + nall = atomKK->nlocal + atomKK->nghost; + + if (torqueflag) { + double **torque = atomKK->torque; + for (i = atomKK->nlocal; i < nall; i++) { + torque[i][0] = 0.0; + torque[i][1] = 0.0; + torque[i][2] = 0.0; + } + } + + if (erforceflag) { + double *erforce = atomKK->erforce; + for (i = atomKK->nlocal; i < nall; i++) erforce[i] = 0.0; + } + + if (e_flag) { + double *de = atomKK->de; + for (i = 0; i < nall; i++) de[i] = 0.0; + } + + if (rho_flag) { + double *drho = atomKK->drho; + for (i = 0; i < nall; i++) drho[i] = 0.0; + } + } + } +} diff --git a/src/KOKKOS/verlet_kokkos.h b/src/KOKKOS/verlet_kokkos.h new file mode 100644 index 0000000000..63531bda2d --- /dev/null +++ b/src/KOKKOS/verlet_kokkos.h @@ -0,0 +1,48 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef INTEGRATE_CLASS + +IntegrateStyle(verlet/kk,VerletKokkos) + +#else + +#ifndef LMP_VERLET_KOKKOS_H +#define LMP_VERLET_KOKKOS_H + +#include "verlet.h" + +namespace LAMMPS_NS { + +class VerletKokkos : public Verlet { + public: + VerletKokkos(class LAMMPS *, int, char **); + ~VerletKokkos() {} + void setup(); + void setup_minimal(int); + void run(int); + + protected: + class AtomKokkos *atomKK; + + void force_clear(); +}; + +} + +#endif +#endif + +/* ERROR/WARNING messages: + +*/ diff --git a/src/MAKE/Makefile.cuda b/src/MAKE/Makefile.cuda new file mode 100755 index 0000000000..61b1738ba8 --- /dev/null +++ b/src/MAKE/Makefile.cuda @@ -0,0 +1,111 @@ +# cuda = RedHat Linux box, nvcc for Kokkos, MPICH2, FFTW + +SHELL = /bin/sh + +# --------------------------------------------------------------------- +# compiler/linker settings +# specify flags and libraries needed for your compiler + +CC = nvcc +CCFLAGS = -g -O3 -arch=sm_20 +SHFLAGS = -fPIC +DEPFLAGS = -M + +LINK = g++ +LINKFLAGS = -g -O +LIB = +SIZE = size + +ARCHIVE = ar +ARFLAGS = -rc +SHLIBFLAGS = -shared + +# --------------------------------------------------------------------- +# LAMMPS-specific settings +# specify settings for LAMMPS features you will use +# if you change any -D setting, do full re-compile after "make clean" + +# LAMMPS ifdef settings, OPTIONAL +# see possible settings in doc/Section_start.html#2_2 (step 4) + +LMP_INC = -DLAMMPS_GZIP -DLAMMPS_JPEG + +# MPI library, REQUIRED +# see discussion in doc/Section_start.html#2_2 (step 5) +# can point to dummy MPI library in src/STUBS as in Makefile.serial +# INC = path for mpi.h, MPI compiler settings +# PATH = path for MPI library +# LIB = name of MPI library + +MPI_INC = -DMPICH_SKIP_MPICXX +MPI_PATH = +MPI_LIB = -lmpich -lmpl -lpthread + +# FFT library, OPTIONAL +# see discussion in doc/Section_start.html#2_2 (step 6) +# can be left blank to use provided KISS FFT library +# INC = -DFFT setting, e.g. -DFFT_FFTW, FFT compiler settings +# PATH = path for FFT library +# LIB = name of FFT library + +FFT_INC = -DFFT_FFTW +FFT_PATH = +FFT_LIB = -lfftw + +# JPEG and/or PNG library, OPTIONAL +# see discussion in doc/Section_start.html#2_2 (step 7) +# only needed if -DLAMMPS_JPEG or -DLAMMPS_PNG listed with LMP_INC +# INC = path(s) for jpeglib.h and/or png.h +# PATH = path(s) for JPEG library and/or PNG library +# LIB = name(s) of JPEG library and/or PNG library + +JPG_INC = +JPG_PATH = +JPG_LIB = -ljpeg + +# --------------------------------------------------------------------- +# build rules and dependencies +# no need to edit this section + +include Makefile.package.settings +include Makefile.package + +EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC) $(PKG_SYSINC) +EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(FFT_PATH) $(JPG_PATH) $(PKG_SYSPATH) +EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(FFT_LIB) $(JPG_LIB) $(PKG_SYSLIB) + +# Path to src files + +vpath %.cpp .. +vpath %.h .. + +# Link target + +$(EXE): $(OBJ) + $(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE) + $(SIZE) $(EXE) + +# Library targets + +lib: $(OBJ) + $(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ) + +shlib: $(OBJ) + $(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \ + $(OBJ) $(EXTRA_LIB) $(LIB) + +# Compilation rules + +%.o:%.cu + $(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $< + +%.o:%.cpp + $(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $< + +%.d:%.cpp + $(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@ + +# Individual dependencies + +DEPENDS = $(OBJ:.o=.d) +sinclude $(DEPENDS) diff --git a/src/Makefile b/src/Makefile index 8241135cc2..f8e70a94dc 100755 --- a/src/Makefile +++ b/src/Makefile @@ -14,7 +14,7 @@ OBJ = $(SRC:.cpp=.o) # Package variables PACKAGE = asphere body class2 colloid dipole fld gpu granular kim \ - kspace manybody mc meam misc molecule mpiio opt peri poems \ + kokkos kspace manybody mc meam misc molecule mpiio opt peri poems \ reax replica rigid shock srd voronoi xtc PACKUSER = user-atc user-awpmd user-cg-cmm user-colvars \ diff --git a/src/atom_vec.cpp b/src/atom_vec.cpp index 0fd8043d85..06c61caa2f 100644 --- a/src/atom_vec.cpp +++ b/src/atom_vec.cpp @@ -80,8 +80,10 @@ void AtomVec::init() deform_groupbit = domain->deform_groupbit; h_rate = domain->h_rate; - if (lmp->cuda != NULL && cudable == false) + if (lmp->cuda != NULL && !cudable) error->all(FLERR,"USER-CUDA package requires a cuda enabled atom_style"); + if (lmp->kokkos != NULL && !kokkosable) + error->all(FLERR,"KOKKOS package requires a kokkos enabled atom_style"); } /* ---------------------------------------------------------------------- diff --git a/src/compute_property_local.cpp b/src/compute_property_local.cpp index da34de08ae..82d85d4bec 100644 --- a/src/compute_property_local.cpp +++ b/src/compute_property_local.cpp @@ -334,7 +334,7 @@ void ComputePropertyLocal::compute_local() int ComputePropertyLocal::count_pairs(int allflag, int forceflag) { - int i,j,m,n,ii,jj,inum,jnum,itype,jtype; + int i,j,m,ii,jj,inum,jnum,itype,jtype; double xtmp,ytmp,ztmp,delx,dely,delz,rsq; int *ilist,*jlist,*numneigh,**firstneigh; @@ -358,7 +358,7 @@ int ComputePropertyLocal::count_pairs(int allflag, int forceflag) double **cutsq = force->pair->cutsq; - m = n = 0; + m = 0; for (ii = 0; ii < inum; ii++) { i = ilist[ii]; if (!(mask[i] & groupbit)) continue; -- GitLab