From b4785cd03877b05ef147b0b4679299181f8c4d3a Mon Sep 17 00:00:00 2001 From: stamoor <stamoor@f3b2605a-c512-4ea7-a41b-209d697bcdaa> Date: Thu, 1 Sep 2016 20:53:40 +0000 Subject: [PATCH] Adding Kokkos version of PPPM git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@15535 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- src/KOKKOS/Install.sh | 6 + src/KOKKOS/gridcomm_kokkos.cpp | 614 ++++ src/KOKKOS/gridcomm_kokkos.h | 96 + src/KOKKOS/kokkos_type.h | 70 + src/KOKKOS/math_special_kokkos.cpp | 534 +++ src/KOKKOS/math_special_kokkos.h | 100 + .../pair_lj_charmm_coul_long_kokkos.cpp | 23 + src/KOKKOS/pair_lj_cut_coul_long_kokkos.cpp | 28 + src/KOKKOS/pppm_kokkos.cpp | 3193 +++++++++++++++++ src/KOKKOS/pppm_kokkos.h | 552 +++ src/KSPACE/pppm.cpp | 2 + src/kspace.cpp | 3 + src/kspace.h | 7 + 13 files changed, 5228 insertions(+) create mode 100644 src/KOKKOS/gridcomm_kokkos.cpp create mode 100644 src/KOKKOS/gridcomm_kokkos.h create mode 100644 src/KOKKOS/math_special_kokkos.cpp create mode 100644 src/KOKKOS/math_special_kokkos.h create mode 100644 src/KOKKOS/pppm_kokkos.cpp create mode 100644 src/KOKKOS/pppm_kokkos.h diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh index c269de41d3..c45be8c973 100644 --- a/src/KOKKOS/Install.sh +++ b/src/KOKKOS/Install.sh @@ -87,6 +87,8 @@ action fix_setforce_kokkos.cpp action fix_setforce_kokkos.h action fix_wall_reflect_kokkos.cpp action fix_wall_reflect_kokkos.h +action gridcomm_kokkos.cpp gridcomm.cpp +action gridcomm_kokkos.h gridcomm.h action improper_harmonic_kokkos.cpp improper_harmonic.cpp action improper_harmonic_kokkos.h improper_harmonic.h action kokkos.cpp @@ -102,6 +104,8 @@ action neigh_list_kokkos.cpp action neigh_list_kokkos.h action neighbor_kokkos.cpp action neighbor_kokkos.h +action math_special_kokkos.cpp +action math_special_kokkos.h action pair_buck_coul_cut_kokkos.cpp action pair_buck_coul_cut_kokkos.h action pair_buck_coul_long_kokkos.cpp pair_buck_coul_long.cpp @@ -167,6 +171,8 @@ action pair_tersoff_mod_kokkos.cpp pair_tersoff_mod.cpp action pair_tersoff_mod_kokkos.h pair_tersoff_mod.h action pair_tersoff_zbl_kokkos.cpp pair_tersoff_zbl.cpp action pair_tersoff_zbl_kokkos.h pair_tersoff_zbl.h +action pppm_kokkos.cpp pppm.cpp +action pppm_kokkos.h pppm.h action region_block_kokkos.cpp action region_block_kokkos.h action verlet_kokkos.cpp diff --git a/src/KOKKOS/gridcomm_kokkos.cpp b/src/KOKKOS/gridcomm_kokkos.cpp new file mode 100644 index 0000000000..6871ef67ae --- /dev/null +++ b/src/KOKKOS/gridcomm_kokkos.cpp @@ -0,0 +1,614 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include <mpi.h> +#include "gridcomm_kokkos.h" +#include "comm.h" +#include "kspace.h" +#include "memory.h" +#include "error.h" + +using namespace LAMMPS_NS; + +#define SWAPDELTA 8 + +/* ---------------------------------------------------------------------- */ + +template<class DeviceType> +GridCommKokkos<DeviceType>::GridCommKokkos(LAMMPS *lmp, MPI_Comm gcomm, int forward, int reverse, + int ixlo, int ixhi, int iylo, int iyhi, int izlo, int izhi, + int oxlo, int oxhi, int oylo, int oyhi, int ozlo, int ozhi, + int pxlo, int pxhi, int pylo, int pyhi, int pzlo, int pzhi) + : Pointers(lmp) +{ + gridcomm = gcomm; + MPI_Comm_rank(gridcomm,&me); + + nforward = forward; + nreverse = reverse; + + inxlo = ixlo; + inxhi = ixhi; + inylo = iylo; + inyhi = iyhi; + inzlo = izlo; + inzhi = izhi; + + outxlo = oxlo; + outxhi = oxhi; + outylo = oylo; + outyhi = oyhi; + outzlo = ozlo; + outzhi = ozhi; + + outxlo_max = oxlo; + outxhi_max = oxhi; + outylo_max = oylo; + outyhi_max = oyhi; + outzlo_max = ozlo; + outzhi_max = ozhi; + + procxlo = pxlo; + procxhi = pxhi; + procylo = pylo; + procyhi = pyhi; + proczlo = pzlo; + proczhi = pzhi; + + nswap = 0; + swap = NULL; + //buf1 = buf2 = NULL; +} + +/* ---------------------------------------------------------------------- */ + +template<class DeviceType> +GridCommKokkos<DeviceType>::GridCommKokkos(LAMMPS *lmp, MPI_Comm gcomm, int forward, int reverse, + int ixlo, int ixhi, int iylo, int iyhi, int izlo, int izhi, + int oxlo, int oxhi, int oylo, int oyhi, int ozlo, int ozhi, + int oxlo_max, int oxhi_max, int oylo_max, int oyhi_max, + int ozlo_max, int ozhi_max, + int pxlo, int pxhi, int pylo, int pyhi, int pzlo, int pzhi) + : Pointers(lmp) +{ + gridcomm = gcomm; + MPI_Comm_rank(gridcomm,&me); + + nforward = forward; + nreverse = reverse; + + inxlo = ixlo; + inxhi = ixhi; + inylo = iylo; + inyhi = iyhi; + inzlo = izlo; + inzhi = izhi; + + outxlo = oxlo; + outxhi = oxhi; + outylo = oylo; + outyhi = oyhi; + outzlo = ozlo; + outzhi = ozhi; + + outxlo_max = oxlo_max; + outxhi_max = oxhi_max; + outylo_max = oylo_max; + outyhi_max = oyhi_max; + outzlo_max = ozlo_max; + outzhi_max = ozhi_max; + + procxlo = pxlo; + procxhi = pxhi; + procylo = pylo; + procyhi = pyhi; + proczlo = pzlo; + proczhi = pzhi; + + nswap = 0; + swap = NULL; + //buf1 = buf2 = NULL; +} + +/* ---------------------------------------------------------------------- */ + +template<class DeviceType> +GridCommKokkos<DeviceType>::~GridCommKokkos() +{ + for (int i = 0; i < nswap; i++) { + //memory->destroy_kokkos(swap[i].k_packlist,swap[i].packlist); + //memory->destroy_kokkos(swap[i].k_unpacklist,swap[i].unpacklist); + } + memory->sfree(swap); + + //memory->destroy(buf1); + //memory->destroy(buf2); +} + +/* ---------------------------------------------------------------------- + notify 6 neighbor procs how many ghost grid planes I need from them + ghostxlo = # of lower grid planes I own that are needed from me + by procxlo to become its upper ghost planes + ghostxhi = # of upper grid planes I own that are needed from me + by procxhi to become its lower ghost planes + if no neighbor proc, value is from self +------------------------------------------------------------------------- */ + +template<class DeviceType> +void GridCommKokkos<DeviceType>::ghost_notify() +{ + int nplanes = inxlo - outxlo; + if (procxlo != me) + MPI_Sendrecv(&nplanes,1,MPI_INT,procxlo,0, + &ghostxhi,1,MPI_INT,procxhi,0,gridcomm,MPI_STATUS_IGNORE); + else ghostxhi = nplanes; + + nplanes = outxhi - inxhi; + if (procxhi != me) + MPI_Sendrecv(&nplanes,1,MPI_INT,procxhi,0, + &ghostxlo,1,MPI_INT,procxlo,0,gridcomm,MPI_STATUS_IGNORE); + else ghostxlo = nplanes; + + nplanes = inylo - outylo; + if (procylo != me) + MPI_Sendrecv(&nplanes,1,MPI_INT,procylo,0, + &ghostyhi,1,MPI_INT,procyhi,0,gridcomm,MPI_STATUS_IGNORE); + else ghostyhi = nplanes; + + nplanes = outyhi - inyhi; + if (procyhi != me) + MPI_Sendrecv(&nplanes,1,MPI_INT,procyhi,0, + &ghostylo,1,MPI_INT,procylo,0,gridcomm,MPI_STATUS_IGNORE); + else ghostylo = nplanes; + + nplanes = inzlo - outzlo; + if (proczlo != me) + MPI_Sendrecv(&nplanes,1,MPI_INT,proczlo,0, + &ghostzhi,1,MPI_INT,proczhi,0,gridcomm,MPI_STATUS_IGNORE); + else ghostzhi = nplanes; + + nplanes = outzhi - inzhi; + if (proczhi != me) + MPI_Sendrecv(&nplanes,1,MPI_INT,proczhi,0, + &ghostzlo,1,MPI_INT,proczlo,0,gridcomm,MPI_STATUS_IGNORE); + else ghostzlo = nplanes; +} + +/* ---------------------------------------------------------------------- + check if all ghost grid comm needs overlap into non nearest-neighbor proc + if yes, return 1, else return 0 +------------------------------------------------------------------------- */ + +template<class DeviceType> +int GridCommKokkos<DeviceType>::ghost_overlap() +{ + int nearest = 0; + if (ghostxlo > inxhi-inxlo+1) nearest = 1; + if (ghostxhi > inxhi-inxlo+1) nearest = 1; + if (ghostylo > inyhi-inylo+1) nearest = 1; + if (ghostyhi > inyhi-inylo+1) nearest = 1; + if (ghostzlo > inzhi-inzlo+1) nearest = 1; + if (ghostzhi > inzhi-inzlo+1) nearest = 1; + + int nearest_all; + MPI_Allreduce(&nearest,&nearest_all,1,MPI_INT,MPI_MIN,gridcomm); + + return nearest_all; +} + +/* ---------------------------------------------------------------------- + create swap stencil for grid own/ghost communication + swaps covers all 3 dimensions and both directions + swaps cover multiple iterations in a direction if need grid pts + from further away than nearest-neighbor proc + same swap list used by forward and reverse communication +------------------------------------------------------------------------- */ + +template<class DeviceType> +void GridCommKokkos<DeviceType>::setup() +{ + int nsent,sendfirst,sendlast,recvfirst,recvlast; + int sendplanes,recvplanes; + int notdoneme,notdone; + + int maxswap = 6; + swap = (Swap *) memory->smalloc(maxswap*sizeof(Swap),"Commgrid:swap"); + k_packlist = DAT::tdual_int_2d("Commgrid:packlist",maxswap,1); + k_unpacklist = DAT::tdual_int_2d("Commgrid:unpacklist",maxswap,1); + nswap = 0; + + // send own grid pts to -x processor, recv ghost grid pts from +x processor + + nsent = 0; + sendfirst = inxlo; + sendlast = inxhi; + recvfirst = inxhi+1; + notdone = 1; + + while (notdone) { + if (nswap == maxswap) { + maxswap += SWAPDELTA; + swap = (Swap *) + memory->srealloc(swap,maxswap*sizeof(Swap),"Commgrid:swap"); + k_packlist.resize(maxswap,k_packlist.dimension_1()); + k_unpacklist.resize(maxswap,k_unpacklist.dimension_1()); + } + + swap[nswap].sendproc = procxlo; + swap[nswap].recvproc = procxhi; + sendplanes = MIN(sendlast-sendfirst+1,ghostxlo-nsent); + swap[nswap].npack = + indices(k_packlist,nswap, + sendfirst,sendfirst+sendplanes-1,inylo,inyhi,inzlo,inzhi); + + if (procxlo != me) + MPI_Sendrecv(&sendplanes,1,MPI_INT,procxlo,0, + &recvplanes,1,MPI_INT,procxhi,0,gridcomm,MPI_STATUS_IGNORE); + else recvplanes = sendplanes; + + swap[nswap].nunpack = + indices(k_unpacklist,nswap, + recvfirst,recvfirst+recvplanes-1,inylo,inyhi,inzlo,inzhi); + + nsent += sendplanes; + sendfirst += sendplanes; + sendlast += recvplanes; + recvfirst += recvplanes; + nswap++; + + if (nsent < ghostxlo) notdoneme = 1; + else notdoneme = 0; + MPI_Allreduce(¬doneme,¬done,1,MPI_INT,MPI_SUM,gridcomm); + } + + // send own grid pts to +x processor, recv ghost grid pts from -x processor + + nsent = 0; + sendfirst = inxlo; + sendlast = inxhi; + recvlast = inxlo-1; + notdone = 1; + + while (notdone) { + if (nswap == maxswap) { + maxswap += 1; + swap = (Swap *) + memory->srealloc(swap,maxswap*sizeof(Swap),"Commgrid:swap"); + k_packlist.resize(maxswap,k_packlist.dimension_1()); + k_unpacklist.resize(maxswap,k_unpacklist.dimension_1()); + } + + swap[nswap].sendproc = procxhi; + swap[nswap].recvproc = procxlo; + sendplanes = MIN(sendlast-sendfirst+1,ghostxhi-nsent); + swap[nswap].npack = + indices(k_packlist,nswap, + sendlast-sendplanes+1,sendlast,inylo,inyhi,inzlo,inzhi); + + if (procxhi != me) + MPI_Sendrecv(&sendplanes,1,MPI_INT,procxhi,0, + &recvplanes,1,MPI_INT,procxlo,0,gridcomm,MPI_STATUS_IGNORE); + else recvplanes = sendplanes; + + swap[nswap].nunpack = + indices(k_unpacklist,nswap, + recvlast-recvplanes+1,recvlast,inylo,inyhi,inzlo,inzhi); + + nsent += sendplanes; + sendfirst -= recvplanes; + sendlast -= sendplanes; + recvlast -= recvplanes; + nswap++; + + if (nsent < ghostxhi) notdoneme = 1; + else notdoneme = 0; + MPI_Allreduce(¬doneme,¬done,1,MPI_INT,MPI_SUM,gridcomm); + } + + // send own grid pts to -y processor, recv ghost grid pts from +y processor + + nsent = 0; + sendfirst = inylo; + sendlast = inyhi; + recvfirst = inyhi+1; + notdone = 1; + + while (notdone) { + if (nswap == maxswap) { + maxswap += SWAPDELTA; + swap = (Swap *) + memory->srealloc(swap,maxswap*sizeof(Swap),"Commgrid:swap"); + k_packlist.resize(maxswap,k_packlist.dimension_1()); + k_unpacklist.resize(maxswap,k_unpacklist.dimension_1()); + } + + swap[nswap].sendproc = procylo; + swap[nswap].recvproc = procyhi; + sendplanes = MIN(sendlast-sendfirst+1,ghostylo-nsent); + swap[nswap].npack = + indices(k_packlist,nswap, + outxlo,outxhi,sendfirst,sendfirst+sendplanes-1,inzlo,inzhi); + + if (procylo != me) + MPI_Sendrecv(&sendplanes,1,MPI_INT,procylo,0, + &recvplanes,1,MPI_INT,procyhi,0,gridcomm,MPI_STATUS_IGNORE); + else recvplanes = sendplanes; + + swap[nswap].nunpack = + indices(k_unpacklist,nswap, + outxlo,outxhi,recvfirst,recvfirst+recvplanes-1,inzlo,inzhi); + + nsent += sendplanes; + sendfirst += sendplanes; + sendlast += recvplanes; + recvfirst += recvplanes; + nswap++; + + if (nsent < ghostylo) notdoneme = 1; + else notdoneme = 0; + MPI_Allreduce(¬doneme,¬done,1,MPI_INT,MPI_SUM,gridcomm); + } + + // send own grid pts to +y processor, recv ghost grid pts from -y processor + + nsent = 0; + sendfirst = inylo; + sendlast = inyhi; + recvlast = inylo-1; + notdone = 1; + + while (notdone) { + if (nswap == maxswap) { + maxswap += 1; + swap = (Swap *) + memory->srealloc(swap,maxswap*sizeof(Swap),"Commgrid:swap"); + k_packlist.resize(maxswap,k_packlist.dimension_1()); + k_unpacklist.resize(maxswap,k_unpacklist.dimension_1()); + } + + swap[nswap].sendproc = procyhi; + swap[nswap].recvproc = procylo; + sendplanes = MIN(sendlast-sendfirst+1,ghostyhi-nsent); + swap[nswap].npack = + indices(k_packlist,nswap, + outxlo,outxhi,sendlast-sendplanes+1,sendlast,inzlo,inzhi); + + if (procyhi != me) + MPI_Sendrecv(&sendplanes,1,MPI_INT,procyhi,0, + &recvplanes,1,MPI_INT,procylo,0,gridcomm,MPI_STATUS_IGNORE); + else recvplanes = sendplanes; + + swap[nswap].nunpack = + indices(k_unpacklist,nswap, + outxlo,outxhi,recvlast-recvplanes+1,recvlast,inzlo,inzhi); + + nsent += sendplanes; + sendfirst -= recvplanes; + sendlast -= sendplanes; + recvlast -= recvplanes; + nswap++; + + if (nsent < ghostyhi) notdoneme = 1; + else notdoneme = 0; + MPI_Allreduce(¬doneme,¬done,1,MPI_INT,MPI_SUM,gridcomm); + } + + // send own grid pts to -z processor, recv ghost grid pts from +z processor + + nsent = 0; + sendfirst = inzlo; + sendlast = inzhi; + recvfirst = inzhi+1; + notdone = 1; + + while (notdone) { + if (nswap == maxswap) { + maxswap += SWAPDELTA; + swap = (Swap *) + memory->srealloc(swap,maxswap*sizeof(Swap),"Commgrid:swap"); + k_packlist.resize(maxswap,k_packlist.dimension_1()); + k_unpacklist.resize(maxswap,k_unpacklist.dimension_1()); + } + + swap[nswap].sendproc = proczlo; + swap[nswap].recvproc = proczhi; + sendplanes = MIN(sendlast-sendfirst+1,ghostzlo-nsent); + swap[nswap].npack = + indices(k_packlist,nswap, + outxlo,outxhi,outylo,outyhi,sendfirst,sendfirst+sendplanes-1); + + if (proczlo != me) + MPI_Sendrecv(&sendplanes,1,MPI_INT,proczlo,0, + &recvplanes,1,MPI_INT,proczhi,0,gridcomm,MPI_STATUS_IGNORE); + else recvplanes = sendplanes; + + swap[nswap].nunpack = + indices(k_unpacklist,nswap, + outxlo,outxhi,outylo,outyhi,recvfirst,recvfirst+recvplanes-1); + + nsent += sendplanes; + sendfirst += sendplanes; + sendlast += recvplanes; + recvfirst += recvplanes; + nswap++; + + if (nsent < ghostzlo) notdoneme = 1; + else notdoneme = 0; + MPI_Allreduce(¬doneme,¬done,1,MPI_INT,MPI_SUM,gridcomm); + } + + // send own grid pts to +z processor, recv ghost grid pts from -z processor + + nsent = 0; + sendfirst = inzlo; + sendlast = inzhi; + recvlast = inzlo-1; + notdone = 1; + + while (notdone) { + if (nswap == maxswap) { + maxswap += 1; + swap = (Swap *) + memory->srealloc(swap,maxswap*sizeof(Swap),"Commgrid:swap"); + k_packlist.resize(maxswap,k_packlist.dimension_1()); + k_unpacklist.resize(maxswap,k_unpacklist.dimension_1()); + } + + swap[nswap].sendproc = proczhi; + swap[nswap].recvproc = proczlo; + sendplanes = MIN(sendlast-sendfirst+1,ghostzhi-nsent); + swap[nswap].npack = + indices(k_packlist,nswap, + outxlo,outxhi,outylo,outyhi,sendlast-sendplanes+1,sendlast); + + if (proczhi != me) + MPI_Sendrecv(&sendplanes,1,MPI_INT,proczhi,0, + &recvplanes,1,MPI_INT,proczlo,0,gridcomm,MPI_STATUS_IGNORE); + else recvplanes = sendplanes; + + swap[nswap].nunpack = + indices(k_unpacklist,nswap, + outxlo,outxhi,outylo,outyhi,recvlast-recvplanes+1,recvlast); + + nsent += sendplanes; + sendfirst -= recvplanes; + sendlast -= sendplanes; + recvlast -= recvplanes; + nswap++; + + if (nsent < ghostzhi) notdoneme = 1; + else notdoneme = 0; + MPI_Allreduce(¬doneme,¬done,1,MPI_INT,MPI_SUM,gridcomm); + } + + // nbuf = max of any forward/reverse pack/unpack + + nbuf = 0; + for (int i = 0; i < nswap; i++) { + nbuf = MAX(nbuf,swap[i].npack); + nbuf = MAX(nbuf,swap[i].nunpack); + } + nbuf *= MAX(nforward,nreverse); + //memory->create(buf1,nbuf,"Commgrid:buf1"); + k_buf1 = DAT::tdual_FFT_SCALAR_1d("Commgrid:buf1",nbuf); + //memory->create(buf2,nbuf,"Commgrid:buf2"); + k_buf2 = DAT::tdual_FFT_SCALAR_1d("Commgrid:buf2",nbuf); +} + +/* ---------------------------------------------------------------------- + use swap list in forward order to acquire copy of all needed ghost grid pts +------------------------------------------------------------------------- */ + +template<class DeviceType> +void GridCommKokkos<DeviceType>::forward_comm(KSpace *kspace, int which) +{ + k_packlist.sync<DeviceType>(); + k_unpacklist.sync<DeviceType>(); + + for (int m = 0; m < nswap; m++) { + if (swap[m].sendproc == me) + kspace->pack_forward_kokkos(which,k_buf2,swap[m].npack,k_packlist,m); + else + kspace->pack_forward_kokkos(which,k_buf1,swap[m].npack,k_packlist,m); + + if (swap[m].sendproc != me) { + MPI_Irecv(k_buf2.view<DeviceType>().ptr_on_device(),nforward*swap[m].nunpack,MPI_FFT_SCALAR, + swap[m].recvproc,0,gridcomm,&request); + MPI_Send(k_buf1.view<DeviceType>().ptr_on_device(),nforward*swap[m].npack,MPI_FFT_SCALAR, + swap[m].sendproc,0,gridcomm); + MPI_Wait(&request,MPI_STATUS_IGNORE); + } + + kspace->unpack_forward_kokkos(which,k_buf2,swap[m].nunpack,k_unpacklist,m); + } +} + +/* ---------------------------------------------------------------------- + use swap list in reverse order to compute fully summed value + for each owned grid pt that some other proc has copy of as a ghost grid pt +------------------------------------------------------------------------- */ + +template<class DeviceType> +void GridCommKokkos<DeviceType>::reverse_comm(KSpace *kspace, int which) +{ + k_packlist.sync<DeviceType>(); + k_unpacklist.sync<DeviceType>(); + + for (int m = nswap-1; m >= 0; m--) { + if (swap[m].recvproc == me) + kspace->pack_reverse_kokkos(which,k_buf2,swap[m].nunpack,k_unpacklist,m); + else + kspace->pack_reverse_kokkos(which,k_buf1,swap[m].nunpack,k_unpacklist,m); + + if (swap[m].recvproc != me) { + MPI_Irecv(k_buf2.view<DeviceType>().ptr_on_device(),nreverse*swap[m].npack,MPI_FFT_SCALAR, + swap[m].sendproc,0,gridcomm,&request); + MPI_Send(k_buf1.view<DeviceType>().ptr_on_device(),nreverse*swap[m].nunpack,MPI_FFT_SCALAR, + swap[m].recvproc,0,gridcomm); + MPI_Wait(&request,MPI_STATUS_IGNORE); + } + + kspace->unpack_reverse_kokkos(which,k_buf2,swap[m].npack,k_packlist,m); + } +} + +/* ---------------------------------------------------------------------- + create 1d list of offsets into 3d array section (xlo:xhi,ylo:yhi,zlo:zhi) + assume 3d array is allocated as (0:outxhi_max-outxlo_max+1,0:outyhi_max-outylo_max+1, + 0:outzhi_max-outzlo_max+1) +------------------------------------------------------------------------- */ + +template<class DeviceType> +int GridCommKokkos<DeviceType>::indices(DAT::tdual_int_2d &k_list, int index, + int xlo, int xhi, int ylo, int yhi, int zlo, int zhi) +{ + int nmax = (xhi-xlo+1) * (yhi-ylo+1) * (zhi-zlo+1); + if (k_list.dimension_1() < nmax) + k_list.resize(k_list.dimension_0(),nmax); + + int nx = (outxhi_max-outxlo_max+1); + int ny = (outyhi_max-outylo_max+1); + + k_list.sync<LMPHostType>(); + + int n = 0; + int ix,iy,iz; + for (iz = zlo; iz <= zhi; iz++) + for (iy = ylo; iy <= yhi; iy++) + for (ix = xlo; ix <= xhi; ix++) + k_list.h_view(index,n++) = (iz-outzlo_max)*ny*nx + (iy-outylo_max)*nx + (ix-outxlo_max); + + k_list.modify<LMPHostType>(); + k_list.sync<DeviceType>(); + + return nmax; +} + + +/* ---------------------------------------------------------------------- + memory usage of send/recv bufs +------------------------------------------------------------------------- */ + +template<class DeviceType> +double GridCommKokkos<DeviceType>::memory_usage() +{ + double bytes = 2*nbuf * sizeof(double); + return bytes; +} + +namespace LAMMPS_NS { +template class GridCommKokkos<LMPDeviceType>; +#ifdef KOKKOS_HAVE_CUDA +template class GridCommKokkos<LMPHostType>; +#endif +} diff --git a/src/KOKKOS/gridcomm_kokkos.h b/src/KOKKOS/gridcomm_kokkos.h new file mode 100644 index 0000000000..a8b3dfdc85 --- /dev/null +++ b/src/KOKKOS/gridcomm_kokkos.h @@ -0,0 +1,96 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifndef LMP_GRIDCOMM_KOKKOS_H +#define LMP_GRIDCOMM_KOKKOS_H + +#include "pointers.h" +#include "kokkos_type.h" + +#ifdef FFT_SINGLE +typedef float FFT_SCALAR; +#define MPI_FFT_SCALAR MPI_FLOAT +#else +typedef double FFT_SCALAR; +#define MPI_FFT_SCALAR MPI_DOUBLE +#endif + +namespace LAMMPS_NS { + +template<class DeviceType> +class GridCommKokkos : protected Pointers { + public: + typedef DeviceType device_type; + typedef ArrayTypes<DeviceType> AT; + + GridCommKokkos(class LAMMPS *, MPI_Comm, int, int, + int, int, int, int, int, int, + int, int, int, int, int, int, + int, int, int, int, int, int); + GridCommKokkos(class LAMMPS *, MPI_Comm, int, int, + int, int, int, int, int, int, + int, int, int, int, int, int, + int, int, int, int, int, int, + int, int, int, int, int, int); + ~GridCommKokkos(); + void ghost_notify(); + int ghost_overlap(); + void setup(); + void forward_comm(class KSpace *, int); + void reverse_comm(class KSpace *, int); + double memory_usage(); + + private: + int me; + int nforward,nreverse; + MPI_Comm gridcomm; + MPI_Request request; + + // in = inclusive indices of 3d grid chunk that I own + // out = inclusive indices of 3d grid chunk I own plus ghosts I use + // proc = 6 neighbor procs that surround me + // ghost = # of my owned grid planes needed from me + // by each of 6 neighbor procs to become their ghost planes + + int inxlo,inxhi,inylo,inyhi,inzlo,inzhi; + int outxlo,outxhi,outylo,outyhi,outzlo,outzhi; + int outxlo_max,outxhi_max,outylo_max,outyhi_max,outzlo_max,outzhi_max; + int procxlo,procxhi,procylo,procyhi,proczlo,proczhi; + int ghostxlo,ghostxhi,ghostylo,ghostyhi,ghostzlo,ghostzhi; + + int nbuf; + //FFT_SCALAR *buf1,*buf2; + DAT::tdual_FFT_SCALAR_1d k_buf1; + DAT::tdual_FFT_SCALAR_1d k_buf2; + + struct Swap { + int sendproc; // proc to send to for forward comm + int recvproc; // proc to recv from for forward comm + int npack; // # of datums to pack + int nunpack; // # of datums to unpack + //int *packlist; // 3d array offsets to pack + //int *unpacklist; // 3d array offsets to unpack + }; + + DAT::tdual_int_2d k_packlist; + DAT::tdual_int_2d k_unpacklist; + + int nswap; + Swap *swap; + + int indices(DAT::tdual_int_2d &, int, int, int, int, int, int, int); +}; + +} + +#endif diff --git a/src/KOKKOS/kokkos_type.h b/src/KOKKOS/kokkos_type.h index 07a6e20f01..c1176122a7 100644 --- a/src/KOKKOS/kokkos_type.h +++ b/src/KOKKOS/kokkos_type.h @@ -14,6 +14,8 @@ #ifndef LMP_LMPTYPE_KOKKOS_H #define LMP_LMPTYPE_KOKKOS_H +#include "lmptype.h" + #include <Kokkos_Core.hpp> #include <Kokkos_DualView.hpp> #include <impl/Kokkos_Timer.hpp> @@ -24,6 +26,21 @@ #define ISFINITE(x) std::isfinite(x) #endif +// User-settable FFT precision + +// FFT_PRECISION = 1 is single-precision complex (4-byte real, 4-byte imag) +// FFT_PRECISION = 2 is double-precision complex (8-byte real, 8-byte imag) + +#ifdef FFT_SINGLE +#define FFT_PRECISION 1 +#define MPI_FFT_SCALAR MPI_FLOAT +typedef float FFT_SCALAR; +#else +#define FFT_PRECISION 2 +#define MPI_FFT_SCALAR MPI_DOUBLE +typedef double FFT_SCALAR; +#endif + #define MAX_TYPES_STACKPARAMS 12 #define NeighClusterSize 8 @@ -567,6 +584,32 @@ typedef tdual_neighbors_2d::t_dev_um t_neighbors_2d_um; typedef tdual_neighbors_2d::t_dev_const_um t_neighbors_2d_const_um; typedef tdual_neighbors_2d::t_dev_const_randomread t_neighbors_2d_randomread; +//Kspace + +typedef Kokkos:: + DualView<FFT_SCALAR*, Kokkos::LayoutRight, LMPDeviceType> tdual_FFT_SCALAR_1d; +typedef tdual_FFT_SCALAR_1d::t_dev t_FFT_SCALAR_1d; +typedef tdual_FFT_SCALAR_1d::t_dev_um t_FFT_SCALAR_1d_um; + +typedef Kokkos::DualView<FFT_SCALAR**,Kokkos::LayoutRight,LMPDeviceType> tdual_FFT_SCALAR_2d; +typedef tdual_FFT_SCALAR_2d::t_dev t_FFT_SCALAR_2d; + +typedef Kokkos::DualView<FFT_SCALAR**[3],Kokkos::LayoutRight,LMPDeviceType> tdual_FFT_SCALAR_2d_3; +typedef tdual_FFT_SCALAR_2d_3::t_dev t_FFT_SCALAR_2d_3; + +typedef Kokkos::DualView<FFT_SCALAR***,Kokkos::LayoutRight,LMPDeviceType> tdual_FFT_SCALAR_3d; +typedef tdual_FFT_SCALAR_3d::t_dev t_FFT_SCALAR_3d; + +typedef Kokkos:: + DualView<FFT_SCALAR*[2], Kokkos::LayoutRight, LMPDeviceType> tdual_FFT_DATA_1d; +typedef tdual_FFT_DATA_1d::t_dev t_FFT_DATA_1d; +typedef tdual_FFT_DATA_1d::t_dev_um t_FFT_DATA_1d_um; + +typedef Kokkos:: + DualView<int*, LMPDeviceType::array_layout, LMPDeviceType> tdual_int_64; +typedef tdual_int_64::t_dev t_int_64; +typedef tdual_int_64::t_dev_um t_int_64_um; + }; #ifdef KOKKOS_HAVE_CUDA @@ -801,6 +844,33 @@ typedef tdual_neighbors_2d::t_host_um t_neighbors_2d_um; typedef tdual_neighbors_2d::t_host_const_um t_neighbors_2d_const_um; typedef tdual_neighbors_2d::t_host_const_randomread t_neighbors_2d_randomread; + +//Kspace + +typedef Kokkos:: + DualView<FFT_SCALAR*, Kokkos::LayoutRight, LMPDeviceType> tdual_FFT_SCALAR_1d; +typedef tdual_FFT_SCALAR_1d::t_host t_FFT_SCALAR_1d; +typedef tdual_FFT_SCALAR_1d::t_host_um t_FFT_SCALAR_1d_um; + +typedef Kokkos::DualView<FFT_SCALAR**,Kokkos::LayoutRight,LMPDeviceType> tdual_FFT_SCALAR_2d; +typedef tdual_FFT_SCALAR_2d::t_host t_FFT_SCALAR_2d; + +typedef Kokkos::DualView<FFT_SCALAR**[3],Kokkos::LayoutRight,LMPDeviceType> tdual_FFT_SCALAR_2d_3; +typedef tdual_FFT_SCALAR_2d_3::t_host t_FFT_SCALAR_2d_3; + +typedef Kokkos::DualView<FFT_SCALAR***,Kokkos::LayoutRight,LMPDeviceType> tdual_FFT_SCALAR_3d; +typedef tdual_FFT_SCALAR_3d::t_host t_FFT_SCALAR_3d; + +typedef Kokkos:: + DualView<FFT_SCALAR*[2], Kokkos::LayoutRight, LMPDeviceType> tdual_FFT_DATA_1d; +typedef tdual_FFT_DATA_1d::t_host t_FFT_DATA_1d; +typedef tdual_FFT_DATA_1d::t_host_um t_FFT_DATA_1d_um; + +typedef Kokkos:: + DualView<int*, LMPDeviceType::array_layout, LMPDeviceType> tdual_int_64; +typedef tdual_int_64::t_host t_int_64; +typedef tdual_int_64::t_host_um t_int_64_um; + }; #endif //default LAMMPS Types diff --git a/src/KOKKOS/math_special_kokkos.cpp b/src/KOKKOS/math_special_kokkos.cpp new file mode 100644 index 0000000000..a8c35b12b8 --- /dev/null +++ b/src/KOKKOS/math_special_kokkos.cpp @@ -0,0 +1,534 @@ + +#include <math.h> +#include <stdint.h> +#include "math_special_kokkos.h" + +using namespace LAMMPS_NS; + +/* Library libcerf: + * Compute complex error functions, based on a new implementation of + * Faddeeva's w_of_z. Also provide Dawson and Voigt functions. + * + * File erfcx.c: + * Compute erfcx(x) = exp(x^2) erfc(x) function, for real x, + * using a novel algorithm that is much faster than DERFC of SLATEC. + * This function is used in the computation of Faddeeva, Dawson, and + * other complex error functions. + * + * Copyright: + * (C) 2012 Massachusetts Institute of Technology + * (C) 2013 Forschungszentrum Jülich GmbH + * + * Licence: + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Steven G. Johnson, Massachusetts Institute of Technology, 2012, core author + * Joachim Wuttke, Forschungszentrum Jülich, 2013, package maintainer + * + * Website: + * http://apps.jcns.fz-juelich.de/libcerf + * + * Revision history: + * ../CHANGELOG + * + * Manual page: + * man 3 erfcx + */ + +/******************************************************************************/ +/* Lookup-table for Chebyshev polynomials for smaller |x| */ +/******************************************************************************/ + +double MathSpecialKokkos::erfcx_y100(const double y100) +{ + // Steven G. Johnson, October 2012. + + // Given y100=100*y, where y = 4/(4+x) for x >= 0, compute erfc(x). + + // Uses a look-up table of 100 different Chebyshev polynomials + // for y intervals [0,0.01], [0.01,0.02], ...., [0.99,1], generated + // with the help of Maple and a little shell script. This allows + // the Chebyshev polynomials to be of significantly lower degree (about 1/4) + // compared to fitting the whole [0,1] interval with a single polynomial. + + switch ((int) y100) { + case 0: { + double t = 2*y100 - 1; + return 0.70878032454106438663e-3 + (0.71234091047026302958e-3 + (0.35779077297597742384e-5 + (0.17403143962587937815e-7 + (0.81710660047307788845e-10 + (0.36885022360434957634e-12 + 0.15917038551111111111e-14 * t) * t) * t) * t) * t) * t; + } + case 1: { + double t = 2*y100 - 3; + return 0.21479143208285144230e-2 + (0.72686402367379996033e-3 + (0.36843175430938995552e-5 + (0.18071841272149201685e-7 + (0.85496449296040325555e-10 + (0.38852037518534291510e-12 + 0.16868473576888888889e-14 * t) * t) * t) * t) * t) * t; + } + case 2: { + double t = 2*y100 - 5; + return 0.36165255935630175090e-2 + (0.74182092323555510862e-3 + (0.37948319957528242260e-5 + (0.18771627021793087350e-7 + (0.89484715122415089123e-10 + (0.40935858517772440862e-12 + 0.17872061464888888889e-14 * t) * t) * t) * t) * t) * t; + } + case 3: { + double t = 2*y100 - 7; + return 0.51154983860031979264e-2 + (0.75722840734791660540e-3 + (0.39096425726735703941e-5 + (0.19504168704300468210e-7 + (0.93687503063178993915e-10 + (0.43143925959079664747e-12 + 0.18939926435555555556e-14 * t) * t) * t) * t) * t) * t; + } + case 4: { + double t = 2*y100 - 9; + return 0.66457513172673049824e-2 + (0.77310406054447454920e-3 + (0.40289510589399439385e-5 + (0.20271233238288381092e-7 + (0.98117631321709100264e-10 + (0.45484207406017752971e-12 + 0.20076352213333333333e-14 * t) * t) * t) * t) * t) * t; + } + case 5: { + double t = 2*y100 - 11; + return 0.82082389970241207883e-2 + (0.78946629611881710721e-3 + (0.41529701552622656574e-5 + (0.21074693344544655714e-7 + (0.10278874108587317989e-9 + (0.47965201390613339638e-12 + 0.21285907413333333333e-14 * t) * t) * t) * t) * t) * t; + } + case 6: { + double t = 2*y100 - 13; + return 0.98039537275352193165e-2 + (0.80633440108342840956e-3 + (0.42819241329736982942e-5 + (0.21916534346907168612e-7 + (0.10771535136565470914e-9 + (0.50595972623692822410e-12 + 0.22573462684444444444e-14 * t) * t) * t) * t) * t) * t; + } + case 7: { + double t = 2*y100 - 15; + return 0.11433927298290302370e-1 + (0.82372858383196561209e-3 + (0.44160495311765438816e-5 + (0.22798861426211986056e-7 + (0.11291291745879239736e-9 + (0.53386189365816880454e-12 + 0.23944209546666666667e-14 * t) * t) * t) * t) * t) * t; + } + case 8: { + double t = 2*y100 - 17; + return 0.13099232878814653979e-1 + (0.84167002467906968214e-3 + (0.45555958988457506002e-5 + (0.23723907357214175198e-7 + (0.11839789326602695603e-9 + (0.56346163067550237877e-12 + 0.25403679644444444444e-14 * t) * t) * t) * t) * t) * t; + } + case 9: { + double t = 2*y100 - 19; + return 0.14800987015587535621e-1 + (0.86018092946345943214e-3 + (0.47008265848816866105e-5 + (0.24694040760197315333e-7 + (0.12418779768752299093e-9 + (0.59486890370320261949e-12 + 0.26957764568888888889e-14 * t) * t) * t) * t) * t) * t; + } + case 10: { + double t = 2*y100 - 21; + return 0.16540351739394069380e-1 + (0.87928458641241463952e-3 + (0.48520195793001753903e-5 + (0.25711774900881709176e-7 + (0.13030128534230822419e-9 + (0.62820097586874779402e-12 + 0.28612737351111111111e-14 * t) * t) * t) * t) * t) * t; + } + case 11: { + double t = 2*y100 - 23; + return 0.18318536789842392647e-1 + (0.89900542647891721692e-3 + (0.50094684089553365810e-5 + (0.26779777074218070482e-7 + (0.13675822186304615566e-9 + (0.66358287745352705725e-12 + 0.30375273884444444444e-14 * t) * t) * t) * t) * t) * t; + } + case 12: { + double t = 2*y100 - 25; + return 0.20136801964214276775e-1 + (0.91936908737673676012e-3 + (0.51734830914104276820e-5 + (0.27900878609710432673e-7 + (0.14357976402809042257e-9 + (0.70114790311043728387e-12 + 0.32252476000000000000e-14 * t) * t) * t) * t) * t) * t; + } + case 13: { + double t = 2*y100 - 27; + return 0.21996459598282740954e-1 + (0.94040248155366777784e-3 + (0.53443911508041164739e-5 + (0.29078085538049374673e-7 + (0.15078844500329731137e-9 + (0.74103813647499204269e-12 + 0.34251892320000000000e-14 * t) * t) * t) * t) * t) * t; + } + case 14: { + double t = 2*y100 - 29; + return 0.23898877187226319502e-1 + (0.96213386835900177540e-3 + (0.55225386998049012752e-5 + (0.30314589961047687059e-7 + (0.15840826497296335264e-9 + (0.78340500472414454395e-12 + 0.36381553564444444445e-14 * t) * t) * t) * t) * t) * t; + } + case 15: { + double t = 2*y100 - 31; + return 0.25845480155298518485e-1 + (0.98459293067820123389e-3 + (0.57082915920051843672e-5 + (0.31613782169164830118e-7 + (0.16646478745529630813e-9 + (0.82840985928785407942e-12 + 0.38649975768888888890e-14 * t) * t) * t) * t) * t) * t; + } + case 16: { + double t = 2*y100 - 33; + return 0.27837754783474696598e-1 + (0.10078108563256892757e-2 + (0.59020366493792212221e-5 + (0.32979263553246520417e-7 + (0.17498524159268458073e-9 + (0.87622459124842525110e-12 + 0.41066206488888888890e-14 * t) * t) * t) * t) * t) * t; + } + case 17: { + double t = 2*y100 - 35; + return 0.29877251304899307550e-1 + (0.10318204245057349310e-2 + (0.61041829697162055093e-5 + (0.34414860359542720579e-7 + (0.18399863072934089607e-9 + (0.92703227366365046533e-12 + 0.43639844053333333334e-14 * t) * t) * t) * t) * t) * t; + } + case 18: { + double t = 2*y100 - 37; + return 0.31965587178596443475e-1 + (0.10566560976716574401e-2 + (0.63151633192414586770e-5 + (0.35924638339521924242e-7 + (0.19353584758781174038e-9 + (0.98102783859889264382e-12 + 0.46381060817777777779e-14 * t) * t) * t) * t) * t) * t; + } + case 19: { + double t = 2*y100 - 39; + return 0.34104450552588334840e-1 + (0.10823541191350532574e-2 + (0.65354356159553934436e-5 + (0.37512918348533521149e-7 + (0.20362979635817883229e-9 + (0.10384187833037282363e-11 + 0.49300625262222222221e-14 * t) * t) * t) * t) * t) * t; + } + case 20: { + double t = 2*y100 - 41; + return 0.36295603928292425716e-1 + (0.11089526167995268200e-2 + (0.67654845095518363577e-5 + (0.39184292949913591646e-7 + (0.21431552202133775150e-9 + (0.10994259106646731797e-11 + 0.52409949102222222221e-14 * t) * t) * t) * t) * t) * t; + } + case 21: { + double t = 2*y100 - 43; + return 0.38540888038840509795e-1 + (0.11364917134175420009e-2 + (0.70058230641246312003e-5 + (0.40943644083718586939e-7 + (0.22563034723692881631e-9 + (0.11642841011361992885e-11 + 0.55721092871111111110e-14 * t) * t) * t) * t) * t) * t; + } + case 22: { + double t = 2*y100 - 45; + return 0.40842225954785960651e-1 + (0.11650136437945673891e-2 + (0.72569945502343006619e-5 + (0.42796161861855042273e-7 + (0.23761401711005024162e-9 + (0.12332431172381557035e-11 + 0.59246802364444444445e-14 * t) * t) * t) * t) * t) * t; + } + case 23: { + double t = 2*y100 - 47; + return 0.43201627431540222422e-1 + (0.11945628793917272199e-2 + (0.75195743532849206263e-5 + (0.44747364553960993492e-7 + (0.25030885216472953674e-9 + (0.13065684400300476484e-11 + 0.63000532853333333334e-14 * t) * t) * t) * t) * t) * t; + } + case 24: { + double t = 2*y100 - 49; + return 0.45621193513810471438e-1 + (0.12251862608067529503e-2 + (0.77941720055551920319e-5 + (0.46803119830954460212e-7 + (0.26375990983978426273e-9 + (0.13845421370977119765e-11 + 0.66996477404444444445e-14 * t) * t) * t) * t) * t) * t; + } + case 25: { + double t = 2*y100 - 51; + return 0.48103121413299865517e-1 + (0.12569331386432195113e-2 + (0.80814333496367673980e-5 + (0.48969667335682018324e-7 + (0.27801515481905748484e-9 + (0.14674637611609884208e-11 + 0.71249589351111111110e-14 * t) * t) * t) * t) * t) * t; + } + case 26: { + double t = 2*y100 - 53; + return 0.50649709676983338501e-1 + (0.12898555233099055810e-2 + (0.83820428414568799654e-5 + (0.51253642652551838659e-7 + (0.29312563849675507232e-9 + (0.15556512782814827846e-11 + 0.75775607822222222221e-14 * t) * t) * t) * t) * t) * t; + } + case 27: { + double t = 2*y100 - 55; + return 0.53263363664388864181e-1 + (0.13240082443256975769e-2 + (0.86967260015007658418e-5 + (0.53662102750396795566e-7 + (0.30914568786634796807e-9 + (0.16494420240828493176e-11 + 0.80591079644444444445e-14 * t) * t) * t) * t) * t) * t; + } + case 28: { + double t = 2*y100 - 57; + return 0.55946601353500013794e-1 + (0.13594491197408190706e-2 + (0.90262520233016380987e-5 + (0.56202552975056695376e-7 + (0.32613310410503135996e-9 + (0.17491936862246367398e-11 + 0.85713381688888888890e-14 * t) * t) * t) * t) * t) * t; + } + case 29: { + double t = 2*y100 - 59; + return 0.58702059496154081813e-1 + (0.13962391363223647892e-2 + (0.93714365487312784270e-5 + (0.58882975670265286526e-7 + (0.34414937110591753387e-9 + (0.18552853109751857859e-11 + 0.91160736711111111110e-14 * t) * t) * t) * t) * t) * t; + } + case 30: { + double t = 2*y100 - 61; + return 0.61532500145144778048e-1 + (0.14344426411912015247e-2 + (0.97331446201016809696e-5 + (0.61711860507347175097e-7 + (0.36325987418295300221e-9 + (0.19681183310134518232e-11 + 0.96952238400000000000e-14 * t) * t) * t) * t) * t) * t; + } + case 31: { + double t = 2*y100 - 63; + return 0.64440817576653297993e-1 + (0.14741275456383131151e-2 + (0.10112293819576437838e-4 + (0.64698236605933246196e-7 + (0.38353412915303665586e-9 + (0.20881176114385120186e-11 + 0.10310784480000000000e-13 * t) * t) * t) * t) * t) * t; + } + case 32: { + double t = 2*y100 - 65; + return 0.67430045633130393282e-1 + (0.15153655418916540370e-2 + (0.10509857606888328667e-4 + (0.67851706529363332855e-7 + (0.40504602194811140006e-9 + (0.22157325110542534469e-11 + 0.10964842115555555556e-13 * t) * t) * t) * t) * t) * t; + } + case 33: { + double t = 2*y100 - 67; + return 0.70503365513338850709e-1 + (0.15582323336495709827e-2 + (0.10926868866865231089e-4 + (0.71182482239613507542e-7 + (0.42787405890153386710e-9 + (0.23514379522274416437e-11 + 0.11659571751111111111e-13 * t) * t) * t) * t) * t) * t; + } + case 34: { + double t = 2*y100 - 69; + return 0.73664114037944596353e-1 + (0.16028078812438820413e-2 + (0.11364423678778207991e-4 + (0.74701423097423182009e-7 + (0.45210162777476488324e-9 + (0.24957355004088569134e-11 + 0.12397238257777777778e-13 * t) * t) * t) * t) * t) * t; + } + case 35: { + double t = 2*y100 - 71; + return 0.76915792420819562379e-1 + (0.16491766623447889354e-2 + (0.11823685320041302169e-4 + (0.78420075993781544386e-7 + (0.47781726956916478925e-9 + (0.26491544403815724749e-11 + 0.13180196462222222222e-13 * t) * t) * t) * t) * t) * t; + } + case 36: { + double t = 2*y100 - 73; + return 0.80262075578094612819e-1 + (0.16974279491709504117e-2 + (0.12305888517309891674e-4 + (0.82350717698979042290e-7 + (0.50511496109857113929e-9 + (0.28122528497626897696e-11 + 0.14010889635555555556e-13 * t) * t) * t) * t) * t) * t; + } + case 37: { + double t = 2*y100 - 75; + return 0.83706822008980357446e-1 + (0.17476561032212656962e-2 + (0.12812343958540763368e-4 + (0.86506399515036435592e-7 + (0.53409440823869467453e-9 + (0.29856186620887555043e-11 + 0.14891851591111111111e-13 * t) * t) * t) * t) * t) * t; + } + case 38: { + double t = 2*y100 - 77; + return 0.87254084284461718231e-1 + (0.17999608886001962327e-2 + (0.13344443080089492218e-4 + (0.90900994316429008631e-7 + (0.56486134972616465316e-9 + (0.31698707080033956934e-11 + 0.15825697795555555556e-13 * t) * t) * t) * t) * t) * t; + } + case 39: { + double t = 2*y100 - 79; + return 0.90908120182172748487e-1 + (0.18544478050657699758e-2 + (0.13903663143426120077e-4 + (0.95549246062549906177e-7 + (0.59752787125242054315e-9 + (0.33656597366099099413e-11 + 0.16815130613333333333e-13 * t) * t) * t) * t) * t) * t; + } + case 40: { + double t = 2*y100 - 81; + return 0.94673404508075481121e-1 + (0.19112284419887303347e-2 + (0.14491572616545004930e-4 + (0.10046682186333613697e-6 + (0.63221272959791000515e-9 + (0.35736693975589130818e-11 + 0.17862931591111111111e-13 * t) * t) * t) * t) * t) * t; + } + case 41: { + double t = 2*y100 - 83; + return 0.98554641648004456555e-1 + (0.19704208544725622126e-2 + (0.15109836875625443935e-4 + (0.10567036667675984067e-6 + (0.66904168640019354565e-9 + (0.37946171850824333014e-11 + 0.18971959040000000000e-13 * t) * t) * t) * t) * t) * t; + } + case 42: { + double t = 2*y100 - 85; + return 0.10255677889470089531e0 + (0.20321499629472857418e-2 + (0.15760224242962179564e-4 + (0.11117756071353507391e-6 + (0.70814785110097658502e-9 + (0.40292553276632563925e-11 + 0.20145143075555555556e-13 * t) * t) * t) * t) * t) * t; + } + case 43: { + double t = 2*y100 - 87; + return 0.10668502059865093318e0 + (0.20965479776148731610e-2 + (0.16444612377624983565e-4 + (0.11700717962026152749e-6 + (0.74967203250938418991e-9 + (0.42783716186085922176e-11 + 0.21385479360000000000e-13 * t) * t) * t) * t) * t) * t; + } + case 44: { + double t = 2*y100 - 89; + return 0.11094484319386444474e0 + (0.21637548491908170841e-2 + (0.17164995035719657111e-4 + (0.12317915750735938089e-6 + (0.79376309831499633734e-9 + (0.45427901763106353914e-11 + 0.22696025653333333333e-13 * t) * t) * t) * t) * t) * t; + } + case 45: { + double t = 2*y100 - 91; + return 0.11534201115268804714e0 + (0.22339187474546420375e-2 + (0.17923489217504226813e-4 + (0.12971465288245997681e-6 + (0.84057834180389073587e-9 + (0.48233721206418027227e-11 + 0.24079890062222222222e-13 * t) * t) * t) * t) * t) * t; + } + case 46: { + double t = 2*y100 - 93; + return 0.11988259392684094740e0 + (0.23071965691918689601e-2 + (0.18722342718958935446e-4 + (0.13663611754337957520e-6 + (0.89028385488493287005e-9 + (0.51210161569225846701e-11 + 0.25540227111111111111e-13 * t) * t) * t) * t) * t) * t; + } + case 47: { + double t = 2*y100 - 95; + return 0.12457298393509812907e0 + (0.23837544771809575380e-2 + (0.19563942105711612475e-4 + (0.14396736847739470782e-6 + (0.94305490646459247016e-9 + (0.54366590583134218096e-11 + 0.27080225920000000000e-13 * t) * t) * t) * t) * t) * t; + } + case 48: { + double t = 2*y100 - 97; + return 0.12941991566142438816e0 + (0.24637684719508859484e-2 + (0.20450821127475879816e-4 + (0.15173366280523906622e-6 + (0.99907632506389027739e-9 + (0.57712760311351625221e-11 + 0.28703099555555555556e-13 * t) * t) * t) * t) * t) * t; + } + case 49: { + double t = 2*y100 - 99; + return 0.13443048593088696613e0 + (0.25474249981080823877e-2 + (0.21385669591362915223e-4 + (0.15996177579900443030e-6 + (0.10585428844575134013e-8 + (0.61258809536787882989e-11 + 0.30412080142222222222e-13 * t) * t) * t) * t) * t) * t; + } + case 50: { + double t = 2*y100 - 101; + return 0.13961217543434561353e0 + (0.26349215871051761416e-2 + (0.22371342712572567744e-4 + (0.16868008199296822247e-6 + (0.11216596910444996246e-8 + (0.65015264753090890662e-11 + 0.32210394506666666666e-13 * t) * t) * t) * t) * t) * t; + } + case 51: { + double t = 2*y100 - 103; + return 0.14497287157673800690e0 + (0.27264675383982439814e-2 + (0.23410870961050950197e-4 + (0.17791863939526376477e-6 + (0.11886425714330958106e-8 + (0.68993039665054288034e-11 + 0.34101266222222222221e-13 * t) * t) * t) * t) * t) * t; + } + case 52: { + double t = 2*y100 - 105; + return 0.15052089272774618151e0 + (0.28222846410136238008e-2 + (0.24507470422713397006e-4 + (0.18770927679626136909e-6 + (0.12597184587583370712e-8 + (0.73203433049229821618e-11 + 0.36087889048888888890e-13 * t) * t) * t) * t) * t) * t; + } + case 53: { + double t = 2*y100 - 107; + return 0.15626501395774612325e0 + (0.29226079376196624949e-2 + (0.25664553693768450545e-4 + (0.19808568415654461964e-6 + (0.13351257759815557897e-8 + (0.77658124891046760667e-11 + 0.38173420035555555555e-13 * t) * t) * t) * t) * t) * t; + } + case 54: { + double t = 2*y100 - 109; + return 0.16221449434620737567e0 + (0.30276865332726475672e-2 + (0.26885741326534564336e-4 + (0.20908350604346384143e-6 + (0.14151148144240728728e-8 + (0.82369170665974313027e-11 + 0.40360957457777777779e-13 * t) * t) * t) * t) * t) * t; + } + case 55: { + double t = 2*y100 - 111; + return 0.16837910595412130659e0 + (0.31377844510793082301e-2 + (0.28174873844911175026e-4 + (0.22074043807045782387e-6 + (0.14999481055996090039e-8 + (0.87348993661930809254e-11 + 0.42653528977777777779e-13 * t) * t) * t) * t) * t) * t; + } + case 56: { + double t = 2*y100 - 113; + return 0.17476916455659369953e0 + (0.32531815370903068316e-2 + (0.29536024347344364074e-4 + (0.23309632627767074202e-6 + (0.15899007843582444846e-8 + (0.92610375235427359475e-11 + 0.45054073102222222221e-13 * t) * t) * t) * t) * t) * t; + } + case 57: { + double t = 2*y100 - 115; + return 0.18139556223643701364e0 + (0.33741744168096996041e-2 + (0.30973511714709500836e-4 + (0.24619326937592290996e-6 + (0.16852609412267750744e-8 + (0.98166442942854895573e-11 + 0.47565418097777777779e-13 * t) * t) * t) * t) * t) * t; + } + case 58: { + double t = 2*y100 - 117; + return 0.18826980194443664549e0 + (0.35010775057740317997e-2 + (0.32491914440014267480e-4 + (0.26007572375886319028e-6 + (0.17863299617388376116e-8 + (0.10403065638343878679e-10 + 0.50190265831111111110e-13 * t) * t) * t) * t) * t) * t; + } + case 59: { + double t = 2*y100 - 119; + return 0.19540403413693967350e0 + (0.36342240767211326315e-2 + (0.34096085096200907289e-4 + (0.27479061117017637474e-6 + (0.18934228504790032826e-8 + (0.11021679075323598664e-10 + 0.52931171733333333334e-13 * t) * t) * t) * t) * t) * t; + } + case 60: { + double t = 2*y100 - 121; + return 0.20281109560651886959e0 + (0.37739673859323597060e-2 + (0.35791165457592409054e-4 + (0.29038742889416172404e-6 + (0.20068685374849001770e-8 + (0.11673891799578381999e-10 + 0.55790523093333333334e-13 * t) * t) * t) * t) * t) * t; + } + case 61: { + double t = 2*y100 - 123; + return 0.21050455062669334978e0 + (0.39206818613925652425e-2 + (0.37582602289680101704e-4 + (0.30691836231886877385e-6 + (0.21270101645763677824e-8 + (0.12361138551062899455e-10 + 0.58770520160000000000e-13 * t) * t) * t) * t) * t) * t; + } + case 62: { + double t = 2*y100 - 125; + return 0.21849873453703332479e0 + (0.40747643554689586041e-2 + (0.39476163820986711501e-4 + (0.32443839970139918836e-6 + (0.22542053491518680200e-8 + (0.13084879235290858490e-10 + 0.61873153262222222221e-13 * t) * t) * t) * t) * t) * t; + } + case 63: { + double t = 2*y100 - 127; + return 0.22680879990043229327e0 + (0.42366354648628516935e-2 + (0.41477956909656896779e-4 + (0.34300544894502810002e-6 + (0.23888264229264067658e-8 + (0.13846596292818514601e-10 + 0.65100183751111111110e-13 * t) * t) * t) * t) * t) * t; + } + case 64: { + double t = 2*y100 - 129; + return 0.23545076536988703937e0 + (0.44067409206365170888e-2 + (0.43594444916224700881e-4 + (0.36268045617760415178e-6 + (0.25312606430853202748e-8 + (0.14647791812837903061e-10 + 0.68453122631111111110e-13 * t) * t) * t) * t) * t) * t; + } + case 65: { + double t = 2*y100 - 131; + return 0.24444156740777432838e0 + (0.45855530511605787178e-2 + (0.45832466292683085475e-4 + (0.38352752590033030472e-6 + (0.26819103733055603460e-8 + (0.15489984390884756993e-10 + 0.71933206364444444445e-13 * t) * t) * t) * t) * t) * t; + } + case 66: { + double t = 2*y100 - 133; + return 0.25379911500634264643e0 + (0.47735723208650032167e-2 + (0.48199253896534185372e-4 + (0.40561404245564732314e-6 + (0.28411932320871165585e-8 + (0.16374705736458320149e-10 + 0.75541379822222222221e-13 * t) * t) * t) * t) * t) * t; + } + case 67: { + double t = 2*y100 - 135; + return 0.26354234756393613032e0 + (0.49713289477083781266e-2 + (0.50702455036930367504e-4 + (0.42901079254268185722e-6 + (0.30095422058900481753e-8 + (0.17303497025347342498e-10 + 0.79278273368888888890e-13 * t) * t) * t) * t) * t) * t; + } + case 68: { + double t = 2*y100 - 137; + return 0.27369129607732343398e0 + (0.51793846023052643767e-2 + (0.53350152258326602629e-4 + (0.45379208848865015485e-6 + (0.31874057245814381257e-8 + (0.18277905010245111046e-10 + 0.83144182364444444445e-13 * t) * t) * t) * t) * t) * t; + } + case 69: { + double t = 2*y100 - 139; + return 0.28426714781640316172e0 + (0.53983341916695141966e-2 + (0.56150884865255810638e-4 + (0.48003589196494734238e-6 + (0.33752476967570796349e-8 + (0.19299477888083469086e-10 + 0.87139049137777777779e-13 * t) * t) * t) * t) * t) * t; + } + case 70: { + double t = 2*y100 - 141; + return 0.29529231465348519920e0 + (0.56288077305420795663e-2 + (0.59113671189913307427e-4 + (0.50782393781744840482e-6 + (0.35735475025851713168e-8 + (0.20369760937017070382e-10 + 0.91262442613333333334e-13 * t) * t) * t) * t) * t) * t; + } + case 71: { + double t = 2*y100 - 143; + return 0.30679050522528838613e0 + (0.58714723032745403331e-2 + (0.62248031602197686791e-4 + (0.53724185766200945789e-6 + (0.37827999418960232678e-8 + (0.21490291930444538307e-10 + 0.95513539182222222221e-13 * t) * t) * t) * t) * t) * t; + } + case 72: { + double t = 2*y100 - 145; + return 0.31878680111173319425e0 + (0.61270341192339103514e-2 + (0.65564012259707640976e-4 + (0.56837930287837738996e-6 + (0.40035151353392378882e-8 + (0.22662596341239294792e-10 + 0.99891109760000000000e-13 * t) * t) * t) * t) * t) * t; + } + case 73: { + double t = 2*y100 - 147; + return 0.33130773722152622027e0 + (0.63962406646798080903e-2 + (0.69072209592942396666e-4 + (0.60133006661885941812e-6 + (0.42362183765883466691e-8 + (0.23888182347073698382e-10 + 0.10439349811555555556e-12 * t) * t) * t) * t) * t) * t; + } + case 74: { + double t = 2*y100 - 149; + return 0.34438138658041336523e0 + (0.66798829540414007258e-2 + (0.72783795518603561144e-4 + (0.63619220443228800680e-6 + (0.44814499336514453364e-8 + (0.25168535651285475274e-10 + 0.10901861383111111111e-12 * t) * t) * t) * t) * t) * t; + } + case 75: { + double t = 2*y100 - 151; + return 0.35803744972380175583e0 + (0.69787978834882685031e-2 + (0.76710543371454822497e-4 + (0.67306815308917386747e-6 + (0.47397647975845228205e-8 + (0.26505114141143050509e-10 + 0.11376390933333333333e-12 * t) * t) * t) * t) * t) * t; + } + case 76: { + double t = 2*y100 - 153; + return 0.37230734890119724188e0 + (0.72938706896461381003e-2 + (0.80864854542670714092e-4 + (0.71206484718062688779e-6 + (0.50117323769745883805e-8 + (0.27899342394100074165e-10 + 0.11862637614222222222e-12 * t) * t) * t) * t) * t) * t; + } + case 77: { + double t = 2*y100 - 155; + return 0.38722432730555448223e0 + (0.76260375162549802745e-2 + (0.85259785810004603848e-4 + (0.75329383305171327677e-6 + (0.52979361368388119355e-8 + (0.29352606054164086709e-10 + 0.12360253370666666667e-12 * t) * t) * t) * t) * t) * t; + } + case 78: { + double t = 2*y100 - 157; + return 0.40282355354616940667e0 + (0.79762880915029728079e-2 + (0.89909077342438246452e-4 + (0.79687137961956194579e-6 + (0.55989731807360403195e-8 + (0.30866246101464869050e-10 + 0.12868841946666666667e-12 * t) * t) * t) * t) * t) * t; + } + case 79: { + double t = 2*y100 - 159; + return 0.41914223158913787649e0 + (0.83456685186950463538e-2 + (0.94827181359250161335e-4 + (0.84291858561783141014e-6 + (0.59154537751083485684e-8 + (0.32441553034347469291e-10 + 0.13387957943111111111e-12 * t) * t) * t) * t) * t) * t; + } + case 80: { + double t = 2*y100 - 161; + return 0.43621971639463786896e0 + (0.87352841828289495773e-2 + (0.10002929142066799966e-3 + (0.89156148280219880024e-6 + (0.62480008150788597147e-8 + (0.34079760983458878910e-10 + 0.13917107176888888889e-12 * t) * t) * t) * t) * t) * t; + } + case 81: { + double t = 2*y100 - 163; + return 0.45409763548534330981e0 + (0.91463027755548240654e-2 + (0.10553137232446167258e-3 + (0.94293113464638623798e-6 + (0.65972492312219959885e-8 + (0.35782041795476563662e-10 + 0.14455745872000000000e-12 * t) * t) * t) * t) * t) * t; + } + case 82: { + double t = 2*y100 - 165; + return 0.47282001668512331468e0 + (0.95799574408860463394e-2 + (0.11135019058000067469e-3 + (0.99716373005509038080e-6 + (0.69638453369956970347e-8 + (0.37549499088161345850e-10 + 0.15003280712888888889e-12 * t) * t) * t) * t) * t) * t; + } + case 83: { + double t = 2*y100 - 167; + return 0.49243342227179841649e0 + (0.10037550043909497071e-1 + (0.11750334542845234952e-3 + (0.10544006716188967172e-5 + (0.73484461168242224872e-8 + (0.39383162326435752965e-10 + 0.15559069118222222222e-12 * t) * t) * t) * t) * t) * t; + } + case 84: { + double t = 2*y100 - 169; + return 0.51298708979209258326e0 + (0.10520454564612427224e-1 + (0.12400930037494996655e-3 + (0.11147886579371265246e-5 + (0.77517184550568711454e-8 + (0.41283980931872622611e-10 + 0.16122419680000000000e-12 * t) * t) * t) * t) * t) * t; + } + case 85: { + double t = 2*y100 - 171; + return 0.53453307979101369843e0 + (0.11030120618800726938e-1 + (0.13088741519572269581e-3 + (0.11784797595374515432e-5 + (0.81743383063044825400e-8 + (0.43252818449517081051e-10 + 0.16692592640000000000e-12 * t) * t) * t) * t) * t) * t; + } + case 86: { + double t = 2*y100 - 173; + return 0.55712643071169299478e0 + (0.11568077107929735233e-1 + (0.13815797838036651289e-3 + (0.12456314879260904558e-5 + (0.86169898078969313597e-8 + (0.45290446811539652525e-10 + 0.17268801084444444444e-12 * t) * t) * t) * t) * t) * t; + } + case 87: { + double t = 2*y100 - 175; + return 0.58082532122519320968e0 + (0.12135935999503877077e-1 + (0.14584223996665838559e-3 + (0.13164068573095710742e-5 + (0.90803643355106020163e-8 + (0.47397540713124619155e-10 + 0.17850211608888888889e-12 * t) * t) * t) * t) * t) * t; + } + case 88: { + double t = 2*y100 - 177; + return 0.60569124025293375554e0 + (0.12735396239525550361e-1 + (0.15396244472258863344e-3 + (0.13909744385382818253e-5 + (0.95651595032306228245e-8 + (0.49574672127669041550e-10 + 0.18435945564444444444e-12 * t) * t) * t) * t) * t) * t; + } + case 89: { + double t = 2*y100 - 179; + return 0.63178916494715716894e0 + (0.13368247798287030927e-1 + (0.16254186562762076141e-3 + (0.14695084048334056083e-5 + (0.10072078109604152350e-7 + (0.51822304995680707483e-10 + 0.19025081422222222222e-12 * t) * t) * t) * t) * t) * t; + } + case 90: { + double t = 2*y100 - 181; + return 0.65918774689725319200e0 + (0.14036375850601992063e-1 + (0.17160483760259706354e-3 + (0.15521885688723188371e-5 + (0.10601827031535280590e-7 + (0.54140790105837520499e-10 + 0.19616655146666666667e-12 * t) * t) * t) * t) * t) * t; + } + case 91: { + double t = 2*y100 - 183; + return 0.68795950683174433822e0 + (0.14741765091365869084e-1 + (0.18117679143520433835e-3 + (0.16392004108230585213e-5 + (0.11155116068018043001e-7 + (0.56530360194925690374e-10 + 0.20209663662222222222e-12 * t) * t) * t) * t) * t) * t; + } + case 92: { + double t = 2*y100 - 185; + return 0.71818103808729967036e0 + (0.15486504187117112279e-1 + (0.19128428784550923217e-3 + (0.17307350969359975848e-5 + (0.11732656736113607751e-7 + (0.58991125287563833603e-10 + 0.20803065333333333333e-12 * t) * t) * t) * t) * t) * t; + } + case 93: { + double t = 2*y100 - 187; + return 0.74993321911726254661e0 + (0.16272790364044783382e-1 + (0.20195505163377912645e-3 + (0.18269894883203346953e-5 + (0.12335161021630225535e-7 + (0.61523068312169087227e-10 + 0.21395783431111111111e-12 * t) * t) * t) * t) * t) * t; + } + case 94: { + double t = 2*y100 - 189; + return 0.78330143531283492729e0 + (0.17102934132652429240e-1 + (0.21321800585063327041e-3 + (0.19281661395543913713e-5 + (0.12963340087354341574e-7 + (0.64126040998066348872e-10 + 0.21986708942222222222e-12 * t) * t) * t) * t) * t) * t; + } + case 95: { + double t = 2*y100 - 191; + return 0.81837581041023811832e0 + (0.17979364149044223802e-1 + (0.22510330592753129006e-3 + (0.20344732868018175389e-5 + (0.13617902941839949718e-7 + (0.66799760083972474642e-10 + 0.22574701262222222222e-12 * t) * t) * t) * t) * t) * t; + } + case 96: { + double t = 2*y100 - 193; + return 0.85525144775685126237e0 + (0.18904632212547561026e-1 + (0.23764237370371255638e-3 + (0.21461248251306387979e-5 + (0.14299555071870523786e-7 + (0.69543803864694171934e-10 + 0.23158593688888888889e-12 * t) * t) * t) * t) * t) * t; + } + case 97: { + double t = 2*y100 - 195; + return 0.89402868170849933734e0 + (0.19881418399127202569e-1 + (0.25086793128395995798e-3 + (0.22633402747585233180e-5 + (0.15008997042116532283e-7 + (0.72357609075043941261e-10 + 0.23737194737777777778e-12 * t) * t) * t) * t) * t) * t; + } + case 98: { + double t = 2*y100 - 197; + return 0.93481333942870796363e0 + (0.20912536329780368893e-1 + (0.26481403465998477969e-3 + (0.23863447359754921676e-5 + (0.15746923065472184451e-7 + (0.75240468141720143653e-10 + 0.24309291271111111111e-12 * t) * t) * t) * t) * t) * t; + } + case 99: { + double t = 2*y100 - 199; + return 0.97771701335885035464e0 + (0.22000938572830479551e-1 + (0.27951610702682383001e-3 + (0.25153688325245314530e-5 + (0.16514019547822821453e-7 + (0.78191526829368231251e-10 + 0.24873652355555555556e-12 * t) * t) * t) * t) * t) * t; + } + } + /* we only get here if y = 1, i.e. |x| < 4*eps, in which case + * erfcx is within 1e-15 of 1.. */ + return 1.0; +} /* erfcx_y100 */ + +/* optimizer friendly implementation of exp2(x). + * + * strategy: + * + * split argument into an integer part and a fraction: + * ipart = floor(x+0.5); + * fpart = x - ipart; + * + * compute exp2(ipart) from setting the ieee754 exponent + * compute exp2(fpart) using a pade' approximation for x in [-0.5;0.5[ + * + * the result becomes: exp2(x) = exp2(ipart) * exp2(fpart) + */ + +/* IEEE 754 double precision floating point data manipulation */ +typedef union +{ + double f; + uint64_t u; + struct {int32_t i0,i1;} s; +} udi_t; + +static const double fm_exp2_q[] = { +/* 1.00000000000000000000e0, */ + 2.33184211722314911771e2, + 4.36821166879210612817e3 +}; +static const double fm_exp2_p[] = { + 2.30933477057345225087e-2, + 2.02020656693165307700e1, + 1.51390680115615096133e3 +}; + +double MathSpecialKokkos::exp2_x86(double x) +{ + double ipart, fpart, px, qx; + udi_t epart; + + ipart = floor(x+0.5); + fpart = x - ipart; + epart.s.i0 = 0; + epart.s.i1 = (((int) ipart) + 1023) << 20; + + x = fpart*fpart; + + px = fm_exp2_p[0]; + px = px*x + fm_exp2_p[1]; + qx = x + fm_exp2_q[0]; + px = px*x + fm_exp2_p[2]; + qx = qx*x + fm_exp2_q[1]; + + px = px * fpart; + + x = 1.0 + 2.0*(px/(qx-px)); + return epart.f*x; +} diff --git a/src/KOKKOS/math_special_kokkos.h b/src/KOKKOS/math_special_kokkos.h new file mode 100644 index 0000000000..c177e88574 --- /dev/null +++ b/src/KOKKOS/math_special_kokkos.h @@ -0,0 +1,100 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifndef LMP_MATH_SPECIAL_KOKKOS_H +#define LMP_MATH_SPECIAL_KOKKOS_H + +#include <math.h> +#include "kokkos_type.h" + +namespace LAMMPS_NS { + +namespace MathSpecialKokkos { + + // support function for scaled error function complement + + extern double erfcx_y100(const double y100); + + // fast 2**x function without argument checks for little endian CPUs + extern double exp2_x86(double x); + + // scaled error function complement exp(x*x)*erfc(x) for coul/long styles + + static inline double my_erfcx(const double x) + { + if (x >= 0.0) return erfcx_y100(400.0/(4.0+x)); + else return 2.0*exp(x*x) - erfcx_y100(400.0/(4.0-x)); + } + + // exp(-x*x) for coul/long styles + + static inline double expmsq(double x) + { + x *= x; + x *= 1.4426950408889634074; // log_2(e) +#if defined(__BYTE_ORDER__) +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return (x < 1023.0) ? exp2_x86(-x) : 0.0; +#endif +#endif + return (x < 1023.0) ? exp2(-x) : 0.0; + } + + // x**2, use instead of pow(x,2.0) + KOKKOS_INLINE_FUNCTION + static double square(const double &x) { return x*x; } + + // x**3, use instead of pow(x,3.0) + KOKKOS_INLINE_FUNCTION + static double cube(const double &x) { return x*x*x; } + + // return -1.0 for odd n, 1.0 for even n, like pow(-1.0,n) + KOKKOS_INLINE_FUNCTION + static double powsign(const int n) { return (n & 1) ? -1.0 : 1.0; } + + // optimized version of pow(x,n) with n being integer + // up to 10x faster than pow(x,y) + + KOKKOS_INLINE_FUNCTION + static double powint(const double &x, const int n) { + double yy,ww; + + if (x == 0.0) return 0.0; + int nn = (n > 0) ? n : -n; + ww = x; + + for (yy = 1.0; nn != 0; nn >>= 1, ww *=ww) + if (nn & 1) yy *= ww; + + return (n > 0) ? yy : 1.0/yy; + } + + // optimized version of (sin(x)/x)**n with n being a _positive_ integer + + KOKKOS_INLINE_FUNCTION + static double powsinxx(const double &x, int n) { + double yy,ww; + + if (x == 0.0) return 1.0; + + ww = sin(x)/x; + + for (yy = 1.0; n != 0; n >>= 1, ww *=ww) + if (n & 1) yy *= ww; + + return yy; + } +} +} + +#endif diff --git a/src/KOKKOS/pair_lj_charmm_coul_long_kokkos.cpp b/src/KOKKOS/pair_lj_charmm_coul_long_kokkos.cpp index c749b85f3c..3b2b13f40b 100644 --- a/src/KOKKOS/pair_lj_charmm_coul_long_kokkos.cpp +++ b/src/KOKKOS/pair_lj_charmm_coul_long_kokkos.cpp @@ -115,6 +115,19 @@ void PairLJCharmmCoulLongKokkos<DeviceType>::compute(int eflag_in, int vflag_in) if (eflag || vflag) ev_setup(eflag,vflag); else evflag = vflag_fdotr = 0; + // reallocate per-atom arrays if necessary + + if (eflag_atom) { + memory->destroy_kokkos(k_eatom,eatom); + memory->create_kokkos(k_eatom,eatom,maxeatom,"pair:eatom"); + d_eatom = k_eatom.view<DeviceType>(); + } + if (vflag_atom) { + memory->destroy_kokkos(k_vatom,vatom); + memory->create_kokkos(k_vatom,vatom,maxvatom,6,"pair:vatom"); + d_vatom = k_vatom.view<DeviceType>(); + } + atomKK->sync(execution_space,datamask_read); k_cutsq.template sync<DeviceType>(); k_cut_ljsq.template sync<DeviceType>(); @@ -169,6 +182,16 @@ void PairLJCharmmCoulLongKokkos<DeviceType>::compute(int eflag_in, int vflag_in) if (vflag_fdotr) pair_virial_fdotr_compute(this); + if (eflag_atom) { + k_eatom.template modify<DeviceType>(); + k_eatom.template sync<LMPHostType>(); + } + + if (vflag_atom) { + k_vatom.template modify<DeviceType>(); + k_vatom.template sync<LMPHostType>(); + } + copymode = 0; } diff --git a/src/KOKKOS/pair_lj_cut_coul_long_kokkos.cpp b/src/KOKKOS/pair_lj_cut_coul_long_kokkos.cpp index 00d1561bc3..2a1a124460 100644 --- a/src/KOKKOS/pair_lj_cut_coul_long_kokkos.cpp +++ b/src/KOKKOS/pair_lj_cut_coul_long_kokkos.cpp @@ -68,6 +68,10 @@ PairLJCutCoulLongKokkos<DeviceType>::PairLJCutCoulLongKokkos(LAMMPS *lmp):PairLJ template<class DeviceType> PairLJCutCoulLongKokkos<DeviceType>::~PairLJCutCoulLongKokkos() { + memory->destroy_kokkos(k_eatom,eatom); + memory->destroy_kokkos(k_vatom,vatom); + eatom = NULL; + vatom = NULL; if (allocated){ memory->destroy_kokkos(k_cutsq, cutsq); memory->destroy_kokkos(k_cut_ljsq, cut_ljsq); @@ -100,6 +104,20 @@ void PairLJCutCoulLongKokkos<DeviceType>::compute(int eflag_in, int vflag_in) if (eflag || vflag) ev_setup(eflag,vflag); else evflag = vflag_fdotr = 0; + + // reallocate per-atom arrays if necessary + + if (eflag_atom) { + memory->destroy_kokkos(k_eatom,eatom); + memory->create_kokkos(k_eatom,eatom,maxeatom,"pair:eatom"); + d_eatom = k_eatom.view<DeviceType>(); + } + if (vflag_atom) { + memory->destroy_kokkos(k_vatom,vatom); + memory->create_kokkos(k_vatom,vatom,maxvatom,6,"pair:vatom"); + d_vatom = k_vatom.view<DeviceType>(); + } + atomKK->sync(execution_space,datamask_read); k_cutsq.template sync<DeviceType>(); k_cut_ljsq.template sync<DeviceType>(); @@ -151,6 +169,16 @@ void PairLJCutCoulLongKokkos<DeviceType>::compute(int eflag_in, int vflag_in) } if (vflag_fdotr) pair_virial_fdotr_compute(this); + + if (eflag_atom) { + k_eatom.template modify<DeviceType>(); + k_eatom.template sync<LMPHostType>(); + } + + if (vflag_atom) { + k_vatom.template modify<DeviceType>(); + k_vatom.template sync<LMPHostType>(); + } } /* ---------------------------------------------------------------------- diff --git a/src/KOKKOS/pppm_kokkos.cpp b/src/KOKKOS/pppm_kokkos.cpp new file mode 100644 index 0000000000..de9c0ae630 --- /dev/null +++ b/src/KOKKOS/pppm_kokkos.cpp @@ -0,0 +1,3193 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Stan Moore (SNL) +------------------------------------------------------------------------- */ + +#include <mpi.h> +#include <string.h> +#include <stdio.h> +#include <stdlib.h> +#include <math.h> +#include "pppm_kokkos.h" +#include "atom_kokkos.h" +#include "comm.h" +#include "gridcomm_kokkos.h" +#include "neighbor.h" +#include "force.h" +#include "pair.h" +#include "bond.h" +#include "angle.h" +#include "domain.h" +#include "fft3d_wrap.h" +#include "remap_wrap.h" +#include "memory.h" +#include "error.h" +#include "atom_masks.h" + +#include "math_const.h" +#include "math_special_kokkos.h" + +using namespace LAMMPS_NS; +using namespace MathConst; +using namespace MathSpecialKokkos; + +#define MAXORDER 7 +#define OFFSET 16384 +#define LARGE 10000.0 +#define SMALL 0.00001 +#define EPS_HOC 1.0e-7 + +enum{REVERSE_RHO}; +enum{FORWARD_IK,FORWARD_IK_PERATOM}; + +#ifdef FFT_SINGLE +#define ZEROF 0.0f +#define ONEF 1.0f +#else +#define ZEROF 0.0 +#define ONEF 1.0 +#endif + +/* ---------------------------------------------------------------------- */ + +template<class DeviceType> +PPPMKokkos<DeviceType>::PPPMKokkos(LAMMPS *lmp, int narg, char **arg) : PPPM(lmp, narg, arg) +{ + if (narg < 1) error->all(FLERR,"Illegal kspace_style pppm command"); + + atomKK = (AtomKokkos *) atom; + execution_space = ExecutionSpaceFromDevice<DeviceType>::space; + datamask_read = X_MASK | F_MASK | TYPE_MASK | Q_MASK; + datamask_modify = F_MASK; + + pppmflag = 1; + group_group_enable = 0; + triclinic_support = 0; + + accuracy_relative = fabs(force->numeric(FLERR,arg[0])); + + nfactors = 3; + //factors = new int[nfactors]; + factors[0] = 2; + factors[1] = 3; + factors[2] = 5; + + MPI_Comm_rank(world,&me); + MPI_Comm_size(world,&nprocs); + + //density_brick = d_vdx_brick = d_vdy_brick = d_vdz_brick = NULL; + //d_density_fft = NULL; + //d_u_brick = NULL; + //d_v0_brick = d_v1_brick = d_v2_brick = d_v3_brick = d_v4_brick = d_v5_brick = NULL; + //greensfn = NULL; + //d_work1 = d_work2 = NULL; + //vg = NULL; + //d_fkx = d_fky = d_fkz = NULL; + + + //gf_b = NULL; + //rho1d = rho_coeff = drho1d = drho_coeff = NULL; + + fft1 = fft2 = NULL; + remap = NULL; + cg = NULL; + cg_peratom = NULL; + + nmax = 0; + //part2grid = NULL; + + peratom_allocate_flag = 0; + + // define acons coefficients for estimation of kspace errors + // see JCP 109, pg 7698 for derivation of coefficients + // higher order coefficients may be computed if needed + + acons = typename Kokkos::DualView<F_FLOAT[8][7],Kokkos::LayoutRight,LMPDeviceType>::t_host("pppm:acons"); + acons(1,0) = 2.0 / 3.0; + acons(2,0) = 1.0 / 50.0; + acons(2,1) = 5.0 / 294.0; + acons(3,0) = 1.0 / 588.0; + acons(3,1) = 7.0 / 1440.0; + acons(3,2) = 21.0 / 3872.0; + acons(4,0) = 1.0 / 4320.0; + acons(4,1) = 3.0 / 1936.0; + acons(4,2) = 7601.0 / 2271360.0; + acons(4,3) = 143.0 / 28800.0; + acons(5,0) = 1.0 / 23232.0; + acons(5,1) = 7601.0 / 13628160.0; + acons(5,2) = 143.0 / 69120.0; + acons(5,3) = 517231.0 / 106536960.0; + acons(5,4) = 106640677.0 / 11737571328.0; + acons(6,0) = 691.0 / 68140800.0; + acons(6,1) = 13.0 / 57600.0; + acons(6,2) = 47021.0 / 35512320.0; + acons(6,3) = 9694607.0 / 2095994880.0; + acons(6,4) = 733191589.0 / 59609088000.0; + acons(6,5) = 326190917.0 / 11700633600.0; + acons(7,0) = 1.0 / 345600.0; + acons(7,1) = 3617.0 / 35512320.0; + acons(7,2) = 745739.0 / 838397952.0; + acons(7,3) = 56399353.0 / 12773376000.0; + acons(7,4) = 25091609.0 / 1560084480.0; + acons(7,5) = 1755948832039.0 / 36229939200000.0; + acons(7,6) = 4887769399.0 / 37838389248.0; + + k_flag = DAT::tdual_int_scalar("PPPM:flag"); +} + +/* ---------------------------------------------------------------------- + free all memory +------------------------------------------------------------------------- */ + +template<class DeviceType> +PPPMKokkos<DeviceType>::~PPPMKokkos() +{ + if (copymode) return; + + //delete [] factors; + deallocate(); + if (peratom_allocate_flag) deallocate_peratom(); + //memory->destroy(part2grid); + //memory->destroy(acons); + + memory->destroy_kokkos(k_eatom,eatom); + memory->destroy_kokkos(k_vatom,vatom); + eatom = NULL; + vatom = NULL; +} + +/* ---------------------------------------------------------------------- + called once before run +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PPPMKokkos<DeviceType>::init() +{ + if (me == 0) { + if (screen) fprintf(screen,"PPPM initialization ...\n"); + if (logfile) fprintf(logfile,"PPPM initialization ...\n"); + } + + // error check + + if (differentiation_flag == 1) + error->all(FLERR,"Cannot (yet) use PPPM Kokkos with 'kspace_modify diff ad'"); + + triclinic_check(); + if (domain->triclinic && slabflag) + error->all(FLERR,"Cannot (yet) use PPPM with triclinic box and " + "slab correction"); + if (domain->dimension == 2) error->all(FLERR, + "Cannot use PPPM with 2d simulation"); + if (comm->style != 0) + error->universe_all(FLERR,"PPPM can only currently be used with " + "comm_style brick"); + + if (!atomKK->q_flag) error->all(FLERR,"Kspace style requires atomKK attribute q"); + + if (slabflag == 0 && domain->nonperiodic > 0) + error->all(FLERR,"Cannot use nonperiodic boundaries with PPPM"); + if (slabflag) { + if (domain->xperiodic != 1 || domain->yperiodic != 1 || + domain->boundary[2][0] != 1 || domain->boundary[2][1] != 1) + error->all(FLERR,"Incorrect boundaries with slab PPPM"); + } + + if (order < 2 || order > MAXORDER) { + char str[128]; + sprintf(str,"PPPM order cannot be < 2 or > than %d",MAXORDER); + error->all(FLERR,str); + } + + // extract short-range Coulombic cutoff from pair style + + triclinic = domain->triclinic; + pair_check(); + + int itmp = 0; + double *p_cutoff = (double *) force->pair->extract("cut_coul",itmp); + if (p_cutoff == NULL) + error->all(FLERR,"KSpace style is incompatible with Pair style"); + cutoff = *p_cutoff; + + // if kspace is TIP4P, extract TIP4P params from pair style + // bond/angle are not yet init(), so insure equilibrium request is valid + + qdist = 0.0; + + if (tip4pflag) + error->all(FLERR,"Cannot (yet) use PPPM Kokkos TIP4P"); + + // compute qsum & qsqsum and warn if not charge-neutral + + scale = 1.0; + qqrd2e = force->qqrd2e; + qsum_qsq(); + natoms_original = atomKK->natoms; + + // set accuracy (force units) from accuracy_relative or accuracy_absolute + + if (accuracy_absolute >= 0.0) accuracy = accuracy_absolute; + else accuracy = accuracy_relative * two_charge_force; + + // free all arrays previously allocated + + deallocate(); + if (peratom_allocate_flag) deallocate_peratom(); + + // setup FFT grid resolution and g_ewald + // normally one iteration thru while loop is all that is required + // if grid stencil does not extend beyond neighbor proc + // or overlap is allowed, then done + // else reduce order and try again + + int (*procneigh)[2] = comm->procneigh; + + GridCommKokkos<DeviceType> *cgtmp = NULL; + int iteration = 0; + + while (order >= minorder) { + if (iteration && me == 0) + error->warning(FLERR,"Reducing PPPM order b/c stencil extends " + "beyond nearest neighbor processor"); + + set_grid_global(); + set_grid_local(); + if (overlap_allowed) break; + + cgtmp = new GridCommKokkos<DeviceType>(lmp,world,1,1, + nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, + nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out, + procneigh[0][0],procneigh[0][1],procneigh[1][0], + procneigh[1][1],procneigh[2][0],procneigh[2][1]); + cgtmp->ghost_notify(); + if (!cgtmp->ghost_overlap()) break; + delete cgtmp; + + order--; + iteration++; + } + + if (order < minorder) error->all(FLERR,"PPPM order < minimum allowed order"); + if (!overlap_allowed && cgtmp->ghost_overlap()) + error->all(FLERR,"PPPM grid stencil extends " + "beyond nearest neighbor processor"); + if (cgtmp) delete cgtmp; + + // adjust g_ewald + + if (!gewaldflag) adjust_gewald(); + + // calculate the final accuracy + + double estimated_accuracy = final_accuracy(); + + // print stats + + int ngrid_max,nfft_both_max; + MPI_Allreduce(&ngrid,&ngrid_max,1,MPI_INT,MPI_MAX,world); + MPI_Allreduce(&nfft_both,&nfft_both_max,1,MPI_INT,MPI_MAX,world); + + if (me == 0) { + +#ifdef FFT_SINGLE + const char fft_prec[] = "single"; +#else + const char fft_prec[] = "double"; +#endif + + if (screen) { + fprintf(screen," G vector (1/distance) = %g\n",g_ewald); + fprintf(screen," grid = %d %d %d\n",nx_pppm,ny_pppm,nz_pppm); + fprintf(screen," stencil order = %d\n",order); + fprintf(screen," estimated absolute RMS force accuracy = %g\n", + estimated_accuracy); + fprintf(screen," estimated relative force accuracy = %g\n", + estimated_accuracy/two_charge_force); + fprintf(screen," using %s precision FFTs\n",fft_prec); + fprintf(screen," 3d grid and FFT values/proc = %d %d\n", + ngrid_max,nfft_both_max); + } + if (logfile) { + fprintf(logfile," G vector (1/distance) = %g\n",g_ewald); + fprintf(logfile," grid = %d %d %d\n",nx_pppm,ny_pppm,nz_pppm); + fprintf(logfile," stencil order = %d\n",order); + fprintf(logfile," estimated absolute RMS force accuracy = %g\n", + estimated_accuracy); + fprintf(logfile," estimated relative force accuracy = %g\n", + estimated_accuracy/two_charge_force); + fprintf(logfile," using %s precision FFTs\n",fft_prec); + fprintf(logfile," 3d grid and FFT values/proc = %d %d\n", + ngrid_max,nfft_both_max); + } + } + + // allocate K-space dependent memory + // don't invoke allocate peratom(), will be allocated when needed + + allocate(); + cg->ghost_notify(); + cg->setup(); + + // pre-compute Green's function denomiator expansion + // pre-compute 1d charge distribution coefficients + + compute_gf_denom(); + compute_rho_coeff(); + + k_rho_coeff.template modify<LMPHostType>(); + k_rho_coeff.template sync<DeviceType>(); + +} + +/* ---------------------------------------------------------------------- + adjust PPPM coeffs, called initially and whenever volume has changed +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PPPMKokkos<DeviceType>::setup() +{ + if (triclinic) { + setup_triclinic(); + return; + } + + // perform some checks to avoid illegal boundaries with read_data + + if (slabflag == 0 && domain->nonperiodic > 0) + error->all(FLERR,"Cannot use nonperiodic boundaries with PPPM"); + if (slabflag) { + if (domain->xperiodic != 1 || domain->yperiodic != 1 || + domain->boundary[2][0] != 1 || domain->boundary[2][1] != 1) + error->all(FLERR,"Incorrect boundaries with slab PPPM"); + } + + int i,j,k,n; + double *prd; + + // volume-dependent factors + // adjust z dimension for 2d slab PPPM + // z dimension for 3d PPPM is zprd since slab_volfactor = 1.0 + + if (triclinic == 0) prd = domain->prd; + else prd = domain->prd_lamda; + + double xprd = prd[0]; + double yprd = prd[1]; + double zprd = prd[2]; + double zprd_slab = zprd*slab_volfactor; + volume = xprd * yprd * zprd_slab; + + delxinv = nx_pppm/xprd; + delyinv = ny_pppm/yprd; + delzinv = nz_pppm/zprd_slab; + + delvolinv = delxinv*delyinv*delzinv; + + unitkx = (MY_2PI/xprd); + unitky = (MY_2PI/yprd); + unitkz = (MY_2PI/zprd_slab); + + // d_fkx,d_fky,d_fkz for my FFT grid pts + + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_setup1>(nxlo_fft,nxhi_fft+1),*this); + DeviceType::fence(); + copymode = 0; + + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_setup2>(nylo_fft,nyhi_fft+1),*this); + DeviceType::fence(); + copymode = 0; + + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_setup3>(nzlo_fft,nzhi_fft+1),*this); + DeviceType::fence(); + copymode = 0; + + // virial coefficients + + double sqk,vterm; + + // merge three outer loops into one for better threading + + numz_fft = nzhi_fft-nzlo_fft + 1; + numy_fft = nyhi_fft-nylo_fft + 1; + numx_fft = nxhi_fft-nxlo_fft + 1; + const int inum_fft = numz_fft*numy_fft*numx_fft; + + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_setup4>(0,inum_fft),*this); + DeviceType::fence(); + copymode = 0; + + compute_gf_ik(); +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_setup1, const int &i) const +{ + double per = i - nx_pppm*(2*i/nx_pppm); + d_fkx[i-nxlo_fft] = unitkx*per; +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_setup2, const int &i) const +{ + double per = i - ny_pppm*(2*i/ny_pppm); + d_fky[i-nylo_fft] = unitky*per; +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_setup3, const int &i) const +{ + double per = i - nz_pppm*(2*i/nz_pppm); + d_fkz[i-nzlo_fft] = unitkz*per; +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_setup4, const int &n) const +{ + const int k = n/(numy_fft*numx_fft); + const int j = (n - k*numy_fft*numx_fft) / numx_fft; + const int i = n - k*numy_fft*numx_fft - j*numx_fft; + const double sqk = d_fkx[i]*d_fkx[i] + d_fky[j]*d_fky[j] + d_fkz[k]*d_fkz[k]; + if (sqk == 0.0) { + d_vg(n,0) = 0.0; + d_vg(n,1) = 0.0; + d_vg(n,2) = 0.0; + d_vg(n,3) = 0.0; + d_vg(n,4) = 0.0; + d_vg(n,5) = 0.0; + } else { + const double vterm = -2.0 * (1.0/sqk + 0.25/(g_ewald*g_ewald)); + d_vg(n,0) = 1.0 + vterm*d_fkx[i]*d_fkx[i]; + d_vg(n,1) = 1.0 + vterm*d_fky[j]*d_fky[j]; + d_vg(n,2) = 1.0 + vterm*d_fkz[k]*d_fkz[k]; + d_vg(n,3) = vterm*d_fkx[i]*d_fky[j]; + d_vg(n,4) = vterm*d_fkx[i]*d_fkz[k]; + d_vg(n,5) = vterm*d_fky[j]*d_fkz[k]; + } +} + +/* ---------------------------------------------------------------------- + adjust PPPM coeffs, called initially and whenever volume has changed + for a triclinic system +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PPPMKokkos<DeviceType>::setup_triclinic() +{ +// int i,j,k,n; +// double *prd; +// +// // volume-dependent factors +// // adjust z dimension for 2d slab PPPM +// // z dimension for 3d PPPM is zprd since slab_volfactor = 1.0 +// +// prd = domain->prd; +// +// double xprd = prd[0]; +// double yprd = prd[1]; +// double zprd = prd[2]; +// double zprd_slab = zprd*slab_volfactor; +// volume = xprd * yprd * zprd_slab; +// +// // use lamda (0-1) coordinates +// +// delxinv = nx_pppm; +// delyinv = ny_pppm; +// delzinv = nz_pppm; +// delvolinv = delxinv*delyinv*delzinv/volume; +// +// // d_fkx,d_fky,d_fkz for my FFT grid pts +// +// double per_i,per_j,per_k; +// +// n = 0; +// for (k = nzlo_fft; k <= nzhi_fft; k++) { // parallel_for +// per_k = k - nz_pppm*(2*k/nz_pppm); +// for (j = nylo_fft; j <= nyhi_fft; j++) { +// per_j = j - ny_pppm*(2*j/ny_pppm); +// for (i = nxlo_fft; i <= nxhi_fft; i++) { +// per_i = i - nx_pppm*(2*i/nx_pppm); +// +// double unitk_lamda[3]; +// unitk_lamda[0] = 2.0*MY_PI*per_i; +// unitk_lamda[1] = 2.0*MY_PI*per_j; +// unitk_lamda[2] = 2.0*MY_PI*per_k; +// x2lamdaT(&unitk_lamda[0],&unitk_lamda[0]); +// d_fkx[n] = unitk_lamda[0]; +// d_fky[n] = unitk_lamda[1]; +// d_fkz[n] = unitk_lamda[2]; +// n++; +// } +// } +// } +// +// // virial coefficients +// +// double sqk,vterm; +// +// for (n = 0; n < nfft; n++) { // parallel_for +// sqk = d_fkx[n]*d_fkx[n] + d_fky[n]*d_fky[n] + d_fkz[n]*d_fkz[n]; +// if (sqk == 0.0) { +// d_vg(n,0) = 0.0; +// d_vg(n,1) = 0.0; +// d_vg(n,2) = 0.0; +// d_vg(n,3) = 0.0; +// d_vg(n,4) = 0.0; +// d_vg(n,5) = 0.0; +// } else { +// vterm = -2.0 * (1.0/sqk + 0.25/(g_ewald*g_ewald)); +// d_vg(n,0) = 1.0 + vterm*d_fkx[n]*d_fkx[n]; +// d_vg(n,1) = 1.0 + vterm*d_fky[n]*d_fky[n]; +// d_vg(n,2) = 1.0 + vterm*d_fkz[n]*d_fkz[n]; +// d_vg(n,3) = vterm*d_fkx[n]*d_fky[n]; +// d_vg(n,4) = vterm*d_fkx[n]*d_fkz[n]; +// d_vg(n,5) = vterm*d_fky[n]*d_fkz[n]; +// } +// } +// +// compute_gf_ik_triclinic(); +} + +/* ---------------------------------------------------------------------- + reset local grid arrays and communication stencils + called by fix balance b/c it changed sizes of processor sub-domains +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PPPMKokkos<DeviceType>::setup_grid() +{ + // free all arrays previously allocated + + deallocate(); + if (peratom_allocate_flag) deallocate_peratom(); + + // reset portion of global grid that each proc owns + + set_grid_local(); + + // reallocate K-space dependent memory + // check if grid communication is now overlapping if not allowed + // don't invoke allocate peratom(), will be allocated when needed + + allocate(); + + cg->ghost_notify(); + if (overlap_allowed == 0 && cg->ghost_overlap()) + error->all(FLERR,"PPPM grid stencil extends " + "beyond nearest neighbor processor"); + cg->setup(); + + // pre-compute Green's function denomiator expansion + // pre-compute 1d charge distribution coefficients + + compute_gf_denom(); + compute_rho_coeff(); + + // pre-compute volume-dependent coeffs + + setup(); +} + +/* ---------------------------------------------------------------------- + compute the PPPM long-range force, energy, virial +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PPPMKokkos<DeviceType>::compute(int eflag, int vflag) +{ + int i,j; + + // set energy/virial flags + // invoke allocate_peratom() if needed for first time + + if (eflag || vflag) ev_setup(eflag,vflag); + else evflag = evflag_atom = eflag_global = vflag_global = + eflag_atom = vflag_atom = 0; + + // reallocate per-atom arrays if necessary + + if (eflag_atom) { + memory->destroy_kokkos(k_eatom,eatom); + memory->create_kokkos(k_eatom,eatom,maxeatom,"pair:eatom"); + d_eatom = k_eatom.view<DeviceType>(); + } + if (vflag_atom) { + memory->destroy_kokkos(k_vatom,vatom); + memory->create_kokkos(k_vatom,vatom,maxvatom,6,"pair:vatom"); + d_vatom = k_vatom.view<DeviceType>(); + } + + if (evflag_atom && !peratom_allocate_flag) { + allocate_peratom(); + cg_peratom->ghost_notify(); + cg_peratom->setup(); + } + + x = atomKK->k_x.view<DeviceType>(); + f = atomKK->k_f.view<DeviceType>(); + q = atomKK->k_q.view<DeviceType>(); + + atomKK->sync(execution_space,datamask_read); + atomKK->modified(execution_space,datamask_modify); + + //nlocal = atomKK->nlocal; + //nall = atomKK->nlocal + atomKK->nghost; + //newton_pair = force->newton_pair; + + // if atomKK count has changed, update qsum and qsqsum + + if (atomKK->natoms != natoms_original) { + qsum_qsq(); + natoms_original = atomKK->natoms; + } + + // return if there are no charges + + if (qsqsum == 0.0) return; + + // convert atoms from box to lamda coords + + if (triclinic == 0) { + boxlo[0] = domain->boxlo[0]; + boxlo[1] = domain->boxlo[1]; + boxlo[2] = domain->boxlo[2]; + } else { + boxlo[0] = domain->boxlo_lamda[0]; + boxlo[1] = domain->boxlo_lamda[1]; + boxlo[2] = domain->boxlo_lamda[2]; + domain->x2lamda(atomKK->nlocal); + } + + // extend size of per-atomKK arrays if necessary + + if (atomKK->nmax > nmax) { + //memory->destroy(part2grid); + nmax = atomKK->nmax; + //memory->create(part2grid,nmax,3,"pppm:part2grid"); + d_part2grid = typename AT::t_int_1d_3("pppm:part2grid",nmax); + d_rho1d = typename AT::t_FFT_SCALAR_2d_3("pppm:rho1d",nmax,order/2+order/2+1); + } + + // find grid points for all my particles + // map my particle charge onto my local 3d density grid + + particle_map(); + make_rho(); + + // all procs communicate density values from their ghost cells + // to fully sum contribution in their 3d bricks + // remap from 3d decomposition to FFT decomposition + + cg->reverse_comm(this,REVERSE_RHO); + brick2fft(); + + // compute potential gradient on my FFT grid and + // portion of e_long on this proc's FFT grid + // return gradients (electric fields) in 3d brick decomposition + // also performs per-atomKK calculations via poisson_peratom() + + poisson(); + + // all procs communicate E-field values + // to fill ghost cells surrounding their 3d bricks + + cg->forward_comm(this,FORWARD_IK); + + // extra per-atomKK energy/virial communication + + if (evflag_atom) + cg_peratom->forward_comm(this,FORWARD_IK_PERATOM); + + // calculate the force on my particles + + fieldforce(); + + // extra per-atomKK energy/virial communication + + if (evflag_atom) fieldforce_peratom(); + + // sum global energy across procs and add in volume-dependent term + + qscale = qqrd2e * scale; + + if (eflag_global) { + double energy_all; + MPI_Allreduce(&energy,&energy_all,1,MPI_DOUBLE,MPI_SUM,world); + energy = energy_all; + + energy *= 0.5*volume; + energy -= g_ewald*qsqsum/MY_PIS + + MY_PI2*qsum*qsum / (g_ewald*g_ewald*volume); + energy *= qscale; + } + + // sum global virial across procs + + if (vflag_global) { + double virial_all[6]; + MPI_Allreduce(virial,virial_all,6,MPI_DOUBLE,MPI_SUM,world); + for (i = 0; i < 6; i++) virial[i] = 0.5*qscale*volume*virial_all[i]; + } + + // per-atomKK energy/virial + // energy includes self-energy correction + // notal accounts for TIP4P tallying d_eatom/vatom for ghost atoms + + if (evflag_atom) { + int nlocal = atomKK->nlocal; + int ntotal = nlocal; + //if (tip4pflag) ntotal += atomKK->nghost; + + if (eflag_atom) { + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_self1>(0,nlocal),*this); + DeviceType::fence(); + copymode = 0; + //for (i = nlocal; i < ntotal; i++) d_eatom[i] *= 0.5*qscale; + } + + if (vflag_atom) { + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_self2>(0,ntotal),*this); + DeviceType::fence(); + copymode = 0; + } + } + + // 2d slab correction + + if (slabflag == 1) slabcorr(); + + // convert atoms back from lamda to box coords + + if (triclinic) domain->lamda2x(atomKK->nlocal); + + if (eflag_atom) { + k_eatom.template modify<DeviceType>(); + k_eatom.template sync<LMPHostType>(); + } + + if (vflag_atom) { + k_vatom.template modify<DeviceType>(); + k_vatom.template sync<LMPHostType>(); + } +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_self1, const int &i) const +{ + d_eatom[i] *= 0.5; + d_eatom[i] -= g_ewald*q[i]*q[i]/MY_PIS + MY_PI2*q[i]*qsum / + (g_ewald*g_ewald*volume); + d_eatom[i] *= qscale; +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_self2, const int &i) const +{ + for (int j = 0; j < 6; j++) d_vatom(i,j) *= 0.5*qscale; +} + +/* ---------------------------------------------------------------------- + allocate memory that depends on # of K-vectors and order +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PPPMKokkos<DeviceType>::allocate() +{ + d_density_brick = typename AT::t_FFT_SCALAR_3d("pppm:density_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1); + + memory->create_kokkos(k_density_fft,density_fft,nfft_both,"pppm:d_density_fft"); + d_density_fft = k_density_fft.view<DeviceType>(); + + d_greensfn = typename AT::t_float_1d("pppm:greensfn",nfft_both); + memory->create_kokkos(k_work1,work1,2*nfft_both,"pppm:work1"); + memory->create_kokkos(k_work2,work2,2*nfft_both,"pppm:work2"); + d_work1 = k_work1.view<DeviceType>(); + d_work2 = k_work2.view<DeviceType>(); + d_vg = typename AT::t_virial_array("pppm:vg",nfft_both); + + if (triclinic == 0) { + d_fkx = typename AT::t_float_1d("pppm:d_fkx",nxhi_fft-nxlo_fft+1); + d_fky = typename AT::t_float_1d("pppm:d_fky",nyhi_fft-nylo_fft+1); + d_fkz = typename AT::t_float_1d("pppm:d_fkz",nzhi_fft-nzlo_fft+1); + } else { + d_fkx = typename AT::t_float_1d("pppm:d_fkx",nfft_both); + d_fky = typename AT::t_float_1d("pppm:d_fky",nfft_both); + d_fkz = typename AT::t_float_1d("pppm:d_fkz",nfft_both); + } + + d_vdx_brick = typename AT::t_FFT_SCALAR_3d("pppm:d_vdx_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1); + d_vdy_brick = typename AT::t_FFT_SCALAR_3d("pppm:d_vdy_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1); + d_vdz_brick = typename AT::t_FFT_SCALAR_3d("pppm:d_vdz_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1); + + // summation coeffs + + order_allocated = order; + k_gf_b = typename DAT::tdual_float_1d("pppm:gf_b",order); + d_gf_b = k_gf_b.view<DeviceType>(); + d_rho1d = typename AT::t_FFT_SCALAR_2d_3("pppm:rho1d",nmax,order/2+order/2+1); + k_rho_coeff = DAT::tdual_FFT_SCALAR_2d("pppm:rho_coeff",order,order/2-(1-order)/2+1); + d_rho_coeff = k_rho_coeff.view<DeviceType>(); + h_rho_coeff = k_rho_coeff.h_view; + + // create 2 FFTs and a Remap + // 1st FFT keeps data in FFT decompostion + // 2nd FFT returns data in 3d brick decomposition + // remap takes data from 3d brick to FFT decomposition + + int tmp; + + fft1 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm, + nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, + nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, + 0,0,&tmp,collective_flag); + + fft2 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm, + nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, + nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, + 0,0,&tmp,collective_flag); + remap = new Remap(lmp,world, + nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, + nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, + 1,0,0,FFT_PRECISION,collective_flag); + + // create ghost grid object for rho and electric field communication + + int (*procneigh)[2] = comm->procneigh; + + cg = new GridCommKokkos<DeviceType>(lmp,world,3,1, + nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, + nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out, + procneigh[0][0],procneigh[0][1],procneigh[1][0], + procneigh[1][1],procneigh[2][0],procneigh[2][1]); +} + +/* ---------------------------------------------------------------------- + deallocate memory that depends on # of K-vectors and order +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PPPMKokkos<DeviceType>::deallocate() +{ + memory->destroy_kokkos(d_density_fft,density_fft); + density_fft = NULL; + memory->destroy_kokkos(d_greensfn,greensfn); + greensfn = NULL; + memory->destroy_kokkos(d_work1,work1); + work1 = NULL; + memory->destroy_kokkos(d_work2,work2); + work2 = NULL; + + delete fft1; + fft1 = NULL; + delete fft2; + fft2 = NULL; + delete remap; + remap = NULL; + delete cg; + cg = NULL; +} + +/* ---------------------------------------------------------------------- + allocate per-atomKK memory that depends on # of K-vectors and order +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PPPMKokkos<DeviceType>::allocate_peratom() +{ + peratom_allocate_flag = 1; + + d_u_brick = typename AT::t_FFT_SCALAR_3d("pppm:u_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1); + + d_v0_brick = typename AT::t_FFT_SCALAR_3d("pppm:d_v0_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1); + d_v1_brick = typename AT::t_FFT_SCALAR_3d("pppm:d_v1_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1); + d_v2_brick = typename AT::t_FFT_SCALAR_3d("pppm:d_v2_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1); + d_v3_brick = typename AT::t_FFT_SCALAR_3d("pppm:d_v3_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1); + d_v4_brick = typename AT::t_FFT_SCALAR_3d("pppm:d_v4_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1); + d_v5_brick = typename AT::t_FFT_SCALAR_3d("pppm:d_v5_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1); + + + // create ghost grid object for rho and electric field communication + + int (*procneigh)[2] = comm->procneigh; + + cg_peratom = + new GridCommKokkos<DeviceType>(lmp,world,7,1, + nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, + nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out, + procneigh[0][0],procneigh[0][1],procneigh[1][0], + procneigh[1][1],procneigh[2][0],procneigh[2][1]); +} + +/* ---------------------------------------------------------------------- + deallocate per-atomKK memory that depends on # of K-vectors and order +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PPPMKokkos<DeviceType>::deallocate_peratom() +{ + peratom_allocate_flag = 0; + + delete cg_peratom; + cg_peratom = NULL; +} + +/* ---------------------------------------------------------------------- + set global size of PPPM grid = nx,ny,nz_pppm + used for charge accumulation, FFTs, and electric field interpolation +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PPPMKokkos<DeviceType>::set_grid_global() +{ + // use xprd,yprd,zprd (even if triclinic, and then scale later) + // adjust z dimension for 2d slab PPPM + // 3d PPPM just uses zprd since slab_volfactor = 1.0 + + double xprd = domain->xprd; + double yprd = domain->yprd; + double zprd = domain->zprd; + double zprd_slab = zprd*slab_volfactor; + + // make initial g_ewald estimate + // based on desired accuracy and real space cutoff + // fluid-occupied volume used to estimate real-space error + // zprd used rather than zprd_slab + + double h; + bigint natoms = atomKK->natoms; + + if (!gewaldflag) { + if (accuracy <= 0.0) + error->all(FLERR,"KSpace accuracy must be > 0"); + g_ewald = accuracy*sqrt(natoms*cutoff*xprd*yprd*zprd) / (2.0*q2); + if (g_ewald >= 1.0) g_ewald = (1.35 - 0.15*log(accuracy))/cutoff; + else g_ewald = sqrt(-log(g_ewald)) / cutoff; + } + + // set optimal nx_pppm,ny_pppm,nz_pppm based on order and accuracy + // nz_pppm uses extended zprd_slab instead of zprd + // reduce it until accuracy target is met + + if (!gridflag) { + + double err; + h_x = h_y = h_z = 1.0/g_ewald; + + nx_pppm = static_cast<int> (xprd/h_x) + 1; + ny_pppm = static_cast<int> (yprd/h_y) + 1; + nz_pppm = static_cast<int> (zprd_slab/h_z) + 1; + + err = estimate_ik_error(h_x,xprd,natoms); + while (err > accuracy) { + err = estimate_ik_error(h_x,xprd,natoms); + nx_pppm++; + h_x = xprd/nx_pppm; + } + + err = estimate_ik_error(h_y,yprd,natoms); + while (err > accuracy) { + err = estimate_ik_error(h_y,yprd,natoms); + ny_pppm++; + h_y = yprd/ny_pppm; + } + + err = estimate_ik_error(h_z,zprd_slab,natoms); + while (err > accuracy) { + err = estimate_ik_error(h_z,zprd_slab,natoms); + nz_pppm++; + h_z = zprd_slab/nz_pppm; + } + + // scale grid for triclinic skew + + if (triclinic) { + double tmp[3]; + tmp[0] = nx_pppm/xprd; + tmp[1] = ny_pppm/yprd; + tmp[2] = nz_pppm/zprd; + lamda2xT(&tmp[0],&tmp[0]); + nx_pppm = static_cast<int>(tmp[0]) + 1; + ny_pppm = static_cast<int>(tmp[1]) + 1; + nz_pppm = static_cast<int>(tmp[2]) + 1; + } + } + + // boost grid size until it is factorable + + while (!factorable(nx_pppm)) nx_pppm++; + while (!factorable(ny_pppm)) ny_pppm++; + while (!factorable(nz_pppm)) nz_pppm++; + + if (triclinic == 0) { + h_x = xprd/nx_pppm; + h_y = yprd/ny_pppm; + h_z = zprd_slab/nz_pppm; + } else { + double tmp[3]; + tmp[0] = nx_pppm; + tmp[1] = ny_pppm; + tmp[2] = nz_pppm; + x2lamdaT(&tmp[0],&tmp[0]); + h_x = 1.0/tmp[0]; + h_y = 1.0/tmp[1]; + h_z = 1.0/tmp[2]; + } + + if (nx_pppm >= OFFSET || ny_pppm >= OFFSET || nz_pppm >= OFFSET) + error->all(FLERR,"PPPM grid is too large"); +} + +/* ---------------------------------------------------------------------- + check if all factors of n are in list of factors + return 1 if yes, 0 if no +------------------------------------------------------------------------- */ + +template<class DeviceType> +int PPPMKokkos<DeviceType>::factorable(int n) +{ + int i; + + while (n > 1) { + for (i = 0; i < nfactors; i++) { + if (n % factors[i] == 0) { + n /= factors[i]; + break; + } + } + if (i == nfactors) return 0; + } + + return 1; +} + +/* ---------------------------------------------------------------------- + compute estimated kspace force error +------------------------------------------------------------------------- */ + +template<class DeviceType> +double PPPMKokkos<DeviceType>::compute_df_kspace() +{ + double xprd = domain->xprd; + double yprd = domain->yprd; + double zprd = domain->zprd; + double zprd_slab = zprd*slab_volfactor; + bigint natoms = atomKK->natoms; + double df_kspace = 0.0; + double lprx = estimate_ik_error(h_x,xprd,natoms); + double lpry = estimate_ik_error(h_y,yprd,natoms); + double lprz = estimate_ik_error(h_z,zprd_slab,natoms); + df_kspace = sqrt(lprx*lprx + lpry*lpry + lprz*lprz) / sqrt(3.0); + return df_kspace; +} + +/* ---------------------------------------------------------------------- + estimate kspace force error for ik method +------------------------------------------------------------------------- */ + +template<class DeviceType> +double PPPMKokkos<DeviceType>::estimate_ik_error(double h, double prd, bigint natoms) +{ + double sum = 0.0; + for (int m = 0; m < order; m++) + sum += acons(order,m) * pow(h*g_ewald,2.0*m); + double value = q2 * pow(h*g_ewald,(double)order) * + sqrt(g_ewald*prd*sqrt(MY_2PI)*sum/natoms) / (prd*prd); + + return value; +} + +/* ---------------------------------------------------------------------- + adjust the g_ewald parameter to near its optimal value + using a Newton-Raphson solver +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PPPMKokkos<DeviceType>::adjust_gewald() +{ + double dx; + + for (int i = 0; i < LARGE; i++) { + dx = newton_raphson_f() / derivf(); + g_ewald -= dx; + if (fabs(newton_raphson_f()) < SMALL) return; + } + + char str[128]; + sprintf(str, "Could not compute g_ewald"); + error->all(FLERR, str); +} + +/* ---------------------------------------------------------------------- + calculate f(x) using Newton-Raphson solver +------------------------------------------------------------------------- */ + +template<class DeviceType> +double PPPMKokkos<DeviceType>::newton_raphson_f() +{ + double xprd = domain->xprd; + double yprd = domain->yprd; + double zprd = domain->zprd; + bigint natoms = atomKK->natoms; + + double df_rspace = 2.0*q2*exp(-g_ewald*g_ewald*cutoff*cutoff) / + sqrt(natoms*cutoff*xprd*yprd*zprd); + + double df_kspace = compute_df_kspace(); + + return df_rspace - df_kspace; +} + +/* ---------------------------------------------------------------------- + calculate numerical derivative f'(x) using forward difference + [f(x + h) - f(x)] / h +------------------------------------------------------------------------- */ + +template<class DeviceType> +double PPPMKokkos<DeviceType>::derivf() +{ + double h = 0.000001; //Derivative step-size + double df,f1,f2,g_ewald_old; + + f1 = newton_raphson_f(); + g_ewald_old = g_ewald; + g_ewald += h; + f2 = newton_raphson_f(); + g_ewald = g_ewald_old; + df = (f2 - f1)/h; + + return df; +} + +/* ---------------------------------------------------------------------- + calculate the final estimate of the accuracy +------------------------------------------------------------------------- */ + +template<class DeviceType> +double PPPMKokkos<DeviceType>::final_accuracy() +{ + double xprd = domain->xprd; + double yprd = domain->yprd; + double zprd = domain->zprd; + bigint natoms = atomKK->natoms; + + double df_kspace = compute_df_kspace(); + double q2_over_sqrt = q2 / sqrt(natoms*cutoff*xprd*yprd*zprd); + double df_rspace = 2.0 * q2_over_sqrt * exp(-g_ewald*g_ewald*cutoff*cutoff); + double df_table = estimate_table_accuracy(q2_over_sqrt,df_rspace); + double estimated_accuracy = sqrt(df_kspace*df_kspace + df_rspace*df_rspace + + df_table*df_table); + + return estimated_accuracy; +} + +/* ---------------------------------------------------------------------- + set local subset of PPPM/FFT grid that I own + n xyz lo/hi in = 3d brick that I own (inclusive) + n xyz lo/hi out = 3d brick + ghost cells in 6 directions (inclusive) + n xyz lo/hi fft = FFT columns that I own (all of x dim, 2d decomp in yz) +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PPPMKokkos<DeviceType>::set_grid_local() +{ + // global indices of PPPM grid range from 0 to N-1 + // nlo_in,nhi_in = lower/upper limits of the 3d sub-brick of + // global PPPM grid that I own without ghost cells + // for slab PPPM, assign z grid as if it were not extended + + nxlo_in = static_cast<int> (comm->xsplit[comm->myloc[0]] * nx_pppm); + nxhi_in = static_cast<int> (comm->xsplit[comm->myloc[0]+1] * nx_pppm) - 1; + + nylo_in = static_cast<int> (comm->ysplit[comm->myloc[1]] * ny_pppm); + nyhi_in = static_cast<int> (comm->ysplit[comm->myloc[1]+1] * ny_pppm) - 1; + + nzlo_in = static_cast<int> + (comm->zsplit[comm->myloc[2]] * nz_pppm/slab_volfactor); + nzhi_in = static_cast<int> + (comm->zsplit[comm->myloc[2]+1] * nz_pppm/slab_volfactor) - 1; + + // nlower,nupper = stencil size for mapping particles to PPPM grid + + nlower = -(order-1)/2; + nupper = order/2; + + // shift values for particle <-> grid mapping + // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1 + + if (order % 2) shift = OFFSET + 0.5; + else shift = OFFSET; + if (order % 2) shiftone = 0.0; + else shiftone = 0.5; + + // nlo_out,nhi_out = lower/upper limits of the 3d sub-brick of + // global PPPM grid that my particles can contribute charge to + // effectively nlo_in,nhi_in + ghost cells + // nlo,nhi = global coords of grid pt to "lower left" of smallest/largest + // position a particle in my box can be at + // dist[3] = particle position bound = subbox + skin/2.0 + qdist + // qdist = offset due to TIP4P fictitious charge + // convert to triclinic if necessary + // nlo_out,nhi_out = nlo,nhi + stencil size for particle mapping + // for slab PPPM, assign z grid as if it were not extended + + double *prd,*sublo,*subhi; + + if (triclinic == 0) { + prd = domain->prd; + boxlo[0] = domain->boxlo[0]; + boxlo[1] = domain->boxlo[1]; + boxlo[2] = domain->boxlo[2]; + sublo = domain->sublo; + subhi = domain->subhi; + } else { + prd = domain->prd_lamda; + boxlo[0] = domain->boxlo_lamda[0]; + boxlo[1] = domain->boxlo_lamda[1]; + boxlo[2] = domain->boxlo_lamda[2]; + domain->x2lamda(atomKK->nlocal); + sublo = domain->sublo_lamda; + subhi = domain->subhi_lamda; + } + + double xprd = prd[0]; + double yprd = prd[1]; + double zprd = prd[2]; + double zprd_slab = zprd*slab_volfactor; + + double dist[3]; + double cuthalf = 0.5*neighbor->skin + qdist; + if (triclinic == 0) dist[0] = dist[1] = dist[2] = cuthalf; + else kspacebbox(cuthalf,&dist[0]); + + int nlo,nhi; + + nlo = static_cast<int> ((sublo[0]-dist[0]-boxlo[0]) * + nx_pppm/xprd + shift) - OFFSET; + nhi = static_cast<int> ((subhi[0]+dist[0]-boxlo[0]) * + nx_pppm/xprd + shift) - OFFSET; + nxlo_out = nlo + nlower; + nxhi_out = nhi + nupper; + + nlo = static_cast<int> ((sublo[1]-dist[1]-boxlo[1]) * + ny_pppm/yprd + shift) - OFFSET; + nhi = static_cast<int> ((subhi[1]+dist[1]-boxlo[1]) * + ny_pppm/yprd + shift) - OFFSET; + nylo_out = nlo + nlower; + nyhi_out = nhi + nupper; + + nlo = static_cast<int> ((sublo[2]-dist[2]-boxlo[2]) * + nz_pppm/zprd_slab + shift) - OFFSET; + nhi = static_cast<int> ((subhi[2]+dist[2]-boxlo[2]) * + nz_pppm/zprd_slab + shift) - OFFSET; + nzlo_out = nlo + nlower; + nzhi_out = nhi + nupper; + + // for slab PPPM, change the grid boundary for processors at +z end + // to include the empty volume between periodically repeating slabs + // for slab PPPM, want charge data communicated from -z proc to +z proc, + // but not vice versa, also want field data communicated from +z proc to + // -z proc, but not vice versa + // this is accomplished by nzhi_in = nzhi_out on +z end (no ghost cells) + // also insure no other procs use ghost cells beyond +z limit + + if (slabflag == 1) { + if (comm->myloc[2] == comm->procgrid[2]-1) + nzhi_in = nzhi_out = nz_pppm - 1; + nzhi_out = MIN(nzhi_out,nz_pppm-1); + } + + // decomposition of FFT mesh + // global indices range from 0 to N-1 + // proc owns entire x-dimension, clumps of columns in y,z dimensions + // npey_fft,npez_fft = # of procs in y,z dims + // if nprocs is small enough, proc can own 1 or more entire xy planes, + // else proc owns 2d sub-blocks of yz plane + // me_y,me_z = which proc (0-npe_fft-1) I am in y,z dimensions + // nlo_fft,nhi_fft = lower/upper limit of the section + // of the global FFT mesh that I own + + int npey_fft,npez_fft; + if (nz_pppm >= nprocs) { + npey_fft = 1; + npez_fft = nprocs; + } else procs2grid2d(nprocs,ny_pppm,nz_pppm,&npey_fft,&npez_fft); + + int me_y = me % npey_fft; + int me_z = me / npey_fft; + + nxlo_fft = 0; + nxhi_fft = nx_pppm - 1; + nylo_fft = me_y*ny_pppm/npey_fft; + nyhi_fft = (me_y+1)*ny_pppm/npey_fft - 1; + nzlo_fft = me_z*nz_pppm/npez_fft; + nzhi_fft = (me_z+1)*nz_pppm/npez_fft - 1; + + // PPPM grid pts owned by this proc, including ghosts + + ngrid = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) * + (nzhi_out-nzlo_out+1); + + // FFT grids owned by this proc, without ghosts + // nfft = FFT points in FFT decomposition on this proc + // nfft_brick = FFT points in 3d brick-decomposition on this proc + // nfft_both = greater of 2 values + + nfft = (nxhi_fft-nxlo_fft+1) * (nyhi_fft-nylo_fft+1) * + (nzhi_fft-nzlo_fft+1); + int nfft_brick = (nxhi_in-nxlo_in+1) * (nyhi_in-nylo_in+1) * + (nzhi_in-nzlo_in+1); + nfft_both = MAX(nfft,nfft_brick); +} + +/* ---------------------------------------------------------------------- + pre-compute Green's function denominator expansion coeffs, Gamma(2n) +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PPPMKokkos<DeviceType>::compute_gf_denom() +{ + int k,l,m; + + for (l = 1; l < order; l++) k_gf_b.h_view[l] = 0.0; + k_gf_b.h_view[0] = 1.0; + + for (m = 1; m < order; m++) { + for (l = m; l > 0; l--) + k_gf_b.h_view[l] = 4.0 * (k_gf_b.h_view[l]*(l-m)*(l-m-0.5)-k_gf_b.h_view[l-1]*(l-m-1)*(l-m-1)); + k_gf_b.h_view[0] = 4.0 * (k_gf_b.h_view[0]*(l-m)*(l-m-0.5)); + } + + bigint ifact = 1; + for (k = 1; k < 2*order; k++) ifact *= k; + double gaminv = 1.0/ifact; + for (l = 0; l < order; l++) k_gf_b.h_view[l] *= gaminv; + + k_gf_b.template modify<LMPHostType>(); + k_gf_b.template sync<DeviceType>(); +} + +/* ---------------------------------------------------------------------- + pre-compute modified (Hockney-Eastwood) Coulomb Green's function +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PPPMKokkos<DeviceType>::compute_gf_ik() +{ + const double * const prd = domain->prd; + + xprd = prd[0]; + yprd = prd[1]; + const double zprd = prd[2]; + zprd_slab = zprd*slab_volfactor; + unitkx = (MY_2PI/xprd); + unitky = (MY_2PI/yprd); + unitkz = (MY_2PI/zprd_slab); + + nbx = static_cast<int> ((g_ewald*xprd/(MY_PI*nx_pppm)) * + pow(-log(EPS_HOC),0.25)); + nby = static_cast<int> ((g_ewald*yprd/(MY_PI*ny_pppm)) * + pow(-log(EPS_HOC),0.25)); + nbz = static_cast<int> ((g_ewald*zprd_slab/(MY_PI*nz_pppm)) * + pow(-log(EPS_HOC),0.25)); + twoorder = 2*order; + + // merge three outer loops into one for better threading + + numz_fft = nzhi_fft-nzlo_fft + 1; + numy_fft = nyhi_fft-nylo_fft + 1; + numx_fft = nxhi_fft-nxlo_fft + 1; + const int inum_fft = numz_fft*numy_fft*numx_fft; + + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_compute_gf_ik>(0,inum_fft),*this); + DeviceType::fence(); + copymode = 0; +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_compute_gf_ik, const int &n) const +{ + int m = n/(numy_fft*numx_fft); + int l = (n - m*numy_fft*numx_fft) / numx_fft; + int k = n - m*numy_fft*numx_fft - l*numx_fft; + m += nzlo_fft; + l += nylo_fft; + k += nxlo_fft; + + const int mper = m - nz_pppm*(2*m/nz_pppm); + const double snz = square(sin(0.5*unitkz*mper*zprd_slab/nz_pppm)); + + const int lper = l - ny_pppm*(2*l/ny_pppm); + const double sny = square(sin(0.5*unitky*lper*yprd/ny_pppm)); + + const int kper = k - nx_pppm*(2*k/nx_pppm); + const double snx = square(sin(0.5*unitkx*kper*xprd/nx_pppm)); + + const double sqk = square(unitkx*kper) + square(unitky*lper) + square(unitkz*mper); + + if (sqk != 0.0) { + const double numerator = 12.5663706/sqk; + const double denominator = gf_denom(snx,sny,snz); + double sum1 = 0.0; + + for (int nx = -nbx; nx <= nbx; nx++) { + const double qx = unitkx*(kper+nx_pppm*nx); + const double sx = exp(-0.25*square(qx/g_ewald)); + const double argx = 0.5*qx*xprd/nx_pppm; + const double wx = powsinxx(argx,twoorder); + + for (int ny = -nby; ny <= nby; ny++) { + const double qy = unitky*(lper+ny_pppm*ny); + const double sy = exp(-0.25*square(qy/g_ewald)); + const double argy = 0.5*qy*yprd/ny_pppm; + const double wy = powsinxx(argy,twoorder); + + for (int nz = -nbz; nz <= nbz; nz++) { + const double qz = unitkz*(mper+nz_pppm*nz); + const double sz = exp(-0.25*square(qz/g_ewald)); + const double argz = 0.5*qz*zprd_slab/nz_pppm; + const double wz = powsinxx(argz,twoorder); + + const double dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz; + const double dot2 = qx*qx+qy*qy+qz*qz; + sum1 += (dot1/dot2) * sx*sy*sz * wx*wy*wz; + } + } + } + d_greensfn[n] = numerator*sum1/denominator; + } else d_greensfn[n] = 0.0; +} + +/* ---------------------------------------------------------------------- + pre-compute modified (Hockney-Eastwood) Coulomb Green's function + for a triclinic system +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PPPMKokkos<DeviceType>::compute_gf_ik_triclinic() +{ + double tmp[3]; + tmp[0] = (g_ewald/(MY_PI*nx_pppm)) * pow(-log(EPS_HOC),0.25); + tmp[1] = (g_ewald/(MY_PI*ny_pppm)) * pow(-log(EPS_HOC),0.25); + tmp[2] = (g_ewald/(MY_PI*nz_pppm)) * pow(-log(EPS_HOC),0.25); + lamda2xT(&tmp[0],&tmp[0]); + nbx = static_cast<int> (tmp[0]); + nby = static_cast<int> (tmp[1]); + nbz = static_cast<int> (tmp[2]); + + twoorder = 2*order; + + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_compute_gf_ik_triclinic>(nzlo_fft,nzhi_fft+1),*this); + DeviceType::fence(); + copymode = 0; +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_compute_gf_ik_triclinic, const int &m) const +{ + //int n = (m - nzlo_fft)*(nyhi_fft+1 - nylo_fft)*(nxhi_fft+1 - nxlo_fft); + // + //const int mper = m - nz_pppm*(2*m/nz_pppm); + //const double snz = square(sin(MY_PI*mper/nz_pppm)); + // + //for (int l = nylo_fft; l <= nyhi_fft; l++) { + // const int lper = l - ny_pppm*(2*l/ny_pppm); + // const double sny = square(sin(MY_PI*lper/ny_pppm)); + // + // for (int k = nxlo_fft; k <= nxhi_fft; k++) { + // const int kper = k - nx_pppm*(2*k/nx_pppm); + // const double snx = square(sin(MY_PI*kper/nx_pppm)); + // + // double unitk_lamda[3]; + // unitk_lamda[0] = 2.0*MY_PI*kper; + // unitk_lamda[1] = 2.0*MY_PI*lper; + // unitk_lamda[2] = 2.0*MY_PI*mper; + // x2lamdaT(&unitk_lamda[0],&unitk_lamda[0]); + // + // const double sqk = square(unitk_lamda[0]) + square(unitk_lamda[1]) + square(unitk_lamda[2]); + // + // if (sqk != 0.0) { + // const double numerator = 12.5663706/sqk; + // const double denominator = gf_denom(snx,sny,snz); + // double sum1 = 0.0; + // + // for (int nx = -nbx; nx <= nbx; nx++) { + // const double argx = MY_PI*kper/nx_pppm + MY_PI*nx; + // const double wx = powsinxx(argx,twoorder); + // + // for (int ny = -nby; ny <= nby; ny++) { + // const double argy = MY_PI*lper/ny_pppm + MY_PI*ny; + // const double wy = powsinxx(argy,twoorder); + // + // for (int nz = -nbz; nz <= nbz; nz++) { + // const double argz = MY_PI*mper/nz_pppm + MY_PI*nz; + // const double wz = powsinxx(argz,twoorder); + // + // double b[3]; + // b[0] = 2.0*MY_PI*nx_pppm*nx; + // b[1] = 2.0*MY_PI*ny_pppm*ny; + // b[2] = 2.0*MY_PI*nz_pppm*nz; + // x2lamdaT(&b[0],&b[0]); + // + // const double qx = unitk_lamda[0]+b[0]; + // const double sx = exp(-0.25*square(qx/g_ewald)); + // + // const double qy = unitk_lamda[1]+b[1]; + // const double sy = exp(-0.25*square(qy/g_ewald)); + // + // const double qz = unitk_lamda[2]+b[2]; + // const double sz = exp(-0.25*square(qz/g_ewald)); + // + // const double dot1 = unitk_lamda[0]*qx + unitk_lamda[1]*qy + unitk_lamda[2]*qz; + // const double dot2 = qx*qx+qy*qy+qz*qz; + // sum1 += (dot1/dot2) * sx*sy*sz * wx*wy*wz; + // } + // } + // } + // d_greensfn[n++] = numerator*sum1/denominator; + // } else d_greensfn[n++] = 0.0; + // } + //} +} + +/* ---------------------------------------------------------------------- + find center grid pt for each of my particles + check that full stencil for the particle will fit in my 3d brick + store central grid pt indices in part2grid array +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PPPMKokkos<DeviceType>::particle_map() +{ + int nlocal = atomKK->nlocal; + + k_flag.h_view() = 0; + k_flag.template modify<LMPHostType>(); + k_flag.template sync<DeviceType>(); + + if (!ISFINITE(boxlo[0]) || !ISFINITE(boxlo[1]) || !ISFINITE(boxlo[2])) + error->one(FLERR,"Non-numeric box dimensions - simulation unstable"); + + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_particle_map>(0,nlocal),*this); + DeviceType::fence(); + copymode = 0; + + k_flag.template modify<DeviceType>(); + k_flag.template sync<LMPHostType>(); + if (k_flag.h_view()) error->one(FLERR,"Out of range atoms - cannot compute PPPM"); +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_particle_map, const int &i) const +{ + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // current particle coord can be outside global and local box + // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1 + + const int nx = static_cast<int> ((x(i,0)-boxlo[0])*delxinv+shift) - OFFSET; + const int ny = static_cast<int> ((x(i,1)-boxlo[1])*delyinv+shift) - OFFSET; + const int nz = static_cast<int> ((x(i,2)-boxlo[2])*delzinv+shift) - OFFSET; + + d_part2grid(i,0) = nx; + d_part2grid(i,1) = ny; + d_part2grid(i,2) = nz; + + // check that entire stencil around nx,ny,nz will fit in my 3d brick + + if (nx+nlower < nxlo_out || nx+nupper > nxhi_out || + ny+nlower < nylo_out || ny+nupper > nyhi_out || + nz+nlower < nzlo_out || nz+nupper > nzhi_out) + k_flag.view<DeviceType>()() = 1; +} + +/* ---------------------------------------------------------------------- + create discretized "density" on section of global grid due to my particles + density(x,y,z) = charge "density" at grid points of my 3d brick + (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts) + in global grid +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PPPMKokkos<DeviceType>::make_rho() +{ + int l,m,n,nx,ny,nz,mx,my,mz; + FFT_SCALAR dx,dy,dz,x0,y0,z0; + + // clear 3d density array + + //memset(&(density_brick(nzlo_out,nylo_out,nxlo_out)),0, + // ngrid*sizeof(FFT_SCALAR)); + numz_out = nzhi_out-nzlo_out + 1; + numy_out = nyhi_out-nylo_out + 1; + numx_out = nxhi_out-nxlo_out + 1; + const int inum_out = numz_out*numy_out*numx_out; + + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_make_rho_zero>(0,inum_out),*this); + DeviceType::fence(); + copymode = 0; + + // loop over my charges, add their contribution to nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt + + nlocal = atomKK->nlocal; + +#ifdef KOKKOS_HAVE_CUDA + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_make_rho_atomic>(0,nlocal),*this); + DeviceType::fence(); + copymode = 0; +#else + ix = nxhi_out-nxlo_out + 1; + iy = nyhi_out-nylo_out + 1; + + copymode = 1; + Kokkos::TeamPolicy<DeviceType, TagPPPM_make_rho> config(lmp->kokkos->num_threads,1); + Kokkos::parallel_for(config,*this); + DeviceType::fence(); + copymode = 0; +#endif +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_make_rho_zero, const int &ii) const +{ + int iz = ii/(numy_out*numx_out); + int iy = (ii - iz*numy_out*numx_out) / numx_out; + int ix = ii - iz*numy_out*numx_out - iy*numx_out; + d_density_brick(iz,iy,ix) = 0.0; +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_make_rho_atomic, const int &i) const +{ + // The density_brick array is atomic for Half/Thread neighbor style + Kokkos::View<FFT_SCALAR***,Kokkos::LayoutRight,DeviceType,Kokkos::MemoryTraits<Kokkos::Atomic|Kokkos::Unmanaged> > a_density_brick = d_density_brick; + + int nx = d_part2grid(i,0); + int ny = d_part2grid(i,1); + int nz = d_part2grid(i,2); + const FFT_SCALAR dx = nx+shiftone - (x(i,0)-boxlo[0])*delxinv; + const FFT_SCALAR dy = ny+shiftone - (x(i,1)-boxlo[1])*delyinv; + const FFT_SCALAR dz = nz+shiftone - (x(i,2)-boxlo[2])*delzinv; + + + nz -= nzlo_out; + ny -= nylo_out; + nx -= nxlo_out; + + compute_rho1d(i,dx,dy,dz); + + const FFT_SCALAR z0 = delvolinv * q[i]; + for (int n = nlower; n <= nupper; n++) { + const int mz = n+nz; + const FFT_SCALAR y0 = z0*d_rho1d(i,n+order/2,2); + for (int m = nlower; m <= nupper; m++) { + const int my = m+ny; + const FFT_SCALAR x0 = y0*d_rho1d(i,m+order/2,1); + for (int l = nlower; l <= nupper; l++) { + const int mx = l+nx; + a_density_brick(mz,my,mx) += x0*d_rho1d(i,l+order/2,0); + } + } + } +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator() (TagPPPM_make_rho, typename Kokkos::TeamPolicy<DeviceType, TagPPPM_make_rho>::member_type dev) const { + // adapted from USER-OMP/pppm.cpp: + + // determine range of grid points handled by this thread + int tid = dev.league_rank(); + // each thread works on a fixed chunk of grid points + const int nthreads = dev.league_size(); + const int idelta = 1 + ngrid/nthreads; + int ifrom = tid*idelta; + int ito = ((ifrom + idelta) > ngrid) ? ngrid : ifrom + idelta; + + // loop over my charges, add their contribution to nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + + // loop over all local atoms for all threads + for (int i = 0; i < nlocal; i++) { + + int nx = d_part2grid(i,0); + int ny = d_part2grid(i,1); + int nz = d_part2grid(i,2); + + // pre-screen whether this atom will ever come within + // reach of the data segement this thread is updating. + if ( ((nz+nlower-nzlo_out)*ix*iy >= ito) + || ((nz+nupper-nzlo_out+1)*ix*iy < ifrom) ) continue; + + const FFT_SCALAR dx = nx+shiftone - (x(i,0)-boxlo[0])*delxinv; + const FFT_SCALAR dy = ny+shiftone - (x(i,1)-boxlo[1])*delyinv; + const FFT_SCALAR dz = nz+shiftone - (x(i,2)-boxlo[2])*delzinv; + + nz -= nzlo_out; + ny -= nylo_out; + nx -= nxlo_out; + + compute_rho1d(i,dx,dy,dz); + + const FFT_SCALAR z0 = delvolinv * q[i]; + for (int n = nlower; n <= nupper; n++) { + const int mz = n+nz; + const int in = mz*ix*iy; + const FFT_SCALAR y0 = z0*d_rho1d(i,n+order/2,2); + for (int m = nlower; m <= nupper; m++) { + const int my = m+ny; + const int im = in+my*ix; + const FFT_SCALAR x0 = y0*d_rho1d(i,m+order/2,1); + for (int l = nlower; l <= nupper; l++) { + const int mx = l+nx; + const int il = im+mx; + // make sure each thread only updates + // their elements of the density grid + if (il >= ito) break; + if (il < ifrom) continue; + d_density_brick(mz,my,mx) += x0*d_rho1d(i,l+order/2,0); + } + } + } + } +} + +/* ---------------------------------------------------------------------- + remap density from 3d brick decomposition to FFT decomposition +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PPPMKokkos<DeviceType>::brick2fft() +{ + // copy grabs inner portion of density from 3d brick + // remap could be done as pre-stage of FFT, + // but this works optimally on only double values, not complex values + + numz_inout = (nzhi_in-nzlo_out)-(nzlo_in-nzlo_out) + 1; + numy_inout = (nyhi_in-nylo_out)-(nylo_in-nylo_out) + 1; + numx_inout = (nxhi_in-nxlo_out)-(nxlo_in-nxlo_out) + 1; + const int inum_inout = numz_inout*numy_inout*numx_inout; + + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_brick2fft>(0,inum_inout),*this); + DeviceType::fence(); + copymode = 0; + + k_density_fft.template modify<DeviceType>(); + k_density_fft.template sync<LMPHostType>(); + remap->perform(density_fft,density_fft,work1); + k_density_fft.template modify<LMPHostType>(); + k_density_fft.template sync<DeviceType>(); + +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_brick2fft, const int &ii) const +{ + const int n = ii; + int k = ii/(numy_inout*numx_inout); + int j = (ii - k*numy_inout*numx_inout) / numx_inout; + int i = ii - k*numy_inout*numx_inout - j*numx_inout; + k += nzlo_in-nzlo_out; + j += nylo_in-nylo_out; + i += nxlo_in-nxlo_out; + d_density_fft[n] = d_density_brick(k,j,i); +} + +/* ---------------------------------------------------------------------- + FFT-based Poisson solver +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PPPMKokkos<DeviceType>::poisson() +{ + poisson_ik(); +} + +/* ---------------------------------------------------------------------- + FFT-based Poisson solver for ik +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PPPMKokkos<DeviceType>::poisson_ik() +{ + int i,j,k,n; + double eng; + + // transform charge density (r -> k) + + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik1>(0,nfft),*this); + DeviceType::fence(); + copymode = 0; + + k_work1.template modify<DeviceType>(); + k_work1.template sync<LMPHostType>(); + fft1->compute(work1,work1,1); + k_work1.template modify<LMPHostType>(); + k_work1.template sync<DeviceType>(); + + + // global energy and virial contribution + + scaleinv = 1.0/(nx_pppm*ny_pppm*nz_pppm); + s2 = scaleinv*scaleinv; + + if (eflag_global || vflag_global) { + EV_FLOAT ev; + if (vflag_global) { + copymode = 1; + Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik2>(0,nfft),*this,ev); + DeviceType::fence(); + copymode = 0; + for (j = 0; j < 6; j++) virial[j] += ev.v[j]; + energy += ev.ecoul; + } else { + n = 0; + copymode = 1; + Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik3>(0,nfft),*this,ev); + DeviceType::fence(); + copymode = 0; + energy += ev.ecoul; + } + } + + // scale by 1/total-grid-pts to get rho(k) + // multiply by Green's function to get V(k) + + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik4>(0,nfft),*this); + DeviceType::fence(); + copymode = 0; + + // extra FFTs for per-atomKK energy/virial + + if (evflag_atom) poisson_peratom(); + + // triclinic system + + if (triclinic) { + poisson_ik_triclinic(); + return; + } + + // compute gradients of V(r) in each of 3 dims by transformimg -ik*V(k) + // FFT leaves data in 3d brick decomposition + // copy it into inner portion of vdx,vdy,vdz arrays + + // merge three outer loops into one for better threading + + numz_fft = nzhi_fft-nzlo_fft + 1; + numy_fft = nyhi_fft-nylo_fft + 1; + numx_fft = nxhi_fft-nxlo_fft + 1; + const int inum_fft = numz_fft*numy_fft*numx_fft; + + numz_inout = (nzhi_in-nzlo_out)-(nzlo_in-nzlo_out) + 1; + numy_inout = (nyhi_in-nylo_out)-(nylo_in-nylo_out) + 1; + numx_inout = (nxhi_in-nxlo_out)-(nxlo_in-nxlo_out) + 1; + const int inum_inout = numz_inout*numy_inout*numx_inout; + + // x direction gradient + + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik5>(0,inum_fft),*this); + DeviceType::fence(); + copymode = 0; + + k_work2.template modify<DeviceType>(); + k_work2.template sync<LMPHostType>(); + fft2->compute(work2,work2,-1); + k_work2.template modify<LMPHostType>(); + k_work2.template sync<DeviceType>(); + + + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik6>(0,inum_inout),*this); + DeviceType::fence(); + copymode = 0; + + + // y direction gradient + + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik7>(0,inum_fft),*this); + DeviceType::fence(); + copymode = 0; + + k_work2.template modify<DeviceType>(); + k_work2.template sync<LMPHostType>(); + fft2->compute(work2,work2,-1); + k_work2.template modify<LMPHostType>(); + k_work2.template sync<DeviceType>(); + + + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik8>(0,inum_inout),*this); + DeviceType::fence(); + copymode = 0; + + // z direction gradient + + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik9>(0,inum_fft),*this); + DeviceType::fence(); + copymode = 0; + + k_work2.template modify<DeviceType>(); + k_work2.template sync<LMPHostType>(); + fft2->compute(work2,work2,-1); + k_work2.template modify<LMPHostType>(); + k_work2.template sync<DeviceType>(); + + + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik10>(0,inum_inout),*this); + DeviceType::fence(); + copymode = 0; + +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik1, const int &i) const +{ + d_work1[2*i] = d_density_fft[i]; + d_work1[2*i+1] = ZEROF; +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik2, const int &i, EV_FLOAT& ev) const +{ + const double eng = s2 * d_greensfn[i] * (d_work1[2*i]*d_work1[2*i] + d_work1[2*i+1]*d_work1[2*i+1]); + for (int j = 0; j < 6; j++) ev.v[j] += eng*d_vg(i,j); + if (eflag_global) ev.ecoul += eng; +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik3, const int &i, EV_FLOAT& ev) const +{ + ev.ecoul += + s2 * d_greensfn[i] * (d_work1[2*i]*d_work1[2*i] + d_work1[2*i+1]*d_work1[2*i+1]); +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik4, const int &i) const +{ + d_work1[2*i] *= scaleinv * d_greensfn[i]; + d_work1[2*i+1] *= scaleinv * d_greensfn[i]; +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik5, const int &ii) const +{ + const int n = ii*2; + const int k = ii/(numy_fft*numx_fft); + const int j = (ii - k*numy_fft*numx_fft) / numx_fft; + const int i = ii - k*numy_fft*numx_fft - j*numx_fft; + d_work2[n] = d_fkx[i]*d_work1[n+1]; + d_work2[n+1] = -d_fkx[i]*d_work1[n]; +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik6, const int &ii) const +{ + const int n = ii*2; + int k = ii/(numy_inout*numx_inout); + int j = (ii - k*numy_inout*numx_inout) / numx_inout; + int i = ii - k*numy_inout*numx_inout - j*numx_inout; + k += nzlo_in-nzlo_out; + j += nylo_in-nylo_out; + i += nxlo_in-nxlo_out; + d_vdx_brick(k,j,i) = d_work2[n]; +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik7, const int &ii) const +{ + const int n = ii*2; + const int k = ii/(numy_fft*numx_fft); + const int j = (ii - k*numy_fft*numx_fft) / numx_fft; + d_work2[n] = d_fky[j]*d_work1[n+1]; + d_work2[n+1] = -d_fky[j]*d_work1[n]; +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik8, const int &ii) const +{ + const int n = ii*2; + int k = ii/(numy_inout*numx_inout); + int j = (ii - k*numy_inout*numx_inout) / numx_inout; + int i = ii - k*numy_inout*numx_inout - j*numx_inout; + k += nzlo_in-nzlo_out; + j += nylo_in-nylo_out; + i += nxlo_in-nxlo_out; + d_vdy_brick(k,j,i) = d_work2[n]; +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik9, const int &ii) const +{ + const int n = ii*2; + const int k = ii/(numy_fft*numx_fft); + d_work2[n] = d_fkz[k]*d_work1[n+1]; + d_work2[n+1] = -d_fkz[k]*d_work1[n]; +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik10, const int &ii) const +{ + const int n = ii*2; + int k = ii/(numy_inout*numx_inout); + int j = (ii - k*numy_inout*numx_inout) / numx_inout; + int i = ii - k*numy_inout*numx_inout - j*numx_inout; + k += nzlo_in-nzlo_out; + j += nylo_in-nylo_out; + i += nxlo_in-nxlo_out; + d_vdz_brick(k,j,i) = d_work2[n]; +} + +/* ---------------------------------------------------------------------- + FFT-based Poisson solver for ik for a triclinic system +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PPPMKokkos<DeviceType>::poisson_ik_triclinic() +{ +// int i,j,k,n; +// +// // compute gradients of V(r) in each of 3 dims by transformimg -ik*V(k) +// // FFT leaves data in 3d brick decomposition +// // copy it into inner portion of vdx,vdy,vdz arrays +// +// // x direction gradient +// +// n = 0; +// for (i = 0; i < nfft; i++) { // parallel_for1 +// d_work2[n] = d_fkx[i]*d_work1[n+1]; +// d_work2[n+1] = -d_fkx[i]*d_work1[n]; +// n += 2; +// } +// +// fft2->compute(d_work2,d_work2,-1); +// +// n = 0; +// for (k = nzlo_in-nzlo_out; k <= nzhi_in-nzlo_out; k++) // parallel_for2 +// +// +// // y direction gradient +// +// n = 0; +// for (i = 0; i < nfft; i++) { // parallel_for3 +// d_work2[n] = d_fky[i]*d_work1[n+1]; +// d_work2[n+1] = -d_fky[i]*d_work1[n]; +// n += 2; +// } +// +// fft2->compute(d_work2,d_work2,-1); +// +// n = 0; +// for (k = nzlo_in-nzlo_out; k <= nzhi_in-nzlo_out; k++) // parallel_for4 +// for (j = nylo_in-nylo_out; j <= nyhi_in-nylo_out; j++) +// for (i = nxlo_in-nxlo_out; i <= nxhi_in-nxlo_out; i++) { +// d_vdy_brick(k,j,i) = d_work2[n]; +// n += 2; +// } +// +// // z direction gradient +// +// n = 0; +// for (i = 0; i < nfft; i++) { // parallel_for5 +// d_work2[n] = d_fkz[i]*d_work1[n+1]; +// d_work2[n+1] = -d_fkz[i]*d_work1[n]; +// n += 2; +// } +// +// fft2->compute(d_work2,d_work2,-1); +// +// n = 0; +// for (k = nzlo_in-nzlo_out; k <= nzhi_in-nzlo_out; k++) // parallel_for6 +// +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik_triclinic1, const int &k) const +{ + +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik_triclinic2, const int &k) const +{ +// for (j = nylo_in-nylo_out; j <= nyhi_in-nylo_out; j++) +// for (i = nxlo_in-nxlo_out; i <= nxhi_in-nxlo_out; i++) { +// d_vdx_brick(k,j,i) = d_work2[n]; +// n += 2; +// } +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik_triclinic3, const int &k) const +{ +// int n = (k - (nzlo_in-nzlo_out))*((nyhi_in-nylo_out) - (nylo_in-nylo_out) + 1)*((nxhi_in-nxlo_out) - (nxlo_in-nxlo_out) + 1)*2; + +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik_triclinic4, const int &k) const +{ +// int n = (k - (nzlo_in-nzlo_out))*((nyhi_in-nylo_out) - (nylo_in-nylo_out) + 1)*((nxhi_in-nxlo_out) - (nxlo_in-nxlo_out) + 1)*2; +// +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik_triclinic5, const int &k) const +{ +// int n = (k - (nzlo_in-nzlo_out))*((nyhi_in-nylo_out) - (nylo_in-nylo_out) + 1)*((nxhi_in-nxlo_out) - (nxlo_in-nxlo_out) + 1)*2; +// +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik_triclinic6, const int &k) const +{ +// int n = (k - (nzlo_in-nzlo_out))*((nyhi_in-nylo_out) - (nylo_in-nylo_out) + 1)*((nxhi_in-nxlo_out) - (nxlo_in-nxlo_out) + 1)*2; +// +// for (j = nylo_in-nylo_out; j <= nyhi_in-nylo_out; j++) +// for (i = nxlo_in-nxlo_out; i <= nxhi_in-nxlo_out; i++) { +// d_vdz_brick(k,j,i) = d_work2[n]; +// n += 2; +// } +} + +/* ---------------------------------------------------------------------- + FFT-based Poisson solver for per-atom energy/virial +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PPPMKokkos<DeviceType>::poisson_peratom() +{ + int i,j,k,n; + + // merge three outer loops into one for better threading + + numz_inout = (nzhi_in-nzlo_out)-(nzlo_in-nzlo_out) + 1; + numy_inout = (nyhi_in-nylo_out)-(nylo_in-nylo_out) + 1; + numx_inout = (nxhi_in-nxlo_out)-(nxlo_in-nxlo_out) + 1; + const int inum_inout = numz_inout*numy_inout*numx_inout; + + // energy + + if (eflag_atom) { + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom1>(0,nfft),*this); + DeviceType::fence(); + copymode = 0; + + k_work2.template modify<DeviceType>(); + k_work2.template sync<LMPHostType>(); + fft2->compute(work2,work2,-1); + k_work2.template modify<LMPHostType>(); + k_work2.template sync<DeviceType>(); + + + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom2>(0,inum_inout),*this); + DeviceType::fence(); + copymode = 0; + + } + + // 6 components of virial in v0 thru v5 + + if (!vflag_atom) return; + + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom3>(0,nfft),*this); + DeviceType::fence(); + copymode = 0; + + k_work2.template modify<DeviceType>(); + k_work2.template sync<LMPHostType>(); + fft2->compute(work2,work2,-1); + k_work2.template modify<LMPHostType>(); + k_work2.template sync<DeviceType>(); + + + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom4>(0,inum_inout),*this); + DeviceType::fence(); + copymode = 0; + + + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom5>(0,nfft),*this); + DeviceType::fence(); + copymode = 0; + + k_work2.template modify<DeviceType>(); + k_work2.template sync<LMPHostType>(); + fft2->compute(work2,work2,-1); + k_work2.template modify<LMPHostType>(); + k_work2.template sync<DeviceType>(); + + + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom6>(0,inum_inout),*this); + DeviceType::fence(); + copymode = 0; + + + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom7>(0,nfft),*this); + DeviceType::fence(); + copymode = 0; + + k_work2.template modify<DeviceType>(); + k_work2.template sync<LMPHostType>(); + fft2->compute(work2,work2,-1); + k_work2.template modify<LMPHostType>(); + k_work2.template sync<DeviceType>(); + + + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom8>(0,inum_inout),*this); + DeviceType::fence(); + copymode = 0; + + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom9>(0,nfft),*this); + DeviceType::fence(); + copymode = 0; + + k_work2.template modify<DeviceType>(); + k_work2.template sync<LMPHostType>(); + fft2->compute(work2,work2,-1); + k_work2.template modify<LMPHostType>(); + k_work2.template sync<DeviceType>(); + + + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom10>(0,inum_inout),*this); + DeviceType::fence(); + copymode = 0; + + + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom11>(0,nfft),*this); + DeviceType::fence(); + copymode = 0; + + k_work2.template modify<DeviceType>(); + k_work2.template sync<LMPHostType>(); + fft2->compute(work2,work2,-1); + k_work2.template modify<LMPHostType>(); + k_work2.template sync<DeviceType>(); + + + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom12>(0,inum_inout),*this); + DeviceType::fence(); + copymode = 0; + + + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom13>(0,nfft),*this); + DeviceType::fence(); + copymode = 0; + + k_work2.template modify<DeviceType>(); + k_work2.template sync<LMPHostType>(); + fft2->compute(work2,work2,-1); + k_work2.template modify<LMPHostType>(); + k_work2.template sync<DeviceType>(); + + + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom14>(0,inum_inout),*this); + DeviceType::fence(); + copymode = 0; + +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_peratom1, const int &i) const +{ + int n = 2*i; + + d_work2[n] = d_work1[n]; + d_work2[n+1] = d_work1[n+1]; + n += 2; +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_peratom2, const int &ii) const +{ + const int n = ii*2; + int k = ii/(numy_inout*numx_inout); + int j = (ii - k*numy_inout*numx_inout) / numx_inout; + int i = ii - k*numy_inout*numx_inout - j*numx_inout; + k += nzlo_in-nzlo_out; + j += nylo_in-nylo_out; + i += nxlo_in-nxlo_out; + d_u_brick(k,j,i) = d_work2[n]; +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_peratom3, const int &i) const +{ + int n = 2*i; + + d_work2[n] = d_work1[n]*d_vg(i,0); + d_work2[n+1] = d_work1[n+1]*d_vg(i,0); + n += 2; +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_peratom4, const int &ii) const +{ + const int n = ii*2; + int k = ii/(numy_inout*numx_inout); + int j = (ii - k*numy_inout*numx_inout) / numx_inout; + int i = ii - k*numy_inout*numx_inout - j*numx_inout; + k += nzlo_in-nzlo_out; + j += nylo_in-nylo_out; + i += nxlo_in-nxlo_out; + d_v0_brick(k,j,i) = d_work2[n]; +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_peratom5, const int &i) const +{ + int n = 2*i; + + d_work2[n] = d_work1[n]*d_vg(i,1); + d_work2[n+1] = d_work1[n+1]*d_vg(i,1); + n += 2; +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_peratom6, const int &ii) const +{ + const int n = ii*2; + int k = ii/(numy_inout*numx_inout); + int j = (ii - k*numy_inout*numx_inout) / numx_inout; + int i = ii - k*numy_inout*numx_inout - j*numx_inout; + k += nzlo_in-nzlo_out; + j += nylo_in-nylo_out; + i += nxlo_in-nxlo_out; + d_v1_brick(k,j,i) = d_work2[n]; +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_peratom7, const int &i) const +{ + int n = 2*i; + + d_work2[n] = d_work1[n]*d_vg(i,2); + d_work2[n+1] = d_work1[n+1]*d_vg(i,2); + n += 2; +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_peratom8, const int &ii) const +{ + const int n = ii*2; + int k = ii/(numy_inout*numx_inout); + int j = (ii - k*numy_inout*numx_inout) / numx_inout; + int i = ii - k*numy_inout*numx_inout - j*numx_inout; + k += nzlo_in-nzlo_out; + j += nylo_in-nylo_out; + i += nxlo_in-nxlo_out; + d_v2_brick(k,j,i) = d_work2[n]; +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_peratom9, const int &i) const +{ + int n = 2*i; + + d_work2[n] = d_work1[n]*d_vg(i,3); + d_work2[n+1] = d_work1[n+1]*d_vg(i,3); + n += 2; +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_peratom10, const int &ii) const +{ + const int n = ii*2; + int k = ii/(numy_inout*numx_inout); + int j = (ii - k*numy_inout*numx_inout) / numx_inout; + int i = ii - k*numy_inout*numx_inout - j*numx_inout; + k += nzlo_in-nzlo_out; + j += nylo_in-nylo_out; + i += nxlo_in-nxlo_out; + d_v3_brick(k,j,i) = d_work2[n]; +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_peratom11, const int &i) const +{ + int n = 2*i; + + d_work2[n] = d_work1[n]*d_vg(i,4); + d_work2[n+1] = d_work1[n+1]*d_vg(i,4); + n += 2; +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_peratom12, const int &ii) const +{ + const int n = ii*2; + int k = ii/(numy_inout*numx_inout); + int j = (ii - k*numy_inout*numx_inout) / numx_inout; + int i = ii - k*numy_inout*numx_inout - j*numx_inout; + k += nzlo_in-nzlo_out; + j += nylo_in-nylo_out; + i += nxlo_in-nxlo_out; + d_v4_brick(k,j,i) = d_work2[n]; +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_peratom13, const int &i) const +{ + int n = 2*i; + + d_work2[n] = d_work1[n]*d_vg(i,5); + d_work2[n+1] = d_work1[n+1]*d_vg(i,5); + n += 2; +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_peratom14, const int &ii) const +{ + const int n = ii*2; + int k = ii/(numy_inout*numx_inout); + int j = (ii - k*numy_inout*numx_inout) / numx_inout; + int i = ii - k*numy_inout*numx_inout - j*numx_inout; + k += nzlo_in-nzlo_out; + j += nylo_in-nylo_out; + i += nxlo_in-nxlo_out; + d_v5_brick(k,j,i) = d_work2[n]; +} +/* ---------------------------------------------------------------------- + interpolate from grid to get electric field & force on my particles +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PPPMKokkos<DeviceType>::fieldforce() +{ + fieldforce_ik(); +} + +/* ---------------------------------------------------------------------- + interpolate from grid to get electric field & force on my particles for ik +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PPPMKokkos<DeviceType>::fieldforce_ik() +{ + // loop over my charges, interpolate electric field from nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt + // ek = 3 components of E-field on particle + + int nlocal = atomKK->nlocal; + + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_fieldforce_ik>(0,nlocal),*this); + DeviceType::fence(); + copymode = 0; +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_fieldforce_ik, const int &i) const +{ + int l,m,n,nx,ny,nz,mx,my,mz; + FFT_SCALAR dx,dy,dz,x0,y0,z0; + FFT_SCALAR ekx,eky,ekz; + + nx = d_part2grid(i,0); + ny = d_part2grid(i,1); + nz = d_part2grid(i,2); + dx = nx+shiftone - (x(i,0)-boxlo[0])*delxinv; + dy = ny+shiftone - (x(i,1)-boxlo[1])*delyinv; + dz = nz+shiftone - (x(i,2)-boxlo[2])*delzinv; + + nz -= nzlo_out; + ny -= nylo_out; + nx -= nxlo_out; + + //compute_rho1d(i,dx,dy,dz); // hasn't changed from make_rho + + ekx = eky = ekz = ZEROF; + for (n = nlower; n <= nupper; n++) { + mz = n+nz; + z0 = d_rho1d(i,n+order/2,2); + for (m = nlower; m <= nupper; m++) { + my = m+ny; + y0 = z0*d_rho1d(i,m+order/2,1); + for (l = nlower; l <= nupper; l++) { + mx = l+nx; + x0 = y0*d_rho1d(i,l+order/2,0); + ekx -= x0*d_vdx_brick(mz,my,mx); + eky -= x0*d_vdy_brick(mz,my,mx); + ekz -= x0*d_vdz_brick(mz,my,mx); + } + } + } + + // convert E-field to force + + const double qfactor = qqrd2e * scale * q[i]; + f(i,0) += qfactor*ekx; + f(i,1) += qfactor*eky; + if (slabflag != 2) f(i,2) += qfactor*ekz; +} + +/* ---------------------------------------------------------------------- + interpolate from grid to get per-atom energy/virial +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PPPMKokkos<DeviceType>::fieldforce_peratom() +{ + // loop over my charges, interpolate from nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt + + int nlocal = atomKK->nlocal; + + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_fieldforce_peratom>(0,nlocal),*this); + DeviceType::fence(); + copymode = 0; +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_fieldforce_peratom, const int &i) const +{ + int l,m,n,nx,ny,nz,mx,my,mz; + FFT_SCALAR dx,dy,dz,x0,y0,z0; + FFT_SCALAR u,v0,v1,v2,v3,v4,v5; + + nx = d_part2grid(i,0); + ny = d_part2grid(i,1); + nz = d_part2grid(i,2); + dx = nx+shiftone - (x(i,0)-boxlo[0])*delxinv; + dy = ny+shiftone - (x(i,1)-boxlo[1])*delyinv; + dz = nz+shiftone - (x(i,2)-boxlo[2])*delzinv; + + nz -= nzlo_out; + ny -= nylo_out; + nx -= nxlo_out; + + compute_rho1d(i,dx,dy,dz); + + u = v0 = v1 = v2 = v3 = v4 = v5 = ZEROF; + for (n = nlower; n <= nupper; n++) { + mz = n+nz; + z0 = d_rho1d(i,n+order/2,2); + for (m = nlower; m <= nupper; m++) { + my = m+ny; + y0 = z0*d_rho1d(i,m+order/2,1); + for (l = nlower; l <= nupper; l++) { + mx = l+nx; + x0 = y0*d_rho1d(i,l+order/2,0); + if (eflag_atom) u += x0*d_u_brick(mz,my,mx); + if (vflag_atom) { + v0 += x0*d_v0_brick(mz,my,mx); + v1 += x0*d_v1_brick(mz,my,mx); + v2 += x0*d_v2_brick(mz,my,mx); + v3 += x0*d_v3_brick(mz,my,mx); + v4 += x0*d_v4_brick(mz,my,mx); + v5 += x0*d_v5_brick(mz,my,mx); + } + } + } + } + + if (eflag_atom) d_eatom[i] += q[i]*u; + if (vflag_atom) { + d_vatom(i,0) += q[i]*v0; + d_vatom(i,1) += q[i]*v1; + d_vatom(i,2) += q[i]*v2; + d_vatom(i,3) += q[i]*v3; + d_vatom(i,4) += q[i]*v4; + d_vatom(i,5) += q[i]*v5; + } +} + +/* ---------------------------------------------------------------------- + pack own values to buf to send to another proc +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PPPMKokkos<DeviceType>::pack_forward_kokkos(int flag, Kokkos::DualView<FFT_SCALAR*,Kokkos::LayoutRight,LMPDeviceType> &k_buf, int nlist, DAT::tdual_int_2d &k_list, int index) +{ + typename AT::t_int_2d_um d_list = k_list.view<DeviceType>(); + d_list_index = Kokkos::subview(d_list,index,Kokkos::ALL()); + d_buf = k_buf.view<DeviceType>(); + + nx = (nxhi_out-nxlo_out+1); + ny = (nyhi_out-nylo_out+1); + + if (flag == FORWARD_IK) { + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_pack_forward1>(0,nlist),*this); + DeviceType::fence(); + copymode = 0; + } else if (flag == FORWARD_IK_PERATOM) { + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_pack_forward2>(0,nlist),*this); + DeviceType::fence(); + copymode = 0; + } +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_pack_forward1, const int &i) const +{ + const double dlist = (double) d_list_index[i]; + const int iz = (int) (dlist/(nx*ny)); + const int iy = (int) ((dlist - iz*nx*ny)/nx); + const int ix = d_list_index[i] - iz*nx*ny - iy*nx; + d_buf[3*i] = d_vdx_brick(iz,iy,ix); + d_buf[3*i+1] = d_vdy_brick(iz,iy,ix); + d_buf[3*i+2] = d_vdz_brick(iz,iy,ix); +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_pack_forward2, const int &i) const +{ + const double dlist = (double) d_list_index[i]; + const int iz = (int) (dlist/(nx*ny)); + const int iy = (int) ((dlist - iz*nx*ny)/nx); + const int ix = d_list_index[i] - iz*nx*ny - iy*nx; + if (eflag_atom) d_buf[7*i] = d_u_brick(iz,iy,ix); + if (vflag_atom) { + d_buf[7*i+1] = d_v0_brick(iz,iy,ix); + d_buf[7*i+2] = d_v1_brick(iz,iy,ix); + d_buf[7*i+3] = d_v2_brick(iz,iy,ix); + d_buf[7*i+4] = d_v3_brick(iz,iy,ix); + d_buf[7*i+5] = d_v4_brick(iz,iy,ix); + d_buf[7*i+6] = d_v5_brick(iz,iy,ix); + } +} +/* ---------------------------------------------------------------------- + unpack another proc's own values from buf and set own ghost values +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PPPMKokkos<DeviceType>::unpack_forward_kokkos(int flag, Kokkos::DualView<FFT_SCALAR*,Kokkos::LayoutRight,LMPDeviceType> &k_buf, int nlist, DAT::tdual_int_2d &k_list, int index) +{ + typename AT::t_int_2d_um d_list = k_list.view<DeviceType>(); + d_list_index = Kokkos::subview(d_list,index,Kokkos::ALL()); + d_buf = k_buf.view<DeviceType>(); + + nx = (nxhi_out-nxlo_out+1); + ny = (nyhi_out-nylo_out+1); + + if (flag == FORWARD_IK) { + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_unpack_forward1>(0,nlist),*this); + DeviceType::fence(); + copymode = 0; + } else if (flag == FORWARD_IK_PERATOM) { + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_unpack_forward2>(0,nlist),*this); + DeviceType::fence(); + copymode = 0; + } +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_unpack_forward1, const int &i) const +{ + const double dlist = (double) d_list_index[i]; + const int iz = (int) (dlist/(nx*ny)); + const int iy = (int) ((dlist - iz*nx*ny)/nx); + const int ix = d_list_index[i] - iz*nx*ny - iy*nx; + d_vdx_brick(iz,iy,ix) = d_buf[3*i]; + d_vdy_brick(iz,iy,ix) = d_buf[3*i+1]; + d_vdz_brick(iz,iy,ix) = d_buf[3*i+2]; +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_unpack_forward2, const int &i) const +{ + const double dlist = (double) d_list_index[i]; + const int iz = (int) (dlist/(nx*ny)); + const int iy = (int) ((dlist - iz*nx*ny)/nx); + const int ix = d_list_index[i] - iz*nx*ny - iy*nx; + if (eflag_atom) d_u_brick(iz,iy,ix) = d_buf[7*i]; + if (vflag_atom) { + d_v0_brick(iz,iy,ix) = d_buf[7*i+1]; + d_v1_brick(iz,iy,ix) = d_buf[7*i+2]; + d_v2_brick(iz,iy,ix) = d_buf[7*i+3]; + d_v3_brick(iz,iy,ix) = d_buf[7*i+4]; + d_v4_brick(iz,iy,ix) = d_buf[7*i+5]; + d_v5_brick(iz,iy,ix) = d_buf[7*i+6]; + } +} + +/* ---------------------------------------------------------------------- + pack ghost values into buf to send to another proc +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PPPMKokkos<DeviceType>::pack_reverse_kokkos(int flag, Kokkos::DualView<FFT_SCALAR*,Kokkos::LayoutRight,LMPDeviceType> &k_buf, int nlist, DAT::tdual_int_2d &k_list, int index) +{ + typename AT::t_int_2d_um d_list = k_list.view<DeviceType>(); + d_list_index = Kokkos::subview(d_list,index,Kokkos::ALL()); + d_buf = k_buf.view<DeviceType>(); + + nx = (nxhi_out-nxlo_out+1); + ny = (nyhi_out-nylo_out+1); + + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_pack_reverse>(0,nlist),*this); + DeviceType::fence(); + copymode = 0; +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_pack_reverse, const int &i) const +{ + const double dlist = (double) d_list_index[i]; + const int iz = (int) (dlist/(nx*ny)); + const int iy = (int) ((dlist - iz*nx*ny)/nx); + const int ix = d_list_index[i] - iz*nx*ny - iy*nx; + d_buf[i] = d_density_brick(iz,iy,ix); +} + +/* ---------------------------------------------------------------------- + unpack another proc's ghost values from buf and add to own values +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PPPMKokkos<DeviceType>::unpack_reverse_kokkos(int flag, Kokkos::DualView<FFT_SCALAR*,Kokkos::LayoutRight,LMPDeviceType> &k_buf, int nlist, DAT::tdual_int_2d &k_list, int index) +{ + typename AT::t_int_2d_um d_list = k_list.view<DeviceType>(); + d_list_index = Kokkos::subview(d_list,index,Kokkos::ALL()); + d_buf = k_buf.view<DeviceType>(); + + int nx = (nxhi_out-nxlo_out+1); + int ny = (nyhi_out-nylo_out+1); + + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_unpack_reverse>(0,nlist),*this); + DeviceType::fence(); + copymode = 0; +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_unpack_reverse, const int &i) const +{ + const double dlist = (double) d_list_index[i]; + const int iz = (int) (dlist/(nx*ny)); + const int iy = (int) ((dlist - iz*nx*ny)/nx); + const int ix = d_list_index[i] - iz*nx*ny - iy*nx; + d_density_brick(iz,iy,ix) += d_buf[i]; +} + +/* ---------------------------------------------------------------------- + map nprocs to NX by NY grid as PX by PY procs - return optimal px,py +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PPPMKokkos<DeviceType>::procs2grid2d(int nprocs, int nx, int ny, int *px, int *py) +{ + // loop thru all possible factorizations of nprocs + // surf = surface area of largest proc sub-domain + // innermost if test minimizes surface area and surface/volume ratio + + int bestsurf = 2 * (nx + ny); + int bestboxx = 0; + int bestboxy = 0; + + int boxx,boxy,surf,ipx,ipy; + + ipx = 1; + while (ipx <= nprocs) { + if (nprocs % ipx == 0) { + ipy = nprocs/ipx; + boxx = nx/ipx; + if (nx % ipx) boxx++; + boxy = ny/ipy; + if (ny % ipy) boxy++; + surf = boxx + boxy; + if (surf < bestsurf || + (surf == bestsurf && boxx*boxy > bestboxx*bestboxy)) { + bestsurf = surf; + bestboxx = boxx; + bestboxy = boxy; + *px = ipx; + *py = ipy; + } + } + ipx++; + } +} + +/* ---------------------------------------------------------------------- + charge assignment into rho1d + dx,dy,dz = distance of particle from "lower left" grid point +------------------------------------------------------------------------- */ + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::compute_rho1d(const int i, const FFT_SCALAR &dx, const FFT_SCALAR &dy, + const FFT_SCALAR &dz) const +{ + int k,l; + FFT_SCALAR r1,r2,r3; + + for (k = (1-order)/2; k <= order/2; k++) { + r1 = r2 = r3 = ZEROF; + + for (l = order-1; l >= 0; l--) { + r1 = d_rho_coeff(l,k-(1-order)/2) + r1*dx; + r2 = d_rho_coeff(l,k-(1-order)/2) + r2*dy; + r3 = d_rho_coeff(l,k-(1-order)/2) + r3*dz; + } + d_rho1d(i,k+order/2,0) = r1; + d_rho1d(i,k+order/2,1) = r2; + d_rho1d(i,k+order/2,2) = r3; + } +} + +/* ---------------------------------------------------------------------- + generate coeffients for the weight function of order n + + (n-1) + Wn(x) = Sum wn(k,x) , Sum is over every other integer + k=-(n-1) + For k=-(n-1),-(n-1)+2, ....., (n-1)-2,n-1 + k is odd integers if n is even and even integers if n is odd + --- + | n-1 + | Sum a(l,j)*(x-k/2)**l if abs(x-k/2) < 1/2 + wn(k,x) = < l=0 + | + | 0 otherwise + --- + a coeffients are packed into the array rho_coeff to eliminate zeros + rho_coeff(l,((k+mod(n+1,2))/2) = a(l,k) +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PPPMKokkos<DeviceType>::compute_rho_coeff() +{ + int j,k,l,m; + FFT_SCALAR s; + + //FFT_SCALAR **a; + //memory->create2d_offset(a,order,-order,order,"pppm:a"); + FFT_SCALAR a[order][2*order+1]; + + for (k = 0; k <= 2*order; k++) + for (l = 0; l < order; l++) + a[l][k] = 0.0; + + a[0][order] = 1.0; + for (j = 1; j < order; j++) { + for (k = -j; k <= j; k += 2) { + s = 0.0; + for (l = 0; l < j; l++) { + a[l+1][k+order] = (a[l][k+1+order]-a[l][k-1+order]) / (l+1); +#ifdef FFT_SINGLE + s += powf(0.5,(float) l+1) * + (a[l][k-1+order] + powf(-1.0,(float) l) * a[l][k+1+order]) / (l+1); +#else + s += pow(0.5,(double) l+1) * + (a[l][k-1+order] + pow(-1.0,(double) l) * a[l][k+1+order]) / (l+1); +#endif + } + a[0][k+order] = s; + } + } + + m = (1-order)/2; + for (k = -(order-1); k < order; k += 2) { + for (l = 0; l < order; l++) + h_rho_coeff(l,m-(1-order)/2) = a[l][k+order]; + m++; + } + //memory->destroy2d_offset(a,-order); +} + +/* ---------------------------------------------------------------------- + Slab-geometry correction term to dampen inter-slab interactions between + periodically repeating slabs. Yields good approximation to 2D Ewald if + adequate empty space is left between repeating slabs (J. Chem. Phys. + 111, 3155). Slabs defined here to be parallel to the xy plane. Also + extended to non-neutral systems (J. Chem. Phys. 131, 094107). +------------------------------------------------------------------------- */ + +template<class DeviceType> +void PPPMKokkos<DeviceType>::slabcorr() +{ + // compute local contribution to global dipole moment + + zprd = domain->zprd; + int nlocal = atomKK->nlocal; + + double dipole = 0.0; + copymode = 1; + Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPPPM_slabcorr1>(0,nlocal),*this,dipole); + DeviceType::fence(); + copymode = 0; + + // sum local contributions to get global dipole moment + + dipole_all; + MPI_Allreduce(&dipole,&dipole_all,1,MPI_DOUBLE,MPI_SUM,world); + + // need to make non-neutral systems and/or + // per-atomKK energy translationally invariant + + dipole_r2 = 0.0; + if (eflag_atom || fabs(qsum) > SMALL) { + copymode = 1; + Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPPPM_slabcorr2>(0,nlocal),*this,dipole_r2); + DeviceType::fence(); + copymode = 0; + + // sum local contributions + + double tmp; + MPI_Allreduce(&dipole_r2,&tmp,1,MPI_DOUBLE,MPI_SUM,world); + dipole_r2 = tmp; + } + + // compute corrections + + const double e_slabcorr = MY_2PI*(dipole_all*dipole_all - + qsum*dipole_r2 - qsum*qsum*zprd*zprd/12.0)/volume; + qscale = qqrd2e * scale; + + if (eflag_global) energy += qscale * e_slabcorr; + + // per-atomKK energy + + if (eflag_atom) { + efact = qscale * MY_2PI/volume; + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_slabcorr3>(0,nlocal),*this); + DeviceType::fence(); + copymode = 0; + } + + // add on force corrections + + ffact = qscale * (-4.0*MY_PI/volume); + + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_slabcorr4>(0,nlocal),*this); + DeviceType::fence(); + copymode = 0; +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_slabcorr1, const int &i, double &dipole) const +{ + dipole += q[i]*x(i,2); +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_slabcorr2, const int &i, double &dipole_r2) const +{ + dipole_r2 += q[i]*x(i,2)*x(i,2); +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_slabcorr3, const int &i) const +{ + d_eatom[i] += efact * q[i]*(x(i,2)*dipole_all - 0.5*(dipole_r2 + + qsum*x(i,2)*x(i,2)) - qsum*zprd*zprd/12.0); +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_slabcorr4, const int &i) const +{ + f(i,2) += ffact * q[i]*(dipole_all - qsum*x(i,2)); +} + +/* ---------------------------------------------------------------------- + perform and time the 1d FFTs required for N timesteps +------------------------------------------------------------------------- */ + +template<class DeviceType> +int PPPMKokkos<DeviceType>::timing_1d(int n, double &time1d) +{ + double time1,time2; + + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_timing_zero>(0,2*nfft_both),*this); + DeviceType::fence(); + copymode = 0; + + MPI_Barrier(world); + time1 = MPI_Wtime(); + + for (int i = 0; i < n; i++) { + fft1->timing1d(work1,nfft_both,1); + fft2->timing1d(work1,nfft_both,-1); + fft2->timing1d(work1,nfft_both,-1); + fft2->timing1d(work1,nfft_both,-1); + } + + MPI_Barrier(world); + time2 = MPI_Wtime(); + time1d = time2 - time1; + + return 4; +} + +template<class DeviceType> +KOKKOS_INLINE_FUNCTION +void PPPMKokkos<DeviceType>::operator()(TagPPPM_timing_zero, const int &i) const +{ + d_work1[i] = ZEROF; +} + +/* ---------------------------------------------------------------------- + perform and time the 3d FFTs required for N timesteps +------------------------------------------------------------------------- */ + +template<class DeviceType> +int PPPMKokkos<DeviceType>::timing_3d(int n, double &time3d) +{ + double time1,time2; + + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_timing_zero>(0,2*nfft_both),*this); + DeviceType::fence(); + copymode = 0; + + MPI_Barrier(world); + time1 = MPI_Wtime(); + + for (int i = 0; i < n; i++) { + fft1->compute(work1,work1,1); + fft2->compute(work1,work1,-1); + fft2->compute(work1,work1,-1); + fft2->compute(work1,work1,-1); + } + + MPI_Barrier(world); + time2 = MPI_Wtime(); + time3d = time2 - time1; + + return 4; +} + +/* ---------------------------------------------------------------------- + memory usage of local arrays +------------------------------------------------------------------------- */ + +template<class DeviceType> +double PPPMKokkos<DeviceType>::memory_usage() +{ + double bytes = nmax*3 * sizeof(double); + int nbrick = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) * + (nzhi_out-nzlo_out+1); + bytes += 4 * nbrick * sizeof(FFT_SCALAR); + if (triclinic) bytes += 3 * nfft_both * sizeof(double); + bytes += 6 * nfft_both * sizeof(double); + bytes += nfft_both * sizeof(double); + bytes += nfft_both*5 * sizeof(FFT_SCALAR); + + if (peratom_allocate_flag) + bytes += 6 * nbrick * sizeof(FFT_SCALAR); + + bytes += cg->memory_usage(); + + return bytes; +} + +namespace LAMMPS_NS { +template class PPPMKokkos<LMPDeviceType>; +#ifdef KOKKOS_HAVE_CUDA +template class PPPMKokkos<LMPHostType>; +#endif +} diff --git a/src/KOKKOS/pppm_kokkos.h b/src/KOKKOS/pppm_kokkos.h new file mode 100644 index 0000000000..4e6bb1d74c --- /dev/null +++ b/src/KOKKOS/pppm_kokkos.h @@ -0,0 +1,552 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef KSPACE_CLASS + +KSpaceStyle(pppm/kk,PPPMKokkos<LMPDeviceType>) +KSpaceStyle(pppm/kk/device,PPPMKokkos<LMPDeviceType>) +KSpaceStyle(pppm/kk/host,PPPMKokkos<LMPHostType>) + +#else + +#ifndef LMP_PPPM_KOKKOS_H +#define LMP_PPPM_KOKKOS_H + +#include "pppm.h" +#include "gridcomm_kokkos.h" +#include "kokkos_type.h" + +namespace LAMMPS_NS { + +struct TagPPPM_setup1{}; +struct TagPPPM_setup2{}; +struct TagPPPM_setup3{}; +struct TagPPPM_setup4{}; +struct TagPPPM_compute_gf_ik{}; +struct TagPPPM_compute_gf_ik_triclinic{}; +struct TagPPPM_self1{}; +struct TagPPPM_self2{}; +struct TagPPPM_brick2fft{}; +struct TagPPPM_particle_map{}; +struct TagPPPM_make_rho_zero{}; +struct TagPPPM_make_rho_atomic{}; +struct TagPPPM_make_rho{}; +struct TagPPPM_poisson_ik1{}; +struct TagPPPM_poisson_ik2{}; +struct TagPPPM_poisson_ik3{}; +struct TagPPPM_poisson_ik4{}; +struct TagPPPM_poisson_ik5{}; +struct TagPPPM_poisson_ik6{}; +struct TagPPPM_poisson_ik7{}; +struct TagPPPM_poisson_ik8{}; +struct TagPPPM_poisson_ik9{}; +struct TagPPPM_poisson_ik10{}; +struct TagPPPM_poisson_ik_triclinic1{}; +struct TagPPPM_poisson_ik_triclinic2{}; +struct TagPPPM_poisson_ik_triclinic3{}; +struct TagPPPM_poisson_ik_triclinic4{}; +struct TagPPPM_poisson_ik_triclinic5{}; +struct TagPPPM_poisson_ik_triclinic6{}; +struct TagPPPM_poisson_peratom1{}; +struct TagPPPM_poisson_peratom2{}; +struct TagPPPM_poisson_peratom3{}; +struct TagPPPM_poisson_peratom4{}; +struct TagPPPM_poisson_peratom5{}; +struct TagPPPM_poisson_peratom6{}; +struct TagPPPM_poisson_peratom7{}; +struct TagPPPM_poisson_peratom8{}; +struct TagPPPM_poisson_peratom9{}; +struct TagPPPM_poisson_peratom10{}; +struct TagPPPM_poisson_peratom11{}; +struct TagPPPM_poisson_peratom12{}; +struct TagPPPM_poisson_peratom13{}; +struct TagPPPM_poisson_peratom14{}; +struct TagPPPM_fieldforce_ik{}; +struct TagPPPM_fieldforce_peratom{}; +struct TagPPPM_pack_forward1{}; +struct TagPPPM_pack_forward2{}; +struct TagPPPM_unpack_forward1{}; +struct TagPPPM_unpack_forward2{}; +struct TagPPPM_pack_reverse{}; +struct TagPPPM_unpack_reverse{}; +struct TagPPPM_slabcorr1{}; +struct TagPPPM_slabcorr2{}; +struct TagPPPM_slabcorr3{}; +struct TagPPPM_slabcorr4{}; +struct TagPPPM_timing_zero{}; + +template<class DeviceType> +class PPPMKokkos : public PPPM { + public: + typedef DeviceType device_type; + typedef ArrayTypes<DeviceType> AT; + + PPPMKokkos(class LAMMPS *, int, char **); + virtual ~PPPMKokkos(); + virtual void init(); + virtual void setup(); + void setup_grid(); + virtual void compute(int, int); + virtual int timing_1d(int, double &); + virtual int timing_3d(int, double &); + virtual double memory_usage(); + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_setup1, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_setup2, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_setup3, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_setup4, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_compute_gf_ik, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_compute_gf_ik_triclinic, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_self1, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_self2, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_brick2fft, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_particle_map, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_make_rho_zero, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_make_rho_atomic, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_make_rho, typename Kokkos::TeamPolicy<DeviceType, TagPPPM_make_rho>::member_type) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_poisson_ik1, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_poisson_ik2, const int&, EV_FLOAT &) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_poisson_ik3, const int&, EV_FLOAT &) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_poisson_ik4, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_poisson_ik5, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_poisson_ik6, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_poisson_ik7, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_poisson_ik8, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_poisson_ik9, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_poisson_ik10, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_poisson_ik_triclinic1, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_poisson_ik_triclinic2, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_poisson_ik_triclinic3, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_poisson_ik_triclinic4, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_poisson_ik_triclinic5, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_poisson_ik_triclinic6, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_poisson_peratom1, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_poisson_peratom2, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_poisson_peratom3, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_poisson_peratom4, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_poisson_peratom5, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_poisson_peratom6, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_poisson_peratom7, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_poisson_peratom8, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_poisson_peratom9, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_poisson_peratom10, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_poisson_peratom11, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_poisson_peratom12, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_poisson_peratom13, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_poisson_peratom14, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_fieldforce_ik, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_fieldforce_peratom, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_pack_forward1, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_pack_forward2, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_unpack_forward1, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_unpack_forward2, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_pack_reverse, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_unpack_reverse, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_slabcorr1, const int&, double&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_slabcorr2, const int&, double&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_slabcorr3, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_slabcorr4, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPPPM_timing_zero, const int&) const; + + //template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG> + //KOKKOS_INLINE_FUNCTION + //void operator()(TagPPPMKernelA<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int&) const; + + protected: + double unitkx,unitky,unitkz; + double scaleinv,s2; + double qscale,efact,ffact,dipole_all,dipole_r2,zprd; + double xprd,yprd,zprd_slab; + int nbx,nby,nbz,twoorder; + int numx_fft,numy_fft,numz_fft; + int numx_inout,numy_inout,numz_inout; + int numx_out,numy_out,numz_out; + int ix,iy,nlocal; + + int nx,ny,nz; + typename AT::t_int_1d_um d_list_index; + typename AT::t_FFT_SCALAR_1d_um d_buf; + + DAT::tdual_int_scalar k_flag; + + typename AT::t_x_array_randomread x; + typename AT::t_f_array f; + typename AT::t_float_1d_randomread q; + + DAT::tdual_efloat_1d k_eatom; + DAT::tdual_virial_array k_vatom; + typename ArrayTypes<DeviceType>::t_efloat_1d d_eatom; + typename ArrayTypes<DeviceType>::t_virial_array d_vatom; + + int factors[3]; + + typename AT::t_FFT_SCALAR_3d d_density_brick; + typename AT::t_FFT_SCALAR_3d d_vdx_brick,d_vdy_brick,d_vdz_brick; + typename AT::t_FFT_SCALAR_3d d_u_brick; + typename AT::t_FFT_SCALAR_3d d_v0_brick,d_v1_brick,d_v2_brick; + typename AT::t_FFT_SCALAR_3d d_v3_brick,d_v4_brick,d_v5_brick; + typename AT::t_float_1d d_greensfn; + typename AT::t_virial_array d_vg; + typename AT::t_float_1d d_fkx; + typename AT::t_float_1d d_fky; + typename AT::t_float_1d d_fkz; + DAT::tdual_FFT_SCALAR_1d k_density_fft; + DAT::tdual_FFT_SCALAR_1d k_work1; + DAT::tdual_FFT_SCALAR_1d k_work2; + typename AT::t_FFT_SCALAR_1d d_density_fft; + typename AT::t_FFT_SCALAR_1d d_work1; + typename AT::t_FFT_SCALAR_1d d_work2; + + DAT::tdual_float_1d k_gf_b; + typename AT::t_float_1d d_gf_b; + + //FFT_SCALAR **rho1d,**rho_coeff,**drho1d,**drho_coeff; + typename AT::t_FFT_SCALAR_2d_3 d_rho1d; + DAT::tdual_FFT_SCALAR_2d k_rho_coeff; + typename AT::t_FFT_SCALAR_2d d_rho_coeff; + HAT::t_FFT_SCALAR_2d h_rho_coeff; + //double **acons; + typename Kokkos::DualView<F_FLOAT[8][7],Kokkos::LayoutRight,DeviceType>::t_host acons; + + class FFT3d *fft1,*fft2; + class Remap *remap; + GridCommKokkos<DeviceType> *cg; + GridCommKokkos<DeviceType> *cg_peratom; + + //int **part2grid; // storage for particle -> grid mapping + typename AT::t_int_1d_3 d_part2grid; + + //double *boxlo; + double boxlo[3]; + + void set_grid_global(); + void set_grid_local(); + void adjust_gewald(); + double newton_raphson_f(); + double derivf(); + double final_accuracy(); + + virtual void allocate(); + virtual void allocate_peratom(); + virtual void deallocate(); + virtual void deallocate_peratom(); + int factorable(int); + double compute_df_kspace(); + double estimate_ik_error(double, double, bigint); + virtual void compute_gf_denom(); + virtual void compute_gf_ik(); + + virtual void particle_map(); + virtual void make_rho(); + virtual void brick2fft(); + + virtual void poisson(); + virtual void poisson_ik(); + + virtual void fieldforce(); + virtual void fieldforce_ik(); + + virtual void poisson_peratom(); + virtual void fieldforce_peratom(); + void procs2grid2d(int,int,int,int *, int*); + + KOKKOS_INLINE_FUNCTION + void compute_rho1d(const int i, const FFT_SCALAR &, const FFT_SCALAR &, + const FFT_SCALAR &) const; + void compute_rho_coeff(); + void slabcorr(); + + // grid communication + + virtual void pack_forward_kokkos(int, Kokkos::DualView<FFT_SCALAR*,Kokkos::LayoutRight,LMPDeviceType> &, int, DAT::tdual_int_2d &, int); + virtual void unpack_forward_kokkos(int, Kokkos::DualView<FFT_SCALAR*,Kokkos::LayoutRight,LMPDeviceType> &, int, DAT::tdual_int_2d &, int); + virtual void pack_reverse_kokkos(int, Kokkos::DualView<FFT_SCALAR*,Kokkos::LayoutRight,LMPDeviceType> &, int, DAT::tdual_int_2d &, int); + virtual void unpack_reverse_kokkos(int, Kokkos::DualView<FFT_SCALAR*,Kokkos::LayoutRight,LMPDeviceType> &, int, DAT::tdual_int_2d &, int); + + // triclinic + + int triclinic; // domain settings, orthog or triclinic + void setup_triclinic(); + void compute_gf_ik_triclinic(); + void poisson_ik_triclinic(); + +/* ---------------------------------------------------------------------- + denominator for Hockney-Eastwood Green's function + of x,y,z = sin(kx*deltax/2), etc + + inf n-1 + S(n,k) = Sum W(k+pi*j)**2 = Sum b(l)*(z*z)**l + j=-inf l=0 + + = -(z*z)**n /(2n-1)! * (d/dx)**(2n-1) cot(x) at z = sin(x) + gf_b = denominator expansion coeffs +------------------------------------------------------------------------- */ + + KOKKOS_INLINE_FUNCTION + double gf_denom(const double &x, const double &y, + const double &z) const { + double sx,sy,sz; + sz = sy = sx = 0.0; + for (int l = order-1; l >= 0; l--) { + sx = d_gf_b[l] + sx*x; + sy = d_gf_b[l] + sy*y; + sz = d_gf_b[l] + sz*z; + } + double s = sx*sy*sz; + return s*s; + }; +}; + +} + +#endif +#endif + +/* ERROR/WARNING messages: + +E: Illegal ... command + +Self-explanatory. Check the input script syntax and compare to the +documentation for the command. You can use -echo screen as a +command-line option when running LAMMPS to see the offending line. + +E: Cannot (yet) use PPPM with triclinic box and kspace_modify diff ad + +This feature is not yet supported. + +E: Cannot (yet) use PPPM with triclinic box and slab correction + +This feature is not yet supported. + +E: Cannot use PPPM with 2d simulation + +The kspace style pppm cannot be used in 2d simulations. You can use +2d PPPM in a 3d simulation; see the kspace_modify command. + +E: PPPM can only currently be used with comm_style brick + +This is a current restriction in LAMMPS. + +E: Kspace style requires atom attribute q + +The atom style defined does not have these attributes. + +E: Cannot use nonperiodic boundaries with PPPM + +For kspace style pppm, all 3 dimensions must have periodic boundaries +unless you use the kspace_modify command to define a 2d slab with a +non-periodic z dimension. + +E: Incorrect boundaries with slab PPPM + +Must have periodic x,y dimensions and non-periodic z dimension to use +2d slab option with PPPM. + +E: PPPM order cannot be < 2 or > than %d + +This is a limitation of the PPPM implementation in LAMMPS. + +E: KSpace style is incompatible with Pair style + +Setting a kspace style requires that a pair style with matching +long-range Coulombic or dispersion components be used. + +E: Pair style is incompatible with TIP4P KSpace style + +The pair style does not have the requires TIP4P settings. + +E: Bond and angle potentials must be defined for TIP4P + +Cannot use TIP4P pair potential unless bond and angle potentials +are defined. + +E: Bad TIP4P angle type for PPPM/TIP4P + +Specified angle type is not valid. + +E: Bad TIP4P bond type for PPPM/TIP4P + +Specified bond type is not valid. + +E: Cannot (yet) use PPPM with triclinic box and TIP4P + +This feature is not yet supported. + +W: Reducing PPPM order b/c stencil extends beyond nearest neighbor processor + +This may lead to a larger grid than desired. See the kspace_modify overlap +command to prevent changing of the PPPM order. + +E: PPPM order < minimum allowed order + +The default minimum order is 2. This can be reset by the +kspace_modify minorder command. + +E: PPPM grid stencil extends beyond nearest neighbor processor + +This is not allowed if the kspace_modify overlap setting is no. + +E: KSpace accuracy must be > 0 + +The kspace accuracy designated in the input must be greater than zero. + +E: Could not compute grid size + +The code is unable to compute a grid size consistent with the desired +accuracy. This error should not occur for typical problems. Please +send an email to the developers. + +E: PPPM grid is too large + +The global PPPM grid is larger than OFFSET in one or more dimensions. +OFFSET is currently set to 4096. You likely need to decrease the +requested accuracy. + +E: Could not compute g_ewald + +The Newton-Raphson solver failed to converge to a good value for +g_ewald. This error should not occur for typical problems. Please +send an email to the developers. + +E: Non-numeric box dimensions - simulation unstable + +The box size has apparently blown up. + +E: Out of range atoms - cannot compute PPPM + +One or more atoms are attempting to map their charge to a PPPM grid +point that is not owned by a processor. This is likely for one of two +reasons, both of them bad. First, it may mean that an atom near the +boundary of a processor's sub-domain has moved more than 1/2 the +"neighbor skin distance"_neighbor.html without neighbor lists being +rebuilt and atoms being migrated to new processors. This also means +you may be missing pairwise interactions that need to be computed. +The solution is to change the re-neighboring criteria via the +"neigh_modify"_neigh_modify command. The safest settings are "delay 0 +every 1 check yes". Second, it may mean that an atom has moved far +outside a processor's sub-domain or even the entire simulation box. +This indicates bad physics, e.g. due to highly overlapping atoms, too +large a timestep, etc. + +*/ diff --git a/src/KSPACE/pppm.cpp b/src/KSPACE/pppm.cpp index 9b18ad88bb..84b77a1905 100644 --- a/src/KSPACE/pppm.cpp +++ b/src/KSPACE/pppm.cpp @@ -152,6 +152,8 @@ PPPM::PPPM(LAMMPS *lmp, int narg, char **arg) : KSpace(lmp, narg, arg) PPPM::~PPPM() { + if (copymode) return; + delete [] factors; deallocate(); if (peratom_allocate_flag) deallocate_peratom(); diff --git a/src/kspace.cpp b/src/kspace.cpp index d943b02c78..d5123958a1 100644 --- a/src/kspace.cpp +++ b/src/kspace.cpp @@ -94,6 +94,7 @@ KSpace::KSpace(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp) execution_space = Host; datamask_read = ALL_MASK; datamask_modify = ALL_MASK; + copymode = 0; memory->create(gcons,7,7,"kspace:gcons"); gcons[2][0] = 15.0 / 8.0; @@ -149,6 +150,8 @@ KSpace::KSpace(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp) KSpace::~KSpace() { + if (copymode) return; + memory->destroy(eatom); memory->destroy(vatom); memory->destroy(gcons); diff --git a/src/kspace.h b/src/kspace.h index 5519a8c56f..4d1c44c00c 100644 --- a/src/kspace.h +++ b/src/kspace.h @@ -15,6 +15,7 @@ #define LMP_KSPACE_H #include "pointers.h" +#include "accelerator_kokkos.h" #ifdef FFT_SINGLE typedef float FFT_SCALAR; @@ -85,6 +86,7 @@ class KSpace : protected Pointers { // KOKKOS host/device flag and data masks ExecutionSpace execution_space; unsigned int datamask_read,datamask_modify; + int copymode; int compute_flag; // 0 if skip compute() int fftbench; // 0 if skip FFT timing @@ -124,6 +126,11 @@ class KSpace : protected Pointers { virtual void pack_reverse(int, FFT_SCALAR *, int, int *) {}; virtual void unpack_reverse(int, FFT_SCALAR *, int, int *) {}; + virtual void pack_forward_kokkos(int, Kokkos::DualView<FFT_SCALAR*,Kokkos::LayoutRight,LMPDeviceType> &, int, DAT::tdual_int_2d &, int) {}; + virtual void unpack_forward_kokkos(int, Kokkos::DualView<FFT_SCALAR*,Kokkos::LayoutRight,LMPDeviceType> &, int, DAT::tdual_int_2d &, int) {}; + virtual void pack_reverse_kokkos(int, Kokkos::DualView<FFT_SCALAR*,Kokkos::LayoutRight,LMPDeviceType> &, int, DAT::tdual_int_2d &, int) {}; + virtual void unpack_reverse_kokkos(int, Kokkos::DualView<FFT_SCALAR*,Kokkos::LayoutRight,LMPDeviceType> &, int, DAT::tdual_int_2d &, int) {}; + virtual int timing(int, double &, double &) {return 0;} virtual int timing_1d(int, double &) {return 0;} virtual int timing_3d(int, double &) {return 0;} -- GitLab