diff --git a/doc/src/Speed_kokkos.txt b/doc/src/Speed_kokkos.txt index 04cf53691be0c7379bd031b8feefe46119627748..f74c9c9ed77de96e6cc619499453e0b6db0048f4 100644 --- a/doc/src/Speed_kokkos.txt +++ b/doc/src/Speed_kokkos.txt @@ -106,6 +106,11 @@ modification to the input script is needed. Alternatively, one can run with the KOKKOS package by editing the input script as described below. +NOTE: When using a single OpenMP thread, the Kokkos Serial backend (i.e. +Makefile.kokkos_mpi_only) will give better performance than the OpenMP +backend (i.e. Makefile.kokkos_omp) because some of the overhead to make +the code thread-safe is removed. + NOTE: The default for the "package kokkos"_package.html command is to use "full" neighbor lists and set the Newton flag to "off" for both pairwise and bonded interactions. However, when running on CPUs, it @@ -122,6 +127,22 @@ mpirun -np 16 lmp_kokkos_mpi_only -k on -sf kk -pk kokkos newton on neigh half c If the "newton"_newton.html command is used in the input script, it can also override the Newton flag defaults. +For half neighbor lists and OpenMP, the KOKKOS package uses data +duplication (i.e. thread-private arrays) by default to avoid +thread-level write conflicts in the force arrays (and other data +structures as necessary). Data duplication is typically fastest for +small numbers of threads (i.e. 8 or less) but does increase memory +footprint and is not scalable to large numbers of threads. An +alternative to data duplication is to use thread-level atomics, which +don't require duplication. The use of atomics can be forced by compiling +with the "-DLMP_KOKKOS_USE_ATOMICS" compile switch. Most but not all +Kokkos-enabled pair_styles support data duplication. Alternatively, full +neighbor lists avoid the need for duplication or atomics but require +more compute operations per atom. When using the Kokkos Serial backend +or the OpenMP backend with a single thread, no duplication or atomics are +used. For CUDA and half neighbor lists, the KOKKOS package always uses +atomics. + [Core and Thread Affinity:] When using multi-threading, it is important for performance to bind diff --git a/src/KOKKOS/fix_qeq_reax_kokkos.cpp b/src/KOKKOS/fix_qeq_reax_kokkos.cpp index 52822856ee014cbb5b13b09ef87e3f54186a4a53..578afd20775ca2c79b5904a905b7b4b0acbb2eac 100644 --- a/src/KOKKOS/fix_qeq_reax_kokkos.cpp +++ b/src/KOKKOS/fix_qeq_reax_kokkos.cpp @@ -247,6 +247,13 @@ void FixQEqReaxKokkos<DeviceType>::pre_force(int vflag) k_t.template modify<LMPHostType>(); k_t.template sync<DeviceType>(); + need_dup = lmp->kokkos->need_dup<DeviceType>(); + + if (need_dup) + dup_o = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated> (d_o); // allocate duplicated memory + else + ndup_o = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated> (d_o); + // 1st cg solve over b_s, s cg_solve1(); @@ -262,6 +269,10 @@ void FixQEqReaxKokkos<DeviceType>::pre_force(int vflag) if (!allocated_flag) allocated_flag = 1; + + // free duplicated memory + if (need_dup) + dup_o = decltype(dup_o)(); } /* ---------------------------------------------------------------------- */ @@ -480,10 +491,12 @@ void FixQEqReaxKokkos<DeviceType>::cg_solve1() if (neighflag == HALF) { FixQEqReaxKokkosSparse13Functor<DeviceType,HALF> sparse13_functor(this); Kokkos::parallel_for(inum,sparse13_functor); - } else { + } else if (neighflag == HALFTHREAD) { FixQEqReaxKokkosSparse13Functor<DeviceType,HALFTHREAD> sparse13_functor(this); Kokkos::parallel_for(inum,sparse13_functor); } + if (need_dup) + Kokkos::Experimental::contribute(d_o, dup_o); } else { Kokkos::parallel_for(Kokkos::TeamPolicy <DeviceType, TagSparseMatvec1> (inum, teamsize), *this); } @@ -531,18 +544,21 @@ void FixQEqReaxKokkos<DeviceType>::cg_solve1() Kokkos::parallel_for(inum,sparse22_functor); if (neighflag != FULL) { Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType,TagZeroQGhosts>(nlocal,nlocal+atom->nghost),*this); + if (need_dup) + dup_o.reset_except(d_o); if (neighflag == HALF) { FixQEqReaxKokkosSparse23Functor<DeviceType,HALF> sparse23_functor(this); Kokkos::parallel_for(inum,sparse23_functor); - } else { + } else if (neighflag == HALFTHREAD) { FixQEqReaxKokkosSparse23Functor<DeviceType,HALFTHREAD> sparse23_functor(this); Kokkos::parallel_for(inum,sparse23_functor); } + if (need_dup) + Kokkos::Experimental::contribute(d_o, dup_o); } else { Kokkos::parallel_for(Kokkos::TeamPolicy <DeviceType, TagSparseMatvec2> (inum, teamsize), *this); } - if (neighflag != FULL) { k_o.template modify<DeviceType>(); k_o.template sync<LMPHostType>(); @@ -607,13 +623,17 @@ void FixQEqReaxKokkos<DeviceType>::cg_solve2() Kokkos::parallel_for(inum,sparse32_functor); if (neighflag != FULL) { Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType,TagZeroQGhosts>(nlocal,nlocal+atom->nghost),*this); + if (need_dup) + dup_o.reset_except(d_o); if (neighflag == HALF) { FixQEqReaxKokkosSparse33Functor<DeviceType,HALF> sparse33_functor(this); Kokkos::parallel_for(inum,sparse33_functor); - } else { + } else if (neighflag == HALFTHREAD) { FixQEqReaxKokkosSparse33Functor<DeviceType,HALFTHREAD> sparse33_functor(this); Kokkos::parallel_for(inum,sparse33_functor); } + if (need_dup) + Kokkos::Experimental::contribute(d_o, dup_o); } else { Kokkos::parallel_for(Kokkos::TeamPolicy <DeviceType, TagSparseMatvec3> (inum, teamsize), *this); } @@ -661,13 +681,17 @@ void FixQEqReaxKokkos<DeviceType>::cg_solve2() Kokkos::parallel_for(inum,sparse22_functor); if (neighflag != FULL) { Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType,TagZeroQGhosts>(nlocal,nlocal+atom->nghost),*this); + if (need_dup) + dup_o.reset_except(d_o); if (neighflag == HALF) { FixQEqReaxKokkosSparse23Functor<DeviceType,HALF> sparse23_functor(this); Kokkos::parallel_for(inum,sparse23_functor); - } else { + } else if (neighflag == HALFTHREAD) { FixQEqReaxKokkosSparse23Functor<DeviceType,HALFTHREAD> sparse23_functor(this); Kokkos::parallel_for(inum,sparse23_functor); } + if (need_dup) + Kokkos::Experimental::contribute(d_o, dup_o); } else { Kokkos::parallel_for(Kokkos::TeamPolicy <DeviceType, TagSparseMatvec2> (inum, teamsize), *this); } @@ -779,8 +803,9 @@ template<int NEIGHFLAG> KOKKOS_INLINE_FUNCTION void FixQEqReaxKokkos<DeviceType>::sparse13_item(int ii) const { - // The q array is atomic for Half/Thread neighbor style - Kokkos::View<F_FLOAT*, typename DAT::t_float_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_o = d_o; + // The q array is duplicated for OpenMP, atomic for CUDA, and neither for Serial + auto v_o = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_o),decltype(ndup_o)>::get(dup_o,ndup_o); + auto a_o = v_o.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); const int i = d_ilist[ii]; if (mask[i] & groupbit) { @@ -831,8 +856,9 @@ template<int NEIGHFLAG> KOKKOS_INLINE_FUNCTION void FixQEqReaxKokkos<DeviceType>::sparse23_item(int ii) const { - // The q array is atomic for Half/Thread neighbor style - Kokkos::View<F_FLOAT*, typename DAT::t_float_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_o = d_o; + // The q array is duplicated for OpenMP, atomic for CUDA, and neither for Serial + auto v_o = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_o),decltype(ndup_o)>::get(dup_o,ndup_o); + auto a_o = v_o.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); const int i = d_ilist[ii]; if (mask[i] & groupbit) { @@ -890,8 +916,9 @@ template<int NEIGHFLAG> KOKKOS_INLINE_FUNCTION void FixQEqReaxKokkos<DeviceType>::sparse33_item(int ii) const { - // The q array is atomic for Half/Thread neighbor style - Kokkos::View<F_FLOAT*, typename DAT::t_float_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_o = d_o; + // The q array is duplicated for OpenMP, atomic for CUDA, and neither for Serial + auto v_o = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_o),decltype(ndup_o)>::get(dup_o,ndup_o); + auto a_o = v_o.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); const int i = d_ilist[ii]; if (mask[i] & groupbit) { diff --git a/src/KOKKOS/fix_qeq_reax_kokkos.h b/src/KOKKOS/fix_qeq_reax_kokkos.h index 517b541f6f0b328efe02b9939d472f38584d84c4..23bb4f32eea1e46e3bb9d04b0e36a571c3626826 100644 --- a/src/KOKKOS/fix_qeq_reax_kokkos.h +++ b/src/KOKKOS/fix_qeq_reax_kokkos.h @@ -148,6 +148,7 @@ class FixQEqReaxKokkos : public FixQEqReax { private: int inum; int allocated_flag; + int need_dup; typedef Kokkos::DualView<int***,DeviceType> tdual_int_1d; Kokkos::DualView<params_qeq*,Kokkos::LayoutRight,DeviceType> k_params; @@ -192,6 +193,9 @@ class FixQEqReaxKokkos : public FixQEqReax { HAT::t_ffloat_2d h_s_hist, h_t_hist; typename AT::t_ffloat_2d_randomread r_s_hist, r_t_hist; + Kokkos::Experimental::ScatterView<F_FLOAT*, typename AT::t_ffloat_1d::array_layout, DeviceType, Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated> dup_o; + Kokkos::Experimental::ScatterView<F_FLOAT*, typename AT::t_ffloat_1d::array_layout, DeviceType, Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated> ndup_o; + void init_shielding_k(); void init_hist(); void allocate_matrix(); diff --git a/src/KOKKOS/kokkos.cpp b/src/KOKKOS/kokkos.cpp index fb6b8d8d45aa5d4e1e582c9b745ec53906aa999b..9973b5a68812be2d162874a348547a9b226fec88 100644 --- a/src/KOKKOS/kokkos.cpp +++ b/src/KOKKOS/kokkos.cpp @@ -166,6 +166,13 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp) } #endif +#ifndef KOKKOS_HAVE_SERIAL + if (num_threads == 1) + error->warning(FLERR,"When using a single thread, the Kokkos Serial backend " + "(i.e. Makefile.kokkos_mpi_only) gives better performance " + "than the OpenMP backend"); +#endif + Kokkos::InitArguments args; args.num_threads = num_threads; args.num_numa = numa; diff --git a/src/KOKKOS/kokkos.h b/src/KOKKOS/kokkos.h index cf209c0adb97e10a7bf5b45daa4502b3ad15d46a..e1e17a425a3c3e99fff4f615cfa958431e0448ea 100644 --- a/src/KOKKOS/kokkos.h +++ b/src/KOKKOS/kokkos.h @@ -16,6 +16,7 @@ #include "pointers.h" #include "kokkos_type.h" +#include "pair_kokkos.h" namespace LAMMPS_NS { @@ -40,6 +41,18 @@ class KokkosLMP : protected Pointers { ~KokkosLMP(); void accelerator(int, char **); int neigh_count(int); + + template<class DeviceType> + int need_dup() + { + int value = 0; + + if (neighflag == HALFTHREAD) + value = NeedDup<HALFTHREAD,DeviceType>::value; + + return value; + } + private: static void my_signal_handler(int); }; diff --git a/src/KOKKOS/kokkos_type.h b/src/KOKKOS/kokkos_type.h index ddea35ca8892244c053f232af7fda8f6c73ca82c..b88c92ff7309667fa2c300efdd31b4590ebd2b4a 100644 --- a/src/KOKKOS/kokkos_type.h +++ b/src/KOKKOS/kokkos_type.h @@ -20,6 +20,9 @@ #include <Kokkos_DualView.hpp> #include <impl/Kokkos_Timer.hpp> #include <Kokkos_Vectorization.hpp> +#include <Kokkos_ScatterView.hpp> + +enum{FULL=1u,HALFTHREAD=2u,HALF=4u,N2=8u}; #if defined(KOKKOS_HAVE_CXX11) #undef ISFINITE @@ -205,6 +208,100 @@ struct ExecutionSpaceFromDevice<Kokkos::Cuda> { }; #endif + +// Determine memory traits for force array +// Do atomic trait when running HALFTHREAD neighbor list style +template<int NEIGHFLAG> +struct AtomicF { + enum {value = Kokkos::Unmanaged}; +}; + +template<> +struct AtomicF<HALFTHREAD> { + enum {value = Kokkos::Atomic|Kokkos::Unmanaged}; +}; + + +// Determine memory traits for force array +// Do atomic trait when running HALFTHREAD neighbor list style with CUDA +template<int NEIGHFLAG, class DeviceType> +struct AtomicDup { + enum {value = Kokkos::Experimental::ScatterNonAtomic}; +}; + +#ifdef KOKKOS_ENABLE_CUDA +template<> +struct AtomicDup<HALFTHREAD,Kokkos::Cuda> { + enum {value = Kokkos::Experimental::ScatterAtomic}; +}; +#endif + +#ifdef LMP_KOKKOS_USE_ATOMICS + +#ifdef KOKKOS_ENABLE_OPENMP +template<> +struct AtomicDup<HALFTHREAD,Kokkos::OpenMP> { + enum {value = Kokkos::Experimental::ScatterAtomic}; +}; +#endif + +#ifdef KOKKOS_ENABLE_THREADS +template<> +struct AtomicDup<HALFTHREAD,Kokkos::Threads> { + enum {value = Kokkos::Experimental::ScatterAtomic}; +}; +#endif + +#endif + + +// Determine duplication traits for force array +// Use duplication when running threaded and not using atomics +template<int NEIGHFLAG, class DeviceType> +struct NeedDup { + enum {value = Kokkos::Experimental::ScatterNonDuplicated}; +}; + +#ifndef LMP_KOKKOS_USE_ATOMICS + +#ifdef KOKKOS_ENABLE_OPENMP +template<> +struct NeedDup<HALFTHREAD,Kokkos::OpenMP> { + enum {value = Kokkos::Experimental::ScatterDuplicated}; +}; +#endif + +#ifdef KOKKOS_ENABLE_THREADS +template<> +struct NeedDup<HALFTHREAD,Kokkos::Threads> { + enum {value = Kokkos::Experimental::ScatterDuplicated}; +}; +#endif + +#endif + +template<int value, typename T1, typename T2> +class ScatterViewHelper {}; + +template<typename T1, typename T2> +class ScatterViewHelper<Kokkos::Experimental::ScatterDuplicated,T1,T2> { +public: + KOKKOS_INLINE_FUNCTION + static T1 get(const T1 &dup, const T2 &nondup) { + return dup; + } +}; + +template<typename T1, typename T2> +class ScatterViewHelper<Kokkos::Experimental::ScatterNonDuplicated,T1,T2> { +public: + KOKKOS_INLINE_FUNCTION + static T2 get(const T1 &dup, const T2 &nondup) { + return nondup; + } +}; + + // define precision // handle global precision, force, energy, positions, kspace separately diff --git a/src/KOKKOS/neigh_list_kokkos.h b/src/KOKKOS/neigh_list_kokkos.h index 1c433f321c9684b34b2181214b65b6d39bc069bf..585422c54f3d51a4a0ea2cfbadaa2acdd4a0ec68 100644 --- a/src/KOKKOS/neigh_list_kokkos.h +++ b/src/KOKKOS/neigh_list_kokkos.h @@ -20,8 +20,6 @@ namespace LAMMPS_NS { -enum{FULL=1u,HALFTHREAD=2u,HALF=4u,N2=8u}; - class AtomNeighbors { public: diff --git a/src/KOKKOS/pair_eam_alloy_kokkos.cpp b/src/KOKKOS/pair_eam_alloy_kokkos.cpp index f21ea2335dd36c00429064fd3924dd78055ce88f..fc19da1c8a7e07ba3446a9efec1e6f062fe9eea5 100644 --- a/src/KOKKOS/pair_eam_alloy_kokkos.cpp +++ b/src/KOKKOS/pair_eam_alloy_kokkos.cpp @@ -109,7 +109,6 @@ void PairEAMAlloyKokkos<DeviceType>::compute(int eflag_in, int vflag_in) x = atomKK->k_x.view<DeviceType>(); f = atomKK->k_f.view<DeviceType>(); - v_rho = k_rho.view<DeviceType>(); type = atomKK->k_type.view<DeviceType>(); tag = atomKK->k_tag.view<DeviceType>(); nlocal = atom->nlocal; @@ -122,6 +121,19 @@ void PairEAMAlloyKokkos<DeviceType>::compute(int eflag_in, int vflag_in) d_ilist = k_list->d_ilist; int inum = list->inum; + need_dup = lmp->kokkos->need_dup<DeviceType>(); + if (need_dup) { + dup_rho = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_rho); + dup_f = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(f); + dup_eatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_eatom); + dup_vatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_vatom); + } else { + ndup_rho = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_rho); + ndup_f = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(f); + ndup_eatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_eatom); + ndup_vatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_vatom); + } + copymode = 1; // zero out density @@ -233,6 +245,9 @@ void PairEAMAlloyKokkos<DeviceType>::compute(int eflag_in, int vflag_in) } } + if (need_dup) + Kokkos::Experimental::contribute(f, dup_f); + if (eflag_global) eng_vdwl += ev.evdwl; if (vflag_global) { virial[0] += ev.v[0]; @@ -244,11 +259,15 @@ void PairEAMAlloyKokkos<DeviceType>::compute(int eflag_in, int vflag_in) } if (eflag_atom) { + if (need_dup) + Kokkos::Experimental::contribute(d_eatom, dup_eatom); k_eatom.template modify<DeviceType>(); k_eatom.template sync<LMPHostType>(); } if (vflag_atom) { + if (need_dup) + Kokkos::Experimental::contribute(d_vatom, dup_vatom); k_vatom.template modify<DeviceType>(); k_vatom.template sync<LMPHostType>(); } @@ -256,6 +275,14 @@ void PairEAMAlloyKokkos<DeviceType>::compute(int eflag_in, int vflag_in) if (vflag_fdotr) pair_virial_fdotr_compute(this); copymode = 0; + + // free duplicated memory + if (need_dup) { + dup_rho = decltype(dup_rho)(); + dup_f = decltype(dup_f)(); + dup_eatom = decltype(dup_eatom)(); + dup_vatom = decltype(dup_vatom)(); + } } /* ---------------------------------------------------------------------- @@ -503,8 +530,10 @@ void PairEAMAlloyKokkos<DeviceType>::operator()(TagPairEAMAlloyKernelA<NEIGHFLAG // rho = density at each atom // loop over neighbors of my atoms - // The rho array is atomic for Half/Thread neighbor style - Kokkos::View<F_FLOAT*, typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > rho = v_rho; + // The rho array is duplicated for OpenMP, atomic for CUDA, and neither for Serial + + auto v_rho = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_rho),decltype(ndup_rho)>::get(dup_rho,ndup_rho); + auto a_rho = v_rho.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); const int i = d_ilist[ii]; const X_FLOAT xtmp = x(i,0); @@ -672,8 +701,10 @@ template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG> KOKKOS_INLINE_FUNCTION void PairEAMAlloyKokkos<DeviceType>::operator()(TagPairEAMAlloyKernelC<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int &ii, EV_FLOAT& ev) const { - // The f array is atomic for Half/Thread neighbor style - Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_f = f; + // The f array is duplicated for OpenMP, atomic for CUDA, and neither for Serial + + auto v_f = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_f),decltype(ndup_f)>::get(dup_f,ndup_f); + auto a_f = v_f.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); const int i = d_ilist[ii]; const X_FLOAT xtmp = x(i,0); @@ -780,18 +811,22 @@ void PairEAMAlloyKokkos<DeviceType>::ev_tally(EV_FLOAT &ev, const int &i, const const int EFLAG = eflag; const int VFLAG = vflag_either; - // The eatom and vatom arrays are atomic for Half/Thread neighbor style - Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_eatom = k_eatom.view<DeviceType>(); - Kokkos::View<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_vatom = k_vatom.view<DeviceType>(); + // The eatom and vatom arrays are duplicated for OpenMP, atomic for CUDA, and neither for Serial + + auto v_eatom = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_eatom),decltype(ndup_eatom)>::get(dup_eatom,ndup_eatom); + auto a_eatom = v_eatom.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); + + auto v_vatom = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_vatom),decltype(ndup_vatom)>::get(dup_vatom,ndup_vatom); + auto a_vatom = v_vatom.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); if (EFLAG) { if (eflag_atom) { const E_FLOAT epairhalf = 0.5 * epair; if (NEIGHFLAG!=FULL) { - if (NEWTON_PAIR || i < nlocal) v_eatom[i] += epairhalf; - if (NEWTON_PAIR || j < nlocal) v_eatom[j] += epairhalf; + if (NEWTON_PAIR || i < nlocal) a_eatom[i] += epairhalf; + if (NEWTON_PAIR || j < nlocal) a_eatom[j] += epairhalf; } else { - v_eatom[i] += epairhalf; + a_eatom[i] += epairhalf; } } } @@ -835,28 +870,28 @@ void PairEAMAlloyKokkos<DeviceType>::ev_tally(EV_FLOAT &ev, const int &i, const if (vflag_atom) { if (NEIGHFLAG!=FULL) { if (NEWTON_PAIR || i < nlocal) { - v_vatom(i,0) += 0.5*v0; - v_vatom(i,1) += 0.5*v1; - v_vatom(i,2) += 0.5*v2; - v_vatom(i,3) += 0.5*v3; - v_vatom(i,4) += 0.5*v4; - v_vatom(i,5) += 0.5*v5; + a_vatom(i,0) += 0.5*v0; + a_vatom(i,1) += 0.5*v1; + a_vatom(i,2) += 0.5*v2; + a_vatom(i,3) += 0.5*v3; + a_vatom(i,4) += 0.5*v4; + a_vatom(i,5) += 0.5*v5; } if (NEWTON_PAIR || j < nlocal) { - v_vatom(j,0) += 0.5*v0; - v_vatom(j,1) += 0.5*v1; - v_vatom(j,2) += 0.5*v2; - v_vatom(j,3) += 0.5*v3; - v_vatom(j,4) += 0.5*v4; - v_vatom(j,5) += 0.5*v5; + a_vatom(j,0) += 0.5*v0; + a_vatom(j,1) += 0.5*v1; + a_vatom(j,2) += 0.5*v2; + a_vatom(j,3) += 0.5*v3; + a_vatom(j,4) += 0.5*v4; + a_vatom(j,5) += 0.5*v5; } } else { - v_vatom(i,0) += 0.5*v0; - v_vatom(i,1) += 0.5*v1; - v_vatom(i,2) += 0.5*v2; - v_vatom(i,3) += 0.5*v3; - v_vatom(i,4) += 0.5*v4; - v_vatom(i,5) += 0.5*v5; + a_vatom(i,0) += 0.5*v0; + a_vatom(i,1) += 0.5*v1; + a_vatom(i,2) += 0.5*v2; + a_vatom(i,3) += 0.5*v3; + a_vatom(i,4) += 0.5*v4; + a_vatom(i,5) += 0.5*v5; } } } @@ -1165,4 +1200,3 @@ template class PairEAMAlloyKokkos<LMPDeviceType>; template class PairEAMAlloyKokkos<LMPHostType>; #endif } - diff --git a/src/KOKKOS/pair_eam_alloy_kokkos.h b/src/KOKKOS/pair_eam_alloy_kokkos.h index 584839967218b3d8ea1079c9cbf7a2ea597436e4..6593ccae734ba038d21b2b85b70522c67512abee 100644 --- a/src/KOKKOS/pair_eam_alloy_kokkos.h +++ b/src/KOKKOS/pair_eam_alloy_kokkos.h @@ -129,10 +129,19 @@ class PairEAMAlloyKokkos : public PairEAM, public KokkosBase { typename ArrayTypes<DeviceType>::t_efloat_1d d_eatom; typename ArrayTypes<DeviceType>::t_virial_array d_vatom; + int need_dup; + Kokkos::Experimental::ScatterView<F_FLOAT*, typename DAT::t_ffloat_1d::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterDuplicated> dup_rho; + Kokkos::Experimental::ScatterView<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterDuplicated> dup_f; + Kokkos::Experimental::ScatterView<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterDuplicated> dup_eatom; + Kokkos::Experimental::ScatterView<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterDuplicated> dup_vatom; + Kokkos::Experimental::ScatterView<F_FLOAT*, typename DAT::t_ffloat_1d::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterNonDuplicated> ndup_rho; + Kokkos::Experimental::ScatterView<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterNonDuplicated> ndup_f; + Kokkos::Experimental::ScatterView<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterNonDuplicated> ndup_eatom; + Kokkos::Experimental::ScatterView<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterNonDuplicated> ndup_vatom; + DAT::tdual_ffloat_1d k_rho; DAT::tdual_ffloat_1d k_fp; typename AT::t_ffloat_1d d_rho; - typename AT::t_ffloat_1d v_rho; typename AT::t_ffloat_1d d_fp; HAT::t_ffloat_1d h_rho; HAT::t_ffloat_1d h_fp; diff --git a/src/KOKKOS/pair_eam_fs_kokkos.cpp b/src/KOKKOS/pair_eam_fs_kokkos.cpp index 627b56b84ac10e76e9c189d80abde07fe2b8678f..8f5571bf29fc999787fd03ae6ea396d4c75cf9bc 100644 --- a/src/KOKKOS/pair_eam_fs_kokkos.cpp +++ b/src/KOKKOS/pair_eam_fs_kokkos.cpp @@ -109,7 +109,6 @@ void PairEAMFSKokkos<DeviceType>::compute(int eflag_in, int vflag_in) x = atomKK->k_x.view<DeviceType>(); f = atomKK->k_f.view<DeviceType>(); - v_rho = k_rho.view<DeviceType>(); type = atomKK->k_type.view<DeviceType>(); tag = atomKK->k_tag.view<DeviceType>(); nlocal = atom->nlocal; @@ -122,6 +121,19 @@ void PairEAMFSKokkos<DeviceType>::compute(int eflag_in, int vflag_in) d_ilist = k_list->d_ilist; int inum = list->inum; + need_dup = lmp->kokkos->need_dup<DeviceType>(); + if (need_dup) { + dup_rho = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_rho); + dup_f = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(f); + dup_eatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_eatom); + dup_vatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_vatom); + } else { + ndup_rho = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_rho); + ndup_f = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(f); + ndup_eatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_eatom); + ndup_vatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_vatom); + } + copymode = 1; // zero out density @@ -233,6 +245,9 @@ void PairEAMFSKokkos<DeviceType>::compute(int eflag_in, int vflag_in) } } + if (need_dup) + Kokkos::Experimental::contribute(f, dup_f); + if (eflag_global) eng_vdwl += ev.evdwl; if (vflag_global) { virial[0] += ev.v[0]; @@ -246,16 +261,28 @@ void PairEAMFSKokkos<DeviceType>::compute(int eflag_in, int vflag_in) if (vflag_fdotr) pair_virial_fdotr_compute(this); if (eflag_atom) { + if (need_dup) + Kokkos::Experimental::contribute(d_eatom, dup_eatom); k_eatom.template modify<DeviceType>(); k_eatom.template sync<LMPHostType>(); } if (vflag_atom) { + if (need_dup) + Kokkos::Experimental::contribute(d_vatom, dup_vatom); k_vatom.template modify<DeviceType>(); k_vatom.template sync<LMPHostType>(); } copymode = 0; + + // free duplicated memory + if (need_dup) { + dup_rho = decltype(dup_rho)(); + dup_f = decltype(dup_f)(); + dup_eatom = decltype(dup_eatom)(); + dup_vatom = decltype(dup_vatom)(); + } } /* ---------------------------------------------------------------------- @@ -503,8 +530,10 @@ void PairEAMFSKokkos<DeviceType>::operator()(TagPairEAMFSKernelA<NEIGHFLAG,NEWTO // rho = density at each atom // loop over neighbors of my atoms - // The rho array is atomic for Half/Thread neighbor style - Kokkos::View<F_FLOAT*, typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > rho = v_rho; + // The rho array is duplicated for OpenMP, atomic for CUDA, and neither for Serial + + auto v_rho = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_rho),decltype(ndup_rho)>::get(dup_rho,ndup_rho); + auto a_rho = v_rho.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); const int i = d_ilist[ii]; const X_FLOAT xtmp = x(i,0); @@ -672,8 +701,10 @@ template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG> KOKKOS_INLINE_FUNCTION void PairEAMFSKokkos<DeviceType>::operator()(TagPairEAMFSKernelC<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int &ii, EV_FLOAT& ev) const { - // The f array is atomic for Half/Thread neighbor style - Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_f = f; + // The f array is duplicated for OpenMP, atomic for CUDA, and neither for Serial + + auto v_f = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_f),decltype(ndup_f)>::get(dup_f,ndup_f); + auto a_f = v_f.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); const int i = d_ilist[ii]; const X_FLOAT xtmp = x(i,0); @@ -780,18 +811,22 @@ void PairEAMFSKokkos<DeviceType>::ev_tally(EV_FLOAT &ev, const int &i, const int const int EFLAG = eflag; const int VFLAG = vflag_either; - // The eatom and vatom arrays are atomic for Half/Thread neighbor style - Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_eatom = k_eatom.view<DeviceType>(); - Kokkos::View<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_vatom = k_vatom.view<DeviceType>(); + // The eatom and vatom arrays are duplicated for OpenMP, atomic for CUDA, and neither for Serial + + auto v_eatom = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_eatom),decltype(ndup_eatom)>::get(dup_eatom,ndup_eatom); + auto a_eatom = v_eatom.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); + + auto v_vatom = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_vatom),decltype(ndup_vatom)>::get(dup_vatom,ndup_vatom); + auto a_vatom = v_vatom.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); if (EFLAG) { if (eflag_atom) { const E_FLOAT epairhalf = 0.5 * epair; if (NEIGHFLAG!=FULL) { - if (NEWTON_PAIR || i < nlocal) v_eatom[i] += epairhalf; - if (NEWTON_PAIR || j < nlocal) v_eatom[j] += epairhalf; + if (NEWTON_PAIR || i < nlocal) a_eatom[i] += epairhalf; + if (NEWTON_PAIR || j < nlocal) a_eatom[j] += epairhalf; } else { - v_eatom[i] += epairhalf; + a_eatom[i] += epairhalf; } } } @@ -835,28 +870,28 @@ void PairEAMFSKokkos<DeviceType>::ev_tally(EV_FLOAT &ev, const int &i, const int if (vflag_atom) { if (NEIGHFLAG!=FULL) { if (NEWTON_PAIR || i < nlocal) { - v_vatom(i,0) += 0.5*v0; - v_vatom(i,1) += 0.5*v1; - v_vatom(i,2) += 0.5*v2; - v_vatom(i,3) += 0.5*v3; - v_vatom(i,4) += 0.5*v4; - v_vatom(i,5) += 0.5*v5; + a_vatom(i,0) += 0.5*v0; + a_vatom(i,1) += 0.5*v1; + a_vatom(i,2) += 0.5*v2; + a_vatom(i,3) += 0.5*v3; + a_vatom(i,4) += 0.5*v4; + a_vatom(i,5) += 0.5*v5; } if (NEWTON_PAIR || j < nlocal) { - v_vatom(j,0) += 0.5*v0; - v_vatom(j,1) += 0.5*v1; - v_vatom(j,2) += 0.5*v2; - v_vatom(j,3) += 0.5*v3; - v_vatom(j,4) += 0.5*v4; - v_vatom(j,5) += 0.5*v5; + a_vatom(j,0) += 0.5*v0; + a_vatom(j,1) += 0.5*v1; + a_vatom(j,2) += 0.5*v2; + a_vatom(j,3) += 0.5*v3; + a_vatom(j,4) += 0.5*v4; + a_vatom(j,5) += 0.5*v5; } } else { - v_vatom(i,0) += 0.5*v0; - v_vatom(i,1) += 0.5*v1; - v_vatom(i,2) += 0.5*v2; - v_vatom(i,3) += 0.5*v3; - v_vatom(i,4) += 0.5*v4; - v_vatom(i,5) += 0.5*v5; + a_vatom(i,0) += 0.5*v0; + a_vatom(i,1) += 0.5*v1; + a_vatom(i,2) += 0.5*v2; + a_vatom(i,3) += 0.5*v3; + a_vatom(i,4) += 0.5*v4; + a_vatom(i,5) += 0.5*v5; } } } @@ -1174,4 +1209,3 @@ template class PairEAMFSKokkos<LMPDeviceType>; template class PairEAMFSKokkos<LMPHostType>; #endif } - diff --git a/src/KOKKOS/pair_eam_fs_kokkos.h b/src/KOKKOS/pair_eam_fs_kokkos.h index ce0b572ea2054113b32064e748a970027b66ab35..f75605ff6dbca4a131f14fea488c57164f2d14b1 100644 --- a/src/KOKKOS/pair_eam_fs_kokkos.h +++ b/src/KOKKOS/pair_eam_fs_kokkos.h @@ -129,10 +129,19 @@ class PairEAMFSKokkos : public PairEAM, public KokkosBase { typename ArrayTypes<DeviceType>::t_efloat_1d d_eatom; typename ArrayTypes<DeviceType>::t_virial_array d_vatom; + int need_dup; + Kokkos::Experimental::ScatterView<F_FLOAT*, typename DAT::t_ffloat_1d::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterDuplicated> dup_rho; + Kokkos::Experimental::ScatterView<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterDuplicated> dup_f; + Kokkos::Experimental::ScatterView<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterDuplicated> dup_eatom; + Kokkos::Experimental::ScatterView<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterDuplicated> dup_vatom; + Kokkos::Experimental::ScatterView<F_FLOAT*, typename DAT::t_ffloat_1d::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterNonDuplicated> ndup_rho; + Kokkos::Experimental::ScatterView<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterNonDuplicated> ndup_f; + Kokkos::Experimental::ScatterView<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterNonDuplicated> ndup_eatom; + Kokkos::Experimental::ScatterView<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterNonDuplicated> ndup_vatom; + DAT::tdual_ffloat_1d k_rho; DAT::tdual_ffloat_1d k_fp; typename AT::t_ffloat_1d d_rho; - typename AT::t_ffloat_1d v_rho; typename AT::t_ffloat_1d d_fp; HAT::t_ffloat_1d h_rho; HAT::t_ffloat_1d h_fp; diff --git a/src/KOKKOS/pair_eam_kokkos.cpp b/src/KOKKOS/pair_eam_kokkos.cpp index 1bfb8255dc68b089bc7362501691a577970d4af4..a1431334b47ba07a25d59e9225516c741158f286 100644 --- a/src/KOKKOS/pair_eam_kokkos.cpp +++ b/src/KOKKOS/pair_eam_kokkos.cpp @@ -34,7 +34,6 @@ using namespace LAMMPS_NS; - /* ---------------------------------------------------------------------- */ template<class DeviceType> @@ -104,7 +103,6 @@ void PairEAMKokkos<DeviceType>::compute(int eflag_in, int vflag_in) x = atomKK->k_x.view<DeviceType>(); f = atomKK->k_f.view<DeviceType>(); - v_rho = k_rho.view<DeviceType>(); type = atomKK->k_type.view<DeviceType>(); tag = atomKK->k_tag.view<DeviceType>(); nlocal = atom->nlocal; @@ -117,6 +115,19 @@ void PairEAMKokkos<DeviceType>::compute(int eflag_in, int vflag_in) d_ilist = k_list->d_ilist; int inum = list->inum; + need_dup = lmp->kokkos->need_dup<DeviceType>(); + if (need_dup) { + dup_rho = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_rho); + dup_f = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(f); + dup_eatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_eatom); + dup_vatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_vatom); + } else { + ndup_rho = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_rho); + ndup_f = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(f); + ndup_eatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_eatom); + ndup_vatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_vatom); + } + copymode = 1; // zero out density @@ -228,6 +239,9 @@ void PairEAMKokkos<DeviceType>::compute(int eflag_in, int vflag_in) } } + if (need_dup) + Kokkos::Experimental::contribute(f, dup_f); + if (eflag_global) eng_vdwl += ev.evdwl; if (vflag_global) { virial[0] += ev.v[0]; @@ -241,16 +255,28 @@ void PairEAMKokkos<DeviceType>::compute(int eflag_in, int vflag_in) if (vflag_fdotr) pair_virial_fdotr_compute(this); if (eflag_atom) { + if (need_dup) + Kokkos::Experimental::contribute(d_eatom, dup_eatom); k_eatom.template modify<DeviceType>(); k_eatom.template sync<LMPHostType>(); } if (vflag_atom) { + if (need_dup) + Kokkos::Experimental::contribute(d_vatom, dup_vatom); k_vatom.template modify<DeviceType>(); k_vatom.template sync<LMPHostType>(); } copymode = 0; + + // free duplicated memory + if (need_dup) { + dup_rho = decltype(dup_rho)(); + dup_f = decltype(dup_f)(); + dup_eatom = decltype(dup_eatom)(); + dup_vatom = decltype(dup_vatom)(); + } } /* ---------------------------------------------------------------------- @@ -503,8 +529,10 @@ void PairEAMKokkos<DeviceType>::operator()(TagPairEAMKernelA<NEIGHFLAG,NEWTON_PA // rho = density at each atom // loop over neighbors of my atoms - // The rho array is atomic for Half/Thread neighbor style - Kokkos::View<F_FLOAT*, typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > rho = v_rho; + // The rho array is duplicated for OpenMP, atomic for CUDA, and neither for Serial + + auto v_rho = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_rho),decltype(ndup_rho)>::get(dup_rho,ndup_rho); + auto a_rho = v_rho.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); const int i = d_ilist[ii]; const X_FLOAT xtmp = x(i,0); @@ -538,13 +566,13 @@ void PairEAMKokkos<DeviceType>::operator()(TagPairEAMKernelA<NEIGHFLAG,NEWTON_PA d_rhor_spline(d_type2rhor_ji,m,5))*p + d_rhor_spline(d_type2rhor_ji,m,6); if (NEWTON_PAIR || j < nlocal) { const int d_type2rhor_ij = d_type2rhor(itype,jtype); - rho[j] += ((d_rhor_spline(d_type2rhor_ij,m,3)*p + d_rhor_spline(d_type2rhor_ij,m,4))*p + - d_rhor_spline(d_type2rhor_ij,m,5))*p + d_rhor_spline(d_type2rhor_ij,m,6); + a_rho[j] += ((d_rhor_spline(d_type2rhor_ij,m,3)*p + d_rhor_spline(d_type2rhor_ij,m,4))*p + + d_rhor_spline(d_type2rhor_ij,m,5))*p + d_rhor_spline(d_type2rhor_ij,m,6); } } } - rho[i] += rhotmp; + a_rho[i] += rhotmp; } /* ---------------------------------------------------------------------- */ @@ -670,8 +698,10 @@ template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG> KOKKOS_INLINE_FUNCTION void PairEAMKokkos<DeviceType>::operator()(TagPairEAMKernelC<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int &ii, EV_FLOAT& ev) const { - // The f array is atomic for Half/Thread neighbor style - Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_f = f; + // The f array is duplicated for OpenMP, atomic for CUDA, and neither for Serial + + auto v_f = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_f),decltype(ndup_f)>::get(dup_f,ndup_f); + auto a_f = v_f.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); const int i = d_ilist[ii]; const X_FLOAT xtmp = x(i,0); @@ -778,18 +808,22 @@ void PairEAMKokkos<DeviceType>::ev_tally(EV_FLOAT &ev, const int &i, const int & const int EFLAG = eflag; const int VFLAG = vflag_either; - // The eatom and vatom arrays are atomic for Half/Thread neighbor style - Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_eatom = k_eatom.view<DeviceType>(); - Kokkos::View<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_vatom = k_vatom.view<DeviceType>(); + // The eatom and vatom arrays are duplicated for OpenMP, atomic for CUDA, and neither for Serial + + auto v_eatom = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_eatom),decltype(ndup_eatom)>::get(dup_eatom,ndup_eatom); + auto a_eatom = v_eatom.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); + + auto v_vatom = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_vatom),decltype(ndup_vatom)>::get(dup_vatom,ndup_vatom); + auto a_vatom = v_vatom.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); if (EFLAG) { if (eflag_atom) { const E_FLOAT epairhalf = 0.5 * epair; if (NEIGHFLAG!=FULL) { - if (NEWTON_PAIR || i < nlocal) v_eatom[i] += epairhalf; - if (NEWTON_PAIR || j < nlocal) v_eatom[j] += epairhalf; + if (NEWTON_PAIR || i < nlocal) a_eatom[i] += epairhalf; + if (NEWTON_PAIR || j < nlocal) a_eatom[j] += epairhalf; } else { - v_eatom[i] += epairhalf; + a_eatom[i] += epairhalf; } } } @@ -833,28 +867,28 @@ void PairEAMKokkos<DeviceType>::ev_tally(EV_FLOAT &ev, const int &i, const int & if (vflag_atom) { if (NEIGHFLAG!=FULL) { if (NEWTON_PAIR || i < nlocal) { - v_vatom(i,0) += 0.5*v0; - v_vatom(i,1) += 0.5*v1; - v_vatom(i,2) += 0.5*v2; - v_vatom(i,3) += 0.5*v3; - v_vatom(i,4) += 0.5*v4; - v_vatom(i,5) += 0.5*v5; + a_vatom(i,0) += 0.5*v0; + a_vatom(i,1) += 0.5*v1; + a_vatom(i,2) += 0.5*v2; + a_vatom(i,3) += 0.5*v3; + a_vatom(i,4) += 0.5*v4; + a_vatom(i,5) += 0.5*v5; } if (NEWTON_PAIR || j < nlocal) { - v_vatom(j,0) += 0.5*v0; - v_vatom(j,1) += 0.5*v1; - v_vatom(j,2) += 0.5*v2; - v_vatom(j,3) += 0.5*v3; - v_vatom(j,4) += 0.5*v4; - v_vatom(j,5) += 0.5*v5; + a_vatom(j,0) += 0.5*v0; + a_vatom(j,1) += 0.5*v1; + a_vatom(j,2) += 0.5*v2; + a_vatom(j,3) += 0.5*v3; + a_vatom(j,4) += 0.5*v4; + a_vatom(j,5) += 0.5*v5; } } else { - v_vatom(i,0) += 0.5*v0; - v_vatom(i,1) += 0.5*v1; - v_vatom(i,2) += 0.5*v2; - v_vatom(i,3) += 0.5*v3; - v_vatom(i,4) += 0.5*v4; - v_vatom(i,5) += 0.5*v5; + a_vatom(i,0) += 0.5*v0; + a_vatom(i,1) += 0.5*v1; + a_vatom(i,2) += 0.5*v2; + a_vatom(i,3) += 0.5*v3; + a_vatom(i,4) += 0.5*v4; + a_vatom(i,5) += 0.5*v5; } } } diff --git a/src/KOKKOS/pair_eam_kokkos.h b/src/KOKKOS/pair_eam_kokkos.h index c821750da54a0c2faf2eeb49ad0689f016bf1e41..4040eba8588413c61e5b1984e28026fa9db34372 100644 --- a/src/KOKKOS/pair_eam_kokkos.h +++ b/src/KOKKOS/pair_eam_kokkos.h @@ -126,10 +126,19 @@ class PairEAMKokkos : public PairEAM, public KokkosBase { typename ArrayTypes<DeviceType>::t_efloat_1d d_eatom; typename ArrayTypes<DeviceType>::t_virial_array d_vatom; + int need_dup; + Kokkos::Experimental::ScatterView<F_FLOAT*, typename DAT::t_ffloat_1d::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterDuplicated> dup_rho; + Kokkos::Experimental::ScatterView<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterDuplicated> dup_f; + Kokkos::Experimental::ScatterView<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterDuplicated> dup_eatom; + Kokkos::Experimental::ScatterView<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterDuplicated> dup_vatom; + Kokkos::Experimental::ScatterView<F_FLOAT*, typename DAT::t_ffloat_1d::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterNonDuplicated> ndup_rho; + Kokkos::Experimental::ScatterView<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterNonDuplicated> ndup_f; + Kokkos::Experimental::ScatterView<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterNonDuplicated> ndup_eatom; + Kokkos::Experimental::ScatterView<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterNonDuplicated> ndup_vatom; + DAT::tdual_ffloat_1d k_rho; DAT::tdual_ffloat_1d k_fp; typename AT::t_ffloat_1d d_rho; - typename AT::t_ffloat_1d v_rho; typename AT::t_ffloat_1d d_fp; HAT::t_ffloat_1d h_rho; HAT::t_ffloat_1d h_fp; diff --git a/src/KOKKOS/pair_kokkos.h b/src/KOKKOS/pair_kokkos.h index 6257566ba761bfc1314c6b403cb25545f9ebba12..ab616d2c07ab5cca684821483ab81766ae83fca4 100644 --- a/src/KOKKOS/pair_kokkos.h +++ b/src/KOKKOS/pair_kokkos.h @@ -23,6 +23,7 @@ #include "neighbor_kokkos.h" #include "neigh_list_kokkos.h" #include "Kokkos_Vectorization.hpp" +#include "Kokkos_ScatterView.hpp" namespace LAMMPS_NS { @@ -47,45 +48,48 @@ struct DoCoul<1> { typedef CoulTag type; }; -// Determine memory traits for force array -// Do atomic trait when running HALFTHREAD neighbor list style -template<int NEIGHFLAG> -struct AtomicF { - enum {value = Kokkos::Unmanaged}; -}; - -template<> -struct AtomicF<HALFTHREAD> { - enum {value = Kokkos::Atomic|Kokkos::Unmanaged}; -}; //Specialisation for Neighborlist types Half, HalfThread, Full template <class PairStyle, int NEIGHFLAG, bool STACKPARAMS, class Specialisation = void> struct PairComputeFunctor { typedef typename PairStyle::device_type device_type ; + typedef ArrayTypes<device_type> AT; // Reduction type, contains evdwl, ecoul and virial[6] typedef EV_FLOAT value_type; // The copy of the pair style PairStyle c; + typename AT::t_f_array f; + typename AT::t_efloat_1d d_eatom; + typename AT::t_virial_array d_vatom; // The force array is atomic for Half/Thread neighbor style - Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout, - device_type,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > f; + //Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout, + // device_type,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > f; + Kokkos::Experimental::ScatterView<F_FLOAT*[3], typename DAT::t_f_array::array_layout,device_type,Kokkos::Experimental::ScatterSum,NeedDup<NEIGHFLAG,device_type>::value > dup_f; // The eatom and vatom arrays are atomic for Half/Thread neighbor style - Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout, - device_type,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > eatom; - Kokkos::View<F_FLOAT*[6], typename DAT::t_virial_array::array_layout, - device_type,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > vatom; + //Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout, + // device_type,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > eatom; + Kokkos::Experimental::ScatterView<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,device_type,Kokkos::Experimental::ScatterSum,NeedDup<NEIGHFLAG,device_type>::value > dup_eatom; + + //Kokkos::View<F_FLOAT*[6], typename DAT::t_virial_array::array_layout, + // device_type,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > vatom; + Kokkos::Experimental::ScatterView<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,device_type,Kokkos::Experimental::ScatterSum,NeedDup<NEIGHFLAG,device_type>::value > dup_vatom; + + NeighListKokkos<device_type> list; PairComputeFunctor(PairStyle* c_ptr, NeighListKokkos<device_type>* list_ptr): - c(*c_ptr),f(c.f),eatom(c.d_eatom), - vatom(c.d_vatom),list(*list_ptr) {}; + c(*c_ptr),list(*list_ptr) { + // allocate duplicated memory + dup_f = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, NeedDup<NEIGHFLAG,device_type>::value >(c.f); + dup_eatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, NeedDup<NEIGHFLAG,device_type>::value >(c.d_eatom); + dup_vatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, NeedDup<NEIGHFLAG,device_type>::value >(c.d_vatom); + }; // Call cleanup_copy which sets allocations NULL which are destructed by the PairStyle ~PairComputeFunctor() {c.cleanup_copy();list.copymode = 1;}; @@ -94,12 +98,25 @@ struct PairComputeFunctor { return j >> SBBITS & 3; } + void contribute() { + Kokkos::Experimental::contribute(c.f, dup_f); + + if (c.eflag_atom) + Kokkos::Experimental::contribute(c.d_eatom, dup_eatom); + + if (c.vflag_atom) + Kokkos::Experimental::contribute(c.d_vatom, dup_vatom); + } + // Loop over neighbors of one atom without coulomb interaction // This function is called in parallel template<int EVFLAG, int NEWTON_PAIR> KOKKOS_FUNCTION EV_FLOAT compute_item(const int& ii, const NeighListKokkos<device_type> &list, const NoCoulTag&) const { + + auto a_f = dup_f.template access<AtomicDup<NEIGHFLAG,device_type>::value>(); + EV_FLOAT ev; const int i = list.d_ilist[ii]; const X_FLOAT xtmp = c.x(i,0); @@ -133,9 +150,9 @@ struct PairComputeFunctor { fztmp += delz*fpair; if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < c.nlocal)) { - f(j,0) -= delx*fpair; - f(j,1) -= dely*fpair; - f(j,2) -= delz*fpair; + a_f(j,0) -= delx*fpair; + a_f(j,1) -= dely*fpair; + a_f(j,2) -= delz*fpair; } if (EVFLAG) { @@ -151,9 +168,9 @@ struct PairComputeFunctor { } - f(i,0) += fxtmp; - f(i,1) += fytmp; - f(i,2) += fztmp; + a_f(i,0) += fxtmp; + a_f(i,1) += fytmp; + a_f(i,2) += fztmp; return ev; } @@ -164,6 +181,9 @@ struct PairComputeFunctor { KOKKOS_FUNCTION EV_FLOAT compute_item(const int& ii, const NeighListKokkos<device_type> &list, const CoulTag& ) const { + + auto a_f = dup_f.template access<AtomicDup<NEIGHFLAG,device_type>::value>(); + EV_FLOAT ev; const int i = list.d_ilist[ii]; const X_FLOAT xtmp = c.x(i,0); @@ -204,9 +224,9 @@ struct PairComputeFunctor { fztmp += delz*fpair; if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < c.nlocal)) { - f(j,0) -= delx*fpair; - f(j,1) -= dely*fpair; - f(j,2) -= delz*fpair; + a_f(j,0) -= delx*fpair; + a_f(j,1) -= dely*fpair; + a_f(j,2) -= delz*fpair; } if (EVFLAG) { @@ -228,9 +248,9 @@ struct PairComputeFunctor { } } - f(i,0) += fxtmp; - f(i,1) += fytmp; - f(i,2) += fztmp; + a_f(i,0) += fxtmp; + a_f(i,1) += fytmp; + a_f(i,2) += fztmp; return ev; } @@ -240,6 +260,9 @@ struct PairComputeFunctor { const F_FLOAT &epair, const F_FLOAT &fpair, const F_FLOAT &delx, const F_FLOAT &dely, const F_FLOAT &delz) const { + auto a_eatom = dup_eatom.template access<AtomicDup<NEIGHFLAG,device_type>::value>(); + auto a_vatom = dup_vatom.template access<AtomicDup<NEIGHFLAG,device_type>::value>(); + const int EFLAG = c.eflag; const int NEWTON_PAIR = c.newton_pair; const int VFLAG = c.vflag_either; @@ -247,8 +270,8 @@ struct PairComputeFunctor { if (EFLAG) { if (c.eflag_atom) { const E_FLOAT epairhalf = 0.5 * epair; - if (NEWTON_PAIR || i < c.nlocal) eatom[i] += epairhalf; - if ((NEWTON_PAIR || j < c.nlocal) && NEIGHFLAG != FULL) eatom[j] += epairhalf; + if (NEWTON_PAIR || i < c.nlocal) a_eatom[i] += epairhalf; + if ((NEWTON_PAIR || j < c.nlocal) && NEIGHFLAG != FULL) a_eatom[j] += epairhalf; } } @@ -299,20 +322,20 @@ struct PairComputeFunctor { if (c.vflag_atom) { if (NEWTON_PAIR || i < c.nlocal) { - vatom(i,0) += 0.5*v0; - vatom(i,1) += 0.5*v1; - vatom(i,2) += 0.5*v2; - vatom(i,3) += 0.5*v3; - vatom(i,4) += 0.5*v4; - vatom(i,5) += 0.5*v5; + a_vatom(i,0) += 0.5*v0; + a_vatom(i,1) += 0.5*v1; + a_vatom(i,2) += 0.5*v2; + a_vatom(i,3) += 0.5*v3; + a_vatom(i,4) += 0.5*v4; + a_vatom(i,5) += 0.5*v5; } if ((NEWTON_PAIR || j < c.nlocal) && NEIGHFLAG != FULL) { - vatom(j,0) += 0.5*v0; - vatom(j,1) += 0.5*v1; - vatom(j,2) += 0.5*v2; - vatom(j,3) += 0.5*v3; - vatom(j,4) += 0.5*v4; - vatom(j,5) += 0.5*v5; + a_vatom(j,0) += 0.5*v0; + a_vatom(j,1) += 0.5*v1; + a_vatom(j,2) += 0.5*v2; + a_vatom(j,3) += 0.5*v3; + a_vatom(j,4) += 0.5*v4; + a_vatom(j,5) += 0.5*v5; } } } @@ -351,6 +374,9 @@ struct PairComputeFunctor<PairStyle,N2,STACKPARAMS,Specialisation> { return j >> SBBITS & 3; } + + void contribute() {} + template<int EVFLAG, int NEWTON_PAIR> KOKKOS_FUNCTION EV_FLOAT compute_item(const int& ii, @@ -489,10 +515,12 @@ EV_FLOAT pair_compute_neighlist (PairStyle* fpair, typename Kokkos::Impl::enable PairComputeFunctor<PairStyle,NEIGHFLAG,false,Specialisation > ff(fpair,list); if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(list->inum,ff,ev); else Kokkos::parallel_for(list->inum,ff); + ff.contribute(); } else { PairComputeFunctor<PairStyle,NEIGHFLAG,true,Specialisation > ff(fpair,list); if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(list->inum,ff,ev); else Kokkos::parallel_for(list->inum,ff); + ff.contribute(); } return ev; } diff --git a/src/KOKKOS/pair_reaxc_kokkos.cpp b/src/KOKKOS/pair_reaxc_kokkos.cpp index e2e2e6f6de96197435a3bcb4dcca3af53ed3ec45..46ecddfd83849f0f0b6fcfcfa2d5aa301faa1758 100644 --- a/src/KOKKOS/pair_reaxc_kokkos.cpp +++ b/src/KOKKOS/pair_reaxc_kokkos.cpp @@ -708,6 +708,19 @@ void PairReaxCKokkos<DeviceType>::compute(int eflag_in, int vflag_in) d_neighbors = k_list->d_neighbors; d_ilist = k_list->d_ilist; + need_dup = lmp->kokkos->need_dup<DeviceType>(); + + // allocate duplicated memory + if (need_dup) { + dup_f = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(f); + dup_eatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_eatom); + dup_vatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_vatom); + } else { + ndup_f = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(f); + ndup_eatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_eatom); + ndup_vatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_vatom); + } + if (eflag_global) { for (int i = 0; i < 14; i++) pvector[i] = 0.0; @@ -777,6 +790,15 @@ void PairReaxCKokkos<DeviceType>::compute(int eflag_in, int vflag_in) allocate_array(); } + // allocate duplicated memory + if (need_dup) { + dup_dDeltap_self = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_dDeltap_self); + dup_total_bo = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_total_bo); + } else { + ndup_dDeltap_self = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_dDeltap_self); + ndup_total_bo = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_total_bo); + } + // Neighbor lists for bond and hbond // try, resize if necessary @@ -799,7 +821,7 @@ void PairReaxCKokkos<DeviceType>::compute(int eflag_in, int vflag_in) if (neighflag == HALF) Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxBuildListsHalf<HALF> >(0,ignum),*this); else if (neighflag == HALFTHREAD) - Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxBuildListsHalf_LessAtomics<HALFTHREAD> >(0,ignum),*this); + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxBuildListsHalf<HALFTHREAD> >(0,ignum),*this); else //(neighflag == FULL) Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxBuildListsFull>(0,ignum),*this); @@ -814,14 +836,40 @@ void PairReaxCKokkos<DeviceType>::compute(int eflag_in, int vflag_in) if (resize_hb) maxhb++; resize = resize_bo || resize_hb; - if (resize) allocate_array(); + if (resize) { + allocate_array(); + if (need_dup) { + dup_dDeltap_self = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_dDeltap_self); + dup_total_bo = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_total_bo); + } else { + ndup_dDeltap_self = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_dDeltap_self); + ndup_total_bo = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_total_bo); + } + } } + // allocate duplicated memory + if (need_dup) { + dup_CdDelta = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_CdDelta); + //dup_Cdbo = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_Cdbo); + //dup_Cdbopi = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_Cdbopi); + //dup_Cdbopi2 = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_Cdbopi2); + } else { + ndup_CdDelta = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_CdDelta); + //ndup_Cdbo = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_Cdbo); + //ndup_Cdbopi = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_Cdbopi); + //ndup_Cdbopi2 = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_Cdbopi2); + } + + // reduction over duplicated memory + if (need_dup) + Kokkos::Experimental::contribute(d_total_bo, dup_total_bo); // needed in BondOrder1 + // Bond order if (neighflag == HALF) { Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxBondOrder1>(0,ignum),*this); } else if (neighflag == HALFTHREAD) { - Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxBondOrder1_LessAtomics>(0,ignum),*this); + Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxBondOrder1>(0,ignum),*this); } Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxBondOrder2>(0,ignum),*this); Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxBondOrder3>(0,ignum),*this); @@ -920,9 +968,30 @@ void PairReaxCKokkos<DeviceType>::compute(int eflag_in, int vflag_in) pvector[7] = ev.ereax[8]; ev_all.evdwl += ev.ereax[8]; + // reduction over duplicated memory + if (need_dup) { + Kokkos::Experimental::contribute(d_dDeltap_self, dup_dDeltap_self); // needed in ComputeBond2 + Kokkos::Experimental::contribute(d_CdDelta, dup_CdDelta); // needed in ComputeBond2 + + //Kokkos::Experimental::contribute(d_Cdbo, dup_Cdbo); // needed in UpdateBond, but also used in UpdateBond + //Kokkos::Experimental::contribute(d_Cdbopi, dup_Cdbopi); // needed in UpdateBond, but also used in UpdateBond + //Kokkos::Experimental::contribute(d_Cdbopi2, dup_Cdbopi2); // needed in UpdateBond, but also used in UpdateBond + //dup_Cdbo.reset_except(d_Cdbo); + //dup_Cdbopi.reset_except(d_Cdbopi); + //dup_Cdbopi2.reset_except(d_Cdbopi2); + } + // Bond force if (neighflag == HALF) { Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxUpdateBond<HALF> >(0,ignum),*this); + + // reduction over duplicated memory + //if (need_dup) { + // Kokkos::Experimental::contribute(d_Cdbo, dup_Cdbo); // needed in ComputeBond2 + // Kokkos::Experimental::contribute(d_Cdbopi, dup_Cdbopi); // needed in ComputeBond2 + // Kokkos::Experimental::contribute(d_Cdbopi2, dup_Cdbopi2); // needed in ComputeBond2 + //} + if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, PairReaxComputeBond2<HALF,1> >(0,ignum),*this,ev); else @@ -931,6 +1000,14 @@ void PairReaxCKokkos<DeviceType>::compute(int eflag_in, int vflag_in) pvector[0] += ev.evdwl; } else { //if (neighflag == HALFTHREAD) { Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxUpdateBond<HALFTHREAD> >(0,ignum),*this); + + // reduction over duplicated memory + //if (need_dup) { + // Kokkos::Experimental::contribute(d_Cdbo, dup_Cdbo); // needed in ComputeBond2 + // Kokkos::Experimental::contribute(d_Cdbopi, dup_Cdbopi); // needed in ComputeBond2 + // Kokkos::Experimental::contribute(d_Cdbopi2, dup_Cdbopi2); // needed in ComputeBond2 + //} + if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, PairReaxComputeBond2<HALFTHREAD,1> >(0,ignum),*this,ev); else @@ -939,6 +1016,10 @@ void PairReaxCKokkos<DeviceType>::compute(int eflag_in, int vflag_in) pvector[0] += ev.evdwl; } + // reduction over duplicated memory + if (need_dup) + Kokkos::Experimental::contribute(f, dup_f); + if (eflag_global) { eng_vdwl += ev_all.evdwl; eng_coul += ev_all.ecoul; @@ -955,11 +1036,15 @@ void PairReaxCKokkos<DeviceType>::compute(int eflag_in, int vflag_in) if (vflag_fdotr) pair_virial_fdotr_compute(this); if (eflag_atom) { + if (need_dup) + Kokkos::Experimental::contribute(d_eatom, dup_eatom); k_eatom.template modify<DeviceType>(); k_eatom.template sync<LMPHostType>(); } if (vflag_atom) { + if (need_dup) + Kokkos::Experimental::contribute(d_vatom, dup_vatom); k_vatom.template modify<DeviceType>(); k_vatom.template sync<LMPHostType>(); } @@ -968,6 +1053,19 @@ void PairReaxCKokkos<DeviceType>::compute(int eflag_in, int vflag_in) FindBondSpecies(); copymode = 0; + + // free duplicated memory + if (need_dup) { + dup_f = decltype(dup_f)(); + dup_dDeltap_self = decltype(dup_dDeltap_self)(); + dup_total_bo = decltype(dup_total_bo)(); + dup_CdDelta = decltype(dup_CdDelta)(); + //dup_Cdbo = decltype(dup_Cdbo)(); + //dup_Cdbopi = decltype(dup_Cdbopi)(); + //dup_Cdbopi2 = decltype(dup_Cdbopi2)(); + dup_eatom = decltype(dup_eatom)(); + dup_vatom = decltype(dup_vatom)(); + } } /* ---------------------------------------------------------------------- */ @@ -1006,8 +1104,10 @@ template<int NEIGHFLAG, int EVFLAG> KOKKOS_INLINE_FUNCTION void PairReaxCKokkos<DeviceType>::operator()(PairReaxComputeLJCoulomb<NEIGHFLAG,EVFLAG>, const int &ii, EV_FLOAT_REAX& ev) const { - // The f array is atomic for Half/Thread neighbor style - Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_f = f; + // The f array is duplicated for OpenMP, atomic for CUDA, and neither for Serial + + auto v_f = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_f),decltype(ndup_f)>::get(dup_f,ndup_f); + auto a_f = v_f.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); F_FLOAT powr_vdw, powgi_vdw, fn13, dfn13, exp1, exp2, etmp; F_FLOAT evdwl, fvdwl; @@ -1165,8 +1265,10 @@ template<int NEIGHFLAG, int EVFLAG> KOKKOS_INLINE_FUNCTION void PairReaxCKokkos<DeviceType>::operator()(PairReaxComputeTabulatedLJCoulomb<NEIGHFLAG,EVFLAG>, const int &ii, EV_FLOAT_REAX& ev) const { - // The f array is atomic for Half/Thread neighbor style - Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_f = f; + // The f array is duplicated for OpenMP, atomic for CUDA, and neither for Serial + + auto v_f = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_f),decltype(ndup_f)>::get(dup_f,ndup_f); + auto a_f = v_f.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); const int i = d_ilist[ii]; const X_FLOAT xtmp = x(i,0); @@ -1366,18 +1468,18 @@ void PairReaxCKokkos<DeviceType>::operator()(PairReaxZero, const int &n) const { template<class DeviceType> KOKKOS_INLINE_FUNCTION void PairReaxCKokkos<DeviceType>::operator()(PairReaxZeroEAtom, const int &i) const { - v_eatom(i) = 0.0; + d_eatom(i) = 0.0; } template<class DeviceType> KOKKOS_INLINE_FUNCTION void PairReaxCKokkos<DeviceType>::operator()(PairReaxZeroVAtom, const int &i) const { - v_vatom(i,0) = 0.0; - v_vatom(i,1) = 0.0; - v_vatom(i,2) = 0.0; - v_vatom(i,3) = 0.0; - v_vatom(i,4) = 0.0; - v_vatom(i,5) = 0.0; + d_vatom(i,0) = 0.0; + d_vatom(i,1) = 0.0; + d_vatom(i,2) = 0.0; + d_vatom(i,3) = 0.0; + d_vatom(i,4) = 0.0; + d_vatom(i,5) = 0.0; } /* ---------------------------------------------------------------------- */ @@ -1547,8 +1649,11 @@ void PairReaxCKokkos<DeviceType>::operator()(PairReaxBuildListsHalf<NEIGHFLAG>, if (d_resize_bo() || d_resize_hb()) return; - Kokkos::View<F_FLOAT**, typename DAT::t_ffloat_2d_dl::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_dDeltap_self = d_dDeltap_self; - Kokkos::View<F_FLOAT*, typename DAT::t_float_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_total_bo = d_total_bo; + auto v_dDeltap_self = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_dDeltap_self),decltype(ndup_dDeltap_self)>::get(dup_dDeltap_self,ndup_dDeltap_self); + auto a_dDeltap_self = v_dDeltap_self.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); + + auto v_total_bo = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_total_bo),decltype(ndup_total_bo)>::get(dup_total_bo,ndup_total_bo); + auto a_total_bo = v_total_bo.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); const int i = d_ilist[ii]; const X_FLOAT xtmp = x(i,0); @@ -2239,10 +2344,8 @@ template<int NEIGHFLAG, int EVFLAG> KOKKOS_INLINE_FUNCTION void PairReaxCKokkos<DeviceType>::operator()(PairReaxComputeMulti2<NEIGHFLAG,EVFLAG>, const int &ii, EV_FLOAT_REAX& ev) const { - Kokkos::View<F_FLOAT*, typename DAT::t_float_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_CdDelta = d_CdDelta; - Kokkos::View<F_FLOAT**, typename DAT::t_ffloat_2d_dl::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_Cdbo = d_Cdbo; - Kokkos::View<F_FLOAT**, typename DAT::t_ffloat_2d_dl::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_Cdbopi = d_Cdbopi; - Kokkos::View<F_FLOAT**, typename DAT::t_ffloat_2d_dl::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_Cdbopi2 = d_Cdbopi2; + auto v_CdDelta = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_CdDelta),decltype(ndup_CdDelta)>::get(dup_CdDelta,ndup_CdDelta); + auto a_CdDelta = v_CdDelta.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); const int i = d_ilist[ii]; const int itype = type(i); @@ -2393,9 +2496,12 @@ template<int NEIGHFLAG, int EVFLAG> KOKKOS_INLINE_FUNCTION void PairReaxCKokkos<DeviceType>::operator()(PairReaxComputeAngular<NEIGHFLAG,EVFLAG>, const int &ii, EV_FLOAT_REAX& ev) const { - Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_f = f; + auto v_f = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_f),decltype(ndup_f)>::get(dup_f,ndup_f); + auto a_f = v_f.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); Kokkos::View<F_FLOAT**, typename DAT::t_ffloat_2d_dl::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_Cdbo = d_Cdbo; - Kokkos::View<F_FLOAT*, typename DAT::t_float_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_CdDelta = d_CdDelta; + + auto v_CdDelta = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_CdDelta),decltype(ndup_CdDelta)>::get(dup_CdDelta,ndup_CdDelta); + auto a_CdDelta = v_CdDelta.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); const int i = d_ilist[ii]; const int itype = type(i); @@ -2702,9 +2808,13 @@ template<int NEIGHFLAG, int EVFLAG> KOKKOS_INLINE_FUNCTION void PairReaxCKokkos<DeviceType>::operator()(PairReaxComputeTorsion<NEIGHFLAG,EVFLAG>, const int &ii, EV_FLOAT_REAX& ev) const { - Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_f = f; - Kokkos::View<F_FLOAT*, typename DAT::t_float_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_CdDelta = d_CdDelta; + auto v_f = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_f),decltype(ndup_f)>::get(dup_f,ndup_f); + auto a_f = v_f.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); + + auto v_CdDelta = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_CdDelta),decltype(ndup_CdDelta)>::get(dup_CdDelta,ndup_CdDelta); + auto a_CdDelta = v_CdDelta.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); Kokkos::View<F_FLOAT**, typename DAT::t_ffloat_2d_dl::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_Cdbo = d_Cdbo; + //auto a_Cdbo = dup_Cdbo.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); // in reaxc_torsion_angles: j = i, k = j, i = k; @@ -3074,7 +3184,8 @@ template<int NEIGHFLAG, int EVFLAG> KOKKOS_INLINE_FUNCTION void PairReaxCKokkos<DeviceType>::operator()(PairReaxComputeHydrogen<NEIGHFLAG,EVFLAG>, const int &ii, EV_FLOAT_REAX& ev) const { - Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_f = f; + auto v_f = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_f),decltype(ndup_f)>::get(dup_f,ndup_f); + auto a_f = v_f.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); int hblist[MAX_BONDS]; F_FLOAT theta, cos_theta, sin_xhz4, cos_xhz1, sin_theta2; @@ -3224,6 +3335,9 @@ void PairReaxCKokkos<DeviceType>::operator()(PairReaxUpdateBond<NEIGHFLAG>, cons Kokkos::View<F_FLOAT**, typename DAT::t_ffloat_2d_dl::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_Cdbo = d_Cdbo; Kokkos::View<F_FLOAT**, typename DAT::t_ffloat_2d_dl::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_Cdbopi = d_Cdbopi; Kokkos::View<F_FLOAT**, typename DAT::t_ffloat_2d_dl::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_Cdbopi2 = d_Cdbopi2; + //auto a_Cdbo = dup_Cdbo.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); + //auto a_Cdbopi = dup_Cdbopi.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); + //auto a_Cdbopi2 = dup_Cdbopi2.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); const int i = d_ilist[ii]; const tagint itag = tag(i); @@ -3270,8 +3384,11 @@ template<int NEIGHFLAG, int EVFLAG> KOKKOS_INLINE_FUNCTION void PairReaxCKokkos<DeviceType>::operator()(PairReaxComputeBond1<NEIGHFLAG,EVFLAG>, const int &ii, EV_FLOAT_REAX& ev) const { - Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_f = f; - Kokkos::View<F_FLOAT*, typename DAT::t_ffloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_CdDelta = d_CdDelta; + auto v_f = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_f),decltype(ndup_f)>::get(dup_f,ndup_f); + auto a_f = v_f.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); + + auto v_CdDelta = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_CdDelta),decltype(ndup_CdDelta)>::get(dup_CdDelta,ndup_CdDelta); + auto a_CdDelta = v_CdDelta.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); F_FLOAT delij[3]; F_FLOAT p_be1, p_be2, De_s, De_p, De_pp, pow_BOs_be2, exp_be12, CEbo, ebond; @@ -3408,7 +3525,8 @@ template<int NEIGHFLAG, int EVFLAG> KOKKOS_INLINE_FUNCTION void PairReaxCKokkos<DeviceType>::operator()(PairReaxComputeBond2<NEIGHFLAG,EVFLAG>, const int &ii, EV_FLOAT_REAX& ev) const { - Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_f = f; + auto v_f = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_f),decltype(ndup_f)>::get(dup_f,ndup_f); + auto a_f = v_f.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); F_FLOAT delij[3], delik[3], deljk[3], tmpvec[3]; F_FLOAT dBOp_i[3], dBOp_k[3], dln_BOp_pi[3], dln_BOp_pi2[3]; @@ -3620,9 +3738,13 @@ void PairReaxCKokkos<DeviceType>::ev_tally(EV_FLOAT_REAX &ev, const int &i, cons { const int VFLAG = vflag_either; - // The eatom and vatom arrays are atomic for Half/Thread neighbor style - Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_eatom = v_eatom; - Kokkos::View<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_vatom = v_vatom; + // The eatom and vatom arrays are duplicated for OpenMP, atomic for CUDA, and neither for Serial + + auto v_eatom = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_eatom),decltype(ndup_eatom)>::get(dup_eatom,ndup_eatom); + auto a_eatom = v_eatom.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); + + auto v_vatom = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_vatom),decltype(ndup_vatom)>::get(dup_vatom,ndup_vatom); + auto a_vatom = v_vatom.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); if (eflag_atom) { const E_FLOAT epairhalf = 0.5 * epair; @@ -3685,10 +3807,13 @@ void PairReaxCKokkos<DeviceType>::e_tally(EV_FLOAT_REAX &ev, const int &i, const const F_FLOAT &epair) const { - // The eatom array is atomic for Half/Thread neighbor style + // The eatom array is duplicated for OpenMP, atomic for CUDA, and neither for Serial + if (eflag_atom) { - Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_eatom = v_eatom; + auto v_eatom = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_eatom),decltype(ndup_eatom)>::get(dup_eatom,ndup_eatom); + auto a_eatom = v_eatom.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); + const E_FLOAT epairhalf = 0.5 * epair; a_eatom[i] += epairhalf; a_eatom[j] += epairhalf; @@ -3703,8 +3828,9 @@ KOKKOS_INLINE_FUNCTION void PairReaxCKokkos<DeviceType>::e_tally_single(EV_FLOAT_REAX &ev, const int &i, const F_FLOAT &epair) const { - // The eatom array is atomic for Half/Thread neighbor style - Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_eatom = v_eatom; + // The eatom array is duplicated for OpenMP, atomic for CUDA, and neither for Serial + auto v_eatom = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_eatom),decltype(ndup_eatom)>::get(dup_eatom,ndup_eatom); + auto a_eatom = v_eatom.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); a_eatom[i] += epair; } @@ -3737,7 +3863,9 @@ void PairReaxCKokkos<DeviceType>::v_tally(EV_FLOAT_REAX &ev, const int &i, } if (vflag_atom) { - Kokkos::View<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_vatom = v_vatom; + auto v_vatom = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_vatom),decltype(ndup_vatom)>::get(dup_vatom,ndup_vatom); + auto a_vatom = v_vatom.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); + a_vatom(i,0) += v[0]; a_vatom(i,1) += v[1]; a_vatom(i,2) += v[2]; a_vatom(i,3) += v[3]; a_vatom(i,4) += v[4]; a_vatom(i,5) += v[5]; } @@ -3752,8 +3880,9 @@ void PairReaxCKokkos<DeviceType>::v_tally3(EV_FLOAT_REAX &ev, const int &i, cons F_FLOAT *fj, F_FLOAT *fk, F_FLOAT *drij, F_FLOAT *drik) const { - // The eatom and vatom arrays are atomic for Half/Thread neighbor style - Kokkos::View<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_vatom = v_vatom; + // The eatom and vatom arrays are duplicated for OpenMP, atomic for CUDA, and neither for Serial + auto v_vatom = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_vatom),decltype(ndup_vatom)>::get(dup_vatom,ndup_vatom); + auto a_vatom = v_vatom.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); F_FLOAT v[6]; @@ -3793,7 +3922,8 @@ void PairReaxCKokkos<DeviceType>::v_tally4(EV_FLOAT_REAX &ev, const int &i, cons const int &l, F_FLOAT *fi, F_FLOAT *fj, F_FLOAT *fk, F_FLOAT *dril, F_FLOAT *drjl, F_FLOAT *drkl) const { - // The vatom array is atomic for Half/Thread neighbor style + // The vatom array is duplicated for OpenMP, atomic for CUDA, and neither for Serial + F_FLOAT v[6]; v[0] = dril[0]*fi[0] + drjl[0]*fj[0] + drkl[0]*fk[0]; @@ -3813,7 +3943,9 @@ void PairReaxCKokkos<DeviceType>::v_tally4(EV_FLOAT_REAX &ev, const int &i, cons } if (vflag_atom) { - Kokkos::View<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_vatom = v_vatom; + auto v_vatom = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_vatom),decltype(ndup_vatom)>::get(dup_vatom,ndup_vatom); + auto a_vatom = v_vatom.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); + a_vatom(i,0) += 0.25 * v[0]; a_vatom(i,1) += 0.25 * v[1]; a_vatom(i,2) += 0.25 * v[2]; a_vatom(i,3) += 0.25 * v[3]; a_vatom(i,4) += 0.25 * v[4]; a_vatom(i,5) += 0.25 * v[5]; a_vatom(j,0) += 0.25 * v[0]; a_vatom(j,1) += 0.25 * v[1]; a_vatom(j,2) += 0.25 * v[2]; @@ -3910,13 +4042,13 @@ void PairReaxCKokkos<DeviceType>::ev_setup(int eflag, int vflag, int) maxeatom = atom->nmax; memoryKK->destroy_kokkos(k_eatom,eatom); memoryKK->create_kokkos(k_eatom,eatom,maxeatom,"pair:eatom"); - v_eatom = k_eatom.view<DeviceType>(); + d_eatom = k_eatom.view<DeviceType>(); } if (vflag_atom && atom->nmax > maxvatom) { maxvatom = atom->nmax; memoryKK->destroy_kokkos(k_vatom,vatom); memoryKK->create_kokkos(k_vatom,vatom,maxvatom,6,"pair:vatom"); - v_vatom = k_vatom.view<DeviceType>(); + d_vatom = k_vatom.view<DeviceType>(); } // zero accumulators diff --git a/src/KOKKOS/pair_reaxc_kokkos.h b/src/KOKKOS/pair_reaxc_kokkos.h index 5c96d44618361ca3828ba26ceb1738a1dac4de51..89dfc4d884309eb642a594438ded96e29100c209 100644 --- a/src/KOKKOS/pair_reaxc_kokkos.h +++ b/src/KOKKOS/pair_reaxc_kokkos.h @@ -380,11 +380,10 @@ class PairReaxCKokkos : public PairReaxC { typename AT::t_tagint_1d_randomread molecule; DAT::tdual_efloat_1d k_eatom; - typename AT::t_efloat_1d v_eatom; + typename AT::t_efloat_1d d_eatom; DAT::tdual_virial_array k_vatom; - typename ArrayTypes<DeviceType>::t_virial_array d_vatom; - typename AT::t_virial_array v_vatom; + typename AT::t_virial_array d_vatom; HAT::t_virial_array h_vatom; DAT::tdual_float_1d k_tap; @@ -401,6 +400,28 @@ class PairReaxCKokkos : public PairReaxC { typename AT::t_ffloat_2d_dl d_C1dbopi2, d_C2dbopi2, d_C3dbopi2, d_C4dbopi2; typename AT::t_ffloat_2d_dl d_Cdbo, d_Cdbopi, d_Cdbopi2, d_dDeltap_self; + Kokkos::Experimental::ScatterView<F_FLOAT*, typename DAT::t_float_1d::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterDuplicated> dup_total_bo; + Kokkos::Experimental::ScatterView<F_FLOAT*, typename DAT::t_float_1d::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterDuplicated> dup_CdDelta; + Kokkos::Experimental::ScatterView<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterDuplicated> dup_eatom; + Kokkos::Experimental::ScatterView<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterDuplicated> dup_f; + Kokkos::Experimental::ScatterView<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterDuplicated> dup_vatom; + Kokkos::Experimental::ScatterView<F_FLOAT**, typename DAT::t_ffloat_2d_dl::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterDuplicated> dup_dDeltap_self; + Kokkos::Experimental::ScatterView<F_FLOAT**, typename DAT::t_ffloat_2d_dl::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterDuplicated> dup_Cdbo; + Kokkos::Experimental::ScatterView<F_FLOAT**, typename DAT::t_ffloat_2d_dl::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterDuplicated> dup_Cdbopi; + Kokkos::Experimental::ScatterView<F_FLOAT**, typename DAT::t_ffloat_2d_dl::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterDuplicated> dup_Cdbopi2; + + Kokkos::Experimental::ScatterView<F_FLOAT*, typename DAT::t_float_1d::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterNonDuplicated> ndup_total_bo; + Kokkos::Experimental::ScatterView<F_FLOAT*, typename DAT::t_float_1d::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterNonDuplicated> ndup_CdDelta; + Kokkos::Experimental::ScatterView<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterNonDuplicated> ndup_eatom; + Kokkos::Experimental::ScatterView<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterNonDuplicated> ndup_f; + Kokkos::Experimental::ScatterView<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterNonDuplicated> ndup_vatom; + Kokkos::Experimental::ScatterView<F_FLOAT**, typename DAT::t_ffloat_2d_dl::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterNonDuplicated> ndup_dDeltap_self; + Kokkos::Experimental::ScatterView<F_FLOAT**, typename DAT::t_ffloat_2d_dl::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterNonDuplicated> ndup_Cdbo; + Kokkos::Experimental::ScatterView<F_FLOAT**, typename DAT::t_ffloat_2d_dl::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterNonDuplicated> ndup_Cdbopi; + Kokkos::Experimental::ScatterView<F_FLOAT**, typename DAT::t_ffloat_2d_dl::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterNonDuplicated> ndup_Cdbopi2; + + int need_dup; + typedef Kokkos::DualView<F_FLOAT**[7],typename DeviceType::array_layout,DeviceType> tdual_ffloat_2d_n7; typedef typename tdual_ffloat_2d_n7::t_dev_const_randomread t_ffloat_2d_n7_randomread; typedef typename tdual_ffloat_2d_n7::t_host t_host_ffloat_2d_n7; diff --git a/src/KOKKOS/pair_snap_kokkos.h b/src/KOKKOS/pair_snap_kokkos.h index 5c68284219fe2829e589a6dcd5c041587cc4cd71..b2019879edf0449459d734c69e6d204e818282d2 100644 --- a/src/KOKKOS/pair_snap_kokkos.h +++ b/src/KOKKOS/pair_snap_kokkos.h @@ -129,6 +129,12 @@ inline double dist2(double* x,double* y); typename AT::t_f_array f; typename AT::t_int_1d_randomread type; + int need_dup; + Kokkos::Experimental::ScatterView<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterDuplicated> dup_f; + Kokkos::Experimental::ScatterView<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterDuplicated> dup_vatom; + Kokkos::Experimental::ScatterView<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterNonDuplicated> ndup_f; + Kokkos::Experimental::ScatterView<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterNonDuplicated> ndup_vatom; + friend void pair_virial_fdotr_compute<PairSNAPKokkos>(PairSNAPKokkos*); }; diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index a024e5cbd19238a739663870c8e21fcabaea0a41..c452042cfecbd5730a2a35cfda2c5fc64e5eaecf 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -170,6 +170,15 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in) d_ilist = k_list->d_ilist; int inum = list->inum; + need_dup = lmp->kokkos->need_dup<DeviceType>(); + if (need_dup) { + dup_f = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(f); + dup_vatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_vatom); + } else { + ndup_f = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(f); + ndup_vatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_vatom); + } + /* for (int i = 0; i < nlocal; i++) { typename t_neigh_list::t_neighs neighs_i = neigh_list.get_neighs(i); @@ -232,6 +241,9 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in) //if (step%10==0) // printf(" %e %e %e %e %e (%e %e): %e\n",t1,t2,t3,t4,t5,t6,t7,t1+t2+t3+t4+t5); + if (need_dup) + Kokkos::Experimental::contribute(f, dup_f); + if (eflag_global) eng_vdwl += ev.evdwl; if (vflag_global) { virial[0] += ev.v[0]; @@ -244,18 +256,28 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in) if (vflag_fdotr) pair_virial_fdotr_compute(this); + if (eflag_atom) { k_eatom.template modify<DeviceType>(); k_eatom.template sync<LMPHostType>(); } if (vflag_atom) { + if (need_dup) + Kokkos::Experimental::contribute(d_vatom, dup_vatom); k_vatom.template modify<DeviceType>(); k_vatom.template sync<LMPHostType>(); } atomKK->modified(execution_space,F_MASK); + copymode = 0; + + // free duplicated memory + if (need_dup) { + dup_f = decltype(dup_f)(); + dup_vatom = decltype(dup_vatom)(); + } } /* ---------------------------------------------------------------------- @@ -349,8 +371,11 @@ template<class DeviceType> template<int NEIGHFLAG, int EVFLAG> KOKKOS_INLINE_FUNCTION void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAP<NEIGHFLAG,EVFLAG>,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAP<NEIGHFLAG,EVFLAG> >::member_type& team, EV_FLOAT& ev) const { - // The f array is atomic for Half/Thread neighbor style - Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_f = f; + + // The f array is duplicated for OpenMP, atomic for CUDA, and neither for Serial + + auto v_f = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_f),decltype(ndup_f)>::get(dup_f,ndup_f); + auto a_f = v_f.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); const int ii = team.league_rank(); const int i = d_ilist[ii]; @@ -591,8 +616,10 @@ void PairSNAPKokkos<DeviceType>::v_tally_xyz(EV_FLOAT &ev, const int &i, const i const F_FLOAT &fx, const F_FLOAT &fy, const F_FLOAT &fz, const F_FLOAT &delx, const F_FLOAT &dely, const F_FLOAT &delz) const { - // The vatom array is atomic for Half/Thread neighbor style - Kokkos::View<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_vatom = k_vatom.view<DeviceType>(); + // The vatom array is duplicated for OpenMP, atomic for CUDA, and neither for Serial + + auto v_vatom = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_vatom),decltype(ndup_vatom)>::get(dup_vatom,ndup_vatom); + auto a_vatom = v_vatom.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); const E_FLOAT v0 = delx*fx; const E_FLOAT v1 = dely*fy; @@ -611,18 +638,18 @@ void PairSNAPKokkos<DeviceType>::v_tally_xyz(EV_FLOAT &ev, const int &i, const i } if (vflag_atom) { - v_vatom(i,0) += 0.5*v0; - v_vatom(i,1) += 0.5*v1; - v_vatom(i,2) += 0.5*v2; - v_vatom(i,3) += 0.5*v3; - v_vatom(i,4) += 0.5*v4; - v_vatom(i,5) += 0.5*v5; - v_vatom(j,0) += 0.5*v0; - v_vatom(j,1) += 0.5*v1; - v_vatom(j,2) += 0.5*v2; - v_vatom(j,3) += 0.5*v3; - v_vatom(j,4) += 0.5*v4; - v_vatom(j,5) += 0.5*v5; + a_vatom(i,0) += 0.5*v0; + a_vatom(i,1) += 0.5*v1; + a_vatom(i,2) += 0.5*v2; + a_vatom(i,3) += 0.5*v3; + a_vatom(i,4) += 0.5*v4; + a_vatom(i,5) += 0.5*v5; + a_vatom(j,0) += 0.5*v0; + a_vatom(j,1) += 0.5*v1; + a_vatom(j,2) += 0.5*v2; + a_vatom(j,3) += 0.5*v3; + a_vatom(j,4) += 0.5*v4; + a_vatom(j,5) += 0.5*v5; } } diff --git a/src/KOKKOS/pair_sw_kokkos.cpp b/src/KOKKOS/pair_sw_kokkos.cpp index 8f4903c76794a4c7a1d38b5bbbf1b53894788bda..5452d2293f269f59abeed800b7bb1ebc6bef4a0d 100644 --- a/src/KOKKOS/pair_sw_kokkos.cpp +++ b/src/KOKKOS/pair_sw_kokkos.cpp @@ -115,6 +115,17 @@ void PairSWKokkos<DeviceType>::compute(int eflag_in, int vflag_in) d_numneigh = k_list->d_numneigh; d_neighbors = k_list->d_neighbors; + need_dup = lmp->kokkos->need_dup<DeviceType>(); + if (need_dup) { + dup_f = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(f); + dup_eatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_eatom); + dup_vatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_vatom); + } else { + ndup_f = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(f); + ndup_eatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_eatom); + ndup_vatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_vatom); + } + copymode = 1; EV_FLOAT ev; @@ -160,6 +171,9 @@ void PairSWKokkos<DeviceType>::compute(int eflag_in, int vflag_in) ev_all += ev; } + if (need_dup) + Kokkos::Experimental::contribute(f, dup_f); + if (eflag_global) eng_vdwl += ev_all.evdwl; if (vflag_global) { virial[0] += ev_all.v[0]; @@ -171,11 +185,15 @@ void PairSWKokkos<DeviceType>::compute(int eflag_in, int vflag_in) } if (eflag_atom) { + if (need_dup) + Kokkos::Experimental::contribute(d_eatom, dup_eatom); k_eatom.template modify<DeviceType>(); k_eatom.template sync<LMPHostType>(); } if (vflag_atom) { + if (need_dup) + Kokkos::Experimental::contribute(d_vatom, dup_vatom); k_vatom.template modify<DeviceType>(); k_vatom.template sync<LMPHostType>(); } @@ -183,6 +201,13 @@ void PairSWKokkos<DeviceType>::compute(int eflag_in, int vflag_in) if (vflag_fdotr) pair_virial_fdotr_compute(this); copymode = 0; + + // free duplicated memory + if (need_dup) { + dup_f = decltype(dup_f)(); + dup_eatom = decltype(dup_eatom)(); + dup_vatom = decltype(dup_vatom)(); + } } @@ -222,9 +247,10 @@ template<int NEIGHFLAG, int EVFLAG> KOKKOS_INLINE_FUNCTION void PairSWKokkos<DeviceType>::operator()(TagPairSWComputeHalf<NEIGHFLAG,EVFLAG>, const int &ii, EV_FLOAT& ev) const { - // The f array is atomic + // The f array is duplicated for OpenMP, atomic for CUDA, and neither for Serial - Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_f = f; + auto v_f = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_f),decltype(ndup_f)>::get(dup_f,ndup_f); + auto a_f = v_f.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); F_FLOAT delr1[3],delr2[3],fj[3],fk[3]; F_FLOAT evdwl = 0.0; @@ -777,17 +803,19 @@ void PairSWKokkos<DeviceType>::ev_tally(EV_FLOAT &ev, const int &i, const int &j { const int VFLAG = vflag_either; - // The eatom and vatom arrays are atomic for half/thread neighbor list + // The eatom and vatom arrays are duplicated for OpenMP, atomic for CUDA, and neither for Serial - Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_eatom = k_eatom.view<DeviceType>(); - Kokkos::View<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_vatom = k_vatom.view<DeviceType>(); + auto v_eatom = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_eatom),decltype(ndup_eatom)>::get(dup_eatom,ndup_eatom); + auto a_eatom = v_eatom.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); + auto v_vatom = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_vatom),decltype(ndup_vatom)>::get(dup_vatom,ndup_vatom); + auto a_vatom = v_vatom.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); if (eflag_atom) { const E_FLOAT epairhalf = 0.5 * epair; - v_eatom[i] += epairhalf; + a_eatom[i] += epairhalf; if (NEIGHFLAG != FULL) - v_eatom[j] += epairhalf; + a_eatom[j] += epairhalf; } if (VFLAG) { @@ -817,20 +845,20 @@ void PairSWKokkos<DeviceType>::ev_tally(EV_FLOAT &ev, const int &i, const int &j } if (vflag_atom) { - v_vatom(i,0) += 0.5*v0; - v_vatom(i,1) += 0.5*v1; - v_vatom(i,2) += 0.5*v2; - v_vatom(i,3) += 0.5*v3; - v_vatom(i,4) += 0.5*v4; - v_vatom(i,5) += 0.5*v5; + a_vatom(i,0) += 0.5*v0; + a_vatom(i,1) += 0.5*v1; + a_vatom(i,2) += 0.5*v2; + a_vatom(i,3) += 0.5*v3; + a_vatom(i,4) += 0.5*v4; + a_vatom(i,5) += 0.5*v5; if (NEIGHFLAG != FULL) { - v_vatom(j,0) += 0.5*v0; - v_vatom(j,1) += 0.5*v1; - v_vatom(j,2) += 0.5*v2; - v_vatom(j,3) += 0.5*v3; - v_vatom(j,4) += 0.5*v4; - v_vatom(j,5) += 0.5*v5; + a_vatom(j,0) += 0.5*v0; + a_vatom(j,1) += 0.5*v1; + a_vatom(j,2) += 0.5*v2; + a_vatom(j,3) += 0.5*v3; + a_vatom(j,4) += 0.5*v4; + a_vatom(j,5) += 0.5*v5; } } } @@ -853,17 +881,20 @@ void PairSWKokkos<DeviceType>::ev_tally3(EV_FLOAT &ev, const int &i, const int & const int VFLAG = vflag_either; -// The eatom and vatom arrays are atomic for half/thread neighbor list + // The eatom and vatom arrays are duplicated for OpenMP, atomic for CUDA, and neither for Serial + + auto v_eatom = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_eatom),decltype(ndup_eatom)>::get(dup_eatom,ndup_eatom); + auto a_eatom = v_eatom.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); - Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_eatom = k_eatom.view<DeviceType>(); - Kokkos::View<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_vatom = k_vatom.view<DeviceType>(); + auto v_vatom = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_vatom),decltype(ndup_vatom)>::get(dup_vatom,ndup_vatom); + auto a_vatom = v_vatom.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); if (eflag_atom) { epairthird = THIRD * (evdwl + ecoul); - v_eatom[i] += epairthird; + a_eatom[i] += epairthird; if (NEIGHFLAG != FULL) { - v_eatom[j] += epairthird; - v_eatom[k] += epairthird; + a_eatom[j] += epairthird; + a_eatom[k] += epairthird; } } @@ -885,18 +916,18 @@ void PairSWKokkos<DeviceType>::ev_tally3(EV_FLOAT &ev, const int &i, const int & } if (vflag_atom) { - v_vatom(i,0) += THIRD*v[0]; v_vatom(i,1) += THIRD*v[1]; - v_vatom(i,2) += THIRD*v[2]; v_vatom(i,3) += THIRD*v[3]; - v_vatom(i,4) += THIRD*v[4]; v_vatom(i,5) += THIRD*v[5]; + a_vatom(i,0) += THIRD*v[0]; a_vatom(i,1) += THIRD*v[1]; + a_vatom(i,2) += THIRD*v[2]; a_vatom(i,3) += THIRD*v[3]; + a_vatom(i,4) += THIRD*v[4]; a_vatom(i,5) += THIRD*v[5]; if (NEIGHFLAG != FULL) { - v_vatom(j,0) += THIRD*v[0]; v_vatom(j,1) += THIRD*v[1]; - v_vatom(j,2) += THIRD*v[2]; v_vatom(j,3) += THIRD*v[3]; - v_vatom(j,4) += THIRD*v[4]; v_vatom(j,5) += THIRD*v[5]; + a_vatom(j,0) += THIRD*v[0]; a_vatom(j,1) += THIRD*v[1]; + a_vatom(j,2) += THIRD*v[2]; a_vatom(j,3) += THIRD*v[3]; + a_vatom(j,4) += THIRD*v[4]; a_vatom(j,5) += THIRD*v[5]; - v_vatom(k,0) += THIRD*v[0]; v_vatom(k,1) += THIRD*v[1]; - v_vatom(k,2) += THIRD*v[2]; v_vatom(k,3) += THIRD*v[3]; - v_vatom(k,4) += THIRD*v[4]; v_vatom(k,5) += THIRD*v[5]; + a_vatom(k,0) += THIRD*v[0]; a_vatom(k,1) += THIRD*v[1]; + a_vatom(k,2) += THIRD*v[2]; a_vatom(k,3) += THIRD*v[3]; + a_vatom(k,4) += THIRD*v[4]; a_vatom(k,5) += THIRD*v[5]; } } } diff --git a/src/KOKKOS/pair_sw_kokkos.h b/src/KOKKOS/pair_sw_kokkos.h index eb59d5bc3ced9f73a1a9f4f446bd083ab7c532a1..1a3f0b862f860d1c36a3db5df953ece351e620f6 100644 --- a/src/KOKKOS/pair_sw_kokkos.h +++ b/src/KOKKOS/pair_sw_kokkos.h @@ -134,6 +134,14 @@ class PairSWKokkos : public PairSW { typename AT::t_efloat_1d d_eatom; typename AT::t_virial_array d_vatom; + int need_dup; + Kokkos::Experimental::ScatterView<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterDuplicated> dup_f; + Kokkos::Experimental::ScatterView<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterDuplicated> dup_eatom; + Kokkos::Experimental::ScatterView<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterDuplicated> dup_vatom; + Kokkos::Experimental::ScatterView<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterNonDuplicated> ndup_f; + Kokkos::Experimental::ScatterView<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterNonDuplicated> ndup_eatom; + Kokkos::Experimental::ScatterView<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterNonDuplicated> ndup_vatom; + typename AT::t_int_1d_randomread d_type2frho; typename AT::t_int_2d_randomread d_type2rhor; typename AT::t_int_2d_randomread d_type2z2r; diff --git a/src/KOKKOS/pair_table_kokkos.cpp b/src/KOKKOS/pair_table_kokkos.cpp index 7b06f814bb4624874ca16f313f9faeda83cea9db..b72df04c5f626479ad0d8304e36a523d26f60e91 100644 --- a/src/KOKKOS/pair_table_kokkos.cpp +++ b/src/KOKKOS/pair_table_kokkos.cpp @@ -128,21 +128,25 @@ void PairTableKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in) ff(this,(NeighListKokkos<DeviceType>*) list); if (eflag || vflag) Kokkos::parallel_reduce(list->inum,ff,ev); else Kokkos::parallel_for(list->inum,ff); + ff.contribute(); } else if (neighflag == HALFTHREAD) { PairComputeFunctor<PairTableKokkos<DeviceType>,HALFTHREAD,false,S_TableCompute<DeviceType,TABSTYLE> > ff(this,(NeighListKokkos<DeviceType>*) list); if (eflag || vflag) Kokkos::parallel_reduce(list->inum,ff,ev); else Kokkos::parallel_for(list->inum,ff); + ff.contribute(); } else if (neighflag == HALF) { PairComputeFunctor<PairTableKokkos<DeviceType>,HALF,false,S_TableCompute<DeviceType,TABSTYLE> > f(this,(NeighListKokkos<DeviceType>*) list); if (eflag || vflag) Kokkos::parallel_reduce(list->inum,f,ev); else Kokkos::parallel_for(list->inum,f); + f.contribute(); } else if (neighflag == N2) { PairComputeFunctor<PairTableKokkos<DeviceType>,N2,false,S_TableCompute<DeviceType,TABSTYLE> > f(this,(NeighListKokkos<DeviceType>*) list); if (eflag || vflag) Kokkos::parallel_reduce(list->inum,f,ev); else Kokkos::parallel_for(list->inum,f); + f.contribute(); } } else { if (neighflag == FULL) { @@ -150,21 +154,25 @@ void PairTableKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in) f(this,(NeighListKokkos<DeviceType>*) list); if (eflag || vflag) Kokkos::parallel_reduce(list->inum,f,ev); else Kokkos::parallel_for(list->inum,f); + f.contribute(); } else if (neighflag == HALFTHREAD) { PairComputeFunctor<PairTableKokkos<DeviceType>,HALFTHREAD,true,S_TableCompute<DeviceType,TABSTYLE> > f(this,(NeighListKokkos<DeviceType>*) list); if (eflag || vflag) Kokkos::parallel_reduce(list->inum,f,ev); else Kokkos::parallel_for(list->inum,f); + f.contribute(); } else if (neighflag == HALF) { PairComputeFunctor<PairTableKokkos<DeviceType>,HALF,true,S_TableCompute<DeviceType,TABSTYLE> > f(this,(NeighListKokkos<DeviceType>*) list); if (eflag || vflag) Kokkos::parallel_reduce(list->inum,f,ev); else Kokkos::parallel_for(list->inum,f); + f.contribute(); } else if (neighflag == N2) { PairComputeFunctor<PairTableKokkos<DeviceType>,N2,true,S_TableCompute<DeviceType,TABSTYLE> > f(this,(NeighListKokkos<DeviceType>*) list); if (eflag || vflag) Kokkos::parallel_reduce(list->inum,f,ev); else Kokkos::parallel_for(list->inum,f); + f.contribute(); } } diff --git a/src/KOKKOS/pair_tersoff_kokkos.cpp b/src/KOKKOS/pair_tersoff_kokkos.cpp index cb4def2d373b4d3950334997b98ab09ffe0efe14..c3e1494d0b172676e29266817d9434b6c8d601ff 100644 --- a/src/KOKKOS/pair_tersoff_kokkos.cpp +++ b/src/KOKKOS/pair_tersoff_kokkos.cpp @@ -200,6 +200,17 @@ void PairTersoffKokkos<DeviceType>::compute(int eflag_in, int vflag_in) d_neighbors = k_list->d_neighbors; d_ilist = k_list->d_ilist; + need_dup = lmp->kokkos->need_dup<DeviceType>(); + if (need_dup) { + dup_f = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(f); + dup_eatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_eatom); + dup_vatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_vatom); + } else { + ndup_f = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(f); + ndup_eatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_eatom); + ndup_vatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_vatom); + } + copymode = 1; EV_FLOAT ev; @@ -243,6 +254,9 @@ void PairTersoffKokkos<DeviceType>::compute(int eflag_in, int vflag_in) ev_all += ev; } + if (need_dup) + Kokkos::Experimental::contribute(f, dup_f); + if (eflag_global) eng_vdwl += ev_all.evdwl; if (vflag_global) { virial[0] += ev_all.v[0]; @@ -254,11 +268,15 @@ void PairTersoffKokkos<DeviceType>::compute(int eflag_in, int vflag_in) } if (eflag_atom) { + if (need_dup) + Kokkos::Experimental::contribute(d_eatom, dup_eatom); k_eatom.template modify<DeviceType>(); k_eatom.template sync<LMPHostType>(); } if (vflag_atom) { + if (need_dup) + Kokkos::Experimental::contribute(d_vatom, dup_vatom); k_vatom.template modify<DeviceType>(); k_vatom.template sync<LMPHostType>(); } @@ -266,6 +284,13 @@ void PairTersoffKokkos<DeviceType>::compute(int eflag_in, int vflag_in) if (vflag_fdotr) pair_virial_fdotr_compute(this); copymode = 0; + + // free duplicated memory + if (need_dup) { + dup_f = decltype(dup_f)(); + dup_eatom = decltype(dup_eatom)(); + dup_vatom = decltype(dup_vatom)(); + } } /* ---------------------------------------------------------------------- */ @@ -304,8 +329,10 @@ template<int NEIGHFLAG, int EVFLAG> KOKKOS_INLINE_FUNCTION void PairTersoffKokkos<DeviceType>::operator()(TagPairTersoffComputeHalf<NEIGHFLAG,EVFLAG>, const int &ii, EV_FLOAT& ev) const { - // The f array is atomic for Half/Thread neighbor style - Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_f = f; + // The f array is duplicated for OpenMP, atomic for CUDA, and neither for Serial + + auto v_f = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_f),decltype(ndup_f)>::get(dup_f,ndup_f); + auto a_f = v_f.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); const int i = d_ilist[ii]; if (i >= nlocal) return; @@ -1117,14 +1144,18 @@ void PairTersoffKokkos<DeviceType>::ev_tally(EV_FLOAT &ev, const int &i, const i { const int VFLAG = vflag_either; - // The eatom and vatom arrays are atomic for Half/Thread neighbor style - Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_eatom = k_eatom.view<DeviceType>(); - Kokkos::View<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_vatom = k_vatom.view<DeviceType>(); + // The eatom and vatom arrays are duplicated for OpenMP, atomic for CUDA, and neither for Serial + + auto v_eatom = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_eatom),decltype(ndup_eatom)>::get(dup_eatom,ndup_eatom); + auto a_eatom = v_eatom.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); + + auto v_vatom = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_vatom),decltype(ndup_vatom)>::get(dup_vatom,ndup_vatom); + auto a_vatom = v_vatom.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); if (eflag_atom) { const E_FLOAT epairhalf = 0.5 * epair; - v_eatom[i] += epairhalf; - if (NEIGHFLAG != FULL) v_eatom[j] += epairhalf; + a_eatom[i] += epairhalf; + if (NEIGHFLAG != FULL) a_eatom[j] += epairhalf; } if (VFLAG) { @@ -1154,20 +1185,20 @@ void PairTersoffKokkos<DeviceType>::ev_tally(EV_FLOAT &ev, const int &i, const i } if (vflag_atom) { - v_vatom(i,0) += 0.5*v0; - v_vatom(i,1) += 0.5*v1; - v_vatom(i,2) += 0.5*v2; - v_vatom(i,3) += 0.5*v3; - v_vatom(i,4) += 0.5*v4; - v_vatom(i,5) += 0.5*v5; + a_vatom(i,0) += 0.5*v0; + a_vatom(i,1) += 0.5*v1; + a_vatom(i,2) += 0.5*v2; + a_vatom(i,3) += 0.5*v3; + a_vatom(i,4) += 0.5*v4; + a_vatom(i,5) += 0.5*v5; if (NEIGHFLAG != FULL) { - v_vatom(j,0) += 0.5*v0; - v_vatom(j,1) += 0.5*v1; - v_vatom(j,2) += 0.5*v2; - v_vatom(j,3) += 0.5*v3; - v_vatom(j,4) += 0.5*v4; - v_vatom(j,5) += 0.5*v5; + a_vatom(j,0) += 0.5*v0; + a_vatom(j,1) += 0.5*v1; + a_vatom(j,2) += 0.5*v2; + a_vatom(j,3) += 0.5*v3; + a_vatom(j,4) += 0.5*v4; + a_vatom(j,5) += 0.5*v5; } } } @@ -1181,9 +1212,10 @@ KOKKOS_INLINE_FUNCTION void PairTersoffKokkos<DeviceType>::v_tally3(EV_FLOAT &ev, const int &i, const int &j, const int &k, F_FLOAT *fj, F_FLOAT *fk, F_FLOAT *drij, F_FLOAT *drik) const { + // The vatom array is duplicated for OpenMP, atomic for CUDA, and neither for Serial - // The eatom and vatom arrays are atomic for Half/Thread neighbor style - Kokkos::View<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_vatom = k_vatom.view<DeviceType>(); + auto v_vatom = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_vatom),decltype(ndup_vatom)>::get(dup_vatom,ndup_vatom); + auto a_vatom = v_vatom.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); F_FLOAT v[6]; @@ -1204,13 +1236,13 @@ void PairTersoffKokkos<DeviceType>::v_tally3(EV_FLOAT &ev, const int &i, const i } if (vflag_atom) { - v_vatom(i,0) += v[0]; v_vatom(i,1) += v[1]; v_vatom(i,2) += v[2]; - v_vatom(i,3) += v[3]; v_vatom(i,4) += v[4]; v_vatom(i,5) += v[5]; + a_vatom(i,0) += v[0]; a_vatom(i,1) += v[1]; a_vatom(i,2) += v[2]; + a_vatom(i,3) += v[3]; a_vatom(i,4) += v[4]; a_vatom(i,5) += v[5]; if (NEIGHFLAG != FULL) { - v_vatom(j,0) += v[0]; v_vatom(j,1) += v[1]; v_vatom(j,2) += v[2]; - v_vatom(j,3) += v[3]; v_vatom(j,4) += v[4]; v_vatom(j,5) += v[5]; - v_vatom(k,0) += v[0]; v_vatom(k,1) += v[1]; v_vatom(k,2) += v[2]; - v_vatom(k,3) += v[3]; v_vatom(k,4) += v[4]; v_vatom(k,5) += v[5]; + a_vatom(j,0) += v[0]; a_vatom(j,1) += v[1]; a_vatom(j,2) += v[2]; + a_vatom(j,3) += v[3]; a_vatom(j,4) += v[4]; a_vatom(j,5) += v[5]; + a_vatom(k,0) += v[0]; a_vatom(k,1) += v[1]; a_vatom(k,2) += v[2]; + a_vatom(k,3) += v[3]; a_vatom(k,4) += v[4]; a_vatom(k,5) += v[5]; } } diff --git a/src/KOKKOS/pair_tersoff_kokkos.h b/src/KOKKOS/pair_tersoff_kokkos.h index 2dac2c5991fe1ba733b7ae1bedb7c9e640c1b645..f73d4fe2d89e8990466a0a8edafba1cc3d94a9ab 100644 --- a/src/KOKKOS/pair_tersoff_kokkos.h +++ b/src/KOKKOS/pair_tersoff_kokkos.h @@ -202,6 +202,14 @@ class PairTersoffKokkos : public PairTersoff { typename ArrayTypes<DeviceType>::t_efloat_1d d_eatom; typename ArrayTypes<DeviceType>::t_virial_array d_vatom; + int need_dup; + Kokkos::Experimental::ScatterView<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterDuplicated> dup_f; + Kokkos::Experimental::ScatterView<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterDuplicated> dup_eatom; + Kokkos::Experimental::ScatterView<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterDuplicated> dup_vatom; + Kokkos::Experimental::ScatterView<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterNonDuplicated> ndup_f; + Kokkos::Experimental::ScatterView<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterNonDuplicated> ndup_eatom; + Kokkos::Experimental::ScatterView<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterNonDuplicated> ndup_vatom; + typedef Kokkos::DualView<F_FLOAT**[7],Kokkos::LayoutRight,DeviceType> tdual_ffloat_2d_n7; typedef typename tdual_ffloat_2d_n7::t_dev_const_randomread t_ffloat_2d_n7_randomread; typedef typename tdual_ffloat_2d_n7::t_host t_host_ffloat_2d_n7; diff --git a/src/KOKKOS/pair_tersoff_mod_kokkos.cpp b/src/KOKKOS/pair_tersoff_mod_kokkos.cpp index 0d150f83e4a084a8d92a24a79f806aecea3e34c3..61493794baa68ee5672b48845262f4d718c1cf59 100644 --- a/src/KOKKOS/pair_tersoff_mod_kokkos.cpp +++ b/src/KOKKOS/pair_tersoff_mod_kokkos.cpp @@ -200,6 +200,17 @@ void PairTersoffMODKokkos<DeviceType>::compute(int eflag_in, int vflag_in) d_neighbors = k_list->d_neighbors; d_ilist = k_list->d_ilist; + need_dup = lmp->kokkos->need_dup<DeviceType>(); + if (need_dup) { + dup_f = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(f); + dup_eatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_eatom); + dup_vatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_vatom); + } else { + ndup_f = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(f); + ndup_eatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_eatom); + ndup_vatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_vatom); + } + copymode = 1; EV_FLOAT ev; @@ -243,6 +254,9 @@ void PairTersoffMODKokkos<DeviceType>::compute(int eflag_in, int vflag_in) ev_all += ev; } + if (need_dup) + Kokkos::Experimental::contribute(f, dup_f); + if (eflag_global) eng_vdwl += ev_all.evdwl; if (vflag_global) { virial[0] += ev_all.v[0]; @@ -254,11 +268,15 @@ void PairTersoffMODKokkos<DeviceType>::compute(int eflag_in, int vflag_in) } if (eflag_atom) { + if (need_dup) + Kokkos::Experimental::contribute(d_eatom, dup_eatom); k_eatom.template modify<DeviceType>(); k_eatom.template sync<LMPHostType>(); } if (vflag_atom) { + if (need_dup) + Kokkos::Experimental::contribute(d_vatom, dup_vatom); k_vatom.template modify<DeviceType>(); k_vatom.template sync<LMPHostType>(); } @@ -266,6 +284,13 @@ void PairTersoffMODKokkos<DeviceType>::compute(int eflag_in, int vflag_in) if (vflag_fdotr) pair_virial_fdotr_compute(this); copymode = 0; + + // free duplicated memory + if (need_dup) { + dup_f = decltype(dup_f)(); + dup_eatom = decltype(dup_eatom)(); + dup_vatom = decltype(dup_vatom)(); + } } /* ---------------------------------------------------------------------- */ @@ -304,8 +329,10 @@ template<int NEIGHFLAG, int EVFLAG> KOKKOS_INLINE_FUNCTION void PairTersoffMODKokkos<DeviceType>::operator()(TagPairTersoffMODComputeHalf<NEIGHFLAG,EVFLAG>, const int &ii, EV_FLOAT& ev) const { - // The f array is atomic for Half/Thread neighbor style - Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_f = f; + // The f array is duplicated for OpenMP, atomic for CUDA, and neither for Serial + + auto v_f = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_f),decltype(ndup_f)>::get(dup_f,ndup_f); + auto a_f = v_f.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); const int i = d_ilist[ii]; if (i >= nlocal) return; @@ -1120,14 +1147,18 @@ void PairTersoffMODKokkos<DeviceType>::ev_tally(EV_FLOAT &ev, const int &i, cons { const int VFLAG = vflag_either; - // The eatom and vatom arrays are atomic for Half/Thread neighbor style - Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_eatom = k_eatom.view<DeviceType>(); - Kokkos::View<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_vatom = k_vatom.view<DeviceType>(); + // The eatom and vatom arrays are duplicated for OpenMP, atomic for CUDA, and neither for Serial + + auto v_eatom = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_eatom),decltype(ndup_eatom)>::get(dup_eatom,ndup_eatom); + auto a_eatom = v_eatom.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); + + auto v_vatom = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_vatom),decltype(ndup_vatom)>::get(dup_vatom,ndup_vatom); + auto a_vatom = v_vatom.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); if (eflag_atom) { const E_FLOAT epairhalf = 0.5 * epair; - v_eatom[i] += epairhalf; - if (NEIGHFLAG != FULL) v_eatom[j] += epairhalf; + a_eatom[i] += epairhalf; + if (NEIGHFLAG != FULL) a_eatom[j] += epairhalf; } if (VFLAG) { @@ -1157,20 +1188,20 @@ void PairTersoffMODKokkos<DeviceType>::ev_tally(EV_FLOAT &ev, const int &i, cons } if (vflag_atom) { - v_vatom(i,0) += 0.5*v0; - v_vatom(i,1) += 0.5*v1; - v_vatom(i,2) += 0.5*v2; - v_vatom(i,3) += 0.5*v3; - v_vatom(i,4) += 0.5*v4; - v_vatom(i,5) += 0.5*v5; + a_vatom(i,0) += 0.5*v0; + a_vatom(i,1) += 0.5*v1; + a_vatom(i,2) += 0.5*v2; + a_vatom(i,3) += 0.5*v3; + a_vatom(i,4) += 0.5*v4; + a_vatom(i,5) += 0.5*v5; if (NEIGHFLAG != FULL) { - v_vatom(j,0) += 0.5*v0; - v_vatom(j,1) += 0.5*v1; - v_vatom(j,2) += 0.5*v2; - v_vatom(j,3) += 0.5*v3; - v_vatom(j,4) += 0.5*v4; - v_vatom(j,5) += 0.5*v5; + a_vatom(j,0) += 0.5*v0; + a_vatom(j,1) += 0.5*v1; + a_vatom(j,2) += 0.5*v2; + a_vatom(j,3) += 0.5*v3; + a_vatom(j,4) += 0.5*v4; + a_vatom(j,5) += 0.5*v5; } } } @@ -1184,9 +1215,10 @@ KOKKOS_INLINE_FUNCTION void PairTersoffMODKokkos<DeviceType>::v_tally3(EV_FLOAT &ev, const int &i, const int &j, const int &k, F_FLOAT *fj, F_FLOAT *fk, F_FLOAT *drij, F_FLOAT *drik) const { + // The vatom array is duplicated for OpenMP, atomic for CUDA, and neither for Serial - // The eatom and vatom arrays are atomic for Half/Thread neighbor style - Kokkos::View<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_vatom = k_vatom.view<DeviceType>(); + auto v_vatom = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_vatom),decltype(ndup_vatom)>::get(dup_vatom,ndup_vatom); + auto a_vatom = v_vatom.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); F_FLOAT v[6]; @@ -1207,13 +1239,13 @@ void PairTersoffMODKokkos<DeviceType>::v_tally3(EV_FLOAT &ev, const int &i, cons } if (vflag_atom) { - v_vatom(i,0) += v[0]; v_vatom(i,1) += v[1]; v_vatom(i,2) += v[2]; - v_vatom(i,3) += v[3]; v_vatom(i,4) += v[4]; v_vatom(i,5) += v[5]; + a_vatom(i,0) += v[0]; a_vatom(i,1) += v[1]; a_vatom(i,2) += v[2]; + a_vatom(i,3) += v[3]; a_vatom(i,4) += v[4]; a_vatom(i,5) += v[5]; if (NEIGHFLAG != FULL) { - v_vatom(j,0) += v[0]; v_vatom(j,1) += v[1]; v_vatom(j,2) += v[2]; - v_vatom(j,3) += v[3]; v_vatom(j,4) += v[4]; v_vatom(j,5) += v[5]; - v_vatom(k,0) += v[0]; v_vatom(k,1) += v[1]; v_vatom(k,2) += v[2]; - v_vatom(k,3) += v[3]; v_vatom(k,4) += v[4]; v_vatom(k,5) += v[5]; + a_vatom(j,0) += v[0]; a_vatom(j,1) += v[1]; a_vatom(j,2) += v[2]; + a_vatom(j,3) += v[3]; a_vatom(j,4) += v[4]; a_vatom(j,5) += v[5]; + a_vatom(k,0) += v[0]; a_vatom(k,1) += v[1]; a_vatom(k,2) += v[2]; + a_vatom(k,3) += v[3]; a_vatom(k,4) += v[4]; a_vatom(k,5) += v[5]; } } diff --git a/src/KOKKOS/pair_tersoff_mod_kokkos.h b/src/KOKKOS/pair_tersoff_mod_kokkos.h index dd5efb50f1da33e7743f6c8c4d51c5e1e9086dee..d7c94ffc93e62c733b8e4e1f7035721a02f3edef 100644 --- a/src/KOKKOS/pair_tersoff_mod_kokkos.h +++ b/src/KOKKOS/pair_tersoff_mod_kokkos.h @@ -202,6 +202,14 @@ class PairTersoffMODKokkos : public PairTersoffMOD { typename ArrayTypes<DeviceType>::t_efloat_1d d_eatom; typename ArrayTypes<DeviceType>::t_virial_array d_vatom; + int need_dup; + Kokkos::Experimental::ScatterView<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterDuplicated> dup_f; + Kokkos::Experimental::ScatterView<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterDuplicated> dup_eatom; + Kokkos::Experimental::ScatterView<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterDuplicated> dup_vatom; + Kokkos::Experimental::ScatterView<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterNonDuplicated> ndup_f; + Kokkos::Experimental::ScatterView<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterNonDuplicated> ndup_eatom; + Kokkos::Experimental::ScatterView<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterNonDuplicated> ndup_vatom; + typedef Kokkos::DualView<F_FLOAT**[7],Kokkos::LayoutRight,DeviceType> tdual_ffloat_2d_n7; typedef typename tdual_ffloat_2d_n7::t_dev_const_randomread t_ffloat_2d_n7_randomread; typedef typename tdual_ffloat_2d_n7::t_host t_host_ffloat_2d_n7; diff --git a/src/KOKKOS/pair_tersoff_zbl_kokkos.cpp b/src/KOKKOS/pair_tersoff_zbl_kokkos.cpp index 2fb9675db60efa9b5ebda7e81dee5e1271d021fd..f51da2afde8779e2898ca2cdb1be9a68c94618e3 100644 --- a/src/KOKKOS/pair_tersoff_zbl_kokkos.cpp +++ b/src/KOKKOS/pair_tersoff_zbl_kokkos.cpp @@ -214,6 +214,17 @@ void PairTersoffZBLKokkos<DeviceType>::compute(int eflag_in, int vflag_in) d_neighbors = k_list->d_neighbors; d_ilist = k_list->d_ilist; + need_dup = lmp->kokkos->need_dup<DeviceType>(); + if (need_dup) { + dup_f = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(f); + dup_eatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_eatom); + dup_vatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_vatom); + } else { + ndup_f = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(f); + ndup_eatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_eatom); + ndup_vatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_vatom); + } + copymode = 1; EV_FLOAT ev; @@ -257,6 +268,9 @@ void PairTersoffZBLKokkos<DeviceType>::compute(int eflag_in, int vflag_in) ev_all += ev; } + if (need_dup) + Kokkos::Experimental::contribute(f, dup_f); + if (eflag_global) eng_vdwl += ev_all.evdwl; if (vflag_global) { virial[0] += ev_all.v[0]; @@ -268,11 +282,15 @@ void PairTersoffZBLKokkos<DeviceType>::compute(int eflag_in, int vflag_in) } if (eflag_atom) { + if (need_dup) + Kokkos::Experimental::contribute(d_eatom, dup_eatom); k_eatom.template modify<DeviceType>(); k_eatom.template sync<LMPHostType>(); } if (vflag_atom) { + if (need_dup) + Kokkos::Experimental::contribute(d_vatom, dup_vatom); k_vatom.template modify<DeviceType>(); k_vatom.template sync<LMPHostType>(); } @@ -280,6 +298,13 @@ void PairTersoffZBLKokkos<DeviceType>::compute(int eflag_in, int vflag_in) if (vflag_fdotr) pair_virial_fdotr_compute(this); copymode = 0; + + // free duplicated memory + if (need_dup) { + dup_f = decltype(dup_f)(); + dup_eatom = decltype(dup_eatom)(); + dup_vatom = decltype(dup_vatom)(); + } } /* ---------------------------------------------------------------------- */ @@ -318,8 +343,10 @@ template<int NEIGHFLAG, int EVFLAG> KOKKOS_INLINE_FUNCTION void PairTersoffZBLKokkos<DeviceType>::operator()(TagPairTersoffZBLComputeHalf<NEIGHFLAG,EVFLAG>, const int &ii, EV_FLOAT& ev) const { - // The f array is atomic for Half/Thread neighbor style - Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_f = f; + // The f array is duplicated for OpenMP, atomic for CUDA, and neither for Serial + + auto v_f = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_f),decltype(ndup_f)>::get(dup_f,ndup_f); + auto a_f = v_f.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); const int i = d_ilist[ii]; if (i >= nlocal) return; @@ -1214,14 +1241,18 @@ void PairTersoffZBLKokkos<DeviceType>::ev_tally(EV_FLOAT &ev, const int &i, cons { const int VFLAG = vflag_either; - // The eatom and vatom arrays are atomic for Half/Thread neighbor style - Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_eatom = k_eatom.view<DeviceType>(); - Kokkos::View<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_vatom = k_vatom.view<DeviceType>(); + // The eatom and vatom arrays are duplicated for OpenMP, atomic for CUDA, and neither for Serial + + auto v_eatom = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_eatom),decltype(ndup_eatom)>::get(dup_eatom,ndup_eatom); + auto a_eatom = v_eatom.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); + + auto v_vatom = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_vatom),decltype(ndup_vatom)>::get(dup_vatom,ndup_vatom); + auto a_vatom = v_vatom.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); if (eflag_atom) { const E_FLOAT epairhalf = 0.5 * epair; - v_eatom[i] += epairhalf; - if (NEIGHFLAG != FULL) v_eatom[j] += epairhalf; + a_eatom[i] += epairhalf; + if (NEIGHFLAG != FULL) a_eatom[j] += epairhalf; } if (VFLAG) { @@ -1251,20 +1282,20 @@ void PairTersoffZBLKokkos<DeviceType>::ev_tally(EV_FLOAT &ev, const int &i, cons } if (vflag_atom) { - v_vatom(i,0) += 0.5*v0; - v_vatom(i,1) += 0.5*v1; - v_vatom(i,2) += 0.5*v2; - v_vatom(i,3) += 0.5*v3; - v_vatom(i,4) += 0.5*v4; - v_vatom(i,5) += 0.5*v5; + a_vatom(i,0) += 0.5*v0; + a_vatom(i,1) += 0.5*v1; + a_vatom(i,2) += 0.5*v2; + a_vatom(i,3) += 0.5*v3; + a_vatom(i,4) += 0.5*v4; + a_vatom(i,5) += 0.5*v5; if (NEIGHFLAG != FULL) { - v_vatom(j,0) += 0.5*v0; - v_vatom(j,1) += 0.5*v1; - v_vatom(j,2) += 0.5*v2; - v_vatom(j,3) += 0.5*v3; - v_vatom(j,4) += 0.5*v4; - v_vatom(j,5) += 0.5*v5; + a_vatom(j,0) += 0.5*v0; + a_vatom(j,1) += 0.5*v1; + a_vatom(j,2) += 0.5*v2; + a_vatom(j,3) += 0.5*v3; + a_vatom(j,4) += 0.5*v4; + a_vatom(j,5) += 0.5*v5; } } } @@ -1278,9 +1309,10 @@ KOKKOS_INLINE_FUNCTION void PairTersoffZBLKokkos<DeviceType>::v_tally3(EV_FLOAT &ev, const int &i, const int &j, const int &k, F_FLOAT *fj, F_FLOAT *fk, F_FLOAT *drij, F_FLOAT *drik) const { + // The vatom array is duplicated for OpenMP, atomic for CUDA, and neither for Serial - // The eatom and vatom arrays are atomic for Half/Thread neighbor style - Kokkos::View<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_vatom = k_vatom.view<DeviceType>(); + auto v_vatom = ScatterViewHelper<NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_vatom),decltype(ndup_vatom)>::get(dup_vatom,ndup_vatom); + auto a_vatom = v_vatom.template access<AtomicDup<NEIGHFLAG,DeviceType>::value>(); F_FLOAT v[6]; @@ -1301,13 +1333,13 @@ void PairTersoffZBLKokkos<DeviceType>::v_tally3(EV_FLOAT &ev, const int &i, cons } if (vflag_atom) { - v_vatom(i,0) += v[0]; v_vatom(i,1) += v[1]; v_vatom(i,2) += v[2]; - v_vatom(i,3) += v[3]; v_vatom(i,4) += v[4]; v_vatom(i,5) += v[5]; + a_vatom(i,0) += v[0]; a_vatom(i,1) += v[1]; a_vatom(i,2) += v[2]; + a_vatom(i,3) += v[3]; a_vatom(i,4) += v[4]; a_vatom(i,5) += v[5]; if (NEIGHFLAG != FULL) { - v_vatom(j,0) += v[0]; v_vatom(j,1) += v[1]; v_vatom(j,2) += v[2]; - v_vatom(j,3) += v[3]; v_vatom(j,4) += v[4]; v_vatom(j,5) += v[5]; - v_vatom(k,0) += v[0]; v_vatom(k,1) += v[1]; v_vatom(k,2) += v[2]; - v_vatom(k,3) += v[3]; v_vatom(k,4) += v[4]; v_vatom(k,5) += v[5]; + a_vatom(j,0) += v[0]; a_vatom(j,1) += v[1]; a_vatom(j,2) += v[2]; + a_vatom(j,3) += v[3]; a_vatom(j,4) += v[4]; a_vatom(j,5) += v[5]; + a_vatom(k,0) += v[0]; a_vatom(k,1) += v[1]; a_vatom(k,2) += v[2]; + a_vatom(k,3) += v[3]; a_vatom(k,4) += v[4]; a_vatom(k,5) += v[5]; } } diff --git a/src/KOKKOS/pair_tersoff_zbl_kokkos.h b/src/KOKKOS/pair_tersoff_zbl_kokkos.h index 45982bd420b5b860291a54e29ecf0f9f79581063..3af4e0d8ebd410c9d18ff717340fd32e767c589c 100644 --- a/src/KOKKOS/pair_tersoff_zbl_kokkos.h +++ b/src/KOKKOS/pair_tersoff_zbl_kokkos.h @@ -207,6 +207,14 @@ class PairTersoffZBLKokkos : public PairTersoffZBL { typename ArrayTypes<DeviceType>::t_efloat_1d d_eatom; typename ArrayTypes<DeviceType>::t_virial_array d_vatom; + int need_dup; + Kokkos::Experimental::ScatterView<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterDuplicated> dup_f; + Kokkos::Experimental::ScatterView<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterDuplicated> dup_eatom; + Kokkos::Experimental::ScatterView<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterDuplicated> dup_vatom; + Kokkos::Experimental::ScatterView<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterNonDuplicated> ndup_f; + Kokkos::Experimental::ScatterView<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterNonDuplicated> ndup_eatom; + Kokkos::Experimental::ScatterView<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterNonDuplicated> ndup_vatom; + typedef Kokkos::DualView<F_FLOAT**[7],Kokkos::LayoutRight,DeviceType> tdual_ffloat_2d_n7; typedef typename tdual_ffloat_2d_n7::t_dev_const_randomread t_ffloat_2d_n7_randomread; typedef typename tdual_ffloat_2d_n7::t_host t_host_ffloat_2d_n7;