diff --git a/src/finish.cpp b/src/finish.cpp index 8fff3a8a54784fe4147a52d848e015e577ad9bc5..8670987573d13d86c3474669de715153bd4c2f3a 100644 --- a/src/finish.cpp +++ b/src/finish.cpp @@ -33,6 +33,7 @@ #include "neigh_request.h" #include "output.h" #include "memory.h" +#include "error.h" #ifdef LMP_USER_OMP #include "modify.h" @@ -515,6 +516,13 @@ void Finish::end(int flag) } #endif + if (lmp->kokkos && lmp->kokkos->ngpu > 0) + if (const char* env_clb = std::getenv("CUDA_LAUNCH_BLOCKING")) + if (!(strcmp(env_clb,"1") == 0)) { + error->warning(FLERR,"Timing breakdown may not be accurate since GPU/CPU overlap is enabled. " + "Using 'export CUDA_LAUNCH_BLOCKING=1' will give an accurate timing breakdown but will reduce performance"); + } + // FFT timing statistics // time3d,time1d = total time during run for 3d and 1d FFTs // loop on timing() until nsample FFTs require at least 1.0 CPU sec