From 7ec52784cb34e80704ce1e7d9f19e4b1f059c973 Mon Sep 17 00:00:00 2001 From: "Steven J. Plimpton" <sjplimp@singsing.sandia.gov> Date: Tue, 14 Aug 2018 15:44:25 -0600 Subject: [PATCH] more changes to doc pages and CMakeLists.txt --- cmake/CMakeLists.txt | 49 +++--- doc/src/Build.txt | 3 + doc/src/Build_basics.txt | 1 - doc/src/Build_extras.txt | 125 ++++++++++------ doc/src/Build_package.txt | 23 +-- doc/src/Build_settings.txt | 21 ++- doc/src/Commands.txt | 2 + doc/src/Errors.txt | 1 + doc/src/Install.txt | 1 + doc/src/Install_linux.txt | 1 - doc/src/Intro.txt | 1 + doc/src/Intro_authors.txt | 3 +- doc/src/Manual.txt | 2 +- .../{Build_manual.txt => Manual_build.txt} | 1 - doc/src/Modify.txt | 4 + doc/src/Packages.txt | 1 + doc/src/Packages_details.txt | 2 - doc/src/Python.txt | 3 + doc/src/Run.txt | 1 + doc/src/Run_options.txt | 1 - doc/src/Speed.txt | 3 + doc/src/Speed_gpu.txt | 83 +---------- doc/src/Speed_intel.txt | 30 ++-- doc/src/Speed_kokkos.txt | 140 +++--------------- doc/src/Speed_omp.txt | 30 ++-- doc/src/Speed_opt.txt | 25 +--- doc/src/Tools.txt | 1 - doc/src/compute_chunk_atom.txt | 3 - doc/src/dump_modify.txt | 3 - doc/src/fix_box_relax.txt | 2 - lib/gpu/Install.py | 11 +- src/KSPACE/fft3d.cpp | 4 +- src/KSPACE/fft3d.h | 13 +- src/KSPACE/kissfft.h | 1 + src/pack.h | 11 +- 35 files changed, 232 insertions(+), 374 deletions(-) rename doc/src/{Build_manual.txt => Manual_build.txt} (99%) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 58311928d9..1bd9eb22b1 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -371,8 +371,8 @@ if(PKG_USER-NETCDF) endif() if(PKG_USER-SMD) - option(DOWNLOAD_Eigen3 "Download Eigen3 (instead of using the system's one)" OFF) - if(DOWNLOAD_Eigen3) + option(DOWNLOAD_EIGEN3 "Download Eigen3 (instead of using the system's one)" OFF) + if(DOWNLOAD_EIGEN3) include(ExternalProject) ExternalProject_Add(Eigen3_build URL http://bitbucket.org/eigen/eigen/get/3.3.4.tar.gz @@ -385,7 +385,7 @@ if(PKG_USER-SMD) else() find_package(Eigen3) if(NOT Eigen3_FOUND) - message(FATAL_ERROR "Eigen3 not found, help CMake to find it by setting EIGEN3_INCLUDE_DIR, or set DOWNLOAD_Eigen3=ON to download it") + message(FATAL_ERROR "Eigen3 not found, help CMake to find it by setting EIGEN3_INCLUDE_DIR, or set DOWNLOAD_EIGEN3=ON to download it") endif() endif() include_directories(${EIGEN3_INCLUDE_DIR}) @@ -807,16 +807,26 @@ if(PKG_GPU) ${GPU_SOURCES_DIR}/fix_gpu.h ${GPU_SOURCES_DIR}/fix_gpu.cpp) - set(GPU_API "OpenCL" CACHE STRING "API used by GPU package") - set_property(CACHE GPU_API PROPERTY STRINGS OpenCL CUDA) + set(GPU_API "opencl" CACHE STRING "API used by GPU package") + set_property(CACHE GPU_API PROPERTY STRINGS opencl cuda) + string(TOUPPER ${GPU_API} GPU_API_DEFINE) - set(GPU_PREC "SINGLE_DOUBLE" CACHE STRING "LAMMPS GPU precision size") - set_property(CACHE GPU_PREC PROPERTY STRINGS SINGLE_DOUBLE SINGLE_SINGLE DOUBLE_DOUBLE) + set(GPU_PREC "mixed" CACHE STRING "LAMMPS GPU precision") + set_property(CACHE GPU_PREC PROPERTY STRINGS double mixed single) + string(TOUPPER ${GPU_PREC} GPU_PREC_DEFINE) + + if(GPU_PREC_DEFINE STREQUAL "DOUBLE") + set(GPU_PREC_SETTING "DOUBLE_DOUBLE") + elseif(GPU_PREC_DEFINE STREQUAL "MIXED") + set(GPU_PREC_SETTING "SINGLE_DOUBLE") + elseif(GPU_PREC_DEFINE STREQUAL "SINGLE") + set(GPU_PREC_SETTING "SINGLE_SINGLE") + endif() file(GLOB GPU_LIB_SOURCES ${LAMMPS_LIB_SOURCE_DIR}/gpu/*.cpp) file(MAKE_DIRECTORY ${LAMMPS_LIB_BINARY_DIR}/gpu) - if(GPU_API STREQUAL "CUDA") + if(GPU_API_DEFINE STREQUAL "CUDA") find_package(CUDA REQUIRED) find_program(BIN2C bin2c) if(NOT BIN2C) @@ -824,7 +834,7 @@ if(PKG_GPU) endif() option(CUDPP_OPT "Enable CUDPP_OPT" ON) - set(GPU_ARCH "sm_30" CACHE STRING "LAMMPS GPU CUDA SM architecture (e.g. sm_60)") + set(GPU_ARCH "30" CACHE STRING "LAMMPS GPU CUDA SM architecture (e.g. 60)") file(GLOB GPU_LIB_CU ${LAMMPS_LIB_SOURCE_DIR}/gpu/*.cu ${CMAKE_CURRENT_SOURCE_DIR}/gpu/*.cu) list(REMOVE_ITEM GPU_LIB_CU ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_pppm.cu) @@ -838,10 +848,10 @@ if(PKG_GPU) endif() cuda_compile_cubin(GPU_GEN_OBJS ${GPU_LIB_CU} OPTIONS - -DUNIX -O3 -Xptxas -v --use_fast_math -DNV_KERNEL -DUCL_CUDADR -arch=${GPU_ARCH} -D_${GPU_PREC}) + -DUNIX -O3 -Xptxas -v --use_fast_math -DNV_KERNEL -DUCL_CUDADR -arch=sm_${GPU_ARCH} -D_${GPU_PREC_SETTING}) cuda_compile(GPU_OBJS ${GPU_LIB_CUDPP_CU} OPTIONS $<$<BOOL:${BUILD_SHARED_LIBS}>:-Xcompiler=-fPIC> - -DUNIX -O3 -Xptxas -v --use_fast_math -DUCL_CUDADR -arch=${GPU_ARCH} -D_${GPU_PREC}) + -DUNIX -O3 -Xptxas -v --use_fast_math -DUCL_CUDADR -arch=sm_${GPU_ARCH} -D_${GPU_PREC_SETTING}) foreach(CU_OBJ ${GPU_GEN_OBJS}) get_filename_component(CU_NAME ${CU_OBJ} NAME_WE) @@ -858,7 +868,7 @@ if(PKG_GPU) add_library(gpu STATIC ${GPU_LIB_SOURCES} ${GPU_LIB_CUDPP_SOURCES} ${GPU_OBJS}) target_link_libraries(gpu ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY}) target_include_directories(gpu PRIVATE ${LAMMPS_LIB_BINARY_DIR}/gpu ${CUDA_INCLUDE_DIRS}) - target_compile_definitions(gpu PRIVATE -D_${GPU_PREC} -DMPI_GERYON -DUCL_NO_EXIT) + target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -DMPI_GERYON -DUCL_NO_EXIT) if(CUDPP_OPT) target_include_directories(gpu PRIVATE ${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini) target_compile_definitions(gpu PRIVATE -DUSE_CUDPP) @@ -872,10 +882,11 @@ if(PKG_GPU) target_include_directories(nvc_get_devices PRIVATE ${CUDA_INCLUDE_DIRS}) - elseif(GPU_API STREQUAL "OpenCL") + elseif(GPU_API_DEFINE STREQUAL "OPENCL") find_package(OpenCL REQUIRED) - set(OCL_TUNE "GENERIC" CACHE STRING "OpenCL Device Tuning") - set_property(CACHE OCL_TUNE PROPERTY STRINGS INTEL FERMI KEPLER CYPRESS GENERIC) + set(OCL_TUNE "generic" CACHE STRING "OpenCL Device Tuning") + set_property(CACHE OCL_TUNE PROPERTY STRINGS intel fermi kepler cypress generic) + string(TOUPPER ${OCL_TUNE} OCL_TUNE_DEFINE) include(OpenCLUtils) set(OCL_COMMON_HEADERS ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_preprocessor.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_aux_fun1.h) @@ -897,7 +908,7 @@ if(PKG_GPU) add_library(gpu STATIC ${GPU_LIB_SOURCES}) target_link_libraries(gpu ${OpenCL_LIBRARIES}) target_include_directories(gpu PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/gpu ${OpenCL_INCLUDE_DIRS}) - target_compile_definitions(gpu PRIVATE -D_${GPU_PREC} -D${OCL_TUNE}_OCL -DMPI_GERYON -DUCL_NO_EXIT) + target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -D${OCL_TUNE_DEFINE}_OCL -DMPI_GERYON -DUCL_NO_EXIT) target_compile_definitions(gpu PRIVATE -DUSE_OPENCL) list(APPEND LAMMPS_LINK_LIBS gpu) @@ -1148,9 +1159,9 @@ if(BUILD_MPI) endif() if(PKG_GPU) message(STATUS "GPU Api: ${GPU_API}") - if(GPU_API STREQUAL "CUDA") - message(STATUS "GPU Arch: ${GPU_ARCH}") - elseif(GPU_API STREQUAL "OpenCL") + if(GPU_API_DEFINE STREQUAL "CUDA") + message(STATUS "GPU Arch: sm_${GPU_ARCH}") + elseif(GPU_API_DEFINE STREQUAL "OPENCL") message(STATUS "OCL Tune: ${OCL_TUNE}") endif() message(STATUS "GPU Precision: ${GPU_PREC}") diff --git a/doc/src/Build.txt b/doc/src/Build.txt index 76aa95de55..218664897f 100644 --- a/doc/src/Build.txt +++ b/doc/src/Build.txt @@ -19,18 +19,21 @@ as described on the "Install"_Install.html doc page. <!-- RST .. toctree:: + :maxdepth: 1 Build_cmake Build_make Build_link .. toctree:: + :maxdepth: 1 Build_basics Build_settings Build_package .. toctree:: + :maxdepth: 1 Build_extras diff --git a/doc/src/Build_basics.txt b/doc/src/Build_basics.txt index 806144256c..79c22d8fe4 100644 --- a/doc/src/Build_basics.txt +++ b/doc/src/Build_basics.txt @@ -18,7 +18,6 @@ CMake and make: "Build the LAMMPS documentation"_#doc "Install LAMMPS after a build"_#install :ul -:line :line Serial vs parallel build :h3,link(serial) diff --git a/doc/src/Build_extras.txt b/doc/src/Build_extras.txt index 14f17aa981..69f65a451f 100644 --- a/doc/src/Build_extras.txt +++ b/doc/src/Build_extras.txt @@ -49,7 +49,6 @@ This is the list of packages that may require additional steps. "USER-SMD"_#user-smd, "USER-VTK"_#user-vtk :tb(c=6,ea=c) -:line :line COMPRESS package :h4,link(compress) @@ -81,15 +80,15 @@ which GPU hardware to build for. -D GPU_API=value # value = opencl (default) or cuda -D GPU_PREC=value # precision setting - # value = single or mixed (default) or double + # value = double or mixed (default) or single -D OCL_TUNE=value # hardware choice for GPU_API=opencl - # generic (default) or intel (Intel CPU) or phi (Intel Xeon Phi) or fermi, kepler, cypress (NVIDIA) + # generic (default) or intel (Intel CPU) or fermi, kepler, cypress (NVIDIA) -D GPU_ARCH=value # hardware choice for GPU_API=cuda - # value = sm20 (Fermi) or sm30 (Kepler) or sm50 (Maxwell) or sm60 (Pascal) or sm70 (Volta) + # value = 20 (Fermi) or 30 (Kepler) or 50 (Maxwell) or 60 (Pascal) or 70 (Volta) # default is Cuda-compiler dependent, but typically Fermi -D CUDPP_OPT=value # optimization setting for GPU_API=cudea # enables CUDA Performance Primitives Optimizations - # on (default) or off :pre + # yes (default) or no :pre [Traditional make]: @@ -119,7 +118,7 @@ Makefile.machine you start from via the -h, -a, -p, -e switches, and also save a copy of the new Makefile if desired: CUDA_HOME = where NVIDIA CUDA software is installed on your system -CUDA_ARCH = what GPU hardware you have (see help message for details) +CUDA_ARCH = what GPU hardware you have (same as CMake, see help message for details) CUDA_PRECISION = precision (double, mixed, single) EXTRAMAKE = which Makefile.lammps.* file to copy to Makefile.lammps :ul @@ -163,7 +162,7 @@ package?" page. [CMake build]: --D DOWNLOAD_KIM=value # download OpenKIM API v1 for build, value = off (default) or on +-D DOWNLOAD_KIM=value # download OpenKIM API v1 for build, value = no (default) or yes -D KIM_LIBRARY=path # path to KIM shared library (only needed if a custom location) -D KIM_INCLUDE_DIR=path # path to KIM include directory (only needed if a custom location) :pre @@ -183,17 +182,65 @@ make lib-kim args="-p /usr/local/kim-api" # use an existing KIM API installation make lib-kim args="-p /usr/local/kim-api -a EAM_Dynamo_Ackland_W__MO_141627196590_002" # ditto but add one model or driver :pre :line - + KOKKOS package :h4,link(kokkos) To build with this package, you must choose which hardware you want to build for, either CPUs (multi-threading via OpenMP) or KNLs (OpenMP) -or GPUs (Cuda). +or GPUs (NVIDIA Cuda). + +For a CMake or make build, these are the possible choices for the +KOKKOS_ARCH settings described below. Note that for CMake, these are +really Kokkos variables, not LAMMPS variables. Hence you must use +case-sensitive values, e.g. BDW, not bdw. + +ARMv80 = ARMv8.0 Compatible CPU +ARMv81 = ARMv8.1 Compatible CPU +ARMv8-ThunderX = ARMv8 Cavium ThunderX CPU +BGQ = IBM Blue Gene/Q CPUs +Power8 = IBM POWER8 CPUs +Power9 = IBM POWER9 CPUs +SNB = Intel Sandy/Ivy Bridge CPUs +HSW = Intel Haswell CPUs +BDW = Intel Broadwell Xeon E-class CPUs +SKX = Intel Sky Lake Xeon E-class HPC CPUs (AVX512) +KNC = Intel Knights Corner Xeon Phi +KNL = Intel Knights Landing Xeon Phi +Kepler30 = NVIDIA Kepler generation CC 3.0 +Kepler32 = NVIDIA Kepler generation CC 3.2 +Kepler35 = NVIDIA Kepler generation CC 3.5 +Kepler37 = NVIDIA Kepler generation CC 3.7 +Maxwell50 = NVIDIA Maxwell generation CC 5.0 +Maxwell52 = NVIDIA Maxwell generation CC 5.2 +Maxwell53 = NVIDIA Maxwell generation CC 5.3 +Pascal60 = NVIDIA Pascal generation CC 6.0 +Pascal61 = NVIDIA Pascal generation CC 6.1 :ul [CMake build]: -TODO: how to do this, how to select CPU vs KNL vs GPU, and specify -the particular flavor of hardware: e.g. HSW vs BWL +For multicore CPUs using OpenMP, set these 2 variables. + +-D KOKKOS_ARCH=archCPU # archCPU = CPU from list above :pre +-D KOKKOS_ENABLE_OPENMP=yes :pre + +For Intel KNLs using OpenMP, set these 2 variables: + +-D KOKKOS_ARCH=KNL +-D KOKKOS_ENABLE_OPENMP=yes :pre + +For NVIDIA GPUs using CUDA, set these 4 variables: + +-D KOKKOS_ARCH="archCPU;archGPU" # archCPU = CPU from list above that is hosting the GPU + # archGPU = GPU from list above +-D KOKKOS_ENABLE_CUDA=yes +-D KOKKOS_ENABLE_OPENMP=yes +-D CMAKE_CXX_COMPILER=wrapper # wrapper = full path to Cuda nvcc wrapper :pre + +The wrapper value is the Cuda nvcc compiler wrapper provided in the +Kokkos library: lib/kokkos/bin/nvcc_wrapper. The setting should +include the full path name to the wrapper, e.g. + +-D CMAKE_CXX_COMPILER=/home/username/lammps/lib/kokkos/bin/nvcc_wrapper :pre [Traditional make]: @@ -204,16 +251,7 @@ src/MAKE/OPTIONS/Makefile.kokkos* files for examples. For multicore CPUs using OpenMP: KOKKOS_DEVICES = OpenMP -KOKKOS_ARCH = HSW :pre - -Possible values are: - -HSW for Intel Haswell -SNB for Intel SandyBridge -BDW for Intel Broadwell -BGQ for IBM BlueGene Q -Power7 for IBM -Power8 for IBM :ul +KOKKOS_ARCH = archCPU # archCPU = CPU from list above :pre For Intel KNLs using OpenMP: @@ -223,8 +261,8 @@ KOKKOS_ARCH = KNL :pre For NVIDIA GPUs using CUDA: KOKKOS_DEVICES = Cuda -KOKKOS_ARCH = Pascal60,Power8 # P100 hosted by an IBM Power8, etc -KOKKOS_ARCH = Kepler37,Power8 # K80 hosted by an IBM Power8, etc :pre +KOKKOS_ARCH = archCPU,archGPU # archCPU = CPU from list above that is hosting the GPU + # archGPU = GPU from list above :pre For GPUs, you also need these 2 lines in your Makefile.machine before the CC line is defined, in this case for use with OpenMPI mpicxx. The @@ -245,7 +283,7 @@ library. [CMake build]: --D DOWNLOAD_LATTE=value # download LATTE for build, value = off (default) or on +-D DOWNLOAD_LATTE=value # download LATTE for build, value = no (default) or yes -D LATTE_LIBRARY=path # path to LATTE shared library (only needed if a custom location) :pre [Traditional make]: @@ -320,7 +358,7 @@ lib/mscg/README and MSCG/Install files for more details. [CMake build]: --D DOWNLOAD_MSCG=value # download MSCG for build, value = off (default) or on +-D DOWNLOAD_MSCG=value # download MSCG for build, value = no (default) or yes -D MSCG_LIBRARY=path # path to MSCG shared library (only needed if a custom location) -D MSCG_INCLUDE_DIR=path # path to MSCG include directory (only needed if a custom location) :pre @@ -400,12 +438,12 @@ lib/python/README for more details. -D PYTHON_EXECUTABLE=path # path to Python executable to use :pre -Without this setting, CMake will you your system default Python. To -use a different Python version, you can either create a virtualenv, -activate it and then run cmake. Or you can set the PYTHON_EXECUTABLE -variable to specify which Python interpreter should be used. Note -note that you will also need to have the development headers installed -for this version, e.g. python2-devel. +Without this setting, CMake will ues the default Python on your +system. To use a different Python version, you can either create a +virtualenv, activate it and then run cmake. Or you can set the +PYTHON_EXECUTABLE variable to specify which Python interpreter should +be used. Note note that you will also need to have the development +headers installed for this version, e.g. python2-devel. [Traditional make]: @@ -464,7 +502,7 @@ library"_voro_home. [CMake build]: --D DOWNLOAD_VORO=value # download Voro++ for build, value = off (default) or on +-D DOWNLOAD_VORO=value # download Voro++ for build, value = no (default) or yes -D VORO_LIBRARY=path # (only needed if at custom location) path to VORO shared library -D VORO_INCLUDE_DIR=path # (only needed if at custom location) path to VORO include directory :pre @@ -486,7 +524,6 @@ created in lib/voronoi to point to the Voro++ src dir. When LAMMPS builds in src it will use these links. You should not need to edit the lib/voronoi/Makefile.lammps file. -:line :line USER-ATC package :h4,link(user-atc) @@ -642,15 +679,16 @@ USER-INTEL package :h4,link(user-intel) To build with this package, you must choose which hardware you want to build for, either Intel CPUs or Intel KNLs. You should also typically -install the USER-OMP package, as it can be used in tandem with the -USER-INTEL package to good effect, as explained on the "Speed +"install the USER-OMP package"_#user-omp, as it can be used in tandem +with the USER-INTEL package to good effect, as explained on the "Speed intel"_Speed_intel.html doc page. [CMake build]: -D INTEL_ARCH=value # value = cpu (default) or knl :pre +-D BUILD_OMP=yes # also required to build with the USER-INTEl package :pre -Requires an Intel compiler, Intel TBB and MKL and has to be built with "-D BUILD_OMP=on". +Requires an Intel compiler as well as the Intel TBB and MKL libraries. [Traditional make]: @@ -821,22 +859,19 @@ successfully build on your system. USER-SMD package :h4,link(user-smd) -To build with this package, you must download the Eigen library. -Eigen is a template library, so you do not need to build it. +To build with this package, you must download the Eigen3 library. +Eigen3 is a template library, so you do not need to build it. [CMake build]: --D EIGEN3_INCLUDE_DIR=path # path to Eigen library :pre - -TODO: there is no download option for the Eigen lib? +-D DOWNLOAD_EIGEN3 # download Eigen3, value = no (default) or yes +-D EIGEN3_INCLUDE_DIR=path # path to Eigen library (only needed if a custom location) :pre -CMake will not download the Eigen library. But once you have done -that, a CMake build of LAMMPS with "-D PKG_USER-SMD=yes" should work. -Set EIGEN3_INCLUDE_DIR if CMake cannot find the Eigen library. +Set EIGEN3_INCLUDE_DIR if CMake cannot find the Eigen3 library. [Traditional make]: -You can download the Eigen library manually if you prefer; follow the +You can download the Eigen3 library manually if you prefer; follow the instructions in lib/smd/README. You can also do it in one step from the lammps/src dir, using a command like these, which simply invoke the lib/smd/Install.py script with the specified args: diff --git a/doc/src/Build_package.txt b/doc/src/Build_package.txt index 3244e0f94a..45626dbbae 100644 --- a/doc/src/Build_package.txt +++ b/doc/src/Build_package.txt @@ -130,16 +130,16 @@ the Git or SVN repositories, no packages are pre-installed. [CMake shortcuts for installing many packages]: -Instead of specifying all the CMake options via the command-line, CMake allows -initializing the variable cache using script files. These are regular CMake -files which can manipulate and set variables, and can also contain control flow -constructs. +Instead of specifying all the CMake options via the command-line, +CMake allows initializing the variable cache using script files. These +are regular CMake files which can manipulate and set variables, and +can also contain control flow constructs. -LAMMPS includes several of these files to define configuration "presets", -similar to the options that exist for the Make based system. Using these files -you can enable/disable portions of the available packages in LAMMPS. If you need a -custom preset you can take one of them as a starting point and customize it to your -needs. +LAMMPS includes several of these files to define configuration +"presets", similar to the options that exist for the Make based +system. Using these files you can enable/disable portions of the +available packages in LAMMPS. If you need a custom preset you can take +one of them as a starting point and customize it to your needs. cmake -C ../cmake/presets/all_on.cmake \[OPTIONS\] ../cmake | enable all packages cmake -C ../cmake/presets/all_off.cmake \[OPTIONS\] ../cmake | disable all packages @@ -149,8 +149,9 @@ cmake -C ../cmake/presets/std_nolib.cmake \[OPTIONS\] ../cmake | enable standard cmake -C ../cmake/presets/nolib.cmake \[OPTIONS\] ../cmake | disable all packages that do not require extra libraries cmake -C ../cmake/presets/manual_selection.cmake \[OPTIONS\] ../cmake | example of how to create a manual selection of packages :tb(s=|,a=l) -NOTE: Running cmake this way manipulates the variable cache in your current -build directory. You can combine presets and options with multiple cmake runs. +NOTE: Running cmake this way manipulates the variable cache in your +current build directory. You can combine presets and options with +multiple cmake runs. [Example:] diff --git a/doc/src/Build_settings.txt b/doc/src/Build_settings.txt index 736e9b9f0e..45a0827210 100644 --- a/doc/src/Build_settings.txt +++ b/doc/src/Build_settings.txt @@ -21,7 +21,6 @@ explain how to do this for building both with CMake and make. "Workaround for long long integers"_#longlong "Error handling exceptions"_#exceptions when using LAMMPS as a library :all(b) -:line :line FFT library :h3,link(fft) @@ -38,6 +37,10 @@ LAMMPS can use them if they are available on your system. -D FFT_SINGLE=value # yes or no (default), no = double precision -D FFT_PACK=value # array (default) or pointer or memcpy :pre +NOTE: The values for the FFT variable must be in upper-case. +This is an exception to the rule that all CMake variables can +be specified with lower-case values. + Usually these settings are all that is needed. If CMake cannot find the FFT library, you can set these variables: @@ -50,10 +53,11 @@ the FFT library, you can set these variables: [Makefile.machine settings]: -FFT_INC = -DFFT_FFTW3 # -DFFT_FFTW3, -DFFT_FFTW2, -DFFT_FFTW (same as -DFFT_FFTW3), -DFFT_MKL, or -DFFT_KISSFFT +FFT_INC = -DFFT_FFTW3 # -DFFT_FFTW3, -DFFT_FFTW2, -DFFT_FFTW (same as -DFFT_FFTW3), -DFFT_MKL, or -DFFT_KISS # default is KISS if not specified FFT_INC = -DFFT_SINGLE # do not specify for double precision FFT_INC = -DFFT_PACK_ARRAY # or -DFFT_PACK_POINTER or -DFFT_PACK_MEMCPY :pre + # default is FFT_PACK_ARRAY if not specified FFT_INC = -I/usr/local/include FFT_PATH = -L/usr/local/lib @@ -84,9 +88,10 @@ pppm"_kspace_style.html command. The "Run output"_doc page gives more details. FFTW is a fast, portable FFT library that should also work on any -platform and can be faster than KISS FFT. You can download it from -"www.fftw.org"_http://www.fftw.org. Both the (obsolete) legacy version -2.1.X and the newer 3.X versions are supported. +platform and can be faster than the KISS FFT library. You can +download it from "www.fftw.org"_http://www.fftw.org. Both the +(obsolete) legacy version 2.1.X and the newer 3.X versions are +supported. NOTE: FFTW2 has not been updated since 1999 and has been declared obsolete by its developers. @@ -148,7 +153,7 @@ adequate. [Makefile.machine setting]: LMP_INC = -DLAMMPS_SMALLBIG # or -DLAMMPS_BIGBIG or -DLAMMPS_SMALLSMALL :pre - + # default is LAMMMPS_SMALLBIG if not specified [CMake and make info]: The default "smallbig" setting allows for simulations with: @@ -298,10 +303,10 @@ aligned on 64-byte boundaries. [CMake variable]: --D LAMMPS_MEMALIGN=value # 8, 16, 32, 64 (default) :pre +-D LAMMPS_MEMALIGN=value # 0, 8, 16, 32, 64 (default) :pre Use a LAMMPS_MEMALIGN value of 0 to disable using posix_memalign() -and revert to using the malloc() C-library function instead. When +and revert to using the malloc() C-library function instead. When compiling LAMMPS for Windows systems, malloc() will always be used and this setting ignored. diff --git a/doc/src/Commands.txt b/doc/src/Commands.txt index 30e3343bd2..84eac285f7 100644 --- a/doc/src/Commands.txt +++ b/doc/src/Commands.txt @@ -16,6 +16,7 @@ commands in it are used to define a LAMMPS simulation. <!-- RST .. toctree:: + :maxdepth: 1 Commands_input Commands_parse @@ -23,6 +24,7 @@ commands in it are used to define a LAMMPS simulation. Commands_category .. toctree:: + :maxdepth: 1 Commands_all Commands_fix diff --git a/doc/src/Errors.txt b/doc/src/Errors.txt index 1b6206c780..a8d8d3a18e 100644 --- a/doc/src/Errors.txt +++ b/doc/src/Errors.txt @@ -19,6 +19,7 @@ additional details for many of them. <!-- RST .. toctree:: + :maxdepth: 1 Errors_common Errors_bugs diff --git a/doc/src/Install.txt b/doc/src/Install.txt index d59c23d319..0a2e870a5d 100644 --- a/doc/src/Install.txt +++ b/doc/src/Install.txt @@ -20,6 +20,7 @@ need the source code. <!-- RST .. toctree:: + :maxdepth: 1 Install_linux Install_mac diff --git a/doc/src/Install_linux.txt b/doc/src/Install_linux.txt index cddec0c069..cc15ac0ae0 100644 --- a/doc/src/Install_linux.txt +++ b/doc/src/Install_linux.txt @@ -15,7 +15,6 @@ Binaries are available for many different versions of Linux: "Pre-built Ubuntu Linux executables"_#ubuntu "Pre-built Gentoo Linux executable"_#gentoo :all(b) -:line :line Pre-built binary RPMs for Fedora/RedHat/CentOS/openSUSE :h4,link(rpm) diff --git a/doc/src/Intro.txt b/doc/src/Intro.txt index 4defbed8c4..c8725e0085 100644 --- a/doc/src/Intro.txt +++ b/doc/src/Intro.txt @@ -15,6 +15,7 @@ These pages provide a brief introduction to LAMMPS. <!-- RST .. toctree:: + :maxdepth: 1 Intro_overview Manual_version diff --git a/doc/src/Intro_authors.txt b/doc/src/Intro_authors.txt index ce418d0ce1..8bb0fa9c22 100644 --- a/doc/src/Intro_authors.txt +++ b/doc/src/Intro_authors.txt @@ -58,7 +58,6 @@ Terry Stouch (Lexicon Pharmaceuticals, formerly at Bristol Myers Squibb) Steve Lustig (Dupont) Jim Belak and Roy Pollock (LLNL) :ul -:line :line Here is a timeline for when various individuals contributed to a new @@ -239,7 +238,7 @@ Aug11 : angle_style cosine/shift and cosine/shift/exp : Carsten Svaneborg Aug11 : dihedral_style cosine/shift/exp : Carsten Svaneborg Aug11 : pair_style dipole/sf : Mario Orsi Aug11 : fix addtorque and compute temp/rotate : Laurent Joly (U Lyon) -Aug11 : FFT support via FFTW3, MKL, ACML, KISSFFT libraries : \ +Aug11 : FFT support via FFTW3, MKL, ACML, KISS FFT libraries : \ Axel Kohlmeyer (Temple U) Jun11 : pair_style adp : Chris Weinberger (Sandia), Stephen Foiles (Sandia), \ Chandra Veer Singh (Cornell) diff --git a/doc/src/Manual.txt b/doc/src/Manual.txt index 49fd8d4db2..0f0bd8f14e 100644 --- a/doc/src/Manual.txt +++ b/doc/src/Manual.txt @@ -84,7 +84,7 @@ every LAMMPS command. Modify Python Errors - Build_manual + Manual_build .. toctree:: :caption: Index diff --git a/doc/src/Build_manual.txt b/doc/src/Manual_build.txt similarity index 99% rename from doc/src/Build_manual.txt rename to doc/src/Manual_build.txt index 695ac21a13..2be4b98960 100644 --- a/doc/src/Build_manual.txt +++ b/doc/src/Manual_build.txt @@ -122,4 +122,3 @@ software installed. "http://calibre-ebook.com/"_http://calibre-ebook.com/ You first create the ePUB file with 'make epub' and then do: ebook-convert LAMMPS.epub LAMMPS.mobi :pre - diff --git a/doc/src/Modify.txt b/doc/src/Modify.txt index f828bd5d74..6189b9ba3e 100644 --- a/doc/src/Modify.txt +++ b/doc/src/Modify.txt @@ -24,11 +24,13 @@ contribute"_Modify_contribute.html doc page. <!-- RST .. toctree:: + :maxdepth: 1 Modify_overview Modify_contribute .. toctree:: + :maxdepth: 1 Modify_atom Modify_pair @@ -38,6 +40,7 @@ contribute"_Modify_contribute.html doc page. Modify_command .. toctree:: + :maxdepth: 1 Modify_dump Modify_kspace @@ -46,6 +49,7 @@ contribute"_Modify_contribute.html doc page. Modify_body .. toctree:: + :maxdepth: 1 Modify_thermo Modify_variable diff --git a/doc/src/Packages.txt b/doc/src/Packages.txt index e48c947be3..231c8528e9 100644 --- a/doc/src/Packages.txt +++ b/doc/src/Packages.txt @@ -23,6 +23,7 @@ LAMMPS build process. <!-- RST .. toctree:: + :maxdepth: 1 Packages_standard Packages_user diff --git a/doc/src/Packages_details.txt b/doc/src/Packages_details.txt index 46948a88b7..ff0ce7844c 100644 --- a/doc/src/Packages_details.txt +++ b/doc/src/Packages_details.txt @@ -99,7 +99,6 @@ as contained in the file name. "USER-UEF"_#PKG-USER-UEF, "USER-VTK"_#PKG-USER-VTK :tb(c=6,ea=c) -:line :line ASPHERE package :link(PKG-ASPHERE),h4 @@ -1007,7 +1006,6 @@ lib/voronoi/README "compute voronoi/atom"_compute_voronoi_atom.html examples/voronoi :ul -:line :line USER-ATC package :link(PKG-USER-ATC),h4 diff --git a/doc/src/Python.txt b/doc/src/Python.txt index 2c9c6872bb..b5d33c5daa 100644 --- a/doc/src/Python.txt +++ b/doc/src/Python.txt @@ -16,10 +16,12 @@ used together. <!-- RST .. toctree:: + :maxdepth: 1 Python_overview .. toctree:: + :maxdepth: 1 Python_run Python_shlib @@ -31,6 +33,7 @@ used together. Python_examples .. toctree:: + :maxdepth: 1 Python_call diff --git a/doc/src/Run.txt b/doc/src/Run.txt index 68c3f1e295..5e2c0fe235 100644 --- a/doc/src/Run.txt +++ b/doc/src/Run.txt @@ -19,6 +19,7 @@ they can contain. <!-- RST .. toctree:: + :maxdepth: 1 Run_basics Run_options diff --git a/doc/src/Run_options.txt b/doc/src/Run_options.txt index 64c16517d6..0704e3b619 100644 --- a/doc/src/Run_options.txt +++ b/doc/src/Run_options.txt @@ -34,7 +34,6 @@ For example, the lmp_mpi executable might be launched as follows: mpirun -np 16 lmp_mpi -v f tmp.out -l my.log -sc none -i in.alloy mpirun -np 16 lmp_mpi -var f tmp.out -log my.log -screen none -in in.alloy :pre -:line :line [-echo style] :link(echo) diff --git a/doc/src/Speed.txt b/doc/src/Speed.txt index 091657082a..7eac11ffa5 100644 --- a/doc/src/Speed.txt +++ b/doc/src/Speed.txt @@ -31,15 +31,18 @@ hardware platforms. <!-- RST .. toctree:: + :maxdepth: 1 Speed_bench Speed_measure .. toctree:: + :maxdepth: 1 Speed_tips .. toctree:: + :maxdepth: 1 Speed_packages Speed_compare diff --git a/doc/src/Speed_gpu.txt b/doc/src/Speed_gpu.txt index cd81c03ba0..3ae4639dc2 100644 --- a/doc/src/Speed_gpu.txt +++ b/doc/src/Speed_gpu.txt @@ -43,89 +43,22 @@ same functionality can eventually be supported on a variety of GPU hardware. :l :ule -Here is a quick overview of how to enable and use the GPU package: - -build the library in lib/gpu for your GPU hardware with the desired precision settings -install the GPU package and build LAMMPS as usual -use the mpirun command to set the number of MPI tasks/node which determines the number of MPI tasks/GPU -specify the # of GPUs per node -use GPU styles in your input script :ul - -The latter two steps can be done using the "-pk gpu" and "-sf gpu" -"command-line switches"_Run_options.html respectively. Or the effect -of the "-pk" or "-sf" switches can be duplicated by adding the -"package gpu"_package.html or "suffix gpu"_suffix.html commands -respectively to your input script. - [Required hardware/software:] To use this package, you currently need to have an NVIDIA GPU and install the NVIDIA CUDA software on your system: -Check if you have an NVIDIA GPU: cat /proc/driver/nvidia/gpus/0/information -Go to http://www.nvidia.com/object/cuda_get.html -Install a driver and toolkit appropriate for your system (SDK is not necessary) -Run lammps/lib/gpu/nvc_get_devices (after building the GPU library, see below) to list supported devices and properties :ul +Check if you have an NVIDIA GPU: cat +/proc/driver/nvidia/gpus/0/information Go to +http://www.nvidia.com/object/cuda_get.html Install a driver and +toolkit appropriate for your system (SDK is not necessary) Run +lammps/lib/gpu/nvc_get_devices (after building the GPU library, see +below) to list supported devices and properties :ul [Building LAMMPS with the GPU package:] -This requires two steps (a,b): build the GPU library, then build -LAMMPS with the GPU package. You can do both these steps in one line -as described on the "Packages details"_Packages_details.html#GPU doc -page. - -Or you can follow these two (a,b) steps: - -(a) Build the GPU library - -The GPU library is in lammps/lib/gpu. Select a Makefile.machine (in -lib/gpu) appropriate for your system. You should pay special -attention to 3 settings in this makefile. - -CUDA_HOME = needs to be where NVIDIA CUDA software is installed on your system -CUDA_ARCH = needs to be appropriate to your GPUs -CUDA_PREC = precision (double, mixed, single) you desire :ul - -See lib/gpu/Makefile.linux.double for examples of the ARCH settings -for different GPU choices, e.g. Fermi vs Kepler. It also lists the -possible precision settings: - -CUDA_PREC = -D_SINGLE_SINGLE # single precision for all calculations -CUDA_PREC = -D_DOUBLE_DOUBLE # double precision for all calculations -CUDA_PREC = -D_SINGLE_DOUBLE # accumulation of forces, etc, in double :pre - -The last setting is the mixed mode referred to above. Note that your -GPU must support double precision to use either the 2nd or 3rd of -these settings. - -To build the library, type: - -make -f Makefile.machine :pre - -If successful, it will produce the files libgpu.a and Makefile.lammps. - -The latter file has 3 settings that need to be appropriate for the -paths and settings for the CUDA system software on your machine. -Makefile.lammps is a copy of the file specified by the EXTRAMAKE -setting in Makefile.machine. You can change EXTRAMAKE or create your -own Makefile.lammps.machine if needed. - -Note that to change the precision of the GPU library, you need to -re-build the entire library. Do a "clean" first, e.g. "make -f -Makefile.linux clean", followed by the make command above. - -(b) Build LAMMPS with the GPU package - -cd lammps/src -make yes-gpu -make machine :pre - -No additional compile/link flags are needed in Makefile.machine. - -Note that if you change the GPU library precision (discussed above) -and rebuild the GPU library, then you also need to re-install the GPU -package and re-build LAMMPS, so that all affected files are -re-compiled and linked to the new GPU library. +See the "Build extras"_Build_extras.html#gpu doc page for +instructions. [Run with the GPU package from the command line:] diff --git a/doc/src/Speed_intel.txt b/doc/src/Speed_intel.txt index 2b29ec3b0f..ef876a7d42 100644 --- a/doc/src/Speed_intel.txt +++ b/doc/src/Speed_intel.txt @@ -203,16 +203,12 @@ cat /proc/cpuinfo :pre [Building LAMMPS with the USER-INTEL package:] -NOTE: See the src/USER-INTEL/README file for additional flags that -might be needed for best performance on Intel server processors -code-named "Skylake". - -The USER-INTEL package must be installed into the source directory: - -make yes-user-intel :pre +See the "Build extras"_Build_extras.html#user-intel doc page for +instructions. Some additional details are covered here. -Several example Makefiles for building with the Intel compiler are -included with LAMMPS in the src/MAKE/OPTIONS/ directory: +For building with make, several example Makefiles for building with +the Intel compiler are included with LAMMPS in the src/MAKE/OPTIONS/ +directory: Makefile.intel_cpu_intelmpi # Intel Compiler, Intel MPI, No Offload Makefile.knl # Intel Compiler, Intel MPI, No Offload @@ -221,20 +217,16 @@ Makefile.intel_cpu_openpmi # Intel Compiler, OpenMPI, No Offload Makefile.intel_coprocessor # Intel Compiler, Intel MPI, Offload :pre Makefile.knl is identical to Makefile.intel_cpu_intelmpi except that -it explicitly specifies that vectorization should be for Intel -Xeon Phi x200 processors making it easier to cross-compile. For -users with recent installations of Intel Parallel Studio, the -process can be as simple as: +it explicitly specifies that vectorization should be for Intel Xeon +Phi x200 processors making it easier to cross-compile. For users with +recent installations of Intel Parallel Studio, the process can be as +simple as: make yes-user-intel source /opt/intel/parallel_studio_xe_2016.3.067/psxevars.sh # or psxevars.csh for C-shell make intel_cpu_intelmpi :pre -Alternatively this can be done as a single command with suitable make -command invocations, as described on the "Packages -details"_Packages_details.html#USER-INTEL doc page. - Note that if you build with support for a Phi coprocessor, the same binary can be used on nodes with or without coprocessors installed. However, if you do not have coprocessors on your system, building @@ -253,6 +245,10 @@ required for CCFLAGS and "-qoffload" is required for LINKFLAGS. Other recommended CCFLAG options for best performance are "-O2 -fno-alias -ansi-alias -qoverride-limits fp-model fast=2 -no-prec-div". +NOTE: See the src/USER-INTEL/README file for additional flags that +might be needed for best performance on Intel server processors +code-named "Skylake". + NOTE: The vectorization and math capabilities can differ depending on the CPU. For Intel compilers, the "-x" flag specifies the type of processor for which to optimize. "-xHost" specifies that the compiler diff --git a/doc/src/Speed_kokkos.txt b/doc/src/Speed_kokkos.txt index 306bc398af..eb787df5d6 100644 --- a/doc/src/Speed_kokkos.txt +++ b/doc/src/Speed_kokkos.txt @@ -37,101 +37,29 @@ task). These are Serial (MPI-only for CPUs and Intel Phi), OpenMP GPUs). You choose the mode at build time to produce an executable compatible with specific hardware. -[Building LAMMPS with the KOKKOS package:] - NOTE: Kokkos support within LAMMPS must be built with a C++11 compatible compiler. This means GCC version 4.7.2 or later, Intel 14.0.4 or later, or Clang 3.5.2 or later is required. -The recommended method of building the KOKKOS package is to start with -the provided Kokkos Makefiles in /src/MAKE/OPTIONS/. You may need to -modify the KOKKOS_ARCH variable in the Makefile to match your specific -hardware. For example: - -for Sandy Bridge CPUs, set KOKKOS_ARCH=SNB -for Broadwell CPUs, set KOKKOS_ARCH=BWD -for K80 GPUs, set KOKKOS_ARCH=Kepler37 -for P100 GPUs and Power8 CPUs, set KOKKOS_ARCH=Pascal60,Power8 :ul - -See the [Advanced Kokkos Options] section below for a listing of all -KOKKOS_ARCH options. - -[Compile for CPU-only (MPI only, no threading):] - -use a C++11 compatible compiler and set KOKKOS_ARCH variable in -/src/MAKE/OPTIONS/Makefile.kokkos_mpi_only as described above. Then do the -following: - -cd lammps/src -make yes-kokkos -make kokkos_mpi_only :pre - -[Compile for CPU-only (MPI plus OpenMP threading):] - -NOTE: To build with Kokkos support for OpenMP threading, your compiler -must support the OpenMP interface. You should have one or more -multi-core CPUs so that multiple threads can be launched by each MPI -task running on a CPU. - -Use a C++11 compatible compiler and set KOKKOS_ARCH variable in -/src/MAKE/OPTIONS/Makefile.kokkos_omp as described above. Then do the -following: - -cd lammps/src -make yes-kokkos -make kokkos_omp :pre - -[Compile for Intel KNL Xeon Phi (Intel Compiler, OpenMPI):] - -use a C++11 compatible compiler and do the following: - -cd lammps/src -make yes-kokkos -make kokkos_phi :pre - -[Compile for CPUs and GPUs (with OpenMPI or MPICH):] - NOTE: To build with Kokkos support for NVIDIA GPUs, NVIDIA CUDA software version 7.5 or later must be installed on your system. See the discussion for the "GPU package"_Speed_gpu.html for details of how to check and do this. NOTE: Kokkos with CUDA currently implicitly assumes, that the MPI -library is CUDA-aware and has support for GPU-direct. This is not always -the case, especially when using pre-compiled MPI libraries provided by -a Linux distribution. This is not a problem when using only a single -GPU and a single MPI rank on a desktop. When running with multiple -MPI ranks, you may see segmentation faults without GPU-direct support. -These can be avoided by adding the flags -"-pk kokkos gpu/direct off"_Run_options.html -to the LAMMPS command line or by using the command -"package kokkos gpu/direct off"_package.html in the input file. - -Use a C++11 compatible compiler and set KOKKOS_ARCH variable in -/src/MAKE/OPTIONS/Makefile.kokkos_cuda_mpi for both GPU and CPU as -described above. Then do the following: - -cd lammps/src -make yes-kokkos -make kokkos_cuda_mpi :pre - -[Alternative Methods of Compiling:] - -Alternatively, the KOKKOS package can be built by specifying Kokkos variables -on the make command line. For example: - -make mpi KOKKOS_DEVICES=OpenMP KOKKOS_ARCH=SNB # set the KOKKOS_DEVICES and KOKKOS_ARCH variable explicitly -make kokkos_cuda_mpi KOKKOS_ARCH=Pascal60,Power8 # set the KOKKOS_ARCH variable explicitly :pre - -Setting the KOKKOS_DEVICES and KOKKOS_ARCH variables on the make -command line requires a GNU-compatible make command. Try "gmake" if -your system's standard make complains. - -NOTE: If you build using make line variables and re-build LAMMPS twice -with different KOKKOS options and the *same* target, then you *must* -perform a "make clean-all" or "make clean-machine" before each -build. This is to force all the KOKKOS-dependent files to be -re-compiled with the new options. +library is CUDA-aware and has support for GPU-direct. This is not +always the case, especially when using pre-compiled MPI libraries +provided by a Linux distribution. This is not a problem when using +only a single GPU and a single MPI rank on a desktop. When running +with multiple MPI ranks, you may see segmentation faults without +GPU-direct support. These can be avoided by adding the flags "-pk +kokkos gpu/direct off"_Run_options.html to the LAMMPS command line or +by using the command "package kokkos gpu/direct off"_package.html in +the input file. + +[Building LAMMPS with the KOKKOS package:] + +See the "Build extras"_Build_extras.html#kokkos doc page for instructions. [Running LAMMPS with the KOKKOS package:] @@ -411,50 +339,18 @@ hardware. [Advanced Kokkos options:] There are other allowed options when building with the KOKKOS package. -As above, they can be set either as variables on the make command line -or in Makefile.machine. This is the full list of options, including -those discussed above. Each takes a value shown below. The default -value is listed, which is set in the /lib/kokkos/Makefile.kokkos file. +As explained on the "Build extras"_Build_extras.html#kokkos doc page, +they can be set either as variables on the make command line or in +Makefile.machine, or they can be specified as CMake variables. Each +takes a value shown below. The default value is listed, which is set +in the lib/kokkos/Makefile.kokkos file. -KOKKOS_DEVICES, values = {Serial}, {OpenMP}, {Pthreads}, {Cuda}, default = {OpenMP} -KOKKOS_ARCH, values = {KNC}, {SNB}, {HSW}, {Kepler30}, {Kepler32}, {Kepler35}, {Kepler37}, {Maxwell50}, {Maxwell52}, {Maxwell53}, {Pascal60}, {Pascal61}, {ARMv80}, {ARMv81}, {ARMv81}, {ARMv8-ThunderX}, {BGQ}, {Power7}, {Power8}, {Power9}, {KNL}, {BDW}, {SKX}, default = {none} KOKKOS_DEBUG, values = {yes}, {no}, default = {no} KOKKOS_USE_TPLS, values = {hwloc}, {librt}, {experimental_memkind}, default = {none} KOKKOS_CXX_STANDARD, values = {c++11}, {c++1z}, default = {c++11} KOKKOS_OPTIONS, values = {aggressive_vectorization}, {disable_profiling}, default = {none} KOKKOS_CUDA_OPTIONS, values = {force_uvm}, {use_ldg}, {rdc}, {enable_lambda}, default = {enable_lambda} :ul -KOKKOS_DEVICES sets the parallelization method used for Kokkos code -(within LAMMPS). KOKKOS_DEVICES=Serial means that no threading will be used. -KOKKOS_DEVICES=OpenMP means that OpenMP threading will be -used. KOKKOS_DEVICES=Pthreads means that pthreads will be used. -KOKKOS_DEVICES=Cuda means an NVIDIA GPU running CUDA will be used. - -KOKKOS_ARCH enables compiler switches needed when compiling for a -specific hardware: - -ARMv80 = ARMv8.0 Compatible CPU -ARMv81 = ARMv8.1 Compatible CPU -ARMv8-ThunderX = ARMv8 Cavium ThunderX CPU -SNB = Intel Sandy/Ivy Bridge CPUs -HSW = Intel Haswell CPUs -BDW = Intel Broadwell Xeon E-class CPUs -SKX = Intel Sky Lake Xeon E-class HPC CPUs (AVX512) -KNC = Intel Knights Corner Xeon Phi -KNL = Intel Knights Landing Xeon Phi -Kepler30 = NVIDIA Kepler generation CC 3.0 -Kepler32 = NVIDIA Kepler generation CC 3.2 -Kepler35 = NVIDIA Kepler generation CC 3.5 -Kepler37 = NVIDIA Kepler generation CC 3.7 -Maxwell50 = NVIDIA Maxwell generation CC 5.0 -Maxwell52 = NVIDIA Maxwell generation CC 5.2 -Maxwell53 = NVIDIA Maxwell generation CC 5.3 -Pascal60 = NVIDIA Pascal generation CC 6.0 -Pascal61 = NVIDIA Pascal generation CC 6.1 -BGQ = IBM Blue Gene/Q CPUs -Power8 = IBM POWER8 CPUs -Power9 = IBM POWER9 CPUs :ul - KOKKOS_USE_TPLS=hwloc binds threads to hardware cores, so they do not migrate during a simulation. KOKKOS_USE_TPLS=hwloc should always be used if running with KOKKOS_DEVICES=Pthreads for pthreads. It is not diff --git a/doc/src/Speed_omp.txt b/doc/src/Speed_omp.txt index cd85b06192..0abf54430e 100644 --- a/doc/src/Speed_omp.txt +++ b/doc/src/Speed_omp.txt @@ -16,18 +16,6 @@ improper), several Kspace styles, and a few fix styles. It uses the OpenMP interface for multi-threading, but can also be compiled without OpenMP support, providing optimized serial styles in that case. -Here is a quick overview of how to use the USER-OMP package, assuming -one or more 16-core nodes. More details follow. - -make yes-user-omp -make omp # Makefile.omp already has OpenMP settings for GNU compilers -make mpi # or build with USER-OMP package without OpenMP :pre - -env OMP_NUM_THREADS=16 lmp_omp -sf omp -in in.script # 1 MPI task, 16 threads according to OMP_NUM_THREADS -lmp_mpi -sf omp -in in.script # 1 MPI task, no threads, optimized kernels -mpirun -np 4 lmp_omp -sf omp -pk omp 4 -in in.script # 4 MPI tasks, 4 threads/task -mpirun -np 32 -ppn 4 lmp_omp -sf omp -pk omp 4 -in in.script # 8 nodes, 4 MPI tasks/node, 4 threads/task :pre - [Required hardware/software:] To enable multi-threading, your compiler must support the OpenMP interface. @@ -36,18 +24,18 @@ launched by each MPI task on the local node (using shared memory). [Building LAMMPS with the USER-OMP package:] -The lines above illustrate how to include/build with the USER-OMP -package in two steps, using the "make" command. Or how to do it with -one command as described on the "Packages -details"_Packages_details.html#USER-OMP doc page. - -Note that the CCFLAGS and LINKFLAGS settings in Makefile.machine must -include "-fopenmp" for the GNU compilers. If you use an Intel compiler, -the corresponding flag is "-qopenmp" and the CCFLAGS setting must also -include "-restrict". +See the "Build extras"_Build_extras.html#user-omp doc page for +instructions. [Run with the USER-OMP package from the command line:] +These example asume one or more 16-core nodes. + +env OMP_NUM_THREADS=16 lmp_omp -sf omp -in in.script # 1 MPI task, 16 threads according to OMP_NUM_THREADS +lmp_mpi -sf omp -in in.script # 1 MPI task, no threads, optimized kernels +mpirun -np 4 lmp_omp -sf omp -pk omp 4 -in in.script # 4 MPI tasks, 4 threads/task +mpirun -np 32 -ppn 4 lmp_omp -sf omp -pk omp 4 -in in.script # 8 nodes, 4 MPI tasks/node, 4 threads/task :pre + The mpirun or mpiexec command sets the total number of MPI tasks used by LAMMPS (one or multiple per compute node) and the number of MPI tasks used per node. E.g. the mpirun command in MPICH does this via diff --git a/doc/src/Speed_opt.txt b/doc/src/Speed_opt.txt index bb0bcd255c..7dd83a84bf 100644 --- a/doc/src/Speed_opt.txt +++ b/doc/src/Speed_opt.txt @@ -15,34 +15,21 @@ Technologies). It contains a handful of pair styles whose compute() methods were rewritten in C++ templated form to reduce the overhead due to if tests and other conditional code. -Here is a quick overview of how to use the OPT package. More details -follow. - -make yes-opt -make mpi # build with the OPT package :pre - -lmp_mpi -sf opt -in in.script # run in serial -mpirun -np 4 lmp_mpi -sf opt -in in.script # run in parallel :pre - [Required hardware/software:] None. [Building LAMMPS with the OPT package:] -The lines above illustrate how to build LAMMPS with the OPT package in -two steps, using the "make" command. Or how to do it with one command -as described on the "Packages details"_Packages_details.html#OPT doc -page. - -Note that if you use an Intel compiler to build with the OPT package, -the CCFLAGS setting in your Makefile.machine must include "-restrict". +See the "Build extras"_Build_extras.html#opt doc page for instructions. [Run with the OPT package from the command line:] -As in the lines above, use the "-sf opt" "command-line -switch"_Run_options.html, which will automatically append "opt" to -styles that support it. +lmp_mpi -sf opt -in in.script # run in serial +mpirun -np 4 lmp_mpi -sf opt -in in.script # run in parallel :pre + +Use the "-sf opt" "command-line switch"_Run_options.html, which will +automatically append "opt" to styles that support it. [Or run with the OPT package by editing an input script:] diff --git a/doc/src/Tools.txt b/doc/src/Tools.txt index aa4adb7dc1..8b4e779cbe 100644 --- a/doc/src/Tools.txt +++ b/doc/src/Tools.txt @@ -74,7 +74,6 @@ own sub-directories with their own Makefiles and/or README files. "vim"_#vim "xmgrace"_#xmgrace :ul -:line :line amber2lmp tool :h3,link(amber) diff --git a/doc/src/compute_chunk_atom.txt b/doc/src/compute_chunk_atom.txt index 95e6e6c010..e76b51e6ec 100644 --- a/doc/src/compute_chunk_atom.txt +++ b/doc/src/compute_chunk_atom.txt @@ -134,7 +134,6 @@ timesteps it specifies, while it accumulates per-chunk averages. The details are described below. -:line :line The different chunk styles operate as follows. For each style, how it @@ -294,7 +293,6 @@ invoke other computes, fixes, or variables when they are evaluated, so this is a very general means of generating per-atom quantities to treat as a chunk ID. -:line :line Normally, {Nchunk} = the number of chunks, is re-calculated every time @@ -322,7 +320,6 @@ the same compute chunk/atom compute. However, the time windows they induce for holding {Nchunk} constant must be identical, else an error will be generated. -:line :line The various optional keywords operate as follows. Note that some of diff --git a/doc/src/dump_modify.txt b/doc/src/dump_modify.txt index 73107d07f7..98bcbc5e55 100644 --- a/doc/src/dump_modify.txt +++ b/doc/src/dump_modify.txt @@ -133,7 +133,6 @@ dump_modify option below is valid for the {atom} style, it is also valid for the {atom/mpiio} style, and similarly for the other styles which allow for use of MPI-IO. -:line :line These keywords apply to various dump styles, including the "dump @@ -629,7 +628,6 @@ the coordinate would be if it had not been wrapped back into the periodic box. Note that these coordinates may thus be far outside the box size stored with the snapshot. -:line :line These keywords apply only to the "dump image"_dump_image.html and @@ -894,7 +892,6 @@ frame rate higher than 24 is not recommended, as it will result in simply dropping the rendered images. It is more efficient to dump images less frequently. -:line :line [Restrictions:] none diff --git a/doc/src/fix_box_relax.txt b/doc/src/fix_box_relax.txt index 8e21ec2c74..29ebeaeef3 100644 --- a/doc/src/fix_box_relax.txt +++ b/doc/src/fix_box_relax.txt @@ -126,8 +126,6 @@ minimizer from the new adjusted box size/shape, since that creates a new objective function valid for the new box size/shape. Repeat as necessary until the box size/shape has reached its new equilibrium. -:line -:line :line The {couple} keyword allows two or three of the diagonal components of diff --git a/lib/gpu/Install.py b/lib/gpu/Install.py index 13d7ad157e..3b12db5091 100644 --- a/lib/gpu/Install.py +++ b/lib/gpu/Install.py @@ -26,12 +26,13 @@ optionally copies Makefile.auto to a new Makefile.osuffix -h = set CUDA_HOME variable in Makefile.auto to hdir hdir = path to NVIDIA Cuda software, e.g. /usr/local/cuda -a = set CUDA_ARCH variable in Makefile.auto to arch - use arch = 20 for Tesla C2050/C2070 (Fermi) (deprecated as of CUDA 8.0) + use arch = 20 for Fermi (C2050/C2070, deprecated as of CUDA 8.0) or GeForce GTX 580 or similar - use arch = 30 for Tesla K10 (Kepler) - use arch = 35 for Tesla K40 (Kepler) or GeForce GTX Titan or similar - use arch = 37 for Tesla dual K80 (Kepler) - use arch = 60 for Tesla P100 (Pascal) + use arch = 30 for Kepler (K10) + use arch = 35 for Kepler (K40) or GeForce GTX Titan or similar + use arch = 37 for Kepler (dual K80) + use arch = 60 for Pascal (P100) + use arch = 70 for Volta -p = set CUDA_PRECISION variable in Makefile.auto to precision use precision = double or mixed or single -e = set EXTRAMAKE variable in Makefile.auto to Makefile.lammps.esuffix diff --git a/src/KSPACE/fft3d.cpp b/src/KSPACE/fft3d.cpp index 6da7f197ee..db44feabcc 100644 --- a/src/KSPACE/fft3d.cpp +++ b/src/KSPACE/fft3d.cpp @@ -14,7 +14,7 @@ /* ---------------------------------------------------------------------- Contributing authors: Jim Shepherd (GA Tech) added SGI SCSL support Axel Kohlmeyer (Temple U) added support for - FFTW3, KISSFFT, Dfti/MKL, and ACML. + FFTW3, KISS FFT, Dfti/MKL, and ACML. Phil Blood (PSC) added single precision FFT. Paul Coffman (IBM) added MPI collectives remap ------------------------------------------------------------------------- */ @@ -26,7 +26,7 @@ #include "fft3d.h" #include "remap.h" -#ifdef FFT_KISSFFT +#ifdef FFT_KISS /* include kissfft implementation */ #include "kissfft.h" #endif diff --git a/src/KSPACE/fft3d.h b/src/KSPACE/fft3d.h index 9a9caaef26..ab3bca8358 100644 --- a/src/KSPACE/fft3d.h +++ b/src/KSPACE/fft3d.h @@ -24,8 +24,8 @@ typedef float FFT_SCALAR; typedef double FFT_SCALAR; #endif - // set default fftw library. switch to FFT_FFTW3 when convenient. + #ifdef FFT_FFTW #define FFT_FFTW3 #endif @@ -57,8 +57,9 @@ typedef fftwf_complex FFT_DATA; #else /* use a stripped down version of kiss fft as default fft */ -#ifndef FFT_KISSFFT -#define FFT_KISSFFT + +#ifndef FFT_KISS +#define FFT_KISS #endif #define kiss_fft_scalar float typedef struct { @@ -97,8 +98,8 @@ typedef fftw_complex FFT_DATA; #else /* use a stripped down version of kiss fft as default fft */ -#ifndef FFT_KISSFFT -#define FFT_KISSFFT +#ifndef FFT_KISS +#define FFT_KISS #endif #define kiss_fft_scalar double typedef struct { @@ -152,7 +153,7 @@ struct fft_plan_3d { FFTW_API(plan) plan_mid_backward; FFTW_API(plan) plan_slow_forward; FFTW_API(plan) plan_slow_backward; -#elif defined(FFT_KISSFFT) +#elif defined(FFT_KISS) kiss_fft_cfg cfg_fast_forward; kiss_fft_cfg cfg_fast_backward; kiss_fft_cfg cfg_mid_forward; diff --git a/src/KSPACE/kissfft.h b/src/KSPACE/kissfft.h index 4e15f494a9..c95b648dcb 100644 --- a/src/KSPACE/kissfft.h +++ b/src/KSPACE/kissfft.h @@ -13,6 +13,7 @@ changes 2008-2011 by Axel Kohlmeyer <akohlmey@gmail.com> */ + #ifndef LMP_FFT_KISSFFT #define LMP_FFT_KISSFFT diff --git a/src/pack.h b/src/pack.h index 066535f5c9..837c33d14b 100644 --- a/src/pack.h +++ b/src/pack.h @@ -22,9 +22,8 @@ struct pack_plan_3d { int nqty; // # of values/element }; - -#if !defined(PACK_POINTER) && !defined(PACK_MEMCPY) -#define PACK_ARRAY +#if !defined(FFT_PACK_POINTER) && !defined(FFT_PACK_MEMCPY) +#define FFT_PACK_ARRAY #endif #ifndef PACK_DATA @@ -47,7 +46,7 @@ struct pack_plan_3d { pack/unpack with array indices ------------------------------------------------------------------------- */ -#ifdef PACK_ARRAY +#ifdef FFT_PACK_ARRAY /* ---------------------------------------------------------------------- pack from data -> buf @@ -274,7 +273,7 @@ static void unpack_3d_permute2_n(PACK_DATA *buf, PACK_DATA *data, struct pack_pl pack/unpack with pointers ------------------------------------------------------------------------- */ -#ifdef PACK_POINTER +#ifdef FFT_PACK_POINTER /* ---------------------------------------------------------------------- pack from data -> buf @@ -523,7 +522,7 @@ static void unpack_3d_permute2_n(PACK_DATA *buf, PACK_DATA *data, struct pack_pl just use PACK_POINTER versions ------------------------------------------------------------------------- */ -#ifdef PACK_MEMCPY +#ifdef FFT_PACK_MEMCPY /* ---------------------------------------------------------------------- pack from data -> buf -- GitLab