# lammps **Information :** [HiPAC](https://event1.nchc.org.tw/2025/hipac/details.html) [國網盃資訊](https://ndrive.niar.org.tw/navigate/a/#/s/96650889CBB94A48AA0049D88FE6B2AE6BL) [LAMMPS 分子動力學軟體說明](https://drive.google.com/file/d/1gpxa9F2-YfGNRW2c-OcLe0f1_pYHzae4/view?usp=drive_link) [LAMMPS 官方網址](https://www.lammps.org) [LAMMPS 使用手冊](https://docs.lammps.org) [Download](https://www.lammps.org/download.html) [TEAM1](https://drive.google.com/drive/folders/1FZD3ZC0eDzNQ5MUbt7k_G9wLrZOhyOcr) **Reference :** https://hackmd.io/@isc21/BJquuUbu_#processes-and-thread-affinity https://hackmd.io/@isc21/rJY5v-AZO https://www.hpc-carpentry.org/tuning_lammps/index.html ## GCC lmp_gcc ```bash # GCC 10 module purge module load gcc10/10.2.1 module load cuda/12.8 module load openmpi/4.1.6 module load intel/2020 cd ~/lammps/lammps-29Aug2024/ mkdir build_gcc10 && cd build_gcc10 cmake \ -D CMAKE_C_COMPILER=$(which mpicc) \ -D CMAKE_CXX_COMPILER=$(which mpicxx) \ -D BUILD_MPI=yes \ -D BUILD_OMP=yes \ -D PKG_OPENMP=yes \ -D PKG_MANYBODY=yes \ -D PKG_KSPACE=yes \ -D PKG_MOLECULE=yes \ -D PKG_RIGID=yes \ -D PKG_COMPRESS=yes \ -D FFT_SINGLE=yes \ -D BUILD_SHARED_LIBS=ON \ -D CMAKE_INSTALL_PREFIX=$HOME/.local/lammps_gcc10 \ ../cmake make -j 16 make install echo 'export PATH=$HOME/.local/lammps_gcc10/bin:$PATH' >> ~/.bashrc source ~/.bashrc ``` ## INTEL lmp_gcc ```bash # Intel 2020 module purge module load intel/2020 module load cuda/12.8 module load openmpi/4.1.6 cd ~/lammps/lammps-29Aug2024/ mkdir build_intel2020 && cd build_intel2020 cmake \ -D CMAKE_C_COMPILER=$(which mpicc) \ -D CMAKE_CXX_COMPILER=$(which mpicxx) \ -D BUILD_MPI=yes \ -D BUILD_OMP=yes \ -D PKG_OPENMP=yes \ -D PKG_MANYBODY=yes \ -D PKG_KSPACE=yes \ -D PKG_MOLECULE=yes \ -D PKG_RIGID=yes \ -D PKG_COMPRESS=yes \ -D FFT_SINGLE=yes \ -D BUILD_SHARED_LIBS=ON \ -D CMAKE_INSTALL_PREFIX=$HOME/.local/lammps_intel2020 \ ../cmake make -j 16 make install echo 'export PATH=$HOME/.local/lammps_intel2020/bin:$PATH' >> ~/.bashrc source ~/.bashrc ``` ## NVHPC lmp ```bash module purge module load nvhpc-24.11_hpcx-2.14_cuda-11.8 module load cuda/12.8 cd ~/lammps/lammps-29Aug2024/ mkdir build_nvhpc24 && cd build_nvhpc24 cmake \ -D CMAKE_C_COMPILER=$(which mpicc) \ -D CMAKE_CXX_COMPILER=$(which mpicxx) \ -D CMAKE_CUDA_HOST_COMPILER=$(which nvc++) \ -D CUDA_HOST_COMPILER=$(which nvc++) \ -D BUILD_MPI=yes \ -D BUILD_OMP=yes \ -D GPU_API=cuda \ -D GPU_ARCH=sm_70 \ -D PKG_KOKKOS=ON \ -D Kokkos_ARCH_NATIVE=ON \ -D Kokkos_ARCH_VOLTA70=ON \ -D Kokkos_ENABLE_CUDA=ON \ -D Kokkos_ENABLE_OPENMP=ON \ -D PKG_OPENMP=yes \ -D PKG_MANYBODY=yes \ -D PKG_KSPACE=yes \ -D PKG_MOLECULE=yes \ -D PKG_RIGID=yes \ -D PKG_COMPRESS=yes \ -D FFT_SINGLE=yes \ -D BUILD_SHARED_LIBS=ON \ -D CMAKE_INSTALL_PREFIX=$HOME/.local/lammps_nvhpc24 \ ../cmake make -j 16 make install echo 'export PATH=$HOME/.local/lammps_nvhpc24/bin:$PATH' >> ~/.bashrc source ~/.bashrc ``` --- ### sbatch ```shell #!/bin/bash #SBATCH --job-name=名稱 #SBATCH --output=名稱%j.out #SBATCH --error=名稱%j.err #SBATCH --nodes=1 #SBATCH --ntasks-per-node=16 #SBATCH --cpus-per-task=1 #SBATCH --gpus-per-node=8 #SBATCH --time=01:00:00 #SBATCH --partition=gp2d #SBATCH --account=ACD110018 ``` --- ### 指令 ```bash squeue -u <account> ``` ```bash sacct -j <jobid> ``` ```bash dos2unix <.sh> ``` --- > pairstyle ```lj/cut``` #### MPI (CPU only) ```bash mpirun -np 8 lmp -in $INPUTFILE ``` #### OpenMP + MPI ```bash mpirun -np 8 lmp -sf omp -pk omp 4 -in $INPUTFILE ``` > pairstyle ```lj/cut/gpu``` #### GPU ```bash mpirun -np 8 lmp -sf gpu -pk gpu 8 -in $INPUTFILE ``` > pairstyle ```lj/cut/kk``` #### Kokkos(GPU) ```bash mpirun -np 8 lmp -sf kk -k on g 8 -in $INPUTFILE ``` #### kokkos(OMP) ```bash mpirun -np 8 lmp -sf kk -k on t 4 -in $INPUTFILE ``` ## MD [csv](https://docs.google.com/spreadsheets/d/1i_n_qYDdEUQ-gkFpBfe5IR0dufAYljAcHHQr6AZ3-5w/edit?hl=zh-tw&gid=0#gid=0) ### baseline | blocksize | atom | method | run | timestep | skin | every | check | | --------- | -------- | ------------ | --- | -------- | ---- | ----- | ----- | | 200 | 32000000 | 8 rank 8 GPU | 250 | 0.001 | 0.1 | 1 | no | ![image](https://hackmd.io/_uploads/Hk5pwfp8ge.png) >Temp : 1.6712881 TotEng : -2.2800574 ### blocksize :::spoiler blocksize_test.sh ```bash #!/bin/bash #SBATCH --job-name=lmp_melt_kokkos #SBATCH --output=lmp_melt_kokkos_%j.out #SBATCH --error=lmp_melt_kokkos_%j.err #SBATCH --nodes=1 #SBATCH --ntasks-per-node=8 #SBATCH --cpus-per-task=2 #SBATCH --gpus-per-node=8 #SBATCH --time=010:00:00 #SBATCH --partition=gp2d #SBATCH --account=ACD110018 module purge module load nvhpc-24.11_hpcx-2.20_cuda-12.6 module load cuda/12.8 export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK export OMP_PROC_BIND=spread export OMP_PLACES=threads # ==================================== # Generate input scripts for each box size # ==================================== for BOXSIZE in 100 150 200 250 300 350 400 do cat << EOF > in.melt_${BOXSIZE} # 3d Lennard-Jones melt with box size ${BOXSIZE} units lj atom_style atomic lattice fcc 0.8442 region box block 0 ${BOXSIZE} 0 ${BOXSIZE} 0 ${BOXSIZE} create_box 1 box create_atoms 1 box mass 1 1.0 velocity all create 3.0 87287 loop geom pair_style lj/cut/kk 2.5 pair_coeff 1 1 1.0 1.0 2.5 neighbor 0.3 bin neigh_modify every 20 delay 0 check no fix 1 all nve thermo 50 run 250 EOF # ==================================== # Run LAMMPS with mpirun + kokkos GPU # ==================================== echo "=============================" echo "Running box size: ${BOXSIZE}" echo "=============================" mpirun -np 8 lmp -sf kk -k on g 8 -in in.melt_${BOXSIZE} > out.melt_${BOXSIZE} # ==================================== # Extract atom count and performance # ==================================== ATOMS=$(grep "Created" out.melt_${BOXSIZE} | awk '{print $8}') PERF=$(grep "Performance:" out.melt_${BOXSIZE} | awk '{print $(NF-1),$NF}') echo "Box size ${BOXSIZE}: ${ATOMS} atoms, Performance: ${PERF}" >> performance_scaling.txt done ``` ::: ![image](https://hackmd.io/_uploads/By9mWfaIle.png) >32000000 atoms blocksize 200 ### neigh skin every timesteps :::spoiler **md_acc_test.sh** ```bash #!/bin/bash #SBATCH --job-name=md_acc_test #SBATCH --output=md_acc_test_%j.out #SBATCH --error=md_acc_test_%j.err #SBATCH --nodes=1 #SBATCH --ntasks-per-node=8 #SBATCH --cpus-per-task=2 #SBATCH --gpus-per-node=8 #SBATCH --time=05:00:00 #SBATCH --partition=gp2d #SBATCH --account=ACD110018 module purge module load nvhpc-24.11_hpcx-2.20_cuda-12.6 module load cuda/12.8 export LD_LIBRARY_PATH=/home/pkboie1019/.local/lammps_nvhpc24/lib64:$LD_LIBRARY_PATH export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK export OMP_PROC_BIND=spread export OMP_PLACES=threads # ==================================== # Define test parameter sets # ==================================== declare -a TIMESTEPS=("0.001" "0.002" "0.005") declare -a SKINS=("0.1" "0.2" "0.3" "0.5" "1.0" "2.0") declare -a EVERYS=("1" "10" "20" "50" "100" "200" "500") BOXSIZE=200 for TS in "${TIMESTEPS[@]}" do for SKIN in "${SKINS[@]}" do for EVERY in "${EVERYS[@]}" do INPUT=in.melt_${BOXSIZE}_ts${TS}_skin${SKIN}_every${EVERY} cat << EOF > $INPUT units lj atom_style atomic lattice fcc 0.8442 region box block 0 ${BOXSIZE} 0 ${BOXSIZE} 0 ${BOXSIZE} create_box 1 box create_atoms 1 box mass 1 1.0 velocity all create 3.0 87287 loop geom pair_style lj/cut/kk 2.5 pair_coeff 1 1 1.0 1.0 2.5 neighbor ${SKIN} bin neigh_modify every ${EVERY} delay 0 check no timestep ${TS} fix 1 all nve thermo 50 run 250 EOF echo "=============================" echo "Running: ts=${TS}, skin=${SKIN}, every=${EVERY}" echo "=============================" mpirun -np 8 lmp -sf kk -k on g 8 -in $INPUT > out.${INPUT} # Check lost atoms LOST=$(grep "Lost atoms" out.${INPUT}) if [ -z "$LOST" ]; then STATUS="OK" else STATUS="LOST ATOMS" fi # Extract created atoms ATOMS=$(grep "Created" out.${INPUT} | grep -o '[0-9]\+' | head -n 1) # Extract performance TAUDAY=$(grep "Performance:" out.${INPUT} | awk '{print $2}') TIMESTEPPS=$(grep "Performance:" out.${INPUT} | awk '{print $4}') GATOMSTEP=$(grep "Performance:" out.${INPUT} | awk '{print $6}') # Write to summary file echo "ts=${TS}, skin=${SKIN}, every=${EVERY}, atoms=${ATOMS}, runsteps=250, status=${STATUS}, tau/day=${TAUDAY}, timestep/s=${TIMESTEPPS}, Gatom-step/s=${GATOMSTEP}" >> performance_accuracy_test.txt done done done ``` ::: ```bash # 操縱變因 declare -a TIMESTEPS=("0.001" "0.002" "0.005") declare -a SKINS=("0.1" "0.2" "0.3" "0.5" "1.0" "2.0") declare -a EVERYS=("1" "10" "20" "50" "100" "200" "500") ``` **tau/day** ![tau](https://hackmd.io/_uploads/HkuzzGTLex.png) **Gatom-step/s** ![ts](https://hackmd.io/_uploads/rJ6zfzaUlx.png) **measurement error** ![diff](https://hackmd.io/_uploads/rJM-plRUlg.png) ![diff_log](https://hackmd.io/_uploads/ByTwTeRIex.png) ### every delay check(yes/no) `delay` 的值必須是 `0` 或是 `every` 的倍數 :::spoiler md_delay.sh ```bash #!/bin/bash #SBATCH --job-name=p_md_acc_test #SBATCH --output=p_md_acc_test_%j.out #SBATCH --error=p_md_acc_test_%j.err #SBATCH --nodes=1 #SBATCH --ntasks-per-node=8 #SBATCH --cpus-per-task=2 #SBATCH --gpus-per-node=8 #SBATCH --time=05:00:00 #SBATCH --partition=gp2d #SBATCH --account=ACD110018 module purge module load nvhpc-24.11_hpcx-2.20_cuda-12.6 module load cuda/12.8 export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK export OMP_PROC_BIND=spread export OMP_PLACES=threads # ======================== # Parameters to test # ======================== TS="0.001" SKIN="0.5" declare -a EVERYS=("1" "10" "20" "50" "100" "200" "500") declare -a DELAYS=("0" "1" "5" "10" "20" "50" "100" "200" "500") declare -a CHECKS=("no" "yes") BOXSIZE=200 for EVERY in "${EVERYS[@]}" do for DELAY in "${DELAYS[@]}" do # Skip invalid delay combinations if [ $((DELAY % EVERY)) -ne 0 ]; then continue fi for CHECK in "${CHECKS[@]}" do INPUT=in.melt_${BOXSIZE}_ts${TS}_skin${SKIN}_every${EVERY}_delay${DELAY}_check${CHECK} cat << EOF > $INPUT units lj atom_style atomic lattice fcc 0.8442 region box block 0 ${BOXSIZE} 0 ${BOXSIZE} 0 ${BOXSIZE} create_box 1 box create_atoms 1 box mass 1 1.0 velocity all create 3.0 87287 loop geom pair_style lj/cut/kk 2.5 pair_coeff 1 1 1.0 1.0 2.5 neighbor ${SKIN} bin neigh_modify every ${EVERY} delay ${DELAY} check ${CHECK} timestep ${TS} fix 1 all nve thermo 50 run 250 EOF echo "=============================" echo "Running: every=${EVERY}, delay=${DELAY}, check=${CHECK}" echo "=============================" mpirun -np 8 lmp -sf kk -k on g 8 -in $INPUT > out.${INPUT} # Check lost atoms LOST=$(grep "Lost atoms" out.${INPUT}) if [ -z "$LOST" ]; then STATUS="OK" else STATUS="LOST ATOMS" fi # Extract created atoms ATOMS=$(grep "Created" out.${INPUT} | grep -o '[0-9]\+' | head -n 1) # Extract performance TAUDAY=$(grep "Performance:" out.${INPUT} | awk '{print $2}') TIMESTEPPS=$(grep "Performance:" out.${INPUT} | awk '{print $4}') GATOMSTEP=$(grep "Performance:" out.${INPUT} | awk '{print $6}') # Write to summary file echo "ts=${TS}, skin=${SKIN}, every=${EVERY}, delay=${DELAY}, check=${CHECK}, atoms=${ATOMS}, runsteps=250, status=${STATUS}, tau/day=${TAUDAY}, timestep/s=${TIMESTEPPS}, Gatom-step/s=${GATOMSTEP}" >> performance_accuracy_test.txt done done done ``` ::: ```bash # 操縱變因 TS="0.001" SKIN="0.5" declare -a EVERYS=("1" "10" "20" "50" "100" "200" "500") declare -a DELAYS=("0" "1" "5" "10" "20" "50" "100" "200" "500") declare -a CHECKS=("no" "yes") ``` **Gatom-step/s** ![delay](https://hackmd.io/_uploads/BJ5vyWHwxe.png) ### N/A :x: 1. Temp 跟 TotEng 差太多 能量爆炸 ![image](https://hackmd.io/_uploads/B15FDMpLxe.png) 2. Lost atoms 原子移出 domain ![image](https://hackmd.io/_uploads/B1pOdGpLeg.png) #### 想要performance好 skin 越小 (skin太小 精度不穩) every 越大 (但大到一定程度之後就沒什麼差) timestep 越大越好 (太大很容易錯誤) check (yes/no) no 好一點點點 但 yes 比較不容易報錯 delay 通常是越大越好 (根據 every 會有點不一樣) #### 精度 溫度能量不能夠差太多 ### run_script - timestep - skin - every - delay (0 or multiples of every) - check (yes / no) - blocksize - runstep ```bash #!/bin/bash #SBATCH --job-name=p_md_acc_test #SBATCH --output=p_md_acc_test_%j.out #SBATCH --error=p_md_acc_test_%j.err #SBATCH --nodes=1 #SBATCH --ntasks-per-node=8 #SBATCH --cpus-per-task=2 #SBATCH --gpus-per-node=8 #SBATCH --time=05:00:00 #SBATCH --partition=gp2d #SBATCH --account=ACD110018 module purge module load nvhpc-24.11_hpcx-2.20_cuda-12.6 module load cuda/12.8 export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK export OMP_PROC_BIND=spread export OMP_PLACES=threads # ======================== # Custom Parameters # ======================== declare -a TS_LIST=("0.001") # timestep declare -a SKINS=("0.1" "0.2") # skin distance declare -a EVERYS=("50" "100" "200" "250") # neighbor every declare -a DELAYS=("0" "50" "100" "200" "250") # neighbor delay declare -a CHECKS=("yes" "no") # check declare -a BOXSIZES=("200") # box size declare -a RUNSTEPS=("250") # run steps # ======================== # Begin Testing Loop # ======================== for TS in "${TS_LIST[@]}"; do for SKIN in "${SKINS[@]}"; do for BOXSIZE in "${BOXSIZES[@]}"; do for RUNSTEP in "${RUNSTEPS[@]}"; do for EVERY in "${EVERYS[@]}"; do for DELAY in "${DELAYS[@]}"; do # delay 必須是 every 的倍數 if [ $((DELAY % EVERY)) -ne 0 ]; then continue fi for CHECK in "${CHECKS[@]}"; do INPUT=in.melt_${BOXSIZE}_ts${TS}_skin${SKIN}_every${EVERY}_delay${DELAY}_check${CHECK}_run${RUNSTEP} cat << EOF > $INPUT units lj atom_style atomic lattice fcc 0.8442 region box block 0 ${BOXSIZE} 0 ${BOXSIZE} 0 ${BOXSIZE} create_box 1 box create_atoms 1 box mass 1 1.0 velocity all create 3.0 87287 loop geom pair_style lj/cut/kk 2.5 pair_coeff 1 1 1.0 1.0 2.5 neighbor ${SKIN} bin neigh_modify every ${EVERY} delay ${DELAY} check ${CHECK} timestep ${TS} fix 1 all nve thermo 250 run ${RUNSTEP} EOF echo "=============================" echo "Running: ts=${TS}, skin=${SKIN}, every=${EVERY}, delay=${DELAY}, check=${CHECK}, box=${BOXSIZE}, run=${RUNSTEP}" echo "=============================" mpirun -np 8 lmp -sf kk -k on g 8 -in $INPUT > out.${INPUT} # Check lost atoms LOST=$(grep "Lost atoms" out.${INPUT}) if [ -z "$LOST" ]; then STATUS="OK" else STATUS="LOST ATOMS" fi # Extract atoms created ATOMS=$(grep "Created" out.${INPUT} | grep -o '[0-9]\+' | head -n 1) # Extract performance TAUDAY=$(grep "Performance:" out.${INPUT} | awk '{print $2}') TIMESTEPPS=$(grep "Performance:" out.${INPUT} | awk '{print $4}') GATOMSTEP=$(grep "Performance:" out.${INPUT} | awk '{print $6}') # Save to summary echo "ts=${TS}, skin=${SKIN}, every=${EVERY}, delay=${DELAY}, check=${CHECK}, atoms=${ATOMS}, runsteps=${RUNSTEP}, status=${STATUS}, tau/day=${TAUDAY}, timestep/s=${TIMESTEPPS}, Gatom-step/s=${GATOMSTEP}" >> performance_accuracy_test.txt done done done done done done done ``` # Strategy ```bash Performance: 10075.215 tau/day, 122.749 timesteps/s, 3.928 Gatom-step/s 97.4% CPU use with 8 MPI tasks x 1 OpenMP threads MPI task timing breakdown: Section | min time | avg time | max time |%varavg| %total --------------------------------------------------------------- Pair | 0.033339 | 0.034016 | 0.034344 | 0.2 | 1.67 Neigh | 0.09415 | 0.10065 | 0.10867 | 1.5 | 4.94 Comm | 0.72955 | 0.73304 | 0.73525 | 0.2 | 35.99 Output | 0.00026695 | 0.010562 | 0.018416 | 5.5 | 0.52 Modify | 1.0641 | 1.0664 | 1.0697 | 0.2 | 52.36 Other | | 0.09199 | | | 4.52 ``` | Section | 意義 | 包含內容 | 代表什麼時間 | |:----------:|:------------------------:|:------------------------------------------------------:|:------------------------------------------------------:| | **Pair** | Pairwise 力計算 | 計算原子間兩兩交互作用力,如 Lennard-Jones、Coulomb 等 | 和 `pair_style` 有關,是大多數分子動力學計算的主力 | | **Neigh** | Neighbor List 建立 | 建立/更新每個原子的鄰近原子清單 | 和 `neighbor` 距離、`neigh_modify` 更新頻率有關 | | **Comm** | 通訊(MPI 間的資料交換) | 各 MPI Rank 間原子位置、力、速度的傳遞 | 和使用的平行模式、分割區域(domain decomposition)有關 | | **Output** | 輸出操作 | 包括 `thermo`, `dump`, `log` 的輸出時間 | 若很高代表太頻繁輸出 | | **Modify** | fix 與 compute 操作 | 如 `fix nve`, `fix langevin`, `compute temperature` 等 | fix 運算佔比高表示模型中 fix 的耗時操作多 | | **Other** | 其他時間 | 通常是初始化、同步、內部管理、閒置等待等 | 無法分類的時間,尤其在 load imbalance 時會高 | --- # temp ## might helpful ### neigh and newton For KOKKOS/GPU, the default is `neigh = full` and `newton = off`. For Maxwell and Kepler generations of GPUs, the default settings are typically the best. For Pascal generations, setting `neigh = half` and `newton = on` might produce faster runs. ### skin #### 1. neighbor skin style 的意義 **skin 是指鄰居搜尋範圍的額外距離 (buffer distance)。** >每個 timestep,LAMMPS 不會每次都重建 neighbor list,會先建好一個 list,當粒子移動距離沒有超過 skin 的一半時,就不必重建。 **舉例** >假設你的 cutoff = 10.0 Å,skin = 2.0 Å ➡️ neighbor list 會存 cutoff + skin = 12.0 Å 內的鄰居。 ➡️ 當任一粒子移動距離超過 skin/2 = 1.0 Å 時,LAMMPS 會重新建 neighbor list。 #### 2. skin 對效能與精度的影響 | **skin 大小** | **影響** | | ------------- | ---------------------------------------------------------------------------------------------------------------------- | | **較小** | :heavy_check_mark: neighbor list 中的鄰居較少,force 計算較快<br>:x: 但必須更頻繁地重建 neighbor list,增加 overhead | | **較大** | :heavy_check_mark: 減少重建 neighbor list 的頻率<br>:x: 但每次 force 計算要檢查更多無作用力的 pair,造成浪費 | #### 3. style 的選擇 - **bin** 幾乎所有情況都使用 bin,scaling 為 O(N)。 - **nsq** 小分子系統 (non-periodic),N < 500 atoms,可試試。 - **multi** 若有粒子大小差異極大的 granular or colloidal 系統,可搭配 neigh_modify 做最佳化。 ### neighbor every #### 1. neigh_modify every 的語法與意義 **基本語法** ```bash # M = 嘗試每 M timesteps 檢查一次是否需要重建 neighbor list。 neigh_modify every M ``` **運作邏輯 (搭配 delay, check)** - every M 從上次 build 後,每 M step 允許 LAMMPS 檢查是否需要 rebuild。 - delay N 從上次 build neighbor list 後,至少過 N 個 time steps 才允許再次 rebuild。 - check yes/no yes ➔ 只有當有 atom 移動距離 > skin/2 時才真的 rebuild。 no ➔ 達到 every 與 delay 條件後,就 強制 rebuild。 **舉例說明** ```bash # 預設最保守設定,精度最安全。 # 每 step (every=1) 都檢查是否需要 rebuild。 # delay=0 ➔ 不延遲,隨時可 rebuild。 # check=yes ➔ 只有當 atom 移動超過 skin/2 才 rebuild。 neighbor 2.0 bin neigh_modify every 1 delay 0 check yes ``` #### 2. every 對精度與效能的影響 | **every 值** | **影響** | | ---------------- | ------------------------------------------------------------------------------------- | | **較小 (e.g. 1)** | :heavy_check_mark: 時時檢查,精度最高<br>:x: 可能造成 frequent neighbor rebuild,增加 overhead | | **較大 (e.g. 10)** | :heavy_check_mark: 降低 rebuild 檢查頻率,效能提升<br>:x: 若粒子移動快,可能錯過 rebuild timing ➔ **force 計算錯誤,simulation 崩潰** | #### 3. 實務上的設定建議 **追求精度 (保守)** ```bash # 每 step 檢查 # 只 rebuild 當 atom 移動超過 skin/2 # 適用於:高溫、液態、快速擴散系統 neigh_modify every 1 delay 0 check yes ``` **追求效能 (需 benchmark 驗證精度)** ```bash # 每 5 step 檢查一次 # delay=10 ➔ 前 10 step 禁止 rebuild # check=yes ➔ 僅當 atom 移動超過 skin/2 才 rebuild # ✅ 這可降低 neighbor build 次數,減少 MPI 通訊與 rebuild overhead。 neigh_modify every 5 delay 10 check yes ``` **極端效能 (需要確認 atom 幾乎不動,例如低溫晶體)** ```bash # 每 10 step rebuild(因 check=no) # delay=100 ➔ 前 100 step 不 rebuild # ⚠️ 注意:若 atom 移動超過 cutoff+skin,會造成 force 計算錯誤。 neigh_modify every 10 delay 100 check no ``` #### 4. 如何選擇最佳設定 ✅ 方法 >使用不同 every, delay, check 組合跑 短 benchmark (e.g. 1000 step) 觀察 log file 中: Neighbor list builds 次數 timesteps/s Energy drift 是否 acceptable ✅ 目標 > 最少的 neighbor rebuild 次數 最高的 timesteps/s 仍能保持 energy conservation (例如 NVE 系統 energy drift 在可接受範圍內) ### timestep #### 1. timestep 的意義 ```bash # dt = 每個 simulation step 所代表的實際時間大小 (單位由 units 決定,例如 real=fs, metal=ps) timestep dt ``` >units lj: Default dt = 0.005 units real: Default dt = 1.0 femtoseconds units metal: Default dt = 0.001 picoseconds #### 2. timestep 對效能的影響 >✅ 效能提升 timestep 越大 ➔ 模擬時間越快 因為: 例如你要模擬 1 ns timestep=1 fs ➔ 1,000,000 steps timestep=2 fs ➔ 500,000 steps step 數減半,整體計算成本減半。 #### 3. timestep 對精度與穩定性的影響 >❌ timestep 太大 ➔ 模擬可能不穩定 原因: Integration 方法 (如 velocity-Verlet) 假設 timestep 很小 timestep 過大 ➔ energy drift 增加 temperature/pressure 波動過大 系統可能崩潰(例如原子速度爆炸,變為 NaN) #### 4.timestep 效能/精度 | 大小 | 影響 | | ----------- | ---------------------------------------------------------------------------- | | timestep 大 | ✅ step 數少 ➔ 效能提升<br>❌ 積分誤差變大 ➔ energy drift 增加,可能造成崩潰 | | timestep 小 | ✅ 穩定,energy conservation 好<br>❌ step 數多 ➔ 計算時間增加 | ![image](https://hackmd.io/_uploads/Sy_Y3fH8el.png) ![image](https://hackmd.io/_uploads/S1Fo2GHUxg.png) ![image](https://hackmd.io/_uploads/ryaThGHLxg.png) ![image](https://hackmd.io/_uploads/r1iepGH8ee.png) ![image](https://hackmd.io/_uploads/H1mwpGS8gl.png) ```bash #!/bin/bash #SBATCH --job-name=lammps_nvhpc #SBATCH --output=lammps_nvhpc_%j.out #SBATCH --error=lammps_nvhpc_%j.err #SBATCH --nodes=1 #SBATCH --ntasks-per-node=16 #SBATCH --cpus-per-task=1 #SBATCH --gpus-per-node=8 #SBATCH --time=01:00:00 #SBATCH --partition=gp2d #SBATCH --account=ACD110018 module purge module load nvhpc-24.11_hpcx-2.20_cuda-12.6 module load cuda/12.8 export LD_LIBRARY_PATH=/home/pkboie1019/.local/lammps_nvhpc24/lib64:$LD_LIBRARY_PATH mpirun -np 8 lmp_nvhpc -sf kk -k on g 8 -in input.lammps ``` | 項目 | Kokkos package | GPU package | |:-------------:|:---------------------------------------------:|:----------------------------------------------:| | **執行指令** | `-sf kk -k on g N` | `-sf gpu -pk gpu N` | | **CUDA 實作** | Kokkos CUDA backend | 原生 CUDA kernel | | **指令範例** | `mpirun -np 8 lmp -k on g N -sf kk -in input` | `mpirun -np 8 lmp -sf gpu -pk gpu N -in input` | | tau/day | timesteps/s | atom-step/s | |:----------------------------------------------------------------:|:----------------------------:|:-----------------------:| | 在一天 (24 小時) 內,可以模擬多少 Lennard-Jones 時間單位 (tau)。 | 每秒可以執行多少 MD timestep | 每秒計算的 atom-step 數 | | 指標 | 用途 | 是否可比較效能 | | ---------------- | ------------------- | --------------------------------- | | **tau/day** | 模擬物理時間長度 | 僅限相同 system | | **timesteps/s** | 推進速度 | 受 atom 數影響,僅適合同 input 比較 | | **Gatom-step/s** | **計算吞吐量 benchmark** | ✅ **最佳效能指標**,常用於 HPC scaling plot | **效能佳** 1. ranks = GPU 數,除非 input 非常小,可考慮多 ranks per GPU 2. threads 設 1 (除非 CPU only backend) 3. 使用 CUDA-aware MPI module,確保 GPU Direct RDMA :::danger 跑 input.lammps 2D-LJ lmp: symbol lookup error: /work/HPC_SYS/twnia2/pkg-rocky8/nvidia/hpc_sdk/Linux_x86_64/24.11/comm_libs/12.6/hpcx/hpcx-2.20/ucx/mt/lib/ucx/libuct_cuda_gdrcopy.so.0: undefined symbol: gdr_get_info_v2 ::: **before run** ```bash module purge module load nvhpc-24.11_hpcx-2.20_cuda-12.6 module load cuda/12.8 export OMP_PROC_BIND=spread export OMP_PLACES=threads ``` >OMP_PROC_BIND=spread 會讓執行緒均勻分散到 CPU 核心,通常能避免核心過度競爭。 OMP_PLACES=threads 則是指定可用的 CPU 核心範圍 能讓 OpenMP 執行緒綁定到不同 CPU 核心,提升多核效能,避免執行緒在 CPU 之間亂跳導致效能下降。 ```bash export OMP_PROC_BIND=spread export OMP_PLACES=threads unset OMP_PROC_BIND unset OMP_PLACES export OMP_NUM_THREADS=1 ``` ```bash # 1. 清除並載入所需模組 module load nvhpc-24.11_hpcx-2.14_cuda-11.8 module load cuda/12.8 # 2. 進入 LAMMPS 來源碼目錄 cd ~/lammps/lammps-29Aug2024/ # 3. 建立 build 目錄 rm -rf build_new # 如果之前有 build_new 先刪除 mkdir build_new && cd build_new # 4. 執行 CMake 配置(可根據需求加 package) cmake \ -D CMAKE_C_COMPILER=$(which nvc) \ -D CMAKE_CXX_COMPILER=$(which nvc++) \ -D CMAKE_Fortran_COMPILER=$(which nvfortran) \ -D CMAKE_CUDA_HOST_COMPILER=$(which nvc++) \ -D CUDA_HOST_COMPILER=$(which nvc++) \ -D BUILD_MPI=ON \ -D BUILD_OMP=ON \ -D PKG_OPENMP=ON \ -D PKG_GPU=ON \ -D GPU_API=cuda \ -D GPU_ARCH=sm_70 \ -D PKG_KOKKOS=ON \ -D Kokkos_ARCH_NATIVE=ON \ -D Kokkos_ARCH_VOLTA70=ON \ -D Kokkos_ENABLE_CUDA=ON \ -D Kokkos_ENABLE_OPENMP=ON \ -D FFT=FFTW3 \ -D FFT_KOKKOS=CUFFT \ -D FFT_SINGLE=YES \ -D FFT_PACK=array \ -D FFT_USE_HEFFTE=NO \ -D PKG_OPT=ON \ -D BUILD_SHARED_LIBS=ON \ -D PKG_FEP=ON \ -D PKG_TALLY=ON \ -D PKG_REPLICA=ON \ -D PKG_INTEL=ON \ -D PKG_MOLECULE=ON \ -D PKG_KSPACE=ON \ -D PKG_GRANULAR=ON \ -D PKG_RIGID=ON \ -D PKG_CLASS2=ON \ -D PKG_MANYBODY=ON \ -D CMAKE_INSTALL_PREFIX=$HOME/.local/lammps_new \ ../cmake # 5. 編譯並安裝(使用 16 執行緒) make -j 16 make install # 6. 加入環境變數(只加一次就好) echo 'export PATH=$HOME/.local/lammps_new/bin:$PATH' >> ~/.bashrc source ~/.bashrc ``` ```bash cmake ../cmake \ -G Ninja \ -D CMAKE_INSTALL_PREFIX=$HOME/.local/lammps_double \ -DCMAKE_C_COMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/25.7/comm_libs/12.9/hpcx/hpcx-2.22.1/ompi/bin/mpicc \ -DCMAKE_CXX_COMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/25.7/comm_libs/12.9/hpcx/hpcx-2.22.1/ompi/bin/mpicxx \ -DCMAKE_Fortran_COMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/25.7/comm_libs/12.9/hpcx/hpcx-2.22.1/ompi/bin/mpifort \ -DCMAKE_CUDA_HOST_COMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/25.7/compilers/bin/nvc++ \ -D CUDAToolkit_ROOT=/usr/local/cuda-12.9 \ -D CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-12.9 \ -D BUILD_MPI=ON \ -D BUILD_OMP=ON \ -D PKG_OPENMP=ON \ -D PKG_GPU=ON \ -D GPU_PREC=double \ -D GPU_API=cuda \ -D GPU_ARCH=sm_70 \ -D PKG_KOKKOS=ON \ -D Kokkos_ARCH_NATIVE=ON \ -D Kokkos_ARCH_VOLTA70=ON \ -D Kokkos_ENABLE_CUDA=ON \ -D Kokkos_ENABLE_OPENMP=ON \ -D FFT=FFTW3 \ -D FFT_KOKKOS=CUFFT \ -D FFT_SINGLE=YES \ -D FFT_PACK=array \ -D FFT_USE_HEFFTE=NO \ -D PKG_OPT=ON \ -D BUILD_SHARED_LIBS=ON \ -D PKG_FEP=ON \ -D PKG_TALLY=ON \ -D PKG_REPLICA=ON \ -D PKG_INTEL=ON \ -D PKG_MOLECULE=ON \ -D PKG_KSPACE=ON \ -D PKG_GRANULAR=ON \ -D PKG_RIGID=ON \ -D PKG_CLASS2=ON \ -D PKG_MANYBODY=ON \ ``` ```bash module purge module load tbb/2021.7.1 compiler-rt/2022.2.1 mkl/2022.2.1 hpcx nvhpc-hpcx-cuda12/25.7 cuda/12.9 spack unload -a spack load fftw@3.3.10 spack load cmake@3.30.5 ``` ```bash cmake ../cmake \ -G Ninja \ -D CMAKE_INSTALL_PREFIX=$HOME/.local/lammps_latest \ -D CMAKE_C_COMPILER=$(which nvc) \ -D CMAKE_CXX_COMPILER=$(which nvc++) \ -D CMAKE_CUDA_HOST_COMPILER=$(which nvc++) \ -D CUDA_HOST_COMPILER=$(which nvc++) \ -D CUDAToolkit_ROOT=/usr/local/cuda-12.9 \ -D CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-12.9 \ -D CMAKE_CUDA_ARCHITECTURES=70 \ -D CMAKE_C_FLAGS="-fast -mp -Ofast -tp host" \ -D CMAKE_CXX_FLAGS="-fast -mp -Ofast -tp host" \ -D CMAKE_CUDA_FLAGS="-Xcompiler -O3 -Xcompiler -march=native" \ -D BUILD_MPI=yes \ -D BUILD_OMP=yes \ -D PKG_GPU=on \ -D PKG_MANYBODY=on \ -D PKG_REPLICA=on \ -D FFT=MKL \ -D FFT_SINGLE=yes \ -D CUDA_MPS_SUPPORT=yes \ -D CUDPP_OPT=no \ -D GPU_API=cuda \ -D GPU_ARCH=sm_70 \ -D CUDA_BUILD_MULTIARCH=no \ -D USE_STATIC_OPENCL_LOADER=no \ -D BUILD_SHARED_LIBS=ON \ ``` ```bash cmake ../cmake \ -G Ninja \ -D CMAKE_INSTALL_PREFIX=$HOME/.local/lammps_double \ -D CMAKE_C_COMPILER=$(which nvc) \ -D CMAKE_CXX_COMPILER=$(which nvc++) \ -D CMAKE_Fortran_COMPILER=$(which nvfortran) \ -D CMAKE_CUDA_HOST_COMPILER=$(which nvc++) \ -D BUILD_MPI=ON \ -D MPI_C_COMPILER=$(which mpicc) \ -D MPI_CXX_COMPILER=$(which mpicxx) \ -D MPI_Fortran_COMPILER=$(which mpifort) \ -D BUILD_OMP=ON \ -D PKG_OPENMP=ON \ -D PKG_GPU=ON \ -D GPU_API=cuda \ -D GPU_ARCH=sm_70 \ -D GPU_PREC=double \ -D PKG_KOKKOS=ON \ -D Kokkos_ENABLE_CUDA=ON \ -D Kokkos_ENABLE_OPENMP=ON \ -D Kokkos_ARCH_VOLTA70=ON \ -D Kokkos_ARCH_NATIVE=ON \ -D FFT=FFTW3 \ -D FFT_KOKKOS=CUFFT \ -D PKG_OPT=ON \ -D BUILD_SHARED_LIBS=ON \ -D PKG_FEP=ON \ -D PKG_TALLY=ON \ -D PKG_REPLICA=ON \ -D PKG_INTEL=OFF \ -D PKG_MOLECULE=ON \ -D PKG_KSPACE=ON \ -D PKG_GRANULAR=ON \ -D PKG_RIGID=ON \ -D PKG_CLASS2=ON \ -D PKG_MANYBODY=ON \ -D CUDAToolkit_ROOT=/usr/local/cuda-12.9 ```