# lammps
**Information :**
[HiPAC](https://event1.nchc.org.tw/2025/hipac/details.html)
[國網盃資訊](https://ndrive.niar.org.tw/navigate/a/#/s/96650889CBB94A48AA0049D88FE6B2AE6BL)
[LAMMPS 分子動力學軟體說明](https://drive.google.com/file/d/1gpxa9F2-YfGNRW2c-OcLe0f1_pYHzae4/view?usp=drive_link)
[LAMMPS 官方網址](https://www.lammps.org)
[LAMMPS 使用手冊](https://docs.lammps.org)
[Download](https://www.lammps.org/download.html)
[TEAM1](https://drive.google.com/drive/folders/1FZD3ZC0eDzNQ5MUbt7k_G9wLrZOhyOcr)
**Reference :**
https://hackmd.io/@isc21/BJquuUbu_#processes-and-thread-affinity
https://hackmd.io/@isc21/rJY5v-AZO
https://www.hpc-carpentry.org/tuning_lammps/index.html
## GCC
lmp_gcc
```bash
# GCC 10
module purge
module load gcc10/10.2.1
module load cuda/12.8
module load openmpi/4.1.6
module load intel/2020
cd ~/lammps/lammps-29Aug2024/
mkdir build_gcc10 && cd build_gcc10
cmake \
-D CMAKE_C_COMPILER=$(which mpicc) \
-D CMAKE_CXX_COMPILER=$(which mpicxx) \
-D BUILD_MPI=yes \
-D BUILD_OMP=yes \
-D PKG_OPENMP=yes \
-D PKG_MANYBODY=yes \
-D PKG_KSPACE=yes \
-D PKG_MOLECULE=yes \
-D PKG_RIGID=yes \
-D PKG_COMPRESS=yes \
-D FFT_SINGLE=yes \
-D BUILD_SHARED_LIBS=ON \
-D CMAKE_INSTALL_PREFIX=$HOME/.local/lammps_gcc10 \
../cmake
make -j 16
make install
echo 'export PATH=$HOME/.local/lammps_gcc10/bin:$PATH' >> ~/.bashrc
source ~/.bashrc
```
## INTEL
lmp_gcc
```bash
# Intel 2020
module purge
module load intel/2020
module load cuda/12.8
module load openmpi/4.1.6
cd ~/lammps/lammps-29Aug2024/
mkdir build_intel2020 && cd build_intel2020
cmake \
-D CMAKE_C_COMPILER=$(which mpicc) \
-D CMAKE_CXX_COMPILER=$(which mpicxx) \
-D BUILD_MPI=yes \
-D BUILD_OMP=yes \
-D PKG_OPENMP=yes \
-D PKG_MANYBODY=yes \
-D PKG_KSPACE=yes \
-D PKG_MOLECULE=yes \
-D PKG_RIGID=yes \
-D PKG_COMPRESS=yes \
-D FFT_SINGLE=yes \
-D BUILD_SHARED_LIBS=ON \
-D CMAKE_INSTALL_PREFIX=$HOME/.local/lammps_intel2020 \
../cmake
make -j 16
make install
echo 'export PATH=$HOME/.local/lammps_intel2020/bin:$PATH' >> ~/.bashrc
source ~/.bashrc
```
## NVHPC
lmp
```bash
module purge
module load nvhpc-24.11_hpcx-2.14_cuda-11.8
module load cuda/12.8
cd ~/lammps/lammps-29Aug2024/
mkdir build_nvhpc24 && cd build_nvhpc24
cmake \
-D CMAKE_C_COMPILER=$(which mpicc) \
-D CMAKE_CXX_COMPILER=$(which mpicxx) \
-D CMAKE_CUDA_HOST_COMPILER=$(which nvc++) \
-D CUDA_HOST_COMPILER=$(which nvc++) \
-D BUILD_MPI=yes \
-D BUILD_OMP=yes \
-D GPU_API=cuda \
-D GPU_ARCH=sm_70 \
-D PKG_KOKKOS=ON \
-D Kokkos_ARCH_NATIVE=ON \
-D Kokkos_ARCH_VOLTA70=ON \
-D Kokkos_ENABLE_CUDA=ON \
-D Kokkos_ENABLE_OPENMP=ON \
-D PKG_OPENMP=yes \
-D PKG_MANYBODY=yes \
-D PKG_KSPACE=yes \
-D PKG_MOLECULE=yes \
-D PKG_RIGID=yes \
-D PKG_COMPRESS=yes \
-D FFT_SINGLE=yes \
-D BUILD_SHARED_LIBS=ON \
-D CMAKE_INSTALL_PREFIX=$HOME/.local/lammps_nvhpc24 \
../cmake
make -j 16
make install
echo 'export PATH=$HOME/.local/lammps_nvhpc24/bin:$PATH' >> ~/.bashrc
source ~/.bashrc
```
---
### sbatch
```shell
#!/bin/bash
#SBATCH --job-name=名稱
#SBATCH --output=名稱%j.out
#SBATCH --error=名稱%j.err
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=16
#SBATCH --cpus-per-task=1
#SBATCH --gpus-per-node=8
#SBATCH --time=01:00:00
#SBATCH --partition=gp2d
#SBATCH --account=ACD110018
```
---
### 指令
```bash
squeue -u <account>
```
```bash
sacct -j <jobid>
```
```bash
dos2unix <.sh>
```
---
> pairstyle ```lj/cut```
#### MPI (CPU only)
```bash
mpirun -np 8 lmp -in $INPUTFILE
```
#### OpenMP + MPI
```bash
mpirun -np 8 lmp -sf omp -pk omp 4 -in $INPUTFILE
```
> pairstyle ```lj/cut/gpu```
#### GPU
```bash
mpirun -np 8 lmp -sf gpu -pk gpu 8 -in $INPUTFILE
```
> pairstyle ```lj/cut/kk```
#### Kokkos(GPU)
```bash
mpirun -np 8 lmp -sf kk -k on g 8 -in $INPUTFILE
```
#### kokkos(OMP)
```bash
mpirun -np 8 lmp -sf kk -k on t 4 -in $INPUTFILE
```
## MD
[csv](https://docs.google.com/spreadsheets/d/1i_n_qYDdEUQ-gkFpBfe5IR0dufAYljAcHHQr6AZ3-5w/edit?hl=zh-tw&gid=0#gid=0)
### baseline
| blocksize | atom | method | run | timestep | skin | every | check |
| --------- | -------- | ------------ | --- | -------- | ---- | ----- | ----- |
| 200 | 32000000 | 8 rank 8 GPU | 250 | 0.001 | 0.1 | 1 | no |

>Temp : 1.6712881
TotEng : -2.2800574
### blocksize
:::spoiler blocksize_test.sh
```bash
#!/bin/bash
#SBATCH --job-name=lmp_melt_kokkos
#SBATCH --output=lmp_melt_kokkos_%j.out
#SBATCH --error=lmp_melt_kokkos_%j.err
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=8
#SBATCH --cpus-per-task=2
#SBATCH --gpus-per-node=8
#SBATCH --time=010:00:00
#SBATCH --partition=gp2d
#SBATCH --account=ACD110018
module purge
module load nvhpc-24.11_hpcx-2.20_cuda-12.6
module load cuda/12.8
export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
export OMP_PROC_BIND=spread
export OMP_PLACES=threads
# ====================================
# Generate input scripts for each box size
# ====================================
for BOXSIZE in 100 150 200 250 300 350 400
do
cat << EOF > in.melt_${BOXSIZE}
# 3d Lennard-Jones melt with box size ${BOXSIZE}
units lj
atom_style atomic
lattice fcc 0.8442
region box block 0 ${BOXSIZE} 0 ${BOXSIZE} 0 ${BOXSIZE}
create_box 1 box
create_atoms 1 box
mass 1 1.0
velocity all create 3.0 87287 loop geom
pair_style lj/cut/kk 2.5
pair_coeff 1 1 1.0 1.0 2.5
neighbor 0.3 bin
neigh_modify every 20 delay 0 check no
fix 1 all nve
thermo 50
run 250
EOF
# ====================================
# Run LAMMPS with mpirun + kokkos GPU
# ====================================
echo "============================="
echo "Running box size: ${BOXSIZE}"
echo "============================="
mpirun -np 8 lmp -sf kk -k on g 8 -in in.melt_${BOXSIZE} > out.melt_${BOXSIZE}
# ====================================
# Extract atom count and performance
# ====================================
ATOMS=$(grep "Created" out.melt_${BOXSIZE} | awk '{print $8}')
PERF=$(grep "Performance:" out.melt_${BOXSIZE} | awk '{print $(NF-1),$NF}')
echo "Box size ${BOXSIZE}: ${ATOMS} atoms, Performance: ${PERF}" >> performance_scaling.txt
done
```
:::

>32000000 atoms
blocksize 200
### neigh skin every timesteps
:::spoiler **md_acc_test.sh**
```bash
#!/bin/bash
#SBATCH --job-name=md_acc_test
#SBATCH --output=md_acc_test_%j.out
#SBATCH --error=md_acc_test_%j.err
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=8
#SBATCH --cpus-per-task=2
#SBATCH --gpus-per-node=8
#SBATCH --time=05:00:00
#SBATCH --partition=gp2d
#SBATCH --account=ACD110018
module purge
module load nvhpc-24.11_hpcx-2.20_cuda-12.6
module load cuda/12.8
export LD_LIBRARY_PATH=/home/pkboie1019/.local/lammps_nvhpc24/lib64:$LD_LIBRARY_PATH
export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
export OMP_PROC_BIND=spread
export OMP_PLACES=threads
# ====================================
# Define test parameter sets
# ====================================
declare -a TIMESTEPS=("0.001" "0.002" "0.005")
declare -a SKINS=("0.1" "0.2" "0.3" "0.5" "1.0" "2.0")
declare -a EVERYS=("1" "10" "20" "50" "100" "200" "500")
BOXSIZE=200
for TS in "${TIMESTEPS[@]}"
do
for SKIN in "${SKINS[@]}"
do
for EVERY in "${EVERYS[@]}"
do
INPUT=in.melt_${BOXSIZE}_ts${TS}_skin${SKIN}_every${EVERY}
cat << EOF > $INPUT
units lj
atom_style atomic
lattice fcc 0.8442
region box block 0 ${BOXSIZE} 0 ${BOXSIZE} 0 ${BOXSIZE}
create_box 1 box
create_atoms 1 box
mass 1 1.0
velocity all create 3.0 87287 loop geom
pair_style lj/cut/kk 2.5
pair_coeff 1 1 1.0 1.0 2.5
neighbor ${SKIN} bin
neigh_modify every ${EVERY} delay 0 check no
timestep ${TS}
fix 1 all nve
thermo 50
run 250
EOF
echo "============================="
echo "Running: ts=${TS}, skin=${SKIN}, every=${EVERY}"
echo "============================="
mpirun -np 8 lmp -sf kk -k on g 8 -in $INPUT > out.${INPUT}
# Check lost atoms
LOST=$(grep "Lost atoms" out.${INPUT})
if [ -z "$LOST" ]; then
STATUS="OK"
else
STATUS="LOST ATOMS"
fi
# Extract created atoms
ATOMS=$(grep "Created" out.${INPUT} | grep -o '[0-9]\+' | head -n 1)
# Extract performance
TAUDAY=$(grep "Performance:" out.${INPUT} | awk '{print $2}')
TIMESTEPPS=$(grep "Performance:" out.${INPUT} | awk '{print $4}')
GATOMSTEP=$(grep "Performance:" out.${INPUT} | awk '{print $6}')
# Write to summary file
echo "ts=${TS}, skin=${SKIN}, every=${EVERY}, atoms=${ATOMS}, runsteps=250, status=${STATUS}, tau/day=${TAUDAY}, timestep/s=${TIMESTEPPS}, Gatom-step/s=${GATOMSTEP}" >> performance_accuracy_test.txt
done
done
done
```
:::
```bash
# 操縱變因
declare -a TIMESTEPS=("0.001" "0.002" "0.005")
declare -a SKINS=("0.1" "0.2" "0.3" "0.5" "1.0" "2.0")
declare -a EVERYS=("1" "10" "20" "50" "100" "200" "500")
```
**tau/day**

**Gatom-step/s**

**measurement error**


### every delay check(yes/no)
`delay` 的值必須是 `0` 或是 `every` 的倍數
:::spoiler md_delay.sh
```bash
#!/bin/bash
#SBATCH --job-name=p_md_acc_test
#SBATCH --output=p_md_acc_test_%j.out
#SBATCH --error=p_md_acc_test_%j.err
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=8
#SBATCH --cpus-per-task=2
#SBATCH --gpus-per-node=8
#SBATCH --time=05:00:00
#SBATCH --partition=gp2d
#SBATCH --account=ACD110018
module purge
module load nvhpc-24.11_hpcx-2.20_cuda-12.6
module load cuda/12.8
export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
export OMP_PROC_BIND=spread
export OMP_PLACES=threads
# ========================
# Parameters to test
# ========================
TS="0.001"
SKIN="0.5"
declare -a EVERYS=("1" "10" "20" "50" "100" "200" "500")
declare -a DELAYS=("0" "1" "5" "10" "20" "50" "100" "200" "500")
declare -a CHECKS=("no" "yes")
BOXSIZE=200
for EVERY in "${EVERYS[@]}"
do
for DELAY in "${DELAYS[@]}"
do
# Skip invalid delay combinations
if [ $((DELAY % EVERY)) -ne 0 ]; then
continue
fi
for CHECK in "${CHECKS[@]}"
do
INPUT=in.melt_${BOXSIZE}_ts${TS}_skin${SKIN}_every${EVERY}_delay${DELAY}_check${CHECK}
cat << EOF > $INPUT
units lj
atom_style atomic
lattice fcc 0.8442
region box block 0 ${BOXSIZE} 0 ${BOXSIZE} 0 ${BOXSIZE}
create_box 1 box
create_atoms 1 box
mass 1 1.0
velocity all create 3.0 87287 loop geom
pair_style lj/cut/kk 2.5
pair_coeff 1 1 1.0 1.0 2.5
neighbor ${SKIN} bin
neigh_modify every ${EVERY} delay ${DELAY} check ${CHECK}
timestep ${TS}
fix 1 all nve
thermo 50
run 250
EOF
echo "============================="
echo "Running: every=${EVERY}, delay=${DELAY}, check=${CHECK}"
echo "============================="
mpirun -np 8 lmp -sf kk -k on g 8 -in $INPUT > out.${INPUT}
# Check lost atoms
LOST=$(grep "Lost atoms" out.${INPUT})
if [ -z "$LOST" ]; then
STATUS="OK"
else
STATUS="LOST ATOMS"
fi
# Extract created atoms
ATOMS=$(grep "Created" out.${INPUT} | grep -o '[0-9]\+' | head -n 1)
# Extract performance
TAUDAY=$(grep "Performance:" out.${INPUT} | awk '{print $2}')
TIMESTEPPS=$(grep "Performance:" out.${INPUT} | awk '{print $4}')
GATOMSTEP=$(grep "Performance:" out.${INPUT} | awk '{print $6}')
# Write to summary file
echo "ts=${TS}, skin=${SKIN}, every=${EVERY}, delay=${DELAY}, check=${CHECK}, atoms=${ATOMS}, runsteps=250, status=${STATUS}, tau/day=${TAUDAY}, timestep/s=${TIMESTEPPS}, Gatom-step/s=${GATOMSTEP}" >> performance_accuracy_test.txt
done
done
done
```
:::
```bash
# 操縱變因
TS="0.001"
SKIN="0.5"
declare -a EVERYS=("1" "10" "20" "50" "100" "200" "500")
declare -a DELAYS=("0" "1" "5" "10" "20" "50" "100" "200" "500")
declare -a CHECKS=("no" "yes")
```
**Gatom-step/s**

### N/A :x:
1. Temp 跟 TotEng 差太多 能量爆炸

2. Lost atoms 原子移出 domain

#### 想要performance好
skin 越小 (skin太小 精度不穩)
every 越大 (但大到一定程度之後就沒什麼差)
timestep 越大越好 (太大很容易錯誤)
check (yes/no) no 好一點點點 但 yes 比較不容易報錯
delay 通常是越大越好 (根據 every 會有點不一樣)
#### 精度
溫度能量不能夠差太多
### run_script
- timestep
- skin
- every
- delay (0 or multiples of every)
- check (yes / no)
- blocksize
- runstep
```bash
#!/bin/bash
#SBATCH --job-name=p_md_acc_test
#SBATCH --output=p_md_acc_test_%j.out
#SBATCH --error=p_md_acc_test_%j.err
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=8
#SBATCH --cpus-per-task=2
#SBATCH --gpus-per-node=8
#SBATCH --time=05:00:00
#SBATCH --partition=gp2d
#SBATCH --account=ACD110018
module purge
module load nvhpc-24.11_hpcx-2.20_cuda-12.6
module load cuda/12.8
export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
export OMP_PROC_BIND=spread
export OMP_PLACES=threads
# ========================
# Custom Parameters
# ========================
declare -a TS_LIST=("0.001") # timestep
declare -a SKINS=("0.1" "0.2") # skin distance
declare -a EVERYS=("50" "100" "200" "250") # neighbor every
declare -a DELAYS=("0" "50" "100" "200" "250") # neighbor delay
declare -a CHECKS=("yes" "no") # check
declare -a BOXSIZES=("200") # box size
declare -a RUNSTEPS=("250") # run steps
# ========================
# Begin Testing Loop
# ========================
for TS in "${TS_LIST[@]}"; do
for SKIN in "${SKINS[@]}"; do
for BOXSIZE in "${BOXSIZES[@]}"; do
for RUNSTEP in "${RUNSTEPS[@]}"; do
for EVERY in "${EVERYS[@]}"; do
for DELAY in "${DELAYS[@]}"; do
# delay 必須是 every 的倍數
if [ $((DELAY % EVERY)) -ne 0 ]; then
continue
fi
for CHECK in "${CHECKS[@]}"; do
INPUT=in.melt_${BOXSIZE}_ts${TS}_skin${SKIN}_every${EVERY}_delay${DELAY}_check${CHECK}_run${RUNSTEP}
cat << EOF > $INPUT
units lj
atom_style atomic
lattice fcc 0.8442
region box block 0 ${BOXSIZE} 0 ${BOXSIZE} 0 ${BOXSIZE}
create_box 1 box
create_atoms 1 box
mass 1 1.0
velocity all create 3.0 87287 loop geom
pair_style lj/cut/kk 2.5
pair_coeff 1 1 1.0 1.0 2.5
neighbor ${SKIN} bin
neigh_modify every ${EVERY} delay ${DELAY} check ${CHECK}
timestep ${TS}
fix 1 all nve
thermo 250
run ${RUNSTEP}
EOF
echo "============================="
echo "Running: ts=${TS}, skin=${SKIN}, every=${EVERY}, delay=${DELAY}, check=${CHECK}, box=${BOXSIZE}, run=${RUNSTEP}"
echo "============================="
mpirun -np 8 lmp -sf kk -k on g 8 -in $INPUT > out.${INPUT}
# Check lost atoms
LOST=$(grep "Lost atoms" out.${INPUT})
if [ -z "$LOST" ]; then
STATUS="OK"
else
STATUS="LOST ATOMS"
fi
# Extract atoms created
ATOMS=$(grep "Created" out.${INPUT} | grep -o '[0-9]\+' | head -n 1)
# Extract performance
TAUDAY=$(grep "Performance:" out.${INPUT} | awk '{print $2}')
TIMESTEPPS=$(grep "Performance:" out.${INPUT} | awk '{print $4}')
GATOMSTEP=$(grep "Performance:" out.${INPUT} | awk '{print $6}')
# Save to summary
echo "ts=${TS}, skin=${SKIN}, every=${EVERY}, delay=${DELAY}, check=${CHECK}, atoms=${ATOMS}, runsteps=${RUNSTEP}, status=${STATUS}, tau/day=${TAUDAY}, timestep/s=${TIMESTEPPS}, Gatom-step/s=${GATOMSTEP}" >> performance_accuracy_test.txt
done
done
done
done
done
done
done
```
# Strategy
```bash
Performance: 10075.215 tau/day, 122.749 timesteps/s, 3.928 Gatom-step/s
97.4% CPU use with 8 MPI tasks x 1 OpenMP threads
MPI task timing breakdown:
Section | min time | avg time | max time |%varavg| %total
---------------------------------------------------------------
Pair | 0.033339 | 0.034016 | 0.034344 | 0.2 | 1.67
Neigh | 0.09415 | 0.10065 | 0.10867 | 1.5 | 4.94
Comm | 0.72955 | 0.73304 | 0.73525 | 0.2 | 35.99
Output | 0.00026695 | 0.010562 | 0.018416 | 5.5 | 0.52
Modify | 1.0641 | 1.0664 | 1.0697 | 0.2 | 52.36
Other | | 0.09199 | | | 4.52
```
| Section | 意義 | 包含內容 | 代表什麼時間 |
|:----------:|:------------------------:|:------------------------------------------------------:|:------------------------------------------------------:|
| **Pair** | Pairwise 力計算 | 計算原子間兩兩交互作用力,如 Lennard-Jones、Coulomb 等 | 和 `pair_style` 有關,是大多數分子動力學計算的主力 |
| **Neigh** | Neighbor List 建立 | 建立/更新每個原子的鄰近原子清單 | 和 `neighbor` 距離、`neigh_modify` 更新頻率有關 |
| **Comm** | 通訊(MPI 間的資料交換) | 各 MPI Rank 間原子位置、力、速度的傳遞 | 和使用的平行模式、分割區域(domain decomposition)有關 |
| **Output** | 輸出操作 | 包括 `thermo`, `dump`, `log` 的輸出時間 | 若很高代表太頻繁輸出 |
| **Modify** | fix 與 compute 操作 | 如 `fix nve`, `fix langevin`, `compute temperature` 等 | fix 運算佔比高表示模型中 fix 的耗時操作多 |
| **Other** | 其他時間 | 通常是初始化、同步、內部管理、閒置等待等 | 無法分類的時間,尤其在 load imbalance 時會高 |
---
# temp
## might helpful
### neigh and newton
For KOKKOS/GPU, the default is `neigh = full` and `newton = off`.
For Maxwell and Kepler generations of GPUs, the default settings are typically the best.
For Pascal generations, setting `neigh = half` and `newton = on` might produce faster runs.
### skin
#### 1. neighbor skin style 的意義
**skin 是指鄰居搜尋範圍的額外距離 (buffer distance)。**
>每個 timestep,LAMMPS 不會每次都重建 neighbor list,會先建好一個 list,當粒子移動距離沒有超過 skin 的一半時,就不必重建。
**舉例**
>假設你的 cutoff = 10.0 Å,skin = 2.0 Å
➡️ neighbor list 會存 cutoff + skin = 12.0 Å 內的鄰居。
➡️ 當任一粒子移動距離超過 skin/2 = 1.0 Å 時,LAMMPS 會重新建 neighbor list。
#### 2. skin 對效能與精度的影響
| **skin 大小** | **影響** |
| ------------- | ---------------------------------------------------------------------------------------------------------------------- |
| **較小** | :heavy_check_mark: neighbor list 中的鄰居較少,force 計算較快<br>:x: 但必須更頻繁地重建 neighbor list,增加 overhead |
| **較大** | :heavy_check_mark: 減少重建 neighbor list 的頻率<br>:x: 但每次 force 計算要檢查更多無作用力的 pair,造成浪費 |
#### 3. style 的選擇
- **bin**
幾乎所有情況都使用 bin,scaling 為 O(N)。
- **nsq**
小分子系統 (non-periodic),N < 500 atoms,可試試。
- **multi**
若有粒子大小差異極大的 granular or colloidal 系統,可搭配 neigh_modify 做最佳化。
### neighbor every
#### 1. neigh_modify every 的語法與意義
**基本語法**
```bash
# M = 嘗試每 M timesteps 檢查一次是否需要重建 neighbor list。
neigh_modify every M
```
**運作邏輯 (搭配 delay, check)**
- every M
從上次 build 後,每 M step 允許 LAMMPS 檢查是否需要 rebuild。
- delay N
從上次 build neighbor list 後,至少過 N 個 time steps 才允許再次 rebuild。
- check yes/no
yes ➔ 只有當有 atom 移動距離 > skin/2 時才真的 rebuild。
no ➔ 達到 every 與 delay 條件後,就 強制 rebuild。
**舉例說明**
```bash
# 預設最保守設定,精度最安全。
# 每 step (every=1) 都檢查是否需要 rebuild。
# delay=0 ➔ 不延遲,隨時可 rebuild。
# check=yes ➔ 只有當 atom 移動超過 skin/2 才 rebuild。
neighbor 2.0 bin
neigh_modify every 1 delay 0 check yes
```
#### 2. every 對精度與效能的影響
| **every 值** | **影響** |
| ---------------- | ------------------------------------------------------------------------------------- |
| **較小 (e.g. 1)** | :heavy_check_mark: 時時檢查,精度最高<br>:x: 可能造成 frequent neighbor rebuild,增加 overhead |
| **較大 (e.g. 10)** | :heavy_check_mark: 降低 rebuild 檢查頻率,效能提升<br>:x: 若粒子移動快,可能錯過 rebuild timing ➔ **force 計算錯誤,simulation 崩潰** |
#### 3. 實務上的設定建議
**追求精度 (保守)**
```bash
# 每 step 檢查
# 只 rebuild 當 atom 移動超過 skin/2
# 適用於:高溫、液態、快速擴散系統
neigh_modify every 1 delay 0 check yes
```
**追求效能 (需 benchmark 驗證精度)**
```bash
# 每 5 step 檢查一次
# delay=10 ➔ 前 10 step 禁止 rebuild
# check=yes ➔ 僅當 atom 移動超過 skin/2 才 rebuild
# ✅ 這可降低 neighbor build 次數,減少 MPI 通訊與 rebuild overhead。
neigh_modify every 5 delay 10 check yes
```
**極端效能 (需要確認 atom 幾乎不動,例如低溫晶體)**
```bash
# 每 10 step rebuild(因 check=no)
# delay=100 ➔ 前 100 step 不 rebuild
# ⚠️ 注意:若 atom 移動超過 cutoff+skin,會造成 force 計算錯誤。
neigh_modify every 10 delay 100 check no
```
#### 4. 如何選擇最佳設定
✅ 方法
>使用不同 every, delay, check 組合跑 短 benchmark (e.g. 1000 step)
觀察 log file 中:
Neighbor list builds 次數
timesteps/s
Energy drift 是否 acceptable
✅ 目標
> 最少的 neighbor rebuild 次數
最高的 timesteps/s
仍能保持 energy conservation (例如 NVE 系統 energy drift 在可接受範圍內)
### timestep
#### 1. timestep 的意義
```bash
# dt = 每個 simulation step 所代表的實際時間大小 (單位由 units 決定,例如 real=fs, metal=ps)
timestep dt
```
>units lj: Default dt = 0.005
units real: Default dt = 1.0 femtoseconds
units metal: Default dt = 0.001 picoseconds
#### 2. timestep 對效能的影響
>✅ 效能提升
timestep 越大 ➔ 模擬時間越快
因為:
例如你要模擬 1 ns
timestep=1 fs ➔ 1,000,000 steps
timestep=2 fs ➔ 500,000 steps
step 數減半,整體計算成本減半。
#### 3. timestep 對精度與穩定性的影響
>❌ timestep 太大 ➔ 模擬可能不穩定
原因:
Integration 方法 (如 velocity-Verlet) 假設 timestep 很小
timestep 過大 ➔
energy drift 增加
temperature/pressure 波動過大
系統可能崩潰(例如原子速度爆炸,變為 NaN)
#### 4.timestep 效能/精度
| 大小 | 影響 |
| ----------- | ---------------------------------------------------------------------------- |
| timestep 大 | ✅ step 數少 ➔ 效能提升<br>❌ 積分誤差變大 ➔ energy drift 增加,可能造成崩潰 |
| timestep 小 | ✅ 穩定,energy conservation 好<br>❌ step 數多 ➔ 計算時間增加 |





```bash
#!/bin/bash
#SBATCH --job-name=lammps_nvhpc
#SBATCH --output=lammps_nvhpc_%j.out
#SBATCH --error=lammps_nvhpc_%j.err
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=16
#SBATCH --cpus-per-task=1
#SBATCH --gpus-per-node=8
#SBATCH --time=01:00:00
#SBATCH --partition=gp2d
#SBATCH --account=ACD110018
module purge
module load nvhpc-24.11_hpcx-2.20_cuda-12.6
module load cuda/12.8
export LD_LIBRARY_PATH=/home/pkboie1019/.local/lammps_nvhpc24/lib64:$LD_LIBRARY_PATH
mpirun -np 8 lmp_nvhpc -sf kk -k on g 8 -in input.lammps
```
| 項目 | Kokkos package | GPU package |
|:-------------:|:---------------------------------------------:|:----------------------------------------------:|
| **執行指令** | `-sf kk -k on g N` | `-sf gpu -pk gpu N` |
| **CUDA 實作** | Kokkos CUDA backend | 原生 CUDA kernel |
| **指令範例** | `mpirun -np 8 lmp -k on g N -sf kk -in input` | `mpirun -np 8 lmp -sf gpu -pk gpu N -in input` |
| tau/day | timesteps/s | atom-step/s |
|:----------------------------------------------------------------:|:----------------------------:|:-----------------------:|
| 在一天 (24 小時) 內,可以模擬多少 Lennard-Jones 時間單位 (tau)。 | 每秒可以執行多少 MD timestep | 每秒計算的 atom-step 數 |
| 指標 | 用途 | 是否可比較效能 |
| ---------------- | ------------------- | --------------------------------- |
| **tau/day** | 模擬物理時間長度 | 僅限相同 system |
| **timesteps/s** | 推進速度 | 受 atom 數影響,僅適合同 input 比較 |
| **Gatom-step/s** | **計算吞吐量 benchmark** | ✅ **最佳效能指標**,常用於 HPC scaling plot |
**效能佳**
1. ranks = GPU 數,除非 input 非常小,可考慮多 ranks per GPU
2. threads 設 1 (除非 CPU only backend)
3. 使用 CUDA-aware MPI module,確保 GPU Direct RDMA
:::danger
跑 input.lammps 2D-LJ
lmp: symbol lookup error: /work/HPC_SYS/twnia2/pkg-rocky8/nvidia/hpc_sdk/Linux_x86_64/24.11/comm_libs/12.6/hpcx/hpcx-2.20/ucx/mt/lib/ucx/libuct_cuda_gdrcopy.so.0: undefined symbol: gdr_get_info_v2
:::
**before run**
```bash
module purge
module load nvhpc-24.11_hpcx-2.20_cuda-12.6
module load cuda/12.8
export OMP_PROC_BIND=spread
export OMP_PLACES=threads
```
>OMP_PROC_BIND=spread 會讓執行緒均勻分散到 CPU 核心,通常能避免核心過度競爭。
OMP_PLACES=threads 則是指定可用的 CPU 核心範圍
能讓 OpenMP 執行緒綁定到不同 CPU 核心,提升多核效能,避免執行緒在 CPU 之間亂跳導致效能下降。
```bash
export OMP_PROC_BIND=spread
export OMP_PLACES=threads
unset OMP_PROC_BIND
unset OMP_PLACES
export OMP_NUM_THREADS=1
```
```bash
# 1. 清除並載入所需模組
module load nvhpc-24.11_hpcx-2.14_cuda-11.8
module load cuda/12.8
# 2. 進入 LAMMPS 來源碼目錄
cd ~/lammps/lammps-29Aug2024/
# 3. 建立 build 目錄
rm -rf build_new # 如果之前有 build_new 先刪除
mkdir build_new && cd build_new
# 4. 執行 CMake 配置(可根據需求加 package)
cmake \
-D CMAKE_C_COMPILER=$(which nvc) \
-D CMAKE_CXX_COMPILER=$(which nvc++) \
-D CMAKE_Fortran_COMPILER=$(which nvfortran) \
-D CMAKE_CUDA_HOST_COMPILER=$(which nvc++) \
-D CUDA_HOST_COMPILER=$(which nvc++) \
-D BUILD_MPI=ON \
-D BUILD_OMP=ON \
-D PKG_OPENMP=ON \
-D PKG_GPU=ON \
-D GPU_API=cuda \
-D GPU_ARCH=sm_70 \
-D PKG_KOKKOS=ON \
-D Kokkos_ARCH_NATIVE=ON \
-D Kokkos_ARCH_VOLTA70=ON \
-D Kokkos_ENABLE_CUDA=ON \
-D Kokkos_ENABLE_OPENMP=ON \
-D FFT=FFTW3 \
-D FFT_KOKKOS=CUFFT \
-D FFT_SINGLE=YES \
-D FFT_PACK=array \
-D FFT_USE_HEFFTE=NO \
-D PKG_OPT=ON \
-D BUILD_SHARED_LIBS=ON \
-D PKG_FEP=ON \
-D PKG_TALLY=ON \
-D PKG_REPLICA=ON \
-D PKG_INTEL=ON \
-D PKG_MOLECULE=ON \
-D PKG_KSPACE=ON \
-D PKG_GRANULAR=ON \
-D PKG_RIGID=ON \
-D PKG_CLASS2=ON \
-D PKG_MANYBODY=ON \
-D CMAKE_INSTALL_PREFIX=$HOME/.local/lammps_new \
../cmake
# 5. 編譯並安裝(使用 16 執行緒)
make -j 16
make install
# 6. 加入環境變數(只加一次就好)
echo 'export PATH=$HOME/.local/lammps_new/bin:$PATH' >> ~/.bashrc
source ~/.bashrc
```
```bash
cmake ../cmake \
-G Ninja \
-D CMAKE_INSTALL_PREFIX=$HOME/.local/lammps_double \
-DCMAKE_C_COMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/25.7/comm_libs/12.9/hpcx/hpcx-2.22.1/ompi/bin/mpicc \
-DCMAKE_CXX_COMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/25.7/comm_libs/12.9/hpcx/hpcx-2.22.1/ompi/bin/mpicxx \
-DCMAKE_Fortran_COMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/25.7/comm_libs/12.9/hpcx/hpcx-2.22.1/ompi/bin/mpifort \
-DCMAKE_CUDA_HOST_COMPILER=/opt/nvidia/hpc_sdk/Linux_x86_64/25.7/compilers/bin/nvc++ \
-D CUDAToolkit_ROOT=/usr/local/cuda-12.9 \
-D CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-12.9 \
-D BUILD_MPI=ON \
-D BUILD_OMP=ON \
-D PKG_OPENMP=ON \
-D PKG_GPU=ON \
-D GPU_PREC=double \
-D GPU_API=cuda \
-D GPU_ARCH=sm_70 \
-D PKG_KOKKOS=ON \
-D Kokkos_ARCH_NATIVE=ON \
-D Kokkos_ARCH_VOLTA70=ON \
-D Kokkos_ENABLE_CUDA=ON \
-D Kokkos_ENABLE_OPENMP=ON \
-D FFT=FFTW3 \
-D FFT_KOKKOS=CUFFT \
-D FFT_SINGLE=YES \
-D FFT_PACK=array \
-D FFT_USE_HEFFTE=NO \
-D PKG_OPT=ON \
-D BUILD_SHARED_LIBS=ON \
-D PKG_FEP=ON \
-D PKG_TALLY=ON \
-D PKG_REPLICA=ON \
-D PKG_INTEL=ON \
-D PKG_MOLECULE=ON \
-D PKG_KSPACE=ON \
-D PKG_GRANULAR=ON \
-D PKG_RIGID=ON \
-D PKG_CLASS2=ON \
-D PKG_MANYBODY=ON \
```
```bash
module purge
module load tbb/2021.7.1 compiler-rt/2022.2.1 mkl/2022.2.1 hpcx nvhpc-hpcx-cuda12/25.7 cuda/12.9
spack unload -a
spack load fftw@3.3.10
spack load cmake@3.30.5
```
```bash
cmake ../cmake \
-G Ninja \
-D CMAKE_INSTALL_PREFIX=$HOME/.local/lammps_latest \
-D CMAKE_C_COMPILER=$(which nvc) \
-D CMAKE_CXX_COMPILER=$(which nvc++) \
-D CMAKE_CUDA_HOST_COMPILER=$(which nvc++) \
-D CUDA_HOST_COMPILER=$(which nvc++) \
-D CUDAToolkit_ROOT=/usr/local/cuda-12.9 \
-D CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-12.9 \
-D CMAKE_CUDA_ARCHITECTURES=70 \
-D CMAKE_C_FLAGS="-fast -mp -Ofast -tp host" \
-D CMAKE_CXX_FLAGS="-fast -mp -Ofast -tp host" \
-D CMAKE_CUDA_FLAGS="-Xcompiler -O3 -Xcompiler -march=native" \
-D BUILD_MPI=yes \
-D BUILD_OMP=yes \
-D PKG_GPU=on \
-D PKG_MANYBODY=on \
-D PKG_REPLICA=on \
-D FFT=MKL \
-D FFT_SINGLE=yes \
-D CUDA_MPS_SUPPORT=yes \
-D CUDPP_OPT=no \
-D GPU_API=cuda \
-D GPU_ARCH=sm_70 \
-D CUDA_BUILD_MULTIARCH=no \
-D USE_STATIC_OPENCL_LOADER=no \
-D BUILD_SHARED_LIBS=ON \
```
```bash
cmake ../cmake \
-G Ninja \
-D CMAKE_INSTALL_PREFIX=$HOME/.local/lammps_double \
-D CMAKE_C_COMPILER=$(which nvc) \
-D CMAKE_CXX_COMPILER=$(which nvc++) \
-D CMAKE_Fortran_COMPILER=$(which nvfortran) \
-D CMAKE_CUDA_HOST_COMPILER=$(which nvc++) \
-D BUILD_MPI=ON \
-D MPI_C_COMPILER=$(which mpicc) \
-D MPI_CXX_COMPILER=$(which mpicxx) \
-D MPI_Fortran_COMPILER=$(which mpifort) \
-D BUILD_OMP=ON \
-D PKG_OPENMP=ON \
-D PKG_GPU=ON \
-D GPU_API=cuda \
-D GPU_ARCH=sm_70 \
-D GPU_PREC=double \
-D PKG_KOKKOS=ON \
-D Kokkos_ENABLE_CUDA=ON \
-D Kokkos_ENABLE_OPENMP=ON \
-D Kokkos_ARCH_VOLTA70=ON \
-D Kokkos_ARCH_NATIVE=ON \
-D FFT=FFTW3 \
-D FFT_KOKKOS=CUFFT \
-D PKG_OPT=ON \
-D BUILD_SHARED_LIBS=ON \
-D PKG_FEP=ON \
-D PKG_TALLY=ON \
-D PKG_REPLICA=ON \
-D PKG_INTEL=OFF \
-D PKG_MOLECULE=ON \
-D PKG_KSPACE=ON \
-D PKG_GRANULAR=ON \
-D PKG_RIGID=ON \
-D PKG_CLASS2=ON \
-D PKG_MANYBODY=ON \
-D CUDAToolkit_ROOT=/usr/local/cuda-12.9
```