# HOOMD-blue
[TOC]
## Codes
```=bash
# Source code
git clone --recursive https://github.com/glotzerlab/hoomd-blue
# Benchmarks
git clone https://github.com/glotzerlab/hoomd-benchmarks
```
## Conda
```=bash
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
bash Miniconda3-latest-Linux-x86_64.sh
conda update -n base conda
conda install --solver=classic conda-forge::conda-libmamba-solver conda-forge::libmamba conda-forge::libmambapy conda-forge::libarchive
conda create -p hoomdpy312 python=3.12 -y
conda activate /home/552/es6155/hoomdpy312
```
## Dependencies
```=bash
pip install pybind11 numpy gsd
sed -i 's|pybind11/archive/v2.10.1.tar.gz|pybind11/archive/v2.13.4.tar.gz|g' /scratch/pi13/hoomd/hoomd-blue/install-prereq-headers.py
python3 /scratch/pi13/hoomd/hoomd-blue/install-prereq-headers.py -y
```
```=bash
pip install pybind11 numpy gsd
sed -i 's|pybind11/archive/v2.10.1.tar.gz|pybind11/archive/v2.13.4.tar.gz|g' /home/users/industry/ai-hpc/apacsc39/hoomd-blue/install-prereq-headers.py
python3 /home/users/industry/ai-hpc/apacsc39/hoomd-blue/install-prereq-headers.py -y
```
## Compile
```=bash
module load openmpi/4.1.5
module load gcc/12.2.0
export CFLAGS="-Ofast -march=native -mtune=native -funroll-loops -fprefetch-loop-arrays -ftree-vectorize -fstack-protector-strong -fno-plt -fPIC -pipe"
export CXXFLAGS="$CFLAGS"
export LDFLAGS="-Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--gc-sections"
PATH=hoomdgcc2/bin:$PATH \
cmake -B build/hoomd_gcc_flag -S /scratch/pi13/hoomd/hoomd-blue \
-D ENABLE_MPI=on -D MPI_HOME=/apps/openmpi/4.1.5 \
-D pybind11_DIR=$PWD/hoomdgcc2/lib/python3.12/site-packages/pybind11/share/cmake/pybind11 \
-D Eigen3_DIR=$PWD/hoomdgcc2/share/eigen3/cmake \
-D cereal_DIR=$PWD/hoomdgcc2/lib64/cmake/cereal \
-D CMAKE_C_FLAGS="$CFLAGS" \
-D CMAKE_CXX_FLAGS="$CXXFLAGS" \
-D CMAKE_EXE_LINKER_FLAGS="$LDFLAGS"
cmake --build build/hoomd_gcc_flag -j4
#用>4會爆掉
```
### Gadi
#### gcc11
- openmpi/4.1.4
- cuda/12.5.1

#### Intel 2024.2.0
```=bash
module load intel-compiler-llvm/2024.2.0 intel-mpi/2021.13.0
export CXXFLAGS="-Ofast -march=broadwell -axSKYLAKE-AVX512,CASCADELAKE,SAPPHIRERAPIDS -mtune=native -funroll-loops -flto -qopenmp -qopt-prefetch -qopt-streaming-stores always -fp-model fast -qparallel -fopt-info-vec-all"
export CFLAGS="$CXXFLAGS"
PATH=hoomdintel/bin:$PATH \
cmake -B build/hoomd_intel_v2 -S /scratch/pi13/hoomd/hoomd-blue \
-D CMAKE_C_COMPILER=icx \
-D CMAKE_CXX_COMPILER=icpx \
-D MPI_C_COMPILER=mpicc \
-D MPI_CXX_COMPILER=mpicxx \
-D ENABLE_MPI=on -D MPI_HOME=$I_MPI_ROOT \
-D pybind11_DIR=$PWD/hoomdintel/lib/python3.12/site-packages/pybind11/share/cmake/pybind11 \
-D Eigen3_DIR=$PWD/hoomdintel/share/eigen3/cmake \
-D cereal_DIR=$PWD/hoomdintel/lib64/cmake/cereal \
-D ENABLE_TBB=on \
-D TBB_DIR=/apps/intel-tools/.packages/2024.2.0/tbb/2021.13/lib/cmake/tbb \
-D CMAKE_CXX_FLAGS="$CXXFLAGS" \
-D CMAKE_C_FLAGS="$CFLAGS"
```
#### GCC+Intel-MPI
```=bash
module load intel-mpi/2021.13.0
module load gcc/12.2.0
module load intel-tbb/2021.13.0
export CFLAGS="-Ofast -march=native -mtune=native -funroll-loops -fprefetch-loop-arrays -ftree-vectorize -fstack-protector-strong -fno-plt -fPIC -pipe"
export CXXFLAGS="$CFLAGS"
export LDFLAGS="-Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--gc-sections"
PATH=hoomdpy312llvm/bin:$PATH \
cmake -B build/hoomd_gcc_intelmpi -S /scratch/pi13/hoomd/hoomd-blue \
-D ENABLE_MPI=on \
-D MPI_HOME=/apps/intel-tools/intel-mpi/2021.13.0 \
-D pybind11_DIR=$PWD/hoomdpy312llvm/lib/python3.12/site-packages/pybind11/share/cmake/pybind11 \
-D Eigen3_DIR=$PWD/hoomdpy312llvm/share/eigen3/cmake \
-D cereal_DIR=$PWD/hoomdpy312llvm/lib64/cmake/cereal \
-D ENABLE_TBB=on \
-D TBB_DIR=/apps/intel-tools/intel-tbb/2021.13.0/lib/cmake/tbb \
-D CMAKE_C_FLAGS="$CFLAGS" \
-D CMAKE_CXX_FLAGS="$CXXFLAGS" \
-D CMAKE_EXE_LINKER_FLAGS="$LDFLAGS"
cmake --build build/hoomd_gcc -j4
```
### Aspire
#### GCC
- openmpi/4.1.5-gcc11
- cuda/12.2.2

```=bash
module load gcc/11.2.0
module load openmpi/4.1.5-gcc11
export CFLAGS="-O3 -march=native -mtune=native -funroll-loops -fprefetch-loop-arrays -ftree-vectorize -fstack-protector-strong -fno-plt -fPIC -pipe"
export CXXFLAGS="$CFLAGS"
export LDFLAGS="-Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--gc-sections"
PATH=hoomdpy312/bin:$PATH \
cmake -B build/hoomd_gcc -S $PWD/hoomd-blue \
-D ENABLE_MPI=on -D MPI_HOME=/app/apps/openmpi/4.1.5-gcc11 \
-D pybind11_DIR=$PWD/hoomdpy312/lib/python3.12/site-packages/pybind11/share/cmake/pybind11 \
-D Eigen3_DIR=$PWD/hoomdpy312/share/eigen3/cmake \
-D cereal_DIR=$PWD/hoomdpy312/lib64/cmake/cereal \
-D CMAKE_C_FLAGS="$CFLAGS" \
-D CMAKE_CXX_FLAGS="$CXXFLAGS" \
-D CMAKE_EXE_LINKER_FLAGS="$LDFLAGS"
cmake --build build/hoomd_gcc -j4
```
#### AOCC 4.0.0
```=bash
module load aocc/4.0.0
module load openmpi/4.1.5-aocc4
export CXXFLAGS="-O3 -march=znver3 -ffast-math -flto -fopenmp -fvectorize"
export CFLAGS="$CXXFLAGS"
PATH=hoomdpy312/bin:$PATH \
cmake -B build/hoomd_aocc -S $PWD/hoomd-blue \
-D CMAKE_C_COMPILER=clang \
-D CMAKE_CXX_COMPILER=clang++ \
-D MPI_C_COMPILER=mpicc \
-D MPI_CXX_COMPILER=mpicxx \
-D ENABLE_MPI=on -D MPI_HOME=$MPI_HOME \
-D pybind11_DIR=$PWD/hoomdpy312/lib/python3.12/site-packages/pybind11/share/cmake/pybind11 \
-D Eigen3_DIR=$PWD/hoomdpy312/share/eigen3/cmake \
-D cereal_DIR=$PWD/hoomdpy312/lib64/cmake/cereal \
-D CMAKE_CXX_FLAGS="$CXXFLAGS" \
-D CMAKE_C_FLAGS="$CFLAGS"
cmake --build build/hoomd_aocc -j4
```
#### Cray MPICH
```=bash
export CXXFLAGS="-Ofast -h aggress -h vector3 -hfp3"
export CFLAGS="$CXXFLAGS"
PATH=hoomdmpich/bin:$PATH \
cmake -B build/hoomd_mpich_v2 -S $PWD/hoomd-blue \
-D ENABLE_MPI=on \
-D pybind11_DIR=$PWD/hoomdmpich/lib/python3.12/site-packages/pybind11/share/cmake/pybind11 \
-D Eigen3_DIR=$PWD/hoomdmpich/share/eigen3/cmake \
-D cereal_DIR=$PWD/hoomdmpich/lib64/cmake/cereal
cmake --build build/hoomd_mpich_v2 -j4
```
## Run scripts
- case: hoomd_benchmarks.md_pair_wca
- gpu: 2
### GSD file (N=10000)
GSD stands for "General Simulation Data" and is a file format used in scientific simulations to store and transmit particle simulation data.
```=bash
#!/bin/bash
#PBS -N test
#PBS -q gpuvolta
#PBS -l ncpus=24
#PBS -l ngpus=2
#PBS -l mem=96gb
#PBS -l walltime=00:10:00
#PBS -P pi13
#PBS -j oe
#PBS -o output_2.log
#PBS -e error.log
module load cuda openmpi
module load gcc/12.2.0
module load llvm/16.0.4 intel-tbb/2021.13.0
source /home/552/es6155/miniconda3/bin/activate /home/552/es6155/hoomdpy312cuda
nvidia-smi
export PYTHONPATH=$PWD/build/hoomd_cuda:$PWD/hoomd-benchmarks
mpirun \
-map-by ppr:2:node -oversubscribe \
-x PYTHONPATH \
/home/552/es6155/hoomdpy312cuda/bin/python3 \
-m hoomd_benchmarks.md_pair_wca \
--device GPU -N 10000 -v --warmup_steps 0 --benchmark_steps 0
```
### Benchmarks
下面那段改掉 (只差在最後一行)
```=bash
#!/bin/bash
#PBS -N test
#PBS -q gpuvolta
#PBS -l ncpus=24
#PBS -l ngpus=2
#PBS -l mem=96gb
#PBS -l walltime=00:10:00
#PBS -P pi13
#PBS -j oe
#PBS -o output_2.log
#PBS -e error.log
module load cuda openmpi
module load gcc/12.2.0
module load llvm/16.0.4 intel-tbb/2021.13.0
source /home/552/es6155/miniconda3/bin/activate /home/552/es6155/hoomdpy312cuda
nvidia-smi
export PYTHONPATH=$PWD/build/hoomd_cuda:$PWD/hoomd-benchmarks
mpirun \
-map-by ppr:2:node -oversubscribe \
-x PYTHONPATH \
/home/552/es6155/hoomdpy312cuda/bin/python3 \
-m hoomd_benchmarks.md_pair_wca \
--device GPU -N 1000000 -v --benchmark_steps 100
```


### Intel
```=bash
#!/bin/bash
#PBS -N test
#PBS -q normal
#PBS -l ncpus=1536
#PBS -l mem=400gb
#PBS -l walltime=00:10:00
#PBS -P pi13
#PBS -j oe
#PBS -o intel_v2_1536_2_2_c.log
#PBS -e error.log
module purge
module load intel-compiler-llvm/2024.2.0 intel-mpi/2021.13.0
source /home/552/es6155/miniconda3/bin/activate /home/552/es6155/hoomdintel
export PYTHONPATH=$PWD/build/hoomd_intel_v2:$PWD/hoomd-benchmarks
export LD_PRELOAD=/apps/intel-tools/intel-mpi/2021.13.0/lib/release/libmpi.so
export I_MPI_PIN_DOMAIN=auto
export I_MPI_PIN_ORDER=scatter
export I_MPI_PIN_PROCESSOR_LIST=allcores
export OMP_NUM_THREADS=1
numactl --interleave=all \
mpirun -n 1536 -ppn 2 \
-genv PYTHONPATH $PYTHONPATH \
-genv I_MPI_PIN_DOMAIN $I_MPI_PIN_DOMAIN \
-genv I_MPI_PIN_ORDER $I_MPI_PIN_ORDER \
-genv OMP_NUM_THREADS $OMP_NUM_THREADS \
python3 -m hoomd_benchmarks.md_pair_wca \
--device CPU -v \
-N 200000 \
--warmup 10000 --benchmark_steps 320000
```
### Config UCC
```bash=
./configure --prefix=$HOME/ucc/built --with-ucx=$HOME/ucx/built
```
### Config OpenMPI
```=bash
module load intel-compiler-llvm;
module load ucx;
module load ucc;
CC="icx -O3 -Wno-tautological-constant-compare -xHost -Wno-error=incompatible-function-pointer-types"
CXX="icpx -O3 -Wno-tautological-constant-compare -xHost -Wno-error=incompatible-function-pointer-types"
FC="ifx -O3 -fPIC -xHost"
./configure \
--prefix=$HOME/ompi/built \
--enable-shared \
--enable-mpi-fortran \
--with-tm=/opt/pbs/default \
--with-pbs \
--with-ucx=$HOME/ucx/built \
--with-libevent=internal \
--enable-mpi1-compatibility \
--with-knem=/opt/knem \
--without-xpmem \
--without-verbs \
--enable-mca-static \
--with-hcoll=/apps/hcoll/4.8.3220 \
--with-lustre \
--with-io-romio-flags=--with-file-system=lustre+ufs \
--enable-orterun-prefix-by-default \
--with-platform=$HOME/ompi/contrib/platform/mellanox/optimized \
--with-ucc=$HOME/ucc/built \
--enable-sparse-groups \
--with-zlib
```