# Binding and mapping on Leonardo Booster
- Fill in all the `XXX` sections in the desired SLURM template.
- The GPU binder writes a report to the standard output. If, for any reason, this is not the desired behaviour, redirect or comment the `echo [...]` line.
- An exact ratios between the number of processes per node and the number of GPUs is assumed.
- When using less than one node [eg 3 GPUs], do _not_ use `#SBATCH --exclusive`, which would overwrite your previous choices.
### SLURM and SRUN based
```bash=
#!/bin/sh
#SBATCH --partition=boost_usr_prod
#SBATCH --nodes=XXX
#SBATCH --gres=gpu:XXX
#SBATCH --ntasks-per-node=XXX
#SBATCH --cpus-per-task=XXX
#SBATCH --time=XXX
#SBATCH --output=XXX-%j.out
#
# Setup the environment
#
module purge
XXX
#
# GPU binding
#
GPU_BINDER=cin_gpu_binder.sh
TASKS_PER_GPU=$((SLURM_NTASKS_PER_NODE/SLURM_GPUS_ON_NODE))
cat > ${GPU_BINDER} <<-EOF
#!/usr/bin/env bash
NODEID=\${SLURM_NODEID}
TASKS_PER_GPU=${TASKS_PER_GPU}
MY_GPU=\$((SLURM_LOCALID/TASKS_PER_GPU))
export CUDA_VISIBLE_DEVICES=\$MY_GPU
BINDINGS="task \${SLURM_LOCALID}, bound to GPU \${CUDA_VISIBLE_DEVICES} : \$(taskset -cp \$\$)"
echo "# [BINDING REPORT, NODE(\${NODEID}) \$(hostname)] \${BINDINGS}"
\$@
EOF
chmod 755 ${GPU_BINDER}
#
# Run application with bindings
#
srun --cpu-bind=cores --cpus-per-task=${SLURM_CPUS_PER_TASK} -m block:block ./${GPU_BINDER} XXX # Executable here
```
### SLURM and Open MPI based
```bash=
#!/bin/sh
#SBATCH --partition=boost_usr_prod
#SBATCH --nodes=XXX
#SBATCH --gres=gpu:XXX
#SBATCH --ntasks-per-node=XXX
#SBATCH --cpus-per-task=XXX
#SBATCH --time=XXX
#SBATCH --output=XXX-%j.out
#
# Setup the environment
#
module purge
XXX
#
# GPU binding
#
GPU_BINDER=cin_gpu_binder.sh
TASKS_PER_GPU=$((SLURM_NTASKS_PER_NODE/SLURM_GPUS_ON_NODE))
cat > ${GPU_BINDER} <<-EOF
#!/usr/bin/env bash
NODEID=\$((OMPI_COMM_WORLD_RANK/${SLURM_NTASKS_PER_NODE}))
TASKS_PER_GPU=${TASKS_PER_GPU}
MY_GPU=\$((OMPI_COMM_WORLD_LOCAL_RANK/TASKS_PER_GPU))
export CUDA_VISIBLE_DEVICES=\$MY_GPU
BINDINGS="task \${OMPI_COMM_WORLD_LOCAL_RANK}, bound to GPU \${CUDA_VISIBLE_DEVICES} : \$(taskset -cp \$\$)"
echo "# [BINDING REPORT, NODE(\${NODEID}) \$(hostname)] \${BINDINGS}"
\$@
EOF
chmod 755 ${GPU_BINDER}
#
# Thread binding to cores
#
export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK}
export OMP_PLACES=cores
export OMP_PROC_BIND=close
#
# Run application with bindings
#
mpirun --report-bindings --map-by ppr:${SLURM_NTASKS_PER_NODE}:node:PE=${SLURM_CPUS_PER_TASK} ./${GPU_BINDER} XXX # Executable here
```
## MPS
Please use only 2, 4 or 8 processes _per_ _GPU_ when using MPS [assuming that 32 processes per node will always be used].
### SLURM and SRUN based
```bash=
#!/bin/sh
#SBATCH --partition=boost_usr_prod
#SBATCH --nodes=XXX
#SBATCH --gres=gpu:XXX
#SBATCH --ntasks-per-node=XXX
#SBATCH --cpus-per-task=XXX
#SBATCH --time=XXX
#SBATCH --output=XXX-%j.out
#
# Setup the environment
#
module purge
XXX
#
# GPU binding (and MPS starter)
#
GPU_BINDER=cin_gpu_binder.sh
TASKS_PER_GPU=$((SLURM_NTASKS_PER_NODE/SLURM_GPUS_ON_NODE))
cat > ${GPU_BINDER} <<-EOF
#!/usr/bin/env bash
NODEID=\${SLURM_NODEID}
export CUDA_MPS_LOG_DIRECTORY=./pipe\${NODEID}
export CUDA_MPS_PIPE_DIRECTORY=./pipe\${NODEID}
if [ \$SLURM_LOCALID -eq 0 ]; then
nvidia-cuda-mps-control -d
fi
TASKS_PER_GPU=${TASKS_PER_GPU}
MY_GPU=\$((SLURM_LOCALID/TASKS_PER_GPU))
export CUDA_VISIBLE_DEVICES=\$MY_GPU
BINDINGS="task \${SLURM_LOCALID}, bound to GPU \${CUDA_VISIBLE_DEVICES} : \$(taskset -cp \$\$)"
echo "# [BINDING REPORT, NODE(\${NODEID}) \$(hostname)] \${BINDINGS}"
\$@
EOF
chmod 755 ${GPU_BINDER}
#
# Run application with bindings
#
srun --cpu-bind=cores --cpus-per-task=${SLURM_CPUS_PER_TASK} -m block:block ./${GPU_BINDER} XXX # Executable here
#
# Stop MPS daemon
#
MPS_STOPPER=cin_stop_mps.sh
cat > ${MPS_STOPPER} <<-EOF
#!/usr/bin/env bash
NODEID=\${SLURM_NODEID}
if [ \$SLURM_LOCALID -eq 0 ]; then
echo quit | nvidia-cuda-mps-control
echo "Stopped MPS deamon on node \${NODEID}"
fi
EOF
chmod 755 ${MPS_STOPPER}
srun --cpu-bind=cores --cpus-per-task=${SLURM_CPUS_PER_TASK} -m block:block ${MPS_STOPPER}
```
### SLURM and Open MPI based
```bash=
#!/bin/sh
#SBATCH --partition=boost_usr_prod
#SBATCH --nodes=XXX
#SBATCH --gres=gpu:XXX
#SBATCH --ntasks-per-node=XXX
#SBATCH --cpus-per-task=XXX
#SBATCH --time=XXX
#SBATCH --output=XXX-%j.out
#
# Setup the environment
#
module purge
XXX
#
# GPU binding (and MPS starter)
#
GPU_BINDER=cin_gpu_binder.sh
TASKS_PER_GPU=$((SLURM_NTASKS_PER_NODE/SLURM_GPUS_ON_NODE))
cat > ${GPU_BINDER} <<-EOF
#!/usr/bin/env bash
NODEID=\$((OMPI_COMM_WORLD_RANK/${SLURM_NTASKS_PER_NODE}))
export CUDA_MPS_LOG_DIRECTORY=./pipe\${NODEID}
export CUDA_MPS_PIPE_DIRECTORY=./pipe\${NODEID}
if [ \$OMPI_COMM_WORLD_LOCAL_RANK -eq 0 ]; then
nvidia-cuda-mps-control -d
fi
TASKS_PER_GPU=${TASKS_PER_GPU}
MY_GPU=\$((OMPI_COMM_WORLD_LOCAL_RANK/TASKS_PER_GPU))
export CUDA_VISIBLE_DEVICES=\$MY_GPU
BINDINGS="task \${OMPI_COMM_WORLD_LOCAL_RANK}, bound to GPU \${CUDA_VISIBLE_DEVICES} : \$(taskset -cp \$\$)"
echo "# [BINDING REPORT, NODE(\${NODEID}) \$(hostname)] \${BINDINGS}"
\$@
EOF
chmod 755 ${GPU_BINDER}
#
# Thread binding to cores
#
export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK}
export OMP_PLACES=cores
#
# Run application with bindings
#
mpirun --report-bindings --map-by ppr:${SLURM_NTASKS_PER_NODE}:node:PE=${SLURM_CPUS_PER_TASK} ./${GPU_BINDER} XXX # Executable here
#
# Stop MPS daemon
#
MPS_STOPPER=cin_stop_mps.sh
cat > ${MPS_STOPPER} <<-EOF
#!/usr/bin/env bash
NODEID=\$((OMPI_COMM_WORLD_RANK/${SLURM_NTASKS_PER_NODE}))
if [ \$OMPI_COMM_WORLD_LOCAL_RANK -eq 0 ]; then
echo quit | nvidia-cuda-mps-control
echo "Stopped MPS deamon on node \${NODEID}"
fi
EOF
chmod 755 ${MPS_STOPPER}
mpirun --map-by ppr:${SLURM_NTASKS_PER_NODE}:node:PE=${SLURM_CPUS_PER_TASK} ${MPS_STOPPER}
```