# Run cuquantum and qiskit-machine-learning
## Prepare custom image
```
FROM nvcr.io/nvidia/cuquantum-appliance:23.10
RUN /home/cuquantum/conda/envs/cuquantum-23.10/bin/pip install qiskit-machine-learning==0.6.1
USER root
RUN chmod 755 -R /home/cuquantum
```
## Build custom image
```
docker build -t mycuquantum . --no-cache
docker image ls
```
### nvidia-docker
run in nvidia-docker ready environment (Dgx-1 V100)
```
docker run --gpus all -it --rm mycuquantum:latest
docker run --gpus '"device=0,3"' -it --rm mycuquantum:latest
```
/usr/local/ucx/bin/ucx_info -d
run in the project folder
```
docker run --gpus all -v $PWD:/host_pwd -w /host_pwd --rm mycuquantum:latest python benchmark_qsvm_tnsm-mpi_demo3.py
```
```
docker run --gpus all -v $PWD:/host_pwd -w /host_pwd --rm mycuquantum:latest /usr/local/ucx/bin/ucx_info -d
```
## singularity
```
singularity build cuquantum-qiskit.sif docker-daemon://mycuquantum:latest
singularity build --sandbox cuquantum-qiskit cuquantum-qiskit.sif
```
or
```
singularity build --sandbox cuquantum-qiskit docker-daemon://mycuquantum:latest
```
### DGX1 V100
### Taiwania2 H100
# openmpi
wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/fdc7a2bc-b7a8-47eb-8876-de6201297144/l_BaseKit_p_2024.1.0.596_offline.sh
sh ./l_BaseKit_p_2024.1.0.596_offline.sh -a --cli --instance=qusim
wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7f096850-dc7b-4c35-90b5-36c12abd9eaa/l_HPCKit_p_2024.1.0.560_offline.sh
sh ./l_HPCKit_p_2024.1.0.560_offline.sh -a --cli --instance=qusim
https://xconfigure.readthedocs.io/en/latest/
apps/oneapi-2024.1/modulefiles-setup.sh --output-dir=~/apps/modulefiles --ignore-latest
module load tbb/2021.12 compiler-rt/2024.1.0 oclfpga/2024.1.0 compiler/2024.1.0 mkl/2024.1 mpi/2021.12
ompi_info
Configure command line: '--prefix=/home/qusim/gcc8/openmpi-5.0.2'
'--with-slurm=/usr' '--with-pmix=internal'
'--with-libevent=internal' '--with-hwloc=internal'
'--without-xpmem'
'--with-ucx=/home/qusim/gcc8/ucx-1.15'
'--with-knem=/opt/knem-1.1.4.90mlnx2'
'--with-hcoll=/opt/mellanox/hcoll'
'--with-platform=../contrib/platform/mellanox/optimized'
'--enable-sparse-groups' '--disable-dlopen'
'--enable-mpi1-compatibility' '--without-verbs'
Configure command line: '--prefix=/home/qusim/toolchain/build/openmpi-5.0.2/icc'
'--enable-shared' '--enable-static'
'--enable-mpi-fortran' '--with-slurm'
'--with-ucx=/home/qusim/toolchain/build/ucx-1.16/icc'
'--with-ucx-libdir=/home/qusim/toolchain/build/ucx-1.16/icc/lib'
'--enable-mpi1-compatibility' '--with-libevent'
'--with-hwloc=internal'
'--with-knem=/opt/knem-1.1.4.90mlnx2'
'--with-hcoll=/opt/mellanox/hcoll'
'--without-xpmem' '--enable-mca-static'
'--enable-orterun-prefix-by-default'
'--enable-mpirun-prefix-by-default'
'--with-platform=contrib/platform/mellanox/optimized'
'--with-gpfs=/usr/lpp/mmfs'
'--enable-sparse-groups' '--with-zlib'
To install into a named environment, run:
```
conda install package-name=2.3.4 -n some-environment
```
conda config --append envs_dirs /home/cuquantum/conda/envs && source activate cuquantum-23.10
(cuquantum-23.10) cuquantum@3497c9b11f95:~$ /usr/local/openmpi/bin/orte-info
```
(cuquantum-23.10) cuquantum@3497c9b11f95:~$ /usr/local/openmpi/bin/orte-info
Open RTE: 4.1.4rc2
Open RTE repo revision: v4.1.4
Open RTE release date: Unreleased developer copy
Prefix: /usr/local/openmpi
Configured architecture: x86_64-pc-linux-gnu
Configure host: buildkitsandbox
Configured by: root
Configured on: Tue Nov 7 20:55:35 UTC 2023
Configure host: buildkitsandbox
Configure command line: '--prefix=/usr/local/openmpi' '--disable-debug'
'--disable-getpwuid' '--disable-mem-debug'
'--disable-mem-profile' '--disable-memchecker'
'--disable-static' '--enable-mca-no-build=btl-uct'
'--enable-mpi1-compatibility' '--enable-oshmem'
'--prefix=/usr/local/openmpi'
'--with-cuda=/usr/local/cuda'
'--with-pmi=/usr/local/pmi'
'--with-pmix=/usr/local/pmix' '--with-slurm'
'--with-ucx=/usr/local/ucx' '--without-verbs'
Built by:
Built on: Tue Nov 7 20:57:17 UTC 2023
Built host: buildkitsandbox
C compiler: gcc
C compiler absolute: /usr/bin/gcc
C compiler family name: GNU
C compiler version: 10.5.0
Thread support: posix (OPAL: yes, ORTE progress: yes, Event lib: yes)
Internal debug support: no
Memory profiling support: no
Memory debugging support: no
dl support: yes
Heterogeneous support: no
orterun default --prefix: no
MPI_WTIME support: native
Symbol vis. support: yes
FT Checkpoint support: no (checkpoint thread: no)
MCA allocator: bucket (MCA v2.1, API v2.0, Component v4.1.4)
MCA allocator: basic (MCA v2.1, API v2.0, Component v4.1.4)
MCA backtrace: execinfo (MCA v2.1, API v2.0, Component v4.1.4)
MCA btl: self (MCA v2.1, API v3.1, Component v4.1.4)
MCA btl: smcuda (MCA v2.1, API v3.1, Component v4.1.4)
MCA btl: vader (MCA v2.1, API v3.1, Component v4.1.4)
MCA btl: tcp (MCA v2.1, API v3.1, Component v4.1.4)
MCA compress: gzip (MCA v2.1, API v2.0, Component v4.1.4)
MCA compress: bzip (MCA v2.1, API v2.0, Component v4.1.4)
MCA crs: none (MCA v2.1, API v2.0, Component v4.1.4)
MCA dl: dlopen (MCA v2.1, API v1.0, Component v4.1.4)
MCA event: external (MCA v2.1, API v2.0, Component v4.1.4)
MCA hwloc: hwloc201 (MCA v2.1, API v2.0, Component v4.1.4)
MCA if: linux_ipv6 (MCA v2.1, API v2.0, Component v4.1.4)
MCA if: posix_ipv4 (MCA v2.1, API v2.0, Component v4.1.4)
MCA installdirs: env (MCA v2.1, API v2.0, Component v4.1.4)
MCA installdirs: config (MCA v2.1, API v2.0, Component v4.1.4)
MCA memory: patcher (MCA v2.1, API v2.0, Component v4.1.4)
MCA mpool: hugepage (MCA v2.1, API v3.0, Component v4.1.4)
MCA patcher: overwrite (MCA v2.1, API v1.0, Component v4.1.4)
MCA pmix: ext3x (MCA v2.1, API v2.0, Component v4.1.4)
MCA pmix: isolated (MCA v2.1, API v2.0, Component v4.1.4)
MCA pmix: s2 (MCA v2.1, API v2.0, Component v4.1.4)
MCA pmix: flux (MCA v2.1, API v2.0, Component v4.1.4)
MCA pstat: linux (MCA v2.1, API v2.0, Component v4.1.4)
MCA rcache: grdma (MCA v2.1, API v3.3, Component v4.1.4)
MCA rcache: gpusm (MCA v2.1, API v3.3, Component v4.1.4)
MCA rcache: rgpusm (MCA v2.1, API v3.3, Component v4.1.4)
MCA reachable: netlink (MCA v2.1, API v2.0, Component v4.1.4)
MCA reachable: weighted (MCA v2.1, API v2.0, Component v4.1.4)
MCA shmem: posix (MCA v2.1, API v2.0, Component v4.1.4)
MCA shmem: mmap (MCA v2.1, API v2.0, Component v4.1.4)
MCA shmem: sysv (MCA v2.1, API v2.0, Component v4.1.4)
MCA timer: linux (MCA v2.1, API v2.0, Component v4.1.4)
MCA errmgr: default_tool (MCA v2.1, API v3.0, Component v4.1.4)
MCA errmgr: default_app (MCA v2.1, API v3.0, Component v4.1.4)
MCA errmgr: default_orted (MCA v2.1, API v3.0, Component v4.1.4)
MCA errmgr: default_hnp (MCA v2.1, API v3.0, Component v4.1.4)
MCA ess: hnp (MCA v2.1, API v3.0, Component v4.1.4)
MCA ess: env (MCA v2.1, API v3.0, Component v4.1.4)
MCA ess: pmi (MCA v2.1, API v3.0, Component v4.1.4)
MCA ess: singleton (MCA v2.1, API v3.0, Component v4.1.4)
MCA ess: slurm (MCA v2.1, API v3.0, Component v4.1.4)
MCA ess: tool (MCA v2.1, API v3.0, Component v4.1.4)
MCA filem: raw (MCA v2.1, API v2.0, Component v4.1.4)
MCA grpcomm: direct (MCA v2.1, API v3.0, Component v4.1.4)
MCA iof: tool (MCA v2.1, API v2.0, Component v4.1.4)
MCA iof: orted (MCA v2.1, API v2.0, Component v4.1.4)
MCA iof: hnp (MCA v2.1, API v2.0, Component v4.1.4)
MCA odls: default (MCA v2.1, API v2.0, Component v4.1.4)
MCA odls: pspawn (MCA v2.1, API v2.0, Component v4.1.4)
MCA oob: tcp (MCA v2.1, API v2.0, Component v4.1.4)
MCA plm: slurm (MCA v2.1, API v2.0, Component v4.1.4)
MCA plm: rsh (MCA v2.1, API v2.0, Component v4.1.4)
MCA plm: isolated (MCA v2.1, API v2.0, Component v4.1.4)
MCA ras: slurm (MCA v2.1, API v2.0, Component v4.1.4)
MCA ras: simulator (MCA v2.1, API v2.0, Component v4.1.4)
MCA regx: fwd (MCA v2.1, API v1.0, Component v4.1.4)
MCA regx: reverse (MCA v2.1, API v1.0, Component v4.1.4)
MCA regx: naive (MCA v2.1, API v1.0, Component v4.1.4)
MCA rmaps: rank_file (MCA v2.1, API v2.0, Component v4.1.4)
MCA rmaps: mindist (MCA v2.1, API v2.0, Component v4.1.4)
MCA rmaps: resilient (MCA v2.1, API v2.0, Component v4.1.4)
MCA rmaps: round_robin (MCA v2.1, API v2.0, Component v4.1.4)
MCA rmaps: ppr (MCA v2.1, API v2.0, Component v4.1.4)
MCA rmaps: seq (MCA v2.1, API v2.0, Component v4.1.4)
MCA rml: oob (MCA v2.1, API v3.0, Component v4.1.4)
MCA routed: radix (MCA v2.1, API v3.0, Component v4.1.4)
MCA routed: direct (MCA v2.1, API v3.0, Component v4.1.4)
MCA routed: binomial (MCA v2.1, API v3.0, Component v4.1.4)
MCA rtc: hwloc (MCA v2.1, API v1.0, Component v4.1.4)
MCA schizo: ompi (MCA v2.1, API v1.0, Component v4.1.4)
MCA schizo: flux (MCA v2.1, API v1.0, Component v4.1.4)
MCA schizo: orte (MCA v2.1, API v1.0, Component v4.1.4)
MCA schizo: slurm (MCA v2.1, API v1.0, Component v4.1.4)
MCA schizo: jsm (MCA v2.1, API v1.0, Component v4.1.4)
MCA state: orted (MCA v2.1, API v1.0, Component v4.1.4)
MCA state: app (MCA v2.1, API v1.0, Component v4.1.4)
MCA state: hnp (MCA v2.1, API v1.0, Component v4.1.4)
MCA state: novm (MCA v2.1, API v1.0, Component v4.1.4)
MCA state: tool (MCA v2.1, API v1.0, Component v4.1.4)
```
module load singularity cuda/12.2 miniconda3
singularity exec --nv /home/p00acy00/images/cuquantum-qiskit.sif mpiexec -n 4 python3 ghz.py
singularity exec --nv -B $PWD:/tmp --pwd /tmp /home/p00acy00/images/cuquantum-qiskit.sif python benchmark_qsvm_tnsm-mpi_demo3.py
conda create --name cuquantum python=3.10
conda activate cuquantum
conda install -c conda-forge cuquantum=23.10.0.6 cuquantum-python=23.10.0 openmpi=4.1.6 qiskit-machine-learning=0.6.1
conda install -c conda-forge pandas matplotlib mpi4py
```
INSTALL=$HOME/build
nproc=40
module purge
module load cuda/12.2
cd $HOME
git clone -b v1.15.x https://github.com/openucx/ucx.git
cd ucx
./autogen.sh
CC=gcc CXX=g++ FC=gfortran \
./contrib/configure-release \
--prefix=$INSTALL/ucx-1.15 \
--with-gdrcopy=/usr \
--with-cuda=$CUDA_HOME \
--with-cuda-libdir=$CUDA_HOME/lib64/stubs \
--disable-logging \
--disable-debug \
--disable-assertions \
--disable-dependency-tracking \
--disable-params-check \
--disable-doxygen-doc \
--disable-doxygen-dot \
--disable-doxygen-man \
--disable-doxygen-html \
--disable-doxygen-pdf \
--without-java \
--enable-optimizations \
--enable-shared \
--enable-static \
--enable-mt \
--enable-cma \
--with-mlx5-dv \
--with-ib-hw-tm \
--with-rc \
--with-ud \
--with-dc \
--with-avx
make -j $(nproc)
make install -j $(nproc)
cd $HOME
wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.6.tar.gz
tar xf openmpi-4.1.6.tar.gz
cd openmpi-4.1.6
CC=gcc CXX=g++ FC=gfortran \
CFLAGS="-O3 -march=native -fPIC" \
CXXFLAGS="-O3 -march=native -fPIC" \
FCFLAGS="-O3 -march=native -fPIC" \
LDFLAGS="-Wl,--build-id -L/lib64" \
./configure \
--prefix=$INSTALL/openmpi-4.1.6 \
--host=x86_64-redhat-linux-gnu \
--with-cuda=$CUDA_HOME \
--with-cuda-libdir=$CUDA_HOME/lib64/stubs \
--enable-shared \
--enable-static \
--enable-mpi-fortran \
--with-slurm \
--with-ucx=$INSTALL/ucx-1.15 \
--with-ucx-libdir=$INSTALL/ucx-1.15/lib \
--enable-mpi1-compatibility \
--with-libevent \
--with-hwloc=internal \
--with-knem=/opt/knem-1.1.3.90mlnx1 \
--with-hcoll=/opt/mellanox/hcoll \
--without-xpmem \
--without-verbs \
--enable-mca-static \
--enable-orterun-prefix-by-default \
--enable-mpirun-prefix-by-default \
--with-platform=contrib/platform/mellanox/optimized \
--with-gpfs=/usr/lpp/mmfs \
--enable-sparse-groups \
--with-zlib
make -j $(nproc)
make install -j $(nproc)
```
mpirun -np 16 singularity run --nv -B python benchmark_qsvm_tnsm-mpi_demo3.py
# LAMMPS on DGX
```
docker run --rm --gpus all --ipc=host -v $PWD:/host_pwd -w /host_pwd nvcr.io/hpc/lammps:patch_15Jun2023 ./run_lammps.sh
singularity run --nv -B $PWD:/host_pwd --pwd /host_pwd docker://nvcr.io/hpc/lammps:patch_15Jun2023 ./run_lammps.sh
```
# TWN2
https://docs.sylabs.io/guides/3.8/user-guide/build_a_container.html
module load singularity cuda/12.2
###export UCX_TLS=rc,cuda_copy,cuda_ipc,gdr_copy,sm
cp -r ~/images/cuquantum-qiskit /media/nvme/jobs
mpiexec -n 2
singularity exec --nv -B $PWD:/tmp --pwd /tmp /media/nvme/jobs/cuquantum-qiskit /home/cuquantum/conda/envs/cuquantum-23.10/bin/python benchmark_qsvm_tnsm-mpi_demo3.py
salloc --partition=gp1d --account=GOV108018 --nodes=2 --ntasks-per-node=1 --gpus-per-node=1
# DGX
https://docs.sylabs.io/guides/latest/user-guide/build_a_container.html
export SINGULARITYENV_UCX_TLS=rc,cuda_copy,cuda_ipc,gdr_copy,sm
singularity exec --nv -w -B $PWD:/tmp --pwd /tmp /raid/acyang/qsvm/cuquantum-qiskit mpiexec -n 8 /home/cuquantum/conda/envs/cuquantum-23.10/bin/python benchmark_qsvm_tnsm-mpi_demo3.py
singularity shell --nv -w -B $PWD:/tmp --pwd /tmp /raid/acyang/qsvm/cuquantum-qiskit
singularity exec --nv -B $PWD:/tmp --pwd /tmp /raid/acyang/qsvm/cuquantum-qiskit python benchmark_qsvm_tnsm-mpi_demo3.py
```
root@b8a4f9b28111:/home/cuquantum# /usr/local/ucx/bin/ucx_info -d
#
# Memory domain: self
# Component: self
# register: unlimited, cost: 0 nsec
# remote key: 0 bytes
#
# Transport: self
# Device: memory0
# Type: loopback
# System device: <unknown>
#
# capabilities:
# bandwidth: 0.00/ppn + 6911.00 MB/sec
# latency: 0 nsec
# overhead: 10 nsec
# put_short: <= 4294967295
# put_bcopy: unlimited
# get_bcopy: unlimited
# am_short: <= 8K
# am_bcopy: <= 8K
# domain: cpu
# atomic_add: 32, 64 bit
# atomic_and: 32, 64 bit
# atomic_or: 32, 64 bit
# atomic_xor: 32, 64 bit
# atomic_fadd: 32, 64 bit
# atomic_fand: 32, 64 bit
# atomic_for: 32, 64 bit
# atomic_fxor: 32, 64 bit
# atomic_swap: 32, 64 bit
# atomic_cswap: 32, 64 bit
# connection: to iface
# device priority: 0
# device num paths: 1
# max eps: inf
# device address: 0 bytes
# iface address: 8 bytes
# error handling: ep_check
#
#
# Memory domain: tcp
# Component: tcp
# register: unlimited, cost: 0 nsec
# remote key: 0 bytes
#
# Transport: tcp
# Device: lo
# Type: network
# System device: <unknown>
#
# capabilities:
# bandwidth: 11.91/ppn + 0.00 MB/sec
# latency: 10960 nsec
# overhead: 50000 nsec
# put_zcopy: <= 18446744073709551590, up to 6 iov
# put_opt_zcopy_align: <= 1
# put_align_mtu: <= 0
# am_short: <= 8K
# am_bcopy: <= 8K
# am_zcopy: <= 64K, up to 6 iov
# am_opt_zcopy_align: <= 1
# am_align_mtu: <= 0
# am header: <= 8037
# connection: to ep, to iface
# device priority: 1
# device num paths: 1
# max eps: 256
# device address: 18 bytes
# iface address: 2 bytes
# ep address: 10 bytes
# error handling: peer failure, ep_check, keepalive
#
# Transport: tcp
# Device: eth0
# Type: network
# System device: <unknown>
#
# capabilities:
# bandwidth: 1131.64/ppn + 0.00 MB/sec
# latency: 5258 nsec
# overhead: 50000 nsec
# put_zcopy: <= 18446744073709551590, up to 6 iov
# put_opt_zcopy_align: <= 1
# put_align_mtu: <= 0
# am_short: <= 8K
# am_bcopy: <= 8K
# am_zcopy: <= 64K, up to 6 iov
# am_opt_zcopy_align: <= 1
# am_align_mtu: <= 0
# am header: <= 8037
# connection: to ep, to iface
# device priority: 0
# device num paths: 1
# max eps: 256
# device address: 6 bytes
# iface address: 2 bytes
# ep address: 10 bytes
# error handling: peer failure, ep_check, keepalive
#
#
# Connection manager: tcp
# max_conn_priv: 2064 bytes
#
# Memory domain: sysv
# Component: sysv
# allocate: unlimited
# remote key: 12 bytes
# rkey_ptr is supported
#
# Transport: sysv
# Device: memory
# Type: intra-node
# System device: <unknown>
#
# capabilities:
# bandwidth: 0.00/ppn + 12179.00 MB/sec
# latency: 80 nsec
# overhead: 10 nsec
# put_short: <= 4294967295
# put_bcopy: unlimited
# get_bcopy: unlimited
# am_short: <= 100
# am_bcopy: <= 8256
# domain: cpu
# atomic_add: 32, 64 bit
# atomic_and: 32, 64 bit
# atomic_or: 32, 64 bit
# atomic_xor: 32, 64 bit
# atomic_fadd: 32, 64 bit
# atomic_fand: 32, 64 bit
# atomic_for: 32, 64 bit
# atomic_fxor: 32, 64 bit
# atomic_swap: 32, 64 bit
# atomic_cswap: 32, 64 bit
# connection: to iface
# device priority: 0
# device num paths: 1
# max eps: inf
# device address: 16 bytes
# iface address: 8 bytes
# error handling: ep_check
#
#
# Memory domain: posix
# Component: posix
# allocate: <= 1G
# remote key: 32 bytes
# rkey_ptr is supported
#
# Transport: posix
# Device: memory
# Type: intra-node
# System device: <unknown>
#
# capabilities:
# bandwidth: 0.00/ppn + 12179.00 MB/sec
# latency: 80 nsec
# overhead: 10 nsec
# put_short: <= 4294967295
# put_bcopy: unlimited
# get_bcopy: unlimited
# am_short: <= 100
# am_bcopy: <= 8256
# domain: cpu
# atomic_add: 32, 64 bit
# atomic_and: 32, 64 bit
# atomic_or: 32, 64 bit
# atomic_xor: 32, 64 bit
# atomic_fadd: 32, 64 bit
# atomic_fand: 32, 64 bit
# atomic_for: 32, 64 bit
# atomic_fxor: 32, 64 bit
# atomic_swap: 32, 64 bit
# atomic_cswap: 32, 64 bit
# connection: to iface
# device priority: 0
# device num paths: 1
# max eps: inf
# device address: 16 bytes
# iface address: 16 bytes
# error handling: ep_check
#
#
# Memory domain: cuda_cpy
# Component: cuda_cpy
# allocate: unlimited
# register: unlimited, cost: 0 nsec
#
# Transport: cuda_copy
# Device: cuda
# Type: accelerator
# System device: <unknown>
#
# capabilities:
# bandwidth: 10000.00/ppn + 0.00 MB/sec
# latency: 8000 nsec
# overhead: 0 nsec
# put_short: <= 4294967295
# put_zcopy: unlimited, up to 1 iov
# put_opt_zcopy_align: <= 1
# put_align_mtu: <= 1
# get_short: <= 4294967295
# get_zcopy: unlimited, up to 1 iov
# get_opt_zcopy_align: <= 1
# get_align_mtu: <= 1
# connection: to iface
# device priority: 0
# device num paths: 1
# max eps: inf
# device address: 0 bytes
# iface address: 8 bytes
# error handling: none
#
#
# Memory domain: cuda_ipc
# Component: cuda_ipc
# register: unlimited, cost: 0 nsec
# remote key: 112 bytes
# memory invalidation is supported
#
# Transport: cuda_ipc
# Device: cuda
# Type: intra-node
# System device: <unknown>
#
# capabilities:
# bandwidth: 250000.00/ppn + 0.00 MB/sec
# latency: 1 nsec
# overhead: 0 nsec
# put_zcopy: unlimited, up to 1 iov
# put_opt_zcopy_align: <= 1
# put_align_mtu: <= 1
# get_zcopy: unlimited, up to 1 iov
# get_opt_zcopy_align: <= 1
# get_align_mtu: <= 1
# connection: to iface
# device priority: 0
# device num paths: 1
# max eps: inf
# device address: 8 bytes
# iface address: 4 bytes
# error handling: peer failure, ep_check
#
# < failed to open connection manager rdmacm >
root@b8a4f9b28111:/home/cuquantum# echo $UCX_TLS
rc,cuda_copy,cuda_ipc,gdr_copy,sm
```
```
acyang@DGX101:~/qsvm$ export SINGULARITYENV_UCX_TLS=rc,cuda_copy,cuda_ipc,gdr_copy,sm
acyang@DGX101:~/qsvm$ echo $SINGULARITYENV_UCX_TLS
rc,cuda_copy,cuda_ipc,gdr_copy,sm
acyang@DGX101:~/qsvm$ singularity shell --nv -w -B $PWD:/tmp --pwd /tmp /raid/acyang/qsvm/cuquantum-qiskit
INFO: Setting 'NVIDIA_VISIBLE_DEVICES=all' to emulate legacy GPU binding.
WARNING: Skipping mount /etc/localtime [binds]: /etc/localtime doesn't exist in container
Singularity> echo $UCX_TLS
rc,cuda_copy,cuda_ipc,gdr_copy,sm
Singularity> /usr/local/ucx/bin/ucx_info -d
#
# Memory domain: self
# Component: self
# register: unlimited, cost: 0 nsec
# remote key: 0 bytes
#
# Transport: self
# Device: memory0
# Type: loopback
# System device: <unknown>
#
# capabilities:
# bandwidth: 0.00/ppn + 6911.00 MB/sec
# latency: 0 nsec
# overhead: 10 nsec
# put_short: <= 4294967295
# put_bcopy: unlimited
# get_bcopy: unlimited
# am_short: <= 8K
# am_bcopy: <= 8K
# domain: cpu
# atomic_add: 32, 64 bit
# atomic_and: 32, 64 bit
# atomic_or: 32, 64 bit
# atomic_xor: 32, 64 bit
# atomic_fadd: 32, 64 bit
# atomic_fand: 32, 64 bit
# atomic_for: 32, 64 bit
# atomic_fxor: 32, 64 bit
# atomic_swap: 32, 64 bit
# atomic_cswap: 32, 64 bit
# connection: to iface
# device priority: 0
# device num paths: 1
# max eps: inf
# device address: 0 bytes
# iface address: 8 bytes
# error handling: ep_check
#
#
# Memory domain: tcp
# Component: tcp
# register: unlimited, cost: 0 nsec
# remote key: 0 bytes
#
# Transport: tcp
# Device: ibs1
# Type: network
# System device: <unknown>
#
# capabilities:
# bandwidth: 11142.51/ppn + 0.00 MB/sec
# latency: 5206 nsec
# overhead: 50000 nsec
# put_zcopy: <= 18446744073709551590, up to 6 iov
# put_opt_zcopy_align: <= 1
# put_align_mtu: <= 0
# am_short: <= 8K
# am_bcopy: <= 8K
# am_zcopy: <= 64K, up to 6 iov
# am_opt_zcopy_align: <= 1
# am_align_mtu: <= 0
# am header: <= 8037
# connection: to ep, to iface
# device priority: 1
# device num paths: 1
# max eps: 256
# device address: 6 bytes
# iface address: 2 bytes
# ep address: 10 bytes
# error handling: peer failure, ep_check, keepalive
#
# Transport: tcp
# Device: lo
# Type: network
# System device: <unknown>
#
# capabilities:
# bandwidth: 11.91/ppn + 0.00 MB/sec
# latency: 10960 nsec
# overhead: 50000 nsec
# put_zcopy: <= 18446744073709551590, up to 6 iov
# put_opt_zcopy_align: <= 1
# put_align_mtu: <= 0
# am_short: <= 8K
# am_bcopy: <= 8K
# am_zcopy: <= 64K, up to 6 iov
# am_opt_zcopy_align: <= 1
# am_align_mtu: <= 0
# am header: <= 8037
# connection: to ep, to iface
# device priority: 1
# device num paths: 1
# max eps: 256
# device address: 18 bytes
# iface address: 2 bytes
# ep address: 10 bytes
# error handling: peer failure, ep_check, keepalive
#
# Transport: tcp
# Device: enp1s0f1
# Type: network
# System device: <unknown>
#
# capabilities:
# bandwidth: 1131.64/ppn + 0.00 MB/sec
# latency: 5258 nsec
# overhead: 50000 nsec
# put_zcopy: <= 18446744073709551590, up to 6 iov
# put_opt_zcopy_align: <= 1
# put_align_mtu: <= 0
# am_short: <= 8K
# am_bcopy: <= 8K
# am_zcopy: <= 64K, up to 6 iov
# am_opt_zcopy_align: <= 1
# am_align_mtu: <= 0
# am header: <= 8037
# connection: to ep, to iface
# device priority: 0
# device num paths: 1
# max eps: 256
# device address: 6 bytes
# iface address: 2 bytes
# ep address: 10 bytes
# error handling: peer failure, ep_check, keepalive
#
#
# Connection manager: tcp
# max_conn_priv: 2064 bytes
#
# Memory domain: sysv
# Component: sysv
# allocate: unlimited
# remote key: 12 bytes
# rkey_ptr is supported
#
# Transport: sysv
# Device: memory
# Type: intra-node
# System device: <unknown>
#
# capabilities:
# bandwidth: 0.00/ppn + 12179.00 MB/sec
# latency: 80 nsec
# overhead: 10 nsec
# put_short: <= 4294967295
# put_bcopy: unlimited
# get_bcopy: unlimited
# am_short: <= 100
# am_bcopy: <= 8256
# domain: cpu
# atomic_add: 32, 64 bit
# atomic_and: 32, 64 bit
# atomic_or: 32, 64 bit
# atomic_xor: 32, 64 bit
# atomic_fadd: 32, 64 bit
# atomic_fand: 32, 64 bit
# atomic_for: 32, 64 bit
# atomic_fxor: 32, 64 bit
# atomic_swap: 32, 64 bit
# atomic_cswap: 32, 64 bit
# connection: to iface
# device priority: 0
# device num paths: 1
# max eps: inf
# device address: 8 bytes
# iface address: 8 bytes
# error handling: ep_check
#
#
# Memory domain: posix
# Component: posix
# allocate: <= 264124268K
# remote key: 24 bytes
# rkey_ptr is supported
#
# Transport: posix
# Device: memory
# Type: intra-node
# System device: <unknown>
#
# capabilities:
# bandwidth: 0.00/ppn + 12179.00 MB/sec
# latency: 80 nsec
# overhead: 10 nsec
# put_short: <= 4294967295
# put_bcopy: unlimited
# get_bcopy: unlimited
# am_short: <= 100
# am_bcopy: <= 8256
# domain: cpu
# atomic_add: 32, 64 bit
# atomic_and: 32, 64 bit
# atomic_or: 32, 64 bit
# atomic_xor: 32, 64 bit
# atomic_fadd: 32, 64 bit
# atomic_fand: 32, 64 bit
# atomic_for: 32, 64 bit
# atomic_fxor: 32, 64 bit
# atomic_swap: 32, 64 bit
# atomic_cswap: 32, 64 bit
# connection: to iface
# device priority: 0
# device num paths: 1
# max eps: inf
# device address: 8 bytes
# iface address: 8 bytes
# error handling: ep_check
#
#
# Memory domain: cuda_cpy
# Component: cuda_cpy
# allocate: unlimited
# register: unlimited, cost: 0 nsec
#
# Transport: cuda_copy
# Device: cuda
# Type: accelerator
# System device: <unknown>
#
# capabilities:
# bandwidth: 10000.00/ppn + 0.00 MB/sec
# latency: 8000 nsec
# overhead: 0 nsec
# put_short: <= 4294967295
# put_zcopy: unlimited, up to 1 iov
# put_opt_zcopy_align: <= 1
# put_align_mtu: <= 1
# get_short: <= 4294967295
# get_zcopy: unlimited, up to 1 iov
# get_opt_zcopy_align: <= 1
# get_align_mtu: <= 1
# connection: to iface
# device priority: 0
# device num paths: 1
# max eps: inf
# device address: 0 bytes
# iface address: 8 bytes
# error handling: none
#
#
# Memory domain: cuda_ipc
# Component: cuda_ipc
# register: unlimited, cost: 0 nsec
# remote key: 112 bytes
# memory invalidation is supported
#
# Transport: cuda_ipc
# Device: cuda
# Type: intra-node
# System device: <unknown>
#
# capabilities:
# bandwidth: 250000.00/ppn + 0.00 MB/sec
# latency: 1 nsec
# overhead: 0 nsec
# put_zcopy: unlimited, up to 1 iov
# put_opt_zcopy_align: <= 1
# put_align_mtu: <= 1
# get_zcopy: unlimited, up to 1 iov
# get_opt_zcopy_align: <= 1
# get_align_mtu: <= 1
# connection: to iface
# device priority: 0
# device num paths: 1
# max eps: inf
# device address: 8 bytes
# iface address: 4 bytes
# error handling: peer failure, ep_check
#
#
# Memory domain: mlx5_0
# Component: ib
# register: unlimited, cost: 180 nsec
# remote key: 8 bytes
# local memory handle is required for zcopy
#
# Transport: dc_mlx5
# Device: mlx5_0:1
# Type: network
# System device: mlx5_0 (8)
#
# capabilities:
# bandwidth: 11794.23/ppn + 0.00 MB/sec
# latency: 660 nsec
# overhead: 40 nsec
# put_short: <= 172
# put_bcopy: <= 8256
# put_zcopy: <= 1G, up to 11 iov
# put_opt_zcopy_align: <= 512
# put_align_mtu: <= 4K
# get_bcopy: <= 8256
# get_zcopy: 65..1G, up to 11 iov
# get_opt_zcopy_align: <= 512
# get_align_mtu: <= 4K
# am_short: <= 186
# am_bcopy: <= 8254
# am_zcopy: <= 8254, up to 3 iov
# am_opt_zcopy_align: <= 512
# am_align_mtu: <= 4K
# am header: <= 138
# domain: device
# atomic_add: 32, 64 bit
# atomic_and: 32, 64 bit
# atomic_or: 32, 64 bit
# atomic_xor: 32, 64 bit
# atomic_fadd: 32, 64 bit
# atomic_fand: 32, 64 bit
# atomic_for: 32, 64 bit
# atomic_fxor: 32, 64 bit
# atomic_swap: 32, 64 bit
# atomic_cswap: 32, 64 bit
# connection: to iface
# device priority: 30
# device num paths: 1
# max eps: inf
# device address: 3 bytes
# iface address: 5 bytes
# error handling: buffer (zcopy), remote access, peer failure, ep_check
#
#
# Transport: rc_verbs
# Device: mlx5_0:1
# Type: network
# System device: mlx5_0 (8)
#
# capabilities:
# bandwidth: 11794.23/ppn + 0.00 MB/sec
# latency: 600 + 1.000 * N nsec
# overhead: 75 nsec
# put_short: <= 124
# put_bcopy: <= 8256
# put_zcopy: <= 1G, up to 5 iov
# put_opt_zcopy_align: <= 512
# put_align_mtu: <= 4K
# get_bcopy: <= 8256
# get_zcopy: 65..1G, up to 5 iov
# get_opt_zcopy_align: <= 512
# get_align_mtu: <= 4K
# am_short: <= 123
# am_bcopy: <= 8255
# am_zcopy: <= 8255, up to 4 iov
# am_opt_zcopy_align: <= 512
# am_align_mtu: <= 4K
# am header: <= 127
# domain: device
# atomic_add: 64 bit
# atomic_fadd: 64 bit
# atomic_cswap: 64 bit
# connection: to ep
# device priority: 30
# device num paths: 1
# max eps: 256
# device address: 3 bytes
# ep address: 5 bytes
# error handling: peer failure, ep_check
#
#
# Transport: rc_mlx5
# Device: mlx5_0:1
# Type: network
# System device: mlx5_0 (8)
#
# capabilities:
# bandwidth: 11794.23/ppn + 0.00 MB/sec
# latency: 600 + 1.000 * N nsec
# overhead: 40 nsec
# put_short: <= 220
# put_bcopy: <= 8256
# put_zcopy: <= 1G, up to 14 iov
# put_opt_zcopy_align: <= 512
# put_align_mtu: <= 4K
# get_bcopy: <= 8256
# get_zcopy: 65..1G, up to 14 iov
# get_opt_zcopy_align: <= 512
# get_align_mtu: <= 4K
# am_short: <= 234
# am_bcopy: <= 8254
# am_zcopy: <= 8254, up to 3 iov
# am_opt_zcopy_align: <= 512
# am_align_mtu: <= 4K
# am header: <= 186
# domain: device
# atomic_add: 32, 64 bit
# atomic_and: 32, 64 bit
# atomic_or: 32, 64 bit
# atomic_xor: 32, 64 bit
# atomic_fadd: 32, 64 bit
# atomic_fand: 32, 64 bit
# atomic_for: 32, 64 bit
# atomic_fxor: 32, 64 bit
# atomic_swap: 32, 64 bit
# atomic_cswap: 32, 64 bit
# connection: to ep
# device priority: 30
# device num paths: 1
# max eps: 256
# device address: 3 bytes
# ep address: 7 bytes
# error handling: buffer (zcopy), remote access, peer failure, ep_check
#
#
# Transport: ud_verbs
# Device: mlx5_0:1
# Type: network
# System device: mlx5_0 (8)
#
# capabilities:
# bandwidth: 11794.23/ppn + 0.00 MB/sec
# latency: 630 nsec
# overhead: 105 nsec
# am_short: <= 116
# am_bcopy: <= 4088
# am_zcopy: <= 4088, up to 5 iov
# am_opt_zcopy_align: <= 512
# am_align_mtu: <= 4K
# am header: <= 3952
# connection: to ep, to iface
# device priority: 30
# device num paths: 1
# max eps: inf
# device address: 3 bytes
# iface address: 3 bytes
# ep address: 6 bytes
# error handling: peer failure, ep_check
#
#
# Transport: ud_mlx5
# Device: mlx5_0:1
# Type: network
# System device: mlx5_0 (8)
#
# capabilities:
# bandwidth: 11794.23/ppn + 0.00 MB/sec
# latency: 630 nsec
# overhead: 80 nsec
# am_short: <= 180
# am_bcopy: <= 4088
# am_zcopy: <= 4088, up to 3 iov
# am_opt_zcopy_align: <= 512
# am_align_mtu: <= 4K
# am header: <= 132
# connection: to ep, to iface
# device priority: 30
# device num paths: 1
# max eps: inf
# device address: 3 bytes
# iface address: 3 bytes
# ep address: 6 bytes
# error handling: peer failure, ep_check
#
#
# Memory domain: mlx5_1
# Component: ib
# register: unlimited, cost: 180 nsec
# remote key: 8 bytes
# local memory handle is required for zcopy
#
# Transport: dc_mlx5
# Device: mlx5_1:1
# Type: network
# System device: mlx5_1 (9)
#
# capabilities:
# bandwidth: 11794.23/ppn + 0.00 MB/sec
# latency: 660 nsec
# overhead: 40 nsec
# put_short: <= 172
# put_bcopy: <= 8256
# put_zcopy: <= 1G, up to 11 iov
# put_opt_zcopy_align: <= 512
# put_align_mtu: <= 4K
# get_bcopy: <= 8256
# get_zcopy: 65..1G, up to 11 iov
# get_opt_zcopy_align: <= 512
# get_align_mtu: <= 4K
# am_short: <= 186
# am_bcopy: <= 8254
# am_zcopy: <= 8254, up to 3 iov
# am_opt_zcopy_align: <= 512
# am_align_mtu: <= 4K
# am header: <= 138
# domain: device
# atomic_add: 32, 64 bit
# atomic_and: 32, 64 bit
# atomic_or: 32, 64 bit
# atomic_xor: 32, 64 bit
# atomic_fadd: 32, 64 bit
# atomic_fand: 32, 64 bit
# atomic_for: 32, 64 bit
# atomic_fxor: 32, 64 bit
# atomic_swap: 32, 64 bit
# atomic_cswap: 32, 64 bit
# connection: to iface
# device priority: 30
# device num paths: 1
# max eps: inf
# device address: 3 bytes
# iface address: 5 bytes
# error handling: buffer (zcopy), remote access, peer failure, ep_check
#
#
# Transport: rc_verbs
# Device: mlx5_1:1
# Type: network
# System device: mlx5_1 (9)
#
# capabilities:
# bandwidth: 11794.23/ppn + 0.00 MB/sec
# latency: 600 + 1.000 * N nsec
# overhead: 75 nsec
# put_short: <= 124
# put_bcopy: <= 8256
# put_zcopy: <= 1G, up to 5 iov
# put_opt_zcopy_align: <= 512
# put_align_mtu: <= 4K
# get_bcopy: <= 8256
# get_zcopy: 65..1G, up to 5 iov
# get_opt_zcopy_align: <= 512
# get_align_mtu: <= 4K
# am_short: <= 123
# am_bcopy: <= 8255
# am_zcopy: <= 8255, up to 4 iov
# am_opt_zcopy_align: <= 512
# am_align_mtu: <= 4K
# am header: <= 127
# domain: device
# atomic_add: 64 bit
# atomic_fadd: 64 bit
# atomic_cswap: 64 bit
# connection: to ep
# device priority: 30
# device num paths: 1
# max eps: 256
# device address: 3 bytes
# ep address: 5 bytes
# error handling: peer failure, ep_check
#
#
# Transport: rc_mlx5
# Device: mlx5_1:1
# Type: network
# System device: mlx5_1 (9)
#
# capabilities:
# bandwidth: 11794.23/ppn + 0.00 MB/sec
# latency: 600 + 1.000 * N nsec
# overhead: 40 nsec
# put_short: <= 220
# put_bcopy: <= 8256
# put_zcopy: <= 1G, up to 14 iov
# put_opt_zcopy_align: <= 512
# put_align_mtu: <= 4K
# get_bcopy: <= 8256
# get_zcopy: 65..1G, up to 14 iov
# get_opt_zcopy_align: <= 512
# get_align_mtu: <= 4K
# am_short: <= 234
# am_bcopy: <= 8254
# am_zcopy: <= 8254, up to 3 iov
# am_opt_zcopy_align: <= 512
# am_align_mtu: <= 4K
# am header: <= 186
# domain: device
# atomic_add: 32, 64 bit
# atomic_and: 32, 64 bit
# atomic_or: 32, 64 bit
# atomic_xor: 32, 64 bit
# atomic_fadd: 32, 64 bit
# atomic_fand: 32, 64 bit
# atomic_for: 32, 64 bit
# atomic_fxor: 32, 64 bit
# atomic_swap: 32, 64 bit
# atomic_cswap: 32, 64 bit
# connection: to ep
# device priority: 30
# device num paths: 1
# max eps: 256
# device address: 3 bytes
# ep address: 7 bytes
# error handling: buffer (zcopy), remote access, peer failure, ep_check
#
#
# Transport: ud_verbs
# Device: mlx5_1:1
# Type: network
# System device: mlx5_1 (9)
#
# capabilities:
# bandwidth: 11794.23/ppn + 0.00 MB/sec
# latency: 630 nsec
# overhead: 105 nsec
# am_short: <= 116
# am_bcopy: <= 4088
# am_zcopy: <= 4088, up to 5 iov
# am_opt_zcopy_align: <= 512
# am_align_mtu: <= 4K
# am header: <= 3952
# connection: to ep, to iface
# device priority: 30
# device num paths: 1
# max eps: inf
# device address: 3 bytes
# iface address: 3 bytes
# ep address: 6 bytes
# error handling: peer failure, ep_check
#
#
# Transport: ud_mlx5
# Device: mlx5_1:1
# Type: network
# System device: mlx5_1 (9)
#
# capabilities:
# bandwidth: 11794.23/ppn + 0.00 MB/sec
# latency: 630 nsec
# overhead: 80 nsec
# am_short: <= 180
# am_bcopy: <= 4088
# am_zcopy: <= 4088, up to 3 iov
# am_opt_zcopy_align: <= 512
# am_align_mtu: <= 4K
# am header: <= 132
# connection: to ep, to iface
# device priority: 30
# device num paths: 1
# max eps: inf
# device address: 3 bytes
# iface address: 3 bytes
# ep address: 6 bytes
# error handling: peer failure, ep_check
#
#
# Memory domain: mlx5_2
# Component: ib
# register: unlimited, cost: 180 nsec
# remote key: 8 bytes
# local memory handle is required for zcopy
#
# Transport: dc_mlx5
# Device: mlx5_2:1
# Type: network
# System device: mlx5_2 (10)
#
# capabilities:
# bandwidth: 11794.23/ppn + 0.00 MB/sec
# latency: 660 nsec
# overhead: 40 nsec
# put_short: <= 172
# put_bcopy: <= 8256
# put_zcopy: <= 1G, up to 11 iov
# put_opt_zcopy_align: <= 512
# put_align_mtu: <= 4K
# get_bcopy: <= 8256
# get_zcopy: 65..1G, up to 11 iov
# get_opt_zcopy_align: <= 512
# get_align_mtu: <= 4K
# am_short: <= 186
# am_bcopy: <= 8254
# am_zcopy: <= 8254, up to 3 iov
# am_opt_zcopy_align: <= 512
# am_align_mtu: <= 4K
# am header: <= 138
# domain: device
# atomic_add: 32, 64 bit
# atomic_and: 32, 64 bit
# atomic_or: 32, 64 bit
# atomic_xor: 32, 64 bit
# atomic_fadd: 32, 64 bit
# atomic_fand: 32, 64 bit
# atomic_for: 32, 64 bit
# atomic_fxor: 32, 64 bit
# atomic_swap: 32, 64 bit
# atomic_cswap: 32, 64 bit
# connection: to iface
# device priority: 30
# device num paths: 1
# max eps: inf
# device address: 3 bytes
# iface address: 5 bytes
# error handling: buffer (zcopy), remote access, peer failure, ep_check
#
#
# Transport: rc_verbs
# Device: mlx5_2:1
# Type: network
# System device: mlx5_2 (10)
#
# capabilities:
# bandwidth: 11794.23/ppn + 0.00 MB/sec
# latency: 600 + 1.000 * N nsec
# overhead: 75 nsec
# put_short: <= 124
# put_bcopy: <= 8256
# put_zcopy: <= 1G, up to 5 iov
# put_opt_zcopy_align: <= 512
# put_align_mtu: <= 4K
# get_bcopy: <= 8256
# get_zcopy: 65..1G, up to 5 iov
# get_opt_zcopy_align: <= 512
# get_align_mtu: <= 4K
# am_short: <= 123
# am_bcopy: <= 8255
# am_zcopy: <= 8255, up to 4 iov
# am_opt_zcopy_align: <= 512
# am_align_mtu: <= 4K
# am header: <= 127
# domain: device
# atomic_add: 64 bit
# atomic_fadd: 64 bit
# atomic_cswap: 64 bit
# connection: to ep
# device priority: 30
# device num paths: 1
# max eps: 256
# device address: 3 bytes
# ep address: 5 bytes
# error handling: peer failure, ep_check
#
#
# Transport: rc_mlx5
# Device: mlx5_2:1
# Type: network
# System device: mlx5_2 (10)
#
# capabilities:
# bandwidth: 11794.23/ppn + 0.00 MB/sec
# latency: 600 + 1.000 * N nsec
# overhead: 40 nsec
# put_short: <= 220
# put_bcopy: <= 8256
# put_zcopy: <= 1G, up to 14 iov
# put_opt_zcopy_align: <= 512
# put_align_mtu: <= 4K
# get_bcopy: <= 8256
# get_zcopy: 65..1G, up to 14 iov
# get_opt_zcopy_align: <= 512
# get_align_mtu: <= 4K
# am_short: <= 234
# am_bcopy: <= 8254
# am_zcopy: <= 8254, up to 3 iov
# am_opt_zcopy_align: <= 512
# am_align_mtu: <= 4K
# am header: <= 186
# domain: device
# atomic_add: 32, 64 bit
# atomic_and: 32, 64 bit
# atomic_or: 32, 64 bit
# atomic_xor: 32, 64 bit
# atomic_fadd: 32, 64 bit
# atomic_fand: 32, 64 bit
# atomic_for: 32, 64 bit
# atomic_fxor: 32, 64 bit
# atomic_swap: 32, 64 bit
# atomic_cswap: 32, 64 bit
# connection: to ep
# device priority: 30
# device num paths: 1
# max eps: 256
# device address: 3 bytes
# ep address: 7 bytes
# error handling: buffer (zcopy), remote access, peer failure, ep_check
#
#
# Transport: ud_verbs
# Device: mlx5_2:1
# Type: network
# System device: mlx5_2 (10)
#
# capabilities:
# bandwidth: 11794.23/ppn + 0.00 MB/sec
# latency: 630 nsec
# overhead: 105 nsec
# am_short: <= 116
# am_bcopy: <= 4088
# am_zcopy: <= 4088, up to 5 iov
# am_opt_zcopy_align: <= 512
# am_align_mtu: <= 4K
# am header: <= 3952
# connection: to ep, to iface
# device priority: 30
# device num paths: 1
# max eps: inf
# device address: 3 bytes
# iface address: 3 bytes
# ep address: 6 bytes
# error handling: peer failure, ep_check
#
#
# Transport: ud_mlx5
# Device: mlx5_2:1
# Type: network
# System device: mlx5_2 (10)
#
# capabilities:
# bandwidth: 11794.23/ppn + 0.00 MB/sec
# latency: 630 nsec
# overhead: 80 nsec
# am_short: <= 180
# am_bcopy: <= 4088
# am_zcopy: <= 4088, up to 3 iov
# am_opt_zcopy_align: <= 512
# am_align_mtu: <= 4K
# am header: <= 132
# connection: to ep, to iface
# device priority: 30
# device num paths: 1
# max eps: inf
# device address: 3 bytes
# iface address: 3 bytes
# ep address: 6 bytes
# error handling: peer failure, ep_check
#
#
# Memory domain: mlx5_3
# Component: ib
# register: unlimited, cost: 180 nsec
# remote key: 8 bytes
# local memory handle is required for zcopy
# < no supported devices found >
#
# Connection manager: rdmacm
# max_conn_priv: 54 bytes
#
# Memory domain: cma
# Component: cma
# register: unlimited, cost: 9 nsec
#
# Transport: cma
# Device: memory
# Type: intra-node
# System device: <unknown>
#
# capabilities:
# bandwidth: 0.00/ppn + 11145.00 MB/sec
# latency: 80 nsec
# overhead: 2000 nsec
# put_zcopy: unlimited, up to 16 iov
# put_opt_zcopy_align: <= 1
# put_align_mtu: <= 1
# get_zcopy: unlimited, up to 16 iov
# get_opt_zcopy_align: <= 1
# get_align_mtu: <= 1
# connection: to iface
# device priority: 0
# device num paths: 1
# max eps: inf
# device address: 8 bytes
# iface address: 4 bytes
# error handling: peer failure, ep_check
#
```