# Run cuquantum and qiskit-machine-learning ## Prepare custom image ``` FROM nvcr.io/nvidia/cuquantum-appliance:23.10 RUN /home/cuquantum/conda/envs/cuquantum-23.10/bin/pip install qiskit-machine-learning==0.6.1 USER root RUN chmod 755 -R /home/cuquantum ``` ## Build custom image ``` docker build -t mycuquantum . --no-cache docker image ls ``` ### nvidia-docker run in nvidia-docker ready environment (Dgx-1 V100) ``` docker run --gpus all -it --rm mycuquantum:latest docker run --gpus '"device=0,3"' -it --rm mycuquantum:latest ``` /usr/local/ucx/bin/ucx_info -d run in the project folder ``` docker run --gpus all -v $PWD:/host_pwd -w /host_pwd --rm mycuquantum:latest python benchmark_qsvm_tnsm-mpi_demo3.py ``` ``` docker run --gpus all -v $PWD:/host_pwd -w /host_pwd --rm mycuquantum:latest /usr/local/ucx/bin/ucx_info -d ``` ## singularity ``` singularity build cuquantum-qiskit.sif docker-daemon://mycuquantum:latest singularity build --sandbox cuquantum-qiskit cuquantum-qiskit.sif ``` or ``` singularity build --sandbox cuquantum-qiskit docker-daemon://mycuquantum:latest ``` ### DGX1 V100 ### Taiwania2 H100 # openmpi wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/fdc7a2bc-b7a8-47eb-8876-de6201297144/l_BaseKit_p_2024.1.0.596_offline.sh sh ./l_BaseKit_p_2024.1.0.596_offline.sh -a --cli --instance=qusim wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7f096850-dc7b-4c35-90b5-36c12abd9eaa/l_HPCKit_p_2024.1.0.560_offline.sh sh ./l_HPCKit_p_2024.1.0.560_offline.sh -a --cli --instance=qusim https://xconfigure.readthedocs.io/en/latest/ apps/oneapi-2024.1/modulefiles-setup.sh --output-dir=~/apps/modulefiles --ignore-latest module load tbb/2021.12 compiler-rt/2024.1.0 oclfpga/2024.1.0 compiler/2024.1.0 mkl/2024.1 mpi/2021.12 ompi_info Configure command line: '--prefix=/home/qusim/gcc8/openmpi-5.0.2' '--with-slurm=/usr' '--with-pmix=internal' '--with-libevent=internal' '--with-hwloc=internal' '--without-xpmem' '--with-ucx=/home/qusim/gcc8/ucx-1.15' '--with-knem=/opt/knem-1.1.4.90mlnx2' '--with-hcoll=/opt/mellanox/hcoll' '--with-platform=../contrib/platform/mellanox/optimized' '--enable-sparse-groups' '--disable-dlopen' '--enable-mpi1-compatibility' '--without-verbs' Configure command line: '--prefix=/home/qusim/toolchain/build/openmpi-5.0.2/icc' '--enable-shared' '--enable-static' '--enable-mpi-fortran' '--with-slurm' '--with-ucx=/home/qusim/toolchain/build/ucx-1.16/icc' '--with-ucx-libdir=/home/qusim/toolchain/build/ucx-1.16/icc/lib' '--enable-mpi1-compatibility' '--with-libevent' '--with-hwloc=internal' '--with-knem=/opt/knem-1.1.4.90mlnx2' '--with-hcoll=/opt/mellanox/hcoll' '--without-xpmem' '--enable-mca-static' '--enable-orterun-prefix-by-default' '--enable-mpirun-prefix-by-default' '--with-platform=contrib/platform/mellanox/optimized' '--with-gpfs=/usr/lpp/mmfs' '--enable-sparse-groups' '--with-zlib' To install into a named environment, run: ``` conda install package-name=2.3.4 -n some-environment ``` conda config --append envs_dirs /home/cuquantum/conda/envs && source activate cuquantum-23.10 (cuquantum-23.10) cuquantum@3497c9b11f95:~$ /usr/local/openmpi/bin/orte-info ``` (cuquantum-23.10) cuquantum@3497c9b11f95:~$ /usr/local/openmpi/bin/orte-info Open RTE: 4.1.4rc2 Open RTE repo revision: v4.1.4 Open RTE release date: Unreleased developer copy Prefix: /usr/local/openmpi Configured architecture: x86_64-pc-linux-gnu Configure host: buildkitsandbox Configured by: root Configured on: Tue Nov 7 20:55:35 UTC 2023 Configure host: buildkitsandbox Configure command line: '--prefix=/usr/local/openmpi' '--disable-debug' '--disable-getpwuid' '--disable-mem-debug' '--disable-mem-profile' '--disable-memchecker' '--disable-static' '--enable-mca-no-build=btl-uct' '--enable-mpi1-compatibility' '--enable-oshmem' '--prefix=/usr/local/openmpi' '--with-cuda=/usr/local/cuda' '--with-pmi=/usr/local/pmi' '--with-pmix=/usr/local/pmix' '--with-slurm' '--with-ucx=/usr/local/ucx' '--without-verbs' Built by: Built on: Tue Nov 7 20:57:17 UTC 2023 Built host: buildkitsandbox C compiler: gcc C compiler absolute: /usr/bin/gcc C compiler family name: GNU C compiler version: 10.5.0 Thread support: posix (OPAL: yes, ORTE progress: yes, Event lib: yes) Internal debug support: no Memory profiling support: no Memory debugging support: no dl support: yes Heterogeneous support: no orterun default --prefix: no MPI_WTIME support: native Symbol vis. support: yes FT Checkpoint support: no (checkpoint thread: no) MCA allocator: bucket (MCA v2.1, API v2.0, Component v4.1.4) MCA allocator: basic (MCA v2.1, API v2.0, Component v4.1.4) MCA backtrace: execinfo (MCA v2.1, API v2.0, Component v4.1.4) MCA btl: self (MCA v2.1, API v3.1, Component v4.1.4) MCA btl: smcuda (MCA v2.1, API v3.1, Component v4.1.4) MCA btl: vader (MCA v2.1, API v3.1, Component v4.1.4) MCA btl: tcp (MCA v2.1, API v3.1, Component v4.1.4) MCA compress: gzip (MCA v2.1, API v2.0, Component v4.1.4) MCA compress: bzip (MCA v2.1, API v2.0, Component v4.1.4) MCA crs: none (MCA v2.1, API v2.0, Component v4.1.4) MCA dl: dlopen (MCA v2.1, API v1.0, Component v4.1.4) MCA event: external (MCA v2.1, API v2.0, Component v4.1.4) MCA hwloc: hwloc201 (MCA v2.1, API v2.0, Component v4.1.4) MCA if: linux_ipv6 (MCA v2.1, API v2.0, Component v4.1.4) MCA if: posix_ipv4 (MCA v2.1, API v2.0, Component v4.1.4) MCA installdirs: env (MCA v2.1, API v2.0, Component v4.1.4) MCA installdirs: config (MCA v2.1, API v2.0, Component v4.1.4) MCA memory: patcher (MCA v2.1, API v2.0, Component v4.1.4) MCA mpool: hugepage (MCA v2.1, API v3.0, Component v4.1.4) MCA patcher: overwrite (MCA v2.1, API v1.0, Component v4.1.4) MCA pmix: ext3x (MCA v2.1, API v2.0, Component v4.1.4) MCA pmix: isolated (MCA v2.1, API v2.0, Component v4.1.4) MCA pmix: s2 (MCA v2.1, API v2.0, Component v4.1.4) MCA pmix: flux (MCA v2.1, API v2.0, Component v4.1.4) MCA pstat: linux (MCA v2.1, API v2.0, Component v4.1.4) MCA rcache: grdma (MCA v2.1, API v3.3, Component v4.1.4) MCA rcache: gpusm (MCA v2.1, API v3.3, Component v4.1.4) MCA rcache: rgpusm (MCA v2.1, API v3.3, Component v4.1.4) MCA reachable: netlink (MCA v2.1, API v2.0, Component v4.1.4) MCA reachable: weighted (MCA v2.1, API v2.0, Component v4.1.4) MCA shmem: posix (MCA v2.1, API v2.0, Component v4.1.4) MCA shmem: mmap (MCA v2.1, API v2.0, Component v4.1.4) MCA shmem: sysv (MCA v2.1, API v2.0, Component v4.1.4) MCA timer: linux (MCA v2.1, API v2.0, Component v4.1.4) MCA errmgr: default_tool (MCA v2.1, API v3.0, Component v4.1.4) MCA errmgr: default_app (MCA v2.1, API v3.0, Component v4.1.4) MCA errmgr: default_orted (MCA v2.1, API v3.0, Component v4.1.4) MCA errmgr: default_hnp (MCA v2.1, API v3.0, Component v4.1.4) MCA ess: hnp (MCA v2.1, API v3.0, Component v4.1.4) MCA ess: env (MCA v2.1, API v3.0, Component v4.1.4) MCA ess: pmi (MCA v2.1, API v3.0, Component v4.1.4) MCA ess: singleton (MCA v2.1, API v3.0, Component v4.1.4) MCA ess: slurm (MCA v2.1, API v3.0, Component v4.1.4) MCA ess: tool (MCA v2.1, API v3.0, Component v4.1.4) MCA filem: raw (MCA v2.1, API v2.0, Component v4.1.4) MCA grpcomm: direct (MCA v2.1, API v3.0, Component v4.1.4) MCA iof: tool (MCA v2.1, API v2.0, Component v4.1.4) MCA iof: orted (MCA v2.1, API v2.0, Component v4.1.4) MCA iof: hnp (MCA v2.1, API v2.0, Component v4.1.4) MCA odls: default (MCA v2.1, API v2.0, Component v4.1.4) MCA odls: pspawn (MCA v2.1, API v2.0, Component v4.1.4) MCA oob: tcp (MCA v2.1, API v2.0, Component v4.1.4) MCA plm: slurm (MCA v2.1, API v2.0, Component v4.1.4) MCA plm: rsh (MCA v2.1, API v2.0, Component v4.1.4) MCA plm: isolated (MCA v2.1, API v2.0, Component v4.1.4) MCA ras: slurm (MCA v2.1, API v2.0, Component v4.1.4) MCA ras: simulator (MCA v2.1, API v2.0, Component v4.1.4) MCA regx: fwd (MCA v2.1, API v1.0, Component v4.1.4) MCA regx: reverse (MCA v2.1, API v1.0, Component v4.1.4) MCA regx: naive (MCA v2.1, API v1.0, Component v4.1.4) MCA rmaps: rank_file (MCA v2.1, API v2.0, Component v4.1.4) MCA rmaps: mindist (MCA v2.1, API v2.0, Component v4.1.4) MCA rmaps: resilient (MCA v2.1, API v2.0, Component v4.1.4) MCA rmaps: round_robin (MCA v2.1, API v2.0, Component v4.1.4) MCA rmaps: ppr (MCA v2.1, API v2.0, Component v4.1.4) MCA rmaps: seq (MCA v2.1, API v2.0, Component v4.1.4) MCA rml: oob (MCA v2.1, API v3.0, Component v4.1.4) MCA routed: radix (MCA v2.1, API v3.0, Component v4.1.4) MCA routed: direct (MCA v2.1, API v3.0, Component v4.1.4) MCA routed: binomial (MCA v2.1, API v3.0, Component v4.1.4) MCA rtc: hwloc (MCA v2.1, API v1.0, Component v4.1.4) MCA schizo: ompi (MCA v2.1, API v1.0, Component v4.1.4) MCA schizo: flux (MCA v2.1, API v1.0, Component v4.1.4) MCA schizo: orte (MCA v2.1, API v1.0, Component v4.1.4) MCA schizo: slurm (MCA v2.1, API v1.0, Component v4.1.4) MCA schizo: jsm (MCA v2.1, API v1.0, Component v4.1.4) MCA state: orted (MCA v2.1, API v1.0, Component v4.1.4) MCA state: app (MCA v2.1, API v1.0, Component v4.1.4) MCA state: hnp (MCA v2.1, API v1.0, Component v4.1.4) MCA state: novm (MCA v2.1, API v1.0, Component v4.1.4) MCA state: tool (MCA v2.1, API v1.0, Component v4.1.4) ``` module load singularity cuda/12.2 miniconda3 singularity exec --nv /home/p00acy00/images/cuquantum-qiskit.sif mpiexec -n 4 python3 ghz.py singularity exec --nv -B $PWD:/tmp --pwd /tmp /home/p00acy00/images/cuquantum-qiskit.sif python benchmark_qsvm_tnsm-mpi_demo3.py conda create --name cuquantum python=3.10 conda activate cuquantum conda install -c conda-forge cuquantum=23.10.0.6 cuquantum-python=23.10.0 openmpi=4.1.6 qiskit-machine-learning=0.6.1 conda install -c conda-forge pandas matplotlib mpi4py ``` INSTALL=$HOME/build nproc=40 module purge module load cuda/12.2 cd $HOME git clone -b v1.15.x https://github.com/openucx/ucx.git cd ucx ./autogen.sh CC=gcc CXX=g++ FC=gfortran \ ./contrib/configure-release \ --prefix=$INSTALL/ucx-1.15 \ --with-gdrcopy=/usr \ --with-cuda=$CUDA_HOME \ --with-cuda-libdir=$CUDA_HOME/lib64/stubs \ --disable-logging \ --disable-debug \ --disable-assertions \ --disable-dependency-tracking \ --disable-params-check \ --disable-doxygen-doc \ --disable-doxygen-dot \ --disable-doxygen-man \ --disable-doxygen-html \ --disable-doxygen-pdf \ --without-java \ --enable-optimizations \ --enable-shared \ --enable-static \ --enable-mt \ --enable-cma \ --with-mlx5-dv \ --with-ib-hw-tm \ --with-rc \ --with-ud \ --with-dc \ --with-avx make -j $(nproc) make install -j $(nproc) cd $HOME wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.6.tar.gz tar xf openmpi-4.1.6.tar.gz cd openmpi-4.1.6 CC=gcc CXX=g++ FC=gfortran \ CFLAGS="-O3 -march=native -fPIC" \ CXXFLAGS="-O3 -march=native -fPIC" \ FCFLAGS="-O3 -march=native -fPIC" \ LDFLAGS="-Wl,--build-id -L/lib64" \ ./configure \ --prefix=$INSTALL/openmpi-4.1.6 \ --host=x86_64-redhat-linux-gnu \ --with-cuda=$CUDA_HOME \ --with-cuda-libdir=$CUDA_HOME/lib64/stubs \ --enable-shared \ --enable-static \ --enable-mpi-fortran \ --with-slurm \ --with-ucx=$INSTALL/ucx-1.15 \ --with-ucx-libdir=$INSTALL/ucx-1.15/lib \ --enable-mpi1-compatibility \ --with-libevent \ --with-hwloc=internal \ --with-knem=/opt/knem-1.1.3.90mlnx1 \ --with-hcoll=/opt/mellanox/hcoll \ --without-xpmem \ --without-verbs \ --enable-mca-static \ --enable-orterun-prefix-by-default \ --enable-mpirun-prefix-by-default \ --with-platform=contrib/platform/mellanox/optimized \ --with-gpfs=/usr/lpp/mmfs \ --enable-sparse-groups \ --with-zlib make -j $(nproc) make install -j $(nproc) ``` mpirun -np 16 singularity run --nv -B python benchmark_qsvm_tnsm-mpi_demo3.py # LAMMPS on DGX ``` docker run --rm --gpus all --ipc=host -v $PWD:/host_pwd -w /host_pwd nvcr.io/hpc/lammps:patch_15Jun2023 ./run_lammps.sh singularity run --nv -B $PWD:/host_pwd --pwd /host_pwd docker://nvcr.io/hpc/lammps:patch_15Jun2023 ./run_lammps.sh ``` # TWN2 https://docs.sylabs.io/guides/3.8/user-guide/build_a_container.html module load singularity cuda/12.2 ###export UCX_TLS=rc,cuda_copy,cuda_ipc,gdr_copy,sm cp -r ~/images/cuquantum-qiskit /media/nvme/jobs mpiexec -n 2 singularity exec --nv -B $PWD:/tmp --pwd /tmp /media/nvme/jobs/cuquantum-qiskit /home/cuquantum/conda/envs/cuquantum-23.10/bin/python benchmark_qsvm_tnsm-mpi_demo3.py salloc --partition=gp1d --account=GOV108018 --nodes=2 --ntasks-per-node=1 --gpus-per-node=1 # DGX https://docs.sylabs.io/guides/latest/user-guide/build_a_container.html export SINGULARITYENV_UCX_TLS=rc,cuda_copy,cuda_ipc,gdr_copy,sm singularity exec --nv -w -B $PWD:/tmp --pwd /tmp /raid/acyang/qsvm/cuquantum-qiskit mpiexec -n 8 /home/cuquantum/conda/envs/cuquantum-23.10/bin/python benchmark_qsvm_tnsm-mpi_demo3.py singularity shell --nv -w -B $PWD:/tmp --pwd /tmp /raid/acyang/qsvm/cuquantum-qiskit singularity exec --nv -B $PWD:/tmp --pwd /tmp /raid/acyang/qsvm/cuquantum-qiskit python benchmark_qsvm_tnsm-mpi_demo3.py ``` root@b8a4f9b28111:/home/cuquantum# /usr/local/ucx/bin/ucx_info -d # # Memory domain: self # Component: self # register: unlimited, cost: 0 nsec # remote key: 0 bytes # # Transport: self # Device: memory0 # Type: loopback # System device: <unknown> # # capabilities: # bandwidth: 0.00/ppn + 6911.00 MB/sec # latency: 0 nsec # overhead: 10 nsec # put_short: <= 4294967295 # put_bcopy: unlimited # get_bcopy: unlimited # am_short: <= 8K # am_bcopy: <= 8K # domain: cpu # atomic_add: 32, 64 bit # atomic_and: 32, 64 bit # atomic_or: 32, 64 bit # atomic_xor: 32, 64 bit # atomic_fadd: 32, 64 bit # atomic_fand: 32, 64 bit # atomic_for: 32, 64 bit # atomic_fxor: 32, 64 bit # atomic_swap: 32, 64 bit # atomic_cswap: 32, 64 bit # connection: to iface # device priority: 0 # device num paths: 1 # max eps: inf # device address: 0 bytes # iface address: 8 bytes # error handling: ep_check # # # Memory domain: tcp # Component: tcp # register: unlimited, cost: 0 nsec # remote key: 0 bytes # # Transport: tcp # Device: lo # Type: network # System device: <unknown> # # capabilities: # bandwidth: 11.91/ppn + 0.00 MB/sec # latency: 10960 nsec # overhead: 50000 nsec # put_zcopy: <= 18446744073709551590, up to 6 iov # put_opt_zcopy_align: <= 1 # put_align_mtu: <= 0 # am_short: <= 8K # am_bcopy: <= 8K # am_zcopy: <= 64K, up to 6 iov # am_opt_zcopy_align: <= 1 # am_align_mtu: <= 0 # am header: <= 8037 # connection: to ep, to iface # device priority: 1 # device num paths: 1 # max eps: 256 # device address: 18 bytes # iface address: 2 bytes # ep address: 10 bytes # error handling: peer failure, ep_check, keepalive # # Transport: tcp # Device: eth0 # Type: network # System device: <unknown> # # capabilities: # bandwidth: 1131.64/ppn + 0.00 MB/sec # latency: 5258 nsec # overhead: 50000 nsec # put_zcopy: <= 18446744073709551590, up to 6 iov # put_opt_zcopy_align: <= 1 # put_align_mtu: <= 0 # am_short: <= 8K # am_bcopy: <= 8K # am_zcopy: <= 64K, up to 6 iov # am_opt_zcopy_align: <= 1 # am_align_mtu: <= 0 # am header: <= 8037 # connection: to ep, to iface # device priority: 0 # device num paths: 1 # max eps: 256 # device address: 6 bytes # iface address: 2 bytes # ep address: 10 bytes # error handling: peer failure, ep_check, keepalive # # # Connection manager: tcp # max_conn_priv: 2064 bytes # # Memory domain: sysv # Component: sysv # allocate: unlimited # remote key: 12 bytes # rkey_ptr is supported # # Transport: sysv # Device: memory # Type: intra-node # System device: <unknown> # # capabilities: # bandwidth: 0.00/ppn + 12179.00 MB/sec # latency: 80 nsec # overhead: 10 nsec # put_short: <= 4294967295 # put_bcopy: unlimited # get_bcopy: unlimited # am_short: <= 100 # am_bcopy: <= 8256 # domain: cpu # atomic_add: 32, 64 bit # atomic_and: 32, 64 bit # atomic_or: 32, 64 bit # atomic_xor: 32, 64 bit # atomic_fadd: 32, 64 bit # atomic_fand: 32, 64 bit # atomic_for: 32, 64 bit # atomic_fxor: 32, 64 bit # atomic_swap: 32, 64 bit # atomic_cswap: 32, 64 bit # connection: to iface # device priority: 0 # device num paths: 1 # max eps: inf # device address: 16 bytes # iface address: 8 bytes # error handling: ep_check # # # Memory domain: posix # Component: posix # allocate: <= 1G # remote key: 32 bytes # rkey_ptr is supported # # Transport: posix # Device: memory # Type: intra-node # System device: <unknown> # # capabilities: # bandwidth: 0.00/ppn + 12179.00 MB/sec # latency: 80 nsec # overhead: 10 nsec # put_short: <= 4294967295 # put_bcopy: unlimited # get_bcopy: unlimited # am_short: <= 100 # am_bcopy: <= 8256 # domain: cpu # atomic_add: 32, 64 bit # atomic_and: 32, 64 bit # atomic_or: 32, 64 bit # atomic_xor: 32, 64 bit # atomic_fadd: 32, 64 bit # atomic_fand: 32, 64 bit # atomic_for: 32, 64 bit # atomic_fxor: 32, 64 bit # atomic_swap: 32, 64 bit # atomic_cswap: 32, 64 bit # connection: to iface # device priority: 0 # device num paths: 1 # max eps: inf # device address: 16 bytes # iface address: 16 bytes # error handling: ep_check # # # Memory domain: cuda_cpy # Component: cuda_cpy # allocate: unlimited # register: unlimited, cost: 0 nsec # # Transport: cuda_copy # Device: cuda # Type: accelerator # System device: <unknown> # # capabilities: # bandwidth: 10000.00/ppn + 0.00 MB/sec # latency: 8000 nsec # overhead: 0 nsec # put_short: <= 4294967295 # put_zcopy: unlimited, up to 1 iov # put_opt_zcopy_align: <= 1 # put_align_mtu: <= 1 # get_short: <= 4294967295 # get_zcopy: unlimited, up to 1 iov # get_opt_zcopy_align: <= 1 # get_align_mtu: <= 1 # connection: to iface # device priority: 0 # device num paths: 1 # max eps: inf # device address: 0 bytes # iface address: 8 bytes # error handling: none # # # Memory domain: cuda_ipc # Component: cuda_ipc # register: unlimited, cost: 0 nsec # remote key: 112 bytes # memory invalidation is supported # # Transport: cuda_ipc # Device: cuda # Type: intra-node # System device: <unknown> # # capabilities: # bandwidth: 250000.00/ppn + 0.00 MB/sec # latency: 1 nsec # overhead: 0 nsec # put_zcopy: unlimited, up to 1 iov # put_opt_zcopy_align: <= 1 # put_align_mtu: <= 1 # get_zcopy: unlimited, up to 1 iov # get_opt_zcopy_align: <= 1 # get_align_mtu: <= 1 # connection: to iface # device priority: 0 # device num paths: 1 # max eps: inf # device address: 8 bytes # iface address: 4 bytes # error handling: peer failure, ep_check # # < failed to open connection manager rdmacm > root@b8a4f9b28111:/home/cuquantum# echo $UCX_TLS rc,cuda_copy,cuda_ipc,gdr_copy,sm ``` ``` acyang@DGX101:~/qsvm$ export SINGULARITYENV_UCX_TLS=rc,cuda_copy,cuda_ipc,gdr_copy,sm acyang@DGX101:~/qsvm$ echo $SINGULARITYENV_UCX_TLS rc,cuda_copy,cuda_ipc,gdr_copy,sm acyang@DGX101:~/qsvm$ singularity shell --nv -w -B $PWD:/tmp --pwd /tmp /raid/acyang/qsvm/cuquantum-qiskit INFO: Setting 'NVIDIA_VISIBLE_DEVICES=all' to emulate legacy GPU binding. WARNING: Skipping mount /etc/localtime [binds]: /etc/localtime doesn't exist in container Singularity> echo $UCX_TLS rc,cuda_copy,cuda_ipc,gdr_copy,sm Singularity> /usr/local/ucx/bin/ucx_info -d # # Memory domain: self # Component: self # register: unlimited, cost: 0 nsec # remote key: 0 bytes # # Transport: self # Device: memory0 # Type: loopback # System device: <unknown> # # capabilities: # bandwidth: 0.00/ppn + 6911.00 MB/sec # latency: 0 nsec # overhead: 10 nsec # put_short: <= 4294967295 # put_bcopy: unlimited # get_bcopy: unlimited # am_short: <= 8K # am_bcopy: <= 8K # domain: cpu # atomic_add: 32, 64 bit # atomic_and: 32, 64 bit # atomic_or: 32, 64 bit # atomic_xor: 32, 64 bit # atomic_fadd: 32, 64 bit # atomic_fand: 32, 64 bit # atomic_for: 32, 64 bit # atomic_fxor: 32, 64 bit # atomic_swap: 32, 64 bit # atomic_cswap: 32, 64 bit # connection: to iface # device priority: 0 # device num paths: 1 # max eps: inf # device address: 0 bytes # iface address: 8 bytes # error handling: ep_check # # # Memory domain: tcp # Component: tcp # register: unlimited, cost: 0 nsec # remote key: 0 bytes # # Transport: tcp # Device: ibs1 # Type: network # System device: <unknown> # # capabilities: # bandwidth: 11142.51/ppn + 0.00 MB/sec # latency: 5206 nsec # overhead: 50000 nsec # put_zcopy: <= 18446744073709551590, up to 6 iov # put_opt_zcopy_align: <= 1 # put_align_mtu: <= 0 # am_short: <= 8K # am_bcopy: <= 8K # am_zcopy: <= 64K, up to 6 iov # am_opt_zcopy_align: <= 1 # am_align_mtu: <= 0 # am header: <= 8037 # connection: to ep, to iface # device priority: 1 # device num paths: 1 # max eps: 256 # device address: 6 bytes # iface address: 2 bytes # ep address: 10 bytes # error handling: peer failure, ep_check, keepalive # # Transport: tcp # Device: lo # Type: network # System device: <unknown> # # capabilities: # bandwidth: 11.91/ppn + 0.00 MB/sec # latency: 10960 nsec # overhead: 50000 nsec # put_zcopy: <= 18446744073709551590, up to 6 iov # put_opt_zcopy_align: <= 1 # put_align_mtu: <= 0 # am_short: <= 8K # am_bcopy: <= 8K # am_zcopy: <= 64K, up to 6 iov # am_opt_zcopy_align: <= 1 # am_align_mtu: <= 0 # am header: <= 8037 # connection: to ep, to iface # device priority: 1 # device num paths: 1 # max eps: 256 # device address: 18 bytes # iface address: 2 bytes # ep address: 10 bytes # error handling: peer failure, ep_check, keepalive # # Transport: tcp # Device: enp1s0f1 # Type: network # System device: <unknown> # # capabilities: # bandwidth: 1131.64/ppn + 0.00 MB/sec # latency: 5258 nsec # overhead: 50000 nsec # put_zcopy: <= 18446744073709551590, up to 6 iov # put_opt_zcopy_align: <= 1 # put_align_mtu: <= 0 # am_short: <= 8K # am_bcopy: <= 8K # am_zcopy: <= 64K, up to 6 iov # am_opt_zcopy_align: <= 1 # am_align_mtu: <= 0 # am header: <= 8037 # connection: to ep, to iface # device priority: 0 # device num paths: 1 # max eps: 256 # device address: 6 bytes # iface address: 2 bytes # ep address: 10 bytes # error handling: peer failure, ep_check, keepalive # # # Connection manager: tcp # max_conn_priv: 2064 bytes # # Memory domain: sysv # Component: sysv # allocate: unlimited # remote key: 12 bytes # rkey_ptr is supported # # Transport: sysv # Device: memory # Type: intra-node # System device: <unknown> # # capabilities: # bandwidth: 0.00/ppn + 12179.00 MB/sec # latency: 80 nsec # overhead: 10 nsec # put_short: <= 4294967295 # put_bcopy: unlimited # get_bcopy: unlimited # am_short: <= 100 # am_bcopy: <= 8256 # domain: cpu # atomic_add: 32, 64 bit # atomic_and: 32, 64 bit # atomic_or: 32, 64 bit # atomic_xor: 32, 64 bit # atomic_fadd: 32, 64 bit # atomic_fand: 32, 64 bit # atomic_for: 32, 64 bit # atomic_fxor: 32, 64 bit # atomic_swap: 32, 64 bit # atomic_cswap: 32, 64 bit # connection: to iface # device priority: 0 # device num paths: 1 # max eps: inf # device address: 8 bytes # iface address: 8 bytes # error handling: ep_check # # # Memory domain: posix # Component: posix # allocate: <= 264124268K # remote key: 24 bytes # rkey_ptr is supported # # Transport: posix # Device: memory # Type: intra-node # System device: <unknown> # # capabilities: # bandwidth: 0.00/ppn + 12179.00 MB/sec # latency: 80 nsec # overhead: 10 nsec # put_short: <= 4294967295 # put_bcopy: unlimited # get_bcopy: unlimited # am_short: <= 100 # am_bcopy: <= 8256 # domain: cpu # atomic_add: 32, 64 bit # atomic_and: 32, 64 bit # atomic_or: 32, 64 bit # atomic_xor: 32, 64 bit # atomic_fadd: 32, 64 bit # atomic_fand: 32, 64 bit # atomic_for: 32, 64 bit # atomic_fxor: 32, 64 bit # atomic_swap: 32, 64 bit # atomic_cswap: 32, 64 bit # connection: to iface # device priority: 0 # device num paths: 1 # max eps: inf # device address: 8 bytes # iface address: 8 bytes # error handling: ep_check # # # Memory domain: cuda_cpy # Component: cuda_cpy # allocate: unlimited # register: unlimited, cost: 0 nsec # # Transport: cuda_copy # Device: cuda # Type: accelerator # System device: <unknown> # # capabilities: # bandwidth: 10000.00/ppn + 0.00 MB/sec # latency: 8000 nsec # overhead: 0 nsec # put_short: <= 4294967295 # put_zcopy: unlimited, up to 1 iov # put_opt_zcopy_align: <= 1 # put_align_mtu: <= 1 # get_short: <= 4294967295 # get_zcopy: unlimited, up to 1 iov # get_opt_zcopy_align: <= 1 # get_align_mtu: <= 1 # connection: to iface # device priority: 0 # device num paths: 1 # max eps: inf # device address: 0 bytes # iface address: 8 bytes # error handling: none # # # Memory domain: cuda_ipc # Component: cuda_ipc # register: unlimited, cost: 0 nsec # remote key: 112 bytes # memory invalidation is supported # # Transport: cuda_ipc # Device: cuda # Type: intra-node # System device: <unknown> # # capabilities: # bandwidth: 250000.00/ppn + 0.00 MB/sec # latency: 1 nsec # overhead: 0 nsec # put_zcopy: unlimited, up to 1 iov # put_opt_zcopy_align: <= 1 # put_align_mtu: <= 1 # get_zcopy: unlimited, up to 1 iov # get_opt_zcopy_align: <= 1 # get_align_mtu: <= 1 # connection: to iface # device priority: 0 # device num paths: 1 # max eps: inf # device address: 8 bytes # iface address: 4 bytes # error handling: peer failure, ep_check # # # Memory domain: mlx5_0 # Component: ib # register: unlimited, cost: 180 nsec # remote key: 8 bytes # local memory handle is required for zcopy # # Transport: dc_mlx5 # Device: mlx5_0:1 # Type: network # System device: mlx5_0 (8) # # capabilities: # bandwidth: 11794.23/ppn + 0.00 MB/sec # latency: 660 nsec # overhead: 40 nsec # put_short: <= 172 # put_bcopy: <= 8256 # put_zcopy: <= 1G, up to 11 iov # put_opt_zcopy_align: <= 512 # put_align_mtu: <= 4K # get_bcopy: <= 8256 # get_zcopy: 65..1G, up to 11 iov # get_opt_zcopy_align: <= 512 # get_align_mtu: <= 4K # am_short: <= 186 # am_bcopy: <= 8254 # am_zcopy: <= 8254, up to 3 iov # am_opt_zcopy_align: <= 512 # am_align_mtu: <= 4K # am header: <= 138 # domain: device # atomic_add: 32, 64 bit # atomic_and: 32, 64 bit # atomic_or: 32, 64 bit # atomic_xor: 32, 64 bit # atomic_fadd: 32, 64 bit # atomic_fand: 32, 64 bit # atomic_for: 32, 64 bit # atomic_fxor: 32, 64 bit # atomic_swap: 32, 64 bit # atomic_cswap: 32, 64 bit # connection: to iface # device priority: 30 # device num paths: 1 # max eps: inf # device address: 3 bytes # iface address: 5 bytes # error handling: buffer (zcopy), remote access, peer failure, ep_check # # # Transport: rc_verbs # Device: mlx5_0:1 # Type: network # System device: mlx5_0 (8) # # capabilities: # bandwidth: 11794.23/ppn + 0.00 MB/sec # latency: 600 + 1.000 * N nsec # overhead: 75 nsec # put_short: <= 124 # put_bcopy: <= 8256 # put_zcopy: <= 1G, up to 5 iov # put_opt_zcopy_align: <= 512 # put_align_mtu: <= 4K # get_bcopy: <= 8256 # get_zcopy: 65..1G, up to 5 iov # get_opt_zcopy_align: <= 512 # get_align_mtu: <= 4K # am_short: <= 123 # am_bcopy: <= 8255 # am_zcopy: <= 8255, up to 4 iov # am_opt_zcopy_align: <= 512 # am_align_mtu: <= 4K # am header: <= 127 # domain: device # atomic_add: 64 bit # atomic_fadd: 64 bit # atomic_cswap: 64 bit # connection: to ep # device priority: 30 # device num paths: 1 # max eps: 256 # device address: 3 bytes # ep address: 5 bytes # error handling: peer failure, ep_check # # # Transport: rc_mlx5 # Device: mlx5_0:1 # Type: network # System device: mlx5_0 (8) # # capabilities: # bandwidth: 11794.23/ppn + 0.00 MB/sec # latency: 600 + 1.000 * N nsec # overhead: 40 nsec # put_short: <= 220 # put_bcopy: <= 8256 # put_zcopy: <= 1G, up to 14 iov # put_opt_zcopy_align: <= 512 # put_align_mtu: <= 4K # get_bcopy: <= 8256 # get_zcopy: 65..1G, up to 14 iov # get_opt_zcopy_align: <= 512 # get_align_mtu: <= 4K # am_short: <= 234 # am_bcopy: <= 8254 # am_zcopy: <= 8254, up to 3 iov # am_opt_zcopy_align: <= 512 # am_align_mtu: <= 4K # am header: <= 186 # domain: device # atomic_add: 32, 64 bit # atomic_and: 32, 64 bit # atomic_or: 32, 64 bit # atomic_xor: 32, 64 bit # atomic_fadd: 32, 64 bit # atomic_fand: 32, 64 bit # atomic_for: 32, 64 bit # atomic_fxor: 32, 64 bit # atomic_swap: 32, 64 bit # atomic_cswap: 32, 64 bit # connection: to ep # device priority: 30 # device num paths: 1 # max eps: 256 # device address: 3 bytes # ep address: 7 bytes # error handling: buffer (zcopy), remote access, peer failure, ep_check # # # Transport: ud_verbs # Device: mlx5_0:1 # Type: network # System device: mlx5_0 (8) # # capabilities: # bandwidth: 11794.23/ppn + 0.00 MB/sec # latency: 630 nsec # overhead: 105 nsec # am_short: <= 116 # am_bcopy: <= 4088 # am_zcopy: <= 4088, up to 5 iov # am_opt_zcopy_align: <= 512 # am_align_mtu: <= 4K # am header: <= 3952 # connection: to ep, to iface # device priority: 30 # device num paths: 1 # max eps: inf # device address: 3 bytes # iface address: 3 bytes # ep address: 6 bytes # error handling: peer failure, ep_check # # # Transport: ud_mlx5 # Device: mlx5_0:1 # Type: network # System device: mlx5_0 (8) # # capabilities: # bandwidth: 11794.23/ppn + 0.00 MB/sec # latency: 630 nsec # overhead: 80 nsec # am_short: <= 180 # am_bcopy: <= 4088 # am_zcopy: <= 4088, up to 3 iov # am_opt_zcopy_align: <= 512 # am_align_mtu: <= 4K # am header: <= 132 # connection: to ep, to iface # device priority: 30 # device num paths: 1 # max eps: inf # device address: 3 bytes # iface address: 3 bytes # ep address: 6 bytes # error handling: peer failure, ep_check # # # Memory domain: mlx5_1 # Component: ib # register: unlimited, cost: 180 nsec # remote key: 8 bytes # local memory handle is required for zcopy # # Transport: dc_mlx5 # Device: mlx5_1:1 # Type: network # System device: mlx5_1 (9) # # capabilities: # bandwidth: 11794.23/ppn + 0.00 MB/sec # latency: 660 nsec # overhead: 40 nsec # put_short: <= 172 # put_bcopy: <= 8256 # put_zcopy: <= 1G, up to 11 iov # put_opt_zcopy_align: <= 512 # put_align_mtu: <= 4K # get_bcopy: <= 8256 # get_zcopy: 65..1G, up to 11 iov # get_opt_zcopy_align: <= 512 # get_align_mtu: <= 4K # am_short: <= 186 # am_bcopy: <= 8254 # am_zcopy: <= 8254, up to 3 iov # am_opt_zcopy_align: <= 512 # am_align_mtu: <= 4K # am header: <= 138 # domain: device # atomic_add: 32, 64 bit # atomic_and: 32, 64 bit # atomic_or: 32, 64 bit # atomic_xor: 32, 64 bit # atomic_fadd: 32, 64 bit # atomic_fand: 32, 64 bit # atomic_for: 32, 64 bit # atomic_fxor: 32, 64 bit # atomic_swap: 32, 64 bit # atomic_cswap: 32, 64 bit # connection: to iface # device priority: 30 # device num paths: 1 # max eps: inf # device address: 3 bytes # iface address: 5 bytes # error handling: buffer (zcopy), remote access, peer failure, ep_check # # # Transport: rc_verbs # Device: mlx5_1:1 # Type: network # System device: mlx5_1 (9) # # capabilities: # bandwidth: 11794.23/ppn + 0.00 MB/sec # latency: 600 + 1.000 * N nsec # overhead: 75 nsec # put_short: <= 124 # put_bcopy: <= 8256 # put_zcopy: <= 1G, up to 5 iov # put_opt_zcopy_align: <= 512 # put_align_mtu: <= 4K # get_bcopy: <= 8256 # get_zcopy: 65..1G, up to 5 iov # get_opt_zcopy_align: <= 512 # get_align_mtu: <= 4K # am_short: <= 123 # am_bcopy: <= 8255 # am_zcopy: <= 8255, up to 4 iov # am_opt_zcopy_align: <= 512 # am_align_mtu: <= 4K # am header: <= 127 # domain: device # atomic_add: 64 bit # atomic_fadd: 64 bit # atomic_cswap: 64 bit # connection: to ep # device priority: 30 # device num paths: 1 # max eps: 256 # device address: 3 bytes # ep address: 5 bytes # error handling: peer failure, ep_check # # # Transport: rc_mlx5 # Device: mlx5_1:1 # Type: network # System device: mlx5_1 (9) # # capabilities: # bandwidth: 11794.23/ppn + 0.00 MB/sec # latency: 600 + 1.000 * N nsec # overhead: 40 nsec # put_short: <= 220 # put_bcopy: <= 8256 # put_zcopy: <= 1G, up to 14 iov # put_opt_zcopy_align: <= 512 # put_align_mtu: <= 4K # get_bcopy: <= 8256 # get_zcopy: 65..1G, up to 14 iov # get_opt_zcopy_align: <= 512 # get_align_mtu: <= 4K # am_short: <= 234 # am_bcopy: <= 8254 # am_zcopy: <= 8254, up to 3 iov # am_opt_zcopy_align: <= 512 # am_align_mtu: <= 4K # am header: <= 186 # domain: device # atomic_add: 32, 64 bit # atomic_and: 32, 64 bit # atomic_or: 32, 64 bit # atomic_xor: 32, 64 bit # atomic_fadd: 32, 64 bit # atomic_fand: 32, 64 bit # atomic_for: 32, 64 bit # atomic_fxor: 32, 64 bit # atomic_swap: 32, 64 bit # atomic_cswap: 32, 64 bit # connection: to ep # device priority: 30 # device num paths: 1 # max eps: 256 # device address: 3 bytes # ep address: 7 bytes # error handling: buffer (zcopy), remote access, peer failure, ep_check # # # Transport: ud_verbs # Device: mlx5_1:1 # Type: network # System device: mlx5_1 (9) # # capabilities: # bandwidth: 11794.23/ppn + 0.00 MB/sec # latency: 630 nsec # overhead: 105 nsec # am_short: <= 116 # am_bcopy: <= 4088 # am_zcopy: <= 4088, up to 5 iov # am_opt_zcopy_align: <= 512 # am_align_mtu: <= 4K # am header: <= 3952 # connection: to ep, to iface # device priority: 30 # device num paths: 1 # max eps: inf # device address: 3 bytes # iface address: 3 bytes # ep address: 6 bytes # error handling: peer failure, ep_check # # # Transport: ud_mlx5 # Device: mlx5_1:1 # Type: network # System device: mlx5_1 (9) # # capabilities: # bandwidth: 11794.23/ppn + 0.00 MB/sec # latency: 630 nsec # overhead: 80 nsec # am_short: <= 180 # am_bcopy: <= 4088 # am_zcopy: <= 4088, up to 3 iov # am_opt_zcopy_align: <= 512 # am_align_mtu: <= 4K # am header: <= 132 # connection: to ep, to iface # device priority: 30 # device num paths: 1 # max eps: inf # device address: 3 bytes # iface address: 3 bytes # ep address: 6 bytes # error handling: peer failure, ep_check # # # Memory domain: mlx5_2 # Component: ib # register: unlimited, cost: 180 nsec # remote key: 8 bytes # local memory handle is required for zcopy # # Transport: dc_mlx5 # Device: mlx5_2:1 # Type: network # System device: mlx5_2 (10) # # capabilities: # bandwidth: 11794.23/ppn + 0.00 MB/sec # latency: 660 nsec # overhead: 40 nsec # put_short: <= 172 # put_bcopy: <= 8256 # put_zcopy: <= 1G, up to 11 iov # put_opt_zcopy_align: <= 512 # put_align_mtu: <= 4K # get_bcopy: <= 8256 # get_zcopy: 65..1G, up to 11 iov # get_opt_zcopy_align: <= 512 # get_align_mtu: <= 4K # am_short: <= 186 # am_bcopy: <= 8254 # am_zcopy: <= 8254, up to 3 iov # am_opt_zcopy_align: <= 512 # am_align_mtu: <= 4K # am header: <= 138 # domain: device # atomic_add: 32, 64 bit # atomic_and: 32, 64 bit # atomic_or: 32, 64 bit # atomic_xor: 32, 64 bit # atomic_fadd: 32, 64 bit # atomic_fand: 32, 64 bit # atomic_for: 32, 64 bit # atomic_fxor: 32, 64 bit # atomic_swap: 32, 64 bit # atomic_cswap: 32, 64 bit # connection: to iface # device priority: 30 # device num paths: 1 # max eps: inf # device address: 3 bytes # iface address: 5 bytes # error handling: buffer (zcopy), remote access, peer failure, ep_check # # # Transport: rc_verbs # Device: mlx5_2:1 # Type: network # System device: mlx5_2 (10) # # capabilities: # bandwidth: 11794.23/ppn + 0.00 MB/sec # latency: 600 + 1.000 * N nsec # overhead: 75 nsec # put_short: <= 124 # put_bcopy: <= 8256 # put_zcopy: <= 1G, up to 5 iov # put_opt_zcopy_align: <= 512 # put_align_mtu: <= 4K # get_bcopy: <= 8256 # get_zcopy: 65..1G, up to 5 iov # get_opt_zcopy_align: <= 512 # get_align_mtu: <= 4K # am_short: <= 123 # am_bcopy: <= 8255 # am_zcopy: <= 8255, up to 4 iov # am_opt_zcopy_align: <= 512 # am_align_mtu: <= 4K # am header: <= 127 # domain: device # atomic_add: 64 bit # atomic_fadd: 64 bit # atomic_cswap: 64 bit # connection: to ep # device priority: 30 # device num paths: 1 # max eps: 256 # device address: 3 bytes # ep address: 5 bytes # error handling: peer failure, ep_check # # # Transport: rc_mlx5 # Device: mlx5_2:1 # Type: network # System device: mlx5_2 (10) # # capabilities: # bandwidth: 11794.23/ppn + 0.00 MB/sec # latency: 600 + 1.000 * N nsec # overhead: 40 nsec # put_short: <= 220 # put_bcopy: <= 8256 # put_zcopy: <= 1G, up to 14 iov # put_opt_zcopy_align: <= 512 # put_align_mtu: <= 4K # get_bcopy: <= 8256 # get_zcopy: 65..1G, up to 14 iov # get_opt_zcopy_align: <= 512 # get_align_mtu: <= 4K # am_short: <= 234 # am_bcopy: <= 8254 # am_zcopy: <= 8254, up to 3 iov # am_opt_zcopy_align: <= 512 # am_align_mtu: <= 4K # am header: <= 186 # domain: device # atomic_add: 32, 64 bit # atomic_and: 32, 64 bit # atomic_or: 32, 64 bit # atomic_xor: 32, 64 bit # atomic_fadd: 32, 64 bit # atomic_fand: 32, 64 bit # atomic_for: 32, 64 bit # atomic_fxor: 32, 64 bit # atomic_swap: 32, 64 bit # atomic_cswap: 32, 64 bit # connection: to ep # device priority: 30 # device num paths: 1 # max eps: 256 # device address: 3 bytes # ep address: 7 bytes # error handling: buffer (zcopy), remote access, peer failure, ep_check # # # Transport: ud_verbs # Device: mlx5_2:1 # Type: network # System device: mlx5_2 (10) # # capabilities: # bandwidth: 11794.23/ppn + 0.00 MB/sec # latency: 630 nsec # overhead: 105 nsec # am_short: <= 116 # am_bcopy: <= 4088 # am_zcopy: <= 4088, up to 5 iov # am_opt_zcopy_align: <= 512 # am_align_mtu: <= 4K # am header: <= 3952 # connection: to ep, to iface # device priority: 30 # device num paths: 1 # max eps: inf # device address: 3 bytes # iface address: 3 bytes # ep address: 6 bytes # error handling: peer failure, ep_check # # # Transport: ud_mlx5 # Device: mlx5_2:1 # Type: network # System device: mlx5_2 (10) # # capabilities: # bandwidth: 11794.23/ppn + 0.00 MB/sec # latency: 630 nsec # overhead: 80 nsec # am_short: <= 180 # am_bcopy: <= 4088 # am_zcopy: <= 4088, up to 3 iov # am_opt_zcopy_align: <= 512 # am_align_mtu: <= 4K # am header: <= 132 # connection: to ep, to iface # device priority: 30 # device num paths: 1 # max eps: inf # device address: 3 bytes # iface address: 3 bytes # ep address: 6 bytes # error handling: peer failure, ep_check # # # Memory domain: mlx5_3 # Component: ib # register: unlimited, cost: 180 nsec # remote key: 8 bytes # local memory handle is required for zcopy # < no supported devices found > # # Connection manager: rdmacm # max_conn_priv: 54 bytes # # Memory domain: cma # Component: cma # register: unlimited, cost: 9 nsec # # Transport: cma # Device: memory # Type: intra-node # System device: <unknown> # # capabilities: # bandwidth: 0.00/ppn + 11145.00 MB/sec # latency: 80 nsec # overhead: 2000 nsec # put_zcopy: unlimited, up to 16 iov # put_opt_zcopy_align: <= 1 # put_align_mtu: <= 1 # get_zcopy: unlimited, up to 16 iov # get_opt_zcopy_align: <= 1 # get_align_mtu: <= 1 # connection: to iface # device priority: 0 # device num paths: 1 # max eps: inf # device address: 8 bytes # iface address: 4 bytes # error handling: peer failure, ep_check # ```