# hlajungo - HiPac 4 Release Note and informations
## Quantum
佔 25 分。
lab1 5 題,每題 1 分。
lab2-1 10 分, 2-2 10分。
### lab1
[題目和題解](https://hackmd.io/@hlajungo/r1m0tDQdle)
### lab 2
lab2-1 在比跑最大 qubit grover。
最好的能用 31 qubit, 我只用 30 qubit。
lab2-2 在比跑最快 24 qubit grover。
cudaq 1 node 1 process 1 GPU 8.38 秒能完成。
評審賽中要求 shot=1024 以上,並且不能用抽獎法。
評審賽後說 qiskit 大概要 10-20 秒,cudaq 0-10 秒。
[安裝腳本與代碼](https://hackmd.io/@hlajungo/ryG4JvMugg)
### new lab2-2 MPI code
比賽時 lab2-2 MPI 失敗了,但現在知道原因了。
這個版本是比賽後寫的,能成功跑起來。
:::spoiler code
```cpp
#include <chrono>
#include <cmath>
#include <cudaq.h>
#include <functional> // for std::reference_wrapper
#include <iomanip> // for setw
#include <iostream>
#include <string>
#include "log.hpp"
// Apply the oracle: flip phase on the marked state
__qpu__ void
oracle (auto& q, const std::string& marked_state)
{
// Step 1: 通過 marked_state 把指定的 qubit 翻轉
for (int i = 0; i < marked_state.size (); ++i)
{
if (marked_state[i] == '0')
{
x (q[i]);
}
}
// Step 2: 進行 HXH=Z ,對 "11...1" 變號
h (q[q.size () - 1]);
// 改用 reference_wrapper
std::vector<std::reference_wrapper<cudaq::qubit> > controls;
for (std::size_t i = 0; i < q.size () - 1; ++i)
{
controls.emplace_back (std::ref (q[i]));
}
x (controls, q[q.size () - 1]); // 多控制 X gate
h (q[q.size () - 1]);
// Step 3: 還原 Step 1 翻轉的 bit
for (int i = 0; i < marked_state.size (); ++i)
{
if (marked_state[i] == '0')
{
x (q[i]);
}
}
// 此處完成對 w 乘上 -1,其餘不變動。
}
// Diffusion operator: reflection about average
__qpu__ void
diffusion (auto& q)
{
// Step 1: 弄成 "00...0" 狀態
h (q);
// Step 2: 弄成 "11...1" 狀態
x (q);
// Step 3: 進行 HXH=Z, 對 "11...1" 變號
h (q[q.size () - 1]);
std::vector<std::reference_wrapper<cudaq::qubit> > controls;
for (std::size_t i = 0; i < q.size () - 1; ++i)
{
controls.emplace_back (std::ref (q[i]));
}
x (controls, q[q.size () - 1]);
h (q[q.size () - 1]);
// Step 4: 還原,弄成 "00...0"
x (q);
// Step 5: 還原, 弄成之前的狀態
h (q);
}
/*
s = target state "1111"
w = 均勻超位置狀態
oracle -> 對 w 翻轉
diffusion -> 對 s 翻轉
*/
__qpu__ void
grover_kernel (std::size_t n, const std::string& marked, std::size_t iterations)
{
cudaq::qvector q (n);
// 初始化為均衡態
for (auto& qbit : q)
h (qbit);
// 疊代 Grover
for (std::size_t i = 0; i < iterations; ++i)
{
oracle (q, marked);
diffusion (q);
}
// 測量
mz (q);
}
int
main_my ()
{
for (std::size_t n = 2; n <= 2; ++n)
{
std::string marked (n, '1');
std::size_t opt_it
= std::floor ((M_PI / 4.0) * std::sqrt (std::pow (2, n)));
std::size_t it = opt_it * 0.61;
auto start = std::chrono::high_resolution_clock::now ();
auto result = cudaq::sample (grover_kernel, n, marked, it);
auto end = std::chrono::high_resolution_clock::now ();
std::chrono::duration<double> elapsed = end - start;
std::cout << std::setprecision (3) << n << std::setw (10)
<< result.probability (result.most_probable ()) << std::setw (10)
<< elapsed.count () << std::setw (10) << "\n";
}
return 0;
}
int
main_mpi ()
{
cudaq::mpi::initialize ();
int rank = cudaq::mpi::rank ();
int size = cudaq::mpi::num_ranks ();
std::size_t total_shots = 1024;
std::size_t shots_per_rank = total_shots / size;
if (cudaq::mpi::rank () == 0)
{
std::cout << "qubits target amp run time(s)\n";
}
for (std::size_t n = 2; n <= 2; ++n)
{
std::string marked (n, '1');
std::size_t opt_it
= std::floor ((M_PI / 4.0) * std::sqrt (std::pow (2, n)));
std::size_t it = opt_it * 0.61;
auto start = std::chrono::high_resolution_clock::now ();
auto result = cudaq::sample (shots_per_rank, grover_kernel, n, marked, it);
// result 是 map. map["110"]=255;
auto end = std::chrono::high_resolution_clock::now ();
std::chrono::duration<double> elapsed = end - start;
// gather data from ranks
std::map<std::string, std::vector<int> > global_counts_all;
for (auto& [bits, count] : result)
{
LOG(bits);
std::vector<int> gathered (size);
// 暫存本地結果
std::vector<int> local = { static_cast<int> (count) };
// 所有 rank 對應 bits 都 gather
cudaq::mpi::all_gather (gathered, local);
if (rank == 0)
{
global_counts_all[bits] = gathered;
}
}
if (rank == 0)
{
for (auto& [bits, count_1d] : global_counts_all)
{
int total = std::accumulate (count_1d.begin (), count_1d.end (), 0);
std::cout << bits << ':' << total << "\n";
}
}
// std::cout << std::setprecision (3) << n << std::setw (10)
//<< result.probability (result.most_probable ()) << std::setw (10)
//<< elapsed.count () << std::setw (10) << "\n";
}
cudaq::mpi::finalize ();
return 0;
}
int
main ()
{
main_mpi ();
return 0;
}
```
:::
運行
```
mpirun -np 1 ./grover_exec
```
輸出
```
hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices
(warning: it would break the library ABI, don't enable unless really needed).
qubits target amp run time(s)
LOG [std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>] bits = [00] At main_mpi ,line 00162
LOG [std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>] bits = [10] At main_mpi ,line 00162
LOG [std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>] bits = [01] At main_mpi ,line 00162
LOG [std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>] bits = [11] At main_mpi ,line 00162
00:255
01:275
10:240
11:254
```
## presentaion
[url](https://www.canva.com/design/DAGvaQAqT2c/fUnR02Bd7_1oQkEdIN-2SQ/edit?utm_content=DAGvaQAqT2c&utm_campaign=designshare&utm_medium=link2&utm_source=sharebutton)
簡報中 `2^31*16/1024/1024/1024 = 32 GB = V100 RAM` 是錯的。
fp32 是 8 bytes(4虛4實),不是 16 bytes,所以可以跑 31 qubit 而非 30 qubits。
## reflect
1. 對 slurm 不太熟悉,在本地撰寫一些測試案例,搬到比賽現場跑跑看,確認行為正常。
2. mpirun 指令最佳化又長又麻煩,以後寫一個生成 mpirun 的腳本。
像是
```
mpirun --hostfile hostfile.txt --rankfile rankfile.txt \
--mca pml ucx \
--mca btl ^vader,tcp,smcuda,self \
-n 1 -x CUDA_VISIBLE_DEVICES=0 -x UCX_NET_DEVICES=mlx5_0:1,mlx5_2:0 -x UCX_TLS="ud,rc,sm,cuda_copy,gdr_copy,cuda_ipc" ./grover_exec : \
-n 1 -x CUDA_VISIBLE_DEVICES=1 -x UCX_NET_DEVICES=mlx5_0:1,mlx5_2:0 -x UCX_TLS="ud,rc,sm,cuda_copy,gdr_copy,cuda_ipc" ./grover_exec : \
-n 1 -x CUDA_VISIBLE_DEVICES=2 -x UCX_NET_DEVICES=mlx5_0:1,mlx5_2:0 -x UCX_TLS="ud,rc,sm,cuda_copy,gdr_copy,cuda_ipc" ./grover_exec : \
-n 1 -x CUDA_VISIBLE_DEVICES=3 -x UCX_NET_DEVICES=mlx5_0:1,mlx5_2:0 -x UCX_TLS="ud,rc,sm,cuda_copy,gdr_copy,cuda_ipc" ./grover_exec : \
-n 1 -x CUDA_VISIBLE_DEVICES=4 -x UCX_NET_DEVICES=mlx5_0:1,mlx5_3:0 -x UCX_TLS="ud,rc,sm,cuda_copy,gdr_copy,cuda_ipc" ./grover_exec : \
-n 1 -x CUDA_VISIBLE_DEVICES=5 -x UCX_NET_DEVICES=mlx5_0:1,mlx5_3:0 -x UCX_TLS="ud,rc,sm,cuda_copy,gdr_copy,cuda_ipc" ./grover_exec : \
-n 1 -x CUDA_VISIBLE_DEVICES=6 -x UCX_NET_DEVICES=mlx5_0:1,mlx5_3:0 -x UCX_TLS="ud,rc,sm,cuda_copy,gdr_copy,cuda_ipc" ./grover_exec : \
-n 1 -x CUDA_VISIBLE_DEVICES=7 -x UCX_NET_DEVICES=mlx5_0:1,mlx5_3:0 -x UCX_TLS="ud,rc,sm,cuda_copy,gdr_copy,cuda_ipc" ./grover_exec : \
-n 1 -x CUDA_VISIBLE_DEVICES=0 -x UCX_NET_DEVICES=mlx5_0:1,mlx5_2:0 -x UCX_TLS="ud,rc,sm,cuda_copy,gdr_copy,cuda_ipc" -x LD_LIBRARY_PATH ./grover_exec : \
-n 1 -x CUDA_VISIBLE_DEVICES=1 -x UCX_NET_DEVICES=mlx5_0:1,mlx5_2:0 -x UCX_TLS="ud,rc,sm,cuda_copy,gdr_copy,cuda_ipc" -x LD_LIBRARY_PATH ./grover_exec : \
-n 1 -x CUDA_VISIBLE_DEVICES=2 -x UCX_NET_DEVICES=mlx5_0:1,mlx5_2:0 -x UCX_TLS="ud,rc,sm,cuda_copy,gdr_copy,cuda_ipc" -x LD_LIBRARY_PATH ./grover_exec : \
-n 1 -x CUDA_VISIBLE_DEVICES=3 -x UCX_NET_DEVICES=mlx5_0:1,mlx5_2:0 -x UCX_TLS="ud,rc,sm,cuda_copy,gdr_copy,cuda_ipc" -x LD_LIBRARY_PATH ./grover_exec : \
-n 1 -x CUDA_VISIBLE_DEVICES=4 -x UCX_NET_DEVICES=mlx5_0:1,mlx5_3:0 -x UCX_TLS="ud,rc,sm,cuda_copy,gdr_copy,cuda_ipc" -x LD_LIBRARY_PATH ./grover_exec : \
-n 1 -x CUDA_VISIBLE_DEVICES=5 -x UCX_NET_DEVICES=mlx5_0:1,mlx5_3:0 -x UCX_TLS="ud,rc,sm,cuda_copy,gdr_copy,cuda_ipc" -x LD_LIBRARY_PATH ./grover_exec : \
-n 1 -x CUDA_VISIBLE_DEVICES=6 -x UCX_NET_DEVICES=mlx5_0:1,mlx5_3:0 -x UCX_TLS="ud,rc,sm,cuda_copy,gdr_copy,cuda_ipc" -x LD_LIBRARY_PATH ./grover_exec : \
-n 1 -x CUDA_VISIBLE_DEVICES=7 -x UCX_NET_DEVICES=mlx5_0:1,mlx5_3:0 -x UCX_TLS="ud,rc,sm,cuda_copy,gdr_copy,cuda_ipc" -x LD_LIBRARY_PATH ./grover_exec >> $OUT_FILE 2>&1
```
3. cudaq 的 MPI 當時有錯誤,需要更熟悉 profiler,用 MPI 的想法思考。
它需要不同 process 負責一部分 shot 最後用 all_gather 收集起來。
4. 如果要做優化,先拿最基本的版本用在小型測試上, 像是 1 node 1 process 1 cpu/gpu, 把這當作基準點。得到進行優化行為的新版本後,拿去和基準點比,就能知道加速比。