# hlajungo - HiPac 4 Release Note and informations ## Quantum 佔 25 分。 lab1 5 題,每題 1 分。 lab2-1 10 分, 2-2 10分。 ### lab1 [題目和題解](https://hackmd.io/@hlajungo/r1m0tDQdle) ### lab 2 lab2-1 在比跑最大 qubit grover。 最好的能用 31 qubit, 我只用 30 qubit。 lab2-2 在比跑最快 24 qubit grover。 cudaq 1 node 1 process 1 GPU 8.38 秒能完成。 評審賽中要求 shot=1024 以上,並且不能用抽獎法。 評審賽後說 qiskit 大概要 10-20 秒,cudaq 0-10 秒。 [安裝腳本與代碼](https://hackmd.io/@hlajungo/ryG4JvMugg) ### new lab2-2 MPI code 比賽時 lab2-2 MPI 失敗了,但現在知道原因了。 這個版本是比賽後寫的,能成功跑起來。 :::spoiler code ```cpp #include <chrono> #include <cmath> #include <cudaq.h> #include <functional> // for std::reference_wrapper #include <iomanip> // for setw #include <iostream> #include <string> #include "log.hpp" // Apply the oracle: flip phase on the marked state __qpu__ void oracle (auto& q, const std::string& marked_state) { // Step 1: 通過 marked_state 把指定的 qubit 翻轉 for (int i = 0; i < marked_state.size (); ++i) { if (marked_state[i] == '0') { x (q[i]); } } // Step 2: 進行 HXH=Z ,對 "11...1" 變號 h (q[q.size () - 1]); // 改用 reference_wrapper std::vector<std::reference_wrapper<cudaq::qubit> > controls; for (std::size_t i = 0; i < q.size () - 1; ++i) { controls.emplace_back (std::ref (q[i])); } x (controls, q[q.size () - 1]); // 多控制 X gate h (q[q.size () - 1]); // Step 3: 還原 Step 1 翻轉的 bit for (int i = 0; i < marked_state.size (); ++i) { if (marked_state[i] == '0') { x (q[i]); } } // 此處完成對 w 乘上 -1,其餘不變動。 } // Diffusion operator: reflection about average __qpu__ void diffusion (auto& q) { // Step 1: 弄成 "00...0" 狀態 h (q); // Step 2: 弄成 "11...1" 狀態 x (q); // Step 3: 進行 HXH=Z, 對 "11...1" 變號 h (q[q.size () - 1]); std::vector<std::reference_wrapper<cudaq::qubit> > controls; for (std::size_t i = 0; i < q.size () - 1; ++i) { controls.emplace_back (std::ref (q[i])); } x (controls, q[q.size () - 1]); h (q[q.size () - 1]); // Step 4: 還原,弄成 "00...0" x (q); // Step 5: 還原, 弄成之前的狀態 h (q); } /* s = target state "1111" w = 均勻超位置狀態 oracle -> 對 w 翻轉 diffusion -> 對 s 翻轉 */ __qpu__ void grover_kernel (std::size_t n, const std::string& marked, std::size_t iterations) { cudaq::qvector q (n); // 初始化為均衡態 for (auto& qbit : q) h (qbit); // 疊代 Grover for (std::size_t i = 0; i < iterations; ++i) { oracle (q, marked); diffusion (q); } // 測量 mz (q); } int main_my () { for (std::size_t n = 2; n <= 2; ++n) { std::string marked (n, '1'); std::size_t opt_it = std::floor ((M_PI / 4.0) * std::sqrt (std::pow (2, n))); std::size_t it = opt_it * 0.61; auto start = std::chrono::high_resolution_clock::now (); auto result = cudaq::sample (grover_kernel, n, marked, it); auto end = std::chrono::high_resolution_clock::now (); std::chrono::duration<double> elapsed = end - start; std::cout << std::setprecision (3) << n << std::setw (10) << result.probability (result.most_probable ()) << std::setw (10) << elapsed.count () << std::setw (10) << "\n"; } return 0; } int main_mpi () { cudaq::mpi::initialize (); int rank = cudaq::mpi::rank (); int size = cudaq::mpi::num_ranks (); std::size_t total_shots = 1024; std::size_t shots_per_rank = total_shots / size; if (cudaq::mpi::rank () == 0) { std::cout << "qubits target amp run time(s)\n"; } for (std::size_t n = 2; n <= 2; ++n) { std::string marked (n, '1'); std::size_t opt_it = std::floor ((M_PI / 4.0) * std::sqrt (std::pow (2, n))); std::size_t it = opt_it * 0.61; auto start = std::chrono::high_resolution_clock::now (); auto result = cudaq::sample (shots_per_rank, grover_kernel, n, marked, it); // result 是 map. map["110"]=255; auto end = std::chrono::high_resolution_clock::now (); std::chrono::duration<double> elapsed = end - start; // gather data from ranks std::map<std::string, std::vector<int> > global_counts_all; for (auto& [bits, count] : result) { LOG(bits); std::vector<int> gathered (size); // 暫存本地結果 std::vector<int> local = { static_cast<int> (count) }; // 所有 rank 對應 bits 都 gather cudaq::mpi::all_gather (gathered, local); if (rank == 0) { global_counts_all[bits] = gathered; } } if (rank == 0) { for (auto& [bits, count_1d] : global_counts_all) { int total = std::accumulate (count_1d.begin (), count_1d.end (), 0); std::cout << bits << ':' << total << "\n"; } } // std::cout << std::setprecision (3) << n << std::setw (10) //<< result.probability (result.most_probable ()) << std::setw (10) //<< elapsed.count () << std::setw (10) << "\n"; } cudaq::mpi::finalize (); return 0; } int main () { main_mpi (); return 0; } ``` ::: 運行 ``` mpirun -np 1 ./grover_exec ``` 輸出 ``` hwloc/linux: Ignoring PCI device with non-16bit domain. Pass --enable-32bits-pci-domain to configure to support such devices (warning: it would break the library ABI, don't enable unless really needed). qubits target amp run time(s) LOG [std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>] bits = [00] At main_mpi ,line 00162 LOG [std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>] bits = [10] At main_mpi ,line 00162 LOG [std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>] bits = [01] At main_mpi ,line 00162 LOG [std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>] bits = [11] At main_mpi ,line 00162 00:255 01:275 10:240 11:254 ``` ## presentaion [url](https://www.canva.com/design/DAGvaQAqT2c/fUnR02Bd7_1oQkEdIN-2SQ/edit?utm_content=DAGvaQAqT2c&utm_campaign=designshare&utm_medium=link2&utm_source=sharebutton) 簡報中 `2^31*16/1024/1024/1024 = 32 GB = V100 RAM` 是錯的。 fp32 是 8 bytes(4虛4實),不是 16 bytes,所以可以跑 31 qubit 而非 30 qubits。 ## reflect 1. 對 slurm 不太熟悉,在本地撰寫一些測試案例,搬到比賽現場跑跑看,確認行為正常。 2. mpirun 指令最佳化又長又麻煩,以後寫一個生成 mpirun 的腳本。 像是 ``` mpirun --hostfile hostfile.txt --rankfile rankfile.txt \ --mca pml ucx \ --mca btl ^vader,tcp,smcuda,self \ -n 1 -x CUDA_VISIBLE_DEVICES=0 -x UCX_NET_DEVICES=mlx5_0:1,mlx5_2:0 -x UCX_TLS="ud,rc,sm,cuda_copy,gdr_copy,cuda_ipc" ./grover_exec : \ -n 1 -x CUDA_VISIBLE_DEVICES=1 -x UCX_NET_DEVICES=mlx5_0:1,mlx5_2:0 -x UCX_TLS="ud,rc,sm,cuda_copy,gdr_copy,cuda_ipc" ./grover_exec : \ -n 1 -x CUDA_VISIBLE_DEVICES=2 -x UCX_NET_DEVICES=mlx5_0:1,mlx5_2:0 -x UCX_TLS="ud,rc,sm,cuda_copy,gdr_copy,cuda_ipc" ./grover_exec : \ -n 1 -x CUDA_VISIBLE_DEVICES=3 -x UCX_NET_DEVICES=mlx5_0:1,mlx5_2:0 -x UCX_TLS="ud,rc,sm,cuda_copy,gdr_copy,cuda_ipc" ./grover_exec : \ -n 1 -x CUDA_VISIBLE_DEVICES=4 -x UCX_NET_DEVICES=mlx5_0:1,mlx5_3:0 -x UCX_TLS="ud,rc,sm,cuda_copy,gdr_copy,cuda_ipc" ./grover_exec : \ -n 1 -x CUDA_VISIBLE_DEVICES=5 -x UCX_NET_DEVICES=mlx5_0:1,mlx5_3:0 -x UCX_TLS="ud,rc,sm,cuda_copy,gdr_copy,cuda_ipc" ./grover_exec : \ -n 1 -x CUDA_VISIBLE_DEVICES=6 -x UCX_NET_DEVICES=mlx5_0:1,mlx5_3:0 -x UCX_TLS="ud,rc,sm,cuda_copy,gdr_copy,cuda_ipc" ./grover_exec : \ -n 1 -x CUDA_VISIBLE_DEVICES=7 -x UCX_NET_DEVICES=mlx5_0:1,mlx5_3:0 -x UCX_TLS="ud,rc,sm,cuda_copy,gdr_copy,cuda_ipc" ./grover_exec : \ -n 1 -x CUDA_VISIBLE_DEVICES=0 -x UCX_NET_DEVICES=mlx5_0:1,mlx5_2:0 -x UCX_TLS="ud,rc,sm,cuda_copy,gdr_copy,cuda_ipc" -x LD_LIBRARY_PATH ./grover_exec : \ -n 1 -x CUDA_VISIBLE_DEVICES=1 -x UCX_NET_DEVICES=mlx5_0:1,mlx5_2:0 -x UCX_TLS="ud,rc,sm,cuda_copy,gdr_copy,cuda_ipc" -x LD_LIBRARY_PATH ./grover_exec : \ -n 1 -x CUDA_VISIBLE_DEVICES=2 -x UCX_NET_DEVICES=mlx5_0:1,mlx5_2:0 -x UCX_TLS="ud,rc,sm,cuda_copy,gdr_copy,cuda_ipc" -x LD_LIBRARY_PATH ./grover_exec : \ -n 1 -x CUDA_VISIBLE_DEVICES=3 -x UCX_NET_DEVICES=mlx5_0:1,mlx5_2:0 -x UCX_TLS="ud,rc,sm,cuda_copy,gdr_copy,cuda_ipc" -x LD_LIBRARY_PATH ./grover_exec : \ -n 1 -x CUDA_VISIBLE_DEVICES=4 -x UCX_NET_DEVICES=mlx5_0:1,mlx5_3:0 -x UCX_TLS="ud,rc,sm,cuda_copy,gdr_copy,cuda_ipc" -x LD_LIBRARY_PATH ./grover_exec : \ -n 1 -x CUDA_VISIBLE_DEVICES=5 -x UCX_NET_DEVICES=mlx5_0:1,mlx5_3:0 -x UCX_TLS="ud,rc,sm,cuda_copy,gdr_copy,cuda_ipc" -x LD_LIBRARY_PATH ./grover_exec : \ -n 1 -x CUDA_VISIBLE_DEVICES=6 -x UCX_NET_DEVICES=mlx5_0:1,mlx5_3:0 -x UCX_TLS="ud,rc,sm,cuda_copy,gdr_copy,cuda_ipc" -x LD_LIBRARY_PATH ./grover_exec : \ -n 1 -x CUDA_VISIBLE_DEVICES=7 -x UCX_NET_DEVICES=mlx5_0:1,mlx5_3:0 -x UCX_TLS="ud,rc,sm,cuda_copy,gdr_copy,cuda_ipc" -x LD_LIBRARY_PATH ./grover_exec >> $OUT_FILE 2>&1 ``` 3. cudaq 的 MPI 當時有錯誤,需要更熟悉 profiler,用 MPI 的想法思考。 它需要不同 process 負責一部分 shot 最後用 all_gather 收集起來。 4. 如果要做優化,先拿最基本的版本用在小型測試上, 像是 1 node 1 process 1 cpu/gpu, 把這當作基準點。得到進行優化行為的新版本後,拿去和基準點比,就能知道加速比。