PYNQ-Z2 加速器

1.vivado hls file (AMD) https://docs.amd.com/r/zh-CN/ug1399-vitis-hls/%E5%88%9B%E5%BB%BA-HLS-%E7%BB%84%E4%BB%B6 2.get liscence to open vitis hls (2024) https://blog.csdn.net/weixin_74013425/article/details/142458347 3.Overlay Tutorial (AMD) https://pynq.readthedocs.io/en/v2.5.1/overlay_design_methodology/overlay_tutorial.html 4.Overlay Tutorial in Chinese (1) https://blog.csdn.net/CSD_N_csdn/article/details/105669069 5.Overlay Tutorial in Chinese (2) https://blog.csdn.net/CSD_N_csdn/article/details/105691939 6.Overlay Tutorial in Chinese (Filter) https://blog.csdn.net/CSD_N_csdn/article/details/105894172?sharetype=blogdetail&shareId=105894172&sharerefer=APP&sharesource=Azhuwizard&sharefrom=link internet scan: ``` sudo apt install nmap nmap -sn 192.168.3.0/24 ``` adder.cpp #HLS ``` void adder(int a, int b, int& c) { #pragma HLS INTERFACE ap_ctrl_none port=return #pragma HLS INTERFACE s_axilite port=a #pragma HLS INTERFACE s_axilite port=b #pragma HLS INTERFACE s_axilite port=c c = a + b; } ``` adder.tcl #PYNQ-Z2 ``` # Open a new project open_project my_adder_project # Add the design source file add_files adder.cpp # Create and open a solution with the specified target device open_solution -reset solution1 # Replace with your FPGA part number set_part xc7z020clg400-1 # Set the top-level function set_top adder # Run synthesis csynth_design # Export the design as an IP export_design -format ip_catalog -rtl verilog # Print completion message puts "IP core has been successfully generated and stored in the 'impl/ip' directory." ``` ``` vitis_hls -f adder.tcl ``` /home/esslab/Pynq_adder/my_adder_project/solution1/impl/misc/drivers /home/esslab/Pynq_adder/my_adder_project/solution1/impl/verilog .bit .hw .tcl ``` import time from pynq import Overlay, DefaultIP # 自定義驅動程式 class AddDriver(DefaultIP): def __init__(self, description): super().__init__(description=description) bindto = ['xilinx.com:hls:add:1.0'] # 與 HLS 定義的名稱匹配 def add(self, a, b): self.write(0x10, a) self.write(0x18, b) return self.read(0x20) # 加載比特流 overlay = Overlay('adder.bit') # 從 ip_dict 中獲取描述資訊 adder_description = overlay.ip_dict['adder_0'] # 'adder_0' 為 IP 名稱 # 手動將 adder_0 綁定到 AddDriver overlay.adder_0 = AddDriver(adder_description) # 測試數據 a, b = 123, 456 # 計算使用 IP 的運算時間 start_time_ip = time.time() ip_result = overlay.adder_0.add(a, b) end_time_ip = time.time() time_ip = end_time_ip - start_time_ip print(f"IP Result: {ip_result}, Time taken with IP: {time_ip:.6f} seconds") # 計算使用純 Python 的運算時間 start_time_python = time.time() python_result = a + b end_time_python = time.time() time_python = end_time_python - start_time_python print(f"Python Result: {python_result}, Time taken with Python: {time_python:.6f} seconds") # 驗證結果一致性 if ip_result == python_result: print("Results are consistent.") else: print("Results are inconsistent.") ``` file ``` sudo chmod 777 /media/esslab/root/home/xilinx/jupyter_notebooks/matrix ``` 3*3矩陣乘法版本 ``` #include <ap_int.h> #include <hls_stream.h> #define MATRIX_SIZE 3 void matrix( int A[MATRIX_SIZE][MATRIX_SIZE], int B[MATRIX_SIZE][MATRIX_SIZE], int C[MATRIX_SIZE][MATRIX_SIZE] ) { // 使用標準控制介面 #pragma HLS INTERFACE s_axilite port=return bundle=CTRL #pragma HLS INTERFACE s_axilite port=A bundle=CTRL #pragma HLS INTERFACE s_axilite port=B bundle=CTRL #pragma HLS INTERFACE s_axilite port=C bundle=CTRL // 初始化 C for (int i = 0; i < MATRIX_SIZE; i++) { for (int j = 0; j < MATRIX_SIZE; j++) { #pragma HLS PIPELINE II=1 C[i][j] = 0; } } // 執行乘法 for (int i = 0; i < MATRIX_SIZE; i++) { for (int j = 0; j < MATRIX_SIZE; j++) { for (int k = 0; k < MATRIX_SIZE; k++) { #pragma HLS PIPELINE II=1 C[i][j] += A[i][k] * B[k][j]; } } } } ``` ``` import time from pynq import Overlay, DefaultIP import numpy as np class MatrixDriver(DefaultIP): # Match your IP’s "bindto" definition here bindto = ['xilinx.com:hls:matrix:1.0'] def __init__(self, description): super().__init__(description=description) def multiply(self, matrix_a, matrix_b): """ Send two matrices to the hardware for multiplication and retrieve the result. """ # 1) Write size if required (assume offset 0x10) size = matrix_a.shape[0] self.write(0x10, size) # 2) Write matrix A for i in range(size): for j in range(size): addrA = 0x40 + 4 * (i * size + j) self.write(addrA, int(matrix_a[i][j])) # 3) Write matrix B for i in range(size): for j in range(size): addrB = 0x80 + 4 * (i * size + j) self.write(addrB, int(matrix_b[i][j])) # 4) Start the IP control_val = self.read(0x00) self.write(0x00, (control_val & 0x80) | 0x01) # 5) Poll for ap_done while True: status = self.read(0x00) if (status & 0x2) != 0: # bit 1 for ap_done break # 6) Read matrix C result = np.zeros((size, size), dtype=int) for i in range(size): for j in range(size): addrC = 0xC0 + 4 * (i * size + j) result[i][j] = self.read(addrC) return result # -------------------------------------------------------------------- # Now load the bitstream, create the driver instance, and test # -------------------------------------------------------------------- # 1) Load the bitstream overlay = Overlay('matrix.bit') # 2) Bind the IP description to the custom driver overlay.matrix_0 = MatrixDriver(overlay.ip_dict['matrix_0']) # 3) Test with 3x3 matrices matrix_a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=int) matrix_b = np.array([[9, 8, 7], [6, 5, 4], [3, 2, 1]], dtype=int) # 4) Check expected result via numpy expected_result = np.dot(matrix_a, matrix_b) print("Expected result (calculated with numpy):") print(expected_result) # 5) Multiply with hardware print("\nPerforming multiplication with hardware...") start_time_ip = time.time() ip_result = overlay.matrix_0.multiply(matrix_a, matrix_b) end_time_ip = time.time() # 6) Display the result and check correctness print("\nHardware Result:") print(ip_result) print(f"\nTime taken with IP: {end_time_ip - start_time_ip:.6f} seconds") if np.array_equal(ip_result, expected_result): print("Verification successful: Results match.") else: print("Verification failed: Results do NOT match.") ``` 12*12矩陣 ``` #include <ap_int.h> #include <hls_stream.h> #define MATRIX_SIZE 12 void matrix( int A[MATRIX_SIZE][MATRIX_SIZE], // 輸入矩陣 A int B[MATRIX_SIZE][MATRIX_SIZE], // 輸入矩陣 B int C[MATRIX_SIZE][MATRIX_SIZE] // 輸出矩陣 C ) { // AXI4-Lite interface for control signals #pragma HLS INTERFACE s_axilite port=return bundle=CTRL #pragma HLS INTERFACE s_axilite port=A bundle=CTRL #pragma HLS INTERFACE s_axilite port=B bundle=CTRL #pragma HLS INTERFACE s_axilite port=C bundle=CTRL // 初始化 C 矩陣 for (int i = 0; i < MATRIX_SIZE; i++) { for (int j = 0; j < MATRIX_SIZE; j++) { #pragma HLS UNROLL // 完全展開以加速初始化 C[i][j] = 0; } } // 執行矩陣乘法 for (int i = 0; i < MATRIX_SIZE; i++) { for (int j = 0; j < MATRIX_SIZE; j++) { #pragma HLS UNROLL // 完全展開以加速初始化 for (int k = 0; k < MATRIX_SIZE; k++) { #pragma HLS PIPELINE II=1 // 每次迴圈迭代佔用 1 個時鐘週期 C[i][j] += A[i][k] * B[k][j]; } } } } ```