1.vivado hls file (AMD)
https://docs.amd.com/r/zh-CN/ug1399-vitis-hls/%E5%88%9B%E5%BB%BA-HLS-%E7%BB%84%E4%BB%B6
2.get liscence to open vitis hls (2024)
https://blog.csdn.net/weixin_74013425/article/details/142458347
3.Overlay Tutorial (AMD)
https://pynq.readthedocs.io/en/v2.5.1/overlay_design_methodology/overlay_tutorial.html
4.Overlay Tutorial in Chinese (1)
https://blog.csdn.net/CSD_N_csdn/article/details/105669069
5.Overlay Tutorial in Chinese (2)
https://blog.csdn.net/CSD_N_csdn/article/details/105691939
6.Overlay Tutorial in Chinese (Filter)
https://blog.csdn.net/CSD_N_csdn/article/details/105894172?sharetype=blogdetail&shareId=105894172&sharerefer=APP&sharesource=Azhuwizard&sharefrom=link
internet scan:
```
sudo apt install nmap
nmap -sn 192.168.3.0/24
```
adder.cpp #HLS
```
void adder(int a, int b, int& c) {
#pragma HLS INTERFACE ap_ctrl_none port=return
#pragma HLS INTERFACE s_axilite port=a
#pragma HLS INTERFACE s_axilite port=b
#pragma HLS INTERFACE s_axilite port=c
c = a + b;
}
```
adder.tcl #PYNQ-Z2
```
# Open a new project
open_project my_adder_project
# Add the design source file
add_files adder.cpp
# Create and open a solution with the specified target device
open_solution -reset solution1
# Replace with your FPGA part number
set_part xc7z020clg400-1
# Set the top-level function
set_top adder
# Run synthesis
csynth_design
# Export the design as an IP
export_design -format ip_catalog -rtl verilog
# Print completion message
puts "IP core has been successfully generated and stored in the 'impl/ip' directory."
```
```
vitis_hls -f adder.tcl
```
/home/esslab/Pynq_adder/my_adder_project/solution1/impl/misc/drivers
/home/esslab/Pynq_adder/my_adder_project/solution1/impl/verilog
.bit .hw .tcl
```
import time
from pynq import Overlay, DefaultIP
# 自定義驅動程式
class AddDriver(DefaultIP):
def __init__(self, description):
super().__init__(description=description)
bindto = ['xilinx.com:hls:add:1.0'] # 與 HLS 定義的名稱匹配
def add(self, a, b):
self.write(0x10, a)
self.write(0x18, b)
return self.read(0x20)
# 加載比特流
overlay = Overlay('adder.bit')
# 從 ip_dict 中獲取描述資訊
adder_description = overlay.ip_dict['adder_0'] # 'adder_0' 為 IP 名稱
# 手動將 adder_0 綁定到 AddDriver
overlay.adder_0 = AddDriver(adder_description)
# 測試數據
a, b = 123, 456
# 計算使用 IP 的運算時間
start_time_ip = time.time()
ip_result = overlay.adder_0.add(a, b)
end_time_ip = time.time()
time_ip = end_time_ip - start_time_ip
print(f"IP Result: {ip_result}, Time taken with IP: {time_ip:.6f} seconds")
# 計算使用純 Python 的運算時間
start_time_python = time.time()
python_result = a + b
end_time_python = time.time()
time_python = end_time_python - start_time_python
print(f"Python Result: {python_result}, Time taken with Python: {time_python:.6f} seconds")
# 驗證結果一致性
if ip_result == python_result:
print("Results are consistent.")
else:
print("Results are inconsistent.")
```
file
```
sudo chmod 777 /media/esslab/root/home/xilinx/jupyter_notebooks/matrix
```
3*3矩陣乘法版本
```
#include <ap_int.h>
#include <hls_stream.h>
#define MATRIX_SIZE 3
void matrix(
int A[MATRIX_SIZE][MATRIX_SIZE],
int B[MATRIX_SIZE][MATRIX_SIZE],
int C[MATRIX_SIZE][MATRIX_SIZE]
) {
// 使用標準控制介面
#pragma HLS INTERFACE s_axilite port=return bundle=CTRL
#pragma HLS INTERFACE s_axilite port=A bundle=CTRL
#pragma HLS INTERFACE s_axilite port=B bundle=CTRL
#pragma HLS INTERFACE s_axilite port=C bundle=CTRL
// 初始化 C
for (int i = 0; i < MATRIX_SIZE; i++) {
for (int j = 0; j < MATRIX_SIZE; j++) {
#pragma HLS PIPELINE II=1
C[i][j] = 0;
}
}
// 執行乘法
for (int i = 0; i < MATRIX_SIZE; i++) {
for (int j = 0; j < MATRIX_SIZE; j++) {
for (int k = 0; k < MATRIX_SIZE; k++) {
#pragma HLS PIPELINE II=1
C[i][j] += A[i][k] * B[k][j];
}
}
}
}
```
```
import time
from pynq import Overlay, DefaultIP
import numpy as np
class MatrixDriver(DefaultIP):
# Match your IP’s "bindto" definition here
bindto = ['xilinx.com:hls:matrix:1.0']
def __init__(self, description):
super().__init__(description=description)
def multiply(self, matrix_a, matrix_b):
"""
Send two matrices to the hardware for multiplication and retrieve the result.
"""
# 1) Write size if required (assume offset 0x10)
size = matrix_a.shape[0]
self.write(0x10, size)
# 2) Write matrix A
for i in range(size):
for j in range(size):
addrA = 0x40 + 4 * (i * size + j)
self.write(addrA, int(matrix_a[i][j]))
# 3) Write matrix B
for i in range(size):
for j in range(size):
addrB = 0x80 + 4 * (i * size + j)
self.write(addrB, int(matrix_b[i][j]))
# 4) Start the IP
control_val = self.read(0x00)
self.write(0x00, (control_val & 0x80) | 0x01)
# 5) Poll for ap_done
while True:
status = self.read(0x00)
if (status & 0x2) != 0: # bit 1 for ap_done
break
# 6) Read matrix C
result = np.zeros((size, size), dtype=int)
for i in range(size):
for j in range(size):
addrC = 0xC0 + 4 * (i * size + j)
result[i][j] = self.read(addrC)
return result
# --------------------------------------------------------------------
# Now load the bitstream, create the driver instance, and test
# --------------------------------------------------------------------
# 1) Load the bitstream
overlay = Overlay('matrix.bit')
# 2) Bind the IP description to the custom driver
overlay.matrix_0 = MatrixDriver(overlay.ip_dict['matrix_0'])
# 3) Test with 3x3 matrices
matrix_a = np.array([[1, 2, 3],
[4, 5, 6],
[7, 8, 9]], dtype=int)
matrix_b = np.array([[9, 8, 7],
[6, 5, 4],
[3, 2, 1]], dtype=int)
# 4) Check expected result via numpy
expected_result = np.dot(matrix_a, matrix_b)
print("Expected result (calculated with numpy):")
print(expected_result)
# 5) Multiply with hardware
print("\nPerforming multiplication with hardware...")
start_time_ip = time.time()
ip_result = overlay.matrix_0.multiply(matrix_a, matrix_b)
end_time_ip = time.time()
# 6) Display the result and check correctness
print("\nHardware Result:")
print(ip_result)
print(f"\nTime taken with IP: {end_time_ip - start_time_ip:.6f} seconds")
if np.array_equal(ip_result, expected_result):
print("Verification successful: Results match.")
else:
print("Verification failed: Results do NOT match.")
```
12*12矩陣
```
#include <ap_int.h>
#include <hls_stream.h>
#define MATRIX_SIZE 12
void matrix(
int A[MATRIX_SIZE][MATRIX_SIZE], // 輸入矩陣 A
int B[MATRIX_SIZE][MATRIX_SIZE], // 輸入矩陣 B
int C[MATRIX_SIZE][MATRIX_SIZE] // 輸出矩陣 C
) {
// AXI4-Lite interface for control signals
#pragma HLS INTERFACE s_axilite port=return bundle=CTRL
#pragma HLS INTERFACE s_axilite port=A bundle=CTRL
#pragma HLS INTERFACE s_axilite port=B bundle=CTRL
#pragma HLS INTERFACE s_axilite port=C bundle=CTRL
// 初始化 C 矩陣
for (int i = 0; i < MATRIX_SIZE; i++) {
for (int j = 0; j < MATRIX_SIZE; j++) {
#pragma HLS UNROLL // 完全展開以加速初始化
C[i][j] = 0;
}
}
// 執行矩陣乘法
for (int i = 0; i < MATRIX_SIZE; i++) {
for (int j = 0; j < MATRIX_SIZE; j++) {
#pragma HLS UNROLL // 完全展開以加速初始化
for (int k = 0; k < MATRIX_SIZE; k++) {
#pragma HLS PIPELINE II=1 // 每次迴圈迭代佔用 1 個時鐘週期
C[i][j] += A[i][k] * B[k][j];
}
}
}
}
```