# HLS-Lab
## Week 2 Question
https://github.com/GeeCheng/hls_lab/tree/main/lab2_different_fir

### Original
```c
acc_t fir(data_t x) {
#pragma HLS INLINE off
static data_t shift_reg[N];
#pragma HLS RESET variable=shift_reg
acc_t acc = 0;
for (int i = N - 1; i >= 0; --i) {
#pragma HLS LOOP_TRIPCOUNT min=11 max=11
if (i == 0) {
acc += (acc_t)x * (acc_t)c[0];
shift_reg[0] = x;
} else {
shift_reg[i] = shift_reg[i - 1];
acc += (acc_t)shift_reg[i] * (acc_t)c[i];
}
}
return acc;
}
```
| | Latency | DSP | FF | LUT |
| -------- | ------- | --- | --- | --- |
| FIR | 22 | 2 | 361 | 272 |
| FIR loop | 16 | 1 | 331 | 229 |


### Remove if-else
```c
acc_t fir(data_t x) {
#pragma HLS INLINE off
static data_t shift_reg[N];
#pragma HLS RESET variable=shift_reg
acc_t acc = 0;
for (int i = N - 1; i >= 1; --i) {
#pragma HLS LOOP_TRIPCOUNT min=10 max=10
shift_reg[i] = shift_reg[i - 1]; // shift
acc += (acc_t)shift_reg[i] * (acc_t)c[i]; // MAC with shifted value
}
acc += (acc_t)x * (acc_t)c[0]; // tap0
shift_reg[0] = x;
return acc;
}
```
| | Latency | DSP | FF | LUT |
| -------- | ------- | --- | --- | --- |
| FIR | 18 | 2 | 169 | 204 |
| FIR loop | 15 | 1 | 121 | 111 |


### Separate loop: TDL & MAC
```c
acc_t fir(data_t x) {
#pragma HLS INLINE off
static data_t shift_reg[N];
#pragma HLS RESET variable=shift_reg
for (int i = N - 1; i > 0; --i) {
#pragma HLS LOOP_TRIPCOUNT min=10 max=10
shift_reg[i] = shift_reg[i - 1];
}
shift_reg[0] = x;
acc_t acc = 0;
for (int i = 0; i < N; ++i) {
#pragma HLS LOOP_TRIPCOUNT min=11 max=11
acc += (acc_t)shift_reg[i] * (acc_t)c[i];
}
return acc;
}
```
| | Latency | DSP | FF | LUT |
| --------- | ------- | --- | --- | --- |
| FIR | 34 | 1 | 179 | 295 |
| FIR loop1 | 12 | 0 | 11 | 59 |
| FIR loop2 | 16 | 1 | 117 | 111 |


### TDL unroll, MAC pipeline
```c
acc_t fir(data_t x) {
#pragma HLS INLINE off
static data_t shift_reg[N];
#pragma HLS RESET variable=shift_reg
for (int i = N - 1; i > 0; --i) {
#pragma HLS UNROLL
shift_reg[i] = shift_reg[i - 1];
}
shift_reg[0] = x;
acc_t acc = 0;
for (int i = 0; i < N; ++i) {
#pragma HLS PIPELINE II=1
acc += (acc_t)shift_reg[i] * (acc_t)c[i];
}
return acc;
}
```
| | Latency | DSP | FF | LUT |
| ----------------- | ------- | --- | --- | --- |
| FIR | 28 | 1 | 302 | 332 |
| FIR pipeline loop | 16 | 1 | 117 | 111 |



### TDC unroll, array partition complete, MAC pipeline
```c
acc_t fir(data_t x) {
#pragma HLS INLINE off
static data_t shift_reg[N];
#pragma HLS RESET variable=shift_reg
#pragma HLS ARRAY_PARTITION variable=shift_reg complete dim=1
for (int i = N - 1; i > 0; --i) {
#pragma HLS UNROLL
shift_reg[i] = shift_reg[i - 1];
}
shift_reg[0] = x;
acc_t acc = 0;
for (int i = 0; i < N; ++i) {
#pragma HLS PIPELINE II=1
acc += (acc_t)shift_reg[i] * (acc_t)c[i];
}
return acc;
}
```
| | Latency | DSP | FF | LUT |
| ----------------- | ------- | --- | --- | --- |
| FIR | 16 | 1 | 302 | 332 |
| FIR pipeline loop | 14 | 1 | 302 | 332 |



### TDC unroll, MAC unroll, array partition complete
```c
acc_t fir(data_t x) {
#pragma HLS INLINE off
static data_t shift_reg[N];
#pragma HLS RESET variable=shift_reg
#pragma HLS ARRAY_PARTITION variable=shift_reg complete dim=1
for (int i = N - 1; i > 0; --i) {
#pragma HLS UNROLL
shift_reg[i] = shift_reg[i - 1];
}
shift_reg[0] = x;
acc_t acc = 0;
for (int i = 0; i < N; ++i) {
#pragma HLS UNROLL
acc += (acc_t)shift_reg[i] * (acc_t)c[i];
}
return acc;
}
```
| | Latency | DSP | FF | LUT |
| --- | ------- | --- | --- | --- |
| FIR | 5 | 5 | 198 | 71 |



## Week 3 Question
### Explain Vitis programming model and the following models of data movements
1. Kernel directly access host memory
2. Host to Kernel streaming
Watch the video & draw a diagram to assist explaination
**Vitis programming model**

**Kernel directly access host memory**

**Host to Kernel streaming**

### Why use hardware scheduler? Watch the video & draw a diagram to assist explanation

### Explain how double buffering improves performance


It makes all resources fully utilized. There is no idle time on PCIe, DRAM bus, and kernel.
### Explain the cosim operation, and the requirement for cosim


## Week 4 Question
### Explain the use of IO interface for top function (kernel)


### Explain how the ap_ctrl_chain protocol and the input FIFO and output FIFO help the host/kernel data transfer

Each has an input thread and an output thread.
The input synchronization writes ap_start and places one piece of data into the input FIFO. When ap_ready becomes 1, ap_start can be written again.
The output synchronization reads ap_done, which indicates that one piece of output data is available. At this point, the input FIFO releases one piece of data, and after the output FIFO takes it, ap_continue is written to allow the kernel to keep operating.
FIFO depth, round-trip delay, and data transfer rate are closely related.
### Refer to the slide, given burst-length = B, data access latency = L, what is the minimum # of outstanding request?


As shown in the figure, to ensure that all time within the latency is utilized for data transmission, the minimum number of outstanding requests is L/B.
### Determine AXI master read outstanding configuration

Using 8 works well, as 32×4 equals 16×8.
## Week 5 Question
### Explain the best practice for task/data-level parallelism


### Calculate the II for the given code snippet


Latency: 4
Distance: 2
II = 4/2 = 2
### Rewrite the code to eliminate data dependency

1.
ld/B[i], ld/A[k], + , sd/A[k]
WAR between ld/B[i], ld/A[k]
RAW when loading A[k] before "+"
2.
```c
// WAR: Renaming
// RAW: Forwarding
for (i = 0; i < N; i++) {
k = B[i];
C[i] = A[k];
if (old == k) {
acc += 1;
} else {
A[old] = acc;
acc = A[k] + 1;
}
old = k;
}
A[old] = acc;
```
### What is the II for loop-k? rewrite the code to make loop-k II = 1

```c
for (int i = i+1; i < diagSize; ++i) {
dataType tmp_i[16] = {0};
double tmp2 = 0;
for (int k = 0; k < j; k++) {
#pragma HLS pipeline
tmp_i[k % 16] += dataA[i][k] * dataA[i][k];
}
...
}
```
## Week 6 Question
### Dataflow Latency and Rate Matching

1.
Set task #3 to II = 1, or duplicate it in parallel to reach II = 1.
2. FIFO #1 is set to depth 2, and FIFO #2 does not exist.
### Rewrite the code to meet dataflow rules

```c
void foo(int data_in[N], int scale, int data_out1[N], int data_out2[N]) {
int temp1[N], temp2[N], temp3[N], temp4[N], temp5[N];
Loop1:for (int i = 0; i < N; i++) {
temp1[i] = data_in[i] * scale;
}
Split(temp1, temp4, temp5)
Loop2:for (int j = 0; j < N; j++) {
temp2[j] = temp4[j] + 123;
}
Loop3:for (int k = 0; k < N; k++) {
data_out1[k] = temp2[k] + temp5[k];
}
}
```