# Groq module - MEM & HLS 2

## Pipeline stalled
<details>
<summary>example</summary>
```cpp
extern "C" {
void mem_slice(
hls::stream<mem_inst_t> &inst_in,
hls::stream<mem_inst_t> &inst_out,
hls::stream<data_full_t> &west_in,
hls::stream<data_full_t> &west_out,
hls::stream<data_full_t> &east_in,
hls::stream<data_full_t> &east_out
) {
#pragma HLS INTERFACE axis port=inst_in
#pragma HLS INTERFACE axis port=inst_out
#pragma HLS INTERFACE axis port=west_in
#pragma HLS INTERFACE axis port=west_out
#pragma HLS INTERFACE axis port=east_in
#pragma HLS INTERFACE axis port=east_out
#pragma HLS INTERFACE ap_ctrl_none port=return
#pragma HLS PIPELINE II=1
#pragma HLS LATENCY min=1 max=6
if (inst_in.empty()) return;
mem_inst_t inst = inst_in.read();
inst_out.write(inst); // forward
// 解碼 instruction
ap_uint<MEM_OPCODE_BITS> op = get_opcode(inst);
ap_uint<MEM_ADDR_BITS> addr13 = get_addr(inst);
ap_uint<MEM_SIDE_BITS> side = get_side(inst);
ap_uint<MEM_SRC_DST_BITS> srcdst = get_srcdst(inst);
ap_uint<MEM_RESERVED_BITS> resv = get_reserved(inst);
ap_uint<MEM_ICU_BITS> icu = get_icu(inst);
(void)resv; // 未使用欄位避免 warning
(void)icu;
ap_uint<32> byte_addr = ((ap_uint<32>)addr13) << 4;
static ap_uint<4> skew_cnt = 0;
static bool skew_active = false;
#pragma HLS RESET variable=skew_cnt
#pragma HLS RESET variable=skew_active
// --------------------------
// dskew 處理邏輯(空轉控制)
// --------------------------
if (!skew_active && dskew > 0) {
skew_cnt = dskew;
skew_active = true;
return; // 先暫停一個 cycle
}
if (skew_active) {
if (skew_cnt > 0) {
skew_cnt--;
return; // 繼續空轉直到倒數完成
} else {
skew_active = false; // 倒數結束 -> 執行下一步
}
}
bool west_valid = false, east_valid = false;
data_full_t west_data = 0, east_data = 0;
// 處理寫入
if (op == MEM_OP_WRITE) {
if (side == 0 && !west_in.empty()) {
data_full_t din = west_in.read();
write_mem(byte_addr, get_stream_slice(din, srcdst));
} else if (side == 1 && !east_in.empty()) {
data_full_t din = east_in.read();
write_mem(byte_addr, get_stream_slice(din, srcdst));
}
}
// 處理讀取
if (op == MEM_OP_READ) {
DataUnit dout = read_mem(byte_addr);
if (side == 0) {
set_stream_slice(west_data, srcdst, dout);
west_valid = true;
} else {
set_stream_slice(east_data, srcdst, dout);
east_valid = true;
}
}
// 寫出結果
if (west_valid) west_out.write(west_data);
if (east_valid) east_out.write(east_data);
}
} // extern "C"
```
</details>
In the current design of mem_slice.cpp:
* The dskew logic is handled inside the main pipeline.
* Once the module enters the idle (return) phase, the top-level function stops progressing.
* Because `#pragma HLS PIPELINE II=1` enforces a single control flow, the inst_in AXI stream interface will also experience back-pressure (TREADY = 0).
In other words:
* Idle (dskew active) → entire pipeline stalls → no new instruction is accepted.
### Solution - decouple inst fetch from engine
<details>
<summary> decouple version mem_slice.cpp</summary>
```cpp
#include "mem_slice.h"
// ---------- single memory bank ----------
static DataUnit mem_array[MEM_SLICE_WORDS];
// ---------- memory access ----------
void write_mem(ap_uint<32> byte_addr, const DataUnit &dw) {
#pragma HLS INLINE
unsigned int idx = (unsigned int)((byte_addr / WORD_BYTES) % MEM_SLICE_WORDS);
mem_array[idx] = dw;
}
DataUnit read_mem(ap_uint<32> byte_addr) {
#pragma HLS INLINE
unsigned int idx = (unsigned int)((byte_addr / WORD_BYTES) % MEM_SLICE_WORDS);
return mem_array[idx];
}
// ---------- slice helper ----------
inline DataUnit get_stream_slice(const data_full_t &full, unsigned idx) {
#pragma HLS INLINE
return full.range((idx+1)*STREAM_WIDTH-1, idx*STREAM_WIDTH);
}
inline void set_stream_slice(data_full_t &full, unsigned idx, const DataUnit &val) {
#pragma HLS INLINE
full.range((idx+1)*STREAM_WIDTH-1, idx*STREAM_WIDTH) = val;
}
// ---------- top-level DUT ----------
extern "C" {
void mem_slice(
hls::stream<mem_inst_t> &inst_in,
hls::stream<mem_inst_t> &inst_out,
hls::stream<data_full_t> &west_in,
hls::stream<data_full_t> &west_out,
hls::stream<data_full_t> &east_in,
hls::stream<data_full_t> &east_out
) {
#pragma HLS INTERFACE axis port=inst_in
#pragma HLS INTERFACE axis port=inst_out
#pragma HLS INTERFACE axis port=west_in
#pragma HLS INTERFACE axis port=west_out
#pragma HLS INTERFACE axis port=east_in
#pragma HLS INTERFACE axis port=east_out
#pragma HLS INTERFACE ap_ctrl_none port=return
#pragma HLS PIPELINE II=1
#pragma HLS LATENCY min=1 max=6
// 一層 FIFO
hls::stream<mem_inst_t> inst_fifo;
#pragma HLS STREAM variable=inst_fifo depth=8
// Fetch stage
if (!inst_in.empty() && !inst_fifo.full())
inst_fifo.write(inst_in.read());
// Execute stage
if (inst_fifo.empty()) return;
mem_inst_t inst = inst_fifo.read();
inst_out.write(inst); // forward
// 解碼 instruction
ap_uint<MEM_DSKEW_BITS> dskew = get_dskew(inst);
ap_uint<MEM_OPCODE_BITS> op = get_opcode(inst);
ap_uint<MEM_ADDR_BITS> addr13 = get_addr(inst);
ap_uint<MEM_SIDE_BITS> side = get_side(inst);
ap_uint<MEM_SRC_DST_BITS> srcdst = get_srcdst(inst);
ap_uint<MEM_RESERVED_BITS> resv = get_reserved(inst);
ap_uint<MEM_ICU_BITS> icu = get_icu(inst);
(void)resv; // 未使用欄位避免 warning
(void)icu;
ap_uint<32> byte_addr = ((ap_uint<32>)addr13) << 4;
static ap_uint<4> skew_cnt = 0;
static bool skew_active = false;
#pragma HLS RESET variable=skew_cnt
#pragma HLS RESET variable=skew_active
// --------------------------
// dskew 處理邏輯(空轉控制)
// --------------------------
if (!skew_active && dskew > 0) {
skew_cnt = dskew;
skew_active = true;
return; // 先暫停一個 cycle
}
if (skew_active) {
if (skew_cnt > 0) {
skew_cnt--;
return; // 繼續空轉直到倒數完成
} else {
skew_active = false; // 倒數結束 -> 執行下一步
}
}
bool west_valid = false, east_valid = false;
data_full_t west_data = 0, east_data = 0;
// 處理寫入
if (op == MEM_OP_WRITE) {
if (side == 0 && !west_in.empty()) {
data_full_t din = west_in.read();
write_mem(byte_addr, get_stream_slice(din, srcdst));
} else if (side == 1 && !east_in.empty()) {
data_full_t din = east_in.read();
write_mem(byte_addr, get_stream_slice(din, srcdst));
}
}
// 處理讀取
if (op == MEM_OP_READ) {
DataUnit dout = read_mem(byte_addr);
if (side == 0) {
set_stream_slice(west_data, srcdst, dout);
west_valid = true;
} else {
set_stream_slice(east_data, srcdst, dout);
east_valid = true;
}
}
// 寫出結果
if (west_valid) west_out.write(west_data);
if (east_valid) east_out.write(east_data);
}
} // extern "C"=
```
</details>
<details>
<summary>decouple version mem_slice.h</summary>
```cpp
#ifndef MEM_SLICE_H
#define MEM_SLICE_H
#define AP_INT_MAX_W 4096
#include <ap_int.h>
#include <hls_stream.h>
#include <stdint.h>
// ---------------------------
// bit layout (LSB..MSB):
// 0:3 dskew (4)
// 4:5 opcode (2)
// 6:18 addr (13)
// 19 w_or_e (1)
// 20:24 stream_src_dst (5)
// 25:29 reserved (5)
// 30:31 ICU (2)
// ---------------------------
#define MEM_DSKEW_BITS 4
#define MEM_OPCODE_BITS 2
#define MEM_ADDR_BITS 13
#define MEM_SIDE_BITS 1
#define MEM_SRC_DST_BITS 5
#define MEM_RESERVED_BITS 5
#define MEM_ICU_BITS 2
#define WORD_BYTES 16
#define STREAM_WIDTH 128
#define NUM_OF_STREAMS 32
#define MEM_SLICE_BYTES (16 * 4096)
#define MEM_SLICE_WORDS (MEM_SLICE_BYTES / WORD_BYTES)
// ---------- 型別 ----------
typedef struct {
ap_uint<32> raw;
} mem_inst_t;
typedef ap_uint<STREAM_WIDTH> DataUnit;
typedef ap_uint<STREAM_WIDTH * NUM_OF_STREAMS> data_full_t;
typedef enum mem_opcode {
MEM_OP_READ = 0,
MEM_OP_WRITE = 1,
NOP = 3
} mem_opcode_t;
// ---------- decode helpers ----------
inline ap_uint<MEM_DSKEW_BITS> get_dskew (const mem_inst_t &i) { return i.raw.range(3,0); }
inline ap_uint<MEM_OPCODE_BITS> get_opcode (const mem_inst_t &i) { return i.raw.range(5,4); }
inline ap_uint<MEM_ADDR_BITS> get_addr (const mem_inst_t &i) { return i.raw.range(18,6); }
inline ap_uint<MEM_SIDE_BITS> get_side (const mem_inst_t &i) { return i.raw.bit(19); }
inline ap_uint<MEM_SRC_DST_BITS> get_srcdst (const mem_inst_t &i) { return i.raw.range(24,20); }
inline ap_uint<MEM_RESERVED_BITS> get_reserved (const mem_inst_t &i) { return i.raw.range(29,25); }
inline ap_uint<MEM_ICU_BITS> get_icu (const mem_inst_t &i) { return i.raw.range(31,30); }
// ---------- memory 存取函式宣告 ----------
void write_mem(ap_uint<32> byte_addr, const DataUnit &dw);
DataUnit read_mem(ap_uint<32> byte_addr);
DataUnit get_stream_slice(const data_full_t &full, unsigned idx);
void set_stream_slice(data_full_t &full, unsigned idx, const DataUnit &val);
// ---------- top-level function ----------
extern "C" void mem_slice(
hls::stream<mem_inst_t> &inst_in,
hls::stream<mem_inst_t> &inst_out,
hls::stream<data_full_t> &west_in,
hls::stream<data_full_t> &west_out,
hls::stream<data_full_t> &east_in,
hls::stream<data_full_t> &east_out
);
#endif // MEM_SLICE_H
```
</details>
<details>
<summary>decouple version mem_slice_test.cpp</summary>
```cpp
#include "mem_slice.h"
#include <iostream>
#include <iomanip>
using namespace std;
// ================================================================
// Helper: instruction builder
// ================================================================
mem_inst_t make_inst(
ap_uint<MEM_DSKEW_BITS> dskew,
ap_uint<MEM_OPCODE_BITS> opcode,
ap_uint<MEM_ADDR_BITS> addr,
ap_uint<MEM_SIDE_BITS> side,
ap_uint<MEM_SRC_DST_BITS> srcdst)
{
mem_inst_t inst;
inst.raw = 0;
inst.raw.range(3,0) = dskew;
inst.raw.range(5,4) = opcode;
inst.raw.range(18,6) = addr;
inst.raw.bit(19) = side;
inst.raw.range(24,20) = srcdst;
return inst;
}
// ================================================================
// Main testbench
// ================================================================
int main() {
hls::stream<mem_inst_t> inst_in, inst_out;
hls::stream<data_full_t> west_in, west_out, east_in, east_out;
cout << "============================" << endl;
cout << " mem_slice 全功能測試開始" << endl;
cout << "============================" << endl;
// ================================================================
// Case 1: Basic WRITE / READ test (west side)
// ================================================================
{
cout << "\n[CASE 1] Basic west WRITE/READ test\n";
data_full_t input_data = 0;
DataUnit dword = 0xAABBCCDDEEFF1122;
set_stream_slice(input_data, 0, dword);
mem_inst_t inst_w = make_inst(0, MEM_OP_WRITE, 0x001, 0, 0);
mem_inst_t inst_r = make_inst(0, MEM_OP_READ , 0x001, 0, 0);
west_in.write(input_data);
inst_in.write(inst_w);
mem_slice(inst_in, inst_out, west_in, west_out, east_in, east_out);
inst_in.write(inst_r);
mem_slice(inst_in, inst_out, west_in, west_out, east_in, east_out);
if (!west_out.empty()) {
data_full_t outdata = west_out.read();
DataUnit dout = get_stream_slice(outdata, 0);
cout << " READ result = " << hex << dout << endl;
} else {
cout << " ❌ No output on west_out!" << endl;
}
while (!inst_out.empty()) inst_out.read();
}
// ================================================================
// Case 2: East side test
// ================================================================
{
cout << "\n[CASE 2] East side WRITE/READ test\n";
data_full_t eastdata = 0;
DataUnit dword = 0x123456789ABCDEF0;
set_stream_slice(eastdata, 2, dword);
mem_inst_t inst_w = make_inst(0, MEM_OP_WRITE, 0x002, 1, 2);
mem_inst_t inst_r = make_inst(0, MEM_OP_READ , 0x002, 1, 2);
east_in.write(eastdata);
inst_in.write(inst_w);
mem_slice(inst_in, inst_out, west_in, west_out, east_in, east_out);
inst_in.write(inst_r);
mem_slice(inst_in, inst_out, west_in, west_out, east_in, east_out);
if (!east_out.empty()) {
data_full_t outdata = east_out.read();
DataUnit dout = get_stream_slice(outdata, 2);
cout << " READ result = " << hex << dout << endl;
} else {
cout << " ❌ No output on east_out!" << endl;
}
while (!inst_out.empty()) inst_out.read();
}
// ================================================================
// Case 3: dskew delay test (write before read)
// ================================================================
{
cout << "\n[CASE 3] dskew delay test\n";
// Step 1️⃣: Write value to addr 0x003
data_full_t input_data = 0;
DataUnit write_word = 0xDEADBEEFCAFEBABE;
set_stream_slice(input_data, 0, write_word);
mem_inst_t inst_w = make_inst(0, MEM_OP_WRITE, 0x003, 0, 0);
west_in.write(input_data);
inst_in.write(inst_w);
mem_slice(inst_in, inst_out, west_in, west_out, east_in, east_out);
cout << " Wrote 0x" << hex << write_word << " to addr 0x003\n";
// Step 2️⃣: Issue READ with dskew = 3
mem_inst_t inst_delay = make_inst(3, MEM_OP_READ, 0x003, 0, 0);
for (int cycle = 0; cycle < 6; cycle++) {
inst_in.write(inst_delay);
mem_slice(inst_in, inst_out, west_in, west_out, east_in, east_out);
cout << " Cycle " << dec << cycle << " processed" << endl;
if (!west_out.empty()) {
data_full_t outdata = west_out.read();
DataUnit dout = get_stream_slice(outdata, 0);
cout << " ✅ READ result = " << hex << dout
<< " (at cycle " << dec << cycle << ")\n";
}
}
cout << " ✅ Expected: READ occurs ~3 cycles after issue.\n";
while (!inst_out.empty()) inst_out.read();
}
// ================================================================
// Case 4: NOP test
// ================================================================
{
cout << "\n[CASE 4] NOP instruction test\n";
mem_inst_t inst_nop = make_inst(0, NOP, 0x000, 0, 0);
inst_in.write(inst_nop);
mem_slice(inst_in, inst_out, west_in, west_out, east_in, east_out);
if (west_out.empty() && east_out.empty())
cout << " ✅ No output as expected.\n";
else
cout << " ❌ Unexpected output detected!\n";
while (!inst_out.empty()) inst_out.read();
}
// ================================================================
// Case 5: Continuous pipeline test
// ================================================================
{
cout << "\n[CASE 5] Continuous instruction stream test\n";
for (int i = 0; i < 4; i++) {
mem_inst_t inst_w = make_inst(i % 2, MEM_OP_WRITE, 0x010 + i, i & 1, i);
mem_inst_t inst_r = make_inst(0, MEM_OP_READ , 0x010 + i, i & 1, i);
data_full_t ddata = 0;
DataUnit word = (0xABCD0000 | i);
set_stream_slice(ddata, i, word);
if (i & 1) east_in.write(ddata);
else west_in.write(ddata);
inst_in.write(inst_w);
inst_in.write(inst_r);
}
for (int t = 0; t < 10; t++) {
mem_slice(inst_in, inst_out, west_in, west_out, east_in, east_out);
}
cout << " ✅ Continuous streaming executed.\n";
while (!inst_out.empty()) inst_out.read();
while (!west_out.empty()) west_out.read();
while (!east_out.empty()) east_out.read();
while (!west_in.empty()) west_in.read();
while (!east_in.empty()) east_in.read();
}
cout << "\n============================" << endl;
cout << " ✅ All test cases done." << endl;
cout << "============================" << endl;
return 0;
}
```
</details>
## Integrate into Bundle
<details>
<summary>decouple version mem_bundle.cpp</summary>
```cpp
#include "mem_bundle.h"
// bundle: inst per-slice, data chained (slice0 -> slice1 -> slice2 -> slice3)
extern "C" void mem_bundle(
hls::stream<mem_inst_t> inst_in[4],
hls::stream<mem_inst_t> inst_out[4],
hls::stream<data_full_t> &west_in,
hls::stream<data_full_t> &west_out,
hls::stream<data_full_t> &east_in,
hls::stream<data_full_t> &east_out
) {
#pragma HLS DATAFLOW
#pragma HLS INTERFACE ap_ctrl_none port=return
// mid data streams chain the slices (pass-through semantics)
static hls::stream<data_full_t> w_mid[3];
static hls::stream<data_full_t> e_mid[3];
#pragma HLS STREAM variable=w_mid depth=8
#pragma HLS STREAM variable=e_mid depth=8
// instantiate four independent slices, chaining the data streams
mem_slice(inst_in[0], inst_out[0], west_in, w_mid[0], east_in, e_mid[0]);
mem_slice(inst_in[1], inst_out[1], w_mid[0], w_mid[1], e_mid[0], e_mid[1]);
mem_slice(inst_in[2], inst_out[2], w_mid[1], w_mid[2], e_mid[1], e_mid[2]);
mem_slice(inst_in[3], inst_out[3], w_mid[2], west_out, e_mid[2], east_out);
}
```
</details>
### Problem1 - Data dependency

### Solution1 - template
```cpp
template<int SLICE_ID>
void mem_slice(
hls::stream<mem_inst_t> &inst_in,
hls::stream<mem_inst_t> &inst_out,
hls::stream<data_full_t> &west_in,
hls::stream<data_full_t> &west_out,
hls::stream<data_full_t> &east_in,
hls::stream<data_full_t> &east_out
) {
#pragma HLS PIPELINE II=1
#pragma HLS LATENCY min=1 max=6
static ap_uint<4> skew_cnt = 0;
static bool skew_active = false;
...
}
```
In HLS, `template<int ID>` is used to: Automatically generate multiple hardware instances from the same piece of code, where each instance has its own independent pipeline state and static variables.
* Solves the “static feedback dependence” error in dataflow
* Enables multiple tasks to truly run in parallel
### Problem2 - Undefine template
```
obj/mem_bundle.o: in function mem_bundle': /home/ubuntu/MEM_hls_Ray/Groq_MEM/Integration1/csim/build/../../../../mem_bundle.cpp:22:
undefined reference to
void mem_slice<0>(
hls::stream<mem_inst_t, 0>&,
hls::stream<mem_inst_t, 0>&,
hls::stream<ap_uint<4096>, 0>&,
hls::stream<ap_uint<4096>, 0>&,
hls::stream<ap_uint<4096>, 0>&,
hls::stream<ap_uint<4096>, 0>&
)
```
### Solution2 - define at mem_slice.cpp
```cpp
// 顯式生成四個 template 實體
template void mem_slice<0>(
hls::stream<mem_inst_t> &,
hls::stream<mem_inst_t> &,
hls::stream<data_full_t> &,
hls::stream<data_full_t> &,
hls::stream<data_full_t> &,
hls::stream<data_full_t> &
);
template void mem_slice<1>(
hls::stream<mem_inst_t> &,
hls::stream<mem_inst_t> &,
hls::stream<data_full_t> &,
hls::stream<data_full_t> &,
hls::stream<data_full_t> &,
hls::stream<data_full_t> &
);
template void mem_slice<2>(
hls::stream<mem_inst_t> &,
hls::stream<mem_inst_t> &,
hls::stream<data_full_t> &,
hls::stream<data_full_t> &,
hls::stream<data_full_t> &,
hls::stream<data_full_t> &
);
template void mem_slice<3>(
hls::stream<mem_inst_t> &,
hls::stream<mem_inst_t> &,
hls::stream<data_full_t> &,
hls::stream<data_full_t> &,
hls::stream<data_full_t> &,
hls::stream<data_full_t> &
);
```
### Problem3 - Data Dependency

* The most common issues during the Vivado HLS dataflow checking stage.
* They indicate that your global or static array is being read and/or written by multiple dataflow tasks simultaneously,which violates the independence required by the dataflow execution model. As a result, HLS refuses to proceed with synthesis.
```cpp
// ---------- single memory bank ----------
static DataUnit mem_array[MEM_SLICE_WORDS];
// ---------- memory access ----------
void write_mem(ap_uint<32> byte_addr, const DataUnit &dw) {
#pragma HLS INLINE
unsigned int idx = (unsigned int)((byte_addr / WORD_BYTES) % MEM_SLICE_WORDS);
mem_array[idx] = dw;
}
DataUnit read_mem(ap_uint<32> byte_addr) {
#pragma HLS INLINE
unsigned int idx = (unsigned int)((byte_addr / WORD_BYTES) % MEM_SLICE_WORDS);
return mem_array[idx];
}
```
* Original Design
### Solution3 - Dont share memory
```cpp
static DataUnit mem_array[MEM_SLICE_WORDS];
#pragma HLS BIND_STORAGE variable=mem_array type=ram_t2p impl=bram
#pragma HLS DEPENDENCE variable=mem_array inter false
#pragma HLS DEPENDENCE variable=mem_array intra false
// ============================================================
// Inline memory access helpers (local scope)
// ============================================================
auto write_mem = [&](ap_uint<32> byte_addr, const DataUnit &dw) {
#pragma HLS INLINE
unsigned int idx = (unsigned int)((byte_addr / WORD_BYTES) % MEM_SLICE_WORDS);
mem_array[idx] = dw;
};
auto read_mem = [&](ap_uint<32> byte_addr) -> DataUnit {
#pragma HLS INLINE
unsigned int idx = (unsigned int)((byte_addr / WORD_BYTES) % MEM_SLICE_WORDS);
return mem_array[idx];
};
```
### Problem4 - Deadlock
```cpp
RTL Simulation : 0 / 30 [142857142.86%] @ "100000135000"
```
* 0 / 30: Out of 30 total transactions, none have been completed yet.
* 142,857,142.86%: Indicates the progress of a single transaction (calculated as measured latency / estimated latency × 100%).
→ Since it has exceeded 100%, it means the latency estimation is incorrect.
* @ "100000135000": Represents the simulation time (in picoseconds or nanoseconds).
:::warning
I found the deadlock root cause: the design reads `east_in`, but the HLS TB provides zero tokens on `east_in` for all transactions, so dataflow stalls immediately. This matches the cosim warning about ap_ctrl_none and non-blocking FIFO and the progress stuck at 0/30.
:::
#1
```cpp
if (inst_in.empty()) return;
mem_inst_t inst = inst_in.read();
inst_out.write(inst);
```
* This part is one of the key problems.
→ Because this return prevents the slice from executing any west/east data forwarding.
* If the previous slice is waiting for you to read from `west_in`, and the next slice is waiting for you to write to `west_out`, this dataflow chain will become permanently stalled.
#2
```cpp
#pragma HLS DATAFLOW
static hls::stream<data_full_t> w_mid[3];
static hls::stream<data_full_t> e_mid[3];
```
You used static streams as the slice chain, but when there is no active consumer, it can lead to the following:
* If any slice does not forward its data every clock cycle,
the downstream slice will be blocked;
* As a result, the dataflow model will be detected by HLS as a potential deadlock.
Solution:
Ensure that each slice continues to forward data even when there is no memory operation — that is, always pass data through from west_in → east_out and east_in → west_out.
#3
```cpp
for (int j = 0; j < 30; j++){
for (int i = 0; i < 4; i++) {
inst_in[i].write(make_inst(0, NOP, 0, 0, 0));
}
}
// in mem_bundle_test.cpp
```
* You wrote 120 instructions in total, but each mem_slice() can only read one instruction per clock cycle.
* If the default inst_in FIFO depth is small (typically around 16 entries), it will quickly become full, causing a write-blocking stall.
→ During co-simulation, the HLS host side will then be unable to write to the AXIS channel, and the simulation will freeze while waiting for available FIFO space.
### Solution4 - add input into the fifo
Cause: `east_in` had zero depth; modules like mem_slice_1_U0 require `e_mid_empty_n/east_in_empty_n` to read, creating a deadlock.
Fix: Provide at least one token on east_in for the first transaction. Updated stream size and data files.
#1 & #2
```cpp
if (inst_in.empty()) {
// 即使沒有新指令,仍需維持資料流 pass-through
if (!west_in.empty()) west_out.write(west_in.read());
if (!east_in.empty()) east_out.write(east_in.read());
return;
}
```
#3
```cpp
#pragma HLS STREAM variable=inst_in depth=64
#pragma HLS STREAM variable=inst_out depth=64
```
### Problem5
#1
```cpp
// mid data streams chain the slices (pass-through semantics)
static hls::stream<data_full_t> e_mid[6];
#pragma HLS STREAM variable=e_mid depth=8
// instantiate four independent slices, chaining the data streams
mem_slice<0>(inst_in[0], inst_out[0], west_in, west_out, e_mid[0], e_mid[1]);
mem_slice<1>(inst_in[1], inst_out[1], e_mid[1], e_mid[0], e_mid[2], e_mid[3]);
mem_slice<2>(inst_in[2], inst_out[2], e_mid[3], e_mid[2], e_mid[4], e_mid[5]);
mem_slice<3>(inst_in[3], inst_out[3], e_mid[5], e_mid[4], east_in, east_out);
}
```
* In your original version, e_mid[0] and e_mid[1] are used simultaneously as both input and output by slice0 and slice1.
→ For HLS, this creates a zero-latency loop, where both modules try to read and write to the same stream within the same clock cycle.
* In the C simulation, this looks fine because execution is sequential — the simulator processes one operation after another.
* However, in RTL cosimulation, the streams are implemented as AXIS FIFOs with handshake signals (ready / valid).
* If both ends attempt to read and write in the same cycle, their handshakes conflict: each waits for the other to be ready.
#2
```cpp
#include "mem_bundle.h"
// bundle: inst per-slice, data chained (slice0 -> slice1 -> slice2 -> slice3)
extern "C" void mem_bundle(
hls::stream<mem_inst_t> inst_in[4],
hls::stream<mem_inst_t> inst_out[4],
hls::stream<data_full_t> &west_in,
hls::stream<data_full_t> &west_out,
hls::stream<data_full_t> &east_in,
hls::stream<data_full_t> &east_out
) {
// mem_bundle.cpp
#pragma HLS DATAFLOW
#pragma HLS INTERFACE ap_ctrl_none port=return
```
* When the top-level module (`mem_bundle()`) uses `ap_ctrl_none`, Vivado HLS treats it as a **free-running hardware block** — it starts automatically and never finishes, so `ap_done` is never asserted. While this works fine for FPGA synthesis, it causes problems in C/RTL co-simulation, because the auto-generated testbench **expects a sequence of reset → start DUT → wait for ap_done → compare results**. Since ap_done never occurs, the simulator waits forever, appearing to hang even though the design itself isn’t deadlocked.
* Switching to `ap_ctrl_hs` inserts a small handshake FSM (IDLE → RUN → DONE → IDLE), giving Vivado clear start/done conditions. The co-simulation driver automatically asserts ap_start=1, waits until Vivado raises ap_done=1, and then terminates successfully. The functional behavior of your design—its dataflow and mem_slice modules—remains exactly the same; the only difference is that Vivado now knows when to stop the simulation.
* If you want both proper co-simulation termination and continuous dataflow operation, use `ap_ctrl_chain` instead. It behaves like `ap_ctrl_none` (always active, dataflow-friendly) but also provides start/done signals, allowing clean integration and automatic co-sim completion without hanging.
#3

* There is a 2 cycle between the first input and the second input. It may result from the `#pragma HLS INTERFACE ap_ctrl_chain port=return`
### Solution5
#1
```cpp
// mem_bundle.cpp
#pragma HLS DATAFLOW
#pragma HLS INTERFACE ap_ctrl_none port=return
static hls::stream<data_full_t> east_link[3]; // k.east_out -> (k+1).west_in
static hls::stream<data_full_t> west_link[3]; // (k+1).west_out -> k.east_in
#pragma HLS STREAM variable=east_link depth=8
#pragma HLS STREAM variable=west_link depth=8
// slice0(最左)
mem_slice<0>(inst_in[0], inst_out[0],
/*west_in*/ west_in,
/*west_out*/ west_out, // 直接對外
/*east_in*/ west_link[0], // 從 slice1 回來的「往西」鏈
/*east_out*/ east_link[0]); // 往東,給 slice1 的 west_in
// slice1
mem_slice<1>(inst_in[1], inst_out[1],
/*west_in*/ east_link[0], // 從 slice0 往東過來
/*west_out*/ west_link[0], // 往西回去 slice0
/*east_in*/ west_link[1], // 從 slice2 往西回來
/*east_out*/ east_link[1]); // 往東給 slice2
// slice2
mem_slice<2>(inst_in[2], inst_out[2],
/*west_in*/ east_link[1],
/*west_out*/ west_link[1],
/*east_in*/ west_link[2],
/*east_out*/ east_link[2]);
// slice3(最右)
mem_slice<3>(inst_in[3], inst_out[3],
/*west_in*/ east_link[2],
/*west_out*/ west_link[2], // 回去 slice2(若你要對外可另接)
/*east_in*/ east_in, // 直接對外
/*east_out*/ east_out); // 直接對外
```
#2
```cpp
// mem_bundle.cpp
#pragma HLS DATAFLOW
#pragma HLS INTERFACE ap_ctrl_chain port=return
//#pragma HLS INTERFACE ap_ctrl_hs port=return
//#pragma HLS INTERFACE ap_ctrl_none port=return
```
#3
## Newest Version

<details>
<summary>decouple version mem_bundle.cpp</summary>
```cpp
```
</details>