---
# System prepended metadata

title: Groq module - MEM & HLS 2

---

# Groq module - MEM & HLS 2
![image](https://hackmd.io/_uploads/HJxwEyRpel.png)

## Pipeline stalled
<details>
<summary>example</summary>

```cpp
extern "C" {
void mem_slice(
    hls::stream<mem_inst_t> &inst_in,
    hls::stream<mem_inst_t> &inst_out,
    hls::stream<data_full_t> &west_in,
    hls::stream<data_full_t> &west_out,
    hls::stream<data_full_t> &east_in,
    hls::stream<data_full_t> &east_out
) {
#pragma HLS INTERFACE axis port=inst_in
#pragma HLS INTERFACE axis port=inst_out
#pragma HLS INTERFACE axis port=west_in
#pragma HLS INTERFACE axis port=west_out
#pragma HLS INTERFACE axis port=east_in
#pragma HLS INTERFACE axis port=east_out
#pragma HLS INTERFACE ap_ctrl_none port=return
#pragma HLS PIPELINE II=1
#pragma HLS LATENCY min=1 max=6

    if (inst_in.empty()) return;

    mem_inst_t inst = inst_in.read();
    inst_out.write(inst);  // forward

    // 解碼 instruction
    ap_uint<MEM_OPCODE_BITS> op = get_opcode(inst);
    ap_uint<MEM_ADDR_BITS> addr13 = get_addr(inst);
    ap_uint<MEM_SIDE_BITS> side = get_side(inst);
    ap_uint<MEM_SRC_DST_BITS> srcdst = get_srcdst(inst);
    ap_uint<MEM_RESERVED_BITS> resv = get_reserved(inst);
    ap_uint<MEM_ICU_BITS> icu = get_icu(inst);
    (void)resv; // 未使用欄位避免 warning
    (void)icu;

    ap_uint<32> byte_addr = ((ap_uint<32>)addr13) << 4;

    static ap_uint<4> skew_cnt = 0;
    static bool skew_active = false;
#pragma HLS RESET variable=skew_cnt
#pragma HLS RESET variable=skew_active

    // --------------------------
    // dskew 處理邏輯（空轉控制）
    // --------------------------
    if (!skew_active && dskew > 0) {
        skew_cnt = dskew;
        skew_active = true;
        return; // 先暫停一個 cycle
    }

    if (skew_active) {
        if (skew_cnt > 0) {
            skew_cnt--;
            return; // 繼續空轉直到倒數完成
        } else {
            skew_active = false; // 倒數結束 -> 執行下一步
        }
    }


    bool west_valid = false, east_valid = false;
    data_full_t west_data = 0, east_data = 0;

    // 處理寫入
    if (op == MEM_OP_WRITE) {
        if (side == 0 && !west_in.empty()) {
            data_full_t din = west_in.read();
            write_mem(byte_addr, get_stream_slice(din, srcdst));
        } else if (side == 1 && !east_in.empty()) {
            data_full_t din = east_in.read();
            write_mem(byte_addr, get_stream_slice(din, srcdst));
        }
    }

    // 處理讀取
    if (op == MEM_OP_READ) {
        DataUnit dout = read_mem(byte_addr);
        if (side == 0) {
            set_stream_slice(west_data, srcdst, dout);
            west_valid = true;
        } else {
            set_stream_slice(east_data, srcdst, dout);
            east_valid = true;
        }
    }

    // 寫出結果
    if (west_valid) west_out.write(west_data);
    if (east_valid) east_out.write(east_data);
}
} // extern "C"

```
    
</details>

In the current design of mem_slice.cpp:

* The dskew logic is handled inside the main pipeline.
* Once the module enters the idle (return) phase, the top-level function stops progressing.
* Because `#pragma HLS PIPELINE II=1` enforces a single control flow, the inst_in AXI stream interface will also experience back-pressure (TREADY = 0).

In other words:
* Idle (dskew active) → entire pipeline stalls → no new instruction is accepted.

### Solution - decouple inst fetch from engine
<details>
<summary> decouple version mem_slice.cpp</summary>    

```cpp
#include "mem_slice.h"

// ---------- single memory bank ----------
static DataUnit mem_array[MEM_SLICE_WORDS];

// ---------- memory access ----------
void write_mem(ap_uint<32> byte_addr, const DataUnit &dw) {
#pragma HLS INLINE
    unsigned int idx = (unsigned int)((byte_addr / WORD_BYTES) % MEM_SLICE_WORDS);
    mem_array[idx] = dw;
}

DataUnit read_mem(ap_uint<32> byte_addr) {
#pragma HLS INLINE
    unsigned int idx = (unsigned int)((byte_addr / WORD_BYTES) % MEM_SLICE_WORDS);
    return mem_array[idx];
}

// ---------- slice helper ----------
inline DataUnit get_stream_slice(const data_full_t &full, unsigned idx) {
#pragma HLS INLINE
    return full.range((idx+1)*STREAM_WIDTH-1, idx*STREAM_WIDTH);
}
inline void set_stream_slice(data_full_t &full, unsigned idx, const DataUnit &val) {
#pragma HLS INLINE
    full.range((idx+1)*STREAM_WIDTH-1, idx*STREAM_WIDTH) = val;
}

// ---------- top-level DUT ----------
extern "C" {
void mem_slice(
    hls::stream<mem_inst_t> &inst_in,
    hls::stream<mem_inst_t> &inst_out,
    hls::stream<data_full_t> &west_in,
    hls::stream<data_full_t> &west_out,
    hls::stream<data_full_t> &east_in,
    hls::stream<data_full_t> &east_out
) {
#pragma HLS INTERFACE axis port=inst_in
#pragma HLS INTERFACE axis port=inst_out
#pragma HLS INTERFACE axis port=west_in
#pragma HLS INTERFACE axis port=west_out
#pragma HLS INTERFACE axis port=east_in
#pragma HLS INTERFACE axis port=east_out
#pragma HLS INTERFACE ap_ctrl_none port=return
#pragma HLS PIPELINE II=1
#pragma HLS LATENCY min=1 max=6
    // 一層 FIFO
    hls::stream<mem_inst_t> inst_fifo;
#pragma HLS STREAM variable=inst_fifo depth=8

    // Fetch stage
    if (!inst_in.empty() && !inst_fifo.full())
        inst_fifo.write(inst_in.read());

    // Execute stage
    if (inst_fifo.empty()) return;
    mem_inst_t inst = inst_fifo.read();

    inst_out.write(inst);  // forward

    // 解碼 instruction
    ap_uint<MEM_DSKEW_BITS> dskew = get_dskew(inst);
    ap_uint<MEM_OPCODE_BITS> op = get_opcode(inst);
    ap_uint<MEM_ADDR_BITS> addr13 = get_addr(inst);
    ap_uint<MEM_SIDE_BITS> side = get_side(inst);
    ap_uint<MEM_SRC_DST_BITS> srcdst = get_srcdst(inst);
    ap_uint<MEM_RESERVED_BITS> resv = get_reserved(inst);
    ap_uint<MEM_ICU_BITS> icu = get_icu(inst);
    (void)resv; // 未使用欄位避免 warning
    (void)icu;

    ap_uint<32> byte_addr = ((ap_uint<32>)addr13) << 4;

    static ap_uint<4> skew_cnt = 0;
    static bool skew_active = false;
#pragma HLS RESET variable=skew_cnt
#pragma HLS RESET variable=skew_active

    // --------------------------
    // dskew 處理邏輯（空轉控制）
    // --------------------------
    if (!skew_active && dskew > 0) {
        skew_cnt = dskew;
        skew_active = true;
        return; // 先暫停一個 cycle
    }

    if (skew_active) {
        if (skew_cnt > 0) {
            skew_cnt--;
            return; // 繼續空轉直到倒數完成
        } else {
            skew_active = false; // 倒數結束 -> 執行下一步
        }
    }


    bool west_valid = false, east_valid = false;
    data_full_t west_data = 0, east_data = 0;

    // 處理寫入
    if (op == MEM_OP_WRITE) {
        if (side == 0 && !west_in.empty()) {
            data_full_t din = west_in.read();
            write_mem(byte_addr, get_stream_slice(din, srcdst));
        } else if (side == 1 && !east_in.empty()) {
            data_full_t din = east_in.read();
            write_mem(byte_addr, get_stream_slice(din, srcdst));
        }
    }

    // 處理讀取
    if (op == MEM_OP_READ) {
        DataUnit dout = read_mem(byte_addr);
        if (side == 0) {
            set_stream_slice(west_data, srcdst, dout);
            west_valid = true;
        } else {
            set_stream_slice(east_data, srcdst, dout);
            east_valid = true;
        }
    }

    // 寫出結果
    if (west_valid) west_out.write(west_data);
    if (east_valid) east_out.write(east_data);
}
} // extern "C"=

```

</details>


<details>     
<summary>decouple version mem_slice.h</summary>

```cpp
#ifndef MEM_SLICE_H
#define MEM_SLICE_H

#define AP_INT_MAX_W 4096
#include <ap_int.h>
#include <hls_stream.h>
#include <stdint.h>

// ---------------------------
// bit layout (LSB..MSB):
// 0:3    dskew (4)
// 4:5    opcode (2)
// 6:18   addr (13)
// 19     w_or_e (1)
// 20:24  stream_src_dst (5)
// 25:29  reserved (5)
// 30:31  ICU (2)
// ---------------------------
#define MEM_DSKEW_BITS      4
#define MEM_OPCODE_BITS     2
#define MEM_ADDR_BITS      13
#define MEM_SIDE_BITS       1
#define MEM_SRC_DST_BITS    5
#define MEM_RESERVED_BITS   5
#define MEM_ICU_BITS        2

#define WORD_BYTES 16
#define STREAM_WIDTH 128
#define NUM_OF_STREAMS 32
#define MEM_SLICE_BYTES (16 * 4096)
#define MEM_SLICE_WORDS (MEM_SLICE_BYTES / WORD_BYTES)

// ---------- 型別 ----------
typedef struct {
    ap_uint<32> raw;
} mem_inst_t;

typedef ap_uint<STREAM_WIDTH> DataUnit;
typedef ap_uint<STREAM_WIDTH * NUM_OF_STREAMS> data_full_t;

typedef enum mem_opcode {
    MEM_OP_READ  = 0,
    MEM_OP_WRITE = 1,
    NOP          = 3
} mem_opcode_t;

// ---------- decode helpers ----------
inline ap_uint<MEM_DSKEW_BITS>  get_dskew   (const mem_inst_t &i) { return i.raw.range(3,0); }
inline ap_uint<MEM_OPCODE_BITS> get_opcode  (const mem_inst_t &i) { return i.raw.range(5,4); }
inline ap_uint<MEM_ADDR_BITS>   get_addr    (const mem_inst_t &i) { return i.raw.range(18,6); }
inline ap_uint<MEM_SIDE_BITS>   get_side    (const mem_inst_t &i) { return i.raw.bit(19); }
inline ap_uint<MEM_SRC_DST_BITS> get_srcdst (const mem_inst_t &i) { return i.raw.range(24,20); }
inline ap_uint<MEM_RESERVED_BITS> get_reserved (const mem_inst_t &i) { return i.raw.range(29,25); }
inline ap_uint<MEM_ICU_BITS> get_icu (const mem_inst_t &i) { return i.raw.range(31,30); }

// ---------- memory 存取函式宣告 ----------
void write_mem(ap_uint<32> byte_addr, const DataUnit &dw);
DataUnit read_mem(ap_uint<32> byte_addr);
DataUnit get_stream_slice(const data_full_t &full, unsigned idx);
void set_stream_slice(data_full_t &full, unsigned idx, const DataUnit &val);

// ---------- top-level function ----------
extern "C" void mem_slice(
    hls::stream<mem_inst_t> &inst_in,
    hls::stream<mem_inst_t> &inst_out,
    hls::stream<data_full_t> &west_in,
    hls::stream<data_full_t> &west_out,
    hls::stream<data_full_t> &east_in,
    hls::stream<data_full_t> &east_out
);

#endif // MEM_SLICE_H

```
    
</details>

<details>     
<summary>decouple version mem_slice_test.cpp</summary>

```cpp
#include "mem_slice.h"
#include <iostream>
#include <iomanip>
using namespace std;

// ================================================================
// Helper: instruction builder
// ================================================================
mem_inst_t make_inst(
    ap_uint<MEM_DSKEW_BITS> dskew,
    ap_uint<MEM_OPCODE_BITS> opcode,
    ap_uint<MEM_ADDR_BITS> addr,
    ap_uint<MEM_SIDE_BITS> side,
    ap_uint<MEM_SRC_DST_BITS> srcdst)
{
    mem_inst_t inst;
    inst.raw = 0;
    inst.raw.range(3,0)   = dskew;
    inst.raw.range(5,4)   = opcode;
    inst.raw.range(18,6)  = addr;
    inst.raw.bit(19)      = side;
    inst.raw.range(24,20) = srcdst;
    return inst;
}

// ================================================================
// Main testbench
// ================================================================
int main() {
    hls::stream<mem_inst_t> inst_in, inst_out;
    hls::stream<data_full_t> west_in, west_out, east_in, east_out;

    cout << "============================" << endl;
    cout << "   mem_slice 全功能測試開始" << endl;
    cout << "============================" << endl;

    // ================================================================
    // Case 1: Basic WRITE / READ test (west side)
    // ================================================================
    {
        cout << "\n[CASE 1] Basic west WRITE/READ test\n";

        data_full_t input_data = 0;
        DataUnit dword = 0xAABBCCDDEEFF1122;
        set_stream_slice(input_data, 0, dword);

        mem_inst_t inst_w = make_inst(0, MEM_OP_WRITE, 0x001, 0, 0);
        mem_inst_t inst_r = make_inst(0, MEM_OP_READ , 0x001, 0, 0);

        west_in.write(input_data);
        inst_in.write(inst_w);
        mem_slice(inst_in, inst_out, west_in, west_out, east_in, east_out);

        inst_in.write(inst_r);
        mem_slice(inst_in, inst_out, west_in, west_out, east_in, east_out);

        if (!west_out.empty()) {
            data_full_t outdata = west_out.read();
            DataUnit dout = get_stream_slice(outdata, 0);
            cout << "  READ result = " << hex << dout << endl;
        } else {
            cout << "  ❌ No output on west_out!" << endl;
        }
        while (!inst_out.empty()) inst_out.read();
    }

    // ================================================================
    // Case 2: East side test
    // ================================================================
    {
        cout << "\n[CASE 2] East side WRITE/READ test\n";
        data_full_t eastdata = 0;
        DataUnit dword = 0x123456789ABCDEF0;
        set_stream_slice(eastdata, 2, dword);

        mem_inst_t inst_w = make_inst(0, MEM_OP_WRITE, 0x002, 1, 2);
        mem_inst_t inst_r = make_inst(0, MEM_OP_READ , 0x002, 1, 2);

        east_in.write(eastdata);
        inst_in.write(inst_w);
        mem_slice(inst_in, inst_out, west_in, west_out, east_in, east_out);

        inst_in.write(inst_r);
        mem_slice(inst_in, inst_out, west_in, west_out, east_in, east_out);

        if (!east_out.empty()) {
            data_full_t outdata = east_out.read();
            DataUnit dout = get_stream_slice(outdata, 2);
            cout << "  READ result = " << hex << dout << endl;
        } else {
            cout << "  ❌ No output on east_out!" << endl;
        }
        while (!inst_out.empty()) inst_out.read();
    }

    // ================================================================
    // Case 3: dskew delay test (write before read)
    // ================================================================
    {
        cout << "\n[CASE 3] dskew delay test\n";

        // Step 1️⃣: Write value to addr 0x003
        data_full_t input_data = 0;
        DataUnit write_word = 0xDEADBEEFCAFEBABE;
        set_stream_slice(input_data, 0, write_word);

        mem_inst_t inst_w = make_inst(0, MEM_OP_WRITE, 0x003, 0, 0);
        west_in.write(input_data);
        inst_in.write(inst_w);
        mem_slice(inst_in, inst_out, west_in, west_out, east_in, east_out);
        cout << "  Wrote 0x" << hex << write_word << " to addr 0x003\n";

        // Step 2️⃣: Issue READ with dskew = 3
        mem_inst_t inst_delay = make_inst(3, MEM_OP_READ, 0x003, 0, 0);

        for (int cycle = 0; cycle < 6; cycle++) {
            inst_in.write(inst_delay);
            mem_slice(inst_in, inst_out, west_in, west_out, east_in, east_out);
            cout << "  Cycle " << dec << cycle << " processed" << endl;

            if (!west_out.empty()) {
                data_full_t outdata = west_out.read();
                DataUnit dout = get_stream_slice(outdata, 0);
                cout << "  ✅ READ result = " << hex << dout
                     << " (at cycle " << dec << cycle << ")\n";
            }
        }
        cout << "  ✅ Expected: READ occurs ~3 cycles after issue.\n";
        while (!inst_out.empty()) inst_out.read();
    }

    // ================================================================
    // Case 4: NOP test
    // ================================================================
    {
        cout << "\n[CASE 4] NOP instruction test\n";
        mem_inst_t inst_nop = make_inst(0, NOP, 0x000, 0, 0);
        inst_in.write(inst_nop);
        mem_slice(inst_in, inst_out, west_in, west_out, east_in, east_out);
        if (west_out.empty() && east_out.empty())
            cout << "  ✅ No output as expected.\n";
        else
            cout << "  ❌ Unexpected output detected!\n";
        while (!inst_out.empty()) inst_out.read();
    }

    // ================================================================
    // Case 5: Continuous pipeline test
    // ================================================================
    {
        cout << "\n[CASE 5] Continuous instruction stream test\n";
        for (int i = 0; i < 4; i++) {
            mem_inst_t inst_w = make_inst(i % 2, MEM_OP_WRITE, 0x010 + i, i & 1, i);
            mem_inst_t inst_r = make_inst(0, MEM_OP_READ , 0x010 + i, i & 1, i);
            data_full_t ddata = 0;
            DataUnit word = (0xABCD0000 | i);
            set_stream_slice(ddata, i, word);
            if (i & 1) east_in.write(ddata);
            else west_in.write(ddata);
            inst_in.write(inst_w);
            inst_in.write(inst_r);
        }
        for (int t = 0; t < 10; t++) {
            mem_slice(inst_in, inst_out, west_in, west_out, east_in, east_out);
        }
        cout << "  ✅ Continuous streaming executed.\n";
        while (!inst_out.empty()) inst_out.read();
        while (!west_out.empty()) west_out.read();
        while (!east_out.empty()) east_out.read();
        while (!west_in.empty()) west_in.read();
        while (!east_in.empty()) east_in.read();
    }

    cout << "\n============================" << endl;
    cout << "   ✅ All test cases done." << endl;
    cout << "============================" << endl;
    return 0;
}

```
    
</details>


## Integrate into Bundle
    
<details>     
<summary>decouple version mem_bundle.cpp</summary>

```cpp
#include "mem_bundle.h"

// bundle: inst per-slice, data chained (slice0 -> slice1 -> slice2 -> slice3)
extern "C" void mem_bundle(
    hls::stream<mem_inst_t> inst_in[4],
    hls::stream<mem_inst_t> inst_out[4],
    hls::stream<data_full_t> &west_in,
    hls::stream<data_full_t> &west_out,
    hls::stream<data_full_t> &east_in,
    hls::stream<data_full_t> &east_out
) {
#pragma HLS DATAFLOW
#pragma HLS INTERFACE ap_ctrl_none port=return

    // mid data streams chain the slices (pass-through semantics)
    static hls::stream<data_full_t> w_mid[3];
    static hls::stream<data_full_t> e_mid[3];
#pragma HLS STREAM variable=w_mid depth=8
#pragma HLS STREAM variable=e_mid depth=8

    // instantiate four independent slices, chaining the data streams
    mem_slice(inst_in[0], inst_out[0], west_in, w_mid[0], east_in, e_mid[0]);
    mem_slice(inst_in[1], inst_out[1], w_mid[0], w_mid[1], e_mid[0], e_mid[1]);
    mem_slice(inst_in[2], inst_out[2], w_mid[1], w_mid[2], e_mid[1], e_mid[2]);
    mem_slice(inst_in[3], inst_out[3], w_mid[2], west_out,   e_mid[2], east_out);
}

```
    
</details>

### Problem1 - Data dependency
![image](https://hackmd.io/_uploads/BkXNUmKpgl.png)

### Solution1 - template
```cpp
template<int SLICE_ID>
void mem_slice(
    hls::stream<mem_inst_t> &inst_in,
    hls::stream<mem_inst_t> &inst_out,
    hls::stream<data_full_t> &west_in,
    hls::stream<data_full_t> &west_out,
    hls::stream<data_full_t> &east_in,
    hls::stream<data_full_t> &east_out
) {
#pragma HLS PIPELINE II=1
#pragma HLS LATENCY min=1 max=6

    static ap_uint<4> skew_cnt = 0;
    static bool skew_active = false;
    ...
}
```
In HLS, `template<int ID>` is used to: Automatically generate multiple hardware instances from the same piece of code, where each instance has its own independent pipeline state and static variables.
* Solves the “static feedback dependence” error in dataflow
* Enables multiple tasks to truly run in parallel

### Problem2 - Undefine template
```
obj/mem_bundle.o: in function mem_bundle': /home/ubuntu/MEM_hls_Ray/Groq_MEM/Integration1/csim/build/../../../../mem_bundle.cpp:22: 
undefined reference to 
void mem_slice<0>(
    hls::stream<mem_inst_t, 0>&, 
    hls::stream<mem_inst_t, 0>&, 
    hls::stream<ap_uint<4096>, 0>&,
    hls::stream<ap_uint<4096>, 0>&,
    hls::stream<ap_uint<4096>, 0>&,
    hls::stream<ap_uint<4096>, 0>&
)
```
### Solution2 - define at mem_slice.cpp
```cpp
// 顯式生成四個 template 實體
template void mem_slice<0>(
    hls::stream<mem_inst_t> &,
    hls::stream<mem_inst_t> &,
    hls::stream<data_full_t> &,
    hls::stream<data_full_t> &,
    hls::stream<data_full_t> &,
    hls::stream<data_full_t> &
);

template void mem_slice<1>(
    hls::stream<mem_inst_t> &,
    hls::stream<mem_inst_t> &,
    hls::stream<data_full_t> &,
    hls::stream<data_full_t> &,
    hls::stream<data_full_t> &,
    hls::stream<data_full_t> &
);

template void mem_slice<2>(
    hls::stream<mem_inst_t> &,
    hls::stream<mem_inst_t> &,
    hls::stream<data_full_t> &,
    hls::stream<data_full_t> &,
    hls::stream<data_full_t> &,
    hls::stream<data_full_t> &
);

template void mem_slice<3>(
    hls::stream<mem_inst_t> &,
    hls::stream<mem_inst_t> &,
    hls::stream<data_full_t> &,
    hls::stream<data_full_t> &,
    hls::stream<data_full_t> &,
    hls::stream<data_full_t> &
);

```

### Problem3 - Data Dependency
![image](https://hackmd.io/_uploads/BJ5I2wcTlg.png)
* The most common issues during the Vivado HLS dataflow checking stage.
* They indicate that your global or static array is being read and/or written by multiple dataflow tasks simultaneously,which violates the independence required by the dataflow execution model. As a result, HLS refuses to proceed with synthesis.
```cpp
// ---------- single memory bank ----------
static DataUnit mem_array[MEM_SLICE_WORDS];

// ---------- memory access ----------
void write_mem(ap_uint<32> byte_addr, const DataUnit &dw) {
#pragma HLS INLINE
    unsigned int idx = (unsigned int)((byte_addr / WORD_BYTES) % MEM_SLICE_WORDS);
    mem_array[idx] = dw;
}

DataUnit read_mem(ap_uint<32> byte_addr) {
#pragma HLS INLINE
    unsigned int idx = (unsigned int)((byte_addr / WORD_BYTES) % MEM_SLICE_WORDS);
    return mem_array[idx];
}

```
* Original Design
### Solution3 - Dont share memory
```cpp
    static DataUnit mem_array[MEM_SLICE_WORDS];
#pragma HLS BIND_STORAGE variable=mem_array type=ram_t2p impl=bram
#pragma HLS DEPENDENCE variable=mem_array inter false
#pragma HLS DEPENDENCE variable=mem_array intra false

    // ============================================================
    // Inline memory access helpers (local scope)
    // ============================================================
    auto write_mem = [&](ap_uint<32> byte_addr, const DataUnit &dw) {
#pragma HLS INLINE
        unsigned int idx = (unsigned int)((byte_addr / WORD_BYTES) % MEM_SLICE_WORDS);
        mem_array[idx] = dw;
    };

    auto read_mem = [&](ap_uint<32> byte_addr) -> DataUnit {
#pragma HLS INLINE
        unsigned int idx = (unsigned int)((byte_addr / WORD_BYTES) % MEM_SLICE_WORDS);
        return mem_array[idx];
    };
```

### Problem4 - Deadlock
```cpp
RTL Simulation : 0 / 30 [142857142.86%] @ "100000135000"
```
* 0 / 30: Out of 30 total transactions, none have been completed yet.
* 142,857,142.86%: Indicates the progress of a single transaction (calculated as measured latency / estimated latency × 100%).
→ Since it has exceeded 100%, it means the latency estimation is incorrect.
* @ "100000135000": Represents the simulation time (in picoseconds or nanoseconds).
:::warning
I found the deadlock root cause: the design reads `east_in`, but the HLS TB provides zero tokens on `east_in` for all transactions, so dataflow stalls immediately. This matches the cosim warning about ap_ctrl_none and non-blocking FIFO and the progress stuck at 0/30.
:::
#1
```cpp
if (inst_in.empty()) return;
mem_inst_t inst = inst_in.read();
inst_out.write(inst);
```
* This part is one of the key problems.
→ Because this return prevents the slice from executing any west/east data forwarding.
* If the previous slice is waiting for you to read from `west_in`, and the next slice is waiting for you to write to `west_out`, this dataflow chain will become permanently stalled.

#2

```cpp
#pragma HLS DATAFLOW
static hls::stream<data_full_t> w_mid[3];
static hls::stream<data_full_t> e_mid[3];
```

You used static streams as the slice chain, but when there is no active consumer, it can lead to the following:

* If any slice does not forward its data every clock cycle,
the downstream slice will be blocked;

* As a result, the dataflow model will be detected by HLS as a potential deadlock.

Solution:
Ensure that each slice continues to forward data even when there is no memory operation — that is, always pass data through from west_in → east_out and east_in → west_out.

#3
```cpp
for (int j = 0; j < 30; j++){
    for (int i = 0; i < 4; i++) {
        inst_in[i].write(make_inst(0, NOP, 0, 0, 0));
    }
}
// in mem_bundle_test.cpp
```
* You wrote 120 instructions in total, but each mem_slice() can only read one instruction per clock cycle.
* If the default inst_in FIFO depth is small (typically around 16 entries), it will quickly become full, causing a write-blocking stall.
→ During co-simulation, the HLS host side will then be unable to write to the AXIS channel, and the simulation will freeze while waiting for available FIFO space.

### Solution4 - add input into the fifo
Cause: `east_in` had zero depth; modules like mem_slice_1_U0 require `e_mid_empty_n/east_in_empty_n` to read, creating a deadlock.
Fix: Provide at least one token on east_in for the first transaction. Updated stream size and data files.
#1 & #2
```cpp
if (inst_in.empty()) {
    // 即使沒有新指令，仍需維持資料流 pass-through
    if (!west_in.empty()) west_out.write(west_in.read());
    if (!east_in.empty()) east_out.write(east_in.read());
    return;
}
```
#3

```cpp
#pragma HLS STREAM variable=inst_in depth=64
#pragma HLS STREAM variable=inst_out depth=64
```

### Problem5
#1
```cpp
    // mid data streams chain the slices (pass-through semantics)
    static hls::stream<data_full_t> e_mid[6];
#pragma HLS STREAM variable=e_mid depth=8

    // instantiate four independent slices, chaining the data streams
    mem_slice<0>(inst_in[0], inst_out[0], west_in, west_out, e_mid[0], e_mid[1]);
    mem_slice<1>(inst_in[1], inst_out[1], e_mid[1], e_mid[0], e_mid[2], e_mid[3]);
    mem_slice<2>(inst_in[2], inst_out[2], e_mid[3], e_mid[2], e_mid[4], e_mid[5]);
    mem_slice<3>(inst_in[3], inst_out[3], e_mid[5], e_mid[4], east_in, east_out);
}

```
* In your original version, e_mid[0] and e_mid[1] are used simultaneously as both input and output by slice0 and slice1.
→ For HLS, this creates a zero-latency loop, where both modules try to read and write to the same stream within the same clock cycle. 
* In the C simulation, this looks fine because execution is sequential — the simulator processes one operation after another.
* However, in RTL cosimulation, the streams are implemented as AXIS FIFOs with handshake signals (ready / valid).
* If both ends attempt to read and write in the same cycle, their handshakes conflict: each waits for the other to be ready.

#2
```cpp
#include "mem_bundle.h"

// bundle: inst per-slice, data chained (slice0 -> slice1 -> slice2 -> slice3)
extern "C" void mem_bundle(
    hls::stream<mem_inst_t> inst_in[4],
    hls::stream<mem_inst_t> inst_out[4],
    hls::stream<data_full_t> &west_in,
    hls::stream<data_full_t> &west_out,
    hls::stream<data_full_t> &east_in,
    hls::stream<data_full_t> &east_out
) {
	// mem_bundle.cpp
#pragma HLS DATAFLOW
#pragma HLS INTERFACE ap_ctrl_none port=return
```
* When the top-level module (`mem_bundle()`) uses `ap_ctrl_none`, Vivado HLS treats it as a **free-running hardware block** — it starts automatically and never finishes, so `ap_done` is never asserted. While this works fine for FPGA synthesis, it causes problems in C/RTL co-simulation, because the auto-generated testbench **expects a sequence of reset → start DUT → wait for ap_done → compare results**. Since ap_done never occurs, the simulator waits forever, appearing to hang even though the design itself isn’t deadlocked.
* Switching to `ap_ctrl_hs` inserts a small handshake FSM (IDLE → RUN → DONE → IDLE), giving Vivado clear start/done conditions. The co-simulation driver automatically asserts ap_start=1, waits until Vivado raises ap_done=1, and then terminates successfully. The functional behavior of your design—its dataflow and mem_slice modules—remains exactly the same; the only difference is that Vivado now knows when to stop the simulation.
* If you want both proper co-simulation termination and continuous dataflow operation, use `ap_ctrl_chain` instead. It behaves like `ap_ctrl_none` (always active, dataflow-friendly) but also provides start/done signals, allowing clean integration and automatic co-sim completion without hanging.

#3
![image](https://hackmd.io/_uploads/B1FN7sJCgl.png)
* There is a 2 cycle between the first input and the second input. It may result from the `#pragma HLS INTERFACE ap_ctrl_chain port=return`
    
### Solution5 
#1
```cpp
// mem_bundle.cpp
#pragma HLS DATAFLOW
#pragma HLS INTERFACE ap_ctrl_none port=return

static hls::stream<data_full_t> east_link[3]; // k.east_out -> (k+1).west_in
static hls::stream<data_full_t> west_link[3]; // (k+1).west_out -> k.east_in
#pragma HLS STREAM variable=east_link depth=8
#pragma HLS STREAM variable=west_link depth=8

// slice0（最左）
mem_slice<0>(inst_in[0], inst_out[0],
             /*west_in*/  west_in,
             /*west_out*/ west_out,        // 直接對外
             /*east_in*/  west_link[0],    // 從 slice1 回來的「往西」鏈
             /*east_out*/ east_link[0]);   // 往東，給 slice1 的 west_in

// slice1
mem_slice<1>(inst_in[1], inst_out[1],
             /*west_in*/  east_link[0],    // 從 slice0 往東過來
             /*west_out*/ west_link[0],    // 往西回去 slice0
             /*east_in*/  west_link[1],    // 從 slice2 往西回來
             /*east_out*/ east_link[1]);   // 往東給 slice2

// slice2
mem_slice<2>(inst_in[2], inst_out[2],
             /*west_in*/  east_link[1],
             /*west_out*/ west_link[1],
             /*east_in*/  west_link[2],
             /*east_out*/ east_link[2]);

// slice3（最右）
mem_slice<3>(inst_in[3], inst_out[3],
             /*west_in*/  east_link[2],
             /*west_out*/ west_link[2],    // 回去 slice2（若你要對外可另接）
             /*east_in*/  east_in,         // 直接對外
             /*east_out*/ east_out);       // 直接對外

```
#2
```cpp
	// mem_bundle.cpp
#pragma HLS DATAFLOW
#pragma HLS INTERFACE ap_ctrl_chain port=return
//#pragma HLS INTERFACE ap_ctrl_hs port=return
//#pragma HLS INTERFACE ap_ctrl_none port=return
```

#3

    
## Newest Version
![image](https://hackmd.io/_uploads/r1ggfFyJ-g.png)


    
    
    
<details>     
<summary>decouple version mem_bundle.cpp</summary>

```cpp

```
    
</details>