Groq module - MEM & HLS 2

# Groq module - MEM & HLS 2 ![image](https://hackmd.io/_uploads/HJxwEyRpel.png) ## Pipeline stalled <details> <summary>example</summary> ```cpp extern "C" { void mem_slice( hls::stream<mem_inst_t> &inst_in, hls::stream<mem_inst_t> &inst_out, hls::stream<data_full_t> &west_in, hls::stream<data_full_t> &west_out, hls::stream<data_full_t> &east_in, hls::stream<data_full_t> &east_out ) { #pragma HLS INTERFACE axis port=inst_in #pragma HLS INTERFACE axis port=inst_out #pragma HLS INTERFACE axis port=west_in #pragma HLS INTERFACE axis port=west_out #pragma HLS INTERFACE axis port=east_in #pragma HLS INTERFACE axis port=east_out #pragma HLS INTERFACE ap_ctrl_none port=return #pragma HLS PIPELINE II=1 #pragma HLS LATENCY min=1 max=6 if (inst_in.empty()) return; mem_inst_t inst = inst_in.read(); inst_out.write(inst); // forward // 解碼 instruction ap_uint<MEM_OPCODE_BITS> op = get_opcode(inst); ap_uint<MEM_ADDR_BITS> addr13 = get_addr(inst); ap_uint<MEM_SIDE_BITS> side = get_side(inst); ap_uint<MEM_SRC_DST_BITS> srcdst = get_srcdst(inst); ap_uint<MEM_RESERVED_BITS> resv = get_reserved(inst); ap_uint<MEM_ICU_BITS> icu = get_icu(inst); (void)resv; // 未使用欄位避免 warning (void)icu; ap_uint<32> byte_addr = ((ap_uint<32>)addr13) << 4; static ap_uint<4> skew_cnt = 0; static bool skew_active = false; #pragma HLS RESET variable=skew_cnt #pragma HLS RESET variable=skew_active // -------------------------- // dskew 處理邏輯（空轉控制） // -------------------------- if (!skew_active && dskew > 0) { skew_cnt = dskew; skew_active = true; return; // 先暫停一個 cycle } if (skew_active) { if (skew_cnt > 0) { skew_cnt--; return; // 繼續空轉直到倒數完成 } else { skew_active = false; // 倒數結束 -> 執行下一步 } } bool west_valid = false, east_valid = false; data_full_t west_data = 0, east_data = 0; // 處理寫入 if (op == MEM_OP_WRITE) { if (side == 0 && !west_in.empty()) { data_full_t din = west_in.read(); write_mem(byte_addr, get_stream_slice(din, srcdst)); } else if (side == 1 && !east_in.empty()) { data_full_t din = east_in.read(); write_mem(byte_addr, get_stream_slice(din, srcdst)); } } // 處理讀取 if (op == MEM_OP_READ) { DataUnit dout = read_mem(byte_addr); if (side == 0) { set_stream_slice(west_data, srcdst, dout); west_valid = true; } else { set_stream_slice(east_data, srcdst, dout); east_valid = true; } } // 寫出結果 if (west_valid) west_out.write(west_data); if (east_valid) east_out.write(east_data); } } // extern "C" ``` </details> In the current design of mem_slice.cpp: * The dskew logic is handled inside the main pipeline. * Once the module enters the idle (return) phase, the top-level function stops progressing. * Because `#pragma HLS PIPELINE II=1` enforces a single control flow, the inst_in AXI stream interface will also experience back-pressure (TREADY = 0). In other words: * Idle (dskew active) → entire pipeline stalls → no new instruction is accepted. ### Solution - decouple inst fetch from engine <details> <summary> decouple version mem_slice.cpp</summary> ```cpp #include "mem_slice.h" // ---------- single memory bank ---------- static DataUnit mem_array[MEM_SLICE_WORDS]; // ---------- memory access ---------- void write_mem(ap_uint<32> byte_addr, const DataUnit &dw) { #pragma HLS INLINE unsigned int idx = (unsigned int)((byte_addr / WORD_BYTES) % MEM_SLICE_WORDS); mem_array[idx] = dw; } DataUnit read_mem(ap_uint<32> byte_addr) { #pragma HLS INLINE unsigned int idx = (unsigned int)((byte_addr / WORD_BYTES) % MEM_SLICE_WORDS); return mem_array[idx]; } // ---------- slice helper ---------- inline DataUnit get_stream_slice(const data_full_t &full, unsigned idx) { #pragma HLS INLINE return full.range((idx+1)*STREAM_WIDTH-1, idx*STREAM_WIDTH); } inline void set_stream_slice(data_full_t &full, unsigned idx, const DataUnit &val) { #pragma HLS INLINE full.range((idx+1)*STREAM_WIDTH-1, idx*STREAM_WIDTH) = val; } // ---------- top-level DUT ---------- extern "C" { void mem_slice( hls::stream<mem_inst_t> &inst_in, hls::stream<mem_inst_t> &inst_out, hls::stream<data_full_t> &west_in, hls::stream<data_full_t> &west_out, hls::stream<data_full_t> &east_in, hls::stream<data_full_t> &east_out ) { #pragma HLS INTERFACE axis port=inst_in #pragma HLS INTERFACE axis port=inst_out #pragma HLS INTERFACE axis port=west_in #pragma HLS INTERFACE axis port=west_out #pragma HLS INTERFACE axis port=east_in #pragma HLS INTERFACE axis port=east_out #pragma HLS INTERFACE ap_ctrl_none port=return #pragma HLS PIPELINE II=1 #pragma HLS LATENCY min=1 max=6 // 一層 FIFO hls::stream<mem_inst_t> inst_fifo; #pragma HLS STREAM variable=inst_fifo depth=8 // Fetch stage if (!inst_in.empty() && !inst_fifo.full()) inst_fifo.write(inst_in.read()); // Execute stage if (inst_fifo.empty()) return; mem_inst_t inst = inst_fifo.read(); inst_out.write(inst); // forward // 解碼 instruction ap_uint<MEM_DSKEW_BITS> dskew = get_dskew(inst); ap_uint<MEM_OPCODE_BITS> op = get_opcode(inst); ap_uint<MEM_ADDR_BITS> addr13 = get_addr(inst); ap_uint<MEM_SIDE_BITS> side = get_side(inst); ap_uint<MEM_SRC_DST_BITS> srcdst = get_srcdst(inst); ap_uint<MEM_RESERVED_BITS> resv = get_reserved(inst); ap_uint<MEM_ICU_BITS> icu = get_icu(inst); (void)resv; // 未使用欄位避免 warning (void)icu; ap_uint<32> byte_addr = ((ap_uint<32>)addr13) << 4; static ap_uint<4> skew_cnt = 0; static bool skew_active = false; #pragma HLS RESET variable=skew_cnt #pragma HLS RESET variable=skew_active // -------------------------- // dskew 處理邏輯（空轉控制） // -------------------------- if (!skew_active && dskew > 0) { skew_cnt = dskew; skew_active = true; return; // 先暫停一個 cycle } if (skew_active) { if (skew_cnt > 0) { skew_cnt--; return; // 繼續空轉直到倒數完成 } else { skew_active = false; // 倒數結束 -> 執行下一步 } } bool west_valid = false, east_valid = false; data_full_t west_data = 0, east_data = 0; // 處理寫入 if (op == MEM_OP_WRITE) { if (side == 0 && !west_in.empty()) { data_full_t din = west_in.read(); write_mem(byte_addr, get_stream_slice(din, srcdst)); } else if (side == 1 && !east_in.empty()) { data_full_t din = east_in.read(); write_mem(byte_addr, get_stream_slice(din, srcdst)); } } // 處理讀取 if (op == MEM_OP_READ) { DataUnit dout = read_mem(byte_addr); if (side == 0) { set_stream_slice(west_data, srcdst, dout); west_valid = true; } else { set_stream_slice(east_data, srcdst, dout); east_valid = true; } } // 寫出結果 if (west_valid) west_out.write(west_data); if (east_valid) east_out.write(east_data); } } // extern "C"= ``` </details> <details> <summary>decouple version mem_slice.h</summary> ```cpp #ifndef MEM_SLICE_H #define MEM_SLICE_H #define AP_INT_MAX_W 4096 #include <ap_int.h> #include <hls_stream.h> #include <stdint.h> // --------------------------- // bit layout (LSB..MSB): // 0:3 dskew (4) // 4:5 opcode (2) // 6:18 addr (13) // 19 w_or_e (1) // 20:24 stream_src_dst (5) // 25:29 reserved (5) // 30:31 ICU (2) // --------------------------- #define MEM_DSKEW_BITS 4 #define MEM_OPCODE_BITS 2 #define MEM_ADDR_BITS 13 #define MEM_SIDE_BITS 1 #define MEM_SRC_DST_BITS 5 #define MEM_RESERVED_BITS 5 #define MEM_ICU_BITS 2 #define WORD_BYTES 16 #define STREAM_WIDTH 128 #define NUM_OF_STREAMS 32 #define MEM_SLICE_BYTES (16 * 4096) #define MEM_SLICE_WORDS (MEM_SLICE_BYTES / WORD_BYTES) // ---------- 型別 ---------- typedef struct { ap_uint<32> raw; } mem_inst_t; typedef ap_uint<STREAM_WIDTH> DataUnit; typedef ap_uint<STREAM_WIDTH * NUM_OF_STREAMS> data_full_t; typedef enum mem_opcode { MEM_OP_READ = 0, MEM_OP_WRITE = 1, NOP = 3 } mem_opcode_t; // ---------- decode helpers ---------- inline ap_uint<MEM_DSKEW_BITS> get_dskew (const mem_inst_t &i) { return i.raw.range(3,0); } inline ap_uint<MEM_OPCODE_BITS> get_opcode (const mem_inst_t &i) { return i.raw.range(5,4); } inline ap_uint<MEM_ADDR_BITS> get_addr (const mem_inst_t &i) { return i.raw.range(18,6); } inline ap_uint<MEM_SIDE_BITS> get_side (const mem_inst_t &i) { return i.raw.bit(19); } inline ap_uint<MEM_SRC_DST_BITS> get_srcdst (const mem_inst_t &i) { return i.raw.range(24,20); } inline ap_uint<MEM_RESERVED_BITS> get_reserved (const mem_inst_t &i) { return i.raw.range(29,25); } inline ap_uint<MEM_ICU_BITS> get_icu (const mem_inst_t &i) { return i.raw.range(31,30); } // ---------- memory 存取函式宣告 ---------- void write_mem(ap_uint<32> byte_addr, const DataUnit &dw); DataUnit read_mem(ap_uint<32> byte_addr); DataUnit get_stream_slice(const data_full_t &full, unsigned idx); void set_stream_slice(data_full_t &full, unsigned idx, const DataUnit &val); // ---------- top-level function ---------- extern "C" void mem_slice( hls::stream<mem_inst_t> &inst_in, hls::stream<mem_inst_t> &inst_out, hls::stream<data_full_t> &west_in, hls::stream<data_full_t> &west_out, hls::stream<data_full_t> &east_in, hls::stream<data_full_t> &east_out ); #endif // MEM_SLICE_H ``` </details> <details> <summary>decouple version mem_slice_test.cpp</summary> ```cpp #include "mem_slice.h" #include <iostream> #include <iomanip> using namespace std; // ================================================================ // Helper: instruction builder // ================================================================ mem_inst_t make_inst( ap_uint<MEM_DSKEW_BITS> dskew, ap_uint<MEM_OPCODE_BITS> opcode, ap_uint<MEM_ADDR_BITS> addr, ap_uint<MEM_SIDE_BITS> side, ap_uint<MEM_SRC_DST_BITS> srcdst) { mem_inst_t inst; inst.raw = 0; inst.raw.range(3,0) = dskew; inst.raw.range(5,4) = opcode; inst.raw.range(18,6) = addr; inst.raw.bit(19) = side; inst.raw.range(24,20) = srcdst; return inst; } // ================================================================ // Main testbench // ================================================================ int main() { hls::stream<mem_inst_t> inst_in, inst_out; hls::stream<data_full_t> west_in, west_out, east_in, east_out; cout << "============================" << endl; cout << " mem_slice 全功能測試開始" << endl; cout << "============================" << endl; // ================================================================ // Case 1: Basic WRITE / READ test (west side) // ================================================================ { cout << "\n[CASE 1] Basic west WRITE/READ test\n"; data_full_t input_data = 0; DataUnit dword = 0xAABBCCDDEEFF1122; set_stream_slice(input_data, 0, dword); mem_inst_t inst_w = make_inst(0, MEM_OP_WRITE, 0x001, 0, 0); mem_inst_t inst_r = make_inst(0, MEM_OP_READ , 0x001, 0, 0); west_in.write(input_data); inst_in.write(inst_w); mem_slice(inst_in, inst_out, west_in, west_out, east_in, east_out); inst_in.write(inst_r); mem_slice(inst_in, inst_out, west_in, west_out, east_in, east_out); if (!west_out.empty()) { data_full_t outdata = west_out.read(); DataUnit dout = get_stream_slice(outdata, 0); cout << " READ result = " << hex << dout << endl; } else { cout << " ❌ No output on west_out!" << endl; } while (!inst_out.empty()) inst_out.read(); } // ================================================================ // Case 2: East side test // ================================================================ { cout << "\n[CASE 2] East side WRITE/READ test\n"; data_full_t eastdata = 0; DataUnit dword = 0x123456789ABCDEF0; set_stream_slice(eastdata, 2, dword); mem_inst_t inst_w = make_inst(0, MEM_OP_WRITE, 0x002, 1, 2); mem_inst_t inst_r = make_inst(0, MEM_OP_READ , 0x002, 1, 2); east_in.write(eastdata); inst_in.write(inst_w); mem_slice(inst_in, inst_out, west_in, west_out, east_in, east_out); inst_in.write(inst_r); mem_slice(inst_in, inst_out, west_in, west_out, east_in, east_out); if (!east_out.empty()) { data_full_t outdata = east_out.read(); DataUnit dout = get_stream_slice(outdata, 2); cout << " READ result = " << hex << dout << endl; } else { cout << " ❌ No output on east_out!" << endl; } while (!inst_out.empty()) inst_out.read(); } // ================================================================ // Case 3: dskew delay test (write before read) // ================================================================ { cout << "\n[CASE 3] dskew delay test\n"; // Step 1️⃣: Write value to addr 0x003 data_full_t input_data = 0; DataUnit write_word = 0xDEADBEEFCAFEBABE; set_stream_slice(input_data, 0, write_word); mem_inst_t inst_w = make_inst(0, MEM_OP_WRITE, 0x003, 0, 0); west_in.write(input_data); inst_in.write(inst_w); mem_slice(inst_in, inst_out, west_in, west_out, east_in, east_out); cout << " Wrote 0x" << hex << write_word << " to addr 0x003\n"; // Step 2️⃣: Issue READ with dskew = 3 mem_inst_t inst_delay = make_inst(3, MEM_OP_READ, 0x003, 0, 0); for (int cycle = 0; cycle < 6; cycle++) { inst_in.write(inst_delay); mem_slice(inst_in, inst_out, west_in, west_out, east_in, east_out); cout << " Cycle " << dec << cycle << " processed" << endl; if (!west_out.empty()) { data_full_t outdata = west_out.read(); DataUnit dout = get_stream_slice(outdata, 0); cout << " ✅ READ result = " << hex << dout << " (at cycle " << dec << cycle << ")\n"; } } cout << " ✅ Expected: READ occurs ~3 cycles after issue.\n"; while (!inst_out.empty()) inst_out.read(); } // ================================================================ // Case 4: NOP test // ================================================================ { cout << "\n[CASE 4] NOP instruction test\n"; mem_inst_t inst_nop = make_inst(0, NOP, 0x000, 0, 0); inst_in.write(inst_nop); mem_slice(inst_in, inst_out, west_in, west_out, east_in, east_out); if (west_out.empty() && east_out.empty()) cout << " ✅ No output as expected.\n"; else cout << " ❌ Unexpected output detected!\n"; while (!inst_out.empty()) inst_out.read(); } // ================================================================ // Case 5: Continuous pipeline test // ================================================================ { cout << "\n[CASE 5] Continuous instruction stream test\n"; for (int i = 0; i < 4; i++) { mem_inst_t inst_w = make_inst(i % 2, MEM_OP_WRITE, 0x010 + i, i & 1, i); mem_inst_t inst_r = make_inst(0, MEM_OP_READ , 0x010 + i, i & 1, i); data_full_t ddata = 0; DataUnit word = (0xABCD0000 | i); set_stream_slice(ddata, i, word); if (i & 1) east_in.write(ddata); else west_in.write(ddata); inst_in.write(inst_w); inst_in.write(inst_r); } for (int t = 0; t < 10; t++) { mem_slice(inst_in, inst_out, west_in, west_out, east_in, east_out); } cout << " ✅ Continuous streaming executed.\n"; while (!inst_out.empty()) inst_out.read(); while (!west_out.empty()) west_out.read(); while (!east_out.empty()) east_out.read(); while (!west_in.empty()) west_in.read(); while (!east_in.empty()) east_in.read(); } cout << "\n============================" << endl; cout << " ✅ All test cases done." << endl; cout << "============================" << endl; return 0; } ``` </details> ## Integrate into Bundle <details> <summary>decouple version mem_bundle.cpp</summary> ```cpp #include "mem_bundle.h" // bundle: inst per-slice, data chained (slice0 -> slice1 -> slice2 -> slice3) extern "C" void mem_bundle( hls::stream<mem_inst_t> inst_in[4], hls::stream<mem_inst_t> inst_out[4], hls::stream<data_full_t> &west_in, hls::stream<data_full_t> &west_out, hls::stream<data_full_t> &east_in, hls::stream<data_full_t> &east_out ) { #pragma HLS DATAFLOW #pragma HLS INTERFACE ap_ctrl_none port=return // mid data streams chain the slices (pass-through semantics) static hls::stream<data_full_t> w_mid[3]; static hls::stream<data_full_t> e_mid[3]; #pragma HLS STREAM variable=w_mid depth=8 #pragma HLS STREAM variable=e_mid depth=8 // instantiate four independent slices, chaining the data streams mem_slice(inst_in[0], inst_out[0], west_in, w_mid[0], east_in, e_mid[0]); mem_slice(inst_in[1], inst_out[1], w_mid[0], w_mid[1], e_mid[0], e_mid[1]); mem_slice(inst_in[2], inst_out[2], w_mid[1], w_mid[2], e_mid[1], e_mid[2]); mem_slice(inst_in[3], inst_out[3], w_mid[2], west_out, e_mid[2], east_out); } ``` </details> ### Problem1 - Data dependency ![image](https://hackmd.io/_uploads/BkXNUmKpgl.png) ### Solution1 - template ```cpp template<int SLICE_ID> void mem_slice( hls::stream<mem_inst_t> &inst_in, hls::stream<mem_inst_t> &inst_out, hls::stream<data_full_t> &west_in, hls::stream<data_full_t> &west_out, hls::stream<data_full_t> &east_in, hls::stream<data_full_t> &east_out ) { #pragma HLS PIPELINE II=1 #pragma HLS LATENCY min=1 max=6 static ap_uint<4> skew_cnt = 0; static bool skew_active = false; ... } ``` In HLS, `template<int ID>` is used to: Automatically generate multiple hardware instances from the same piece of code, where each instance has its own independent pipeline state and static variables. * Solves the “static feedback dependence” error in dataflow * Enables multiple tasks to truly run in parallel ### Problem2 - Undefine template ``` obj/mem_bundle.o: in function mem_bundle': /home/ubuntu/MEM_hls_Ray/Groq_MEM/Integration1/csim/build/../../../../mem_bundle.cpp:22: undefined reference to void mem_slice<0>( hls::stream<mem_inst_t, 0>&, hls::stream<mem_inst_t, 0>&, hls::stream<ap_uint<4096>, 0>&, hls::stream<ap_uint<4096>, 0>&, hls::stream<ap_uint<4096>, 0>&, hls::stream<ap_uint<4096>, 0>& ) ``` ### Solution2 - define at mem_slice.cpp ```cpp // 顯式生成四個 template 實體 template void mem_slice<0>( hls::stream<mem_inst_t> &, hls::stream<mem_inst_t> &, hls::stream<data_full_t> &, hls::stream<data_full_t> &, hls::stream<data_full_t> &, hls::stream<data_full_t> & ); template void mem_slice<1>( hls::stream<mem_inst_t> &, hls::stream<mem_inst_t> &, hls::stream<data_full_t> &, hls::stream<data_full_t> &, hls::stream<data_full_t> &, hls::stream<data_full_t> & ); template void mem_slice<2>( hls::stream<mem_inst_t> &, hls::stream<mem_inst_t> &, hls::stream<data_full_t> &, hls::stream<data_full_t> &, hls::stream<data_full_t> &, hls::stream<data_full_t> & ); template void mem_slice<3>( hls::stream<mem_inst_t> &, hls::stream<mem_inst_t> &, hls::stream<data_full_t> &, hls::stream<data_full_t> &, hls::stream<data_full_t> &, hls::stream<data_full_t> & ); ``` ### Problem3 - Data Dependency ![image](https://hackmd.io/_uploads/BJ5I2wcTlg.png) * The most common issues during the Vivado HLS dataflow checking stage. * They indicate that your global or static array is being read and/or written by multiple dataflow tasks simultaneously,which violates the independence required by the dataflow execution model. As a result, HLS refuses to proceed with synthesis. ```cpp // ---------- single memory bank ---------- static DataUnit mem_array[MEM_SLICE_WORDS]; // ---------- memory access ---------- void write_mem(ap_uint<32> byte_addr, const DataUnit &dw) { #pragma HLS INLINE unsigned int idx = (unsigned int)((byte_addr / WORD_BYTES) % MEM_SLICE_WORDS); mem_array[idx] = dw; } DataUnit read_mem(ap_uint<32> byte_addr) { #pragma HLS INLINE unsigned int idx = (unsigned int)((byte_addr / WORD_BYTES) % MEM_SLICE_WORDS); return mem_array[idx]; } ``` * Original Design ### Solution3 - Dont share memory ```cpp static DataUnit mem_array[MEM_SLICE_WORDS]; #pragma HLS BIND_STORAGE variable=mem_array type=ram_t2p impl=bram #pragma HLS DEPENDENCE variable=mem_array inter false #pragma HLS DEPENDENCE variable=mem_array intra false // ============================================================ // Inline memory access helpers (local scope) // ============================================================ auto write_mem = [&](ap_uint<32> byte_addr, const DataUnit &dw) { #pragma HLS INLINE unsigned int idx = (unsigned int)((byte_addr / WORD_BYTES) % MEM_SLICE_WORDS); mem_array[idx] = dw; }; auto read_mem = [&](ap_uint<32> byte_addr) -> DataUnit { #pragma HLS INLINE unsigned int idx = (unsigned int)((byte_addr / WORD_BYTES) % MEM_SLICE_WORDS); return mem_array[idx]; }; ``` ### Problem4 - Deadlock ```cpp RTL Simulation : 0 / 30 [142857142.86%] @ "100000135000" ``` * 0 / 30: Out of 30 total transactions, none have been completed yet. * 142,857,142.86%: Indicates the progress of a single transaction (calculated as measured latency / estimated latency × 100%). → Since it has exceeded 100%, it means the latency estimation is incorrect. * @ "100000135000": Represents the simulation time (in picoseconds or nanoseconds). :::warning I found the deadlock root cause: the design reads `east_in`, but the HLS TB provides zero tokens on `east_in` for all transactions, so dataflow stalls immediately. This matches the cosim warning about ap_ctrl_none and non-blocking FIFO and the progress stuck at 0/30. ::: #1 ```cpp if (inst_in.empty()) return; mem_inst_t inst = inst_in.read(); inst_out.write(inst); ``` * This part is one of the key problems. → Because this return prevents the slice from executing any west/east data forwarding. * If the previous slice is waiting for you to read from `west_in`, and the next slice is waiting for you to write to `west_out`, this dataflow chain will become permanently stalled. #2 ```cpp #pragma HLS DATAFLOW static hls::stream<data_full_t> w_mid[3]; static hls::stream<data_full_t> e_mid[3]; ``` You used static streams as the slice chain, but when there is no active consumer, it can lead to the following: * If any slice does not forward its data every clock cycle, the downstream slice will be blocked; * As a result, the dataflow model will be detected by HLS as a potential deadlock. Solution: Ensure that each slice continues to forward data even when there is no memory operation — that is, always pass data through from west_in → east_out and east_in → west_out. #3 ```cpp for (int j = 0; j < 30; j++){ for (int i = 0; i < 4; i++) { inst_in[i].write(make_inst(0, NOP, 0, 0, 0)); } } // in mem_bundle_test.cpp ``` * You wrote 120 instructions in total, but each mem_slice() can only read one instruction per clock cycle. * If the default inst_in FIFO depth is small (typically around 16 entries), it will quickly become full, causing a write-blocking stall. → During co-simulation, the HLS host side will then be unable to write to the AXIS channel, and the simulation will freeze while waiting for available FIFO space. ### Solution4 - add input into the fifo Cause: `east_in` had zero depth; modules like mem_slice_1_U0 require `e_mid_empty_n/east_in_empty_n` to read, creating a deadlock. Fix: Provide at least one token on east_in for the first transaction. Updated stream size and data files. #1 & #2 ```cpp if (inst_in.empty()) { // 即使沒有新指令，仍需維持資料流 pass-through if (!west_in.empty()) west_out.write(west_in.read()); if (!east_in.empty()) east_out.write(east_in.read()); return; } ``` #3 ```cpp #pragma HLS STREAM variable=inst_in depth=64 #pragma HLS STREAM variable=inst_out depth=64 ``` ### Problem5 #1 ```cpp // mid data streams chain the slices (pass-through semantics) static hls::stream<data_full_t> e_mid[6]; #pragma HLS STREAM variable=e_mid depth=8 // instantiate four independent slices, chaining the data streams mem_slice<0>(inst_in[0], inst_out[0], west_in, west_out, e_mid[0], e_mid[1]); mem_slice<1>(inst_in[1], inst_out[1], e_mid[1], e_mid[0], e_mid[2], e_mid[3]); mem_slice<2>(inst_in[2], inst_out[2], e_mid[3], e_mid[2], e_mid[4], e_mid[5]); mem_slice<3>(inst_in[3], inst_out[3], e_mid[5], e_mid[4], east_in, east_out); } ``` * In your original version, e_mid[0] and e_mid[1] are used simultaneously as both input and output by slice0 and slice1. → For HLS, this creates a zero-latency loop, where both modules try to read and write to the same stream within the same clock cycle. * In the C simulation, this looks fine because execution is sequential — the simulator processes one operation after another. * However, in RTL cosimulation, the streams are implemented as AXIS FIFOs with handshake signals (ready / valid). * If both ends attempt to read and write in the same cycle, their handshakes conflict: each waits for the other to be ready. #2 ```cpp #include "mem_bundle.h" // bundle: inst per-slice, data chained (slice0 -> slice1 -> slice2 -> slice3) extern "C" void mem_bundle( hls::stream<mem_inst_t> inst_in[4], hls::stream<mem_inst_t> inst_out[4], hls::stream<data_full_t> &west_in, hls::stream<data_full_t> &west_out, hls::stream<data_full_t> &east_in, hls::stream<data_full_t> &east_out ) { // mem_bundle.cpp #pragma HLS DATAFLOW #pragma HLS INTERFACE ap_ctrl_none port=return ``` * When the top-level module (`mem_bundle()`) uses `ap_ctrl_none`, Vivado HLS treats it as a **free-running hardware block** — it starts automatically and never finishes, so `ap_done` is never asserted. While this works fine for FPGA synthesis, it causes problems in C/RTL co-simulation, because the auto-generated testbench **expects a sequence of reset → start DUT → wait for ap_done → compare results**. Since ap_done never occurs, the simulator waits forever, appearing to hang even though the design itself isn’t deadlocked. * Switching to `ap_ctrl_hs` inserts a small handshake FSM (IDLE → RUN → DONE → IDLE), giving Vivado clear start/done conditions. The co-simulation driver automatically asserts ap_start=1, waits until Vivado raises ap_done=1, and then terminates successfully. The functional behavior of your design—its dataflow and mem_slice modules—remains exactly the same; the only difference is that Vivado now knows when to stop the simulation. * If you want both proper co-simulation termination and continuous dataflow operation, use `ap_ctrl_chain` instead. It behaves like `ap_ctrl_none` (always active, dataflow-friendly) but also provides start/done signals, allowing clean integration and automatic co-sim completion without hanging. #3 ![image](https://hackmd.io/_uploads/B1FN7sJCgl.png) * There is a 2 cycle between the first input and the second input. It may result from the `#pragma HLS INTERFACE ap_ctrl_chain port=return` ### Solution5 #1 ```cpp // mem_bundle.cpp #pragma HLS DATAFLOW #pragma HLS INTERFACE ap_ctrl_none port=return static hls::stream<data_full_t> east_link[3]; // k.east_out -> (k+1).west_in static hls::stream<data_full_t> west_link[3]; // (k+1).west_out -> k.east_in #pragma HLS STREAM variable=east_link depth=8 #pragma HLS STREAM variable=west_link depth=8 // slice0（最左） mem_slice<0>(inst_in[0], inst_out[0], /*west_in*/ west_in, /*west_out*/ west_out, // 直接對外 /*east_in*/ west_link[0], // 從 slice1 回來的「往西」鏈 /*east_out*/ east_link[0]); // 往東，給 slice1 的 west_in // slice1 mem_slice<1>(inst_in[1], inst_out[1], /*west_in*/ east_link[0], // 從 slice0 往東過來 /*west_out*/ west_link[0], // 往西回去 slice0 /*east_in*/ west_link[1], // 從 slice2 往西回來 /*east_out*/ east_link[1]); // 往東給 slice2 // slice2 mem_slice<2>(inst_in[2], inst_out[2], /*west_in*/ east_link[1], /*west_out*/ west_link[1], /*east_in*/ west_link[2], /*east_out*/ east_link[2]); // slice3（最右） mem_slice<3>(inst_in[3], inst_out[3], /*west_in*/ east_link[2], /*west_out*/ west_link[2], // 回去 slice2（若你要對外可另接） /*east_in*/ east_in, // 直接對外 /*east_out*/ east_out); // 直接對外 ``` #2 ```cpp // mem_bundle.cpp #pragma HLS DATAFLOW #pragma HLS INTERFACE ap_ctrl_chain port=return //#pragma HLS INTERFACE ap_ctrl_hs port=return //#pragma HLS INTERFACE ap_ctrl_none port=return ``` #3 ## Newest Version ![image](https://hackmd.io/_uploads/r1ggfFyJ-g.png) <details> <summary>decouple version mem_bundle.cpp</summary> ```cpp ``` </details>