# 數位埃西
## ModelSim 操作
### Compile
- 先compile目錄下所有的檔案 all pass 才能simulate
### Simulation
- 上方工具列 -> `simulate` -> `start simulation` -> `work` -> `testbench`


- `start simulation` -> `verilog` 頁面 -> `disable timing checks in specify block (+no timingchecks)`

- start simulation菜單中 others 頁面 最下方輸入 (避免程式執行完關閉)
```
-onfinish stop
```

### Check Result Waveform
- 上一步完成後 Simulate -> run ->run all 就會執行出現結果 -> 可以查看 transript看testcase 成功或是失敗(這是來自助教寫的tb.v)
```tex
////////////////////////////////////////////////////////////
/////注意simulate->load , 一定要run 才會有結果/////
/////////////////////////////////////////////////////////////
```
- sim標籤,右鍵你想查看元件的執行 `add wave` , 會跳出波形顯示的視窗

- 波形的+號點開可以看到每個bits的訊號

### 操作可能遇到的問題
#### 沒有顯示波形 ?
可能是使用 restart 後沒有重新執行,一旦使用 restart,之前的模擬結果(包括波形資料)就會被清空。 重新啟動後一定要再度執行 run 命令才會有波形資料
### Warmup
#### ver1
:::spoiler code
```verilog=
//
// Designer: NE6121084
//
module MAS_2input(
input signed [4:0]Din1,
input signed [4:0]Din2,
input [1:0]Sel,
input signed[4:0]Q,
output [1:0]Tcmp,
output reg signed[4:0]TDout,
output reg signed[3:0]Dout
);
reg signed [4:0] alu_result;
reg [1:0] cmp_result;
reg signed [4:0] adjusted_result;
always @(*) begin
// ALU
case(Sel)
2'b00: alu_result = Din1 + Din2;
2'b11: alu_result = Din1 - Din2;
default: alu_result = Din1;
endcase
// Comparator
if (alu_result >= Q)
cmp_result = 2'b11;
else if (alu_result < 0)
cmp_result = 2'b00;
else
cmp_result = 2'b01;
// alu_result
case(cmp_result)
2'b11: adjusted_result = alu_result - Q; // alu_result > Q
2'b00: adjusted_result = alu_result + Q; // alu_result < 0
default: adjusted_result = alu_result;
endcase
end
// output
assign TDout = alu_result; // ALU result temp
assign Tcmp = cmp_result; //
assign Dout = adjusted_result[3:0]; // 4 bits
endmodule
```
```tex
Time: 3007500 ps Iteration: 1 Instance: /test
```
:::
#### ver2
:::spoiler code
```verilog=
/// comparator.v
module Comparater(in, mod, out);
input signed[4:0] in, mod;
output reg [1:0] out ;
always@(*)
begin
if (in >= mod)
out = 2'b11;
else if (in < 0)
out= 2'b00;
else
out = 2'b01;
end
endmodule
```
```verilog=
/// alu.v
module ALU (select, in1, in2, out);
input signed[4:0] in1, in2;
input [1:0] select;
output reg[4:0] out;
always @(*)
begin
case (select)
2'b00 : out = in1 + in2;
2'b11 : out = in1 - in2;
default : out = in1;
endcase
end
endmodule
```
```verilog=
// MAS_2input.v
// Designer: NE6121084
//
module MAS_2input(
input signed [4:0]Din1,
input signed [4:0]Din2,
input [1:0]Sel,
input signed[4:0]Q,
output [1:0]Tcmp,
output signed[4:0]TDout,
output signed[3:0]Dout
);
wire signed [4:0] alu_result;
wire [1:0] cmp_result;
wire signed [4:0] adjusted_result;
wire [2:0] Sel2;
ALU alu0(Sel, Din1, Din2, alu_result);
Comparater comparater0(alu_result, Q, cmp_result);
assign Sel2 = cmp_result;
ALU alu1(Sel2, alu_result, Q, adjusted_result);
// output
assign TDout = alu_result; // ALU result temp
assign Tcmp = cmp_result; //
assign Dout = adjusted_result[3:0]; // 4 bits
endmodule
```
```tex
# Time: 3007500 ps Iteration: 1 Instance: /test
```
:::
### 2025-HW1
```verilog=
//comparator2.v
module Comparator2 (
input [3:0] A ,
input [3:0] B ,
output [3:0] min,
output [3:0] max
);
///////////////////////////////
// Write Your Design Here ~ //
///////////////////////////////
assign min = A < B ? A : B;
assign max = A < B ? B : A;
endmodule
```
```verilog=
// MedianFinder_3num.v
module MedianFinder_3num(
input [3:0] num1 ,
input [3:0] num2 ,
input [3:0] num3 ,
output [3:0] median
);
///////////////////////////////
// Write Your Design Here ~ //
///////////////////////////////
wire [3:0]stage1_min;//a
wire [3:0]stage1_max;
wire [3:0]stage2_min;//b
Comparator2 stage1(.A(num1), .B(num2), .min(stage1_min), .max(stage1_max));
Comparator2 stage2(.A(stage1_max), .B(num3), .min(stage2_min), .max());
Comparator2 stage3(.A(stage1_min), .B(stage2_min), .min(), .max(median));
endmodule
```
```verilog=
//MedianFinder_5num.v
module MedianFinder_5num(
input [3:0] num1 ,
input [3:0] num2 ,
input [3:0] num3 ,
input [3:0] num4 ,
input [3:0] num5 ,
output [3:0] median
);
///////////////////////////////
// Write Your Design Here ~ //
///////////////////////////////
wire [3:0] stage1_cmp1_min;
wire [3:0] stage1_cmp1_max;
wire [3:0] stage1_cmp2_min;
wire [3:0] stage1_cmp2_max;
wire [3:0] stage2_cmp1_max;
wire [3:0] stage2_cmp2_min;
Comparator2 stage1_cmp1(.A(num1), .B(num2), .min(stage1_cmp1_min), .max(stage1_cmp1_max));
Comparator2 stage1_cmp2(.A(num3), .B(num4), .min(stage1_cmp2_min), .max(stage1_cmp2_max));
Comparator2 stage2_cmp1(.A(stage1_cmp1_min), .B(stage1_cmp2_min), .min(), .max(stage2_cmp1_max));
Comparator2 stage2_cmp2(.A(stage1_cmp1_max), .B(stage1_cmp2_max), .min(stage2_cmp2_min), .max());
MedianFinder_3num med3(.num1(stage2_cmp1_max),
.num2(stage2_cmp2_min),
.num3(num5),
.median(median));
endmodule
```
```verilog=
// MedianFinder_7num.v
module MedianFinder_7num(
input [3:0] num1 ,
input [3:0] num2 ,
input [3:0] num3 ,
input [3:0] num4 ,
input [3:0] num5 ,
input [3:0] num6 ,
input [3:0] num7 ,
output [3:0] median
);
///////////////////////////////
// Write Your Design Here ~ //
///////////////////////////////
wire [3:0] stage1_cmp1_min;
wire [3:0] stage1_cmp1_max;
wire [3:0] stage1_cmp2_min;
wire [3:0] stage1_cmp2_max;
wire [3:0] stage1_cmp3_min;
wire [3:0] stage1_cmp3_max;
reg [3:0] stage2_num1;
reg [3:0] stage2_num2;
reg [3:0] stage2_num3;
reg [3:0] stage2_num4;
//step1 : exclude num1~6 min/max
Comparator2 stage1_cmp1(.A(num1), .B(num2), .min(stage1_cmp1_min), .max(stage1_cmp1_max));
Comparator2 stage1_cmp2(.A(num3), .B(num4), .min(stage1_cmp2_min), .max(stage1_cmp2_max));
Comparator2 stage1_cmp3(.A(num5), .B(num6), .min(stage1_cmp3_min), .max(stage1_cmp3_max));
always@(*)begin
if (stage1_cmp1_min <= stage1_cmp2_min && stage1_cmp1_min <= stage1_cmp3_min)begin
stage2_num1 <= stage1_cmp2_min;
stage2_num2 <= stage1_cmp3_min;
end
else if (stage1_cmp2_min <= stage1_cmp1_min && stage1_cmp2_min <= stage1_cmp3_min)begin
stage2_num1 <= stage1_cmp1_min;
stage2_num2 <= stage1_cmp3_min;
end
else begin
stage2_num1 <= stage1_cmp1_min;
stage2_num2 <= stage1_cmp2_min;
end
end
always@(*)begin
if (stage1_cmp1_max >= stage1_cmp2_max && stage1_cmp1_max >= stage1_cmp3_max)begin
stage2_num3 <= stage1_cmp2_max;
stage2_num4 <= stage1_cmp3_max;
end
else if (stage1_cmp2_max >= stage1_cmp1_max && stage1_cmp2_max >= stage1_cmp3_max)begin
stage2_num3 <= stage1_cmp1_max;
stage2_num4 <= stage1_cmp3_max;
end
else begin
stage2_num3 <= stage1_cmp1_max;
stage2_num4 <= stage1_cmp2_max;
end
end
//step2 : take 4 num from step1 + num7, use median5 to find median
MedianFinder_5num med5(.num1(stage2_num1), .num2(stage2_num2), .num3(stage2_num3), .num4(stage2_num4), .num5(num7), .median(median));
endmodule
```
```verilog=
//修正版
module MedianFinder_7num(
input [3:0] num1 ,
input [3:0] num2 ,
input [3:0] num3 ,
input [3:0] num4 ,
input [3:0] num5 ,
input [3:0] num6 ,
input [3:0] num7 ,
output [3:0] median
);
///////////////////////////////
// Write Your Design Here ~ //
///////////////////////////////
wire [3:0] stage1_cmp1_min;
wire [3:0] stage1_cmp1_max;
wire [3:0] stage1_cmp2_min;
wire [3:0] stage1_cmp2_max;
wire [3:0] stage1_cmp3_min;
wire [3:0] stage1_cmp3_max;
wire [3:0] temp;
wire [3:0] stage2_num1;
wire [3:0] stage2_num2;
wire [3:0] temp2;
wire [3:0] stage2_num3;
wire [3:0] stage2_num4;
//step1 : exclude num1~6 min/max
Comparator2 stage1_cmp1(.A(num1), .B(num2), .min(stage1_cmp1_min), .max(stage1_cmp1_max));
Comparator2 stage1_cmp2(.A(num3), .B(num4), .min(stage1_cmp2_min), .max(stage1_cmp2_max));
Comparator2 stage1_cmp3(.A(num5), .B(num6), .min(stage1_cmp3_min), .max(stage1_cmp3_max));
Comparator2 stage2_cmp1(.A(stage1_cmp1_min), .B(stage1_cmp2_min), .min(temp), .max(stage2_num1));
Comparator2 stage2_cmp2(.A(temp) , .B(stage1_cmp3_min), .min(), .max(stage2_num2));
Comparator2 stage2_cmp3(.A(stage1_cmp1_max), .B(stage1_cmp2_max), .min(stage2_num3), .max(temp2));
Comparator2 stage2_cmp4(.A(temp2) , .B(stage1_cmp3_max), .min(stage2_num4), .max());
//step2 : take 4 num from step1 + num7, use median5 to find median
MedianFinder_5num med5(.num1(stage2_num1), .num2(stage2_num2), .num3(stage2_num3), .num4(stage2_num4), .num5(num7), .median(median));
endmodule
```
median7的想法,先求出num1~6中,把最大最小移除後保留剩下4個,然後跟num7進行中位數比較,考慮三種狀況
1. `num7`,`min`, ... ... ... ... , `max` : 就算num7是最小,但其次小值(second minimal) 移除也不影響中位數判斷。
2. `min`, ... ... ... ... , `max`,`num7` : 就算num7是最大,但其次大值(second maximal) 移除也不影響中位數判斷。
3. `min`, ... `num7` ... ... ... , `max` : 移除最大最小值後不影響

### 2025-HW2
以下是第一個版本,這裡有犯了幾個失誤導致CHACH[0] 一直沒辦法設定到正確的答案,如果印出結果都會發現第一個cycle都會得到未定義結果
```verilog
$display("[load] index :%d, val : %d" , (cache_wirte_addr-1), IROM_Q);
```
而在觀察作業文件會發現拉起rst -> IROM_rd -> IROM_A 其實會至少跑3個cycle,所以必須設定多二個狀態,分別重置時、READY時各要佔據一個時間週期。可以看修正版多加了兩個狀態。

```verilog=
//可以通過tb1
module LCD_CTRL(
input clk,
input rst,
input [3:0] cmd,
input cmd_valid,
input [7:0] IROM_Q, // 從 IROM 接收圖像資料
output reg IROM_rd, // IROM 讀取使能
output reg [5:0] IROM_A, // IROM 地址
output reg IRAM_ceb, // IRAM enable (高有效)
output reg IRAM_web, // IRAM 讀/寫選擇 (0:寫, 1:讀)
output reg [7:0] IRAM_D, // 要寫入 IRAM 的資料
output reg [5:0] IRAM_A, // IRAM 地址
input [7:0] IRAM_Q, // 從 IRAM 讀出的資料
output busy,
output done
);
/////////////////////////////////
// 狀態定義
/////////////////////////////////
parameter FETCH = 1;
parameter OFFSET_ORGN = 2;
parameter PROCESS = 3;
parameter UPDATE = 4;
parameter SAVE_TORAM = 5;
parameter DONE = 6;
reg [3:0] current_state, next_state; // 4-bit state registers
//-----------------------------------------
// 內部快取:以扁平化 reg 向量表示 64 個 8-bit 資料 (512 bits)
//-----------------------------------------
reg [511:0] CACHE_flat;
//-----------------------------------------
// 內部暫存器及計數器
//-----------------------------------------
reg [6:0] rom_addr_cnt; // FETCH 狀態下的 IROM 讀取地址 (0~63)
reg [6:0] cache_wirte_addr;
reg [6:0] ram_sw_counter; // SAVE_TORAM 狀態下的 IRAM 寫入地址 (0~63)
reg [6:0] cache_read_addr;
//-----------------------------------------
// 操作點 (預設 (4,4))
// 假設 x,y 範圍 0~7,有效操作區在 2~6
//-----------------------------------------
reg [3:0] op_x, op_y;
//-----------------------------------------
// 暫存運算結果 (來自 PROCESS 狀態的計算)
//-----------------------------------------
reg [7:0] kernel_result;
// done 與 busy 信號
assign done = (current_state == DONE);
assign busy = (current_state != OFFSET_ORGN);
//---------------------------------------------------------------------
// 狀態機:組合邏輯 (決定下一狀態)
//---------------------------------------------------------------------
always @(*) begin
case(current_state)
FETCH: begin
if (rom_addr_cnt == 7'd65)
next_state = OFFSET_ORGN;
else
next_state = FETCH;
end
OFFSET_ORGN: begin
if (cmd_valid) begin
case(cmd)
4'd0: next_state = SAVE_TORAM; // SAVE 命令
4'd1: next_state = OFFSET_ORGN; // Max
4'd2: next_state = OFFSET_ORGN; // Max
4'd3: next_state = OFFSET_ORGN; // Max
4'd4: next_state = OFFSET_ORGN; // Max
4'd5: next_state = PROCESS; // Max
4'd6: next_state = PROCESS; // Min
4'd7: next_state = PROCESS; // Average
default: next_state = OFFSET_ORGN; // Shift 命令不轉換狀態
endcase
end else begin
next_state = OFFSET_ORGN;
end
end
PROCESS: next_state = UPDATE;
UPDATE: next_state = OFFSET_ORGN;
SAVE_TORAM: begin
if (ram_sw_counter == 7'd64)
next_state = DONE;
else
next_state = SAVE_TORAM;
end
DONE: next_state = DONE;
default: next_state = FETCH;
endcase
end
//---------------------------------------------------------------------
// 狀態機:順序邏輯 (正緣觸發)
//---------------------------------------------------------------------
integer idx;
integer i, j; // 迴圈變數
reg cmd_handled;
always @(posedge clk or posedge rst) begin
if (rst) begin
current_state <= FETCH;
rom_addr_cnt <= 7'd1;
cache_wirte_addr <= 7'd0;
op_x <= 4'd4;
op_y <= 4'd4;
kernel_result <= 8'd0;
ram_sw_counter <= 7'd0;
for (idx = 0; idx < 64; idx = idx + 1) begin
CACHE_flat[idx*8 +: 8] <= 8'd0;
end
IROM_rd <= 1'b0;
IROM_A <= 6'd0;
IRAM_ceb <= 1'b0;
IRAM_web <= 1'b1; // 預設為讀取模式
IRAM_D <= 8'd0;
IRAM_A <= 6'd0;
end else begin
current_state <= next_state;
case(current_state)
FETCH: begin
IROM_rd <= 1'b1;
IROM_A <= rom_addr_cnt;
cache_wirte_addr <= rom_addr_cnt;
rom_addr_cnt <= rom_addr_cnt + 1;
end
OFFSET_ORGN: begin
IROM_rd <= 1'b0; // 結束 FETCH
case(cmd)
4'd1:
if (op_y > 4'd2) op_y <= op_y - 1; // Shift Up
else op_y <= op_y;
4'd2: if (op_y < 4'd6) op_y <= op_y + 1; // Shift Down
4'd3:
if (op_x > 4'd2) op_x <= op_x - 1; // Shift Left
else op_x <= op_x ;
4'd4: if (op_x < 4'd6) op_x <= op_x + 1; // Shift Right
default: ;
endcase
end
PROCESS: begin
case(cmd)
4'd5: kernel_result <= compute_max(CACHE_flat, op_x, op_y);
4'd6: kernel_result <= compute_min(CACHE_flat, op_x, op_y);
4'd7: kernel_result <= compute_average(CACHE_flat, op_x, op_y);
default: kernel_result <= 8'd0;
endcase
end
UPDATE: begin
// 更新 CACHE_flat 中核區域 (4x4 區域) 為 kernel_result
for (j = -2; j < 2; j = j + 1) begin
for (i = -2; i < 2; i = i + 1) begin
CACHE_flat[((op_y + j)*8 + (op_x + i))*8 +: 8] <= kernel_result;
end
end
end
SAVE_TORAM: begin
IRAM_ceb <= 1'b1;
IRAM_web <= 1'b0; // 寫入模式
IRAM_A <= ram_sw_counter;
IRAM_D <= CACHE_flat[(ram_sw_counter)*8 +: 8];
end
DONE: begin
IRAM_ceb <= 1'b0;
IRAM_web <= 1'b1; // 恢復讀取模式
end
default: ;
endcase
end
end
//---------------------------------------------------------------------
// 負緣觸發:從 IROM 讀取資料到 CACHE_flat
//---------------------------------------------------------------------
always @(negedge clk) begin
if (current_state == FETCH && IROM_rd) begin
CACHE_flat[(cache_wirte_addr-1)*8 +: 8] <= IROM_Q;
end
end
//---------------------------------------------------------------------
// 負緣觸發:從 CACHE_flat 存資料到 IRAM
//---------------------------------------------------------------------
always @(negedge clk) begin
if (current_state == SAVE_TORAM) begin
ram_sw_counter <= ram_sw_counter + 1;
end
end
//-----------------------------------------------------
// 函式:計算最小值、最大值及平均值,基於扁平化的 CACHE_flat
//-----------------------------------------------------
function [7:0] compute_min(input [511:0] CACHE_IN, input [3:0] x_orgin, input [3:0] y_orgin);
reg [7:0] min_val;
integer i, j ;
integer index ;
begin
index = ((y_orgin - 2) * 8 + (x_orgin - 2)) * 8;
min_val = CACHE_IN[index +: 8];
for (j = -2; j < 2; j = j + 1) begin
for (i = -2; i < 2; i = i + 1) begin
index = ((y_orgin + j) * 8 + (x_orgin + i)) * 8;
if (CACHE_IN[index +: 8] < min_val)
min_val = CACHE_IN[index +: 8];
end
end
compute_min = min_val;
end
endfunction
function [7:0] compute_max(input [511:0] CACHE_IN, input [3:0] x_orgin, input [3:0] y_orgin);
reg [7:0] max_val;
integer i, j ;
integer index ;
begin
index = ((y_orgin - 2) * 8 + (x_orgin - 2)) * 8;
max_val = CACHE_IN[index +: 8];
for (j = -2; j < 2; j = j + 1) begin
for (i = -2; i < 2; i = i + 1) begin
index = ((y_orgin + j) * 8 + (x_orgin + i)) * 8;
if (CACHE_IN[index +: 8] > max_val)
max_val = CACHE_IN[index +: 8];
end
end
compute_max = max_val;
end
endfunction
function [7:0] compute_average(input [511:0] CACHE_IN, input [3:0] x_orgin, input [3:0] y_orgin);
integer sum ;
integer i, j ;
integer index;
begin
sum = 0;
for (j = -2; j < 2; j = j + 1) begin
for (i = -2; i < 2; i = i + 1) begin
index = ((y_orgin + j) * 8 + (x_orgin + i)) * 8;
sum = sum + CACHE_IN[index +: 8];
end
end
compute_average = sum / 16;
end
endfunction
endmodule
```
以下是修正版可以通過所有測資,
首先一開始一直認為是邊界條件沒設完,找了整天
`崩╰(〒皿〒)╯潰▃▄▅▆◣`
如果沒有tb3有可能就完全無法發現這個bug,所以多善用`$display`驗證自己的思路。
```verilog=
// 可以通過tb1/tb2/tb3/tb4/tb5
module LCD_CTRL(
input clk,
input rst,
input [3:0] cmd,
input cmd_valid,
input [7:0] IROM_Q, // 從 IROM 接收圖像資料
output reg IROM_rd, // IROM 讀取使能
output reg [5:0] IROM_A, // IROM 地址
output reg IRAM_ceb, // IRAM enable (高有效)
output reg IRAM_web, // IRAM 讀/寫選擇 (0:寫, 1:讀)
output reg [7:0] IRAM_D, // 要寫入 IRAM 的資料
output reg [5:0] IRAM_A, // IRAM 地址
input [7:0] IRAM_Q, // 從 IRAM 讀出的資料
output busy,
output done
);
/////////////////////////////////
// 狀態定義
/////////////////////////////////
parameter IDLE = 0;
parameter READY = 1;
parameter FETCH = 2;
parameter OFFSET_ORGN = 3;
parameter PROCESS = 4;
parameter UPDATE = 5;
parameter SAVE_TORAM = 6;
parameter DONE = 7;
reg [3:0] current_state, next_state; // 4-bit state registers
//-----------------------------------------
// 內部快取:以一維 reg 向量表示 64 個 8-bit 資料 (512 bits)
//-----------------------------------------
reg [511:0] CACHE_flat;
//-----------------------------------------
// 內部暫存器及計數器
//-----------------------------------------
reg [6:0] rom_addr_cnt; // FETCH 狀態下的 IROM 讀取地址 (0~63)
reg [6:0] cache_wirte_addr;
reg [6:0] ram_sw_counter; // SAVE_TORAM 狀態下的 IRAM 寫入地址 (0~63)
reg [6:0] cache_read_addr;
//-----------------------------------------
// 操作點 (預設 (4,4))
// 假設 x,y 範圍 0~7,有效操作區在 2~6
//-----------------------------------------
reg [3:0] op_x, op_y;
//-----------------------------------------
// 暫存運算結果 (來自 PROCESS 狀態的計算)
//-----------------------------------------
reg [7:0] kernel_result;
// done 與 busy 信號
assign done = (current_state == DONE);
assign busy = (current_state != OFFSET_ORGN);
//---------------------------------------------------------------------
// 狀態機:組合邏輯 (決定下一狀態)
//---------------------------------------------------------------------
always @(*) begin
case(current_state)
IDLE :next_state = READY;
READY:next_state = FETCH;
FETCH: begin
if (rom_addr_cnt == 7'd65)
next_state = OFFSET_ORGN;
else
next_state = FETCH;
end
OFFSET_ORGN: begin
if (cmd_valid) begin
case(cmd)
4'd0: next_state = SAVE_TORAM; // SAVE 命令
4'd1: next_state = OFFSET_ORGN; // OFFSET
4'd2: next_state = OFFSET_ORGN; // OFFSET
4'd3: next_state = OFFSET_ORGN; // OFFSET
4'd4: next_state = OFFSET_ORGN; // OFFSET
4'd5: next_state = PROCESS; // Max
4'd6: next_state = PROCESS; // Min
4'd7: next_state = PROCESS; // Average
default: next_state = OFFSET_ORGN; // Shift 命令不轉換狀態
endcase
end else begin
next_state = OFFSET_ORGN;
end
end
PROCESS: next_state = UPDATE;
UPDATE: next_state = OFFSET_ORGN;
SAVE_TORAM: begin
if (ram_sw_counter == 7'd65)
next_state = DONE;
else
next_state = SAVE_TORAM;
end
DONE: next_state = DONE;
default: next_state = READY;
endcase
end
//---------------------------------------------------------------------
// 狀態機:順序邏輯 (正緣觸發)
//---------------------------------------------------------------------
integer idx;
integer i, j; // 迴圈變數
reg cmd_handled;
always@(posedge clk or posedge rst)begin
if (rst)begin
current_state <= IDLE;
end
else begin
current_state <= next_state;
end
end
always @(posedge clk or posedge rst) begin
if (rst) begin
rom_addr_cnt <= 7'd0;
cache_wirte_addr <= 7'd0;
cache_read_addr <= 7'd0;
op_x <= 4'd4;
op_y <= 4'd4;
kernel_result <= 8'd0;
ram_sw_counter <= 7'd0;
for (idx = 0; idx < 64; idx = idx + 1) begin
CACHE_flat[idx*8 +: 8] <= 8'd0;
end
IROM_rd <= 1'b0;
IROM_A <= 6'd0;
IRAM_ceb <= 1'b0;
IRAM_web <= 1'b1; // 預設為讀取模式
IRAM_D <= 8'd0;
IRAM_A <= 6'd0;
end else begin
case(current_state)
IDLE : ;
READY:begin
IROM_rd <= 1'b1;
end
FETCH: begin
IROM_A <= rom_addr_cnt;
cache_wirte_addr <= rom_addr_cnt;
end
OFFSET_ORGN: begin
IROM_rd <= 1'b0; // 結束 FETCH
case(cmd)
4'd1: if (op_y > 4'd2) op_y <= op_y - 1; // Shift Up
4'd2: if (op_y < 4'd6) op_y <= op_y + 1; // Shift Down
4'd3: if (op_x > 4'd2) op_x <= op_x - 1; // Shift Left
4'd4: if (op_x < 4'd6) op_x <= op_x + 1; // Shift Right
default: ;
endcase
end
PROCESS: begin
case(cmd)
4'd5: kernel_result <= compute_max(CACHE_flat, op_x, op_y);
4'd6: kernel_result <= compute_min(CACHE_flat, op_x, op_y);
4'd7: kernel_result <= compute_average(CACHE_flat, op_x, op_y);
default: kernel_result <= 8'd0;
endcase
end
UPDATE: begin
// 更新 CACHE_flat 中核區域 (4x4 區域) 為 kernel_result
for (j = -2; j < 2; j = j + 1) begin
for (i = -2; i < 2; i = i + 1) begin
CACHE_flat[((op_y + j)*8 + (op_x + i))*8 +: 8] <= kernel_result;
end
end
end
SAVE_TORAM: begin
IRAM_ceb <= 1'b1;
IRAM_web <= 1'b0; // 寫入模式
if (ram_sw_counter-1 < 64)begin
IRAM_A <= ram_sw_counter-1;
//$display("[save] index :%d, val : %d" , (ram_sw_counter-1) , CACHE_flat[(ram_sw_counter-1)*8 +: 8]);
IRAM_D <= CACHE_flat[(ram_sw_counter-1)*8 +: 8];
end
end
DONE: begin
IRAM_ceb <= 1'b0;
IRAM_web <= 1'b1; // 恢復讀取模式
end
default: ;
endcase
end
end
//---------------------------------------------------------------------
// 負緣觸發:從 IROM 讀取資料到 CACHE_flat
//---------------------------------------------------------------------
always @(negedge clk) begin
if (current_state == FETCH && IROM_rd && (cache_wirte_addr > 0)) begin
CACHE_flat[(cache_wirte_addr-1)*8 +: 8] <= IROM_Q;
//$display("[load] index :%d, val : %d" , (cache_wirte_addr-1), IROM_Q);
end
if (current_state == FETCH && IROM_rd) rom_addr_cnt <= rom_addr_cnt + 1;
end
//---------------------------------------------------------------------
// 負緣觸發:從 CACHE_flat 存資料到 IRAM
//---------------------------------------------------------------------
always @(negedge clk) begin
if (current_state == SAVE_TORAM) begin
ram_sw_counter <= ram_sw_counter + 1;
end
end
//-----------------------------------------------------
// 函式:計算最小值、最大值及平均值
//-----------------------------------------------------
function [7:0] compute_min(input [511:0] CACHE_IN, input [3:0] x_orgin, input [3:0] y_orgin);
reg [7:0] min_val;
integer i, j ;
integer index ;
begin
index = ((y_orgin - 2) * 8 + (x_orgin - 2)) * 8;
min_val = CACHE_IN[index +: 8];
// $display("Initial: x_orgin=%d, y_orgin=%d, index=%d, value=%d", x_orgin, y_orgin, index, CACHE_IN[index +: 8]);
for (j = -2; j < 2; j = j + 1) begin
for (i = -2; i < 2; i = i + 1) begin
index = ((y_orgin + j) * 8 + (x_orgin + i)) * 8;
if (CACHE_IN[index +: 8] < min_val)
min_val = CACHE_IN[index +: 8];
end
end
compute_min = min_val;
end
endfunction
function [7:0] compute_max(input [511:0] CACHE_IN, input [3:0] x_orgin, input [3:0] y_orgin);
reg [7:0] max_val;
integer i, j ;
integer index ;
begin
index = ((y_orgin - 2) * 8 + (x_orgin - 2)) * 8;
max_val = CACHE_IN[index +: 8];
for (j = -2; j < 2; j = j + 1) begin
for (i = -2; i < 2; i = i + 1) begin
index = ((y_orgin + j) * 8 + (x_orgin + i)) * 8;
if (CACHE_IN[index +: 8] > max_val)
max_val = CACHE_IN[index +: 8];
end
end
compute_max = max_val;
end
endfunction
function [7:0] compute_average(input [511:0] CACHE_IN, input [3:0] x_orgin, input [3:0] y_orgin);
integer sum ;
integer i, j ;
integer index;
begin
sum = 0;
for (j = -2; j < 2; j = j + 1) begin
for (i = -2; i < 2; i = i + 1) begin
index = ((y_orgin + j) * 8 + (x_orgin + i)) * 8;
sum = sum + CACHE_IN[index +: 8];
end
end
compute_average = sum / 16;
end
endfunction
endmodule
```
#### Result
```tex Result
tb1 : 1355 ns
tb2 : 1405 ns
tb3 : 1445 ns
tb4 : 1485 ns
tb5 : 1565 ns
```
### waveform

### 2025-HW3
本題助教給的講義有坑,直接看table就好

```verilog=
module FFT(
input clk,
input rst,
input [15:0] fir_d,
input fir_valid,
output fft_valid,
output done,
output [15:0] fft_d0,
output [15:0] fft_d1,
output [15:0] fft_d2,
output [15:0] fft_d3,
output [15:0] fft_d4,
output [15:0] fft_d5,
output [15:0] fft_d6,
output [15:0] fft_d7,
output [15:0] fft_d8,
output [15:0] fft_d9,
output [15:0] fft_d10,
output [15:0] fft_d11,
output [15:0] fft_d12,
output [15:0] fft_d13,
output [15:0] fft_d14,
output [15:0] fft_d15
);
// 接收 16 筆 FIR 資料
reg [15:0] data_mem[0:15];
reg [3:0] input_count;
reg data_pass;
integer idx;
always @(posedge clk or posedge rst) begin
if (rst) begin
input_count <= 4'd0;
for (idx = 0; idx < 16; idx = idx + 1)
data_mem[idx] <= 16'd0;
end else if (fir_valid) begin
data_mem[input_count] <= fir_d;
input_count <= input_count + 4'd1;
end
end
// 當收滿 16 筆時拉高一次
always @(posedge clk or posedge rst) begin
if (rst)
data_pass <= 1'b0;
else
data_pass <= (fir_valid && input_count == 4'd15);
end
// FSM 狀態定義
parameter IDLE = 3'd0,
PROCESS = 3'd1,
OUTPUT_R = 3'd2,
OUTPUT_I = 3'd3,
DONE = 3'd4;
reg [2:0] state, next_state;
// 傳給子模組
reg [15:0] process_data[0:15];
wire [31:0] fft_real[0:15];
wire [31:0] fft_imag[0:15];
// 子模組實例
FFTCAL u_fftcal (
.x0 (process_data[0]) , .x1 (process_data[1]) , .x2 (process_data[2]) , .x3 (process_data[3]) ,
.x4 (process_data[4]) , .x5 (process_data[5]) , .x6 (process_data[6]) , .x7 (process_data[7]) ,
.x8 (process_data[8]) , .x9 (process_data[9]) , .x10(process_data[10]) , .x11(process_data[11]) ,
.x12(process_data[12]) , .x13(process_data[13]) , .x14(process_data[14]) , .x15(process_data[15]) ,
.y_real_0 (fft_real[0]) , .y_real_1 (fft_real[1]) , .y_real_2 (fft_real[2]) , .y_real_3 (fft_real[3]) ,
.y_real_4 (fft_real[4]) , .y_real_5 (fft_real[5]) , .y_real_6 (fft_real[6]) , .y_real_7 (fft_real[7]) ,
.y_real_8 (fft_real[8]) , .y_real_9 (fft_real[9]) , .y_real_10(fft_real[10]), .y_real_11(fft_real[11]),
.y_real_12(fft_real[12]), .y_real_13(fft_real[13]), .y_real_14(fft_real[14]), .y_real_15(fft_real[15]),
.y_imag_0 (fft_imag[0]) , .y_imag_1 (fft_imag[1]) , .y_imag_2 (fft_imag[2]) , .y_imag_3 (fft_imag[3]) ,
.y_imag_4 (fft_imag[4]) , .y_imag_5 (fft_imag[5]) , .y_imag_6 (fft_imag[6]) , .y_imag_7 (fft_imag[7]) ,
.y_imag_8 (fft_imag[8]) , .y_imag_9 (fft_imag[9]) , .y_imag_10(fft_imag[10]), .y_imag_11(fft_imag[11]),
.y_imag_12(fft_imag[12]), .y_imag_13(fft_imag[13]), .y_imag_14(fft_imag[14]), .y_imag_15(fft_imag[15])
);
// FSM 轉移
always @(*) begin
case (state)
IDLE : next_state = data_pass ? PROCESS : IDLE;
PROCESS : next_state = OUTPUT_R;
OUTPUT_R: next_state = OUTPUT_I;
OUTPUT_I: next_state = fir_valid ? IDLE : DONE;
DONE : next_state = DONE;
default: next_state = IDLE;
endcase
end
// FSM 狀態更新
always @(posedge clk or posedge rst) begin
if (rst)
state <= IDLE;
else
state <= next_state;
end
// 控制流程
reg [15:0] out_data[0:15];
reg fft_valid_reg, done_reg;
assign fft_valid = fft_valid_reg;//fft_valid_reg可拉出來結合state判斷是否拉起
assign done = done_reg;
integer i;
always @(posedge clk or posedge rst) begin
if (rst) begin
fft_valid_reg <= 1'b0;
done_reg <= 1'b0;
for (i = 0; i < 16; i = i + 1) begin
process_data[i] <= 16'd0;
out_data[i] <= 16'd0;
end
end else begin
case (state)
IDLE :begin
fft_valid_reg <= 1'b0;
if (data_pass) begin
for (i = 0; i < 16; i = i + 1)
process_data[i] <= data_mem[i];
end
end
PROCESS: begin
fft_valid_reg <= 1'b1;
for (i = 0; i < 16; i = i + 1)
out_data[i] <= fft_real[i][23:8];
end
OUTPUT_R: begin
fft_valid_reg <= 1'b1;
for (i = 0; i < 16; i = i + 1)
out_data[i] <= fft_imag[i][23:8];
end
OUTPUT_I: begin
fft_valid_reg <= 1'b0;
end
DONE: begin
fft_valid_reg <= 1'b0;
done_reg <= 1'b1;
end
default: begin
fft_valid_reg <= 1'b0;
done_reg <= 1'b0;
end
endcase
end
end
// 連接輸出
assign fft_d0 = out_data[0];
assign fft_d8 = out_data[1];
assign fft_d4 = out_data[2];
assign fft_d12 = out_data[3];
assign fft_d2 = out_data[4];
assign fft_d10 = out_data[5];
assign fft_d6 = out_data[6];
assign fft_d14 = out_data[7];
assign fft_d1 = out_data[8];
assign fft_d9 = out_data[9];
assign fft_d5 = out_data[10];
assign fft_d13 = out_data[11];
assign fft_d3 = out_data[12];
assign fft_d11 = out_data[13];
assign fft_d7 = out_data[14];
assign fft_d15 = out_data[15];
endmodule
```
```verilog=
//fft module
module FFTCAL (
input [15:0] x0,
input [15:0] x1,
input [15:0] x2,
input [15:0] x3,
input [15:0] x4,
input [15:0] x5,
input [15:0] x6,
input [15:0] x7,
input [15:0] x8,
input [15:0] x9,
input [15:0] x10,
input [15:0] x11,
input [15:0] x12,
input [15:0] x13,
input [15:0] x14,
input [15:0] x15,
output [31:0] y_real_0,
output [31:0] y_real_1,
output [31:0] y_real_2,
output [31:0] y_real_3,
output [31:0] y_real_4,
output [31:0] y_real_5,
output [31:0] y_real_6,
output [31:0] y_real_7,
output [31:0] y_real_8,
output [31:0] y_real_9,
output [31:0] y_real_10,
output [31:0] y_real_11,
output [31:0] y_real_12,
output [31:0] y_real_13,
output [31:0] y_real_14,
output [31:0] y_real_15,
output [31:0] y_imag_0,
output [31:0] y_imag_1,
output [31:0] y_imag_2,
output [31:0] y_imag_3,
output [31:0] y_imag_4,
output [31:0] y_imag_5,
output [31:0] y_imag_6,
output [31:0] y_imag_7,
output [31:0] y_imag_8,
output [31:0] y_imag_9,
output [31:0] y_imag_10,
output [31:0] y_imag_11,
output [31:0] y_imag_12,
output [31:0] y_imag_13,
output [31:0] y_imag_14,
output [31:0] y_imag_15
);
//00000000
//FFFFFFFF
parameter signed [63:0] W0_real = 64'h0000000000010000;
parameter signed [63:0] W0_imag = 64'h0000000000000000;
parameter signed [63:0] W1_real = 64'h000000000000EC83;
parameter signed [63:0] W1_imag = 64'hFFFFFFFFFFFF9E09;
parameter signed [63:0] W2_real = 64'h000000000000B504;
parameter signed [63:0] W2_imag = 64'hFFFFFFFFFFFF4AFC;
parameter signed [63:0] W3_real = 64'h00000000000061F7;
parameter signed [63:0] W3_imag = 64'hFFFFFFFFFFFF137D;
parameter signed [63:0] W4_real = 64'h0000000000000000;
parameter signed [63:0] W4_imag = 64'hFFFFFFFFFFFF0000;
parameter signed [63:0] W5_real = 64'hFFFFFFFFFFFF9E09;
parameter signed [63:0] W5_imag = 64'hFFFFFFFFFFFF137D;
parameter signed [63:0] W6_real = 64'hFFFFFFFFFFFF4AFC;
parameter signed [63:0] W6_imag = 64'hFFFFFFFFFFFF4AFC;
parameter signed [63:0] W7_real = 64'hFFFFFFFFFFFF137D;
parameter signed [63:0] W7_imag = 64'hFFFFFFFFFFFF9E09;
// ------------------------------
// Stage 0: sign‑extend + zero padding
// ------------------------------
reg [63:0] real_stage0 [0:15];
reg [63:0] imag_stage0 [0:15];
always @(*) begin
real_stage0[0] = { {40{x0[15]}}, x0, 8'd0 }; imag_stage0[0] = 64'd0;
real_stage0[1] = { {40{x1[15]}}, x1, 8'd0 }; imag_stage0[1] = 64'd0;
real_stage0[2] = { {40{x2[15]}}, x2, 8'd0 }; imag_stage0[2] = 64'd0;
real_stage0[3] = { {40{x3[15]}}, x3, 8'd0 }; imag_stage0[3] = 64'd0;
real_stage0[4] = { {40{x4[15]}}, x4, 8'd0 }; imag_stage0[4] = 64'd0;
real_stage0[5] = { {40{x5[15]}}, x5, 8'd0 }; imag_stage0[5] = 64'd0;
real_stage0[6] = { {40{x6[15]}}, x6, 8'd0 }; imag_stage0[6] = 64'd0;
real_stage0[7] = { {40{x7[15]}}, x7, 8'd0 }; imag_stage0[7] = 64'd0;
real_stage0[8] = { {40{x8[15]}}, x8, 8'd0 }; imag_stage0[8] = 64'd0;
real_stage0[9] = { {40{x9[15]}}, x9, 8'd0 }; imag_stage0[9] = 64'd0;
real_stage0[10] = { {40{x10[15]}}, x10, 8'd0 }; imag_stage0[10] = 64'd0;
real_stage0[11] = { {40{x11[15]}}, x11, 8'd0 }; imag_stage0[11] = 64'd0;
real_stage0[12] = { {40{x12[15]}}, x12, 8'd0 }; imag_stage0[12] = 64'd0;
real_stage0[13] = { {40{x13[15]}}, x13, 8'd0 }; imag_stage0[13] = 64'd0;
real_stage0[14] = { {40{x14[15]}}, x14, 8'd0 }; imag_stage0[14] = 64'd0;
real_stage0[15] = { {40{x15[15]}}, x15, 8'd0 }; imag_stage0[15] = 64'd0;
end
// ------------------------------
// Stage 1: butterfly 0↔8, 1↔9, …, 7↔15
// ------------------------------
reg [63:0] real_stage1 [0:15];
reg [63:0] imag_stage1 [0:15];
// 暫存 a,b 以及乘法結果
reg signed [31:0] a1, b1, c1;
reg signed [63:0] m1, n1;
always @(*) begin
// pair 0 & 8
real_stage1[0] = real_stage0[0] + real_stage0[8];
imag_stage1[0] = imag_stage0[0] + imag_stage0[8];
a1 = $signed(real_stage0[0]) - $signed(real_stage0[8]);
b1 = $signed(imag_stage0[0]) - $signed(imag_stage0[8]);
c1 = $signed(imag_stage0[8]) - $signed(imag_stage0[0]);
m1 = a1*W0_real + c1*W0_imag;
n1 = a1*W0_imag + b1*W0_real;
real_stage1[8] = m1 >>>16;
imag_stage1[8] = n1 >>>16;
// pair 1 & 9
real_stage1[1] = real_stage0[1] + real_stage0[9];
imag_stage1[1] = imag_stage0[1] + imag_stage0[9];
a1 = $signed(real_stage0[1]) - $signed(real_stage0[9]);
b1 = $signed(imag_stage0[1]) - $signed(imag_stage0[9]);
c1 = $signed(imag_stage0[9]) - $signed(imag_stage0[1]);
m1 = a1*W1_real + c1*W1_imag;
n1 = a1*W1_imag + b1*W1_real;
real_stage1[9] = m1 >>>16;
imag_stage1[9] = n1 >>>16;
// pair 2 & 10
real_stage1[2] = real_stage0[2] + real_stage0[10];
imag_stage1[2] = imag_stage0[2] + imag_stage0[10];
a1 = $signed(real_stage0[2]) - $signed(real_stage0[10]);
b1 = $signed(imag_stage0[2]) - $signed(imag_stage0[10]);
c1 = $signed(imag_stage0[10]) - $signed(imag_stage0[2]);
m1 = a1*W2_real + c1*W2_imag;
n1 = a1*W2_imag + b1*W2_real;
real_stage1[10] = m1 >>> 16;
imag_stage1[10] = n1 >>> 16;
// pair 3 & 11
real_stage1[3] = real_stage0[3] + real_stage0[11];
imag_stage1[3] = imag_stage0[3] + imag_stage0[11];
a1 = $signed(real_stage0[3]) - $signed(real_stage0[11]);
b1 = $signed(imag_stage0[3]) - $signed(imag_stage0[11]);
c1 = $signed(imag_stage0[11]) - $signed(imag_stage0[3]);
m1 = a1*W3_real + c1*W3_imag;
n1 = a1*W3_imag + b1*W3_real;
real_stage1[11] = m1 >>>16;
imag_stage1[11] = n1 >>>16;
// pair 4 & 12
real_stage1[4] = real_stage0[4] + real_stage0[12];
imag_stage1[4] = imag_stage0[4] + imag_stage0[12];
a1 = $signed(real_stage0[4]) - $signed(real_stage0[12]);
b1 = $signed(imag_stage0[4]) - $signed(imag_stage0[12]);
c1 = $signed(imag_stage0[12]) - $signed(imag_stage0[4]);
m1 = a1*W4_real + c1*W4_imag;
n1 = a1*W4_imag + b1*W4_real;
real_stage1[12] = m1 >>>16;
imag_stage1[12] = n1 >>>16;
// pair 5 & 13
real_stage1[5] = real_stage0[5] + real_stage0[13];
imag_stage1[5] = imag_stage0[5] + imag_stage0[13];
a1 = $signed(real_stage0[5]) - $signed(real_stage0[13]);
b1 = $signed(imag_stage0[5]) - $signed(imag_stage0[13]);
c1 = $signed(imag_stage0[13]) - $signed(imag_stage0[5]);
m1 = a1*W5_real + c1*W5_imag;
n1 = a1*W5_imag + b1*W5_real;
real_stage1[13] = m1 >>>16;
imag_stage1[13] = n1 >>>16;
// pair 6 & 14
real_stage1[6] = real_stage0[6] + real_stage0[14];
imag_stage1[6] = imag_stage0[6] + imag_stage0[14];
a1 = $signed(real_stage0[6]) - $signed(real_stage0[14]);
b1 = $signed(imag_stage0[6]) - $signed(imag_stage0[14]);
c1 = $signed(imag_stage0[14]) - $signed(imag_stage0[6]);
m1 = a1*W6_real + c1*W6_imag;
n1 = a1*W6_imag + b1*W6_real;
real_stage1[14] = m1 >>> 16;
imag_stage1[14] = n1 >>> 16;
// pair 7 & 15
real_stage1[7] = real_stage0[7] + real_stage0[15];
imag_stage1[7] = imag_stage0[7] + imag_stage0[15];
a1 = $signed(real_stage0[7]) - $signed(real_stage0[15]);
b1 = $signed(imag_stage0[7]) - $signed(imag_stage0[15]);
c1 = $signed(imag_stage0[15]) - $signed(imag_stage0[7]);
m1 = a1*W7_real + c1*W7_imag;
n1 = a1*W7_imag + b1*W7_real;
real_stage1[15] = m1 >>> 16;
imag_stage1[15] = n1 >>> 16;
end
// ------------------------------
// Stage 2: stride = 4, pairs (0,4),(1,5),(2,6),(3,7),(8,12),(9,13),(10,14),(11,15)
// ------------------------------
reg [63:0] real_stage2[0:15], imag_stage2[0:15];
// 暫存 a,b 以及乘法結果
reg signed [31:0] a2, b2, c2;
reg signed [63:0] m2, n2;
always @(*) begin
// pair 0 & 4
real_stage2[0] = real_stage1[0] + real_stage1[4];
imag_stage2[0] = imag_stage1[0] + imag_stage1[4];
a2 = $signed(real_stage1[0]) - $signed(real_stage1[4]);
b2 = $signed(imag_stage1[0]) - $signed(imag_stage1[4]);
c2 = $signed(imag_stage1[4]) - $signed(imag_stage1[0]);
m2 = a2*W0_real + c2*W0_imag;
n2 = a2*W0_imag + b2*W0_real;
real_stage2[4] = m2 >>> 16;
imag_stage2[4] = n2 >>> 16;
// pair 1 & 5
real_stage2[1] = real_stage1[1] + real_stage1[5];
imag_stage2[1] = imag_stage1[1] + imag_stage1[5];
a2 = $signed(real_stage1[1]) - $signed(real_stage1[5]);
b2 = $signed(imag_stage1[1]) - $signed(imag_stage1[5]);
c2 = $signed(imag_stage1[5]) - $signed(imag_stage1[1]);
m2 = a2*W2_real + c2*W2_imag;
n2 = a2*W2_imag + b2*W2_real;
real_stage2[5] = m2 >>> 16;
imag_stage2[5] = n2 >>> 16;
// pair 2 & 6
real_stage2[2] = real_stage1[2] + real_stage1[6];
imag_stage2[2] = imag_stage1[2] + imag_stage1[6];
a2 = $signed(real_stage1[2]) - $signed(real_stage1[6]);
b2 = $signed(imag_stage1[2]) - $signed(imag_stage1[6]);
c2 = $signed(imag_stage1[6]) - $signed(imag_stage1[2]);
m2 = a2*W4_real + c2*W4_imag;
n2 = a2*W4_imag + b2*W4_real;
real_stage2[6] = m2 >>> 16;
imag_stage2[6] = n2 >>> 16;
// pair 3 & 7
real_stage2[3] = real_stage1[3] + real_stage1[7];
imag_stage2[3] = imag_stage1[3] + imag_stage1[7];
a2 = $signed(real_stage1[3]) - $signed(real_stage1[7]);
b2 = $signed(imag_stage1[3]) - $signed(imag_stage1[7]);
c2 = $signed(imag_stage1[7]) - $signed(imag_stage1[3]);
m2 = a2*W6_real + c2*W6_imag;
n2 = a2*W6_imag + b2*W6_real;
real_stage2[7] = m2 >>> 16;
imag_stage2[7] = n2 >>> 16;
// pair 8 & 12
real_stage2[8] = real_stage1[8] + real_stage1[12];
imag_stage2[8] = imag_stage1[8] + imag_stage1[12];
a2 = $signed(real_stage1[8]) - $signed(real_stage1[12]);
b2 = $signed(imag_stage1[8]) - $signed(imag_stage1[12]);
c2 = $signed(imag_stage1[12]) - $signed(imag_stage1[8]);
m2 = a2*W0_real + c2*W0_imag;
n2 = a2*W0_imag + b2*W0_real;
real_stage2[12] = m2 >>> 16;
imag_stage2[12] = n2 >>> 16;
// pair 9 & 13
real_stage2[9] = real_stage1[9] + real_stage1[13];
imag_stage2[9] = imag_stage1[9] + imag_stage1[13];
a2 = $signed(real_stage1[9]) - $signed(real_stage1[13]);
b2 = $signed(imag_stage1[9]) - $signed(imag_stage1[13]);
c2 = $signed(imag_stage1[13]) - $signed(imag_stage1[9]);
m2 = a2*W2_real + c2*W2_imag;
n2 = a2*W2_imag + b2*W2_real;
real_stage2[13] = m2 >>> 16;
imag_stage2[13] = n2 >>> 16;
// pair 10 & 14
real_stage2[10] = real_stage1[10] + real_stage1[14];
imag_stage2[10] = imag_stage1[10] + imag_stage1[14];
a2 = $signed(real_stage1[10]) - $signed(real_stage1[14]);
b2 = $signed(imag_stage1[10]) - $signed(imag_stage1[14]);
c2 = $signed(imag_stage1[14]) - $signed(imag_stage1[10]);
m2 = a2*W4_real + c2*W4_imag;
n2 = a2*W4_imag + b2*W4_real;
real_stage2[14] = m2 >>> 16;
imag_stage2[14] = n2 >>> 16;
// pair 11 & 15
real_stage2[11] = real_stage1[11] + real_stage1[15];
imag_stage2[11] = imag_stage1[11] + imag_stage1[15];
a2 = $signed(real_stage1[11]) - $signed(real_stage1[15]);
b2 = $signed(imag_stage1[11]) - $signed(imag_stage1[15]);
c2 = $signed(imag_stage1[15]) - $signed(imag_stage1[11]);
m2 = a2*W6_real + c2*W6_imag;
n2 = a2*W6_imag + b2*W6_real;
real_stage2[15] = m2 >>> 16;
imag_stage2[15] = n2 >>> 16;
end
// ------------------------------
// Stage 3: stride = 2, pairs (0,2),(1,3),(4,6),(5,7),(8,10),(9,11),(12,14),(13,15)
// ------------------------------
reg [63:0] real_stage3[0:15], imag_stage3[0:15];
// 暫存 a,b 以及乘法結果
reg signed [31:0] a3, b3, c3;
reg signed [63:0] m3, n3;
always @(*) begin
// pair 0 & 2
real_stage3[0] = real_stage2[0] + real_stage2[2];
imag_stage3[0] = imag_stage2[0] + imag_stage2[2];
a3 = $signed(real_stage2[0]) - $signed(real_stage2[2]);
b3 = $signed(imag_stage2[0]) - $signed(imag_stage2[2]);
c3 = $signed(imag_stage2[2]) - $signed(imag_stage2[0]);
m3 = a3*W0_real + c3*W0_imag;
n3 = a3*W0_imag + b3*W0_real;
real_stage3[2] = m3 >>> 16;
imag_stage3[2] = n3 >>> 16;
// pair 1 & 3
real_stage3[1] = real_stage2[1] + real_stage2[3];
imag_stage3[1] = imag_stage2[1] + imag_stage2[3];
a3 = $signed(real_stage2[1]) - $signed(real_stage2[3]);
b3 = $signed(imag_stage2[1]) - $signed(imag_stage2[3]);
c3 = $signed(imag_stage2[3]) - $signed(imag_stage2[1]);
m3 = a3*W4_real + c3*W4_imag;
n3 = a3*W4_imag + b3*W4_real;
real_stage3[3] = m3 >>> 16;
imag_stage3[3] = n3 >>> 16;
// pair 4 & 6
real_stage3[4] = real_stage2[4] + real_stage2[6];
imag_stage3[4] = imag_stage2[4] + imag_stage2[6];
a3 = $signed(real_stage2[4]) - $signed(real_stage2[6]);
b3 = $signed(imag_stage2[4]) - $signed(imag_stage2[6]);
c3 = $signed(imag_stage2[6]) - $signed(imag_stage2[4]);
m3 = a3*W0_real + c3*W0_imag;
n3 = a3*W0_imag + b3*W0_real;
real_stage3[6] = m3 >>> 16;
imag_stage3[6] = n3 >>> 16;
// pair 5 & 7
real_stage3[5] = real_stage2[5] + real_stage2[7];
imag_stage3[5] = imag_stage2[5] + imag_stage2[7];
a3 = $signed(real_stage2[5]) - $signed(real_stage2[7]);
b3 = $signed(imag_stage2[5]) - $signed(imag_stage2[7]);
c3 = $signed(imag_stage2[7]) - $signed(imag_stage2[5]);
m3 = a3*W4_real + c3*W4_imag;
n3 = a3*W4_imag + b3*W4_real;
real_stage3[7] = m3 >>> 16;
imag_stage3[7] = n3 >>> 16;
// pair 8 & 10
real_stage3[8] = real_stage2[8] + real_stage2[10];
imag_stage3[8] = imag_stage2[8] + imag_stage2[10];
a3 = $signed(real_stage2[8]) - $signed(real_stage2[10]);
b3 = $signed(imag_stage2[8]) - $signed(imag_stage2[10]);
c3 = $signed(imag_stage2[10]) - $signed(imag_stage2[8]);
m3 = a3*W0_real + c3*W0_imag;
n3 = a3*W0_imag + b3*W0_real;
real_stage3[10] = m3 >>> 16;
imag_stage3[10] = n3 >>> 16;
// pair 9 & 11
real_stage3[9] = real_stage2[9] + real_stage2[11];
imag_stage3[9] = imag_stage2[9] + imag_stage2[11];
a3 = $signed(real_stage2[9]) - $signed(real_stage2[11]);
b3 = $signed(imag_stage2[9]) - $signed(imag_stage2[11]);
c3 = $signed(imag_stage2[11]) - $signed(imag_stage2[9]);
m3 = a3*W4_real + c3*W4_imag;
n3 = a3*W4_imag + b3*W4_real;
real_stage3[11] = m3 >>> 16;
imag_stage3[11] = n3 >>> 16;
// pair 12 & 14
real_stage3[12] = real_stage2[12] + real_stage2[14];
imag_stage3[12] = imag_stage2[12] + imag_stage2[14];
a3 = $signed(real_stage2[12]) - $signed(real_stage2[14]);
b3 = $signed(imag_stage2[12]) - $signed(imag_stage2[14]);
c3 = $signed(imag_stage2[14]) - $signed(imag_stage2[12]);
m3 = a3*W0_real + c3*W0_imag;
n3 = a3*W0_imag + b3*W0_real;
real_stage3[14] = m3 >>> 16;
imag_stage3[14] = n3 >>> 16;
// pair 13 & 15
real_stage3[13] = real_stage2[13] + real_stage2[15];
imag_stage3[13] = imag_stage2[13] + imag_stage2[15];
a3 = $signed(real_stage2[13]) - $signed(real_stage2[15]);
b3 = $signed(imag_stage2[13]) - $signed(imag_stage2[15]);
c3 = $signed(imag_stage2[15]) - $signed(imag_stage2[13]);
m3 = a3*W4_real + c3*W4_imag;
n3 = a3*W4_imag + b3*W4_real;
real_stage3[15] = m3 >>> 16;
imag_stage3[15] = n3 >>> 16;
end
// ------------------------------
// Stage 4: stride = 1, pairs (0,1),(2,3),(4,5),(6,7),(8,9),(10,11),(12,13),(14,15)
// ------------------------------
reg [63:0] real_stage4[0:15], imag_stage4[0:15];
// 暫存 a,b 以及乘法結果
reg signed [31:0] a4, b4, c4;
reg signed [63:0] m4, n4;
always @(*) begin
// pair 0 & 1
real_stage4[0] = real_stage3[0] + real_stage3[1];
imag_stage4[0] = imag_stage3[0] + imag_stage3[1];
a4 = $signed(real_stage3[0]) - $signed(real_stage3[1]);
b4 = $signed(imag_stage3[0]) - $signed(imag_stage3[1]);
c4 = $signed(imag_stage2[1]) - $signed(imag_stage2[0]);
m4 = a4*W0_real + c4*W0_imag;
n4 = a4*W0_imag + b4*W0_real;
real_stage4[1] = m4 >>> 16;
imag_stage4[1] = n4 >>> 16;
// pair 2 & 3
real_stage4[2] = real_stage3[2] + real_stage3[3];
imag_stage4[2] = imag_stage3[2] + imag_stage3[3];
a4 = $signed(real_stage3[2]) - $signed(real_stage3[3]);
b4 = $signed(imag_stage3[2]) - $signed(imag_stage3[3]);
c4 = $signed(imag_stage2[3]) - $signed(imag_stage2[2]);
m4 = a4*W0_real + c4*W0_imag;
n4 = a4*W0_imag + b4*W0_real;
real_stage4[3] = m4 >>> 16;
imag_stage4[3] = n4 >>> 16;
// pair 4 & 5
real_stage4[4] = real_stage3[4] + real_stage3[5];
imag_stage4[4] = imag_stage3[4] + imag_stage3[5];
a4 = $signed(real_stage3[4]) - $signed(real_stage3[5]);
b4 = $signed(imag_stage3[4]) - $signed(imag_stage3[5]);
c4 = $signed(imag_stage2[5]) - $signed(imag_stage2[4]);
m4 = a4*W0_real + c4*W0_imag;
n4 = a4*W0_imag + b4*W0_real;
real_stage4[5] = m4 >>> 16;
imag_stage4[5] = n4 >>> 16;
// pair 6 & 7
real_stage4[6] = real_stage3[6] + real_stage3[7];
imag_stage4[6] = imag_stage3[6] + imag_stage3[7];
a4 = $signed(real_stage3[6]) - $signed(real_stage3[7]);
b4 = $signed(imag_stage3[6]) - $signed(imag_stage3[7]);
c4 = $signed(imag_stage2[7]) - $signed(imag_stage2[6]);
m4 = a4*W0_real + c4*W0_imag;
n4 = a4*W0_imag + b4*W0_real;
real_stage4[7] = m4 >>> 16;
imag_stage4[7] = n4 >>> 16;
// pair 8 & 9
real_stage4[8] = real_stage3[8] + real_stage3[9];
imag_stage4[8] = imag_stage3[8] + imag_stage3[9];
a4 = $signed(real_stage3[8]) - $signed(real_stage3[9]);
b4 = $signed(imag_stage3[8]) - $signed(imag_stage3[9]);
c4 = $signed(imag_stage2[9]) - $signed(imag_stage2[8]);
m4 = a4*W0_real + c4*W0_imag;
n4 = a4*W0_imag + b4*W0_real;
real_stage4[9] = m4 >>> 16;
imag_stage4[9] = n4 >>> 16;
// pair 10 & 11
real_stage4[10] = real_stage3[10] + real_stage3[11];
imag_stage4[10] = imag_stage3[10] + imag_stage3[11];
a4 = $signed(real_stage3[10]) - $signed(real_stage3[11]);
b4 = $signed(imag_stage3[10]) - $signed(imag_stage3[11]);
c4 = $signed(imag_stage2[11]) - $signed(imag_stage2[10]);
m4 = a4*W0_real + c4*W0_imag;
n4 = a4*W0_imag + b4*W0_real;
real_stage4[11] = m4 >>> 16;
imag_stage4[11] = n4 >>> 16;
// pair 12 & 13
real_stage4[12] = real_stage3[12] + real_stage3[13];
imag_stage4[12] = imag_stage3[12] + imag_stage3[13];
a4 = $signed(real_stage3[12]) - $signed(real_stage3[13]);
b4 = $signed(imag_stage3[12]) - $signed(imag_stage3[13]);
c4 = $signed(imag_stage2[13]) - $signed(imag_stage2[12]);
m4 = a4*W0_real + c4*W0_imag;
n4 = a4*W0_imag + b4*W0_real;
real_stage4[13] = m4 >>> 16;
imag_stage4[13] = n4 >>> 16;
// pair 14 & 15
real_stage4[14] = real_stage3[14] + real_stage3[15];
imag_stage4[14] = imag_stage3[14] + imag_stage3[15];
a4 = $signed(real_stage3[14]) - $signed(real_stage3[15]);
b4 = $signed(imag_stage3[14]) - $signed(imag_stage3[15]);
c4 = $signed(imag_stage2[15]) - $signed(imag_stage2[14]);
m4 = a4*W0_real + c4*W0_imag;
n4 = a4*W0_imag + b4*W0_real;
real_stage4[15] = m4 >>> 16;
imag_stage4[15] = n4 >>> 16;
end
// ------------------------------
// Final output assignments
// ------------------------------
assign y_real_0 = real_stage4[0][31:0];
assign y_imag_0 = imag_stage4[0][31:0];
assign y_real_1 = real_stage4[1][31:0];
assign y_imag_1 = imag_stage4[1][31:0];
assign y_real_2 = real_stage4[2][31:0];
assign y_imag_2 = imag_stage4[2][31:0];
assign y_real_3 = real_stage4[3][31:0];
assign y_imag_3 = imag_stage4[3][31:0];
assign y_real_4 = real_stage4[4][31:0];
assign y_imag_4 = imag_stage4[4][31:0];
assign y_real_5 = real_stage4[5][31:0];
assign y_imag_5 = imag_stage4[5][31:0];
assign y_real_6 = real_stage4[6][31:0];
assign y_imag_6 = imag_stage4[6][31:0];
assign y_real_7 = real_stage4[7][31:0];
assign y_imag_7 = imag_stage4[7][31:0];
assign y_real_8 = real_stage4[8][31:0];
assign y_imag_8 = imag_stage4[8][31:0];
assign y_real_9 = real_stage4[9][31:0];
assign y_imag_9 = imag_stage4[9][31:0];
assign y_real_10 = real_stage4[10][31:0];
assign y_imag_10 = imag_stage4[10][31:0];
assign y_real_11 = real_stage4[11][31:0];
assign y_imag_11 = imag_stage4[11][31:0];
assign y_real_12 = real_stage4[12][31:0];
assign y_imag_12 = imag_stage4[12][31:0];
assign y_real_13 = real_stage4[13][31:0];
assign y_imag_13 = imag_stage4[13][31:0];
assign y_real_14 = real_stage4[14][31:0];
assign y_imag_14 = imag_stage4[14][31:0];
assign y_real_15 = real_stage4[15][31:0];
assign y_imag_15 = imag_stage4[15][31:0];
endmodule
```
#### functional simulation result

#### flow summary
```area = 3331 + 778 + 84 * 9 = 4865```

#### Pre-Layout simulation
```tex=
minimal clock : 26ns
Time : 26866 ns
cycles : 1033
```

### 2025-HW4
```verilog=
`timescale 1ns/10ps
module ATCONV(
input clk ,
input rst ,
output ROM_rd ,
output [11:0] iaddr ,
input [15:0] idata ,
output layer0_ceb,
output layer0_web,
output reg [11:0] layer0_A ,
output reg [15:0] layer0_D ,
input [15:0] layer0_Q ,
output layer1_ceb,
output layer1_web,
output reg [11:0] layer1_A ,
output reg [15:0] layer1_D ,
input [15:0] layer1_Q ,
output done
);
// 狀態編碼
parameter IDLE = 4'd0;
parameter LOAD_FROM_ROM = 4'd1;
parameter PROCESS = 4'd2; // conv + relu
parameter SAVE_TO_LAYER0 = 4'd3;
parameter LOAD_FROM_LAYER0 = 4'd4;
parameter MAXPOOLING = 4'd5;
parameter SAVE_TO_LAYER1 = 4'd6;
parameter DONE = 4'd7;
// 卷積核常數 (16-bit signed)
parameter signed[15:0] k0 = 16'hFFFF;
parameter signed[15:0] k1 = 16'hFFFE;
parameter signed[15:0] k2 = 16'hFFFF;
parameter signed[15:0] k3 = 16'hFFFC;
parameter signed[15:0] k4 = 16'h0010;
parameter signed[15:0] k5 = 16'hFFFC;
parameter signed[15:0] k6 = 16'hFFFF;
parameter signed[15:0] k7 = 16'hFFFE;
parameter signed[15:0] k8 = 16'hFFFF;
parameter signed[15:0] bias = 32'hFFFFFFF4;
// 狀態暫存
reg [3:0] state, next_state;
///////////////////////////////
///卷積操作暫存
//////////////////////////////
// 卷積 kernel 參數
reg [15:0] kernel [0:2][0:2];
reg [3 :0] kernel_cnt ;
reg [12:0] kernel_center ;
// 卷積計算結果
reg signed [31:0] conv_result; // 32 位的卷積結果
reg signed [31:0] relu_result; // 32 位的 ReLU 結果
// 各乘法部分結果
wire signed [31:0] t0, t1, t2, t3, t4, t5, t6, t7, t8;
assign t0 = ($signed(kernel[0][0]) * k0) >>> 4;
assign t1 = ($signed(kernel[0][1]) * k1) >>> 4;
assign t2 = ($signed(kernel[0][2]) * k2) >>> 4;
assign t3 = ($signed(kernel[1][0]) * k3) >>> 4;
assign t4 = ($signed(kernel[1][1]) * k4) >>> 4;
assign t5 = ($signed(kernel[1][2]) * k5) >>> 4;
assign t6 = ($signed(kernel[2][0]) * k6) >>> 4;
assign t7 = ($signed(kernel[2][1]) * k7) >>> 4;
assign t8 = ($signed(kernel[2][2]) * k8) >>> 4;
//////////////////////////////////
// 組合電路: 卷積 + ReLU
//////////////////////////////////
always@(*)begin
conv_result = t0 + t1 + t2 + t3 + t4 + t5 + t6 + t7 + t8 + bias; // CONVOLUTION 操作,最後加上bias
relu_result = (conv_result[31] == 1'b1) ? 32'd0 : conv_result; // ReLU 操作:如果 conv_result 是負數則設為 0,否則保留原來的值
end
//////////////////////////////////
///MAXPOOL + ROUNDUP 操作暫存、參數
/////////////////////////////////
// 組合電路: maxpool 參數
reg [15:0] pool_win [0:1][0:1];
reg [1 :0] pool_read_cnt ;
reg [11:0] layer0_read_addr ;
reg [10:0] layer1_wrt_addr ;
reg [5:0] layer0_row ;
reg [5:0] layer0_col ;
reg [15:0] s0, s1, s2, s3, s4;
reg [15:0] maxpooling_result;
parameter [15:0] MASK_FRAC = 16'b0000_0000_0000_1111;
parameter [15:0] MASK_INT = 16'b1111_1111_1111_0000;
// 組合電路 : 計算 maxpooling + roundup
always@(*)begin
s0 = pool_win[0][0] > pool_win[0][1] ? pool_win[0][0] : pool_win[0][1];
s1 = pool_win[1][0] > pool_win[1][1] ? pool_win[1][0] : pool_win[1][1];
s2 = s0 > s1 ? s0 : s1;
s3 = MASK_FRAC & s2;
s4 = s2 & MASK_INT;
maxpooling_result = s3 > 0 ? s4 + 16'd16 : s4 ;
end
// 組合電路: 下一狀態邏輯
always@(*)begin
case(state)
IDLE : next_state = LOAD_FROM_ROM ; //0
LOAD_FROM_ROM : next_state = (kernel_cnt == 8)? PROCESS : LOAD_FROM_ROM ;//1
PROCESS : next_state = SAVE_TO_LAYER0 ;//2
SAVE_TO_LAYER0 : next_state = (kernel_center == 4096) ? LOAD_FROM_LAYER0 : LOAD_FROM_ROM ;//3
LOAD_FROM_LAYER0 : next_state = (pool_read_cnt == 3) ? MAXPOOLING : LOAD_FROM_LAYER0; //4
MAXPOOLING : next_state = SAVE_TO_LAYER1;//5
SAVE_TO_LAYER1 : next_state = (layer1_wrt_addr == 1024) ? DONE : LOAD_FROM_LAYER0;//6
DONE : next_state = DONE ;
endcase
end
parameter DILATION = 2; // 空洞大小 (2 表示跳過 1 格)
// 拆解中心點 row/col
wire signed[7:0] base_row = {2'b00, kernel_center[11:6]};
wire signed[7:0] base_col = {2'b00, kernel_center[5:0]};
// 組合電路: 卷積偏移設定
reg signed [7:0] off_row, off_col;
always @(*) begin
case (kernel_cnt)
4'd0: begin off_row = -DILATION; off_col = -DILATION; end
4'd1: begin off_row = -DILATION; off_col = 0; end
4'd2: begin off_row = -DILATION; off_col = DILATION; end
4'd3: begin off_row = 0; off_col = -DILATION; end
4'd4: begin off_row = 0; off_col = 0; end
4'd5: begin off_row = 0; off_col = DILATION; end
4'd6: begin off_row = DILATION; off_col = -DILATION; end
4'd7: begin off_row = DILATION; off_col = 0; end
4'd8: begin off_row = DILATION; off_col = DILATION; end
default: begin off_row = 0; off_col = 0; end
endcase
end
// 計算地址
wire signed [7:0] offset_row = base_row + off_row;
wire signed [7:0] offset_col = base_col + off_col;
wire [5:0] target_row = offset_row < 0 ? 6'd0 : offset_row > 63 ? 6'd63 : offset_row[5:0];
wire [5:0] target_col = offset_col < 0 ? 6'd0 : offset_col > 63 ? 6'd63 : offset_col[5:0];
// 組合電路: 最終讀取地址 = row * 64 + col = {row, col}
reg [11:0] iaddr_reg;
always @(*) begin
iaddr_reg = {target_row, target_col};
end
/////////////////////////////////////////////////////////////
//第二部分: 讀取LAYER0 -> 處理MAXPOOL + ROUNDUP -> 寫入LAYER1
////////////////////////////////////////////////////////////
// 組合電路: maxpooling 偏移設定
parameter [1:0] STRIDE = 2'd2;
reg off_row_pool, off_col_pool;
always @(*) begin
case (pool_read_cnt)
4'd0: begin off_row_pool = 1'd0; off_col_pool = 1'd0; end
4'd1: begin off_row_pool = 1'd0; off_col_pool = 1'd1; end
4'd2: begin off_row_pool = 1'd1; off_col_pool = 1'd0; end
4'd3: begin off_row_pool = 1'd1; off_col_pool = 1'd1; end
default: begin off_row_pool = 2'd0; off_col_pool = 2'd0; end
endcase
end
always@(*)begin
layer0_row = {layer1_wrt_addr[9:5] , off_row_pool};
layer0_col = {layer1_wrt_addr[4:0] , off_col_pool} ;
layer0_read_addr = {layer0_row, layer0_col};
end
// 組合電路: Layer0 寫入讀取位置、資料
always@(*)begin
layer0_A = 12'd0;
layer0_D = 16'd0;
case (state)
SAVE_TO_LAYER0: begin
layer0_A = kernel_center;
layer0_D = relu_result;
end
LOAD_FROM_LAYER0: begin
layer0_A = layer0_read_addr;
end
default: begin
layer0_A <= layer0_A;
layer0_D <= layer0_D;
end
endcase
end
// 組合電路: Layer1 寫入位置、資料
always@(*)begin
if(state == SAVE_TO_LAYER1)begin
layer1_A = layer1_wrt_addr;
layer1_D = maxpooling_result;
end
end
//序向電路: 狀態更新
always@(posedge clk, posedge rst)begin
if (rst)begin
state <= IDLE;
end else begin
state <= next_state;
end
end
//序向電路: 同步計數器與中心點更新
always @(posedge clk or posedge rst) begin
if (rst) begin
kernel_cnt <= 4'd0;
kernel_center <= 12'd0;
pool_read_cnt <= 2'd0;
layer1_wrt_addr <= 11'd0;
end else if (state == LOAD_FROM_ROM) begin
kernel[kernel_cnt/3][kernel_cnt%3] <= idata;
kernel_cnt <= kernel_cnt + 1;
end else if (state == PROCESS) begin
kernel_cnt <= 4'd0; // 下一次 PROCESS 前重置
end else if (state == SAVE_TO_LAYER0)begin
kernel_center <= kernel_center + 1;
end else if (state == LOAD_FROM_LAYER0)begin
pool_win[off_row_pool][off_col_pool] <= layer0_Q;
pool_read_cnt <= pool_read_cnt + 1;
end else if (state == MAXPOOLING)begin
pool_read_cnt <= 4'd0; // 下一次 POOL 前重置
end else if (state == SAVE_TO_LAYER1)begin
layer1_wrt_addr <= layer1_wrt_addr + 1;
end
end
assign iaddr = iaddr_reg;
assign layer0_ceb = (state == SAVE_TO_LAYER0) || (state == LOAD_FROM_LAYER0);// LAYER0 寫入控制控制訊號
assign layer0_web = ~(state == SAVE_TO_LAYER0); // LAYER0 : 0代表寫入 1 代表只有讀取
assign layer1_ceb = (state == SAVE_TO_LAYER1); // LAYER1控制訊號
assign layer1_web = ~(state == SAVE_TO_LAYER1); // LAYER1 : 0代表寫入 1 代表只有讀取
assign ROM_rd = (state == LOAD_FROM_ROM); // ROM 讀取
assign done = (state == DONE) ? 1 : 0; //結束程式
endmodule
```
#### functional simulation result

#### flow summary

#### post-sim

##### 另一種寫法
```verilog=
`timescale 1ns/10ps
module ATCONV (
input clk,
input rst,
output reg ROM_rd,
output reg [11:0] iaddr,
input [15:0] idata,
output reg layer0_ceb,
output reg layer0_web,
output reg [11:0] layer0_A,
output reg [15:0] layer0_D,
input [15:0] layer0_Q,
output reg layer1_ceb,
output reg layer1_web,
output reg [11:0] layer1_A,
output reg [15:0] layer1_D,
input [15:0] layer1_Q,
output reg done
);
localparam INIT = 3'd0, ATCONV_S = 3'd1, LAYER0_WRITE = 3'd2, MAXPOOL = 3'd3, LAYER1_WRITE = 3'd4, FINISH = 3'd5;
reg [2:0] state, nextState;
reg [11:0] center;
reg [3:0] counter;
reg signed [35:0] convSum;
wire signed [15:0] kernel [1:9];
assign kernel[1] = 16'hFFFF;
assign kernel[2] = 16'hFFFE;
assign kernel[3] = 16'hFFFF;
assign kernel[4] = 16'hFFFC;
assign kernel[5] = 16'h0010;
assign kernel[6] = 16'hFFFC;
assign kernel[7] = 16'hFFFF;
assign kernel[8] = 16'hFFFE;
assign kernel[9] = 16'hFFFF;
wire signed [15:0] bias;
assign bias = 16'hFFF4;
wire [5:0] row = center[11:6], col = center[5:0];
wire [5:0] row_p2 = row + 6'd2, row_m2 = row - 6'd2;
wire [5:0] col_p2 = col + 6'd2, col_m2 = col - 6'd2;
// register
always @(posedge clk or posedge rst) begin
if (rst)
state <= INIT;
else
state <= nextState;
end
always @(*) begin
case (state)
INIT: nextState = ATCONV_S;
ATCONV_S: nextState = (counter == 4'd9) ? LAYER0_WRITE : ATCONV_S;
LAYER0_WRITE: nextState = (center == 12'd4095) ? MAXPOOL : ATCONV_S;
MAXPOOL: nextState = (counter == 4'd4) ? LAYER1_WRITE : MAXPOOL;
LAYER1_WRITE: nextState = (center == 12'd1023) ? FINISH : MAXPOOL;
FINISH: nextState = FINISH;
default: nextState = INIT;
endcase
end
reg [15:0] maxVal; // for layer1 max-pooling
always @(posedge clk or posedge rst) begin
if (rst) begin
ROM_rd <= 1'b0;
iaddr <= 12'd0;
layer0_ceb <= 1'b0; layer0_web <= 1'b1;
layer0_A <= 12'd0; layer0_D <= 16'd0;
layer1_ceb <= 1'b0; layer1_web <= 1'b1;
layer1_A <= 12'd0; layer1_D <= 16'd0;
done <= 1'b0;
center <= 12'd0;
counter <= 4'd0;
convSum <= {{20{bias[15]}}, bias, 4'd0};
maxVal <= 16'd0;
end else begin
ROM_rd <= 1'b0;
layer0_ceb <= 1'b0; layer0_web <= 1'b1;
layer1_ceb <= 1'b0; layer1_web <= 1'b1;
done <= 1'b0;
case (state)
INIT: begin
end
ATCONV_S: begin
ROM_rd <= 1'b1;
case (counter)
4'd0,4'd1,4'd2: iaddr[11:6] <= (row<6'd2) ? 6'd0 : row_m2;
4'd3,4'd4,4'd5: iaddr[11:6] <= row;
4'd6,4'd7,4'd8: iaddr[11:6] <= (row>6'd61) ? 6'd63 : row_p2;
default: iaddr[11:6] <= 6'd0;
endcase
case (counter)
4'd0,4'd3,4'd6: iaddr[5:0] <= (col<6'd2) ? 6'd0 : col_m2;
4'd1,4'd4,4'd7: iaddr[5:0] <= col;
4'd2,4'd5,4'd8: iaddr[5:0] <= (col>6'd61) ? 6'd63 : col_p2;
default: iaddr[5:0] <= 6'd0;
endcase
// Accumulate convSum after first sample
if (counter>4'd0)
convSum <= convSum + $signed(idata) * kernel[counter];
counter <= counter + 4'd1;
end
LAYER0_WRITE: begin
// Write Layer0 memory
layer0_ceb <= 1'b1; layer0_web <= 1'b0;
layer0_A <= center;
layer0_D <= convSum[35] ? 16'd0 : convSum[19:4];
convSum <= {{20{bias[15]}}, bias, 4'd0};
counter <= 4'd0;
center <= center + 12'd1;
end
MAXPOOL: begin
// Max-pooling from Layer0
layer0_ceb <= 1'b1; layer0_web <= 1'b1;
// address calc for correct sample
case (counter)
4'd0: begin layer0_A[11:6] <= {center[9:5],1'b0}; layer0_A[5:0] <= {center[4:0],1'b0}; end
4'd1: begin layer0_A[11:6] <= {center[9:5],1'b0}; layer0_A[5:0] <= {center[4:0],1'b1}; end
4'd2: begin layer0_A[11:6] <= {center[9:5],1'b1}; layer0_A[5:0] <= {center[4:0],1'b0}; end
4'd3: begin layer0_A[11:6] <= {center[9:5],1'b1}; layer0_A[5:0] <= {center[4:0],1'b1}; end
endcase
if (counter == 4'd1)
maxVal <= layer0_Q;
else if ((counter >= 4'd2 && counter <= 4'd4) && $signed(layer0_Q) > $signed(maxVal))
maxVal <= layer0_Q;
counter <= counter + 4'd1;
end
LAYER1_WRITE: begin
layer1_ceb <= 1'b1; layer1_web <= 1'b0;
layer1_A <= center;
layer1_D <= { maxVal[15:4] + (|maxVal[3:0]), 4'd0 };
center <= center + 12'd1;
counter <= 4'd0;
end
FINISH: begin
done <= 1'b1;
end
endcase
end
end
endmodule
```
**atconv_wrapper.v**
```verilog=
`timescale 1ns/10ps
`include "./include/define.v"
module ATCONV_Wrapper(
input bus_clk ,
input bus_rst ,
input [`BUS_DATA_BITS-1:0] RDATA_M ,
input RLAST_M ,
input WREADY_M ,
input RREADY_M ,
output reg [`BUS_ID_BITS -1:0] ID_M ,
output reg [`BUS_ADDR_BITS-1:0] ADDR_M ,
output reg [`BUS_DATA_BITS-1:0] WDATA_M ,
output [`BUS_LEN_BITS -1:0] BLEN_M ,
output reg WLAST_M ,
output reg WVALID_M ,
output reg RVALID_M ,
output done
);
//ROM
reg [`BUS_ADDR_BITS-1:0] iaddr_reg;
//RAM0
reg[`BUS_ADDR_BITS-1:0] layer0_A;
reg[`BUS_DATA_BITS-1:0] layer0_D;
reg[`BUS_DATA_BITS-1:0] layer0_Q;
//RAM1
reg[`BUS_ADDR_BITS-1:0] layer1_A;
reg[`BUS_DATA_BITS-1:0] layer1_D;
reg[`BUS_DATA_BITS-1:0] layer1_Q;
// 狀態編碼
parameter [3:0]IDLE = 4'd0;
parameter [3:0]WAIT_ROM_READ_READY = 4'd1;
parameter [3:0]LOAD_FROM_ROM = 4'd2;
parameter [3:0]PROCESS = 4'd3; // conv + relu
parameter [3:0]WAIT_LAYER0_WRITE_READY = 4'd4;
parameter [3:0]SAVE_TO_LAYER0 = 4'd5;
parameter [3:0]WAIT_LAYER0_READ_READY = 4'd6;
parameter [3:0]LOAD_FROM_LAYER0 = 4'd7;
parameter [3:0]MAXPOOLING = 4'd8;
parameter [3:0]WAIT_LAYER1_WRITE_READY = 4'd9;
parameter [3:0]SAVE_TO_LAYER1 = 4'd10;
parameter [3:0]DONE = 4'd11;
// 狀態暫存
reg [3:0] state, next_state;
//====================================================
// 1) AXI-like 主裝置握手訊號
//====================================================
reg [3:0] blen_reg;
reg [3:0] transmit_cnt ;//比對傳送資料比數
assign BLEN_M = blen_reg;
always @(*) begin
// 其他狀態維持預設
WVALID_M = 1'b0;
WLAST_M = 1'b0;
RVALID_M = 1'b0;
ID_M = 2'd3;
ADDR_M = 12'd0;
blen_reg= 4'd1;
case(state)
WAIT_ROM_READ_READY :begin
RVALID_M = 1'd1;
ID_M = 2'd0;
ADDR_M = iaddr_reg;
blen_reg = 4'b0001;
end
// 從 ROM 讀資料
LOAD_FROM_ROM: begin
RVALID_M = 1'd0;
ID_M = 2'd0;
ADDR_M = iaddr_reg;
blen_reg = 4'b0001;
end
PROCESS: ;
WAIT_LAYER0_WRITE_READY :begin//4
WVALID_M = 1'd1;
ID_M = 2'd1;
ADDR_M = layer0_A;
WDATA_M = layer0_D;
blen_reg = 4'b0001;
end
// 將卷積→ReLU 結果寫入 Layer0
SAVE_TO_LAYER0: begin//5
WVALID_M = 1'd0;
ID_M = 2'd1;
ADDR_M = layer0_A;
WDATA_M = layer0_D;
blen_reg = 4'b0001;
if (transmit_cnt == blen_reg - 1) begin
WLAST_M = 1'b1;
end
end
WAIT_LAYER0_READ_READY :begin
RVALID_M = 1'd1;
ID_M = 2'd1; // 建立通道ID
ADDR_M = layer0_A;
blen_reg = 4'b0001;
end
// 從 Layer0 讀資料(MaxPool)
LOAD_FROM_LAYER0: begin
RVALID_M = 1'd0;
ID_M = 2'd1; // 建立通道ID
ADDR_M = layer0_A;
blen_reg = 4'b0001;
end
WAIT_LAYER1_WRITE_READY: begin
WVALID_M = 1'd1;
ID_M = 2'd2;
ADDR_M = layer1_A;
WDATA_M = layer1_D;
blen_reg = 4'b0001;
end
// 將 MaxPool→RoundUp 結果寫入 Layer1
SAVE_TO_LAYER1: begin
WVALID_M = 1'b0;
ID_M = 2'd2;
ADDR_M = layer1_A;
WDATA_M = layer1_D;
blen_reg = 4'b0001;
if (transmit_cnt == blen_reg - 1) begin
WLAST_M = 1'b1;
end
end
default: begin
end
endcase
end
///////////////////////////////
///卷積操作暫存
//////////////////////////////
// 卷積核常數 (16-bit signed)
parameter signed[15:0] k0 = 16'hFFFF;
parameter signed[15:0] k1 = 16'hFFFE;
parameter signed[15:0] k2 = 16'hFFFF;
parameter signed[15:0] k3 = 16'hFFFC;
parameter signed[15:0] k4 = 16'h0010;
parameter signed[15:0] k5 = 16'hFFFC;
parameter signed[15:0] k6 = 16'hFFFF;
parameter signed[15:0] k7 = 16'hFFFE;
parameter signed[15:0] k8 = 16'hFFFF;
parameter signed[31:0] bias = 32'hFFFFFFF4;
// 卷積 kernel 參數
reg [15:0] kernel [0:2][0:2];
reg [3 :0] kernel_cnt ;
reg [12:0] kernel_center ;
// 卷積計算結果
reg signed [31:0] conv_result; // 32 位的卷積結果
reg signed [31:0] relu_result; // 32 位的 ReLU 結果
// 各乘法部分結果
wire signed [31:0] t0, t1, t2, t3, t4, t5, t6, t7, t8;
assign t0 = ($signed(kernel[0][0]) * k0) >>> 4;
assign t1 = ($signed(kernel[0][1]) * k1) >>> 4;
assign t2 = ($signed(kernel[0][2]) * k2) >>> 4;
assign t3 = ($signed(kernel[1][0]) * k3) >>> 4;
assign t4 = ($signed(kernel[1][1]) * k4) >>> 4;
assign t5 = ($signed(kernel[1][2]) * k5) >>> 4;
assign t6 = ($signed(kernel[2][0]) * k6) >>> 4;
assign t7 = ($signed(kernel[2][1]) * k7) >>> 4;
assign t8 = ($signed(kernel[2][2]) * k8) >>> 4;
//////////////////////////////////
// 組合電路: 卷積 + ReLU
//////////////////////////////////
always@(*)begin
conv_result = t0 + t1 + t2 + t3 + t4 + t5 + t6 + t7 + t8 + bias; // CONVOLUTION 操作,最後加上bias
relu_result = (conv_result[31] == 1'b1) ? 32'd0 : conv_result; // ReLU 操作:如果 conv_result 是負數則設為 0,否則保留原來的值
end
//////////////////////////////////
///MAXPOOL + ROUNDUP 操作暫存、參數
/////////////////////////////////
// 組合電路: maxpool 參數
reg [15:0] pool_win [0:1][0:1];
reg [1 :0] pool_read_cnt ;
reg [11:0] layer0_read_addr ;
reg [11:0] layer1_wrt_addr ;
reg [5:0] layer0_row ;
reg [5:0] layer0_col ;
reg [15:0] s0, s1, s2, s3, s4;
reg [15:0] maxpooling_result;
parameter [15:0] MASK_FRAC = 16'b0000_0000_0000_1111;
parameter [15:0] MASK_INT = 16'b1111_1111_1111_0000;
// 組合電路 : 計算 maxpooling + roundup
always@(*)begin
s0 = pool_win[0][0] > pool_win[0][1] ? pool_win[0][0] : pool_win[0][1];
s1 = pool_win[1][0] > pool_win[1][1] ? pool_win[1][0] : pool_win[1][1];
s2 = s0 > s1 ? s0 : s1;
s3 = MASK_FRAC & s2;
s4 = s2 & MASK_INT;
maxpooling_result = s3 > 0 ? s4 + 16'd16 : s4 ;
end
// 組合電路: 下一狀態邏輯
always@(*)begin
case(state)
IDLE : next_state = WAIT_ROM_READ_READY ; //0
WAIT_ROM_READ_READY : next_state = (RVALID_M && RREADY_M) ? LOAD_FROM_ROM : WAIT_ROM_READ_READY; //1
LOAD_FROM_ROM : next_state = (RLAST_M) ? (kernel_cnt == 8)? PROCESS : WAIT_ROM_READ_READY : LOAD_FROM_ROM ;//2
PROCESS : next_state = WAIT_LAYER0_WRITE_READY ;//3
WAIT_LAYER0_WRITE_READY : next_state = (WVALID_M && WREADY_M) ? SAVE_TO_LAYER0 : WAIT_LAYER0_WRITE_READY ;//4
SAVE_TO_LAYER0 : next_state = (kernel_center == 4095) ? WAIT_LAYER0_READ_READY : WAIT_ROM_READ_READY ;//5
WAIT_LAYER0_READ_READY : next_state = (RVALID_M && RREADY_M) ? LOAD_FROM_LAYER0 : WAIT_LAYER0_READ_READY;//6
LOAD_FROM_LAYER0 : next_state = (pool_read_cnt == 3) ? MAXPOOLING : WAIT_LAYER0_READ_READY; //7
MAXPOOLING : next_state = WAIT_LAYER1_WRITE_READY;//8
WAIT_LAYER1_WRITE_READY : next_state = (WVALID_M && WREADY_M) ? SAVE_TO_LAYER1 : WAIT_LAYER1_WRITE_READY;//9
SAVE_TO_LAYER1 : next_state = (layer1_wrt_addr == 1023) ? DONE : WAIT_LAYER0_READ_READY;//10
DONE : next_state = DONE ; //11
endcase
end
parameter DILATION = 2; // 空洞大小 (2 表示跳過 1 格)
// 拆解中心點 row/col
wire signed[7:0] base_row = {2'b00, kernel_center[11:6]};
wire signed[7:0] base_col = {2'b00, kernel_center[5:0]};
// 組合電路: 卷積偏移設定
reg signed [7:0] off_row, off_col;
always @(*) begin
case (kernel_cnt)
4'd0: begin off_row = -DILATION; off_col = -DILATION; end
4'd1: begin off_row = -DILATION; off_col = 0; end
4'd2: begin off_row = -DILATION; off_col = DILATION; end
4'd3: begin off_row = 0; off_col = -DILATION; end
4'd4: begin off_row = 0; off_col = 0; end
4'd5: begin off_row = 0; off_col = DILATION; end
4'd6: begin off_row = DILATION; off_col = -DILATION; end
4'd7: begin off_row = DILATION; off_col = 0; end
4'd8: begin off_row = DILATION; off_col = DILATION; end
default: begin off_row = 0; off_col = 0; end
endcase
end
// 限幅並計算地址
wire signed [7:0] offset_row = base_row + off_row;
wire signed [7:0] offset_col = base_col + off_col;
wire [5:0] target_row = offset_row < 0 ? 6'd0 : offset_row > 63 ? 6'd63 : offset_row[5:0];
wire [5:0] target_col = offset_col < 0 ? 6'd0 : offset_col > 63 ? 6'd63 : offset_col[5:0];
// 組合電路: 最終讀取地址 = row * 64 + col = {row, col}
always @(*) begin
iaddr_reg = {target_row, target_col};
end
/////////////////////////////////////////////////////////////
//第二部分: 讀取LAYER0 -> 處理MAXPOOL + ROUNDUP -> 寫入LAYER1
////////////////////////////////////////////////////////////
// 組合電路: maxpooling 偏移設定
parameter [1:0] STRIDE = 2'd2;
reg off_row_pool, off_col_pool;
always @(*) begin
case (pool_read_cnt)
4'd0: begin off_row_pool = 1'd0; off_col_pool = 1'd0; end
4'd1: begin off_row_pool = 1'd0; off_col_pool = 1'd1; end
4'd2: begin off_row_pool = 1'd1; off_col_pool = 1'd0; end
4'd3: begin off_row_pool = 1'd1; off_col_pool = 1'd1; end
default: begin off_row_pool = 2'd0; off_col_pool = 2'd0; end
endcase
end
always@(*)begin
layer0_row = {layer1_wrt_addr[9:5] , off_row_pool};
layer0_col = {layer1_wrt_addr[4:0] , off_col_pool} ;
layer0_read_addr = {layer0_row, layer0_col};
end
// 組合電路: Layer0 寫入讀取位置、資料
always@(*)begin
layer0_A = 12'd0;
layer0_D = 16'd0;
case (state)
WAIT_LAYER0_WRITE_READY: begin
layer0_A = kernel_center;
layer0_D = relu_result[15:0];
end
WAIT_LAYER0_READ_READY: begin
layer0_A = layer0_read_addr;
end
default: begin
end
endcase
end
// 組合電路: Layer1 寫入位置、資料
always@(*)begin
if(state == WAIT_LAYER1_WRITE_READY)begin
layer1_A = layer1_wrt_addr;
layer1_D = maxpooling_result;
end
end
//序向電路: 狀態更新
always@(posedge bus_clk, posedge bus_rst)begin
if (bus_rst)begin
state <= IDLE;
end else begin
state <= next_state;
end
end
//序向電路: 同步計數器與中心點更新
always @(posedge bus_clk or posedge bus_rst) begin
if (bus_rst) begin
kernel_cnt <= 4'd0;
kernel_center <= 12'd0;
pool_read_cnt <= 2'd0;
layer1_wrt_addr <= 12'd0;
transmit_cnt <= 4'd0;
end else if (state == WAIT_ROM_READ_READY)begin
end else if (state == LOAD_FROM_ROM) begin
kernel[kernel_cnt/3][kernel_cnt%3] <= RDATA_M;
kernel_cnt <= kernel_cnt + 1;
end else if (state == PROCESS) begin
kernel_cnt <= 4'd0; // 下一次 PROCESS 前重置
end else if (state == WAIT_LAYER0_WRITE_READY)begin
end else if (state == SAVE_TO_LAYER0)begin
kernel_center <= kernel_center + 1;
end else if (state == WAIT_LAYER0_READ_READY)begin
end else if (state == LOAD_FROM_LAYER0)begin
pool_win[off_row_pool][off_col_pool] <= RDATA_M;
pool_read_cnt <= pool_read_cnt + 1;
end else if (state == MAXPOOLING)begin
pool_read_cnt <= 4'd0; // 下一次 POOL 前重置
end else if (state == WAIT_LAYER1_WRITE_READY)begin
end else if (state == SAVE_TO_LAYER1)begin
layer1_wrt_addr <= layer1_wrt_addr + 1;
end
end
assign done = (state == DONE) ? 1 : 0; //結束程式
endmodule
```
### 2025-HW5
[code](https://github.com/HowFunSong/2025_ICDesign/blob/main/HW5/prototype/v1/MCH.v)
### 練習資源
1. [HDLBits — Verilog Practice](https://hdlbits.01xz.net/wiki/Main_Page)
2. [HDLbits答案更新系列目录(直达答案链接)](https://blog.csdn.net/wangkai_2019/article/details/106664283)
3. [練習筆記](https://hackmd.io/@shinhao66/verilogNote/edit)
4. [牛客](https://www.nowcoder.com/exam/oj?page=1&tab=Verilog%E7%AF%87&topicId=302)
### 參考
1. [Verilog动态截取固定长度语法+:和-:](https://blog.csdn.net/whik1194/article/details/113874073)
2. [Synopsys Design Compiler DC综合教程](https://www.bilibili.com/video/BV1EP4y1a75W/?spm_id_from=333.337.search-card.all.click&vd_source=ee346ab269e4eba3052dfd39001b45f7)
3. [\[Day20\]泡沫排序法](https://ithelp.ithome.com.tw/m/articles/10195078)
### 大神筆記
### [Chapter 4: RTL Coding-Part II](https://hackmd.io/@maggie860326/BygPfLF-3)
### [Chapter 5: Digital System Design](https://hackmd.io/@maggie860326/rknSlakb3)
### [Chapter 6: Control Unit](https://hackmd.io/@maggie860326/rkEQfChTs)
### [Chapter 7: Datapath Optimization](https://hackmd.io/@maggie860326/BykscsNXn)
### [Chapter 8: Case Study](https://hackmd.io/@maggie860326/HyMGJVwEh )