# SOC final project: Operator
## member
Hsuan Jung(羅宣融), Jesse(張哲熙)
## Top-level Architecture of BPE

[top level butterfly](https://github.com/JJ-best/SOC/tree/master/Spring_Final_Project/Operator/butterfly)
I have write `butterfly.v` but haven't test yet, however, all of the submodule have been test, for example, the large block on the right and left have been test.
## FFT old version complex multiplier(Jesse)
[link: complex multiplier](https://github.com/JJ-best/SOC/tree/master/Spring_Final_Project/Operator/cmul)
:::spoiler fadd.v
```verilog=
//author: Jesse
//module: Double Precision Adder
//IEEE754 format
//64bit = 1bit | 11bit | 52bit
// Sign | Exponent | Mantissa
module FLOAT_ADD (
input [63:0] num1,
input [63:0] num2,
input clk,
input rstn,
input valid,
output [63:0] result,
output ready
);
localparam BIT = 64;
localparam BIAS = 1023;
localparam EXPONENT_LEN = 11;
localparam MANTISSA_LEN = 52;
//-----pipeline stage: 1-----//
//input buffer
reg pip1_sign1;
reg pip1_sign2;
reg pip1_valid;
reg [(EXPONENT_LEN-1):0] pip1_exponent1;
reg [(EXPONENT_LEN-1):0] pip1_exponent2;
reg [(MANTISSA_LEN-1):0] pip1_mantissa1;
reg [(MANTISSA_LEN-1):0] pip1_mantissa2;
always @(posedge clk or negedge rstn) begin
if (!rstn) begin
pip1_sign1 <= 0;
pip1_sign2 <= 0;
pip1_exponent1 <= 0;
pip1_exponent2 <= 0;
pip1_mantissa1 <= 0;
pip1_mantissa2 <= 0;
pip1_valid <= 0;
end else begin
pip1_sign1 <= num1[63];
pip1_sign2 <= num2[63];
pip1_exponent1 <= num1[62:52];
pip1_exponent2 <= num2[62:52];
pip1_mantissa1 <= num1[51:0];
pip1_mantissa2 <= num2[51:0];
pip1_valid <= valid;
end
end
//---------------------------//
//-----pipeline stage: 2-----//
//add leadign one or zero
reg pip2_sign1;
reg pip2_sign2;
reg pip2_valid;
reg [(EXPONENT_LEN-1):0]pip2_exponent1;
reg [(EXPONENT_LEN-1):0]pip2_exponent2;
reg signed [(EXPONENT_LEN):0]pip2_exponent_diff;
reg [(MANTISSA_LEN):0]pip2_mantissa1;
reg [(MANTISSA_LEN):0]pip2_mantissa2;
wire check_denor1;
wire check_denor2;
assign check_denor1 = ~(|pip1_exponent1); //check denormalize number
assign check_denor2 = ~(|pip1_exponent2); //check denormalize number
always @(posedge clk or negedge rstn) begin
if (!rstn) begin
pip2_sign1 <= 0;
pip2_sign2 <= 0;
pip2_valid <= 0;
pip2_exponent1 <= 0;
pip2_exponent2 <= 0;
pip2_mantissa1 <= 0;
pip2_mantissa2 <= 0;
pip2_exponent_diff <= 0;
end else begin
pip2_sign1 <= pip1_sign1;
pip2_sign2 <= pip1_sign2;
pip2_valid <= pip1_valid;
pip2_exponent1 <= pip1_exponent1;
pip2_exponent2 <= pip1_exponent2;
pip2_exponent_diff <= pip1_exponent1 - pip1_exponent2;
if (check_denor1) begin
pip2_mantissa1 <= {1'b0, pip1_mantissa1};
end else begin
pip2_mantissa1 <= {1'b1, pip1_mantissa1};
end
if (check_denor2) begin
pip2_mantissa2 <= {1'b0, pip1_mantissa2};
end else begin
pip2_mantissa2 <= {1'b1, pip1_mantissa2};
end
end
end
//---------------------------//
//-----pipeline stage: 3-----//
//align binary point, rounding
reg pip3_sign1;
reg pip3_sign2;
reg pip3_valid;
reg [(EXPONENT_LEN-1):0]pip3_exponent1;
reg [(EXPONENT_LEN-1):0]pip3_exponent2;
reg [(MANTISSA_LEN):0]pip3_mantissa1;
reg [(MANTISSA_LEN):0]pip3_mantissa2;
wire [(MANTISSA_LEN+52):0]pip2_mantissa1_shift;
wire [(MANTISSA_LEN+52):0]pip2_mantissa2_shift;
wire [(MANTISSA_LEN+52):0]pip2_mantissa1_shifted;
wire [(MANTISSA_LEN+52):0]pip2_mantissa2_shifted;
wire [(MANTISSA_LEN):0]pip3_mantissa1_tmp;
wire [(MANTISSA_LEN):0]pip3_mantissa2_tmp;
reg [(MANTISSA_LEN):0]pip3_mantissa1_next;
reg [(MANTISSA_LEN):0]pip3_mantissa2_next;
always @(posedge clk or negedge rstn) begin
if (!rstn) begin
pip3_sign1 <= 0;
pip3_sign2 <= 0;
pip3_exponent1 <= 0;
pip3_exponent2 <= 0;
pip3_mantissa1 <= 0;
pip3_mantissa2 <= 0;
pip3_valid <= 0;
end else begin
pip3_sign1 <= pip2_sign1;
pip3_sign2 <= pip2_sign2;
pip3_valid <= pip2_valid;
if (pip2_exponent_diff >= 0) begin //exponent1>exponent2
pip3_exponent1 <= pip2_exponent1;
pip3_exponent2 <= pip2_exponent1;
pip3_mantissa1 <= pip2_mantissa1;
//pip3_mantissa2 <= pip2_mantissa2 >> pip2_exponent_diff;
pip3_mantissa2 <= pip3_mantissa2_next;
end else begin //exponent1<exponent2
pip3_exponent1 <= pip2_exponent2;
pip3_exponent2 <= pip2_exponent2;
//pip3_mantissa1 <= pip2_mantissa1 >> -pip2_exponent_diff;
pip3_mantissa1 <= pip3_mantissa1_next;
pip3_mantissa2 <= pip2_mantissa2;
end
end
end
assign pip2_mantissa1_shift = {pip2_mantissa1, 52'b0};
assign pip2_mantissa2_shift = {pip2_mantissa2, 52'b0};
assign pip2_mantissa1_shifted = pip2_mantissa1_shift >> -pip2_exponent_diff;
assign pip2_mantissa2_shifted = pip2_mantissa2_shift >> pip2_exponent_diff;
assign pip3_mantissa1_tmp = pip2_mantissa1_shifted[(MANTISSA_LEN+52):52];
assign pip3_mantissa2_tmp = pip2_mantissa2_shifted[(MANTISSA_LEN+52):52];
wire LSB_1 = pip2_mantissa1_shifted[52];
wire Guard_1 = pip2_mantissa1_shifted[51];
wire Round_1 = pip2_mantissa1_shifted[50];
wire Sticky_1 = |pip2_mantissa1_shifted[49:0];
wire LSB_2 = pip2_mantissa2_shifted[52];
wire Guard_2 = pip2_mantissa2_shifted[51];
wire Round_2 = pip2_mantissa2_shifted[50];
wire Sticky_2 = |pip2_mantissa2_shifted[49:0];
wire round_up1 = Guard_1 & (Round_1 | Sticky_1 | LSB_1) ;
//wire round_up1 = (Guard_1 & (Round_1 | Sticky_1)) | (Guard_1 & ~Round_1 & ~Sticky_1 & LSB_1);
//wire round_up2 = (Guard_2 & (Round_2 | Sticky_2)) | (Guard_2 & ~Round_2 & ~Sticky_2 & LSB_2);
wire round_up2 = Guard_2 & (Round_2 | Sticky_2 | LSB_2);
always @(*) begin
if (round_up1) begin
pip3_mantissa1_next = pip3_mantissa1_tmp + 1;
end else begin
pip3_mantissa1_next = pip3_mantissa1_tmp;
end
if (round_up2) begin
pip3_mantissa2_next = pip3_mantissa2_tmp + 1;
end else begin
pip3_mantissa2_next = pip3_mantissa2_tmp;
end
end
//---------------------------//
//-----pipeline stage: 4-----//
//place the mantissa number for next stage(add or sub)
reg pip4_sign1;
reg pip4_sign2;
reg [(EXPONENT_LEN-1):0]pip4_exponent1;
reg [(EXPONENT_LEN-1):0]pip4_exponent2;
reg [(MANTISSA_LEN):0]pip4_mantissaA;
reg [(MANTISSA_LEN):0]pip4_mantissaB;
reg pip4_compare; //compare mantissa1 and mantissa2
reg pip4_valid;
always @(posedge clk or negedge rstn) begin
if (!rstn) begin
pip4_sign1 <= 0;
pip4_sign2 <= 0;
pip4_exponent1 <= 0;
pip4_exponent2 <= 0;
pip4_mantissaA <= 0;
pip4_mantissaB <= 0;
pip4_compare <= 0;
pip4_valid <= 0;
end else begin
pip4_sign1 <= pip3_sign1;
pip4_sign2 <= pip3_sign2;
pip4_exponent1 <= pip3_exponent1;
pip4_exponent2 <= pip3_exponent2;
pip4_valid <= pip3_valid;
case ({pip3_sign1, pip3_sign2})
2'b00: begin
pip4_mantissaA <= pip3_mantissa1;
pip4_mantissaB <= pip3_mantissa2;
end
2'b01: begin
pip4_mantissaA <= pip3_mantissa1;
pip4_mantissaB <= pip3_mantissa2;
end
2'b10: begin
pip4_mantissaA <= pip3_mantissa2;
pip4_mantissaB <= pip3_mantissa1;
end
2'b11: begin
pip4_mantissaA <= pip3_mantissa1;
pip4_mantissaB <= pip3_mantissa2;
end
default: begin
pip4_mantissaA <= pip3_mantissa1;
pip4_mantissaB <= pip3_mantissa2;
end
endcase
pip4_compare <= (pip3_mantissa1 > pip3_mantissa2)? 1:0;
end
end
//---------------------------//
//-----pipeline stage: 5-----//
//calculate the add and sub of mantissa
//if pip5_complement = 1 means the pip5_mantissa is negative
reg pip5_sign;
reg pip5_valid;
reg [(EXPONENT_LEN-1):0]pip5_exponent;
reg signed [(MANTISSA_LEN+1):0]pip5_mantissa;//2bit(integer)+52bit(float point)
reg pip5_complement;
reg signed [(MANTISSA_LEN+1):0]pip5_add;
reg signed [(MANTISSA_LEN+1):0]pip5_sub;
reg pip5_sign_tmp;
reg pip5_complement_tmp;
always @(posedge clk or negedge rstn) begin
if (!rstn) begin
pip5_sign <= 0;
pip5_exponent <= 0;
pip5_complement <= 0;
pip5_mantissa <= 0;
pip5_valid <= 0;
end else begin
pip5_valid <= pip4_valid;
if (~|pip5_sub && (pip4_sign1 ^ pip4_sign2)) begin //2.0 - 2.0 = zero
pip5_sign <= 0;
pip5_exponent <= 0;
end else begin
pip5_sign <= pip5_sign_tmp;
pip5_exponent <= pip4_exponent1 & pip4_exponent2;
end
pip5_complement <= pip5_complement_tmp;
if (pip4_sign1 ^ pip4_sign2) begin
pip5_mantissa <= pip5_sub;
end else begin
pip5_mantissa <= pip5_add;
end
end
end
always @(*) begin
pip5_add = pip4_mantissaA + pip4_mantissaB;
pip5_sub = pip4_mantissaA - pip4_mantissaB;
if (pip4_compare) begin
pip5_sign_tmp = pip4_sign1;
end else begin
pip5_sign_tmp = pip4_sign2;
end
case ({pip4_sign1, pip4_sign2})
2'b00: begin //A=mantissa1 B=mantissa2, A+B
//pip5_sign_tmp = 0;
pip5_complement_tmp = 0;
end
2'b01: begin //A=mantissa1 B=mantissa2, A-B
if (~pip4_compare) begin //mantissa2 > mantissa1 => B>A
//pip5_sign_tmp = 1;
pip5_complement_tmp = 1;
end else begin
//pip5_sign_tmp = 0;
pip5_complement_tmp = 0;
end
end
2'b10: begin //A=mantissa2 B=mantissa1, A-B
if (pip4_compare) begin //mantissa1 > mantissa2 => B>A
//pip5_sign_tmp = 1;
pip5_complement_tmp = 1;
end else begin
//pip5_sign_tmp = 0;
pip5_complement_tmp = 0;
end
end
2'b11: begin //A=mantissa1 B=mantissa2, -(A+B)
//pip5_sign_tmp = 1;
pip5_complement_tmp = 0;
end
endcase
end
//---------------------------//
//-----pipeline stage: 6-----//
//if pip5_mantissa is negative, reverse to positive
reg pip6_sign;
reg pip6_valid;
reg [(EXPONENT_LEN-1):0]pip6_exponent;
reg [(MANTISSA_LEN+1):0]pip6_mantissa;
always @(posedge clk or negedge rstn) begin
if (!rstn) begin
pip6_sign <= 0;
pip6_exponent <= 0;
pip6_mantissa <= 0;
pip6_valid <= 0;
end else begin
pip6_valid <= pip5_valid;
pip6_sign <= pip5_sign;
pip6_exponent <= pip5_exponent;
if (pip5_complement) begin
pip6_mantissa <= ~pip5_mantissa+1;
end else begin
pip6_mantissa <= pip5_mantissa;
end
end
end
//---------------------------//
//-----pipeline stage: 7-----//
//normalize, 10.00101 * 2^33 => 1.000101 * 2^34
reg pip7_sign;
reg pip7_valid;
reg [(EXPONENT_LEN-1):0]pip7_exponent;
reg [(MANTISSA_LEN+1):0]pip7_mantissa; //54bit = 2bit interger + 52bit float
always @(posedge clk or negedge rstn) begin
if (!rstn) begin
pip7_sign <= 0;
pip7_exponent <= 0;
pip7_mantissa <= 0;
pip7_valid <= 0;
end else begin
pip7_valid <= pip6_valid;
pip7_sign <= pip6_sign;
if (pip6_mantissa[MANTISSA_LEN+1] == 1) begin
pip7_exponent <= pip6_exponent + 1;
if (pip6_mantissa[0] & pip6_mantissa[1]) begin
pip7_mantissa <= {1'b0, pip6_mantissa[(MANTISSA_LEN+1):1]} + 1;
end else begin
pip7_mantissa <= {1'b0, pip6_mantissa[(MANTISSA_LEN+1):1]};
end
end else begin
pip7_exponent <= pip6_exponent;
pip7_mantissa <= pip6_mantissa;
end
end
end
//---------------------------//
//-----pipeline stage: 8-----//
// normalize using leading-one detection
//0.00000101 * 2^22 => 1.01000000 * 2^16
reg pip8_sign;
reg pip8_valid;
reg [(EXPONENT_LEN-1):0] pip8_exponent;
reg [(MANTISSA_LEN-1):0] pip8_mantissa;
assign ready = pip8_valid;
reg [MANTISSA_LEN+1:0] pip8_mantissa_shift;
reg signed [5:0] shift;
integer i;
always @(*) begin
shift = 0;
for (i = MANTISSA_LEN - 1; i >= 0; i = i - 1) begin
if (pip7_mantissa[i] && shift == 0) begin
shift = MANTISSA_LEN - i;
end
end
pip8_mantissa_shift = pip7_mantissa << shift;
end
always @(posedge clk or negedge rstn) begin
if (!rstn) begin
pip8_sign <= 0;
pip8_valid <= 0;
pip8_exponent <= 0;
pip8_mantissa <= 0;
end else begin
pip8_sign <= pip7_sign;
pip8_valid <= pip7_valid;
if (pip7_mantissa[MANTISSA_LEN] == 1'b1) begin
// Already normalized
pip8_exponent <= pip7_exponent;
pip8_mantissa <= pip7_mantissa[MANTISSA_LEN-1:0];
end else begin
// Subnormal: align to leading one
pip8_exponent <= pip7_exponent - shift;
pip8_mantissa <= pip8_mantissa_shift[MANTISSA_LEN-1:0];
end
end
end
//---------------------------//
//assign result = {pip7_sign, pip7_exponent, pip7_mantissa[(MANTISSA_LEN-1):0]};
assign result = {pip8_sign, pip8_exponent, pip8_mantissa[(MANTISSA_LEN-1):0]};
endmodule //IEEE754_64bit_adder
```
:::
:::spoiler fadd_tb.v
```verilog=
// Author: Jesse
`timescale 1ns / 1ps
//`define SDFFILE "../syn/netlist/FLOAT_ADD_syn.sdf"
`include "FLOAT_ADD.v"
`include "fp_add.v"
module FLOAT_ADD_tb ();
localparam BIT = 64;
localparam BIAS = 1023;
localparam EXPONENT_LEN = 11;
localparam MANTISSA_LEN = 52;
localparam N=1000;
localparam PERIOD = 2;
// `ifdef SDF
// initial $sdf_annotate(`SDFFILE, uut);
// `endif
reg clk;
reg rstn;
reg [(BIT-1):0]num1;
reg [(BIT-1):0]num2;
wire [63:0]result;
reg valid;
wire ready;
FLOAT_ADD uut(
.num1(num1),
.num2(num2),
.clk(clk),
.rstn(rstn),
.result(result),
.valid(valid),
.ready(ready)
);
//-----clock generate-----//
initial begin
clk = 0;
forever begin
#(PERIOD/2) clk = (~clk);
end
end
//-----reset-----//
initial begin
rstn = 0;
repeat (2) @(posedge clk);
rstn = 1;
end
// ---------- memory ----------
reg [63:0] num1_mem [0:N-1];
reg [63:0] num2_mem [0:N-1];
reg [63:0] golden_result [0:N-1];
integer fd, i = 0, r;
reg [63:0] t1, t2, t3;
// ---------- file reading ----------
initial begin
fd = $fopen("./py/golden.dat", "r"); // read("r") golden.dat
if (!fd) begin
$display("❌ Error: Cannot open golden.dat");
$finish;
end
//if file is end, $feof will return 1, jump out of loop
//r represent read how many row from file
while (!$feof(fd)) begin
r = $fscanf(fd, "%h %h %h\n", t1, t2, t3);
if (r == 3) begin
num1_mem[i] = t1;
num2_mem[i] = t2;
golden_result[i] = t3;
i = i + 1;
end else begin
$display("⚠️ Warning: Invalid line format at index %0d", i);
end
end
$fclose(fd);
$display("✅ Loaded %0d test patterns from golden.dat", i);
end
// ---------- feed input ----------
integer j;
initial begin
valid = 0;
wait(rstn == 1);
for (j = 0; j < i; j = j + 1) begin
@(negedge clk);
valid <= 1;
num1 <= num1_mem[j];
num2 <= num2_mem[j];
end
// flush
@(negedge clk);
valid <= 0;
num1 <= 0;
num2 <= 0;
end
//-----prevent hang
integer timeout = (100000);
initial begin
while(timeout > 0) begin
@(posedge clk);
timeout = timeout - 1;
end
$display($time, "Simualtion Hang ....");
$finish;
end
// ---------- check result ----------
integer k = 0;
integer errors = 0;
initial begin
wait(rstn == 1);
repeat(8) @(posedge clk); // 等待 pipeline 開始輸出
while (k < N) begin
@(posedge clk);
if (result != golden_result[k]) begin
$display("❌ [Mismatch] Pattern %0d: result = %h, expected = %h",
k+1 , result, golden_result[k]);
errors = errors + 1;
end else begin
$display("✅ [Match] Pattern %0d: result = %h, expected = %h", k+1, result, golden_result[k]);
end
k = k + 1;
end
$display("🎯 Test completed: %0d errors out of %0d patterns", errors, i);
if (errors == 0)
$display("🎉 ALL PASS!");
else
$display("⚠️ Some mismatches found. Please check your design.");
repeat (10) @(posedge clk);
$finish;
end
initial begin
$dumpfile("fadd.vcd");
$dumpvars(0, FLOAT_ADD_tb);
//$dumpvars(0, uut.pip1_mantissa1[52]);
//$dumpvars(0, uut.pip1_mantissa1[2]);
//$dumpvars(0, uut.pip1_mantissa1[1]);
end
endmodule //IEEE754_64bit_add_tb
```
:::


:::spoiler fmul.v
```verilog=
//author: Jesse
//module: Double Precision Multiplier
//IEEE754 format
//64bit = 1bit | 11bit | 52bit
// Sign | Exponent | Mantissa
module FLOAT_MUL (
input [(BIT-1):0] num1,
input [(BIT-1):0] num2,
input clk,
input rstn,
output [(BIT-1):0] result,
input valid,
output ready
);
localparam BIT = 64;
localparam BIAS = 1023;
localparam EXPONENT_LEN = 11;
localparam MANTISSA_LEN = 52;
//-----pipeline stage: 0-----//
reg [(BIT-1):0]pip0_num1;
reg [(BIT-1):0]pip0_num2;
reg pip0_valid;
always @(posedge clk or negedge rstn) begin
if (!rstn) begin
pip0_num1 <= 0;
pip0_num2 <= 0;
pip0_valid <= 0;
end else begin
pip0_num1 <= num1;
pip0_num2 <= num2;
pip0_valid <= valid;
end
end
//---------------------------//
//-----pipeline stage: 1-----//
//this stage is input buffer
reg pip1_sign1;
reg pip1_sign2;
reg pip1_valid;
reg [(EXPONENT_LEN-1):0] pip1_exponent1;
reg [(EXPONENT_LEN-1):0] pip1_exponent2;
reg pip1_zero;
wire zero;
wire num1_zero;
wire num2_zero;
wire check_denor1;
wire check_denor2;
always @(posedge clk or negedge rstn) begin
if (!rstn) begin
pip1_sign1 <= 0;
pip1_sign2 <= 0;
pip1_valid <= 0;
pip1_exponent1 <= 0;
pip1_exponent2 <= 0;
pip1_zero <= 0;
end else begin
pip1_sign1 <= pip0_num1[63];
pip1_sign2 <= pip0_num2[63];
pip1_valid <= pip0_valid;
if (check_denor1) begin //for denormalize number
pip1_exponent1 <= 11'd1; //1-BIAS = -1022
end else begin
pip1_exponent1 <= pip0_num1[62:52];
end
if (check_denor2) begin //for denormalize number
pip1_exponent2 <= 11'd1; //1-BIAS = -1022
end else begin
pip1_exponent2 <= pip0_num2[62:52];
end
pip1_zero <= zero;
end
end
assign zero = (num1_zero | num2_zero); //check zero
assign num1_zero = ~(|pip0_num1[62:0]); //check pip0_num1 zero
assign num2_zero = ~(|pip0_num2[62:0]); //check pip0_num2 zero
assign check_denor1 = ~(|pip0_num1[62:52]); //check denormalize number
assign check_denor2 = ~(|pip0_num2[62:52]); //check denormalize number
//53bit: 1bit(leading 1) + 52bit(mantissa)
reg [(MANTISSA_LEN):0] pip1_mantissa1 [(MANTISSA_LEN):0];
reg [(MANTISSA_LEN):0] pip1_mantissa2;
integer i;
always @(posedge clk or negedge rstn) begin
if (!rstn) begin
for (i = 0; i < 53; i=i+1) begin
pip1_mantissa1[i][(MANTISSA_LEN):0] <= 0;
end
pip1_mantissa2[(MANTISSA_LEN):0] <= 0;
end else begin
for (i = 0; i <= 52; i=i+1) begin //0~51
if (check_denor1) begin
pip1_mantissa1[i][(MANTISSA_LEN):0] <= {1'b0, pip0_num1[51:0]};
end else begin
pip1_mantissa1[i][(MANTISSA_LEN):0] <= {1'b1, pip0_num1[51:0]};
end
end
if (check_denor2) begin
pip1_mantissa2[(MANTISSA_LEN):0] <= {1'b0, pip0_num2[51:0]};
end else begin
pip1_mantissa2[(MANTISSA_LEN):0] <= {1'b1, pip0_num2[51:0]};
end
end
end
//---------------------------//
//-----pipeline stage: 2-----//
//calculate sign, add exponent, anding num1 and num2
reg pip2_sign;
reg pip2_valid;
//1bit(check overflow) + 11bit(exponent)
reg [(EXPONENT_LEN):0] pip2_exponent;
reg pip2_zero;
always @(posedge clk or negedge rstn) begin
if (!rstn) begin
pip2_sign <= 0;
pip2_exponent <= 0;
pip2_zero <= 0;
pip2_valid <= 0;
end else begin
pip2_sign <= pip1_sign1 ^ pip1_sign2;
pip2_exponent <= pip1_exponent1 + pip1_exponent2;
pip2_zero <= pip1_zero;
pip2_valid <= pip1_valid;
end
end
//AND num1 with each bit of num2
reg [(MANTISSA_LEN):0] pip2_mantissa [(MANTISSA_LEN):0];
always @(posedge clk or negedge rstn) begin
if (!rstn) begin
for (i = 0; i < 53; i++) begin
pip2_mantissa [i][(MANTISSA_LEN):0] <= 0;
end
end else begin
for (i = 0; i < 53; i++) begin
pip2_mantissa [i][(MANTISSA_LEN):0] <= pip1_mantissa1[i] & {53{pip1_mantissa2[i]}};
end
end
end
//---------------------------//
//-----pipeline stage: 3-----//
reg pip3_sign;
reg pip3_valid;
reg [(EXPONENT_LEN):0]pip3_exponent;
always @(posedge clk or negedge rstn) begin
if (!rstn) begin
pip3_sign <= 0;
pip3_exponent <= 0;
pip3_valid <= 0;
end else begin
pip3_valid <= pip2_valid;
if (pip2_zero) begin
pip3_exponent <= 0;
pip3_sign <= 0;
end else begin
pip3_exponent <= pip2_exponent - BIAS;
pip3_sign <= pip2_sign;
end
end
end
//55bit = 54bit(pip2_mantissa1 shift 1bit) + 53bit(pip2_mantissa1)
//27number
reg [(MANTISSA_LEN+2):0]pip3_mantissa[(MANTISSA_LEN/2):0];
always @(posedge clk or negedge rstn) begin
if (!rstn) begin
for (i = 0; i <= (MANTISSA_LEN/2); i++) begin
pip3_mantissa[i] <= 0;
end
end else begin
//Group A, B, C
for (i = 0; i < (MANTISSA_LEN/2); i++) begin
pip3_mantissa[i] <= pip2_mantissa[2*i] + {pip2_mantissa[2*i+1], 1'b0};
end
//Group D
pip3_mantissa[(MANTISSA_LEN/2)] <= pip2_mantissa[MANTISSA_LEN];
end
end
//---------------------------//
//-----pipeline stage: 4-----//
reg pip4_sign;
reg pip4_valid;
reg [(EXPONENT_LEN):0]pip4_exponent;
reg [(MANTISSA_LEN + 4):0]pip4_mantissa[(MANTISSA_LEN/4):0];
always @(posedge clk or negedge rstn) begin
if (!rstn) begin
pip4_sign <= 0;
pip4_valid <= 0;
pip4_exponent <= 0;
for (i = 0; i <= (MANTISSA_LEN/4); i++) begin
pip4_mantissa[i] <= 0;
end
end else begin
pip4_sign <= pip3_sign;
pip4_valid <= pip3_valid;
pip4_exponent <= pip3_exponent;
for (i = 0; i < (MANTISSA_LEN/4); i++) begin
pip4_mantissa[i] <= pip3_mantissa[2*i] + {pip3_mantissa[2*i+1], 2'b00};
end
pip4_mantissa[(MANTISSA_LEN/4)] <= pip3_mantissa[(MANTISSA_LEN/2)];
end
end
//14number
//---------------------------//
//-----pipeline stage: 5-----//
reg pip5_sign;
reg pip5_valid;
reg [(EXPONENT_LEN):0]pip5_exponent;
reg [(MANTISSA_LEN+8):0]pip5_mantissa[7:0];
always @(posedge clk or negedge rstn) begin
if (!rstn) begin
pip5_sign <= 0;
pip5_valid <= 0;
pip5_exponent <= 0;
for (i = 0; i <= 7; i++) begin
pip5_mantissa[i] <= 0;
end
end else begin
pip5_sign <= pip4_sign;
pip5_valid <= pip4_valid;
pip5_exponent <= pip4_exponent;
for (i = 0; i <= 5; i++) begin
pip5_mantissa[i] <= pip4_mantissa[2*i] + {pip4_mantissa[2*i+1], 4'b0000};
end
pip5_mantissa[6] <= pip4_mantissa[(MANTISSA_LEN/4)-1];
pip5_mantissa[7] <= pip4_mantissa[(MANTISSA_LEN/4)];
end
end
//61bit, 8number
//---------------------------//
//-----pipeline stage: 6-----//
reg pip6_sign;
reg pip6_valid;
reg [(EXPONENT_LEN):0]pip6_exponent;
reg [(MANTISSA_LEN+16):0]pip6_mantissa[4:0];
//69bit, 5number
always @(posedge clk or negedge rstn) begin
if (!rstn) begin
pip6_sign <= 0;
pip6_valid <= 0;
pip6_exponent <= 0;
for (i = 0; i <= 4; i++) begin
pip6_mantissa[i] <= 0;
end
end else begin
pip6_sign <= pip5_sign;
pip6_valid <= pip5_valid;
pip6_exponent <= pip5_exponent;
for (i = 0; i <= 2; i++) begin
pip6_mantissa[i] <= pip5_mantissa[2*i] + {pip5_mantissa[2*i+1], 8'h00};
end
pip6_mantissa[3] <= pip5_mantissa[6];
pip6_mantissa[4] <= pip5_mantissa[7];
end
end
//---------------------------//
//-----pipeline stage: 7-----//
reg pip7_sign;
reg pip7_valid;
reg [(EXPONENT_LEN):0]pip7_exponent;
reg [(MANTISSA_LEN+32):0]pip7_mantissa[3:0];
//85bit, 4number
always @(posedge clk or negedge rstn) begin
if (!rstn) begin
pip7_sign <= 0;
pip7_valid <= 0;
pip7_exponent <= 0;
for (i = 0; i <= 3; i++) begin
pip7_mantissa[i] <= 0;
end
end else begin
pip7_sign <= pip6_sign;
pip7_valid <= pip6_valid;
pip7_exponent <= pip6_exponent;
//Group A
pip7_mantissa[0] <= pip6_mantissa[0] + {pip6_mantissa[1], 16'h0000};
//Group B
pip7_mantissa[1] <= pip6_mantissa[2];
//Group C
pip7_mantissa[2] <= pip6_mantissa[3];
//Group D
pip7_mantissa[3] <= pip6_mantissa[4];
end
end
//---------------------------//
//-----pipeline stage: 8-----//
reg pip8_sign;
reg pip8_valid;
reg [(EXPONENT_LEN):0]pip8_exponent;
reg [31:0] pip8_A_remain;
reg [3:0] pip8_C_remain;
reg [53:0]pip8_w; //1bit(carry) + 53bit
reg [16:0]pip8_x; //1bit(carry) + 16bit
reg [16:0]pip8_y; //1bit(carry) + 16bit
reg [37:0]pip8_z; //1bit(carry) + 37bit
always @(posedge clk or negedge rstn) begin
if (!rstn) begin
pip8_sign <= 0;
pip8_valid <= 0;
pip8_exponent <= 0;
pip8_A_remain <= 0;
pip8_C_remain <= 0;
pip8_w <= 0;
pip8_x <= 0;
pip8_y <= 0;
pip8_z <= 0;
end else begin
pip8_sign <= pip7_sign;
pip8_valid <= pip7_valid;
pip8_exponent <= pip7_exponent;
pip8_A_remain <= pip7_mantissa[0][31:0];
pip8_C_remain <= pip7_mantissa[2][3:0];
pip8_w <= pip7_mantissa[0][84:48] + pip7_mantissa[1][68:16];
pip8_x <= pip7_mantissa[0][47:32] + pip7_mantissa[1][15:0];
pip8_y <= pip7_mantissa[2][56:41] + pip7_mantissa[3][52:37];
pip8_z <= pip7_mantissa[2][40:4] + pip7_mantissa[3][36:0];
end
end
//---------------------------//
//-----pipeline stage: 9-----//
reg pip9_sign;
reg pip9_valid;
reg [(EXPONENT_LEN):0]pip9_exponent;
reg [101:0]pip9_AB;
reg [57:0]pip9_CD;
wire [53:0]pip9_w;
wire [16:0]pip9_y;
assign pip9_w = pip8_w + pip8_x[16];//add carry
assign pip9_y = pip8_y + pip8_z[37];//add carry
always @(posedge clk or negedge rstn) begin
if (!rstn) begin
pip9_sign <= 0;
pip9_valid <= 0;
pip9_exponent <= 0;
pip9_AB <= 0;
pip9_CD <= 0;
end else begin
pip9_sign <= pip8_sign;
pip9_valid <= pip8_valid;
pip9_exponent <= pip8_exponent;
pip9_AB <= {pip9_w, pip8_x[15:0], pip8_A_remain};
pip9_CD <= {pip9_y, pip8_z[36:0], pip8_C_remain};
end
end
//---------------------------//
//-----pipeline stage: 10----//
reg pip10_sign;
reg pip10_valid;
reg [(EXPONENT_LEN):0]pip10_exponent;
reg [47:0]pip10_AB_remain;
reg [31:0]pip10_n; //32bit = 1bit(carry)+31bit
reg [27:0]pip10_m; //28bit = 1bit(carry)+27bit
always @(posedge clk or negedge rstn) begin
if (!rstn) begin
pip10_sign <= 0;
pip10_valid <= 0;
pip10_exponent <= 0;
pip10_AB_remain <= 0;
pip10_n <= 0;
pip10_m <= 0;
end else begin
pip10_sign <= pip9_sign;
pip10_valid <= pip9_valid;
pip10_exponent <= pip9_exponent;
pip10_AB_remain <= pip9_AB[47:0];
pip10_n <= pip9_AB[78:48] + pip9_CD[30:0];
pip10_m <= pip9_AB[101:79] + pip9_CD[57:31];
end
end
//---------------------------//
//-----pipeline stage: 11----//
reg pip11_sign;
reg pip11_valid;
reg [(EXPONENT_LEN):0]pip11_exponent;
wire [26:0]pip11_mn;//27bit (need 1 bit carry?)
reg [105:0]pip11_mantissa;
assign pip11_mn = pip10_m + pip10_n[31];
always @(posedge clk or negedge rstn) begin
if (!rstn) begin
pip11_sign <= 0;
pip11_valid <= 0;
pip11_exponent <= 0;
pip11_mantissa <= 0;
end else begin
pip11_sign <= pip10_sign;
pip11_valid <= pip10_valid;
pip11_exponent <= pip10_exponent;
pip11_mantissa <= {pip11_mn, pip10_n[30:0], pip10_AB_remain};
end
end
//---------------------------//
//-----pipeline stage: 12----//
reg pip12_sign;
reg pip12_valid;
reg [(EXPONENT_LEN-1):0]pip12_exponent;
reg [105:0]pip12_mantissa;
always @(posedge clk or negedge rstn) begin
if (!rstn) begin
pip12_sign <= 0;
pip12_valid <= 0;
pip12_exponent <= 0;
pip12_mantissa <= 0;
end else begin
pip12_sign <= pip11_sign;
pip12_valid <= pip11_valid;
if (pip11_mantissa[105] == 1'b1) begin //normalize
pip12_exponent <= pip11_exponent + 1'b1;
pip12_mantissa <= pip11_mantissa >> 1; //shift right
end else begin
pip12_exponent <= pip11_exponent;
pip12_mantissa <= pip11_mantissa;
end
end
end
//---------------------------//
//-----pipeline stage: 13----//
reg pip13_sign;
reg pip13_valid;
assign ready = pip13_valid;
reg [(EXPONENT_LEN-1):0]pip13_exponent;
reg [(MANTISSA_LEN-1):0]pip13_mantissa;
wire LSB = pip12_mantissa[52];
wire Guard = pip12_mantissa[51];
wire Round = pip12_mantissa[50];
wire Sticky = |pip12_mantissa[49:0];
wire round_up = Guard & (Round | Sticky | LSB);
always @(posedge clk or negedge rstn) begin
if (!rstn) begin
pip13_sign <= 0;
pip13_valid <= 0;
pip13_exponent <= 0;
pip13_mantissa <= 0;
end else begin
pip13_sign <= pip12_sign;
pip13_valid <= pip12_valid;
pip13_exponent <= pip12_exponent;
if (round_up) begin
pip13_mantissa <= pip12_mantissa[103:52] + 1;
end else begin
pip13_mantissa <= pip12_mantissa[103:52];
end
end
end
assign result = {pip13_sign, pip13_exponent, pip13_mantissa};
//---------------------------//
endmodule //IEEE754_64bit_mul
```
:::
:::spoiler fmul_tb.v
```verilog=
// Author: Jesse
`timescale 1ns / 1ps
`include "FLOAT_MUL.v"
module FLOAT_MUL_tb ();
localparam BIT = 64;
localparam BIAS = 1023;
localparam EXPONENT_LEN = 11;
localparam MANTISSA_LEN = 52;
localparam N=1000;
reg clk;
reg rstn;
reg [(BIT-1):0]num1;
reg [(BIT-1):0]num2;
wire [63:0]result;
reg valid;
wire ready;
FLOAT_MUL uut(
.num1(num1),
.num2(num2),
.clk(clk),
.rstn(rstn),
.result(result),
.valid(valid),
.ready(ready)
);
//-----clock generate-----//
initial begin
clk = 0;
forever begin
#5 clk = (~clk);
end
end
//-----reset-----//
initial begin
rstn = 0;
repeat (2) @(posedge clk);
rstn = 1;
end
// ---------- memory ----------
reg [63:0] num1_mem [0:N-1];
reg [63:0] num2_mem [0:N-1];
reg [63:0] golden_result [0:N-1];
integer fd, i = 0, r;
reg [63:0] t1, t2, t3;
// ---------- file reading ----------
initial begin
fd = $fopen("./py/golden.dat", "r"); // read("r") golden.dat
if (!fd) begin
$display("❌ Error: Cannot open golden.dat");
$finish;
end
//if file is end, $feof will return 1, jump out of loop
//r represent read how many row from file
while (!$feof(fd)) begin
r = $fscanf(fd, "%h %h %h\n", t1, t2, t3);
if (r == 3) begin
num1_mem[i] = t1;
num2_mem[i] = t2;
golden_result[i] = t3;
i = i + 1;
end else begin
$display("⚠️ Warning: Invalid line format at index %0d", i);
end
end
$fclose(fd);
$display("✅ Loaded %0d test patterns from golden.dat", i);
end
// ---------- feed input ----------
integer j;
initial begin
valid = 0;
wait(rstn == 1);
for (j = 0; j < i; j = j + 1) begin
@(negedge clk);
valid <= 1;
num1 <= num1_mem[j];
num2 <= num2_mem[j];
end
// flush
@(negedge clk);
valid <= 0;
num1 <= 0;
num2 <= 0;
end
//-----prevent hang
integer timeout = (100000);
initial begin
while(timeout > 0) begin
@(posedge clk);
timeout = timeout - 1;
end
$display($time, "Simualtion Hang ....");
$finish;
end
// ---------- check result ----------
integer k = 0;
integer errors = 0;
initial begin
wait(rstn == 1);
repeat(14) @(posedge clk); // 等待 pipeline 開始輸出
while (k < N) begin
@(posedge clk);
if (result != golden_result[k]) begin
$display("❌ [Mismatch] Pattern %0d: result = %h, expected = %h",
k , result, golden_result[k]);
errors = errors + 1;
end else begin
$display("✅ [Match] Pattern %0d: result = %h, expected = %h", k, result, golden_result[k]);
end
k = k + 1;
end
$display("🎯 Test completed: %0d errors out of %0d patterns", errors, i);
if (errors == 0)
$display("🎉 ALL PASS!");
else
$display("⚠️ Some mismatches found. Please check your design.");
repeat (10) @(posedge clk);
$finish;
end
initial begin
$dumpfile("mul.vcd");
$dumpvars();
//$dumpvars(0, uut.pip1_mantissa1[52]);
//$dumpvars(0, uut.pip1_mantissa1[2]);
//$dumpvars(0, uut.pip1_mantissa1[1]);
end
endmodule //IEEE754_64bit_mul_tb
```
:::


:::spoiler cmul.v
```verilog=
`include "FLOAT_ADD.v"
`include "FLOAT_MUL.v"
module COMPLEX_MUL #(
parameter pDATA_WIDTH=128
)(
input [(pDATA_WIDTH-1):0]num1,//Xr(64bit) + jXi(64bit)
input [(pDATA_WIDTH-1):0]num2,//Yr(64bit) + jYi(64bit)
input clk,
input rstn,
output [(pDATA_WIDTH-1):0]result,
input valid,
output ready
);
//num1 = Xr + jXi
//num2 = Yr + jYi
//-----stage 1-----//
//8 cycle
wire [63:0]negative_num1i = {~num1[63], num1[62:0]};
wire [63:0]negative_num2i = {~num2[63], num2[62:0]};
wire [63:0]stage1_add1_result;
wire [63:0]stage1_add2_result;
wire [63:0]stage1_add3_result;
wire stage1_add1_ready;
wire stage1_add2_ready;
wire stage1_add3_ready;
//Yr+Yi
FLOAT_ADD stage1_add1(
.num1(num2[127:64]),
.num2(num2[63:0]),
.clk(clk),
.rstn(rstn),
.result(stage1_add1_result),
.valid(valid),
.ready(stage1_add1_ready)
);
//Xr-Xi
FLOAT_ADD stage1_add2(
.num1(num1[127:64]),
.num2(negative_num1i),
.clk(clk),
.rstn(rstn),
.result(stage1_add2_result),
.valid(valid),
.ready(stage1_add2_ready)
);
//Yr-Yi
FLOAT_ADD stage1_add3(
.num1(num2[127:64]),
.num2(negative_num2i),
.clk(clk),
.rstn(rstn),
.result(stage1_add3_result),
.valid(valid),
.ready(stage1_add3_ready)
);
//-----stage 1-----//
//-----stage 2-----//
//14 cycle
wire [63:0]stage2_mul1_result;
wire [63:0]stage2_mul2_result;
wire [63:0]stage2_mul3_result;
wire stage2_mul1_ready;
wire stage2_mul2_ready;
wire stage2_mul3_ready;
//Xi * (Yr + Yi)
FLOAT_MUL stage2_mul1(
.num1(pip8_num1[63:0]),
.num2(stage1_add1_result),
.clk(clk),
.rstn(rstn),
.result(stage2_mul1_result),
.valid(stage1_add1_ready),
.ready(stage2_mul1_ready)
);
//Yi * (Xr - Xi)
FLOAT_MUL stage2_mul2(
.num1(pip8_num2[63:0]),
.num2(stage1_add2_result),
.clk(clk),
.rstn(rstn),
.result(stage2_mul2_result),
.valid(stage1_add2_ready),
.ready(stage2_mul2_ready)
);
//Xr * (Yr - Yi)
FLOAT_MUL stage2_mul3(
.num1(pip8_num1[127:64]),
.num2(stage1_add3_result),
.clk(clk),
.rstn(rstn),
.result(stage2_mul3_result),
.valid(stage1_add3_ready),
.ready(stage2_mul3_ready)
);
//-----stage 2-----//
//-----stage 3-----//
//8 cycle
wire stage3_add1_ready;
wire stage3_add2_ready;
wire stage2_1_ready = stage2_mul1_ready & stage2_mul2_ready;
wire stage2_2_ready = stage2_mul2_ready & stage2_mul3_ready;
assign ready = stage3_add1_ready & stage3_add2_ready;
//Zi = Xi * (Yr + Yi) + Yi * (Xr - Xi)
FLOAT_ADD stage3_add1(
.num1(stage2_mul1_result),
.num2(stage2_mul2_result),
.clk(clk),
.rstn(rstn),
.result(result[63:0]),
.valid(stage2_1_ready),
.ready(stage3_add1_ready)
);
//Zr = Xr * (Yr - Yi) + Yi * (Xr - Xi)
FLOAT_ADD stage3_add2(
.num1(stage2_mul3_result),
.num2(stage2_mul2_result),
.clk(clk),
.rstn(rstn),
.result(result[127:64]),
.valid(stage2_2_ready),
.ready(stage3_add2_ready)
);
//-----stage 3-----//
//-----pip-----//
//buffer for stage2 num1 & num2
reg [127:0]pip1_num1;
reg [63:0]pip1_num2;
always @(posedge clk or negedge rstn) begin
if (!rstn) begin
pip1_num1 <= 0;
pip1_num2 <= 0;
end else begin
pip1_num1 <= num1;
pip1_num2 <= num2[63:0];
end
end
reg [127:0]pip2_num1;
reg [63:0]pip2_num2;
always @(posedge clk or negedge rstn) begin
if (!rstn) begin
pip2_num1 <= 0;
pip2_num2 <= 0;
end else begin
pip2_num1 <= pip1_num1;
pip2_num2 <= pip1_num2;
end
end
reg [127:0]pip3_num1;
reg [63:0]pip3_num2;
always @(posedge clk or negedge rstn) begin
if (!rstn) begin
pip3_num1 <= 0;
pip3_num2 <= 0;
end else begin
pip3_num1 <= pip2_num1;
pip3_num2 <= pip2_num2;
end
end
reg [127:0]pip4_num1;
reg [63:0]pip4_num2;
always @(posedge clk or negedge rstn) begin
if (!rstn) begin
pip4_num1 <= 0;
pip4_num2 <= 0;
end else begin
pip4_num1 <= pip3_num1;
pip4_num2 <= pip3_num2;
end
end
reg [127:0]pip5_num1;
reg [63:0]pip5_num2;
always @(posedge clk or negedge rstn) begin
if (!rstn) begin
pip5_num1 <= 0;
pip5_num2 <= 0;
end else begin
pip5_num1 <= pip4_num1;
pip5_num2 <= pip4_num2;
end
end
reg [127:0]pip6_num1;
reg [63:0]pip6_num2;
always @(posedge clk or negedge rstn) begin
if (!rstn) begin
pip6_num1 <= 0;
pip6_num2 <= 0;
end else begin
pip6_num1 <= pip5_num1;
pip6_num2 <= pip5_num2;
end
end
reg [127:0]pip7_num1;
reg [63:0]pip7_num2;
always @(posedge clk or negedge rstn) begin
if (!rstn) begin
pip7_num1 <= 0;
pip7_num2 <= 0;
end else begin
pip7_num1 <= pip6_num1;
pip7_num2 <= pip6_num2;
end
end
reg [127:0]pip8_num1;
reg [63:0]pip8_num2;
always @(posedge clk or negedge rstn) begin
if (!rstn) begin
pip8_num1 <= 0;
pip8_num2 <= 0;
end else begin
pip8_num1 <= pip7_num1;
pip8_num2 <= pip7_num2;
end
end
//-----pip-----//
endmodule //COMPLEX_MUL
```
:::
:::spoiler cmul_tb.v
```verilog=
// Author: Jesse & Hsuan
`timescale 1ns / 1ps
`include "COMPLEX_MUL.v"
module COMPLEX_MUL_tb ();
localparam BIT = 128;
localparam N = 1000;
reg clk;
reg rstn;
reg [(BIT-1):0] num1;
reg [(BIT-1):0] num2;
wire [(BIT-1):0] result;
reg valid;
wire ready;
COMPLEX_MUL uut (
.num1(num1),
.num2(num2),
.clk(clk),
.rstn(rstn),
.result(result),
.valid(valid),
.ready(ready)
);
//-----clock generate-----//
initial begin
clk = 0;
forever #5 clk = ~clk;
end
//-----reset-----//
initial begin
rstn = 0;
repeat (2) @(posedge clk);
rstn = 1;
end
// ---------- memory ---------- //
reg [127:0] num1_mem [0:N-1];
reg [127:0] num2_mem [0:N-1];
reg [127:0] golden_result [0:N-1];
integer fd, i = 0, r;
reg [63:0] Xr, Xi, Yr, Yi, Zr, Zi;
// ---------- file reading ---------- //
initial begin
fd = $fopen("./py/golden.dat", "r");
if (!fd) begin
$display("❌ Error: Cannot open golden.dat");
$finish;
end
while (!$feof(fd)) begin
r = $fscanf(fd, "%h %h %h %h %h %h\n", Xr, Xi, Yr, Yi, Zr, Zi);
if (r == 6) begin
num1_mem[i] = {Xr, Xi};
num2_mem[i] = {Yr, Yi};
golden_result[i] = {Zr, Zi};
i = i + 1;
end else begin
$display("⚠️ Warning: Invalid line format at index %0d", i);
end
end
$fclose(fd);
$display("✅ Loaded %0d test patterns from golden.dat", i);
end
// ---------- feed input ---------- //
integer j;
initial begin
valid = 0;
wait(rstn == 1);
for (j = 0; j < i; j = j + 1) begin
@(negedge clk);
valid <= 1;
num1 <= num1_mem[j];
num2 <= num2_mem[j];
end
@(negedge clk);
valid <= 0;
num1 <= 0;
num2 <= 0;
end
// ---------- timeout prevent ---------- //
integer timeout = 100000;
initial begin
while(timeout > 0) begin
@(posedge clk);
timeout = timeout - 1;
end
$display($time, " Simulation Hang ....");
$finish;
end
// ---------- check result ---------- //
integer k = 0;
integer errors = 0;
//localparam MASK = 127 << 0 | 127 << 64;
//use to mask pattern : if ((result|MASK) !== (golden_result[k]|MASK))
initial begin
wait(rstn == 1);
repeat(24) @(posedge clk); // wait for pipeline latency
while (k < i) begin
@(posedge clk);
if ((result) !== (golden_result[k])) begin
$display("❌ [Mismatch] Pattern %0d:", k+1);
$display("result = %h", result);
$display("expected = %h", golden_result[k]);
errors = errors + 1;
end else begin
$display("✅ [Match] Pattern %0d: result = %h", k+1, result);
end
k = k + 1;
end
$display("🎯 Test completed: %0d errors out of %0d patterns", errors, i);
if (errors == 0)
$display("🎉 ALL PASS!");
else
$display("⚠️ Some mismatches found. Please check your design.");
repeat (10) @(posedge clk);
$finish;
end
initial begin
$dumpfile("complex_mul.vcd");
$dumpvars(0, COMPLEX_MUL_tb);
end
endmodule
```
:::


Since the above old version is not efficiently to do resource sharing with NTT montgomery multiplication, we use other architecture, **array multiplication** and **Wallace multiplier** to get better timing and resource sharing as following.
## Design Methods
### Array multiplication
In FFT operation, we use floating point multiplication(add exponent, calculate sign, multiply significand(53bit $\times$ 53bit), rounding, ...etc).
In NTT operation, we hope to find a way in the world of integers that allows integers to have properties similar to complex numbers. That is **mod operation**, we multiply two number(16bit $\times$ 16bit) with montgomery multiplication.
Since both of them use a lot of resource on multiplication, we use array multiplication to implement the resource sharing.

As above figure, we devided 64bit $\times$ 64bit multiplication into 16 part, each part is a 16 $\times$ 16 bit wallace multiplier(for better timing), then when we doing NTT integer multiplication:
$A3(\text{16bit}), A2(\text{16bit}), A1(\text{16bit}), A0(\text{16bit}) \times B3(\text{16bit}), B2(\text{16bit}), B1(\text{16bit}), B0(\text{16bit})$
$=A3B3(\text{32bit}), A2B2(\text{32bit}), A1B1(\text{32bit}), A0B0(\text{32bit})$
Which is the purple part in the figure(note the 32bit result didnt do the mod operation now).
For FFT, since there is 53bit $\times$ 53bit multiplication, we need all of the result from small wallace multiplier, totally 16 $\times$ 32bit result, put them in the next bigger wallace tree to add them up, get the result of significand multiplication and then do the rounding.
## FFT new version complex multiplier(Hsuan)

[link: resource sharing BPE](https://github.com/JJ-best/SOC/tree/master/Spring_Final_Project/Operator/fiFFNTT_mul/1.RTL_simulation)
Not my part, Hsuan do the complex multiplication in resouce sharing architecture and write the testbench for complex multiplication and integer multiplication.
## NTT montgomery multiplier(Jesse)

[link: resource sharing BPE](https://github.com/JJ-best/SOC/tree/master/Spring_Final_Project/Operator/fiFFNTT_mul/1.RTL_simulation)
The montgomery multiplication can be shown as following:
$Monty(X, Y) = \frac{X \times Y}{R} \text{ mod } Q$
Since there needs divition operation, needs to more resource and timing, we use some skill to get the same result, but no need on division, only needs shift operation.
$Monty(X, Y) = \frac{T+W}{R}\text{ mod } Q$
$W = (TQ^{-1}\text{ mod } R) \times Q$
$Q^{-1}=12287=0x2FFF \quad Q=12289=0x3001 \quad R=2^{16}$
To implement the mod operation, I use following design methods. Since the right two multiplication array(in the above figure) gives 8 $\times$ 32bit result(purple part, $T$), we want use these result to implement $(TQ^{-1}\text{ mod } R)$.
$T$(32bit), $Q^{-1}$(16bit), the multiplication can be written as (16+16bit) $\times$ 16bit, seems totally need 2 small array block(16bit wallace multiplier), however, we can observe $\text{ mod } R$ operation, which gets the remainder of $R=2^{16}=0x10000$, that is, we can written $(TQ^{-1}\text{ mod } R)$ as $(TQ^{-1} \text{ & } 0xFFFF)$

That is, we can pass the half result from right two multiplication array(low 16bit of $T$(32bit)), so that we only need one small 16bit wallace multiplier to implement the formula $(TQ^{-1}\text{ mod } R)$.
Next we need to get $W$ by $(TQ^{-1}\text{ mod } R)\times Q$, the data path of the left array will change as the follwing figure if the mode signal is for NTT. row 0 and row 1 is doing the operation I discuss in the above part, and row 3 and row 4 is used to multiple $Q$, get $W$.
To do the $T+W$ operation in ii architecture, the result $T$ comes from the right two array needs to put into the pipline register, which have same latency as the left array multiplier, and add there output at the same time in CLA33 level. Next, minus Q at CLA17 level(16bit sign addition need extra sign bit), if the result is negative, take the result from CLA33 level.

:::spoiler mul_array_ntt.v
```verilog=
module mul16_array_ntt #(
parameter pDi_WIDTH = 64 ,
parameter pDo_WIDTH = 32
)
(
input[(pDi_WIDTH-1):0] in_A, // * input 64bit data
input[(pDi_WIDTH-1):0] in_B0, // * input 64bit data
input[(pDi_WIDTH-1):0] in_B1, // FFT: in_B1 = in_B2, NTT in_B1 != in_B2
input clk,
input rst_n,
input mode,
input in_valid,
output out_valid,
//-------- result from mul_16 ---------//
output[(pDo_WIDTH-1):0] result_00,
output[(pDo_WIDTH-1):0] result_01,
output[(pDo_WIDTH-1):0] result_02,
output[(pDo_WIDTH-1):0] result_03,
output[(pDo_WIDTH-1):0] result_10,
output[(pDo_WIDTH-1):0] result_11,
output[(pDo_WIDTH-1):0] result_12,
output[(pDo_WIDTH-1):0] result_13,
output[(pDo_WIDTH-1):0] result_20,
output[(pDo_WIDTH-1):0] result_21,
output[(pDo_WIDTH-1):0] result_22,
output[(pDo_WIDTH-1):0] result_23,
output[(pDo_WIDTH-1):0] result_30,
output[(pDo_WIDTH-1):0] result_31,
output[(pDo_WIDTH-1):0] result_32,
output[(pDo_WIDTH-1):0] result_33
);
//============================================================================================//
localparam pMUL_WIDTH = 16;
localparam C_MUL = 1'b0;
localparam INT_MUL = 1'b1;
localparam Q01 = 16'h2FFF;
localparam Q = 16'h3001;
//============================================================================================//
wire [(pMUL_WIDTH-1):0] partA[0:3];
wire [(pMUL_WIDTH-1):0] partB[0:3];
wire o_valid[0:15];
reg o_valid_buf[0:7];
wire i_valid[0:7];
wire [(pDo_WIDTH-1):0] result_0[0:3];
wire [(pDo_WIDTH-1):0] result_1[0:3];
wire [(pDo_WIDTH-1):0] result_2[0:3];
wire [(pDo_WIDTH-1):0] result_3[0:3];
wire [(pMUL_WIDTH-1):0]in_a00;
wire [(pMUL_WIDTH-1):0]in_a01;
wire [(pMUL_WIDTH-1):0]in_a02;
wire [(pMUL_WIDTH-1):0]in_a03;
wire [(pMUL_WIDTH-1):0]in_a10;
wire [(pMUL_WIDTH-1):0]in_a11;
wire [(pMUL_WIDTH-1):0]in_a12;
wire [(pMUL_WIDTH-1):0]in_a13;
wire [(pMUL_WIDTH-1):0]in_a20;
wire [(pMUL_WIDTH-1):0]in_a21;
wire [(pMUL_WIDTH-1):0]in_a22;
wire [(pMUL_WIDTH-1):0]in_a23;
wire [(pMUL_WIDTH-1):0]in_a30;
wire [(pMUL_WIDTH-1):0]in_a31;
wire [(pMUL_WIDTH-1):0]in_a32;
wire [(pMUL_WIDTH-1):0]in_a33;
wire [(pMUL_WIDTH-1):0]in_b00;
wire [(pMUL_WIDTH-1):0]in_b01;
wire [(pMUL_WIDTH-1):0]in_b02;
wire [(pMUL_WIDTH-1):0]in_b03;
wire [(pMUL_WIDTH-1):0]in_b10;
wire [(pMUL_WIDTH-1):0]in_b11;
wire [(pMUL_WIDTH-1):0]in_b12;
wire [(pMUL_WIDTH-1):0]in_b13;
wire [(pMUL_WIDTH-1):0]in_b20;
wire [(pMUL_WIDTH-1):0]in_b21;
wire [(pMUL_WIDTH-1):0]in_b22;
wire [(pMUL_WIDTH-1):0]in_b23;
wire [(pMUL_WIDTH-1):0]in_b30;
wire [(pMUL_WIDTH-1):0]in_b31;
wire [(pMUL_WIDTH-1):0]in_b32;
wire [(pMUL_WIDTH-1):0]in_b33;
reg [(pMUL_WIDTH-1):0]in_buf[7:0];
//============================================================================================//
assign partA[0] = in_A[15:0];
assign partA[1] = in_A[31:16];
assign partA[2] = in_A[47:32];
assign partA[3] = in_A[63:48];
assign partB[0] = in_B0[15:0];
assign partB[1] = in_B0[31:16];
assign partB[2] = in_B0[47:32];
assign partB[3] = in_B0[63:48];
//For FFT in_B0 = in B1 = array_in_A0
//For NTT in_B0 = mul_16_result_b3[3][15:0] , mul_16_result_b2[2][15:0] , mul_16_result_b1[1][15:0] , mul_16_result_b0[0][15:0]
//For NTT in_B1 = mul_16_result_c3[3][15:0] , mul_16_result_c2[2][15:0] , mul_16_result_c1[1][15:0] , mul_16_result_c0[0][15:0]
// a = 0(row)
assign in_a00 = (mode == INT_MUL)? Q01:in_A[(pMUL_WIDTH-1):0];
assign in_a01 = (mode == INT_MUL)? Q01:in_A[(pMUL_WIDTH-1):0];
assign in_a02 = (mode == INT_MUL)? Q01:in_A[(pMUL_WIDTH-1):0];
assign in_a03 = (mode == INT_MUL)? Q01:in_A[(pMUL_WIDTH-1):0];
// a = 1(row)
assign in_a10 = (mode == INT_MUL)? Q01:in_A[(pMUL_WIDTH*2-1):(pMUL_WIDTH)];
assign in_a11 = (mode == INT_MUL)? Q01:in_A[(pMUL_WIDTH*2-1):(pMUL_WIDTH)];
assign in_a12 = (mode == INT_MUL)? Q01:in_A[(pMUL_WIDTH*2-1):(pMUL_WIDTH)];
assign in_a13 = (mode == INT_MUL)? Q01:in_A[(pMUL_WIDTH*2-1):(pMUL_WIDTH)];
// a = 2(row)
assign in_a20 = (mode == INT_MUL)? Q:in_A[(pMUL_WIDTH*3-1):(pMUL_WIDTH*2)];
assign in_a21 = (mode == INT_MUL)? Q:in_A[(pMUL_WIDTH*3-1):(pMUL_WIDTH*2)];
assign in_a22 = (mode == INT_MUL)? Q:in_A[(pMUL_WIDTH*3-1):(pMUL_WIDTH*2)];
assign in_a23 = (mode == INT_MUL)? Q:in_A[(pMUL_WIDTH*3-1):(pMUL_WIDTH*2)];
// a = 3(row)
assign in_a30 = (mode == INT_MUL)? Q:in_A[(pMUL_WIDTH*4-1):(pMUL_WIDTH*3)];
assign in_a31 = (mode == INT_MUL)? Q:in_A[(pMUL_WIDTH*4-1):(pMUL_WIDTH*3)];
assign in_a32 = (mode == INT_MUL)? Q:in_A[(pMUL_WIDTH*4-1):(pMUL_WIDTH*3)];
assign in_a33 = (mode == INT_MUL)? Q:in_A[(pMUL_WIDTH*4-1):(pMUL_WIDTH*3)];
// b = 0(column)
assign in_b00 = (mode == INT_MUL)? in_B0[(pMUL_WIDTH-1):0]:in_B0[(pMUL_WIDTH-1):0];
assign in_b10 = (mode == INT_MUL)? in_B1[(pMUL_WIDTH-1):0]:in_B0[(pMUL_WIDTH-1):0];
assign in_b20 = (mode == INT_MUL)? in_buf[0] :in_B0[(pMUL_WIDTH-1):0];
assign in_b30 = (mode == INT_MUL)? in_buf[1] :in_B0[(pMUL_WIDTH-1):0];
// b = 1(column)
assign in_b01 = (mode == INT_MUL)? in_B0[(pMUL_WIDTH*2-1):(pMUL_WIDTH)]:in_B0[(pMUL_WIDTH*2-1):(pMUL_WIDTH)];
assign in_b11 = (mode == INT_MUL)? in_B1[(pMUL_WIDTH*2-1):(pMUL_WIDTH)]:in_B0[(pMUL_WIDTH*2-1):(pMUL_WIDTH)];
assign in_b21 = (mode == INT_MUL)? in_buf[2] :in_B0[(pMUL_WIDTH*2-1):(pMUL_WIDTH)];
assign in_b31 = (mode == INT_MUL)? in_buf[3] :in_B0[(pMUL_WIDTH*2-1):(pMUL_WIDTH)];
// b = 2(column)
assign in_b02 = (mode == INT_MUL)? in_B0[(pMUL_WIDTH*3-1):(pMUL_WIDTH*2)]:in_B0[(pMUL_WIDTH*3-1):(pMUL_WIDTH*2)];
assign in_b12 = (mode == INT_MUL)? in_B1[(pMUL_WIDTH*3-1):(pMUL_WIDTH*2)]:in_B0[(pMUL_WIDTH*3-1):(pMUL_WIDTH*2)];
assign in_b22 = (mode == INT_MUL)? in_buf[4] :in_B0[(pMUL_WIDTH*3-1):(pMUL_WIDTH*2)];
assign in_b32 = (mode == INT_MUL)? in_buf[5] :in_B0[(pMUL_WIDTH*3-1):(pMUL_WIDTH*2)];
// b = 3(column)
assign in_b03 = (mode == INT_MUL)? in_B0[(pMUL_WIDTH*4-1):(pMUL_WIDTH*3)]:in_B0[(pMUL_WIDTH*4-1):(pMUL_WIDTH*3)];
assign in_b13 = (mode == INT_MUL)? in_B1[(pMUL_WIDTH*4-1):(pMUL_WIDTH*3)]:in_B0[(pMUL_WIDTH*4-1):(pMUL_WIDTH*3)];
assign in_b23 = (mode == INT_MUL)? in_buf[6] :in_B0[(pMUL_WIDTH*4-1):(pMUL_WIDTH*3)];
assign in_b33 = (mode == INT_MUL)? in_buf[7] :in_B0[(pMUL_WIDTH*4-1):(pMUL_WIDTH*3)];
always @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
in_buf[0] <= 0;
in_buf[1] <= 0;
in_buf[2] <= 0;
in_buf[3] <= 0;
in_buf[4] <= 0;
in_buf[5] <= 0;
in_buf[6] <= 0;
in_buf[7] <= 0;
o_valid_buf[0] <= 0;
o_valid_buf[1] <= 0;
o_valid_buf[2] <= 0;
o_valid_buf[3] <= 0;
o_valid_buf[4] <= 0;
o_valid_buf[5] <= 0;
o_valid_buf[6] <= 0;
o_valid_buf[7] <= 0;
end else begin
in_buf[0] <= result_0[0][(pMUL_WIDTH-1):0];//Structure output: (0, 0)
in_buf[1] <= result_0[1][(pMUL_WIDTH-1):0];//Structure output: (1, 0)
in_buf[2] <= result_1[0][(pMUL_WIDTH-1):0];//Structure output: (0, 1)
in_buf[3] <= result_1[1][(pMUL_WIDTH-1):0];//Structure output: (1, 1)
in_buf[4] <= result_2[0][(pMUL_WIDTH-1):0];//Structure output: (0, 2)
in_buf[5] <= result_2[1][(pMUL_WIDTH-1):0];//Structure output: (1, 2)
in_buf[6] <= result_3[0][(pMUL_WIDTH-1):0];//Structure output: (0, 3)
in_buf[7] <= result_3[1][(pMUL_WIDTH-1):0];//Structure output: (1, 3)
o_valid_buf[0] <= o_valid[0] ;//Structure valid: (0, 0)
o_valid_buf[1] <= o_valid[4] ;//Structure valid: (0, 1)
o_valid_buf[2] <= o_valid[8] ;//Structure valid: (0, 2)
o_valid_buf[3] <= o_valid[12];//Structure valid: (0, 3)
o_valid_buf[4] <= o_valid[1] ;//Structure valid: (1, 0)
o_valid_buf[5] <= o_valid[5] ;//Structure valid: (1, 1)
o_valid_buf[6] <= o_valid[9] ;//Structure valid: (1, 2)
o_valid_buf[7] <= o_valid[13];//Structure valid: (1, 3)
end
end
assign i_valid[0] = (mode == INT_MUL)? o_valid_buf[0] :in_valid;
assign i_valid[1] = (mode == INT_MUL)? o_valid_buf[1] :in_valid;
assign i_valid[2] = (mode == INT_MUL)? o_valid_buf[2] :in_valid;
assign i_valid[3] = (mode == INT_MUL)? o_valid_buf[3] :in_valid;
assign i_valid[4] = (mode == INT_MUL)? o_valid_buf[4] :in_valid;
assign i_valid[5] = (mode == INT_MUL)? o_valid_buf[5] :in_valid;
assign i_valid[6] = (mode == INT_MUL)? o_valid_buf[6] :in_valid;
assign i_valid[7] = (mode == INT_MUL)? o_valid_buf[7] :in_valid;
//a = 0(row)
mul_16 mul_16_00 (.in_a(in_a00), .in_b(in_b00), .in_valid(in_valid), .out_valid(o_valid[0]), .result(result_0[0]), .clk(clk), .rst_n(rst_n));
mul_16 mul_16_01 (.in_a(in_a01), .in_b(in_b01), .in_valid(in_valid), .out_valid(o_valid[4]), .result(result_1[0]), .clk(clk), .rst_n(rst_n));
mul_16 mul_16_02 (.in_a(in_a02), .in_b(in_b02), .in_valid(in_valid), .out_valid(o_valid[8]), .result(result_2[0]), .clk(clk), .rst_n(rst_n));
mul_16 mul_16_03 (.in_a(in_a03), .in_b(in_b03), .in_valid(in_valid), .out_valid(o_valid[12]), .result(result_3[0]), .clk(clk), .rst_n(rst_n));
// a = 1(row)
mul_16 mul_16_10 (.in_a(in_a10), .in_b(in_b10), .in_valid(in_valid), .out_valid(o_valid[1]), .result(result_0[1]), .clk(clk), .rst_n(rst_n));
mul_16 mul_16_11 (.in_a(in_a11), .in_b(in_b11), .in_valid(in_valid), .out_valid(o_valid[5]), .result(result_1[1]), .clk(clk), .rst_n(rst_n));
mul_16 mul_16_12 (.in_a(in_a12), .in_b(in_b12), .in_valid(in_valid), .out_valid(o_valid[9]), .result(result_2[1]), .clk(clk), .rst_n(rst_n));
mul_16 mul_16_13 (.in_a(in_a13), .in_b(in_b13), .in_valid(in_valid), .out_valid(o_valid[13]), .result(result_3[1]), .clk(clk), .rst_n(rst_n));
// a = 2(row)
mul_16 mul_16_20 (.in_a(in_a20), .in_b(in_b20), .in_valid(i_valid[0]), .out_valid(o_valid[2]), .result(result_0[2]), .clk(clk), .rst_n(rst_n));
mul_16 mul_16_21 (.in_a(in_a21), .in_b(in_b21), .in_valid(i_valid[1]), .out_valid(o_valid[6]), .result(result_1[2]), .clk(clk), .rst_n(rst_n));
mul_16 mul_16_22 (.in_a(in_a22), .in_b(in_b22), .in_valid(i_valid[2]), .out_valid(o_valid[10]), .result(result_2[2]), .clk(clk), .rst_n(rst_n));
mul_16 mul_16_23 (.in_a(in_a23), .in_b(in_b23), .in_valid(i_valid[3]), .out_valid(o_valid[14]), .result(result_3[2]), .clk(clk), .rst_n(rst_n));
// a = 3(row)
mul_16 mul_16_30 (.in_a(in_a30), .in_b(in_b30), .in_valid(i_valid[4]), .out_valid(o_valid[3]), .result(result_0[3]), .clk(clk), .rst_n(rst_n));
mul_16 mul_16_31 (.in_a(in_a31), .in_b(in_b31), .in_valid(i_valid[5]), .out_valid(o_valid[7]), .result(result_1[3]), .clk(clk), .rst_n(rst_n));
mul_16 mul_16_32 (.in_a(in_a32), .in_b(in_b32), .in_valid(i_valid[6]), .out_valid(o_valid[11]), .result(result_2[3]), .clk(clk), .rst_n(rst_n));
mul_16 mul_16_33 (.in_a(in_a33), .in_b(in_b33), .in_valid(i_valid[7]), .out_valid(o_valid[15]), .result(result_3[3]), .clk(clk), .rst_n(rst_n));
// generate
// for(a=0 ; a<4 ; a=a+1)begin : GEN_MUL16
// mul_16 mul_16_0 (.in_a( partA[a] ), .in_b( partB[0] ), .in_valid( in_valid ), .out_valid( o_valid[a] ), .result( result_0[a] ), .clk(clk), .rst_n(rst_n));
// mul_16 mul_16_1 (.in_a( partA[a] ), .in_b( partB[1] ), .in_valid( in_valid ), .out_valid( o_valid[a+4] ), .result( result_1[a] ), .clk(clk), .rst_n(rst_n));
// mul_16 mul_16_2 (.in_a( partA[a] ), .in_b( partB[2] ), .in_valid( in_valid ), .out_valid( o_valid[a+8] ), .result( result_2[a] ), .clk(clk), .rst_n(rst_n));
// mul_16 mul_16_3 (.in_a( partA[a] ), .in_b( partB[3] ), .in_valid( in_valid ), .out_valid( o_valid[a+12] ), .result( result_3[a] ), .clk(clk), .rst_n(rst_n));
// end
// endgenerate
assign result_00 = result_0[0];//Structure: (0, 0)
assign result_01 = result_0[1];//Structure: (1, 0)
assign result_02 = result_0[2];//Structure: (2, 0)
assign result_03 = result_0[3];//Structure: (3, 0)
assign result_10 = result_1[0];//Structure: (0, 1)
assign result_11 = result_1[1];//Structure: (1, 1)
assign result_12 = result_1[2];//Structure: (2, 1)--
assign result_13 = result_1[3];//Structure: (3, 1)
assign result_20 = result_2[0];//Structure: (0, 2)
assign result_21 = result_2[1];//Structure: (1, 2)
assign result_22 = result_2[2];//Structure: (2, 2)
assign result_23 = result_2[3];//Structure: (3, 2)
assign result_30 = result_3[0];//Structure: (0, 3)
assign result_31 = result_3[1];//Structure: (1, 3)
assign result_32 = result_3[2];//Structure: (2, 3)--
assign result_33 = result_3[3];//Structure: (3, 3)--
assign out_valid = o_valid[2];//use o_valid of row 2 or 3 for NTT path
endmodule
```
:::
:::spoiler mul16_array_ntt_tb.v
```verilog=
`include "mul_16.v"
`include "CLA8.v"
`include "mul16_array_ntt.v"
// tb_mul16_array.v
`timescale 1ns/1ps
`define CLK_PER 10 // 100 MHz
module tb_mul16_array;
reg clk = 0;
always #(`CLK_PER/2) clk = ~clk;
reg rst_n;
// ==================== Pattern ROM ==========================
parameter PAT = 1000;
reg [63:0] memA [0:PAT-1];
reg [63:0] memB [0:PAT-1];
reg [255:0] golden [0:PAT-1];
initial begin
$readmemh("./py/A64.dat" , memA);
$readmemh("./py/B64.dat" , memB);
$readmemh("./py/golden.dat", golden);
//$display("golden[0]:%h", golden[0]);
end
reg in_valid;
reg [63:0] in_A, in_B;
wire out_valid;
wire [31:0] result_00, result_01, result_02, result_03;
wire [31:0] result_10, result_11, result_12, result_13;
wire [31:0] result_20, result_21, result_22, result_23;
wire [31:0] result_30, result_31, result_32, result_33;
mul16_array_ntt dut(
.clk(clk), .rst_n(rst_n),
.in_valid(in_valid), .in_A(64'b0), .in_B0(in_B), .in_B1(in_A),
.out_valid(out_valid), .mode(1'b1),
.result_00(result_00), .result_01(result_01),
.result_02(result_02), .result_03(result_03),
.result_10(result_10), .result_11(result_11),
.result_12(result_12), .result_13(result_13),
.result_20(result_20), .result_21(result_21),
.result_22(result_22), .result_23(result_23),
.result_30(result_30), .result_31(result_31),
.result_32(result_32), .result_33(result_33)
);
wire [511:0] dut_pack = {
result_33, result_32, result_31, result_30,
result_23, result_22, result_21, result_20,
result_13, result_12, result_11, result_10,
result_03, result_02, result_01, result_00
};
// wire [255:0] dnt_pack = {
// result_33, result_23, result_13, result_03, result_32, result_22, result_12, result_02
// };
wire [255:0] dnt_pack = {
result_32, result_22, result_12, result_02, result_33, result_23, result_13, result_03
};
integer err = 0;
integer i;
initial begin
$dumpfile("mul16_array_ntt_tb.vcd");
$dumpvars(0, tb_mul16_array);
// reset
rst_n = 0; in_valid = 0; in_A = 0; in_B = 0;
repeat (5) @(posedge clk);
rst_n = 1; repeat (2) @(posedge clk);
for (i = 0; i < PAT; i = i + 1) begin
// ---- 送 1 cycle ----
@(posedge clk);
in_A <= memA[i];
in_B <= memB[i];
in_valid <= 1'b1;
end
end
integer j=0;
initial begin
wait(rst_n);
wait(out_valid);
@(negedge clk);
for (j = 0; j < PAT; j = j + 1)begin
if (dnt_pack == golden[j]) begin
$display("[PASS] pat=%0d", j);
$display("exp=%h", golden[j]);
$display("get=%h", dnt_pack);
end else begin
$display("[FAIL] pat=%0d", j);
$display("exp=%h", golden[j]);
$display("get=%h", dnt_pack);
err = err + 1;
end
@(negedge clk);
end
if (err == 0)
$display("=========== ALL %0d PATTERN PASS ===========", PAT);
else
$display("**** %0d mismatches out of %0d ****", err, PAT);
repeat (30) @(posedge clk);
$finish;
end
endmodule
```
:::


As the waveform, the row 2 and row 3 comes out later.
The whole FFT complex multiplication and NTT montgomery multiplication is as following. Please click the link(Since this hackmd note is too longer, can't paste the whole code here).
[FFT+NTT muliplier github code](https://github.com/JJ-best/SOC/blob/master/Spring_Final_Project/Operator/fiFFNTT_mul/1.RTL_simulation/hdl/mul.v)





## Expected score
I think I can get 93 points in special research this semester. Because Hsuan and Me started working hard on coding several weeks before the finals week.