# Final Project (IOP) [Code for final project IOP](https://github.com/Raywang908/final_iop) ## fiFFNTT.v ### Axi-lite #### waveform ![axi-lite](https://hackmd.io/_uploads/B1lvKp0zex.png) #### Spec | Address | Bit | Name | Description | |---------|-----|--------------|-----------------------------------------------------------------------------| | 0x00 | [0] | ap_done | Set by kernel when the last data is transfered and reset when read address 0x00. | | | [1] | ap_idle | Reset by metadata and set when last data is transfered. | | 0x10 | - | coef_done | 1: coef is streamed in;0: waiting for coef to streamed in. | * axi-write is not used for now. ### Code View <details> <summary>Verilog Code for fiFFNTT.v</summary> ```verilog // forward / inverse FFT & NTT module fiFFNTT #( parameter pADDR_WIDTH = 32, parameter pDATA_WIDTH = 32, parameter pIOPS_WIDTH = 128 ) ( input wire clk, input wire rstn, output wire awready, output wire wready, input wire awvalid, input wire [(pADDR_WIDTH-1):0] awaddr, input wire wvalid, input wire [(pDATA_WIDTH-1):0] wdata, output wire arready, input wire rready, input wire arvalid, input wire [(pADDR_WIDTH-1):0] araddr, output wire rvalid, output wire [(pDATA_WIDTH-1):0] rdata, input wire ss_tvalid, input wire [(pDATA_WIDTH-1):0] ss_tdata, input wire ss_tlast, output wire ss_tready, input wire sm_tready, output wire sm_tvalid, output wire [(pDATA_WIDTH-1):0] sm_tdata, output wire sm_tlast ); //========================== Declaration ========================== // =============== axi-lite =============== // wire [31:0] ap_ctrl; wire [31:0] coef_ctrl; // axi write seems to be useless in the current plan reg awready_tmp; reg awready_next; reg wready_tmp; reg wready_next; // axi read is used to read the ap_state of the kenel // coef_done can be determined by the metadata -> dont need axi write for now reg arready_tmp; reg arready_next; reg rvalid_tmp; reg rvalid_next; reg [(pADDR_WIDTH-1):0] araddr_tmp; reg [(pADDR_WIDTH-1):0] araddr_next; reg [(pDATA_WIDTH-1):0] rdata_tmp; // telling IOP that done is read reg read_ap_stat_tmp; reg read_ap_stat_next; wire ap_read; // local parameter localparam PULL_DN = 0; // pull down localparam PULL_UP = 1; localparam AP_STAT = 32'h00; // 0x00 localparam COEF_STAT = 32'h10; // 0x10 //========================== Function ========================== // =============== axi-lite =============== // always @(posedge clk or negedge rstn) begin if (!rstn) begin awready_tmp <= PULL_DN; wready_tmp <= PULL_DN; arready_tmp <= PULL_DN; rvalid <= PULL_DN; araddr_tmp <= PULL_DN; read_ap_stat_tmp <= PULL_DN; end else begin awready_tmp <= awready_next; wready_tmp <= wready_next; arready_tmp <= arready_next; rvalid <= rvalid_next; araddr_tmp <= araddr_next; read_ap_stat_tmp <= read_ap_stat_next; end end always @(*) begin // axi write (not used for now) if (awvalid && wvalid && !wready) begin awready_next = PULL_UP; wready_next = PULL_UP; end else begin awready_next = PULL_DN; wready_next = PULL_DN; end // axi read - arready if (arvalid && !arready) begin arready_next = PULL_UP; end else begin arready_next = PULL_DN; end // axi read - rvalid if (arready) begin rvalid_next = PULL_UP; end else if (rready) begin rvalid_next = PULL_DN; end else begin rvalid_next = rvalid_tmp; end // axi read - araddr_buffer if (arvalid) begin araddr_next = araddr; end else if (rready && rvalid) begin araddr_next = PULL_DN; end else begin araddr_next = araddr_tmp; end // determine rdata if (araddr_tmp == AP_STAT) begin rdata_tmp = ap_ctrl; end else if (araddr_tmp == COEF_STAT) begin rdata_tmp = coef_ctrl; end else begin rdata_tmp = PULL_DN; end // read_ap_stat if (araddr_tmp == AP_STAT && rready && rvalid && !read_ap_stat_tmp) begin read_ap_stat_next = PULL_UP; end else begin read_ap_stat_next = PULL_DN; end end // assign to port wire assign awready = awready_tmp; assign wready = wready_tmp; assign arready = arready_tmp; assign rvalid = rvalid_tmp; assign rdata = rdata_tmp; assign ap_read = read_ap_stat_tmp; /*================================================================================================ # IOP # ================================================================================================*/ stage_top IOP ( .clk (clk), .rstn (rstn), .in1_sw ( ), .ap_crtl (ap_crtl), .coef_crtl (coef_crtl), .ap_read (ap_read), .ss_vld (ss_tvalid), .ss_dat (ss_tdata), .ss_lst (ss_tlast), .ss_rdy (ss_tready), .sm_rdy (sm_tready), .sm_vld (sm_tvalid), .sm_dat (sm_tdata), .sm_lst (sm_tlast), .k1_ld_vld (k1_load_vld), .k1_ld_rdy (k1_load_rdy), .k1_ld_dat (k1_load_dat), .k1_sw_vld (k1_store_vld), .k1_sw_rdy (k1_store_rdy), .k1_sw_dat (k1_store_dat), .k2_ld_vld (k2_load_vld), .k2_ld_rdy (k2_load_rdy), .k2_ld_dat (k2_load_dat), .k2_sw_vld (k2_store_vld), .k2_sw_rdy (k2_store_rdy), .k2_sw_dat (k2_store_dat), .k3_ld_vld (k3_load_vld), .k3_ld_rdy (k3_load_rdy), .k3_ld_dat (k3_load_dat), .k3_sw_vld (k3_store_vld), .k3_sw_rdy (k3_store_rdy), .k3_sw_dat (k3_store_dat), .k4_ld_vld (k4_load_vld), .k4_ld_rdy (k4_load_rdy), .k4_ld_dat (k4_load_dat), .k4_sw_vld (k4_store_vld), .k4_sw_rdy (k4_store_rdy), .k4_sw_dat (k4_store_dat) ); /*================================================================================================ # Kernels # ================================================================================================*/ kernel K1 ( .ld_vld (k1_ld_vld), .ld_rdy (k1_ld_rdy), .ld_dat (k1_ld_dat), .sw_vld (k1_sw_vld), .sw_rdy (k1_sw_rdy), .sw_dat (k1_sw_dat) ); kernel K2 ( .ld_vld (k2_ld_vld), .ld_rdy (k2_ld_rdy), .ld_dat (k2_ld_dat), .sw_vld (k2_sw_vld), .sw_rdy (k2_sw_rdy), .sw_dat (k2_sw_dat) ); kernel K3 ( .ld_vld (k3_ld_vld), .ld_rdy (k3_ld_rdy), .ld_dat (k3_ld_dat), .sw_vld (k3_sw_vld), .sw_rdy (k3_sw_rdy), .sw_dat (k3_sw_dat) ); kernel K4 ( .ld_vld (k4_ld_vld), .ld_rdy (k4_ld_rdy), .ld_dat (k4_ld_dat), .sw_vld (k4_sw_vld), .sw_rdy (k4_sw_rdy), .sw_dat (k4_sw_dat) ); endmodule ``` </details> ## stage_top.v (IOP) ### Axi-stream #### waveform ![iop_ss](https://hackmd.io/_uploads/Bk-CZyg7ee.png) #### code * I put one buffer for input and one buffer for output, because we dont know the latency for DMA to send data. * `pack_cnter` is used for packing four 32 bit data into 128 bit and send to kernel. ### Metadata #### code * `read_meta` is a signal indicating whether the coming data is a metadata. * `meta_cnter` count from 1 to 1024, here I only consider all data length is 1024, when cnter counts to 1024, it will reset to 1 and make `read_meta = 1`. * `data_length` of the metadata haven't extract, add `assign data_length = meta_buffer_tmp[15:0];`. ### Kernel #### code * If the sm_buffer is full (when `sm_vld` still on), `k_sw_rdy` will not set to 1. * `decode` is used as a signal that control the update of the `butterfly_mode` register. ### Ap_ctrl & Coef_ctrl #### code * `coef_ctrl` is set to 1 when the last coef data is received. * `ap_idle` is set to 0 when `decode_meta` pulls up, and it will be set to 1 when `k_sw_lst`(the last data sent by kernel) pulls up, `k_sw_lst` is designed only rise up when bus between IOP and kernel shakehand. * `ap_read` is sent from `fiFFNTT.v`, indicating that CPU is reading the adress `0x00`. ### address generator for tap :::warning This part havent design yet. ::: ### Code View <details> <summary>Verilog Code for stage_top.v</summary> ```verilog // Using Deep-Feedback structure, we will have module stage_top #( parameter pDATA_WIDTH = 128 // two 64-bit numbers parameter pSS_WIDTH = 32 // two 64-bit numbers ) ( input wire clk, input wire rstn, input wire [1:0] in1_sw, // not used for now output wire [31:0] ap_crtl, output wire [31:0] coef_crtl, input wire ap_read, // SS/SM interface: // FFT/iFFT SS: concat 4 32-bit data to 128-bit // FFT/iFFT SM: split 128-bit data to 32-bit input wire ss_vld, input wire [(pSS_WIDTH-1):0] ss_dat, input wire ss_lst, // not used for now output wire ss_rdy, input wire sm_rdy, output wire sm_vld, output wire [(pSS_WIDTH-1):0] sm_dat, output wire sm_lst, // not used for now // 1st Kernel output wire clk1, output wire rstn1, output wire k1_ld_vld, // Stream: X[a], X[b], GM constant input wire k1_ld_rdy, output wire [(pDATA_WIDTH-1):0] k1_ld_dat, input wire k1_sw_vld, // Stream: X[a], X[b], GM constant//Stream-in IOP, then stream-out output wire k1_sw_rdy, input wire [(pDATA_WIDTH-1):0] k1_sw_dat, output wire [7:0] k1_mode, output wire decode1, input wire k1_sw_lst, // 2nd Kernel output wire clk2, output wire rstn2, output wire k2_ld_vld, // Stream: X[a], X[b], GM constant input wire k2_ld_rdy, output wire [(pDATA_WIDTH-1):0] k2_ld_dat, input wire k2_sw_vld, // Stream: X[a], X[b], GM constant//Stream-in IOP, then stream-out output wire k2_sw_rdy, input wire [(pDATA_WIDTH-1):0] k2_sw_dat, output wire [7:0] k2_mode, output wire decode2, input wire k2_sw_lst, // 3rd Kernel output wire clk3, output wire rstn3, output wire k3_ld_vld, // Stream: X[a], X[b], GM constant input wire k3_ld_rdy, output wire [(pDATA_WIDTH-1):0] k3_ld_dat, input wire k3_sw_vld, // Stream: X[a], X[b], GM constant//Stream-in IOP, then stream-out output wire k3_sw_rdy, input wire [(pDATA_WIDTH-1):0] k3_sw_dat, output wire [7:0] k3_mode, output wire decode3, input wire k3_sw_lst, // 4th Kernel output wire clk4, output wire rstn4, output wire k4_ld_vld, // Stream: X[a], X[b], GM constant input wire k4_ld_rdy, output wire [(pDATA_WIDTH-1):0] k4_ld_dat, input wire k4_sw_vld, // Stream: X[a], X[b], GM constant//Stream-in IOP, then stream-out output wire k4_sw_rdy, input wire [(pDATA_WIDTH-1):0] k4_sw_dat, output wire [7:0] k4_mode, output wire decode4, input wire k4_sw_lst ); //========================== Declaration ========================== // =============== axi stream =============== // reg ss_rdy_tmp; reg ss_rdy_next; // one stage pipe for receiving data from DMA reg [(pSS_WIDTH-1):0] ss_buffer_tmp1; reg [(pSS_WIDTH-1):0] ss_buffer_tmp2; reg [(pSS_WIDTH-1):0] ss_buffer_tmp3; reg [(pSS_WIDTH-1):0] ss_buffer_tmp4; reg [(pSS_WIDTH-1):0] ss_buffer_next1; reg [(pSS_WIDTH-1):0] ss_buffer_next2; reg [(pSS_WIDTH-1):0] ss_buffer_next3; reg [(pSS_WIDTH-1):0] ss_buffer_next4; wire [(pDATA_WIDTH-1):0] ss_buffer; wire [3:0] condition; reg sm_vld_tmp; reg sm_vld_next; reg [(pDATA_WIDTH-1):0] sm_buffer_tmp; reg [(pDATA_WIDTH-1):0] sm_buffer_next; // cnter for packing 128 bit reg [1:0] pack_cnter_tmp; reg [1:0] pack_cnter_next; // local parameter localparam PULL_DN = 0; localparam PULL_UP = 1; localparam INDEX_3 = 2'b11; // =============== metadata =============== // // indicate if the coming data is metadata reg read_meta_tmp; reg read_meta_next; // indicate the cycle for decoding reg decode_meta_tmp; reg decode_meta_next; reg [(pSS_WIDTH-1):0] meta_buffer_tmp; reg [(pSS_WIDTH-1):0] meta_buffer_next; // cnter for data length reg [10:0] meta_cnter_tmp; reg [10:0] meta_cnter_next; // destination & mode wire [7:0] dst_tmp; // destination wire [7:0] mode_tmp; // local parameter localparam MAX_LEN = 1024; // parameter for destination localparam KERNEL_1 = 8'b00000100; localparam KERNEL_2 = 8'b00000101; localparam KERNEL_3 = 8'b00000110; localparam KERNEL_4 = 8'b00000111; localparam COEF = 8'b00010000; // =============== kernel =============== // // kernel handshake reg k1_ld_vld_tmp; reg k1_ld_vld_next; reg k1_sw_rdy_tmp; reg k1_sw_rdy_next; reg k2_ld_vld_tmp; reg k2_ld_vld_next; reg k2_sw_rdy_tmp; reg k2_sw_rdy_next; reg k3_ld_vld_tmp; reg k3_ld_vld_next; reg k3_sw_rdy_tmp; reg k3_sw_rdy_next; reg k4_ld_vld_tmp; reg k4_ld_vld_next; reg k4_sw_rdy_tmp; reg k4_sw_rdy_next; // destination wire [15:0] status; // local parameter for kenel mode // need to make sure with the operator group localparam FFT = 8'd1; localparam IFFT = 8'd2; localparam NTT = 8'd3; localparam INTT = 8'd4; // =============== ap_ctrl & coef_ctrl =============== // reg [3:0] ap_done1_tmp; reg [3:0] ap_done1_next; reg [3:0] ap_done2_tmp; reg [3:0] ap_done2_next; reg [3:0] ap_done3_tmp; reg [3:0] ap_done3_next; reg [3:0] ap_done4_tmp; reg [3:0] ap_done4_next; reg [3:0] ap_idle1_tmp; reg [3:0] ap_idle1_next; reg [3:0] ap_idle2_tmp; reg [3:0] ap_idle2_next; reg [3:0] ap_idle3_tmp; reg [3:0] ap_idle3_next; reg [3:0] ap_idle4_tmp; reg [3:0] ap_idle4_next; // 0x00: Kernel status (configuration address: 0x3000_0000) read by middleware //reg coef_done; // 0x10: Indicate coefficient is initialized reg coef_crtl_tmp; reg coef_crtl_next; // =============== address generator for tap =============== // //========================== Function ========================== // =============== axi stream =============== // always @(posedge clk or negedge rstn) begin if (!rstn) begin ss_rdy_tmp <= PULL_DN; ss_buffer_tmp1 <= PULL_DN; ss_buffer_tmp2 <= PULL_DN; ss_buffer_tmp3 <= PULL_DN; ss_buffer_tmp4 <= PULL_DN; pack_cnter_tmp <= PULL_DN; sm_vld_tmp <= PULL_DN; sm_buffer_tmp <= PULL_DN; end else begin ss_rdy_tmp <= ss_rdy_next; ss_buffer_tmp1 <= ss_buffer_next1; ss_buffer_tmp2 <= ss_buffer_next2; ss_buffer_tmp3 <= ss_buffer_next3; ss_buffer_tmp4 <= ss_buffer_next4; pack_cnter_tmp <= pack_cnter_next; sm_vld_tmp <= sm_vld_next; sm_buffer_tmp <= sm_buffer_next; end end assign condition = {read_meta_tmp, ss_rdy, pack_cnter_tmp}; always @(*) begin // ss_rdy if (ss_vld && !ss_rdy) begin ss_rdy_next = PULL_UP; end begin ss_rdy_next = PULL_DN; end // pack_cnter if (ss_rdy && !(pack_cnter_tmp == INDEX_3) && !read_meta_tmp) begin pack_cnter_next = pack_cnter_tmp + 1; end else if (pack_cnter_tmp == INDEX_3 && !read_meta_tmp) begin pack_cnter_next = PULL_DN; end else begin pack_cnter_next = pack_cnter_tmp; end // fill in buffer case (condition) 4'b0100: begin ss_buffer_next1 = ss_dat; ss_buffer_next2 = ss_buffer_tmp2; ss_buffer_next3 = ss_buffer_tmp3; ss_buffer_next4 = ss_buffer_tmp4; end 4'b0101: begin ss_buffer_next1 = ss_buffer_tmp1; ss_buffer_next2 = ss_dat; ss_buffer_next3 = ss_buffer_tmp3; ss_buffer_next4 = ss_buffer_tmp4; end 4'b0110: begin ss_buffer_next1 = ss_buffer_tmp1; ss_buffer_next2 = ss_buffer_tmp2; ss_buffer_next3 = ss_dat; ss_buffer_next4 = ss_buffer_tmp4; end 4'b0111: begin ss_buffer_next1 = ss_buffer_tmp1; ss_buffer_next2 = ss_buffer_tmp2; ss_buffer_next3 = ss_buffer_tmp3; ss_buffer_next4 = ss_dat; end default: begin ss_buffer_next1 = PULL_DN; ss_buffer_next2 = PULL_DN; ss_buffer_next3 = PULL_DN; ss_buffer_next4 = PULL_DN; end endcase // sm_vld if ((k1_sw_vld && k1_sw_rdy) || (k2_sw_vld && k2_sw_rdy) || (k2_sw_vld && k2_sw_rdy) || (k2_sw_vld && k2_sw_rdy)) begin sm_vld_next = PULL_UP; end else if (sm_rdy) begin sm_vld_next = PULL_DN; end else begin sm_vld_next = sm_vld_tmp; end // sm_buffer if (k1_sw_vld && k1_sw_rdy) begin sm_buffer_next = k1_sw_dat; end else if (k2_sw_vld && k2_sw_rdy) begin sm_buffer_next = k1_sw_dat; end else if (k3_sw_vld && k3_sw_rdy) begin sm_buffer_next = k1_sw_dat; end else if (k4_sw_vld && k4_sw_rdy) begin sm_buffer_next = k1_sw_dat; end else begin sm_buffer_next = sm_buffer_tmp; end end assign ss_buffer = {ss_buffer_tmp1, ss_buffer_tmp2, ss_buffer_tmp3, ss_buffer_tmp4}; // assign to port wire assign ss_rdy = ss_rdy_tmp; assign sm_vld = sm_vld_tmp; assign sm_dat = sm_buffer_tmp; // =============== metadata =============== // always @(posedge clk or negedge rstn) begin if (!rstn) begin read_meta_tmp <= PULL_UP; decode_meta_tmp <= PULL_DN; meta_buffer_tmp <= PULL_DN; meta_cnter_tmp <= PULL_DN; end else begin read_meta_tmp <= read_meta_next; decode_meta_tmp <= decode_meta_next; meta_buffer_tmp <= meta_buffer_next; meta_cnter_tmp <= meta_cnter_next; end end always @(*) begin // read_meta if (ss_rdy && !(meta_cnter_tmp == MAX_LEN)) begin read_meta_next = PULL_DN; end else (ss_rdy && meta_cnter_tmp == MAX_LEN) begin read_meta_next = PULL_UP; end else begin read_meta_next = read_meta_tmp; end // decode_meta & meta_buffer & meta_cnter if (ss_rdy && read_meta_tmp) begin decode_meta_next = PULL_UP; meta_buffer_next = ss_dat; meta_cnter_next = PULL_UP; // set to 1 end else begin decode_meta_next = PULL_DN; meta_buffer_next = meta_buffer_tmp; meta_cnter_next = meta_cnter_tmp + 1; // set to 1 end end assign dst_tmp = meta_buffer_tmp[31:24]; assign mode_tmp = meta_buffer_tmp[23:16]; // I did not extract data length here // add it if u need // =============== kernel =============== // always @(posedge clk or negedge rstn) begin if (!rstn) begin k1_ld_vld_tmp <= PULL_DN; k1_sw_rdy_tmp <= PULL_DN; k2_ld_vld_tmp <= PULL_DN; k2_sw_rdy_tmp <= PULL_DN; k3_ld_vld_tmp <= PULL_DN; k3_sw_rdy_tmp <= PULL_DN; k4_ld_vld_tmp <= PULL_DN; k4_sw_rdy_tmp <= PULL_DN; end else begin k1_ld_vld_tmp <= k1_ld_vld_next; k1_sw_rdy_tmp <= k1_sw_rdy_next; k2_ld_vld_tmp <= k2_ld_vld_next; k2_sw_rdy_tmp <= k2_sw_rdy_next; k3_ld_vld_tmp <= k3_ld_vld_next; k3_sw_rdy_tmp <= k3_sw_rdy_next; k4_ld_vld_tmp <= k4_ld_vld_next; k4_sw_rdy_tmp <= k4_sw_rdy_next; end end always @(*) begin if (k1_sw_vld && !k1_sw_rdy_tmp && !sm_vld) begin k1_sw_rdy_next = PULL_UP; k2_sw_rdy_next = k2_sw_rdy_tmp; k3_sw_rdy_next = k3_sw_rdy_tmp; k4_sw_rdy_next = k4_sw_rdy_tmp; end else if (k2_sw_vld && !k2_sw_rdy_tmp && !sm_vld) begin k2_sw_rdy_next = PULL_UP; k1_sw_rdy_next = k1_sw_rdy_tmp; k3_sw_rdy_next = k3_sw_rdy_tmp; k4_sw_rdy_next = k4_sw_rdy_tmp; end else if (k3_sw_vld && !k3_sw_rdy_tmp && !sm_vld) begin k3_sw_rdy_next = PULL_UP; k1_sw_rdy_next = k1_sw_rdy_tmp; k2_sw_rdy_next = k2_sw_rdy_tmp; k4_sw_rdy_next = k4_sw_rdy_tmp; end else if (k4_sw_vld && !k4_sw_rdy_tmp && !sm_vld) begin k4_sw_rdy_next = PULL_UP; k1_sw_rdy_next = k1_sw_rdy_tmp; k2_sw_rdy_next = k2_sw_rdy_tmp; k3_sw_rdy_next = k3_sw_rdy_tmp; end else begin k1_sw_rdy_next = PULL_DN; k2_sw_rdy_next = PULL_DN; k3_sw_rdy_next = PULL_DN; k4_sw_rdy_next = PULL_DN; end end assign status = {read_meta_tmp, ss_rdy, pack_cnter_tmp, k1_ld_rdy, k2_ld_rdy, k3_ld_rdy, k4_ld_rdy, dst_tmp}; // for destination always @(*) begin casez (status) {8'b01110???, KERNEL_1}: begin k1_ld_vld_next = PULL_UP; k2_ld_vld_next = k2_ld_vld_tmp; k3_ld_vld_next = k3_ld_vld_tmp; k4_ld_vld_next = k4_ld_vld_tmp; end {8'b01??1???, KERNEL_1}: begin k1_ld_vld_next = PULL_DN; k2_ld_vld_next = k2_ld_vld_tmp; k3_ld_vld_next = k3_ld_vld_tmp; k4_ld_vld_next = k4_ld_vld_tmp; end {8'b0111?0??, KERNEL_2}: begin k2_ld_vld_next = PULL_UP; k1_ld_vld_next = k1_ld_vld_tmp; k3_ld_vld_next = k3_ld_vld_tmp; k4_ld_vld_next = k4_ld_vld_tmp; end {8'b01???1??, KERNEL_2}: begin k2_ld_vld_next = PULL_DN; k1_ld_vld_next = k1_ld_vld_tmp; k3_ld_vld_next = k3_ld_vld_tmp; k4_ld_vld_next = k4_ld_vld_tmp; end {8'b0111??0?, KERNEL_3}: begin k3_ld_vld_next = PULL_UP; k1_ld_vld_next = k1_ld_vld_tmp; k2_ld_vld_next = k2_ld_vld_tmp; k4_ld_vld_next = k4_ld_vld_tmp; end {8'b01????1?, KERNEL_3}: begin k3_ld_vld_next = PULL_DN; k1_ld_vld_next = k1_ld_vld_tmp; k2_ld_vld_next = k2_ld_vld_tmp; k4_ld_vld_next = k4_ld_vld_tmp; end {8'b0111???0, KERNEL_4}: begin k4_ld_vld_next = PULL_UP; k1_ld_vld_next = k1_ld_vld_tmp; k2_ld_vld_next = k2_ld_vld_tmp; k3_ld_vld_next = k3_ld_vld_tmp; end {8'b01?????1, KERNEL_4}: begin k4_ld_vld_next = PULL_DN; k1_ld_vld_next = k1_ld_vld_tmp; k2_ld_vld_next = k2_ld_vld_tmp; k3_ld_vld_next = k3_ld_vld_tmp; end default: begin k1_ld_vld_next = k1_ld_vld_tmp; k2_ld_vld_next = k2_ld_vld_tmp; k3_ld_vld_next = k3_ld_vld_tmp; k4_ld_vld_next = k4_ld_vld_tmp; end endcase end // for kenel mode assign decode1 = (decode_meta_tmp && dst_tmp == KERNEL_1) ? PULL_UP : PULL_DN; assign decode2 = (decode_meta_tmp && dst_tmp == KERNEL_2) ? PULL_UP : PULL_DN; assign decode3 = (decode_meta_tmp && dst_tmp == KERNEL_3) ? PULL_UP : PULL_DN; assign decode4 = (decode_meta_tmp && dst_tmp == KERNEL_4) ? PULL_UP : PULL_DN; // kenel mode is update when decode is set in kernel.v assign k1_mode = mode_tmp; assign k2_mode = mode_tmp; assign k3_mode = mode_tmp; assign k4_mode = mode_tmp; // whether input data is valid is determined by vld & rdy assign k1_ld_dat = ss_buffer; assign k2_ld_dat = ss_buffer; assign k3_ld_dat = ss_buffer; assign k4_ld_dat = ss_buffer; assign k1_ld_vld = k1_ld_vld_tmp; assign k2_ld_vld = k2_ld_vld_tmp; assign k3_ld_vld = k3_ld_vld_tmp; assign k4_ld_vld = k4_ld_vld_tmp; assign k1_sw_rdy = k1_sw_rdy_tmp; assign k1_sw_rdy = k1_sw_rdy_tmp; assign k1_sw_rdy = k1_sw_rdy_tmp; assign k1_sw_rdy = k1_sw_rdy_tmp; // =============== ap_ctrl & coef_ctrl =============== // always @(posedge clk or negedge rstn) begin if (!rstn) begin coef_crtl_tmp <= PULL_DN; ap_idle1_tmp <= PULL_UP; ap_idle2_tmp <= PULL_UP; ap_idle3_tmp <= PULL_UP; ap_idle4_tmp <= PULL_UP; ap_done1_tmp <= PULL_DN; ap_done2_tmp <= PULL_DN; ap_done3_tmp <= PULL_DN; ap_done4_tmp <= PULL_DN; end else begin coef_crtl_tmp <= coef_crtl_next; ap_idle1_tmp <= ap_idle1_next; ap_idle2_tmp <= ap_idle2_next; ap_idle3_tmp <= ap_idle3_next; ap_idle4_tmp <= ap_idle4_next; ap_done1_tmp <= ap_done1_next; ap_done2_tmp <= ap_done2_next; ap_done3_tmp <= ap_done3_next; ap_done4_tmp <= ap_done4_next; end end // assume that coef length is as same as 1024 always @(*) begin // coef_crtl if ((meta_cnter_tmp == MAX_LEN) && ss_rdy && (dst_tmp == COEF)) begin coef_crtl_next = PULL_UP; end else begin coef_crtl_next = coef_crtl_tmp; end // ap_idle1 if (decode_meta_tmp == PULL_UP && dst_tmp == KERNEL_1) begin ap_idle1_next = PULL_DN end else if (k1_sw_lst) begin ap_idle1_next = PULL_UP; end else begin ap_idle1_next = ap_idle1_tmp; end // ap_idle2 if (decode_meta_tmp == PULL_UP && dst_tmp == KERNEL_2) begin ap_idle2_next = PULL_DN end else if (k2_sw_lst) begin ap_idle2_next = PULL_UP; end else begin ap_idle2_next = ap_idle2_tmp; end // ap_idle3 if (decode_meta_tmp == PULL_UP && dst_tmp == KERNEL_3) begin ap_idle3_next = PULL_DN end else if (k3_sw_lst) begin ap_idle3_next = PULL_UP; end else begin ap_idle3_next = ap_idle3_tmp; end // ap_idle4 if (decode_meta_tmp == PULL_UP && dst_tmp == KERNEL_4) begin ap_idle4_next = PULL_DN end else if (k4_sw_lst) begin ap_idle4_next = PULL_UP; end else begin ap_idle4_next = ap_idle4_tmp; end // ap_done1 if (k1_sw_lst) begin ap_done1_next = PULL_UP; end else if (ap_read && ap_done1_tmp) begin ap_done1_next = PULL_DN; end else begin ap_done1_next = ap_done1_tmp; end // ap_done2 if (k2_sw_lst) begin ap_done2_next = PULL_UP; end else if (ap_read && ap_done2_tmp) begin ap_done2_next = PULL_DN; end else begin ap_done2_next = ap_done2_tmp; end // ap_done3 if (k3_sw_lst) begin ap_done3_next = PULL_UP; end else if (ap_read && ap_done3_tmp) begin ap_done3_next = PULL_DN; end else begin ap_done3_next = ap_done3_tmp; end // ap_done4 if (k4_sw_lst) begin ap_done4_next = PULL_UP; end else if (ap_read && ap_done4_tmp) begin ap_done4_next = PULL_DN; end else begin ap_done4_next = ap_done4_tmp; end end assign coef_crtl = coef_crtl_tmp; assign ap_ctrl = {ap_idle4_next, ap_done4_next, ap_idle3_next, ap_done3_next, ap_idle2_next, ap_done2_next, ap_idle1_next, ap_done1_next}; // =============== address generator for tap =============== // bram32 tap_RAM ( .CLK(), .WE(write_tmp), .EN(), .Di(), .A(), .Do() ); endmodule ``` </details> ## kernel.v ### Axi_lite #### code * Only `ld_rdy` this part is complete, others need to consider the form of the address generator for buffer. ### Kernel mode #### code * This part is completed, `butterfly_mode` will be put into the port for butterfly in `.mode()`. ### Address generator for buffer :::warning This part havent design yet. ::: ### Code View <details> <summary>Verilog Code for kernel.v</summary> ```verilog module kernel #( parameter pDATA_WIDTH = 128 // two 64-bit numbers ) ( input wire clk, input wire rstn, input wire ld_vld, output wire ld_rdy, input wire [(pDATA_WIDTH-1):0] ld_dat, output wire sw_vld, input wire sw_rdy, output wire [(pDATA_WIDTH-1):0] sw_dat, input wire [7:0] mode, input wire decode, output wire sw_lst, // this is set when handshake ); //========================== Declaration ========================== // =============== axi-lite =============== // reg ld_rdy_tmp; reg ld_rdy_next; reg sw_vld_tmp; reg sw_vld_next; // local parameter localparam PULL_DN = 0; localparam PULL_UP = 1; // =============== kernel mode =============== // reg [7:0] mode_state_tmp; reg [7:0] mode_state_next; wire [7:0] butterfly_mode; //========================== Function ========================== // =============== axi-lite =============== // always @(posedge clk or negedge rstn) begin if (!rstn) begin ld_rdy_tmp <= PULL_DN; end else begin ld_rdy_tmp <= ld_rdy_next; end end always @(*) begin if (ld_vld && !ld_rdy_tmp) begin ld_rdy_next = PULL_UP; end else begin ld_rdy_next = PULL_DN; end end // =============== kernel mode =============== // always @(posedge clk or negedge rstn) begin if (!rstn) begin mode_state_tmp <= PULL_DN; end else begin mode_state_tmp <= mode_state_next; end end always @(*) begin if (decode) begin mode_state_next = mode; end else begin mode_state_next = mode_state_tmp; end end assign butterfly_mode = mode_state_tmp; /*================================================================================================ # Address Generation # ================================================================================================*/ butterfly BPE1 ( .clk (clk), .rstn (rstn), .mode (), .i_vld (), .i_rdy (), .o_vld (), .o_rdy (), .ai (), .bi (), .gm (), .ao (), .bo () ); butterfly BPE2 ( .clk (clk), .rstn (rstn), .mode (), .i_vld (), .i_rdy (), .o_vld (), .o_rdy (), .ai (), .bi (), .gm (), .ao (), .bo () ); butterfly BPE3 ( .clk (clk), .rstn (rstn), .mode (), .i_vld (), .i_rdy (), .o_vld (), .o_rdy (), .ai (), .bi (), .gm (), .ao (), .bo () ); butterfly BPE4 ( .clk (clk), .rstn (rstn), .mode (), .i_vld (), .i_rdy (), .o_vld (), .o_rdy (), .ai (), .bi (), .gm (), .ao (), .bo () ); butterfly BPE5 ( .clk (clk), .rstn (rstn), .mode (), .i_vld (), .i_rdy (), .o_vld (), .o_rdy (), .ai (), .bi (), .gm (), .ao (), .bo () ); endmodule ``` </details>