# Lab4_1 [1. whole pack code to simulation (caravel-soc_fpga-lab /lab-exmem_fir/)](https://github.com/bol-edu/caravel-soc_fpga-lab/tree/main/lab-exmem_fir) [2. Modified code and waveform sumerized](https://github.com/Raywang908/lab4/tree/master/lab4_1) ## Overview ### run_sim #path : lab-exmem_fir/testbench/counter_la_fir/run_sim ``` rm -f counter_la_fir.hex riscv32-unknown-elf-gcc -Wl,--no-warn-rwx-segments -g \ --save-temps \ -Xlinker -Map=output.map \ -I../../firmware \ -march=rv32i -mabi=ilp32 -D__vexriscv__ \ -Wl,-Bstatic,-T,../../firmware/sections.lds,--strip-discarded \ -ffreestanding -nostartfiles -o counter_la_fir.elf ../../firmware/crt0_vex.S ../../firmware/isr.c fir.c counter_la_fir.c # -nostartfiles riscv32-unknown-elf-objcopy -O verilog counter_la_fir.elf counter_la_fir.hex riscv32-unknown-elf-objdump -D counter_la_fir.elf > counter_la_fir.out # to fix flash base address sed -ie 's/@10/@00/g' counter_la_fir.hex iverilog -Ttyp -DFUNCTIONAL -DSIM -DUNIT_DELAY=#1 \ -f./include.rtl.list -o counter_la_fir.vvp counter_la_fir_tb.v vvp counter_la_fir.vvp rm -f counter_la_fir.vvp counter_la_fir.elf counter_la_fir.hexe ``` * source file are `firmware/crt0_vex.S`, `fir.c`, and `counter_la_fir.c` ### crt0_vex.S #path : lab-exmem_fir/firmware/crt0_vex.S ``` crt_init: la sp, _fstack la a0, trap_entry csrw mtvec, a0 sram_init: la a0, _fsram la a1, _esram la a2, _esram_rom sram_loop: beq a0,a1,sram_done lw a3,0(a2) sw a3,0(a0) add a0,a0,4 add a2,a2,4 j sram_loop sram_done: ... bss_init: la a0, _fbss la a1, _ebss bss_loop: beq a0,a1,bss_done sw zero,0(a0) add a0,a0,4 #ifndef SIM j bss_loop #endif bss_done: li a0, 0x880 //880 enable timer + external interrupt sources (until mstatus.MIE is set, they will never trigger an interrupt) csrw mie,a0 call main infinit_loop: j infinit_loop ``` * important part `call main` * `lw a3,0(a2)`, CPU fetch instruction of this line in `flash address`, `PROVIDE(_esram_rom = LOADADDR(.mprjram));` and `la a2, _esram_rom` telling CPU that load word from `_esram_rom` which is the address for `.mprjram` in `flash` -> CPU send request from dBus -> fetch the data and store the register ### section.lds #path : lab-exmem_fir/firmware/sections.lds ``` MEMORY { vexriscv_debug : ORIGIN = 0xf00f0000, LENGTH = 0x00000100 dff : ORIGIN = 0x00000000, LENGTH = 0x00000400 dff2 : ORIGIN = 0x00000400, LENGTH = 0x00000200 flash : ORIGIN = 0x10000000, LENGTH = 0x01000000 mprj : ORIGIN = 0x30000000, LENGTH = 0x00100000 mprjram : ORIGIN = 0x38000000, LENGTH = 0x00400000 hk : ORIGIN = 0x26000000, LENGTH = 0x00100000 csr : ORIGIN = 0xf0000000, LENGTH = 0x00010000 } SECTIONS { .text : { _ftext = .; /* Make sure crt0 files come first, and they, and the isr */ /* don't get disposed of by greedy optimisation */ *crt0*(.text) KEEP(*crt0*(.text)) KEEP(*(.text.isr)) /* *(.text .stub .text.* .gnu.linkonce.t.*) */ _etext = .; } > flash .rodata : { . = ALIGN(8); _frodata = .; *(.rodata .rodata.* .gnu.linkonce.r.*) *(.rodata1) . = ALIGN(8); _erodata = .; } > flash .data : { . = ALIGN(8); _fdata = .; *(.data .data.* .gnu.linkonce.d.*) *(.data1) _gp = ALIGN(16); *(.sdata .sdata.* .gnu.linkonce.s.*) . = ALIGN(8); _edata = .; } > dff AT > flash .bss : { . = ALIGN(8); _fbss = .; *(.dynsbss) *(.sbss .sbss.* .gnu.linkonce.sb.*) *(.scommon) *(.dynbss) *(.bss .bss.* .gnu.linkonce.b.*) *(COMMON) . = ALIGN(8); _ebss = .; _end = .; } > dff AT > flash .mprjram : { . = ALIGN(8); _fsram = .; *libgcc.a:*(.text .text.*) } > mprjram AT > flash } PROVIDE(_fstack = ORIGIN(dff2) + LENGTH(dff2)); PROVIDE(_fdata_rom = LOADADDR(.data)); PROVIDE(_edata_rom = LOADADDR(.data) + SIZEOF(.data)); PROVIDE(_esram = ORIGIN(mprjram) + SIZEOF(.mprjram)); PROVIDE(_esram_rom = LOADADDR(.mprjram)); ``` * `mprjram : ORIGIN = 0x38000000, LENGTH = 0x00400000` and `> mprjram AT > flash` means that the code assign to `mprjram` section should originally be at the `flash` but copied to `mprjram` where is the user area. * the code will be put into the BRAM we designed in `bram.v` ### fir.c #path : lab-exmem_fir/testbench/counter_la_fir/fir.c ```cpp #include "fir.h" void __attribute__ ( ( section ( ".mprjram" ) ) ) initfir() { //initial your fir for (int i = 0; i < N; i++){ inputbuffer[i] = 0; outputsignal[i] = 0; } } int* __attribute__ ( ( section ( ".mprjram" ) ) ) fir(){ initfir(); //write down your fir for (int i = 0; i < N; i++) { for (int j = N - 1; j > 0; j--) { inputbuffer[j] = inputbuffer[j - 1]; } inputbuffer[0] = inputsignal[i]; int sum = 0; for (int k = 0; k < N; k++) { sum += inputbuffer[k] * taps[k]; } outputsignal[i] = sum; } return outputsignal; } ``` :::info It is very important that this part of the software code is moved to `.mprjram` ::: ### counter_la_fir.c #path : lab-exmem_fir/testbench/counter_la_fir/counter_la_fir.c ```cpp int* tmp = fir(); reg_mprj_datal = *tmp << 16; reg_mprj_datal = *(tmp+1) << 16; reg_mprj_datal = *(tmp+2) << 16; reg_mprj_datal = *(tmp+3) << 16; reg_mprj_datal = *(tmp+4) << 16; reg_mprj_datal = *(tmp+5) << 16; reg_mprj_datal = *(tmp+6) << 16; reg_mprj_datal = *(tmp+7) << 16; reg_mprj_datal = *(tmp+8) << 16; reg_mprj_datal = *(tmp+9) << 16; reg_mprj_datal = *(tmp+10) << 16; ``` * knowing that the `main` function is in this .c file, recalling `crt0_vex.S` executes the boot code and after the `.bss` is done it executes `call main`, so it will jump to this file. Moreover, it will call the function `fir()` which its instruction code is put into `mprjram`. To sum up, CPU will first fetch instruction code from `flash` for initialization, then, it will fetch instruction code from `mprjram` by iBus. * ` << 16` is to align with `checkbits` in `counter_la_fir_tb.v` :::info By doing so, CPU can fetch and run instruction faster then taking instruction from `flash`. ::: ## Waveform Process ### CPU ![image](https://hackmd.io/_uploads/BJg_bw4geg.png) ``` 100000b0 <crt_init>: 100000b0: 60000113 li sp,1536 100000b4: 00000517 auipc a0,0x0 100000b8: f6c50513 addi a0,a0,-148 # 10000020 <trap_entry> 100000bc: 30551073 csrw mtvec,a0 100000c0 <sram_init>: 100000c0: 28000517 auipc a0,0x28000 100000c4: f4050513 addi a0,a0,-192 # 38000000 <__mulsi3> 100000c8: 28000597 auipc a1,0x28000 100000cc: 0fc58593 addi a1,a1,252 # 380001c4 <_esram> 100000d0: 00000617 auipc a2,0x0 100000d4: 73860613 addi a2,a2,1848 # 10000808 <_esram_rom> 100000d8 <sram_loop>: 100000d8: 00b50c63 beq a0,a1,100000f0 <data_init> 100000dc: 00062683 lw a3,0(a2) 100000e0: 00d52023 sw a3,0(a0) 100000e4: 00450513 addi a0,a0,4 100000e8: 00460613 addi a2,a2,4 100000ec: fedff06f j 100000d8 <sram_loop> ``` * The first thing CPU does is to fetch instruction code from the begining of the `0x1000000`, and keep fetching instructions defined in `counter_la_fir.out`. When the last 32 byte cache filled at adrress `0x100000E0`, CPU is told to transfer instructions to `0x38000000`. ![image](https://hackmd.io/_uploads/Sk1XEDExge.png) ![image](https://hackmd.io/_uploads/SJgNrvEgll.png) * `100000E0` is to store word in `0x38000000`, so CPU pulls up `dBus_cmd_valid` to first fetch instructions from flash, once `dBus_rsp_valid` pulls up, CPU gets the instruction `00050613` and store it at pipline buffer, at the next `dBus_cmd_valid` CPU transfer this `00050613` to `0x38000000`. :::warning It is not sure why the second `dBus_cmd_valid` is pull up at PC `0x100000E4`. More specifically, when would `dBus_cmd_valid` pull up due to the reading instuction? ::: ![image](https://hackmd.io/_uploads/HJ4kDJ0egl.png) * `0x100000DC` determine the `dBus_cmd_valid` pull up. ![image](https://hackmd.io/_uploads/BJYvw10lex.png) * `0x100000E0` determine the `dBus_cmd_valid` pull up. :::success We assume that this Risc-V uses 5-stage pipelined processor, and `dBus_cmd_valid` should be asserted at the MEM stage, so the instruction that assert this `dBus_cmd_valid` should be two or three cycles before the current IF, and it is proof that **`dBus_cmd_valid` belongs to the instruction of two cycles before** in lab4-2. ::: ![image](https://hackmd.io/_uploads/H1cDDwNxgg.png) ![image](https://hackmd.io/_uploads/HJW5DPEege.png) ``` 10000100 <data_loop>: 10000100: 00b50c63 beq a0,a1,10000118 <bss_init> 10000104: 00062683 lw a3,0(a2) 10000108: 00d52023 sw a3,0(a0) 1000010c: 00450513 addi a0,a0,4 10000110: 00460613 addi a2,a2,4 10000114: fedff06f j 10000100 <data_loop> ``` * In the first waveform we can see that there are two parts when dBus is active, the first one is transfering instruction code to `0x38000000` and another one is transfering `.data` to `dff = 0x00000000`. * We can see that when dBus is active, the wishbone in user project is active too, implying that this two are the same Bus. ### counter_la_fir.c ![image](https://hackmd.io/_uploads/SyUBowEeee.png) ```cpp reg_mprj_datal = 0xAB400000; ``` ![image](https://hackmd.io/_uploads/HkGVx_Egxx.png) ```cpp int* tmp = fir(); reg_mprj_datal = *tmp << 16; reg_mprj_datal = *(tmp+1) << 16; reg_mprj_datal = *(tmp+2) << 16; reg_mprj_datal = *(tmp+3) << 16; reg_mprj_datal = *(tmp+4) << 16; reg_mprj_datal = *(tmp+5) << 16; reg_mprj_datal = *(tmp+6) << 16; reg_mprj_datal = *(tmp+7) << 16; reg_mprj_datal = *(tmp+8) << 16; reg_mprj_datal = *(tmp+9) << 16; reg_mprj_datal = *(tmp+10) << 16; //print("\n"); //print("Monitor: Test 1 Passed\n\n"); // Makes simulation very long! reg_mprj_datal = 0xAB510000; ``` ### fir.c ![image](https://hackmd.io/_uploads/SkoYkd4gxe.png) * When `AB40` -> `flash_csb = 1`, CPU start fetching instruction from `mprjbram` to do convolution. ## Code ### fir.c ```cpp #include "fir.h" void __attribute__ ( ( section ( ".mprjram" ) ) ) initfir() { //initial your fir for (int i = 0; i < N; i++){ inputbuffer[i] = 0; outputsignal[i] = 0; } } int* __attribute__ ( ( section ( ".mprjram" ) ) ) fir(){ initfir(); //write down your fir for (int i = 0; i < N; i++) { for (int j = N - 1; j > 0; j--) { inputbuffer[j] = inputbuffer[j - 1]; } inputbuffer[0] = inputsignal[i]; int sum = 0; for (int k = 0; k < N; k++) { sum += inputbuffer[k] * taps[k]; } outputsignal[i] = sum; } return outputsignal; } ``` ### user_proj_example.counter.v ```verilog module user_proj_example #( parameter BITS = 32, parameter DELAYS=10 )( `ifdef USE_POWER_PINS inout vccd1, // User area 1 1.8V supply inout vssd1, // User area 1 digital ground `endif // Wishbone Slave ports (WB MI A) input wb_clk_i, input wb_rst_i, input wbs_stb_i, input wbs_cyc_i, input wbs_we_i, input [3:0] wbs_sel_i, input [31:0] wbs_dat_i, input [31:0] wbs_adr_i, output wbs_ack_o, output [31:0] wbs_dat_o, // Logic Analyzer Signals input [127:0] la_data_in, output [127:0] la_data_out, input [127:0] la_oenb, // IOs input [`MPRJ_IO_PADS-1:0] io_in, output [`MPRJ_IO_PADS-1:0] io_out, output [`MPRJ_IO_PADS-1:0] io_oeb, // IRQ output [2:0] irq ); wire clk; wire rst; assign clk = wb_clk_i; assign rst = wb_rst_i; wire [`MPRJ_IO_PADS-1:0] io_in; wire [`MPRJ_IO_PADS-1:0] io_out; wire [`MPRJ_IO_PADS-1:0] io_oeb; reg [3:0] counter; reg [3:0] counter_next; reg [31:0] wbs_dat_buffer; wire [31:0] wbs_dat_buffer_next; wire wbs_ack_o_next; reg wbs_ack_o_tmp; wire [3:0] bram_WE0; wire bram_EN0; wire [31:0] bram_Di0; wire [31:0] bram_Do0; wire [31:0] bram_A0; assign bram_WE0 = {(4){wbs_we_i}}; assign bram_EN0 = (wbs_cyc_i && wbs_stb_i && wbs_we_i && wbs_adr_i[29:27] == 3'b111) ? 1 : (counter == 8 && wbs_adr_i[29:27] == 3'b111) ? 1 : 0; assign bram_Di0 = wbs_dat_i; assign bram_A0 = {{5'b00000}, wbs_adr_i[26:0]}; assign wbs_dat_o = (counter == 10) ? wbs_dat_buffer : 0; assign wbs_dat_buffer_next = (counter == 9) ? bram_Do0 : wbs_dat_buffer; assign io_out = 0; assign io_oeb = 1; always @(posedge clk or posedge rst) begin if (rst) begin counter <= 0; wbs_dat_buffer <= 0; wbs_ack_o_tmp <= 0; end else begin counter <= counter_next; wbs_dat_buffer <= wbs_dat_buffer_next; wbs_ack_o_tmp <= wbs_ack_o_next; end end always @(*) begin if (wbs_cyc_i && wbs_adr_i[29:27] == 3'b111 && !(|wbs_we_i) && !(counter == 10)) begin //meaning that wbs_adr_i is in 0x38000000 counter_next = counter + 1; end else if (counter == 10 || !wbs_cyc_i)begin counter_next = 0; end else begin counter_next = counter; end end assign wbs_ack_o = (wbs_cyc_i && wbs_stb_i && wbs_we_i && wbs_adr_i[29:27] == 3'b111) ? 1 : (counter == 10) ? 1 : 0; bram user_bram ( .CLK(clk), .WE0(bram_WE0), .EN0(bram_EN0), .Di0(bram_Di0), .Do0(bram_Do0), .A0(bram_A0) ); endmodule ``` * In this code I design a buffer to catch the data output from bram and when `wbs_ack_o` is pullup -> `wbs_dat_o` which is connect to the buffer is a valid data output. * `bram_EN0` pullup at `counter == 8` -> `bram_Do0` is valid at `counter == 9` -> `wbs_dat_buffer` is valid at `counter == 10`