# Lab4_1
[1. whole pack code to simulation (caravel-soc_fpga-lab
/lab-exmem_fir/)](https://github.com/bol-edu/caravel-soc_fpga-lab/tree/main/lab-exmem_fir)
[2. Modified code and waveform sumerized](https://github.com/Raywang908/lab4/tree/master/lab4_1)
## Overview
### run_sim
#path : lab-exmem_fir/testbench/counter_la_fir/run_sim
```
rm -f counter_la_fir.hex
riscv32-unknown-elf-gcc -Wl,--no-warn-rwx-segments -g \
--save-temps \
-Xlinker -Map=output.map \
-I../../firmware \
-march=rv32i -mabi=ilp32 -D__vexriscv__ \
-Wl,-Bstatic,-T,../../firmware/sections.lds,--strip-discarded \
-ffreestanding -nostartfiles -o counter_la_fir.elf ../../firmware/crt0_vex.S ../../firmware/isr.c fir.c counter_la_fir.c
# -nostartfiles
riscv32-unknown-elf-objcopy -O verilog counter_la_fir.elf counter_la_fir.hex
riscv32-unknown-elf-objdump -D counter_la_fir.elf > counter_la_fir.out
# to fix flash base address
sed -ie 's/@10/@00/g' counter_la_fir.hex
iverilog -Ttyp -DFUNCTIONAL -DSIM -DUNIT_DELAY=#1 \
-f./include.rtl.list -o counter_la_fir.vvp counter_la_fir_tb.v
vvp counter_la_fir.vvp
rm -f counter_la_fir.vvp counter_la_fir.elf counter_la_fir.hexe
```
* source file are `firmware/crt0_vex.S`, `fir.c`, and `counter_la_fir.c`
### crt0_vex.S
#path : lab-exmem_fir/firmware/crt0_vex.S
```
crt_init:
la sp, _fstack
la a0, trap_entry
csrw mtvec, a0
sram_init:
la a0, _fsram
la a1, _esram
la a2, _esram_rom
sram_loop:
beq a0,a1,sram_done
lw a3,0(a2)
sw a3,0(a0)
add a0,a0,4
add a2,a2,4
j sram_loop
sram_done:
...
bss_init:
la a0, _fbss
la a1, _ebss
bss_loop:
beq a0,a1,bss_done
sw zero,0(a0)
add a0,a0,4
#ifndef SIM
j bss_loop
#endif
bss_done:
li a0, 0x880 //880 enable timer + external interrupt sources (until mstatus.MIE is set, they will never trigger an interrupt)
csrw mie,a0
call main
infinit_loop:
j infinit_loop
```
* important part `call main`
* `lw a3,0(a2)`, CPU fetch instruction of this line in `flash address`, `PROVIDE(_esram_rom = LOADADDR(.mprjram));` and `la a2, _esram_rom` telling CPU that load word from `_esram_rom` which is the address for `.mprjram` in `flash` -> CPU send request from dBus -> fetch the data and store the register
### section.lds
#path : lab-exmem_fir/firmware/sections.lds
```
MEMORY {
vexriscv_debug : ORIGIN = 0xf00f0000, LENGTH = 0x00000100
dff : ORIGIN = 0x00000000, LENGTH = 0x00000400
dff2 : ORIGIN = 0x00000400, LENGTH = 0x00000200
flash : ORIGIN = 0x10000000, LENGTH = 0x01000000
mprj : ORIGIN = 0x30000000, LENGTH = 0x00100000
mprjram : ORIGIN = 0x38000000, LENGTH = 0x00400000
hk : ORIGIN = 0x26000000, LENGTH = 0x00100000
csr : ORIGIN = 0xf0000000, LENGTH = 0x00010000
}
SECTIONS
{
.text :
{
_ftext = .;
/* Make sure crt0 files come first, and they, and the isr */
/* don't get disposed of by greedy optimisation */
*crt0*(.text)
KEEP(*crt0*(.text))
KEEP(*(.text.isr))
/* *(.text .stub .text.* .gnu.linkonce.t.*) */
_etext = .;
} > flash
.rodata :
{
. = ALIGN(8);
_frodata = .;
*(.rodata .rodata.* .gnu.linkonce.r.*)
*(.rodata1)
. = ALIGN(8);
_erodata = .;
} > flash
.data :
{
. = ALIGN(8);
_fdata = .;
*(.data .data.* .gnu.linkonce.d.*)
*(.data1)
_gp = ALIGN(16);
*(.sdata .sdata.* .gnu.linkonce.s.*)
. = ALIGN(8);
_edata = .;
} > dff AT > flash
.bss :
{
. = ALIGN(8);
_fbss = .;
*(.dynsbss)
*(.sbss .sbss.* .gnu.linkonce.sb.*)
*(.scommon)
*(.dynbss)
*(.bss .bss.* .gnu.linkonce.b.*)
*(COMMON)
. = ALIGN(8);
_ebss = .;
_end = .;
} > dff AT > flash
.mprjram :
{
. = ALIGN(8);
_fsram = .;
*libgcc.a:*(.text .text.*)
} > mprjram AT > flash
}
PROVIDE(_fstack = ORIGIN(dff2) + LENGTH(dff2));
PROVIDE(_fdata_rom = LOADADDR(.data));
PROVIDE(_edata_rom = LOADADDR(.data) + SIZEOF(.data));
PROVIDE(_esram = ORIGIN(mprjram) + SIZEOF(.mprjram));
PROVIDE(_esram_rom = LOADADDR(.mprjram));
```
* `mprjram : ORIGIN = 0x38000000, LENGTH = 0x00400000` and `> mprjram AT > flash` means that the code assign to `mprjram` section should originally be at the `flash` but copied to `mprjram` where is the user area.
* the code will be put into the BRAM we designed in `bram.v`
### fir.c
#path : lab-exmem_fir/testbench/counter_la_fir/fir.c
```cpp
#include "fir.h"
void __attribute__ ( ( section ( ".mprjram" ) ) ) initfir() {
//initial your fir
for (int i = 0; i < N; i++){
inputbuffer[i] = 0;
outputsignal[i] = 0;
}
}
int* __attribute__ ( ( section ( ".mprjram" ) ) ) fir(){
initfir();
//write down your fir
for (int i = 0; i < N; i++) {
for (int j = N - 1; j > 0; j--) {
inputbuffer[j] = inputbuffer[j - 1];
}
inputbuffer[0] = inputsignal[i];
int sum = 0;
for (int k = 0; k < N; k++) {
sum += inputbuffer[k] * taps[k];
}
outputsignal[i] = sum;
}
return outputsignal;
}
```
:::info
It is very important that this part of the software code is moved to `.mprjram`
:::
### counter_la_fir.c
#path : lab-exmem_fir/testbench/counter_la_fir/counter_la_fir.c
```cpp
int* tmp = fir();
reg_mprj_datal = *tmp << 16;
reg_mprj_datal = *(tmp+1) << 16;
reg_mprj_datal = *(tmp+2) << 16;
reg_mprj_datal = *(tmp+3) << 16;
reg_mprj_datal = *(tmp+4) << 16;
reg_mprj_datal = *(tmp+5) << 16;
reg_mprj_datal = *(tmp+6) << 16;
reg_mprj_datal = *(tmp+7) << 16;
reg_mprj_datal = *(tmp+8) << 16;
reg_mprj_datal = *(tmp+9) << 16;
reg_mprj_datal = *(tmp+10) << 16;
```
* knowing that the `main` function is in this .c file, recalling `crt0_vex.S` executes the boot code and after the `.bss` is done it executes `call main`, so it will jump to this file. Moreover, it will call the function `fir()` which its instruction code is put into `mprjram`. To sum up, CPU will first fetch instruction code from `flash` for initialization, then, it will fetch instruction code from `mprjram` by iBus.
* ` << 16` is to align with `checkbits` in `counter_la_fir_tb.v`
:::info
By doing so, CPU can fetch and run instruction faster then taking instruction from `flash`.
:::
## Waveform Process
### CPU

```
100000b0 <crt_init>:
100000b0: 60000113 li sp,1536
100000b4: 00000517 auipc a0,0x0
100000b8: f6c50513 addi a0,a0,-148 # 10000020 <trap_entry>
100000bc: 30551073 csrw mtvec,a0
100000c0 <sram_init>:
100000c0: 28000517 auipc a0,0x28000
100000c4: f4050513 addi a0,a0,-192 # 38000000 <__mulsi3>
100000c8: 28000597 auipc a1,0x28000
100000cc: 0fc58593 addi a1,a1,252 # 380001c4 <_esram>
100000d0: 00000617 auipc a2,0x0
100000d4: 73860613 addi a2,a2,1848 # 10000808 <_esram_rom>
100000d8 <sram_loop>:
100000d8: 00b50c63 beq a0,a1,100000f0 <data_init>
100000dc: 00062683 lw a3,0(a2)
100000e0: 00d52023 sw a3,0(a0)
100000e4: 00450513 addi a0,a0,4
100000e8: 00460613 addi a2,a2,4
100000ec: fedff06f j 100000d8 <sram_loop>
```
* The first thing CPU does is to fetch instruction code from the begining of the `0x1000000`, and keep fetching instructions defined in `counter_la_fir.out`. When the last 32 byte cache filled at adrress `0x100000E0`, CPU is told to transfer instructions to `0x38000000`.


* `100000E0` is to store word in `0x38000000`, so CPU pulls up `dBus_cmd_valid` to first fetch instructions from flash, once `dBus_rsp_valid` pulls up, CPU gets the instruction `00050613` and store it at pipline buffer, at the next `dBus_cmd_valid` CPU transfer this `00050613` to `0x38000000`.
:::warning
It is not sure why the second `dBus_cmd_valid` is pull up at PC `0x100000E4`. More specifically, when would `dBus_cmd_valid` pull up due to the reading instuction?
:::

* `0x100000DC` determine the `dBus_cmd_valid` pull up.

* `0x100000E0` determine the `dBus_cmd_valid` pull up.
:::success
We assume that this Risc-V uses 5-stage pipelined processor, and `dBus_cmd_valid` should be asserted at the MEM stage, so the instruction that assert this `dBus_cmd_valid` should be two or three cycles before the current IF, and it is proof that **`dBus_cmd_valid` belongs to the instruction of two cycles before** in lab4-2.
:::


```
10000100 <data_loop>:
10000100: 00b50c63 beq a0,a1,10000118 <bss_init>
10000104: 00062683 lw a3,0(a2)
10000108: 00d52023 sw a3,0(a0)
1000010c: 00450513 addi a0,a0,4
10000110: 00460613 addi a2,a2,4
10000114: fedff06f j 10000100 <data_loop>
```
* In the first waveform we can see that there are two parts when dBus is active, the first one is transfering instruction code to `0x38000000` and another one is transfering `.data` to `dff = 0x00000000`.
* We can see that when dBus is active, the wishbone in user project is active too, implying that this two are the same Bus.
### counter_la_fir.c

```cpp
reg_mprj_datal = 0xAB400000;
```

```cpp
int* tmp = fir();
reg_mprj_datal = *tmp << 16;
reg_mprj_datal = *(tmp+1) << 16;
reg_mprj_datal = *(tmp+2) << 16;
reg_mprj_datal = *(tmp+3) << 16;
reg_mprj_datal = *(tmp+4) << 16;
reg_mprj_datal = *(tmp+5) << 16;
reg_mprj_datal = *(tmp+6) << 16;
reg_mprj_datal = *(tmp+7) << 16;
reg_mprj_datal = *(tmp+8) << 16;
reg_mprj_datal = *(tmp+9) << 16;
reg_mprj_datal = *(tmp+10) << 16;
//print("\n");
//print("Monitor: Test 1 Passed\n\n"); // Makes simulation very long!
reg_mprj_datal = 0xAB510000;
```
### fir.c

* When `AB40` -> `flash_csb = 1`, CPU start fetching instruction from `mprjbram` to do convolution.
## Code
### fir.c
```cpp
#include "fir.h"
void __attribute__ ( ( section ( ".mprjram" ) ) ) initfir() {
//initial your fir
for (int i = 0; i < N; i++){
inputbuffer[i] = 0;
outputsignal[i] = 0;
}
}
int* __attribute__ ( ( section ( ".mprjram" ) ) ) fir(){
initfir();
//write down your fir
for (int i = 0; i < N; i++) {
for (int j = N - 1; j > 0; j--) {
inputbuffer[j] = inputbuffer[j - 1];
}
inputbuffer[0] = inputsignal[i];
int sum = 0;
for (int k = 0; k < N; k++) {
sum += inputbuffer[k] * taps[k];
}
outputsignal[i] = sum;
}
return outputsignal;
}
```
### user_proj_example.counter.v
```verilog
module user_proj_example #(
parameter BITS = 32,
parameter DELAYS=10
)(
`ifdef USE_POWER_PINS
inout vccd1, // User area 1 1.8V supply
inout vssd1, // User area 1 digital ground
`endif
// Wishbone Slave ports (WB MI A)
input wb_clk_i,
input wb_rst_i,
input wbs_stb_i,
input wbs_cyc_i,
input wbs_we_i,
input [3:0] wbs_sel_i,
input [31:0] wbs_dat_i,
input [31:0] wbs_adr_i,
output wbs_ack_o,
output [31:0] wbs_dat_o,
// Logic Analyzer Signals
input [127:0] la_data_in,
output [127:0] la_data_out,
input [127:0] la_oenb,
// IOs
input [`MPRJ_IO_PADS-1:0] io_in,
output [`MPRJ_IO_PADS-1:0] io_out,
output [`MPRJ_IO_PADS-1:0] io_oeb,
// IRQ
output [2:0] irq
);
wire clk;
wire rst;
assign clk = wb_clk_i;
assign rst = wb_rst_i;
wire [`MPRJ_IO_PADS-1:0] io_in;
wire [`MPRJ_IO_PADS-1:0] io_out;
wire [`MPRJ_IO_PADS-1:0] io_oeb;
reg [3:0] counter;
reg [3:0] counter_next;
reg [31:0] wbs_dat_buffer;
wire [31:0] wbs_dat_buffer_next;
wire wbs_ack_o_next;
reg wbs_ack_o_tmp;
wire [3:0] bram_WE0;
wire bram_EN0;
wire [31:0] bram_Di0;
wire [31:0] bram_Do0;
wire [31:0] bram_A0;
assign bram_WE0 = {(4){wbs_we_i}};
assign bram_EN0 = (wbs_cyc_i && wbs_stb_i && wbs_we_i && wbs_adr_i[29:27] == 3'b111) ? 1 :
(counter == 8 && wbs_adr_i[29:27] == 3'b111) ? 1 : 0;
assign bram_Di0 = wbs_dat_i;
assign bram_A0 = {{5'b00000}, wbs_adr_i[26:0]};
assign wbs_dat_o = (counter == 10) ? wbs_dat_buffer : 0;
assign wbs_dat_buffer_next = (counter == 9) ? bram_Do0 : wbs_dat_buffer;
assign io_out = 0;
assign io_oeb = 1;
always @(posedge clk or posedge rst) begin
if (rst) begin
counter <= 0;
wbs_dat_buffer <= 0;
wbs_ack_o_tmp <= 0;
end else begin
counter <= counter_next;
wbs_dat_buffer <= wbs_dat_buffer_next;
wbs_ack_o_tmp <= wbs_ack_o_next;
end
end
always @(*) begin
if (wbs_cyc_i && wbs_adr_i[29:27] == 3'b111 && !(|wbs_we_i) && !(counter == 10)) begin //meaning that wbs_adr_i is in 0x38000000
counter_next = counter + 1;
end else if (counter == 10 || !wbs_cyc_i)begin
counter_next = 0;
end else begin
counter_next = counter;
end
end
assign wbs_ack_o = (wbs_cyc_i && wbs_stb_i && wbs_we_i && wbs_adr_i[29:27] == 3'b111) ? 1 :
(counter == 10) ? 1 : 0;
bram user_bram (
.CLK(clk),
.WE0(bram_WE0),
.EN0(bram_EN0),
.Di0(bram_Di0),
.Do0(bram_Do0),
.A0(bram_A0)
);
endmodule
```
* In this code I design a buffer to catch the data output from bram and when `wbs_ack_o` is pullup -> `wbs_dat_o` which is connect to the buffer is a valid data output.
* `bram_EN0` pullup at `counter == 8` -> `bram_Do0` is valid at `counter == 9` -> `wbs_dat_buffer` is valid at `counter == 10`