# Assignment3: SoftCPU
# install everything
### install **verilator**
```
Verilator is an open source verilog simulation tool.
you will use it to perform RTL function simulation,
so as to verify the RTL code you write.
```
step1.prepare some packages
```
sudo apt-get install git perl python3 make autoconf g++ flex bison ccache
sudo apt-get install libgoogle-perftools-dev numactl perl-doc
sudo apt-get install libfl-dev
sudo apt-get install zlibc zlib1g zlib1g-dev
```
step2.Copy source code on git
```
git clone https://github.com/verilator/verilator
```
step3
Preparations before compilation
```
unset VERILATOR_ROOT # For bash
cd verilator
git pull # Make sure git repository is up-to-date
git tag # See what versions exist
```
step4.Select the compiled version here
```
git checkout v5.002
```
step5.Configure, compile and install
```
autoconf # Create ./configure script
./configure # Configure and create Makefile
make -j `nproc` # Build Verilator itself (if error, try just 'make')
sudo make install
```
step6.If successful enter
```
verilator --version
```
### install **GTKWAVE**
```
sudo apt-get install tcl8.6-dev
sudo apt-get install tk8.6-dev
sudo apt install gperf
./configure --with-tcl=/usr/lib/tcl8.6 --with-tk=/usr/lib/tk8.6
make -j`nproc`
sudo make install
```
# prepare two question
## leetcode medium
Problem Description:
I pick [122. Best Time to Buy and Sell Stock II](https://leetcode.com/problems/best-time-to-buy-and-sell-stock-ii/)
#### C code
```
#include<stdio.h>
#include<stdlib.h>
int main(){
int arr[]={99,32,3,56,0,2,56,99};
int size=8;
int a= maxProfit(arr,size);
printf("%d\n",a);
}
int maxProfit(int* prices, int pricesSize){
int totalProfit = 0;
int i=0;
int increases = 0;
for (i = 1; i < pricesSize; i++) {
if (prices[i] <= prices[i - 1]) {
totalProfit += increases;
increases = 0;
}
else
increases += prices[i] - prices[i - 1];
}
totalProfit += increases;
return totalProfit;
}
```
result

#### assembly code
```
0000003c <maxProfit>:
3c: 00100793 li a5,1
40: 02b7dc63 bge a5,a1,78 <maxProfit+0x3c>
44: 00259593 slli a1,a1,0x2
48: 00052703 lw a4,0(a0)
4c: 00450793 addi a5,a0,4
50: 00b50633 add a2,a0,a1
54: 00000513 li a0,0
58: 00070693 mv a3,a4
5c: 0007a703 lw a4,0(a5)
60: 00478793 addi a5,a5,4
64: 40d705b3 sub a1,a4,a3
68: 00e6d463 bge a3,a4,70 <maxProfit+0x34>
6c: 00b50533 add a0,a0,a1
70: fec794e3 bne a5,a2,58 <maxProfit+0x1c>
74: 00008067 ret
78: 00000513 li a0,0
7c: 00008067 ret
```
### Waveform analysis
here is srv32's structure

so I just pick up some signals in GTKWAVE

and let's get started!
#### hazard
how srv32 deal with data hazard
#### Forwarding

```
00009504 <strlen>:
9504: 00357793 andi a5,a0,3
9508: 00050713 mv a4,a0
950c: 04079c63 bnez a5,9564 <strlen+0x60>
9510: 7f7f86b7 lui a3,0x7f7f8
9514: f7f68693 addi a3,a3,-129 # 7f7f7f7f <_stack+0x7f7b7f7f>
9518: fff00593 li a1,-1
951c: 00072603 lw a2,0(a4)
9520: 00470713 addi a4,a4,4
9524: 00d677b3 and a5,a2,a3
9528: 00d787b3 add a5,a5,a3
952c: 00c7e7b3 or a5,a5,a2
9530: 00d7e7b3 or a5,a5,a3
9534: feb784e3 beq a5,a1,951c <strlen+0x18>
9538: ffc74683 lbu a3,-4(a4)
953c: 40a707b3 sub a5,a4,a0
9540: 04068463 beqz a3,9588 <strlen+0x84>
9544: ffd74683 lbu a3,-3(a4)
9548: 02068c63 beqz a3,9580 <strlen+0x7c>
954c: ffe74503 lbu a0,-2(a4)
9550: 00a03533 snez a0,a0
9554: 00f50533 add a0,a0,a5
9558: ffe50513 addi a0,a0,-2
955c: 00008067 ret
9560: fa0688e3 beqz a3,9510 <strlen+0xc>
9564: 00074783 lbu a5,0(a4)
9568: 00170713 addi a4,a4,1
956c: 00377693 andi a3,a4,3
9570: fe0798e3 bnez a5,9560 <strlen+0x5c>
9574: 40a70733 sub a4,a4,a0
9578: fff70513 addi a0,a4,-1
957c: 00008067 ret
9580: ffd78513 addi a0,a5,-3
9584: 00008067 ret
9588: ffc78513 addi a0,a5,-4
958c: 00008067 ret
```
### data hazard
and we take a look at this part
```
951c: 00072603 lw a2,0(a4)
9520: 00470713 addi a4,a4,4
9524: 00d677b3 and a5,a2,a3
9528: 00d787b3 add a5,a5,a3
952c: 00c7e7b3 or a5,a5,a2
9530: 00d7e7b3 or a5,a5,a3
9534: feb784e3 beq a5,a1,951c <strlen+0x18>
```
in this part we could see how SRV32 solve data hazar
here are the signals

### load-use hazard
```
00008dd8 <_sbrk_r>:
8dd8: ff010113 addi sp,sp,-16
8ddc: 00812423 sw s0,8(sp)
8de0: 00050413 mv s0,a0
8de4: 00058513 mv a0,a1
8de8: 00019797 auipc a5,0x19
8dec: a407a023 sw zero,-1472(a5) # 21828 <errno>
8df0: 00112623 sw ra,12(sp)
8df4: 520070ef jal ra,10314 <_sbrk>
8df8: fff00793 li a5,-1
8dfc: 00f50a63 beq a0,a5,8e10 <_sbrk_r+0x38>
8e00: 00c12083 lw ra,12(sp)
8e04: 00812403 lw s0,8(sp)
8e08: 01010113 addi sp,sp,16
8e0c: 00008067 ret
8e10: 00019797 auipc a5,0x19
8e14: a187a783 lw a5,-1512(a5) # 21828 <errno>
8e18: fe0784e3 beqz a5,8e00 <_sbrk_r+0x28>
8e1c: 00c12083 lw ra,12(sp)
8e20: 00f42023 sw a5,0(s0)
8e24: 00812403 lw s0,8(sp)
8e28: 01010113 addi sp,sp,16
8e2c: 00008067 ret
```
we take a look at
```
8df8: fff00793 li a5,-1
8dfc: 00f50a63 beq a0,a5,8e10 <_sbrk_r+0x38>
```

### Branch Penalty

**The BSS segment usually refers to a memory area used to store uninitialized or initialized to 0 global variables and static variables in the program. The feature is readable and writable, and the BSS segment will be automatically cleared to 0 before the program is executed.**
```
00000020 <_bss_clear>:
20: 0002a023 sw zero,0(t0)
24: 00428293 addi t0,t0,4
28: fe62ece3 bltu t0,t1,20 <_bss_clear>
2c: 00040117 auipc sp,0x40
30: fd410113 addi sp,sp,-44 # 40000 <_stack>
34: 04c000ef jal ra,80 <main>
38: 2c41006f j 102fc <exit>
```
#### as the picture we could see
ex_flush=1
then it took 2 cycles
#### after that
wb_flush=1
then it took 2 cycle too.
next observ the code and code address
From the following waveform and part of the code, when branch is taken, the two instructions after the branch instruction will be flushed.
85ps:fetch_pc=0x20:sw zero,0(t0)
next_pc=0000002C:auipc sp,0x40
if_pc=0x30:addi sp,sp,-44 # 40000 <_stack>
ex_pc=0x2c:auipc sp,0x40
*wb_pc=0x28:bltu t0,t1,20 <_bss_clear>(kill this instruction)
ex_flush=1
wb_flush=0
we could see in 93ps
instruction would be treated like below
```
sw zero,0(t0)
addi t0,t0,4
bltu t0,t1,20 <_bss_clear>
X auipc sp,0x40
X addi sp,sp,-44 # 40000 <_stack>
```
by the way the signal of
```
branch_taken=1
```
### Software Optimizations
```
#include<stdio.h>
#include<stdlib.h>
int main(){
int arr[]={99,32,3,56,0,2,56,99};
int size=8;
int a= maxProfit(arr,size);
printf("%d\n",a);
}
int maxProfit(int*p,int t)
{int i=1,r=0;
for(;i^t;++i) r+=p[i-1]<p[i]?p[i]-p[i-1]:0;
return r;}
```

I tried to reduce more cycles or used time,but,sadly I'm stll not found how to deal with it,so I try to use less code with my C code.
## choose assignment2
Problem Description:
I pick up[楊淳皓](https://hackmd.io/@Vgwl_uixQFasIvsDbsFlvA/risc-v-hw2)'s code to do my thrid homework
```
#include <stdio.h>
int searchInsert(int *nums, int numsSize, int target) {
int left = 0;
int right = numsSize - 1;
while (left <= right)
{
int mid = (left + right) / 2;
if (target == nums[mid])
return mid;
else if (target < nums[mid])
right = mid - 1;
else
left = mid + 1;
}
return left;
}
int main()
{
int data[] = {1, 3, 5, 6};
int size = 4;
int tar1 = 5, tar2 = 2, tar3 = 7;
int index1 = searchInsert(data, size, tar1);
int index2 = searchInsert(data, size, tar2);
int index3 = searchInsert(data, size, tar3);
printf("The target1 insert position is %d\n", index1);
printf("The target2 insert position is %d\n", index2);
printf("The target3 insert position is %d\n", index3);
return 0;
}
```
result

#### assembly code
```
0000003c <searchInsert>:
3c: fff58593 addi a1,a1,-1
40: 00050693 mv a3,a0
44: 0405c263 bltz a1,88 <searchInsert+0x4c>
48: 00000713 li a4,0
4c: 00b70533 add a0,a4,a1
50: 40155513 srai a0,a0,0x1
54: 00251793 slli a5,a0,0x2
58: 00f687b3 add a5,a3,a5
5c: 0007a783 lw a5,0(a5)
60: 00c78a63 beq a5,a2,74 <searchInsert+0x38>
64: 00f65a63 bge a2,a5,78 <searchInsert+0x3c>
68: fff50593 addi a1,a0,-1
6c: fee5d0e3 bge a1,a4,4c <searchInsert+0x10>
70: 00070513 mv a0,a4
74: 00008067 ret
78: 00150713 addi a4,a0,1
7c: fce5d8e3 bge a1,a4,4c <searchInsert+0x10>
80: 00070513 mv a0,a4
84: ff1ff06f j 74 <searchInsert+0x38>
88: 00000513 li a0,0
8c: 00008067 ret
```
### Forwarding
#### load use hazard
```
00000090 <main>:
90: 000207b7 lui a5,0x20
94: 0b478793 addi a5,a5,180 # 200b4 <environ+0x28>
98: 0007a603 lw a2,0(a5)
9c: 0047a683 lw a3,4(a5)
a0: 0087a703 lw a4,8(a5)
a4: 00c7a783 lw a5,12(a5)
a8: fe010113 addi sp,sp,-32
ac: 00c12023 sw a2,0(sp)
b0: 00d12223 sw a3,4(sp)
b4: 00112e23 sw ra,28(sp)
b8: 00e12423 sw a4,8(sp)
bc: 00f12623 sw a5,12(sp)
c0: 00300693 li a3,3
c4: 00000593 li a1,0
c8: 00500613 li a2,5
cc: 00b687b3 add a5,a3,a1
d0: 4017d793 srai a5,a5,0x1
d4: 00279713 slli a4,a5,0x2
d8: 01070713 addi a4,a4,16
dc: 00270733 add a4,a4,sp
e0: ff072703 lw a4,-16(a4)
e4: 04c70863 beq a4,a2,134 <main+0xa4>
e8: 02e65463 bge a2,a4,110 <main+0x80>
ec: fff78693 addi a3,a5,-1
f0: fcb6dee3 bge a3,a1,cc <main+0x3c>
f4: 00020537 lui a0,0x20
f8: 09050513 addi a0,a0,144 # 20090 <environ+0x4>
fc: 080000ef jal ra,17c <printf>
100: 01c12083 lw ra,28(sp)
104: 00000513 li a0,0
108: 02010113 addi sp,sp,32
10c: 00008067 ret
110: 00178593 addi a1,a5,1
114: feb6c0e3 blt a3,a1,f4 <main+0x64>
118: 00b687b3 add a5,a3,a1
11c: 4017d793 srai a5,a5,0x1
120: 00279713 slli a4,a5,0x2
124: 01070713 addi a4,a4,16
128: 00270733 add a4,a4,sp
12c: ff072703 lw a4,-16(a4)
130: fac71ce3 bne a4,a2,e8 <main+0x58>
134: 00078593 mv a1,a5
138: fbdff06f j f4 <main+0x64>
```
take a look at
we could know
signal
```
reg_rdata1=0003FE30
reg_rdata2=00020090
```
is the sourse for ALU
```
12c: ff072703 lw a4,-16(a4)
130: fac71ce3 bne a4,a2,e8 <main+0x58>
```

#### data hazard
```
00000020 <_bss_clear>:
20: 0002a023 sw zero,0(t0)
24: 00428293 addi t0,t0,4
28: fe62ece3 bltu t0,t1,20 <_bss_clear>
2c: 00040117 auipc sp,0x40
30: fd410113 addi sp,sp,-44 # 40000 <_stack>
34: 05c000ef jal ra,90 <main>
38: 3541006f j 1038c <exit>
```
let's look at this
```
24: 00428293 addi t0,t0,4
28: fe62ece3 bltu t0,t1,20 <_bss_clear>
```

### flush
```
20: 0002a023 sw zero,0(t0)
24: 00428293 addi t0,t0,4
28: fe62ece3 bltu t0,t1,20
```
then we look at this
signals from this

turned to this

### Software Optimizations
I kenw SRV32 is a 3-stage pipeline CPU
so I rewrite the C code
And SRV32 can deal with data hazard with forward
I also try to take advantage of this.
```
int searchInsert(int* nums, int numsSize, int target)
{
for(int i = 0; i<numsSize; i++)
{
if(nums[i] == target)
return i;
else if(nums[i] > target)
return i;
}
return numsSize;
}
int main()
{
int data[] = {1, 3, 5, 6};
int size = 4;
int tar1 = 5, tar2 = 2, tar3 = 7;
int index1 = searchInsert(data, size, tar1);
int index2 = searchInsert(data, size, tar2);
int index3 = searchInsert(data, size, tar3);
printf("The target1 insert position is %d\n", index1);
printf("The target2 insert position is %d\n", index2);
printf("The target3 insert position is %d\n", index3);
return 0;
}
```
result

# how SRV32 work?
:::spoiler {state="close"} `next_pc` determination in `srv32/rtl/riscv.v`
flush:
```
always @* begin
branch_taken = !ex_flush;
next_pc = fetch_pc + `IF_NEXT_PC;
ex_ill_branch = 1'b0;
case(1'b1)
ex_jal : next_pc = ex_pc + ex_imm;
ex_jalr : next_pc = alu_op1 + ex_imm;
ex_branch: begin
case(ex_alu_op)
OP_BEQ : begin
next_pc = (result_subs[32: 0] == 'd0) ?
ex_pc + ex_imm : fetch_pc + `IF_NEXT_PC;
if (result_subs[32: 0] != 'd0) branch_taken = 1'b0;
end
OP_BNE : begin
next_pc = (result_subs[32: 0] != 'd0) ?
ex_pc + ex_imm : fetch_pc + `IF_NEXT_PC;
if (result_subs[32: 0] == 'd0) branch_taken = 1'b0;
end
OP_BLT : begin
next_pc = result_subs[32] ?
ex_pc + ex_imm : fetch_pc + `IF_NEXT_PC;
if (!result_subs[32]) branch_taken = 1'b0;
end
OP_BGE : begin
next_pc = !result_subs[32] ?
ex_pc + ex_imm : fetch_pc + `IF_NEXT_PC;
if (result_subs[32]) branch_taken = 1'b0;
end
OP_BLTU: begin
next_pc = result_subu[32] ?
ex_pc + ex_imm : fetch_pc + `IF_NEXT_PC;
if (!result_subu[32]) branch_taken = 1'b0;
end
OP_BGEU: begin
next_pc = !result_subu[32] ?
ex_pc + ex_imm : fetch_pc + `IF_NEXT_PC;
if (result_subu[32]) branch_taken = 1'b0;
end
default: begin
next_pc = fetch_pc;
ex_ill_branch = 1'b1;
end
endcase
end
default : begin
next_pc = fetch_pc + `IF_NEXT_PC;
branch_taken = 1'b0;
end
endcase
end
```
:::
:::spoiler {state="close"} `load-use hazard` solution in `srv32/rtl/riscv.v`
```
always @(posedge clk or negedge resetb) begin
if (!resetb)
ex_mem2reg <= 1'b0;
else if (inst[`OPCODE] == OP_LOAD)
ex_mem2reg <= 1'b1;
else if (ex_mem2reg && dmem_rvalid)
ex_mem2reg <= 1'b0;
end
always @(posedge clk or negedge resetb) begin
if (!resetb) begin
wb_result <= 32'h0;
wb_alu2reg <= 1'b0;
wb_dst_sel <= 5'h0;
wb_branch <= 1'b0;
wb_branch_nxt <= 1'b0;
wb_mem2reg <= 1'b0;
wb_raddr <= 2'h0;
wb_alu_op <= 3'h0;
end else if (!ex_stall) begin
wb_result <= ex_result;
wb_alu2reg <= ex_alu || ex_lui || ex_auipc || ex_jal || ex_jalr ||
ex_csr ||
`ifdef RV32M_ENABLED
ex_mul ||
`endif
(ex_mem2reg && !ex_ld_align_excp);
wb_dst_sel <= ex_dst_sel;
wb_branch <= branch_taken || ex_trap;
wb_branch_nxt <= wb_branch;
wb_mem2reg <= ex_mem2reg;
wb_raddr <= dmem_raddr[1:0];
wb_alu_op <= ex_alu_op;
end
end
always @* begin
case(wb_alu_op)
OP_LB : begin
case(wb_raddr[1:0])
2'b00: wb_rdata[31: 0] = {{24{dmem_rdata[7]}},
dmem_rdata[ 7: 0]};
2'b01: wb_rdata[31: 0] = {{24{dmem_rdata[15]}},
dmem_rdata[15: 8]};
2'b10: wb_rdata[31: 0] = {{24{dmem_rdata[23]}},
dmem_rdata[23:16]};
2'b11: wb_rdata[31: 0] = {{24{dmem_rdata[31]}},
dmem_rdata[31:24]};
endcase
end
OP_LH : begin
wb_rdata = (wb_raddr[1]) ?
{{16{dmem_rdata[31]}}, dmem_rdata[31:16]} :
{{16{dmem_rdata[15]}}, dmem_rdata[15: 0]};
end
OP_LW : begin
wb_rdata = dmem_rdata;
end
OP_LBU : begin
case(wb_raddr[1:0])
2'b00: wb_rdata[31: 0] = {24'h0, dmem_rdata[7:0]};
2'b01: wb_rdata[31: 0] = {24'h0, dmem_rdata[15:8]};
2'b10: wb_rdata[31: 0] = {24'h0, dmem_rdata[23:16]};
2'b11: wb_rdata[31: 0] = {24'h0, dmem_rdata[31:24]};
endcase
end
OP_LHU : begin
wb_rdata = (wb_raddr[1]) ?
{16'h0, dmem_rdata[31:16]} :
{16'h0, dmem_rdata[15: 0]};
end
default: begin
wb_rdata = 32'h0;
end
endcase
end
assign reg_rdata1[31: 0] = (ex_src1_sel == 5'h0) ? 32'h0 :
(!wb_flush && wb_alu2reg &&
(wb_dst_sel == ex_src1_sel)) ? // register forwarding
(wb_mem2reg ? wb_rdata : wb_result) : // load instruction?
regs[ex_src1_sel];
assign reg_rdata2[31: 0] = (ex_src2_sel == 5'h0) ? 32'h0 :
(!wb_flush && wb_alu2reg &&
(wb_dst_sel == ex_src2_sel)) ? // register forwarding
(wb_mem2reg ? wb_rdata : wb_result) : // load instruction?
regs[ex_src2_sel];
```