owned this note
owned this note
Published
Linked with GitHub
# Lab3: SoftCPU
###### tags: `RISC-V` `computer architure 2021`
## Environment build(Ubuntu)
According to the requirements of Lab3, I downloaded [srv32](https://github.com/sysprog21/srv32), which is RISC-V 3-stage pipeline processor.The following are the steps to build the environment.
1. Install RISC-V toolchains from source.
```shell=
$ sudo apt install autoconf automake autotools-dev curl gawk git \
build-essential bison flex texinfo gperf libtool patchutils bc git \
libmpc-dev libmpfr-dev libgmp-dev gawk zlib1g-dev libexpat1-dev
$ git clone --recursive https://github.com/riscv/riscv-gnu-toolchain
$ cd riscv-gnu-toolchain
$ mkdir -p build && cd build
$ ../configure --prefix=/opt/riscv --enable-multilib
$ make -j$(nproc)
```
2. Install the dependent packages.
```shell=
$ sudo apt install build-essential ccache
```
3. Download [srv32](https://github.com/sysprog21/srv32).
```shell=
$ git clone https://github.com/sysprog21/srv32.git
make
```
4. Download GNU Toolchain for RISC-V.
```shell=
$ cd /tmp
$ wget https://github.com/xpack-dev-tools/riscv-none-embed-gcc-xpack/releases/download/v10.2.0-1.1/xpack-riscv-none-embed-gcc-10.2.0-1.1-linux-x64.tar.gz
$ tar zxvf xpack-riscv-none-embed-gcc-10.2.0-1.1-linux-x64.tar.gz
$ cp -af xpack-riscv-none-embed-gcc-10.2.0-1.1 $HOME/riscv-none-embed-gcc
$ cd $HOME/riscv-none-embed-gcc
$ echo "export PATH=`pwd`/bin:$PATH" > setenv
$ cd $HOME
$ source riscv-none-embed-gcc/setenv
$ riscv-none-embed-gcc -v
gcc version 10.1.0 (xPack GNU RISC-V Embedded GCC, 64-bit)
```
5. Down GTKWave.
```shell=
$ sudo apt install gtkwave
```
## Requirement 1
Because I rewritten the C++ code into an assembly language in [Lab1](https://hackmd.io/bt0v-sK8TfaEPWzxQn4pBA), there is no C version of the Pascal's triangle problem. Here, I rewrite it into C code based on the assembly language I wrote at that time.
### C code
```c=
#include<stdio.h>
void swap(int res[34], int temp[34]) {
int a ;
for ( int i = 0 ; i < 34 ; i ++ ) {
a = res[i] ;
res[i] = temp[i] ;
temp[i] = a ;
} // for
} // swap
int main() {
int result[34] = {0} ;
int temp[34] = {0} ;
int rowIndex = 5 ;
if ( rowIndex < 0 || rowIndex > 33 ) { // error
printf("Input is out of range!\n") ;
return 0 ;
} // if
else if ( rowIndex == 0 ) {
result[0] = 1 ;
} // else if
else if ( rowIndex == 1 ) {
result[0] = 1 ;
result[1] = 1 ;
} // else if
else { // rowIndex > 1
result[0] = 1 ;
result[1] = 1 ;
for ( int i = 2 ; i <= rowIndex ; i ++ ) {
temp[0] = 1 ;
for ( int j = 1 ; j < i ; j ++ ) {
temp[j] = result[j-1] + result[j] ;
} // for
temp[i] = 1 ;
swap(result, temp) ;
} // for
} // else
printf("[%d", result[0]) ;
for ( int k = 1 ; k <= rowIndex ; k ++ ) {
printf(",%d", result[k]) ;
} // for
printf("]\n") ;
} // main()
```
### C code after modifying
Then, rewrite all variables of type int in the C code to variables of type volatile int and change some repetitive parts of the code to make it more concise.
```c=
#include<stdio.h>
void swap(volatile int res[34], volatile int temp[34]) {
volatile int a ;
for ( volatile int i = 0 ; i < 34 ; i ++ ) {
a = res[i] ;
res[i] = temp[i] ;
temp[i] = a ;
} // for
} // swap
int main() {
volatile int result[34] = {0} ;
volatile int temp[34] = {0} ;
volatile int rowIndex = 5 ;
result[0] = 1 ;
result[1] = 1 ;
if ( rowIndex < 0 || rowIndex > 33 ) { // error
printf("Input is out of range!\n") ;
return 0 ;
} // if
else { // rowIndex > 1
for ( volatile int i = 2 ; i <= rowIndex ; i ++ ) {
temp[0] = 1 ;
for ( volatile int j = 1 ; j < i ; j ++ ) {
temp[j] = result[j-1] + result[j] ;
} // for
temp[i] = 1 ;
swap(result, temp) ;
} // for
} // else
printf("[%d", result[0]) ;
for ( volatile int k = 1 ; k <= rowIndex ; k ++ ) {
printf(",%d", result[k]) ;
} // for
printf("]\n") ;
} // main()
```
### Run program
1. Put the source code into ~/srv32/sw/lab3.
```shell=
~/srv32/sw$cd lab3
~/srv32/sw/lab3$ls
lab3.c
```
2. Copy the Makefile in ~/srv32/sw/hello to ~/srv32/sw/lab3
```shell=
~/srv32/sw/lab3$ls
lab3.c Makefile
```
3. Modify the Makefile as follows.
```clike=
include ../common/Makefile.common
EXE = .elf
SRC = lab3.c
CFLAGS += -L../common
LDFLAGS += -T ../common/default.ld
TARGET = lab3
OUTPUT = $(TARGET)$(EXE)
.PHONY: all clean
all: $(TARGET)
$(TARGET): $(SRC)
$(CC) $(CFLAGS) -o $(OUTPUT) $(SRC) $(LDFLAGS)
$(OBJCOPY) -j .text -O binary $(OUTPUT) imem.bin
$(OBJCOPY) -j .data -O binary $(OUTPUT) dmem.bin
$(OBJCOPY) -O binary $(OUTPUT) memory.bin
$(OBJDUMP) -d $(OUTPUT) > $(TARGET).dis
$(READELF) -a $(OUTPUT) > $(TARGET).symbol
clean:
$(RM) *.o $(OUTPUT) $(TARGET).dis $(TARGET).symbol [id]mem.bin memory.bin
```
4. Run the program.
```shell=
$ cd ~/srv32
$ make lab3
make[1]: Entering directory '/home/chewinggum/srv32/sw'
make -C common
make[2]: Entering directory '/home/chewinggum/srv32/sw/common'
make[2]: Nothing to be done for 'all'.
make[2]: Leaving directory '/home/chewinggum/srv32/sw/common'
make[2]: Entering directory '/home/chewinggum/srv32/sw/lab3'
riscv-none-embed-gcc -O3 -Wall -march=rv32im -mabi=ilp32 -nostartfiles -nostdlib -L../common -o lab3.elf lab3.c -lc -lm -lgcc -lsys -T ../common/default.ld
riscv-none-embed-objcopy -j .text -O binary lab3.elf imem.bin
riscv-none-embed-objcopy -j .data -O binary lab3.elf dmem.bin
riscv-none-embed-objcopy -O binary lab3.elf memory.bin
riscv-none-embed-objdump -d lab3.elf > lab3.dis
riscv-none-embed-readelf -a lab3.elf > lab3.symbol
make[2]: Leaving directory '/home/chewinggum/srv32/sw/lab3'
make[1]: Leaving directory '/home/chewinggum/srv32/sw'
make[1]: Entering directory '/home/chewinggum/srv32/sim'
[1,5,10,10,5,1]
Excuting 8493 instructions, 10123 cycles, 1.191 CPI
Program terminate
- ../rtl/../testbench/testbench.v:418: Verilog $finish
Simulation statistics
=====================
Simulation time : 0.087 s
Simulation cycles: 10134
Simulation speed : 0.116483 MHz
make[1]: Leaving directory '/home/chewinggum/srv32/sim'
make[1]: Entering directory '/home/chewinggum/srv32/tools'
./rvsim --memsize 128 -l trace.log ../sw/lab3/lab3.elf
[1,5,10,10,5,1]
Excuting 8493 instructions, 10123 cycles, 1.192 CPI
Program terminate
Simulation statistics
=====================
Simulation time : 0.003 s
Simulation cycles: 10123
Simulation speed : 4.011 MHz
make[1]: Leaving directory '/home/chewinggum/srv32/tools'
Compare the trace between RTL and ISS simulator
=== Simulation passed ===
```
## Requirement 2
Observe the waveform using GTKWave.
```shell=
$gtkwave
```
:::spoiler Assembly code generated by SRV32 as follows.
```clike=
0000003c <swap>:
3c: ff010113 addi sp,sp,-16
40: 00012623 sw zero,12(sp)
44: 00c12703 lw a4,12(sp)
48: 02100793 li a5,33
4c: 06e7c263 blt a5,a4,b0 <swap+0x74>
50: 02100693 li a3,33
54: 00c12783 lw a5,12(sp)
58: 00279793 slli a5,a5,0x2
5c: 00f507b3 add a5,a0,a5
60: 0007a783 lw a5,0(a5)
64: 00f12423 sw a5,8(sp)
68: 00c12703 lw a4,12(sp)
6c: 00c12783 lw a5,12(sp)
70: 00271713 slli a4,a4,0x2
74: 00e58733 add a4,a1,a4
78: 00072703 lw a4,0(a4)
7c: 00279793 slli a5,a5,0x2
80: 00f507b3 add a5,a0,a5
84: 00e7a023 sw a4,0(a5)
88: 00c12783 lw a5,12(sp)
8c: 00812703 lw a4,8(sp)
90: 00279793 slli a5,a5,0x2
94: 00f587b3 add a5,a1,a5
98: 00e7a023 sw a4,0(a5)
9c: 00c12783 lw a5,12(sp)
a0: 00178793 addi a5,a5,1
a4: 00f12623 sw a5,12(sp)
a8: 00c12783 lw a5,12(sp)
ac: faf6d4e3 bge a3,a5,54 <swap+0x18>
b0: 01010113 addi sp,sp,16
b4: 00008067 ret
000000b8 <main>:
b8: ed010113 addi sp,sp,-304
bc: 08800613 li a2,136
c0: 00000593 li a1,0
c4: 01010513 addi a0,sp,16
c8: 12112623 sw ra,300(sp)
cc: 12812423 sw s0,296(sp)
d0: 12912223 sw s1,292(sp)
d4: 194000ef jal ra,268 <memset>
d8: 08800613 li a2,136
dc: 00000593 li a1,0
e0: 09810513 addi a0,sp,152
e4: 184000ef jal ra,268 <memset>
e8: 00500793 li a5,5
ec: 00f12223 sw a5,4(sp)
f0: 00100793 li a5,1
f4: 00f12823 sw a5,16(sp)
f8: 00f12a23 sw a5,20(sp)
fc: 00412703 lw a4,4(sp)
100: 14074c63 bltz a4,258 <main+0x1a0>
104: 00412683 lw a3,4(sp)
108: 02100713 li a4,33
10c: 14d74663 blt a4,a3,258 <main+0x1a0>
110: 00412703 lw a4,4(sp)
114: 00200613 li a2,2
118: 00100513 li a0,1
11c: 02100593 li a1,33
120: 0ce7d863 bge a5,a4,1f0 <main+0x138>
124: 08a12c23 sw a0,152(sp)
128: 00100713 li a4,1
12c: fff70793 addi a5,a4,-1
130: 12010813 addi a6,sp,288
134: 00271693 slli a3,a4,0x2
138: 00279793 slli a5,a5,0x2
13c: 00d806b3 add a3,a6,a3
140: 00f807b3 add a5,a6,a5
144: ef07a783 lw a5,-272(a5)
148: ef06a803 lw a6,-272(a3)
14c: 00170713 addi a4,a4,1
150: 010787b3 add a5,a5,a6
154: f6f6ac23 sw a5,-136(a3)
158: fcc71ae3 bne a4,a2,12c <main+0x74>
15c: 00261793 slli a5,a2,0x2
160: 12010713 addi a4,sp,288
164: 00f707b3 add a5,a4,a5
168: f6a7ac23 sw a0,-136(a5)
16c: 00012623 sw zero,12(sp)
170: 00c12783 lw a5,12(sp)
174: 06f5c863 blt a1,a5,1e4 <main+0x12c>
178: 00c12783 lw a5,12(sp)
17c: 01010713 addi a4,sp,16
180: 09810693 addi a3,sp,152
184: 00279793 slli a5,a5,0x2
188: 00f707b3 add a5,a4,a5
18c: 0007a783 lw a5,0(a5)
190: 00f12423 sw a5,8(sp)
194: 00c12703 lw a4,12(sp)
198: 00c12783 lw a5,12(sp)
19c: 00271713 slli a4,a4,0x2
1a0: 00e68733 add a4,a3,a4
1a4: 00072703 lw a4,0(a4)
1a8: 01010693 addi a3,sp,16
1ac: 00279793 slli a5,a5,0x2
1b0: 00f687b3 add a5,a3,a5
1b4: 00e7a023 sw a4,0(a5)
1b8: 00c12783 lw a5,12(sp)
1bc: 00812703 lw a4,8(sp)
1c0: 09810693 addi a3,sp,152
1c4: 00279793 slli a5,a5,0x2
1c8: 00f687b3 add a5,a3,a5
1cc: 00e7a023 sw a4,0(a5)
1d0: 00c12783 lw a5,12(sp)
1d4: 00178793 addi a5,a5,1
1d8: 00f12623 sw a5,12(sp)
1dc: 00c12783 lw a5,12(sp)
1e0: f8f5dce3 bge a1,a5,178 <main+0xc0>
1e4: 00412783 lw a5,4(sp)
1e8: 00160613 addi a2,a2,1
1ec: f2c7dce3 bge a5,a2,124 <main+0x6c>
1f0: 01012583 lw a1,16(sp)
1f4: 00020537 lui a0,0x20
1f8: 05450513 addi a0,a0,84 # 20054 <__malloc_trim_threshold+0x1c>
1fc: 188000ef jal ra,384 <printf>
200: 00412783 lw a5,4(sp)
204: 02f05863 blez a5,234 <main+0x17c>
208: 00100413 li s0,1
20c: 000204b7 lui s1,0x20
210: 00241793 slli a5,s0,0x2
214: 12010713 addi a4,sp,288
218: 00f707b3 add a5,a4,a5
21c: ef07a583 lw a1,-272(a5)
220: 05848513 addi a0,s1,88 # 20058 <__malloc_trim_threshold+0x20>
224: 00140413 addi s0,s0,1
228: 15c000ef jal ra,384 <printf>
22c: 00412783 lw a5,4(sp)
230: fe87d0e3 bge a5,s0,210 <main+0x158>
234: 00020537 lui a0,0x20
238: 05c50513 addi a0,a0,92 # 2005c <__malloc_trim_threshold+0x24>
23c: 264000ef jal ra,4a0 <puts>
240: 12c12083 lw ra,300(sp)
244: 12812403 lw s0,296(sp)
248: 12412483 lw s1,292(sp)
24c: 00000513 li a0,0
250: 13010113 addi sp,sp,304
254: 00008067 ret
258: 00020537 lui a0,0x20
25c: 03c50513 addi a0,a0,60 # 2003c <__malloc_trim_threshold+0x4>
260: 240000ef jal ra,4a0 <puts>
264: fddff06f j 240 <main+0x188>
```
:::
- SRV32 is a three-stage RISC-V pipeline processor with three stages IF/ID, EXE and WB.
- When the `branch` is taken during the EXE stage, it needs to flush the instructions that have been fetched into the pipeline.
- The branch penalty requires two stalls.
| 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |
| -------- | -------- | -------- | -------- | -------- | -------- | -------- | -------- |
| |IF/ID | EXE | WB |
|jal||IF/ID|EXE|WB|
||||nop|nop|nop|
|||||nop|nop|nop|
||||||IF/ID|EXE|WB|
From the following waveform and part of the code, when `branch` is taken, the two instructions after the `branch` instruction will be flushed.

```clike=
178: lw a5,12(sp)
...
1e0: bge a1,a5,178
1e4: lw a5,4(sp)
1e8: addi a2,a2,1
```
| address | instruction | 1 | 2 | 3 | 4|5|6|
| -------- | -------- | -------- |-------- |-------- |-------- |-------- |-------- |
| 1e0 | bge a1,a5,178 | IF/ID |EXE|WB|
| 1e4 | lw a5,4(sp) | |nop|nop|nop|
| 1e8 | addi a2,a2,1 | ||nop|nop|nop|
| 1e0 | lw a5,12(sp) | |||IF/ID|EXE|WB
## Requirement 3
- Every function call will encounter the `branch` penalty which needs to stalls 2 cycles.
- Inorder to reduce the amount of cycles required to run the program, I put the `swap` function directly in the `main` function.
### The code after optimization
```c=
#include<stdio.h>
int main() {
volatile int result[34] = {0} ;
volatile int temp[34] = {0} ;
volatile int rowIndex = 5 ;
result[0] = 1 ;
result[1] = 1 ;
if ( rowIndex < 0 || rowIndex > 33 ) { // error
printf("Input is out of range!\n") ;
return 0 ;
} // if
else { // rowIndex > 1
for ( volatile int i = 2 ; i <= rowIndex ; i ++ ) {
temp[0] = 1 ;
for ( volatile int j = 1 ; j < i ; j ++ ) {
temp[j] = result[j-1] + result[j] ;
} // for
temp[i] = 1 ;
volatile int a ;
for ( volatile int k = 0 ; k < 34 ; k ++ ) {
a = result[k] ;
result[k] = temp[k] ;
temp[k] = a ;
} // for
} // for
} // else
printf("[%d", result[0]) ;
for ( volatile int k = 1 ; k <= rowIndex ; k ++ ) {
printf(",%d", result[k]) ;
} // for
printf("]\n") ;
} // main()
```
### Run program
```shell=
make[1]: Entering directory '/home/chewinggum/srv32/sw'
make -C common
make[2]: Entering directory '/home/chewinggum/srv32/sw/common'
make[2]: Nothing to be done for 'all'.
make[2]: Leaving directory '/home/chewinggum/srv32/sw/common'
make[2]: Entering directory '/home/chewinggum/srv32/sw/lab3'
riscv-none-embed-gcc -O3 -Wall -march=rv32im -mabi=ilp32 -nostartfiles -nostdlib -L../common -o lab3.elf lab3.c -lc -lm -lgcc -lsys -T ../common/default.ld
riscv-none-embed-objcopy -j .text -O binary lab3.elf imem.bin
riscv-none-embed-objcopy -j .data -O binary lab3.elf dmem.bin
riscv-none-embed-objcopy -O binary lab3.elf memory.bin
riscv-none-embed-objdump -d lab3.elf > lab3.dis
riscv-none-embed-readelf -a lab3.elf > lab3.symbol
make[2]: Leaving directory '/home/chewinggum/srv32/sw/lab3'
make[1]: Leaving directory '/home/chewinggum/srv32/sw'
make[1]: Entering directory '/home/chewinggum/srv32/sim'
[1,5,10,10,5,1]
Excuting 8381 instructions, 10011 cycles, 1.194 CPI
Program terminate
- ../rtl/../testbench/testbench.v:418: Verilog $finish
Simulation statistics
=====================
Simulation time : 0.087 s
Simulation cycles: 10022
Simulation speed : 0.115195 MHz
make[1]: Leaving directory '/home/chewinggum/srv32/sim'
make[1]: Entering directory '/home/chewinggum/srv32/tools'
./rvsim --memsize 128 -l trace.log ../sw/lab3/lab3.elf
[1,5,10,10,5,1]
Excuting 8381 instructions, 10011 cycles, 1.194 CPI
Program terminate
Simulation statistics
=====================
Simulation time : 0.003 s
Simulation cycles: 10011
Simulation speed : 3.613 MHz
make[1]: Leaving directory '/home/chewinggum/srv32/tools'
Compare the trace between RTL and ISS simulator
=== Simulation passed ===
```
- We reduced 10123 - 10011 = 112 cycles.
### Extra part
- I found that the code executed a lot of unnecessary parts in the loop of the exchange of the two arrays. It takes 34 iterations to perform this loop every time.
- I changed the number of iterations I need to execute this loop to depend on the size of the array at the time.
- The code modified as follows.
```c=
#include<stdio.h>
int main() {
volatile int result[34] = {0} ;
volatile int temp[34] = {0} ;
volatile int rowIndex = 5 ;
result[0] = 1 ;
result[1] = 1 ;
if ( rowIndex < 0 || rowIndex > 33 ) { // error
printf("Input is out of range!\n") ;
return 0 ;
} // if
else { // rowIndex > 1
for ( volatile int i = 2 ; i <= rowIndex ; i ++ ) {
temp[0] = 1 ;
for ( volatile int j = 1 ; j < i ; j ++ ) {
temp[j] = result[j-1] + result[j] ;
} // for
temp[i] = 1 ;
volatile int a ;
for ( volatile int k = 0 ; k <= i ; k ++ ) {
a = result[k] ;
result[k] = temp[k] ;
temp[k] = a ;
} // for
} // for
} // else
printf("[%d", result[0]) ;
for ( volatile int k = 1 ; k <= rowIndex ; k ++ ) {
printf(",%d", result[k]) ;
} // for
printf("]\n") ;
} // main()
```
- Result
```shell=
make[1]: Entering directory '/home/chewinggum/srv32/sw'
make -C common
make[2]: Entering directory '/home/chewinggum/srv32/sw/common'
make[2]: Nothing to be done for 'all'.
make[2]: Leaving directory '/home/chewinggum/srv32/sw/common'
make[2]: Entering directory '/home/chewinggum/srv32/sw/lab3'
riscv-none-embed-gcc -O3 -Wall -march=rv32im -mabi=ilp32 -nostartfiles -nostdlib -L../common -o lab3.elf lab3.c -lc -lm -lgcc -lsys -T ../common/default.ld
riscv-none-embed-objcopy -j .text -O binary lab3.elf imem.bin
riscv-none-embed-objcopy -j .data -O binary lab3.elf dmem.bin
riscv-none-embed-objcopy -O binary lab3.elf memory.bin
riscv-none-embed-objdump -d lab3.elf > lab3.dis
riscv-none-embed-readelf -a lab3.elf > lab3.symbol
make[2]: Leaving directory '/home/chewinggum/srv32/sw/lab3'
make[1]: Leaving directory '/home/chewinggum/srv32/sw'
make[1]: Entering directory '/home/chewinggum/srv32/sim'
[1,5,10,10,5,1]
Excuting 5452 instructions, 6846 cycles, 1.255 CPI
Program terminate
- ../rtl/../testbench/testbench.v:418: Verilog $finish
Simulation statistics
=====================
Simulation time : 0.07 s
Simulation cycles: 6857
Simulation speed : 0.0979571 MHz
make[1]: Leaving directory '/home/chewinggum/srv32/sim'
make[1]: Entering directory '/home/chewinggum/srv32/tools'
./rvsim --memsize 128 -l trace.log ../sw/lab3/lab3.elf
[1,5,10,10,5,1]
Excuting 5452 instructions, 6846 cycles, 1.256 CPI
Program terminate
Simulation statistics
=====================
Simulation time : 0.003 s
Simulation cycles: 6846
Simulation speed : 2.677 MHz
make[1]: Leaving directory '/home/chewinggum/srv32/tools'
Compare the trace between RTL and ISS simulator
=== Simulation passed ===
```
- We reduced 10011 - 6846 = 3165 cycles.