2022 Assignment3: SoftCPU
contribute by <chiangkd
>
Requirement follow Assignment3: SoftCPU
Source code
I choose the program from 鄒崴丞-Length of Last Word and Leetcode 77. Combinations
C code(Modified from 鄒崴丞)
.org 0
.global _start
/* newlib system calls */
.set SYSEXIT, 93
.set SYSWRITE, 64
.data
str1: .string "Hello World"
.set str1len, .-str1
str2: .string "I am a student "
.set str2len, .-str2
str3: .string "a"
.set str3len, .-str3
fstr: .string "For string \""
.set flen, .-fstr
answrstr: .string "\", length of last word : "
.set answrlen, .-answrstr
newline: .string "\n"
.set nllen, .-newline
.text
_start:
la a0, str1
jal ra, lengthoflastword
add a2, x0, a0
la a0, str1
li a1, str1len
jal print_result
la a0, str2
jal ra, lengthoflastword
add a2, x0, a0
la a0, str2
li a1, str2len
jal print_result
la a0, str3
jal ra, lengthoflastword
add a2, x0, a0
la a0, str3
li a1, str3len
jal print_result
li a0, 0
li a7, SYSEXIT
ecall
nochar:
li a0, 0
ret
lengthoflastword:
lb t0, 0(a0)
beq t0, x0, nochar
a2, a0, 3
add a1, x0, a0
bne a2, x0, unaligned # jump if the address is unaligned
starttraverse:
lui a3, 0x7F7F8
addi a3, a3, -129
addi a4, x0, -1
findend:
lw t0, 0(a1)
lw t1, 4(a1)
lw t2, 8(a1)
lw t3, 12(a1)
a2, t0, t1
t4, t2, t3
a2, a2, t4
addi a1, a1, 16
a5, a2, a3
add a5, a5, a3
or a5, a5, a3
beq a5, a4, findend
a5, t0, a3
add a5, a5, a3
or a5, a5, a3
bne a5, a4, back12
a5, t1, a3
add a5, a5, a3
or a5, a5, a3
bne a5, a4, back8
a5, t2, a3
add a5, a5, a3
or a5, a5, a3
bne a5, a4, back4
findbyte:
lbu a2, -4(a1)
sub a3, a1, a0
beq a2, x0, sub4
lbu a2, -3(a1)
beq a2, x0, sub3
lbu a2, -2(a1)
snez a2, a2
add a2, a2, a3
addi a1, a2, -2
j findword
back4:
addi a1, a1, -4
j findbyte
back8:
addi a1, a1, -8
j findbyte
back12:
addi a1, a1, -12
j findbyte
checkalign:
beq a3, x0, starttraverse
unaligned:
lbu a2, 0(a1)
addi a1, a1, 1
a3, a1, 3
bne a2, x0, checkalign
sub a1, a1, a0
addi a1, a1, -1
j findword
sub4:
addi a1, a3, -4
j findword
sub3:
addi a1, a3, -3
j findword
findword:
li a2, 32
add a0, a1, a0
addi a1, a1, 1
loop1:
beq a1, x0, return
addi a0, a0, -1
addi a1, a1, -1
lb t0, 0(a0)
beq t0, a2, loop1
add a3, x0, a0
add a0, x0, x0
loop2:
beq a1, x0, return
addi a1, a1, -1
addi a3, a3, -1
addi a0, a0, 1
lb t0, 0(a3)
bne t0, a2, loop2
return:
ret
print_result:
add s0, x0, a0
add s1, x0, a1
add s2, x0, a2
li a7, SYSWRITE
li a0, 1
la a1, fstr
la a2, flen
ecall
li a7, SYSWRITE
li a0, 1
add a1, x0, s0
add a2, x0, s1
ecall
li a7, SYSWRITE
li a0, 1
la a1, answrstr
li a2, answrlen
ecall
li a7, SYSWRITE
addi a1, s2, 48
addi sp, sp, -4
sw a1, 0(sp)
li a0, 1
addi a1, sp, 0
li a2, 4
ecall
addi sp, sp, 4
li a7, SYSWRITE
li a0, 1
la a1, newline
li a2, nllen
ecall
ret
- There are serval version in 鄒崴丞-Length of Last Word, I take the version with SWAR(4 register at one time), It seems has the best performance (CSR cycle) with longer string.
Given two integers n
and k
, return all possible combinations of k
numbers chosen from the range [1, n]
.
Constraints:
Leetcode Solution
From description, we know that the MAXLEN about the answer will be , so that's why I give it a constant value and after caculation, just realloc
it.
Improvement
In LeetCode, there are many problems take the argument int *returnSize
and int ** returnColumnSizes
, the first one is about output array's length, and the second one is about the each column size in the output array.
But, in my problem, the column size will always equal to k
, which is an argument about combine
function, and the returnSize will equal to as mention above.
So, I remove returnSize
and returnColumnSizes
to simplfy the question. And also, use structure to reduce argument number
However, when I do make
with this program, get an error message
which is defined in testbench.v
, means out of memory range (DRAM + IRAM), so I shrink the MAXLENGTH
to make it run properly.
Tweaked programs for srv32
- You should validate the results in your program(s).
- You should program in RISC-V assembly for the sake of further optimizations. Hint: You may modify a customized
Makefile
for building C and assembly source files from scratch.
Modify Makefile
Simply copy Makefile
from existing hello
example and just modifiy the target and associated dependency name
And do some modification to let the assembly run on srv32
printf
system call
- stored
ra
.data
str1: .string "Hello World"
str2: .string "I am a student "
str3: .string "a"
space: .string " "
fstr: .string "For string \""
answrstr: .string "\", length of last word : "
newline: .string "\n"
iformat: .string "%d"
.text
.global main
main:
addi sp, sp, -4
sw ra, 0(sp)
la a0, str1
jal ra, lengthoflastword
add a2, x0, a0
la a0, str1
jal print_result
la a0, str2
jal ra, lengthoflastword
add a2, x0, a0
la a0, str2
jal print_result
la a0, str3
jal ra, lengthoflastword
add a2, x0, a0
la a0, str3
jal print_result
lw ra, 0(sp)
addi sp, sp, 4
li a0, 0
ret
lengthoflastword:
lb t0, 0(a0)
beq t0, x0, return
andi a2, a0, 3
add a1, x0, a0
bne a2, x0, unaligned
starttraverse:
lui a3, 0x7F7F8
addi a3, a3, -129
addi a4, x0, -1
findend:
lw t0, 0(a1)
lw t1, 4(a1)
lw t2, 8(a1)
lw t3, 12(a1)
and a2, t0, t1
and t4, t2, t3
and a2, a2, t4
addi a1, a1, 16
and a5, a2, a3
add a5, a5, a3
or a5, a5, a3
beq a5, a4, findend
and a5, t0, a3
add a5, a5, a3
or a5, a5, a3
bne a5, a4, back12
and a5, t1, a3
add a5, a5, a3
or a5, a5, a3
bne a5, a4, back8
and a5, t2, a3
add a5, a5, a3
or a5, a5, a3
bne a5, a4, back4
findbyte:
lbu a2, -4(a1)
sub a3, a1, a0
beq a2, x0, sub4
lbu a2, -3(a1)
beq a2, x0, sub3
lbu a2, -2(a1)
snez a2, a2
add a2, a2, a3
addi a1, a2, -2
j findword
back4:
addi a1, a1, -4
j findbyte
back8:
addi a1, a1, -8
j findbyte
back12:
addi a1, a1, -12
j findbyte
checkalign:
beq a3, x0, starttraverse
unaligned:
lbu a2, 0(a1)
addi a1, a1, 1
andi a3, a1, 3
bne a2, x0, checkalign
sub a1, a1, a0
addi a1, a1, -1
j findword
sub4:
addi a1, a3, -4
j findword
sub3:
addi a1, a3, -3
j findword
findword:
li a2, 32
add a0, a1, a0
addi a1, a1, 1
loop1:
beq a1, x0, return
addi a0, a0, -1
addi a1, a1, -1
lb t0, 0(a0)
beq t0, a2, loop1
add a3, x0, a0
add a0, x0, x0
loop2:
beq a1, x0, return
addi a1, a1, -1
addi a3, a3, -1
addi a0, a0, 1
lb t0, 0(a3)
bne t0, a2, loop2
return:
ret
print_result:
addi sp, sp, -4
sw ra, 0(sp)
add s0, x0, a0
add s1, x0, a1
add s2, x0, a2
la a0, fstr
call printf
add a0, x0, s0
call printf
la a0, answrstr
call printf
addi a1, s2, 0
addi sp, sp, -4
sw a1, 0(sp)
la a0, iformat
lw a1, 0(sp)
call printf
addi sp, sp, 4
la a0, newline
call printf
lw ra, 0(sp)
addi sp, sp, 4
ret
Result
sim
rvsim
- The answer is correct
- Executing 12991 instructions
- 17619 cycles
- 1.356 CPI
|
sim |
rvsim |
Simulation time |
0.075s |
0.03s |
Simulation cycles |
17252 |
17241 |
Simulation speed |
0.230027 MHz |
5.760 MHz |
From the C code mentioned above, write the assembly which can properly run on srv32
Hand-write Assembly
.data
n: .word 20
lbracket: .string "["
rbracket: .string "]"
comma: .string ","
backspace: .string "\b"
newline: .string "\n"
iformat: .string "%d,"
xformat: .string "%x\n"
.text
.global main
main:
li t1, 0
addi sp, sp, -4
sw ra, 0(sp)
# push sp
lw t0, n
slli t0, t0, 2
sub sp, sp, t0
mv a0, sp # a0 = &cond
li t0, 4 # cond.n = 4
sw t0, 0(a0)
li t0, 3 # cond.k = 3
sw t0, 4(a0)
sw x0, 8(a0) # cond.returnSize = 0
call combine
# a0 = res
lw a1, 4(a3) # a1 = cond.k
lw a2, 8(a3) # a2 = cond.returnSize
call printAns
# pop sp
lw t0, n
slli t0, t0, 2
add sp, sp, t0
# restore ra
lw ra, 0(sp)
addi sp, sp, 4
# exit
li a0, 0
ret
memcpy:
# a0 dest
# a1 src
# a2 n
#slli t5, a2, 0x2
li t3, 0 # i = 0
mcpyloop:
lw t4, 0(a1)
sw t4, 0(a0)
addi a0, a0, 4
addi a1, a1, 4
addi t3, t3, 4
bne t3, a2, mcpyloop
ret
GetLineSize:
# a0 = res
# a1 = line
# a2 = lineSize
# a3 = cond
# a4 = idx
addi sp, sp, -12
sw ra, 0(sp)
sw a2, 4(sp) # lineSize
sw a4, 8(sp) # idx
lw s1, 4(a3) # s1 = cond->k
bne a2, s1, GLSrecursive # if (lineSize != cond->k)
PutLineIn:
# a0 = res
# a1 = line
# a2 = lineSize
# a3 = cond
# a4 = idx
addi sp, sp, -24
sw a0, 0(sp)
sw a1, 4(sp)
sw a2, 8(sp)
sw a3, 12(sp)
sw a4, 16(sp)
lw t2, 20(sp)
add t1, t2, t1
add a0, a0, t1 # a0 = res[cond->returnSize]
lw t2, 4(a3) # t2 = cond->k
slli a2, t2, 0x2 # a2 = sizoef(int) * cond->k
sw a2, 20(sp) # store offset # Here can do some improvement
call memcpy
lw a0, 0(sp) # res
lw a1, 4(sp) # line
lw a2, 8(sp) # lineSize
lw a3, 12(sp) # cond
lw a4, 16(sp) # idx
addi sp, sp, 24
addi t5, t5, 1 # cond->returnSize++
sw t5, 8(a3) # store returnSize back
lw ra, 0(sp)
addi sp, sp, 12
ret
GLSrecursive:
mv s2, a2 # s1 = lineSize
mv s3, a4 # int i = idx
lw t0, 0(a3) # t0 = cond->n
blt t0, s3, Normalreturn
#addi t0, t0, 1
GLSloop:
slli t3, s2, 0x2 # t3 = line[lineSize] offset
add t4, a1, t3 # t4 = line[lineSize] address
sw s3, 0(t4) # line[lineSize] = i
# a2, a3 must put in function with plus 1
addi a2, s2, 1 # lineSize + 1
addi a4, s3, 1 # i + 1
call GetLineSize
addi s3, s3, 1 # i++
sw s3, 8(sp)
bge t0, s3, GLSloop
Normalreturn:
lw ra, 0(sp)
addi sp, sp, 12
lw a2, 4(sp)
lw a4, 8(sp)
mv s2, a2
mv s3, a4
ret
combine:
# a0 = &cond
# 0(a0) = cond->n
# 4(a0) = cond->k
# 8(a0) = cond->returnSize
addi sp, sp, -24
sw ra, 0(sp)
mv a3, a0 # a3 = cond
addi a0, a0, -2000 # a0 = address of **res
addi a1, a0, -2000 # a1 = address of line[cond->k]
li a2, 0 # a2 = lineSize
li a4, 1 # a4 = 1
sw a0, 4(sp)
sw a1, 8(sp)
sw a2, 12(sp)
sw a3, 16(sp)
sw a4, 20(sp)
call GetLineSize
lw ra, 0(sp)
lw a0, 4(sp) # res
lw a1, 8(sp) # line
lw a2, 12(sp) # lineSize
lw a3, 16(sp) # cond
lw a4, 20(sp) # idx
addi sp, sp, 24
ret
printAns:
# a0 = res
# a1 = k (cond.k)
# a2 = returnSize (cond.returnSize)
mv s2, a1 # return Size = 3
mv s3, a2 # k = 4
addi sp, sp, -4
sw ra, 0(sp)
mv s1, a0 # s1 = res
la a0, lbracket
call printf
li s4, 0 # int i = 0
li s5, 0 # int j = 0
printloop1:
la a0, lbracket
call printf
li s4, 0 # int j = 0
printloop2:
lw a1, 0(s1) # res[i]
addi s1, s1, 0x4 # res[i][j] address offset
la a0, iformat
call printf
addi s4, s4, 1 # i++
bne s4, s2, printloop2 # j < returnColumnSizes
la a0, backspace
call printf
la a0, rbracket
call printf
la a0, comma
call printf
addi s5, s5, 1 # i++
bne s5, s3, printloop1 # i < returnSize
la a0, backspace
call printf
la a0, rbracket
call printf
la a0, newline
call printf
lw ra, 0(sp)
addi sp, sp, 4
ret
- There is a little difficult for me to wrtie recursive code in assembly, the difficult point is how stack pointer move, once the
sp
get wrong position, the return address(ra
) will totally mess up and the program will not run properly.
Take input n = 4, k = 3 for example, The procedure about how Combinations(4,3)
work as below.
The redline is actually the order of function execute, and in every backward arrow and vertical arrow, that is, the red arrow which is no overlapped with black arrow will pop the stack with specific number of bytes coresspond to k
value, and the overlapping situation will push the stack.
Function Entry |
sp |
|
|
|
|
|
|
|
|
|
|
|
Function Out |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Result
sim
rvsim
- The answer is correct
- Executing 18343 instructions
- 24105 cycles
- 1.314 CPI
|
sim |
rvsim |
Simulation time |
0.134s |
0.004s |
Simulation cycles |
24123 |
24112 |
Simulation speed |
0.180022 MHz |
5.725 MHz |
Analysize srv32 RV32 core
Hint by Lab3: srv32, srv32 is a 3-stage pipeline architecture with IF/ID, EX, WB stages and supports full forwarding, means that RAW data hazard can be resolved without stalling the processor

Compare the disassembly and the waveform

First check the waveform, at initial imem
fetch the instruction at 00000000
, which is corresponded to 00010297
stands for auipc t0, 0x10
, and inst[31:0]
not decode by instr_decoder yet (00000013
stands for addi x0, x0, 0
, which is NOP
)

Next, IF/ID in the same stage (same clock period), which can obtain rdata
and inst
with same value.
Check hazard

Since srv32 has fully bypassing, there is not stall generated by data hazards
Check branch

Take first branch occur for example, at 78 ps (the line in the figure), raddr
and rdata
equals to FE62ECE3
means bltu t0, t1, 20
at IF/ID stage, and one clock cycle after at the EX stage, branch will be taken. Also a clock cycle after wb_flush
will trigger and flush 2 cycle, then reload the correct instruction after branching.
Disassembly
Instruction |
c1 |
c2 |
c3 |
c4 |
c5 |
c6 |
c7 |
bltu t0, t1, 20 <_bss_clear> |
IF/ID |
EX |
WB |
|
|
|
|
auipc sp, 0x40 |
|
IF/IDNOP |
EXNOP |
WBNOP |
|
|
|
addi sp, sp, -44 |
|
|
IF/IDNOP |
EXNOP |
WBNOP |
|
|
sw zero, 0(t0) |
|
|
|
IF/ID |
EX |
WB |
|
addi t0, t0, 4 |
|
|
|
|
IF/ID |
EX |
WB |


- From c1 to c5 is correspond to the figure above from 77 ps to 97 ps (5 clock cycle)
Notice that at 4293 ps, 0xFEC296E3
was executed which is correspond to bne t0, a2, loop2
. In first testcase, this branch is taken totally four times, the number of the loop execute is relate to the last word's length. Here can do loop unrolling to reduce branch panelty.

At 4053ps, 120000EF
was executed, which is call combine
. Get into combine
function and run the algorithm and cause branch panelty with 2 cycle panelty.

There is similar behavior (branch and flush) if branch is taken. So, the branch in my program can be do some optimization to reduce branch panelty.
Propose the software optimizations
Original loop2
which is relate the length of last word.
Loop unrolling 2 times.
Loop unrolling 4 times.
loop2:
beq a1, x0, return # if encounter the last position, return
addi a1, a1, -1
addi a3, a3, -1
addi a0, a0, 1 # s++
lb t0, 0(a3) # t0 = s[i]
beq t0, a2, return
beq a1, x0, return # if encounter the last position, return
addi a1, a1, -1
addi a3, a3, -1
addi a0, a0, 1 # s++
lb t0, 0(a3) # t0 = s[i]
beq a1, x0, return # if encounter the last position, return
addi a1, a1, -1
addi a3, a3, -1
addi a0, a0, 1 # s++
lb t0, 0(a3) # t0 = s[i]
beq a1, x0, return # if encounter the last position, return
addi a1, a1, -1
addi a3, a3, -1
addi a0, a0, 1 # s++
lb t0, 0(a3) # t0 = s[i]
bne t0, a2, loop2
|
Original |
Unrolling 2 |
Unrolling 4 |
Instructions |
12991 |
12980 |
12986 |
Cycles |
17619 |
17596 |
17598 |
CPI |
1.356 |
1.356 |
1.355 |
- Notice that the performance is no grouth with the numbers of unrolling, that's because the in the testcase above, the last word length is not so long so the branch will no take so much.
If change the testcase to
|
Original |
Unrolling 2 |
Unrolling 4 |
Instructions |
15428 |
15382 |
15358 |
Cycles |
20926 |
20830 |
20782 |
CPI |
1.356 |
1.354 |
1.353 |
- As the length of last word be longer, the improvement of loop unrolling is clear.
To reduce the branch panelty, unrolling the printloop2
loop by 4 times
printloop2:
beq s4, s2, loop2out
lw a1, 0(s1)
addi s1, s1, 0x4
la a0, iformat
call printf
addi s4, s4, 1
beq s4, s2, loop2out
lw a1, 0(s1)
addi s1, s1, 0x4
la a0, iformat
call printf
addi s4, s4, 1
beq s4, s2, loop2out
lw a1, 0(s1)
addi s1, s1, 0x4
la a0, iformat
call printf
addi s4, s4, 1
beq s4, s2, loop2out
lw a1, 0(s1)
addi s1, s1, 0x4
la a0, iformat
call printf
addi s4, s4, 1
beq s4, s2, loop2out
lw a1, 0(s1)
addi s1, s1, 0x4
la a0, iformat
call printf
addi s4, s4, 1
beq s4, s2, loop2out
lw a1, 0(s1)
addi s1, s1, 0x4
la a0, iformat
call printf
addi s4, s4, 1
j printloop2
- There is the same if I unroll 4, 8, 16 times. because in my testcase,
loop2
handle the size of column (cond->k
), which is 3, so if want to unroll much more, need to take another testcase.
Furthermore, make function printAns
, combine
, memcpy
, to inline functon, such as:
- Also can reduce branch panelty
|
Original |
Unrolling printAns loop |
Unrolling and inline function |
Instructions |
18350 |
18354 |
18342 |
Cycles |
24112 |
24108 |
24072 |
CPI |
1.314 |
1.314 |
1.312 |
How srv32 work with Verilator
If do make hello
, it will go find the correspond foled under /sw
and do a series operations.
line 1
do the compilation and generate hello.elf
line 2
copy .text
section in hello.elf
and generate imem.bin
line 3
copy .data
section in hello.elf
and generate dmem.bin
line 4
create an output file in binary
line 5
disassemble .text
section (executable sections)
line 6
disassemble with -h -l -S -s -r -d -V -A -I
and next see /sim/Makefile
- parse argument from
filelist.txt
filelist.txt
Do make
under /sim
will generate executable file sim
and a bunch of file which will move to sim_cc
Then ./sim
can run the executable file which is done before (generate imem
and dmem
).
If simply run make hello.run
under /sim
, it will copy imem.bin
and dmem.bin
under /sw/hello
and run the program.
sim_main.cpp
The while loop will run until verilator end, and can see three signal(resetb, stall, clk
) in GTKWave

And use chrono to caculate time.
Reference Link