owned this note
owned this note
Published
Linked with GitHub
# Assignment2: RISC-V Toolchain
contributed by < `dck9661` >
# Bubble Sort
## Pick up a assembly program from Assignment1
* choose [bubble sort](https://github.com/Shengyuu/Assignment_computer_arch) assembly program from [林聖堯](https://hackmd.io/@_01X9rimQmWH33Djf8QhoA/HJ1QiTzYB).
```cpp=
main:
#create stack
addi sp,sp,-48
#save s0
sw s0,44(sp)
#update s0
addi s0,sp,48
#init arr[] to memory
li a5,3
sw a5,-48(s0)
li a5,5
sw a5,-44(s0)
li a5,1
sw a5,-40(s0)
li a5,2
sw a5,-36(s0)
li a5,4
sw a5,-32(s0)
#init i to memory
sw zero,-20(s0)
j .L2
.L6:
#init j to memory
sw zero,-24(s0)
j .L3
.L5:
#load arr[ j ] to a4
lw a5,-24(s0)
slli a5,a5,2
addi a4,s0,-16
add a5,a4,a5
lw a4,-32(a5)
#load arr[j+1] a5
lw a5,-24(s0)
addi a5,a5,1
slli a5,a5,2
addi a3,s0,-16
add a5,a3,a5
lw a5,-32(a5)
#if arr[j + 1] > arr[j]
bge a5,a4,.L4
# arr[j+1] < arr[j]
#load arr[j] a5
lw a5,-24(s0)
slli a5,a5,2
addi a4,s0,-16
add a5,a4,a5
lw a5,-32(a5)
#store arr[j] to tmp
sw a5,-28(s0)
#load arr[j+1] to a4
lw a5,-24(s0)
addi a5,a5,1
slli a5,a5,2
addi a4,s0,-16
add a5,a4,a5
lw a4,-32(a5)
#arr[j] = arr[j+1]
lw a5,-24(s0)
slli a5,a5,2
addi a3,s0,-16
add a5,a3,a5
sw a4,-32(a5)
#store tmp to arr[j+1]
lw a5,-24(s0)
addi a5,a5,1
slli a5,a5,2
addi a4,s0,-16
add a5,a4,a5
lw a4,-28(s0)
sw a4,-32(a5)
.L4:
#j++
lw a5,-24(s0)
addi a5,a5,1
sw a5,-24(s0)
.L3:
#check (j - i) < 4
li a4,4
lw a5,-20(s0)
sub a5,a4,a5
lw a4,-24(s0)
blt a4,a5,.L5
#i++
lw a5,-20(s0)
addi a5,a5,1
sw a5,-20(s0)
.L2:
#check i < 4
lw a4,-20(s0)
li a5,3
bge a5,a4,.L6
#return
li a5,0
mv a0,a5
lw s0,44(sp)
addi sp,sp,48
jr ra
```
* bubble sort C program (this is provided by [林聖堯](https://hackmd.io/@_01X9rimQmWH33Djf8QhoA/HJ1QiTzYB) )
```cpp=
#include <stdio.h>
int main()
{
int arr[5];
arr[0] = 3;
arr[1] = 5;
arr[2] = 1;
arr[3] = 2;
arr[4] = 4;
int tmp;
for(int i = 0; i < 4; i++){
for(int j = 0; j < 4 - i; j++){
if(arr[j] > arr[j + 1]){
tmp = arr[j];
arr[j] = arr[j + 1];
arr[j + 1] = tmp;
}
}
}
return 0;
}
```
## Rewrite assembly programs into C implementation
This bubble sort version is implemented by myself.
```cpp=
#include <stdio.h>
typedef enum { false = 0, true = !false } bool;
int main()
{
int arr[5];
arr[0] = 3;
arr[1] = 5;
arr[2] = 1;
arr[3] = 2;
arr[4] = 4;
int tmp;
for(int i = 0; i < 4; i++){
bool flag = false;
for(int j = 0; j < 4 - i; j++){
if(arr[j] > arr[j + 1]){
flag = true;
tmp = arr[j];
arr[j] = arr[j + 1];
arr[j + 1] = tmp;
}
}
if(!flag) break;
}
return 0;
}
```
Rewrite bubble sort to execute with [rv32emu]((https://github.com/sysprog21/rv32emu)).
```cpp=
typedef enum { false = 0, true = !false } bool;
void _start()
{
int arr[5];
arr[0] = 1;
arr[1] = 2;
arr[2] = 3;
arr[3] = 4;
arr[4] = 5;
int tmp;
volatile int* tx = (volatile int*) 0x40002000;
for(int i = 0; i < 4; i++){
bool flag = false;
for(int j = 0; j < 4 - i; j++){
if(arr[j] > arr[j + 1]){
flag = true;
tmp = arr[j];
arr[j] = arr[j + 1];
arr[j + 1] = tmp;
}
}
if(!flag) break;
}
*tx = arr[4];
}
```
Objdump above c porgram
* In the beginning, it can't work on emulator because sp register initial value is 0, so sp-32 will make sp value be 0xffffffe0(-32). In emulator, ram size is not big enough to store address 0xfffffe0. As a result, exception occur when the address over its limit.
```cpp=
00010054 <_start>:
10054: fe010113 addi sp,sp,-32
10058: 00100793 li a5,1
1005c: 00f12623 sw a5,12(sp)
10060: 00200793 li a5,2
10064: 00f12823 sw a5,16(sp)
10068: 00300793 li a5,3
1006c: 00f12a23 sw a5,20(sp)
10070: 00400793 li a5,4
10074: 00f12c23 sw a5,24(sp)
10078: 00500793 li a5,5
1007c: 00f12e23 sw a5,28(sp)
10080: 00400713 li a4,4
10084: 00c10793 addi a5,sp,12
10088: 00000693 li a3,0
1008c: 0200006f j 100ac <_start+0x58>
10090: 0007a603 lw a2,0(a5)
10094: 0047a583 lw a1,4(a5)
10098: 00168693 addi a3,a3,1
1009c: 00c5d663 bge a1,a2,100a8 <_start+0x54>
100a0: 00b7a023 sw a1,0(a5)
100a4: 00c7a223 sw a2,4(a5)
100a8: 00478793 addi a5,a5,4
100ac: fee6c2e3 blt a3,a4,10090 <_start+0x3c>
100b0: fff70713 addi a4,a4,-1
100b4: fc0718e3 bnez a4,10084 <_start+0x30>
100b8: 01c12703 lw a4,28(sp)
100bc: 400027b7 lui a5,0x40002
100c0: 00e7a023 sw a4,0(a5) # 40002000 <__global_pointer$+0x3fff0734>
100c4: 02010113 addi sp,sp,32
100c8: 00008067 ret
```
How to fix this program
1. Don't use sp register which means modify our c program.
2. Modify sp initial value in emulator.
In these case, I modify the emulator to make my code can work normally.
* When open the ELF file,I give the reg2(sp) the highest value in memory which means ram_start + RAM_SIZE.
```cpp=
/* for compliance test */
if (strcmp(name, "_start") == 0) {
ram_start = sym.st_value;
printf("ram_start:0x%08x\n",ram_start);
reg[2] = ram_start + RAM_SIZE;
}
```
* After fix this problem, the program can run correctly in emulator. We can open debug mode to see if the instruction work properly.
```cpp=
ram_start:0x00010054
begin_signature: 0x00000000
end_signature: 0x00000000
start: 0x00010054
ignoring section at address 0x00000000
codesize: 0x00000078 (120)
codesize with offset: 204
[00010054]=fe010113, mtime: 17a2e930e4c5, mtimecmp: 0
>>> ADDI
[00010058]=00200793, mtime: 17a2e930e539, mtimecmp: 0
>>> ADDI
[0001005c]=00f12623, mtime: 17a2e930e59a, mtimecmp: 0
>>> SW
[00010060]=00100793, mtime: 17a2e930e601, mtimecmp: 0
>>> ADDI
[00010064]=00f12823, mtime: 17a2e930e669, mtimecmp: 0
>>> SW
[00010068]=00500793, mtime: 17a2e930e6cd, mtimecmp: 0
>>> ADDI
.
.
.
.
[000100b8]=01c12703, mtime: 17a2e9310831, mtimecmp: 0
>>> LW
[000100bc]=400027b7, mtime: 17a2e931087f, mtimecmp: 0
>>> LUI
[000100c0]=00e7a023, mtime: 17a2e93108cb, mtimecmp: 0
>>> SW
addr:0x40002000
value: 5
[000100c4]=02010113, mtime: 17a2e931094a, mtimecmp: 0
>>> ADDI
[000100c8]=00008067, mtime: 17a2e9310994, mtimecmp: 0
>>> JALR
[00000000]=00000001, mtime: 17a2e93109e4, mtimecmp: 0
raise_exception: illegal instruction 0x2 0x1
done interp 6e int=0 mstatus=0 prv=3
```
## Compare the assembly listing between handwritten and compiler optimized one.
-O2
* The different between handwritten and -O2 optimized is handwritten version use more sw,lw to get the data, and it use many jump instruction which can be combined by bge or blt.
```cpp=
00010054 <_start>:
10054: fe010113 addi sp,sp,-32
10058: 00200793 li a5,2
1005c: 00f12623 sw a5,12(sp)
10060: 00100793 li a5,1
10064: 00f12823 sw a5,16(sp)
10068: 00500793 li a5,5
1006c: 00f12a23 sw a5,20(sp)
10070: 00400793 li a5,4
10074: 00f12c23 sw a5,24(sp)
10078: 00300793 li a5,3
1007c: 00f12e23 sw a5,28(sp)
10080: 00400593 li a1,4 // i = 4
10084: 00000713 li a4,0 //i = 0
10088: 00c10793 addi a5,sp,12 //a5 = 12(sp)
1008c: 02b75263 bge a4,a1,100b0 <_start+0x5c>
10090: 0007a683 lw a3,0(a5) //first element
10094: 0047a603 lw a2,4(a5) //second element
10098: 00170713 addi a4,a4,1
1009c: 00d65663 bge a2,a3,100a8 <_start+0x54>
100a0: 00c7a023 sw a2,0(a5)
100a4: 00d7a223 sw a3,4(a5)
100a8: 00478793 addi a5,a5,4 //next element address
100ac: feb742e3 blt a4,a1,10090 <_start+0x3c>
100b0: fff58593 addi a1,a1,-1
100b4: fc0598e3 bnez a1,10084 <_start+0x30>
100b8: 01c12703 lw a4,28(sp)
100bc: 400027b7 lui a5,0x40002
100c0: 00e7a023 sw a4,0(a5) # 40002000 <__global_pointer$+0x3fff0734>
100c4: 02010113 addi sp,sp,32
100c8: 00008067 ret
```
-Os
* Os version are almost the same with the -O2 version.The only different is at PC = 1008c. In -Os version, it jump first and then do the blt. In -O2 version, it do the bge instruction directly.
```cpp=
00010054 <_start>:
10054: fe010113 addi sp,sp,-32
10058: 00200793 li a5,2
1005c: 00f12623 sw a5,12(sp)
10060: 00100793 li a5,1
10064: 00f12823 sw a5,16(sp)
10068: 00500793 li a5,5
1006c: 00f12a23 sw a5,20(sp)
10070: 00400793 li a5,4
10074: 00f12c23 sw a5,24(sp)
10078: 00300793 li a5,3
1007c: 00f12e23 sw a5,28(sp)
10080: 00400713 li a4,4
10084: 00c10793 addi a5,sp,12
10088: 00000693 li a3,0
1008c: 0200006f j 100ac <_start+0x58>
10090: 0007a603 lw a2,0(a5)
10094: 0047a583 lw a1,4(a5)
10098: 00168693 addi a3,a3,1
1009c: 00c5d663 bge a1,a2,100a8 <_start+0x54>
100a0: 00b7a023 sw a1,0(a5)
100a4: 00c7a223 sw a2,4(a5)
100a8: 00478793 addi a5,a5,4
100ac: fee6c2e3 blt a3,a4,10090 <_start+0x3c>
100b0: fff70713 addi a4,a4,-1
100b4: fc0718e3 bnez a4,10084 <_start+0x30>
100b8: 01c12703 lw a4,28(sp)
100bc: 400027b7 lui a5,0x40002
100c0: 00e7a023 sw a4,0(a5) # 40002000 <__global_pointer$+0x3fff0734>
100c4: 02010113 addi sp,sp,32
100c8: 00008067 ret
```
-O3
* In -O3 version we can see that the compiler store the sorting result in register a4, so the code size is very small.
```cpp=
00010054 <_start>:
10054: 00500713 li a4,5
10058: 400027b7 lui a5,0x40002
1005c: fe010113 addi sp,sp,-32
10060: 00e7a023 sw a4,0(a5) # 40002000 <__global_pointer$+0x3fff0794>
10064: 02010113 addi sp,sp,32
10068: 00008067 ret
```
## Statistics of emu-rv32i results
* true counter will increase when the branch condition is true.
* false counter will increase when the branch condition is false which means its nextPC is PC + 4.
* forward counter means nextPC > PC, which represent no matter nextPC is PC + 4, or nextPC is PC + (+imm) forward counter will increase.
* backward counter is on opposite of forward counter.
```cpp=
Instructions Stat:
LUI = 1
JAL = 4
JALR = 1
BNE = 4
BLT = 14
BGE = 10
LW = 21
SW = 14
ADDI = 40
LI* = 10
Five Most Frequent:
1) ADDI = 40 (36.36%)
2) LW = 21 (19.09%)
3) BLT = 14 (12.73%)
4) SW = 14 (12.73%)
5) BGE = 10 (9.09%)
Memory Reading Area 10054...20053
Memory Writing Area 20040...40002003
>>> Execution time: 960148 ns
>>> Instruction count: 110 (IPS=114565)
>>> Jumps: 24 (21.82%) - 10 forwards, 14 backwards
>>> Branching T=19 (67.86%) F=9 (32.14%)
```
# BinarySearch
## Pick up a assembly program from Assignment1
* choose [BinarySearch](https://github.com/murumura/risc-v) assembly program from [黃偉宸](https://hackmd.io/7oykWP_1SCKVAqAAZVKdUA?view).
```cpp=
# s0: base address of array
# s1: array size
# s2: the search element
# return index at $a5,if search element not found , put -1 in a5
.data
success1: .string "search success "
success2: .string " at index "
fail_string1: .string "search element "
fail_string2: .string " does not exist!"
array: .word 1, 4, 5, 7, 9, 12, 15, 17, 20, 21, 30
array_size: .word 11
search_element:.word 9
.text
main:
# binarySearch(int arr[], int l, int r, int x)
addi t0, zero, 0 #int l
la t1, array_size
lw t1, 0(t1) #int r
addi t1, t1, -1
add s10,t1,zero # s10 used to check if search is go out of bound of array
addi a5, zero, -1
la s2, search_element
lw s2, 0(s2) #s2 stands for search value
la s0, array # load the base address to $s0
jal ra, binary_search # Jump-and-link to the 'binarySearch' label
bltz a5, Fail #check if found or not
j exit
binary_search:
# a2 stand for mid = (l + r)/2
add a2, t0, t1
srli a2, a2, 1
#check if mid > array_size
blt s10,a2,Fail
#check if mid < 0
bltz a2,Fail
# check if array[mid]== search_element
add t2, a2, zero
slli t2, t2, 2 # t2=t2*4
add t2, t2, s0
lw t2, 0(t2)
beq t2, s2, Find
# check if to == t1 and still not equal
beq t0,t1,Fail
#not equal then adjust l,r depends on the value of array[mid]
blt s2, t2, less
greater: #elseif target > array[mid] : l = mid + 1
addi t0,a2,1
j binary_search
less: # @if target<array[mid] : r = mid-1
addi t1,a2,-1
j binary_search
ret
Fail:
addi a5,zero,-1
la a1, fail_string1
li a0, 4
ecall
mv a1, s2
li a0, 1
ecall
la a1, fail_string2
li a0, 4
ecall
j exit
Find:
add a5,a2,zero
la a1, success1
li a0, 4
ecall
mv a1, s2
li a0, 1
ecall
la a1, success2
li a0, 4
ecall
mv a1, a5
li a0, 1
ecall
j exit
exit:
```
## Rewrite assembly programs into C implementation
Rewrite binarySearch to execute with [rv32emu]((https://github.com/sysprog21/rv32emu)).
```cpp=
void _start() __attribute__ ((section(".text.startup")));
int binarySearch(int arr[], int l, int r, int x)
{
if (r >= l) {
int mid = l + (r - l) / 2;
// If the element is present at the middle itself
if (arr[mid] == x)
return mid;
// If element is smaller than mid, then it can only
// be present in left subarray
if (arr[mid] > x)
return binarySearch(arr, l, mid - 1, x);
// Else the element can only be present in right
// subarray
return binarySearch(arr, mid + 1, r, x);
}
// We reach here when element is not present in array
return -1;
}
void _start()
{
int arr[5];
arr[0] = 2;
arr[1] = 3;
arr[2] = 4;
arr[3] = 10;
arr[4] = 40;
int n = 5; //sizeof(arr)/sizeof(arr[0]);
int x = 10;
int result = binarySearch(arr, 0, n-1, x);
volatile int* tx = (volatile int*) 0x40002000;
*tx = result;
}
```
Objdump above c porgram
```cpp=
Disassembly of section .text:
00010054 <_start>:
10054: fd010113 addi sp,sp,-48
10058: 00200793 li a5,2
1005c: 00f12623 sw a5,12(sp)
10060: 00300793 li a5,3
10064: 00f12823 sw a5,16(sp)
10068: 00400793 li a5,4
1006c: 00f12a23 sw a5,20(sp)
10070: 00a00793 li a5,10
10074: 00f12c23 sw a5,24(sp)
10078: 00c10513 addi a0,sp,12
1007c: 02800793 li a5,40
10080: 00a00693 li a3,10
10084: 00400613 li a2,4
10088: 00000593 li a1,0
1008c: 02112623 sw ra,44(sp)
10090: 00f12e23 sw a5,28(sp)
10094: 018000ef jal ra,100ac <binarySearch>
10098: 02c12083 lw ra,44(sp)
1009c: 400027b7 lui a5,0x40002
100a0: 00a7a023 sw a0,0(a5) # 40002000 <__global_pointer$+0x3fff0714>
100a4: 03010113 addi sp,sp,48
100a8: 00008067 ret
000100ac <binarySearch>:
100ac: 02b64a63 blt a2,a1,100e0 <binarySearch+0x34>
100b0: 40b607b3 sub a5,a2,a1
100b4: 4017d793 srai a5,a5,0x1
100b8: 00b787b3 add a5,a5,a1
100bc: 00279713 slli a4,a5,0x2
100c0: 00e50733 add a4,a0,a4
100c4: 00072703 lw a4,0(a4)
100c8: 00d70e63 beq a4,a3,100e4 <binarySearch+0x38>
100cc: 00e6d663 bge a3,a4,100d8 <binarySearch+0x2c>
100d0: fff78613 addi a2,a5,-1
100d4: fd9ff06f j 100ac <binarySearch>
100d8: 00178593 addi a1,a5,1
100dc: fd1ff06f j 100ac <binarySearch>
100e0: fff00793 li a5,-1
100e4: 00078513 mv a0,a5
100e8: 00008067 ret
```
In the beginning, it can't work on emulator because instruction doesn't load to the ram so first PC can't get right instruction.
```cpp=
ram_start:0x00010094
begin_signature: 0x00000000
end_signature: 0x00000000
start: 0x00010094
ignoring section at address 0x00010054
ignoring section at address 0x00000000
codesize: 0x00000001 (1)
codesize with offset: 149
[00010094]=00000000, mtime: 1d6f79f9cc1e, mtimecmp: 0
raise_exception: illegal instruction 0x2 0x0
done interp 1 int=0 mstatus=0 prv=3
/* scan for program */
while ((scn = elf_nextscn(elf, scn)) != NULL) {
gelf_getshdr(scn, &shdr);
if (shdr.sh_type == SHT_PROGBITS) {
Elf_Data *data = elf_getdata(scn, NULL);
if (shdr.sh_addr >= ram_start) {
for (size_t i = 0; i < shdr.sh_size; i++) {
ram_curr = shdr.sh_addr + i - ram_start;
if(ram_curr >= RAM_SIZE)
{
#ifdef DEBUG_OUTPUT
printf("memory pointer outside of range 0x%08x (section at address 0x%08x)\n", ram_curr, (uint32_t) shdr.sh_addr);
#endif
/* break; */
}
else
{
ram[ram_curr] = ((uint8_t *)data->d_buf)[i];
if(ram_curr > ram_last) ram_last = ram_curr;
}
}
} else {
#ifdef DEBUG_OUTPUT
printf("ignoring section at address 0x%08x\n", (uint32_t) shdr.sh_addr);
#endif
}
}
}
```
```cpp=
00010054 <binarySearch>:
10054: 02b64a63 blt a2,a1,10088 <binarySearch+0x34>
10058: 40b607b3 sub a5,a2,a1
1005c: 4017d793 srai a5,a5,0x1
10060: 00b787b3 add a5,a5,a1
10064: 00279713 slli a4,a5,0x2
10068: 00e50733 add a4,a0,a4
1006c: 00072703 lw a4,0(a4)
10070: 00d70e63 beq a4,a3,1008c <binarySearch+0x38>
10074: 00e6d663 bge a3,a4,10080 <binarySearch+0x2c>
10078: fff78613 addi a2,a5,-1
1007c: fd9ff06f j 10054 <binarySearch>
10080: 00178593 addi a1,a5,1
10084: fd1ff06f j 10054 <binarySearch>
10088: fff00793 li a5,-1
1008c: 00078513 mv a0,a5
10090: 00008067 ret
00010094 <_start>:
10094: fd010113 addi sp,sp,-48
10098: 00200793 li a5,2
1009c: 00f12623 sw a5,12(sp)
100a0: 00300793 li a5,3
100a4: 00f12823 sw a5,16(sp)
100a8: 00400793 li a5,4
100ac: 00f12a23 sw a5,20(sp)
100b0: 00a00793 li a5,10
100b4: 00f12c23 sw a5,24(sp)
100b8: 00c10513 addi a0,sp,12
100bc: 02800793 li a5,40
100c0: 00a00693 li a3,10
100c4: 00400613 li a2,4
100c8: 00000593 li a1,0
100cc: 02112623 sw ra,44(sp)
100d0: 00f12e23 sw a5,28(sp)
100d4: f81ff0ef jal ra,10054 <binarySearch>
100d8: 02c12083 lw ra,44(sp)
100dc: 400027b7 lui a5,0x40002
100e0: 00a7a023 sw a0,0(a5) # 40002000 <__global_pointer$+0x3fff0714>
100e4: 03010113 addi sp,sp,48
100e8: 00008067 ret
```
The first PC need start from <_start>, so if we start from <binarySearch>, it will get fault. To fix this problem, I use ```riscv-none-embed-gcc -Wl,-verbose``` to see the default link script.
use ``` void _start() __attribute__ ((section(".text.startup")));``` to force start function to execute first then the emulator can work properly.
```cpp=
.text :
{
*(.text.unlikely .text.*_unlikely .text.unlikely.*)
*(.text.exit .text.exit.*)
*(.text.startup .text.startup.*)
*(.text.hot .text.hot.*)
*(.text .stub .text.* .gnu.linkonce.t.*)
/* .gnu.warning sections are handled specially by elf32.em. */
*(.gnu.warning)
}
```
## Statistics of emu-rv32i results
```cpp=
Instructions Stat:
LUI = 1
JAL = 2
JALR = 2
BEQ = 2
BLT = 2
BGE = 1
LW = 3
SW = 7
ADDI = 13
SLLI = 2
SRAI = 2
ADD = 4
SUB = 2
LI* = 8
Five Most Frequent:
1) ADDI = 13 (29.55%)
2) LI* = 8 (18.18%)
3) SW = 7 (15.91%)
4) ADD = 4 (9.09%)
5) LW = 3 (6.82%)
Memory Reading Area 10054...20053
Memory Writing Area 20030...40002003
>>> Execution time: 356054 ns
>>> Instruction count: 44 (IPS=123576)
>>> Jumps: 6 (13.64%) - 3 forwards, 3 backwards
>>> Branching T=2 (40.00%) F=3 (60.00%)
```