# Assignment2: RISC-V Toolchain contributed by < `dck9661` > # Bubble Sort ## Pick up a assembly program from Assignment1 * choose [bubble sort](https://github.com/Shengyuu/Assignment_computer_arch) assembly program from [林聖堯](https://hackmd.io/@_01X9rimQmWH33Djf8QhoA/HJ1QiTzYB). ```cpp= main: #create stack addi sp,sp,-48 #save s0 sw s0,44(sp) #update s0 addi s0,sp,48 #init arr[] to memory li a5,3 sw a5,-48(s0) li a5,5 sw a5,-44(s0) li a5,1 sw a5,-40(s0) li a5,2 sw a5,-36(s0) li a5,4 sw a5,-32(s0) #init i to memory sw zero,-20(s0) j .L2 .L6: #init j to memory sw zero,-24(s0) j .L3 .L5: #load arr[ j ] to a4 lw a5,-24(s0) slli a5,a5,2 addi a4,s0,-16 add a5,a4,a5 lw a4,-32(a5) #load arr[j+1] a5 lw a5,-24(s0) addi a5,a5,1 slli a5,a5,2 addi a3,s0,-16 add a5,a3,a5 lw a5,-32(a5) #if arr[j + 1] > arr[j] bge a5,a4,.L4 # arr[j+1] < arr[j] #load arr[j] a5 lw a5,-24(s0) slli a5,a5,2 addi a4,s0,-16 add a5,a4,a5 lw a5,-32(a5) #store arr[j] to tmp sw a5,-28(s0) #load arr[j+1] to a4 lw a5,-24(s0) addi a5,a5,1 slli a5,a5,2 addi a4,s0,-16 add a5,a4,a5 lw a4,-32(a5) #arr[j] = arr[j+1] lw a5,-24(s0) slli a5,a5,2 addi a3,s0,-16 add a5,a3,a5 sw a4,-32(a5) #store tmp to arr[j+1] lw a5,-24(s0) addi a5,a5,1 slli a5,a5,2 addi a4,s0,-16 add a5,a4,a5 lw a4,-28(s0) sw a4,-32(a5) .L4: #j++ lw a5,-24(s0) addi a5,a5,1 sw a5,-24(s0) .L3: #check (j - i) < 4 li a4,4 lw a5,-20(s0) sub a5,a4,a5 lw a4,-24(s0) blt a4,a5,.L5 #i++ lw a5,-20(s0) addi a5,a5,1 sw a5,-20(s0) .L2: #check i < 4 lw a4,-20(s0) li a5,3 bge a5,a4,.L6 #return li a5,0 mv a0,a5 lw s0,44(sp) addi sp,sp,48 jr ra ``` * bubble sort C program (this is provided by [林聖堯](https://hackmd.io/@_01X9rimQmWH33Djf8QhoA/HJ1QiTzYB) ) ```cpp= #include <stdio.h> int main() { int arr[5]; arr[0] = 3; arr[1] = 5; arr[2] = 1; arr[3] = 2; arr[4] = 4; int tmp; for(int i = 0; i < 4; i++){ for(int j = 0; j < 4 - i; j++){ if(arr[j] > arr[j + 1]){ tmp = arr[j]; arr[j] = arr[j + 1]; arr[j + 1] = tmp; } } } return 0; } ``` ## Rewrite assembly programs into C implementation This bubble sort version is implemented by myself. ```cpp= #include <stdio.h> typedef enum { false = 0, true = !false } bool; int main() { int arr[5]; arr[0] = 3; arr[1] = 5; arr[2] = 1; arr[3] = 2; arr[4] = 4; int tmp; for(int i = 0; i < 4; i++){ bool flag = false; for(int j = 0; j < 4 - i; j++){ if(arr[j] > arr[j + 1]){ flag = true; tmp = arr[j]; arr[j] = arr[j + 1]; arr[j + 1] = tmp; } } if(!flag) break; } return 0; } ``` Rewrite bubble sort to execute with [rv32emu]((https://github.com/sysprog21/rv32emu)). ```cpp= typedef enum { false = 0, true = !false } bool; void _start() { int arr[5]; arr[0] = 1; arr[1] = 2; arr[2] = 3; arr[3] = 4; arr[4] = 5; int tmp; volatile int* tx = (volatile int*) 0x40002000; for(int i = 0; i < 4; i++){ bool flag = false; for(int j = 0; j < 4 - i; j++){ if(arr[j] > arr[j + 1]){ flag = true; tmp = arr[j]; arr[j] = arr[j + 1]; arr[j + 1] = tmp; } } if(!flag) break; } *tx = arr[4]; } ``` Objdump above c porgram * In the beginning, it can't work on emulator because sp register initial value is 0, so sp-32 will make sp value be 0xffffffe0(-32). In emulator, ram size is not big enough to store address 0xfffffe0. As a result, exception occur when the address over its limit. ```cpp= 00010054 <_start>: 10054: fe010113 addi sp,sp,-32 10058: 00100793 li a5,1 1005c: 00f12623 sw a5,12(sp) 10060: 00200793 li a5,2 10064: 00f12823 sw a5,16(sp) 10068: 00300793 li a5,3 1006c: 00f12a23 sw a5,20(sp) 10070: 00400793 li a5,4 10074: 00f12c23 sw a5,24(sp) 10078: 00500793 li a5,5 1007c: 00f12e23 sw a5,28(sp) 10080: 00400713 li a4,4 10084: 00c10793 addi a5,sp,12 10088: 00000693 li a3,0 1008c: 0200006f j 100ac <_start+0x58> 10090: 0007a603 lw a2,0(a5) 10094: 0047a583 lw a1,4(a5) 10098: 00168693 addi a3,a3,1 1009c: 00c5d663 bge a1,a2,100a8 <_start+0x54> 100a0: 00b7a023 sw a1,0(a5) 100a4: 00c7a223 sw a2,4(a5) 100a8: 00478793 addi a5,a5,4 100ac: fee6c2e3 blt a3,a4,10090 <_start+0x3c> 100b0: fff70713 addi a4,a4,-1 100b4: fc0718e3 bnez a4,10084 <_start+0x30> 100b8: 01c12703 lw a4,28(sp) 100bc: 400027b7 lui a5,0x40002 100c0: 00e7a023 sw a4,0(a5) # 40002000 <__global_pointer$+0x3fff0734> 100c4: 02010113 addi sp,sp,32 100c8: 00008067 ret ``` How to fix this program 1. Don't use sp register which means modify our c program. 2. Modify sp initial value in emulator. In these case, I modify the emulator to make my code can work normally. * When open the ELF file,I give the reg2(sp) the highest value in memory which means ram_start + RAM_SIZE. ```cpp= /* for compliance test */ if (strcmp(name, "_start") == 0) { ram_start = sym.st_value; printf("ram_start:0x%08x\n",ram_start); reg[2] = ram_start + RAM_SIZE; } ``` * After fix this problem, the program can run correctly in emulator. We can open debug mode to see if the instruction work properly. ```cpp= ram_start:0x00010054 begin_signature: 0x00000000 end_signature: 0x00000000 start: 0x00010054 ignoring section at address 0x00000000 codesize: 0x00000078 (120) codesize with offset: 204 [00010054]=fe010113, mtime: 17a2e930e4c5, mtimecmp: 0 >>> ADDI [00010058]=00200793, mtime: 17a2e930e539, mtimecmp: 0 >>> ADDI [0001005c]=00f12623, mtime: 17a2e930e59a, mtimecmp: 0 >>> SW [00010060]=00100793, mtime: 17a2e930e601, mtimecmp: 0 >>> ADDI [00010064]=00f12823, mtime: 17a2e930e669, mtimecmp: 0 >>> SW [00010068]=00500793, mtime: 17a2e930e6cd, mtimecmp: 0 >>> ADDI . . . . [000100b8]=01c12703, mtime: 17a2e9310831, mtimecmp: 0 >>> LW [000100bc]=400027b7, mtime: 17a2e931087f, mtimecmp: 0 >>> LUI [000100c0]=00e7a023, mtime: 17a2e93108cb, mtimecmp: 0 >>> SW addr:0x40002000 value: 5 [000100c4]=02010113, mtime: 17a2e931094a, mtimecmp: 0 >>> ADDI [000100c8]=00008067, mtime: 17a2e9310994, mtimecmp: 0 >>> JALR [00000000]=00000001, mtime: 17a2e93109e4, mtimecmp: 0 raise_exception: illegal instruction 0x2 0x1 done interp 6e int=0 mstatus=0 prv=3 ``` ## Compare the assembly listing between handwritten and compiler optimized one. -O2 * The different between handwritten and -O2 optimized is handwritten version use more sw,lw to get the data, and it use many jump instruction which can be combined by bge or blt. ```cpp= 00010054 <_start>: 10054: fe010113 addi sp,sp,-32 10058: 00200793 li a5,2 1005c: 00f12623 sw a5,12(sp) 10060: 00100793 li a5,1 10064: 00f12823 sw a5,16(sp) 10068: 00500793 li a5,5 1006c: 00f12a23 sw a5,20(sp) 10070: 00400793 li a5,4 10074: 00f12c23 sw a5,24(sp) 10078: 00300793 li a5,3 1007c: 00f12e23 sw a5,28(sp) 10080: 00400593 li a1,4 // i = 4 10084: 00000713 li a4,0 //i = 0 10088: 00c10793 addi a5,sp,12 //a5 = 12(sp) 1008c: 02b75263 bge a4,a1,100b0 <_start+0x5c> 10090: 0007a683 lw a3,0(a5) //first element 10094: 0047a603 lw a2,4(a5) //second element 10098: 00170713 addi a4,a4,1 1009c: 00d65663 bge a2,a3,100a8 <_start+0x54> 100a0: 00c7a023 sw a2,0(a5) 100a4: 00d7a223 sw a3,4(a5) 100a8: 00478793 addi a5,a5,4 //next element address 100ac: feb742e3 blt a4,a1,10090 <_start+0x3c> 100b0: fff58593 addi a1,a1,-1 100b4: fc0598e3 bnez a1,10084 <_start+0x30> 100b8: 01c12703 lw a4,28(sp) 100bc: 400027b7 lui a5,0x40002 100c0: 00e7a023 sw a4,0(a5) # 40002000 <__global_pointer$+0x3fff0734> 100c4: 02010113 addi sp,sp,32 100c8: 00008067 ret ``` -Os * Os version are almost the same with the -O2 version.The only different is at PC = 1008c. In -Os version, it jump first and then do the blt. In -O2 version, it do the bge instruction directly. ```cpp= 00010054 <_start>: 10054: fe010113 addi sp,sp,-32 10058: 00200793 li a5,2 1005c: 00f12623 sw a5,12(sp) 10060: 00100793 li a5,1 10064: 00f12823 sw a5,16(sp) 10068: 00500793 li a5,5 1006c: 00f12a23 sw a5,20(sp) 10070: 00400793 li a5,4 10074: 00f12c23 sw a5,24(sp) 10078: 00300793 li a5,3 1007c: 00f12e23 sw a5,28(sp) 10080: 00400713 li a4,4 10084: 00c10793 addi a5,sp,12 10088: 00000693 li a3,0 1008c: 0200006f j 100ac <_start+0x58> 10090: 0007a603 lw a2,0(a5) 10094: 0047a583 lw a1,4(a5) 10098: 00168693 addi a3,a3,1 1009c: 00c5d663 bge a1,a2,100a8 <_start+0x54> 100a0: 00b7a023 sw a1,0(a5) 100a4: 00c7a223 sw a2,4(a5) 100a8: 00478793 addi a5,a5,4 100ac: fee6c2e3 blt a3,a4,10090 <_start+0x3c> 100b0: fff70713 addi a4,a4,-1 100b4: fc0718e3 bnez a4,10084 <_start+0x30> 100b8: 01c12703 lw a4,28(sp) 100bc: 400027b7 lui a5,0x40002 100c0: 00e7a023 sw a4,0(a5) # 40002000 <__global_pointer$+0x3fff0734> 100c4: 02010113 addi sp,sp,32 100c8: 00008067 ret ``` -O3 * In -O3 version we can see that the compiler store the sorting result in register a4, so the code size is very small. ```cpp= 00010054 <_start>: 10054: 00500713 li a4,5 10058: 400027b7 lui a5,0x40002 1005c: fe010113 addi sp,sp,-32 10060: 00e7a023 sw a4,0(a5) # 40002000 <__global_pointer$+0x3fff0794> 10064: 02010113 addi sp,sp,32 10068: 00008067 ret ``` ## Statistics of emu-rv32i results * true counter will increase when the branch condition is true. * false counter will increase when the branch condition is false which means its nextPC is PC + 4. * forward counter means nextPC > PC, which represent no matter nextPC is PC + 4, or nextPC is PC + (+imm) forward counter will increase. * backward counter is on opposite of forward counter. ```cpp= Instructions Stat: LUI = 1 JAL = 4 JALR = 1 BNE = 4 BLT = 14 BGE = 10 LW = 21 SW = 14 ADDI = 40 LI* = 10 Five Most Frequent: 1) ADDI = 40 (36.36%) 2) LW = 21 (19.09%) 3) BLT = 14 (12.73%) 4) SW = 14 (12.73%) 5) BGE = 10 (9.09%) Memory Reading Area 10054...20053 Memory Writing Area 20040...40002003 >>> Execution time: 960148 ns >>> Instruction count: 110 (IPS=114565) >>> Jumps: 24 (21.82%) - 10 forwards, 14 backwards >>> Branching T=19 (67.86%) F=9 (32.14%) ``` # BinarySearch ## Pick up a assembly program from Assignment1 * choose [BinarySearch](https://github.com/murumura/risc-v) assembly program from [黃偉宸](https://hackmd.io/7oykWP_1SCKVAqAAZVKdUA?view). ```cpp= # s0: base address of array # s1: array size # s2: the search element # return index at $a5,if search element not found , put -1 in a5 .data success1: .string "search success " success2: .string " at index " fail_string1: .string "search element " fail_string2: .string " does not exist!" array: .word 1, 4, 5, 7, 9, 12, 15, 17, 20, 21, 30 array_size: .word 11 search_element:.word 9 .text main: # binarySearch(int arr[], int l, int r, int x) addi t0, zero, 0 #int l la t1, array_size lw t1, 0(t1) #int r addi t1, t1, -1 add s10,t1,zero # s10 used to check if search is go out of bound of array addi a5, zero, -1 la s2, search_element lw s2, 0(s2) #s2 stands for search value la s0, array # load the base address to $s0 jal ra, binary_search # Jump-and-link to the 'binarySearch' label bltz a5, Fail #check if found or not j exit binary_search: # a2 stand for mid = (l + r)/2 add a2, t0, t1 srli a2, a2, 1 #check if mid > array_size blt s10,a2,Fail #check if mid < 0 bltz a2,Fail # check if array[mid]== search_element add t2, a2, zero slli t2, t2, 2 # t2=t2*4 add t2, t2, s0 lw t2, 0(t2) beq t2, s2, Find # check if to == t1 and still not equal beq t0,t1,Fail #not equal then adjust l,r depends on the value of array[mid] blt s2, t2, less greater: #elseif target > array[mid] : l = mid + 1 addi t0,a2,1 j binary_search less: # @if target<array[mid] : r = mid-1 addi t1,a2,-1 j binary_search ret Fail: addi a5,zero,-1 la a1, fail_string1 li a0, 4 ecall mv a1, s2 li a0, 1 ecall la a1, fail_string2 li a0, 4 ecall j exit Find: add a5,a2,zero la a1, success1 li a0, 4 ecall mv a1, s2 li a0, 1 ecall la a1, success2 li a0, 4 ecall mv a1, a5 li a0, 1 ecall j exit exit: ``` ## Rewrite assembly programs into C implementation Rewrite binarySearch to execute with [rv32emu]((https://github.com/sysprog21/rv32emu)). ```cpp= void _start() __attribute__ ((section(".text.startup"))); int binarySearch(int arr[], int l, int r, int x) { if (r >= l) { int mid = l + (r - l) / 2; // If the element is present at the middle itself if (arr[mid] == x) return mid; // If element is smaller than mid, then it can only // be present in left subarray if (arr[mid] > x) return binarySearch(arr, l, mid - 1, x); // Else the element can only be present in right // subarray return binarySearch(arr, mid + 1, r, x); } // We reach here when element is not present in array return -1; } void _start() { int arr[5]; arr[0] = 2; arr[1] = 3; arr[2] = 4; arr[3] = 10; arr[4] = 40; int n = 5; //sizeof(arr)/sizeof(arr[0]); int x = 10; int result = binarySearch(arr, 0, n-1, x); volatile int* tx = (volatile int*) 0x40002000; *tx = result; } ``` Objdump above c porgram ```cpp= Disassembly of section .text: 00010054 <_start>: 10054: fd010113 addi sp,sp,-48 10058: 00200793 li a5,2 1005c: 00f12623 sw a5,12(sp) 10060: 00300793 li a5,3 10064: 00f12823 sw a5,16(sp) 10068: 00400793 li a5,4 1006c: 00f12a23 sw a5,20(sp) 10070: 00a00793 li a5,10 10074: 00f12c23 sw a5,24(sp) 10078: 00c10513 addi a0,sp,12 1007c: 02800793 li a5,40 10080: 00a00693 li a3,10 10084: 00400613 li a2,4 10088: 00000593 li a1,0 1008c: 02112623 sw ra,44(sp) 10090: 00f12e23 sw a5,28(sp) 10094: 018000ef jal ra,100ac <binarySearch> 10098: 02c12083 lw ra,44(sp) 1009c: 400027b7 lui a5,0x40002 100a0: 00a7a023 sw a0,0(a5) # 40002000 <__global_pointer$+0x3fff0714> 100a4: 03010113 addi sp,sp,48 100a8: 00008067 ret 000100ac <binarySearch>: 100ac: 02b64a63 blt a2,a1,100e0 <binarySearch+0x34> 100b0: 40b607b3 sub a5,a2,a1 100b4: 4017d793 srai a5,a5,0x1 100b8: 00b787b3 add a5,a5,a1 100bc: 00279713 slli a4,a5,0x2 100c0: 00e50733 add a4,a0,a4 100c4: 00072703 lw a4,0(a4) 100c8: 00d70e63 beq a4,a3,100e4 <binarySearch+0x38> 100cc: 00e6d663 bge a3,a4,100d8 <binarySearch+0x2c> 100d0: fff78613 addi a2,a5,-1 100d4: fd9ff06f j 100ac <binarySearch> 100d8: 00178593 addi a1,a5,1 100dc: fd1ff06f j 100ac <binarySearch> 100e0: fff00793 li a5,-1 100e4: 00078513 mv a0,a5 100e8: 00008067 ret ``` In the beginning, it can't work on emulator because instruction doesn't load to the ram so first PC can't get right instruction. ```cpp= ram_start:0x00010094 begin_signature: 0x00000000 end_signature: 0x00000000 start: 0x00010094 ignoring section at address 0x00010054 ignoring section at address 0x00000000 codesize: 0x00000001 (1) codesize with offset: 149 [00010094]=00000000, mtime: 1d6f79f9cc1e, mtimecmp: 0 raise_exception: illegal instruction 0x2 0x0 done interp 1 int=0 mstatus=0 prv=3 /* scan for program */ while ((scn = elf_nextscn(elf, scn)) != NULL) { gelf_getshdr(scn, &shdr); if (shdr.sh_type == SHT_PROGBITS) { Elf_Data *data = elf_getdata(scn, NULL); if (shdr.sh_addr >= ram_start) { for (size_t i = 0; i < shdr.sh_size; i++) { ram_curr = shdr.sh_addr + i - ram_start; if(ram_curr >= RAM_SIZE) { #ifdef DEBUG_OUTPUT printf("memory pointer outside of range 0x%08x (section at address 0x%08x)\n", ram_curr, (uint32_t) shdr.sh_addr); #endif /* break; */ } else { ram[ram_curr] = ((uint8_t *)data->d_buf)[i]; if(ram_curr > ram_last) ram_last = ram_curr; } } } else { #ifdef DEBUG_OUTPUT printf("ignoring section at address 0x%08x\n", (uint32_t) shdr.sh_addr); #endif } } } ``` ```cpp= 00010054 <binarySearch>: 10054: 02b64a63 blt a2,a1,10088 <binarySearch+0x34> 10058: 40b607b3 sub a5,a2,a1 1005c: 4017d793 srai a5,a5,0x1 10060: 00b787b3 add a5,a5,a1 10064: 00279713 slli a4,a5,0x2 10068: 00e50733 add a4,a0,a4 1006c: 00072703 lw a4,0(a4) 10070: 00d70e63 beq a4,a3,1008c <binarySearch+0x38> 10074: 00e6d663 bge a3,a4,10080 <binarySearch+0x2c> 10078: fff78613 addi a2,a5,-1 1007c: fd9ff06f j 10054 <binarySearch> 10080: 00178593 addi a1,a5,1 10084: fd1ff06f j 10054 <binarySearch> 10088: fff00793 li a5,-1 1008c: 00078513 mv a0,a5 10090: 00008067 ret 00010094 <_start>: 10094: fd010113 addi sp,sp,-48 10098: 00200793 li a5,2 1009c: 00f12623 sw a5,12(sp) 100a0: 00300793 li a5,3 100a4: 00f12823 sw a5,16(sp) 100a8: 00400793 li a5,4 100ac: 00f12a23 sw a5,20(sp) 100b0: 00a00793 li a5,10 100b4: 00f12c23 sw a5,24(sp) 100b8: 00c10513 addi a0,sp,12 100bc: 02800793 li a5,40 100c0: 00a00693 li a3,10 100c4: 00400613 li a2,4 100c8: 00000593 li a1,0 100cc: 02112623 sw ra,44(sp) 100d0: 00f12e23 sw a5,28(sp) 100d4: f81ff0ef jal ra,10054 <binarySearch> 100d8: 02c12083 lw ra,44(sp) 100dc: 400027b7 lui a5,0x40002 100e0: 00a7a023 sw a0,0(a5) # 40002000 <__global_pointer$+0x3fff0714> 100e4: 03010113 addi sp,sp,48 100e8: 00008067 ret ``` The first PC need start from <_start>, so if we start from <binarySearch>, it will get fault. To fix this problem, I use ```riscv-none-embed-gcc -Wl,-verbose``` to see the default link script. use ``` void _start() __attribute__ ((section(".text.startup")));``` to force start function to execute first then the emulator can work properly. ```cpp= .text : { *(.text.unlikely .text.*_unlikely .text.unlikely.*) *(.text.exit .text.exit.*) *(.text.startup .text.startup.*) *(.text.hot .text.hot.*) *(.text .stub .text.* .gnu.linkonce.t.*) /* .gnu.warning sections are handled specially by elf32.em. */ *(.gnu.warning) } ``` ## Statistics of emu-rv32i results ```cpp= Instructions Stat: LUI = 1 JAL = 2 JALR = 2 BEQ = 2 BLT = 2 BGE = 1 LW = 3 SW = 7 ADDI = 13 SLLI = 2 SRAI = 2 ADD = 4 SUB = 2 LI* = 8 Five Most Frequent: 1) ADDI = 13 (29.55%) 2) LI* = 8 (18.18%) 3) SW = 7 (15.91%) 4) ADD = 4 (9.09%) 5) LW = 3 (6.82%) Memory Reading Area 10054...20053 Memory Writing Area 20030...40002003 >>> Execution time: 356054 ns >>> Instruction count: 44 (IPS=123576) >>> Jumps: 6 (13.64%) - 3 forwards, 3 backwards >>> Branching T=2 (40.00%) F=3 (60.00%) ```