# Assignment2: RISC-V Toolchain ## 1.pick up two assembly programs from [Assignment1: RISC-V Assembly and Instruction Pipeline](https://hackmd.io/ZHsZ-7HBTvqtH8SoDHxQoQ) and rewrite into C implementations which can execute with [rv32emu](https://github.com/sysprog21/rv32emu). ### Bit Reverse Giving an input number and reverse every bit of the number. #### C code for emu-rv32 ```cpp= void _start() { volatile char* tx = (volatile char*) 0x40002000; unsigned int example = 0x12345678; unsigned int mask[5]; mask[4] = 0xffff0000; mask[3] = 0xff00ff00; mask[2] = 0xf0f0f0f0; mask[1] = 0xcccccccc; mask[0] = 0xaaaaaaaa; unsigned int tmp; for(int i = 4; i >= 0 ; i--){ tmp = example & mask[i]; tmp >>= (1U << i); mask[i] >>= (1U << i); example &= mask[i]; example <<= (1U << i); example |= tmp; } for(int i = 0; i < 8; i++){ tmp = ((example & 0xf0000000) >> 28); *tx = tmp > 9 ? tmp - 10 + 'a' : tmp + '0'; example <<= 4; } } ``` In first for loop every bits in the number `example` are reversed. In the first cycle of the loop the first 16 bits and the last 16 bits in the variable `example` are swapped, in the second cycle of the loop the first 8 bits and the last 8 bits of first 16 bits in the variable `example` will be swapped, the same as last 16 bits in the variable `example`. The variable `example` will be cut in half and be swapped in each slices every cycle of the loop. ![](https://i.imgur.com/qFPWQak.png) In second for loop the variable `example` will be sent to `tx` as hexadecimal type, and print the outcome in serial port. #### Run with emu-rv32 ``` $ make check ./emu-rv32i bit_reverse 1e6a2c48 ``` Binary number of 0X12345678: `00010010001101000101011001111000` Binary number of 0X1e6a2c48: `00011110011010100010110001001000` #### counter Because there are no branch type instruction, the value of `counter_true` and `counter_false` is zero. ![](https://i.imgur.com/51CYalb.png) #### Assembly Code Compairation ##### C code (v1) ```cpp= #include<stdio.h> int main() { unsigned int example = 0x12345678; unsigned int shift[5]; shift[4] = 0xffff0000; shift[3] = 0xff00ff00; shift[2] = 0xf0f0f0f0; shift[1] = 0xcccccccc; shift[0] = 0xaaaaaaaa; unsigned int tmp; for(int i = 4; i >= 0 ; i--){ tmp = example & shift[i]; tmp >>= (1U << i); shift[i] >>= (1U << i); example &= shift[i]; example <<= (1U << i); example |= tmp; } printf("%08X\n", example); return 0; } ``` Compile command ``` $ riscv32-unknown-elf-gcc -march=rv32i -mabi=ilp32 -O3 -nostdlib -S bit_reverse.c ``` Assembly code ```cpp .file "bit_reverse.c" .option nopic .attribute arch, "rv32i2p0" .attribute unaligned_access, 0 .attribute stack_align, 16 .text .section .rodata.str1.4,"aMS",@progbits,1 .align 2 .LC0: .string "%08X\n" .section .text.startup,"ax",@progbits .align 2 .globl main .type main, @function main: li a1,510275584 lui a0,%hi(.LC0) addi sp,sp,-16 addi a1,a1,-952 addi a0,a0,%lo(.LC0) sw ra,12(sp) call printf lw ra,12(sp) li a0,0 addi sp,sp,16 jr ra .size main, .-main .ident "GCC: (GNU) 9.2.0" ``` After compilation with `O3` we can find there are almost `load` and `store` instruction in the `elf` file, because the constant computation part is reduced by optimization of compilation, and it is hard to read. ##### C code v2 In this part, i replace the constant number `example` from `argv[1]` to see how compiler optimize the program. ```cpp= #include<stdio.h> int main(int argc, char **argv) { // unsigned int example = 0x12345678; unsigned int example = atoi(argv[1]); unsigned int shift[5]; shift[4] = 0xffff0000; shift[3] = 0xff00ff00; shift[2] = 0xf0f0f0f0; shift[1] = 0xcccccccc; shift[0] = 0xaaaaaaaa; unsigned int tmp; for(int i = 4; i >= 0 ; i--){ tmp = example & shift[i]; tmp >>= (1U << i); shift[i] >>= (1U << i); example &= shift[i]; example <<= (1U << i); example |= tmp; } printf("%08X\n", example); return 0; } ``` Compile command ``` $ riscv32-unknown-elf-gcc -march=rv32i -mabi=ilp32 -O3 -nostdlib -S bit_reverse.c ``` Assembly code ```cpp .file "bit_reverse.c" .option nopic .attribute arch, "rv32i2p0" .attribute unaligned_access, 0 .attribute stack_align, 16 .text .section .rodata.str1.4,"aMS",@progbits,1 .align 2 .LC0: .string "%08X\n" .section .text.startup,"ax",@progbits .align 2 .globl main .type main, @function main: lw a0,4(a1) addi sp,sp,-16 sw ra,12(sp) call atoi srli a4,a0,16 slli a0,a0,16 or a4,a4,a0 li a3,16711680 srli a5,a4,8 addi a3,a3,255 and a5,a5,a3 li a3,-16711680 addi a3,a3,-256 slli a4,a4,8 and a4,a4,a3 or a5,a5,a4 li a3,252645376 srli a4,a5,4 addi a3,a3,-241 and a4,a4,a3 li a3,-252645376 addi a3,a3,240 slli a5,a5,4 and a5,a5,a3 or a4,a4,a5 li a3,858992640 srli a5,a4,2 addi a3,a3,819 and a5,a5,a3 li a3,-858992640 addi a3,a3,-820 slli a4,a4,2 and a4,a4,a3 or a5,a5,a4 li a4,1431654400 srli a1,a5,1 addi a4,a4,1365 and a1,a1,a4 li a4,-1431654400 addi a4,a4,-1366 slli a5,a5,1 and a5,a5,a4 lui a0,%hi(.LC0) or a1,a1,a5 addi a0,a0,%lo(.LC0) call printf lw ra,12(sp) li a0,0 addi sp,sp,16 jr ra .size main, .-main .ident "GCC: (GNU) 9.2.0" ``` We can see if the target input is not a constant number, the compiler will not computing the output in the first and store it. ### Prime Number Find the prime number between 1~20 #### C code for emu-rv32 ```cpp= void _start() { const char * error = "Illegle input!\n"; const char * IsPrime = " is prime!\n"; volatile char* tx = (volatile char*) 0x40002000; int prime = 1;//prime or not //check 1 to 20 is prime or not for(int i = 0; i < 20; i++){ int last = i/2; // 0 and 1 is not defined if(i <= 1){ /*Print illegle input*/ while(*error){ *tx = *error; error++; } prime = 0; } // if there is some number j that (i%j) equals to 0 // then i is not a prime number for(int j = 2; j <= last; j++){ if(!(i % j)){ prime = 0; } } //print the prime number if(prime){ //Convert int to char if(i >= 10){ *tx = (i / 10) + '0'; *tx = (i - 10) + '0'; } else{ *tx = i + '0'; } //Print the result char * tmp = IsPrime; while(*tmp){ *tx = *tmp; tmp++; } } prime = 1; } } ``` The variable `i` means our input number ranging from 1 to 20, and if we can find some number (variable `j`) satisfied the equation `(i%j)==0` we can say this number `i` is not a prime number, or it is prime. #### Run with emu-rv32 ``` $ make check ./emu-rv32i test1 Illegle input! 2 is prime! 3 is prime! 5 is prime! 7 is prime! 11 is prime! 13 is prime! 17 is prime! 19 is prime! >>> Execution time: 696634 ns >>> Instruction count: 881 (IPS=1264652) >>> Jumps: 171 (19.41%) - 35 forwards, 136 backwards >>> Branching T=165 (51.72%) F=154 (48.28%) ``` There are alomst a half `branch true`, maybe we can rearrange the assembly code to reduce `breanch true` so that reducing the `stall` in the cpu. :::warning ### Operation support problem The `rv32i` architecture doesn't support `div` and `mod` operation which used by finding `prime number`, hence compiling the `prime.c` with `-march=rv32im` argument to support those operation. Furthermore commenting the `#define STRICT_RV32I` in the emu-rv32i.c to make emu-rv32 support `div` and `mod` operation. ### Stack support problem Even though i don't use function call in the `prime.c` and compiling it with `O3`, the assembly code after compilation also includes using of `stack pointer`. However, the `sp` register in the `emu-rv32.c` doesn't be initialized which resulting in error when excution. To solve this problem, we have to initialize the register `sp` in the main function of `emu-rv32.c`, giving `reg[2] = ram_start + RAM_SIZE` which is the highest ram address. ::: #### Assembly code compairation C code ```cpp= #include<stdio.h> int main() { int flag = 1; for(int i = 0; i < 20; i++){ int last = i/2; if(i <= 1){ /*Print illegle input*/ printf("Illegle input\n"); } for(int j = 2; j <= last; j++){ if(!(i % j)) flag = 0; } if(flag){ printf("%d\n", i); } flag = 1; } } ``` Compile command ``` $ riscv32-unknown-elf-gcc -march=rv32im -mabi=ilp32 -O3 -nostdlib prime.c -o prime ``` Assembly code ```cpp= .file "mod.c" .option nopic .attribute arch, "rv32i2p0" .attribute unaligned_access, 0 .attribute stack_align, 16 .text .section .rodata.str1.4,"aMS",@progbits,1 .align 2 .LC0: .string "Illegle input" .globl __umodsi3 .globl __modsi3 .align 2 .LC1: .string "%d\n" .section .text.startup,"ax",@progbits .align 2 .globl main .type main, @function main: addi sp,sp,-48 sw s0,40(sp) sw s3,28(sp) sw s4,24(sp) sw s5,20(sp) sw s6,16(sp) sw s7,12(sp) sw s8,8(sp) sw s9,4(sp) sw ra,44(sp) sw s1,36(sp) sw s2,32(sp) li s0,0 li s3,1 lui s9,%hi(.LC0) lui s6,%hi(.LC1) li s4,2 li s5,4 li s7,9 li s8,18 .L13: srai s1,s0,1 ble s0,s3,.L41 .L2: ble s1,s3,.L3 andi s2,s0,1 bne s2,zero,.L42 beq s1,s4,.L12 li a1,3 mv a0,s0 call __modsi3 li a5,3 beq a0,zero,.L6 .L43: beq s1,a5,.L7 andi a5,s0,3 beq a5,zero,.L8 .L44: beq s1,s5,.L7 li a1,5 mv a0,s0 call __modsi3 li a5,5 beq a0,zero,.L9 .L45: beq s1,a5,.L7 li a1,6 mv a0,s0 call __modsi3 li a5,6 beq a0,zero,.L10 .L46: beq s1,a5,.L7 li a1,7 mv a0,s0 call __modsi3 li a5,7 beq a0,zero,.L11 .L47: beq s1,a5,.L7 .L14: andi a5,s0,7 beq a5,zero,.L12 bne s1,s7,.L7 beq s0,s8,.L12 .L7: beq s2,zero,.L12 .L3: mv a1,s0 addi a0,s6,%lo(.LC1) call printf .L12: addi s0,s0,1 li a5,20 bne s0,a5,.L13 lw ra,44(sp) lw s0,40(sp) lw s1,36(sp) lw s2,32(sp) lw s3,28(sp) lw s4,24(sp) lw s5,20(sp) lw s6,16(sp) lw s7,12(sp) lw s8,8(sp) lw s9,4(sp) li a0,0 addi sp,sp,48 jr ra .L41: addi a0,s9,%lo(.LC0) call puts j .L2 .L42: beq s1,s4,.L3 li a1,3 mv a0,s0 call __modsi3 li a5,3 bne a0,zero,.L43 .L6: li s2,0 beq s1,a5,.L12 andi a5,s0,3 bne a5,zero,.L44 .L8: li s2,0 beq s1,s5,.L12 li a1,5 mv a0,s0 call __modsi3 li a5,5 bne a0,zero,.L45 .L9: li s2,0 beq s1,a5,.L12 li a1,6 mv a0,s0 call __modsi3 li a5,6 bne a0,zero,.L46 .L10: li s2,0 beq s1,a5,.L12 li a1,7 mv a0,s0 call __modsi3 li a5,7 bne a0,zero,.L47 .L11: li s2,0 bne s1,a5,.L14 j .L12 .size main, .-main .ident "GCC: (GNU) 9.2.0" ``` ### Counter #### Defination ![](https://i.imgur.com/8NSbNhK.png) - jump_counter - Counting the times of `jump` in the program including `branch true` - backward_counter - Counting the times of `jump backward` in the program, it means the target address is smaller than pc - forward_counter - Counting the times of `jump forward` in the program, it means the target address is bigger than pc - true_counter - Counting how many times `branch true` in the program - false_counter - Counting how many times `branch false` in the program #### Jump instruction ![](https://i.imgur.com/uwCzNFt.png) #### Branch instruction ![](https://i.imgur.com/jGr1c8S.png) ###### tags: `Computer archetecture` `Assignment`