Assignment2: GNU Toolchain

# Assignment2: GNU Toolchain contributed by [spadee27357](https://github.com/spadee27357) ## Question selection I chose the [Implement Binarization by count leading zero](https://hackmd.io/@edenlin/CompArchi_HW1) from [林勁羽](https://github.com/JinYu1225/2023_Computer_Architecture/tree/main/Lab1) The reason I chose this topic is that in image recognition tasks, it's common to encounter the use of binarization, a technique for image processing. This method is incredibly practical across various fields. ## Original Code **C Code** ```c #include <stdint.h> #include <stdio.h> uint32_t count_leading_zeros_32(uint32_t x) { x |= (x >> 1); x |= (x >> 2); x |= (x >> 4); x |= (x >> 8); x |= (x >> 16); x -= ((x >> 1) & 0x55555555); x = ((x >> 2) & 0x33333333) + (x & 0x33333333); x = ((x >> 4) + x) & 0x0f0f0f0f; x += (x >> 8); x += (x >> 16); return (32 - (x & 0x3f)); // change 0x7f to 0x3f } int main(){ // pixel test // 8-bit color depth for black and white photo uint32_t picture[5] = {20,80,128,150,231}; uint32_t threshold = 128; uint32_t *pixel = &picture; for(int i = 0; i < 5; i++){ uint32_t sub = threshold - *(pixel+i); printf("%d, ",i); printf("before = %d, ",*(pixel+i)); sub = count_leading_zeros_32(sub); if(sub) *(pixel+i) = 0; else *(pixel+i) = 255; printf("after = %d\n",*(pixel+i)); } return 0; } ``` **Assembly code** ``` # input the image from "test" # output the binariztion result to "result" # "threshold" can be change from 0 to 255 # each value of "test" can be change from 0 to 255 # if the number of "test" being change, # the number of "result" and the "s1" parameter in "main" should be change as well .data test: .byte 20, 80, 128, 150, 231 result: .byte 0, 0, 0, 0, 0 threshold: .byte 0x80 mask1: .word 0x55555555 mask2: .word 0x33333333 mask4: .word 0x0f0f0f0f str1: .string " " str2: .string "\nThe Binarization Result:\n" .text main: # a1 = address of pixel # a2 = address of result # a3 = threshold la a1, test # load address of pixel la a2, result # load address of result la a3, threshold # load address of threshold lbu a3, 0(a3) # load value of threshold BINARIZATION: li s1, 5 # count of haven't done pixel B_LOOP: lbu a4, 0(a1) # load value of pixel sub a0, a3, a4 # threshold - pixel jal ra, CLZ # jump to CLZ bne a0, x0, FLOOR # branch to FLOOR if CLZ = 0 CEIL: li a4, 255 j STORE_MOVE # jump to STORE_MOVE FLOOR: li a4, 0 STORE_MOVE: sb a4, 0(a2) addi s1, s1, -1 # total count -1 addi a1, a1, 1 # go to next address of pixel addi a2, a2, 1 # go to next address of result bne s1, x0, B_LOOP li s1, 5 # make count for RESULT_CHECK sub a2, a2, s1 # reset address of result la a0, str2 # start to print the result li a7, 4 ecall RESULT_CHECK: lbu a0, 0(a2) # load result to a0 jal ra, PRINT_INT addi s1, s1, -1 # count print result addi a2, a2, 1 # move to next address of result bne s1, x0, RESULT_CHECK j EXIT CLZ: # a0: x = the number to be counted leading zero # t0: shifted x # t1: x shift & mask # t2: mask PAD1: srli t0, a0, 1 # t0 = x >> 1 or a0, a0, t0 # x |= x >> 1 srli t0, a0, 2 # t0 = x >> 2 or a0, a0, t0 # x |= x >> 2 srli t0, a0, 4 # t0 = x >> 4 or a0, a0, t0 # x |= x >> 4 srli t0, a0, 8 # t0 = x >> 8 or a0, a0, t0 # x |= x >> 8 srli t0, a0, 16 # t0 = x >> 16 or a0, a0, t0 # x |= x >> 16 POPCNT: lw t2, mask1 # load mask1 to t2 srli t0, a0, 1 # t0 = x >> 1 and t1, t0, t2 # t1 = (x >> 1) & mask1 sub a0, a0, t1 # x -= ((x >> 1) & mask1) lw t2, mask2 # load mask2 to t2 srli t0, a0, 2 # t0 = x >> 2 and t1, t0, t2 # (x >> 2) & mask2 and a0, a0, t2 # x & mask2 add a0, t1, a0 # ((x >> 2) & mask2) + (x & mask2) srli t0, a0, 4 # t0 = x >> 4 add a0, a0, t0 # x + (x >> 4) lw t2, mask4 # load mask4 to t2 and a0, a0, t2 # ((x >> 4) + x) & mask4 srli t0, a0, 8 # t0 = x >> 8 add a0, a0, t0 # x += (x >> 8) srli t0, a0, 16 # t0 = x >> 16 add a0, a0, t0 # x += (x >> 16) andi t0, a0, 0x3f # t0 = x & 0x3f li a0, 32 # a0 = 64 sub a0, a0, t0 # 64 - (x & 0x3f) addi sp, sp, -4 # make space for ra in stack sw ra, 0(sp) # push ra into stack jal ra, PRINT_INT lw ra, 0(sp) # pull ra out of stack addi sp, sp, 4 # free the space create in stack for ra ret PRINT_INT: li a7, 1 # print the value of input a0 in int ecall addi sp, sp, -4 sw a0, 0(sp) la a0, str1 li a7, 4 ecall lw a0, 0(sp) addi sp, sp, 4 ret EXIT: li a7, 10 # Halts the simulator ecall ``` ## Modified Code **C Code** ```C #include <stdint.h> #include <stdio.h> #include <time.h> #include <string.h> extern uint64_t get_cycles(); extern uint64_t get_instret(); uint32_t count_leading_zeros_32(uint32_t x) { x |= (x >> 1); x |= (x >> 2); x |= (x >> 4); x |= (x >> 8); x |= (x >> 16); x -= ((x >> 1) & 0x55555555); x = ((x >> 2) & 0x33333333) + (x & 0x33333333); x = ((x >> 4) + x) & 0x0f0f0f0f; x += (x >> 8); x += (x >> 16); return (32 - (x & 0x3f)); } int main() { /* measure cycles */ uint64_t instret = get_instret(); uint64_t oldcount = get_cycles(); uint32_t picture[5] = {20, 80, 128, 150, 231}; uint32_t threshold = 230; for (int i = 0; i < 5; i++) { printf("%d, before = %lu, ", i, picture[i]); picture[i] = (count_leading_zeros_32(threshold - picture[i]) != 0) ? 0 : 255; printf("after = %lu\n", picture[i]); } uint64_t cyclecount = get_cycles() - oldcount; printf("cycle count: %u\n", (unsigned int) cyclecount); printf("instret: %x\n", (unsigned) (instret & 0xffffffff)); return 0; } ``` **Assembly code** ``` .data picture: .word 20, 80, 128, 150, 231 threshold: .word 230 str1: .string "%d, before = " str2: .string ", after = " str3: .string "\n" .text .globl main main: la a1, picture li a4, 5 la a2, threshold lw a2, 0(a2) li t3, 0 LOOP: lw a5, 0(a1) mv a3, a5 sub a0, a2, a5 jal ra, count_leading_zeros li a3, 32 sub a0, a3, a0 bnez a0, SET_BLACK li a3, 255 j STORE SET_BLACK: li a3, 0 STORE: sw a3, 0(a1) mv a0, t3 call PRINT_INT la a0, str1 li a7, 4 ecall mv a0, a5 call PRINT_INT la a0, str2 li a7, 4 ecall mv a0, a3 call PRINT_INT la a0, str3 li a7, 4 ecall addi a1, a1, 4 addi t3, t3, 1 blt t3, a4, LOOP li a7, 10 ecall count_leading_zeros: srli t1, a0, 1 or a0, a0, t1 srli t1, a0, 2 or a0, a0, t1 srli t1, a0, 4 or a0, a0, t1 srli t1, a0, 8 or a0, a0, t1 srli t1, a0, 16 or a0, a0, t1 # Start of population count li t2, 0x55555555 srli t1, a0, 1 and t1, t1, t2 sub a0, a0, t1 li t2, 0x33333333 srli t1, a0, 2 and t1, t1, t2 and a0, a0, t2 add a0, a0, t1 srli t1, a0, 4 add a0, a0, t1 li t2, 0x0f0f0f0f and a0, a0, t2 srli t1, a0, 8 add a0, a0, t1 srli t1, a0, 16 add a0, a0, t1 andi a0, a0, 0x3f li t1, 64 sub a0, t1, a0 ret PRINT_INT: li a7, 1 ecall ret ``` ## Compile and execute the code **Performance measurement** Compile the C code to ELF ``` riscv-none-elf-gcc -march=rv32i_zicsr_zifencei -mabi=ilp32 -Wall -c -o getcycles.o getcycles.s riscv-none-elf-gcc -march=rv32i_zicsr_zifencei -mabi=ilp32 -Wall -c -o getinstret.o getinstret.s riscv-none-elf-gcc -march=rv32i_zicsr_zifencei -mabi=ilp32 -O0 -Wall -c -o main.o main.c riscv-none-elf-gcc -O0 -march=rv32i -mabi=ilp32 -o main.elf getcycles.o getinstret.o new.o ``` Result ``` ~/rv32emu/build/rv32emu main.elf ``` ``` 0, before = 20, after = 0 1, before = 80, after = 0 2, before = 128, after = 0 3, before = 150, after = 0 4, before = 231, after = 255 cycle count: 14731 instret: 2c5 inferior exit code 0 ``` Display the ELF Size ``` riscv-none-elf-size main.elf ``` ``` text data bss dec hex filename 51752 1876 1528 55156 d774 main.elf ``` Display the ELF file header ``` riscv-none-elf-readelf -h main.elf ``` ``` ELF Header: Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00 Class: ELF32 Data: 2's complement, little endian Version: 1 (current) OS/ABI: UNIX - System V ABI Version: 0 Type: EXEC (Executable file) Machine: RISC-V Version: 0x1 Entry point address: 0x100c2 Start of program headers: 52 (bytes into file) Start of section headers: 69428 (bytes into file) Flags: 0x1, RVC, soft-float ABI Size of this header: 52 (bytes) Size of program headers: 32 (bytes) Number of program headers: 3 Size of section headers: 40 (bytes) Number of section headers: 15 Section header string table index: 14 ``` **GCC O0, O1, O2, O3, Os, Ofast optimization** We can accurately determine the efficiency of O0, O1, O2, O3, and Os from the table below. ***Optimization Levels*** -O0: No optimization. This is the default level, meaning the compiler will compile quickly but perform no optimizations. This is typically used for debugging since it ensures the most direct correspondence between source code and generated machine code. -O1: Basic optimization. This level attempts to reduce code size and execution time without significantly increasing compilation time. -O2: Further optimization. Enhances execution efficiency and code size optimization without performing unsafe optimizations. This may make debugging more difficult. -O3: Enables more optimization techniques, such as further inlining expansion and vectorization optimizations. This may increase the size of the code (sometimes referred to as "code bloat"). -Os: Focuses on reducing the size of the executable. This optimization tries to minimize code size without significantly impacting execution performance. -Ofast: In addition to all the optimizations of -O3, it includes some optimizations that might change program behavior, such as ignoring accuracy and standard conformity. ``` riscv-none-elf-size main.elf ``` | | -O0 | -O1 | -O2 | -O3 | -Os | -Ofast | | ----------- | ----- | ----- | ----- | ----- | ----- | ------ | | text | 51752 | 51480 | 51480 | 51664 | 51446 | 51664 | | data | 1876 | 1876 | 1876 | 1876 | 1876 | 1876 | | bss | 1528 | 1528 | 1528 | 1528 | 1528 | 1528 | | dec | 55156 | 54884 | 54884 | 55068 | 54850 | 55068 | | Cycle Count | 14731 | 14395 | 14395 | 14351 | 14436 | 14351 | | Instret | 2c5 | 2cc | 2cc | 2cf | 2cc | 2cf | ***Table Comparison*** text: We can see that the size of the text section is largest at the -O0 level, indicating no optimization has been applied. From -O1 to -Ofast, there is almost no significant change in size, which could be due to the optimizations being primarily focused on performance rather than size, or the code itself has limited scope for further size reduction. data and bss: These sections usually contain static and global variables. Their size remains unchanged across different optimization levels, suggesting that the configuration of these variables is consistent regardless of the optimization level. dec: Overall, there isn’t much variation in size, showing that the main focus of optimization is on performance rather than size. Cycle Count & Instret: These are performance metrics. Moving from -O0 to -O1, we observe a notable improvement in performance, with a reduction in cycle counts. This indicates that even basic optimizations significantly impact performance. However, the performance gains from -O1 to -Ofast are not as substantial. This might be due to certain parts of the code reaching an optimization bottleneck, or the performance of the program itself is already nearing the hardware limits. ## Adapt the original assembly code **Compare Modified Assembly Code** ``` riscv-none-elf-as -march=rv32i -mabi=ilp32 -o Hand.o Hand.s riscv-none-elf-gcc -march=rv32i -mabi=ilp32 -o Hand.elf Hand.o ``` **ELF Header** ``` riscv-none-elf-readelf -h Hand.elf ``` ``` ELF Header: Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00 Class: ELF32 Data: 2's complement, little endian Version: 1 (current) OS/ABI: UNIX - System V ABI Version: 0 Type: EXEC (Executable file) Machine: RISC-V Version: 0x1 Entry point address: 0x100d8 Start of program headers: 52 (bytes into file) Start of section headers: 16492 (bytes into file) Flags: 0x0 Size of this header: 52 (bytes) Size of program headers: 32 (bytes) Number of program headers: 3 Size of section headers: 40 (bytes) Number of section headers: 14 Section header string table index: 13 ``` **ELF size** Similarly, we execute it in the terminal to obtain the result. ``` james@james-VirtualBox:~/rv32emu/programs/HW2$ riscv-none-elf-size Hand.elf text data bss dec hex filename 8876 1420 840 11136 2b80 Hand.elf ``` We can make a comparison: the performance of the modified assembly code is better than the direct translation from C to ELF format. | | text | data | bss | dec | Cycle Count | Instret | | ----- | ----- | ---- | --- | --- | ----------- | ------- | | -O0 | 51752 | 1876 | 1528 | 55156 | 14731 | 2c5 | | -O1 | 51480 | 1876 | 1528 | 54884 | 14395 | 2cc | | -O2 | 51480 | 1876 | 1528 | 54884 | 14395 | 2cc | | -O3 | 51664 | 1876 | 1528 | 55068 | 14351 | 2cf | | -Os | 51446 | 1876 | 1528 | 54850 | 14436 | 2cc | | -Ofast | 51664 | 1876 | 1528 | 55068 | 14351 | 2cf | |Modified| 6168 | 1420 | 840 | 8428 | 559 | 390 | ![image.png](https://hackmd.io/_uploads/rJgTpzl7a.png) :::warning You shall use RDCYCLE/RDCYCLEH instruction for the statistics of your program’s execution. :notes: jserv :::