# Assignment2: RISC-V Toolchain contributed by < [terry23304](https://github.com/terry23304) > ## Choose a question Problem: I choose [Implement Binarization by count leading zero](https://hackmd.io/@edenlin/CompArchi_HW1) from `edenlin` ## Add perfcounter to C code ```c #include <stdint.h> #include <stdio.h> #include <stdint.h> extern uint64_t get_cycles(); extern uint64_t get_instret(); uint32_t count_leading_zeros_32(uint32_t x) { x |= (x >> 1); x |= (x >> 2); x |= (x >> 4); x |= (x >> 8); x |= (x >> 16); x -= ((x >> 1) & 0x55555555); x = ((x >> 2) & 0x33333333) + (x & 0x33333333); x = ((x >> 4) + x) & 0x0f0f0f0f; x += (x >> 8); x += (x >> 16); return (32 - (x & 0x3f)); // change 0x7f to 0x3f } int main() { // pixel test // 8-bit color depth for black and white photo uint32_t picture[5] = {20,80,128,150,231}; uint32_t threshold = 128; uint32_t *pixel = &picture; uint64_t instret = get_instret(); uint64_t oldcount = get_cycles(); for (int i = 0; i < 5; i++) { uint32_t sub = threshold - *(pixel+i); printf("%d, ",i); printf("before = %ld, ",*(pixel+i)); sub = count_leading_zeros_32(sub); if(sub) *(pixel+i) = 0; else *(pixel+i) = 255; printf("after = %ld\n",*(pixel+i)); } uint64_t cyclecount = get_cycles() - oldcount; printf("cycle count: %u\n", (unsigned int) cyclecount); printf("instret: %x\n", (unsigned) (instret & 0xffffffff)); return 0; } ``` ### Makefile ```shell .PHONY: clean include ../../mk/toolchain.mk CFLAGS = -march=rv32i_zicsr_zifencei -mabi=ilp32 -O0 -Wall OBJS = \ getcycles.o \ getinstret.o \ main.o BIN = main.elf %.o: %.S $(CROSS_COMPILE)gcc $(CFLAGS) -c -o $@ $< %.o: %.c $(CROSS_COMPILE)gcc $(CFLAGS) -c -o $@ $< all: $(BIN) $(BIN): $(OBJS) $(CROSS_COMPILE)gcc -o $@ $^ clean: $(RM) $(BIN) $(OBJS) ``` ### use make to compile C code to elf file ```shell riscv-none-elf-gcc -march=rv32i_zicsr_zifencei -mabi=ilp32 -O0 -Wall -c -o main.o main.c riscv-none-elf-gcc -o main.elf getcycles.o getinstret.o main.o ``` ## Observe different compilation options result ### -O0 Optimized Assembly Code ```c 0001016c <count_leading_zeros_32>: 1016c: fe010113 add sp,sp,-32 10170: 00812e23 sw s0,28(sp) 10174: 02010413 add s0,sp,32 10178: fea42623 sw a0,-20(s0) 1017c: fec42783 lw a5,-20(s0) 10180: 0017d793 srl a5,a5,0x1 10184: fec42703 lw a4,-20(s0) 10188: 00f767b3 or a5,a4,a5 1018c: fef42623 sw a5,-20(s0) 10190: fec42783 lw a5,-20(s0) 10194: 0027d793 srl a5,a5,0x2 10198: fec42703 lw a4,-20(s0) 1019c: 00f767b3 or a5,a4,a5 101a0: fef42623 sw a5,-20(s0) 101a4: fec42783 lw a5,-20(s0) 101a8: 0047d793 srl a5,a5,0x4 101ac: fec42703 lw a4,-20(s0) 101b0: 00f767b3 or a5,a4,a5 101b4: fef42623 sw a5,-20(s0) 101b8: fec42783 lw a5,-20(s0) 101bc: 0087d793 srl a5,a5,0x8 101c0: fec42703 lw a4,-20(s0) 101c4: 00f767b3 or a5,a4,a5 101c8: fef42623 sw a5,-20(s0) 101cc: fec42783 lw a5,-20(s0) 101d0: 0107d793 srl a5,a5,0x10 101d4: fec42703 lw a4,-20(s0) 101d8: 00f767b3 or a5,a4,a5 101dc: fef42623 sw a5,-20(s0) 101e0: fec42783 lw a5,-20(s0) 101e4: 0017d713 srl a4,a5,0x1 101e8: 555557b7 lui a5,0x55555 101ec: 55578793 add a5,a5,1365 # 55555555 <__BSS_END__+0x55537805> 101f0: 00f777b3 and a5,a4,a5 101f4: fec42703 lw a4,-20(s0) 101f8: 40f707b3 sub a5,a4,a5 101fc: fef42623 sw a5,-20(s0) 10200: fec42783 lw a5,-20(s0) 10204: 0027d713 srl a4,a5,0x2 10208: 333337b7 lui a5,0x33333 1020c: 33378793 add a5,a5,819 # 33333333 <__BSS_END__+0x333155e3> 10210: 00f77733 and a4,a4,a5 10214: fec42683 lw a3,-20(s0) 10218: 333337b7 lui a5,0x33333 1021c: 33378793 add a5,a5,819 # 33333333 <__BSS_END__+0x333155e3> 10220: 00f6f7b3 and a5,a3,a5 10224: 00f707b3 add a5,a4,a5 10228: fef42623 sw a5,-20(s0) 1022c: fec42783 lw a5,-20(s0) 10230: 0047d713 srl a4,a5,0x4 10234: fec42783 lw a5,-20(s0) 10238: 00f70733 add a4,a4,a5 1023c: 0f0f17b7 lui a5,0xf0f1 10240: f0f78793 add a5,a5,-241 # f0f0f0f <__BSS_END__+0xf0d31bf> 10244: 00f777b3 and a5,a4,a5 10248: fef42623 sw a5,-20(s0) 1024c: fec42783 lw a5,-20(s0) 10250: 0087d793 srl a5,a5,0x8 10254: fec42703 lw a4,-20(s0) 10258: 00f707b3 add a5,a4,a5 1025c: fef42623 sw a5,-20(s0) 10260: fec42783 lw a5,-20(s0) 10264: 0107d793 srl a5,a5,0x10 10268: fec42703 lw a4,-20(s0) 1026c: 00f707b3 add a5,a4,a5 10270: fef42623 sw a5,-20(s0) 10274: fec42783 lw a5,-20(s0) 10278: 03f7f793 and a5,a5,63 1027c: 02000713 li a4,32 10280: 40f707b3 sub a5,a4,a5 10284: 00078513 mv a0,a5 10288: 01c12403 lw s0,28(sp) 1028c: 02010113 add sp,sp,32 10290: 00008067 ret 00010294 <main>: 10294: fb010113 add sp,sp,-80 10298: 04112623 sw ra,76(sp) 1029c: 04812423 sw s0,72(sp) 102a0: 05010413 add s0,sp,80 102a4: 0001c7b7 lui a5,0x1c 102a8: d2478793 add a5,a5,-732 # 1bd24 <__clzsi2+0xbe> 102ac: 0007a583 lw a1,0(a5) 102b0: 0047a603 lw a2,4(a5) 102b4: 0087a683 lw a3,8(a5) 102b8: 00c7a703 lw a4,12(a5) 102bc: 0107a783 lw a5,16(a5) 102c0: fab42823 sw a1,-80(s0) 102c4: fac42a23 sw a2,-76(s0) 102c8: fad42c23 sw a3,-72(s0) 102cc: fae42e23 sw a4,-68(s0) 102d0: fcf42023 sw a5,-64(s0) 102d4: 08000793 li a5,128 102d8: fef42423 sw a5,-24(s0) 102dc: fb040793 add a5,s0,-80 102e0: fef42223 sw a5,-28(s0) 102e4: e75ff0ef jal 10158 <get_instret> 102e8: fca42c23 sw a0,-40(s0) 102ec: fcb42e23 sw a1,-36(s0) 102f0: e55ff0ef jal 10144 <get_cycles> 102f4: fca42823 sw a0,-48(s0) 102f8: fcb42a23 sw a1,-44(s0) 102fc: fe042623 sw zero,-20(s0) 10300: 0cc0006f j 103cc <main+0x138> 10304: fec42783 lw a5,-20(s0) 10308: 00279793 sll a5,a5,0x2 1030c: fe442703 lw a4,-28(s0) 10310: 00f707b3 add a5,a4,a5 10314: 0007a783 lw a5,0(a5) 10318: fe842703 lw a4,-24(s0) 1031c: 40f707b3 sub a5,a4,a5 10320: fcf42223 sw a5,-60(s0) 10324: fec42583 lw a1,-20(s0) 10328: 0001c7b7 lui a5,0x1c 1032c: cd878513 add a0,a5,-808 # 1bcd8 <__clzsi2+0x72> 10330: 4f8000ef jal 10828 <printf> 10334: fec42783 lw a5,-20(s0) 10338: 00279793 sll a5,a5,0x2 1033c: fe442703 lw a4,-28(s0) 10340: 00f707b3 add a5,a4,a5 10344: 0007a783 lw a5,0(a5) 10348: 00078593 mv a1,a5 1034c: 0001c7b7 lui a5,0x1c 10350: ce078513 add a0,a5,-800 # 1bce0 <__clzsi2+0x7a> 10354: 4d4000ef jal 10828 <printf> 10358: fc442503 lw a0,-60(s0) 1035c: e11ff0ef jal 1016c <count_leading_zeros_32> 10360: fca42223 sw a0,-60(s0) 10364: fc442783 lw a5,-60(s0) 10368: 00078e63 beqz a5,10384 <main+0xf0> 1036c: fec42783 lw a5,-20(s0) 10370: 00279793 sll a5,a5,0x2 10374: fe442703 lw a4,-28(s0) 10378: 00f707b3 add a5,a4,a5 1037c: 0007a023 sw zero,0(a5) 10380: 01c0006f j 1039c <main+0x108> 10384: fec42783 lw a5,-20(s0) 10388: 00279793 sll a5,a5,0x2 1038c: fe442703 lw a4,-28(s0) 10390: 00f707b3 add a5,a4,a5 10394: 0ff00713 li a4,255 10398: 00e7a023 sw a4,0(a5) 1039c: fec42783 lw a5,-20(s0) 103a0: 00279793 sll a5,a5,0x2 103a4: fe442703 lw a4,-28(s0) 103a8: 00f707b3 add a5,a4,a5 103ac: 0007a783 lw a5,0(a5) 103b0: 00078593 mv a1,a5 103b4: 0001c7b7 lui a5,0x1c 103b8: cf078513 add a0,a5,-784 # 1bcf0 <__clzsi2+0x8a> 103bc: 46c000ef jal 10828 <printf> 103c0: fec42783 lw a5,-20(s0) 103c4: 00178793 add a5,a5,1 103c8: fef42623 sw a5,-20(s0) 103cc: fec42703 lw a4,-20(s0) 103d0: 00400793 li a5,4 103d4: f2e7d8e3 bge a5,a4,10304 <main+0x70> 103d8: d6dff0ef jal 10144 <get_cycles> 103dc: 00050613 mv a2,a0 103e0: 00058693 mv a3,a1 103e4: fd042503 lw a0,-48(s0) 103e8: fd442583 lw a1,-44(s0) 103ec: 40a60733 sub a4,a2,a0 103f0: 00070813 mv a6,a4 103f4: 01063833 sltu a6,a2,a6 103f8: 40b687b3 sub a5,a3,a1 103fc: 410786b3 sub a3,a5,a6 10400: 00068793 mv a5,a3 10404: fce42423 sw a4,-56(s0) 10408: fcf42623 sw a5,-52(s0) 1040c: fc842783 lw a5,-56(s0) 10410: 00078593 mv a1,a5 10414: 0001c7b7 lui a5,0x1c 10418: d0078513 add a0,a5,-768 # 1bd00 <__clzsi2+0x9a> 1041c: 40c000ef jal 10828 <printf> 10420: fd842783 lw a5,-40(s0) 10424: 00078593 mv a1,a5 10428: 0001c7b7 lui a5,0x1c 1042c: d1478513 add a0,a5,-748 # 1bd14 <__clzsi2+0xae> 10430: 3f8000ef jal 10828 <printf> 10434: 00000793 li a5,0 10438: 00078513 mv a0,a5 1043c: 04c12083 lw ra,76(sp) 10440: 04812403 lw s0,72(sp) 10444: 05010113 add sp,sp,80 10448: 00008067 ret ``` There are 17448 lines in O0 optimization, therefore I choose `count_leading_zero` and `main` to compare. #### elf size `riscv64-unknown-elf-size ./main.elf` ``` text data bss dec hex filename 51804 1876 1528 55208 d7a8 ./main.elf ``` #### execute elf file ``` 0, before = 20, after = 0 1, before = 80, after = 0 2, before = 128, after = 0 3, before = 150, after = 255 4, before = 231, after = 255 cycle count: 16582 instret: 2d7 inferior exit code 0 ``` ### -O1 Optimized Assembly Code #### elf size `riscv64-unknown-elf-size ./main.elf` ``` text data bss dec hex filename 51520 1876 1528 54924 d68c ./main.elf ``` #### execute elf file ``` 0, before = 20, after = 0 1, before = 80, after = 0 2, before = 128, after = 0 3, before = 150, after = 255 4, before = 231, after = 255 cycle count: 16243 instret: 2dc inferior exit code 0 ``` ### -O2 Optimized Assembly Code #### elf size `riscv64-unknown-elf-size ./main.elf` ``` text data bss dec hex filename 51520 1876 1528 54924 d68c ./main.elf ``` #### execute elf file ``` 0, before = 20, after = 0 1, before = 80, after = 0 2, before = 128, after = 0 3, before = 150, after = 255 4, before = 231, after = 255 cycle count: 16243 instret: 2dc inferior exit code 0 ``` ### -O3 Optimized Assembly Code #### elf size `riscv64-unknown-elf-size ./main.elf` ``` text data bss dec hex filename 51688 1876 1528 55092 d734 ./main.elf ``` #### execute elf file ``` 0, before = 20, after = 0 1, before = 80, after = 0 2, before = 128, after = 0 3, before = 150, after = 255 4, before = 231, after = 255 cycle count: 16200 instret: 2dd inferior exit code 0 ``` ### -Os Optimized Assembly Code #### elf size `riscv64-unknown-elf-size ./main.elf` ``` text data bss dec hex filename 51486 1876 1528 54890 d66a ./main.elf ``` #### execute elf file ``` 0, before = 20, after = 0 1, before = 80, after = 0 2, before = 128, after = 0 3, before = 150, after = 255 4, before = 231, after = 255 cycle count: 16241 instret: 306 inferior exit code 0 ``` ### -Ofast Optimized Assembly Code #### elf size `riscv64-unknown-elf-size ./main.elf` ``` text data bss dec hex filename 51688 1876 1528 55092 d734 ./main.elf ``` #### execute elf file ``` 0, before = 20, after = 0 1, before = 80, after = 0 2, before = 128, after = 0 3, before = 150, after = 255 4, before = 231, after = 255 cycle count: 16200 instret: 2dd inferior exit code 0 ``` ### conclusion **CSR** | Name | cycle count | | -------- | -------- | | O0 | 16582 | | O1 | 16243 | | O2 | 16243 | | O3 | 16200 | | Os | 16241 | | Ofast | 16200 | - optimization level O0 using less line of code and registers compared to O2. - O1 and O2 are the same. - At optimization level O3, the compiler utilizes a greater number of registers, resulting in a larger code size compared to O2. However, it minimizes the cycle count and offers the highest level of performance. The program size is large because it needs to include functions from `stdlib.h` and `stdio.h`, resulting in numerous unnecessary instructions in the code. ## Rewrite code I think there's no need to use CLZ to check if the input number is greater than the threshold or not. Therefore, I modified the code to determine the output number based on the sign bit, making it either 0 or 255. ```c #include <stdint.h> #include <stdio.h> #include <stdint.h> extern uint64_t get_cycles(); extern uint64_t get_instret(); int main() { // pixel test // 8-bit color depth for black and white photo uint32_t picture[5] = {20,80,128,150,231}; uint32_t threshold = 128; uint32_t *pixel = &picture; uint64_t instret = get_instret(); uint64_t oldcount = get_cycles(); for (int i = 0; i < 5; i++) { uint32_t sub = threshold - *(pixel+i); printf("%d, ",i); printf("before = %ld, ",*(pixel+i)); if (sub & 0x80000000) *(pixel + i) = 255; else *(pixel + i) = 0; printf("after = %ld\n",*(pixel+i)); } uint64_t cyclecount = get_cycles() - oldcount; printf("cycle count: %u\n", (unsigned int) cyclecount); printf("instret: %x\n", (unsigned) (instret & 0xffffffff)); return 0; } ``` **elf size** ``` text data bss dec hex filename 51404 1876 1528 54808 d618 ./main.elf ``` **cycle count** `cycle count: 16062` :::warning TODO: Revise the handwritten RISC-V assembly code. :notes: jserv ::: ## Handwritten Assembly Error: ``` (.text+0x38): undefined reference to `main' collect2: error: ld returned 1 exit status ``` Add the following code on top of main function. ``` .globl main .type main, @function ``` Error: ``` unknown syscall 4 Segmentation fault (core dumped) ``` ``` la a0, str2 li a7, 4 ecall ``` Modifications: ``` .set SYSEXIT, 93 .set SYSWRITE, 64 li a0, 1 la a1, str2 li a2, 26 li a7, SYSWRITE ecall li a7, SYSEXIT ecall ``` After the modification, the assembly code can run on rv32emu properly.