Assignment2: GNU Toolchain

# Assignment2: GNU Toolchain contributed by < [`hungyuhang`](https://github.com/hungyuhang) > ## Problem Selection For this assignment, I choose the program [Find Leftmost 0-byte using CLZ](https://hackmd.io/@cychen/computer_architecture_hw1) by [陳川曜](https://github.com/hugo0406). And the reason to pick this implementation, is because there are a lot of binary operations in this implementation, and I want to practice more binary operations so that I can be more familiar with it. ## Program Adaptation ### Assembly Code To adapt the assembly from Ripes to rv32emu, most of the work involves changing the code for system calls. For example, in Ripes there is a system call to print integer. But in rv32emu, it only supports system call to print string. And compared to rv32emu, Ripes ignores some syntax errors, meaning that you might encounter some syntax errors when assembling using `riscv-none-elf-as`, even though the same code runs on Ripes perfectly. In oredr to adapt the code properly, it is important to understand the assembler directives, linker script, and Makefile. Here are the resources that I find it useful: - [GNU assembler](https://ftp.gnu.org/old-gnu/Manuals/gas-2.9.1/html_node/as_toc.html) - [GNU linker](https://ftp.gnu.org/old-gnu/Manuals/ld-2.9.1/html_mono/ld.html) - [Makefile](https://jasonblog.github.io/note/gunmake/shen_ru_xue_xi_makeming_ling_he_makefile.html) And here is the first version of the adaptation: ```clike= # RISC-V assembly program. .org 0 # Provide program starting address to linker .global _start /* newlib system calls */ .set SYSEXIT, 93 .set SYSWRITE, 64 .data test1: .dword 0x1122334455007700 test2: .dword 0x0123456789abcdef test3: .dword 0x1100220033445566 str1: .string "The Leftmost 0-byte is " .set str1_size, .-str1 endl: .string "\n" .set endl_size, .-endl .text _start: addi sp, sp, -4 # create space for buffer of print function la t2, test1 # t2 points to the first test case li t3, 3 # number of test cases loop: lw a0, 0(t2) # a0:test_half_right lw a1, 4(t2) # a1:test_half_left jal zbytel mv t4, a0 # save the result to t4 #print li a0, 1 # 1 = standard output (stdout) la a1, str1 # load address of str1 la a2, str1_size # length of str1 li a7, SYSWRITE # "write" syscall ecall addi t4, t4, 48 # change number to ascii digit sw t4, 0(sp) # push the result onto the stack li a0, 1 # 1 = standard output (stdout) mv a1, sp # pass the stack address to a1 li a2, 1 # length of value digit li a7, SYSWRITE # "write" syscall ecall li a0, 1 # 1 = standard output (stdout) la a1, endl # load address of endl la a2, endl_size # length of endl li a7, SYSWRITE # "write" syscall ecall addi t2, t2, 8 # move to the next test case addi t3, t3, -1 # test case counter-- bne t3, x0, loop # counter=0,break addi sp, sp, 4 add a0, x0, 0 # Use 0 as the return code li a7, SYSEXIT # "exit" syscall ecall zbytel: addi sp,sp,-4 #push sw ra,0(sp) mv s0,a0 #s0:test_half_right mv s1,a1 #s1:test_half_left #y = (x & 0x7F7F7F7F7F7F7F7F)+ 0x7F7F7F7F7F7F7F7F li t0,0x7f7f7f7f and s2,s0,t0 add s2,s2,t0 #y = ~(y | x |0x7F7F7F7F7F7F7F7F) or s2,s2,s0 or s2,s2,t0 xori s2,s2,-1 #s2:y_half_right and s3,s1,t0 add s3,s3,t0 or s3,s3,s0 or s3,s3,t0 xori s3,s3,-1 #s3:y_half_left mv a0,s2 mv a1,s3 jal clz lw ra,0(sp) addi sp,sp,4 #pop srli a0,a0,3 #clz(y)>>3 jr ra clz: #x |= (x >> 1) andi t1,a1,0x1 srli s4,a1,1 srli s5,a0,1 slli t1,t1,31 or s5,s5,t1 or a1,s4,a1 or a0,s5,a0 #x |= (x >> 2) andi t1,a1,0x3 srli s4,a1,2 srli s5,a0,2 slli t1,t1,30 or s5,s5,t1 or a1,s4,a1 or a0,s5,a0 #x |= (x >> 4) andi t1,a1,0xf srli s4,a1,4 srli s5,a0,4 slli t1,t1,28 or s5,s5,t1 or a1,s4,a1 or a0,s5,a0 #x |= (x >> 8) andi t1,a1,0xff srli s4,a1,8 srli s5,a0,8 slli t1,t1,24 or s5,s5,t1 or a1,s4,a1 or a0,s5,a0 #x |= (x >> 16) li t1,0xffff and t1,a1,t1 srli s4,a1,16 srli s5,a0,16 slli t1,t1,16 or s5,s5,t1 or a1,s4,a1 or a0,s5,a0 #x |= (x >> 32) mv s5,a1 and s4,a1,x0 or a1,s4,a1 or a0,s5,a0 # x -= ((x >> 1) & 0x5555555555555555) andi t1,a1,0x1 srli s4,a1,1 srli s5,a0,1 slli t1,t1,31 or s5,s5,t1 li t1,0x55555555 and s4,s4,t1 and s5,s5,t1 sub a1,a1,s4 sub a0,a0,s5 #x = ((x >> 2) & 0x3333333333333333) + (x & 0x3333333333333333) andi t1,a1,0x3 srli s4,a1,2 srli s5,a0,2 slli t1,t1,30 or s5,s5,t1 li t1,0x33333333 and s4,s4,t1 and s5,s5,t1 and a1,a1,t1 and a0,a0,t1 add a1,a1,s4 add a0,a0,s5 #x = ((x >> 4) + x) & 0x0f0f0f0f0f0f0f0f; andi t1,a1,0xf srli s4,a1,4 srli s5,a0,4 slli t1,t1,28 or s5,s5,t1 add s4,s4,a1 add s5,s5,a0 li t1,0x0f0f0f0f and a1,s4,t1 and a0,s5,t1 #x += (x >> 8) andi t1,a1,0xff srli s4,a1,8 srli s5,a0,8 slli t1,t1,24 or s5,s5,t1 add a1,a1,s4 add a0,a0,s5 #x += (x >> 16) li t1,0xffff and t1,t1,a1 srli s4,a1,16 srli s5,a0,16 slli t1,t1,16 or s5,s5,t1 add a1,a1,s4 add a0,a0,s5 #x += (x >> 32) mv s5,a1 and s4,a1,x0 add a1,a1,s4 add a0,a0,s5 #64 - (x & 0x7f) andi a0,a0,0x7f li t1,64 sub a0,t1,a0 jr ra ``` The code runs properly, and the output is shown below: ``` hungyuhang@hungyuhang-VirtualBox:~/rv32emu/tests/hw2$ ~/rv32emu/build/rv32emu hw2_asm.elf The Leftmost 0-byte is 5 The Leftmost 0-byte is 8 The Leftmost 0-byte is 1 inferior exit code 0 ``` ### C Code To make the flow of the C code more similar to the assembly version, I modify function `main()`. More specifically, in the original version, `zbytel()` and `printf()` were callde 3 times and 6 times respectively without looping. And I wrap the function calls within a loop as shown below, just like the assembly version. And here is the modified code: ```clike= #include <stdio.h> #include <stdint.h> // test data, the result should be {5, 8, 1} uint64_t a[3] = {0x1122334455007700, 0x0123456789abcdef, 0x1100220033445566}; uint16_t count_leading_zeros(uint64_t x) { x |= (x >> 1); x |= (x >> 2); x |= (x >> 4); x |= (x >> 8); x |= (x >> 16); x |= (x >> 32); /* count ones (population count) */ x -= ((x >> 1) & 0x5555555555555555); x = ((x >> 2) & 0x3333333333333333) + (x & 0x3333333333333333); x = ((x >> 4) + x) & 0x0f0f0f0f0f0f0f0f; x += (x >> 8); x += (x >> 16); x += (x >> 32); return (64 - (x & 0x7f)); } uint16_t zbytel(uint64_t x) { uint64_t y; uint16_t n; y = (x & 0x7F7F7F7F7F7F7F7F)+ 0x7F7F7F7F7F7F7F7F; // convert each 0-byte to 0x80 y = ~(y | x |0x7F7F7F7F7F7F7F7F); //and each nonzero byte to 0x00 n = count_leading_zeros(y) >> 3 ; // use number of leading zeros return n; // n = 0 ... 8 , 8 if x has no 0-byte. } int main( int argc, char *argv[ ] ){ uint16_t zbl; for (int i = 0; i < 3; i++) { zbl = zbytel(a[i]); //printf("%llx\n",a[i]); printf("The Leftmost 0-byte is "); printf("%d\n",zbl); } return 0; } ``` The code compiled successfully with optimization flag set to `-O0`: ``` riscv-none-elf-gcc -march=rv32i -mabi=ilp32 -O0 -c -o hw2_c.o hw2_c.c riscv-none-elf-gcc -o hw2_c.elf hw2_c.o ``` And runs without error: ``` The Leftmost 0-byte is 5 The Leftmost 0-byte is 8 The Leftmost 0-byte is 1 inferior exit code 0 ``` To check the content of the generated elf file, I use the command below: ``` riscv-none-elf-objdump -D hw2_c.elf > output.txt ``` But the size of the elf generated from the c code is 69.9kB, which is large compared to the size of the elf file (5.5kB) generated from the assembly version. And to reduce the elf size, I replace the printf function with inline asm. At first, i rewrite the code from ```clike= int main( int argc, char *argv[ ] ){ uint16_t zbl; for (int i = 0; i < 3; i++) { zbl = zbytel(a[i]); printf("The Leftmost 0-byte is "); printf("%d\n",zbl); } return 0; } ``` to ```clike= int main( int argc, char *argv[ ] ){ uint16_t zbl; char str1[] = "The Leftmost 0-byte is "; char endl = '\n'; for (int i = 0; i < 3; i++) { zbl = zbytel(a[i]); zbl += 48; // change to ascii __asm__ ("li a0, 1 # 1 = standard output (stdout)\n" "la a1, %0 # load address of str1\n" "li a2, 24 # length of str1\n" "li a7, 64 # write syscall\n" "ecall\n" "li a0, 1 # 1 = standard output (stdout)\n" "la a1, %1 # pass the stack address to a1\n" "li a2, 1 # length of value digit\n" "li a7, 64 # write syscall\n" "ecall\n" "li a0, 1 # 1 = standard output (stdout)\n" "la a1, %2 # load address of endl\n" "li a2, 2 # length of endl\n" "li a7, 64 # write syscall\n" "ecall\n" : : "irm" (str1), // 0 "irm" (&zbl), // 1 "irm" (&endl) // 2 : ); } return 0; } ``` But here is the compiler output: ``` hungyuhang@hungyuhang-VirtualBox:~/rv32emu/tests/hw2$ make riscv-none-elf-gcc -march=rv32i -mabi=ilp32 -O0 -c -o hw2_c.o hw2_c.c riscv-none-elf-gcc -o hw2_c.elf hw2_c.o /home/hungyuhang/riscv-none-elf-gcc/bin/../lib/gcc/riscv-none-elf/13.2.0/../../../../riscv-none-elf/bin/ld: hw2_c.o: in function `.L7': hw2_c.c:(.text+0x610): undefined reference to `a5' /home/hungyuhang/riscv-none-elf-gcc/bin/../lib/gcc/riscv-none-elf/13.2.0/../../../../riscv-none-elf/bin/ld: hw2_c.o: in function `.L0 ': hw2_c.c:(.text+0x628): undefined reference to `a4' /home/hungyuhang/riscv-none-elf-gcc/bin/../lib/gcc/riscv-none-elf/13.2.0/../../../../riscv-none-elf/bin/ld: hw2_c.c:(.text+0x640): undefined reference to `a3' collect2: error: ld returned 1 exit status make: *** [Makefile:18: hw2_c.elf] Error 1 ``` And I can't find out the cause of the compilation error, so I changed the code to: ```clike= int main( int argc, char *argv[ ] ){ uint16_t zbl; char str1[] = "The Leftmost 0-byte is "; char endl = '\n'; for (int i = 0; i < 3; i++) { zbl = zbytel(a[i]); zbl += 48; // change to ascii register char *p1 asm ("a1") = str1; asm ("li a0, 1\n" // 1 = standard output (stdout) "li a2, 24\n" // length of str1 "li a7, 64\n" // write syscall "ecall" :::); p1 = (char *)(&zbl); asm ("li a0, 1\n" // 1 = standard output (stdout) "li a2, 1\n" // length of value digit "li a7, 64\n" // write syscall "ecall\n" :::); p1 = &endl; asm ("li a0, 1\n" // 1 = standard output (stdout) "li a2, 2\n" // length of endl "li a7, 64\n" // write syscall "ecall" :::); } return 0; } ``` And the code works fine. After remove the `printf()` function call by asm inline, the elf size changes to 16.2kB, which is a big improvement compared to the original 69.9kB. And after have a discussion with [倪英智](https://github.com/ollieni), I find the problem why the first version of the asm inline code cannot compile properly. It turns out that the second parameter of `la` in line 12, 18, and 24 should be a label, but I put a register in the second parameter, for example: `la a1, %2`. And after I changed the code to `mv a1, %2`, the code compiled successfully. ## Comparison [Here](https://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html) is a reference of gcc optimization options. ### -O0 and Handwritten First, I compare the size of the handwritten version with the `-O0` version. I use `riscv-none-elf-size` to check the compiled code size, and here are the results: - Handwritten ``` text data bss dec hex filename 680 0 0 680 2a8 hw2_asm.elf ``` - -O0 ``` text data bss dec hex filename 7520 1396 840 9756 261c hw2_c.elf ``` I use `riscv-none-elf-objdump` to check the elf of the -O0 version, and find there are some library functions (ex. lseek) in the disassembly. I think is because I use `stdint.h` in the C code, that makes it need to use some library functions. and this causes the elf size of the `-O0` version to be larger compared to the handwritten version. ### -O1 I changed the compiler optimization from `-O0` to `-O1`, but the compiled code did not execute properly. I check the main function of the elf file using `riscv-none-elf-objdump`: ``` 00010314 <main>: 10314: fe010113 add sp,sp,-32 10318: 00300793 li a5,3 1031c: 00410613 add a2,sp,4 10320: 01e10693 add a3,sp,30 10324: 00310713 add a4,sp,3 10328: 00100513 li a0,1 1032c: 00060593 mv a1,a2 10330: 01800613 li a2,24 10334: 04000893 li a7,64 10338: 00000073 ecall 1033c: 00100513 li a0,1 10340: 00068593 mv a1,a3 10344: 00100613 li a2,1 10348: 04000893 li a7,64 1034c: 00000073 ecall 10350: 00100513 li a0,1 10354: 00070593 mv a1,a4 10358: 00100613 li a2,1 1035c: 04000893 li a7,64 10360: 00000073 ecall 10364: fff78793 add a5,a5,-1 10368: fc0790e3 bnez a5,10328 <main+0x14> 1036c: 00000513 li a0,0 10370: 02010113 add sp,sp,32 10374: 00008067 ret ``` In the disassembly above, we can see that the code loops 3 times, but within the loop it neither call the `zbytel` function, nor load the address of the string to print onto the stack (`a2`, `a3`, and `a4` points to somewhere on the stack). To fix this, I add the `volatile` quafilier to the declaration of the variables: ```clike= int main( int argc, char *argv[ ] ){ volatile uint16_t zbl; volatile char str1[] = "The Leftmost 0-byte is "; volatile char endl = '\n'; for (int i = 0; i < 3; i++) { zbl = zbytel(a[i]); zbl += 48; // change to ascii asm ("li a0, 1 # 1 = standard output (stdout)\n" "mv a1, %0 # load address of str1\n" "li a2, 24 # length of str1\n" "li a7, 64 # write syscall\n" "ecall\n" "li a0, 1 # 1 = standard output (stdout)\n" "mv a1, %1 # pass the stack address to a1\n" "li a2, 1 # length of value digit\n" "li a7, 64 # write syscall\n" "ecall\n" "li a0, 1 # 1 = standard output (stdout)\n" "mv a1, %2 # load address of endl\n" "li a2, 1 # length of endl\n" "li a7, 64 # write syscall\n" "ecall\n" : : "irm" (str1), // %0 "irm" (&zbl), // %1 "irm" (&endl) // %2 : ); } return 0; } ``` This prevents the compiler to do false optimization. After the modification, the code works normally. After modify the code, I use `riscv-none-elf-size` to check the compiled code size: ``` text data bss dec hex filename 6576 1404 840 8820 2274 hw2_c.elf ``` The total size (8820) slightly decreased compared to the `-O0` version (9756). ### -O2 After changing the code to `-O2`, the code works normally. I check the objdump of the elf file, and find that the compiler merges the code of `zbytel()` into `main()`. As a result, `main()` calls `count_leading_zeros()` directly, which decrease the depth of function calls. And the compiled code size obtained using `riscv-none-elf-size` is shown below: ``` text data bss dec hex filename 6652 1404 840 8896 22c0 hw2_c.elf ``` Compared to the -O1 version, the compiled code size is about the same. ### -O3 But after I change the optimization to `-O3`, the compiled code failed to print properly. After inspecting the disassembly of the elf file, it turns out that the compiler failed to consider the register dependency within the asm() block. For example: ``` 102d4: 00100513 li a0,1 102d8: 00058593 mv a1,a1 102dc: 01800613 li a2,24 102e0: 04000893 li a7,64 102e4: 00000073 ecall 102e8: 00100513 li a0,1 102ec: 00050593 mv a1,a0 102f0: 00100613 li a2,1 102f4: 04000893 li a7,64 102f8: 00000073 ecall 102fc: 00100513 li a0,1 10300: 00080593 mv a1,a6 10304: 00100613 li a2,1 10308: 04000893 li a7,64 1030c: 00000073 ecall ``` At address `0x102ec`, the code should moves the address of the output number (the index of the leftmost zero byte) to register `a1`, but in fact, the code always put 1 in `a1`, which is clearly an error. Trying to fix the error, I changed the asm inline part of the C code to: ```clike=12 register volatile char *p1 asm ("a1") = str1; asm ("li a0, 1\n"// 1 = standard output (stdout) "li a2, 24\n"// length of str1 "li a7, 64\n"// write syscall "ecall" :::); p1 = (char *)(&zbl); asm ("li a0, 1\n"// 1 = standard output (stdout) "li a2, 1\n"// length of value digit "li a7, 64\n"// write syscall "ecall\n" :::); p1 = &endl; asm ("li a0, 1\n"// 1 = standard output (stdout) "li a2, 2\n"// length of endl "li a7, 64\n"// write syscall "ecall" :::); ``` But the output changes to print nothing. Using `objdump` to check the disassembly: ``` 10178: 00100513 li a0,1 1017c: 01800613 li a2,24 10180: 04000893 li a7,64 10184: 00000073 ecall 10188: 00100513 li a0,1 1018c: 00100613 li a2,1 10190: 04000893 li a7,64 10194: 00000073 ecall 10198: 00100513 li a0,1 1019c: 00200613 li a2,2 101a0: 04000893 li a7,64 101a4: 00000073 ecall ``` It did not even move any value to register `a1`. I think the compiler optimize `p1` out, since from the compiler's view, `p1` is not used in the entire code. Despite the fact that my code cannot work work with `-O3`, from the output of `objdump`, I can see that the compiler with `-O3` option uses loop unrolling for code optimization. Compiled code size of `-O3` is shown below: ``` text data bss dec hex filename 8236 1404 840 10480 28f0 hw2_c.elf ``` The size of the `-O3` compiled code is larger then the `-O2` compiled code. ### -Os From the output below, we can see the compiled code size is larger then when compiled using the `-O2` option. ``` text data bss dec hex filename 6866 1404 840 9110 2396 hw2_c.elf ``` ### -Ofast When using `-Ofast` optimization, the same issue when I use `-O3` optimization appears. While inspecting the output of the `objdump`, I find that the compiler eliminate all function calls and loops, and make all code go into the main function with loop unrolling. And below is the size of the compiled code when using `-Ofast`: ``` text data bss dec hex filename 8236 1404 840 10480 28f0 hw2_c.elf ``` ## Performance Measurement ### Add Performance Counter I use the code in [perfcounter](https://github.com/sysprog21/rv32emu/tree/master/tests/perfcounter) to get the execution clock cycle of the program. The code utilizes the CSR register, details of CSR register can be found in chapter 10 of [The RISC-V Instruction Set Manual](https://drive.google.com/file/d/1s0lZxUZaa7eV_O0_WsZzaurFLLww7ou5/view) For simplicity, I only measure the clock cycles that function `zbytel` and `clz` takes. #### Assembly Code First I add the clock cycle measurement fuction in the assembly version. To add the feature, I change the assembler flags ``` ASFLAGS = -march=rv32i -mabi=ilp32 ``` to the flags below: ``` ASFLAGS = -march=rv32i_zicsr_zifencei -mabi=ilp32 ``` And link `getcycles.S` and `getinstret.S` into my program: ``` riscv-none-elf-ld -o hw2_asm.elf -T hw2.ld --oformat=elf32-littleriscv getcycles.o getinstret.o hw2_asm.o ``` To enable the program to print out the clock cycles, I implement a function to print out the value in a register. Because `rv32emu` only supports syscall that prints out a string, it takes me some time to implement the code. The output result is: ``` The Leftmost 0-byte is 5 Cycle count: 0x000000000000009D Instructions retired: 0x000000000000009F The Leftmost 0-byte is 8 Cycle count: 0x000000000000009D Instructions retired: 0x000000000000009F The Leftmost 0-byte is 1 Cycle count: 0x000000000000009D Instructions retired: 0x000000000000009F inferior exit code 0 ``` >0x9D = 157 >0x9F = 159 I print the cycle count in hex because it takes less code to implement. #### C Code To test the `-O3` and `-Ofast`, I change the printing code from asm inline back to calling `printf()` directly. Since we are measuring the section that does not print out data, this modification does not affect performance measured. The code below is the main function after modification: ```clike= int main( int argc, char *argv[ ] ){ uint16_t zbl; for (int i = 0; i < 3; i++) { uint64_t oldcount = get_cycles(); uint64_t oldinstret = get_instret(); zbl = zbytel(a[i]); uint64_t cyclecount = get_cycles() - oldcount; uint64_t instret = get_instret() - oldinstret; printf("The Leftmost 0-byte is "); printf("%d\n",zbl); printf("Cycle count: %u\n", (unsigned int) cyclecount); printf("Instructions retired: %u\n\n", (unsigned) instret); } return 0; } ``` And after bringing back the `printf()` function, I can use the `-O3` optimization without error. I think it is because the optimizer knows what the printf function is doing, so it can optimize the code properly. ### Comapre the Clock Cycles of Different Optimizations The table below shows the result of the measurement: | version | cycle count | instret | |---------|-------------|---------| |handwritten|157|159| |-O0|371|381| |-O1|134|134| |-O2|122|122| |-O3|107|107| |-Os|135|135| |-Ofast|107|107| ## Assembly Code Optimization ### Reduce Function Calls To improve the handwritten version of the assembly code, I tried to modify the `zbytel` function. First I eliminate the function call to `clz` function by insert the code of `clz` directly into `zbytel`: ```clike= zbytel: addi sp,sp,-28 #push sw ra,0(sp) sw s0,4(sp) sw s1,8(sp) sw s2,12(sp) sw s3,16(sp) sw s4,20(sp) sw s5,24(sp) mv s0,a0 #s0:test_half_right mv s1,a1 #s1:test_half_left #y = (x & 0x7F7F7F7F7F7F7F7F)+ 0x7F7F7F7F7F7F7F7F li t0,0x7f7f7f7f and s2,s0,t0 add s2,s2,t0 #y = ~(y | x |0x7F7F7F7F7F7F7F7F) or s2,s2,s0 or s2,s2,t0 xori s2,s2,-1 #s2:y_half_right and s3,s1,t0 add s3,s3,t0 or s3,s3,s0 or s3,s3,t0 xori s3,s3,-1 #s3:y_half_left mv a0,s2 mv a1,s3 # start of clz #x |= (x >> 1) andi t1,a1,0x1 srli s4,a1,1 srli s5,a0,1 slli t1,t1,31 or s5,s5,t1 or a1,s4,a1 or a0,s5,a0 #x |= (x >> 2) andi t1,a1,0x3 srli s4,a1,2 srli s5,a0,2 slli t1,t1,30 or s5,s5,t1 or a1,s4,a1 or a0,s5,a0 ...code truncated #x += (x >> 32) mv s5,a1 and s4,a1,x0 add a1,a1,s4 add a0,a0,s5 #64 - (x & 0x7f) andi a0,a0,0x7f li t1,64 sub a0,t1,a0 # end of clz lw ra,0(sp) lw s0,4(sp) lw s1,8(sp) lw s2,12(sp) lw s3,16(sp) lw s4,20(sp) lw s5,24(sp) addi sp,sp,28 #pop srli a0,a0,3 #clz(y)>>3 jr ra ``` And the output after modification: ``` The Leftmost 0-byte is 5 Cycle count: 0x0000000000000099 Instructions retired: 0x000000000000009B The Leftmost 0-byte is 8 Cycle count: 0x0000000000000099 Instructions retired: 0x000000000000009B The Leftmost 0-byte is 1 Cycle count: 0x0000000000000099 Instructions retired: 0x000000000000009B inferior exit code 0 ``` >0x99 = 153 >0x9B = 155 Which reduce 4 cycle counts. ### Reduce Stack Usage Take a look at the code, we can see that the register used by `zbytel` are almost saved register, and this makes the function needs to push/pop a lot of registers at the start/end of the function. And this is the modified version of the code: ```clike= zbytel: addi sp,sp,-4 #push sw ra,0(sp) #y = (x & 0x7F7F7F7F7F7F7F7F)+ 0x7F7F7F7F7F7F7F7F li t0,0x7f7f7f7f and t2,a0,t0 add t2,t2,t0 #y = ~(y | x |0x7F7F7F7F7F7F7F7F) or t2,t2,a0 or t2,t2,t0 xori t2,t2,-1 #t2:y_half_right and t3,a1,t0 add t3,t3,t0 or t3,t3,a0 or t3,t3,t0 xori t3,t3,-1 #t3:y_half_left mv a0,t2 mv a1,t3 # start of clz #x |= (x >> 1) andi t1,a1,0x1 srli t4,a1,1 srli t5,a0,1 slli t1,t1,31 or t5,t5,t1 or a1,t4,a1 or a0,t5,a0 #x |= (x >> 2) andi t1,a1,0x3 srli t4,a1,2 srli t5,a0,2 slli t1,t1,30 or t5,t5,t1 or a1,t4,a1 or a0,t5,a0 ...code truncated #x += (x >> 32) mv t5,a1 and t4,a1,x0 add a1,a1,t4 add a0,a0,t5 #64 - (x & 0x7f) andi a0,a0,0x7f li t1,64 sub a0,t1,a0 # end of clz lw ra,0(sp) addi sp,sp,4 #pop srli a0,a0,3 #clz(y)>>3 jr ra ``` Compared to the previous version, The `zbytel` function only pushes `ra` onto the stack. And this is the output: ``` The Leftmost 0-byte is 5 Cycle count: 0x000000000000008B Instructions retired: 0x000000000000008D The Leftmost 0-byte is 8 Cycle count: 0x000000000000008B Instructions retired: 0x000000000000008D The Leftmost 0-byte is 1 Cycle count: 0x000000000000008B Instructions retired: 0x000000000000008D inferior exit code 0 ``` >0x8B = 139 >0x8D = 141 This optimization reduces 14 cycles compared to the previous version.