# Assignment2: GNU Toolchain contributed by < [Daniel-0224](https://github.com/Daniel-0224) > ## Problem selet I choose the problem ***Output different number with same binary format by using counting leading zeros*** from classmate, [陳金諄](https://github.com/david965154/Computer_Architecture_hw1). I believe I can enhance his assembly code. This classmate relies heavily on loops and branches to implement clz, which consumes a significant number of clock cycles. ### The origin code ```c #include <stdint.h> #include <stdio.h> uint16_t count_leading_zeros(uint32_t x) { x |= (x >> 1); x |= (x >> 2); x |= (x >> 4); x |= (x >> 8); x |= (x >> 16); /* count ones (population count) */ x -= ((x >> 1) & 0x55555555 ); x = ((x >> 2) & 0x33333333) + (x & 0x33333333 ); x = ((x >> 4) + x) & 0x0f0f0f0f; x += (x >> 8); x += (x >> 16); return (32 - (x & 0x7f)); } int main(){ uint32_t x,y; scanf("%u %u",&x, &y); x = count_leading_zeros(x); y = count_leading_zeros(y); printf("%u",(x>y)?(x-y):(y-x)); return 0; } ``` ## Compile and excecute the code ### Performance measurement Here, I use "getcycles" to calculate the total program's cycle count. * This's the code after modifying: ```c #include <stdint.h> #include <stdio.h> extern uint64_t get_cycles(); uint16_t count_leading_zeros(uint32_t x) { x |= (x >> 1); x |= (x >> 2); x |= (x >> 4); x |= (x >> 8); x |= (x >> 16); /* count ones (population count) */ x -= ((x >> 1) & 0x55555555 ); x = ((x >> 2) & 0x33333333) + (x & 0x33333333 ); x = ((x >> 4) + x) & 0x0f0f0f0f; x += (x >> 8); x += (x >> 16); return (32 - (x & 0x7f)); } int main(){ uint64_t oldcount = get_cycles(); uint32_t x1,y1,x2,y2,x3,y3; x1 = 16, y1 = 12, x2 = 25, y2 = 800, x3 = 956, y3 = 85; x1 = count_leading_zeros(x1); y1 = count_leading_zeros(y1); printf("%lu\n",(x1>y1)?(x1-y1):(y1-x1)); x2 = count_leading_zeros(x2); y2 = count_leading_zeros(y2); printf("%lu\n",(x2>y2)?(x2-y2):(y2-x2)); x3 = count_leading_zeros(x3); y3 = count_leading_zeros(y3); printf("%lu\n",(x3>y3)?(x3-y3):(y3-x3)); uint64_t cyclecount = get_cycles() - oldcount; printf("cycle count: %u\n", (unsigned int) cyclecount); return 0; } ``` I used an external function, "get_cycles()," which requires me to link "main.o" and "getcycles.o" during the GCC compilation. * This's the instruction: ```c $ riscv-none-elf-gcc -march=rv32i_zicsr_zifencei -mabi=ilp32 -Wall -c -o getcycles.o getcycles.s $ riscv-none-elf-gcc -march=rv32i_zicsr_zifencei -mabi=ilp32 -Wall -S -o main.s main.c $ riscv-none-elf-gcc -march=rv32i_zicsr_zifencei -mabi=ilp32 -Wall -c -o main.o main.s $ riscv-none-elf-gcc -o main.elf getcycles.o main.o ``` * Result ```c tseng@tseng-virtual-machine:~/rv32emu/tests/hw2$ ~/rv32emu/build/rv32emu main.elf 1 5 3 cycle count: 3477 inferior exit code 0 ``` * Program size ```c tseng@tseng-virtual-machine:~/rv32emu/tests/hw2$ riscv-none-elf-size main.elf text data bss dec hex filename 51732 1876 1528 55136 d760 main.elf ``` ## Optimal Assembly Code ### -O1 Optimized Assembly Code * Result ```c tseng@tseng-virtual-machine:~/rv32emu/tests/hw2$ ~/rv32emu/build/rv32emu main1.elf 1 5 3 cycle count: 2939 inferior exit code 0 ``` * Program size ```c tseng@tseng-virtual-machine:~/rv32emu/tests/hw2$ riscv-none-elf-size main1.elf text data bss dec hex filename 51228 1876 1528 54632 d568 main1.elf ``` ### -O2 Optimized Assembly Code ```c tseng@tseng-virtual-machine:~/rv32emu/tests/hw2$ ~/rv32emu/build/rv32emu main2.elf 1 5 3 cycle count: 2939 inferior exit code 0 ``` * Program size ```c tseng@tseng-virtual-machine:~/rv32emu/tests/hw2$ riscv-none-elf-size main2.elf text data bss dec hex filename 51228 1876 1528 54632 d568 main2.elf ``` ### -O3 Optimized Assembly Code ```c tseng@tseng-virtual-machine:~/rv32emu/tests/hw2$ ~/rv32emu/build/rv32emu main3.elf 1 5 3 cycle count: 2939 inferior exit code 0 ``` * Program size ```c tseng@tseng-virtual-machine:~/rv32emu/tests/hw2$ riscv-none-elf-size main3.elf text data bss dec hex filename 51228 1876 1528 54632 d568 main3.elf ``` ### -Ofast Optimized Assembly Code ```c tseng@tseng-virtual-machine:~/rv32emu/tests/hw2$ ~/rv32emu/build/rv32emu main_fast.elf 1 5 3 cycle count: 2939 inferior exit code 0 ``` * Program size ```c tseng@tseng-virtual-machine:~/rv32emu/tests/hw2$ riscv-none-elf-size main_fast.elf text data bss dec hex filename 51228 1876 1528 54632 d568 main_fast.elf ``` ## Modified the handwrite assembly code In classmate's code, he relies heavily on loops and branches to implement clz. So, I rewrite this part to improve performance. * The original code: ```c func: addi t1, t1, 1 loop: sra t2, a0, t1 or a0, a0, t2 #if t1 == 16 co addi t3, t1, -16 beq t3, x0, co #t1 = t1*2 slli t1, t1, 1 jal x0, loop co: #x -= ((x >> 1) & 0x55555555 ); srai t1, a0, 1 lui t4, 0x55555 ori t4, t4, 0x555 and t1, t1, t4 sub a0, a0, t1 #x = ((x >> 2) & 0x33333333) + (x & 0x33333333 ); srai t1, a0, 2 lui t4, 0x33333 ori t4, t4, 0x333 and t1, t1, t4 and a0, a0, t4 add a0, a0, t1 # x = ((x >> 4) + x) & 0x0f0f0f0f; srai t1, a0, 4 add a0, a0, t1 lui t4, 0x0f0f0 ori t4, t4, 0x787 addi t4, t4, 0x788 and a0, a0, t4 #x += (x >> 8); srai t1, a0, 8 add a0, a0, t1 #x += (x >> 16); srai t1, a0, 16 add a0, a0, t1 #return (32 - (x & 0x7f)); andi a0, a0, 0x7f addi t3, t3, 32 sub a0, t3, a0 ret ``` * My rewrite: Even though you have a fullybypass, every time you take branch, a "nop" is needed.Reducing the use of loops and branches has resulted in a decrease in the number of "nop" instructions. ```c count_leading_zeros: srli t0, a0, 1 # t0: x >> 1 or a0, a0, t0 # a0: x |= (x >> 1) srli t0, a0, 2 # t0: x >> 2 or a0, a0, t0 srli t0, a0, 4 # t0: x >> 4 or a0, a0, t0 srli t0, a0, 8 # t0: x >> 8 or a0, a0, t0 srli t0, a0, 16 # t0: x >> 16 or a0, a0, t0 count_ones: srai t0, a0, 1 # t0: x >> 1 lui t1, 0x55555 # t1: 0x55555555 ori t1, t1, 0x555 and t0, t0, t1 # t0: (x >> 1) & 0x55555555 sub a0, a0, t0 srai t0, a0, 2 # t0: x >> 2 lui t1, 0x33333 # t1: 0x33333333 ori t1, t1, 0x333 and t0, t0, t1 # t0: (x >> 2) & 0x33333333 and t4, a0, t1 # t4: x & 0x33333333 add a0, t0, t4 srai t0, a0, 4 # t0: x >> 4 add t0, t0, a0 # t0: (x >> 4) + x lui t1, 0x0f0f0 # t1: 0x0f0f0f0f ori t1, t1, 0x787 addi t1, t1, 0x788 and a0, t0, t1 # a0: ((x >> 4) + x) & 0x0f0f0f0f srai t0, a0, 8 # t0: x >> 8 add a0, a0, t0 srai t0, a0, 16 # t0: x >> 16 add a0, a0, t0 andi t0, a0, 0x7f # t0: x & 0x7f addi t1, x0, 32 # t1: 32 sub a0, t1, t0 # a0: return (32 - (x & 0x7f)) ret ``` * Berfor rewrite ![](https://hackmd.io/_uploads/HJoNdupfp.png) * After rewrite ![](https://hackmd.io/_uploads/r1VmOu6z6.png) ## Conclusion We can clearly see that I have significantly reduced both the number of instructions and the cycle count by rewriting the handwrite assembly code. | | original | -O1 | -O2 | -O3 | -Ofast | | ----- |:--------:| ---- | ---- | ---- |:------:| | cycle | 3477 | 2939 | 2939 | 2939 | 2939 |