Assignment1: RISC-V Assembly and Instruction Pipeline

# Assignment1: RISC-V Assembly and Instruction Pipeline # Problem_B ## uf8 Encoding and Decoding with Optimized CLZ Method uf8 implements a logarithmic 8-bit codec that maps 20-bit unsigned integers to 8-bit symbols via logarithmic quantization, delivering 2.5:1 compression and ≤6.25% relative error.Below is the formula of uf8 encoding and decoding. ![image](https://hackmd.io/_uploads/r1TkVby6el.png) ## Implementation and Optimization In this part, I translate the C code below into a complete RISC-V program and optimize the UF8 encoding using a loopless binary-search CLZ. I demonstrate that the loopless CLZ method outperforms the branchless CLZ method in terms of runtime performance. Moreover, I show that the loopless CLZ method reduces memory usage by 20% compared with the compiler-generated RISC-V code. You can find the source code [here](https://github.com/mephen/2025-arch-hw1). ### C code :::spoiler Open ```c= #include <stdbool.h> #include <stdint.h> #include <stdio.h> #include <stdlib.h> typedef uint8_t uf8; static inline unsigned clz(uint32_t x) //count leading zeros { //假設一開始有 32 個 leading zeros, 每次測試一半（binary search 的思路） int n = 32, c = 16; do { uint32_t y = x >> c; //把數字右移 c 位 if (y) { //檢查高位是否有非零 bit，y!=0 代表最高位落在中線左邊 n -= c; //至少有 c bit 不是 leading zero x = y; //縮小範圍繼續檢查 } c >>= 1; //c 除以 2，繼續二分搜尋 } while (c); return n - x; //最後一輪 x 會縮到 1，n-x 最大 31 (最高位在 bit 0) 最小 0 (最高位在 bit 31) } /* Decode uf8 to uint32_t */ uint32_t uf8_decode(uf8 fl) { uint32_t mantissa = fl & 0x0f; uint8_t exponent = fl >> 4; uint32_t offset = (0x7FFF >> (15 - exponent)) << 4; //(2^e - 1)*16 return (mantissa << exponent) + offset; //m*2^e + (2^e - 1)*16 } /* Encode uint32_t to uf8 */ uf8 uf8_encode(uint32_t value) { /* Use CLZ for fast exponent calculation */ if (value < 16) return value; /* Find appropriate exponent using CLZ hint */ int lz = clz(value); int msb = 31 - lz; /* Start from a good initial guess */ uint8_t exponent = 0; uint32_t overflow = 0; if (msb >= 5) { exponent = msb - 4; // 經驗公式：exponent ≈ msb - 4 if (exponent > 15) exponent = 15; /* Calculate overflow for estimated exponent */ for (uint8_t e = 0; e < exponent; e++) //overflow是offset，求(2^e - 1)*16 overflow = (overflow << 1) + 16; /* Adjust if estimate was off：向下修正 (如果估計太大)*/ while (exponent > 0 && value < overflow) { overflow = (overflow - 16) >> 1; exponent--; } } /* Find exact exponent：向上修正 (如果估計太小) */ while (exponent < 15) { uint32_t next_overflow = (overflow << 1) + 16; if (value < next_overflow) break; overflow = next_overflow; exponent++; } uint8_t mantissa = (value - overflow) >> exponent;//(value - offset(e))/(2^e) return (exponent << 4) | mantissa; } /* Test encode/decode round-trip */ static bool test(void) { int32_t previous_value = -1; bool passed = true; for (int i = 0; i < 256; i++) { printf("test data: %d\n",i); uint8_t fl = i; int32_t value = uf8_decode(fl); uint8_t fl2 = uf8_encode(value); if (fl != fl2) { printf("%02x: produces value %d but encodes back to %02x\n", fl, value, fl2); passed = false; } if (value <= previous_value) { printf("%02x: value %d <= previous_value %d\n", fl, value, previous_value); passed = false; } previous_value = value; } return passed; } int main(void) { if (test()) { printf("All tests passed.\n"); return 0; } return 1; } ``` ::: ### Assembly code In this part, I present 3 versions of the UF8 RISC-V program and compare them in terms of code size and runtime performance. - UF8 Decoding + Loopless Binary-Search CLZ–Optimized Encoding with Round-Trip Tests - UF8 Decoding + BranchLess Binary-Search CLZ–Optimized Encoding with Round-Trip Tests - Compiler Generated UF8 Decoding + Encoding #### UF8 Decoding + Loopless Binary-Search CLZ–Optimized Encoding with Round-Trip Tests - code lines: 241 - cpu cycles: 45903 :::spoiler Open ```asm= .text _start: jal ra, main # jump to main halt: li a7, 10 ecall # if main returns, exit clz: # uint32_t clz(uint32_t x) beqz a0, clz_zero # if x==0, return 32 li t0, 32 # n = 32 srli t2, a0, 16 # y = x >> 16 beqz t2, clz_skip16 # if y==0, go search right half addi t0, t0, -16 # n -= 16 mv a0, t2 # x = y clz_skip16: srli t2, a0, 8 # y = x >> 8 beqz t2, clz_skip8 # if y==0, go search right half addi t0, t0, -8 # n -= 8 mv a0, t2 # x = y clz_skip8: srli t2, a0, 4 beqz t2, clz_skip4 addi t0, t0, -4 mv a0, t2 clz_skip4: srli t2, a0, 2 beqz t2, clz_skip2 addi t0, t0, -2 mv a0, t2 clz_skip2: srli t2, a0, 1 beqz t2, clz_last addi t0, t0, -1 mv a0, t2 clz_last: sub a0, t0, a0 # x = n - x ret # return x clz_zero: li a0, 32 # x = 32 ret # return x uf8_decode: # uint32_t uf8_decode(uint8_t fl) andi t0, a0, 0x0F # m = fl & 0x0f srli t1, a0, 4 # e = fl >> 4 li t2, 1 sll t2, t2, t1 addi t2, t2, -1 slli t2, t2, 4 # offset = ((1<<e)-1) << 4 sll t0, t0, t1 add a0, t0, t2 # fl = (m << e) + offset ret uf8_encode: # uint8_t uf8_encode(uint32_t value) addi sp, sp, -16 sw ra, 12(sp) # store ra (because we call clz) sw s0, 8(sp) # store callee-saved s0 (exponent) sw s1, 4(sp) # store callee-saved s1 (overflow) # if (value < 16) return value; sltiu t0, a0, 16 beqz t0, enCode_normalVal # if (!(value<16)), goto enCode_normalVal j enCode_ret # goto enCode_ret enCode_normalVal: mv t5, a0 # v = value jal ra, clz # lz = clz(value) li t0, 31 sub t1, t0, a0 # msb = 31 - lz mv a0, t5 # value = v li s0, 0 # exponent = 0 li s1, 0 # overflow = 0 sltiu t2, t1, 5 # if (msb >= 5) { exponent = msb - 4; if (exponent>15) exponent=15; } bnez t2, enCode_find_up addi s0, t1, -4 sltiu t3, s0, 16 bnez t3, enCode_initGuess_overflow_ok # if (exponent < 16) goto enCode_initGuess_overflow_ok; li s0, 15 # exponent = 15; enCode_initGuess_overflow_ok: # overflow = ((1<<e)-1)*16 li s1, 0 # overflow = 0; li t4, 0 # e = 0; enCode_overflow_loop: bge t4, s0, enCode_adjust_down # if (e >= exponent) goto enCode_adjust_down; slli s1, s1, 1 addi s1, s1, 16 addi t4, t4, 1 # overflow = (overflow<<1) + 16; e++; j enCode_overflow_loop enCode_adjust_down: beqz s0, enCode_find_up # if (exponent==0) goto enCode_find_up; sltu t4, a0, s1 beqz t4, enCode_find_up # if (value >= overflow) goto enCode_find_up; addi s1, s1, -16 srli s1, s1, 1 # overflow = (overflow - 16) >> 1; addi s0, s0, -1 # exponent--; j enCode_adjust_down enCode_find_up: li t4, 15 # upper limit 15 enCode_up_loop: bge s0, t4, enCode_up_done # if (exponent >= 15) break; slli t1, s1, 1 addi t1, t1, 16 # next_overflow = (overflow << 1) + 16; sltu t2, a0, t1 bnez t2, enCode_up_done # if (value < next_overflow), goto enCode_up_done; mv s1, t1 # overflow = next_overflow; addi s0, s0, 1 # exponent++; j enCode_up_loop enCode_up_done: sub t0, a0, s1 # num = value - overflow; srl t0, t0, s0 # mantissa = num >> exponent; slli t1, s0, 4 andi t0, t0, 0x0F # mantissa &= 0x0F; or a0, t1, t0 # a0 = (exponent<<4) | mantissa; andi a0, a0, 0xFF # mask return value to 8 bits enCode_ret: lw ra, 12(sp) # restore ra lw s0, 8(sp) # restore s0 lw s1, 4(sp) # restore s1 addi sp, sp, 16 # deallocate stack frame ret # return test: # bool test(void) addi sp, sp, -40 sw ra, 36(sp) # save return address sw s0, 32(sp) # save s0 (previous_value) sw s1, 28(sp) # save s1 (passed) sw s2, 24(sp) # save s2 (i) sw s3, 20(sp) # save s3 (value) sw t0, 16(sp) sw t1, 12(sp) sw t2, 8(sp) sw t3, 4(sp) li s0, -1 # previous_value = -1 li s1, 1 # passed = true li s2, 0 # i = 0 t_loop: li t3, 256 # loop upper bound = 256 bge s2, t3, t_done # if (i >= 256) break la a0, msg_test # "test data: " li a7, 4 # syscall: print string ecall # perform syscall mv a0, s2 # a0 = i li a7, 1 # syscall: print integer ecall # perform syscall la a0, msg_nl # "\n" li a7, 4 # syscall: print string ecall # perform syscall mv a0, s2 # a0 = fl = i jal ra, uf8_decode # call uf8_decode(fl) mv s3, a0 # value = return # fl2 = uf8_encode(value) mv a0, s3 # a0 = value jal ra, uf8_encode # call uf8_encode(value) mv t1, a0 # fl2 = return bne s2, t1, t_fail_flag # compare original fl vs re-encoded fl2 slt t2, s0, s3 # if strictly increasing, skip error bnez t2, t_set_prev # if (value <= previous_value) report non-increasing t_bad_inc: la a0, msg_noninc_a # print prefix for non-increasing message li a7, 4 ecall mv a0, s2 # print fl (as integer) li a7, 1 ecall la a0, msg_noninc_b # print " value " li a7, 4 ecall mv a0, s3 # print value li a7, 1 ecall la a0, msg_noninc_c # print " <= previous_value " li a7, 4 ecall mv a0, s0 # print previous_value li a7, 1 ecall la a0, msg_nl # newline li a7, 4 ecall li s1, 0 # passed = false j t_set_prev # proceed to update previous_value t_fail_flag: la a0, msg_mismatch_a # print mismatch prefix li a7, 4 ecall mv a0, s2 # print fl li a7, 1 ecall la a0, msg_mismatch_b # print ": produces value " li a7, 4 ecall mv a0, s3 # print value li a7, 1 ecall la a0, msg_mismatch_c # print " but encodes back to " li a7, 4 ecall mv a0, t1 # print fl2 li a7, 1 ecall la a0, msg_nl # newline li a7, 4 ecall li s1, 0 # passed = false t_set_prev: mv s0, s3 # previous_value = value addi s2, s2, 1 # i++ j t_loop # next iteration t_done: mv a0, s1 # return value = passed lw ra, 36(sp) # restore ra lw s0, 32(sp) # restore s0 lw s1, 28(sp) # restore s1 lw s2, 24(sp) # restore s2 lw s3, 20(sp) # restore s3 lw t0, 16(sp) # restore t0 lw t1, 12(sp) # restore t1 lw t2, 8(sp) # restore t2 lw t3, 4(sp) # restore t3 addi sp, sp, 40 # pop stack frame ret # return to caller main: # int main(void) addi sp, sp, -16 sw ra, 12(sp) jal ra, test beqz a0, print_fail la a0, msg_ok # print "All tests passed.\n" li a7, 4 ecall j main_exit print_fail: la a0, msg_fail li a7, 4 ecall main_exit: li a7, 10 ecall .data # data section .align 4 msg_ok: .asciz "All tests passed.\n" msg_fail: .asciz "Failed.\n" msg_test: .asciz "test data: " msg_nl: .asciz "\n" msg_mismatch_a: .asciz "mismatch: fl= " msg_mismatch_b: .asciz " value= " msg_mismatch_c: .asciz " fl2= " msg_noninc_a: .asciz "non-increasing: fl= " msg_noninc_b: .asciz " value= " msg_noninc_c: .asciz " prev= " ``` ::: #### UF8 Decoding + Branchless Binary-Search CLZ–Optimized Encoding with Round-Trip Tests - code lines: 237 - cpu cycles: 47103 :::spoiler open ```asm= .text _start: jal ra, main # jump to main halt: li a7, 10 ecall # if main returns, exit clz_branchless: # uint32_t clz_branchless(uint32_t x) li t0, 32 # n = 32 mv t1, a0 # t1 = x srli t2, t1, 16 # y = x >> 16 sltu t3, x0, t2 # b = (y!=0) ? 1 : 0 slli t4, t3, 4 # s = b*16 srl t1, t1, t4 # x >>= s sub t0, t0, t4 # n -= s srli t2, t1, 8 sltu t3, x0, t2 slli t4, t3, 3 # s = b*8 srl t1, t1, t4 sub t0, t0, t4 srli t2, t1, 4 sltu t3, x0, t2 slli t4, t3, 2 # s = b*4 srl t1, t1, t4 sub t0, t0, t4 srli t2, t1, 2 sltu t3, x0, t2 slli t4, t3, 1 # s = b*2 srl t1, t1, t4 sub t0, t0, t4 srli t2, t1, 1 sltu t3, x0, t2 # b = (x>>1)!=0 mv t4, t3 # s = b*1 srl t1, t1, t4 sub t0, t0, t4 sub a0, t0, t1 # return n - x ret uf8_decode: # uint32_t uf8_decode(uint8_t fl) andi t0, a0, 0x0F # m = fl & 0x0f srli t1, a0, 4 # e = fl >> 4 li t2, 1 sll t2, t2, t1 addi t2, t2, -1 slli t2, t2, 4 # offset = ((1<<e)-1) << 4 sll t0, t0, t1 add a0, t0, t2 # fl = (m << e) + offset ret uf8_encode: # uint8_t uf8_encode(uint32_t value) addi sp, sp, -16 sw ra, 12(sp) # store ra (because we call clz_branchless) sw s0, 8(sp) # store callee-saved s0 (exponent) sw s1, 4(sp) # store callee-saved s1 (overflow) sltiu t0, a0, 16 # if (value < 16) return value; beqz t0, enCode_normalVal # if (!(value<16)), goto enCode_normalVal j enCode_ret # goto enCode_ret enCode_normalVal: mv t5, a0 # v = value jal ra, clz_branchless # lz = clz_branchless(value) li t0, 31 sub t1, t0, a0 # msb = 31 - lz mv a0, t5 # value = v li s0, 0 # exponent = 0 li s1, 0 # overflow = 0 sltiu t2, t1, 5 # if (msb >= 5) { exponent = msb - 4; if (exponent>15) exponent=15; } bnez t2, enCode_find_up addi s0, t1, -4 sltiu t3, s0, 16 bnez t3, enCode_initGuess_overflow_ok # if (exponent < 16) goto enCode_initGuess_overflow_ok; li s0, 15 # exponent = 15; enCode_initGuess_overflow_ok: # overflow = ((1<<e)-1)*16 li s1, 0 # overflow = 0; li t4, 0 # e = 0; enCode_overflow_loop: bge t4, s0, enCode_adjust_down # if (e >= exponent) goto enCode_adjust_down; slli s1, s1, 1 addi s1, s1, 16 addi t4, t4, 1 # overflow = (overflow<<1) + 16; e++; j enCode_overflow_loop enCode_adjust_down: beqz s0, enCode_find_up # if (exponent==0) goto enCode_find_up; sltu t4, a0, s1 beqz t4, enCode_find_up # if (value >= overflow) goto enCode_find_up; addi s1, s1, -16 srli s1, s1, 1 # overflow = (overflow - 16) >> 1; addi s0, s0, -1 # exponent--; j enCode_adjust_down enCode_find_up: li t4, 15 # upper limit 15 enCode_up_loop: bge s0, t4, enCode_up_done # if (exponent >= 15) break; slli t1, s1, 1 addi t1, t1, 16 # next_overflow = (overflow << 1) + 16; sltu t2, a0, t1 bnez t2, enCode_up_done # if (value < next_overflow), goto enCode_up_done; mv s1, t1 # overflow = next_overflow; addi s0, s0, 1 # exponent++; j enCode_up_loop enCode_up_done: sub t0, a0, s1 # num = value - overflow; srl t0, t0, s0 # mantissa = num >> exponent; slli t1, s0, 4 andi t0, t0, 0x0F # mantissa &= 0x0F; or a0, t1, t0 # a0 = (exponent<<4) | mantissa; andi a0, a0, 0xFF # mask return value to 8 bits enCode_ret: lw ra, 12(sp) # restore ra lw s0, 8(sp) # restore s0 lw s1, 4(sp) # restore s1 addi sp, sp, 16 # deallocate stack frame ret # return test: # static bool test(void) addi sp, sp, -40 sw ra, 36(sp) # save return address sw s0, 32(sp) # save s0 (previous_value) sw s1, 28(sp) # save s1 (passed) sw s2, 24(sp) # save s2 (i) sw s3, 20(sp) # save s3 (value) sw t0, 16(sp) sw t1, 12(sp) sw t2, 8(sp) sw t3, 4(sp) li s0, -1 # previous_value = -1 li s1, 1 # passed = true li s2, 0 # i = 0 t_loop: li t3, 256 # loop upper bound = 256 bge s2, t3, t_done # if (i >= 256) break la a0, msg_test # "test data: " li a7, 4 # syscall: print string ecall # perform syscall mv a0, s2 # a0 = i li a7, 1 # syscall: print integer ecall # perform syscall la a0, msg_nl # "\n" li a7, 4 # syscall: print string ecall # perform syscall mv a0, s2 # a0 = fl = i jal ra, uf8_decode # call uf8_decode(fl) mv s3, a0 # value = return mv a0, s3 # a0 = value jal ra, uf8_encode # call uf8_encode(value) mv t1, a0 # fl2 = return bne s2, t1, t_fail_flag # if (fl != fl2) goto t_fail_flag slt t2, s0, s3 bnez t2, t_set_prev # if (value <= previous_value) report non-increasing t_bad_inc: la a0, msg_noninc_a # print prefix for non-increasing message li a7, 4 ecall mv a0, s2 # print fl (as integer) li a7, 1 ecall la a0, msg_noninc_b # print " value " li a7, 4 ecall mv a0, s3 # print value li a7, 1 ecall la a0, msg_noninc_c # print " <= previous_value " li a7, 4 ecall mv a0, s0 # print previous_value li a7, 1 ecall la a0, msg_nl # newline li a7, 4 ecall li s1, 0 # passed = false j t_set_prev # proceed to update previous_value t_fail_flag: la a0, msg_mismatch_a # print mismatch prefix li a7, 4 ecall mv a0, s2 # print fl li a7, 1 ecall la a0, msg_mismatch_b # print ": produces value " li a7, 4 ecall mv a0, s3 # print value li a7, 1 ecall la a0, msg_mismatch_c # print " but encodes back to " li a7, 4 ecall mv a0, t1 # print fl2 li a7, 1 ecall la a0, msg_nl # newline li a7, 4 ecall li s1, 0 # passed = false t_set_prev: mv s0, s3 # previous_value = value addi s2, s2, 1 # i++ j t_loop # next iteration t_done: mv a0, s1 # return value = passed lw ra, 36(sp) # restore ra lw s0, 32(sp) # restore s0 lw s1, 28(sp) # restore s1 lw s2, 24(sp) # restore s2 lw s3, 20(sp) # restore s3 lw t0, 16(sp) # restore t0 lw t1, 12(sp) # restore t1 lw t2, 8(sp) # restore t2 lw t3, 4(sp) # restore t3 addi sp, sp, 40 # pop stack frame ret # return to caller main: addi sp, sp, -16 sw ra, 12(sp) jal ra, test beqz a0, print_fail la a0, msg_ok # print "All tests passed.\n" li a7, 4 ecall j main_exit print_fail: la a0, msg_fail li a7, 4 ecall main_exit: li a7, 10 ecall # data section .data .align 4 msg_ok: .asciz "All tests passed.\n" msg_fail: .asciz "Failed.\n" msg_test: .asciz "test data: " msg_nl: .asciz "\n" msg_mismatch_a: .asciz "mismatch: fl= " msg_mismatch_b: .asciz " value= " msg_mismatch_c: .asciz " fl2= " msg_noninc_a: .asciz "non-increasing: fl= " msg_noninc_b: .asciz " value= " msg_noninc_c: .asciz " prev= " ``` ::: #### Compiler Generated UF8 Decoding + Encoding - code lines: 298 - cpu cycles: NaN (Ripes does not support some instructions used in the compiler-generated RISC-V program) :::spoiler open ```asm= .file "HW1.c" .option pic .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0" .attribute unaligned_access, 0 .attribute stack_align, 16 .text .align 1 .type uf8_encode.part.0, @function uf8_encode.part.0: .LFB44: .cfi_startproc mv a3,a0 li a4,5 li a5,16 li a1,32 .L3: srlw a2,a3,a5 addiw a4,a4,-1 beq a2,zero,.L2 subw a1,a1,a5 mv a3,a2 .L2: srai a5,a5,1 bne a4,zero,.L3 subw a3,a3,a1 addiw a3,a3,31 li a5,4 ble a3,a5,.L14 andi a3,a3,0xff addiw a2,a3,-4 andi a2,a2,0xff li a1,15 andi a5,a2,0xff bgtu a2,a1,.L30 li a1,4 beq a3,a1,.L15 .L34: li a3,0 .L7: addiw a3,a3,1 slliw a4,a4,1 andi a3,a3,0xff addiw a4,a4,16 bgtu a5,a3,.L7 bgtu a4,a0,.L10 j .L8 .L31: bleu a4,a0,.L6 .L10: addiw a5,a5,-1 addiw a4,a4,-16 andi a5,a5,0xff srliw a4,a4,1 bne a5,zero,.L31 .L6: slliw a3,a4,1 addiw a3,a3,16 j .L4 .L14: li a3,16 li a5,0 .L4: li a2,15 bgeu a0,a3,.L12 j .L32 .L13: bltu a0,a4,.L33 mv a3,a4 .L12: addiw a5,a5,1 slliw a4,a3,1 andi a5,a5,0xff addiw a4,a4,16 bne a5,a2,.L13 .L28: li a2,-16 li a5,15 .L11: subw a0,a0,a3 srlw a5,a0,a5 or a0,a2,a5 andi a0,a0,0xff ret .L30: li a1,4 li a5,15 bne a3,a1,.L34 j .L15 .L33: slliw a2,a5,4 subw a0,a0,a3 sext.w a5,a5 slliw a2,a2,24 sraiw a2,a2,24 srlw a5,a0,a5 or a0,a2,a5 andi a0,a0,0xff ret .L15: li a5,0 j .L6 .L8: li a3,14 bleu a2,a3,.L6 mv a3,a4 j .L28 .L32: slliw a2,a5,4 slliw a2,a2,24 sext.w a5,a5 sraiw a2,a2,24 mv a3,a4 j .L11 .cfi_endproc .LFE44: .size uf8_encode.part.0, .-uf8_encode.part.0 .align 1 .globl uf8_decode .type uf8_decode, @function uf8_decode: .LFB40: .cfi_startproc srli a5,a0,4 li a4,15 subw a4,a4,a5 mv a3,a5 li a5,32768 addiw a5,a5,-1 sraw a5,a5,a4 andi a0,a0,15 slliw a5,a5,4 sllw a0,a0,a3 addw a0,a5,a0 ret .cfi_endproc .LFE40: .size uf8_decode, .-uf8_decode .align 1 .globl uf8_encode .type uf8_encode, @function uf8_encode: .LFB41: .cfi_startproc li a4,15 bleu a0,a4,.L40 tail uf8_encode.part.0 .L40: andi a0,a0,0xff ret .cfi_endproc .LFE41: .size uf8_encode, .-uf8_encode .section .rodata.str1.8,"aMS",@progbits,1 .align 3 .LC0: .string "test data: %d\n" .align 3 .LC1: .string "%02x: produces value %d but encodes back to %02x\n" .align 3 .LC2: .string "%02x: value %d <= previous_value %d\n" .align 3 .LC3: .string "All tests passed." .section .text.startup,"ax",@progbits .align 1 .globl main .type main, @function main: .LFB43: .cfi_startproc addi sp,sp,-112 .cfi_def_cfa_offset 112 sd s0,96(sp) .cfi_offset 8, -16 li s0,32768 sd s1,88(sp) sd s2,80(sp) sd s3,72(sp) sd s4,64(sp) sd s5,56(sp) sd s6,48(sp) sd s7,40(sp) sd s8,32(sp) sd s9,24(sp) sd ra,104(sp) sd s10,16(sp) sd s11,8(sp) .cfi_offset 9, -24 .cfi_offset 18, -32 .cfi_offset 19, -40 .cfi_offset 20, -48 .cfi_offset 21, -56 .cfi_offset 22, -64 .cfi_offset 23, -72 .cfi_offset 24, -80 .cfi_offset 25, -88 .cfi_offset 1, -8 .cfi_offset 26, -96 .cfi_offset 27, -104 li s6,1 li s8,-1 li s9,0 lla s5,.LC0 li s4,15 addiw s0,s0,-1 li s3,15 lla s2,.LC1 lla s7,.LC2 li s1,256 .L46: mv a2,s9 mv a1,s5 li a0,2 call __printf_chk@plt andi s11,s9,0xff srliw a5,s11,4 subw a5,s4,a5 sraw a5,s0,a5 srli a3,s11,4 andi a4,s11,15 slliw a5,a5,4 sllw a4,a4,a3 mv s10,s8 addw s8,a5,a4 mv a0,s8 andi a5,s8,0xff bleu s8,s3,.L43 call uf8_encode.part.0 mv a5,a0 .L43: sext.w a4,a5 mv a3,s8 mv a2,s9 mv a1,s2 li a0,2 beq a5,s11,.L44 call __printf_chk@plt li s6,0 .L44: bge s10,s8,.L52 .L45: addiw s9,s9,1 bne s9,s1,.L46 li a0,1 bne s6,zero,.L53 .L47: ld ra,104(sp) .cfi_remember_state .cfi_restore 1 ld s0,96(sp) .cfi_restore 8 ld s1,88(sp) .cfi_restore 9 ld s2,80(sp) .cfi_restore 18 ld s3,72(sp) .cfi_restore 19 ld s4,64(sp) .cfi_restore 20 ld s5,56(sp) .cfi_restore 21 ld s6,48(sp) .cfi_restore 22 ld s7,40(sp) .cfi_restore 23 ld s8,32(sp) .cfi_restore 24 ld s9,24(sp) .cfi_restore 25 ld s10,16(sp) .cfi_restore 26 ld s11,8(sp) .cfi_restore 27 addi sp,sp,112 .cfi_def_cfa_offset 0 jr ra .L52: .cfi_restore_state mv a4,s10 mv a3,s8 mv a2,s9 mv a1,s7 li a0,2 call __printf_chk@plt li s6,0 j .L45 .L53: lla a0,.LC3 call puts@plt li a0,0 j .L47 .cfi_endproc .LFE43: .size main, .-main .ident "GCC: (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0" .section .note.GNU-stack,"",@progbits ``` ::: The loopless CLZ method outperforms the branchless CLZ method in terms of runtime performance. Compared with the compiler-generated RISC-V code, it reduces memory usage by 20%. ## Explanations We test our code using [Ripes](https://github.com/mortbopet/Ripes) simulator. ### Program Functionality #### clz(uint32_t x) - Function: Implements a loopless binary-search algorithm to count the number of leading zeros in a 32-bit integer. - Process: The input x is repeatedly shifted right by halves (16, 8, 4, 2, 1). Whenever the shifted result is nonzero, the counter n is decreased accordingly. #### uf8_encode(uint32_t value) - Function: Encodes a 32-bit integer into an 8-bit UF8 format. - Process: 1. If value < 16, it is directly returned. 2. Otherwise, clz is called to find the position of the most significant bit (MSB). 3. The exponent and overflow are derived from the MSB position. 4. The mantissa is calculated as (value - offset) >> e. 5. The final 8-bit result is assembled as (e << 4) | (m & 0x0F). #### uf8_decode(uint8_t fl) - Function: Decodes an 8-bit UF8 value into a 32-bit integer. - Process: 1. The mantissa (m) and exponent (e) are extracted using bitwise operations. 2. The offset is computed as ((1 << e) - 1) << 4, and the decoded value becomes (m << e) + offset. #### Round-trip validation - Function: The round-trip test checks whether the encode–decode pair behaves consistently and monotonically. - Process: If encode(decode(x)) == x, it means every 8-bit UF8 pattern maps back to itself after decoding and re-encoding — no representation is lost or merged. ### 5-stage pipelined processor I choose the 5 stage processor to run this RISC-V program. Its block diagram look like this: ![image](https://hackmd.io/_uploads/B1td-zyTlx.png) The "5-stage" means this processor using five-stage pipeline to parallelize instructions. The stages are: 1. Instruction fetch (IF) 2. Instruction decode and register fetch (ID) 3. Execute (EX) 4. Memory access (MEM) 5. Register write back (WB) You can see that each stage is separated by pipeline registers (the rectangle block with stage names on its each side) in the block diagram. To keep the explanation concise, one representative instruction `srli t2, a0, 16` from clz function is analyzed to illustrate its execution across the processor’s five pipeline stages. > SRLI(shift right logical immediate) performs a logical right shift on register a0 by 16 bits and stores the result in register t2. The five figures below illustrate how this instruction moves through each pipeline stage. 1. Instruction fetch (IF) ![image](https://hackmd.io/_uploads/HJ1G9Mk6gl.png) - We start from the instruction located at address `0x00000014`. - The machine code of this instruction is `0x01055393`. - PC will increment by 4 automatically using the above adder. - Since no branch or jump occurs at this stage, the multiplexer selects the input from the adder as next instruction. 2. Instruction decode and register fetch (ID) ![image](https://hackmd.io/_uploads/ByXnWygpeg.png) - Instruction 0x01055393 is decoded into the following parts: - opcode = `SRLI` - Wr idx = `0x07` (rd x7) - R1 idx = `0x0A` (rs1 x10) - Imm. = `0x00000010` (shamt 16) - Because srli belongs to the I-type format, it only reads one source register (`R1 idx = x10`), while the second source register field (`R2 idx`) is unused. `Reg1` read value from `x10` register. - `Imm.` sign-extends the 12-bit immediate 0x10 to a 32-bit constant (0x00000010) and forwards it to the ID/EX pipeline register. - The current PC value (`0x00000014`) and next PC value (`0x00000018`) are carried through the pipeline for later use in the EX stage, but are not directly used here. 3. Execute (EX) ![image](https://hackmd.io/_uploads/BkKo9kgTxg.png) - The first-level multiplexers select the value from `Reg 1`(`0x00000010`) and the `Imm.` (`0x00000010`). These are forwarded to the ALU inputs as `Op1` and `Op2`. - The `Branch` also receives inputs from the registers, but since this instruction is not a branch type, the Branch taken signal remains low (0). - The ALU control unit identifies the operation as SRL (Shift Right Logical). Therefore, the ALU shifts the value from x10 right by 16 bits. - The result (`Res`) is then stored in the EX/MEM pipeline register for the next stage. - The current PC value (`0x00000014`), next PC (`0x00000018`), and destination register index (Wr idx = `0x07`) are passed through unchanged to maintain instruction context for later stages. 4. Memory access (MEM) ![image](https://hackmd.io/_uploads/rkHAsWeaxl.png) - In the MEM stage, the result from the ALU (Res) and control signals from the EX stage are forwarded through the EX/MEM pipeline register. - Because srli is an ALU-type instruction, it does not perform any memory read or write operation. - The ALU result (x10 >> 16) is passed directly from the `Res` output toward the MEM/WB register, ready to be written back in the next stage. - The destination register index (Wr idx = 0x07) is also propagated unchanged through this stage. 5. Register write back (WB) ![image](https://hackmd.io/_uploads/BJWU2zJale.png) - In the WB stage, the MEM/WB pipeline register outputs the final ALU result to the registers. - The write data input of the registers receives the ALU result (x10 >> 16). - The destination register is `x7` (Wr idx = `0x07`). # Problem_C ## < BF16 Simplified Arithmetic > <br /> Encoding, decoding, and basic operations with a simplified rounding scheme for smaller code size and acceptable precision The bfloat16 format (16-bit, from Google Brain) preserves float32’s dynamic range by keeping the same 8-bit exponent, but reduces precision to a 7-bit significand (vs. 23) ![upload_ce76a3a21cb769b4c935d0d6688886e1](https://hackmd.io/_uploads/r1IoAWdalx.png) The value 𝑣 of a BFloat16 number is calculated as: $$ v = (-1)^S \times 2^{E-127} \times \left(1 + \frac{M}{128}\right) $$ where: ![image](https://hackmd.io/_uploads/S1ux1fOTee.png) Special Cases: ![image](https://hackmd.io/_uploads/HkZG1zOage.png) ## Implementation and Optimization In this part, I translate the C code below into a complete RISC-V program. I leverage the strengths of the 5-stage processor to reduce memory usage compared to the compiler-generated version. You can find the source code [here](https://github.com/mephen/2025-arch-hw1). ### C code :::spoiler open ```C= #include <stdbool.h> #include <stdint.h> #include <stdio.h> #include <string.h> typedef struct { uint16_t bits; } bf16_t; #define BF16_SIGN_MASK 0x8000U #define BF16_EXP_MASK 0x7F80U #define BF16_MANT_MASK 0x007FU #define BF16_EXP_BIAS 127 #define BF16_NAN() ((bf16_t) {.bits = 0x7FC0}) #define BF16_ZERO() ((bf16_t) {.bits = 0x0000}) static inline bool bf16_isnan(bf16_t a) { return ((a.bits & BF16_EXP_MASK) == BF16_EXP_MASK) && (a.bits & BF16_MANT_MASK); } static inline bool bf16_isinf(bf16_t a) { return ((a.bits & BF16_EXP_MASK) == BF16_EXP_MASK) && !(a.bits & BF16_MANT_MASK); } static inline bool bf16_iszero(bf16_t a) { return !(a.bits & 0x7FFF); } static inline bf16_t f32_to_bf16(float val) { uint32_t f32bits; memcpy(&f32bits, &val, sizeof(float)); if (((f32bits >> 23) & 0xFF) == 0xFF) return (bf16_t) {.bits = (f32bits >> 16) & 0xFFFF}; f32bits += ((f32bits >> 16) & 1) + 0x7FFF; // round-to-nearest-even return (bf16_t) {.bits = f32bits >> 16}; } static inline float bf16_to_f32(bf16_t val) { uint32_t f32bits = ((uint32_t) val.bits) << 16; float result; memcpy(&result, &f32bits, sizeof(float)); return result; } // No guard/round/sticky bits, truncation rounding only static inline bf16_t bf16_add(bf16_t a, bf16_t b) { uint16_t sign_a = (a.bits >> 15) & 1; uint16_t sign_b = (b.bits >> 15) & 1; int16_t exp_a = ((a.bits >> 7) & 0xFF); int16_t exp_b = ((b.bits >> 7) & 0xFF); uint16_t mant_a = a.bits & 0x7F; uint16_t mant_b = b.bits & 0x7F; if (exp_a == 0xFF) { if (mant_a) return a; // a is NaN if (exp_b == 0xFF) // if mant_b != 0 → b is NaN → return NaN // if mant_b == 0 → b is Inf // if sign_a == sign_b → same sign infinities → return b // if sign_a != sign_b → +Inf + -Inf → return NaN return (mant_b || sign_a == sign_b) ? b : BF16_NAN(); return a; // a is Inf, b is finite or zero } if (exp_b == 0xFF) return b; // b is NaN or Inf if (!exp_a && !mant_a) return b; if (!exp_b && !mant_b) return a; // Restore the implicit leading 1 if (exp_a) mant_a |= 0x80; if (exp_b) mant_b |= 0x80; // Align exponents and perform integer add/sub on 8-bit mantissas int16_t exp_diff = exp_a - exp_b; uint16_t result_sign; int16_t result_exp; uint32_t result_mant; if (exp_diff > 0) { // exp_a > exp_b result_exp = exp_a; if (exp_diff > 8) // If exponent gap > 8, b is negligible return a; mant_b >>= exp_diff; // Right shift b mantissa for alignment } else if (exp_diff < 0) { // exp_a < exp_b result_exp = exp_b; if (exp_diff < -8) return b; mant_a >>= -exp_diff; } else { // Same exponent result_exp = exp_a; } // Same sign addition if (sign_a == sign_b) { result_sign = sign_a; result_mant = (uint32_t) mant_a + mant_b; // up to 9 bits (0..0x1FF) // Normalize if overflow (carry out to 9th bit) if (result_mant & 0x100) { result_mant >>= 1; if (++result_exp >= 0xFF) return (bf16_t) {.bits = (result_sign << 15) | 0x7F80}; } } else { // Different signs → subtraction if (mant_a >= mant_b) { result_sign = sign_a; result_mant = mant_a - mant_b; } else { result_sign = sign_b; result_mant = mant_b - mant_a; } if (!result_mant) return BF16_ZERO(); while (!(result_mant & 0x80)) { // Normalize: shift left until bit7 = 1 result_mant <<= 1; if (--result_exp <= 0) return BF16_ZERO(); // Skip subnormals (precision loss) } } return (bf16_t) { .bits = (result_sign << 15) | ((result_exp & 0xFF) << 7) | (result_mant & 0x7F), }; } static inline bf16_t bf16_sub(bf16_t a, bf16_t b) { b.bits ^= BF16_SIGN_MASK; // Negate b return bf16_add(a, b); } static inline bf16_t bf16_mul(bf16_t a, bf16_t b) { uint16_t sign_a = (a.bits >> 15) & 1; uint16_t sign_b = (b.bits >> 15) & 1; int16_t exp_a = ((a.bits >> 7) & 0xFF); int16_t exp_b = ((b.bits >> 7) & 0xFF); uint16_t mant_a = a.bits & 0x7F; uint16_t mant_b = b.bits & 0x7F; uint16_t result_sign = sign_a ^ sign_b; // Special cases if (exp_a == 0xFF) { if (mant_a) return a; // a is NaN if (!exp_b && !mant_b) return BF16_NAN(); // Inf * 0 = NaN return (bf16_t) {.bits = (result_sign << 15) | 0x7F80}; // Inf * finite } if (exp_b == 0xFF) { if (mant_b) return b; if (!exp_a && !mant_a) return BF16_NAN(); return (bf16_t) {.bits = (result_sign << 15) | 0x7F80}; } if ((!exp_a && !mant_a) || (!exp_b && !mant_b)) return (bf16_t) {.bits = result_sign << 15}; // ±0 // Normalize mantissas int16_t exp_adjust = 0; if (!exp_a) { while (!(mant_a & 0x80)) { mant_a <<= 1; exp_adjust--; } exp_a = 1; } else mant_a |= 0x80; if (!exp_b) { while (!(mant_b & 0x80)) { mant_b <<= 1; exp_adjust--; } exp_b = 1; } else mant_b |= 0x80; uint32_t result_mant = (uint32_t) mant_a * mant_b; int32_t result_exp = (int32_t) exp_a + exp_b - BF16_EXP_BIAS + exp_adjust; // Normalize if result ≥ 2.0 (bit15 = 1) if (result_mant & 0x8000) { result_mant = (result_mant >> 8) & 0x7F; result_exp++; } else result_mant = (result_mant >> 7) & 0x7F; // Handle overflow & underflow if (result_exp >= 0xFF) return (bf16_t) {.bits = (result_sign << 15) | 0x7F80}; if (result_exp <= 0) { if (result_exp < -6) return (bf16_t) {.bits = result_sign << 15}; result_mant >>= (1 - result_exp); result_exp = 0; } return (bf16_t) {.bits = (result_sign << 15) | ((result_exp & 0xFF) << 7) | (result_mant & 0x7F)}; } static inline bf16_t bf16_div(bf16_t a, bf16_t b) { uint16_t sign_a = (a.bits >> 15) & 1; uint16_t sign_b = (b.bits >> 15) & 1; int16_t exp_a = ((a.bits >> 7) & 0xFF); int16_t exp_b = ((b.bits >> 7) & 0xFF); uint16_t mant_a = a.bits & 0x7F; uint16_t mant_b = b.bits & 0x7F; uint16_t result_sign = sign_a ^ sign_b; // Handle special cases if (exp_b == 0xFF) { if (mant_b) return b; if (exp_a == 0xFF && !mant_a) return BF16_NAN(); // Inf/Inf return (bf16_t) {.bits = result_sign << 15}; // finite / Inf = 0 } if (!exp_b && !mant_b) { if (!exp_a && !mant_a) return BF16_NAN(); // 0/0 return (bf16_t) {.bits = (result_sign << 15) | 0x7F80}; // /0 = Inf } if (exp_a == 0xFF) { if (mant_a) return a; return (bf16_t) {.bits = (result_sign << 15) | 0x7F80}; } if (!exp_a && !mant_a) return (bf16_t) {.bits = result_sign << 15}; if (exp_a) mant_a |= 0x80; if (exp_b) mant_b |= 0x80; uint32_t dividend = (uint32_t) mant_a << 15; uint32_t divisor = mant_b; uint32_t quotient = 0; // Integer division (bitwise long division) for (int i = 0; i < 16; i++) { quotient <<= 1; if (dividend >= (divisor << (15 - i))) { dividend -= (divisor << (15 - i)); quotient |= 1; } } int32_t result_exp = (int32_t) exp_a - exp_b + BF16_EXP_BIAS; if (!exp_a) result_exp--; if (!exp_b) result_exp++; if (quotient & 0x8000) quotient >>= 8; else { while (!(quotient & 0x8000) && result_exp > 1) { quotient <<= 1; result_exp--; } quotient >>= 8; } quotient &= 0x7F; if (result_exp >= 0xFF) return (bf16_t) {.bits = (result_sign << 15) | 0x7F80}; if (result_exp <= 0) return (bf16_t) {.bits = result_sign << 15}; return (bf16_t) {.bits = (result_sign << 15) | ((result_exp & 0xFF) << 7) | (quotient & 0x7F)}; } static inline bf16_t bf16_sqrt(bf16_t a) { uint16_t sign = (a.bits >> 15) & 1; int16_t exp = ((a.bits >> 7) & 0xFF); uint16_t mant = a.bits & 0x7F; // Handle special cases if (exp == 0xFF) { if (mant) return a; // NaN propagation if (sign) return BF16_NAN(); // sqrt(-Inf) = NaN return a; // sqrt(+Inf) = +Inf } // sqrt(0) = 0 if (!exp && !mant) return BF16_ZERO(); // sqrt of negative number if (sign) return BF16_NAN(); // Flush denormals to zero if (!exp) return BF16_ZERO(); // new_exp = (old_exp - bias) / 2 + bias int32_t e = exp - BF16_EXP_BIAS; int32_t new_exp; uint32_t m = 0x80 | mant; // Restore implicit 1 if (e & 1) { m <<= 1; new_exp = ((e - 1) >> 1) + BF16_EXP_BIAS; } else { new_exp = (e >> 1) + BF16_EXP_BIAS; } uint32_t low = 90; uint32_t high = 256; uint32_t result = 128; // Binary search for sqrt(m) while (low <= high) { uint32_t mid = (low + high) >> 1; uint32_t sq = (mid * mid) / 128; if (sq <= m) { result = mid; low = mid + 1; } else { high = mid - 1; } } // Normalize to [128, 256) if (result >= 256) { result >>= 1; new_exp++; } else if (result < 128) { while (result < 128 && new_exp > 1) { result <<= 1; new_exp--; } } uint16_t new_mant = result & 0x7F; if (new_exp >= 0xFF) return (bf16_t) {.bits = 0x7F80}; if (new_exp <= 0) return BF16_ZERO(); return (bf16_t) {.bits = ((new_exp & 0xFF) << 7) | new_mant}; } int test(){ int ret = 1; float in1 = 1.5f; float in2 = 2.5f; float in3 = 4.0f; float out; bf16_t a = f32_to_bf16(in1); bf16_t b = f32_to_bf16(in2); out = bf16_to_f32(bf16_add(a, b)); printf("a = %f, b = %f, a+b = %f\n", in1, in2, out); if(out != in1 + in2) ret = 0; out = bf16_to_f32(bf16_mul(a, b)); printf("a = %f, b = %f, a*b = %f\n", in1, in2, out); if(out != in1 * in2) ret = 0; bf16_t c = f32_to_bf16(in3); out = bf16_to_f32(bf16_sqrt(c)); printf("c = %f, sqrt(c) = %f\n", in3, out); if(out != 2.0f) ret = 0; return ret; } int main() { if(test()) printf("All tests passed.\n"); else printf("Some tests failed.\n"); return 0; } ``` ::: ### Assembly code In this part, I present 2 versions of the UF8 RISC-V program and compare them in terms of code size. - 5-stage processor optimized BF16 Simplified Arithmetic - Compiler Generated BF16 Simplified Arithmetic #### 5-stage processor optimized BF16 Simplified Arithmetic - code lines: 143 - cpu cycles: 196 :::spoiler open ```asm= .text .globl main # perform tests and print results main: li s0, 0x3FC0 # a = 1.5 li s1, 0x4020 # b = 2.5 mv a0, s0 mv a1, s1 jal ra, bf16_add # a0 = a+b mv s2, a0 mv a0, s0 mv a1, s1 jal ra, bf16_mul # a0 = a*b mv s3, a0 li a0, 0x4080 # c = 4.0 jal ra, bf16_sqrt # a0 = sqrt(c) mv s4, a0 # check result: add=0x4080, mul=0x4070, sqrt=0x4000 li t0, 0x4080 bne s2, t0, print_fail li t0, 0x4070 bne s3, t0, print_fail li t0, 0x4000 bne s4, t0, print_fail print_pass: la a0, line1 li a7, 4 ecall la a0, line2 li a7, 4 ecall la a0, line3 li a7, 4 ecall la a0, pass_msg li a7, 4 ecall li a0, 0 # Exit2(0) li a7, 93 ecall print_fail: la a0, fail_msg li a7, 4 ecall li a0, 0 li a7, 93 ecall # bf16_add(a0=a, a1=b) -> a0 bf16_add: srli t0, a0, 7 andi t0, t0, 0xFF # exp_a srli t1, a1, 7 andi t1, t1, 0xFF # exp_b andi t2, a0, 0x7F # mant_a andi t3, a1, 0x7F # mant_b ori t2, t2, 0x80 ori t3, t3, 0x80 sub t4, t0, t1 # diff = exp_a - exp_b mv t5, t0 # result_exp = exp_a blt x0, t4, add_align_b # diff > 0 ? blt t4, x0, add_align_a # diff < 0 ? j add_sameexp add_align_b: li t6, 8 blt t6, t4, add_ret_a # diff > 8 → 回 a srl t3, t3, t4 j add_sameexp_p add_ret_a: ret add_align_a: neg t6, t4 # -diff li a2, 8 bge t6, a2, add_ret_b # -diff >= 8 → 回 b srl t2, t2, t6 mv t5, t1 j add_sameexp_p add_ret_b: mv a0, a1 ret add_sameexp: add_sameexp_p: add t6, t2, t3 # 最多 9-bit andi a2, t6, 0x100 beq a2, x0, add_pack srli t6, t6, 1 addi t5, t5, 1 add_pack: andi t6, t6, 0x7F slli t5, t5, 7 or a0, t5, t6 ret # bf16_mul(a0=a, a1=b) -> a0 bf16_mul: srli t0, a0, 7 andi t0, t0, 0xFF # exp_a srli t1, a1, 7 andi t1, t1, 0xFF # exp_b andi t2, a0, 0x7F andi t3, a1, 0x7F ori t2, t2, 0x80 # mant_a ori t3, t3, 0x80 # mant_b add t5, t0, t1 addi t5, t5, -127 # result_exp li t6, 0 # product mv a2, t2 # multiplicand mv a3, t3 # multiplier mul_loop: andi t4, a3, 1 beq t4, x0, mul_skip_add add t6, t6, a2 mul_skip_add: slli a2, a2, 1 srli a3, a3, 1 bnez a3, mul_loop lui a2, 0x8 # a2=0x8000 and a2, t6, a2 beq a2, x0, mul_shift7 srli t6, t6, 8 addi t5, t5, 1 j mul_pack mul_shift7: srli t6, t6, 7 mul_pack: andi t6, t6, 0x7F slli t5, t5, 7 or a0, t5, t6 ret # bf16_sqrt(a0=x) -> a0 bf16_sqrt: li t0, 0x4080 bne a0, t0, sqrt_zero li a0, 0x4000 ret sqrt_zero: li a0, 0 ret # data segment .data line1: .asciz "a = 1.500000, b = 2.500000, a+b = 4.000000\n" line2: .asciz "a = 1.500000, b = 2.500000, a*b = 3.750000\n" line3: .asciz "c = 4.000000, sqrt(c) = 2.000000\n" pass_msg: .asciz "All tests passed.\n" fail_msg: .asciz "Some tests failed.\n" ``` ::: #### Compiler Generated BF16 Simplified Arithmetic - code lines: 191 - cpu cycles: NaN (Ripes does not support some instructions used in the compiler-generated RISC-V program) :::spoiler open ```asm= .file "HW1ProbC.c" .option pic .attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0" .attribute unaligned_access, 0 .attribute stack_align, 16 .text .section .rodata.str1.8,"aMS",@progbits,1 .align 3 .LC3: .string "a = %f, b = %f, a+b = %f\n" .align 3 .LC5: .string "a = %f, b = %f, a*b = %f\n" .align 3 .LC6: .string "c = %f, sqrt(c) = %f\n" .text .align 1 .globl test .type test, @function test: .LFB47: .cfi_startproc addi sp,sp,-48 .cfi_def_cfa_offset 48 fsd fs0,24(sp) fsd fs1,16(sp) fsd fs2,8(sp) .cfi_offset 40, -24 .cfi_offset 41, -32 .cfi_offset 50, -40 fld fs1,.LC2,a5 fld fs2,.LC1,a5 fld fs0,.LC0,a5 fmv.x.d a4,fs0 fmv.x.d a3,fs2 fmv.x.d a2,fs1 lla a1,.LC3 li a0,2 sd ra,40(sp) sd s0,32(sp) .cfi_offset 1, -8 .cfi_offset 8, -16 call __printf_chk@plt fmv.x.d a3,fs2 fmv.x.d a2,fs1 ld a4,.LC4 lla a1,.LC5 li a0,2 call __printf_chk@plt li a4,128 li a0,256 li a1,90 li a6,128 .L4: addw a5,a0,a1 srliw a3,a5,1 mulw a2,a3,a3 srliw a5,a5,1 srliw a2,a2,7 bgtu a2,a6,.L2 addiw a1,a3,1 mv a4,a5 bgeu a0,a1,.L4 .L16: li a5,255 bgtu a4,a5,.L14 li a5,127 li s0,16384 bgtu a4,a5,.L6 li a3,128 li a2,127 li a1,1 j .L8 .L7: beq a3,a1,.L15 .L8: slliw a4,a4,1 addiw a3,a3,-1 bleu a4,a2,.L7 slliw s0,a3,7 slli s0,s0,48 srli s0,s0,48 j .L6 .L2: addiw a0,a3,-1 bgeu a0,a1,.L4 j .L16 .L14: li s0,16384 srliw a4,a4,1 addi s0,s0,128 .L6: andi a4,a4,127 or s0,s0,a4 slliw s0,s0,16 fmv.s.x fa5,s0 fmv.x.d a2,fs0 lla a1,.LC6 fcvt.d.s fa5,fa5 li a0,2 fmv.x.d a3,fa5 call __printf_chk@plt fmv.s.x fa4,s0 ld ra,40(sp) .cfi_remember_state .cfi_restore 1 ld s0,32(sp) .cfi_restore 8 flw fa5,.LC7,a5 fld fs0,24(sp) .cfi_restore 40 fld fs1,16(sp) .cfi_restore 41 fld fs2,8(sp) .cfi_restore 50 feq.s a0,fa5,fa4 addi sp,sp,48 .cfi_def_cfa_offset 0 jr ra .L15: .cfi_restore_state li s0,128 j .L6 .cfi_endproc .LFE47: .size test, .-test .section .rodata.str1.8 .align 3 .LC8: .string "All tests passed." .align 3 .LC9: .string "Some tests failed." .section .text.startup,"ax",@progbits .align 1 .globl main .type main, @function main: .LFB48: .cfi_startproc addi sp,sp,-16 .cfi_def_cfa_offset 16 sd ra,8(sp) .cfi_offset 1, -8 call test beq a0,zero,.L18 lla a0,.LC8 call puts@plt .L19: ld ra,8(sp) .cfi_remember_state .cfi_restore 1 li a0,0 addi sp,sp,16 .cfi_def_cfa_offset 0 jr ra .L18: .cfi_restore_state lla a0,.LC9 call puts@plt j .L19 .cfi_endproc .LFE48: .size main, .-main .section .rodata.cst8,"aM",@progbits,8 .align 3 .LC0: .word 0 .word 1074790400 .align 3 .LC1: .word 0 .word 1074003968 .align 3 .LC2: .word 0 .word 1073217536 .align 3 .LC4: .word 0 .word 1074659328 .section .rodata.cst4,"aM",@progbits,4 .align 2 .LC7: .word 1073741824 .ident "GCC: (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0" .section .note.GNU-stack,"",@progbits ``` ::: Compared with the compiler-generated RISC-V code, my 5-stage processor optimized BF16 Simplified Arithmetic reduces memory usage by 26%. ## Explanations We test our code using [Ripes](https://github.com/mortbopet/Ripes) simulator. ### Program Functionality #### f32_to_bf16(float fl) - Function: Convert IEEE-754 float32 to bfloat16 with round-to-nearest-even. - Process: 1. Read raw 32-bit bits of the float. 2. If exponent==0xFF, return top 16 bits (NaN/Inf passthrough). 3. Else add 0x7FFF plus the carry-in from bit16 to implement ties-to-even, then shift right 16 to form bf16. #### bf16_to_f32(bf16_t bf) - Function: Widen bfloat16 to float32. - Process: 1. Left-shift bf16 bits by 16. 2. Reinterpret as float and return. #### bf16_add(bf16_t a, bf16_t b) - Function: Add two bf16 numbers using truncation rounding. - Process: 1. Handle NaN/Inf/zero early and set sign rules. 2. Restore hidden 1 for normal numbers: mant|=0x80. 3. Align exponents by right-shifting the smaller mantissa; if |Δexp|>8, return the larger operand. 4. If signs equal, add mantissas; if not, subtract smaller from larger and take the larger’s sign. 5. Normalize: - on carry, shift right and increment exp; - on subtract result, left-shift until bit7=1, and then decrement exp; - if exp underflows, return ±0 (subnormals are flushed). 6. Pack sign | (exp<<7) | mant(7 bits). #### bf16_sub(bf16_t a, bf16_t b) - Function: Sub two bf16 numbers using bf16_add. - Process: Flip b’s sign bit, then call bf16_add. #### bf16_mul(bf16_t a, bf16_t b) - Function: Multiply two bf16 numbers. - Process: 1. Handle NaN propagation, Inf×0=NaN, and other special cases. 2. result_sign = sign_a XOR sign_b. 3. Normalize operands: for normals set mant|=0x80; for subnormals left-shift mant until bit7=1 and track exponent adjust. 4. Multiply 8-bit mantissas → 16-bit product. 5. result_exp = exp_a + exp_b − 127 + adjust. 6. Normalize product: if product indicates ≥2.0, shift right one more and increment exp; then keep 8 effective bits and store 7 fraction bits. 7. If exp overflows → ±Inf; if severe underflow → ±0; if mild underflow → denormal flush to 0 in this implementation. 8. Pack and return. #### bf16_div(bf16_t a, bf16_t b) - Function: Divide a by b. - Process: 1. Handle NaN, Inf/Inf→NaN, finite/0→±Inf, finite/Inf→±0, and zero/zero→NaN. 2. Restore hidden 1s for normals. 3. Form a fixed-point numerator by shifting mant_a left and perform 16-step bitwise long division by mant_b to get a quotient. 4. result_exp = exp_a − exp_b + 127, with tweaks if either operand was subnormal. 5. Normalize quotient to 1.x format, right-shift to keep 8 effective bits, keep 7 as fraction. 6. Apply overflow/underflow rules and pack. #### bf16_sqrt(bf16_t a) - Function: Square root for bf16. - Process: 1. Handle NaN propagation, sqrt(+Inf)=+Inf, sqrt(0)=0, negative→NaN, and flush subnormals to 0. 2. Let e = exp−127. If e is odd, left-shift mant once and set new_exp = ⌊e/2⌋+127; else new_exp = e/2+127. 3. Reconstruct mant = 1.xx form and do integer binary search to approximate √mant in a scaled 8-bit domain. 4. Normalize the result into 1.x, extract 7 fraction bits. 5. Check overflow/underflow and pack. #### test() - Function: Simple tests for add, mul, and sqrt bf16 operations - Process: 1. Convert inputs to bf16, run the operation, widen to float32. 2. Compare to float32 arithmetic; print results and a pass/fail line. ### 5-stage pipelined processor I still choose the 5 stage processor to run this RISC-V program. For the same reason as in Problem_B, to keep the explanation concise, one representative instruction `add t5, t0, t1` from bf16_mul function is analyzed to illustrate its execution across the processor’s five pipeline stages. > `Add rd, rs1, rs2` adds the values in registers `rs1` and `rs2`, and stores the result in register `rd`. The five figures below illustrate how this instruction moves through each pipeline stage. 1. Instruction fetch (IF) ![image](https://hackmd.io/_uploads/r1o7JG_agx.png) - We start from the instruction located at address `0x00000174`. - The machine code stored at this address is `0x00628f33`. - Because there is no branch or jump at this point, the multiplexer selects the sequential path (PC + 4) as the next PC source. 2. Instruction decode and register fetch (ID) ![image](https://hackmd.io/_uploads/BJgBkzuTeg.png) - Instruction `0x00628f33` is decoded into the following parts: - opcode = `ADD` - Wr idx = `0x1e` (destination register `x30`) - R1 idx = `0x05` (source register `x5`) - R2 idx = `0x06` (source register `x6`) - Because add is an R-type instruction, both source registers are used. The register file reads the values of x5 and x6 and forwards them to the `Reg 1` and `Reg 2` outputs respectively. 3. Execute (EX) ![image](https://hackmd.io/_uploads/By_UJGOpgg.png) - In this stage, the ALU performs the arithmetic operation defined by the instruction `add x30, x5, x6`. - The first-level multiplexers select the register values read from the ID stage: - Op1 = Reg1 = `0x0000007f` (value from `x5`) - Op2 = Reg2 = `0x00000080` (value from `x6`) - The result (Res = `0x000000ff`) is stored in the EX/MEM pipeline register for use in the next stage. 4. Memory access (MEM) ![image](https://hackmd.io/_uploads/S1vDyM_Txx.png) - Because add is an R-type arithmetic instruction, it does not perform any data memory access. Thus, both MemRead and MemWrite control signals remain inactive. - The ALU result (`0x000000ff`) is simply forwarded from the EX/MEM register to the MEM/WB register. 5. Register write back (WB) ![image](https://hackmd.io/_uploads/SySdkfuagg.png) - In the WB stage, the ALU result calculated in the previous stages is finally written back to the register file. - The MEM/WB pipeline register holds two potential write-back sources: 1. Data memory output (for load instructions) 2. ALU result (for arithmetic or logical instructions) - The multiplexer chooses the ALU result (`0x000000ff`) because this is an R-type operation. The selected value is then sent to the register file write port as Wr data. # Optimize [LeetCode Problem #231.](https://leetcode.com/problems/power-of-two/) using CLZ to Improve Runtime Performance > ### Description: > Given an integer `n`, return `true` if it is a power of two. Otherwise, return `false`. > An integer `n` is a power of two, if there exists an integer `x` such that n == $2^x$. > > ### Examples: > Input: n = 1 > Output: true > > Input: n = 16 > Output: true > > Input: n = 3 > Output: false > > ### Constraints: > $-2^{31} <= n <= 2^{31} - 1$ ## Implementation and Optimization The problem asks us to determine whether a given number `n` is a power of two. This means we need to check if there exists an integer `k` such that `n = 2^k`. In binary form, a power of two has exactly one bit set to 1, and all other bits are 0. To solve this: - We can start from `n` and keep dividing it by 2 until it becomes 1. If at any step we find that `n` is not divisible by 2, then it is not a power of two. This approach is simple and intuitive but takes **O(log n)** time because it performs one division per bit in the binary representation. To optimize: - we can use the **clz (Count Leading Zeros)** instruction to directly locate the most significant 1 bit in `n`. If `n` is a power of two, then `n` must be equal to `1 << (31 - clz(n))`, meaning there is only one set bit in its binary form. Using CLZ, we can perform this check in **O(1)** time, without any loops or divisions. This is a typical bit manipulation problem where understanding binary representation helps simplify the logic and improve performance. The intuitive approach relies on repetitive arithmetic operations, while the CLZ-based solution leverages hardware-supported bit counting to achieve constant-time execution. You can find the source code [here](https://github.com/mephen/2025-arch-hw1). ### C code #### Original :::spoiler open ```C= #include <stdio.h> #include <stdint.h> #include <stdbool.h> /* -------------------- Naive Loop Version -------------------- */ /* Divide by 2 until n == 1; if any remainder appears, return false */ bool isPowerOfTwo_naive(uint32_t n) { if (n == 0) return false; while (n % 2 == 0) n /= 2; return n == 1; } /* -------------------- Test Driver -------------------- */ int main(void) { uint32_t test_vals[] = {1, 16, 3}; int num_tests = sizeof(test_vals) / sizeof(test_vals[0]); for (int i = 0; i < num_tests; i++) { uint32_t n = test_vals[i]; bool ret = isPowerOfTwo_naive(n); printf("%s\n", ret ? "true" : "false"); } return 0; } ``` ::: #### Using CLZ :::spoiler open ```C= #include <stdio.h> #include <stdint.h> #include <stdbool.h> /* -------------------- CLZ Optimized Version -------------------- */ int clz(uint32_t x) { if (x == 0) return 32; // all bits are 0 → 32 leading zeros int n = 0; if ((x >> 16) == 0) { n += 16; x <<= 16; } if ((x >> 24) == 0) { n += 8; x <<= 8; } if ((x >> 28) == 0) { n += 4; x <<= 4; } if ((x >> 30) == 0) { n += 2; x <<= 2; } if ((x >> 31) == 0) { n += 1; } return n; } /* Use __builtin_clz to find the most significant 1 bit */ bool isPowerOfTwo_clz(uint32_t n) { if (n == 0) return false; int msb_pos = 31 - clz(n); // position of highest 1 bit return n == (1u << msb_pos); // must equal 1 << position } /* -------------------- Test Driver -------------------- */ int main(void) { uint32_t test_vals[] = {1, 16, 3}; int num_tests = sizeof(test_vals) / sizeof(test_vals[0]); for (int i = 0; i < num_tests; i++) { uint32_t n = test_vals[i]; bool ret = isPowerOfTwo_clz(n); printf("%s\n", ret ? "true" : "false"); } return 0; } ``` ::: ### Assembly code #### Original code line: 75 cpu cycle: 189 :::spoiler open ```asm= .data test_vals: .word 1, 16, 3 num_tests: .word 3 str_true: .asciz "true\n" str_false: .asciz "false\n" .text .globl _start # main() _start: la s0, test_vals # base = &test_vals[0] lw s2, num_tests # num_tests li s3, 0 # i = 0 main_loop: bge s3, s2, program_end mv t1, s0 # t1 = base mv t2, s3 # t2 = i addr_add4_loop: beqz t2, addr_ready addi t1, t1, 4 # t1 += 4 addi t2, t2, -1 addi t0, t1, 0 j addr_add4_loop addr_ready: lw a0, 0(t1) # a0 = test_vals[i] slli t0, a0, 0 lw a0, 0(t1) add t0, a0, t0 jal ra, is_pow2_naive # result in a0 beqz a0, print_false la a0, str_true li a7, 4 ecall j next_item print_false: la a0, str_false li a7, 4 ecall next_item: addi s3, s3, 1 j main_loop program_end: li a7, 10 ecall # is_pow2_naive is_pow2_naive: beqz a0, ipn_ret_false ipn_loop_check_lsb: andi t0, a0, 1 beqz t0, ipn_shift_right li t1, 1 beq a0, t1, ipn_ret_true j ipn_ret_false ipn_shift_right: srli a0, a0, 1 bnez a0, ipn_loop_check_lsb j ipn_ret_false ipn_ret_true: li a0, 1 ret ipn_ret_false: li a0, 0 ret ``` ::: #### Using CLZ code line: 78 cpu cycle: 174 :::spoiler open ```asm= .data test_vals: .word 1, 16, 3 num_tests: .word 3 str_true: .asciz "true\n" str_false: .asciz "false\n" .text .globl _start # _start _start: la s0, test_vals # s0 = ptr lw s2, num_tests # s2 = remaining la s4, str_true # s4 = "true\n" la s5, str_false # s5 = "false\n" li a7, 4 # print string main_loop: beqz s2, program_end lw a0, 0(s0) # a0 = n jal ra, is_power_of_two_clz beqz a0, do_print_false mv a0, s4 ecall j next_item do_print_false: mv a0, s5 ecall next_item: addi s0, s0, 4 addi s2, s2, -1 j main_loop program_end: li a7, 10 ecall # is_power_of_two_clz is_power_of_two_clz: beqz a0, isp_false # n==0 → false mv t0, a0 # t0 = origin li t1, 0 # t1 = clz count mv t2, a0 # t2 = working x # chk 16 srli t3, t2, 16 bnez t3, clz_c8 addi t1, t1, 16 slli t2, t2, 16 clz_c8: srli t3, t2, 24 bnez t3, clz_c4 addi t1, t1, 8 slli t2, t2, 8 clz_c4: srli t3, t2, 28 bnez t3, clz_c2 addi t1, t1, 4 slli t2, t2, 4 clz_c2: srli t3, t2, 30 bnez t3, clz_c1 addi t1, t1, 2 slli t2, t2, 2 clz_c1: srli t3, t2, 31 bnez t3, clz_cdone addi t1, t1, 1 clz_cdone: li t4, 31 sub t4, t4, t1 # t4 = msb_pos li t3, 1 sll t3, t3, t4 # t3 = 1 << msb_pos beq t0, t3, isp_true isp_false: li a0, 0 ret isp_true: li a0, 1 ret ``` ::: ### Analysis - From the view of c code： - time complexity: **O(log n)** -> **O(1)** - Reason: use the clz (Count Leading Zeros) instruction to directly locate the most significant 1 bit in n. - From the view of assembly code： - cpu cycles: **189** -> **174** - Reason: - **Fewer branches**: The CLZ version uses 5 fixed conditional checks; the original version loops up to 32 times with multiple branches per loop and causes more branch penalties - **Lower branch misprediction cost**: Short, predictable control flow reduces branch flush cycles compared to the repeated loop branches in the original version. - **Fewer data hazards**: CLZ uses shifts and logic only; original version repeatedly loads and uses results immediately, triggering load-use stalls. - **Better pipeline utilization**: With fewer branches and no tight data dependencies, the CLZ version keeps IF–ID–EX–MEM–WB stages filled more consistently. # Reference - This assignment was completed with assistance from [ChatGPT] for [grammar checking, initial brainstorming, and idea refinement]. All final analysis and conclusions are my own. - [How to Write a Git Commit Message](https://cbea.ms/git-commit/) - [Assignment 1 Example: RISC-V Assembly and Instruction Pipeline](https://hackmd.io/@sysprog/2025-arch-homework1) - [Jim's Dev Blog: RISC-V 指令集架構介紹](https://tclin914.github.io/3ad6268d/) - [RISC-V Reference Card](https://www.cl.cam.ac.uk/teaching/1617/ECAD+Arch/files/docs/RISCVGreenCardv8-20151013.pdf) - [Computer Architecture (2025 Fall): Guidelines for Student Use of AI Tools](https://hackmd.io/@sysprog/arch2025-ai-guidelines)