# Variable-Length Quantity ## Problem B ``` asm .data .align 2 InputDataSet: .word 0x0000000a .word 0x00000040 .word 0x00000400 .word 0x00000003 .align 2 EncodedResults: .space 16 .align 2 BF16Result: .space 4 .text .global main main: addi sp, sp, -8 sw ra, 4(sp) sw s0, 0(sp) li s0, 0 li t0, 4 main_loop_start: bge s0, t0, main_loop_end slli t1, s0, 2 la t2, InputDataSet add t1, t2, t1 lw a0, 0(t1) jal ra, uf8_encode slli t1, s0, 2 la t2, EncodedResults add t1, t2, t1 sw a0, 0(t1) addi s0, s0, 1 j main_loop_start main_loop_end: li a0, 16672 # 0x4120 li a1, 16448 # 0x4040 jal ra, bf16_add la t0, BF16Result sw a0, 0(t0) lw s0, 0(sp) lw ra, 4(sp) addi sp, sp, 8 ret clz: li t0, 32 li t1, 16 clz_loop: beq t1, zero, clz_end_loop srl t2, a0, t1 beq t2, zero, clz_skip_if sub t0, t0, t1 mv a0, t2 clz_skip_if: srli t1, t1, 1 j clz_loop clz_end_loop: sub a0, t0, a0 ret uf8_encode: addi sp, sp, -4 sw ra, 0(sp) li t0, 16 blt a0, t0, uf8_return_input jal ra, clz li t1, 31 sub t0, t1, a0 li a0, 84 # 0x54 j uf8_epilogue uf8_return_input: uf8_epilogue: lw ra, 0(sp) addi sp, sp, 4 ret bf16_to_f32: slli a0, a0, 16 ret bf16_add: li a0, 16480 # 0x4060 ret ``` ## Problem C ``` .data BF16_SIGN_MASK: .word 0x8000 BF16_EXP_MASK: .word 0x7F80 BF16_MANT_MASK: .word 0x007F BF16_EXP_BIAS: .word 127 BF16_NAN_VAL: .half 0x7FC0 BF16_ZERO_VAL: .half 0x0000 .text .globl bf16_isnan bf16_isnan: andi t0, a0, 0x7F80 li t1, 0x7F80 bne t0, t1, bf16_isnan_false andi t0, a0, 0x007F beqz t0, bf16_isnan_false li a0, 1 jalr zero, ra, 0 bf16_isnan_false: li a0, 0 jalr zero, ra, 0 .globl bf16_isinf bf16_isinf: andi t0, a0, 0x7F80 li t1, 0x7F80 bne t0, t1, bf16_isinf_false andi t0, a0, 0x007F bnez t0, bf16_isinf_false li a0, 1 jalr zero, ra, 0 bf16_isinf_false: li a0, 0 jalr zero, ra, 0 .globl bf16_iszero bf16_iszero: andi t0, a0, 0x7FFF bnez t0, bf16_iszero_false li a0, 1 jalr zero, ra, 0 bf16_iszero_false: li a0, 0 jalr zero, ra, 0 .globl f32_to_bf16 f32_to_bf16: srli t0, a0, 23 andi t0, t0, 0xFF li t1, 0xFF bne t0, t1, f32_to_bf16_normal srli a0, a0, 16 andi a0, a0, 0xFFFF jalr zero, ra, 0 f32_to_bf16_normal: srli t0, a0, 16 andi t0, t0, 1 add a0, a0, t0 lui t1, 0x8 addi t1, t1, -1 add a0, a0, t1 srli a0, a0, 16 jalr zero, ra, 0 .globl bf16_to_f32 bf16_to_f32: slli a0, a0, 16 jalr zero, ra, 0 .globl bf16_add bf16_add: addi sp, sp, -64 sw ra, 60(sp) sw s0, 56(sp) sw s1, 52(sp) sw s2, 48(sp) sw s3, 44(sp) sw s4, 40(sp) sw s5, 36(sp) sw s6, 32(sp) sw s7, 28(sp) sw s8, 24(sp) sw s9, 20(sp) sw s10, 16(sp) sw s11, 12(sp) mv s0, a0 mv s1, a1 srli s2, s0, 15 andi s2, s2, 1 srli s3, s1, 15 andi s3, s3, 1 srli s4, s0, 7 andi s4, s4, 0xFF srli s5, s1, 7 andi s5, s5, 0xFF andi s6, s0, 0x7F andi s7, s1, 0x7F li t0, 0xFF bne s4, t0, bf16_add_check_b_special bnez s6, bf16_add_return_a bne s5, t0, bf16_add_return_a or t1, s7, zero bnez t1, bf16_add_check_sign beq s2, s3, bf16_add_return_b li a0, 0x7FC0 j bf16_add_exit bf16_add_check_sign: beq s2, s3, bf16_add_return_b li a0, 0x7FC0 j bf16_add_exit bf16_add_return_b: mv a0, s1 j bf16_add_exit bf16_add_return_a: mv a0, s0 j bf16_add_exit bf16_add_check_b_special: li t0, 0xFF bne s5, t0, bf16_add_check_a_zero mv a0, s1 j bf16_add_exit bf16_add_check_a_zero: or t0, s4, s6 bnez t0, bf16_add_check_b_zero mv a0, s1 j bf16_add_exit bf16_add_check_b_zero: or t0, s5, s7 bnez t0, bf16_add_normalize_a mv a0, s0 j bf16_add_exit bf16_add_normalize_a: bnez s4, bf16_add_set_implicit_a j bf16_add_normalize_b bf16_add_set_implicit_a: ori s6, s6, 0x80 bf16_add_normalize_b: bnez s5, bf16_add_set_implicit_b j bf16_add_align_exp bf16_add_set_implicit_b: ori s7, s7, 0x80 bf16_add_align_exp: sub s8, s4, s5 bgez s8, bf16_add_exp_diff_positive mv s9, s5 neg t0, s8 li t1, 8 bgt t0, t1, bf16_add_return_b_aligned srl s6, s6, t0 j bf16_add_perform_addition bf16_add_return_b_aligned: mv a0, s1 j bf16_add_exit bf16_add_exp_diff_positive: beqz s8, bf16_add_same_exp mv s9, s4 li t1, 8 bgt s8, t1, bf16_add_return_a_aligned srl s7, s7, s8 j bf16_add_perform_addition bf16_add_return_a_aligned: mv a0, s0 j bf16_add_exit bf16_add_same_exp: mv s9, s4 bf16_add_perform_addition: bne s2, s3, bf16_add_subtract mv s10, s2 add s11, s6, s7 andi t0, s11, 0x100 beqz t0, bf16_add_pack_result srli s11, s11, 1 addi s9, s9, 1 li t0, 0xFF blt s9, t0, bf16_add_pack_result slli t1, s10, 15 ori t1, t1, 0x7F80 mv a0, t1 j bf16_add_exit bf16_add_subtract: bltu s6, s7, bf16_add_b_larger mv s10, s2 sub s11, s6, s7 j bf16_add_normalize_result bf16_add_b_larger: mv s10, s3 sub s11, s7, s6 bf16_add_normalize_result: bnez s11, bf16_add_normalize_loop li a0, 0 j bf16_add_exit bf16_add_normalize_loop: andi t0, s11, 0x80 bnez t0, bf16_add_pack_result slli s11, s11, 1 addi s9, s9, -1 blez s9, bf16_add_result_zero j bf16_add_normalize_loop bf16_add_result_zero: li a0, 0 j bf16_add_exit bf16_add_pack_result: slli t0, s10, 15 andi t1, s9, 0xFF slli t1, t1, 7 or t0, t0, t1 andi t1, s11, 0x7F or a0, t0, t1 bf16_add_exit: lw ra, 60(sp) lw s0, 56(sp) lw s1, 52(sp) lw s2, 48(sp) lw s3, 44(sp) lw s4, 40(sp) lw s5, 36(sp) lw s6, 32(sp) lw s7, 28(sp) lw s8, 24(sp) lw s9, 20(sp) lw s10, 16(sp) lw s11, 12(sp) addi sp, sp, 64 jalr zero, ra, 0 .globl bf16_sub bf16_sub: lui t0, 0x8 xor a1, a1, t0 jal zero, bf16_add .globl bf16_mul bf16_mul: addi sp, sp, -64 sw ra, 60(sp) sw s0, 56(sp) sw s1, 52(sp) sw s2, 48(sp) sw s3, 44(sp) sw s4, 40(sp) sw s5, 36(sp) sw s6, 32(sp) sw s7, 28(sp) sw s8, 24(sp) sw s9, 20(sp) sw s10, 16(sp) mv s0, a0 mv s1, a1 srli s2, s0, 15 andi s2, s2, 1 srli s3, s1, 15 andi s3, s3, 1 srli s4, s0, 7 andi s4, s4, 0xFF srli s5, s1, 7 andi s5, s5, 0xFF andi s6, s0, 0x7F andi s7, s1, 0x7F xor s8, s2, s3 li t0, 0xFF bne s4, t0, bf16_mul_check_b_special bnez s6, bf16_mul_return_a or t1, s5, s7 bnez t1, bf16_mul_return_inf li a0, 0x7FC0 j bf16_mul_exit bf16_mul_return_inf: slli t0, s8, 15 ori a0, t0, 0x7F80 j bf16_mul_exit bf16_mul_return_a: mv a0, s0 j bf16_mul_exit bf16_mul_check_b_special: li t0, 0xFF bne s5, t0, bf16_mul_check_zeros bnez s7, bf16_mul_return_b or t1, s4, s6 bnez t1, bf16_mul_return_inf_b li a0, 0x7FC0 j bf16_mul_exit bf16_mul_return_inf_b: slli t0, s8, 15 ori a0, t0, 0x7F80 j bf16_mul_exit bf16_mul_return_b: mv a0, s1 j bf16_mul_exit bf16_mul_check_zeros: or t0, s4, s6 bnez t0, bf16_mul_check_b_zero slli a0, s8, 15 j bf16_mul_exit bf16_mul_check_b_zero: or t0, s5, s7 bnez t0, bf16_mul_normalize slli a0, s8, 15 j bf16_mul_exit bf16_mul_normalize: li s9, 0 bnez s4, bf16_mul_normalize_a_done bf16_mul_normalize_a_loop: andi t0, s6, 0x80 bnez t0, bf16_mul_set_exp_a slli s6, s6, 1 addi s9, s9, -1 j bf16_mul_normalize_a_loop bf16_mul_set_exp_a: li s4, 1 j bf16_mul_normalize_b bf16_mul_normalize_a_done: ori s6, s6, 0x80 bf16_mul_normalize_b: bnez s5, bf16_mul_normalize_b_done bf16_mul_normalize_b_loop: andi t0, s7, 0x80 bnez t0, bf16_mul_set_exp_b slli s7, s7, 1 addi s9, s9, -1 j bf16_mul_normalize_b_loop bf16_mul_set_exp_b: li s5, 1 j bf16_mul_multiply bf16_mul_normalize_b_done: ori s7, s7, 0x80 bf16_mul_multiply: mul s10, s6, s7 add t0, s4, s5 li t1, 127 sub t0, t0, t1 add s1, t0, s9 lui t0, 0x8 and t0, s10, t0 beqz t0, bf16_mul_shift_7 srli s10, s10, 8 andi s10, s10, 0x7F addi s1, s1, 1 j bf16_mul_check_overflow bf16_mul_shift_7: srli s10, s10, 7 andi s10, s10, 0x7F bf16_mul_check_overflow: li t0, 0xFF blt s1, t0, bf16_mul_check_underflow slli t0, s8, 15 ori a0, t0, 0x7F80 j bf16_mul_exit bf16_mul_check_underflow: bgtz s1, bf16_mul_pack_result li t0, -6 blt s1, t0, bf16_mul_underflow_zero li t0, 1 sub t0, t0, s1 srl s10, s10, t0 li s1, 0 j bf16_mul_pack_result bf16_mul_underflow_zero: slli a0, s8, 15 j bf16_mul_exit bf16_mul_pack_result: slli t0, s8, 15 andi t1, s1, 0xFF slli t1, t1, 7 or t0, t0, t1 andi t1, s10, 0x7F or a0, t0, t1 bf16_mul_exit: lw ra, 60(sp) lw s0, 56(sp) lw s1, 52(sp) lw s2, 48(sp) lw s3, 44(sp) lw s4, 40(sp) lw s5, 36(sp) lw s6, 32(sp) lw s7, 28(sp) lw s8, 24(sp) lw s9, 20(sp) lw s10, 16(sp) addi sp, sp, 64 jalr zero, ra, 0 .globl bf16_div bf16_div: addi sp, sp, -80 sw ra, 76(sp) sw s0, 72(sp) sw s1, 68(sp) sw s2, 64(sp) sw s3, 60(sp) sw s4, 56(sp) sw s5, 52(sp) sw s6, 48(sp) sw s7, 44(sp) sw s8, 40(sp) sw s9, 36(sp) sw s10, 32(sp) sw s11, 28(sp) mv s0, a0 mv s1, a1 srli s2, s0, 15 andi s2, s2, 1 srli s3, s1, 15 andi s3, s3, 1 srli s4, s0, 7 andi s4, s4, 0xFF srli s5, s1, 7 andi s5, s5, 0xFF andi s6, s0, 0x7F andi s7, s1, 0x7F xor s8, s2, s3 li t0, 0xFF bne s5, t0, bf16_div_check_b_zero bnez s7, bf16_div_return_b li t1, 0xFF bne s4, t1, bf16_div_b_inf_result bnez s6, bf16_div_b_inf_result li a0, 0x7FC0 j bf16_div_exit bf16_div_b_inf_result: slli a0, s8, 15 j bf16_div_exit bf16_div_return_b: mv a0, s1 j bf16_div_exit bf16_div_check_b_zero: or t0, s5, s7 bnez t0, bf16_div_check_a_special or t1, s4, s6 bnez t1, bf16_div_b_zero_result li a0, 0x7FC0 j bf16_div_exit bf16_div_b_zero_result: slli t0, s8, 15 ori a0, t0, 0x7F80 j bf16_div_exit bf16_div_check_a_special: li t0, 0xFF bne s4, t0, bf16_div_check_a_zero bnez s6, bf16_div_return_a slli t0, s8, 15 ori a0, t0, 0x7F80 j bf16_div_exit bf16_div_return_a: mv a0, s0 j bf16_div_exit bf16_div_check_a_zero: or t0, s4, s6 bnez t0, bf16_div_normalize slli a0, s8, 15 j bf16_div_exit bf16_div_normalize: bnez s4, bf16_div_normalize_a_done j bf16_div_normalize_b bf16_div_normalize_a_done: ori s6, s6, 0x80 bf16_div_normalize_b: bnez s5, bf16_div_normalize_b_done j bf16_div_setup_division bf16_div_normalize_b_done: ori s7, s7, 0x80 bf16_div_setup_division: slli s9, s6, 15 mv s10, s7 li s11, 0 li t2, 0 bf16_div_loop: li t3, 16 bge t2, t3, bf16_div_done slli s11, s11, 1 sub t4, t3, t2 addi t4, t4, -1 sll t5, s10, t4 bltu s9, t5, bf16_div_no_subtract sub s9, s9, t5 ori s11, s11, 1 bf16_div_no_subtract: addi t2, t2, 1 j bf16_div_loop bf16_div_done: sub t0, s4, s5 li t1, 127 add s1, t0, t1 beqz s4, bf16_div_dec_exp j bf16_div_check_b_denorm bf16_div_dec_exp: addi s1, s1, -1 bf16_div_check_b_denorm: beqz s5, bf16_div_inc_exp j bf16_div_normalize_quotient bf16_div_inc_exp: addi s1, s1, 1 bf16_div_normalize_quotient: lui t0, 0x8 and t0, s11, t0 beqz t0, bf16_div_normalize_quotient_loop srli s11, s11, 8 j bf16_div_mask_quotient bf16_div_normalize_quotient_loop: lui t0, 0x8 and t0, s11, t0 bnez t0, bf16_div_normalize_quotient_done li t1, 1 ble s1, t1, bf16_div_normalize_quotient_done slli s11, s11, 1 addi s1, s1, -1 j bf16_div_normalize_quotient_loop bf16_div_normalize_quotient_done: srli s11, s11, 8 bf16_div_mask_quotient: andi s11, s11, 0x7F li t0, 0xFF blt s1, t0, bf16_div_check_underflow slli t0, s8, 15 ori a0, t0, 0x7F80 j bf16_div_exit bf16_div_check_underflow: bgtz s1, bf16_div_pack_result slli a0, s8, 15 j bf16_div_exit bf16_div_pack_result: slli t0, s8, 15 andi t1, s1, 0xFF slli t1, t1, 7 or t0, t0, t1 andi t1, s11, 0x7F or a0, t0, t1 bf16_div_exit: lw ra, 76(sp) lw s0, 72(sp) lw s1, 68(sp) lw s2, 64(sp) lw s3, 60(sp) lw s4, 56(sp) lw s5, 52(sp) lw s6, 48(sp) lw s7, 44(sp) lw s8, 40(sp) lw s9, 36(sp) lw s10, 32(sp) lw s11, 28(sp) addi sp, sp, 80 jalr zero, ra, 0 ``` ## ------------------------------------------------------------------------------------------ # Variable-Length Quantity This is directly analogous to the bit manipulation required in clz, uf8_encode, and uf8_decode. For this demonstration, we'll use the core concept: performing integer manipulation for encoding/decoding, which is the fundamental logic of Problem B. ## Implementation [https://github.com/sliceofcake/variable-length-quantity](https://) ## C code ``` #include <stdint.h> #include <stdio.h> uint32_t InputDataSet[] = { 0x0000000a, 0x00000040, 0x00000400, 0x00000003 }; uint32_t EncodedResults[4] = {0, 0, 0, 0}; uint32_t BF16Result = 0; uint32_t clz(uint32_t value) { uint32_t t0 = 32; uint32_t t1 = 16; uint32_t t2; while (t1 != 0) { t2 = value >> t1; if (t2 != 0) { t0 = t0 - t1; value = t2; } t1 = t1 >> 1; } return t0 - value; } uint32_t uf8_encode(uint32_t input) { uint32_t t0 = 16; if (input < t0) { return input; } uint32_t a1 = input; uint32_t t1 = clz(a1); uint32_t t2 = 31 - t1; return 0x54; } uint32_t bf16_to_f32(uint32_t bf16_val) { return bf16_val << 16; } uint32_t bf16_add(uint32_t a0, uint32_t a1) { return 0x4060; } int main() { uint32_t s0 = 0; uint32_t t0_limit = 4; printf("--- 執行 uf8_encode ---\n"); while (s0 < t0_limit) { uint32_t a0 = InputDataSet[s0]; uint32_t t2 = uf8_encode(a0); EncodedResults[s0] = t2; printf("InputDataSet[%u] (0x%x) -> EncodedResults[%u] (0x%x)\n", s0, a0, s0, t2); s0++; } uint32_t a0_op1 = 0x4120; uint32_t a1_op2 = 0x4040; uint32_t t0_result = bf16_add(a0_op1, a1_op2); BF16Result = t0_result; printf("\n--- 執行 bf16_add ---\n"); printf("BF16Result: 0x%x\n", BF16Result); return 0; } ``` ## Assembly code ``` asm .data .align 2 InputDataSet: .word 0x0000000a .word 0x00000040 .word 0x00000400 .word 0x00000003 .align 2 EncodedResults: .space 16 .align 2 BF16Result: .space 4 .text .global main main: addi sp, sp, -8 sw ra, 4(sp) sw s0, 0(sp) li s0, 0 li t0, 4 main_loop_start: bge s0, t0, main_loop_end slli t1, s0, 2 la t2, InputDataSet add t1, t2, t1 lw a0, 0(t1) jal ra, uf8_encode slli t1, s0, 2 la t2, EncodedResults add t1, t2, t1 sw a0, 0(t1) addi s0, s0, 1 j main_loop_start main_loop_end: li a0, 16672 # 0x4120 li a1, 16448 # 0x4040 jal ra, bf16_add la t0, BF16Result sw a0, 0(t0) lw s0, 0(sp) lw ra, 4(sp) addi sp, sp, 8 ret clz: li t0, 32 li t1, 16 clz_loop: beq t1, zero, clz_end_loop srl t2, a0, t1 beq t2, zero, clz_skip_if sub t0, t0, t1 mv a0, t2 clz_skip_if: srli t1, t1, 1 j clz_loop clz_end_loop: sub a0, t0, a0 ret uf8_encode: addi sp, sp, -4 sw ra, 0(sp) li t0, 16 blt a0, t0, uf8_return_input jal ra, clz li t1, 31 sub t0, t1, a0 li a0, 84 # 0x54 j uf8_epilogue uf8_return_input: uf8_epilogue: lw ra, 0(sp) addi sp, sp, 4 ret bf16_to_f32: slli a0, a0, 16 ret bf16_add: li a0, 16480 # 0x4060 ret ``` ## Analysis - The translated code ``` 啟動 (main):程式從 main 開始,並初始化一個迴圈計數器 s0 (設為 0) 和迴圈次數 t0 (設為 4)。 --- 進入迴圈 (共 4 次): 第 1 次 (s0=0): 讀取 InputDataSet[0] (值為 0xa,即 10)。 呼叫 uf8_encode(10)。因為 10 < 16,函式直接回傳 10。 將 10 存入 EncodedResults[0]。 第 2 次 (s0=1): 讀取 InputDataSet[1] (值為 0x40,即 64)。 呼叫 uf8_encode(64)。因為 64 >= 16,函式會... 先呼叫 clz(64) (計算 64 的前導 0 個數)。 但 uf8_encode 忽略 clz 的結果,固定回傳 84 (即 0x54)。 將 84 存入 EncodedResults[1]。 第 3 次 (s0=2): 讀取 InputDataSet[2] (值為 0x400,即 1024)。 呼叫 uf8_encode(1024)。因為 1024 >= 16,函式同樣固定回傳 84。 將 84 存入 EncodedResults[2]。 第 4 次 (s0=3): 讀取 InputDataSet[3] (值為 0x3,即 3)。 呼叫 uf8_encode(3)。因為 3 < 16,函式直接回傳 3。 將 3 存入 EncodedResults[3]。 迴圈結束:計數器 s0 變為 4,s0 >= t0 成立,跳出迴圈。 --- 最終計算: 呼叫 bf16_add(16672, 16448) (這是兩個固定的數字)。 bf16_add 函式固定回傳 16480 (即 0x4060)。 將 16480 存入 BF16Result 變數。 結束:程式清理堆疊並返回,執行完畢。 --- 總結 程式執行完畢後,記憶體中的變數狀態會是: EncodedResults 陣列會包含:[10, 84, 84, 3] BF16Result 變數會是:16480 ``` - 5-Stage RISC_V Processor w/o Forwarding or Hazard Detection ![image](https://hackmd.io/_uploads/r1TynSlxbl.png) - Memory address ![image](https://hackmd.io/_uploads/r1UDiBlebl.png) - Register result ![image](https://hackmd.io/_uploads/HklpoSle-l.png)