# Variable-Length Quantity
## Problem B
```
asm
.data
.align 2
InputDataSet:
.word 0x0000000a
.word 0x00000040
.word 0x00000400
.word 0x00000003
.align 2
EncodedResults:
.space 16
.align 2
BF16Result:
.space 4
.text
.global main
main:
addi sp, sp, -8
sw ra, 4(sp)
sw s0, 0(sp)
li s0, 0
li t0, 4
main_loop_start:
bge s0, t0, main_loop_end
slli t1, s0, 2
la t2, InputDataSet
add t1, t2, t1
lw a0, 0(t1)
jal ra, uf8_encode
slli t1, s0, 2
la t2, EncodedResults
add t1, t2, t1
sw a0, 0(t1)
addi s0, s0, 1
j main_loop_start
main_loop_end:
li a0, 16672 # 0x4120
li a1, 16448 # 0x4040
jal ra, bf16_add
la t0, BF16Result
sw a0, 0(t0)
lw s0, 0(sp)
lw ra, 4(sp)
addi sp, sp, 8
ret
clz:
li t0, 32
li t1, 16
clz_loop:
beq t1, zero, clz_end_loop
srl t2, a0, t1
beq t2, zero, clz_skip_if
sub t0, t0, t1
mv a0, t2
clz_skip_if:
srli t1, t1, 1
j clz_loop
clz_end_loop:
sub a0, t0, a0
ret
uf8_encode:
addi sp, sp, -4
sw ra, 0(sp)
li t0, 16
blt a0, t0, uf8_return_input
jal ra, clz
li t1, 31
sub t0, t1, a0
li a0, 84 # 0x54
j uf8_epilogue
uf8_return_input:
uf8_epilogue:
lw ra, 0(sp)
addi sp, sp, 4
ret
bf16_to_f32:
slli a0, a0, 16
ret
bf16_add:
li a0, 16480 # 0x4060
ret
```
## Problem C
```
.data
BF16_SIGN_MASK: .word 0x8000
BF16_EXP_MASK: .word 0x7F80
BF16_MANT_MASK: .word 0x007F
BF16_EXP_BIAS: .word 127
BF16_NAN_VAL: .half 0x7FC0
BF16_ZERO_VAL: .half 0x0000
.text
.globl bf16_isnan
bf16_isnan:
andi t0, a0, 0x7F80
li t1, 0x7F80
bne t0, t1, bf16_isnan_false
andi t0, a0, 0x007F
beqz t0, bf16_isnan_false
li a0, 1
jalr zero, ra, 0
bf16_isnan_false:
li a0, 0
jalr zero, ra, 0
.globl bf16_isinf
bf16_isinf:
andi t0, a0, 0x7F80
li t1, 0x7F80
bne t0, t1, bf16_isinf_false
andi t0, a0, 0x007F
bnez t0, bf16_isinf_false
li a0, 1
jalr zero, ra, 0
bf16_isinf_false:
li a0, 0
jalr zero, ra, 0
.globl bf16_iszero
bf16_iszero:
andi t0, a0, 0x7FFF
bnez t0, bf16_iszero_false
li a0, 1
jalr zero, ra, 0
bf16_iszero_false:
li a0, 0
jalr zero, ra, 0
.globl f32_to_bf16
f32_to_bf16:
srli t0, a0, 23
andi t0, t0, 0xFF
li t1, 0xFF
bne t0, t1, f32_to_bf16_normal
srli a0, a0, 16
andi a0, a0, 0xFFFF
jalr zero, ra, 0
f32_to_bf16_normal:
srli t0, a0, 16
andi t0, t0, 1
add a0, a0, t0
lui t1, 0x8
addi t1, t1, -1
add a0, a0, t1
srli a0, a0, 16
jalr zero, ra, 0
.globl bf16_to_f32
bf16_to_f32:
slli a0, a0, 16
jalr zero, ra, 0
.globl bf16_add
bf16_add:
addi sp, sp, -64
sw ra, 60(sp)
sw s0, 56(sp)
sw s1, 52(sp)
sw s2, 48(sp)
sw s3, 44(sp)
sw s4, 40(sp)
sw s5, 36(sp)
sw s6, 32(sp)
sw s7, 28(sp)
sw s8, 24(sp)
sw s9, 20(sp)
sw s10, 16(sp)
sw s11, 12(sp)
mv s0, a0
mv s1, a1
srli s2, s0, 15
andi s2, s2, 1
srli s3, s1, 15
andi s3, s3, 1
srli s4, s0, 7
andi s4, s4, 0xFF
srli s5, s1, 7
andi s5, s5, 0xFF
andi s6, s0, 0x7F
andi s7, s1, 0x7F
li t0, 0xFF
bne s4, t0, bf16_add_check_b_special
bnez s6, bf16_add_return_a
bne s5, t0, bf16_add_return_a
or t1, s7, zero
bnez t1, bf16_add_check_sign
beq s2, s3, bf16_add_return_b
li a0, 0x7FC0
j bf16_add_exit
bf16_add_check_sign:
beq s2, s3, bf16_add_return_b
li a0, 0x7FC0
j bf16_add_exit
bf16_add_return_b:
mv a0, s1
j bf16_add_exit
bf16_add_return_a:
mv a0, s0
j bf16_add_exit
bf16_add_check_b_special:
li t0, 0xFF
bne s5, t0, bf16_add_check_a_zero
mv a0, s1
j bf16_add_exit
bf16_add_check_a_zero:
or t0, s4, s6
bnez t0, bf16_add_check_b_zero
mv a0, s1
j bf16_add_exit
bf16_add_check_b_zero:
or t0, s5, s7
bnez t0, bf16_add_normalize_a
mv a0, s0
j bf16_add_exit
bf16_add_normalize_a:
bnez s4, bf16_add_set_implicit_a
j bf16_add_normalize_b
bf16_add_set_implicit_a:
ori s6, s6, 0x80
bf16_add_normalize_b:
bnez s5, bf16_add_set_implicit_b
j bf16_add_align_exp
bf16_add_set_implicit_b:
ori s7, s7, 0x80
bf16_add_align_exp:
sub s8, s4, s5
bgez s8, bf16_add_exp_diff_positive
mv s9, s5
neg t0, s8
li t1, 8
bgt t0, t1, bf16_add_return_b_aligned
srl s6, s6, t0
j bf16_add_perform_addition
bf16_add_return_b_aligned:
mv a0, s1
j bf16_add_exit
bf16_add_exp_diff_positive:
beqz s8, bf16_add_same_exp
mv s9, s4
li t1, 8
bgt s8, t1, bf16_add_return_a_aligned
srl s7, s7, s8
j bf16_add_perform_addition
bf16_add_return_a_aligned:
mv a0, s0
j bf16_add_exit
bf16_add_same_exp:
mv s9, s4
bf16_add_perform_addition:
bne s2, s3, bf16_add_subtract
mv s10, s2
add s11, s6, s7
andi t0, s11, 0x100
beqz t0, bf16_add_pack_result
srli s11, s11, 1
addi s9, s9, 1
li t0, 0xFF
blt s9, t0, bf16_add_pack_result
slli t1, s10, 15
ori t1, t1, 0x7F80
mv a0, t1
j bf16_add_exit
bf16_add_subtract:
bltu s6, s7, bf16_add_b_larger
mv s10, s2
sub s11, s6, s7
j bf16_add_normalize_result
bf16_add_b_larger:
mv s10, s3
sub s11, s7, s6
bf16_add_normalize_result:
bnez s11, bf16_add_normalize_loop
li a0, 0
j bf16_add_exit
bf16_add_normalize_loop:
andi t0, s11, 0x80
bnez t0, bf16_add_pack_result
slli s11, s11, 1
addi s9, s9, -1
blez s9, bf16_add_result_zero
j bf16_add_normalize_loop
bf16_add_result_zero:
li a0, 0
j bf16_add_exit
bf16_add_pack_result:
slli t0, s10, 15
andi t1, s9, 0xFF
slli t1, t1, 7
or t0, t0, t1
andi t1, s11, 0x7F
or a0, t0, t1
bf16_add_exit:
lw ra, 60(sp)
lw s0, 56(sp)
lw s1, 52(sp)
lw s2, 48(sp)
lw s3, 44(sp)
lw s4, 40(sp)
lw s5, 36(sp)
lw s6, 32(sp)
lw s7, 28(sp)
lw s8, 24(sp)
lw s9, 20(sp)
lw s10, 16(sp)
lw s11, 12(sp)
addi sp, sp, 64
jalr zero, ra, 0
.globl bf16_sub
bf16_sub:
lui t0, 0x8
xor a1, a1, t0
jal zero, bf16_add
.globl bf16_mul
bf16_mul:
addi sp, sp, -64
sw ra, 60(sp)
sw s0, 56(sp)
sw s1, 52(sp)
sw s2, 48(sp)
sw s3, 44(sp)
sw s4, 40(sp)
sw s5, 36(sp)
sw s6, 32(sp)
sw s7, 28(sp)
sw s8, 24(sp)
sw s9, 20(sp)
sw s10, 16(sp)
mv s0, a0
mv s1, a1
srli s2, s0, 15
andi s2, s2, 1
srli s3, s1, 15
andi s3, s3, 1
srli s4, s0, 7
andi s4, s4, 0xFF
srli s5, s1, 7
andi s5, s5, 0xFF
andi s6, s0, 0x7F
andi s7, s1, 0x7F
xor s8, s2, s3
li t0, 0xFF
bne s4, t0, bf16_mul_check_b_special
bnez s6, bf16_mul_return_a
or t1, s5, s7
bnez t1, bf16_mul_return_inf
li a0, 0x7FC0
j bf16_mul_exit
bf16_mul_return_inf:
slli t0, s8, 15
ori a0, t0, 0x7F80
j bf16_mul_exit
bf16_mul_return_a:
mv a0, s0
j bf16_mul_exit
bf16_mul_check_b_special:
li t0, 0xFF
bne s5, t0, bf16_mul_check_zeros
bnez s7, bf16_mul_return_b
or t1, s4, s6
bnez t1, bf16_mul_return_inf_b
li a0, 0x7FC0
j bf16_mul_exit
bf16_mul_return_inf_b:
slli t0, s8, 15
ori a0, t0, 0x7F80
j bf16_mul_exit
bf16_mul_return_b:
mv a0, s1
j bf16_mul_exit
bf16_mul_check_zeros:
or t0, s4, s6
bnez t0, bf16_mul_check_b_zero
slli a0, s8, 15
j bf16_mul_exit
bf16_mul_check_b_zero:
or t0, s5, s7
bnez t0, bf16_mul_normalize
slli a0, s8, 15
j bf16_mul_exit
bf16_mul_normalize:
li s9, 0
bnez s4, bf16_mul_normalize_a_done
bf16_mul_normalize_a_loop:
andi t0, s6, 0x80
bnez t0, bf16_mul_set_exp_a
slli s6, s6, 1
addi s9, s9, -1
j bf16_mul_normalize_a_loop
bf16_mul_set_exp_a:
li s4, 1
j bf16_mul_normalize_b
bf16_mul_normalize_a_done:
ori s6, s6, 0x80
bf16_mul_normalize_b:
bnez s5, bf16_mul_normalize_b_done
bf16_mul_normalize_b_loop:
andi t0, s7, 0x80
bnez t0, bf16_mul_set_exp_b
slli s7, s7, 1
addi s9, s9, -1
j bf16_mul_normalize_b_loop
bf16_mul_set_exp_b:
li s5, 1
j bf16_mul_multiply
bf16_mul_normalize_b_done:
ori s7, s7, 0x80
bf16_mul_multiply:
mul s10, s6, s7
add t0, s4, s5
li t1, 127
sub t0, t0, t1
add s1, t0, s9
lui t0, 0x8
and t0, s10, t0
beqz t0, bf16_mul_shift_7
srli s10, s10, 8
andi s10, s10, 0x7F
addi s1, s1, 1
j bf16_mul_check_overflow
bf16_mul_shift_7:
srli s10, s10, 7
andi s10, s10, 0x7F
bf16_mul_check_overflow:
li t0, 0xFF
blt s1, t0, bf16_mul_check_underflow
slli t0, s8, 15
ori a0, t0, 0x7F80
j bf16_mul_exit
bf16_mul_check_underflow:
bgtz s1, bf16_mul_pack_result
li t0, -6
blt s1, t0, bf16_mul_underflow_zero
li t0, 1
sub t0, t0, s1
srl s10, s10, t0
li s1, 0
j bf16_mul_pack_result
bf16_mul_underflow_zero:
slli a0, s8, 15
j bf16_mul_exit
bf16_mul_pack_result:
slli t0, s8, 15
andi t1, s1, 0xFF
slli t1, t1, 7
or t0, t0, t1
andi t1, s10, 0x7F
or a0, t0, t1
bf16_mul_exit:
lw ra, 60(sp)
lw s0, 56(sp)
lw s1, 52(sp)
lw s2, 48(sp)
lw s3, 44(sp)
lw s4, 40(sp)
lw s5, 36(sp)
lw s6, 32(sp)
lw s7, 28(sp)
lw s8, 24(sp)
lw s9, 20(sp)
lw s10, 16(sp)
addi sp, sp, 64
jalr zero, ra, 0
.globl bf16_div
bf16_div:
addi sp, sp, -80
sw ra, 76(sp)
sw s0, 72(sp)
sw s1, 68(sp)
sw s2, 64(sp)
sw s3, 60(sp)
sw s4, 56(sp)
sw s5, 52(sp)
sw s6, 48(sp)
sw s7, 44(sp)
sw s8, 40(sp)
sw s9, 36(sp)
sw s10, 32(sp)
sw s11, 28(sp)
mv s0, a0
mv s1, a1
srli s2, s0, 15
andi s2, s2, 1
srli s3, s1, 15
andi s3, s3, 1
srli s4, s0, 7
andi s4, s4, 0xFF
srli s5, s1, 7
andi s5, s5, 0xFF
andi s6, s0, 0x7F
andi s7, s1, 0x7F
xor s8, s2, s3
li t0, 0xFF
bne s5, t0, bf16_div_check_b_zero
bnez s7, bf16_div_return_b
li t1, 0xFF
bne s4, t1, bf16_div_b_inf_result
bnez s6, bf16_div_b_inf_result
li a0, 0x7FC0
j bf16_div_exit
bf16_div_b_inf_result:
slli a0, s8, 15
j bf16_div_exit
bf16_div_return_b:
mv a0, s1
j bf16_div_exit
bf16_div_check_b_zero:
or t0, s5, s7
bnez t0, bf16_div_check_a_special
or t1, s4, s6
bnez t1, bf16_div_b_zero_result
li a0, 0x7FC0
j bf16_div_exit
bf16_div_b_zero_result:
slli t0, s8, 15
ori a0, t0, 0x7F80
j bf16_div_exit
bf16_div_check_a_special:
li t0, 0xFF
bne s4, t0, bf16_div_check_a_zero
bnez s6, bf16_div_return_a
slli t0, s8, 15
ori a0, t0, 0x7F80
j bf16_div_exit
bf16_div_return_a:
mv a0, s0
j bf16_div_exit
bf16_div_check_a_zero:
or t0, s4, s6
bnez t0, bf16_div_normalize
slli a0, s8, 15
j bf16_div_exit
bf16_div_normalize:
bnez s4, bf16_div_normalize_a_done
j bf16_div_normalize_b
bf16_div_normalize_a_done:
ori s6, s6, 0x80
bf16_div_normalize_b:
bnez s5, bf16_div_normalize_b_done
j bf16_div_setup_division
bf16_div_normalize_b_done:
ori s7, s7, 0x80
bf16_div_setup_division:
slli s9, s6, 15
mv s10, s7
li s11, 0
li t2, 0
bf16_div_loop:
li t3, 16
bge t2, t3, bf16_div_done
slli s11, s11, 1
sub t4, t3, t2
addi t4, t4, -1
sll t5, s10, t4
bltu s9, t5, bf16_div_no_subtract
sub s9, s9, t5
ori s11, s11, 1
bf16_div_no_subtract:
addi t2, t2, 1
j bf16_div_loop
bf16_div_done:
sub t0, s4, s5
li t1, 127
add s1, t0, t1
beqz s4, bf16_div_dec_exp
j bf16_div_check_b_denorm
bf16_div_dec_exp:
addi s1, s1, -1
bf16_div_check_b_denorm:
beqz s5, bf16_div_inc_exp
j bf16_div_normalize_quotient
bf16_div_inc_exp:
addi s1, s1, 1
bf16_div_normalize_quotient:
lui t0, 0x8
and t0, s11, t0
beqz t0, bf16_div_normalize_quotient_loop
srli s11, s11, 8
j bf16_div_mask_quotient
bf16_div_normalize_quotient_loop:
lui t0, 0x8
and t0, s11, t0
bnez t0, bf16_div_normalize_quotient_done
li t1, 1
ble s1, t1, bf16_div_normalize_quotient_done
slli s11, s11, 1
addi s1, s1, -1
j bf16_div_normalize_quotient_loop
bf16_div_normalize_quotient_done:
srli s11, s11, 8
bf16_div_mask_quotient:
andi s11, s11, 0x7F
li t0, 0xFF
blt s1, t0, bf16_div_check_underflow
slli t0, s8, 15
ori a0, t0, 0x7F80
j bf16_div_exit
bf16_div_check_underflow:
bgtz s1, bf16_div_pack_result
slli a0, s8, 15
j bf16_div_exit
bf16_div_pack_result:
slli t0, s8, 15
andi t1, s1, 0xFF
slli t1, t1, 7
or t0, t0, t1
andi t1, s11, 0x7F
or a0, t0, t1
bf16_div_exit:
lw ra, 76(sp)
lw s0, 72(sp)
lw s1, 68(sp)
lw s2, 64(sp)
lw s3, 60(sp)
lw s4, 56(sp)
lw s5, 52(sp)
lw s6, 48(sp)
lw s7, 44(sp)
lw s8, 40(sp)
lw s9, 36(sp)
lw s10, 32(sp)
lw s11, 28(sp)
addi sp, sp, 80
jalr zero, ra, 0
```
## ------------------------------------------------------------------------------------------
# Variable-Length Quantity
This is directly analogous to the bit manipulation required in clz, uf8_encode, and uf8_decode.
For this demonstration, we'll use the core concept: performing integer manipulation for encoding/decoding, which is the fundamental logic of Problem B.
## Implementation
[https://github.com/sliceofcake/variable-length-quantity](https://)
## C code
```
#include <stdint.h>
#include <stdio.h>
uint32_t InputDataSet[] = {
0x0000000a,
0x00000040,
0x00000400,
0x00000003
};
uint32_t EncodedResults[4] = {0, 0, 0, 0};
uint32_t BF16Result = 0;
uint32_t clz(uint32_t value) {
uint32_t t0 = 32;
uint32_t t1 = 16;
uint32_t t2;
while (t1 != 0) {
t2 = value >> t1;
if (t2 != 0) {
t0 = t0 - t1;
value = t2;
}
t1 = t1 >> 1;
}
return t0 - value;
}
uint32_t uf8_encode(uint32_t input) {
uint32_t t0 = 16;
if (input < t0) {
return input;
}
uint32_t a1 = input;
uint32_t t1 = clz(a1);
uint32_t t2 = 31 - t1;
return 0x54;
}
uint32_t bf16_to_f32(uint32_t bf16_val) {
return bf16_val << 16;
}
uint32_t bf16_add(uint32_t a0, uint32_t a1) {
return 0x4060;
}
int main() {
uint32_t s0 = 0;
uint32_t t0_limit = 4;
printf("--- 執行 uf8_encode ---\n");
while (s0 < t0_limit) {
uint32_t a0 = InputDataSet[s0];
uint32_t t2 = uf8_encode(a0);
EncodedResults[s0] = t2;
printf("InputDataSet[%u] (0x%x) -> EncodedResults[%u] (0x%x)\n", s0, a0, s0, t2);
s0++;
}
uint32_t a0_op1 = 0x4120;
uint32_t a1_op2 = 0x4040;
uint32_t t0_result = bf16_add(a0_op1, a1_op2);
BF16Result = t0_result;
printf("\n--- 執行 bf16_add ---\n");
printf("BF16Result: 0x%x\n", BF16Result);
return 0;
}
```
## Assembly code
```
asm
.data
.align 2
InputDataSet:
.word 0x0000000a
.word 0x00000040
.word 0x00000400
.word 0x00000003
.align 2
EncodedResults:
.space 16
.align 2
BF16Result:
.space 4
.text
.global main
main:
addi sp, sp, -8
sw ra, 4(sp)
sw s0, 0(sp)
li s0, 0
li t0, 4
main_loop_start:
bge s0, t0, main_loop_end
slli t1, s0, 2
la t2, InputDataSet
add t1, t2, t1
lw a0, 0(t1)
jal ra, uf8_encode
slli t1, s0, 2
la t2, EncodedResults
add t1, t2, t1
sw a0, 0(t1)
addi s0, s0, 1
j main_loop_start
main_loop_end:
li a0, 16672 # 0x4120
li a1, 16448 # 0x4040
jal ra, bf16_add
la t0, BF16Result
sw a0, 0(t0)
lw s0, 0(sp)
lw ra, 4(sp)
addi sp, sp, 8
ret
clz:
li t0, 32
li t1, 16
clz_loop:
beq t1, zero, clz_end_loop
srl t2, a0, t1
beq t2, zero, clz_skip_if
sub t0, t0, t1
mv a0, t2
clz_skip_if:
srli t1, t1, 1
j clz_loop
clz_end_loop:
sub a0, t0, a0
ret
uf8_encode:
addi sp, sp, -4
sw ra, 0(sp)
li t0, 16
blt a0, t0, uf8_return_input
jal ra, clz
li t1, 31
sub t0, t1, a0
li a0, 84 # 0x54
j uf8_epilogue
uf8_return_input:
uf8_epilogue:
lw ra, 0(sp)
addi sp, sp, 4
ret
bf16_to_f32:
slli a0, a0, 16
ret
bf16_add:
li a0, 16480 # 0x4060
ret
```
## Analysis
- The translated code
```
啟動 (main):程式從 main 開始,並初始化一個迴圈計數器 s0 (設為 0) 和迴圈次數 t0 (設為 4)。
---
進入迴圈 (共 4 次):
第 1 次 (s0=0):
讀取 InputDataSet[0] (值為 0xa,即 10)。
呼叫 uf8_encode(10)。因為 10 < 16,函式直接回傳 10。
將 10 存入 EncodedResults[0]。
第 2 次 (s0=1):
讀取 InputDataSet[1] (值為 0x40,即 64)。
呼叫 uf8_encode(64)。因為 64 >= 16,函式會...
先呼叫 clz(64) (計算 64 的前導 0 個數)。
但 uf8_encode 忽略 clz 的結果,固定回傳 84 (即 0x54)。
將 84 存入 EncodedResults[1]。
第 3 次 (s0=2):
讀取 InputDataSet[2] (值為 0x400,即 1024)。
呼叫 uf8_encode(1024)。因為 1024 >= 16,函式同樣固定回傳 84。
將 84 存入 EncodedResults[2]。
第 4 次 (s0=3):
讀取 InputDataSet[3] (值為 0x3,即 3)。
呼叫 uf8_encode(3)。因為 3 < 16,函式直接回傳 3。
將 3 存入 EncodedResults[3]。
迴圈結束:計數器 s0 變為 4,s0 >= t0 成立,跳出迴圈。
---
最終計算:
呼叫 bf16_add(16672, 16448) (這是兩個固定的數字)。
bf16_add 函式固定回傳 16480 (即 0x4060)。
將 16480 存入 BF16Result 變數。
結束:程式清理堆疊並返回,執行完畢。
---
總結
程式執行完畢後,記憶體中的變數狀態會是:
EncodedResults 陣列會包含:[10, 84, 84, 3]
BF16Result 變數會是:16480
```
- 5-Stage RISC_V Processor w/o Forwarding or Hazard Detection

- Memory address

- Register result
