# CA Homework 1
## problem B
I translated the C functions clz, uf8_decode, and uf8_encode into RISC-V assembly and built an automated test that exhaustively evaluates all inputs from 0 to 255.
### clz
<details>
<summary><b>clz</b></summary>
``` c=
static inline unsigned clz(uint32_t x)
{
int n = 32, c = 16;
do {
uint32_t y = x >> c;
if (y) {
n -= c;
x = y;
}
c >>= 1;
} while (c);
return n - x;
}
```
</details>
### Assembly code
```
.text
.globl main
main:
li a0, 0x0000F000 # test 0x00F00000
jal ra, clz # call clz
done:
j done
clz:
li t0, 32 # n = 32
li t1, 16 # c = 16
clz_loop:
srl t2, a0, t1 # y = x >> c
beq t2, zero, clz_skip # if (!y) skip
sub t0, t0, t1 # n -= c
mv a0, t2 # x = y
clz_skip:
srli t1, t1, 1 # c >>= 1
bne t1, zero, clz_loop # while (c)
sub a0, t0, a0 # return n - x
ret
```
### uf8_decode
```
.text
.globl main
main:
# test 0x7C
li a0, 0x7C
jal ra, uf8_decode
# a0 3568 (0xDE0)
done:
j done
uf8_decode:
andi t0, a0, 0x0f # t0 = mantissa
srli t1, a0, 4 # t1 = exponent
li t2, 15
sub t2, t2, t1 # t2 = 15 - exponent
li t3, 0x7FFF
srl t3, t3, t2
slli t3, t3, 4 # t3 = offset
sll t0, t0, t1 # t0 = mantissa << exponent
add a0, t0, t3 # a0 = (mantissa << exponent) + offset
ret
```
### main
```
.text
.globl main
main:
# test function
jal ra, test # a0 (1 = pass, 0 = fail)。
done:
li a7, 10
ecall
test:
addi sp, sp, -20
sw ra, 16(sp) #return address
sw s0, 12(sp) #s0 previous_value
sw s1, 8(sp) #s1 passed flag
sw s2, 4(sp) #s2 loop counter
sw s3, 0(sp) #s3 result of decode
li s0, -1
li s1, 1
mv s2, zero
test_loop_start:
li t0, 256
bge s2, t0, test_loop_end
mv a0, s2
jal ra, uf8_decode
mv s3, a0
jal ra, uf8_encode
beq s2, a0, check_monotonicity
j test_set_fail
check_monotonicity:
bgt s3, s0, update_and_continue
j test_set_fail
update_and_continue:
mv s0, s3
addi s2, s2, 1
j test_loop_start
test_set_fail:
mv s1, zero
j update_and_continue
test_loop_end:
mv a0, s1
lw s3, 0(sp)
lw s2, 4(sp)
lw s1, 8(sp)
lw s0, 12(sp)
lw ra, 16(sp)
addi sp, sp, 20
ret
uf8_encode:
addi sp, sp, -16
sw ra, 12(sp)
sw s0, 8(sp)
sw s1, 4(sp)
sw s2, 0(sp)
li t0, 16
bge a0, t0, handle_large_value_enc
j epilogue_encode
handle_large_value_enc:
mv s0, a0
jal ra, clz
li t0, 31
sub t1, t0, a0
mv s1, zero
mv s2, zero
li t0, 5
blt t1, t0, find_exact_exponent
addi t0, t1, -4
mv s1, t0
li t0, 15
ble s1, t0, calculate_overflow_loop
mv s1, t0
calculate_overflow_loop:
mv t2, zero
for_loop_start:
bge t2, s1, for_loop_end
slli s2, s2, 1
addi s2, s2, 16
addi t2, t2, 1
j for_loop_start
for_loop_end:
adjust_exponent_loop:
ble s1, zero, find_exact_exponent
bge s0, s2, find_exact_exponent
addi s2, s2, -16
srli s2, s2, 1
addi s1, s1, -1
j adjust_exponent_loop
find_exact_exponent:
exact_exponent_loop:
li t0, 15
bge s1, t0, exact_exponent_end
slli t3, s2, 1
addi t3, t3, 16
blt s0, t3, exact_exponent_end
mv s2, t3
addi s1, s1, 1
j exact_exponent_loop
exact_exponent_end:
sub t0, s0, s2
srl t0, t0, s1
slli a0, s1, 4
or a0, a0, t0
epilogue_encode:
lw s2, 0(sp)
lw s1, 4(sp)
lw s0, 8(sp)
lw ra, 12(sp)
addi sp, sp, 16
ret
uf8_decode:
andi t0, a0, 0x0f # t0 = mantissa = fl & 0x0f
srli t1, a0, 4 # t1 = exponent = fl >> 4
li t2, 15 # t2 = 15
sub t2, t2, t1 # t2 = 15 - exponent
li t3, 0x7FFF # t3 = 0x7FFF
srl t3, t3, t2 # t3 = 0x7FFF >> t2
slli t3, t3, 4 # t3 = offset = (result << 4)
sll t0, t0, t1 # t0 = mantissa << exponent
add a0, t0, t3 # a0 = (mantissa << exponent) + offset
ret
clz:
mv t3, a0 # t3 = x
li t0, 32 # t0 = n = 32
li t1, 16 # t1 = c = 16
clz_loop:
srl t2, t3, t1 # t2 = y = x >> c
beq t2, zero, clz_skip # if (y == 0) then skip
sub t0, t0, t1 # n -= c
mv t3, t2 # x = y
clz_skip:
srli t1, t1, 1 # c >>= 1
bne t1, zero, clz_loop # while (c != 0)
sub a0, t0, t3 # return n - x
ret
```
### Analysis
Put code above into editor and we will see that Ripe doesn't execute it literally. Instead, it replace pseudo instruction into equivalent one, and change register name from ABI name to sequencial one.
The translated code looks like:
```
00000000 <main>:
0: 008000ef jal x1 8 <test>
00000004 <done>:
4: 0000006f jal x0 0 <done>
00000008 <test>:
8: fec10113 addi x2 x2 -20
c: 00112823 sw x1 16 x2
10: 00812623 sw x8 12 x2
14: 00912423 sw x9 8 x2
18: 01212223 sw x18 4 x2
1c: 01312023 sw x19 0 x2
20: fff00413 addi x8 x0 -1
24: 00100493 addi x9 x0 1
28: 00000913 addi x18 x0 0
0000002c <test_loop_start>:
2c: 10000293 addi x5 x0 256
30: 02595c63 bge x18 x5 56 <test_loop_end>
34: 00090513 addi x10 x18 0
38: 11c000ef jal x1 284 <uf8_decode>
3c: 00050993 addi x19 x10 0
40: 048000ef jal x1 72 <uf8_encode>
44: 00a90463 beq x18 x10 8 <check_monotonicity>
48: 0180006f jal x0 24 <test_set_fail>
0000004c <check_monotonicity>:
4c: 01344463 blt x8 x19 8 <update_and_continue>
50: 0100006f jal x0 16 <test_set_fail>
00000054 <update_and_continue>:
54: 00098413 addi x8 x19 0
58: 00190913 addi x18 x18 1
5c: fd1ff06f jal x0 -48 <test_loop_start>
00000060 <test_set_fail>:
60: 00000493 addi x9 x0 0
64: ff1ff06f jal x0 -16 <update_and_continue>
00000068 <test_loop_end>:
68: 00048513 addi x10 x9 0
6c: 00012983 lw x19 0 x2
70: 00412903 lw x18 4 x2
74: 00812483 lw x9 8 x2
78: 00c12403 lw x8 12 x2
7c: 01012083 lw x1 16 x2
80: 01410113 addi x2 x2 20
84: 00008067 jalr x0 x1 0
00000088 <uf8_encode>:
88: ff010113 addi x2 x2 -16
8c: 00112623 sw x1 12 x2
90: 00812423 sw x8 8 x2
94: 00912223 sw x9 4 x2
98: 01212023 sw x18 0 x2
9c: 01000293 addi x5 x0 16
a0: 00555463 bge x10 x5 8 <handle_large_value_enc>
a4: 0980006f jal x0 152 <epilogue_encode>
000000a8 <handle_large_value_enc>:
a8: 00050413 addi x8 x10 0
ac: 0d4000ef jal x1 212 <clz>
b0: 01f00293 addi x5 x0 31
b4: 40a28333 sub x6 x5 x10
b8: 00000493 addi x9 x0 0
bc: 00000913 addi x18 x0 0
c0: 00500293 addi x5 x0 5
c4: 04534463 blt x6 x5 72 <find_exact_exponent>
c8: ffc30293 addi x5 x6 -4
cc: 00028493 addi x9 x5 0
d0: 00f00293 addi x5 x0 15
d4: 0092d463 bge x5 x9 8 <calculate_overflow_loop>
d8: 00028493 addi x9 x5 0
000000dc <calculate_overflow_loop>:
dc: 00000393 addi x7 x0 0
000000e0 <for_loop_start>:
e0: 0093da63 bge x7 x9 20 <for_loop_end>
e4: 00191913 slli x18 x18 1
e8: 01090913 addi x18 x18 16
ec: 00138393 addi x7 x7 1
f0: ff1ff06f jal x0 -16 <for_loop_start>
000000f4 <for_loop_end>:
f4: 00905c63 bge x0 x9 24 <find_exact_exponent>
f8: 01245a63 bge x8 x18 20 <find_exact_exponent>
fc: ff090913 addi x18 x18 -16
100: 00195913 srli x18 x18 1
104: fff48493 addi x9 x9 -1
108: fedff06f jal x0 -20 <for_loop_end>
0000010c <find_exact_exponent>:
10c: 00f00293 addi x5 x0 15
110: 0054de63 bge x9 x5 28 <exact_exponent_end>
114: 00191e13 slli x28 x18 1
118: 010e0e13 addi x28 x28 16
11c: 01c44863 blt x8 x28 16 <exact_exponent_end>
120: 000e0913 addi x18 x28 0
124: 00148493 addi x9 x9 1
128: fe5ff06f jal x0 -28 <find_exact_exponent>
0000012c <exact_exponent_end>:
12c: 412402b3 sub x5 x8 x18
130: 0092d2b3 srl x5 x5 x9
134: 00449513 slli x10 x9 4
138: 00556533 or x10 x10 x5
0000013c <epilogue_encode>:
13c: 00012903 lw x18 0 x2
140: 00412483 lw x9 4 x2
144: 00812403 lw x8 8 x2
148: 00c12083 lw x1 12 x2
14c: 01010113 addi x2 x2 16
150: 00008067 jalr x0 x1 0
00000154 <uf8_decode>:
154: 00f57293 andi x5 x10 15
158: 00455313 srli x6 x10 4
15c: 00f00393 addi x7 x0 15
160: 406383b3 sub x7 x7 x6
164: 00008e37 lui x28 0x8
168: fffe0e13 addi x28 x28 -1
16c: 007e5e33 srl x28 x28 x7
170: 004e1e13 slli x28 x28 4
174: 006292b3 sll x5 x5 x6
178: 01c28533 add x10 x5 x28
17c: 00008067 jalr x0 x1 0
00000180 <clz>:
180: 00050e13 addi x28 x10 0
184: 02000293 addi x5 x0 32
188: 01000313 addi x6 x0 16
0000018c <clz_loop>:
18c: 006e53b3 srl x7 x28 x6
190: 00038663 beq x7 x0 12 <clz_skip>
194: 406282b3 sub x5 x5 x6
198: 00038e13 addi x28 x7 0
0000019c <clz_skip>:
19c: 00135313 srli x6 x6 1
1a0: fe0316e3 bne x6 x0 -20 <clz_loop>
1a4: 41c28533 sub x10 x5 x28
1a8: 00008067 jalr x0 x1 0
```
### 5-stage pipelined processor

### Instruction Fetch (IF)
Fetch the next instruction at PC (e.g., jal ra, test, loop heads like test_loop_start, clz_loop).
The code has many branches and jumps—bge s2, t0, test_loop_end, beq t2, zero, clz_skip, jal x0, … (unconditional jump), and jalr x0, x1, 0 (ret).
IF speculatively fetches the fall-through; once EX resolves a branch/jump, any misfetched instruction is flushed (control hazard → bubbles).
### Instruction Decode & Register Fetch (ID)
Examples:
Loop control reads: bge s2, t0, … (reads s2, t0).
Argument setup before calls: mv a0, s2 (expanded as addi a0, s2, 0) for uf8_decode/encode.
Stack frame prologues: sw ra, 16(sp), sw s0, 12(sp) preserve live registers.
Data hazards: Back-to-back use of values returned from functions (e.g., clz → used immediately in uf8_encode) depends on forwarding; without it, the core inserts stalls.
### Execute (EX)
Representative operations
clz (count leading zeros):
Binary search via shifts: srl t2, t3, t1 with c stepping 16→8→4→2→1; branch on zero beq t2, zero, clz_skip; maintain n -= c; return sub a0, t0, t3.
Branches resolve in EX → frequent control hazards.
uf8_decode:
Field split: andi t0, a0, 0x0f (mantissa), srli t1, a0, 4 (exponent).
Build 0x7FFF via lui+addi, then srl/slli to form offset; compute (mantissa<<exp)+offset.
uf8_encode:
Estimate msb = 31 - clz(value); guess exponent ≈ msb - 4 (clamped to 15).
Build overflow by loop: overflow = (overflow<<1) + 16.
Adjust down while value < overflow: (overflow-16)>>1, exponent- -.
Refine up while value ≥ next = (overflow<<1)+16: update overflow, exponent++.
Compute mantissa = (value - overflow) >> exponent, then pack (exponent<<4)|mantissa.
Branch resolution: blt/bge/beq decide in EX; IF may need to discard prefetched instructions.
### Memory Access (MEM)
Role: Data memory loads/stores. In your code, MEM is mostly stack traffic at function boundaries.
Typical instructions:
sw ra/s0/… on entry; lw on exit to restore. The algorithm itself keeps most working data in registers, so MEM is dominated by prologue/epilogue traffic.
Note: If the simulator models cache/latency, lw followed immediately by a consumer can cause a load-use stall without forwarding.
### Register Write Back (WB)
Function returns: clz, uf8_decode, uf8_encode write results to a0; callers often consume them in the next instruction (RAW hazard → rely on forwarding).
Loop variables: addi s2, s2, 1 (i++) is written back and then read right away by bge s2, 256.

This routine (test) validates the round-trip correctness and monotonicity of a custom 8-bit “uf8” number format by exhaustively iterating all 256 encodings. For each fl ∈ [0,255], it:
decodes fl → value = uf8_decode(fl),
re-encodes value → fl2 = uf8_encode(value),
checks that fl2 == fl (round-trip), and
enforces value is strictly increasing over fl (monotonicity).
If any check fails, a boolean flag passed is cleared. The function returns 1 (pass) or 0 (fail) in a0.

## problem C
### static inline bool bf16_isnan(bf16_t a)
```
.text
.globl main
main:
li a0, 0x7FC0
jal ra, bf16_isnan
mv s0, a0
li a0, 0x7F80
jal ra, bf16_isnan
mv s1, a0
done:
j done
bf16_isnan:
li t0, 0x7F80
and t1, a0, t0
bne t1, t0, is_not_nan
li t0, 0x007F
and t1, a0, t0
sltu a0, zero, t1
ret
is_not_nan:
li a0, 0
ret
```
### static inline bool bf16_isinf(bf16_t a)
```
.text
.globl main
main:
li a0, 0x7F80
jal ra, bf16_isinf
mv s0, a0
li a0, 0x7FC0
jal ra, bf16_isinf
mv s1, a0
done:
j done
bf16_isinf:
li t0, 0x7F80
and t1, a0, t0
bne t1, t0, is_not_inf
li t0, 0x007F
and t1, a0, t0
seqz a0, t1
ret
is_not_inf:
li a0, 0
ret
```
### static inline bool bf16_iszero(bf16_t a)
```
.text
.globl main
main:
li a0, 0x8000
jal ra, bf16_iszero
mv s0, a0
li a0, 0x3F80
jal ra, bf16_iszero
mv s1, a0
done:
j done
bf16_iszero:
li t0, 0x7FFF
and t1, a0, t0
seqz a0, t1
ret
```
### static inline bf16_t f32_to_bf16(float val)
```
.text
.globl main
main:
li a0, 0x7F800000
jal ra, f32_to_bf16
mv s0, a0
li a0, 0x40000000
jal ra, f32_to_bf16
mv s1, a0
li a0, 0x40490FDB
jal ra, f32_to_bf16
mv s2, a0
done:
j done
f32_to_bf16:
srli t0, a0, 23
andi t0, t0, 0xFF
li t1, 0xFF
bne t0, t1, handle_normal
handle_special:
srli a0, a0, 16
ret
handle_normal:
srli t0, a0, 16
andi t0, t0, 1
li t1, 0x7FFF
add t0, t0, t1
add a0, a0, t0
srli a0, a0, 16
ret
```
### static inline float bf16_to_f32(bf16_t val)
```
.text
.globl main
main:
li a0, 0x4000
jal ra, bf16_to_f32
mv s0, a0
li a0, 0x7F80
jal ra, bf16_to_f32
mv s1, a0
done:
j done
bf16_to_f32:
slli a0, a0, 16
ret
```
### static inline bf16_t bf16_add(bf16_t a, bf16_t b)
```
.text
.globl main
main:
li a0, 0x3F80
li a1, 0x4000
jal ra, bf16_add
mv s0, a0
li a0, 0x4040
li a1, 0xBF80
jal ra, bf16_add
mv s1, a0
li a0, 0x3FC0
li a1, 0x3FC0
jal ra, bf16_add
mv s2, a0
done:
j done
bf16_add:
addi sp, sp, -48
sw ra, 44(sp)
sw s0, 40(sp) # s0: a
sw s1, 36(sp) # s1: b
sw s2, 32(sp) # s2: sign_a
sw s3, 28(sp) # s3: sign_b
sw s4, 24(sp) # s4: exp_a
sw s5, 20(sp) # s5: exp_b
sw s6, 16(sp) # s6: mant_a
sw s7, 12(sp) # s7: mant_b
sw s8, 8(sp) # s8: result_sign
sw s9, 4(sp) # s9: result_exp
sw s10, 0(sp) # s10: result_mant
mv s0, a0
mv s1, a1
srli s2, s0, 15 # s2 = sign_a
andi s2, s2, 1
srli s3, s1, 15 # s3 = sign_b
andi s3, s3, 1
srli s4, s0, 7 # s4 = exp_a
andi s4, s4, 0xFF
srli s5, s1, 7 # s5 = exp_b
andi s5, s5, 0xFF
andi s6, s0, 0x7F # s6 = mant_a
andi s7, s1, 0x7F # s7 = mant_b
li t0, 0xFF
bne s4, t0, check_b_special
bne s6, zero, return_a
bne s5, t0, return_a
bne s7, zero, return_b
beq s2, s3, return_b
li a0, 0x7FC0
j epilogue
check_b_special:
bne s5, t0, check_zeros
bne s7, zero, return_b
j return_b
check_zeros:
or t0, s4, s6
beq t0, zero, return_b
or t0, s5, s7
beq t0, zero, return_a
li t0, 0x80
bne s4, zero, add_implicit_a
j check_implicit_b
add_implicit_a:
or s6, s6, t0
check_implicit_b:
bne s5, zero, add_implicit_b
j align_exponents
add_implicit_b:
or s7, s7, t0
align_exponents:
sub t0, s4, s5
bgtz t0, a_exp_larger
bltz t0, b_exp_larger
mv s9, s4
j add_sub_mantissas
a_exp_larger:
mv s9, s4
li t1, 8
bgt t0, t1, return_a
srl s7, s7, t0
j add_sub_mantissas
b_exp_larger:
mv s9, s5
neg t0, t0
li t1, 8
bgt t0, t1, return_b
srl s6, s6, t0
add_sub_mantissas:
bne s2, s3, different_signs
same_signs:
mv s8, s2
add s10, s6, s7
andi t0, s10, 0x100
beq t0, zero, pack_result
srli s10, s10, 1
addi s9, s9, 1
li t0, 0xFF
bge s9, t0, return_infinity
j pack_result
different_signs:
bge s6, s7, a_mant_larger
mv s8, s3
sub s10, s7, s6
j normalize_sub_result
a_mant_larger:
mv s8, s2
sub s10, s6, s7
normalize_sub_result:
beq s10, zero, return_pos_zero
normalize_loop:
andi t0, s10, 0x80
bne t0, zero, pack_result
slli s10, s10, 1
addi s9, s9, -1
blez s9, return_pos_zero
j normalize_loop
pack_result:
slli t0, s8, 15
andi s9, s9, 0xFF
slli t1, s9, 7
andi t2, s10, 0x7F
or a0, t0, t1
or a0, a0, t2
j epilogue
return_a:
mv a0, s0
j epilogue
return_b:
mv a0, s1
j epilogue
return_pos_zero:
li a0, 0
j epilogue
return_infinity:
slli t0, s8, 15
li t1, 0x7F80
or a0, t0, t1
j epilogue
epilogue:
lw s10, 0(sp)
lw s9, 4(sp)
lw s8, 8(sp)
lw s7, 12(sp)
lw s6, 16(sp)
lw s5, 20(sp)
lw s4, 24(sp)
lw s3, 28(sp)
lw s2, 32(sp)
lw s1, 36(sp)
lw s0, 40(sp)
lw ra, 44(sp)
addi sp, sp, 48
ret
```
```
00000000 <main>:
0: 00004537 lui x10 0x4
4: f8050513 addi x10 x10 -128
8: 000045b7 lui x11 0x4
c: 03c000ef jal x1 60 <bf16_add>
10: 00050413 addi x8 x10 0
14: 00004537 lui x10 0x4
18: 04050513 addi x10 x10 64
1c: 0000c5b7 lui x11 0xc
20: f8058593 addi x11 x11 -128
24: 024000ef jal x1 36 <bf16_add>
28: 00050493 addi x9 x10 0
2c: 00004537 lui x10 0x4
30: fc050513 addi x10 x10 -64
34: 000045b7 lui x11 0x4
38: fc058593 addi x11 x11 -64
3c: 00c000ef jal x1 12 <bf16_add>
40: 00050913 addi x18 x10 0
00000044 <done>:
44: 0000006f jal x0 0 <done>
00000048 <bf16_add>:
48: fd010113 addi x2 x2 -48
4c: 02112623 sw x1 44 x2
50: 02812423 sw x8 40 x2
54: 02912223 sw x9 36 x2
58: 03212023 sw x18 32 x2
5c: 01312e23 sw x19 28 x2
60: 01412c23 sw x20 24 x2
64: 01512a23 sw x21 20 x2
68: 01612823 sw x22 16 x2
6c: 01712623 sw x23 12 x2
70: 01812423 sw x24 8 x2
74: 01912223 sw x25 4 x2
78: 01a12023 sw x26 0 x2
7c: 00050413 addi x8 x10 0
80: 00058493 addi x9 x11 0
84: 00f45913 srli x18 x8 15
88: 00197913 andi x18 x18 1
8c: 00f4d993 srli x19 x9 15
90: 0019f993 andi x19 x19 1
94: 00745a13 srli x20 x8 7
98: 0ffa7a13 andi x20 x20 255
9c: 0074da93 srli x21 x9 7
a0: 0ffafa93 andi x21 x21 255
a4: 07f47b13 andi x22 x8 127
a8: 07f4fb93 andi x23 x9 127
ac: 0ff00293 addi x5 x0 255
b0: 025a1063 bne x20 x5 32 <check_b_special>
b4: 100b1463 bne x22 x0 264 <return_a>
b8: 105a9263 bne x21 x5 260 <return_a>
bc: 100b9463 bne x23 x0 264 <return_b>
c0: 11390263 beq x18 x19 260 <return_b>
c4: 00008537 lui x10 0x8
c8: fc050513 addi x10 x10 -64
cc: 11c0006f jal x0 284 <epilogue>
000000d0 <check_b_special>:
d0: 005a9663 bne x21 x5 12 <check_zeros>
d4: 0e0b9863 bne x23 x0 240 <return_b>
d8: 0ec0006f jal x0 236 <return_b>
000000dc <check_zeros>:
dc: 016a62b3 or x5 x20 x22
e0: 0e028263 beq x5 x0 228 <return_b>
e4: 017ae2b3 or x5 x21 x23
e8: 0c028a63 beq x5 x0 212 <return_a>
ec: 08000293 addi x5 x0 128
f0: 000a1463 bne x20 x0 8 <add_implicit_a>
f4: 0080006f jal x0 8 <check_implicit_b>
000000f8 <add_implicit_a>:
f8: 005b6b33 or x22 x22 x5
000000fc <check_implicit_b>:
fc: 000a9463 bne x21 x0 8 <add_implicit_b>
100: 0080006f jal x0 8 <align_exponents>
00000104 <add_implicit_b>:
104: 005bebb3 or x23 x23 x5
00000108 <align_exponents>:
108: 415a02b3 sub x5 x20 x21
10c: 00504863 blt x0 x5 16 <a_exp_larger>
110: 0202c063 blt x5 x0 32 <b_exp_larger>
114: 000a0c93 addi x25 x20 0
118: 02c0006f jal x0 44 <add_sub_mantissas>
0000011c <a_exp_larger>:
11c: 000a0c93 addi x25 x20 0
120: 00800313 addi x6 x0 8
124: 08534c63 blt x6 x5 152 <return_a>
128: 005bdbb3 srl x23 x23 x5
12c: 0180006f jal x0 24 <add_sub_mantissas>
00000130 <b_exp_larger>:
130: 000a8c93 addi x25 x21 0
134: 405002b3 sub x5 x0 x5
138: 00800313 addi x6 x0 8
13c: 08534463 blt x6 x5 136 <return_b>
140: 005b5b33 srl x22 x22 x5
00000144 <add_sub_mantissas>:
144: 03391463 bne x18 x19 40 <different_signs>
00000148 <same_signs>:
148: 00090c13 addi x24 x18 0
14c: 017b0d33 add x26 x22 x23
150: 100d7293 andi x5 x26 256
154: 04028663 beq x5 x0 76 <pack_result>
158: 001d5d13 srli x26 x26 1
15c: 001c8c93 addi x25 x25 1
160: 0ff00293 addi x5 x0 255
164: 065cd863 bge x25 x5 112 <return_infinity>
168: 0380006f jal x0 56 <pack_result>
0000016c <different_signs>:
16c: 017b5863 bge x22 x23 16 <a_mant_larger>
170: 00098c13 addi x24 x19 0
174: 416b8d33 sub x26 x23 x22
178: 00c0006f jal x0 12 <normalize_sub_result>
0000017c <a_mant_larger>:
17c: 00090c13 addi x24 x18 0
180: 417b0d33 sub x26 x22 x23
00000184 <normalize_sub_result>:
184: 040d0463 beq x26 x0 72 <return_pos_zero>
00000188 <normalize_loop>:
188: 080d7293 andi x5 x26 128
18c: 00029a63 bne x5 x0 20 <pack_result>
190: 001d1d13 slli x26 x26 1
194: fffc8c93 addi x25 x25 -1
198: 03905a63 bge x0 x25 52 <return_pos_zero>
19c: fedff06f jal x0 -20 <normalize_loop>
000001a0 <pack_result>:
1a0: 00fc1293 slli x5 x24 15
1a4: 0ffcfc93 andi x25 x25 255
1a8: 007c9313 slli x6 x25 7
1ac: 07fd7393 andi x7 x26 127
1b0: 0062e533 or x10 x5 x6
1b4: 00756533 or x10 x10 x7
1b8: 0300006f jal x0 48 <epilogue>
000001bc <return_a>:
1bc: 00040513 addi x10 x8 0
1c0: 0280006f jal x0 40 <epilogue>
000001c4 <return_b>:
1c4: 00048513 addi x10 x9 0
1c8: 0200006f jal x0 32 <epilogue>
000001cc <return_pos_zero>:
1cc: 00000513 addi x10 x0 0
1d0: 0180006f jal x0 24 <epilogue>
000001d4 <return_infinity>:
1d4: 00fc1293 slli x5 x24 15
1d8: 00008337 lui x6 0x8
1dc: f8030313 addi x6 x6 -128
1e0: 0062e533 or x10 x5 x6
1e4: 0040006f jal x0 4 <epilogue>
000001e8 <epilogue>:
1e8: 00012d03 lw x26 0 x2
1ec: 00412c83 lw x25 4 x2
1f0: 00812c03 lw x24 8 x2
1f4: 00c12b83 lw x23 12 x2
1f8: 01012b03 lw x22 16 x2
1fc: 01412a83 lw x21 20 x2
200: 01812a03 lw x20 24 x2
204: 01c12983 lw x19 28 x2
208: 02012903 lw x18 32 x2
20c: 02412483 lw x9 36 x2
210: 02812403 lw x8 40 x2
214: 02c12083 lw x1 44 x2
218: 03010113 addi x2 x2 48
21c: 00008067 jalr x0 x1 0
```

### static inline bf16_t bf16_sub(bf16_t a, bf16_t b)
```
.text
.globl main
main:
li a0, 0x4040 # a = 3.0
li a1, 0x3F80 # b = 1.0
jal ra, bf16_sub
mv s0, a0 # 0x4000
li a0, 0x3F80 # a = 1.0
li a1, 0x4040 # b = 3.0
jal ra, bf16_sub
mv s1, a0 # 0xC000
li a0, 0x40A0 # a = 5.0
li a1, 0x40A0 # b = 5.0
jal ra, bf16_sub
mv s2, a0 # 0x0000
done:
j done
bf16_sub:
li t0, 0x8000
xor a1, a1, t0
j bf16_add
```
### static inline bf16_t bf16_mul(bf16_t a, bf16_t b)
```
.text
.globl main
.globl bf16_mul
main:
li a0, 0x4000 # a = 2.0 (bf16)
li a1, 0x4040 # b = 3.0 (bf16)
jal ra, bf16_mul # a0 <- result
mv a2, a0 # 0x40C0
#li a0, 0xC000 # a = -2.0
#li a1, 0x4040 # b = 3.0
#jal ra, bf16_mul
#mv a3, a0 # 0xC0C0
#li a0, 0x7F80 # a = +Inf
#li a1, 0x40A0 # b = 5.0
#jal ra, bf16_mul
#mv a4, a0 # 0x7F80
done:
j done
bf16_mul:
srli t0, a0, 15 # sign_a
andi t0, t0, 1
srli t1, a1, 15 # sign_b
andi t1, t1, 1
xor t6, t0, t1 # result_sign -> t6
srli a2, a0, 7 # exp_a
andi a2, a2, 0xFF
srli a3, a1, 7 # exp_b
andi a3, a3, 0xFF
andi a4, a0, 0x7F # mant_a
andi a5, a1, 0x7F # mant_b
li t2, 0xFF
bne a2, t2, 1f
bnez a4, .ret_a
or t3, a3, a5
beqz t3, .ret_qnan
j .ret_inf
1: bne a3, t2, 2f
bnez a5, .ret_b
or t3, a2, a4
beqz t3, .ret_qnan
j .ret_inf
2:
or t3, a2, a4
beqz t3, .ret_zero
or t3, a3, a5
beqz t3, .ret_zero
li a6, 0
bnez a2, 3f
0: andi t3, a4, 0x80
bnez t3, 4f
slli a4, a4, 1
addi a6, a6, -1
j 0b
4: li a2, 1
3: ori a4, a4, 0x80
bnez a3, 5f
6: andi t3, a5, 0x80
bnez t3, 7f
slli a5, a5, 1
addi a6, a6, -1
j 6b
7: li a3, 1
5: ori a5, a5, 0x80
add a7, a2, a3
addi t3, x0, 127
sub a7, a7, t3
add a7, a7, a6
mv t0, a4
mv t1, a5
li t4, 0
li t5, 8
mul_loop:
andi t2, t1, 1
beqz t2, 8f
add t4, t4, t0
8: slli t0, t0, 1
srli t1, t1, 1
addi t5, t5, -1
bnez t5, mul_loop
li t0, 0x8000
and t1, t4, t0
beqz t1, 9f
srli t4, t4, 8
addi a7, a7, 1
j 10f
9: srli t4, t4, 7
10: andi t4, t4, 0x7F
li t0, 0xFF
bge a7, t0, .ret_inf
blez a7, 11f
j pack
11: li t0, -6
blt a7, t0, .ret_zero
li t1, 1
sub t1, t1, a7
srl t4, t4, t1
li a7, 0
pack:
slli t0, t6, 15
slli t1, a7, 7
or a0, t0, t1
or a0, a0, t4
ret
.ret_a:
ret
.ret_b:
mv a0, a1
ret
.ret_inf:
li a0, 0x7F80
slli t0, t6, 15
or a0, a0, t0
ret
.ret_qnan:
li a0, 0x7FC0
ret
.ret_zero:
slli a0, t6, 15
ret
```
### static inline bf16_t bf16_div(bf16_t a, bf16_t b)
```
.text
.globl bf16_div
.globl main
main:
li a0, 0x40C0 # a = 6.0 (bf16)
li a1, 0x4040 # b = 3.0 (bf16)
jal ra, bf16_div # a0 <- result
mv a2, a0 # 0x4000
done:
j done
bf16_div:
srli t0, a0, 15 # sign_a
andi t0, t0, 1
srli t1, a1, 15 # sign_b
andi t1, t1, 1
xor t6, t0, t1 # result_sign -> t6
srli a2, a0, 7 # exp_a
andi a2, a2, 0xFF
srli a3, a1, 7 # exp_b
andi a3, a3, 0xFF
andi a4, a0, 0x7F # mant_a
andi a5, a1, 0x7F # mant_b
li t2, 0xFF
bne a3, t2, 1f
bnez a5, .ret_b
# b = Inf
beq a2, t2, .ret_bf16_nan
j .ret_signed_zero
1:
or t0, a3, a5
bnez t0, 2f
or t1, a2, a4
beqz t1, .ret_bf16_nan
j .ret_signed_inf
2:
bne a2, t2, 3f
bnez a4, .ret_a
j .ret_signed_inf
3:
or t0, a2, a4
beqz t0, .ret_signed_zero
bnez a2, 4f
j 5f
4: ori a4, a4, 0x80
5:
bnez a3, 6f
j 7f
6: ori a5, a5, 0x80
7:
slli t4, a4, 15 # dividend
add t5, x0, a5 # divisor
li t2, 0 # quotient
li t3, 0 # i
add a7, a2, x0
sub a7, a7, a3
addi a7, a7, 127
bnez a2, 8f
addi a7, a7, -1 # !exp_a → --
8: bnez a3, 9f
addi a7, a7, 1 # !exp_b → ++
9:
div_loop:
slli t2, t2, 1 # quotient <<= 1
li t0, 15
sub t0, t0, t3 # k = 15 - i
sll t1, t5, t0 # tmp = divisor << k
sltu t0, t4, t1 # dividend < tmp ?
bne t0, x0, 10f
sub t4, t4, t1 # dividend -= tmp
ori t2, t2, 1 # quotient |= 1
10:
addi t3, t3, 1
li t0, 16
blt t3, t0, div_loop
li t0, 0x8000
and t1, t2, t0
bnez t1, 11f
norm_loop:
and t1, t2, t0
bnez t1, 12f
li t1, 1
ble a7, t1, 12f
slli t2, t2, 1
addi a7, a7, -1
j norm_loop
11: srli t2, t2, 8
j 13f
12: srli t2, t2, 8
13: andi t2, t2, 0x7F
li t0, 0xFF
bge a7, t0, .ret_signed_inf
blez a7, .ret_signed_zero
slli t0, t6, 15 # sign<<15
slli t1, a7, 7 # (exp&0xFF)<<7
or a0, t0, t1
or a0, a0, t2 # | mant(7 bits)
ret
.ret_a:
ret
.ret_b:
mv a0, a1
ret
.ret_bf16_nan:
li a0, 0x7FC0
ret
.ret_signed_inf:
li a0, 0x7F80
slli t0, t6, 15
or a0, a0, t0
ret
.ret_signed_zero:
slli a0, t6, 15
ret
```