# Assignment 1: RISC-V Assembly and Instruction Pipeline
contributed by < [kaihsiang092](https://github.com/kaihsiang092/ca2025-quizzes.git) >
## Problem B : uf8
* [Assembly program for Problem B.](https://github.com/kaihsiang092/ca2025-quizzes/blob/main/q1-uf8.s)
Problem B mainly focuses on compressing a 20-bit unsigned integer into an 8-bit code using **logarithmic quantization** with a **2.5:1 compression ratio**. This method reduces storage requirements while introducing only minor errors, making it suitable for applications where **range is more important than precision**, such as **temperature sensing, distance sensing, and computer graphics**.
The following sections describe the **decoding** and **encoding** methods.
### C code : uf8_decode
```c=
* Decode uf8 to uint32_t */
uint32_t uf8_decode(uf8 fl)
{
uint32_t mantissa = fl & 0x0f;
uint8_t exponent = fl >> 4;
uint32_t offset = (0x7FFF >> (15 - exponent)) << 4;
return (mantissa << exponent) + offset;
}
```
1. uint32_t mantissa = fl & 0x0f;—takes the lower 4 bits of fl as the mantissa.
2. uint8_t exponent = fl >> 4;—takes the upper 4 bits of fl as the exponent.
3. (0x7FFF >> (15 - exponent)) << 4 calculates the starting value of each interval, which corresponds to the formula ${offset}(e) = (2^e - 1) \cdot 16$
Finally, the function returns the decoded value, which is the sum of the interval’s starting value and the offset.
### C code : uf8_encode
1. If the value is less than 16, no compression is needed — return the original value directly.
2. Use CLZ (Count Leading Zeros) to find the position of the most significant bit (MSB), which determines which power-of-two range the value belongs to.
3. Subtracting 4 is an empirical adjustment to make the value distribution smoother. If the result exceeds 15 (the maximum representable by 4 bits), set it to 15.
4. Use a loop to calculate the starting value (offset) of the corresponding exponent range. This represents the minimum value that can be expressed in that range.
5. If the current overflow is already greater than the input value, it means the exponent was overestimated — decrease the exponent and adjust overflow accordingly.
6. Starting from the current overflow, gradually test the next range to see if it’s still less than the value, until the range that just contains the value is found.
7. Within that range, extract the fine-grained offset and right-shift it according to the exponent.
8. The upper 4 bits represent the exponent, and the lower 4 bits represent the mantissa — together forming the final 8-bit uf8 encoded value.
```c=
/* Encode uint32_t to uf8 */
uf8 uf8_encode(uint32_t value)
{
/* Use CLZ for fast exponent calculation */
if (value < 16)
return value;
/* Find appropriate exponent using CLZ hint */
int lz = clz(value);
int msb = 31 - lz;
/* Start from a good initial guess */
uint8_t exponent = 0;
uint32_t overflow = 0;
if (msb >= 5) {
/* Estimate exponent - the formula is empirical */
exponent = msb - 4;
if (exponent > 15)
exponent = 15;
/* Calculate overflow for estimated exponent */
for (uint8_t e = 0; e < exponent; e++)
overflow = (overflow << 1) + 16;
/* Adjust if estimate was off */
while (exponent > 0 && value < overflow) {
overflow = (overflow - 16) >> 1;
exponent--;
}
}
/* Find exact exponent */
while (exponent < 15) {
uint32_t next_overflow = (overflow << 1) + 16;
if (value < next_overflow)
break;
overflow = next_overflow;
exponent++;
}
uint8_t mantissa = (value - overflow) >> exponent;
return (exponent << 4) | mantissa;
}
```
### assembly code
```c=
.data
str1: .string ": produces value "
str2: .string " but encodes back to "
str3: .string ": value "
str4: .string " <= previous_value "
str5: .string "All tests passed.\n"
str6: .string " tests failed.\n"
.text
.globl main
main:
jal ra, test
beq a0, x0, fail
la a0, str5 # print str5
li a7, 4
ecall
li a7, 93 # ecall: exit
li a0, 0 # exit code is 0,
ecall
fail:
la a0, str6 # print str6
li a7, 4
ecall
li a7, 93 # ecall: exit
li a0, 1 # exit code is 1
ecall
# clz(a0 = x) -> a0 = result
clz:
li t0, 32 # n = 32
li t1, 16 # c = 16
clz_loop:
srl t2, a0, t1 # y = x >> c
beq t2, x0, clz_skip # if (y == 0) skip
sub t0, t0, t1 # n -= c
add a0, t2, x0 # x = y
clz_skip:
srli t1, t1, 1 # c >>= 1
bne t1, x0, clz_loop # while(c)
sub a0, t0, a0 # return n - x
jr ra
# a0 = fl (uint8)
# return a0 = value (uint32)
uf8_decode:
andi t0, a0, 0x0F # t0 = mantissa,mantissa = fl & 0x0f;
srli t1, a0, 4 # t1 = exponent,exponent = fl >> 4;
li t2, 15
sub t2, t2, t1 # 15 - exponent
li t3, 0x7FFF
srl t3, t3, t2 # 0x7FFF >> (15 - exponent)
slli t3, t3, 4 # t3 = offset
sll t2, t0, t1 # mantissa << exponent
add a0, t2, t3 # a0 = (mantissa << exponent) + offset
jr ra # jump to ra
uf8_encode:
# if (value < 16) return value;
addi sp, sp, -4
sw ra, 0(sp)
add t6, a0, x0 # t6 = value
addi t0, x0, 16
blt a0, t0, return1
jal ra, clz
# msb = 31 - lz
addi t1, x0, 31
sub t1, t1, a0 # t1 = msb = 31 - lz;
addi t2, x0, 0 # t2 = exponent = 0;
addi t3, x0, 0 # t3 = overflow = 0;
addi t4, t4, 5
blt t1, t4, exact_exponent
addi t2, t1, -4 # exponent = msb - 4;
addi t4, x0, 15
bge t4, t2, Calculate_overflow # if (exponent <= 15) Calculate overflow for estimated exponent
li t2, 15
Calculate_overflow:
addi t4, x0, 0 # e = 0
for1:
slli t5, t3, 1
addi t3, t5, 16
addi t4, t4, 1
blt t4, t2, for1
while1:
blez t2, exact_exponent # if (exponent <= 0) break;
bge t6, t3, exact_exponent # if (value >= overflow) break;
addi t5, t3, -16 # (overflow - 16)
srli t3, t5, 1 # t3 = overflow = (overflow - 16) >> 1;
addi t2, t2, -1
j while1
exact_exponent:
addi t5, x0, 15
while2:
bge t2, t5, return0
slli t4, t3, 1 # (overflow << 1)
addi t4, t4, 16 # t4 = next_overflow = (overflow << 1) + 16;
blt t6, t4, return0 # if (value < next_overflow) break;
add t3, t4, x0
addi t2, t2, 1
j while2
return0:
sub t1, t6, t3 # (value - overflow)
srl t1, t1, t2 # t1 = mantissa = (value - overflow) >> exponent;
slli t0, t2, 4 # (exponent << 4)
or a0, t0, t1 # return (exponent << 4) | mantissa;
return1:
lw ra, 0(sp)
addi sp, sp, 4
jr ra # jump to ra
test:
addi sp, sp, -4
sw ra, 0(sp)
addi s0, x0, -1 # s0 = previous_value
addi s1, x0, 1 # s1 = passed
addi s2, x0, 0 # fl = 0
addi s3, x0,256 # fl = 256
for2:
add a0, s2, x0
jal ra uf8_decode
add s4, a0, x0 # s4 = value
add a0, s4, x0
jal ra uf8_encode
add s5, a0, x0 # s5 = fl2
if1:
beq s2, s5, if2
mv a0, s2
li a7, 34
ecall
la a0, str1 # print str1
li a7, 4
ecall
mv a0, s4 # print value
li a7, 1
ecall
la a0, str2 # print str2
li a7, 4
ecall
mv a0, s5
li a7, 34
ecall
li s1, 0 # passed = false
if2:
blt s0, s4, afterif
mv a0, s2 # print f1
li a7, 34
ecall
la a0, str3 # print str3
li a7, 4
ecall
mv a0, s4 # print value
li a7, 1
ecall
la a0, str4 # print str4
li a7, 4
ecall
mv a0, s0 # prepare to print previous_value
li a7, 34
ecall
li s1, 0 # passed = false
afterif:
mv s0, s4 # previous_value = value;
addi s2, s2, 1
blt s2, s3, for2
mv a0, s1 # return passed
lw ra, 0(sp)
addi sp, sp, 4
jr ra # jump to ra
```
### Result

### Performance
Cycles : 44397

### 5-stage pipelined processor
1. **Single-cycle processor**
Every instruction completes in a single clock cycle—the fetch, decode, execute, memory access, and write-back all happen within the same tick.
2. **5-stage pipelined processor**
The execution of each instruction is split into five stages (IF, ID, EX, MEM, WB), allowing multiple instructions to be in different stages simultaneously. Next, we’ll visualize and explain each of these five stages.
#### Instruction Fetch (IF) :

* **PC:** Holds the address of the current instruction; shown as `PC = 0x00000000`. After each instruction executes, the PC updates to the address of the “next” instruction.
* **Adder (+4):** Adds 4 to the current PC and outputs `0x00000004`, which is the sequential next instruction address.
* **MUX:** The lower-left MUX selects the next PC. Normally it chooses `PC + 4` (sequential execution). For jumps/branches taken, it selects the branch/jump target.
In this figure, because the instruction is `jal x1, 316`, the next PC becomes `PC + 316`.
* **Instruction Memory:** Uses the current PC to fetch the instruction. With `addr = 0x00000000`, the fetched word is `0x13C000EF`, which encodes `jal x1, 316`.
* **Compressed Decoder:** Ripes checks whether the instruction is a 16-bit RISC-V compressed instruction. If so, it expands it to an equivalent 32-bit instruction here.
`jal` is a standard 32-bit instruction, so input and output are the same: `0x13C000EF`.
* **IF/ID Pipeline Register:** Latches the fetched instruction and its PC for the next clock cycle, passing them to the ID stage.
With **enable** lit green and **clear** red (not asserted), the instruction is flowing normally into the ID stage.
#### Instruction Decode / Register Fetch (ID)

* **IF/ID Pipeline Register:** Supplies the PC (address of the current instruction) and the instruction `0x13C000EF` (`jal x1, 316`). This instruction word is forwarded to the **Decode** unit and the **Immediate Generator**.
* **Decode:** Extracts the opcode (lowest 7 bits) to identify the instruction type (here: JAL), as well as `rd`, `rs1`, `rs2` (register indices), and `funct3`/`funct7` (which select ALU behaviors for other instruction types).
* **Registers:** `R1 idx` and `R2 idx` would read source register values, but `jal` doesn’t use source registers, so the inputs are `0x00000000`. The outputs `Reg1` and `Reg2` are also `0x00000000` since this instruction has no operand reads.
* **Immediate Generator:** Extracts the immediate field from the instruction bits; here it outputs `0x0000013C`, the jump offset (316).
* **ID/EX Pipeline Register:** Latches these decoded fields for the next cycle and passes them to the EX stage. The **enable** light being green indicates the register is currently writing/propagating values normally (ID stage is proceeding).
#### Execute(EX)

* **ALU:** Executes arithmetic (ADD/SUB), bitwise ops (AND/OR/XOR), comparisons (SLT/SLTU), and address calculations (branch/jump targets).
Inputs here are the PC (`0x00000000`) and the immediate (`0x0000013C`).
Output `Res = 0x00000000 + 0x0000013C = 0x0000013C`, which is the **jump target address**. This value is sent back to the IF stage on the next cycle to update the PC.
* **ALU input selectors:** The upper MUX selects the PC (`0x00000000`); the lower MUX selects the immediate (`0x0000013C`). Together they steer the ALU to compute `PC + immediate`.
* **Branch Unit:** Determines whether a branch condition is taken (for `beq`, `bne`, `blt`, `bge`, etc.). For `jal`, no comparison is needed—the jump always happens—so **branch taken** is asserted (true).
Once the jump/branch is confirmed in EX, the target address (`0x0000013C`) is forwarded to the IF stage, which uses it on the next cycle to update the PC.
#### Memory access (MEM)

* **Data Memory:** The `jal` instruction does not access data memory. In this cycle, the module is simply passed through with no actual operation performed.
#### Register write back (WB)

* **MUX:** The three-input MUX selects the data source for write-back. The green light indicates the currently selected path is active.
* The rightmost output value `0x00000004` is sent back to the Register File and written to the destination register **x1 (ra)**. This is one of `jal`’s key functions: it stores the address of the next instruction (**PC + 4**) into **x1** as the return address.

After execution, **x1 = 0x00000004** — because `jal` writes **PC + 4** (the return address) into **x1 (ra)**.
## Problem C : bfloat16Square Root operation
* [Assembly program for Problem C.](https://github.com/kaihsiang092/ca2025-quizzes/blob/main/q1-bfloat16.s)
### bfloat16 data
### bf16_isnan、bf16_isinf、bf16_iszero
#### c code
```c=
static inline bool bf16_isnan(bf16_t a)
{
return ((a.bits & BF16_EXP_MASK) == BF16_EXP_MASK) &&
(a.bits & BF16_MANT_MASK);
}
static inline bool bf16_isinf(bf16_t a)
{
return ((a.bits & BF16_EXP_MASK) == BF16_EXP_MASK) &&
!(a.bits & BF16_MANT_MASK);
}
static inline bool bf16_iszero(bf16_t a)
{
return !(a.bits & 0x7FFF);
}
```
#### assembly code
```c=
bf16_isnan:
addi t5, a0, 0 # save a.bits
# t0 = EXP_MASK = 0x7F80 (build via 0xFF << 7)
addi t0, x0, 255
slli t0, t0, 7
and t1, t5, t0 # t1 = a & EXP_MASK
xor t2, t1, t0 # t2 = (a & EXP_MASK) ^ EXP_MASK
sltiu a0, t2, 1 # a0 = (t2 == 0) ? 1 : 0 (exp_all_ones)
# t3 = a & MANT_MASK (0x007F)
andi t3, t5, 127
sltu t4, x0, t3 # t4 = (t3 != 0) ? 1 : 0 (mant_nonzero)
and a0, a0, t4 # a0 = exp_all_ones && mant_nonzero
jalr x0, ra, 0
bf16_isinf:
addi t5, a0, 0 # save a.bits
# t0 = EXP_MASK = 0x7F80
addi t0, x0, 255
slli t0, t0, 7
and t1, t5, t0 # t1 = a & EXP_MASK
xor t2, t1, t0 # t2 = (a & EXP_MASK) ^ EXP_MASK
sltiu a0, t2, 1 # a0 = (t2 == 0) ? 1 : 0 (exp_all_ones)
# t3 = a & MANT_MASK
andi t3, t5, 127
sltiu t4, t3, 1 # t4 = (t3 == 0) ? 1 : 0 (mant_zero)
and a0, a0, t4 # a0 = exp_all_ones && mant_zero
jalr x0, ra, 0
bf16_iszero:
addi t5, a0, 0 # save a.bits
# t0 = 0x7FFF = (0x7F << 8) | 0xFF
addi t0, x0, 127
slli t0, t0, 8
ori t0, t0, 255
and t1, t5, t0 # t1 = a & 0x7FFF
sltiu a0, t1, 1 # a0 = (t1 == 0) ? 1 : 0
jalr x0, ra, 0
```
### f32_to_bf16
#### C code
```c=
static inline bf16_t f32_to_bf16(float val)
{
uint32_t f32bits;
memcpy(&f32bits, &val, sizeof(float));
if (((f32bits >> 23) & 0xFF) == 0xFF)
return (bf16_t) {.bits = (f32bits >> 16) & 0xFFFF};
f32bits += ((f32bits >> 16) & 1) + 0x7FFF;
return (bf16_t) {.bits = f32bits >> 16};
}
```
#### Assembly code
```c=
f32_to_bf16:
# === 取出 exponent (8 bits) ===
srli t0, a0, 23 # t0 = f32bits >> 23
andi t0, t0, 0xFF # t0 = exponent (8 bits)
li t1, 0xFF
beq t0, t1, nan_inf # if exponent == 都是1 → NaN/Inf 處理
# === 一般數值的 round-to-nearest-even ===
srli t1, a0, 16 # t1 = f32bits >> 16
andi t1, t1, 1 # t1 = (f32bits >> 16) & 1 (LSB for tie-to-even)
li t2, 0x7FFF
add t1, t1, t2 # t1 = ((f32bits >>16)&1) + 0x7FFF
add a0, a0, t1 # f32bits += t1
# 取高16位作為 bf16.bits
srli a0, a0, 16 # a0 = f32bits >> 16
li t3, 0xFFFF # t3 = 0xFFFF
and a0, a0, t3 # 保留低16位(確保乾淨)
jr ra
nan_inf:
srli a0, a0, 16 # a0 = f32bits >> 16
li t3, 0xFFFF # t3 = 0xFFFF
and a0, a0, t3 # (f32bits >> 16) & 0xFFFF
jr ra
```
### bf16_to_f32
#### C code
```c=
static inline float bf16_to_f32(bf16_t val)
{
uint32_t f32bits = ((uint32_t) val.bits) << 16;
float result;
memcpy(&result, &f32bits, sizeof(float));
return result;
}
```
#### Assembly code
```c=
bf16_to_f32:
slli a0, a0, 16 #f32bits = ((uint32_t) val.bits) << 16
jr ra
```
### bf16_add
#### C code
```c=
static inline bf16_t bf16_add(bf16_t a, bf16_t b)
{
uint16_t sign_a = (a.bits >> 15) & 1;
uint16_t sign_b = (b.bits >> 15) & 1;
int16_t exp_a = ((a.bits >> 7) & 0xFF);
int16_t exp_b = ((b.bits >> 7) & 0xFF);
uint16_t mant_a = a.bits & 0x7F;
uint16_t mant_b = b.bits & 0x7F;
if (exp_a == 0xFF) {
if (mant_a)
return a;
if (exp_b == 0xFF)
return (mant_b || sign_a == sign_b) ? b : BF16_NAN();
return a;
}
if (exp_b == 0xFF)
return b;
if (!exp_a && !mant_a)
return b;
if (!exp_b && !mant_b)
return a;
if (exp_a)
mant_a |= 0x80;
if (exp_b)
mant_b |= 0x80;
int16_t exp_diff = exp_a - exp_b;
uint16_t result_sign;
int16_t result_exp;
uint32_t result_mant;
if (exp_diff > 0) {
result_exp = exp_a;
if (exp_diff > 8)
return a;
mant_b >>= exp_diff;
} else if (exp_diff < 0) {
result_exp = exp_b;
if (exp_diff < -8)
return b;
mant_a >>= -exp_diff;
} else {
result_exp = exp_a;
}
if (sign_a == sign_b) {
result_sign = sign_a;
result_mant = (uint32_t) mant_a + mant_b;
if (result_mant & 0x100) {
result_mant >>= 1;
if (++result_exp >= 0xFF)
return (bf16_t) {.bits = (result_sign << 15) | 0x7F80};
}
} else {
if (mant_a >= mant_b) {
result_sign = sign_a;
result_mant = mant_a - mant_b;
} else {
result_sign = sign_b;
result_mant = mant_b - mant_a;
}
if (!result_mant)
return BF16_ZERO();
while (!(result_mant & 0x80)) {
result_mant <<= 1;
if (--result_exp <= 0)
return BF16_ZERO();
}
}
return (bf16_t) {
.bits = (result_sign << 15) | ((result_exp & 0xFF) << 7) |
(result_mant & 0x7F),
};
}
```
#### Assembly code
```c=
bf16_add:
# a0 = a.bits (low 16)
# a1 = b.bits (low 16)
# 使用暫存器:
# t0=sign_a, t1=sign_b, t2=exp_a, t3=exp_b, t4=mant_a, t5=mant_b, t6=temp
# a2=result_sign, a3=result_exp, a4=result_mant, a5=exp_diff
srli t0, a0, 15 # a.bits >> 15
andi t0, t0, 1 # t0=sign_a
srli t1, a1, 15 # b.bits >> 15
andi t1, t1, 1 # t1=sign_b
srli t2, a0, 7 # a.bits >> 7
andi t2, t2, 0xFF # t2=exp_a
srli t3, a1, 7 # b.bits >> 7
andi t3, t3, 0xFF # t3=exp_b
andi t4, a0, 0x7F # t4=mant_a
andi t5, a1, 0x7F # t5=mant_b
addi t6, x0, 0xFF
bne t2, t6, if_1 # if exp_a != 0xFF -> 跳到if_1
if_1:
bne t3, t6, if_2
addi a0, a1, 0 # if (exp_b == 0xFF) return b;
jr ra
if_2: # if (!exp_a && !mant_a) return b;
bne t2, x0, if_3
bne t4, x0, if_3
addi a0, a1, 0
jr ra
if_3: # if (!exp_b && !mant_b) return a;
bne t3, x0, if_4
bne t5, x0, if_4
jr ra
return_b:
addi a0, a1, 0
jr ra
return_a:
jr ra
if_4: # if (exp_a) mant_a |= 0x80;
beq t2, x0, if_5
ori t4, t4, 0x80
if_5: # if (exp_b) mant_b |= 0x80;
beq t3, x0, if_6
ori t5, t5, 0x80
if_6:
sub a5, t2, t3 # exp_diff = exp_a - exp_b
# if exp_diff > 0
slt t6, x0, a5 # t6 = (0 < exp_diff)
beq t6, x0, else_if # 若不是 >0,去檢查 <0
addi a3, t2, 0 # result_exp = exp_a
addi t6, x0, 8
blt t6, a5, return_a # if 8 < exp_diff -> 直接回 a
srl t5, t5, a5 # mant_b >>= exp_diff
jal x0, if_8
else_if: # else if (exp_diff < 0)
slt t6, a5, x0 # t6 = (exp_diff < 0)
beq t6, x0, else # 若不是 <0 -> 相等
addi a3, t3, 0 # result_exp = exp_b
sub t6, x0, a5 # t6 = -exp_diff = exp_b - exp_a
addi a4, x0, 8
blt a4, t6, return_b # if (-exp_diff) > 8 -> 回 b
srl t4, t4, t6 # mant_a >>= (-exp_diff)
jal x0, if_8
else: # exp_diff == 0
add a3, t2, x0 # result_exp = exp_a
if_8:
xor t6, t0, t1
bne t6, x0, DIFF_SIGN # 若異號
add a2, t0, x0 # result_sign = sign_a
add a4, t4, t5 # result_mant = mant_a + mant_b
andi t6, a4, 0x100 # 檢查進位 (bit8)
beq t6, x0, PACK
srli a4, a4, 1
addi a3, a3, 1 # exp++
addi t6, x0, 0xFF
bge a3, t6, RET_INF # 溢出 -> 回 ±Inf
jal x0, PACK
RET_INF:
slli a0, a2, 15 # a0 = (result_sign << 15)
addi t6, x0, 0xFF
slli t6, t6, 7 # 0x7F80 = 0xFF<<7
or a0, a0, t6
jr ra
DIFF_SIGN:
sltu t6, t4, t5 # t6=1 if mant_a < mant_b
bne t6, x0, USE_B
addi a2, t0, 0 # result_sign = sign_a
sub a4, t4, t5 # mant = a - b
jal x0, AFTER_SUB
USE_B:
addi a2, t1, 0 # result_sign = sign_b
sub a4, t5, t4 # mant = b - a
AFTER_SUB:
beq a4, x0, RET_ZERO
NORM_LOOP:
andi t6, a4, 0x80
bne t6, x0, PACK # 若已具 hidden bit -> 組裝
slli a4, a4, 1 # 左規格化
addi a3, a3, -1 # exp--
# 若 exp <= 0 -> 下溢為 0
slt t6, x0, a3 # t6 = (0 < exp)
bne t6, x0, NORM_LOOP
jal x0, RET_ZERO
PACK:
andi t6, a3, 255
slli t6, t6, 7 # (exp&0xFF)<<7
slli a0, a2, 15 # sign<<15
or a0, a0, t6
andi t6, a4, 0x7F
or a0, a0, t6
jr ra
RET_ZERO:
addi a0, x0, 0
jr ra
```
### bf16_sub
#### C code
```c=
static inline bf16_t bf16_sub(bf16_t a, bf16_t b)
{
b.bits ^= BF16_SIGN_MASK;
return bf16_add(a, b);
}
```
#### Assembly code
```c=
bf16_sub:
lui t0, 0x8 # t0 = 0x00008000
xor a1, a1, t0
# return bf16_add(a, b)
jal ra, bf16_add # 呼叫 bf16_add(a0, a1)
jalr x0, ra, 0 # return
```
### bf16_mul
#### C code
```c=
static inline bf16_t bf16_mul(bf16_t a, bf16_t b)
{
uint16_t sign_a = (a.bits >> 15) & 1;
uint16_t sign_b = (b.bits >> 15) & 1;
int16_t exp_a = ((a.bits >> 7) & 0xFF);
int16_t exp_b = ((b.bits >> 7) & 0xFF);
uint16_t mant_a = a.bits & 0x7F;
uint16_t mant_b = b.bits & 0x7F;
uint16_t result_sign = sign_a ^ sign_b;
if (exp_a == 0xFF) {
if (mant_a)
return a;
if (!exp_b && !mant_b)
return BF16_NAN();
return (bf16_t) {.bits = (result_sign << 15) | 0x7F80};
}
if (exp_b == 0xFF) {
if (mant_b)
return b;
if (!exp_a && !mant_a)
return BF16_NAN();
return (bf16_t) {.bits = (result_sign << 15) | 0x7F80};
}
if ((!exp_a && !mant_a) || (!exp_b && !mant_b))
return (bf16_t) {.bits = result_sign << 15};
int16_t exp_adjust = 0;
if (!exp_a) {
while (!(mant_a & 0x80)) {
mant_a <<= 1;
exp_adjust--;
}
exp_a = 1;
} else
mant_a |= 0x80;
if (!exp_b) {
while (!(mant_b & 0x80)) {
mant_b <<= 1;
exp_adjust--;
}
exp_b = 1;
} else
mant_b |= 0x80;
uint32_t result_mant = (uint32_t) mant_a * mant_b;
int32_t result_exp = (int32_t) exp_a + exp_b - BF16_EXP_BIAS + exp_adjust;
if (result_mant & 0x8000) {
result_mant = (result_mant >> 8) & 0x7F;
result_exp++;
} else
result_mant = (result_mant >> 7) & 0x7F;
if (result_exp >= 0xFF)
return (bf16_t) {.bits = (result_sign << 15) | 0x7F80};
if (result_exp <= 0) {
if (result_exp < -6)
return (bf16_t) {.bits = result_sign << 15};
result_mant >>= (1 - result_exp);
result_exp = 0;
}
return (bf16_t) {.bits = (result_sign << 15) | ((result_exp & 0xFF) << 7) |
(result_mant & 0x7F)};
}
```
#### Assembly code
```c=
bf16_mul:
srli t0, a0, 15 # sign_a
andi t0, t0, 1
srli t1, a1, 15 # sign_b
andi t1, t1, 1
srli t2, a0, 7 # exp_a
andi t2, t2, 0xFF
srli t3, a1, 7 # exp_b
andi t3, t3, 0xFF
andi t4, a0, 0x7F # mant_a (7 bits)
andi t5, a1, 0x7F # mant_b (7 bits)
xor a2, t0, t1 # result_sign = sign_a ^ sign_b
addi t6, x0, 255
bne t2, t6, 1
bne t4, x0, RET_A # a 是 NaN → 回 a
# a 是 ±Inf
# 若 b 是 0(exp_b=0 且 mant_b=0)→ NaN
bne t3, x0, RET_INF2 # b 不是次正規/0 → ±Inf
beq t5, x0, RET_NAN # b 的 mant==0 且 exp_b==0 → NaN
# b 是 subnormal(exp_b=0 但 mant_b!=0),仍視為有限 → ±Inf
jal x0, RET_INF2
1: # --- b 的 exponent == 0xFF ---
bne t3, t6, 2f
bne t5, x0, RET_B # b 是 NaN → 回 b
# b 是 ±Inf
bne t2, x0, RET_INF2 # a 非 0 → ±Inf
beq t4, x0, RET_NAN # a==0 → NaN
jal x0, RET_INF2
2: # --- 任一為 0 → 回 ±0(符號=result_sign) ---
# if a == 0
bne t2, x0, 3f
beq t4, x0, RET_SIGNED_ZERO
3:
# if b == 0
bne t3, x0, 4f
beq t5, x0, RET_SIGNED_ZERO
4: # --- exp_adjust = 0 ---
addi a5, x0, 0
# --- 正規化/補 hidden bit:處理 a ---
beq t2, x0, 5f
ori t4, t4, 0x80 # mant_a |= 0x80
jal x0, 6f
5:
# a 是次正規:左移直到有 hidden bit,exp_adjust--
# do { if (mant_a & 0x80) break; mant_a<<=1; exp_adjust--; } while(1)
andi t6, t4, 0x80
bne t6, x0, 555
55:
slli t4, t4, 1
addi a5, a5, -1
andi t6, t4, 0x80
beq t6, x0, 55
555:
addi t2, x0, 1 # exp_a = 1
6: # --- 處理 b ---
beq t3, x0, 7f
ori t5, t5, 0x80 # mant_b |= 0x80
jal x0, 8f
7:
# b 是次正規
andi t6, t5, 0x80
bne t6, x0, 777
77:
slli t5, t5, 1
addi a5, a5, -1
andi t6, t5, 0x80
beq t6, x0, 77
777:
addi t3, x0, 1 # exp_b = 1
8:
# --- 8x8 乘法(位移加法),result_mant=a4 ---
addi a4, x0, 0 # product = 0
addi t6, x0, 8 # 8 次迭代
9: # 迴圈: 若 (mant_b & 1) product += mant_a
andi t0, t5, 1
beq t0, x0, 10f
add a4, a4, t4
10:
slli t4, t4, 1 # mant_a <<= 1
srli t5, t5, 1 # mant_b >>= 1
addi t6, t6, -1
bne t6, x0, 9b
add a3, t2, t3
addi a3, a3, -127
add a3, a3, a5
lui t6, 0x8 # t6 = 0x00008000
and t0, a4, t6
beq t0, x0, 11f
srli a4, a4, 8
andi a4, a4, 0x7F
addi a3, a3, 1
jal x0, 12f
11: # 無 bit15
srli a4, a4, 7
andi a4, a4, 0x7F
12:
# --- 溢出: result_exp >= 0xFF → ±Inf ---
addi t6, x0, 255
bge a3, t6, RET_INF2
# --- 下溢: result_exp <= 0 ---
beq a3, x0, 13f
slt t0, a3, x0 # t0=1 if a3<0
beq t0, x0, 14f # a3>0 → 正常數
13: # result_exp <= 0
addi t6, x0, -6
blt a3, t6, RET_SIGNED_ZERO # 太小 → ±0
# result_mant >>= (1 - result_exp); result_exp=0
addi t0, x0, 1
sub t0, t0, a3 # t0 = 1 - result_exp
srl a4, a4, t0
addi a3, x0, 0
14:
# --- 組裝正常/次正規結果 ---
# bits = (sign<<15) | ((exp&0xFF)<<7) | (mant&0x7F)
andi t6, a3, 255
slli t6, t6, 7
slli a0, a2, 15
or a0, a0, t6
andi t6, a4, 0x7F
or a0, a0, t6
jalr x0, ra, 0
RET_A: # 回 a
addi a0, a0, 0
jalr x0, ra, 0
RET_B: # 回 b
addi a0, a1, 0
jalr x0, ra, 0
RET_INF2: # 回 ±Inf: (sign<<15)|(0xFF<<7)
slli a0, a2, 15
addi t6, x0, 255
slli t6, t6, 7 # t6 = 0x7F80
or a0, a0, t6
jalr x0, ra, 0
RET_NAN: # 回 qNaN: 0x7FC0
addi t6, x0, 255
slli t6, t6, 7 # 0x7F80
ori a0, t6, 0x40 # 0x7FC0
jalr x0, ra, 0
RET_SIGNED_ZERO: # 回 ±0: (result_sign<<15)
slli a0, a2, 15
jalr x0, ra, 0
```
### bf16_div
#### c code
```c=
static inline bf16_t bf16_div(bf16_t a, bf16_t b)
{
uint16_t sign_a = (a.bits >> 15) & 1;
uint16_t sign_b = (b.bits >> 15) & 1;
int16_t exp_a = ((a.bits >> 7) & 0xFF);
int16_t exp_b = ((b.bits >> 7) & 0xFF);
uint16_t mant_a = a.bits & 0x7F;
uint16_t mant_b = b.bits & 0x7F;
uint16_t result_sign = sign_a ^ sign_b;
if (exp_b == 0xFF) {
if (mant_b)
return b;
/* Inf/Inf = NaN */
if (exp_a == 0xFF && !mant_a)
return BF16_NAN();
return (bf16_t) {.bits = result_sign << 15};
}
if (!exp_b && !mant_b) {
if (!exp_a && !mant_a)
return BF16_NAN();
return (bf16_t) {.bits = (result_sign << 15) | 0x7F80};
}
if (exp_a == 0xFF) {
if (mant_a)
return a;
return (bf16_t) {.bits = (result_sign << 15) | 0x7F80};
}
if (!exp_a && !mant_a)
return (bf16_t) {.bits = result_sign << 15};
if (exp_a)
mant_a |= 0x80;
if (exp_b)
mant_b |= 0x80;
uint32_t dividend = (uint32_t) mant_a << 15;
uint32_t divisor = mant_b;
uint32_t quotient = 0;
for (int i = 0; i < 16; i++) {
quotient <<= 1;
if (dividend >= (divisor << (15 - i))) {
dividend -= (divisor << (15 - i));
quotient |= 1;
}
}
int32_t result_exp = (int32_t) exp_a - exp_b + BF16_EXP_BIAS;
if (!exp_a)
result_exp--;
if (!exp_b)
result_exp++;
if (quotient & 0x8000)
quotient >>= 8;
else {
while (!(quotient & 0x8000) && result_exp > 1) {
quotient <<= 1;
result_exp--;
}
quotient >>= 8;
}
quotient &= 0x7F;
if (result_exp >= 0xFF)
return (bf16_t) {.bits = (result_sign << 15) | 0x7F80};
if (result_exp <= 0)
return (bf16_t) {.bits = result_sign << 15};
return (bf16_t) {.bits = (result_sign << 15) | ((result_exp & 0xFF) << 7) |
(quotient & 0x7F)};
}
```
#### assembly code
```c=
bf16_div:
srli t0, a0, 15 # sign_a
andi t0, t0, 1
srli t1, a1, 15 # sign_b
andi t1, t1, 1
srli t2, a0, 7 # exp_a
andi t2, t2, 0xFF
srli t3, a1, 7 # exp_b
andi t3, t3, 0xFF
andi t4, a0, 0x7F # mant_a
andi t5, a1, 0x7F # mant_b
# result_sign = sign_a ^ sign_b
xor a2, t0, t1
addi t6, x0, 255
bne t3, t6, 1f # if (exp_b != 0xFF) 跳過
bne t5, x0, RET_B3 # mant_b!=0 → NaN/NaN payload,回 b
# b 是 ±Inf;若 a 是 +Inf/-Inf(且非 NaN)→ NaN;否則 a/Inf = ±0
bne t2, t6, RET_SIGNED_ZERO3
beq t4, x0, RET_NAN3 # a 是 ±Inf(mant_a==0)→ NaN
jal x0, RET_SIGNED_ZERO3
1:
bne t3, x0, 2f
bne t5, x0, 2f # exp_b==0 && mant_b==0 → b==0
# 若 a==0 → NaN;否則 → ±Inf
bne t2, x0, RET_INF3
beq t4, x0, RET_NAN3
jal x0, RET_INF3
2:
bne t2, t6, 3f # if exp_a != 0xFF 跳過
bne t4, x0, RET_A3 # a 是 NaN → 回 a
# a 是 ±Inf → 回 ±Inf
jal x0, RET_INF3
3:
bne t2, x0, 4f
beq t4, x0, RET_SIGNED_ZERO
4:
beq t2, x0, 5f
ori t4, t4, 0x80 # mant_a |= 0x80
5:
beq t3, x0, 6f
ori t5, t5, 0x80 # mant_b |= 0x80
6:
slli a4, t4, 15 # dividend = (uint32)mant_a << 15
addi a5, x0, 0 # quotient = 0
# divisor_shift = mant_b << 15 (用 t6 當 cur = divisor << 15)
slli t6, t5, 15
addi t0, x0, 16 # t0 = 16 次
L_div_loop:
slli a5, a5, 1 # quotient <<= 1
# if (dividend >= cur) { dividend -= cur; quotient |= 1; }
sltu t1, a4, t6 # t1=1 if dividend < cur(無號比較)
bne t1, x0, L_no_sub
sub a4, a4, t6
ori a5, a5, 1
L_no_sub:
srli t6, t6, 1 # cur >>= 1
addi t0, t0, -1
bne t0, x0, L_div_loop
sub a3, t2, t3
addi a3, a3, 127
# if (!exp_a) result_exp--;
beq t2, x0, L_dec_exp_a
jal x0, L_chk_exp_b
L_dec_exp_a:
addi a3, a3, -1
L_chk_exp_b:
# if (!exp_b) result_exp++;
beq t3, x0, L_inc_exp_b
jal x0, L_norm_q
L_inc_exp_b:
addi a3, a3, 1
L_norm_q:
# if (quotient & 0x8000) quotient >>= 8;
lui t0, 0x8 # t0 = 0x00008000
and t1, a5, t0
beq t1, x0, L_shift_left_phase
srli a5, a5, 8
jal x0, L_pack_mant
L_shift_left_phase:
# while (!(quotient & 0x8000) && result_exp > 1) { quotient <<= 1; result_exp--; }
L_norm_loop:
and t1, a5, t0
bne t1, x0, L_after_left_norm
addi t1, x0, 1
slt t1, t1, a3 # t1=1 if (1 < result_exp) 即 result_exp > 1
beq t1, x0, L_after_left_norm
slli a5, a5, 1
addi a3, a3, -1
jal x0, L_norm_loop
L_after_left_norm:
srli a5, a5, 8
L_pack_mant:
andi a5, a5, 0x7F # mantissa = quotient & 0x7F
addi t0, x0, 255
bge a3, t0, RET_INF3 # result_exp >= 0xFF → ±Inf
# if (result_exp <= 0) → ±0
beq a3, x0, RET_SIGNED_ZERO3
slt t1, a3, x0 # a3 < 0 ?
bne t1, x0, RET_SIGNED_ZERO3
# bits = (sign<<15) | ((exp&0xFF)<<7) | (mant&0x7F)
andi t0, a3, 255
slli t0, t0, 7
slli a0, a2, 15
or a0, a0, t0
or a0, a0, a5
jalr x0, ra 0
RET_A3: # 回 a
addi a0, a0, 0
jalr x0, ra 0
RET_B3: # 回 b
addi a0, a1, 0
jalr x0, ra 0
RET_INF3: # 回 ±Inf: (sign<<15)|(0xFF<<7)
slli a0, a2, 15
addi t0, x0, 255
slli t0, t0, 7 # t0 = 0x7F80
or a0, a0, t0
jalr x0, ra 0
RET_NAN3: # 回 qNaN: 0x7FC0
addi t0, x0, 255
slli t0, t0, 7 # 0x7F80
ori a0, t0, 0x40 # 0x7FC0
jalr x0, ra 0
RET_SIGNED_ZERO3: # 回 ±0: (sign<<15)
slli a0, a2, 15
jalr x0, ra 0
```
### bf16_sqrt
#### c code
```c=
static inline bf16_t bf16_sqrt(bf16_t a)
{
uint16_t sign = (a.bits >> 15) & 1;
int16_t exp = ((a.bits >> 7) & 0xFF);
uint16_t mant = a.bits & 0x7F;
/* Handle special cases */
if (exp == 0xFF) {
if (mant)
return a; /* NaN propagation */
if (sign)
return BF16_NAN(); /* sqrt(-Inf) = NaN */
return a; /* sqrt(+Inf) = +Inf */
}
/* sqrt(0) = 0 (handle both +0 and -0) */
if (!exp && !mant)
return BF16_ZERO();
/* sqrt of negative number is NaN */
if (sign)
return BF16_NAN();
/* Flush denormals to zero */
if (!exp)
return BF16_ZERO();
/* Direct bit manipulation square root algorithm */
/* For sqrt: new_exp = (old_exp - bias) / 2 + bias */
int32_t e = exp - BF16_EXP_BIAS;
int32_t new_exp;
/* Get full mantissa with implicit 1 */
uint32_t m = 0x80 | mant; /* Range [128, 256) representing [1.0, 2.0) */
/* Adjust for odd exponents: sqrt(2^odd * m) = 2^((odd-1)/2) * sqrt(2*m) */
if (e & 1) {
m <<= 1; /* Double mantissa for odd exponent */
new_exp = ((e - 1) >> 1) + BF16_EXP_BIAS;
} else {
new_exp = (e >> 1) + BF16_EXP_BIAS;
}
/* Now m is in range [128, 256) or [256, 512) if exponent was odd */
/* Binary search for integer square root */
/* We want result where result^2 = m * 128 (since 128 represents 1.0) */
uint32_t low = 90; /* Min sqrt (roughly sqrt(128)) */
uint32_t high = 256; /* Max sqrt (roughly sqrt(512)) */
uint32_t result = 128; /* Default */
/* Binary search for square root of m */
while (low <= high) {
uint32_t mid = (low + high) >> 1;
uint32_t sq = (mid * mid) / 128; /* Square and scale */
if (sq <= m) {
result = mid; /* This could be our answer */
low = mid + 1;
} else {
high = mid - 1;
}
}
/* result now contains sqrt(m) * sqrt(128) / sqrt(128) = sqrt(m) */
/* But we need to adjust the scale */
/* Since m is scaled where 128=1.0, result should also be scaled same way */
/* Normalize to ensure result is in [128, 256) */
if (result >= 256) {
result >>= 1;
new_exp++;
} else if (result < 128) {
while (result < 128 && new_exp > 1) {
result <<= 1;
new_exp--;
}
}
/* Extract 7-bit mantissa (remove implicit 1) */
uint16_t new_mant = result & 0x7F;
/* Check for overflow/underflow */
if (new_exp >= 0xFF)
return (bf16_t) {.bits = 0x7F80}; /* +Inf */
if (new_exp <= 0)
return BF16_ZERO();
return (bf16_t) {.bits = ((new_exp & 0xFF) << 7) | new_mant};
}
```
#### assembly code
```c=
bf16_sqrt:
# 取 sign/exp/mant
srli t0, a0, 15 # sign
andi t0, t0, 1
srli t1, a0, 7 # exp
andi t1, t1, 0xFF
andi t2, a0, 0x7F # mant
addi t3, x0, 255
bne t1, t3, SKIP_INF_NAN
bne t2, x0, RET_A4
bne t0, x0, RET_NAN4 # sign!=0 → NaN
jalr x0, ra, 0 # +Inf 直接回 a
SKIP_INF_NAN:
# sqrt(0)=0 :(exp==0 && mant==0)
bne t1, x0, SKIP_ZERO
bne t2, x0, SKIP_ZERO
addi a0, x0, 0
jalr x0, ra, 0
SKIP_ZERO:
# 陰數(有限)→ NaN
beq t0, x0, SKIP_NEG
jal x0, RET_NAN4
SKIP_NEG:
# 次正規 flush-to-zero: exp==0 → 0
bne t1, x0, CONTINUE1
addi a0, x0, 0
jalr x0, ra, 0
CONTINUE1:
# e = exp - 127 (暫存在 a3)
addi a3, t1, -127
# m = 0x80 | mant (a2)
ori a2, t2, 0x80
# 若 e 為奇數:m <<= 1; new_exp = ((e-1)>>1) + 127
andi t3, a3, 1
beq t3, x0, EVEN_EXP
slli a2, a2, 1 # m <<= 1
addi t3, a3, -1
srai t3, t3, 1
addi a3, t3, 127 # new_exp
jal x0, INIT_BSEARCH
EVEN_EXP:
srai t3, a3, 1
addi a3, t3, 127 # new_exp
INIT_BSEARCH:
# Binary search: low=90, high=256, result=128
addi t4, x0, 90
addi t5, x0, 256
addi t6, x0, 128
BSEARCH_LOOP:
# while (low <= high) { ... }
slt t3, t5, t4 # t3=1 if high<low
bne t3, x0, BSEARCH_DONE
# mid = (low + high) >> 1 (用 t6 當 mid)
add t6, t4, t5
srli t6, t6, 1
# prod = mid * mid (a4=prod, a5=multiplier,t3=multiplicand)
addi a4, x0, 0 # prod=0
addi a5, t6, 0 # multiplier = mid
addi t3, t6, 0 # multiplicand = mid
addi t0, x0, 0 # (scratch)
MUL_LOOP:
andi t0, a5, 1
beq t0, x0, MUL_SKIP_ADD
add a4, a4, t3 # prod += multiplicand
MUL_SKIP_ADD:
slli t3, t3, 1 # multiplicand <<= 1
srli a5, a5, 1 # multiplier >>= 1
bne a5, x0, MUL_LOOP
# sq = (prod) >> 7 (a4 作為 sq)
srli a4, a4, 7
# if (sq <= m) { result = mid; low = mid + 1; } else { high = mid - 1; }
# 判斷:m < sq ? (無號比較)
sltu t0, a2, a4 # t0=1 if m < sq
bne t0, x0, TAKE_RIGHT # m<sq → high=mid-1
# sq <= m
addi t6, t6, 0 # result = mid(維持在 t6)
addi t4, t6, 1 # low = mid + 1
jal x0, BSEARCH_LOOP
TAKE_RIGHT:
addi t5, t6, -1 # high = mid - 1
jal x0, BSEARCH_LOOP
BSEARCH_DONE:
# result 現在在 t6
# 規格化:若 result >= 256 → result>>=1; new_exp++
addi t0, x0, 256
slt t1, t6, t0 # t1=1 if result < 256
bne t1, x0, CHECK_LOW
srli t6, t6, 1
addi a3, a3, 1
jal x0, AFTER_NORM
CHECK_LOW:
# 若 result < 128,且 new_exp > 1:左移直到 >=128 或 new_exp==1
addi t0, x0, 128
NORM_LOOP_LOW:
slt t1, t6, t0 # t1=1 if result < 128
beq t1, x0, AFTER_NORM
addi t1, x0, 1
slt t1, t1, a3 # t1=1 if 1 < new_exp
beq t1, x0, AFTER_NORM
slli t6, t6, 1
addi a3, a3, -1
jal x0, NORM_LOOP_LOW
AFTER_NORM:
# new_mant = result & 0x7F
andi t6, t6, 0x7F
# 溢出 / 下溢 檢查
addi t0, x0, 255
bge a3, t0, RET_POS_INF # new_exp >= 0xFF → +Inf
beq a3, x0, RET_ZERO4 # new_exp <= 0 → 0
slt t1, a3, x0
bne t1, x0, RET_ZERO4
# 組裝(正號):bits = ((new_exp&0xFF)<<7) | new_mant
andi t0, a3, 255
slli t0, t0, 7
or a0, t0, t6
jalr x0, ra, 0
# ========= 多個返回路徑 =========
RET_A4: # 回 a(NaN 傳播等)
addi a0, a0, 0
jalr x0, ra, 0
RET_ZERO4: # 回 0
addi a0, x0, 0
jalr x0, ra, 0
RET_POS_INF: # 回 +Inf : 0x7F80
addi t0, x0, 255
slli t0, t0, 7 # 0x7F80
addi a0, t0, 0
jalr x0, ra, 0
RET_NAN4: # 回 qNaN : 0x7FC0
addi t0, x0, 255
slli t0, t0, 7 # 0x7F80
ori a0, t0, 0x40 # 0x7FC0
jalr x0, ra, 0
```
### bf16_eq
#### c code
```c=
static inline bool bf16_eq(bf16_t a, bf16_t b)
{
if (bf16_isnan(a) || bf16_isnan(b))
return false;
if (bf16_iszero(a) && bf16_iszero(b))
return true;
return a.bits == b.bits;
}
```
#### assembly code
```c=
bf16_eq:
srli t0, a0, 7 # t0 = exp_a
andi t0, t0, 255
addi t1, x0, 255
bne t0, t1, CheckIsNanB # if exp_a != 0xFF -> 不是 NaN
andi t2, a0, 0x7F # mant_a
beq t2, x0, CheckIsNanB # mant_a == 0 -> Inf,不是 NaN
addi a0, x0, 0 # a 是 NaN -> false
jalr x0, ra, 0
CheckIsNanB:
srli t0, a1, 7 # t0 = exp_b
andi t0, t0, 255
addi t1, x0, 255
bne t0, t1, CheckBothZero
andi t2, a1, 0x7F # mant_b
beq t2, x0, CheckBothZero
addi a0, x0, 0 # b 是 NaN -> false
jalr x0, ra, 0
CheckBothZero:
lui t3, 0x8 # t3 = 0x00008000
addi t3, t3, -1 # t3 = 0x00007FFF
# (a.bits & 0x7FFF) == 0 ?
and t0, a0, t3
bne t0, x0, CompareBits
# (b.bits & 0x7FFF) == 0 ?
and t1, a1, t3
bne t1, x0, CompareBits
# 兩個都是 ±0 -> true
addi a0, x0, 1
jalr x0, ra, 0
CompareBits:
xor t0, a0, a1
bne t0, x0, ReturnFalse # 若不同 -> false
addi a0, x0, 1 # 相同 -> true
jalr x0, ra, 0
ReturnFalse:
addi a0, x0, 0
jalr x0, ra, 0
```
### bf16_lt
#### c code
```c=
static inline bool bf16_lt(bf16_t a, bf16_t b)
{
if (bf16_isnan(a) || bf16_isnan(b))
return false;
if (bf16_iszero(a) && bf16_iszero(b))
return false;
bool sign_a = (a.bits >> 15) & 1, sign_b = (b.bits >> 15) & 1;
if (sign_a != sign_b)
return sign_a > sign_b;
return sign_a ? a.bits > b.bits : a.bits < b.bits;
}
```
#### assembly code
```c=
bf16_lt:
srli t0, a0, 7 # t0 = exp_a
andi t0, t0, 255
addi t1, x0, 255
bne t0, t1, CheckIsNanB2
andi t2, a0, 0x7F # mant_a
beq t2, x0, CheckIsNanB2 # mant==0 → Inf,不是 NaN
addi a0, x0, 0 # a 是 NaN → false
jalr x0, ra, 0
CheckIsNanB2:
srli t0, a1, 7 # t0 = exp_b
andi t0, t0, 255
addi t1, x0, 255
bne t0, t1, CheckBothZero
andi t2, a1, 0x7F # mant_b
beq t2, x0, CheckBothZero
addi a0, x0, 0 # b 是 NaN → false
jalr x0, ra, 0
CheckBothZero2:
# 構造 0x7FFF 掩碼
lui t3, 0x8 # t3 = 0x00008000
addi t3, t3, -1 # t3 = 0x00007FFF
and t0, a0, t3 # t0 = a.bits & 0x7FFF
bne t0, x0, TakeSigns
and t1, a1, t3 # t1 = b.bits & 0x7FFF
bne t1, x0, TakeSigns
# 兩者皆為 ±0 → a<b 為假
addi a0, x0, 0
jalr x0, ra, 0
TakeSigns:
srli t0, a0, 15 # sign_a
andi t0, t0, 1
srli t1, a1, 15 # sign_b
andi t1, t1, 1
# 若符號不同:回傳 (sign_a > sign_b)
xor t2, t0, t1 # t2 = sign_a ^ sign_b
beq t2, x0, SameSign
# 符號不同時:a< b 當且僅當 sign_a=1 且 sign_b=0
# 亦即直接回傳 sign_a
addi a0, t0, 0
jalr x0, ra, 0
SameSign:
# 若為負號:用位元序反向(a.bits > b.bits → a<b)
bne t0, x0, BothNegative
# 兩者皆為正:a.bits < b.bits → a<b
sltu t2, a0, a1 # t2 = (a0 < a1) ? 1:0
addi a0, t2, 0
jalr x0, ra, 0
BothNegative:
# 兩者皆為負:a.bits > b.bits → a<b
sltu t2, a1, a0 # t2 = (a1 < a0) ? 1:0 <=> a0 > a1
addi a0, t2, 0
jalr x0, ra, 0
```
### bf16_gt
#### c code
```c=
static inline bool bf16_gt(bf16_t a, bf16_t b)
{
return bf16_lt(b, a);
}
```
#### assembly code
```c=
bf16_gt:
addi t0, a0, 0 # t0 = a
addi a0, a1, 0 # a0 = b
addi a1, t0, 0 # a1 = a
jal ra, bf16_lt # call bf16_lt(b, a)
jalr x0, ra, 0 # return
```