---
title: 'Assignment 1: RISC-V Assembly and Instruction Pipline '
---
# Assignment 1: RISC-V Assembly and Instruction Pipline
*Source: [Quiz1 solution (HackMD)](https://hackmd.io/@sysprog/arch2025-quiz1-sol)*
## UF8
### Overview
- **uf8** encodes **20‑bit unsigned integers** (range **0–1,015,792**) into **8 bits** via **logarithmic quantization**.
- Target characteristics: about **2.5:1 compression** with **worst‑case relative error ≤ 6.25%** (typical ≈ **3%**).
### Decoding
Given an encoded byte **b**:
- **Formula:** $\ D(b) = m\cdot 2^{e} + (2^{e} - 1)\cdot 16$
- where $(e = \lfloor b/16 \rfloor), (m = b \bmod 16)$
### Encoding
For a value **v**:
- If **v < 16**, encode **exactly**: \(E(v) = v\).
- Otherwise,
- Let $\text{offset}(e) = (2^{e} - 1)\cdot 16$
- **Formula:** $E(v) = 16e + \left\lfloor \dfrac{v - \text{offset}(e)}{2^{e}} \right\rfloor$
### C implementation
::: spoiler code
``` c=
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
typedef uint8_t uf8;
static inline unsigned clz(uint32_t x)
{
int n = 32, c = 16;
do {
uint32_t y = x >> c;
if (y) {
n -= c;
x = y;
}
c >>= 1;
} while (c);
return n - x;
}
/* Decode uf8 to uint32_t */
uint32_t uf8_decode(uf8 fl)
{
uint32_t mantissa = fl & 0x0f;
uint8_t exponent = fl >> 4;
uint32_t offset = (0x7FFF >> (15 - exponent)) << 4;
return (mantissa << exponent) + offset;
}
/* Encode uint32_t to uf8 */
uf8 uf8_encode(uint32_t value)
{
/* Use CLZ for fast exponent calculation */
if (value < 16)
return value;
/* Find appropriate exponent using CLZ hint */
int lz = clz(value);
int msb = 31 - lz;
/* Start from a good initial guess */
uint8_t exponent = 0;
uint32_t overflow = 0;
if (msb >= 5) {
/* Estimate exponent - the formula is empirical */
exponent = msb - 4;
if (exponent > 15)
exponent = 15;
/* Calculate overflow for estimated exponent */
for (uint8_t e = 0; e < exponent; e++)
overflow = (overflow << 1) + 16;
/* Adjust if estimate was off */
while (exponent > 0 && value < overflow) {
overflow = (overflow - 16) >> 1;
exponent--;
}
}
/* Find exact exponent */
while (exponent < 15) {
uint32_t next_overflow = (overflow << 1) + 16;
if (value < next_overflow)
break;
overflow = next_overflow;
exponent++;
}
uint8_t mantissa = (value - overflow) >> exponent;
return (exponent << 4) | mantissa;
}
/* Test encode/decode round-trip */
static bool test(void)
{
int32_t previous_value = -1;
bool passed = true;
for (int i = 0; i < 256; i++) {
uint8_t fl = i;
int32_t value = uf8_decode(fl);
uint8_t fl2 = uf8_encode(value);
if (fl != fl2) {
printf("%02x: produces value %d but encodes back to %02x\n", fl,
value, fl2);
passed = false;
}
if (value <= previous_value) {
printf("%02x: value %d <= previous_value %d\n", fl, value,
previous_value);
passed = false;
}
previous_value = value;
}
return passed;
}
int main(void)
{
if (test()) {
printf("All tests passed.\n");
return 0;
}
return 1;
}
```
:::
### RV32I implementation
#### CLZ
**AI Assistance notification**
>This RV32I implementation is assited by ChatGPT.
:::spoiler code
```c=
static inline unsigned clz(uint32_t x)
{
unsigned n, c, y;
asm volatile(
"li %0, 32 \n\t" // n = 32
"li %2, 16 \n\t" // c = 16
"1: \n\t"
"srl %3, %1, %2 \n\t" // y = x >> c
"beqz %3, 2f \n\t"
"sub %0, %0, %2 \n\t" // n -= c
"mv %1, %3 \n\t" // x = y
"2: \n\t"
"srli %2, %2, 1 \n\t" // c >>= 1
"bnez %2, 1b \n\t"
"sub %0, %0, %1 \n\t" // n -= x (x is 0 or 1)
: "=&r"(n), "+&r"(x), "=&r"(c), "=&r"(y)
:
: "cc"
);
return n;
}
```
:::
This implementation follows identical CLZ behavior defined previously.
>In order to discover the improvemnt in either code size or run time w.r.t original C implementaion and guarantee fairness , this and following implementations of UF8 utilities will apply **asm volatile**.
#### uf8_decode
::: spoiler code
``` c=
static inline uint32_t uf8_decode(uf8 fl)
{
uint32_t r, mant, exp, off;
asm volatile(
"andi %1, %4, 0x0f \n\t" // mant = fl & 0x0f
"srli %2, %4, 4 \n\t" // exp = fl >> 4
// off = ((1 << exp) - 1) << 4
"li %3, 1 \n\t" // off = 1
"sll %3, %3, %2 \n\t" // off <<= exp
"addi %3, %3, -1 \n\t" // off = (1<<exp) - 1
"slli %3, %3, 4 \n\t" // off <<= 4
// r = (mant << exp) + off
"sll %0, %1, %2 \n\t" // r = mant << exp
"add %0, %0, %3 \n\t" // r += off
: "=&r"(r), "=&r"(mant), "=&r"(exp), "=&r"(off)
: "r"(fl)
: "cc"
);
return r;
}
```
:::
#### uf8_encode
::: spoiler code
``` c=
static inline uf8 uf8_encode(uint32_t value)
{
// --- preserved C part ---
if (value < 16)
return (uf8)value;
int lz = clz(value);
// --- assembly for the rest ---
uint32_t out, e, ovf, msb, t0, t1;
asm volatile(
// msb = 31 - lz
"li %3, 31 \n\t"
"sub %3, %3, %7 \n\t"
// e = 0; ovf = 0;
"mv %1, x0 \n\t"
"mv %2, x0 \n\t"
// if (msb >= 5) { e = msb - 4; if (e > 15) e = 15; for(i=0;i<e;i++) ovf=(ovf<<1)+16; while(e>0 && value<ovf){ovf=(ovf-16)>>1; e--;}}
"li %4, 5 \n\t"
"bltu %3, %4, 1f \n\t" // if (msb < 5) skip block
"addi %1, %3, -4 \n\t" // e = msb - 4
"li %4, 15 \n\t"
"bltu %4, %1, 2f \n\t" // if (15 < e) clamp
"j 3f \n\t"
"2: \n\t"
"mv %1, %4 \n\t" // e = 15
"3: \n\t"
// for (i=0; i<e; i++) ovf = (ovf<<1) + 16;
"mv %5, x0 \n\t" // i = 0 (t1)
"4: \n\t"
"beq %5, %1, 5f \n\t"
"slli %2, %2, 1 \n\t"
"addi %2, %2, 16 \n\t"
"addi %5, %5, 1 \n\t"
"j 4b \n\t"
"5: \n\t"
// while (e > 0 && value < ovf) { ovf = (ovf - 16) >> 1; e--; }
"beqz %1, 1f \n\t"
"6: \n\t"
"bltu %6, %2, 7f \n\t" // if (value < ovf) adjust
"j 1f \n\t"
"7: \n\t"
"addi %2, %2, -16 \n\t"
"srli %2, %2, 1 \n\t"
"addi %1, %1, -1 \n\t"
"bnez %1, 6b \n\t"
// } // end if (msb >= 5)
"1: \n\t"
// while (e < 15) { next=(ovf<<1)+16; if (value < next) break; ovf=next; e++; }
"li %4, 15 \n\t"
"bgeu %1, %4, 8f \n\t"
"9: \n\t"
"slli %5, %2, 1 \n\t" // next in t1
"addi %5, %5, 16 \n\t"
"bltu %6, %5, 8f \n\t" // break if value < next
"mv %2, %5 \n\t" // ovf = next
"addi %1, %1, 1 \n\t" // e++
"bltu %1, %4, 9b \n\t"
"8: \n\t"
// mant = (value - ovf) >> e; out = ((e<<4)|mant) & 0xFF
"sub %5, %6, %2 \n\t"
"srl %5, %5, %1 \n\t"
"slli %0, %1, 4 \n\t"
"or %0, %0, %5 \n\t"
"andi %0, %0, 0xff \n\t"
: "=&r"(out), // %0
"=&r"(e), // %1
"=&r"(ovf), // %2
"=&r"(msb), // %3
"=&r"(t0), // %4
"=&r"(t1) // %5
: "r"(value), // %6
"r"(lz) // %7
: "cc"
);
return (uf8)out; // narrowed to 8 bits; ABI-safe
}
```
:::
#### Performance
>Note that test and main program are preserved as pure C code to guarantee fairness.
|baseline(Pure C)|C mix RV32I|
|--|--|
|||
## UF8 optimization
### CLZ
>Notification :
Since CLZ baseline (divide & conquer) already provided a acceptable preformance , hence this optimization didnt stick to RV32I ISA
in order to dig full potential of 5 stage pipline CPU.
#### baseline
```
static inline unsigned clz(uint32_t x)
{
int n = 32, c = 16;
do {
uint32_t y = x >> c;
if (y) {
n -= c;
x = y;
}
c >>= 1;
} while (c);
return n - x;
}
```
This implementation conceptually similar to binary search.
There are 2 optimization scheme.
#### CLZ solution 1
solution 2 is based on **De Bruijn multiply (branchless, tiny table)**
implementation
``` c=
static inline unsigned clz(uint32_t x) {
static const unsigned char lut[32] = {
0, 9, 1,10,13,21, 2,29,11,14,16,18,22,25, 3,30,
8,12,20,28,15,17,24, 7,19,27,23, 6,26, 5, 4,31
};
if (!x) return 32;
uint32_t v = x; // saturate below MSB to 1s
v |= v >> 1; v |= v >> 2; v |= v >> 4; v |= v >> 8; v |= v >> 16;
unsigned msb = lut[(v * 0x07C4ACDDu) >> 27];
return 31u - msb;
}
```
:::spoiler RV32I
``` c=
static inline unsigned clz(uint32_t x) {
static const unsigned char lut[32] = {
0, 9, 1,10,13,21, 2,29,11,14,16,18,22,25, 3,30,
8,12,20,28,15,17,24, 7,19,27,23, 6,26, 5, 4,31
};
unsigned res, v, tmp, prod, idx, msb;
asm volatile(
// if (!x) return 32;
"beqz %6, 0f \n\t"
// v = x; // saturate below MSB to 1s
"mv %1, %6 \n\t"
"srli %2, %1, 1 \n\t" "or %1, %1, %2 \n\t"
"srli %2, %1, 2 \n\t" "or %1, %1, %2 \n\t"
"srli %2, %1, 4 \n\t" "or %1, %1, %2 \n\t"
"srli %2, %1, 8 \n\t" "or %1, %1, %2 \n\t"
"srli %2, %1, 16 \n\t" "or %1, %1, %2 \n\t"
// prod = v * 0x07C4ACDDu (shift-add, modulo 2^32)
// Bits set in 0x07C4ACDD: 0,2,3,4,6,7,10,11,13,15,18,22,23,24,25,26
"mv %3, %1 \n\t" // prod = (v << 0)
"slli %2, %1, 2 \n\t" "add %3, %3, %2 \n\t"
"slli %2, %1, 3 \n\t" "add %3, %3, %2 \n\t"
"slli %2, %1, 4 \n\t" "add %3, %3, %2 \n\t"
"slli %2, %1, 6 \n\t" "add %3, %3, %2 \n\t"
"slli %2, %1, 7 \n\t" "add %3, %3, %2 \n\t"
"slli %2, %1, 10 \n\t" "add %3, %3, %2 \n\t"
"slli %2, %1, 11 \n\t" "add %3, %3, %2 \n\t"
"slli %2, %1, 13 \n\t" "add %3, %3, %2 \n\t"
"slli %2, %1, 15 \n\t" "add %3, %3, %2 \n\t"
"slli %2, %1, 18 \n\t" "add %3, %3, %2 \n\t"
"slli %2, %1, 22 \n\t" "add %3, %3, %2 \n\t"
"slli %2, %1, 23 \n\t" "add %3, %3, %2 \n\t"
"slli %2, %1, 24 \n\t" "add %3, %3, %2 \n\t"
"slli %2, %1, 25 \n\t" "add %3, %3, %2 \n\t"
"slli %2, %1, 26 \n\t" "add %3, %3, %2 \n\t"
// idx = (prod >> 27)
"srli %4, %3, 27 \n\t"
// msb = lut[idx]
"add %5, %7, %4 \n\t"
"lbu %5, 0(%5) \n\t"
// res = 31 - msb
"li %0, 31 \n\t"
"sub %0, %0, %5 \n\t"
"j 1f \n\t"
// x == 0 case
"0: \n\t"
"li %0, 32 \n\t"
"1: \n\t"
: "=&r"(res), // %0
"=&r"(v), // %1
"=&r"(tmp), // %2 (temp for shifts)
"=&r"(prod), // %3 (product)
"=&r"(idx), // %4
"=&r"(msb) // %5
: "r"(x), // %6
"r"(lut) // %7 (base ptr)
: "cc", "memory"
);
return res;
}
```
:::
Multiply in RV32I version is resolved in **Shift and add** solution.
:::spoiler RV32I/M
```c=
#include <stdint.h>
// result type is unsigned (32-bit)
static inline unsigned clz(uint32_t x) {
static const unsigned char lut[32] = {
0, 9, 1,10,13,21, 2,29,11,14,16,18,22,25, 3,30,
8,12,20,28,15,17,24, 7,19,27,23, 6,26, 5, 4,31
};
unsigned res, v, tmp, idx, msb;
asm volatile(
// if (!x) return 32;
"beqz %5, 0f \n\t"
// v = x; saturate below MSB to 1s:
"mv %1, %5 \n\t"
"srli %2, %1, 1 \n\t" "or %1, %1, %2 \n\t"
"srli %2, %1, 2 \n\t" "or %1, %1, %2 \n\t"
"srli %2, %1, 4 \n\t" "or %1, %1, %2 \n\t"
"srli %2, %1, 8 \n\t" "or %1, %1, %2 \n\t"
"srli %2, %1, 16 \n\t" "or %1, %1, %2 \n\t"
// idx = (v * 0x07C4ACDDu) >> 27; (needs M-extension)
"li %2, 0x07C4ACDD \n\t"
"mul %3, %1, %2 \n\t"
"srli %3, %3, 27 \n\t"
// msb = lut[idx]; (byte table)
"add %4, %6, %3 \n\t"
"lbu %4, 0(%4) \n\t"
// res = 31u - msb;
"li %0, 31 \n\t"
"sub %0, %0, %4 \n\t"
"j 1f \n\t"
// x == 0 case
"0: \n\t"
"li %0, 32 \n\t"
"1: \n\t"
: "=&r"(res), // %0
"=&r"(v), // %1
"=&r"(tmp), // %2 (temps/const)
"=&r"(idx), // %3
"=&r"(msb) // %4
: "r"(x), // %5
"r"(lut) // %6 base pointer to LUT
: "cc", "memory"
);
return res;
}
```
:::
Note that since we unlock RV32M , hence we use **mul** resolve mutiplication.
#### CLZ solution 2
solution 3 is based on GCC built-in utilities called **__builtin_clz**
however according to https://gcc.gnu.org/onlinedocs/gcc/Bit-Operation-Builtins.html
> Built-in Function: int __builtin_clz (unsigned int x)
Returns the number of leading 0-bits in x, starting at the most significant bit position. If x is 0, the result is undefined.
hence we add a **branch** here to handle spcial case of x == 0.
implementation
``` c=
static inline __attribute__((always_inline))
unsigned clz(uint32_t x) {
return x ? __builtin_clz(x) : 32u; // UB-free
}
```
Evaluation
> note that UF8 encode/decode all implement in C mix RV32I scheme.
and evaluation and main function are preserved as pure C.
| baseline | De Bruijn RV32I |De Bruijn RV32I/M| GCC built-in|
|:--|:--|:--|:--|
|||||
Its actually amusing to know that **De Brujin** solution with only RV32I instruction set falls behind basedline.
On the other hand as **RV32C** unlocked **De Brujin** solution outperforms baseline.
and without a doubt , **GCC built-in** solution has best performance.
## BF16
### Overview
**bfloat16** is a 16‑bit floating‑point format that keeps the **8‑bit exponent** of FP32 (same dynamic range) but uses a **7‑bit fraction** (+1 sign bit). It trades precision for speed and memory.
| Field | Bits | Description |
|---|---:|---|
| Sign | 1 | 0 = +, 1 = − |
| Exponent | 8 | Bias = **127** (same as FP32) |
| Fraction (mantissa) | 7 | No hidden bits stored beyond the implicit leading 1 for normals |
**AI Assistance notification**
> Partial work in this note was assisted by **ChatGPT** (OpenAI). The BF16 implementation benefited from design discussion and an evaluation-program scheme.
| Contributor | Contribution |
|---|---|
| Synte-Peng (author of this note) | Implement of BF16 helper & arithmetic functions from scratch |
| ChatGPT (OpenAI) | Designed & proposed evaluation program scheme |
### helper funcitons implementation
#### bf16_isnan
:::spoiler code
``` c=
#
#
#
# input: a0 = x, return 1 if x is NaN else 0
#
#
bf16_isnan: #leverage s0 , s1
addi sp , sp , -32
sw s0 , 4(sp)
sw s1 , 8(sp)
lw s0 , BF16_EXP_MASK
lw s1 , BF16_MANT_MASK
and s1 , s1 , a0
and a0 , a0 , s0
sub a0 , a0 , s0
beq a0 , zero , full_exp
addi a0 , zero , 0
cont_compute_nan:
bne s1 , zero , nan_mant
addi s1 , zero , 0
cont_compute_nan1:
and a0 , s1 , a0
lw s0 , 4(sp)
lw s1 , 8(sp)
addi sp , sp , 32
ret
full_exp:
addi a0 , zero , 1
beq zero , zero , cont_compute_nan
nan_mant:
addi s1 , zero , 1
beq zero , zero , cont_compute_nan1
```
:::
#### bf16_isinf
:::spoiler code
```c=
#
#
#
# input: a0 = x, return 1 if x is inf else 0
#
#
bf16_isinf: #leverage s0
addi sp , sp , -8
sw s0 , 4(sp)
lw s0 , BF16_EXP_MASK
slli a0 , a0 , 17
srli a0 , a0 , 17
beq a0 , s0 , isinf
and a0 , a0 , zero
cont_compute_inf:
lw s0 , 4(sp)
addi sp , sp , 8
ret
isinf:
ori a0 , zero ,1
beq zero , zero , cont_compute_inf
```
:::
#### bf16_iszero
:::spoiler code
```c=
#
#
#
# input: a0 = x, return 1 if x is zero else 0
#
#
bf16_iszero:
slli a0 , a0 , 17
srli a0 , a0 , 17
sltu a0 , zero , a0
xori a0 , a0 , 1
ret
```
:::
#### f32_to_bf16
:::spoiler code
``` c=
#
#
# convert input a0 from f32 float represenataion into bf16
#
#
f32_to_bf16: #leverage s0 , s1 , s2
addi sp , sp , -16
sw s0 , 12(sp)
sw s1 , 8(sp)
sw s2 , 4(sp)
or s0 , a0 , zero
srli s2 , s0 , 23
andi s2 , s2 , 0xff
ori s1 , zero , 0xff
beq s2 , s1 , f32_to_bf16_is_inf_or_nan
lui s1 , 0x7fff
srli s1 , s1 , 12
srli s0 , s0 , 16
andi s0 , s0 , 1
add s0 , s1 , s0
add a0 , a0 , s0
srli a0 , a0 , 16
cont_convert:
lw s0 , 12(sp)
lw s1 , 8(sp)
lw s2 , 4(sp)
addi sp , sp , 16
ret
f32_to_bf16_is_inf_or_nan:
lui s0 , 0xffff
srli s0 , s0 , 12
srli a0 , a0 , 16
and a0 , a0 , s0
beq zero , zero , cont_convert
```
:::
#### bf16_to_f32
:::spoiler code
```c=
#
#
# convert input a0 from bf16 representation into f32 float
#
#
bf16_to_f32:
slli a0 , a0 , 16
ret
```
:::
### arithmetic utilities
#### bf16_add
:::spoiler code
```c=
#
#
# return a0 = a0 + a1 , which all repersented in bf16
#
#
bf16_add: # s0 : input 1 , s1 : input 2 , s2 : hold 1
addi sp , sp , -128
sw ra , 4(sp)
sw s0 , 8(sp)
sw s1 , 12(sp)
sw s2 , 16(sp)
sw s3 , 20(sp) # if logic handling / free variable
or s0 , a0 , zero
or s1 , a1 , zero
addi s2 , zero , 1
jal bf16_isnan
beq a0 , s2 , add_a_nan
or a0 , a1 , zero
jal bf16_isnan
beq a0 , s2 , add_a_nan
or a0 , s0 , zero
jal bf16_isinf
beq a0 , s2 , add_a_inf # check if a is inf
or a0 , s1 , zero # Place input:2 into a0
jal bf16_isinf
beq a0 , s2 , add_b_inf # check if b is inf
or a0 , s0 , zero
jal bf16_iszero
beq a0 , s2 , add_a_zero
or s3 , s1 , zero # SWAP s0 , s1
or s1 , s0 , zero
or s0 , s3 , zero
or a0 , s0 , zero
jal bf16_iszero
beq a0 , s2 , add_a_zero
sw s4 , 24(sp) # a_sign
sw s5 , 28(sp) # b_sign
sw s6 , 32(sp) # a_exp
sw s7 , 36(sp) # b_exp
sw s8 , 40(sp) # a_mant
sw s9 , 44(sp) # b_mant
sw s10 , 48(sp) # free to use
sw s11 , 52(sp) # free to use
srli s4 , s0 , 15
srli s5 , s1 , 15
srli s6 , s0 , 7
srli s7 , s1 , 7
andi s6 , s6 , 0xff
andi s7 , s7 , 0xff
andi s8 , s0 , 0x7f
andi s9 , s1 , 0x7f
slt s3 , s2 , s6
bne zero , s3 , add_a_normalized
cont_add1:
slt s3 , s2 , s7
bne zero , s3 , add_b_normalized
cont_add2:
sw t0 , 56(sp) # exp_diff
sw t1 , 60(sp) # result_sign
sw t2 , 64(sp) # result_exp
sw t3 , 68(sp) # result_mant
sub t0 , s6 , s7
bge t0 , zero , result_exp_is_a
addi s10 , zero , -8
bge s10 , t0 , b_too_large_add
or t2 , s7 , zero
xori s10 , t0 , -1
addi s10 , s10 , 1
srl s8 , s8 , s10
cont_add3:
beq s4 , s5 , add_same_sign_handling
or s10 , zero , s9
addi s10 , s10 , -1
bge s8 , s10 , result_sign_is_a
# result sign is b
or t1 , zero , s5
sub t3 , s9 , s8
cont_add5:
beq t3 , zero , after_add_is_zero
ori s11 , zero , 0x80
final_add_normalization:
and s3 , t3 , s11
beq s3 , s11, cont_add4
slli t3 , t3 , 1
addi t2 , t2 , -1
bge zero , t2 , after_add_is_zero # equivalent to denormalized case
beq zero , zero , final_add_normalization
cont_add4: # final return stage
slli t1 , t1 , 15
slli t2 , t2 , 7
andi t3 , t3 , 0x7f
or a0 , t1 , zero
or a0 , a0 , t2
or a0 , a0 , t3
complete:
lw ra , 4(sp)
lw s0 , 8(sp)
lw s1 , 12(sp)
lw s2 , 16(sp)
lw s3 , 20(sp)
lw s4 , 24(sp)
lw s5 , 28(sp)
lw s6 , 32(sp)
lw s7 , 36(sp)
lw s8 , 40(sp)
lw s9 , 44(sp)
lw s10 , 48(sp)
lw s11 , 52(sp)
lw t0 , 56(sp)
lw t1 , 60(sp)
lw t2 , 64(sp)
lw t3 , 68(sp)
addi sp , sp , 128
ret
add_a_nan:
lw a0 , BF16_NAN
lw ra , 4(sp)
lw s0 , 8(sp)
lw s1 , 12(sp)
lw s2 , 16(sp)
lw s3 , 20(sp)
addi sp , sp , 128
ret
add_a_inf:
or a0 , s1 , zero
jal bf16_isinf
beq a0 , s2 , add_2_inf
or a0 , s0 , zero
beq zero , zero , complete
add_2_inf:
addi s3 , zero , 0x80
slli s3 , s3 , 8
xor s1 , s1 , s3
beq s0 , s1 , inf_minus_inf
or a0 , zero , s0
beq zero , zero , complete
inf_minus_inf:
lw s3 , BF16_NAN
or a0 , zero , s3
beq zero , zero , complete
add_b_inf:
or a0 , s1 , zero
beq zero , zero , complete
add_a_zero:
or a0 , s1 , zero
beq zero , zero , complete
add_a_normalized:
ori s8 , s8 , 0x80
beq zero , zero , cont_add1
add_b_normalized:
ori s9 , s9 , 0x80
beq zero , zero , cont_add2
result_exp_is_a:
ori s10 , zero , 0x100
bge t0 , s10 , a_too_large_add
srl s9 , s9 , t0
or t2 , zero , s6
beq zero , zero , cont_add3
a_too_large_add:
or a0 , s0 , zero
beq zero , zero , complete
b_too_large_add:
or a0 , s1 , zero
beq zero , zero , complete
add_same_sign_handling:
or t1 , s4 , zero
add t3 , s8 , s9
andi s10 , t3 , 0x100
srli s10, s10 , 8
bne zero , s10 , normalize_after_add
beq zero , zero , cont_add5
normalize_after_add:
srli t3 , t3 , 1
addi t2 , t2 , 1
ori s10 , zero , 0xff
bge t2 , s10 , add_to_inf
beq zero , zero , cont_add5
add_to_inf:
lui s10 , 0x7f80
srli s10 , s10 , 12
slli t1 , t1 , 15
or a0 , t1 , s10
beq zero , zero , complete
result_sign_is_a:
or t1 , zero , s4
sub t3 , s8 , s9
beq zero , zero , cont_add5
after_add_is_zero:
or a0 , zero , zero
beq zero , zero , complete
```
:::
#### bf16_sub
::: spoiler code
```c=
#
#
# leverage idea of a - b = a + (-b)
#
#
bf16_sub:
addi sp , sp , -12
sw ra , 8(sp)
sw s0 , 4(sp)
lw s0 , BF16_SIGN_MASK
xor a1 , a1 , s0
jal bf16_add
ret
```
:::
#### bf16_mul
::: spoiler code
``` c=
#
#
# a0 = a0 * a1 which all in bf16 representation
# add-shift multiplication algorithm is applied
#
#
bf16_mul: # s0 : a , s1 : b , s2 : 1
addi sp , sp , -128
sw ra , 4(sp)
sw s0 , 8(sp)
sw s1 , 12(sp)
sw s2 , 16(sp)
sw s3 , 20(sp) # if logic handling / free variable
sw s4 , 24(sp) # a_sign
sw s5 , 28(sp) # b_sign
or s0 , a0 , zero
or s1 , a1 , zero
addi s2 , zero , 1
srli s4 , s0 , 15
srli s5 , s1 , 15
xor s4 , s5 , s4 # s4 serve for result sign , s5 for exp_adjust
jal bf16_isnan
beq s2 , a0 , mul_a_nan
or a0 , s0 , zero
jal bf16_isinf
beq s2 , a0 , mul_a_inf
or a0 , s1 , zero
jal bf16_isnan
beq s2 , a0 , mul_b_nan
or a0 , s1 , zero
jal bf16_isinf
beq s2 , a0 , mul_b_inf
sw s6 , 32(sp) # a_exp
sw s7 , 36(sp) # b_exp
sw s8 , 40(sp) # a_mant
sw s9 , 44(sp) # b_mant
sw s10 , 48(sp) # free to use
sw s11 , 52(sp) # free to use
srli s4 , s0 , 15
srli s5 , s1 , 15
srli s6 , s0 , 7
srli s7 , s1 , 7
andi s6 , s6 , 0xff
andi s7 , s7 , 0xff
andi s8 , s0 , 0x7f
andi s9 , s1 , 0x7f
or s5 , zero , zero # s5 serve for exp adjust
or s3 , zero , zero # if logic handling
normalize_mant_a_mul:
add s3 , s6 , zero
slt s3 , s3 , s2
ori s10 , zero , 0x7f
beq zero , s3 , normalize_mant_b_mul
normalizing_a_mul:
# loop
andi s3 , s8 , 0x80
slt s3 , s3 , s10
bne s2 , s3 , a_normalized_mul
slli s8 , s8 , 1
addi s5 ,s5 , -1
beq zero , zero , normalizing_a_mul
a_normalized_mul:
addi s6 , zero , 1
normalize_mant_b_mul:
ori s8 , s8 , 0x80
add s3 , s7 , zero
slt s3 , s3 , s2
beq zero , s3 , normalize_mant_b_complete_mul
normalizing_b_mul:
# loop
andi s3 , s9 , 0x80
slt s3 , s3 , s10
bne s2 , s3 , b_normalized_mul
slli s9 , s9 , 1
addi s5 , s5 , -1
beq zero , zero , normalizing_b_mul
b_normalized_mul:
addi s7 , zero , 1
normalize_mant_b_complete_mul:
ori s9 ,s9 , 0x80
# multiplication loop
ori s3 , zero , 8
sw t0 , 56(sp) # serve for result mant
sw t1 , 60(sp) # serve for result exp
or t0 , zero , s9
slli s8 , s8 , 8
start_mant_mul:
beq zero , s3 , mul_mant_complete
addi s3 , s3 , -1
andi s10 , t0 , 1
bne s2 , s10 , partial_addition_finished
add t0 , t0 ,s8
partial_addition_finished:
srli t0 , t0 , 1
beq zero , zero , start_mant_mul
mul_mant_complete:
srli s8 , s8 , 16
add t1 , s6 , s7
addi t1 , t1 , -127
add t1 , t1 , s5
lui s10 , 0x00008
and s10 , s10 , t0
slt s3 , s10 , s2
bne zero , s3 , no_adjustment_after_mant_mul
srli t0 , t0 , 8
andi t0 , t0 , 0x7f
addi t1 ,t1 , 1
beq zero , zero , exp_adjustment_complete
no_adjustment_after_mant_mul:
srli t0 , t0 ,7
andi t0 , t0 , 0x7f
exp_adjustment_complete:
# mul to inf checking
addi s3 , zero , 0xfe
bge t1 , s3 , mul_to_overflow
bge s2 , t1 , check_mul_to_denormalized
result_normalization_complete:
or a0 , zero , t0
andi t1 , t1 , 0xff
slli t1 , t1 , 7
or a0 , a0 , t1
slli s4 , s4 , 15
or a0 , a0 , s4
mul_final:
lw ra , 4(sp)
lw s0 , 8(sp)
lw s1 , 12(sp)
lw s2 , 16(sp)
lw s3 , 20(sp)
lw s4 , 24(sp)
lw s5 , 28(sp)
lw s6 , 32(sp)
lw s7 , 36(sp)
lw s8 , 40(sp)
lw s9 , 44(sp)
lw s10 , 48(sp)
lw s11 , 52(sp)
lw t0 , 56(sp)
lw t1 , 60(sp)
addi sp , sp , 128
ret
mul_a_nan:
or a0 , s0 , zero
beq zero , zero , mul_final
mul_a_inf:
slli a0 ,s1 , 17
srli a0 ,a0 , 17
beq zero , a0 , mul_a_to_nan
or a0 , s4 , zero
slli a0 , a0 , 15
or a0 , s0 , zero
beq zero , zero , mul_final
mul_a_to_nan:
lw a0 , BF16_NAN
beq zero , zero, mul_final
mul_b_nan:
or a0 , s1 , zero
beq zero , zero , mul_final
mul_b_inf:
slli a0 , s0 , 17
srli a0 , a0 , 17
beq zero , a0 , mul_a_to_nan
or a0 , s4 , zero
slli a0 , a0 , 15
or a0 , s1 , zero
beq zero , zero , mul_final
mul_to_overflow:
lw a0 , BF16_EXP_MASK
slli s4 , s4 , 15
or a0 , a0 ,s4
beq zero , zero , mul_final
check_mul_to_denormalized:
addi s3 , zero , -6
bge s3 , t1 , mul_to_denormalized
sub s2 , s2 , t1
sll t0 , t0 , s2
or t0 , zero , zero
beq zero , zero , result_normalization_complete
mul_to_denormalized:
or a0 , zero , s4
slli a0 , a0 , 15
beq zero , zero , mul_final
```
:::
#### bf16_div
:::spoiler code
```c=
#
#
# a0 = a0 / a1 , which all in bf16 representation
# sub-shift division algorithm is applied
#
#
bf16_div:
addi sp , sp , -128
sw ra , 4(sp)
sw s0 , 8(sp)
sw s1 , 12(sp)
sw s2 , 16(sp)
sw s3 , 20(sp) # if logic handling / free variable
or s0 , zero , a0
or s1 , zero , a1
sw s4 , 24(sp) # a_sign
sw s5 , 28(sp) # b_sign
srli s4 , s0 , 15
srli s5 , s1 , 15
xor s4 , s5 , s4 # s4 serve for result sign , s5 for exp_adjust
or s0 , a0 , zero
or s1 , a1 , zero
addi s2 , zero , 1
jal bf16_isnan
beq a0 , s2 , div_to_nan
or a0 , s1 , zero
jal bf16_isnan
beq a0 , s2 , div_to_nan
or a0 , s1 , zero
jal bf16_iszero
beq a0 , s2 , div_b_zero
or a0 , s1 , zero
jal bf16_isinf
beq a0 , s2 , div_b_inf
or a0 , s0 , zero
jal bf16_isinf
beq a0 , s2 , div_to_inf
or a0 , s0 , zero
jal bf16_iszero
beq a0 , s2 , div_to_zero
sw s6 , 32(sp) # a_exp
sw s7 , 36(sp) # b_exp
sw s8 , 40(sp) # a_mant
sw s9 , 44(sp) # b_mant
sw s10 , 48(sp) # free to use
sw s11 , 52(sp) # free to use
srli s6 , s0 , 7
srli s7 , s1 , 7
andi s6 , s6 , 0xff
andi s7 , s7 , 0xff
andi s8 , s0 , 0x7f
andi s9 , s1 , 0x7f
beq s6 , zero , a_mant_normalized
ori s8 , s8 , 0x80
a_mant_normalized:
beq s7 , zero , b_mant_normalized
ori s9 , s9 , 0x80
b_mant_normalized:
sw t0 , 56(sp) # serve for dividend
sw t1 , 60(sp) # serve for divisor
sw t2 , 64(sp) # serve for quotient
sw t3 , 68(sp) # free to use
slli t0 , s8 , 15
or t1 , s9 , zero
or t2 , zero , zero
addi s10 , zero , 16
start_division:
beq s10 , zero , division_complete
slli t2 ,t2 ,1
addi s10 , s10 , -1
sll s11 , t1 , s10
sub t3 , s11 , t0
slt s3 , zero , t3
bne zero , s3 , no_substraction
sub t0 , t0 , s11
ori t2 , t2 , 1
no_substraction:
beq zero , zero , start_division
division_complete:
sub t1 , s6 , s7
addi t1 , t1 , 127 # t1 serve for result exp
bne zero , s6 , div_1_finished
addi t1 , t1 , -1
div_1_finished:
bne zero , s7 , div_by_1_finished
addi t1 , t1 , 1
div_by_1_finished:
lui s10 , 0x00008
and s3 , s10 , t2
bne zero , s3 , quotient_normalized
quotient_normalize_start:
and s3 , s10 , t2
slt s3 , s3 , s2
slt s11 , s2 , t1
and s3 , s3 , s2
beq zero , s3 , quotient_normalized
slli t2 , t2 ,1
addi t1 , t1 , -1
beq zero , zero , quotient_normalize_start
quotient_normalize_complete:
andi t2 , t2 , 0x7f
ori s3 , zero , 0xff
bge t1 , s3 , div_to_inf
bge zero , t1 , div_to_zero
slli a0 , s4 , 15
andi t1 , t1 , 0xff
slli t1 , t1 , 7
or a0 , a0 , t1
andi t2 , t2 , 0x7f
or a0 , a0 , t2
div_final:
lw ra , 4(sp)
lw s0 , 8(sp)
lw s1 , 12(sp)
lw s2 , 16(sp)
lw s3 , 20(sp)
lw s4 , 24(sp)
lw s5 , 28(sp)
lw s6 , 32(sp)
lw s7 , 36(sp)
lw s8 , 40(sp)
lw s9 , 44(sp)
lw s10 , 48(sp)
lw s11 , 52(sp)
lw t0 , 56(sp)
lw t1 , 60(sp)
lw t2 , 64(sp)
lw t3 , 68(sp)
addi sp , sp , 128
ret
div_to_nan:
lw a0 , BF16_NAN
beq zero , zero , div_final
div_b_zero:
or a0 , s0 , zero
jal bf16_iszero
beq a0 , s2 , div_to_nan
div_to_inf:
or a0 , s4 , zero
slli a0 , a0 , 15
lw s3 , BF16_EXP_MASK
or a0 , a0 , s3
beq zero , zero , div_final
div_b_inf:
or a0 , s0 , zero
jal bf16_isinf
beq s2 , a0 , div_to_nan
div_to_zero:
or a0 , zero , zero
beq zero , zero , div_final
quotient_normalized:
srli t2 , t2 , 8
beq zero , zero quotient_normalize_complete
```
:::
#### bf16_sqrt
:::spoiler code
```c=
#
#
# a0 = a0 ^ 0.5
# binary search for root square is applied.
#
#
bf16_sqrt:
addi sp , sp , -128
sw ra , 4(sp)
sw s0 , 8(sp)
sw s1 , 12(sp)
sw s2 , 16(sp)
sw s3 , 20(sp)
sw s4 , 24(sp)
sw s5 , 28(sp)
or s0 , a0 , zero # s0 reserve a0
srli s1 , a0 , 15 # s1 reserve sign
andi s1 , s1 , 1
srli s2 , s0 , 7 # s2 reserve exp
andi s2 , s2 , 0xff
andi s3 , a0 , 0x7f # s3 reserve mant
addi s4 , zero , 1 # s4 hold 1
jal bf16_iszero
beq s4 , a0 , sqrt_zero # handling input zero
bne zero , s1 , sqrt_nan # negative input
ori s5 , zero , 0xff # s5 : free to use
beq s2 , s5 , sqrt_special_case # hndling inf or nan
beq zero , s2 , sqrt_zero # handling denormalized
addi s2 , s2 , -127
ori s3 , s3 , 0x80
andi s5 , s2 , 1
beq zero , s5 , sqrt_exp_complete
slli s3 , s3 , 1
addi s2 , s2 , -1
sqrt_exp_complete:
srai s2 , s2 , 1
addi s2 , s2 , 127
sw s6 , 32(sp) # low
sw s7 , 36(sp) # high
sw s8 , 40(sp) # mid
sw s9 , 44(sp) # sq
sw s10, 48(sp) # result
sw s11, 52(sp) # free to use
sw t0 , 56(sp) # free to use
addi s6 , zero , 90
addi s7 , zero , 256
addi s10 , zero , 128
start_binary_search_for_root:
blt s7 , s6 , search_finished
add s8 , s6 , s7
srli s8 , s8 , 1
# prepare for unsigned multiplication
addi s5 , zero , 9 #serve for specicial case of 256 as mid = 0x100000000
or s11 , s8 , zero
slli s8 , s8 , 9
start_18bit_mul: # 18bit multiplier
beq s5 , zero , bit16_mul_completed
addi s5 , s5 , -1
andi t0 , s11 , 1
beq zero , t0 , partial_add_completed
add s11 , s11 , s8
partial_add_completed:
srli s11 , s11 , 1
beq zero , zero , start_18bit_mul
bit16_mul_completed:
srli s8 , s8 , 9
srli s9 , s11 , 7
bge s3 , s9 , update_result
addi s7 , s8 , -1
beq zero , zero , start_binary_search_for_root
update_result:
or s10 , zero , s8
addi s6 , s8 , 1
beq zero , zero , start_binary_search_for_root
search_finished:
ori s5 , zero , 256
blt s10 , s5 , sqrt_even_exp
srli s10 , s10 , 1
addi s2 , s2 ,1
beq zero , zero , normalize_sqrt_result_complete
sqrt_even_exp:
ori s5 , zero , 128
bge s5 , s10 , normalize_sqrt_result_complete
slt s11 , s10 , s5
slt t0 , s4 , s2
and s11 , s11 , t0
bne s4 , s11 , normalize_sqrt_result_complete
slli s10 , s10 , 1
addi s2 , s2 , -1
normalize_sqrt_result_complete:
bge zero , s2 , sqrt_zero
andi s10 , s10 , 0x7f
or a0 , zero , s10
andi s2 , s2 , 0xff
slli s2 , s2 , 7
or a0 , a0 , s2
sqrt_complete:
lw ra , 4(sp)
lw s0 , 8(sp)
lw s1 , 12(sp)
lw s2 , 16(sp)
lw s3 , 20(sp)
lw s4 , 24(sp)
lw s5 , 28(sp)
lw s6 , 32(sp)
lw s7 , 36(sp)
lw s8 , 40(sp)
lw s9 , 44(sp)
lw s10, 48(sp)
lw s11, 52(sp)
lw t0 , 56(sp)
addi sp , sp , 128
ret
sqrt_special_case: #included with negative input , inf
or a0 , s0 , zero
beq zero , zero , sqrt_complete
sqrt_nan:
lw a0 , BF16_NAN
beq zero , zero , sqrt_complete
sqrt_zero:
lw a0 , BF16_ZERO
beq zero , zero , sqrt_complete
```
:::
## BF16 Evaluation
**AI Assistance notification**
> Partial work in this note was assisted by **ChatGPT** (OpenAI).
### Overview
A tiny, fixed-size test harness for BF16 primitives (Ripes-friendly).
It validates:
- **Arithmetic**: `bf16_add`, `bf16_sub`, `bf16_mul`, `bf16_div` on **3 fixed cases**
- **Sqrt + Conversion**: `bf16_sqrt` on 3 inputs and **BF16 round-trip**
`f32_to_bf16(bf16_to_f32(x_bf16)) == x_bf16`
---
### Test Inputs (Arithmetic)
**3 (a,b) pairs** used for all four arithmetic ops:
| Case | A (bf16) | B (bf16) | Notes |
|---:|:---:|:---:|---|
| 10 | `0x7F7F` | `0x7F7F` | max-finite vs max-finite |
| 15 | `0xBF2A` | `0xC1BB` | negatives |
| 16 | `0xC283` | `0xBF2A` | negatives |
#### Expected Results (Arithmetic)
| Case | ADD | SUB | MUL | DIV |
|---:|:---:|:---:|:---:|:---:|
| 10 | `0x7F80` | `0x0000` | `0x7F80` | `0x3F80` |
| 15 | `0xC1C0` | `0x41B6` | `0x4178` | `0x3CE8` |
| 16 | `0xC284` | `0xC282` | `0x422D` | `0x42C5` |
---
### SQRT & BF16 Roundtrip
**Inputs (bf16):** `0x1234`, `0x5678`, `0x789A`
**Expected `bf16_sqrt` outputs:** `0x28D6`, `0x4AFC`, `0x5C0C`
**Round-trip property (bf16 only):**
f32_to_bf16( bf16_to_f32(x_bf16) ) == x_bf16
This guarantees the conversion pair is consistent for BF16-representable values.
*(Note: the F32→BF16→F32 roundtrip check was removed by design.)*
---
### Data Layout (word-aligned, fixed-size)
- input
- `TEST_A[3]` — bf16 words: `0x7F7F, 0xBF2A, 0xC283`
- `TEST_B[3]` — bf16 words: `0x7F7F, 0xC1BB, 0xBF2A`
- corresponding output
- `EXP_ADD[3]` — bf16 words: `0x7F80, 0xC1C0, 0xC284`
- `EXP_SUB[3]` — bf16 words: `0x0000, 0x41B6, 0xC282`
- `EXP_MUL[3]` — bf16 words: `0x7F80, 0x4178, 0x422D`
- `EXP_DIV[3]` — bf16 words: `0x3F80, 0x3CE8, 0x42C5`
- `SQRT_IN[3]` — bf16 words: `0x1234, 0x5678, 0x789A`
- `EXP_SQRT[3]` — bf16 words: `0x28D6, 0x4AFC, 0x5C0C`
---
### Execution Flow
#### `test_arith()`
For each `i in {0,1,2}`:
1. Load `a = TEST_A[i]`, `b = TEST_B[i]`.
2. Compute `add/sub/mul/div` via `bf16_*` routines.
3. Compare each result to `EXP_*[i]` via `exp_cmp16`.
4. Return **1** if all 12 checks pass; otherwise **0**.
#### `test_sqrt_and_convert()`
For each `i in {0,1,2}`:
1. Compute `bf16_sqrt(SQRT_IN[i])` and compare to `EXP_SQRT[i]`.
2. Check **BF16 roundtrip** `f32_to_bf16(bf16_to_f32(SQRT_IN[i])) == SQRT_IN[i]`.
3. Return **1** if all checks pass; otherwise **0**.
#### `main`
- Runs both routines and ANDs the results.
- Ripes syscalls print exactly one line:
- `"All tests passed.\n"` when all checks pass.
- `"Some tests failed.\n"` otherwise.
- Exits with code **0** on pass, **1** on fail.
---
### Comparator (bit-exact for bf16)
`exp_cmp16(result, expected)` uses the zero-test idiom:
```asm
sub t0, a0, a1
sltiu a0, t0, 1 # a0 = 1 if equal, else 0
```
### evaluation program implementation
:::spoiler code
```c=
# ===== 3 arithmetic test pairs: CASE10, CASE15, CASE16 =====
TEST_A:
.word 0x7F7F # CASE1-1
.word 0xBF2A # CASE1-2
.word 0xC283 # CASE1-3
TEST_B:
.word 0x7F7F # CASE1-1
.word 0xC1BB # CASE1-2
.word 0xBF2A # CASE1-3
# ===== Expected results (exact bit patterns) =====
# ADD
EXP_ADD:
.word 0x7F80 # CASE1-1
.word 0xC1C0 # CASE1-2
.word 0xC284 # CASE1-3
# SUB
EXP_SUB:
.word 0x0000 # CASE1-1
.word 0x41B6 # CASE1-2
.word 0xC282 # CASE1-3
# MUL
EXP_MUL:
.word 0x7F80 # CASE1-1
.word 0x4178 # CASE1-2
.word 0x422D # CASE1-3
# DIV
EXP_DIV:
.word 0x3F80 # CASE1-1
.word 0x3CE8 # CASE1-2
.word 0x42C5 # CASE1-3
# ===== 3 sqrt inputs (bf16) and expected sqrt(bf16) =====
SQRT_IN:
.word 0x1234, 0x5678, 0x789A
EXP_SQRT:
.word 0x28D6, 0x4AFC, 0x5C0C
# ===== Messages =====
MSG_ALL: .asciz "All tests passed.\n"
MSG_FAILS: .asciz "Some tests failed.\n"
.text
.globl main
# -----------------------------------------------------------------------------
# main: run both groups and print one line
# -----------------------------------------------------------------------------
main:
jal ra, test_arith
mv s0, a0
jal ra, test_sqrt_and_convert
and s0, s0, a0
bnez s0, AllPass
la a0, MSG_FAILS
li a7, 4
ecall
li a7, 10
li a0, 1
ecall
AllPass:
la a0, MSG_ALL
li a7, 4
ecall
li a7, 10
li a0, 0
ecall
# -----------------------------------------------------------------------------
# exp_cmp16(a0=result16, a1=expected16) -> a0 = 1 pass, 0 fail
# bit-exact compare (finite, ±Inf, subnormals, ±0 with sign)
# -----------------------------------------------------------------------------
exp_cmp16:
sub t0, a0, a1
sltiu a0, t0, 1
ret
# -----------------------------------------------------------------------------
# exp_cmp32(a0=result32, a1=expected32) -> a0 = 1 pass, 0 fail
# bit-exact compare for f32 (raw bits in integer reg)
# -----------------------------------------------------------------------------
exp_cmp32:
sub t0, a0, a1
sltiu a0, t0, 1
ret
# -----------------------------------------------------------------------------
# test_arith() -> a0 = 1 if all ADD/SUB/MUL/DIV pass for 3 pairs, else 0
# -----------------------------------------------------------------------------
.globl test_arith
test_arith:
addi sp, sp, -4
sw ra, 0(sp)
la s0, TEST_A
la s1, TEST_B
la s2, EXP_ADD
la s3, EXP_SUB
la s4, EXP_MUL
la s5, EXP_DIV
li s6, 1 # all_pass = 1
li t6, 0 # i = 0
L_arith_loop:
li t5, 3
beq t6, t5, L_arith_done
# load a,b
lw a0, 0(s0)
lw a1, 0(s1)
# --- ADD ---
jal ra, bf16_add
lw a1, 0(s2)
jal ra, exp_cmp16
bnez a0, 1f
li s6, 0
1:
# --- SUB ---
lw a0, 0(s0)
lw a1, 0(s1)
jal ra, bf16_sub
lw a1, 0(s3)
jal ra, exp_cmp16
bnez a0, 2f
li s6, 0
2:
# --- MUL ---
lw a0, 0(s0)
lw a1, 0(s1)
jal ra, bf16_mul
lw a1, 0(s4)
jal ra, exp_cmp16
bnez a0, 3f
li s6, 0
3:
# --- DIV ---
lw a0, 0(s0)
lw a1, 0(s1)
jal ra, bf16_div
lw a1, 0(s5)
jal ra, exp_cmp16
bnez a0, 4f
li s6, 0
4:
# advance to next case
addi s0, s0, 4
addi s1, s1, 4
addi s2, s2, 4
addi s3, s3, 4
addi s4, s4, 4
addi s5, s5, 4
addi t6, t6, 1
j L_arith_loop
L_arith_done:
mv a0, s6
lw ra, 0(sp)
addi sp, sp, 4
ret
# -----------------------------------------------------------------------------
# test_sqrt_and_convert() -> a0 = 1 if all pass, else 0
# For each x in SQRT_IN:
# (1) bf16_sqrt(x) == EXP_SQRT[i] (bf16 compare)
# (2) f32_to_bf16(bf16_to_f32(x)) == x (bf16 round-trip)
# -----------------------------------------------------------------------------
.globl test_sqrt_and_convert
test_sqrt_and_convert:
addi sp, sp, -8
sw ra, 4(sp)
sw s0, 0(sp)
la s0, SQRT_IN
la s1, EXP_SQRT
li s2, 1 # all_pass = 1
li t6, 0
L_sc_loop:
li t5, 3
beq t6, t5, L_sc_done
# ---- SQRT check ----
lw a0, 0(s0) # x_bf16
jal ra, bf16_sqrt # -> a0 = sqrt_bf16
lw a1, 0(s1) # expected sqrt
jal ra, exp_cmp16
bnez a0, 10f
li s2, 0
10:
# ---- BF16 roundtrip: f32_to_bf16(bf16_to_f32(x)) == x ----
lw a0, 0(s0) # x_bf16
jal ra, bf16_to_f32 # -> x_f32_bits
jal ra, f32_to_bf16 # -> back_bf16
lw a1, 0(s0) # original x_bf16
jal ra, exp_cmp16
bnez a0, 11f
li s2, 0
11:
# next
addi s0, s0, 4
addi s1, s1, 4
addi t6, t6, 1
j L_sc_loop
L_sc_done:
mv a0, s2
lw s0, 0(sp)
lw ra, 4(sp)
addi sp, sp, 8
ret
```
:::
### Evaluation result

### Preformance evaluation

## Assignment use case : CLZ
[Leetcode 201. Bitwise AND of Numbers Range](https://leetcode.com/problems/bitwise-and-of-numbers-range/)
|Baseline|__builtin_clz|
|--|--|
|||
Since we already examined performance of **__builtin_clz** previosly,
hence we adopt this soltion to evaluate performace improvement in either runtime or memory.