# Assignment 1: RISC-V Assembly and Instruction Pipeline
contributed by < [kuohanwen](https://github.com/kuohanwen/ca2025-quizzes) >
[Homework1 Requirment](https://hackmd.io/@sysprog/2025-arch-homework1)
## Problem B
Refer to [Quiz1 of Computer Architecture (2025 Fall) Problem B](https://hackmd.io/@sysprog/arch2025-quiz1-sol)
### Description of Problem
Use an 8-bit `uf8` format (4-bit exponent + 4-bit mantissa) to approximately represent 20-bit unsigned integers, satisfying the relative error is **≤ 1/16**.
---
### C Code of Decoding (8bits → 32bits)
```c=
// Decode an 8-bit uf8 code [eeee mmmm] into a 32-bit unsigned integer.
// The decoded value = (m << e) + 16 * (2^e − 1)
uint32_t uf8_decode(uf8 fl)
{
uint32_t mantissa = fl & 0x0f;
// Extract the 4-bit mantissa (bits [3:0]).
uint8_t exponent = fl >> 4;
// Extract the 4-bit exponent (bits [7:4]).
uint32_t offset = (0x7FFF >> (15 - exponent)) << 4;
// Compute offset(e) = 16 * (2^15 − 1)* (2^-(15-e)) ~= 16 * (2^e − 1)
return (mantissa << exponent) + offset;
// Final decoded integer: mantissa * 2^e + offset(e)
}
```
Decoding Method
$$
D(b) = m \cdot 2^e + (2^e - 1) \cdot 16
$$
Input:8-bit uf8 value b
Output:20-bit unsigned integer D(b)
### C Code of Encoding (32bits → 8bits)
```c=
// Encode a non-negative 32-bit integer into an 8-bit uf8 code [eeee mmmm]
uf8 uf8_encode(uint32_t value)
{
// Small values (0..15) are exact in uf8 (e=0) can return directly
if (value < 16)
return value;
/* Fast exponent e hint using CLZ:
lz = leading zeros in value ; msb = 31 - lz~= floor(log2(value)) */
int lz = clz(value);
int msb = 31 - lz;
/* Give a initial guess
overflow will hold offset(e) = 16*(2^e - 1). */
uint8_t exponent = 0;
uint32_t overflow = 0;
/* for values ≥ 32
the number is large enough that using the log₂-based approximation
becomes worthwhile
16~31 start with exponent = 0 and overflow = 0 */
if (msb >= 5) {
// Empirical initial guess: e ≈ msb - 4
exponent = msb - 4;
// e=0~15
if (exponent > 15)
exponent = 15;
/* Compute offset(e) iteratively:
16*(2^e - 1)=16*(2^(e - 1) - 1) * 2 + 16
offset(e) = offset(e-1)*2 + 16*/
for (uint8_t e = 0; e < exponent; e++)
overflow = (overflow << 1) + 16;
/* If the estimate overshot (value < offset(e)),
step e down while undoing the recurrence:
offset(e-1) = (offset(e) - 16)/2
result will be (value > offset(e))*/
while (exponent > 0 && value < overflow) {
overflow = (overflow - 16) >> 1;
exponent--;
}
}
/* Move e upward until value < offset(e+1)
solve the condition that value >... >offset(e+1)> offset(e)
16~31 will be revised to exponent = 1 and overflow = 16
Then result will be offset(e) ≤ value < offset(e+1)*/
while (exponent < 15) {
uint32_t next_overflow = (overflow << 1) + 16;
if (value < next_overflow)
break;
overflow = next_overflow;
exponent++;
}
// m = floor((value - offset(e)) / 2^e)
uint8_t mantissa = (value - overflow) >> exponent;
// b = [eeee mmmm]
return (exponent << 4) | mantissa;
}
```
Encoding Method
$$
E(v) = \begin{cases}
v, & \text{if } v < 16 \\
16e + \lfloor(v - \text{offset}(e))/2^e\rfloor, & \text{otherwise}
\end{cases}
$$
where $$\text{offset}(e) = (2^e - 1) \cdot 16$$
Input:20-bit unsigned integer v
Output:8-bit uf8 value E(v)
### Full RISC-V Assembly Code
```c=
.data
# offset(e) = 16*(2^e - 1), e=0..15
offset_tbl:
.word 0,16,48,112,240,496,1008,2032
.word 4080,8176,16368,32752,65520,131056,262128,524272
msg1: .asciz ": decodes to "
msg2: .asciz " but re-encodes as "
msg3: .asciz ": decoded is "
msg4: .asciz " <= previous value "
msg5: .asciz "All tests passed.\n"
msg6: .asciz "At least one test failed.\n"
newline: .asciz "\n"
.align 2
.text
.globl main
main:
jal ra, test # start test
beq a0, x0, Not_pass # failed
la a0, msg5 # print msg5 when passing
li a7, 4
ecall
li a7, 10 # ecall: exit
li a0, 0 # exit code 0 (success)
ecall
Not_pass:
la a0, msg6 # print msg6 when not passing
li a7, 4
ecall
li a7, 10 # ecall: exit
li a0, 1 # exit code 1 (failure)
ecall
test:
addi sp, sp, -32
sw ra, 28(sp)
sw s0, 24(sp)
sw s1, 20(sp)
sw s2, 16(sp)
sw s3, 12(sp)
sw s4, 8(sp)
sw s5, 4(sp)
addi s0, x0, -1 # previous_value
li s1, 1 # passed = true
li s2, 0 # fl (0..255)
li s3, 256 # loop end
For_2:
add a0, s2, x0 # a0 = fl
jal ra, uf8_decode
add s4, a0, x0 # value
add a0, s4, x0 # a0 = value
jal ra, uf8_encode
add s5, a0, x0 # fl2
# (A) round-trip check
test_if_1:
beq s2, s5, test_if_2
add a0, s2, x0 # print fl (hex)
li a7, 34
ecall
la a0, msg1
li a7, 4
ecall
add a0, s4, x0 # print value (decimal)
li a7, 1
ecall
la a0, msg2
li a7, 4
ecall
add a0, s5, x0 # print fl2 (hex)
li a7, 34
ecall
la a0, newline
li a7, 4
ecall
li s1, 0 # passed = false
# (B) strict monotonic increase
test_if_2:
blt s0, s4, after_if
add a0, s2, x0
li a7, 34
ecall
la a0, msg3
li a7, 4
ecall
add a0, s4, x0
li a7, 1
ecall
la a0, msg4
li a7, 4
ecall
add a0, s0, x0
li a7, 34
ecall
la a0, newline
li a7, 4
ecall
li s1, 0
after_if:
add s0, s4, x0
addi s2, s2, 1
blt s2, s3, For_2
add a0, s1, x0 # return passed (1/0)
lw ra, 28(sp)
lw s0, 24(sp)
lw s1, 20(sp)
lw s2, 16(sp)
lw s3, 12(sp)
lw s4, 8(sp)
lw s5, 4(sp)
addi sp, sp, 32
jr ra
#CLZ (fixed 16/8/4/2/1; clz(0)=32)
CLZ:
beq a0, x0, CLZ_ZERO
li t0, 0 # bitlen-1 accumulator
srli t1, a0, 16
beq t1, x0, CLZ_1
addi t0, t0, 16
add a0, t1, x0
CLZ_1:
srli t1, a0, 8
beq t1, x0, CLZ_2
addi t0, t0, 8
add a0, t1, x0
CLZ_2:
srli t1, a0, 4
beq t1, x0, CLZ_3
addi t0, t0, 4
add a0, t1, x0
CLZ_3:
srli t1, a0, 2
beq t1, x0, CLZ_4
addi t0, t0, 2
add a0, t1, x0
CLZ_4:
srli t1, a0, 1
beq t1, x0, CLZ_5
addi t0, t0, 1
CLZ_5:
li t1, 31
sub a0, t1, t0 # clz = 31 - (bitlen-1)
jr ra
CLZ_ZERO:
li a0, 32
jr ra
# uf8_decode
# a0 = uf8 code -> a0 = decoded integer
uf8_decode:
andi t0, a0, 0x0F # m
srli t1, a0, 4 # e
la t2, offset_tbl
slli t3, t1, 2 # index = e * 4 (word offset)
add t2, t2, t3
lw t2, 0(t2) # t2 = offset(e)
sll t0, t0, t1 # m << e
add a0, t0, t2 # value = offset + (m << e)
jr ra
# uf8_encode
# a0 = value -> a0 = uf8 code
uf8_encode:
addi sp, sp, -4
sw ra, 0(sp)
add t6, a0, x0 # t6 = value
# Small values: exact (e=0)
li t0, 16
blt t6, t0, UF8ENC_RET # a0 already holds value
# e = floor_log2( (value + 16) >> 4 ) = 31 - clz(t)
addi t1, t6, 16
srli t1, t1, 4
add a0, t1, x0
jal ra, CLZ
li t2, 31
sub t2, t2, a0 # t2 = e
# clamp e to 15
li t0, 15
blt t2, t0, 1f
li t2, 15
1:
li t3, 1 # t3 = pow2
add t1, t2, x0 # t1 = e (loop counter)
2:
beq t1, x0, 3f
slli t3, t3, 1 # pow2 <<= 1
addi t1, t1, -1
j 2b
3:
# off = ((1 << e) - 1) << 4
addi t3, t3, -1
slli t3, t3, 4 # t3 = off
# m = (value - off) >> e (single shift; then saturate)
sub t4, t6, t3
srl t4, t4, t2
li t0, 15
blt t4, t0, 4f
li t4, 15
4:
slli t5, t2, 4
or a0, t5, t4 # a0 = (e<<4) | m
UF8ENC_RET:
lw ra, 0(sp)
addi sp, sp, 4
jr ra
```
### Explain of RISC-V Assembly Code
##### <span style="background-color:#cce5ff; color:#004085; font-weight:bold; padding:2px 6px; border-radius:4px;">main</span>
main is the program entry that orchestrates the run: it jumps to test, which returns a pass/fail flag in a0 (1 = pass, 0 = fail). After the call, main inspects a0; if it is non-zero, it prints msg5 (“All tests passed.\n”) using the RARS/Ripes print-string syscall (place the string address in a0, set a7=4, then ecall) and terminates with the exit syscall (a7=10) using exit code 0. If a0 is zero, it prints msg6 (“At least one test failed.\n”) in the same way and exits with code 1. Throughout, a0 serves as both the test result and the syscall argument register, per the calling convention.
##### <span style="background-color:#cce5ff; color:#004085; font-weight:bold; padding:2px 6px; border-radius:4px;">Test</span>
test verifies the uf8 codec end-to-end over all 256 byte codes. It first creates a 32-byte stack frame and saves ra and the callee-saved registers it will use. It then initializes its state: s0 = -1 (previous decoded value), s1 = 1 (passed flag), s2 = 0 (current uf8 code, 0…255), and s3 = 256 (loop limit). For each fl = s2 it calls uf8_decode to get value (s4), then calls uf8_encode(value) to get fl2 (s5). Two checks are performed:
(A)<span style="background-color:#d4edda; color:#155724; font-weight:bold; padding:2px 6px; border-radius:4px;">Round-trip</span>
if fl2 != fl, it prints a diagnostic line showing fl (hex), the decoded value (decimal), and the re-encoded fl2 (hex), and clears s1 to 0.
(B)<span style="background-color:#d4edda; color:#155724; font-weight:bold; padding:2px 6px; border-radius:4px;">Strict monotonicity</span>
if value <= previous_value (s0), it prints a diagnostic line fl: value <=' previous_value (with fl and previous_value in hex) and clears s1.
After the checks it updates previous_value = value, increments fl, and repeats until fl == 256. At exit it returns the pass flag in a0 (a0 = s1, 1 = all tests passed, 0 = at least one failure), restores ra and s0..s5, pops the stack frame, and jr ra.
##### <span style="background-color:#cce5ff; color:#004085; font-weight:bold; padding:2px 6px; border-radius:4px;">CLZ</span>— Count Leading Zeros
1.<span style="background-color:#d4edda; color:#155724; font-weight:bold; padding:2px 6px; border-radius:4px;">Purpose</span>
Returns the number of consecutive zero bits before the most-significant 1 in a 32-bit unsigned integer. By convention clz(0) = 32. The routine is used by uf8_encode to compute
2.<span style="background-color:#d4edda; color:#155724; font-weight:bold; padding:2px 6px; border-radius:4px;">Inputs / Outputs</span>
Input: a0 — 32-bit unsigned value.
Output: a0 — integer in [0,32], the count of leading zeros.
Registers: t0 (accumulator for “bitlen−1”), t1 (temporary shifts). ra is not modified (leaf function).
3.<span style="background-color:#d4edda; color:#155724; font-weight:bold; padding:2px 6px; border-radius:4px;">Algorithm</span>
(1)If a0 == 0, return 32 immediately.
(2)Initialize t0 = 0 (will accumulate the index of the highest set bit).
(3)Probe blocks of bits from MSB to LSB with fixed right shifts:
Test upper 16 bits: t1 = a0 >> 16.
If non-zero, the MSB lies in that half → add 16 to t0, replace a0 = t1.
Else, keep a0 as is (MSB lies in the lower half).
Repeat with shifts 8, 4, 2, 1 using the same rule, each time adding the block size to t0 if the probed part is non-zero and narrowing a0 to that sub-range.
(4)At the end, t0 equals (bitlen − 1), so return 31 - t0 in a0.
4.<span style="background-color:#d4edda; color:#155724; font-weight:bold; padding:2px 6px; border-radius:4px;">Benefit</span>
(1)RV32I-only: uses srli, beq, addi, add, li, sub, jr—no M/F/D extensions.
(2)Constant time: always performs at most 5 probes; no data-dependent loops → predictable timing.
(3)Efficient: far fewer branches than naive 32-iteration scans, and avoids multiplication/division.
(4)Time Complexity:O(1) — exactly five conditional tests after the zero check.
5.<span style="background-color:#d4edda; color:#155724; font-weight:bold; padding:2px 6px; border-radius:4px;">Edge cases & assumptions</span>
(1)Input is treated as unsigned; if callers pass a signed value, it is interpreted bitwise (two’s complement).
(2)clz(0) is explicitly defined as 32 to avoid a special “no MSB” case.
(3)Because it’s a leaf routine, it does not touch the stack or ra.
6.<span style="background-color:#d4edda; color:#155724; font-weight:bold; padding:2px 6px; border-radius:4px;">Example</span>
a0 = 0x0010_0000
a0 != 0 → do not return 32.
Initialize t0 = 0 (will accumulate the index of the highest set bit).
Probe from MSB to LSB with fixed right shifts:
Shift 16: t1 = a0 >> 16 = 0x0000_0010 (non-zero).
⇒ MSB is in this half → t0 += 16 → t0 = 16; set a0 = t1 = 0x10.
Shift 8: t1 = a0 >> 8 = 0x0000_0000 (zero).
⇒ Stay in lower half → keep a0 = 0x10; t0 unchanged (16).
Shift 4: t1 = a0 >> 4 = 0x0000_0001 (non-zero).
⇒ Add 4 → t0 = 20; set a0 = t1 = 0x1.
Shift 2: t1 = a0 >> 2 = 0x0000_0000 (zero).
⇒ Keep a0 = 0x1; t0 unchanged (20).
Shift 1: t1 = a0 >> 1 = 0x0000_0000 (zero).
⇒ Keep a0 = 0x1; t0 unchanged (20).
Finish: t0 = 20 (bitlen − 1), so return a0 = 31 − t0 = 31 − 20 = 11.
Result: clz(0x0010_0000) = 11 (there are 11 leading zeros in the 32-bit representation).
##### <span style="background-color:#cce5ff; color:#004085; font-weight:bold; padding:2px 6px; border-radius:4px;">uf8_decode</span>
1.<span style="background-color:#d4edda; color:#155724; font-weight:bold; padding:2px 6px; border-radius:4px;">Purpose</span>
Convert an 8-bit uf8 code [eeee|mmmm] (4-bit exponent e, 4-bit mantissa m) into its 32-bit unsigned representative.
2.<span style="background-color:#d4edda; color:#155724; font-weight:bold; padding:2px 6px; border-radius:4px;">How it works</span>
Extract e = b >> 4 and m = b & 0x0F.
Load the interval start from a lookup table: off = offset_tbl[e] where offset_tbl[e] = 16*(2^e − 1).
Return offset[e] + (m << e); i.e.,
$$
D(b) = m \cdot 2^e + (2^e - 1) \cdot 16
$$
3.<span style="background-color:#d4edda; color:#155724; font-weight:bold; padding:2px 6px; border-radius:4px;">Input / Output</span>
Input: b (0–255) — an 8-bit uf8 code.
Output: 32-bit unsigned integer D(b).
4.<span style="background-color:#d4edda; color:#155724; font-weight:bold; padding:2px 6px; border-radius:4px;">Why the lookup table helps</span>
(1)Fixed and tiny cost (O(1))
offset(e) has only 16 entries (e = 0..15), so the table is 16 × 4B = 64 bytes. One indexed load + one add + one shift finishes the job. Latency is stable with no data-dependent loops.
(2)Stable performance
Compared with recomputing offset(e) as ((1<<e) - 1) << 4 or (0x7FFF >> (15 - e)) << 4, the lookup avoids extra “variable shifts + sub/bit-twiddling” sequences. On embedded/pipelined cores this means fewer instructions, shorter dependency chains, and tighter timing dispersion—great for real-time systems.
(3)Avoids overflow and boundary pitfalls
Computing ((1<<e) - 1) << 4 at runtime can hit shift-width limits or immediate encoding quirks when porting to different word sizes or variants. With a lookup, all boundaries are pre-fixed in the table, so you don’t encounter those runtime edge cases.
5.<span style="background-color:#d4edda; color:#155724; font-weight:bold; padding:2px 6px; border-radius:4px;">Example</span>
b = 0x1A → e=1, m=10, offset(1)=16, m<<e=20 → result 36.
##### <span style="background-color:#cce5ff; color:#004085; font-weight:bold; padding:2px 6px; border-radius:4px;">uf8_encode</span>
1.<span style="background-color:#d4edda; color:#155724; font-weight:bold; padding:2px 6px; border-radius:4px;">Objective</span>
Encode a 20-bit unsigned integer value into an 8-bit uf8 code [eeee|mmmm]
2.<span style="background-color:#d4edda; color:#155724; font-weight:bold; padding:2px 6px; border-radius:4px;">How it works</span>
(1)Small values passthrough: if value < 16, return value exactly (e=0).
(2)Find the exponent: compute t = (value + 16) >> 4, then e = 31 − clz(t); clamp e to 15.
(3)Build the interval start: construct pow2 = (1 << e) via a tiny loop, then off = (pow2 − 1) << 4.
(4)Compute the mantissa: m = (value − off) >> e, saturate to 15 if needed.
(5)Pack and return: (e << 4) | m in a0.
3.<span style="background-color:#d4edda; color:#155724; font-weight:bold; padding:2px 6px; border-radius:4px;">Benefit</span>
(1)Monotonic and round-trip consistent: codes increase strictly with value; for any code b, encode(decode(b)) == b
(2)predictable error: 0..15 are exact; beyond that the step is 2^e, giving a worst-case relative error ≤ 1/16 (6.25%).
(3)Robust with saturation: clamping e and m to 15 ensures every input maps to a valid 8-bit code,there is no undefined behavior.
4.<span style="background-color:#d4edda; color:#155724; font-weight:bold; padding:2px 6px; border-radius:4px;">Example</span>
Input: value = 36
Compute t = (value + 16) >> 4 = (36 + 16) >> 4 = 52 >> 4 = 3.
Exponent via CLZ: clz(3) = 30 ⇒ e = 31 − 30 = 1 (clamp not needed).
Build pow2 = (1 << e) via the small loop: start at 1, shift once ⇒ pow2 = 2.
Interval start: off = (pow2 − 1) << 4 = (2 − 1) << 4 = 1 << 4 = 16.
Mantissa: m = (value − off) >> e = (36 − 16) >> 1 = 20 >> 1 = 10.
Pack: uf8 = (e << 4) | m = (1 << 4) | 10 = 0x10 | 0x0A = 0x1A.
### Experiment and Result
##### <span style="background-color:#cce5ff; color:#004085; font-weight:bold; padding:2px 6px; border-radius:4px;">Improved Approach</span>
Cycle count=35402
CPI=1.46


##### <span style="background-color:#cce5ff; color:#004085; font-weight:bold; padding:2px 6px; border-radius:4px;">Original Approach</span>
Cycle count=41802
CPI=1.49


### Improvement
By using `lookup table approach`,cycles reduced about (41802-35402)/41802=15% and instructions reduced about (28062-24222)/28062=13%
## Problem C
Refer to [Quiz1 of Computer Architecture (2025 Fall) Problem C](https://hackmd.io/@sysprog/arch2025-quiz1-sol)
### Description of Problem
Implement addition, subtraction, multiplication, division and square root
bfloat16 format
```
┌─────────┬──────────────┬──────────────┐
│Sign (1) │ Exponent (8) │ Mantissa (7) │
└─────────┴──────────────┴──────────────┘
15 14 7 6 0
S: Sign bit (0 = positive, 1 = negative)
E: Exponent bits (8 bits, bias = 127)
M: Mantissa/fraction bits (7 bits)
```
For regular numbers that are not special cases (like zero, infinity, NaN or Denormals), their value is given by the formula:
$$
v = (-1)^S \times 2^{E-127} \times \left(1 + \frac{M}{128}\right)
$$
bfloat16 preserves the wide numeric range of float32 by keeping the same 8-bit exponent, but it reduces precision because its mantissa has only 7 bits instead of 23.
### Special Cases
:::spoiler Assembly Code
```c=
.text
.globl bf16_isnan
bf16_isnan:
# a0: bf16 bits, return 1 if NaN, else 0
li t0, 0x7FFF # clear sign
and t1, a0, t0 # t1 = |x|
li t2, 0x7F80 # |Inf|
sltu a0, t2, t1 # a0 = (|x| > |Inf|)
ret
.globl bf16_isinf
bf16_isinf:
# a0: bf16 bits, return 1 if ±Inf, else 0
li t0, 0x7FFF
and t1, a0, t0 # t1 = |x|
li t2, 0x7F80 # |Inf|
xor t1, t1, t2
seqz a0, t1 # a0 = 1 if equal
ret
.globl bf16_iszero
bf16_iszero:
# a0: bf16 bits, return 1 if ±0, else 0
li t0, 0x7FFF
and t1, a0, t0 # drop sign
seqz a0, t1
ret
.globl f32_to_bf16
f32_to_bf16:
# a0: f32 bits, return bf16 bits in a0
srli t0, a0, 23
andi t0, t0, 0xFF # exponent
addi t0, t0, -255 # exponent - 0xFF
bnez t0, f32_unspecial
# exponent == 0xFF: Inf or NaN, just truncate
srli a0, a0, 16
ret
f32_unspecial:
# round to nearest even when truncating to bf16
srli t0, a0, 16
andi t0, t0, 1 # bit16 (LSB of discarded part)
li t1, 0x7FFF
add t0, t0, t1 # 0x7FFF or 0x8000
add a0, a0, t0
srli a0, a0, 16
ret
.globl bf16_to_f32
bf16_to_f32:
# a0: bf16 bits, return f32 bits
slli a0, a0, 16
ret
.globl BF16_NAN
BF16_NAN:
# return quiet NaN (bf16)
li a0, 0x7FC0
ret
.globl BF16_ZERO
BF16_ZERO:
# return +0 (bf16)
li a0, 0x0000
ret
```
:::
### Implementation Addition
#### Test Case
| Test # | Operation | Input A | Input B | BF16 A | BF16 B | Expected Output C | BF16 C |
|--------|-----------|---------|---------|--------|--------|--------|--------|
| 1 | Add | 1.0 | 2.0 | 0x3F80 | 0x4000 | 3.0 | 0x4040 |
| 2 | Add | -1.5 | 0.5 | 0xBFC0 | 0x3F00 | -1.0 | 0xBF80 |
| 3 | Add | -Inf | 5.0 | 0xFF80 | 0x40A0 | -Inf | 0xFF80 |
:::spoiler Assembly Code
```c=
.data
newline: .string "\n"
pass_msg: .asciz "Test Passed\n"
fail_msg: .asciz "Test Failed\n"
.text
.globl main
main:
addi sp, sp, -4
sw ra, 0(sp)
test1:
# Test 1.0 + 2.0 = 3.0
li a0, 0x3F80 # Input A = 1.0
li a1, 0x4000 # Input B = 2.0
jal ra, bf16_add # Output C = A + B
li t1, 0x4040 # Expected Output C = 3.0
bne a0, t1, test1_fail
jal ra, print_pass
j test2
test1_fail:
jal ra, print_fail
j test2
test2:
# Test -1.5 + 0.5 = -1.0
li a0, 0xBFC0 # Input A = -1.5
li a1, 0x3F00 # Input B = 0.5
jal ra, bf16_add # Output C = A + B
li t1, 0xBF80 # Expected Output C = -1.0
bne a0, t1, test2_fail
jal ra, print_pass
j test3
test2_fail:
jal ra, print_fail
j test3
test3:
# Test -Inf + 5.0 = -Inf
li a0, 0xFF80 # Input A = -Inf
li a1, 0x40A0 # Input B = 5.0
jal ra, bf16_add # Output C = A + B
li t1, 0xFF80 # Expected Output C = -Inf
bne a0, t1, test3_fail
jal ra, print_pass
j tests_done
test3_fail:
jal ra, print_fail
j tests_done
print_pass:
la a0, pass_msg
li a7, 4 # syscall: print string
ecall
jr ra
print_fail:
la a0, fail_msg
li a7, 4 # syscall: print string
ecall
jr ra
tests_done:
lw ra, 0(sp)
addi sp, sp, 4
li a7, 10 # syscall: exit
ecall
.globl bf16_add
bf16_add:
# extract sign, exponent, mantissa
srli t0, a0, 15 # t0 = sign_a (bit 15)
srli t1, a1, 15 # t1 = sign_b
srli t2, a0, 7
andi t2, t2, 0xFF # t2 = exp_a (8 bits)
srli t3, a1, 7
andi t3, t3, 0xFF # t3 = exp_b
andi t4, a0, 0x7F # t4 = mant_a (7 bits)
andi t5, a1, 0x7F # t5 = mant_b (7 bits)
li t6, 0xFF
bne t2, t6, check_exp_b
exp_a_checkall:
bnez t4, ret_a # mant_a != 0 → a is NaN, return a
bne t3, t6, ret_a # a is Inf, b is finite → return a
bnez t5, return_b1 # b mantissa != 0 → b is NaN
bne t0, t1, return_nan # +Inf + -Inf → NaN
return_b1:
mv a0, a1 # b is NaN or same-sign Inf
ret
return_nan:
li a0, 0x7FC0 # canonical NaN
ret_a:
ret
check_exp_b:
beq t3, t6, return_b2
j check_0_a
return_b2:
mv a0, a1 # b is NaN or Inf
ret
check_0_a:
bnez t2, check_0_b # exp_a != 0 → not zero
bnez t4, check_0_b # mant_a != 0 → not zero
mv a0, a1 # a is ±0 → result = b
ret
check_0_b:
bnez t3, norm_a
bnez t5, norm_a
ret # b is ±0 → result = a (a0)
norm_a:
beqz t2, norm_b # exp_a == 0 → subnormal
ori t4, t4, 0x80 # mant_a |= 1 << 7 (restore hidden bit)
norm_b:
beqz t3, end_check1
ori t5, t5, 0x80 # mant_b |= 1 << 7 (restore hidden bit)
end_check1:
addi sp, sp, -20
sw s0, 16(sp)
sw s1, 12(sp)
sw s2, 8(sp)
sw s3, 4(sp)
sw s4, 0(sp)
sub s0, t2, t3 # s0 = exp_diff = exp_a - exp_b
blez s0, diff_neg # exp_a <= exp_b
mv s2, t2 # result_exp = exp_a
li t6, 8
bgt s0, t6, return_a # if exp_diff > 8 → B too small
srl t5, t5, s0 # shift mant_b
j exp_done
diff_neg:
bgez s0, diff_else # exp_diff == 0
mv s2, t3 # result_exp = exp_b
li t6, -8
bge s0, t6, shift_a # if exp_diff >= -8 → shift A
j return_b3 # else A too small → result ≈ B
shift_a:
neg s4, s0 # s4 = -exp_diff
srl t4, t4, s4 # shift mant_a
j exp_done
diff_else: # exp_diff == 0
mv s2, t2
j exp_done
return_a:
# a0 is already A → return A directly
j bf16_epilogue
return_b3:
# result = B
mv a0, a1
j bf16_epilogue
exp_done:
bne t0, t1, diff_sign # sign differ → subtraction
same_sign:
mv s1, t0 # result_sign
add s3, t4, t5 # result_mant = mant_a + mant_b
andi t6, s3, 0x100 # overflow into bit 8?
beqz t6, norm_end
srli s3, s3, 1 # shift mantissa right
addi s2, s2, 1 # exponent++
li t6, 0xFF
bge s2, t6, overflow_inf
j norm_end
overflow_inf:
# a0 = ±Inf
slli a0, s1, 15 # sign
li t6, 0x7F80 # Inf exponent (exp=0xFF, mant=0)
or a0, a0, t6
j bf16_epilogue
diff_sign:
bge t4, t5, manta_gt_b
mv s1, t1 # result_sign = sign_b
sub s3, t5, t4 # mant_b - mant_a
j mant_result
manta_gt_b:
mv s1, t0 # result_sign = sign_a
sub s3, t4, t5 # mant_a - mant_b
mant_result:
beqz s3, return_zero # exact zero
norm_loop:
andi t6, s3, 0x80
bnez t6, norm_end
slli s3, s3, 1
addi s2, s2, -1
blez s2, return_zero
j norm_loop
norm_end:
# reconstruct BF16: sign | exponent | mantissa
slli a0, s1, 15 # sign
andi t0, s2, 0xFF
slli t0, t0, 7 # exponent
or a0, a0, t0
andi t0, s3, 0x7F # mantissa
or a0, a0, t0
j bf16_epilogue
return_zero:
li a0, 0x0000 # +0
bf16_epilogue:
lw s0, 16(sp)
lw s1, 12(sp)
lw s2, 8(sp)
lw s3, 4(sp)
lw s4, 0(sp)
addi sp, sp, 20
ret
```
:::
#### Result

### Implementation Subtraction
#### Test Case
| Test # | Operation | Input A | Input B | BF16 A | BF16 B | Expected Output C | BF16 C |
|--------|-----------|---------|---------|--------|--------|--------|--------|
| 1 | Sub | 3.0 | 1.0 | 0x4040 | 0x3F80 | 2.0 | 0x4000 |
| 2 | Sub | 1.5 | 2.0 | 0x3FC0 | 0x4000 | -0.5 | 0xBF00 |
| 3 | Sub | +Inf | 2.5 | 0x7F80 | 0x4020 | +Inf | 0x7F80 |
:::spoiler Assembly Code
```c=
.data
newline: .string "\n"
pass_msg: .asciz "Test Passed\n"
fail_msg: .asciz "Test Failed\n"
.text
.globl main
main:
addi sp, sp, -4
sw ra, 0(sp)
test1:
# Test 3.0 - 1.0 = 2.0
li a0, 0x4040 # Input A = 3.0
li a1, 0x3F80 # Input B = 1.0
jal ra, bf16_sub # Output C = A - B
li t1, 0x4000 # Expected Output C = 2.0
bne a0, t1, test1_fail
jal ra, print_pass
j test2
test1_fail:
jal ra, print_fail
j test2
test2:
# Test 1.5 - 2.0 = -0.5
li a0, 0x3FC0 # Input A = 1.5
li a1, 0x4000 # Input B = 2.0
jal ra, bf16_sub # Output C = A - B
li t1, 0xBF00 # Expected Output C = -0.5
bne a0, t1, test2_fail
jal ra, print_pass
j test3
test2_fail:
jal ra, print_fail
j test3
test3:
# Test +Inf - 2.5 = +Inf
li a0, 0x7F80 # Input A = +Inf
li a1, 0x4020 # Input B = 2.5
jal ra, bf16_sub # Output C = A - B
li t1, 0x7F80 # Expected Output C = +Inf
bne a0, t1, test3_fail
jal ra, print_pass
j tests_done
test3_fail:
jal ra, print_fail
j tests_done
print_pass:
la a0, pass_msg
li a7, 4 # syscall: print string
ecall
jr ra
print_fail:
la a0, fail_msg
li a7, 4 # syscall: print string
ecall
jr ra
tests_done:
lw ra, 0(sp)
addi sp, sp, 4
li a7, 10 # syscall: exit
ecall
# BF16 subtraction wrapper: C = A - B
# Implemented as: A + (-B), reusing bf16_add
.globl bf16_sub
bf16_sub:
li t6, 0x8000 # mask for sign bit (bit 15)
xor a1, a1, t6 # flip sign of B → B = -B
j bf16_add # tail call bf16_add (A + (-B))
# Original BF16 addition: C = A + B
.globl bf16_add
bf16_add:
# extract sign, exponent, mantissa
srli t0, a0, 15 # t0 = sign_a (bit 15)
srli t1, a1, 15 # t1 = sign_b
srli t2, a0, 7
andi t2, t2, 0xFF # t2 = exp_a (8 bits)
srli t3, a1, 7
andi t3, t3, 0xFF # t3 = exp_b
andi t4, a0, 0x7F # t4 = mant_a (7 bits)
andi t5, a1, 0x7F # t5 = mant_b (7 bits)
li t6, 0xFF
bne t2, t6, check_exp_b
exp_a_checkall:
bnez t4, ret_a # mant_a != 0 → a is NaN, return a
bne t3, t6, ret_a # a is Inf, b is finite → return a
bnez t5, return_b1 # b mantissa != 0 → b is NaN
bne t0, t1, return_nan # +Inf + -Inf → NaN
return_b1:
mv a0, a1 # b is NaN or same-sign Inf
ret
return_nan:
li a0, 0x7FC0 # canonical NaN
ret_a:
ret
check_exp_b:
beq t3, t6, return_b2
j check_0_a
return_b2:
mv a0, a1 # b is NaN or Inf
ret
check_0_a:
bnez t2, check_0_b # exp_a != 0 → not zero
bnez t4, check_0_b # mant_a != 0 → not zero
mv a0, a1 # a is ±0 → result = b
ret
check_0_b:
bnez t3, norm_a
bnez t5, norm_a
ret # b is ±0 → result = a (a0)
norm_a:
beqz t2, norm_b # exp_a == 0 → subnormal
ori t4, t4, 0x80 # mant_a |= 1 << 7
norm_b:
beqz t3, end_check1
ori t5, t5, 0x80
end_check1:
addi sp, sp, -20
sw s0, 16(sp)
sw s1, 12(sp)
sw s2, 8(sp)
sw s3, 4(sp)
sw s4, 0(sp)
sub s0, t2, t3 # s0 = exp_diff = exp_a - exp_b
blez s0, diff_neg # exp_a <= exp_b
mv s2, t2 # result_exp = exp_a
li t6, 8
bgt s0, t6, return_a # if exp_diff > 8 → B too small
srl t5, t5, s0 # shift mant_b
j exp_done
diff_neg:
bgez s0, diff_else # exp_diff == 0
mv s2, t3 # result_exp = exp_b
li t6, -8
bge s0, t6, shift_a # if exp_diff >= -8 → shift A
j return_b3 # else A too small → result ≈ B
shift_a:
neg s4, s0 # s4 = -exp_diff
srl t4, t4, s4 # shift mant_a
j exp_done
diff_else: # exp_diff == 0
mv s2, t2
j exp_done
return_a:
# a0 is already A → return A directly
j bf16_epilogue
return_b3:
# result = B
mv a0, a1
j bf16_epilogue
exp_done:
bne t0, t1, diff_sign # sign differ → subtraction
same_sign:
mv s1, t0 # result_sign
add s3, t4, t5 # result_mant
andi t6, s3, 0x100 # overflow into bit 8?
beqz t6, norm_end
srli s3, s3, 1 # shift mantissa
addi s2, s2, 1 # exponent++
li t6, 0xFF
bge s2, t6, overflow_inf
j norm_end
overflow_inf:
# a0 = ±Inf
slli a0, s1, 15 # sign
li t6, 0x7F80 # Inf exponent (exp=0xFF, mant=0)
or a0, a0, t6
j bf16_epilogue
diff_sign:
bge t4, t5, manta_gt_b
mv s1, t1 # result_sign = sign_b
sub s3, t5, t4 # mant_b - mant_a
j mant_result
manta_gt_b:
mv s1, t0 # result_sign = sign_a
sub s3, t4, t5 # mant_a - mant_b
mant_result:
beqz s3, return_zero # exact zero
norm_loop:
andi t6, s3, 0x80
bnez t6, norm_end
slli s3, s3, 1
addi s2, s2, -1
blez s2, return_zero
j norm_loop
norm_end:
# reconstruct BF16: sign | exponent | mantissa
slli a0, s1, 15 # sign
andi t0, s2, 0xFF
slli t0, t0, 7 # exponent
or a0, a0, t0
andi t0, s3, 0x7F # mantissa
or a0, a0, t0
j bf16_epilogue
return_zero:
li a0, 0x0000 # +0
bf16_epilogue:
lw s0, 16(sp)
lw s1, 12(sp)
lw s2, 8(sp)
lw s3, 4(sp)
lw s4, 0(sp)
addi sp, sp, 20
ret
```
:::
#### Result

### Implementation Multiplication
#### Test Case
| Test # | Operation | Input A | Input B | BF16 A | BF16 B | Expected Output C | BF16 C |
|--------|-----------|---------|---------|--------|--------|--------|--------|
| 1 | Mul | 3.0 | 2.5 | 0x4040 | 0x4020 | 7.5 | 0x40F0 |
| 2 | Mul | 0.0 | +Inf | 0x0000 | 0x7F80 | NAN | 0x7FC0 |
| 3 | Mul | 2.0 | -1.5 | 0x4000 | 0xBFC0 | -3.0 | 0xC040 |
:::spoiler Assembly Code
```c=
.globl main
.data
pass_msg: .asciz "PASS\n"
fail_msg: .asciz "FAIL\n"
.text
main:
addi sp, sp, -4
sw ra, 0(sp)
# Test 1: 3.0 × 2.5 = 7.5
test1:
li a0, 0x4040 # Input A = 3.0
li a1, 0x4020 # Input B = 2.5
jal ra, bf16_mul # Output C = A * B
li t1, 0x40F0 # Expected Output C = 7.5
bne a0, t1, fail1
jal ra, print_pass
j test2
fail1:
jal ra, print_fail
# Test 2: 0.0 × Inf = NaN
test2:
li a0, 0x0000 # Input A = 0.0
li a1, 0x7F80 # Input B = +Inf
jal ra, bf16_mul # Output C = A * B
li t1, 0x7FC0 # Expected Output C = NAN
bne a0, t1, fail2
jal ra, print_pass
j test3
fail2:
jal ra, print_fail
# Test 3: 2.0 × -1.5 = -3.0
test3:
li a0, 0x4000 # Input A = 2.0
li a1, 0xBFC0 # Input B = -1.5
jal ra, bf16_mul # Output C = A * B
li t1, 0xC040 # Expected Output C = -3.0
bne a0, t1, fail3
jal ra, print_pass
j done
fail3:
jal ra, print_fail
print_pass:
la a0, pass_msg
li a7, 4
ecall
ret
print_fail:
la a0, fail_msg
li a7, 4
ecall
ret
done:
lw ra, 0(sp)
addi sp, sp, 4
li a7, 10
ecall
.globl bf16_mul
bf16_mul:
addi sp, sp, -16
sw s0, 0(sp)
sw s1, 4(sp)
sw s2, 8(sp)
sw ra, 12(sp)
# Extract sign, exponent, mantissa
srli t0, a0, 15
andi t0, t0, 1
srli t1, a1, 15
andi t1, t1, 1
xor s0, t0, t1 # result sign
srli t2, a0, 7 # exp A
andi t2, t2, 0xFF
srli t3, a1, 7 # exp B
andi t3, t3, 0xFF
andi t4, a0, 0x7F # mant A
andi t5, a1, 0x7F # mant B
# Step 1 — NaN Handling (highest priority)
li t6, 0xFF
# A is NaN?
beq t2, t6, check_nan_a
# B is NaN?
beq t3, t6, check_nan_b
j check_inf # no NaN → go Inf check
check_nan_a:
bnez t4, make_nan
j check_inf # A=Inf but not NaN
check_nan_b:
bnez t5, make_nan
j check_inf
# Step 2 — Inf Handling
check_inf:
li t6, 0xFF
# A = Inf
beq t2, t6, handle_inf_a
# B = Inf
beq t3, t6, handle_inf_b
j check_zero
handle_inf_a:
# A = Inf
beqz t3, make_nan # Inf × 0 = NaN
j make_inf
handle_inf_b:
# B = Inf
beqz t2, make_nan # 0 × Inf = NaN
j make_inf
# Step 3 — Zero Handling
check_zero:
# A == 0 → 0
beqz t2, check_a_mant
j check_b_zero
check_a_mant:
beqz t4, return_zero
check_b_zero:
# B == 0 → 0
beqz t3, check_b_mant
j normalize_inputs
check_b_mant:
beqz t5, return_zero
# NaN / Inf / Zero Makers
make_nan:
li a0, 0x7FC0
j mul_done
make_inf:
slli a0, s0, 15
li t6, 0x7F80
or a0, a0, t6
j mul_done
return_zero:
slli a0, s0, 15
j mul_done
# Normalize A & B
normalize_inputs:
mv s1, zero # exp_adjust = 0
# Normalize A
beqz t2, norm_a_sub
ori t4, t4, 0x80
j norm_b
norm_a_sub:
beqz t4, return_zero
norm_a_loop:
andi t6, t4, 0x80
bnez t6, norm_a_end
slli t4, t4, 1
addi s1, s1, -1
j norm_a_loop
norm_a_end:
li t2, 1
# Normalize B
norm_b:
beqz t3, norm_b_sub
ori t5, t5, 0x80
j do_mul
norm_b_sub:
beqz t5, return_zero
norm_b_loop:
andi t6, t5, 0x80
bnez t6, norm_b_end
slli t5, t5, 1
addi s1, s1, -1
j norm_b_loop
norm_b_end:
li t3, 1
# Mantissa Multiply
do_mul:
mul s2, t4, t5
add t6, t2, t3
addi t6, t6, -127
add t6, t6, s1
mv s1, t6
# Normalize multiplication result
li t6, 0x8000
and t6, s2, t6
beqz t6, mul_norm_low
srli s2, s2, 8
andi s2, s2, 0x7F
addi s1, s1, 1
j check_exp
mul_norm_low:
srli s2, s2, 7
andi s2, s2, 0x7F
# Exponent Overflow / Underflow
check_exp:
li t6, 255
bge s1, t6, make_inf
blez s1, return_zero
# Assemble Final BF16
final_output:
slli a0, s0, 15
andi s1, s1, 0xFF
slli s1, s1, 7
or a0, a0, s1
or a0, a0, s2
j mul_done
# Epilogue
mul_done:
lw s0, 0(sp)
lw s1, 4(sp)
lw s2, 8(sp)
lw ra, 12(sp)
addi sp, sp, 16
ret
```
:::
#### Result

### Implementation Division
#### Test Case
| Test # | Operation | Input A | Input B | BF16 A | BF16 B | Expected Output C | BF16 C |
|--------|-----------|---------|---------|--------|--------|--------|--------|
| 1 | Div | 6.0 | 2.0 | 0x40C0 | 0x4000 | 3.0 | 0x4040 |
| 2 | Div | -2.0 | 4.0 | 0xC000 | 0x4080 | -0.5 | 0xBF00 |
| 3 | Div | -Inf | -5.0 | 0xFF80 | 0xC0A0 | +Inf | 0x7F80 |
:::spoiler Assembly Code
```c=
.data
pass_msg: .asciz "PASS\n"
fail_msg: .asciz "FAIL\n"
.text
.globl main
main:
addi sp, sp, -4
sw ra, 0(sp)
# Test 1: 6.0 / 2.0 = 3.0
test1:
li a0, 0x40C0 # Input A = 6.0
li a1, 0x4000 # Input B = 2.0
jal ra, bf16_div # Output C = A / B
li t1, 0x4040 # Expected Output C = 3.0
bne a0, t1, fail1
jal ra, print_pass
j test2
fail1:
jal ra, print_fail
# Test 2: -2.0 / 4.0 = -0.5
test2:
li a0, 0xC000 # Input A = -2.0
li a1, 0x4080 # Input B = 4.0
jal ra, bf16_div # Output C = A / B
li t1, 0xBF00 # Expected Output C = -0.5
bne a0, t1, fail2
jal ra, print_pass
j test3
fail2:
jal ra, print_fail
# Test 3: -Inf / -5.0 = +Inf
test3:
li a0, 0xFF80 # Input A = -Inf
li a1, 0xC0A0 # Input B = -5.0
jal ra, bf16_div # Output C = A / B
li t1, 0x7F80 # Expected Output C = +Inf
bne a0, t1, fail3
jal ra, print_pass
j done
fail3:
jal ra, print_fail
done:
lw ra, 0(sp)
addi sp, sp, 4
li a7, 10
ecall
print_pass:
la a0, pass_msg
li a7, 4
ecall
jr ra
print_fail:
la a0, fail_msg
li a7, 4
ecall
jr ra
.globl bf16_div
bf16_div:
# Extract sign, exponent, mantissa
srli t0, a0, 15 # sign_a
srli t1, a1, 15 # sign_b
xor a2, t0, t1 # result_sign = a.sign ^ b.sign
srli t2, a0, 7 # exp_a
andi t2, t2, 0xFF
srli t3, a1, 7 # exp_b
andi t3, t3, 0xFF
andi t4, a0, 0x7F # mant_a
andi t5, a1, 0x7F # mant_b
# Special cases : NaN, Inf, Zero
li t6, 0xFF
# B is NaN or Inf
beq t3, t6, check_B_inf_nan
# B is Zero → A/0 = Inf or -Inf
beq t3, x0, check_B_zero
j check_A_inf_nan
# B is NaN or Inf
check_B_inf_nan:
bnez t5, make_nan # B mantissa != 0 → NaN
# B = Inf
beq t2, t6, make_nan # Inf / Inf = NaN
# finite / Inf = Zero
j make_zero
# B == Zero
check_B_zero:
beq t5, x0, make_inf # A / 0 → ±Inf
j continue # subnormal zero → go normalize
# A is NaN or Inf
check_A_inf_nan:
beq t2, t6, A_inf_or_nan
j continue
A_inf_or_nan:
bnez t4, make_nan # NaN
# A is Inf
j make_inf
# Continue : Normalize A & B mantissas
continue:
# Normalize A
li s1, 0 # exp_adjust = 0
beq t2, x0, normA_sub
ori t4, t4, 0x80 # add hidden 1
j normB
normA_sub:
beq t4, x0, make_zero # A = 0
normA_loop:
andi t6, t4, 0x80
bnez t6, normA_done
slli t4, t4, 1
addi s1, s1, -1
j normA_loop
normA_done:
li t2, 1
# Normalize B
normB:
beq t3, x0, normB_sub
ori t5, t5, 0x80
j divide_start
normB_sub:
beq t5, x0, make_zero # B=0 case already handled above
normB_loop:
andi t6, t5, 0x80
bnez t6, normB_done
slli t5, t5, 1
addi s1, s1, -1
j normB_loop
normB_done:
li t3, 1
# Long Division 16-bit
divide_start:
slli a4, t4, 15 # dividend <<= 15
li a5, 0 # quotient = 0
mv t6, t5 # divisor working copy
slli t6, t6, 15
li t0, 16
div_loop:
slli a5, a5, 1 # shift quotient left
sltu t1, a4, t6 # if dividend < divisor
bne t1, x0, div_skip
sub a4, a4, t6 # dividend -= divisor
ori a5, a5, 1 # quotient bit = 1
div_skip:
srli t6, t6, 1 # divisor >>= 1
addi t0, t0, -1
bne t0, x0, div_loop
# Compute exponent
sub a3, t2, t3
addi a3, a3, 127
beq t2, x0, decA
j adjustB
decA:
addi a3, a3, -1
adjustB:
beq t3, x0, incB
j normalize_q
incB:
addi a3, a3, 1
# Normalize quotient (mantissa)
normalize_q:
li t0, 0x8000
and t1, a5, t0
bnez t1, shift8
# Shift left until top bit=1
norm_q_loop:
and t1, a5, t0
bnez t1, shift8
slli a5, a5, 1
addi a3, a3, -1
j norm_q_loop
shift8:
srli a5, a5, 8
# Assemble final BF16
andi a5, a5, 0x7F
li t0, 255
bge a3, t0, make_inf
ble a3, x0, make_zero
andi t0, a3, 255
slli t0, t0, 7
slli a0, a2, 15
or a0, a0, t0
or a0, a0, a5
jr ra
# Special return paths
make_zero:
slli a0, a2, 15
jr ra
make_inf:
slli a0, a2, 15
li t0, 0x7F80
or a0, a0, t0
jr ra
make_nan:
li a0, 0x7FC0
jr ra
```
:::
#### Result

### Implementation Square Root
#### Test Case
| Test # | Operation | Input A | Input B | BF16 A | BF16 B | Expected Output C | BF16 C |
|--------|-----------|---------|---------|--------|--------|--------|--------|
| 1 | Sqrt | 4.0 | | 0x4080 | | 2.0 | 0x4000 |
| 2 | Sqrt | +Inf | | 0x7F80 | | +Inf | 0x7F80 |
| 3 | Sqrt | +0.0 | | 0x0000 | | +0.0 | 0x0000 |
:::spoiler Assembly Code
```c=
.data
pass_msg: .asciz "Test Passed\n"
fail_msg: .asciz "Test Failed\n"
.text
.globl main
.globl bf16_sqrt
main:
addi sp, sp, -4
sw ra, 0(sp)
test1:
li a0, 0x4080 # Input A = 4.0
jal ra, bf16_sqrt # Output C = sqrt(A)
li t1, 0x4000 # Expected Output C = 2.0
bne a0, t1, test1_fail
jal ra, print_pass
j test2
test1_fail:
jal ra, print_fail
j test2
test2:
li a0, 0x7F80 # Input A = +Inf
jal ra, bf16_sqrt # Output C = sqrt(A)
li t1, 0x7F80 # Expected Output C = +Inf
bne a0, t1, test2_fail
jal ra, print_pass
j test3
test2_fail:
jal ra, print_fail
j test3
test3:
li a0, 0x0000 # Input A = +0.0
jal ra, bf16_sqrt # Output C = sqrt(A)
li t1, 0x0000 # Expected Output C = +0.0
bne a0, t1, test3_fail
jal ra, print_pass
j program_end
test3_fail:
jal ra, print_fail
j program_end
program_end:
lw ra, 0(sp)
addi sp, sp, 4
li a7, 10 # syscall: exit (RARS)
ecall
print_pass:
la a0, pass_msg
li a7, 4 # print_string
ecall
ret
print_fail:
la a0, fail_msg
li a7, 4 # print_string
ecall
ret
bf16_sqrt:
addi sp, sp, -32
sw ra, 28(sp)
sw s0, 24(sp)
sw s1, 20(sp)
sw s2, 16(sp)
sw s3, 12(sp)
sw s4, 8(sp)
sw s5, 4(sp)
sw s6, 0(sp)
srli t0, a0, 15 # Shift right 15
andi t0, t0, 1 # Extract sign bit
srli t1, a0, 7 # Shift right 7
andi t1, t1, 0xFF # Extract exponent (8 bits)
andi t2, a0, 0x7F # Extract mantissa (7 bits)
li t3, 0xFF
bne t1, t3, check_zero # if exp != 0xFF → not Inf/NaN
bnez t2, return_a # exp=0xFF, mant!=0 → NaN, just return a (NaN propagation)
bnez t0, return_nan # exp=0xFF, mant=0, sign!=0 → -Inf → NaN
j return_a # exp=0xFF, mant=0, sign=0 → +Inf, return a
check_zero: # Handle zero
or t3, t1, t2 # if exp==0 && mant==0 → zero
bnez t3, check_negative
j return_zero
check_negative: # Handle negative / denormals
bnez t0, return_nan # negative number → NaN
bnez t1, compute_sqrt # if exp!=0 → normalized → go sqrt
j return_zero # denormals are flushed to zero
compute_sqrt:
addi s0, t1, -127 # s0 = unbiased exponent E
ori s1, t2, 0x80 # s1 = mantissa with implicit leading 1 (1.xxx)
andi t3, s0, 1 # t3 = E & 1 (check odd/even)
beqz t3, even_exp
# odd exponent: sqrt(2^E * M) = 2^{(E-1)/2} * sqrt(2*M)
slli s1, s1, 1 # mantissa * 2
addi t4, s0, -1
srai t4, t4, 1
addi s2, t4, 127 # s2 = output exponent (biased)
j binary_search
even_exp:
# even exponent: sqrt(2^E * M) = 2^{E/2} * sqrt(M)
srai t4, s0, 1
addi s2, t4, 127 # s2 = output exponent (biased)
binary_search:
# Search integer y in [90,256] such that (y^2 >> 7) ≈ s1
li s3, 90 # low
li s4, 256 # high
li s5, 128 # best (initial guess)
search_loop:
bgt s3, s4, search_done
add t3, s3, s4
srli t3, t3, 1 # mid = (low + high) / 2
mv a1, t3
mv a2, t3
jal ra, multiply # a0 = mid * mid
mv t4, a0
srli t4, t4, 7 # compare (mid^2 >> 7) with s1
bgt t4, s1, search_high # too large → move high
mv s5, t3 # accept mid as current best
addi s3, t3, 1 # low = mid + 1
j search_loop
search_high:
addi s4, t3, -1 # high = mid - 1
j search_loop
search_done:
li t3, 256
blt s5, t3, check_low # if best < 256 → normal case
srli s5, s5, 1 # if best >= 256 → renormalize
addi s2, s2, 1
j extract_mant
check_low:
li t3, 128
bge s5, t3, extract_mant # already normalized (>=128)
norm_loop: # ensure mantissa in [128,255]
li t3, 128
bge s5, t3, extract_mant
li t3, 1
ble s2, t3, extract_mant # avoid exponent underflow
slli s5, s5, 1
addi s2, s2, -1
j norm_loop
extract_mant:
andi s6, s5, 0x7F # take low 7 bits as mantissa
li t3, 0xFF
bge s2, t3, return_inf # exponent overflow → +Inf
blez s2, return_zero # exponent underflow / zero
andi t3, s2, 0xFF
slli t3, t3, 7
or a0, t3, s6 # pack exponent + mantissa (sign=0)
j cleanup
return_zero:
li a0, 0x0000
j cleanup
return_nan:
li a0, 0x7FC0 # canonical quiet NaN
j cleanup
return_inf:
li a0, 0x7F80 # +Inf
j cleanup
return_a:
# a0 already has original input (Inf / NaN propagation etc.)
cleanup:
lw s6, 0(sp)
lw s5, 4(sp)
lw s4, 8(sp)
lw s3, 12(sp)
lw s2, 16(sp)
lw s1, 20(sp)
lw s0, 24(sp)
lw ra, 28(sp)
addi sp, sp, 32
ret
# multiply(a1, a2) → a0 = a1 * a2 (unsigned, shift-add)
multiply:
li a0, 0
beqz a2, mult_done
mult_loop:
andi t0, a2, 1
beqz t0, mult_skip
add a0, a0, a1
mult_skip:
slli a1, a1, 1
srli a2, a2, 1
bnez a2, mult_loop
mult_done:
ret
```
:::
#### Result

## 5-Stage Pipeline
### Introduction
Modern RISC processors use a **five-stage pipeline** to execute instructions, the following figure is an example of a five-stage RISC-V pipeline. It splits executing one instruction into 5 consecutive stages so that multiple instructions can be in different stages at the same time, improving overall throughput. In the ideal case, the pipeline can complete nearly one instruction per cycle.

1️⃣ **IF – Instruction Fetch**
- Use the current PC (Program Counter) to read the instruction from instruction memory.
- Store the instruction into the IF/ID pipeline register and compute the next PC.
2️⃣ **ID – Instruction Decode**
- Decode the instruction type (R-type, I-type, load/store, branch, …).
- Read the source registers rs1 and rs2 from the register file.
- Generate control signals such as RegWrite, MemRead, MemWrite, ALUOp, and ALUSrc.
- For I-type and load/store instructions, generate a sign-extended immediate.
3️⃣ **EX – Execute**
- The ALU performs operations such as addition/subtraction, shift, and comparison according to the control signals.
- For load/store instructions, compute the memory address in this stage (base register + offset).
- For branch instructions, compare rs1 and rs2 here to decide whether to branch, and produce the branch target address.
4️⃣ **MEM – Memory Access**
- For **read** instructions such as `lw` / `lh` / `lb`: use the address computed in EX to read from data memory in this stage.
- For **write** instructions such as `sw` / `sh` / `sb`: write the data from a register into data memory in this stage.
- Instructions that do not access memory simply pass the ALU result to the next pipeline register in this stage.
5️⃣ **WB – Write Back**
- Write the ALU result or data read from memory back to the destination register rd.
- Whether the processor writes back is controlled by the RegWrite signal. The write-back data source is selected by a MUX, choosing between the ALU result and the data read from memory.
### How instructions are executed in the Ripes simulator
`addi sp, sp, 32` means adding 32 to sp (stack pointer) and writing the result back to sp. In the context of a function’s stack frame, this typically releases 32 bytes of stack space by moving the stack pointer back to its original position, reclaiming the temporary space allocated on the stack.
<span style="background-color:#6f42c1; color:#ffffff; font-weight:bold; padding:2px 6px; border-radius:4px;">Stage 1 - IF</span>

- In the figure, the instruction memory address is 0x00000000, meaning the processor uses PC = `0x00000000` to fetch the instruction from instruction memory.
- The MUX determines whether the PC should be incremented by 4 or 2. Since the fetched addi is a 32-bit instruction, the MUX selects +4, so the sequential next PC candidate is `0x00000000 + 0x00000004 = 0x00000004`.
- Because `addi` does not change control flow, the processor follows the sequential path instead of a branch or jump target. As a result, the next PC is 0x00000004, the same as the sequential next PC candidate.
- `0x02010113` is the 32-bit machine code of a RISC-V instruction. When decoded, it corresponds to `addi x2, x2, 32`. In RISC-V, x2 is the sp, so this is also written as `addi sp, sp, 32`.
<span style="background-color:#6f42c1; color:#ffffff; font-weight:bold; padding:2px 6px; border-radius:4px;">Stage 2 - ID</span>

| Field | Bit range | Value (bin) | Value (hex) | Meaning |
|----------|-----------|----------------|-----------------|---------|
| imm[11:0] | [31:20] | 000000100000 | 0x020 | immediate = 32 |
| rs1 | [19:15] | 00010 | 0x02 | rs1 = x2 (sp) |
| funct3 | [14:12] | 000 | 0x0 | funct3 = 000 → ADDI |
| rd | [11:7] | 00010 | 0x02 | rd = x2 (sp) |
| opcode | [6:0] | 0010011 | 0x13 | OP-IMM (addi/andi/ori/...) |
1. Decode
- After the machine code `0x02010113` enters the Decode block, it is recognized as opcode = ADDI.
- This means the instruction is an I-type add-immediate instruction.
2. Registers
- In the Registers block:
- `1 idx = 0x02` indicates that rs1 reads x2.
- `2 idx = 0x00` indicates that the other read port reads x0. `addi` does not actually require rs2, but the hardware typically still outputs a value for this port.
- The values read from the register file are:
- `Reg1 = 0x7ffffff0`: the current value of x2.
- `Reg2 = 0x00000000`: read from x0, so it is 0.
3. Imm (Immediate Generator)
- For `addi`, the immediate is 32. After sign extension, it is shown as `0x00000020`.
4. ID/EX Pipeline Register
- The ID/EX pipeline register latches the values prepared in the ID stage and forwards them to the EX stage:
- Reg1: `0x7ffffff0`
- Reg2: `0x00000000`
- Imm : `0x00000020`
- This allows the EX stage to use the ALU to compute x2 + 32.
<span style="background-color:#6f42c1; color:#ffffff; font-weight:bold; padding:2px 6px; border-radius:4px;">Stage 3 - EX</span>

1) Values coming from the ID/EX (IDEX) register
- `0x7ffffff0`: Reg1, which is the current value of x2.
- `0x00000020`: Imm, the immediate value 32.
- `0x00000000`: Reg2. This `addi` does not use rs2, so this value is typically 0.
- `0x00000000` / `0x00000004`: PC and PC+4. These are mainly used for branch/jump address calculations and are not used by this `addi`.
- `0x02` represents register number 2, which is x2.
2) How the ALU operands are selected
- Operand 1 (upper ALU input) selects Reg1 = `0x7ffffff0`.
- Operand 2 (under ALU input)is selected by a MUX. For `addi`, the control signal selects the immediate (Imm) instead of Reg2, so Operand 2 = `0x00000020`.
3) What operation the ALU performs
- `addi` uses the ALU to perform addition:
`0x7ffffff0 + 0x00000020 = 0x80000010`.
4) Why the Branch block indicates not taken
- `addi` is not a branch or jump instruction, so no control-flow change occurs.
- Therefore, **Branch taken = 0**, and the PC continues along the sequential path (PC+4).
5) What is latched into the EX/MEM pipeline register
- The EX/MEM pipeline register latches the ALU result `0x80000010` and forwards it to the next stage.
- It also carries the destination register information (rd = x2) and the required control signals for the following stages.
<span style="background-color:#6f42c1; color:#ffffff; font-weight:bold; padding:2px 6px; border-radius:4px;">Stage 4 - MEM</span>

This figure shows the Memory Access stage while executing `addi x2, x2, 32`.
Since `addi` does not access data memory, the memory hardware is idle and the datapath simply forwards values toward the WB stage.
- `0x80000010` (Addr into Data memory) : this is the ALU result computed in the EX stage. For `addi`, it is the computed value to be written back to x2, not a real memory address to be used. It still appears on the address input because the datapath wiring is fixed.
- Wr En is red (disabled) : this indicates **MemWrite = 0**, so no store occurs and data memory is not updated.
- `Data in = 0x00000000` : this would be the data written to memory for a store instruction, but since Wr En is disabled, this value is not used here.
- `Read out = 0x00000000` : this output is meaningful for load instructions. For `addi` (MemRead = 0), it typically shows a default value and does not indicate an actual memory read.
- `0x02` : this is the destination register index rd = x2, carried forward so the WB stage knows where to write the result.
- `0x00000004` : this is the sequential next PC related value (PC+4) that may also be carried along the pipeline, although it is not directly used by `addi` in MEM/WB.
<span style="background-color:#6f42c1; color:#ffffff; font-weight:bold; padding:2px 6px; border-radius:4px;">Stage 5 - WB</span>

This figure shows the Write Back stage for the instruction `addi x2, x2, 32`. At this stage, the processor writes the computed result back to register x2.
- `0x80000010` : this is the ALU result produced in the EX stage. In WB, it is used as the data to be written back to the register file.
- The green MUX selection : the write-back MUX selects the source of the data written to the register file.
- For arithmetic instructions like `addi`, it selects the ALU result `0x80000010`.
- `0x00000000` corresponds to memory read data, which is used for load instructions and is not used by `addi` here.
- `0x02` : this is the destination register index rd = x2, so the value `0x80000010` will be written into x2, which is the stack pointer.
- `0x00000004` : this is the sequential next PC value carried along the pipeline. It does not affect the write-back of `addi`, but it may be useful for debugging for other instructions.