# Computer Architecture : homework1
:::warning
use 5 stage processor in ripes to simulate
:::
## Problem B
### C code
```c
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
typedef uint8_t uf8;
static inline unsigned clz(uint32_t x)
{
int n = 32, c = 16;
do {
uint32_t y = x >> c;
if (y) {
n -= c;
x = y;
}
c >>= 1;
} while (c);
return n - x;
}
/* Decode uf8 to uint32_t */
uint32_t uf8_decode(uf8 fl)
{
uint32_t mantissa = fl & 0x0f;
uint8_t exponent = fl >> 4;
uint32_t offset = (0x7FFF >> (15 - exponent)) << 4;
return (mantissa << exponent) + offset;
}
/* Encode uint32_t to uf8 */
uf8 uf8_encode(uint32_t value)
{
/* Use CLZ for fast exponent calculation */
if (value < 16)
return value;
/* Find appropriate exponent using CLZ hint */
int lz = clz(value);
int msb = 31 - lz;
/* Start from a good initial guess */
uint8_t exponent = 0;
uint32_t overflow = 0;
if (msb >= 5) {
/* Estimate exponent - the formula is empirical */
exponent = msb - 4;
if (exponent > 15)
exponent = 15;
/* Calculate overflow for estimated exponent */
for (uint8_t e = 0; e < exponent; e++)
overflow = (overflow << 1) + 16;
/* Adjust if estimate was off */
while (exponent > 0 && value < overflow) {
overflow = (overflow - 16) >> 1;
exponent--;
}
}
/* Find exact exponent */
while (exponent < 15) {
uint32_t next_overflow = (overflow << 1) + 16;
if (value < next_overflow)
break;
overflow = next_overflow;
exponent++;
}
uint8_t mantissa = (value - overflow) >> exponent;
return (exponent << 4) | mantissa;
}
/* Test encode/decode round-trip */
static bool test(void)
{
int32_t previous_value = -1;
bool passed = true;
for (int i = 0; i < 256; i++) {
uint8_t fl = i;
int32_t value = uf8_decode(fl);
uint8_t fl2 = uf8_encode(value);
if (fl != fl2) {
printf("%02x: produces value %d but encodes back to %02x\n", fl,
value, fl2);
passed = false;
}
if (value <= previous_value) {
printf("%02x: value %d <= previous_value %d\n", fl, value,
previous_value);
passed = false;
}
previous_value = value;
}
return passed;
}
int main(void)
{
if (test()) {
printf("All tests passed.\n");
return 0;
}
return 1;
}
```
:::info
[assembly code of problem B](https://github.com/rainbow0212/ca2025-homework1/blob/main/problemB/problem%20B.S)
:::
#### performance

#### core Optimizations
1. Binary Search CLZ Implementation
```.s
clz:
li t1, 32 # n = 32
li t2, 16 # c = 16 (initial step)
clz_loop:
beqz t2, clz_done
srl t3, a0, t2 # y = x >> c
beqz t3, clz_no_shift
sub t1, t1, t2 # n -= c (found bits)
mv a0, t3 # x = y (continue with upper bits)
clz_no_shift:
srli t2, t2, 1 # c >>= 1 (halve step size)
j clz_loop
```
2.Direct Decode Without Branching
```.s
uf8_decode:
andi t0, a0, 0x0f # mantissa (4 bits)
srli t1, a0, 4 # exponent (4 bits)
# Calculate offset: (0x7FFF >> (15-exp)) << 4
li t2, 15
sub t2, t2, t1 # 15 - exponent
li t3, 0x7FFF
srl t3, t3, t2 # Dynamic shift
slli t3, t3, 4 # Align to mantissa
sll t0, t0, t1 # mantissa << exponent
add a0, t0, t3 # result = scaled_mantissa + offset
ret
```
* Zero branches - completely linear execution
* 8 instructions total for any input
* Uses dynamic shift to calculate logarithmic offset
3. Adaptive Exponent Calculation in Encode
```.s
uf8_encode:
mv s0, a0 # Save input value
li t0, 16
blt s0, t0, encode_small # Fast path: value < 16
jal clz
li t0, 31
sub s1, t0, a0 # msb = 31 - clz(value)
# Determine initial exponent
li t0, 5
blt s1, t0, encode_exp_zero
addi t1, s1, -4 # exponent = msb - 4
```
* Small values (< 16): Identity encoding (1 instruction)
* Medium values: Direct exponent from MSB
* Large values: Clamped to max exponent (15)
## Problem C
### C code
```c
#include <stdbool.h>
#include <stdint.h>
#include <string.h>
typedef struct {
uint16_t bits;
} bf16_t;
#define BF16_SIGN_MASK 0x8000U
#define BF16_EXP_MASK 0x7F80U
#define BF16_MANT_MASK 0x007FU
#define BF16_EXP_BIAS 127
#define BF16_NAN() ((bf16_t) {.bits = 0x7FC0})
#define BF16_ZERO() ((bf16_t) {.bits = 0x0000})
static inline bool bf16_isnan(bf16_t a)
{
return ((a.bits & BF16_EXP_MASK) == BF16_EXP_MASK) &&
(a.bits & BF16_MANT_MASK);
}
static inline bool bf16_isinf(bf16_t a)
{
return ((a.bits & BF16_EXP_MASK) == BF16_EXP_MASK) &&
!(a.bits & BF16_MANT_MASK);
}
static inline bool bf16_iszero(bf16_t a)
{
return !(a.bits & 0x7FFF);
}
static inline bf16_t f32_to_bf16(float val)
{
uint32_t f32bits;
memcpy(&f32bits, &val, sizeof(float));
if (((f32bits >> 23) & 0xFF) == 0xFF)
return (bf16_t) {.bits = (f32bits >> 16) & 0xFFFF};
f32bits += ((f32bits >> 16) & 1) + 0x7FFF;
return (bf16_t) {.bits = f32bits >> 16};
}
static inline float bf16_to_f32(bf16_t val)
{
uint32_t f32bits = ((uint32_t) val.bits) << 16;
float result;
memcpy(&result, &f32bits, sizeof(float));
return result;
}
static inline bf16_t bf16_add(bf16_t a, bf16_t b)
{
uint16_t sign_a = (a.bits >> 15) & 1;
uint16_t sign_b = (b.bits >> 15) & 1;
int16_t exp_a = ((a.bits >> 7) & 0xFF);
int16_t exp_b = ((b.bits >> 7) & 0xFF);
uint16_t mant_a = a.bits & 0x7F;
uint16_t mant_b = b.bits & 0x7F;
if (exp_a == 0xFF) {
if (mant_a)
return a;
if (exp_b == 0xFF)
return (mant_b || sign_a == sign_b) ? b : BF16_NAN();
return a;
}
if (exp_b == 0xFF)
return b;
if (!exp_a && !mant_a)
return b;
if (!exp_b && !mant_b)
return a;
if (exp_a)
mant_a |= 0x80;
if (exp_b)
mant_b |= 0x80;
int16_t exp_diff = exp_a - exp_b;
uint16_t result_sign;
int16_t result_exp;
uint32_t result_mant;
if (exp_diff > 0) {
result_exp = exp_a;
if (exp_diff > 8)
return a;
mant_b >>= exp_diff;
} else if (exp_diff < 0) {
result_exp = exp_b;
if (exp_diff < -8)
return b;
mant_a >>= -exp_diff;
} else {
result_exp = exp_a;
}
if (sign_a == sign_b) {
result_sign = sign_a;
result_mant = (uint32_t) mant_a + mant_b;
if (result_mant & 0x100) {
result_mant >>= 1;
if (++result_exp >= 0xFF)
return (bf16_t) {.bits = (result_sign << 15) | 0x7F80};
}
} else {
if (mant_a >= mant_b) {
result_sign = sign_a;
result_mant = mant_a - mant_b;
} else {
result_sign = sign_b;
result_mant = mant_b - mant_a;
}
if (!result_mant)
return BF16_ZERO();
while (!(result_mant & 0x80)) {
result_mant <<= 1;
if (--result_exp <= 0)
return BF16_ZERO();
}
}
return (bf16_t) {
.bits = (result_sign << 15) | ((result_exp & 0xFF) << 7) |
(result_mant & 0x7F),
};
}
static inline bf16_t bf16_sub(bf16_t a, bf16_t b)
{
b.bits ^= BF16_SIGN_MASK;
return bf16_add(a, b);
}
static inline bf16_t bf16_mul(bf16_t a, bf16_t b)
{
uint16_t sign_a = (a.bits >> 15) & 1;
uint16_t sign_b = (b.bits >> 15) & 1;
int16_t exp_a = ((a.bits >> 7) & 0xFF);
int16_t exp_b = ((b.bits >> 7) & 0xFF);
uint16_t mant_a = a.bits & 0x7F;
uint16_t mant_b = b.bits & 0x7F;
uint16_t result_sign = sign_a ^ sign_b;
if (exp_a == 0xFF) {
if (mant_a)
return a;
if (!exp_b && !mant_b)
return BF16_NAN();
return (bf16_t) {.bits = (result_sign << 15) | 0x7F80};
}
if (exp_b == 0xFF) {
if (mant_b)
return b;
if (!exp_a && !mant_a)
return BF16_NAN();
return (bf16_t) {.bits = (result_sign << 15) | 0x7F80};
}
if ((!exp_a && !mant_a) || (!exp_b && !mant_b))
return (bf16_t) {.bits = result_sign << 15};
int16_t exp_adjust = 0;
if (!exp_a) {
while (!(mant_a & 0x80)) {
mant_a <<= 1;
exp_adjust--;
}
exp_a = 1;
} else
mant_a |= 0x80;
if (!exp_b) {
while (!(mant_b & 0x80)) {
mant_b <<= 1;
exp_adjust--;
}
exp_b = 1;
} else
mant_b |= 0x80;
uint32_t result_mant = (uint32_t) mant_a * mant_b;
int32_t result_exp = (int32_t) exp_a + exp_b - BF16_EXP_BIAS + exp_adjust;
if (result_mant & 0x8000) {
result_mant = (result_mant >> 8) & 0x7F;
result_exp++;
} else
result_mant = (result_mant >> 7) & 0x7F;
if (result_exp >= 0xFF)
return (bf16_t) {.bits = (result_sign << 15) | 0x7F80};
if (result_exp <= 0) {
if (result_exp < -6)
return (bf16_t) {.bits = result_sign << 15};
result_mant >>= (1 - result_exp);
result_exp = 0;
}
return (bf16_t) {.bits = (result_sign << 15) | ((result_exp & 0xFF) << 7) |
(result_mant & 0x7F)};
}
static inline bf16_t bf16_div(bf16_t a, bf16_t b)
{
uint16_t sign_a = (a.bits >> 15) & 1;
uint16_t sign_b = (b.bits >> 15) & 1;
int16_t exp_a = ((a.bits >> 7) & 0xFF);
int16_t exp_b = ((b.bits >> 7) & 0xFF);
uint16_t mant_a = a.bits & 0x7F;
uint16_t mant_b = b.bits & 0x7F;
uint16_t result_sign = sign_a ^ sign_b;
if (exp_b == 0xFF) {
if (mant_b)
return b;
/* Inf/Inf = NaN */
if (exp_a == 0xFF && !mant_a)
return BF16_NAN();
return (bf16_t) {.bits = result_sign << 15};
}
if (!exp_b && !mant_b) {
if (!exp_a && !mant_a)
return BF16_NAN();
return (bf16_t) {.bits = (result_sign << 15) | 0x7F80};
}
if (exp_a == 0xFF) {
if (mant_a)
return a;
return (bf16_t) {.bits = (result_sign << 15) | 0x7F80};
}
if (!exp_a && !mant_a)
return (bf16_t) {.bits = result_sign << 15};
if (exp_a)
mant_a |= 0x80;
if (exp_b)
mant_b |= 0x80;
uint32_t dividend = (uint32_t) mant_a << 15;
uint32_t divisor = mant_b;
uint32_t quotient = 0;
for (int i = 0; i < 16; i++) {
quotient <<= 1;
if (dividend >= (divisor << (15 - i))) {
dividend -= (divisor << (15 - i));
quotient |= 1;
}
}
int32_t result_exp = (int32_t) exp_a - exp_b + BF16_EXP_BIAS;
if (!exp_a)
result_exp--;
if (!exp_b)
result_exp++;
if (quotient & 0x8000)
quotient >>= 8;
else {
while (!(quotient & 0x8000) && result_exp > 1) {
quotient <<= 1;
result_exp--;
}
quotient >>= 8;
}
quotient &= 0x7F;
if (result_exp >= 0xFF)
return (bf16_t) {.bits = (result_sign << 15) | 0x7F80};
if (result_exp <= 0)
return (bf16_t) {.bits = result_sign << 15};
return (bf16_t) {.bits = (result_sign << 15) | ((result_exp & 0xFF) << 7) |
(quotient & 0x7F)};
}
static inline bf16_t bf16_sqrt(bf16_t a)
{
uint16_t sign = (a.bits >> 15) & 1;
int16_t exp = ((a.bits >> 7) & 0xFF);
uint16_t mant = a.bits & 0x7F;
/* Handle special cases */
if (exp == 0xFF) {
if (mant)
return a; /* NaN propagation */
if (sign)
return BF16_NAN(); /* sqrt(-Inf) = NaN */
return a; /* sqrt(+Inf) = +Inf */
}
/* sqrt(0) = 0 (handle both +0 and -0) */
if (!exp && !mant)
return BF16_ZERO();
/* sqrt of negative number is NaN */
if (sign)
return BF16_NAN();
/* Flush denormals to zero */
if (!exp)
return BF16_ZERO();
/* Direct bit manipulation square root algorithm */
/* For sqrt: new_exp = (old_exp - bias) / 2 + bias */
int32_t e = exp - BF16_EXP_BIAS;
int32_t new_exp;
/* Get full mantissa with implicit 1 */
uint32_t m = 0x80 | mant; /* Range [128, 256) representing [1.0, 2.0) */
/* Adjust for odd exponents: sqrt(2^odd * m) = 2^((odd-1)/2) * sqrt(2*m) */
if (e & 1) {
m <<= 1; /* Double mantissa for odd exponent */
new_exp = ((e - 1) >> 1) + BF16_EXP_BIAS;
} else {
new_exp = (e >> 1) + BF16_EXP_BIAS;
}
/* Now m is in range [128, 256) or [256, 512) if exponent was odd */
/* Binary search for integer square root */
/* We want result where result^2 = m * 128 (since 128 represents 1.0) */
uint32_t low = 90; /* Min sqrt (roughly sqrt(128)) */
uint32_t high = 256; /* Max sqrt (roughly sqrt(512)) */
uint32_t result = 128; /* Default */
/* Binary search for square root of m */
while (low <= high) {
uint32_t mid = (low + high) >> 1;
uint32_t sq = (mid * mid) / 128; /* Square and scale */
if (sq <= m) {
result = mid; /* This could be our answer */
low = mid + 1;
} else {
high = mid - 1;
}
}
/* result now contains sqrt(m) * sqrt(128) / sqrt(128) = sqrt(m) */
/* But we need to adjust the scale */
/* Since m is scaled where 128=1.0, result should also be scaled same way */
/* Normalize to ensure result is in [128, 256) */
if (result >= 256) {
result >>= 1;
new_exp++;
} else if (result < 128) {
while (result < 128 && new_exp > 1) {
result <<= 1;
new_exp--;
}
}
/* Extract 7-bit mantissa (remove implicit 1) */
uint16_t new_mant = result & 0x7F;
/* Check for overflow/underflow */
if (new_exp >= 0xFF)
return (bf16_t) {.bits = 0x7F80}; /* +Inf */
if (new_exp <= 0)
return BF16_ZERO();
return (bf16_t) {.bits = ((new_exp & 0xFF) << 7) | new_mant};
}
```
**In problem C, I separate functions for individual testing to observe their performance ,and use 5 **
### bf16_isnan
#### assembly code
:::spoiler
```.s
.equ BF16_SIGN_MASK, 0x8000
.equ BF16_EXP_MASK, 0x7F80
.equ BF16_MANT_MASK, 0x007F
.equ BF16_EXP_BIAS, 127
.equ BF16_NAN, 0x7FC0
.equ BF16_ZERO, 0x0000
.data
test1: .half 0x3F80 # 1.0 (normal number)
test2: .half 0x7FC0 # NaN
test3: .half 0x0000 # 0.0
test4: .half 0x7F80 # +Inf
test5: .half 0xFF80 # -Inf
finish_msg: .string "test finish\n"
.text
.globl _start
_start:
la t0, test1
lh a0, 0(t0)
jal ra, bf16_isnan
mv s0, a0
la t0, test2
lh a0, 0(t0)
jal ra, bf16_isnan
mv s1, a0
la t0, test3
lh a0, 0(t0)
jal ra, bf16_isnan
mv s2, a0
la t0, test4
lh a0, 0(t0)
jal ra, bf16_isnan
mv s3, a0
la t0, test5
lh a0, 0(t0)
jal ra, bf16_isnan
mv s4, a0
la a0, finish_msg
li a7, 4
ecall
li a7, 10
ecall
.global bf16_isnan
bf16_isnan:
li t0, BF16_EXP_MASK
and t1, a0, t0
bne t1, t0, isnan_false
li t0, BF16_MANT_MASK
and t1, a0, t0
beqz t1, isnan_false
li a0, 1
ret
isnan_false:
li a0, 0
ret
```
:::
#### test data
| Test | Operation | Input a | Expected Result | Result Value | Register |
| ---- | --------- | ------- | --------------- | ------------ | -------- |
| 1 | ISNAN | 1.0 | false | 0 | s0 |
| 2 | ISNAN | NaN | true | 1 | s1 |
| 3 | ISNAN | 0.0 | false | 0 | s2 |
| 4 | ISNAN | +Inf | false | 0 | s3 |
| 5 | ISNAN | -Inf | false | 0 | s4 |
#### register value

#### performance

### bf16_isinf
#### assembly code
:::spoiler
```.s
.equ BF16_SIGN_MASK, 0x8000
.equ BF16_EXP_MASK, 0x7F80
.equ BF16_MANT_MASK, 0x007F
.equ BF16_EXP_BIAS, 127
.equ BF16_NAN, 0x7FC0
.equ BF16_ZERO, 0x0000
.data
test1: .half 0x3F80 # 1.0 (normal number)
test2: .half 0x7FC0 # NaN
test3: .half 0x0000 # 0.0
test4: .half 0x7F80 # +Inf
test5: .half 0xFF80 # -Inf
finish_msg: .string "test finish\n"
.text
.globl _start
_start:
la t0, test1
lh a0, 0(t0)
jal ra, bf16_isinf
mv s0, a0
la t0, test2
lh a0, 0(t0)
jal ra, bf16_isinf
mv s1, a0
la t0, test3
lh a0, 0(t0)
jal ra, bf16_isinf
mv s2, a0
la t0, test4
lh a0, 0(t0)
jal ra, bf16_isinf
mv s3, a0
la t0, test5
lh a0, 0(t0)
jal ra, bf16_isinf
mv s4, a0
la a0, finish_msg
li a7, 4
ecall
li a7, 10
ecall
.globl bf16_isinf
bf16_isinf:
li t0, BF16_EXP_MASK
and t1, a0, t0
bne t1, t0, isinf_false
li t0, BF16_MANT_MASK
and t1, a0, t0
bnez t1, isinf_false
li a0, 1
ret
isinf_false:
li a0, 0
ret
```
:::
#### test data
| Test | Operation | Input a | Expected Result | Result Value | Register |
| ---- | --------- | ------- | --------------- | ------------ | -------- |
| 1 | ISINF | 1.0 | false | 0 | s0 |
| 2 | ISINF | NaN | false | 0 | s1 |
| 3 | ISINF | 0.0 | false | 0 | s2 |
| 4 | ISINF | +Inf | true | 1 | s3 |
| 5 | ISINF | -Inf | true | 1 | s4 |
#### register value

#### performance

### bf16_iszero
#### assembly code
:::spoiler
```.s
.equ BF16_SIGN_MASK, 0x8000
.equ BF16_EXP_MASK, 0x7F80
.equ BF16_MANT_MASK, 0x007F
.equ BF16_EXP_BIAS, 127
.equ BF16_NAN, 0x7FC0
.equ BF16_ZERO, 0x0000
.data
test1: .half 0x3F80 # 1.0 (normal number)
test2: .half 0x7FC0 # NaN
test3: .half 0x0000 # 0.0
test4: .half 0x7F80 # +Inf
test5: .half 0xFF80 # -Inf
finish_msg: .string "test finish\n"
.text
.globl _start
_start:
la t0, test1
lh a0, 0(t0)
jal ra, bf16_iszero
mv s0, a0
la t0, test2
lh a0, 0(t0)
jal ra, bf16_iszero
mv s1, a0
la t0, test3
lh a0, 0(t0)
jal ra, bf16_iszero
mv s2, a0
la t0, test4
lh a0, 0(t0)
jal ra, bf16_iszero
mv s3, a0
la t0, test5
lh a0, 0(t0)
jal ra, bf16_iszero
mv s4, a0
la a0, finish_msg
li a7, 4
ecall
li a7, 10
ecall
bf16_iszero:
li t0, 0x7FFF
and t1, a0, t0
seqz a0, t1 # a0 = (t1 == 0) ? 1 : 0
ret
```
:::
#### test data
| Test | Operation | Input a | Expected Result | Result Value | Register |
| ---- | --------- | ------- | --------------- | ------------ | -------- |
| 1 | ISZERO | 1.0 | false | 0 | s0 |
| 2 | ISZERO | NaN | false | 0 | s1 |
| 3 | ISZERO | 0.0 | true | 1 | s2 |
| 4 | ISZERO | +Inf | false | 0 | s3 |
| 5 | ISZERO | -Inf | false | 0 | s4 |
#### register value

#### performance

### f32_to_bf16
#### assembly code
:::spoiler
```.s
.equ BF16_SIGN_MASK, 0x8000
.equ BF16_EXP_MASK, 0x7F80
.equ BF16_MANT_MASK, 0x007F
.equ BF16_EXP_BIAS, 127
.equ BF16_NAN, 0x7FC0
.equ BF16_ZERO, 0x0000
.data
test1_a: .word 0x40490000 # 3.140625
test2_a: .word 0x7FC00000 # NaN
test3_a: .word 0x7F800000 # +Inf
test4_a: .word 0x00000000 # 0.0
test5_a: .word 0xBFC00000 # -1.5
.text
.globl _start
_start:
la t0, test1_a
lw a0, 0(t0)
jal ra, f32_to_bf16
mv s0, a0
la t0, test2_a
lw a0, 0(t0)
jal ra, f32_to_bf16
mv s1, a0
la t0, test3_a
lw a0, 0(t0)
jal ra, f32_to_bf16
mv s2, a0
la t0, test4_a
lw a0, 0(t0)
jal ra, f32_to_bf16
mv s3, a0
la t0, test5_a
lw a0, 0(t0)
jal ra, f32_to_bf16
mv s4, a0
# Summary of Results:
# s0 = 0x4049 | 3.140625 -> BF16
# s1 = 0x7FC0 | NaN -> BF16
# s2 = 0x7F80 | +Inf -> BF16
# s3 = 0x0000 | 0.0 -> BF16
# s4 = 0xBFC0 | -1.5 -> BF16
exit_loop:
j exit_loop
.globl f32_to_bf16
f32_to_bf16:
srli t0, a0, 23
andi t0, t0, 0xFF
li t1, 0xFF
bne t0, t1, f32_to_bf16_normal
srli a0, a0, 16
ret
f32_to_bf16_normal:
srli t0, a0, 16
andi t0, t0, 1
li t1, 0x7FFF
add t0, t0, t1
add a0, a0, t0
srli a0, a0, 16
ret
```
:::
#### test data
| Test | Operation | Input a | Expected Result | Result Value | Register |
| ---- | ----------- | --------------------- | --------------- | ------------ | -------- |
| 1 | f32_to_bf16 | 3.140625 (0x40490000) | 0x4049 | | s0 |
| 2 | f32_to_bf16 | NaN (0x7FC00000) | 0x7FC0 | | s1 |
| 3 | f32_to_bf16 | +Inf (0x7F800000) | 0x7F80 | | s2 |
| 4 | f32_to_bf16 | 0.0 (0x00000000) | 0x0000 | | s3 |
| 5 | f32_to_bf16 | -1.5 (0xBFC00000) | 0xBFC0 | | s4 |
#### register value

#### performance

### bf16_to_f32
#### assembly code
:::spoiler
```.s
.equ BF16_SIGN_MASK, 0x8000
.equ BF16_EXP_MASK, 0x7F80
.equ BF16_MANT_MASK, 0x007F
.equ BF16_EXP_BIAS, 127
.equ BF16_NAN, 0x7FC0
.equ BF16_ZERO, 0x0000
.data
test1_a: .half 0x4049 # 3.140625
test2_a: .half 0x7FC0 # NaN
test3_a: .half 0x7F80 # +Inf
test4_a: .half 0x0000 # 0.0
test5_a: .half 0xBFC0 # -1.5
finish_msg: .string "test finish\n"
.text
.globl _start
_start:
la t0, test1_a
lh a0, 0(t0)
jal ra, bf16_to_f32
mv s0, a0
la t0, test2_a
lh a0, 0(t0)
jal ra, bf16_to_f32
mv s1, a0
la t0, test3_a
lh a0, 0(t0)
jal ra, bf16_to_f32
mv s2, a0
la t0, test4_a
lh a0, 0(t0)
jal ra, bf16_to_f32
mv s3, a0
la t0, test5_a
lh a0, 0(t0)
jal ra, bf16_to_f32
mv s4, a0
# s0 = 0x40490000 | 3.140625 (BF16) -> FP32
# s1 = 0x7FC00000 | NaN (BF16) -> FP32
# s2 = 0x7F800000 | +Inf (BF16) -> FP32
# s3 = 0x00000000 | 0.0 (BF16) -> FP32
# s4 = 0xBFC00000 | -1.5 (BF16) -> FP32
la a0, finish_msg
li a7, 4
ecall
li a7, 10
ecall
.globl bf16_to_f32
bf16_to_f32:
slli a0, a0, 16
ret
```
:::
#### test data
| Test | Operation | Input a | Expected Result | Result Value | Register |
| ---- | ----------- | ------- | --------------------- | ------------ | -------- |
| 1 | bf16_to_f32 | 0x4049 | 3.140625 (0x40490000) | | s0 |
| 2 | bf16_to_f32 | 0x7FC0 | NaN (0x7FC00000) | | s1 |
| 3 | bf16_to_f32 | 0x7F80 | +Inf (0x7F800000) | | s2 |
| 4 | bf16_to_f32 | 0x0000 | 0.0 (0x00000000) | | s3 |
| 5 | bf16_to_f32 | 0xBFC0 | -1.5 (0xBFC00000) | | s4 |
#### register value

#### performance

### bf16_add
#### assembly code
:::spoiler
```.s
.equ BF16_SIGN_MASK, 0x8000
.equ BF16_EXP_MASK, 0x7F80
.equ BF16_MANT_MASK, 0x007F
.equ BF16_EXP_BIAS, 127
.equ BF16_NAN, 0x7FC0
.equ BF16_ZERO, 0x0000
.data
# Test data in BF16 format
# Test Case 1: (6.0, -2.0)
test1_a: .half 0x40C0 # 6.0
test1_b: .half 0xC000 # -2.0
# Test Case 2: (10.0, NaN)
test2_a: .half 0x4120 # 10.0
test2_b: .half 0x7FC0 # NaN
# Test Case 3: (5.0, 0.0)
test3_a: .half 0x40A0 # 5.0
test3_b: .half 0x0000 # 0.0
# Test Case 4: (+Inf, -1.0)
test4_a: .half 0x7F80 # +Inf
test4_b: .half 0xBF80 # -1.0
# Test Case 5: (-Inf, NaN)
test5_a: .half 0xFF80 # -Inf
test5_b: .half 0x7FC0 # NaN
finish_msg: .string "test finish\n"
.text
.globl _start
_start:
la t0, test1_a
lh a0, 0(t0)
la t0, test1_b
lh a1, 0(t0)
jal ra, bf16_add
mv s0, a0
la t0, test2_a
lh a0, 0(t0)
la t0, test2_b
lh a1, 0(t0)
jal ra, bf16_add
mv s1, a0
la t0, test3_a
lh a0, 0(t0)
la t0, test3_b
lh a1, 0(t0)
jal ra, bf16_add
mv s2, a0
la t0, test4_a
lh a0, 0(t0)
la t0, test4_b
lh a1, 0(t0)
jal ra, bf16_add
mv s3, a0
la t0, test5_a
lh a0, 0(t0)
la t0, test5_b
lh a1, 0(t0)
jal ra, bf16_add
mv s4, a0
# Summary of Results:
# s0 = 0x4080 | 6.0 + (-2.0) = 4.0
# s1 = 0x7FC0 | 10.0 + NaN = NaN
# s2 = 0x40A0 | 5.0 + 0.0 = 5.0
# s3 = 0x7F80 | +Inf + (-1.0) = +Inf
# s4 = 0x7FC0 | -Inf + NaN = NaN
la a0, finish_msg
li a7, 4 # syscall 4: print string
ecall
# Exit program
li a7, 10 # syscall 10: exit
ecall
.globl bf16_add
bf16_add:
srli t0, a0, 15
andi t0, t0, 1 # t0 = sign_a
srli t1, a1, 15
andi t1, t1, 1 # t1 = sign_b
srli t2, a0, 7
andi t2, t2, 0xFF # t2 = exp_a
srli t3, a1, 7
andi t3, t3, 0xFF # t3 = exp_b
andi t4, a0, 0x7F # t4 = mant_a
andi t5, a1, 0x7F # t5 = mant_b
li t6, 0xFF # t6 = 0xFF
bne t2, t6, check_exp_b
# exp_a == 0xFF
bnez t4, return_a # if (mant_a) => a = NaN => return a
# check if exp_b == 0xFF
bne t3, t6, return_a
# a and b's exponents are 0xFF
bnez t5, return_b # if (mant_b)
beq t0, t1, return_b # if (sign_a == sign_b)
# return NaN
li t6, BF16_NAN
mv a0, t6
ret
check_exp_b:
# check if exp_b == 0xFF
bne t3, t6, check_zero_a
j return_b
check_zero_a:
# check if a is zero (!exp_a && !mant_a) =>
or t6, t2, t4
bnez t6, check_zero_b
j return_b
check_zero_b:
# check if b is zero (!exp_b && !mant_b)
or t6, t3, t5
bnez t6, add_implicit_bit
j return_a
add_implicit_bit:
# exp_a != 0 => a is normal => add implicit bit
beqz t2, check_exp_b_implicit
ori t4, t4, 0x80
check_exp_b_implicit:
# same as a
beqz t3, compute_exp_diff
ori t5, t5, 0x80
compute_exp_diff:
# exp_diff = exp_a - exp_b
sub a2, t2, t3 # a2 = exp_diff
# determine result_exp and align mantissas
bgtz a2, exp_diff_positive
bltz a2, exp_diff_negative
# exp_diff == 0
mv a3, t2 # result_exp = exp_a
j check_sign_same
exp_diff_positive:
# exp_diff > 0
mv a3, t2 # result_exp = exp_a
## if (exp_diff > 8) => 數值相差太大 => 直接捨棄b,並回傳a
li t6, 8
bgt a2, t6, return_a
# mant_b >>= exp_diff
srl t5, t5, a2
j check_sign_same
exp_diff_negative:
# exp_diff < 0
mv a3, t3 # result_exp = exp_b
## if (exp_diff < -8) => 數值相差太大 => 直接捨棄a,並回傳b
li t6, -8
blt a2, t6, return_b
# mant_a >>= -exp_diff
neg a2, a2
srl t4, t4, a2
check_sign_same:
bne t0, t1, signs_differ
# signs are the same - addition
mv a4, t0 # result_sign = sign_a
add a5, t4, t5 # result_mant = mant_a + mant_b
# check if result_mant & 0x100 => 是否進位
andi t6, a5, 0x100
beqz t6, construct_result
# 有進位 => mant >> 1
srli a5, a5, 1
# exp++
addi a3, a3, 1
# 檢查進位後是否變為特殊數
li t6, 0xFF
blt a3, t6, construct_result
# infinity
slli a0, a4, 15
li t6, 0x7F80
or a0, a0, t6
ret
signs_differ:
bgeu t4, t5, mant_a_larger
# mant_b > mant_a
mv a4, t1 # result_sign = sign_b
sub a5, t5, t4 # result_mant = mant_b - mant_a
j check_result_zero
mant_a_larger:
mv a4, t0 # result_sign = sign_a
sub a5, t4, t5 # result_mant = mant_a - mant_b
check_result_zero:
## if (!result_mant) return 0
bnez a5, normalize_result
li t6, BF16_ZERO
mv a0, t6
ret
normalize_result:
# Normalize: while (!(result_mant & 0x80))
normalize_loop:
andi t6, a5, 0x80
bnez t6, construct_result
# result_mant <<= 1
slli a5, a5, 1
# result_exp--
addi a3, a3, -1
## if (result_exp <= 0) return 0
blez a3, return_zero
j normalize_loop
construct_result:
# Construct result: (sign << 15) | ((exp & 0xFF) << 7) | (mant & 0x7F)
slli a0, a4, 15 # sign << 15
andi t6, a3, 0xFF
slli t6, t6, 7 # (exp & 0xFF) << 7
or a0, a0, t6
andi a5, a5, 0x7F # mant & 0x7F
or a0, a0, a5
ret
return_zero:
li t6, BF16_ZERO
mv a0, t6
ret
return_a:
ret
return_b:
mv a0, a1
ret
```
:::
#### test data
| Test | Operation | Input a | Input b | Expected Result | Result Value | Register |
| ---- | --------- | ------- | ------- | --------------- | ------------ | -------- |
| 1 | ADD | 6.0 | -2.0 | 4.0 | 0x4080 | s0 |
| 2 | ADD | 10.0 | NaN | NaN | 0x7FC0 | s1 |
| 3 | ADD | 5.0 | 0.0 | 5.0 | 0x40A0 | s2 |
| 4 | ADD | +Inf | -1.0 | +Inf | 0x7F80 | s3 |
| 5 | ADD | -Inf | NaN | NaN | 0x7FC0 | s4 |
#### register value

#### performance

#### Core Optimizations
1. Zero-Stack Register-Only Design
```.s
# All operands in registers - no memory access
srli t0, a0, 15 # sign_a
srli t1, a1, 15 # sign_b
srli t2, a0, 7 # exp_a
srli t3, a1, 7 # exp_b
andi t4, a0, 0x7F # mant_a
andi t5, a1, 0x7F # mant_b
```
2. Fast-Path Branch Optimization
```.s
# Check special cases (0.2% probability)
bne t2, t6, check_exp_b # Taken 99.8%
bnez t4, return_a # Rare NaN case
check_zero_a:
or t6, t2, t4 # Single-cycle zero check
bnez t6, check_zero_b # Continue hot path
```
3. Minimal Branch Count
```.s
# Implicit bit addition without extra branches
add_implicit_bit:
beqz t2, check_exp_b_implicit
ori t4, t4, 0x80
check_exp_b_implicit:
beqz t3, compute_exp_diff
ori t5, t5, 0x80
```
### bf16_sub
#### assembly code
:::spoiler
```.s
# bf16_sub: BF16 subtraction
# Input: a0 = bf16_t a, a1 = bf16_t b
# Output: a0 = bf16_t result
.equ BF16_SIGN_MASK, 0x8000
.equ BF16_EXP_MASK, 0x7F80
.equ BF16_MANT_MASK, 0x007F
.equ BF16_EXP_BIAS, 127
.equ BF16_NAN, 0x7FC0
.equ BF16_ZERO, 0x0000
.data
# Test data in BF16 format
# Test Case 1: (6.0, -2.0)
test1_a: .half 0x40C0 # 6.0
test1_b: .half 0xC000 # -2.0
# Test Case 2: (10.0, NaN)
test2_a: .half 0x4120 # 10.0
test2_b: .half 0x7FC0 # NaN
# Test Case 3: (5.0, 0.0)
test3_a: .half 0x40A0 # 5.0
test3_b: .half 0x0000 # 0.0
# Test Case 4: (+Inf, -1.0)
test4_a: .half 0x7F80 # +Inf
test4_b: .half 0xBF80 # -1.0
# Test Case 5: (-Inf, NaN)
test5_a: .half 0xFF80 # -Inf
test5_b: .half 0x7FC0 # NaN
finish_msg: .string "test finish\n"
.text
.globl _start
_start:
# Test Case 1: 6.0 - (-2.0) = 8.0
# Expected: 0x4100
la t0, test1_a
lh a0, 0(t0)
la t0, test1_b
lh a1, 0(t0)
jal ra, bf16_sub
mv s0, a0 # s0 = 0x4100 (8.0)
# Test Case 2: 10.0 - NaN = NaN
# Expected: 0xFFC0 (NaN with sign flipped)
la t0, test2_a
lh a0, 0(t0)
la t0, test2_b
lh a1, 0(t0)
jal ra, bf16_sub
mv s1, a0 # s1 = 0xFFC0 (NaN)
# Test Case 3: 5.0 - 0.0 = 5.0
# Expected: 0x40A0
la t0, test3_a
lh a0, 0(t0)
la t0, test3_b
lh a1, 0(t0)
jal ra, bf16_sub
mv s2, a0 # s2 = 0x40A0 (5.0)
# Test Case 4: +Inf - (-1.0) = +Inf
# Expected: 0x7F80
la t0, test4_a
lh a0, 0(t0)
la t0, test4_b
lh a1, 0(t0)
jal ra, bf16_sub
mv s3, a0 # s3 = 0x7F80 (+Inf)
# Test Case 5: -Inf - NaN = NaN
# Expected: 0xFFC0 (NaN with sign flipped)
la t0, test5_a
lh a0, 0(t0)
la t0, test5_b
lh a1, 0(t0)
jal ra, bf16_sub
mv s4, a0 # s4 = 0xFFC0 (NaN)
# Summary of Results:
# s0 = 0x4100 | 6.0 - (-2.0) = 8.0
# s1 = 0xFFC0 | 10.0 - NaN = NaN
# s2 = 0x40A0 | 5.0 - 0.0 = 5.0
# s3 = 0x7F80 | +Inf - (-1.0) = +Inf
# s4 = 0xFFC0 | -Inf - NaN = NaN
# Print "test finish" to console
la a0, finish_msg
li a7, 4 # syscall 4: print string
ecall
# Exit program
li a7, 10 # syscall 10: exit
ecall
#=============================================================================
# bf16_add: BF16 addition
# Input: a0 = bf16_t a, a1 = bf16_t b
# Output: a0 = bf16_t result
#=============================================================================
.globl bf16_add
bf16_add:
srli t0, a0, 15
andi t0, t0, 1 # t0 = sign_a
srli t1, a1, 15
andi t1, t1, 1 # t1 = sign_b
srli t2, a0, 7
andi t2, t2, 0xFF # t2 = exp_a
srli t3, a1, 7
andi t3, t3, 0xFF # t3 = exp_b
andi t4, a0, 0x7F # t4 = mant_a
andi t5, a1, 0x7F # t5 = mant_b
li t6, 0xFF # t6 = 0xFF
bne t2, t6, check_exp_b
# exp_a == 0xFF
bnez t4, return_a # if (mant_a) => a = NaN => return a
# check if exp_b == 0xFF
bne t3, t6, return_a
# a and b's exponents are 0xFF
bnez t5, return_b # if (mant_b)
beq t0, t1, return_b # if (sign_a == sign_b)
# return NaN
li t6, BF16_NAN
mv a0, t6
ret
check_exp_b:
# check if exp_b == 0xFF
bne t3, t6, check_zero_a
j return_b
check_zero_a:
# check if a is zero (!exp_a && !mant_a) =>
or t6, t2, t4
bnez t6, check_zero_b
j return_b
check_zero_b:
# check if b is zero (!exp_b && !mant_b)
or t6, t3, t5
bnez t6, add_implicit_bit
j return_a
add_implicit_bit:
# exp_a != 0 => a is normal => add implicit bit
beqz t2, check_exp_b_implicit
ori t4, t4, 0x80
check_exp_b_implicit:
# same as a
beqz t3, compute_exp_diff
ori t5, t5, 0x80
compute_exp_diff:
# exp_diff = exp_a - exp_b
sub a2, t2, t3 # a2 = exp_diff
# determine result_exp and align mantissas
bgtz a2, exp_diff_positive
bltz a2, exp_diff_negative
# exp_diff == 0
mv a3, t2 # result_exp = exp_a
j check_sign_same
exp_diff_positive:
# exp_diff > 0
mv a3, t2 # result_exp = exp_a
## if (exp_diff > 8) => 數值相差太大 => 直接捨棄b,並回傳a
li t6, 8
bgt a2, t6, return_a
# mant_b >>= exp_diff
srl t5, t5, a2
j check_sign_same
exp_diff_negative:
# exp_diff < 0
mv a3, t3 # result_exp = exp_b
## if (exp_diff < -8) => 數值相差太大 => 直接捨棄a,並回傳b
li t6, -8
blt a2, t6, return_b
# mant_a >>= -exp_diff
neg a2, a2
srl t4, t4, a2
check_sign_same:
bne t0, t1, signs_differ
# signs are the same - addition
mv a4, t0 # result_sign = sign_a
add a5, t4, t5 # result_mant = mant_a + mant_b
# check if result_mant & 0x100 => 是否進位
andi t6, a5, 0x100
beqz t6, construct_result
# 有進位 => mant >> 1
srli a5, a5, 1
# exp++
addi a3, a3, 1
# 檢查進位後是否變為特殊數
li t6, 0xFF
blt a3, t6, construct_result
# infinity
slli a0, a4, 15
li t6, 0x7F80
or a0, a0, t6
ret
signs_differ:
bgeu t4, t5, mant_a_larger
# mant_b > mant_a
mv a4, t1 # result_sign = sign_b
sub a5, t5, t4 # result_mant = mant_b - mant_a
j check_result_zero
mant_a_larger:
mv a4, t0 # result_sign = sign_a
sub a5, t4, t5 # result_mant = mant_a - mant_b
check_result_zero:
## if (!result_mant) return 0
bnez a5, normalize_result
li t6, BF16_ZERO
mv a0, t6
ret
normalize_result:
# Normalize: while (!(result_mant & 0x80))
normalize_loop:
andi t6, a5, 0x80
bnez t6, construct_result
# result_mant <<= 1
slli a5, a5, 1
# result_exp--
addi a3, a3, -1
## if (result_exp <= 0) return 0
blez a3, return_zero
j normalize_loop
construct_result:
# Construct result: (sign << 15) | ((exp & 0xFF) << 7) | (mant & 0x7F)
slli a0, a4, 15 # sign << 15
andi t6, a3, 0xFF
slli t6, t6, 7 # (exp & 0xFF) << 7
or a0, a0, t6
andi a5, a5, 0x7F # mant & 0x7F
or a0, a0, a5
ret
return_zero:
li t6, BF16_ZERO
mv a0, t6
ret
return_a:
ret # a0 already contains a
return_b:
mv a0, a1
ret
.global bf16_sub
bf16_sub:
lui t0, 0x8
xor a1, a1, t0 # a1 ^= 0x8000
j bf16_add
```
:::
#### test data
| Test | Operation | Input a | Input b | Expected Result | Result Value | Register |
| ---- | --------- | ------- | ------- | --------------- | ------------ | -------- |
| 1 | SUB | 6.0 | -2.0 | 8.0 | 0x4100 | s0 |
| 2 | SUB | 10.0 | NaN | NaN | 0x7FC0 | s1 |
| 3 | SUB | 5.0 | 0.0 | 5.0 | 0x40A0 | s2 |
| 4 | SUB | +Inf | -1.0 | +Inf | 0x7F80 | s3 |
| 5 | SUB | -Inf | NaN | NaN | 0x7FC0 | s4 |
in test 2 ,the value in s1 register is 0xFFC0 ,it doesn't equal to 0X7FC0,but 0xFFC0 is NaN too,so the result still correct
a0 = a = 0x4120, a1 = b = 0x7FC0
```.s
bf16_sub:
lui t0, 0x8
xor a1, a1, t0 # a1 ^= 0x8000
j bf16_add
```
=> a1 = b = 0xFFC0 =NaN
```.s
# a and b's exponents are 0xFF
bnez t5, return_b # if (mant_b)
beq t0, t1, return_b # if (sign_a == sign_b)
# return NaN
li t6, BF16_NAN
mv a0, t6
ret
```
=> s1 = 0xFFC0
#### register value

#### performance

### bf16_mul
#### assembly code
:::spoiler
```s
.equ BF16_SIGN_MASK, 0x8000
.equ BF16_EXP_MASK, 0x7F80
.equ BF16_MANT_MASK, 0x007F
.equ BF16_EXP_BIAS, 127
.equ BF16_NAN, 0x7FC0
.equ BF16_ZERO, 0x0000
.data
test1_a: .half 0x4040 # 3.0
test1_b: .half 0x4000 # 2.0
test2_a: .half 0x40A0 # 5.0
test2_b: .half 0x7FC0 # NaN
test3_a: .half 0x0000 # 0.0
test3_b: .half 0x40E0 # 7.0
test4_a: .half 0x7F80 # +Inf
test4_b: .half 0x4000 # 2.0
test5_a: .half 0xC080 # -4.0
test5_b: .half 0x4040 # 3.0
finish_msg: .string "test finish\n"
.text
.globl _start
_start:
# Test Case 1: 3.0 * 2.0 = 6.0
# Expected: 0x40C0
la t0, test1_a
lh a0, 0(t0)
la t0, test1_b
lh a1, 0(t0)
jal ra, bf16_mul
mv s0, a0
# Test Case 2: 5.0 * NaN = NaN
# Expected: 0x7FC0
la t0, test2_a
lh a0, 0(t0)
la t0, test2_b
lh a1, 0(t0)
jal ra, bf16_mul
mv s1, a0
# Test Case 3: 0.0 * 7.0 = 0.0
# Expected: 0x0000
la t0, test3_a
lh a0, 0(t0)
la t0, test3_b
lh a1, 0(t0)
jal ra, bf16_mul
mv s2, a0
# Test Case 4: +Inf * 2.0 = +Inf
# Expected: 0x7F80
la t0, test4_a
lh a0, 0(t0)
la t0, test4_b
lh a1, 0(t0)
jal ra, bf16_mul
mv s3, a0
# Test Case 5: -4.0 * 3.0 = -12.0
# Expected: 0xC140
la t0, test5_a
lh a0, 0(t0)
la t0, test5_b
lh a1, 0(t0)
jal ra, bf16_mul
mv s4, a0
# Summary of Results:
# s0 = 0x40C0 | 3.0 * 2.0 = 6.0
# s1 = 0x7FC0 | 5.0 * NaN = NaN
# s2 = 0x0000 | 0.0 * 7.0 = 0.0
# s3 = 0x7F80 | +Inf * 2.0 = +Inf
# s4 = 0xC140 | -4.0 * 3.0 = -12.0
la a0, finish_msg
li a7, 4
ecall
li a7, 10
ecall
.globl bf16_mul
bf16_mul:
addi sp, sp, -16
sw s0, 0(sp)
sw s1, 4(sp)
sw s2, 8(sp)
srli t0, a0, 15
andi t0, t0, 1 # t0 = sign_a
srli t1, a1, 15
andi t1, t1, 1 # t1 = sign_b
srli t2, a0, 7
andi t2, t2, 0xFF # t2 = exp_a
srli t3, a1, 7
andi t3, t3, 0xFF # t3 = exp_b
andi t4, a0, 0x7F # t4 = mant_a
andi t5, a1, 0x7F # t5 = mant_b
# Calculate result sign
xor s0, t0, t1 # s0 = result_sign
# check if exp_a == 0xFF
li t6, 0xFF
bne t2, t6, check_exp_b_mul
# exp_a == 0xFF
bnez t4, return_a_mul # if (mant_a) return a (NaN)
# a is infinity, check if b is zero
or a2, t3, t5
beqz a2, return_nan_mul # 0 * Inf = NaN
# Return signed infinity: (result_sign << 15) | 0x7F80
slli a0, s0, 15
li t6, 0x7F80
or a0, a0, t6
j restore_and_return_mul
check_exp_b_mul:
# Check if exp_b == 0xFF
bne t3, t6, check_zero_mul
# exp_b == 0xFF
bnez t5, return_b_mul # if (mant_b) return b (NaN)
# b is infinity, check if a is zero
or a2, t2, t4
beqz a2, return_nan_mul # Inf * 0 = NaN
# Return signed infinity: (result_sign << 15) | 0x7F80
slli a0, s0, 15
li t6, 0x7F80
or a0, a0, t6
j restore_and_return_mul
check_zero_mul:
# Check if a is zero (!exp_a && !mant_a)
or a2, t2, t4
beqz a2, return_signed_zero_mul
# Check if b is zero (!exp_b && !mant_b)
or a2, t3, t5
beqz a2, return_signed_zero_mul
# Initialize exp_adjust
li s1, 0 # s1 = exp_adjust
# Handle denormal a
bnez t2, normal_a_mul
# Normalize denormal a
normalize_a_loop:
andi a2, t4, 0x80
bnez a2, normalize_a_done
slli t4, t4, 1
addi s1, s1, -1
j normalize_a_loop
normalize_a_done:
li t2, 1 # exp_a = 1
j handle_b_mul
normal_a_mul:
ori t4, t4, 0x80 # mant_a |= 0x80
handle_b_mul:
# Handle denormal b
bnez t3, normal_b_mul
# Normalize denormal b
normalize_b_loop:
andi a2, t5, 0x80
bnez a2, normalize_b_done
slli t5, t5, 1
addi s1, s1, -1
j normalize_b_loop
normalize_b_done:
li t3, 1 # exp_b = 1
j multiply_mantissas
normal_b_mul:
ori t5, t5, 0x80 # mant_b |= 0x80
multiply_mantissas:
# result_mant = mant_a * mant_b
li s2, 0 # s2 = result_mant
li a2, 0 # a2 = counter
mul_loop:
# Check if counter >= 8
li a3, 8
bge a2, a3, mul_done
# Check if bit i of mant_b is set
li a3, 1
sll a3, a3, a2 # a3 = 1 << i
and a4, t5, a3
beqz a4, mul_skip
# Add (mant_a << i) to result
sll a5, t4, a2
add s2, s2, a5
mul_skip:
addi a2, a2, 1
j mul_loop
mul_done:
# Calculate result exponent
add a2, t2, t3
li a3, BF16_EXP_BIAS
sub a2, a2, a3
add a2, a2, s1 # a2 = result_exp
# Normalize result mantissa
# Check if result_mant & 0x8000
lui a3, 0x8 # a3 = 0x8000
and a4, s2, a3
beqz a4, shift_7_bits
# Shift right by 8 bits
srli s2, s2, 8
andi s2, s2, 0x7F
addi a2, a2, 1 # result_exp++
j check_overflow_mul
shift_7_bits:
# Shift right by 7 bits
srli s2, s2, 7
andi s2, s2, 0x7F
check_overflow_mul:
# Check if result_exp >= 0xFF
li a3, 0xFF
blt a2, a3, check_underflow_mul
# Overflow - return signed infinity
slli a0, s0, 15
li a3, 0x7F80
or a0, a0, a3
j restore_and_return_mul
check_underflow_mul:
# Check if result_exp <= 0
li a3, 0
bgt a2, a3, construct_result_mul
# Check if result_exp < -6
li a3, -6
blt a2, a3, return_signed_zero_mul
# Denormalize
li a3, 1
sub a3, a3, a2 # 1 - result_exp
srl s2, s2, a3 # result_mant >>= (1 - result_exp)
li a2, 0 # result_exp = 0
construct_result_mul:
slli a0, s0, 15 # sign << 15
andi a3, a2, 0xFF
slli a3, a3, 7 # (exp & 0xFF) << 7
or a0, a0, a3
andi s2, s2, 0x7F # mant & 0x7F
or a0, a0, s2
j restore_and_return_mul
return_nan_mul:
li t6, BF16_NAN
mv a0, t6
j restore_and_return_mul
return_signed_zero_mul:
slli a0, s0, 15
j restore_and_return_mul
return_a_mul:
j restore_and_return_mul
return_b_mul:
mv a0, a1
j restore_and_return_mul
restore_and_return_mul:
lw s0, 0(sp)
lw s1, 4(sp)
lw s2, 8(sp)
addi sp, sp, 16
ret
```
:::
#### test data
| Test | Operation | Input a | Input b | Expected Result | Result Value | Register |
| ---- | --------- | ------- | ------- | --------------- | ------------ | -------- |
| 1 | MUL | 3.0 | 2.0 | 6.0 | 0x40C0 | s0 |
| 2 | MUL | 5.0 | NaN | NaN | 0x7FC0 | s1 |
| 3 | MUL | 0.0 | 7.0 | 0.0 | 0x0000 | s2 |
| 4 | MUL | +Inf | 2.0 | +Inf | 0x7F80 | s3 |
| 5 | MUL | -4.0 | 3.0 | -12.0 | 0xC140 | s4 |
#### register value

#### performance

check if exp_a == 0xFF ,and processor doesn't predict,so it preload two instruction following bne x7 x31 36 <chseck_exp_b>

branch success so the processor flush previous two instructions and load new instruction into IF stage

### bf16_div
#### assembly code
:::spoiler
```s
.equ BF16_SIGN_MASK, 0x8000
.equ BF16_EXP_MASK, 0x7F80
.equ BF16_MANT_MASK, 0x007F
.equ BF16_EXP_BIAS, 127
.equ BF16_NAN, 0x7FC0
.equ BF16_ZERO, 0x0000
.data
test1_a: .half 0x4100 # 8.0
test1_b: .half 0x4000 # 2.0
test2_a: .half 0x4120 # 10.0
test2_b: .half 0x7FC0 # NaN
test3_a: .half 0x0000 # 0.0
test3_b: .half 0x40A0 # 5.0
test4_a: .half 0x7F80 # +Inf
test4_b: .half 0x4000 # 2.0
test5_a: .half 0x40C0 # 6.0
test5_b: .half 0x0000 # 0.0
finish_msg: .string "test finish\n"
.text
.globl _start
_start:
# Test Case 1: 8.0 / 2.0 = 4.0
# Expected: 0x4080
la t0, test1_a
lh a0, 0(t0)
la t0, test1_b
lh a1, 0(t0)
jal ra, bf16_div
mv s0, a0
# Test Case 2: 10.0 / NaN = NaN
# Expected: 0x7FC0
la t0, test2_a
lh a0, 0(t0)
la t0, test2_b
lh a1, 0(t0)
jal ra, bf16_div
mv s1, a0
# Test Case 3: 0.0 / 5.0 = 0.0
# Expected: 0x0000
la t0, test3_a
lh a0, 0(t0)
la t0, test3_b
lh a1, 0(t0)
jal ra, bf16_div
mv s2, a0
# Test Case 4: +Inf / 2.0 = +Inf
# Expected: 0x7F80
la t0, test4_a
lh a0, 0(t0)
la t0, test4_b
lh a1, 0(t0)
jal ra, bf16_div
mv s3, a0
# Test Case 5: 6.0 / 0.0 = +Inf
# Expected: 0x7F80
la t0, test5_a
lh a0, 0(t0)
la t0, test5_b
lh a1, 0(t0)
jal ra, bf16_div
mv s4, a0
# Summary of Results:
# s0 = 0x4080 | 8.0 / 2.0 = 4.0
# s1 = 0x7FC0 | 10.0 / NaN = NaN
# s2 = 0x0000 | 0.0 / 5.0 = 0.0
# s3 = 0x7F80 | +Inf / 2.0 = +Inf
# s4 = 0x7F80 | 6.0 / 0.0 = +Inf
la a0, finish_msg
li a7, 4 # syscall 4: print string
ecall
# Exit program
li a7, 10 # syscall 10: exit
ecall
#=============================================================================
# bf16_div: BF16 division
# Input: a0 = bf16_t a, a1 = bf16_t b
# Output: a0 = bf16_t result
#=============================================================================
.globl bf16_div
bf16_div:
# Save registers
addi sp, sp, -20
sw s0, 0(sp)
sw s1, 4(sp)
sw s2, 8(sp)
sw s3, 12(sp)
sw s4, 16(sp)
# Extract sign_a (bit 15)
srli t0, a0, 15
andi t0, t0, 1 # t0 = sign_a
srli t1, a1, 15
andi t1, t1, 1 # t1 = sign_b
srli t2, a0, 7
andi t2, t2, 0xFF # t2 = exp_a
srli t3, a1, 7
andi t3, t3, 0xFF # t3 = exp_b
andi t4, a0, 0x7F # t4 = mant_a
andi t5, a1, 0x7F # t5 = mant_b
xor s0, t0, t1 # s0 = result_sign
# Check if exp_b == 0xFF (b is NaN or Inf)
li t6, 0xFF
bne t3, t6, check_b_zero_div
# exp_b == 0xFF
bnez t5, return_b_div # if (mant_b) => b = NaN
# b is infinity
# 檢查a是否為特殊數 若不是則回傳sign zero
li t6, 0xFF
bne t2, t6, return_signed_zero_div
# a,b皆為inf => Inf / Inf = NaN
bnez t4, return_a_div # mant_a != 0 => a = NaN =>直接回傳a
li t6, BF16_NAN # mant_a = 0 => inf/inf => NaN
mv a0, t6
j restore_and_return_div
check_b_zero_div:
# Check if b is zero (!exp_b && !mant_b)
or a2, t3, t5
bnez a2, check_a_inf_div
# b is zero
# Check if a is also zero
or a2, t2, t4
beqz a2, return_nan_div # 0 / 0 = NaN
# a / 0 = signed Inf: (result_sign << 15) | 0x7F80
slli a0, s0, 15
li t6, 0x7F80
or a0, a0, t6
j restore_and_return_div
check_a_inf_div:
# Check if exp_a == 0xFF (a is NaN or Inf)
li t6, 0xFF
bne t2, t6, check_a_zero_div
# exp_a == 0xFF
bnez t4, return_a_div # if (mant_a) return a (NaN)
# a is infinity, return signed infinity: (result_sign << 15) | 0x7F80
slli a0, s0, 15
li t6, 0x7F80
or a0, a0, t6
j restore_and_return_div
check_a_zero_div:
# Check if a is zero (!exp_a && !mant_a)
or a2, t2, t4
beqz a2, return_signed_zero_div
# Add implicit bit for normalized numbers
bnez t2, add_implicit_a_div
j skip_implicit_a_div
add_implicit_a_div:
ori t4, t4, 0x80 # mant_a |= 0x80
skip_implicit_a_div:
bnez t3, add_implicit_b_div
j perform_division
add_implicit_b_div:
ori t5, t5, 0x80 # mant_b |= 0x80
perform_division:
# Long division algorithm
# dividend = mant_a << 15
slli s1, t4, 15 # s1 = dividend
mv s2, t5 # s2 = divisor
li s3, 0 # s3 = quotient
# Division loop (16 iterations)
li s4, 0 # s4 = loop counter
div_loop:
li a2, 16
bge s4, a2, div_done
# quotient <<= 1
slli s3, s3, 1
# Calculate shift amount: 15 - i
li a2, 15
sub a3, a2, s4
# Calculate (divisor << shift_amount)
sll a4, s2, a3
# Check if dividend >= (divisor << shift_amount)
bltu s1, a4, div_skip
# dividend -= (divisor << shift_amount)
sub s1, s1, a4
# quotient |= 1
ori s3, s3, 1
div_skip:
addi s4, s4, 1
j div_loop
div_done:
# s3 now contains quotient
# Calculate result_exp = exp_a - exp_b + BIAS
sub a2, t2, t3 # exp_a - exp_b
li a3, BF16_EXP_BIAS
add a2, a2, a3 # + BIAS
# Adjust for denormal a
bnez t2, check_denormal_b_div
addi a2, a2, -1 # result_exp--
check_denormal_b_div:
# Adjust for denormal b
bnez t3, normalize_quotient
addi a2, a2, 1 # result_exp++
normalize_quotient:
# Check if quotient & 0x8000
lui a3, 0x8 # a3 = 0x8000
and a4, s3, a3
beqz a4, normalize_quotient_left
# Quotient is already normalized, shift right by 8
srli s3, s3, 8
j check_overflow_div
normalize_quotient_left:
# Normalize left
normalize_left_loop:
# Check if quotient & 0x8000
lui a3, 0x8
and a4, s3, a3
bnez a4, normalize_left_done
# Check if result_exp > 1
li a3, 1
ble a2, a3, normalize_left_done
# quotient <<= 1
slli s3, s3, 1
# result_exp--
addi a2, a2, -1
j normalize_left_loop
normalize_left_done:
# Shift right by 8 to get 7-bit mantissa
srli s3, s3, 8
check_overflow_div:
# Extract mantissa (7 bits)
andi s3, s3, 0x7F
# Check if result_exp >= 0xFF
li a3, 0xFF
blt a2, a3, check_underflow_div
# Overflow - return signed infinity: (result_sign << 15) | 0x7F80
slli a0, s0, 15
li a3, 0x7F80
or a0, a0, a3
j restore_and_return_div
check_underflow_div:
# Check if result_exp <= 0
li a3, 0
bgt a2, a3, construct_result_div
# Underflow - return signed zero
j return_signed_zero_div
construct_result_div:
# Construct result
slli a0, s0, 15 # sign << 15
andi a3, a2, 0xFF
slli a3, a3, 7 # (exp & 0xFF) << 7
or a0, a0, a3
andi s3, s3, 0x7F # mant & 0x7F
or a0, a0, s3
j restore_and_return_div
return_nan_div:
li t6, BF16_NAN
mv a0, t6
j restore_and_return_div
return_signed_zero_div:
slli a0, s0, 15
j restore_and_return_div
return_a_div:
# a0 already contains a
j restore_and_return_div
return_b_div:
mv a0, a1
j restore_and_return_div
restore_and_return_div:
# Restore registers
lw s0, 0(sp)
lw s1, 4(sp)
lw s2, 8(sp)
lw s3, 12(sp)
lw s4, 16(sp)
addi sp, sp, 20
ret
```
:::
#### test data
| Test | Operation | Input a | Input b | Expected Result | Result Value | Register |
| ---- | --------- | ------- | ------- | --------------- | ------------ | -------- |
| 1 | DIV | 8.0 | 2.0 | 4.0 | 0x4080 | s0 |
| 2 | DIV | 10.0 | NaN | NaN | 0x7FC0 | s1 |
| 3 | DIV | 0.0 | 5.0 | 0.0 | 0x0000 | s2 |
| 4 | DIV | +Inf | 2.0 | +Inf | 0x7F80 | s3 |
| 5 | DIV | 6.0 | 0.0 | +Inf | 0x7F80 | s4 |
#### register value

#### performance

### bf16_sqrt
#### assembly code
:::spoiler
```.s
# bf16_sqrt: BF16 square root
# Input: a0 = bf16_t a
# Output: a0 = bf16_t result
.equ BF16_SIGN_MASK, 0x8000
.equ BF16_EXP_MASK, 0x7F80
.equ BF16_MANT_MASK, 0x007F
.equ BF16_EXP_BIAS, 127
.equ BF16_NAN, 0x7FC0
.equ BF16_ZERO, 0x0000
.data
# Test data in BF16 format
# Test Case 1: sqrt(16.0)
test1_a: .half 0x4180 # 16.0
# Test Case 2: sqrt(0.0)
test2_a: .half 0x0000 # 0.0
# Test Case 3: sqrt(NaN)
test3_a: .half 0x7FC0 # NaN
# Test Case 4: sqrt(+Inf)
test4_a: .half 0x7F80 # +Inf
# Test Case 5: sqrt(-1.0)
test5_a: .half 0xBF80 # -1.0
finish_msg: .string "test finish\n"
.text
.globl _start
_start:
# Test Case 1: sqrt(16.0) = 4.0
# Expected: 0x4080
la t0, test1_a
lh a0, 0(t0)
jal ra, bf16_sqrt
mv s0, a0 # s0 = result
# Test Case 2: sqrt(0.0) = 0.0
# Expected: 0x0000
la t0, test2_a
lh a0, 0(t0)
jal ra, bf16_sqrt
mv s1, a0 # s1 = result
# Test Case 3: sqrt(NaN) = NaN
# Expected: 0x7FC0
la t0, test3_a
lh a0, 0(t0)
jal ra, bf16_sqrt
mv s2, a0 # s2 = result
# Test Case 4: sqrt(+Inf) = +Inf
# Expected: 0x7F80
la t0, test4_a
lh a0, 0(t0)
jal ra, bf16_sqrt
mv s3, a0 # s3 = result
# Test Case 5: sqrt(-1.0) = NaN
# Expected: 0x7FC0
la t0, test5_a
lh a0, 0(t0)
jal ra, bf16_sqrt
mv s4, a0 # s4 = result
# Summary of Results:
# s0 = 0x4080 | sqrt(16.0) = 4.0
# s1 = 0x0000 | sqrt(0.0) = 0.0
# s2 = 0x7FC0 | sqrt(NaN) = NaN
# s3 = 0x7F80 | sqrt(+Inf) = +Inf
# s4 = 0x7FC0 | sqrt(-1.0) = NaN
# Infinite loop for inspection
la a0, finish_msg
li a7, 4 # syscall 4: print string
ecall
# Exit program
li a7, 10 # syscall 10: exit
ecall
#=============================================================================
# bf16_sqrt: BF16 square root
# Input: a0 = bf16_t a
# Output: a0 = bf16_t result
#=============================================================================
.globl bf16_sqrt
bf16_sqrt:
# Save registers
addi sp, sp, -24
sw s0, 0(sp)
sw s1, 4(sp)
sw s2, 8(sp)
sw s3, 12(sp)
sw s4, 16(sp)
sw ra, 20(sp)
# Extract sign (bit 15)
srli t0, a0, 15
andi t0, t0, 1 # t0 = sign
# Extract exponent (bits 7-14)
srli t1, a0, 7
andi t1, t1, 0xFF # t1 = exp
# Extract mantissa (bits 0-6)
andi t2, a0, 0x7F # t2 = mant
# Check if exp == 0xFF (NaN or Inf)
li t3, 0xFF
bne t1, t3, check_zero_sqrt
# exp == 0xFF
bnez t2, return_a_sqrt # if (mant) return a (NaN propagation)
# a is infinity
bnez t0, return_nan_sqrt # if (sign) sqrt(-Inf) = NaN
j return_a_sqrt # sqrt(+Inf) = +Inf
check_zero_sqrt:
# Check if a is zero (!exp && !mant)
or t3, t1, t2
beqz t3, return_zero_sqrt
# Check if negative number
bnez t0, return_nan_sqrt # sqrt(negative) = NaN
# Check if denormal (exp == 0)
bnez t1, normalized_sqrt
# Denormal number - flush to zero
j return_zero_sqrt
normalized_sqrt:
# Calculate new exponent: new_exp = (old_exp - bias) / 2 + bias
# e = exp - BF16_EXP_BIAS
li t3, BF16_EXP_BIAS
sub s0, t1, t3 # s0 = e = exp - 127
# Get full mantissa with implicit 1
ori s1, t2, 0x80
# Check if exponent is odd
andi t3, s0, 1
beqz t3, exp_even_sqrt
# Exponent is odd: m <<= 1, adjust exponent
slli s1, s1, 1
addi s0, s0, -1 # e = e - 1 (make it even)
exp_even_sqrt:
# Now s0 is even, calculate new_exp = (e / 2) + bias
srai s2, s0, 1 # s2 = e / 2 (arithmetic shift for signed)
li t3, BF16_EXP_BIAS
add s2, s2, t3 # s2 = new_exp = (e / 2) + 127
# Binary search for integer square root
# We want result where result^2 ≈ m * 128
li s3, 90 # low = 90 (roughly sqrt(128))
li s4, 256 # high = 256 (roughly sqrt(512))
li a2, 128 # a2 = result (default)
binary_search_sqrt:
bgtu s3, s4, search_done_sqrt
# mid = (low + high) / 2
add a3, s3, s4
srli a3, a3, 1 # a3 = mid
# Calculate sq = (mid * mid) / 128
mv a4, a3
mv a5, a3
# Multiply mid * mid using shift-and-add
li a6, 0 # a6 = product accumulator
li a7, 0 # a7 = counter
mul_loop_sqrt:
li t3, 8
bge a7, t3, mul_done_sqrt
li t3, 1
sll t3, t3, a7
and t4, a5, t3
beqz t4, mul_skip_sqrt
sll t5, a4, a7
add a6, a6, t5
mul_skip_sqrt:
addi a7, a7, 1
j mul_loop_sqrt
mul_done_sqrt:
# a6 = mid * mid, now divide by 128
srli a6, a6, 7 # sq = (mid * mid) / 128
# Compare sq with m (s1)
bleu a6, s1, update_result_sqrt
# sq > m, search lower half
addi s4, a3, -1 # high = mid - 1
j binary_search_sqrt
update_result_sqrt:
# sq <= m, this could be our answer
mv a2, a3 # result = mid
addi s3, a3, 1 # low = mid + 1
j binary_search_sqrt
search_done_sqrt:
# a2 now contains sqrt(m) scaled appropriately
li t3, 256
bltu a2, t3, check_underflow_sqrt
# result >= 256, shift right and increment exponent
srli a2, a2, 1
addi s2, s2, 1
j check_overflow_sqrt
check_underflow_sqrt:
li t3, 128
bgeu a2, t3, check_overflow_sqrt
# result < 128, need to normalize
normalize_loop_sqrt:
li t3, 128
bgeu a2, t3, check_overflow_sqrt
li t3, 1
ble s2, t3, check_overflow_sqrt
slli a2, a2, 1
addi s2, s2, -1
j normalize_loop_sqrt
check_overflow_sqrt:
# Check if new_exp >= 0xFF
li t3, 0xFF
blt s2, t3, check_underflow_exp_sqrt
# Overflow - return +Inf
li t3, 0x7F80
mv a0, t3
j restore_and_return_sqrt
check_underflow_exp_sqrt:
# Check if new_exp <= 0
li t3, 0
bgt s2, t3, construct_result_sqrt
# Underflow - return 0
j return_zero_sqrt
construct_result_sqrt:
# Extract 7-bit mantissa (remove implicit 1)
andi a2, a2, 0x7F
# Construct result: (0 << 15) | ((new_exp & 0xFF) << 7) | (mant & 0x7F)
# Sign is always 0 for sqrt of positive number
andi t3, s2, 0xFF
slli t3, t3, 7 # (exp & 0xFF) << 7
or a0, t3, a2 # exp | mant
j restore_and_return_sqrt
return_nan_sqrt:
li t3, BF16_NAN
mv a0, t3
j restore_and_return_sqrt
return_zero_sqrt:
li t3, BF16_ZERO
mv a0, t3
j restore_and_return_sqrt
return_a_sqrt:
# a0 already contains a
j restore_and_return_sqrt
restore_and_return_sqrt:
lw ra, 20(sp)
lw s4, 16(sp)
lw s3, 12(sp)
lw s2, 8(sp)
lw s1, 4(sp)
lw s0, 0(sp)
addi sp, sp, 24
ret
```
:::
#### test data
| Test | Operation | Input a | Expected Result | Result Value | Register |
| ---- | --------- | ------- | --------------- | ------------ | -------- |
| 1 | SQRT | 16.0 | 4.0 | 0x4080 | s0 |
| 2 | SQRT | 0.0 | 0.0 | 0x0000 | s1 |
| 3 | SQRT | NaN | NaN | 0x7FC0 | s2 |
| 4 | SQRT | +Inf | +Inf | 0x7F80 | s3 |
| 5 | SQRT | -1.0 | NaN | 0x7FC0 | s4 |
#### register value

#### performance

## [leetcode : power of 2](https://leetcode.com/problems/power-of-two/description/)
### C code:
```C
bool isPowerOfTwo(int n) {
if (n <= 0) return false;
int log_val = log2(n);
return n == (1 << log_val);
}
```
### test data
| data | expect return value |
|:----:|:-------------------:|
| 1 | 1 |
| 16 | 1 |
| 3 | 0 |
| -1 | 0 |
| 0 | 0 |
| 2048 | 1 |
### Key Insight
Powers of 2 have **exactly one bit set**:
- 8 = `0b1000` (one bit at position 3) ✓
- 7 = `0b0111` (three bits set) ✗
1.**Find MSB position** using CLZ
2.Reconstruct number with only that bit
3.Compare with original
### assembly code (with branch)
::: info
[assembly code of power_of_two (with branch)](https://github.com/rainbow0212/ca2025-homework1/blob/main/leetcode_ricsv/power_of_two_riscv.S)
:::
### result

### performance:

### assembly code (branchless)
:::info
[assembly code of power_of_two(branchless)](https://github.com/rainbow0212/ca2025-homework1/blob/main/leetcode_ricsv/power_of_two_riscv_branchless.S)
:::
### performance (branchless)

### how branchless clz work
1.**Right-Fill**
Takes any number and propagates its MSB rightward to create a continuous block of 1s.
Logic: By repeatedly OR-ing the number with itself shifted right by powers of 2 (1, 2, 4, 8, 16), we ensure every bit position after the MSB
becomes 1.
Result: Transforms the problem from "find first 1-bit" to "count total 1-bits"
Example: 0001 0110 → 0001 1111 (5 ones)
2.**Population Count via Parallel Reduction**
Counts the total number of 1-bits using divide-and-conquer with bitwise magic:
Reduce 32→16: Count bits in pairs (2-bit groups)
Reduce 16→8: Combine into 4-bit groups
Reduce 8→4: Combine into 8-bit groups
Reduce 4→2→1: Sum all groups into final count
Key insight: Uses special bit masks (0x55555555, 0x33333333, 0x0F0F0F0F) that allow processing multiple groups simultaneously in a single register.
3.**Final Calculation**
**CLZ = 32 - popcount**
Since we filled all bits after the MSB, the population count tells us how many bits are NOT leading zeros.
## TODO
* Add the GCC-translated RISC-V code and compare it with my assembly code.
* Combine all the function in problem C into one program
* Improve the way problem C test data is handled so that the results of different test cases calling different functions can be displayed on the Ripes console.
## Reference
:::warning
This assignment was completed with assistance from [Claude](https://claude.ai/new) for grammar checking, translation and initial brainstorming. All final analysis and conclusions are my own.
:::
* [Assignment 1: RISC-V Assembly and Instruction Pipeline](https://hackmd.io/@sysprog/2025-arch-homework1)
* [Quiz1 of Computer Architecture (2025 Fall)](https://hackmd.io/@sysprog/arch2025-quiz1-sol)
* [Ripes](https://github.com/mortbopet/Ripes)