# Computer Architecture : homework1 :::warning use 5 stage processor in ripes to simulate ::: ## Problem B ### C code ```c #include <stdbool.h> #include <stdint.h> #include <stdio.h> #include <stdlib.h> typedef uint8_t uf8; static inline unsigned clz(uint32_t x) { int n = 32, c = 16; do { uint32_t y = x >> c; if (y) { n -= c; x = y; } c >>= 1; } while (c); return n - x; } /* Decode uf8 to uint32_t */ uint32_t uf8_decode(uf8 fl) { uint32_t mantissa = fl & 0x0f; uint8_t exponent = fl >> 4; uint32_t offset = (0x7FFF >> (15 - exponent)) << 4; return (mantissa << exponent) + offset; } /* Encode uint32_t to uf8 */ uf8 uf8_encode(uint32_t value) { /* Use CLZ for fast exponent calculation */ if (value < 16) return value; /* Find appropriate exponent using CLZ hint */ int lz = clz(value); int msb = 31 - lz; /* Start from a good initial guess */ uint8_t exponent = 0; uint32_t overflow = 0; if (msb >= 5) { /* Estimate exponent - the formula is empirical */ exponent = msb - 4; if (exponent > 15) exponent = 15; /* Calculate overflow for estimated exponent */ for (uint8_t e = 0; e < exponent; e++) overflow = (overflow << 1) + 16; /* Adjust if estimate was off */ while (exponent > 0 && value < overflow) { overflow = (overflow - 16) >> 1; exponent--; } } /* Find exact exponent */ while (exponent < 15) { uint32_t next_overflow = (overflow << 1) + 16; if (value < next_overflow) break; overflow = next_overflow; exponent++; } uint8_t mantissa = (value - overflow) >> exponent; return (exponent << 4) | mantissa; } /* Test encode/decode round-trip */ static bool test(void) { int32_t previous_value = -1; bool passed = true; for (int i = 0; i < 256; i++) { uint8_t fl = i; int32_t value = uf8_decode(fl); uint8_t fl2 = uf8_encode(value); if (fl != fl2) { printf("%02x: produces value %d but encodes back to %02x\n", fl, value, fl2); passed = false; } if (value <= previous_value) { printf("%02x: value %d <= previous_value %d\n", fl, value, previous_value); passed = false; } previous_value = value; } return passed; } int main(void) { if (test()) { printf("All tests passed.\n"); return 0; } return 1; } ``` :::info [assembly code of problem B](https://github.com/rainbow0212/ca2025-homework1/blob/main/problemB/problem%20B.S) ::: #### performance ![problemB per](https://hackmd.io/_uploads/HkLKAxK6ee.png) #### core Optimizations 1. Binary Search CLZ Implementation ```.s clz: li t1, 32 # n = 32 li t2, 16 # c = 16 (initial step) clz_loop: beqz t2, clz_done srl t3, a0, t2 # y = x >> c beqz t3, clz_no_shift sub t1, t1, t2 # n -= c (found bits) mv a0, t3 # x = y (continue with upper bits) clz_no_shift: srli t2, t2, 1 # c >>= 1 (halve step size) j clz_loop ``` 2.Direct Decode Without Branching ```.s uf8_decode: andi t0, a0, 0x0f # mantissa (4 bits) srli t1, a0, 4 # exponent (4 bits) # Calculate offset: (0x7FFF >> (15-exp)) << 4 li t2, 15 sub t2, t2, t1 # 15 - exponent li t3, 0x7FFF srl t3, t3, t2 # Dynamic shift slli t3, t3, 4 # Align to mantissa sll t0, t0, t1 # mantissa << exponent add a0, t0, t3 # result = scaled_mantissa + offset ret ``` * Zero branches - completely linear execution * 8 instructions total for any input * Uses dynamic shift to calculate logarithmic offset 3. Adaptive Exponent Calculation in Encode ```.s uf8_encode: mv s0, a0 # Save input value li t0, 16 blt s0, t0, encode_small # Fast path: value < 16 jal clz li t0, 31 sub s1, t0, a0 # msb = 31 - clz(value) # Determine initial exponent li t0, 5 blt s1, t0, encode_exp_zero addi t1, s1, -4 # exponent = msb - 4 ``` * Small values (< 16): Identity encoding (1 instruction) * Medium values: Direct exponent from MSB * Large values: Clamped to max exponent (15) ## Problem C ### C code ```c #include <stdbool.h> #include <stdint.h> #include <string.h> typedef struct { uint16_t bits; } bf16_t; #define BF16_SIGN_MASK 0x8000U #define BF16_EXP_MASK 0x7F80U #define BF16_MANT_MASK 0x007FU #define BF16_EXP_BIAS 127 #define BF16_NAN() ((bf16_t) {.bits = 0x7FC0}) #define BF16_ZERO() ((bf16_t) {.bits = 0x0000}) static inline bool bf16_isnan(bf16_t a) { return ((a.bits & BF16_EXP_MASK) == BF16_EXP_MASK) && (a.bits & BF16_MANT_MASK); } static inline bool bf16_isinf(bf16_t a) { return ((a.bits & BF16_EXP_MASK) == BF16_EXP_MASK) && !(a.bits & BF16_MANT_MASK); } static inline bool bf16_iszero(bf16_t a) { return !(a.bits & 0x7FFF); } static inline bf16_t f32_to_bf16(float val) { uint32_t f32bits; memcpy(&f32bits, &val, sizeof(float)); if (((f32bits >> 23) & 0xFF) == 0xFF) return (bf16_t) {.bits = (f32bits >> 16) & 0xFFFF}; f32bits += ((f32bits >> 16) & 1) + 0x7FFF; return (bf16_t) {.bits = f32bits >> 16}; } static inline float bf16_to_f32(bf16_t val) { uint32_t f32bits = ((uint32_t) val.bits) << 16; float result; memcpy(&result, &f32bits, sizeof(float)); return result; } static inline bf16_t bf16_add(bf16_t a, bf16_t b) { uint16_t sign_a = (a.bits >> 15) & 1; uint16_t sign_b = (b.bits >> 15) & 1; int16_t exp_a = ((a.bits >> 7) & 0xFF); int16_t exp_b = ((b.bits >> 7) & 0xFF); uint16_t mant_a = a.bits & 0x7F; uint16_t mant_b = b.bits & 0x7F; if (exp_a == 0xFF) { if (mant_a) return a; if (exp_b == 0xFF) return (mant_b || sign_a == sign_b) ? b : BF16_NAN(); return a; } if (exp_b == 0xFF) return b; if (!exp_a && !mant_a) return b; if (!exp_b && !mant_b) return a; if (exp_a) mant_a |= 0x80; if (exp_b) mant_b |= 0x80; int16_t exp_diff = exp_a - exp_b; uint16_t result_sign; int16_t result_exp; uint32_t result_mant; if (exp_diff > 0) { result_exp = exp_a; if (exp_diff > 8) return a; mant_b >>= exp_diff; } else if (exp_diff < 0) { result_exp = exp_b; if (exp_diff < -8) return b; mant_a >>= -exp_diff; } else { result_exp = exp_a; } if (sign_a == sign_b) { result_sign = sign_a; result_mant = (uint32_t) mant_a + mant_b; if (result_mant & 0x100) { result_mant >>= 1; if (++result_exp >= 0xFF) return (bf16_t) {.bits = (result_sign << 15) | 0x7F80}; } } else { if (mant_a >= mant_b) { result_sign = sign_a; result_mant = mant_a - mant_b; } else { result_sign = sign_b; result_mant = mant_b - mant_a; } if (!result_mant) return BF16_ZERO(); while (!(result_mant & 0x80)) { result_mant <<= 1; if (--result_exp <= 0) return BF16_ZERO(); } } return (bf16_t) { .bits = (result_sign << 15) | ((result_exp & 0xFF) << 7) | (result_mant & 0x7F), }; } static inline bf16_t bf16_sub(bf16_t a, bf16_t b) { b.bits ^= BF16_SIGN_MASK; return bf16_add(a, b); } static inline bf16_t bf16_mul(bf16_t a, bf16_t b) { uint16_t sign_a = (a.bits >> 15) & 1; uint16_t sign_b = (b.bits >> 15) & 1; int16_t exp_a = ((a.bits >> 7) & 0xFF); int16_t exp_b = ((b.bits >> 7) & 0xFF); uint16_t mant_a = a.bits & 0x7F; uint16_t mant_b = b.bits & 0x7F; uint16_t result_sign = sign_a ^ sign_b; if (exp_a == 0xFF) { if (mant_a) return a; if (!exp_b && !mant_b) return BF16_NAN(); return (bf16_t) {.bits = (result_sign << 15) | 0x7F80}; } if (exp_b == 0xFF) { if (mant_b) return b; if (!exp_a && !mant_a) return BF16_NAN(); return (bf16_t) {.bits = (result_sign << 15) | 0x7F80}; } if ((!exp_a && !mant_a) || (!exp_b && !mant_b)) return (bf16_t) {.bits = result_sign << 15}; int16_t exp_adjust = 0; if (!exp_a) { while (!(mant_a & 0x80)) { mant_a <<= 1; exp_adjust--; } exp_a = 1; } else mant_a |= 0x80; if (!exp_b) { while (!(mant_b & 0x80)) { mant_b <<= 1; exp_adjust--; } exp_b = 1; } else mant_b |= 0x80; uint32_t result_mant = (uint32_t) mant_a * mant_b; int32_t result_exp = (int32_t) exp_a + exp_b - BF16_EXP_BIAS + exp_adjust; if (result_mant & 0x8000) { result_mant = (result_mant >> 8) & 0x7F; result_exp++; } else result_mant = (result_mant >> 7) & 0x7F; if (result_exp >= 0xFF) return (bf16_t) {.bits = (result_sign << 15) | 0x7F80}; if (result_exp <= 0) { if (result_exp < -6) return (bf16_t) {.bits = result_sign << 15}; result_mant >>= (1 - result_exp); result_exp = 0; } return (bf16_t) {.bits = (result_sign << 15) | ((result_exp & 0xFF) << 7) | (result_mant & 0x7F)}; } static inline bf16_t bf16_div(bf16_t a, bf16_t b) { uint16_t sign_a = (a.bits >> 15) & 1; uint16_t sign_b = (b.bits >> 15) & 1; int16_t exp_a = ((a.bits >> 7) & 0xFF); int16_t exp_b = ((b.bits >> 7) & 0xFF); uint16_t mant_a = a.bits & 0x7F; uint16_t mant_b = b.bits & 0x7F; uint16_t result_sign = sign_a ^ sign_b; if (exp_b == 0xFF) { if (mant_b) return b; /* Inf/Inf = NaN */ if (exp_a == 0xFF && !mant_a) return BF16_NAN(); return (bf16_t) {.bits = result_sign << 15}; } if (!exp_b && !mant_b) { if (!exp_a && !mant_a) return BF16_NAN(); return (bf16_t) {.bits = (result_sign << 15) | 0x7F80}; } if (exp_a == 0xFF) { if (mant_a) return a; return (bf16_t) {.bits = (result_sign << 15) | 0x7F80}; } if (!exp_a && !mant_a) return (bf16_t) {.bits = result_sign << 15}; if (exp_a) mant_a |= 0x80; if (exp_b) mant_b |= 0x80; uint32_t dividend = (uint32_t) mant_a << 15; uint32_t divisor = mant_b; uint32_t quotient = 0; for (int i = 0; i < 16; i++) { quotient <<= 1; if (dividend >= (divisor << (15 - i))) { dividend -= (divisor << (15 - i)); quotient |= 1; } } int32_t result_exp = (int32_t) exp_a - exp_b + BF16_EXP_BIAS; if (!exp_a) result_exp--; if (!exp_b) result_exp++; if (quotient & 0x8000) quotient >>= 8; else { while (!(quotient & 0x8000) && result_exp > 1) { quotient <<= 1; result_exp--; } quotient >>= 8; } quotient &= 0x7F; if (result_exp >= 0xFF) return (bf16_t) {.bits = (result_sign << 15) | 0x7F80}; if (result_exp <= 0) return (bf16_t) {.bits = result_sign << 15}; return (bf16_t) {.bits = (result_sign << 15) | ((result_exp & 0xFF) << 7) | (quotient & 0x7F)}; } static inline bf16_t bf16_sqrt(bf16_t a) { uint16_t sign = (a.bits >> 15) & 1; int16_t exp = ((a.bits >> 7) & 0xFF); uint16_t mant = a.bits & 0x7F; /* Handle special cases */ if (exp == 0xFF) { if (mant) return a; /* NaN propagation */ if (sign) return BF16_NAN(); /* sqrt(-Inf) = NaN */ return a; /* sqrt(+Inf) = +Inf */ } /* sqrt(0) = 0 (handle both +0 and -0) */ if (!exp && !mant) return BF16_ZERO(); /* sqrt of negative number is NaN */ if (sign) return BF16_NAN(); /* Flush denormals to zero */ if (!exp) return BF16_ZERO(); /* Direct bit manipulation square root algorithm */ /* For sqrt: new_exp = (old_exp - bias) / 2 + bias */ int32_t e = exp - BF16_EXP_BIAS; int32_t new_exp; /* Get full mantissa with implicit 1 */ uint32_t m = 0x80 | mant; /* Range [128, 256) representing [1.0, 2.0) */ /* Adjust for odd exponents: sqrt(2^odd * m) = 2^((odd-1)/2) * sqrt(2*m) */ if (e & 1) { m <<= 1; /* Double mantissa for odd exponent */ new_exp = ((e - 1) >> 1) + BF16_EXP_BIAS; } else { new_exp = (e >> 1) + BF16_EXP_BIAS; } /* Now m is in range [128, 256) or [256, 512) if exponent was odd */ /* Binary search for integer square root */ /* We want result where result^2 = m * 128 (since 128 represents 1.0) */ uint32_t low = 90; /* Min sqrt (roughly sqrt(128)) */ uint32_t high = 256; /* Max sqrt (roughly sqrt(512)) */ uint32_t result = 128; /* Default */ /* Binary search for square root of m */ while (low <= high) { uint32_t mid = (low + high) >> 1; uint32_t sq = (mid * mid) / 128; /* Square and scale */ if (sq <= m) { result = mid; /* This could be our answer */ low = mid + 1; } else { high = mid - 1; } } /* result now contains sqrt(m) * sqrt(128) / sqrt(128) = sqrt(m) */ /* But we need to adjust the scale */ /* Since m is scaled where 128=1.0, result should also be scaled same way */ /* Normalize to ensure result is in [128, 256) */ if (result >= 256) { result >>= 1; new_exp++; } else if (result < 128) { while (result < 128 && new_exp > 1) { result <<= 1; new_exp--; } } /* Extract 7-bit mantissa (remove implicit 1) */ uint16_t new_mant = result & 0x7F; /* Check for overflow/underflow */ if (new_exp >= 0xFF) return (bf16_t) {.bits = 0x7F80}; /* +Inf */ if (new_exp <= 0) return BF16_ZERO(); return (bf16_t) {.bits = ((new_exp & 0xFF) << 7) | new_mant}; } ``` **In problem C, I separate functions for individual testing to observe their performance ,and use 5 ** ### bf16_isnan #### assembly code :::spoiler ```.s .equ BF16_SIGN_MASK, 0x8000 .equ BF16_EXP_MASK, 0x7F80 .equ BF16_MANT_MASK, 0x007F .equ BF16_EXP_BIAS, 127 .equ BF16_NAN, 0x7FC0 .equ BF16_ZERO, 0x0000 .data test1: .half 0x3F80 # 1.0 (normal number) test2: .half 0x7FC0 # NaN test3: .half 0x0000 # 0.0 test4: .half 0x7F80 # +Inf test5: .half 0xFF80 # -Inf finish_msg: .string "test finish\n" .text .globl _start _start: la t0, test1 lh a0, 0(t0) jal ra, bf16_isnan mv s0, a0 la t0, test2 lh a0, 0(t0) jal ra, bf16_isnan mv s1, a0 la t0, test3 lh a0, 0(t0) jal ra, bf16_isnan mv s2, a0 la t0, test4 lh a0, 0(t0) jal ra, bf16_isnan mv s3, a0 la t0, test5 lh a0, 0(t0) jal ra, bf16_isnan mv s4, a0 la a0, finish_msg li a7, 4 ecall li a7, 10 ecall .global bf16_isnan bf16_isnan: li t0, BF16_EXP_MASK and t1, a0, t0 bne t1, t0, isnan_false li t0, BF16_MANT_MASK and t1, a0, t0 beqz t1, isnan_false li a0, 1 ret isnan_false: li a0, 0 ret ``` ::: #### test data | Test | Operation | Input a | Expected Result | Result Value | Register | | ---- | --------- | ------- | --------------- | ------------ | -------- | | 1 | ISNAN | 1.0 | false | 0 | s0 | | 2 | ISNAN | NaN | true | 1 | s1 | | 3 | ISNAN | 0.0 | false | 0 | s2 | | 4 | ISNAN | +Inf | false | 0 | s3 | | 5 | ISNAN | -Inf | false | 0 | s4 | #### register value ![isnan_reg](https://hackmd.io/_uploads/Ske_cktalx.png) #### performance ![isnan_clock](https://hackmd.io/_uploads/Bk6h5yYage.png) ### bf16_isinf #### assembly code :::spoiler ```.s .equ BF16_SIGN_MASK, 0x8000 .equ BF16_EXP_MASK, 0x7F80 .equ BF16_MANT_MASK, 0x007F .equ BF16_EXP_BIAS, 127 .equ BF16_NAN, 0x7FC0 .equ BF16_ZERO, 0x0000 .data test1: .half 0x3F80 # 1.0 (normal number) test2: .half 0x7FC0 # NaN test3: .half 0x0000 # 0.0 test4: .half 0x7F80 # +Inf test5: .half 0xFF80 # -Inf finish_msg: .string "test finish\n" .text .globl _start _start: la t0, test1 lh a0, 0(t0) jal ra, bf16_isinf mv s0, a0 la t0, test2 lh a0, 0(t0) jal ra, bf16_isinf mv s1, a0 la t0, test3 lh a0, 0(t0) jal ra, bf16_isinf mv s2, a0 la t0, test4 lh a0, 0(t0) jal ra, bf16_isinf mv s3, a0 la t0, test5 lh a0, 0(t0) jal ra, bf16_isinf mv s4, a0 la a0, finish_msg li a7, 4 ecall li a7, 10 ecall .globl bf16_isinf bf16_isinf: li t0, BF16_EXP_MASK and t1, a0, t0 bne t1, t0, isinf_false li t0, BF16_MANT_MASK and t1, a0, t0 bnez t1, isinf_false li a0, 1 ret isinf_false: li a0, 0 ret ``` ::: #### test data | Test | Operation | Input a | Expected Result | Result Value | Register | | ---- | --------- | ------- | --------------- | ------------ | -------- | | 1 | ISINF | 1.0 | false | 0 | s0 | | 2 | ISINF | NaN | false | 0 | s1 | | 3 | ISINF | 0.0 | false | 0 | s2 | | 4 | ISINF | +Inf | true | 1 | s3 | | 5 | ISINF | -Inf | true | 1 | s4 | #### register value ![reg](https://hackmd.io/_uploads/r1crJxtage.png) #### performance ![per](https://hackmd.io/_uploads/S1MIyxt6ee.png) ### bf16_iszero #### assembly code :::spoiler ```.s .equ BF16_SIGN_MASK, 0x8000 .equ BF16_EXP_MASK, 0x7F80 .equ BF16_MANT_MASK, 0x007F .equ BF16_EXP_BIAS, 127 .equ BF16_NAN, 0x7FC0 .equ BF16_ZERO, 0x0000 .data test1: .half 0x3F80 # 1.0 (normal number) test2: .half 0x7FC0 # NaN test3: .half 0x0000 # 0.0 test4: .half 0x7F80 # +Inf test5: .half 0xFF80 # -Inf finish_msg: .string "test finish\n" .text .globl _start _start: la t0, test1 lh a0, 0(t0) jal ra, bf16_iszero mv s0, a0 la t0, test2 lh a0, 0(t0) jal ra, bf16_iszero mv s1, a0 la t0, test3 lh a0, 0(t0) jal ra, bf16_iszero mv s2, a0 la t0, test4 lh a0, 0(t0) jal ra, bf16_iszero mv s3, a0 la t0, test5 lh a0, 0(t0) jal ra, bf16_iszero mv s4, a0 la a0, finish_msg li a7, 4 ecall li a7, 10 ecall bf16_iszero: li t0, 0x7FFF and t1, a0, t0 seqz a0, t1 # a0 = (t1 == 0) ? 1 : 0 ret ``` ::: #### test data | Test | Operation | Input a | Expected Result | Result Value | Register | | ---- | --------- | ------- | --------------- | ------------ | -------- | | 1 | ISZERO | 1.0 | false | 0 | s0 | | 2 | ISZERO | NaN | false | 0 | s1 | | 3 | ISZERO | 0.0 | true | 1 | s2 | | 4 | ISZERO | +Inf | false | 0 | s3 | | 5 | ISZERO | -Inf | false | 0 | s4 | #### register value ![reg](https://hackmd.io/_uploads/r12mkxKTle.png) #### performance ![performance](https://hackmd.io/_uploads/HkUEkxKpgg.png) ### f32_to_bf16 #### assembly code :::spoiler ```.s .equ BF16_SIGN_MASK, 0x8000 .equ BF16_EXP_MASK, 0x7F80 .equ BF16_MANT_MASK, 0x007F .equ BF16_EXP_BIAS, 127 .equ BF16_NAN, 0x7FC0 .equ BF16_ZERO, 0x0000 .data test1_a: .word 0x40490000 # 3.140625 test2_a: .word 0x7FC00000 # NaN test3_a: .word 0x7F800000 # +Inf test4_a: .word 0x00000000 # 0.0 test5_a: .word 0xBFC00000 # -1.5 .text .globl _start _start: la t0, test1_a lw a0, 0(t0) jal ra, f32_to_bf16 mv s0, a0 la t0, test2_a lw a0, 0(t0) jal ra, f32_to_bf16 mv s1, a0 la t0, test3_a lw a0, 0(t0) jal ra, f32_to_bf16 mv s2, a0 la t0, test4_a lw a0, 0(t0) jal ra, f32_to_bf16 mv s3, a0 la t0, test5_a lw a0, 0(t0) jal ra, f32_to_bf16 mv s4, a0 # Summary of Results: # s0 = 0x4049 | 3.140625 -> BF16 # s1 = 0x7FC0 | NaN -> BF16 # s2 = 0x7F80 | +Inf -> BF16 # s3 = 0x0000 | 0.0 -> BF16 # s4 = 0xBFC0 | -1.5 -> BF16 exit_loop: j exit_loop .globl f32_to_bf16 f32_to_bf16: srli t0, a0, 23 andi t0, t0, 0xFF li t1, 0xFF bne t0, t1, f32_to_bf16_normal srli a0, a0, 16 ret f32_to_bf16_normal: srli t0, a0, 16 andi t0, t0, 1 li t1, 0x7FFF add t0, t0, t1 add a0, a0, t0 srli a0, a0, 16 ret ``` ::: #### test data | Test | Operation | Input a | Expected Result | Result Value | Register | | ---- | ----------- | --------------------- | --------------- | ------------ | -------- | | 1 | f32_to_bf16 | 3.140625 (0x40490000) | 0x4049 | | s0 | | 2 | f32_to_bf16 | NaN (0x7FC00000) | 0x7FC0 | | s1 | | 3 | f32_to_bf16 | +Inf (0x7F800000) | 0x7F80 | | s2 | | 4 | f32_to_bf16 | 0.0 (0x00000000) | 0x0000 | | s3 | | 5 | f32_to_bf16 | -1.5 (0xBFC00000) | 0xBFC0 | | s4 | #### register value ![reg](https://hackmd.io/_uploads/SywMygY6eg.png) #### performance ![per](https://hackmd.io/_uploads/HyuWkgYpxl.png) ### bf16_to_f32 #### assembly code :::spoiler ```.s .equ BF16_SIGN_MASK, 0x8000 .equ BF16_EXP_MASK, 0x7F80 .equ BF16_MANT_MASK, 0x007F .equ BF16_EXP_BIAS, 127 .equ BF16_NAN, 0x7FC0 .equ BF16_ZERO, 0x0000 .data test1_a: .half 0x4049 # 3.140625 test2_a: .half 0x7FC0 # NaN test3_a: .half 0x7F80 # +Inf test4_a: .half 0x0000 # 0.0 test5_a: .half 0xBFC0 # -1.5 finish_msg: .string "test finish\n" .text .globl _start _start: la t0, test1_a lh a0, 0(t0) jal ra, bf16_to_f32 mv s0, a0 la t0, test2_a lh a0, 0(t0) jal ra, bf16_to_f32 mv s1, a0 la t0, test3_a lh a0, 0(t0) jal ra, bf16_to_f32 mv s2, a0 la t0, test4_a lh a0, 0(t0) jal ra, bf16_to_f32 mv s3, a0 la t0, test5_a lh a0, 0(t0) jal ra, bf16_to_f32 mv s4, a0 # s0 = 0x40490000 | 3.140625 (BF16) -> FP32 # s1 = 0x7FC00000 | NaN (BF16) -> FP32 # s2 = 0x7F800000 | +Inf (BF16) -> FP32 # s3 = 0x00000000 | 0.0 (BF16) -> FP32 # s4 = 0xBFC00000 | -1.5 (BF16) -> FP32 la a0, finish_msg li a7, 4 ecall li a7, 10 ecall .globl bf16_to_f32 bf16_to_f32: slli a0, a0, 16 ret ``` ::: #### test data | Test | Operation | Input a | Expected Result | Result Value | Register | | ---- | ----------- | ------- | --------------------- | ------------ | -------- | | 1 | bf16_to_f32 | 0x4049 | 3.140625 (0x40490000) | | s0 | | 2 | bf16_to_f32 | 0x7FC0 | NaN (0x7FC00000) | | s1 | | 3 | bf16_to_f32 | 0x7F80 | +Inf (0x7F800000) | | s2 | | 4 | bf16_to_f32 | 0x0000 | 0.0 (0x00000000) | | s3 | | 5 | bf16_to_f32 | 0xBFC0 | -1.5 (0xBFC00000) | | s4 | #### register value ![16to32reg](https://hackmd.io/_uploads/rk64glF6xl.png) #### performance ![16to32per](https://hackmd.io/_uploads/SyZLxgYaex.png) ### bf16_add #### assembly code :::spoiler ```.s .equ BF16_SIGN_MASK, 0x8000 .equ BF16_EXP_MASK, 0x7F80 .equ BF16_MANT_MASK, 0x007F .equ BF16_EXP_BIAS, 127 .equ BF16_NAN, 0x7FC0 .equ BF16_ZERO, 0x0000 .data # Test data in BF16 format # Test Case 1: (6.0, -2.0) test1_a: .half 0x40C0 # 6.0 test1_b: .half 0xC000 # -2.0 # Test Case 2: (10.0, NaN) test2_a: .half 0x4120 # 10.0 test2_b: .half 0x7FC0 # NaN # Test Case 3: (5.0, 0.0) test3_a: .half 0x40A0 # 5.0 test3_b: .half 0x0000 # 0.0 # Test Case 4: (+Inf, -1.0) test4_a: .half 0x7F80 # +Inf test4_b: .half 0xBF80 # -1.0 # Test Case 5: (-Inf, NaN) test5_a: .half 0xFF80 # -Inf test5_b: .half 0x7FC0 # NaN finish_msg: .string "test finish\n" .text .globl _start _start: la t0, test1_a lh a0, 0(t0) la t0, test1_b lh a1, 0(t0) jal ra, bf16_add mv s0, a0 la t0, test2_a lh a0, 0(t0) la t0, test2_b lh a1, 0(t0) jal ra, bf16_add mv s1, a0 la t0, test3_a lh a0, 0(t0) la t0, test3_b lh a1, 0(t0) jal ra, bf16_add mv s2, a0 la t0, test4_a lh a0, 0(t0) la t0, test4_b lh a1, 0(t0) jal ra, bf16_add mv s3, a0 la t0, test5_a lh a0, 0(t0) la t0, test5_b lh a1, 0(t0) jal ra, bf16_add mv s4, a0 # Summary of Results: # s0 = 0x4080 | 6.0 + (-2.0) = 4.0 # s1 = 0x7FC0 | 10.0 + NaN = NaN # s2 = 0x40A0 | 5.0 + 0.0 = 5.0 # s3 = 0x7F80 | +Inf + (-1.0) = +Inf # s4 = 0x7FC0 | -Inf + NaN = NaN la a0, finish_msg li a7, 4 # syscall 4: print string ecall # Exit program li a7, 10 # syscall 10: exit ecall .globl bf16_add bf16_add: srli t0, a0, 15 andi t0, t0, 1 # t0 = sign_a srli t1, a1, 15 andi t1, t1, 1 # t1 = sign_b srli t2, a0, 7 andi t2, t2, 0xFF # t2 = exp_a srli t3, a1, 7 andi t3, t3, 0xFF # t3 = exp_b andi t4, a0, 0x7F # t4 = mant_a andi t5, a1, 0x7F # t5 = mant_b li t6, 0xFF # t6 = 0xFF bne t2, t6, check_exp_b # exp_a == 0xFF bnez t4, return_a # if (mant_a) => a = NaN => return a # check if exp_b == 0xFF bne t3, t6, return_a # a and b's exponents are 0xFF bnez t5, return_b # if (mant_b) beq t0, t1, return_b # if (sign_a == sign_b) # return NaN li t6, BF16_NAN mv a0, t6 ret check_exp_b: # check if exp_b == 0xFF bne t3, t6, check_zero_a j return_b check_zero_a: # check if a is zero (!exp_a && !mant_a) => or t6, t2, t4 bnez t6, check_zero_b j return_b check_zero_b: # check if b is zero (!exp_b && !mant_b) or t6, t3, t5 bnez t6, add_implicit_bit j return_a add_implicit_bit: # exp_a != 0 => a is normal => add implicit bit beqz t2, check_exp_b_implicit ori t4, t4, 0x80 check_exp_b_implicit: # same as a beqz t3, compute_exp_diff ori t5, t5, 0x80 compute_exp_diff: # exp_diff = exp_a - exp_b sub a2, t2, t3 # a2 = exp_diff # determine result_exp and align mantissas bgtz a2, exp_diff_positive bltz a2, exp_diff_negative # exp_diff == 0 mv a3, t2 # result_exp = exp_a j check_sign_same exp_diff_positive: # exp_diff > 0 mv a3, t2 # result_exp = exp_a ## if (exp_diff > 8) => 數值相差太大 => 直接捨棄b,並回傳a li t6, 8 bgt a2, t6, return_a # mant_b >>= exp_diff srl t5, t5, a2 j check_sign_same exp_diff_negative: # exp_diff < 0 mv a3, t3 # result_exp = exp_b ## if (exp_diff < -8) => 數值相差太大 => 直接捨棄a,並回傳b li t6, -8 blt a2, t6, return_b # mant_a >>= -exp_diff neg a2, a2 srl t4, t4, a2 check_sign_same: bne t0, t1, signs_differ # signs are the same - addition mv a4, t0 # result_sign = sign_a add a5, t4, t5 # result_mant = mant_a + mant_b # check if result_mant & 0x100 => 是否進位 andi t6, a5, 0x100 beqz t6, construct_result # 有進位 => mant >> 1 srli a5, a5, 1 # exp++ addi a3, a3, 1 # 檢查進位後是否變為特殊數 li t6, 0xFF blt a3, t6, construct_result # infinity slli a0, a4, 15 li t6, 0x7F80 or a0, a0, t6 ret signs_differ: bgeu t4, t5, mant_a_larger # mant_b > mant_a mv a4, t1 # result_sign = sign_b sub a5, t5, t4 # result_mant = mant_b - mant_a j check_result_zero mant_a_larger: mv a4, t0 # result_sign = sign_a sub a5, t4, t5 # result_mant = mant_a - mant_b check_result_zero: ## if (!result_mant) return 0 bnez a5, normalize_result li t6, BF16_ZERO mv a0, t6 ret normalize_result: # Normalize: while (!(result_mant & 0x80)) normalize_loop: andi t6, a5, 0x80 bnez t6, construct_result # result_mant <<= 1 slli a5, a5, 1 # result_exp-- addi a3, a3, -1 ## if (result_exp <= 0) return 0 blez a3, return_zero j normalize_loop construct_result: # Construct result: (sign << 15) | ((exp & 0xFF) << 7) | (mant & 0x7F) slli a0, a4, 15 # sign << 15 andi t6, a3, 0xFF slli t6, t6, 7 # (exp & 0xFF) << 7 or a0, a0, t6 andi a5, a5, 0x7F # mant & 0x7F or a0, a0, a5 ret return_zero: li t6, BF16_ZERO mv a0, t6 ret return_a: ret return_b: mv a0, a1 ret ``` ::: #### test data | Test | Operation | Input a | Input b | Expected Result | Result Value | Register | | ---- | --------- | ------- | ------- | --------------- | ------------ | -------- | | 1 | ADD | 6.0 | -2.0 | 4.0 | 0x4080 | s0 | | 2 | ADD | 10.0 | NaN | NaN | 0x7FC0 | s1 | | 3 | ADD | 5.0 | 0.0 | 5.0 | 0x40A0 | s2 | | 4 | ADD | +Inf | -1.0 | +Inf | 0x7F80 | s3 | | 5 | ADD | -Inf | NaN | NaN | 0x7FC0 | s4 | #### register value ![addreg](https://hackmd.io/_uploads/B1If-lKaex.png) #### performance ![addper](https://hackmd.io/_uploads/Bk5QZeFpee.png) #### Core Optimizations 1. Zero-Stack Register-Only Design ```.s # All operands in registers - no memory access srli t0, a0, 15 # sign_a srli t1, a1, 15 # sign_b srli t2, a0, 7 # exp_a srli t3, a1, 7 # exp_b andi t4, a0, 0x7F # mant_a andi t5, a1, 0x7F # mant_b ``` 2. Fast-Path Branch Optimization ```.s # Check special cases (0.2% probability) bne t2, t6, check_exp_b # Taken 99.8% bnez t4, return_a # Rare NaN case check_zero_a: or t6, t2, t4 # Single-cycle zero check bnez t6, check_zero_b # Continue hot path ``` 3. Minimal Branch Count ```.s # Implicit bit addition without extra branches add_implicit_bit: beqz t2, check_exp_b_implicit ori t4, t4, 0x80 check_exp_b_implicit: beqz t3, compute_exp_diff ori t5, t5, 0x80 ``` ### bf16_sub #### assembly code :::spoiler ```.s # bf16_sub: BF16 subtraction # Input: a0 = bf16_t a, a1 = bf16_t b # Output: a0 = bf16_t result .equ BF16_SIGN_MASK, 0x8000 .equ BF16_EXP_MASK, 0x7F80 .equ BF16_MANT_MASK, 0x007F .equ BF16_EXP_BIAS, 127 .equ BF16_NAN, 0x7FC0 .equ BF16_ZERO, 0x0000 .data # Test data in BF16 format # Test Case 1: (6.0, -2.0) test1_a: .half 0x40C0 # 6.0 test1_b: .half 0xC000 # -2.0 # Test Case 2: (10.0, NaN) test2_a: .half 0x4120 # 10.0 test2_b: .half 0x7FC0 # NaN # Test Case 3: (5.0, 0.0) test3_a: .half 0x40A0 # 5.0 test3_b: .half 0x0000 # 0.0 # Test Case 4: (+Inf, -1.0) test4_a: .half 0x7F80 # +Inf test4_b: .half 0xBF80 # -1.0 # Test Case 5: (-Inf, NaN) test5_a: .half 0xFF80 # -Inf test5_b: .half 0x7FC0 # NaN finish_msg: .string "test finish\n" .text .globl _start _start: # Test Case 1: 6.0 - (-2.0) = 8.0 # Expected: 0x4100 la t0, test1_a lh a0, 0(t0) la t0, test1_b lh a1, 0(t0) jal ra, bf16_sub mv s0, a0 # s0 = 0x4100 (8.0) # Test Case 2: 10.0 - NaN = NaN # Expected: 0xFFC0 (NaN with sign flipped) la t0, test2_a lh a0, 0(t0) la t0, test2_b lh a1, 0(t0) jal ra, bf16_sub mv s1, a0 # s1 = 0xFFC0 (NaN) # Test Case 3: 5.0 - 0.0 = 5.0 # Expected: 0x40A0 la t0, test3_a lh a0, 0(t0) la t0, test3_b lh a1, 0(t0) jal ra, bf16_sub mv s2, a0 # s2 = 0x40A0 (5.0) # Test Case 4: +Inf - (-1.0) = +Inf # Expected: 0x7F80 la t0, test4_a lh a0, 0(t0) la t0, test4_b lh a1, 0(t0) jal ra, bf16_sub mv s3, a0 # s3 = 0x7F80 (+Inf) # Test Case 5: -Inf - NaN = NaN # Expected: 0xFFC0 (NaN with sign flipped) la t0, test5_a lh a0, 0(t0) la t0, test5_b lh a1, 0(t0) jal ra, bf16_sub mv s4, a0 # s4 = 0xFFC0 (NaN) # Summary of Results: # s0 = 0x4100 | 6.0 - (-2.0) = 8.0 # s1 = 0xFFC0 | 10.0 - NaN = NaN # s2 = 0x40A0 | 5.0 - 0.0 = 5.0 # s3 = 0x7F80 | +Inf - (-1.0) = +Inf # s4 = 0xFFC0 | -Inf - NaN = NaN # Print "test finish" to console la a0, finish_msg li a7, 4 # syscall 4: print string ecall # Exit program li a7, 10 # syscall 10: exit ecall #============================================================================= # bf16_add: BF16 addition # Input: a0 = bf16_t a, a1 = bf16_t b # Output: a0 = bf16_t result #============================================================================= .globl bf16_add bf16_add: srli t0, a0, 15 andi t0, t0, 1 # t0 = sign_a srli t1, a1, 15 andi t1, t1, 1 # t1 = sign_b srli t2, a0, 7 andi t2, t2, 0xFF # t2 = exp_a srli t3, a1, 7 andi t3, t3, 0xFF # t3 = exp_b andi t4, a0, 0x7F # t4 = mant_a andi t5, a1, 0x7F # t5 = mant_b li t6, 0xFF # t6 = 0xFF bne t2, t6, check_exp_b # exp_a == 0xFF bnez t4, return_a # if (mant_a) => a = NaN => return a # check if exp_b == 0xFF bne t3, t6, return_a # a and b's exponents are 0xFF bnez t5, return_b # if (mant_b) beq t0, t1, return_b # if (sign_a == sign_b) # return NaN li t6, BF16_NAN mv a0, t6 ret check_exp_b: # check if exp_b == 0xFF bne t3, t6, check_zero_a j return_b check_zero_a: # check if a is zero (!exp_a && !mant_a) => or t6, t2, t4 bnez t6, check_zero_b j return_b check_zero_b: # check if b is zero (!exp_b && !mant_b) or t6, t3, t5 bnez t6, add_implicit_bit j return_a add_implicit_bit: # exp_a != 0 => a is normal => add implicit bit beqz t2, check_exp_b_implicit ori t4, t4, 0x80 check_exp_b_implicit: # same as a beqz t3, compute_exp_diff ori t5, t5, 0x80 compute_exp_diff: # exp_diff = exp_a - exp_b sub a2, t2, t3 # a2 = exp_diff # determine result_exp and align mantissas bgtz a2, exp_diff_positive bltz a2, exp_diff_negative # exp_diff == 0 mv a3, t2 # result_exp = exp_a j check_sign_same exp_diff_positive: # exp_diff > 0 mv a3, t2 # result_exp = exp_a ## if (exp_diff > 8) => 數值相差太大 => 直接捨棄b,並回傳a li t6, 8 bgt a2, t6, return_a # mant_b >>= exp_diff srl t5, t5, a2 j check_sign_same exp_diff_negative: # exp_diff < 0 mv a3, t3 # result_exp = exp_b ## if (exp_diff < -8) => 數值相差太大 => 直接捨棄a,並回傳b li t6, -8 blt a2, t6, return_b # mant_a >>= -exp_diff neg a2, a2 srl t4, t4, a2 check_sign_same: bne t0, t1, signs_differ # signs are the same - addition mv a4, t0 # result_sign = sign_a add a5, t4, t5 # result_mant = mant_a + mant_b # check if result_mant & 0x100 => 是否進位 andi t6, a5, 0x100 beqz t6, construct_result # 有進位 => mant >> 1 srli a5, a5, 1 # exp++ addi a3, a3, 1 # 檢查進位後是否變為特殊數 li t6, 0xFF blt a3, t6, construct_result # infinity slli a0, a4, 15 li t6, 0x7F80 or a0, a0, t6 ret signs_differ: bgeu t4, t5, mant_a_larger # mant_b > mant_a mv a4, t1 # result_sign = sign_b sub a5, t5, t4 # result_mant = mant_b - mant_a j check_result_zero mant_a_larger: mv a4, t0 # result_sign = sign_a sub a5, t4, t5 # result_mant = mant_a - mant_b check_result_zero: ## if (!result_mant) return 0 bnez a5, normalize_result li t6, BF16_ZERO mv a0, t6 ret normalize_result: # Normalize: while (!(result_mant & 0x80)) normalize_loop: andi t6, a5, 0x80 bnez t6, construct_result # result_mant <<= 1 slli a5, a5, 1 # result_exp-- addi a3, a3, -1 ## if (result_exp <= 0) return 0 blez a3, return_zero j normalize_loop construct_result: # Construct result: (sign << 15) | ((exp & 0xFF) << 7) | (mant & 0x7F) slli a0, a4, 15 # sign << 15 andi t6, a3, 0xFF slli t6, t6, 7 # (exp & 0xFF) << 7 or a0, a0, t6 andi a5, a5, 0x7F # mant & 0x7F or a0, a0, a5 ret return_zero: li t6, BF16_ZERO mv a0, t6 ret return_a: ret # a0 already contains a return_b: mv a0, a1 ret .global bf16_sub bf16_sub: lui t0, 0x8 xor a1, a1, t0 # a1 ^= 0x8000 j bf16_add ``` ::: #### test data | Test | Operation | Input a | Input b | Expected Result | Result Value | Register | | ---- | --------- | ------- | ------- | --------------- | ------------ | -------- | | 1 | SUB | 6.0 | -2.0 | 8.0 | 0x4100 | s0 | | 2 | SUB | 10.0 | NaN | NaN | 0x7FC0 | s1 | | 3 | SUB | 5.0 | 0.0 | 5.0 | 0x40A0 | s2 | | 4 | SUB | +Inf | -1.0 | +Inf | 0x7F80 | s3 | | 5 | SUB | -Inf | NaN | NaN | 0x7FC0 | s4 | in test 2 ,the value in s1 register is 0xFFC0 ,it doesn't equal to 0X7FC0,but 0xFFC0 is NaN too,so the result still correct a0 = a = 0x4120, a1 = b = 0x7FC0 ```.s bf16_sub: lui t0, 0x8 xor a1, a1, t0 # a1 ^= 0x8000 j bf16_add ``` => a1 = b = 0xFFC0 =NaN ```.s # a and b's exponents are 0xFF bnez t5, return_b # if (mant_b) beq t0, t1, return_b # if (sign_a == sign_b) # return NaN li t6, BF16_NAN mv a0, t6 ret ``` => s1 = 0xFFC0 #### register value ![subreg](https://hackmd.io/_uploads/H1vN7lFpxe.png) #### performance ![subper](https://hackmd.io/_uploads/HJqU7eYpel.png) ### bf16_mul #### assembly code :::spoiler ```s .equ BF16_SIGN_MASK, 0x8000 .equ BF16_EXP_MASK, 0x7F80 .equ BF16_MANT_MASK, 0x007F .equ BF16_EXP_BIAS, 127 .equ BF16_NAN, 0x7FC0 .equ BF16_ZERO, 0x0000 .data test1_a: .half 0x4040 # 3.0 test1_b: .half 0x4000 # 2.0 test2_a: .half 0x40A0 # 5.0 test2_b: .half 0x7FC0 # NaN test3_a: .half 0x0000 # 0.0 test3_b: .half 0x40E0 # 7.0 test4_a: .half 0x7F80 # +Inf test4_b: .half 0x4000 # 2.0 test5_a: .half 0xC080 # -4.0 test5_b: .half 0x4040 # 3.0 finish_msg: .string "test finish\n" .text .globl _start _start: # Test Case 1: 3.0 * 2.0 = 6.0 # Expected: 0x40C0 la t0, test1_a lh a0, 0(t0) la t0, test1_b lh a1, 0(t0) jal ra, bf16_mul mv s0, a0 # Test Case 2: 5.0 * NaN = NaN # Expected: 0x7FC0 la t0, test2_a lh a0, 0(t0) la t0, test2_b lh a1, 0(t0) jal ra, bf16_mul mv s1, a0 # Test Case 3: 0.0 * 7.0 = 0.0 # Expected: 0x0000 la t0, test3_a lh a0, 0(t0) la t0, test3_b lh a1, 0(t0) jal ra, bf16_mul mv s2, a0 # Test Case 4: +Inf * 2.0 = +Inf # Expected: 0x7F80 la t0, test4_a lh a0, 0(t0) la t0, test4_b lh a1, 0(t0) jal ra, bf16_mul mv s3, a0 # Test Case 5: -4.0 * 3.0 = -12.0 # Expected: 0xC140 la t0, test5_a lh a0, 0(t0) la t0, test5_b lh a1, 0(t0) jal ra, bf16_mul mv s4, a0 # Summary of Results: # s0 = 0x40C0 | 3.0 * 2.0 = 6.0 # s1 = 0x7FC0 | 5.0 * NaN = NaN # s2 = 0x0000 | 0.0 * 7.0 = 0.0 # s3 = 0x7F80 | +Inf * 2.0 = +Inf # s4 = 0xC140 | -4.0 * 3.0 = -12.0 la a0, finish_msg li a7, 4 ecall li a7, 10 ecall .globl bf16_mul bf16_mul: addi sp, sp, -16 sw s0, 0(sp) sw s1, 4(sp) sw s2, 8(sp) srli t0, a0, 15 andi t0, t0, 1 # t0 = sign_a srli t1, a1, 15 andi t1, t1, 1 # t1 = sign_b srli t2, a0, 7 andi t2, t2, 0xFF # t2 = exp_a srli t3, a1, 7 andi t3, t3, 0xFF # t3 = exp_b andi t4, a0, 0x7F # t4 = mant_a andi t5, a1, 0x7F # t5 = mant_b # Calculate result sign xor s0, t0, t1 # s0 = result_sign # check if exp_a == 0xFF li t6, 0xFF bne t2, t6, check_exp_b_mul # exp_a == 0xFF bnez t4, return_a_mul # if (mant_a) return a (NaN) # a is infinity, check if b is zero or a2, t3, t5 beqz a2, return_nan_mul # 0 * Inf = NaN # Return signed infinity: (result_sign << 15) | 0x7F80 slli a0, s0, 15 li t6, 0x7F80 or a0, a0, t6 j restore_and_return_mul check_exp_b_mul: # Check if exp_b == 0xFF bne t3, t6, check_zero_mul # exp_b == 0xFF bnez t5, return_b_mul # if (mant_b) return b (NaN) # b is infinity, check if a is zero or a2, t2, t4 beqz a2, return_nan_mul # Inf * 0 = NaN # Return signed infinity: (result_sign << 15) | 0x7F80 slli a0, s0, 15 li t6, 0x7F80 or a0, a0, t6 j restore_and_return_mul check_zero_mul: # Check if a is zero (!exp_a && !mant_a) or a2, t2, t4 beqz a2, return_signed_zero_mul # Check if b is zero (!exp_b && !mant_b) or a2, t3, t5 beqz a2, return_signed_zero_mul # Initialize exp_adjust li s1, 0 # s1 = exp_adjust # Handle denormal a bnez t2, normal_a_mul # Normalize denormal a normalize_a_loop: andi a2, t4, 0x80 bnez a2, normalize_a_done slli t4, t4, 1 addi s1, s1, -1 j normalize_a_loop normalize_a_done: li t2, 1 # exp_a = 1 j handle_b_mul normal_a_mul: ori t4, t4, 0x80 # mant_a |= 0x80 handle_b_mul: # Handle denormal b bnez t3, normal_b_mul # Normalize denormal b normalize_b_loop: andi a2, t5, 0x80 bnez a2, normalize_b_done slli t5, t5, 1 addi s1, s1, -1 j normalize_b_loop normalize_b_done: li t3, 1 # exp_b = 1 j multiply_mantissas normal_b_mul: ori t5, t5, 0x80 # mant_b |= 0x80 multiply_mantissas: # result_mant = mant_a * mant_b li s2, 0 # s2 = result_mant li a2, 0 # a2 = counter mul_loop: # Check if counter >= 8 li a3, 8 bge a2, a3, mul_done # Check if bit i of mant_b is set li a3, 1 sll a3, a3, a2 # a3 = 1 << i and a4, t5, a3 beqz a4, mul_skip # Add (mant_a << i) to result sll a5, t4, a2 add s2, s2, a5 mul_skip: addi a2, a2, 1 j mul_loop mul_done: # Calculate result exponent add a2, t2, t3 li a3, BF16_EXP_BIAS sub a2, a2, a3 add a2, a2, s1 # a2 = result_exp # Normalize result mantissa # Check if result_mant & 0x8000 lui a3, 0x8 # a3 = 0x8000 and a4, s2, a3 beqz a4, shift_7_bits # Shift right by 8 bits srli s2, s2, 8 andi s2, s2, 0x7F addi a2, a2, 1 # result_exp++ j check_overflow_mul shift_7_bits: # Shift right by 7 bits srli s2, s2, 7 andi s2, s2, 0x7F check_overflow_mul: # Check if result_exp >= 0xFF li a3, 0xFF blt a2, a3, check_underflow_mul # Overflow - return signed infinity slli a0, s0, 15 li a3, 0x7F80 or a0, a0, a3 j restore_and_return_mul check_underflow_mul: # Check if result_exp <= 0 li a3, 0 bgt a2, a3, construct_result_mul # Check if result_exp < -6 li a3, -6 blt a2, a3, return_signed_zero_mul # Denormalize li a3, 1 sub a3, a3, a2 # 1 - result_exp srl s2, s2, a3 # result_mant >>= (1 - result_exp) li a2, 0 # result_exp = 0 construct_result_mul: slli a0, s0, 15 # sign << 15 andi a3, a2, 0xFF slli a3, a3, 7 # (exp & 0xFF) << 7 or a0, a0, a3 andi s2, s2, 0x7F # mant & 0x7F or a0, a0, s2 j restore_and_return_mul return_nan_mul: li t6, BF16_NAN mv a0, t6 j restore_and_return_mul return_signed_zero_mul: slli a0, s0, 15 j restore_and_return_mul return_a_mul: j restore_and_return_mul return_b_mul: mv a0, a1 j restore_and_return_mul restore_and_return_mul: lw s0, 0(sp) lw s1, 4(sp) lw s2, 8(sp) addi sp, sp, 16 ret ``` ::: #### test data | Test | Operation | Input a | Input b | Expected Result | Result Value | Register | | ---- | --------- | ------- | ------- | --------------- | ------------ | -------- | | 1 | MUL | 3.0 | 2.0 | 6.0 | 0x40C0 | s0 | | 2 | MUL | 5.0 | NaN | NaN | 0x7FC0 | s1 | | 3 | MUL | 0.0 | 7.0 | 0.0 | 0x0000 | s2 | | 4 | MUL | +Inf | 2.0 | +Inf | 0x7F80 | s3 | | 5 | MUL | -4.0 | 3.0 | -12.0 | 0xC140 | s4 | #### register value ![mulreg](https://hackmd.io/_uploads/By2djlFplx.png) #### performance ![mulper](https://hackmd.io/_uploads/BkDFigtagl.png) check if exp_a == 0xFF ,and processor doesn't predict,so it preload two instruction following bne x7 x31 36 <chseck_exp_b> ![before NAN](https://hackmd.io/_uploads/HyMPeYqagg.png) branch success so the processor flush previous two instructions and load new instruction into IF stage ![branch predict fail](https://hackmd.io/_uploads/SknwxF9ael.png) ### bf16_div #### assembly code :::spoiler ```s .equ BF16_SIGN_MASK, 0x8000 .equ BF16_EXP_MASK, 0x7F80 .equ BF16_MANT_MASK, 0x007F .equ BF16_EXP_BIAS, 127 .equ BF16_NAN, 0x7FC0 .equ BF16_ZERO, 0x0000 .data test1_a: .half 0x4100 # 8.0 test1_b: .half 0x4000 # 2.0 test2_a: .half 0x4120 # 10.0 test2_b: .half 0x7FC0 # NaN test3_a: .half 0x0000 # 0.0 test3_b: .half 0x40A0 # 5.0 test4_a: .half 0x7F80 # +Inf test4_b: .half 0x4000 # 2.0 test5_a: .half 0x40C0 # 6.0 test5_b: .half 0x0000 # 0.0 finish_msg: .string "test finish\n" .text .globl _start _start: # Test Case 1: 8.0 / 2.0 = 4.0 # Expected: 0x4080 la t0, test1_a lh a0, 0(t0) la t0, test1_b lh a1, 0(t0) jal ra, bf16_div mv s0, a0 # Test Case 2: 10.0 / NaN = NaN # Expected: 0x7FC0 la t0, test2_a lh a0, 0(t0) la t0, test2_b lh a1, 0(t0) jal ra, bf16_div mv s1, a0 # Test Case 3: 0.0 / 5.0 = 0.0 # Expected: 0x0000 la t0, test3_a lh a0, 0(t0) la t0, test3_b lh a1, 0(t0) jal ra, bf16_div mv s2, a0 # Test Case 4: +Inf / 2.0 = +Inf # Expected: 0x7F80 la t0, test4_a lh a0, 0(t0) la t0, test4_b lh a1, 0(t0) jal ra, bf16_div mv s3, a0 # Test Case 5: 6.0 / 0.0 = +Inf # Expected: 0x7F80 la t0, test5_a lh a0, 0(t0) la t0, test5_b lh a1, 0(t0) jal ra, bf16_div mv s4, a0 # Summary of Results: # s0 = 0x4080 | 8.0 / 2.0 = 4.0 # s1 = 0x7FC0 | 10.0 / NaN = NaN # s2 = 0x0000 | 0.0 / 5.0 = 0.0 # s3 = 0x7F80 | +Inf / 2.0 = +Inf # s4 = 0x7F80 | 6.0 / 0.0 = +Inf la a0, finish_msg li a7, 4 # syscall 4: print string ecall # Exit program li a7, 10 # syscall 10: exit ecall #============================================================================= # bf16_div: BF16 division # Input: a0 = bf16_t a, a1 = bf16_t b # Output: a0 = bf16_t result #============================================================================= .globl bf16_div bf16_div: # Save registers addi sp, sp, -20 sw s0, 0(sp) sw s1, 4(sp) sw s2, 8(sp) sw s3, 12(sp) sw s4, 16(sp) # Extract sign_a (bit 15) srli t0, a0, 15 andi t0, t0, 1 # t0 = sign_a srli t1, a1, 15 andi t1, t1, 1 # t1 = sign_b srli t2, a0, 7 andi t2, t2, 0xFF # t2 = exp_a srli t3, a1, 7 andi t3, t3, 0xFF # t3 = exp_b andi t4, a0, 0x7F # t4 = mant_a andi t5, a1, 0x7F # t5 = mant_b xor s0, t0, t1 # s0 = result_sign # Check if exp_b == 0xFF (b is NaN or Inf) li t6, 0xFF bne t3, t6, check_b_zero_div # exp_b == 0xFF bnez t5, return_b_div # if (mant_b) => b = NaN # b is infinity # 檢查a是否為特殊數 若不是則回傳sign zero li t6, 0xFF bne t2, t6, return_signed_zero_div # a,b皆為inf => Inf / Inf = NaN bnez t4, return_a_div # mant_a != 0 => a = NaN =>直接回傳a li t6, BF16_NAN # mant_a = 0 => inf/inf => NaN mv a0, t6 j restore_and_return_div check_b_zero_div: # Check if b is zero (!exp_b && !mant_b) or a2, t3, t5 bnez a2, check_a_inf_div # b is zero # Check if a is also zero or a2, t2, t4 beqz a2, return_nan_div # 0 / 0 = NaN # a / 0 = signed Inf: (result_sign << 15) | 0x7F80 slli a0, s0, 15 li t6, 0x7F80 or a0, a0, t6 j restore_and_return_div check_a_inf_div: # Check if exp_a == 0xFF (a is NaN or Inf) li t6, 0xFF bne t2, t6, check_a_zero_div # exp_a == 0xFF bnez t4, return_a_div # if (mant_a) return a (NaN) # a is infinity, return signed infinity: (result_sign << 15) | 0x7F80 slli a0, s0, 15 li t6, 0x7F80 or a0, a0, t6 j restore_and_return_div check_a_zero_div: # Check if a is zero (!exp_a && !mant_a) or a2, t2, t4 beqz a2, return_signed_zero_div # Add implicit bit for normalized numbers bnez t2, add_implicit_a_div j skip_implicit_a_div add_implicit_a_div: ori t4, t4, 0x80 # mant_a |= 0x80 skip_implicit_a_div: bnez t3, add_implicit_b_div j perform_division add_implicit_b_div: ori t5, t5, 0x80 # mant_b |= 0x80 perform_division: # Long division algorithm # dividend = mant_a << 15 slli s1, t4, 15 # s1 = dividend mv s2, t5 # s2 = divisor li s3, 0 # s3 = quotient # Division loop (16 iterations) li s4, 0 # s4 = loop counter div_loop: li a2, 16 bge s4, a2, div_done # quotient <<= 1 slli s3, s3, 1 # Calculate shift amount: 15 - i li a2, 15 sub a3, a2, s4 # Calculate (divisor << shift_amount) sll a4, s2, a3 # Check if dividend >= (divisor << shift_amount) bltu s1, a4, div_skip # dividend -= (divisor << shift_amount) sub s1, s1, a4 # quotient |= 1 ori s3, s3, 1 div_skip: addi s4, s4, 1 j div_loop div_done: # s3 now contains quotient # Calculate result_exp = exp_a - exp_b + BIAS sub a2, t2, t3 # exp_a - exp_b li a3, BF16_EXP_BIAS add a2, a2, a3 # + BIAS # Adjust for denormal a bnez t2, check_denormal_b_div addi a2, a2, -1 # result_exp-- check_denormal_b_div: # Adjust for denormal b bnez t3, normalize_quotient addi a2, a2, 1 # result_exp++ normalize_quotient: # Check if quotient & 0x8000 lui a3, 0x8 # a3 = 0x8000 and a4, s3, a3 beqz a4, normalize_quotient_left # Quotient is already normalized, shift right by 8 srli s3, s3, 8 j check_overflow_div normalize_quotient_left: # Normalize left normalize_left_loop: # Check if quotient & 0x8000 lui a3, 0x8 and a4, s3, a3 bnez a4, normalize_left_done # Check if result_exp > 1 li a3, 1 ble a2, a3, normalize_left_done # quotient <<= 1 slli s3, s3, 1 # result_exp-- addi a2, a2, -1 j normalize_left_loop normalize_left_done: # Shift right by 8 to get 7-bit mantissa srli s3, s3, 8 check_overflow_div: # Extract mantissa (7 bits) andi s3, s3, 0x7F # Check if result_exp >= 0xFF li a3, 0xFF blt a2, a3, check_underflow_div # Overflow - return signed infinity: (result_sign << 15) | 0x7F80 slli a0, s0, 15 li a3, 0x7F80 or a0, a0, a3 j restore_and_return_div check_underflow_div: # Check if result_exp <= 0 li a3, 0 bgt a2, a3, construct_result_div # Underflow - return signed zero j return_signed_zero_div construct_result_div: # Construct result slli a0, s0, 15 # sign << 15 andi a3, a2, 0xFF slli a3, a3, 7 # (exp & 0xFF) << 7 or a0, a0, a3 andi s3, s3, 0x7F # mant & 0x7F or a0, a0, s3 j restore_and_return_div return_nan_div: li t6, BF16_NAN mv a0, t6 j restore_and_return_div return_signed_zero_div: slli a0, s0, 15 j restore_and_return_div return_a_div: # a0 already contains a j restore_and_return_div return_b_div: mv a0, a1 j restore_and_return_div restore_and_return_div: # Restore registers lw s0, 0(sp) lw s1, 4(sp) lw s2, 8(sp) lw s3, 12(sp) lw s4, 16(sp) addi sp, sp, 20 ret ``` ::: #### test data | Test | Operation | Input a | Input b | Expected Result | Result Value | Register | | ---- | --------- | ------- | ------- | --------------- | ------------ | -------- | | 1 | DIV | 8.0 | 2.0 | 4.0 | 0x4080 | s0 | | 2 | DIV | 10.0 | NaN | NaN | 0x7FC0 | s1 | | 3 | DIV | 0.0 | 5.0 | 0.0 | 0x0000 | s2 | | 4 | DIV | +Inf | 2.0 | +Inf | 0x7F80 | s3 | | 5 | DIV | 6.0 | 0.0 | +Inf | 0x7F80 | s4 | #### register value ![divreg](https://hackmd.io/_uploads/H1WT4xFTxe.png) #### performance ![divper](https://hackmd.io/_uploads/SJ80VxYpxx.png) ### bf16_sqrt #### assembly code :::spoiler ```.s # bf16_sqrt: BF16 square root # Input: a0 = bf16_t a # Output: a0 = bf16_t result .equ BF16_SIGN_MASK, 0x8000 .equ BF16_EXP_MASK, 0x7F80 .equ BF16_MANT_MASK, 0x007F .equ BF16_EXP_BIAS, 127 .equ BF16_NAN, 0x7FC0 .equ BF16_ZERO, 0x0000 .data # Test data in BF16 format # Test Case 1: sqrt(16.0) test1_a: .half 0x4180 # 16.0 # Test Case 2: sqrt(0.0) test2_a: .half 0x0000 # 0.0 # Test Case 3: sqrt(NaN) test3_a: .half 0x7FC0 # NaN # Test Case 4: sqrt(+Inf) test4_a: .half 0x7F80 # +Inf # Test Case 5: sqrt(-1.0) test5_a: .half 0xBF80 # -1.0 finish_msg: .string "test finish\n" .text .globl _start _start: # Test Case 1: sqrt(16.0) = 4.0 # Expected: 0x4080 la t0, test1_a lh a0, 0(t0) jal ra, bf16_sqrt mv s0, a0 # s0 = result # Test Case 2: sqrt(0.0) = 0.0 # Expected: 0x0000 la t0, test2_a lh a0, 0(t0) jal ra, bf16_sqrt mv s1, a0 # s1 = result # Test Case 3: sqrt(NaN) = NaN # Expected: 0x7FC0 la t0, test3_a lh a0, 0(t0) jal ra, bf16_sqrt mv s2, a0 # s2 = result # Test Case 4: sqrt(+Inf) = +Inf # Expected: 0x7F80 la t0, test4_a lh a0, 0(t0) jal ra, bf16_sqrt mv s3, a0 # s3 = result # Test Case 5: sqrt(-1.0) = NaN # Expected: 0x7FC0 la t0, test5_a lh a0, 0(t0) jal ra, bf16_sqrt mv s4, a0 # s4 = result # Summary of Results: # s0 = 0x4080 | sqrt(16.0) = 4.0 # s1 = 0x0000 | sqrt(0.0) = 0.0 # s2 = 0x7FC0 | sqrt(NaN) = NaN # s3 = 0x7F80 | sqrt(+Inf) = +Inf # s4 = 0x7FC0 | sqrt(-1.0) = NaN # Infinite loop for inspection la a0, finish_msg li a7, 4 # syscall 4: print string ecall # Exit program li a7, 10 # syscall 10: exit ecall #============================================================================= # bf16_sqrt: BF16 square root # Input: a0 = bf16_t a # Output: a0 = bf16_t result #============================================================================= .globl bf16_sqrt bf16_sqrt: # Save registers addi sp, sp, -24 sw s0, 0(sp) sw s1, 4(sp) sw s2, 8(sp) sw s3, 12(sp) sw s4, 16(sp) sw ra, 20(sp) # Extract sign (bit 15) srli t0, a0, 15 andi t0, t0, 1 # t0 = sign # Extract exponent (bits 7-14) srli t1, a0, 7 andi t1, t1, 0xFF # t1 = exp # Extract mantissa (bits 0-6) andi t2, a0, 0x7F # t2 = mant # Check if exp == 0xFF (NaN or Inf) li t3, 0xFF bne t1, t3, check_zero_sqrt # exp == 0xFF bnez t2, return_a_sqrt # if (mant) return a (NaN propagation) # a is infinity bnez t0, return_nan_sqrt # if (sign) sqrt(-Inf) = NaN j return_a_sqrt # sqrt(+Inf) = +Inf check_zero_sqrt: # Check if a is zero (!exp && !mant) or t3, t1, t2 beqz t3, return_zero_sqrt # Check if negative number bnez t0, return_nan_sqrt # sqrt(negative) = NaN # Check if denormal (exp == 0) bnez t1, normalized_sqrt # Denormal number - flush to zero j return_zero_sqrt normalized_sqrt: # Calculate new exponent: new_exp = (old_exp - bias) / 2 + bias # e = exp - BF16_EXP_BIAS li t3, BF16_EXP_BIAS sub s0, t1, t3 # s0 = e = exp - 127 # Get full mantissa with implicit 1 ori s1, t2, 0x80 # Check if exponent is odd andi t3, s0, 1 beqz t3, exp_even_sqrt # Exponent is odd: m <<= 1, adjust exponent slli s1, s1, 1 addi s0, s0, -1 # e = e - 1 (make it even) exp_even_sqrt: # Now s0 is even, calculate new_exp = (e / 2) + bias srai s2, s0, 1 # s2 = e / 2 (arithmetic shift for signed) li t3, BF16_EXP_BIAS add s2, s2, t3 # s2 = new_exp = (e / 2) + 127 # Binary search for integer square root # We want result where result^2 ≈ m * 128 li s3, 90 # low = 90 (roughly sqrt(128)) li s4, 256 # high = 256 (roughly sqrt(512)) li a2, 128 # a2 = result (default) binary_search_sqrt: bgtu s3, s4, search_done_sqrt # mid = (low + high) / 2 add a3, s3, s4 srli a3, a3, 1 # a3 = mid # Calculate sq = (mid * mid) / 128 mv a4, a3 mv a5, a3 # Multiply mid * mid using shift-and-add li a6, 0 # a6 = product accumulator li a7, 0 # a7 = counter mul_loop_sqrt: li t3, 8 bge a7, t3, mul_done_sqrt li t3, 1 sll t3, t3, a7 and t4, a5, t3 beqz t4, mul_skip_sqrt sll t5, a4, a7 add a6, a6, t5 mul_skip_sqrt: addi a7, a7, 1 j mul_loop_sqrt mul_done_sqrt: # a6 = mid * mid, now divide by 128 srli a6, a6, 7 # sq = (mid * mid) / 128 # Compare sq with m (s1) bleu a6, s1, update_result_sqrt # sq > m, search lower half addi s4, a3, -1 # high = mid - 1 j binary_search_sqrt update_result_sqrt: # sq <= m, this could be our answer mv a2, a3 # result = mid addi s3, a3, 1 # low = mid + 1 j binary_search_sqrt search_done_sqrt: # a2 now contains sqrt(m) scaled appropriately li t3, 256 bltu a2, t3, check_underflow_sqrt # result >= 256, shift right and increment exponent srli a2, a2, 1 addi s2, s2, 1 j check_overflow_sqrt check_underflow_sqrt: li t3, 128 bgeu a2, t3, check_overflow_sqrt # result < 128, need to normalize normalize_loop_sqrt: li t3, 128 bgeu a2, t3, check_overflow_sqrt li t3, 1 ble s2, t3, check_overflow_sqrt slli a2, a2, 1 addi s2, s2, -1 j normalize_loop_sqrt check_overflow_sqrt: # Check if new_exp >= 0xFF li t3, 0xFF blt s2, t3, check_underflow_exp_sqrt # Overflow - return +Inf li t3, 0x7F80 mv a0, t3 j restore_and_return_sqrt check_underflow_exp_sqrt: # Check if new_exp <= 0 li t3, 0 bgt s2, t3, construct_result_sqrt # Underflow - return 0 j return_zero_sqrt construct_result_sqrt: # Extract 7-bit mantissa (remove implicit 1) andi a2, a2, 0x7F # Construct result: (0 << 15) | ((new_exp & 0xFF) << 7) | (mant & 0x7F) # Sign is always 0 for sqrt of positive number andi t3, s2, 0xFF slli t3, t3, 7 # (exp & 0xFF) << 7 or a0, t3, a2 # exp | mant j restore_and_return_sqrt return_nan_sqrt: li t3, BF16_NAN mv a0, t3 j restore_and_return_sqrt return_zero_sqrt: li t3, BF16_ZERO mv a0, t3 j restore_and_return_sqrt return_a_sqrt: # a0 already contains a j restore_and_return_sqrt restore_and_return_sqrt: lw ra, 20(sp) lw s4, 16(sp) lw s3, 12(sp) lw s2, 8(sp) lw s1, 4(sp) lw s0, 0(sp) addi sp, sp, 24 ret ``` ::: #### test data | Test | Operation | Input a | Expected Result | Result Value | Register | | ---- | --------- | ------- | --------------- | ------------ | -------- | | 1 | SQRT | 16.0 | 4.0 | 0x4080 | s0 | | 2 | SQRT | 0.0 | 0.0 | 0x0000 | s1 | | 3 | SQRT | NaN | NaN | 0x7FC0 | s2 | | 4 | SQRT | +Inf | +Inf | 0x7F80 | s3 | | 5 | SQRT | -1.0 | NaN | 0x7FC0 | s4 | #### register value ![sqrtreg](https://hackmd.io/_uploads/r1jLBgK6gl.png) #### performance ![sqrtper](https://hackmd.io/_uploads/SJquHgKpgg.png) ## [leetcode : power of 2](https://leetcode.com/problems/power-of-two/description/) ### C code: ```C bool isPowerOfTwo(int n) { if (n <= 0) return false; int log_val = log2(n); return n == (1 << log_val); } ``` ### test data | data | expect return value | |:----:|:-------------------:| | 1 | 1 | | 16 | 1 | | 3 | 0 | | -1 | 0 | | 0 | 0 | | 2048 | 1 | ### Key Insight Powers of 2 have **exactly one bit set**: - 8 = `0b1000` (one bit at position 3) ✓ - 7 = `0b0111` (three bits set) ✗ 1.**Find MSB position** using CLZ 2.Reconstruct number with only that bit 3.Compare with original ### assembly code (with branch) ::: info [assembly code of power_of_two (with branch)](https://github.com/rainbow0212/ca2025-homework1/blob/main/leetcode_ricsv/power_of_two_riscv.S) ::: ### result ![leetcode_risv_result](https://hackmd.io/_uploads/BJ97nGtagg.png) ### performance: ![螢幕擷取畫面 2025-10-12 201714](https://hackmd.io/_uploads/rJgAw2GYTex.png) ### assembly code (branchless) :::info [assembly code of power_of_two(branchless)](https://github.com/rainbow0212/ca2025-homework1/blob/main/leetcode_ricsv/power_of_two_riscv_branchless.S) ::: ### performance (branchless) ![螢幕擷取畫面 2025-10-13 152442](https://hackmd.io/_uploads/HyaWp7qaxl.png) ### how branchless clz work 1.**Right-Fill** Takes any number and propagates its MSB rightward to create a continuous block of 1s. Logic: By repeatedly OR-ing the number with itself shifted right by powers of 2 (1, 2, 4, 8, 16), we ensure every bit position after the MSB becomes 1. Result: Transforms the problem from "find first 1-bit" to "count total 1-bits" Example: 0001 0110 → 0001 1111 (5 ones) 2.**Population Count via Parallel Reduction** Counts the total number of 1-bits using divide-and-conquer with bitwise magic: Reduce 32→16: Count bits in pairs (2-bit groups) Reduce 16→8: Combine into 4-bit groups Reduce 8→4: Combine into 8-bit groups Reduce 4→2→1: Sum all groups into final count Key insight: Uses special bit masks (0x55555555, 0x33333333, 0x0F0F0F0F) that allow processing multiple groups simultaneously in a single register. 3.**Final Calculation** **CLZ = 32 - popcount** Since we filled all bits after the MSB, the population count tells us how many bits are NOT leading zeros. ## TODO * Add the GCC-translated RISC-V code and compare it with my assembly code. * Combine all the function in problem C into one program * Improve the way problem C test data is handled so that the results of different test cases calling different functions can be displayed on the Ripes console. ## Reference :::warning This assignment was completed with assistance from [Claude](https://claude.ai/new) for grammar checking, translation and initial brainstorming. All final analysis and conclusions are my own. ::: * [Assignment 1: RISC-V Assembly and Instruction Pipeline](https://hackmd.io/@sysprog/2025-arch-homework1) * [Quiz1 of Computer Architecture (2025 Fall)](https://hackmd.io/@sysprog/arch2025-quiz1-sol) * [Ripes](https://github.com/mortbopet/Ripes)