Assignment1: RISC-V Assembly and Instruction Pipeline

# Assignment1: RISC-V Assembly and Instruction Pipeline contributed by [< rara0857 >](https://github.com/rara0857/ca2025-quizzes) ## Problem B ### Descripition Assume uf8 implements a logarithmic 8-bit codec that maps 20-bit unsigned integers $([0,1{,}015{,}792])$ to 8-bit symbols via logarithmic quantization, delivering 2.5:1 compression and ≤6.25% relative error. **UF8_Decode:** $$ D(b) = m \cdot 2^e + (2^e - 1) \cdot 16 $$ **UF8_Encode** $$ E(v) = \begin{cases} v, & \text{if } v < 16 \\ 16e + \lfloor (v - \text{offset}(e)) / 2^e \rfloor, & \text{otherwise} \end{cases} $$ --- ### Implementation During encoding, we need to determine which power-of-two interval the input value belongs to, i.e., compute an integer $\log_2$. Since RISC-V has no native instruction for this, we use CLZ (Count Leading Zeros) to locate the highest set bit, and obtain the exponent by $e = 31 - \text{CLZ}(x)$. To find CLZ, we can repeatedly shifting the value right (16 → 8 → 4 → 2 → 1) until the highest ‘1’ bit is detected. **C Code** ```c= typedef uint8_t uf8; static inline unsigned clz(uint32_t x) { int n = 32, c = 16; do { uint32_t y = x >> c; if (y) { n -= c; x = y; } c >>= 1; } while (c); return n - x; } ``` **Assembly Code** ```s= clz: li t0, 32 # n li t1, 16 # c clz_loop: srl t2, a0, t1 # uint32_t y = x >> c beqz t2, clz_end # if (y) sub t0, t0, t1 # n -= c mv a0, t2 # x = y clz_end: srli t1, t1, 1 bnez t1, clz_loop sub a0, t0, a0 ret ``` ### **CLZ Optimization** In this version's implementation, the `clz` routine is optimized by **removing all looping and branching**. Instead of relying on a conditional loop, it performs a fixed sequence of right shifts (16→8→4→2→1) using `snez` masks. This approach eliminates loop overhead and unpredictable branch delays, achieving **constant-time execution** with slightly larger code size. ```s= clz: li t0, 32 # n = 32 # 16-bit srli t2, a0, 16 # y = x >> 16 snez t3, t2 # y != 0 slli t1, t3, 4 # t1 = 16 or 0 sub t0, t0, t1 # n -= t1 srl a0, a0, t1 # x >>= t1 # 8-bit srli t2, a0, 8 snez t3, t2 slli t1, t3, 3 # t1 = 8 or 0 sub t0, t0, t1 srl a0, a0, t1 # 4-bit srli t2, a0, 4 snez t3, t2 slli t1, t3, 2 # t1 = 4 or 0 sub t0, t0, t1 srl a0, a0, t1 # 2-bit srli t2, a0, 2 snez t3, t2 slli t1, t3, 1 # t1 = 2 or 0 sub t0, t0, t1 srl a0, a0, t1 # 1-bit srli t2, a0, 1 snez t3, t2 sub t0, t0, t3 # n -= 1 srl a0, a0, t3 sub a0, t0, a0 # return n - x ret ``` ### C Code :::spoiler C code ```c= #include <stdbool.h> #include <stdint.h> #include <stdio.h> #include <stdlib.h> typedef uint8_t uf8; static inline unsigned clz(uint32_t x) { int n = 32, c = 16; do { uint32_t y = x >> c; if (y) { n -= c; x = y; } c >>= 1; } while (c); return n - x; } /* Decode uf8 to uint32_t */ uint32_t uf8_decode(uf8 fl) { uint32_t mantissa = fl & 0x0f; uint8_t exponent = fl >> 4; uint32_t offset = (0x7FFF >> (15 - exponent)) << 4; return (mantissa << exponent) + offset; } /* Encode uint32_t to uf8 */ uf8 uf8_encode(uint32_t value) { /* Use CLZ for fast exponent calculation */ if (value < 16) return value; /* Find appropriate exponent using CLZ hint */ int lz = clz(value); int msb = 31 - lz; /* Start from a good initial guess */ uint8_t exponent = 0; uint32_t overflow = 0; if (msb >= 5) { /* Estimate exponent - the formula is empirical */ exponent = msb - 4; if (exponent > 15) exponent = 15; /* Calculate overflow for estimated exponent */ for (uint8_t e = 0; e < exponent; e++) overflow = (overflow << 1) + 16; /* Adjust if estimate was off */ while (exponent > 0 && value < overflow) { overflow = (overflow - 16) >> 1; exponent--; } } /* Find exact exponent */ while (exponent < 15) { uint32_t next_overflow = (overflow << 1) + 16; if (value < next_overflow) break; overflow = next_overflow; exponent++; } uint8_t mantissa = (value - overflow) >> exponent; return (exponent << 4) | mantissa; } /* Test encode/decode round-trip */ static bool test(void) { int test_cases[] = {0, 50, 100, 150, 200, 255}; int num=6; for (int i = 0; i < 6; i++) { uint8_t fl = test_cases[i]; int32_t value = uf8_decode(fl); uint8_t fl2 = uf8_encode(value); printf("test case: %d, decode -> %d, encode -> %d", fl, value, fl2); if (fl != fl2) { printf(", Fail\n"); } else { printf(", Pass\n"); } } } int main(void) { test(); } ``` ::: ### Assembly Code :::spoiler Assembly Code ```s= .data case: .word 0, 50, 100, 150, 200, 255 count: .word 6 msg1: .asciz "test case: " msg2: .asciz ", decode -> " msg3: .asciz ", encode -> " msg4: .asciz ", Pass\n" msg5: .asciz ", Fail\n" .text main: la s0, case lw s1, count li s2, 0 test_loop: bge s2, s1, exit lw s3, 0(s0) la a0, msg1 li a7, 4 ecall mv a0, s3 li a7, 1 ecall la a0, msg2 li a7, 4 ecall mv a0, s3 jal ra, uf8_decode mv s4, a0 mv a0, s4 li a7, 1 ecall la a0, msg3 li a7, 4 ecall mv a0, s4 jal ra, uf8_encode li a7, 1 ecall bne a0, s3, failed la a0, msg4 li a7, 4 ecall addi s0, s0, 4 addi s2, s2, 1 j test_loop failed: la a0, msg5 li a7, 4 ecall addi s0, s0, 4 addi s2, s2, 1 j test_loop exit: li a7, 10 ecall ### Functions clz: li t0, 32 # n = 32 # 16-bit srli t2, a0, 16 # y = x >> 16 snez t3, t2 # y != 0 slli t1, t3, 4 # t1 = 16 or 0 sub t0, t0, t1 # n -= t1 srl a0, a0, t1 # x >>= t1 # 8-bit srli t2, a0, 8 snez t3, t2 slli t1, t3, 3 # t1 = 8 or 0 sub t0, t0, t1 srl a0, a0, t1 # 4-bit srli t2, a0, 4 snez t3, t2 slli t1, t3, 2 # t1 = 4 or 0 sub t0, t0, t1 srl a0, a0, t1 # 2-bit srli t2, a0, 2 snez t3, t2 slli t1, t3, 1 # t1 = 2 or 0 sub t0, t0, t1 srl a0, a0, t1 # 1-bit srli t2, a0, 1 snez t3, t2 sub t0, t0, t3 # n -= 1 srl a0, a0, t3 sub a0, t0, a0 # return n - x ret uf8_decode: andi t0, a0, 0x0f srli t1, a0, 4 li t2, 15 sub t2, t2, t1 li t3, 0x7FFF srl t3, t3, t2 slli t3, t3, 4 sll t0, t0, t1 add a0, t0, t3 ret uf8_encode: addi sp, sp, -8 sw ra, 4(sp) sw s0, 0(sp) mv s0, a0 # if (value < 16) return value li t0, 16 blt s0, t0, encode_ret # lz = clz(value) # msb = 31 - lz jal ra, clz li t0, 31 sub t0, t0, a0 # t0 = msb li t1, 0 # exponent li t2, 0 # overflow # if (msb >= 5) li t3, 5 blt t0, t3, find_exact_exponent # exponent = msb - 4 # if (exponent > 15) exponent = 15 addi t1, t0, -4 li t3, 15 li t3, 15 ble t1, t3, done1 mv t1, t3 done1: li t3, 0 # e = 0 # for (uint8_t e = 0; e < exponent; e++) # overflow = (overflow << 1) + 16 for_loop: bge t3, t1, adjust slli t2, t2, 1 addi t2, t2, 16 addi t3, t3, 1 j for_loop # while (exponent > 0 && value < overflow) adjust: blez t1, find_exact_exponent bge s0, t2, find_exact_exponent addi t2, t2, -16 srli t2, t2, 1 addi t1, t1, -1 j adjust find_exact_exponent: li t3, 15 bge t1, t3, done2 # while (exponent < 15) slli t4, t2, 1 addi t4, t4, 16 blt s0, t4, done2 mv t2, t4 addi t1, t1, 1 j find_exact_exponent # mantissa = (value - overflow) >> exponent # return (exponent << 4) | mantissa; done2: sub t3, s0, t2 srl t3, t3, t1 slli t1, t1, 4 or a0, t1, t3 j end encode_ret: mv a0, s0 end: lw s0, 0(sp) lw ra, 4(sp) addi sp, sp, 8 ret ``` ::: ### Analysis Test with 6 test case `0,50,100,150,200,255` **Console output** ![image](https://hackmd.io/_uploads/rklF7WfYpll.png =40%x) **Comparison** | | C Code | ASM | CLZ opt. | |:----------------:|:---------:|:--------:|:--------:| | **Cycles** | 27466 | 1216 | 1141 | | **Instrs. Ret.** | 21055 | 836 | 827 | | **CPI** | 1.3 | 1.45 | 1.38 | | **IPC** | 0.767 | 0.688 | 0.725 | | **Clock rate** | 23.40 KHz | 1.27 KHz | 2.54 KHz | ## Problem C ### Description The bfloat16 format (16-bit, from Google Brain) preserves float32’s dynamic range by keeping the same 8-bit exponent, but reduces precision to a 7-bit significand (vs. 23) **Bit Layout** ┌─────────┬──────────────┬──────────────┐ │Sign (1) │ Exponent (8) │ Mantissa (7) │ └─────────┴──────────────┴──────────────┘ 15 14 7 6 0 S: Sign bit (0 = positive, 1 = negative) E: Exponent bits (8 bits, bias = 127) M: Mantissa/fraction bits (7 bits) **Value Representation** The value 𝑣 of a BFloat16 number is calculated as: $$ v = (-1)^S \times 2^{E-127} \times \left(1 + \frac{M}{128}\right) $$ where: - $S \in \{0, 1\}$ — sign bit - $E \in [1, 254]$ — biased exponent - $M \in [0, 127]$ — mantissa value **Special Cases** * Zero:E = 0, $M = 0 \Rightarrow v = (-1)^S \times 0$ * Infinity:E = $255, M = 0 \Rightarrow v = (-1)^S \times \infty$ * NaN:E = 255, $M \neq 0 \Rightarrow v = \text{NaN}$ * Denormals: Not supported (flush to zero) --- ### Implementation To finish the code, the following components are required: #### Basic Checks - `bf16_isnan`, `bf16_isinf`, `bf16_iszero`: Detect NaN, ±Inf, ±0. #### Format Conversion - `f32_to_bf16(float val)`: Converts 32-bit float → bfloat16 with rounding (RNE) to reduce precision loss. - `bf16_to_f32(bf16_t val)`: Converts bfloat16 → 32-bit float by padding mantissa bits with zeros. #### Arithmetic Operations - `bf16_add(a, b)`: Addition - `bf16_sub(a, b)`: Subtraction - `bf16_mul(a, b)`: Multiplication - `bf16_div(a, b)`: Division - `bf16_sqrt(a)`: Square root via exponent halving and mantissa normalization #### Comparison Functions - `bf16_eq`, `bf16_lt`, `bf16_gt`: Comparison functions (any NaN → false). ### C Code :::spoiler C Code ```c= #include <stdbool.h> #include <stdint.h> #include <string.h> typedef struct { uint16_t bits; } bf16_t; #define BF16_SIGN_MASK 0x8000U #define BF16_EXP_MASK 0x7F80U #define BF16_MANT_MASK 0x007FU #define BF16_EXP_BIAS 127 #define BF16_NAN() ((bf16_t) {.bits = 0x7FC0}) #define BF16_ZERO() ((bf16_t) {.bits = 0x0000}) static inline bool bf16_isnan(bf16_t a) { return ((a.bits & BF16_EXP_MASK) == BF16_EXP_MASK) && (a.bits & BF16_MANT_MASK); } static inline bool bf16_isinf(bf16_t a) { return ((a.bits & BF16_EXP_MASK) == BF16_EXP_MASK) && !(a.bits & BF16_MANT_MASK); } static inline bool bf16_iszero(bf16_t a) { return !(a.bits & 0x7FFF); } static inline bf16_t f32_to_bf16(float val) { uint32_t f32bits; memcpy(&f32bits, &val, sizeof(float)); if (((f32bits >> 23) & 0xFF) == 0xFF) return (bf16_t) {.bits = (f32bits >> 16) & 0xFFFF}; f32bits += ((f32bits >> 16) & 1) + 0x7FFF; return (bf16_t) {.bits = f32bits >> 16}; } static inline float bf16_to_f32(bf16_t val) { uint32_t f32bits = ((uint32_t) val.bits) << 16; float result; memcpy(&result, &f32bits, sizeof(float)); return result; } static inline bf16_t bf16_add(bf16_t a, bf16_t b) { uint16_t sign_a = (a.bits >> 15) & 1; uint16_t sign_b = (b.bits >> 15) & 1; int16_t exp_a = ((a.bits >> 7) & 0xFF); int16_t exp_b = ((b.bits >> 7) & 0xFF); uint16_t mant_a = a.bits & 0x7F; uint16_t mant_b = b.bits & 0x7F; if (exp_a == 0xFF) { if (mant_a) return a; if (exp_b == 0xFF) return (mant_b || sign_a == sign_b) ? b : BF16_NAN(); return a; } if (exp_b == 0xFF) return b; if (!exp_a && !mant_a) return b; if (!exp_b && !mant_b) return a; if (exp_a) mant_a |= 0x80; if (exp_b) mant_b |= 0x80; int16_t exp_diff = exp_a - exp_b; uint16_t result_sign; int16_t result_exp; uint32_t result_mant; if (exp_diff > 0) { result_exp = exp_a; if (exp_diff > 8) return a; mant_b >>= exp_diff; } else if (exp_diff < 0) { result_exp = exp_b; if (exp_diff < -8) return b; mant_a >>= -exp_diff; } else { result_exp = exp_a; } if (sign_a == sign_b) { result_sign = sign_a; result_mant = (uint32_t) mant_a + mant_b; if (result_mant & 0x100) { result_mant >>= 1; if (++result_exp >= 0xFF) return (bf16_t) {.bits = (result_sign << 15) | 0x7F80}; } } else { if (mant_a >= mant_b) { result_sign = sign_a; result_mant = mant_a - mant_b; } else { result_sign = sign_b; result_mant = mant_b - mant_a; } if (!result_mant) return BF16_ZERO(); while (!(result_mant & 0x80)) { result_mant <<= 1; if (--result_exp <= 0) return BF16_ZERO(); } } return (bf16_t) { .bits = (result_sign << 15) | ((result_exp & 0xFF) << 7) | (result_mant & 0x7F), }; } static inline bf16_t bf16_sub(bf16_t a, bf16_t b) { b.bits ^= BF16_SIGN_MASK; return bf16_add(a, b); } static inline bf16_t bf16_mul(bf16_t a, bf16_t b) { uint16_t sign_a = (a.bits >> 15) & 1; uint16_t sign_b = (b.bits >> 15) & 1; int16_t exp_a = ((a.bits >> 7) & 0xFF); int16_t exp_b = ((b.bits >> 7) & 0xFF); uint16_t mant_a = a.bits & 0x7F; uint16_t mant_b = b.bits & 0x7F; uint16_t result_sign = sign_a ^ sign_b; if (exp_a == 0xFF) { if (mant_a) return a; if (!exp_b && !mant_b) return BF16_NAN(); return (bf16_t) {.bits = (result_sign << 15) | 0x7F80}; } if (exp_b == 0xFF) { if (mant_b) return b; if (!exp_a && !mant_a) return BF16_NAN(); return (bf16_t) {.bits = (result_sign << 15) | 0x7F80}; } if ((!exp_a && !mant_a) || (!exp_b && !mant_b)) return (bf16_t) {.bits = result_sign << 15}; int16_t exp_adjust = 0; if (!exp_a) { while (!(mant_a & 0x80)) { mant_a <<= 1; exp_adjust--; } exp_a = 1; } else mant_a |= 0x80; if (!exp_b) { while (!(mant_b & 0x80)) { mant_b <<= 1; exp_adjust--; } exp_b = 1; } else mant_b |= 0x80; uint32_t result_mant = (uint32_t) mant_a * mant_b; int32_t result_exp = (int32_t) exp_a + exp_b - BF16_EXP_BIAS + exp_adjust; if (result_mant & 0x8000) { result_mant = (result_mant >> 8) & 0x7F; result_exp++; } else result_mant = (result_mant >> 7) & 0x7F; if (result_exp >= 0xFF) return (bf16_t) {.bits = (result_sign << 15) | 0x7F80}; if (result_exp <= 0) { if (result_exp < -6) return (bf16_t) {.bits = result_sign << 15}; result_mant >>= (1 - result_exp); result_exp = 0; } return (bf16_t) {.bits = (result_sign << 15) | ((result_exp & 0xFF) << 7) | (result_mant & 0x7F)}; } static inline bf16_t bf16_div(bf16_t a, bf16_t b) { uint16_t sign_a = (a.bits >> 15) & 1; uint16_t sign_b = (b.bits >> 15) & 1; int16_t exp_a = ((a.bits >> 7) & 0xFF); int16_t exp_b = ((b.bits >> 7) & 0xFF); uint16_t mant_a = a.bits & 0x7F; uint16_t mant_b = b.bits & 0x7F; uint16_t result_sign = sign_a ^ sign_b; if (exp_b == 0xFF) { if (mant_b) return b; /* Inf/Inf = NaN */ if (exp_a == 0xFF && !mant_a) return BF16_NAN(); return (bf16_t) {.bits = result_sign << 15}; } if (!exp_b && !mant_b) { if (!exp_a && !mant_a) return BF16_NAN(); return (bf16_t) {.bits = (result_sign << 15) | 0x7F80}; } if (exp_a == 0xFF) { if (mant_a) return a; return (bf16_t) {.bits = (result_sign << 15) | 0x7F80}; } if (!exp_a && !mant_a) return (bf16_t) {.bits = result_sign << 15}; if (exp_a) mant_a |= 0x80; if (exp_b) mant_b |= 0x80; uint32_t dividend = (uint32_t) mant_a << 15; uint32_t divisor = mant_b; uint32_t quotient = 0; for (int i = 0; i < 16; i++) { quotient <<= 1; if (dividend >= (divisor << (15 - i))) { dividend -= (divisor << (15 - i)); quotient |= 1; } } int32_t result_exp = (int32_t) exp_a - exp_b + BF16_EXP_BIAS; if (!exp_a) result_exp--; if (!exp_b) result_exp++; if (quotient & 0x8000) quotient >>= 8; else { while (!(quotient & 0x8000) && result_exp > 1) { quotient <<= 1; result_exp--; } quotient >>= 8; } quotient &= 0x7F; if (result_exp >= 0xFF) return (bf16_t) {.bits = (result_sign << 15) | 0x7F80}; if (result_exp <= 0) return (bf16_t) {.bits = result_sign << 15}; return (bf16_t) { .bits = (result_sign << 15) | ((result_exp & 0xFF) << 7) | (quotient & 0x7F), }; } static inline bf16_t bf16_sqrt(bf16_t a) { uint16_t sign = (a.bits >> 15) & 1; int16_t exp = ((a.bits >> 7) & 0xFF); uint16_t mant = a.bits & 0x7F; /* Handle special cases */ if (exp == 0xFF) { if (mant) return a; /* NaN propagation */ if (sign) return BF16_NAN(); /* sqrt(-Inf) = NaN */ return a; /* sqrt(+Inf) = +Inf */ } /* sqrt(0) = 0 (handle both +0 and -0) */ if (!exp && !mant) return BF16_ZERO(); /* sqrt of negative number is NaN */ if (sign) return BF16_NAN(); /* Flush denormals to zero */ if (!exp) return BF16_ZERO(); /* Direct bit manipulation square root algorithm */ /* For sqrt: new_exp = (old_exp - bias) / 2 + bias */ int32_t e = exp - BF16_EXP_BIAS; int32_t new_exp; /* Get full mantissa with implicit 1 */ uint32_t m = 0x80 | mant; /* Range [128, 256) representing [1.0, 2.0) */ /* Adjust for odd exponents: sqrt(2^odd * m) = 2^((odd-1)/2) * sqrt(2*m) */ if (e & 1) { m <<= 1; /* Double mantissa for odd exponent */ new_exp = ((e - 1) >> 1) + BF16_EXP_BIAS; } else { new_exp = (e >> 1) + BF16_EXP_BIAS; } /* Now m is in range [128, 256) or [256, 512) if exponent was odd */ /* Binary search for integer square root */ /* We want result where result^2 = m * 128 (since 128 represents 1.0) */ uint32_t low = 90; /* Min sqrt (roughly sqrt(128)) */ uint32_t high = 256; /* Max sqrt (roughly sqrt(512)) */ uint32_t result = 128; /* Default */ /* Binary search for square root of m */ while (low <= high) { uint32_t mid = (low + high) >> 1; uint32_t sq = (mid * mid) / 128; /* Square and scale */ if (sq <= m) { result = mid; /* This could be our answer */ low = mid + 1; } else { high = mid - 1; } } /* result now contains sqrt(m) * sqrt(128) / sqrt(128) = sqrt(m) */ /* But we need to adjust the scale */ /* Since m is scaled where 128=1.0, result should also be scaled same way */ /* Normalize to ensure result is in [128, 256) */ if (result >= 256) { result >>= 1; new_exp++; } else if (result < 128) { while (result < 128 && new_exp > 1) { result <<= 1; new_exp--; } } /* Extract 7-bit mantissa (remove implicit 1) */ uint16_t new_mant = result & 0x7F; /* Check for overflow/underflow */ if (new_exp >= 0xFF) return (bf16_t) {.bits = 0x7F80}; /* +Inf */ if (new_exp <= 0) return BF16_ZERO(); return (bf16_t) {.bits = ((new_exp & 0xFF) << 7) | new_mant}; } static inline bool bf16_eq(bf16_t a, bf16_t b) { if (bf16_isnan(a) || bf16_isnan(b)) return false; if (bf16_iszero(a) && bf16_iszero(b)) return true; return a.bits == b.bits; } static inline bool bf16_lt(bf16_t a, bf16_t b) { if (bf16_isnan(a) || bf16_isnan(b)) return false; if (bf16_iszero(a) && bf16_iszero(b)) return false; bool sign_a = (a.bits >> 15) & 1, sign_b = (b.bits >> 15) & 1; if (sign_a != sign_b) return sign_a > sign_b; return sign_a ? a.bits > b.bits : a.bits < b.bits; } static inline bool bf16_gt(bf16_t a, bf16_t b) { return bf16_lt(b, a); } ``` ::: ### Assembly Code :::spoiler Assembly Code ```s= .data BF16_SIGN_MASK: .word 0x8000 BF16_EXP_MASK: .word 0x7F80 BF16_MANT_MASK: .word 0x007F BF16_EXP_BIAS: .word 127 BF16_NAN: .word 0x7FC0 BF16_ZERO: .word 0x0000 test: .word 0x3F80 # 1.0 .word 0x4000 # 2.0 .word 0xC000 # -2.0 .word 0x0000 # +0.0 .word 0x4080 # 4.0 .text .globl main main: la t0, test lw a2, 0(t0) # 1.0 lw a3, 4(t0) # 2.0 lw a4, 16(t0) # 4.0 lw a5, 8(t0) # -2.0 # 2 + 3 = 5 li a0, 0x4000 # 2.0 li a1, 0x4040 # 3.0 jal ra, bf16_add jal ra, print_hex # expect 0x40A0 (5.0) # 2 - 3 = -1 li a0, 0x4000 # 2.0 li a1, 0x4040 # 3.0 jal ra, bf16_sub jal ra, print_hex # expect 0xBF80 (-1.0) # 2 * 3 = 6 li a0, 0x4000 # 2.0 li a1, 0x4040 # 3.0 jal ra, bf16_mul jal ra, print_hex # expect 0x40C0 (6.0) # 4 / 2 = 2 li a0, 0x4080 # 4.0 li a1, 0x4000 # 2.0 jal ra, bf16_div jal ra, print_hex # expect 0x4000 (2.0) # sqrt(4) = 2 li a0, 0x4080 # 4.0 jal ra, bf16_sqrt jal ra, print_hex # expect 0x4000 (2.0) # end li a7, 10 ecall print_hex: addi sp, sp, -16 sw ra, 12(sp) sw a0, 8(sp) li a7, 11 li a0, 48 # '0' = 48 ecall li a0, 120 # 'x' = 120 ecall lw a0, 8(sp) li t0, 4 li t1, 12 print_hex_loop: beqz t0, print_hex_done lw t2, 8(sp) srl t2, t2, t1 andi t2, t2, 0xF li t3, 10 blt t2, t3, print_hex_digit addi t2, t2, -10 addi t2, t2, 65 # 'A' = 65 j print_hex_char print_hex_digit: addi t2, t2, 48 # '0' = 48 print_hex_char: mv a0, t2 li a7, 11 ecall addi t0, t0, -1 addi t1, t1, -4 j print_hex_loop print_hex_done: li a7, 11 li a0, 10 # '\n' = 10 ecall lw ra, 12(sp) addi sp, sp, 16 ret bf16_isnan: la t0,BF16_EXP_MASK lw t0,0(t0) la t1,BF16_MANT_MASK lw t1,0(t1) and t2, a0, t0 xor t2, t2, t0 # (a.bits & BF16_EXP_MASK) == BF16_EXP_MASK sltiu t4, t2, 1 # cond1 and t3, a0, t1 # a.bits & BF16_MANT_MASK snez t5, t3 # !cond2 and a0, t4, t5 # check if (cond1 & !cond2) ==1 ret bf16_isinf: la t0,BF16_EXP_MASK lw t0,0(t0) la t1,BF16_MANT_MASK lw t1,0(t1) and t2, a0, t0 xor t2, t2, t0 # (a.bits & BF16_EXP_MASK) == BF16_EXP_MASK sltiu t4, t2, 1 # cond1 and t3, a0, t1 # a.bits & BF16_MANT_MASK sltiu t5, t3, 1 # cond2 and a0, t4, t5 # check if (cond1 & cond2) ==1 ret bf16_iszero: li t0,0x7fff and t1,a0,t0 sltiu a0,t1,1 ret f32_to_bf16: #(f32bits >> 23) & 0xFF srli t0, a0, 23 andi t0, t0, 0xFF li t1, 0xFF bne t0, t1, f32_round # return (bf16_t) {.bits = (f32bits >> 16) & 0xFFFF} srli a0, a0, 16 ret f32_round: # f32bits += ((f32bits >> 16) & 1) + 0x7FFF srli t0, a0, 16 andi t0, t0, 1 li t1, 0x7FFF add t0, t0, t1 add a0, a0, t0 # return (bf16_t) {.bits = f32bits >> 16} srli a0, a0, 16 ret bf16_to_f32: # uint32_t f32bits = ((uint32_t) val.bits) << 16 slli a0, a0, 16 ret bf16_add: srli s1, a0, 15 # sign_a srli s2, a1, 15 # sign_b srli s3, a0, 7 andi s3, s3, 0xFF # exp_a srli s4, a1, 7 andi s4, s4, 0xFF # exp_b andi s5, a0, 0x7F # mant_a andi s6, a1, 0x7F # mant_b li t0, 0xFF beq s3, t0, add_sp # a==NaN/inf beq s4, t0, add_sp # b==NaN/inf or t0, s3, s5 # a==0, return b beqz t0, add_ret_b or t1, s4, s6 # b==0, return a beqz t1, add_return li t0, 0x80 or s5, s5, t0 # mant_a |= 0x80 or s6, s6, t0 # mant_b |= 0x80 sub s7, s3, s4 # s7 = exp_diff mv s8, s3 # result_exp = exp_a blez s7, add_align srl s6, s6, s7 # align mant_b j add_sign_check add_align: mv s8, s4 # result_exp = exp_b neg s7, s7 srl s5, s5, s7 # align mant_a add_sign_check: beq s1, s2, add_add bge s5, s6, add_sub_a_minus_b mv s9, s2 # result_sign = sign_b sub s10, s6, s5 # result_mant = mant_b - mant_a j add_norm add_sub_a_minus_b: mv s9, s1 # result_sign = sign_a sub s10, s5, s6 # result_mant = mant_a - mant_a add_norm: li a0, 0 beqz s10, add_return # if result is zero, return zero add_norm_loop: li t0, 0x80 and t1, s10, t0 bnez t1, add_pack slli s10, s10, 1 addi s8, s8, -1 blez s8, add_return # return zero if exp underflows j add_norm_loop add_add: mv s9, s1 # result_sign = sign_a add s10, s5, s6 # result_mant andi t0, s10, 0x100 beqz t0, add_pack # Mantissa overflow srli s10, s10, 1 addi s8, s8, 1 li t0, 0xFF bge s8, t0, add_return_inf add_pack: slli s9, s9, 15 andi s8, s8, 0xFF slli s8, s8, 7 andi s10, s10, 0x7F or a0, s9, s8 or a0, a0, s10 j add_return add_sp: bnez s5, add_return # a==NaN, return a li t0, 0xFF bne s4, t0, add_return # b!=NaN/inf, return a bnez s6, add_ret_b # b==NaN, return b beq s1, s2, add_ret_b # inf+inf, return b li a0, 0x7FC0 # +inf + -inf, return NaN j add_return add_ret_b: mv a0, a1 j add_return add_return_inf: slli a0, s9, 15 li t0, 0x7F80 or a0, a0, t0 add_return: ret bf16_sub: li t0, 0x8000 xor a1, a1, t0 j bf16_add bf16_mul: addi sp, sp, -40 sw s0, 0(sp) sw s1, 4(sp) sw s2, 8(sp) sw s3, 12(sp) sw s4, 16(sp) sw s5, 20(sp) sw s6, 24(sp) sw s7, 28(sp) sw s8, 32(sp) sw s9, 36(sp) # a.bits -> sign, exp, mant srli s0, a0, 15 # s0 = sign_a srli s2, a0, 7 andi s2, s2, 0xFF # s2 = exp_a andi s4, a0, 0x7F # s4 = mant_a # b.bits -> sign, exp, mant srli s1, a1, 15 # s1 = sign_b srli s3, a1, 7 andi s3, s3, 0xFF # s3 = exp_b andi s5, a1, 0x7F # s5 = mant_b xor s6, s0, s1 # s6 = result_sign # handle a == NaN or Inf li t0, 0xFF bne s2, t0, mul_check_b_sp bnez s4, mul_ret_a # a is NaN, return a or t0, s3, s5 # if (b == 0) bnez t0, mul_ret_inf # Inf * Num = Inf la t0, BF16_NAN lw a0, 0(t0) # Inf * 0 = NaN j mul_ret mul_check_b_sp: # handle b == NaN or Inf li t0, 0xFF bne s3, t0, mul_check_zero bnez s5, mul_ret_b # b is NaN, return b or t0, s2, s4 # if (a == 0) bnez t0, mul_ret_inf # Num * Inf = Inf la t0, BF16_NAN lw a0, 0(t0) # 0 * Inf = NaN j mul_ret mul_check_zero: # handle a == 0 or b == 0 or t0, s2, s4 beqz t0, mul_ret_zero # if a==0 or t0, s3, s5 beqz t0, mul_ret_zero # if b==0 # normalize denormals li s9, 0 # s9 = exp_adjust bnez s2, mul_norm_a_done mul_norm_a_loop: # normalize a andi t0, s4, 0x80 bnez t0, mul_norm_a_end slli s4, s4, 1 addi s9, s9, -1 j mul_norm_a_loop mul_norm_a_end: li s2, 1 mul_norm_a_done: ori s4, s4, 0x80 # mant_a |= implicit 1 bnez s3, mul_norm_b_done mul_norm_b_loop: # normalize b andi t0, s5, 0x80 bnez t0, mul_norm_b_end slli s5, s5, 1 addi s9, s9, -1 j mul_norm_b_loop mul_norm_b_end: li s3, 1 mul_norm_b_done: ori s5, s5, 0x80 # mant_b |= implicit 1 # main calculation mul s8, s4, s5 # s8 = result_mant la t0, BF16_EXP_BIAS lw t0, 0(t0) add s7, s2, s3 # s7 = result_exp sub s7, s7, t0 add s7, s7, s9 # normalize result li t0, 0x8000 and t0, s8, t0 beqz t0, mul_norm_res_else srli s8, s8, 8 andi s8, s8, 0x7F addi s7, s7, 1 j mul_check_overflow mul_norm_res_else: srli s8, s8, 7 andi s8, s8, 0x7F mul_check_overflow: # check overflow / underflow li t0, 0xFF bge s7, t0, mul_ret_inf blez s7, mul_underflow j mul_pack mul_underflow: li t0, -6 blt s7, t0, mul_ret_zero li t0, 1 sub t0, t0, s7 srl s8, s8, t0 li s7, 0 mul_pack: # pack result bits slli s6, s6, 15 andi s7, s7, 0xFF slli s7, s7, 7 andi s8, s8, 0x7F or a0, s6, s7 or a0, a0, s8 j mul_ret mul_ret_a: mv a0, a0 j mul_ret mul_ret_b: mv a0, a1 j mul_ret mul_ret_zero: slli a0, s6, 15 j mul_ret mul_ret_inf: slli a0, s6, 15 li t0, 0x7F80 or a0, a0, t0 mul_ret: # restore s0-s9 lw s0, 0(sp) lw s1, 4(sp) lw s2, 8(sp) lw s3, 12(sp) lw s4, 16(sp) lw s5, 20(sp) lw s6, 24(sp) lw s7, 28(sp) lw s8, 32(sp) lw s9, 36(sp) addi sp, sp, 40 ret bf16_div: # --- Prologue: Save registers --- addi sp, sp, -44 sw ra, 0(sp) sw s0, 4(sp) sw s1, 8(sp) sw s2, 12(sp) sw s3, 16(sp) sw s4, 20(sp) sw s5, 24(sp) sw s6, 28(sp) sw s7, 32(sp) sw s8, 36(sp) sw s9, 40(sp) addi sp, sp, -4 sw s10, 0(sp) # Save s10 srli s0, a0, 15 # s0 = sign_a srli s2, a0, 7 andi s2, s2, 0xFF # s2 = exp_a andi s4, a0, 0x7F # s4 = mant_a srli s1, a1, 15 # s1 = sign_b srli s3, a1, 7 andi s3, s3, 0xFF # s3 = exp_b andi s5, a1, 0x7F # s5 = mant_b xor s6, s0, s1 # s6 = result_sign # Special Case # if (exp_b == 0xFF),b is Inf or NaN li t0, 0xFF bne s3, t0, div_check_div_by_zero bnez s5, div_ret_b # if (mant_b) return b (NaN) # b is Inf li t0, 0xFF beq s2, t0, div_check_inf_div_inf # Check if a is also Inf # a is a normal number, Num / Inf = 0 j div_ret_zero div_check_inf_div_inf: bnez s4, div_ret_a # a is NaN, return a # Inf / Inf = NaN j div_ret_nan div_check_div_by_zero: # if (!exp_b && !mant_b) -> b is 0 or t0, s3, s5 bnez t0, div_check_a_sp # b is 0 or t0, s2, s4 # Check if a is also 0 beqz t0, div_ret_nan # 0 / 0 = NaN # Num / 0 = Inf j div_ret_inf div_check_a_sp: # if (exp_a == 0xFF) -> a is Inf or NaN li t0, 0xFF bne s2, t0, div_check_a_zero bnez s4, div_ret_a # a is NaN, return a # a is Inf, Inf / Num = Inf j div_ret_inf div_check_a_zero: # if (!exp_a && !mant_a) -> a is 0 or t0, s2, s4 bnez t0, div_pre_calc # a is 0, 0 / Num = 0 j div_ret_zero div_pre_calc: # --- Add implicit leading bit for normal numbers --- bnez s2, div_norm_a_done # a is denormal, mant_a has no implicit 1 j div_norm_b_check div_norm_a_done: ori s4, s4, 0x80 div_norm_b_check: bnez s3, div_norm_b_done # b is denormal j div_main_calc div_norm_b_done: ori s5, s5, 0x80 div_main_calc: # --- Core Division Loop (Restoring Division) --- # uint32_t dividend = (uint32_t) mant_a << 15; slli s8, s4, 15 # s8 = dividend mv s9, s5 # s9 = divisor li s10, 0 # s10 = quotient li t0, 0 # t0 = loop counter i div_loop_start: slli s10, s10, 1 # quotient <<= 1 # Calculate shifted divisor: divisor << (15 - i) li t1, 15 sub t1, t1, t0 sll t2, s9, t1 # t2 = divisor << (15 - i) # if (dividend >= shifted_divisor) bltu s8, t2, div_loop_skip_sub sub s8, s8, t2 # dividend -= shifted_divisor ori s10, s10, 1 # quotient |= 1 div_loop_skip_sub: addi t0, t0, 1 li t1, 16 blt t0, t1, div_loop_start # --- Exponent Calculation --- la t0, BF16_EXP_BIAS lw t0, 0(t0) sub s7, s2, s3 # s7 = result_exp = exp_a - exp_b add s7, s7, t0 # s7 += BIAS beqz s2, div_exp_adj_a_done addi s7, s7, -1 # if a was denormal div_exp_adj_a_done: beqz s3, div_exp_adj_b_done addi s7, s7, 1 # if b was denormal div_exp_adj_b_done: # --- Normalize Quotient --- li t0, 0x8000 and t0, s10, t0 bnez t0, div_norm_q_shift_8 # if (quotient & 0x8000) div_norm_q_loop: li t0, 0x8000 and t0, s10, t0 bnez t0, div_norm_q_shift_8 # if quotient is normalized li t1, 1 ble s7, t1, div_norm_q_shift_8 # if exp <= 1, stop shifting slli s10, s10, 1 addi s7, s7, -1 j div_norm_q_loop div_norm_q_shift_8: srli s10, s10, 8 andi s10, s10, 0x7F # quotient &= 0x7F # --- Check Overflow / Underflow --- li t0, 0xFF bge s7, t0, div_ret_inf # Overflow -> Inf blez s7, div_ret_zero # Underflow -> 0 (simplified from C code) div_pack: slli s6, s6, 15 # result_sign andi s7, s7, 0xFF slli s7, s7, 7 # result_exp andi s10, s10, 0x7F # result_mant or a0, s6, s7 or a0, a0, s10 j div_ret div_ret_a: mv a0, a0 j div_ret div_ret_b: mv a0, a1 j div_ret div_ret_nan: la t0, BF16_NAN lw a0, 0(t0) j div_ret div_ret_zero: slli a0, s6, 15 j div_ret div_ret_inf: slli a0, s6, 15 li t0, 0x7F80 or a0, a0, t0 div_ret: # --- Epilogue: Restore registers --- lw s10, 0(sp) addi sp, sp, 4 lw ra, 0(sp) lw s0, 4(sp) lw s1, 8(sp) lw s2, 12(sp) lw s3, 16(sp) lw s4, 20(sp) lw s5, 24(sp) lw s6, 28(sp) lw s7, 32(sp) lw s8, 36(sp) lw s9, 40(sp) addi sp, sp, 44 ret bf16_sqrt: # save s0-s7 addi sp, sp, -32 sw s0, 0(sp) sw s1, 4(sp) sw s2, 8(sp) sw s3, 12(sp) sw s4, 16(sp) sw s5, 20(sp) sw s6, 24(sp) sw s7, 28(sp) # decompose a srli s0, a0, 15 # s0 = sign srli s1, a0, 7 andi s1, s1, 0xFF # s1 = exp andi s2, a0, 0x7F # s2 = mant # handle special cases li t0, 0xFF bne s1, t0, sqrt_check_zero bnez s2, sqrt_ret_a # a is NaN bnez s0, sqrt_ret_nan # sqrt(-Inf) j sqrt_ret_a # sqrt(+Inf) sqrt_check_zero: or t0, s1, s2 bnez t0, sqrt_check_neg la a0, BF16_ZERO lw a0, 0(t0) # sqrt(0) j sqrt_ret sqrt_check_neg: bnez s0, sqrt_ret_nan # sqrt(negative) beqz s1, sqrt_ret_zero # denormal -> 0 # calculate exponent la t0, BF16_EXP_BIAS lw t0, 0(t0) sub s3, s1, t0 # s3 = e ori s4, s2, 0x80 # s4 = m andi t1, s3, 1 beqz t1, sqrt_exp_even # e is odd slli s4, s4, 1 addi s3, s3, -1 srli s3, s3, 1 add s7, s3, t0 # s7 = new_exp j sqrt_core sqrt_exp_even: # e is even srli s3, s3, 1 add s7, s3, t0 # s7 = new_exp sqrt_core: # binary search li s5, 90 # low li s6, 256 # high li s2, 128 # result sqrt_loop: ble s6, s5, sqrt_norm add t0, s5, s6 # mid srli t0, t0, 1 mul t1, t0, t0 # sq = mid * mid srli t1, t1, 7 # sq /= 128 ble t1, s4, sqrt_update_low addi s6, t0, -1 # high = mid - 1 j sqrt_loop sqrt_update_low: mv s2, t0 # result = mid addi s5, t0, 1 # low = mid + 1 j sqrt_loop sqrt_norm: # normalize result li t0, 256 blt s2, t0, sqrt_norm_check_low srli s2, s2, 1 addi s7, s7, 1 sqrt_norm_check_low: li t0, 128 bge s2, t0, sqrt_norm_end li t0, 1 ble s7, t0, sqrt_norm_end slli s2, s2, 1 addi s7, s7, -1 j sqrt_norm_check_low sqrt_norm_end: andi s2, s2, 0x7F # new_mant # check overflow / underflow li t0, 0xFF bge s7, t0, sqrt_ret_inf blez s7, sqrt_ret_zero sqrt_pack: # pack result andi s7, s7, 0xFF slli s7, s7, 7 or a0, s7, s2 j sqrt_ret sqrt_ret_a: mv a0, a0 j sqrt_ret sqrt_ret_nan: la t0, BF16_NAN lw a0, 0(t0) j sqrt_ret sqrt_ret_zero: la t0, BF16_ZERO lw a0, 0(t0) j sqrt_ret sqrt_ret_inf: li a0, 0x7F80 sqrt_ret: # restore s0-s7 lw s0, 0(sp) lw s1, 4(sp) lw s2, 8(sp) lw s3, 12(sp) lw s4, 16(sp) lw s5, 20(sp) lw s6, 24(sp) lw s7, 28(sp) addi sp, sp, 32 ret bf16_eq: addi sp, sp, -16 sw ra, 12(sp) sw a0, 8(sp) # a sw a1, 4(sp) # b # check if (bf16_isnan(a) || bf16_isnan(b)) jal ra, bf16_isnan bnez a0, eq_false # isnan(b), return false lw a0, 4(sp) # b jal ra, bf16_isnan bnez a0, eq_false # isnan(b), return false # check if (bf16_iszero(a) && iszero(b)) lw a0, 8(sp) # a jal ra, bf16_iszero beqz a0, eq_cmp_bits lw a0, 4(sp) # b jal ra, bf16_iszero beqz a0, eq_cmp_bits # a==0 && b==0, return true li a0, 1 j eq_end eq_cmp_bits: # return a.bits == b.bits; lw t0, 8(sp) # a.bits lw t1, 4(sp) # b.bits xor a0, t0, t1 # a0 = a.bits ^ b.bits sltiu a0, a0, 1 # a0 == 0 return true, else false j eq_end eq_false: li a0, 0 # false eq_end: lw ra, 12(sp) addi sp, sp, 16 ret bf16_lt: # store return addr & a,b addi sp, sp, -16 sw ra, 12(sp) sw a0, 8(sp) # a sw a1, 4(sp) # b # isnan(a) jal ra, bf16_isnan bnez a0, lt_false # isnan(b) lw a0, 4(sp) jal ra, bf16_isnan bnez a0, lt_false # check if (iszero(a) && iszero(b)) lw a0, 8(sp) jal ra, bf16_iszero beqz a0, lt_cmp_sign lw a0, 4(sp) # b jal ra, bf16_iszero bnez a0, lt_false lt_cmp_sign: lw t0, 8(sp) # a.bits lw t1, 4(sp) # b.bits srli t2, t0, 15 # sign_a = (a.bits >> 15) & 1 srli t3, t1, 15 # sign_b = (b.bits >> 15) & 1 beq t2, t3, lt_eq sltu a0, t3, t2 # sign_a > sign_b j lt_end lt_eq: bnez t2, lt_both_negative # return a.bits < b.bits sltu a0, t0, t1 j lt_end lt_both_negative: # return a.bits > b.bits sltu a0, t1, t0 j lt_end lt_false: li a0, 0 # false lt_end: lw ra, 12(sp) addi sp, sp, 16 ret bf16_gt: # swap (a,b), call bf16_lt mv t0, a0 mv a0, a1 mv a1, t0 j bf16_lt ``` ::: ### Analysis **Comparison** | | C Code | ASM | |:----------------:|:------:|:-------:| | **Cycles** | | 1247 | | **Instrs. Ret.** | | 873 | | **CPI** | | 1.43 | | **IPC** | | 0.7 | | **Clock rate** | | 1.43KHz | ## [LeetCode 1611. Minimum One Bit Operations to Make Integers Zero](https://leetcode.com/problems/minimum-one-bit-operations-to-make-integers-zero/description/) ### Description Given an integer `n`, you must transform it into `0` using the following operations any number of times: - Change the rightmost (0th) bit in the binary representation of `n`. - Change the `i`th bit in the binary representation of `n` **if** the `(i−1)`th bit is set to `1` and the `(i−2)`th through `0`th bits are set to `0`. Return the *minimum number of operations* to transform `n` into `0`. --- **Example 1:** > Input: n = 3 > Output: 2 > Explanation: > The binary representation of 3 is `"11"`. > `"11"` → `"01"` with the 2nd operation since the 0th bit is 1. > `"01"` → `"00"` with the 1st operation. --- **Example 2:** > Input: n = 6 > Output: 4 > Explanation: > The binary representation of 6 is `"110"`. > `"110"` → `"010"` with the 2nd operation since the 1st bit is 1 and 0th through 0th bits are 0. > `"010"` → `"011"` with the 1st operation. > `"011"` → `"001"` with the 2nd operation since the 0th bit is 1. > `"001"` → `"000"` with the 1st operation. --- **Constraints** * `0 <= n <= 10⁹` --- ### Implementation For a given integer $n$, each operation flips a single bit following the rule of Gray code transition. Thus, the minimum number of operations equals the **Gray code value decoded to binary**. The Gray code encoding is: $$ g = n \oplus (n \gg 1) $$ and decoding (the reverse process) is: $$ f(n) = n \oplus (n \gg 1) \oplus (n \gg 2) \oplus (n \gg 3) \oplus \dots $$ Iteratively, this can be computed by: ```c= while (n) { res ^= n; n >>= 1; } ``` Alternatively, the recursive form based on the most significant bit (MSB) is: $$ f(0) = 0, \quad f(n) = (1 \ll (k+1)) - 1 - f(n \oplus (1 \ll k)) $$ where $k = \text{MSB position of } n$. To find $k$ efficiently, `clz` (Count Leading Zeros) is used: $$ k = 31 - \text{clz}(n) $$ This avoids the $O(\log n)$ loop that shifts `n` until zero, reducing the total complexity from $O((\log n)^2)$ to $O(\log n)$. ### C Code ```c= #include <stdio.h> #include <stdint.h> #include <stdio.h> #include <stdint.h> static inline unsigned clz(uint32_t x) { int n = 32, c = 16; do { uint32_t y = x >> c; if (y) { n -= c; x = y; } c >>= 1; } while (c); return n - x; } int minimumOneBitOperations(int n) { if (!n) return 0; int k = 31 - (int)clz((uint32_t)n); return ((1 << (k + 1)) - 1) - minimumOneBitOperations(n ^ (1 << k)); } int main(void) { printf("%d\n", minimumOneBitOperations(1000000)); printf("%d\n", minimumOneBitOperations(2000000)); printf("%d\n", minimumOneBitOperations(3000000)); } ``` ### Assembly Code :::spoiler Assembly Code ```s= .data case: .word 1000000,2000000,3000000 count: .word 3 .text .globl main main: la s0, case lw t1, count loop: beqz t1, exit lw a0, 0(s0) addi sp, sp, -16 sw ra, 12(sp) sw s0, 8(sp) sw t1, 4(sp) jal ra, minimumOneBitOperations lw t1, 4(sp) lw s0, 8(sp) lw ra, 12(sp) addi sp, sp, 16 li a7, 1 ecall li a7, 11 li a0, 10 ecall addi s0, s0, 4 addi t1, t1, -1 j loop exit: li a7, 10 ecall minimumOneBitOperations: beq a0, zero, mbo_ret0 addi sp, sp, -20 sw ra, 16(sp) sw s0, 12(sp) sw a0, 8(sp) # n # k = 31 - clz(n) jal ra, clz # a0 = clz(n) lw t0, 8(sp) # t0 = original n li t1, 31 sub t2, t1, a0 # t2 = k # s0 = ((1 << (k+1)) - 1) li t3, 1 addi t4, t2, 1 sll t3, t3, t4 # t3 = 1<<(k+1) addi t3, t3, -1 # t3 = (1<<(k+1))-1 mv s0, t3 # a0 = n ^ (1<<k) li t5, 1 sll t5, t5, t2 # t5 = 1<<k xor a0, t0, t5 # a0 = n ^ (1<<k) jal ra, minimumOneBitOperations sub a0, s0, a0 lw s0, 12(sp) lw ra, 16(sp) addi sp, sp, 20 ret mbo_ret0: li a0, 0 ret clz: li t0, 32 # n = 32 li t1, 16 # c = 16 clz_loop: srl t2, a0, t1 # y = x >> c beqz t2, clz_skip sub t0, t0, t1 # n -= c mv a0, t2 # x = y clz_skip: srli t1, t1, 1 bnez t1, clz_loop sub a0, t0, a0 # return n - x ret ``` ::: --- Of course, we can optimize it with the **branchless CLZ** version already implemented in Problem B. :::spoiler Assembly Code with branchless CLZ ```s= .data case: .word 1000000,2000000,3000000 count: .word 3 .text .globl main main: la s0, case lw t1, count loop: beqz t1, exit lw a0, 0(s0) addi sp, sp, -16 sw ra, 12(sp) sw s0, 8(sp) sw t1, 4(sp) jal ra, minimumOneBitOperations lw t1, 4(sp) lw s0, 8(sp) lw ra, 12(sp) addi sp, sp, 16 li a7, 1 ecall li a7, 11 li a0, 10 ecall addi s0, s0, 4 addi t1, t1, -1 j loop exit: li a7, 10 ecall minimumOneBitOperations: beq a0, zero, mbo_ret0 addi sp, sp, -20 sw ra, 16(sp) sw s0, 12(sp) sw a0, 8(sp) # n # k = 31 - clz(n) jal ra, clz # a0 = clz(n) lw t0, 8(sp) # t0 = original n li t1, 31 sub t2, t1, a0 # t2 = k # s0 = ((1 << (k+1)) - 1) li t3, 1 addi t4, t2, 1 sll t3, t3, t4 # t3 = 1<<(k+1) addi t3, t3, -1 # t3 = (1<<(k+1))-1 mv s0, t3 # a0 = n ^ (1<<k) li t5, 1 sll t5, t5, t2 # t5 = 1<<k xor a0, t0, t5 # a0 = n ^ (1<<k) jal ra, minimumOneBitOperations sub a0, s0, a0 lw s0, 12(sp) lw ra, 16(sp) addi sp, sp, 20 ret mbo_ret0: li a0, 0 ret clz: li t0, 32 # n = 32 # 16-bit srli t2, a0, 16 # y = x >> 16 snez t3, t2 # y != 0 slli t1, t3, 4 # t1 = 16 or 0 sub t0, t0, t1 # n -= t1 srl a0, a0, t1 # x >>= t1 # 8-bit srli t2, a0, 8 snez t3, t2 slli t1, t3, 3 # t1 = 8 or 0 sub t0, t0, t1 srl a0, a0, t1 # 4-bit srli t2, a0, 4 snez t3, t2 slli t1, t3, 2 # t1 = 4 or 0 sub t0, t0, t1 srl a0, a0, t1 # 2-bit srli t2, a0, 2 snez t3, t2 slli t1, t3, 1 # t1 = 2 or 0 sub t0, t0, t1 srl a0, a0, t1 # 1-bit srli t2, a0, 1 snez t3, t2 sub t0, t0, t3 # n -= 1 srl a0, a0, t3 sub a0, t0, a0 # return n - x ret ``` ::: ### Analysis Test with 3 test case `1000000,2000000,3000000` **Console Output** ![image](https://hackmd.io/_uploads/r1xX1r9Txg.png =40%x) **Comparision** | | C Code | ASM | CLZ opt. | |:----------------:|:---------:|:--------:|:--------:| | **Cycles** | 12011 | 1870 | 1510 | | **Instrs. Ret.** | 9003 | 1317 | 1273 | | **CPI** | 1.33 | 1.42 | 1.19 | | **IPC** | 0.75 | 0.704 | 0.843 | | **Clock rate** | 22.68 KHz | 1.71 KHz | 2.90 KHz | ## 5 stage RISC-V Processor We test our code using [Ripes](https://ripes.me/) simulator. RISC-V processor have five stage : **IF, ID, EX, MEM, WB** ![image](https://hackmd.io/_uploads/HkwHu29Jyx.png =60%x) | Stage | Full Name | |:-----:|:------------------:| | IF | Instruction Fetch | | ID | Instruction Decode | | EX | Execution | | MEM | Data Memory Access | | WB | Write Back | --- ## Reference [Quiz1 of Computer Architecture (2025 Fall)](https://hackmd.io/@sysprog/2025-arch-homework1) [LeetCode 1611 - Minimum One Bit Operations to Make Integers Zero](https://leetcode.com/problems/minimum-one-bit-operations-to-make-integers-zero/description/) [Markdown syntax](https://hackmd.io/@mrcoding/ryZE7k8cN)