Assignment1: RISC-V Assembly and Instruction Pipeline

# Assignment1: RISC-V Assembly and Instruction Pipeline contributed by <`戴仁杰`> repo: https://github.com/dozingmoon/ca2025-quizzes Disclaimer: I used AI tools such as ChatGPT for better understanding of algorithm or architecture descriptions. # Problem B ## C Code :::spoiler `q1/q1-uf8.c` ```clike= #include <stdbool.h> #include <stdint.h> #include <stdio.h> #include <stdlib.h> typedef uint8_t uf8; static inline unsigned clz(uint32_t x) { int n = 32, c = 16; do { uint32_t y = x >> c; if (y) { n -= c; x = y; } c >>= 1; } while (c); return n - x; } /* Decode uf8 to uint32_t */ uint32_t uf8_decode(uf8 fl) { uint32_t mantissa = fl & 0x0f; uint8_t exponent = fl >> 4; uint32_t offset = (0x7FFF >> (15 - exponent)) << 4; return (mantissa << exponent) + offset; } /* Encode uint32_t to uf8 */ uf8 uf8_encode(uint32_t value) { /* Use CLZ for fast exponent calculation */ if (value < 16) return value; /* Find appropriate exponent using CLZ hint */ int lz = clz(value); int msb = 31 - lz; /* Start from a good initial guess */ uint8_t exponent = 0; uint32_t overflow = 0; if (msb >= 5) { /* Estimate exponent - the formula is empirical */ exponent = msb - 4; if (exponent > 15) exponent = 15; /* Calculate overflow for estimated exponent */ for (uint8_t e = 0; e < exponent; e++) overflow = (overflow << 1) + 16; /* Adjust if estimate was off */ while (exponent > 0 && value < overflow) { overflow = (overflow - 16) >> 1; exponent--; } } /* Find exact exponent */ while (exponent < 15) { uint32_t next_overflow = (overflow << 1) + 16; if (value < next_overflow) break; overflow = next_overflow; exponent++; } uint8_t mantissa = (value - overflow) >> exponent; return (exponent << 4) | mantissa; } /* Test encode/decode round-trip */ static bool test(void) { int32_t previous_value = -1; bool passed = true; for (int i = 0; i < 256; i++) { uint8_t fl = i; int32_t value = uf8_decode(fl); uint8_t fl2 = uf8_encode(value); if (fl != fl2) { printf("%02x: produces value %d but encodes back to %02x\n", fl, value, fl2); passed = false; } if (value <= previous_value) { printf("%02x: value %d <= previous_value %d\n", fl, value, previous_value); passed = false; } previous_value = value; } return passed; } int main(void) { if (test()) { printf("All tests passed.\n"); return 0; } return 1; } ``` ::: ## Assembly :::spoiler `hw1-uf8.s` ```asm= .data str1: .string ": produces value " str2: .string " but encodes back to " str3: .string ": value " str4: .string " >= previous_value " strnl: .string "\n" strps: .string "All tests passed" .text ################# # main # ################# main: addi sp,sp,-4 sw ra,0(sp) jal test # a1: passed beq a1,x0,NotPassed # Test is passed li a7,4 la a0, strps ecall li a5,0 j exit_main NotPassed: li a5,1 ret exit_main: mv a0,a5 lw ra,0(sp) addi sp,sp,4 li a7, 10 # exit system call ecall ################# # CLZ # ################# clz: # x: a0, return: a1 li t1,32 # t1 = n = 32 li t2,16 # t2 = c = 16 clz_loop: srl t3,a0,t2 # t3=y=x>>c beq t3,x0,clz_else sub t1,t1,t2 mv t0,t3 clz_else: srai t2,t2,1 beq t2,x0,clz_loop sub a1,t1,t0 ret ################# # uf8_decode # ################# uf8_decode: # fl: a0, return: a1 andi t0,a0,0x0f # mantissa srli t1,a0,4 # exponent li t2,15 sub t2,t2,t1 li t3,0x7fff srl t2,t3,t2 slli t2,t2,4 # offset sll a0,t0,t1 add a1,a0,t2 ret ################# # uf8_encode # ################# uf8_encode: # value: a0, ret: a1 srli t0,a0,4 # will be zero if a0 < 16 bne t0,x0,ValGe16 ret ValGe16: addi sp,sp,-4 sw ra,0(sp) jal clz # lz: a1 lw ra,0(sp) addi sp,sp,4 li t0, 31 sub t0,t0,a1 # msb: t0 li t1,0 # exp: t1 li t2,0 # ovf: t2 addi t3,t0,-5 bltz t3, ExpLoopCond # if msb >= 5 # estimate exponent addi t1,t0,-4 li t3,15 ble t1,t3,ExpLe15 mv t1,t3 ExpLe15: li t3,0 OvfLoopCond: bge t3,t1, AdjLoopCond slli t2,t2,1 addi t2,t2,16 addi t3,t3,1 j OvfLoopCond AdjLoopCond: sltu t3,a0,t2 # t3: value < overflow sltu t4,x0,t1 # t4: 0 < exponent and t3,t3,t4 bgeu x0,t3,ExpLoopCond # break when cond is 0 addi t2,t2,-16 srli t2,t2,1 addi t1,t1,-1 j AdjLoopCond ExpLoopCond: li t3,15 bgeu t1,t3,ExpLoopExit slli t3,t2,1 addi t3,t3,16 # t3: next_overflow bltu a0,t3,ExpLoopExit mv t2,t3 addi t1,t1,1 j ExpLoopCond ExpLoopExit: sub t3,a0,t2 srl t3,t3,t1 slli t1,t1,4 or a1,t1,t3 ret ################# # test # ################# test: li t5,0xffffffff # t0: previous_value li t6, 1 # t1: passed li s0, 255 # s0: fl/i TestLoopCond: blt s0,x0,TestLoopExit addi sp,sp,-4 sw ra,0(sp) mv a0,s0 jal uf8_decode # a1: value mv a0,a1 jal uf8_encode mv s1, a0 # s1: value; a1: fl2 lw ra,0(sp) addi sp,sp,4 beq s0,a1,fl_equal_fl2 # Print if fl != fl2 li a7, 34 # Print hex mv a0,s0 # fl ecall li a7,4 la a0,str1 ecall li a7,34 mv a0,s1 # value ecall li a7,4 la a0,str2 ecall li a7,34 mv a0,a1 # fl2 ecall li a7,4 la a0,strnl ecall mv t6,x0 fl_equal_fl2: bgtu t5,s1,prev_gt_val # Print if previous_value <= value li a7, 34 # Print hex mv a0,s0 # fl ecall li a7,4 la a0,str3 ecall li a7,34 mv a0,s1 # value ecall li a7,4 la a0,str4 ecall li a7,34 mv a0,t5 # previous_value ecall li a7,4 la a0,strnl ecall mv t6,x0 prev_gt_val: mv t5,s1 addi s0,s0,-1 j TestLoopCond TestLoopExit: mv a1,t6 ret ``` ::: # Problem C ## C Code :::spoiler `q1/q1-bf16.c` ```clike= #include <stdbool.h> #include <stdint.h> #include <string.h> typedef struct { uint16_t bits; } bf16_t; #define BF16_SIGN_MASK 0x8000U #define BF16_EXP_MASK 0x7F80U #define BF16_MANT_MASK 0x007FU #define BF16_EXP_BIAS 127 #define BF16_NAN() ((bf16_t) {.bits = 0x7FC0}) #define BF16_ZERO() ((bf16_t) {.bits = 0x0000}) static inline bool bf16_isnan(bf16_t a) { return ((a.bits & BF16_EXP_MASK) == BF16_EXP_MASK) && (a.bits & BF16_MANT_MASK); } static inline bool bf16_isinf(bf16_t a) { return ((a.bits & BF16_EXP_MASK) == BF16_EXP_MASK) && !(a.bits & BF16_MANT_MASK); } static inline bool bf16_iszero(bf16_t a) { return !(a.bits & 0x7FFF); } static inline bf16_t f32_to_bf16(float val) { uint32_t f32bits; memcpy(&f32bits, &val, sizeof(float)); if (((f32bits >> 23) & 0xFF) == 0xFF) return (bf16_t) {.bits = (f32bits >> 16) & 0xFFFF}; f32bits += ((f32bits >> 16) & 1) + 0x7FFF; return (bf16_t) {.bits = f32bits >> 16}; } static inline float bf16_to_f32(bf16_t val) { uint32_t f32bits = ((uint32_t) val.bits) << 16; float result; memcpy(&result, &f32bits, sizeof(float)); return result; } static inline bf16_t bf16_add(bf16_t a, bf16_t b) { uint16_t sign_a = (a.bits >> 15) & 1; uint16_t sign_b = (b.bits >> 15) & 1; int16_t exp_a = ((a.bits >> 7) & 0xFF); int16_t exp_b = ((b.bits >> 7) & 0xFF); uint16_t mant_a = a.bits & 0x7F; uint16_t mant_b = b.bits & 0x7F; if (exp_a == 0xFF) { if (mant_a) return a; if (exp_b == 0xFF) return (mant_b || sign_a == sign_b) ? b : BF16_NAN(); return a; } if (exp_b == 0xFF) return b; if (!exp_a && !mant_a) return b; if (!exp_b && !mant_b) return a; if (exp_a) mant_a |= 0x80; if (exp_b) mant_b |= 0x80; int16_t exp_diff = exp_a - exp_b; uint16_t result_sign; int16_t result_exp; uint32_t result_mant; if (exp_diff > 0) { result_exp = exp_a; if (exp_diff > 8) return a; mant_b >>= exp_diff; } else if (exp_diff < 0) { result_exp = exp_b; if (exp_diff < -8) return b; mant_a >>= -exp_diff; } else { result_exp = exp_a; } if (sign_a == sign_b) { result_sign = sign_a; result_mant = (uint32_t) mant_a + mant_b; if (result_mant & 0x100) { result_mant >>= 1; if (++result_exp >= 0xFF) return (bf16_t) {.bits = (result_sign << 15) | 0x7F80}; } } else { if (mant_a >= mant_b) { result_sign = sign_a; result_mant = mant_a - mant_b; } else { result_sign = sign_b; result_mant = mant_b - mant_a; } if (!result_mant) return BF16_ZERO(); while (!(result_mant & 0x80)) { result_mant <<= 1; if (--result_exp <= 0) return BF16_ZERO(); } } return (bf16_t) { .bits = (result_sign << 15) | ((result_exp & 0xFF) << 7) | (result_mant & 0x7F), }; } static inline bf16_t bf16_sub(bf16_t a, bf16_t b) { b.bits ^= BF16_SIGN_MASK; return bf16_add(a, b); } static inline bf16_t bf16_mul(bf16_t a, bf16_t b) { uint16_t sign_a = (a.bits >> 15) & 1; uint16_t sign_b = (b.bits >> 15) & 1; int16_t exp_a = ((a.bits >> 7) & 0xFF); int16_t exp_b = ((b.bits >> 7) & 0xFF); uint16_t mant_a = a.bits & 0x7F; uint16_t mant_b = b.bits & 0x7F; uint16_t result_sign = sign_a ^ sign_b; if (exp_a == 0xFF) { if (mant_a) return a; if (!exp_b && !mant_b) return BF16_NAN(); return (bf16_t) {.bits = (result_sign << 15) | 0x7F80}; } if (exp_b == 0xFF) { if (mant_b) return b; if (!exp_a && !mant_a) return BF16_NAN(); return (bf16_t) {.bits = (result_sign << 15) | 0x7F80}; } if ((!exp_a && !mant_a) || (!exp_b && !mant_b)) return (bf16_t) {.bits = result_sign << 15}; int16_t exp_adjust = 0; if (!exp_a) { while (!(mant_a & 0x80)) { mant_a <<= 1; exp_adjust--; } exp_a = 1; } else mant_a |= 0x80; if (!exp_b) { while (!(mant_b & 0x80)) { mant_b <<= 1; exp_adjust--; } exp_b = 1; } else mant_b |= 0x80; uint32_t result_mant = (uint32_t) mant_a * mant_b; int32_t result_exp = (int32_t) exp_a + exp_b - BF16_EXP_BIAS + exp_adjust; if (result_mant & 0x8000) { result_mant = (result_mant >> 8) & 0x7F; result_exp++; } else result_mant = (result_mant >> 7) & 0x7F; if (result_exp >= 0xFF) return (bf16_t) {.bits = (result_sign << 15) | 0x7F80}; if (result_exp <= 0) { if (result_exp < -6) return (bf16_t) {.bits = result_sign << 15}; result_mant >>= (1 - result_exp); result_exp = 0; } return (bf16_t) {.bits = (result_sign << 15) | ((result_exp & 0xFF) << 7) | (result_mant & 0x7F)}; } static inline bf16_t bf16_div(bf16_t a, bf16_t b) { uint16_t sign_a = (a.bits >> 15) & 1; uint16_t sign_b = (b.bits >> 15) & 1; int16_t exp_a = ((a.bits >> 7) & 0xFF); int16_t exp_b = ((b.bits >> 7) & 0xFF); uint16_t mant_a = a.bits & 0x7F; uint16_t mant_b = b.bits & 0x7F; uint16_t result_sign = sign_a ^ sign_b; if (exp_b == 0xFF) { if (mant_b) return b; /* Inf/Inf = NaN */ if (exp_a == 0xFF && !mant_a) return BF16_NAN(); return (bf16_t) {.bits = result_sign << 15}; } if (!exp_b && !mant_b) { if (!exp_a && !mant_a) return BF16_NAN(); return (bf16_t) {.bits = (result_sign << 15) | 0x7F80}; } if (exp_a == 0xFF) { if (mant_a) return a; return (bf16_t) {.bits = (result_sign << 15) | 0x7F80}; } if (!exp_a && !mant_a) return (bf16_t) {.bits = result_sign << 15}; if (exp_a) mant_a |= 0x80; if (exp_b) mant_b |= 0x80; uint32_t dividend = (uint32_t) mant_a << 15; uint32_t divisor = mant_b; uint32_t quotient = 0; for (int i = 0; i < 16; i++) { quotient <<= 1; if (dividend >= (divisor << (15 - i))) { dividend -= (divisor << (15 - i)); quotient |= 1; } } int32_t result_exp = (int32_t) exp_a - exp_b + BF16_EXP_BIAS; if (!exp_a) result_exp--; if (!exp_b) result_exp++; if (quotient & 0x8000) quotient >>= 8; else { while (!(quotient & 0x8000) && result_exp > 1) { quotient <<= 1; result_exp--; } quotient >>= 8; } quotient &= 0x7F; if (result_exp >= 0xFF) return (bf16_t) {.bits = (result_sign << 15) | 0x7F80}; if (result_exp <= 0) return (bf16_t) {.bits = result_sign << 15}; return (bf16_t) { .bits = (result_sign << 15) | ((result_exp & 0xFF) << 7) | (quotient & 0x7F), }; } static inline bf16_t bf16_sqrt(bf16_t a) { uint16_t sign = (a.bits >> 15) & 1; int16_t exp = ((a.bits >> 7) & 0xFF); uint16_t mant = a.bits & 0x7F; /* Handle special cases */ if (exp == 0xFF) { if (mant) return a; /* NaN propagation */ if (sign) return BF16_NAN(); /* sqrt(-Inf) = NaN */ return a; /* sqrt(+Inf) = +Inf */ } /* sqrt(0) = 0 (handle both +0 and -0) */ if (!exp && !mant) return BF16_ZERO(); /* sqrt of negative number is NaN */ if (sign) return BF16_NAN(); /* Flush denormals to zero */ if (!exp) return BF16_ZERO(); /* Direct bit manipulation square root algorithm */ /* For sqrt: new_exp = (old_exp - bias) / 2 + bias */ int32_t e = exp - BF16_EXP_BIAS; int32_t new_exp; /* Get full mantissa with implicit 1 */ uint32_t m = 0x80 | mant; /* Range [128, 256) representing [1.0, 2.0) */ /* Adjust for odd exponents: sqrt(2^odd * m) = 2^((odd-1)/2) * sqrt(2*m) */ if (e & 1) { m <<= 1; /* Double mantissa for odd exponent */ new_exp = ((e - 1) >> 1) + BF16_EXP_BIAS; } else { new_exp = (e >> 1) + BF16_EXP_BIAS; } /* Now m is in range [128, 256) or [256, 512) if exponent was odd */ /* Binary search for integer square root */ /* We want result where result^2 = m * 128 (since 128 represents 1.0) */ uint32_t low = 90; /* Min sqrt (roughly sqrt(128)) */ uint32_t high = 256; /* Max sqrt (roughly sqrt(512)) */ uint32_t result = 128; /* Default */ /* Binary search for square root of m */ while (low <= high) { uint32_t mid = (low + high) >> 1; uint32_t sq = (mid * mid) / 128; /* Square and scale */ if (sq <= m) { result = mid; /* This could be our answer */ low = mid + 1; } else { high = mid - 1; } } /* result now contains sqrt(m) * sqrt(128) / sqrt(128) = sqrt(m) */ /* But we need to adjust the scale */ /* Since m is scaled where 128=1.0, result should also be scaled same way */ /* Normalize to ensure result is in [128, 256) */ if (result >= 256) { result >>= 1; new_exp++; } else if (result < 128) { while (result < 128 && new_exp > 1) { result <<= 1; new_exp--; } } /* Extract 7-bit mantissa (remove implicit 1) */ uint16_t new_mant = result & 0x7F; /* Check for overflow/underflow */ if (new_exp >= 0xFF) return (bf16_t) {.bits = 0x7F80}; /* +Inf */ if (new_exp <= 0) return BF16_ZERO(); return (bf16_t) {.bits = ((new_exp & 0xFF) << 7) | new_mant}; } ``` ::: ## Assembly :::spoiler `hw1/hw1-bf16.s` ```asm= .data zero_data: .half 0x0000, 0x8000 nan_data: .half 0x7fff,0xffff,0x7f81, 0xff81 inf_data: .half 0x7f80,0xff80 f32_data: .word 0x3f8f86c2 # 1.1213 .word 0x41336873 # 11.213 bf16_data: .half 0x3f90,0x4133 # 1.1213, 11.213 .half 0x4133,0xbf90 # 11.213, -1.1213 .half 0xbf90,0x4133 # -1.1213, 11.213 .half 0xc133,0xbf90 # -11.213, -1.1213 gold_data_add: .word 0x4145,0x4121,0x4121,0xc145 gold_data_sub: .word 0xc121,0x4145,0xc145,0xc121 gold_data_mul: .word 0x4149,0xc149,0xc149,0x4149 gold_data_div: .word 0x3dcd,0xc11f,0xbdcd,0x411f gold_data_sqrt: .word 0x3f88,0x4056,0x7fc0,0x7fc0 str_plus: .string " + " str_min: .string " - " str_mul: .string " * " str_div: .string " / " str_sqrt: .string " ^0.5 " str_equal: .string " is " str_not: .string " not " str_nl: .string "\n" str_passed: .string ", passed!\n" str_failed: .string ", failed!\n" .text main: jal test_add jal test_sub jal test_mul jal test_div jal test_sqrt li a7,10 ecall ################# # test_add # ################# test_add: addi sp,sp,-4 sw ra,0(sp) la s0,bf16_data la s1,gold_data_add li s2,4 # data counter test_add_loop: addi s2,s2,-1 bltz s2,test_add_exit slli t0,s2,2 add t1,s0,t0 add t2,s1,t0 lhu a1,0(t1) # op a lhu a2,2(t1) # op b lhu s3,0(t2) # answer addi sp,sp,-20 sw ra,0(sp) sw s0,4(sp) sw s1,8(sp) sw s2,12(sp) sw s3,16(sp) jal bf16_add lw ra,0(sp) lw s0,4(sp) lw s1,8(sp) lw s2,12(sp) lw s3,16(sp) addi sp,sp,20 mv s4,a0 # result: s4 mv a0, a1 li a7, 34 ecall la a0, str_plus li a7,4 ecall mv a0, a2 li a7,34 ecall la a0, str_equal li a7,4 ecall mv a0, s3 # ans li a7,34 ecall beq s3,s4,test_add_passed la a0, str_not li a7,4 ecall mv a0,s4 # wrong result li a7,34 ecall la a0,str_failed li a7,4 ecall j test_add_loop test_add_passed: la a0,str_passed li a7,4 ecall j test_add_loop test_add_exit: lw ra,0(sp) addi sp,sp,4 ret ################# # test_sub # ################# test_sub: addi sp,sp,-4 sw ra,0(sp) la s0,bf16_data la s1,gold_data_sub li s2,4 # data counter test_sub_loop: addi s2,s2,-1 bltz s2,test_sub_exit slli t0,s2,2 add t1,s0,t0 add t2,s1,t0 lhu a1,0(t1) # op a lhu a2,2(t1) # op b lhu s3,0(t2) # answer addi sp,sp,-20 sw ra,0(sp) sw s0,4(sp) sw s1,8(sp) sw s2,12(sp) sw s3,16(sp) jal bf16_sub lw ra,0(sp) lw s0,4(sp) lw s1,8(sp) lw s2,12(sp) lw s3,16(sp) addi sp,sp,20 mv s4,a0 # result: s4 mv a0, a1 li a7, 34 ecall la a0, str_min li a7,4 ecall mv a0, a2 li a7,34 ecall la a0, str_equal li a7,4 ecall mv a0, s3 # ans li a7,34 ecall beq s3,s4,test_sub_passed la a0, str_not li a7,4 ecall mv a0,s4 # wrong result li a7,34 ecall la a0,str_failed li a7,4 ecall j test_sub_loop test_sub_passed: la a0,str_passed li a7,4 ecall j test_sub_loop test_sub_exit: lw ra,0(sp) addi sp,sp,4 ret ################# # test_mul # ################# test_mul: addi sp,sp,-4 sw ra,0(sp) la s0,bf16_data la s1,gold_data_mul li s2,4 # data counter test_mul_loop: addi s2,s2,-1 bltz s2,test_mul_exit slli t0,s2,2 add t1,s0,t0 add t2,s1,t0 lhu a1,0(t1) # op a lhu a2,2(t1) # op b lhu s3,0(t2) # answer addi sp,sp,-20 sw ra,0(sp) sw s0,4(sp) sw s1,8(sp) sw s2,12(sp) sw s3,16(sp) jal bf16_mul lw ra,0(sp) lw s0,4(sp) lw s1,8(sp) lw s2,12(sp) lw s3,16(sp) addi sp,sp,20 mv s4,a0 # result: s4 mv a0, a1 li a7, 34 ecall la a0, str_mul li a7,4 ecall mv a0, a2 li a7,34 ecall la a0, str_equal li a7,4 ecall mv a0, s3 # ans li a7,34 ecall beq s3,s4,test_mul_passed la a0, str_not li a7,4 ecall mv a0,s4 # wrong result li a7,34 ecall la a0,str_failed li a7,4 ecall j test_mul_loop test_mul_passed: la a0,str_passed li a7,4 ecall j test_mul_loop test_mul_exit: lw ra,0(sp) addi sp,sp,4 ret ################# # test_div # ################# test_div: addi sp,sp,-4 sw ra,0(sp) la s0,bf16_data la s1,gold_data_div li s2,4 # data counter test_div_loop: addi s2,s2,-1 bltz s2,test_div_exit slli t0,s2,2 add t1,s0,t0 add t2,s1,t0 lhu a1,0(t1) # op a lhu a2,2(t1) # op b lhu s3,0(t2) # answer addi sp,sp,-20 sw ra,0(sp) sw s0,4(sp) sw s1,8(sp) sw s2,12(sp) sw s3,16(sp) jal bf16_div lw ra,0(sp) lw s0,4(sp) lw s1,8(sp) lw s2,12(sp) lw s3,16(sp) addi sp,sp,20 mv s4,a0 # result: s4 mv a0, a1 li a7, 34 ecall la a0, str_div li a7,4 ecall mv a0, a2 li a7,34 ecall la a0, str_equal li a7,4 ecall mv a0, s3 # ans li a7,34 ecall beq s3,s4,test_div_passed la a0, str_not li a7,4 ecall mv a0,s4 # wrong result li a7,34 ecall la a0,str_failed li a7,4 ecall j test_div_loop test_div_passed: la a0,str_passed li a7,4 ecall j test_div_loop test_div_exit: lw ra,0(sp) addi sp,sp,4 ret ################# # test_sqrt # ################# test_sqrt: addi sp,sp,-4 sw ra,0(sp) la s0,bf16_data la s1,gold_data_sqrt li s2,4 # data counter test_sqrt_loop: addi s2,s2,-1 bltz s2,test_sqrt_exit slli t0,s2,2 add t1,s0,t0 add t2,s1,t0 lhu a1,0(t1) # op a lhu s3,0(t2) # answer addi sp,sp,-20 sw ra,0(sp) sw s0,4(sp) sw s1,8(sp) sw s2,12(sp) sw s3,16(sp) jal bf16_sqrt lw ra,0(sp) lw s0,4(sp) lw s1,8(sp) lw s2,12(sp) lw s3,16(sp) addi sp,sp,20 mv s4,a0 # result: s4 mv a0, a1 li a7, 34 ecall la a0, str_sqrt li a7,4 ecall la a0, str_equal li a7,4 ecall mv a0, s3 # ans li a7,34 ecall beq s3,s4,test_sqrt_passed la a0, str_not li a7,4 ecall mv a0,s4 # wrong result li a7,34 ecall la a0,str_failed li a7,4 ecall j test_sqrt_loop test_sqrt_passed: la a0,str_passed li a7,4 ecall j test_sqrt_loop test_sqrt_exit: lw ra,0(sp) addi sp,sp,4 ret ################# # bf16_isnan # ################# bf16_isnan: li s1,0x7F80 # BF16_EXP_MASK and t0,a0,s1 # a & exp_mask bne t0,s1,bf16_isnan_false # check mantissa part is non-zero andi t0,a0,0x7F beq t0,x0,bf16_isnan_false li a1,1 ret bf16_isnan_false: li a1,0 ret ################# # bf16_isinf # ################# bf16_isinf: li s1,0x7F80 # BF16_EXP_MASK and t0,a0,s1 # a & exp_mask bne t0,s1,bf16_isinf_false # check mantissa part is zero andi t0,a0,0x7F bne t0,x0,bf16_isinf_false li a1,1 ret bf16_isinf_false: li a1,0 ret ################# # bf16_iszero # ################# bf16_iszero: li s2,0x7FFF # mask exp and mant and t0,a0,s2 bne t0,x0,bf16_iszero_false li a1,1 ret bf16_iszero_false: li a1,0 ret ################# # f32_to_bf16 # ################# f32_to_bf16: li s2,0x7FFF # mask exp and mant mv t0,a1 # t0: f32bits srli t1,t0,23 andi t1,t1,0xFF addi t1,t1,-0xFF bne t1,x0,f32_to_bf16_else srli t1,t0,16 li t0,0xFFFF and a0,t1,t0 ret f32_to_bf16_else: srli t1,t0,16 andi t1,t1,1 add t1,t1,s2 add t0,t0,t1 srli a0,t0,16 ret ################# # bf16_to_f32 # ################# bf16_to_f32: slli a0,a1,16 ret ################# # bf16_add # ################# bf16_add: # a1: a, a2:b srli s0,a1,15 andi s0,s0,1 # s0: sign_a srli s1,a2,15 andi s1,s1,1 # s1: sign_b srli s2,a1,7 andi s2,s2,0xFF # s2: exp_a srli s3,a2,7 andi s3,s3,0xFF # s3: exp_b andi s4,a1,0x7F # s4: mant_a andi s5,a2,0x7F # s5: mant_b li t0,0xFF # t0: 0xFF bne s2,t0,exp_a_max_n beq s4,x0,ret_a bne s3,t0,ret_a bnez s5,ret_b bne s0,s1,ret_nan exp_a_max_n: beq s3,t0,ret_b bne s2,x0,a_nonzero beq s4,x0,ret_b a_nonzero: bne s3,x0,b_nonzero beq s5,x0,ret_a b_nonzero: beq s2,x0,a_denorm ori s4,s4,0x80 a_denorm: beq s3,x0,b_denorm ori s5,s5,0x80 b_denorm: sub t1,s2,s3 # exp_diff: t1 ble t1,x0,exp_diff_neg mv t3,s2 # result_exp: t3 beq t1,x0,res_exp_exit # exp_diff is 0 addi t4,t1,-8 bgt t4,x0,ret_a srl s5,s5,t1 j res_exp_exit exp_diff_neg: mv t3,s3 # result_exp: t3 addi t4,t1,8 blt t4,x0,ret_b sub t4,x0,t1 srl s4,s4,t4 res_exp_exit: bne s0,s1,sign_is_diff mv t2,s0 # result_sign = t2 add t4,s4,s5 # result_mant = t4 andi t5,t3,0x100 beq t5,x0,ret_encoded # return if no carry srli t4,t4,1 addi t3,t3,1 bge t3,t0,ret_inf j ret_encoded sign_is_diff: blt s4,s5,b_lt_a mv t2,s0 sub t4,s4,s5 j b_lt_a_exit b_lt_a: mv t2,s1 sub t4,s5,s4 b_lt_a_exit: beq t4,x0,ret_zero denorm_loop: andi t5,t4,0x80 bne t5,x0,ret_encoded slli t4,t4,1 addi t3,t3,-1 blez t3,ret_zero j denorm_loop ################# # bf16_sub # ################# bf16_sub: # a1: a, a2:b li t0, 0x8000 # BF16_SIGN_MASK xor a2,a2,t0 j bf16_add ################# # bf16_mul # ################# bf16_mul: # a1: a, a2:b srli s0,a1,15 andi s0,s0,1 # s0: sign_a srli s1,a2,15 andi s1,s1,1 # s1: sign_b srli s2,a1,7 andi s2,s2,0xFF # s2: exp_a srli s3,a2,7 andi s3,s3,0xFF # s3: exp_b andi s4,a1,0x7F # s4: mant_a andi s5,a2,0x7F # s5: mant_b xor t2,s0,s1 # res_sign: t2 li t0,0xFF # t0: 0xFF bne s2,t0,mul_exp_a_max_n bnez s4,ret_a bnez s3,ret_inf beqz s5,ret_nan j ret_inf mul_exp_a_max_n: bne s3,t0,mul_exp_b_max_n bnez s5,ret_b bnez s2,ret_inf beqz s4,ret_nan j ret_inf mul_exp_b_max_n: li t1,0 # exp_adj: t1 mant_a_norm_loop: bnez s2,exp_a_nonzero ori t5,s4,0x80 bnez t5,mant_a_norm slli s4,s4,1 addi t1,t1,-1 j mant_a_norm_loop mant_a_norm: li s2,1 j mant_b_norm_loop exp_a_nonzero: ori s4,s4,0x80 mant_b_norm_loop: bnez s3,exp_b_nonzero ori t5,s5,0x80 bnez t5,mant_b_norm slli s5,s5,1 addi t1,t1,-1 j mant_b_norm_exit mant_b_norm: li s3,1 j mant_b_norm_exit exp_b_nonzero: ori s5,s5,0x80 mant_b_norm_exit: # Multiply li t4,0 mult_loop: beqz s5,mult_exit add t4,t4,s4 addi s5,s5,-1 j mult_loop mult_exit: add t3,s2,s3 addi t3,t3,-127 add t3,t3,t1 # result_exp li t5,0x8000 and t5,t4,t5 beqz t5,result_mant_adj_n srli t4,t4,8 andi t4,t4,0x7F addi t3,t3,1 j check_result_exp result_mant_adj_n: srli t4,t4,7 andi t4,t4,0x7F check_result_exp: blt t3,t0,result_inf_n j ret_inf result_inf_n: bgtz t3,ret_encoded addi t5,t3,6 bltz t5,ret_zero li t5,1 sub t5,t5,t3 srl t4,t4,t5 li t3,0 j ret_encoded ################# # bf16_div # ################# bf16_div: # a1: a, a2:b srli s0,a1,15 andi s0,s0,1 # s0: sign_a srli s1,a2,15 andi s1,s1,1 # s1: sign_b srli s2,a1,7 andi s2,s2,0xFF # s2: exp_a srli s3,a2,7 andi s3,s3,0xFF # s3: exp_b andi s4,a1,0x7F # s4: mant_a andi s5,a2,0x7F # s5: mant_b xor t2,s0,s1 # result_sign: t2 li t0,0xFF # t0: 0xFF bne s3,t0,div_exp_b_max_n bnez s5,ret_b bne s2,t0,ret_zero beqz s4,ret_nan j ret_zero div_exp_b_max_n: bnez s3,div_b_zero_n bnez s5,div_b_zero_n bnez s2,ret_inf beqz s4,ret_nan # 0/0 is nan j ret_inf # inf otherwise div_b_zero_n: bne s2,t0,div_exp_a_max_n bnez s4,ret_a j ret_inf div_exp_a_max_n: # check if a is zero bnez s2,div_a_zero_n beqz s4,ret_zero div_a_zero_n: beqz s2,div_a_denorm ori s4,s4,0x80 div_a_denorm: beqz s3,div_b_denorm ori s5,s5,0x80 div_b_denorm: slli s4,s4,15 # s4: dividend # s5: divisor li s6,0 # s6: quotient li t4,15 # t4: i div_loop: bltz t4,div_loop_exit slli s6,s6,1 sll t5,s5,t4 addi t4,t4,-1 blt s4,t5,div_loop sub s4,s4,t5 ori s6,s6,1 j div_loop div_loop_exit: sub t3,s2,s3 addi t3,t3,127 bnez s2,div_exp_a_zero_n addi s3,s3,-1 div_exp_a_zero_n: bnez s3,div_exp_b_zero_n addi s3,s3,1 div_exp_b_zero_n: li t4,0x8000 and t5,s6,t4 beqz t5,quot_loop srli s6,s6,8 j quot_exit quot_loop: and t5,s6,t4 bnez t5,quot_loop_exit addi t5,t3,-1 blez t5,quot_loop_exit slli s6,s6,1 addi t3,t3,-1 j quot_loop quot_loop_exit: srli s6,s6,8 quot_exit: andi s6,s6,0x7F mv t4,s6 # quotient as mantissa bge t3,t0,ret_inf blez t3,ret_zero j ret_encoded ################# # bf16_sqrt # ################# bf16_sqrt: # a1: a srli s0,a1,15 andi s0,s0,1 # s0: sign srli s2,a1,7 andi s2,s2,0xFF # s2: exp andi s4,a1,0x7F # s4: mant li t0,0xFF # t0: 0xFF bne s2,t0,sqrt_exp_max_n bnez s4,ret_a bnez s0,ret_nan j ret_a sqrt_exp_max_n: bnez s2,sqrt_a_zero_n beqz s4,ret_zero sqrt_a_zero_n: bnez s0,ret_nan beqz s2,ret_zero addi t1,s2,-127 # t1: e, t2: new_exp ori t3,s4,0x80 # t3: m andi t4,t1,1 beqz t4,sqrt_even_exp slli t3,t3,1 addi t2,t1,-1 srli t2,t2,1 addi t2,t2,127 j sqrt_adjexp_exit sqrt_even_exp: srli t2,t1,1 addi t2,t2,127 sqrt_adjexp_exit: li a2,90 # low li a3,256 # high li a4,128 # result bs_loop: bgt a2,a3, bs_exit add t4,a2,a3 srli t4,t4,1 # t4:mid # Multiply: t5: prod li t5,0 mv t6,t4 sqrt_mult_loop: beqz t6,sqrt_mult_exit add t5,t5,t4 addi t6,t6,-1 j sqrt_mult_loop sqrt_mult_exit: srli t5,t5,7 # sq: t5 bgt t5,t3,bs_else mv a4,t4 addi a2,t4,1 j bs_loop bs_else: addi a3,t4,-1 j bs_loop bs_exit: li t4,256 blt a4,t4,sqrt_result_norm_else srli a4,a4,1 addi t3,t3,1 j sqrt_result_norm_exit sqrt_result_norm_else: li t4,128 sqrt_result_norm_loop: bge a4,t4,sqrt_result_norm_exit li t5,1 ble t2,t5,sqrt_result_norm_exit slli a4,a4,1 addi t2,t2,-1 j sqrt_result_norm_loop sqrt_result_norm_exit: mv t5,t2 # new_exp li t2,0 # sign andi t4,a4,0x7F # new_mant bge t5,t0,ret_inf blez t5,ret_zero # ret_encoded andi t2,t5,0xFF slli t2,t2,7 or a0,t4,t2 ret ################# # Helper # ################# ret_a: mv a0,a1 ret ret_b: mv a0,a2 ret ret_nan: li a0,0x7FC0 ret ret_inf: slli t2,t2,15 li t1,0x7F80 or a0,t1,t2 ret ret_zero: slli t1,t1,15 ori a0,t1,0 ret ret_encoded: # t2: sign, t3: exp, t4: mant slli t2,t2,15 and t3,t3,t0 slli t3,t3,7 andi t4,t4,0x7F or a0,t2,t2 or a0,a0,t3 or a0,a0,t4 ret ``` ::: # 🚩 Applications: Fast Inverse Square Root with BFloat16 ## `Q_rsqrt()` Algorithm ### ✳️ Introduction The `q_rsqrt()` algorithm computes an **approximate reciprocal square root**: $y \approx \frac{1}{\sqrt{x}}$ for a given input $x$, often in **fixed-point or reduced-precision formats** (e.g., BF16 or Q-format). This method is widely used in graphics, neural networks, and signal processing due to its **high performance** and **deterministic iteration count**. --- ### 🧩 Mathematical Basis Given $x > 0$, the reciprocal square root can be expressed as: $y = \frac{1}{\sqrt{x}} = x^{-\frac{1}{2}}$ For efficient computation, we can **linearize around an initial approximation**: $y_0 \approx \text{initial guess for } \frac{1}{\sqrt{x}}$ Then, using **Newton–Raphson iteration**, we refine the estimate: $y_{n+1} = y_n \cdot \left( \frac{3}{2} - \frac{x}{2} y_n^2 \right)$ - $y_n$= current estimate - $x$= input value - Each iteration **squares $y_n$**, multiplies by $x$, subtracts from $\frac{3}{2}$, then multiplies by $y_n$to converge faster. This converges **quadratically**, meaning the error roughly **squares** at each step. --- ### 🔢 Initial Approximation A crucial step is choosing an efficient **initial guess**: $y_0 \approx \text{magic number} - \frac{1}{2} \cdot \text{bit-level representation of } x$ - In floating-point, this is often implemented by **bit reinterpretation** and subtraction (the “fast inverse square root trick” popularized by Quake III). - In fixed-point BF16 or Q-format, a **lookup table** or **linear approximation** is often used for simplicity. The initial approximation ensures **1–2 iterations** of Newton–Raphson achieve sufficient precision. --- ### 🧠 Algorithmic Steps (Mathematical Form) Let $x$ be the input, and $y$ the reciprocal square root output: 1. **Compute initial guess $y_0$** based on a linear approximation of $x$ in the chosen numeric format. 2. **Iteratively refine** using Newton–Raphson: $y_{k+1} = y_k \cdot \left( 1.5 - 0.5 \cdot x \cdot y_k^2 \right)$ 3. **Terminate** after fixed iterations (typically 1–3 for BF16), producing $y \approx 1/\sqrt{x}$. --- ### 🔢 BF16 / Fixed-Point Interpretation In **BF16 or Q-format**, multiplications and subtractions are performed with **reduced-precision arithmetic**. Key considerations: 1. **Overflow prevention:** Ensure intermediate products fit into wider registers. 2. **Scaling:** Fixed-point numbers must be **properly scaled** for fractional bits. 3. **Iteration count:** Typically 1 iteration suffices for BF16, since the format limits achievable precision (~0.5% error). Mathematically, the algorithm **mirrors the floating-point Newton–Raphson formula**, but all operations are constrained to the **fixed-point representation**. --- ### 📈 Summary > 💬 *In essence:* > The `q_rsqrt()` algorithm efficiently approximates $\frac{1}{\sqrt{x}}$ by combining a **good initial guess** with **1–2 iterations of Newton–Raphson**, producing a result suitable for **low-precision or real-time arithmetic units**. > Its quadratic convergence and simple arithmetic make it ideal for **BF16 or Q-format hardware pipelines**, where predictable latency and minimal cycles are critical. ## 🛠️ Implementation ### C Code with FP32 ```clike= float Q_rsqrt( float number ){ long i; float x2, y; const float threehalfs = 1.5F; x2 = number * 0.5F; y = number; i = * ( long * ) & y; i = 0x5f3759df - ( i >> 1 ); y = * ( float * ) &i; y = y * ( threehalfs - ( x2 * y * y ) ); return y; } ``` ### Modified C Code with BF16 ```clike= bf16_t Q_rsqrt_bf16( bf16_t number ){ uint16_t i; bf16_t x2, y; bf16_t threehalfs = f32_to_bf16(1.5); x2 = bf16_mul(number,f32_to_bf16(0.5)); y = number; i = * ( uint16_t * ) & (y.bits); i = 0x5f37 - ( i >> 1 ); y = (bf16_t) { .bits = *(&i) }; y = bf16_mul(y, bf16_sub(threehalfs, bf16_mul(x2, bf16_mul(y,y)))); return y; } ``` ### Modified Assembly Code with BF16 ```asm= .data str_0: .string "Inverse square root of " str_1: .string ", BF16: " str_2: .string ", ans: " str_3: .string ", elapsed cycles: " str_nl: .string "\n" test_data: .word 0x379591a2 .word 0x3f800000 .word 0x40c80000 .word 0x42c80000 .word 0x48a623a4 gold_data: .word 0x436CD2C2 .word 0x3f800000 .word 0x3ecccccd .word 0x3dcccccd .word 0x3ae0b3f9 .text main: li s0,4 # counter la s3,test_data # test_data la s4,gold_data # gold_data loop: bltz s0,exit slli t0,s0,2 add t1,s3,t0 lw t1,0(t1) # test_data add s1,s4,t0 lw s1,0(s1) # gold_data la a0,str_0 li a7, 4 ecall mv a0,t1 li a7,2 ecall # BF16: la a0,str_1 li a7, 4 ecall # set start time li a7,31 ecall mv s2,a1 mv a1,t1 # test_data addi sp,sp,-20 sw s0,0(sp) # counter sw s1,4(sp) # gold_data sw s2,8(sp) # #cycle sw s3,12(sp) sw s4,16(sp) jal f32_to_bf16 mv a1,a0 jal Q_sqrt_bf16 mv a1,a0 jal bf16_to_f32 lw s0,0(sp) lw s1,4(sp) lw s2,8(sp) lw s3,12(sp) lw s4,16(sp) addi sp,sp,20 li a7,2 # a0 is result ecall # set end time li a7,31 ecall sub s2,a0,s2 la a0,str_2 li a7,4 ecall mv a0,s1 # gold data li a7,2 ecall # elapsed cycles la a0,str_3 li a7,4 ecall mv a0,s2 li a7,1 ecall la a0,str_nl li a7,4 ecall addi s0,s0,-1 j loop exit: li a7,10 ecall Q_sqrt_bf16: # number: a1 addi sp,sp,-4 sw ra,0(sp) li a2,0x3f00 # 0.5 jal bf16_mul mv a3,a0 # x2: a3 li t1,0x5f37 srli a1,a1,1 sub a4,t1,a1 # y: a4 mv a2,a4 mv a1,a4 jal bf16_mul # bf16_mul(y,y): a0 mv a2,a0 mv a1,a3 jal bf16_mul # bf16_mul(x2,...) mv a2,a0 li a1,0x3fc0 # a1: threehalfs=1.5 jal bf16_sub mv a2,a0 mv a1,a4 jal bf16_mul # bf16_mul(y,...) lw ra,0(sp) addi sp,sp,4 ret ``` ### Test Results ![image](https://hackmd.io/_uploads/ByzrTFqpxl.png) ![Screenshot 2025-10-13 212012](https://hackmd.io/_uploads/HyZ2Hi5Tex.png) ## Algorithm with Shift-Add Multiplication ### ✳️ Introduction Shift–Add Multiplication is a **bitwise arithmetic method** for performing integer or fixed-point multiplication using only **addition** and **bit shifting**. It is conceptually derived from the **binary expansion of the multiplier** and forms the mathematical foundation of hardware multipliers (e.g., array multipliers, Booth multipliers, and iterative ALUs). --- ### 🧩 Mathematical Basis Let the two **unsigned integers** be: $\[ A = \text{multiplicand}, \q B = \text{multiplier}$ where $A, B \in \mathbb{N}$. The binary representation of $B$ can be written as: $B = \sum_{i=0}^{n-1} b_i \cdot 2^i, \quad b_i \in \{0,1\}$ Then, the product $P = A \times B$ can be expanded as: $P = A \times \left(\sum_{i=0}^{n-1} b_i \cdot 2^i\right) = \sum_{i=0}^{n-1} (A \cdot b_i) \cdot 2^i$ Since $b_i$ is either 0 or 1, this simplifies to a **conditional accumulation**: $P = \sum_{i=0}^{n-1} p_i ,$ $p_i= \begin{cases} A \cdot 2^i, & \text{if } b_i = 1 \\ 0, & \text{if } b_i = 0 \end{cases}$ Thus, multiplication reduces to: 1. **Shift** the multiplicand $A$ left by $i$ bits. 2. **Add** it to the running sum if the corresponding bit $b_i$ is 1. --- ### ⚖️ Properties | Property | Description | |-----------|--------------| | **Complexity** | $O(n)$, where $n$ is the number of bits | | **Operations** | Only requires addition and bit-shift (no direct multiply) | | **Determinism** | Fixed iteration count; predictable latency | | **Hardware Parallelism** | Easily parallelizable for array multipliers | | **Accuracy** | Exact for integer arithmetic; in fixed-point, rounding may occur | --- > 💬 **In summary:** > The Shift–Add multiplication method leverages the binary structure of numbers to express multiplication as a series of conditional shifts and additions. It trades raw instruction complexity for predictable, uniform operations — a crucial property for low-power, fixed-latency arithmetic units, especially in BF16 or other reduced-precision numeric systems. ## 🚀 Implementation ### Modified Assembly Code with BF16 We modify the original substract-accumulating multiplication: ```asm= ... # Multiply: t4 = t4 * s5 li t4,0 mult_loop: beqz s5,mult_exit add t4,t4,s4 addi s5,s5,-1 j mult_loop mult_exit: ... ``` to bitwise shift-add multiplication, since: - Multiplicand and multiplier are **unsigned**. - Multiplicand and multiplier are **at most 16 bits**. - It takes $\Theta(16)$ instead of $O(\textrm{value of multiplier})$, which is constant time in **this architecture**. ```asm= ... # Multiply: t4 = t4 * s5 li t4, 0 # result = 0 li t0, 16 # 16 bits to process mult_loop: andi t5,s5,1 # check LSB of multiplier beqz t5,skip_add # if 0, skip addition add t4,t4,s4 # if LSB=1, add multiplicand to result skip_add: srli s5,s5,1 # shift multiplier right by 1 slli s4,s4,1 # shift multiplicand left by 1 addi t0,t0,-1 bnez t0,mult_loop mult_exit: ... ``` ### Test Results ![image](https://hackmd.io/_uploads/rJA93t5Tex.png) ![Screenshot 2025-10-13 221038](https://hackmd.io/_uploads/BJMorjqTlx.png) # 🧮 Evaluation The evaluation compares the proposed **fast multiplication–based reciprocal square root (`q_rsqrt`)** with the **traditional multiply–accumulate implementation**, both in **BF16 precision**, using **FP32** results as the reference. | Test Data | Error (to FP32) | Cycles – Traditional Mul. | Cycles – Fast Mul. | |------------|----------------|---------------------------|--------------------| | 340253.125 | 0.577% | 5281 | 1010 | | 100 | 0.586% | 4975 | 1010 | | 6.25 | 0.586% | 4975 | 1010 | | 1 | 0.000% | 4869 | 1000 | | 0.00001783 | 0.075% | 4651 | 1002 | --- ### 💡 Commentary The **fast multiplication method** achieves a **4.9×–5.2× speedup** over the traditional multiply–accumulate approach, lowering the average cycle count from ~5k to ~1k across all test cases. Despite this large performance gain, the **numerical accuracy remains within 0.6%** of the FP32 reference — well within the tolerance expected for BF16 precision arithmetic. Key observations: - For well-conditioned inputs (e.g., `x = 1`), the fast method achieves **exact FP32 parity (0.000% error)**. - Larger or smaller magnitudes show small, consistent rounding errors due to BF16’s reduced mantissa precision. - Performance remains **deterministic** (fixed iteration count), making it suitable for **real-time or low-power applications**. --- ### ⚙️ Summary | Metric | Observation | |--------|--------------| | **Speedup** | ~5× faster | | **Accuracy Loss** | ≤ 0.6% | | **Determinism** | Fixed cycle count (~1000) | | **Use Case** | Embedded / real-time FP pipelines, BF16 ALU verification | > ✅ *Conclusion:* > The bitwise shift–add–based fast multiplier preserves accuracy while dramatically improving throughput, making it a practical choice for hardware or firmware-level BF16 math units. # 📈Analysis ## ⚙️ 5-Stage Pipelined Processor in Ripes Ripes provides multiple CPU models that can execute the same RISC-V assembly program: - **Single-cycle processor**: Each instruction completes all operations (fetch, decode, execute, memory access, write-back) within a single clock cycle. - **5-stage pipelined processor**: Splits instruction execution into five stages (IF, ID, EX, MEM, WB), allowing multiple instructions to execute simultaneously in different stages. This section visualizes and explains each stage. --- ![image](https://hackmd.io/_uploads/Bk1gzccTgg.png) 🔹 Instruction Fetch (IF) - **PC**: Stores the current instruction address (e.g., `PC = 0x00000000`). After each instruction, it updates to the next address. - **Adder (+4)**: Computes `PC + 4 = 0x00000004` for the next instruction. - **MUX**: Selects the next PC value. - Normally chooses `PC + 4` (sequential execution). - For jumps/branches, selects the target address instead. - Example: for `jal x1, 316`, the next PC becomes `PC + 316`. - **Instruction Memory**: Fetches the instruction at the current PC. Here, `addr = 0x00000000` → instruction `0x13C000EF` (`jal x1, 316`). - **Compressed Decoder**: Checks if the instruction is a 16-bit compressed form and expands it to 32 bits if necessary. (`jal` here is a 32-bit normal instruction, so it stays the same.) - **IF/ID Pipeline Register**: Stores the instruction and PC for use in the next cycle by the ID stage. (Enable = green, Clear = red → instruction flows normally to ID.) --- ![image](https://hackmd.io/_uploads/Hy4fzcqagx.png) 🔹 Instruction Decode / Register Fetch (ID) - **IF/ID Register**: Supplies the current instruction (`0x13C000EF`) and PC. - **Decode Unit**: Extracts opcode, rd, rs1, rs2, funct3, and funct7 fields to identify the instruction type (here: JAL). - **Register File**: Reads the source registers. Since `jal` doesn’t use sources, Reg1 and Reg2 = `0x00000000`. - **Immediate Generator**: Extracts the immediate field → `0x0000013C` (jump offset = 316). - **ID/EX Pipeline Register**: Holds decoded values for the next clock cycle (passed to EX). --- ![image](https://hackmd.io/_uploads/HyP7f9cpll.png) 🔹 Execute (EX) - **ALU**: Performs arithmetic, logic, and address calculations. - Input: `PC = 0x00000000`, `Imm = 0x0000013C` - Output: `Res = 0x0000013C` → Jump Target Address - **ALU Input Selectors (MUXes)**: Upper MUX selects PC, lower MUX selects immediate → performs `PC + Imm`. - **Branch Unit**: Determines if a branch is taken (`beq`, `bne`, `blt`, `bge`, etc.). For `jal`, the jump always occurs → `Branch taken = true`. The jump target (0x0000013C) is sent back to the IF stage to update PC. --- ![image](https://hackmd.io/_uploads/SJLVfqqpgl.png) 🔹 Memory Access (MEM) - **Data Memory**: The `jal` instruction doesn’t access memory, so this stage simply passes through without changes. --- ![image](https://hackmd.io/_uploads/HyySzcqpxg.png) 🔹 Write Back (WB) - **MUX (Write-back Selector)**: Chooses which data source will be written to the register file. - In this case: `PC + 4 = 0x00000004`. - **Register File**: Writes `0x00000004` to the destination register `x1 (ra)`. - **Result**: After execution, `x1 = 0x00000004`, which stores the return address (next instruction address). --- 💡 **Summary** This 5-stage pipeline allows multiple instructions to be processed simultaneously in different stages, improving CPU throughput and overall efficiency. # Reference [Inverse Square Root](https://ece.uwaterloo.ca/~dwharder/aads/Algorithms/Inverse_square_root/)