# Assignment1: RISC-V Assembly and Instruction Pipeline
contributed by <`戴仁杰`>
repo: https://github.com/dozingmoon/ca2025-quizzes
Disclaimer: I used AI tools such as ChatGPT for better understanding of algorithm or architecture descriptions.
# Problem B
## C Code
:::spoiler `q1/q1-uf8.c`
```clike=
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
typedef uint8_t uf8;
static inline unsigned clz(uint32_t x)
{
int n = 32, c = 16;
do {
uint32_t y = x >> c;
if (y) {
n -= c;
x = y;
}
c >>= 1;
} while (c);
return n - x;
}
/* Decode uf8 to uint32_t */
uint32_t uf8_decode(uf8 fl)
{
uint32_t mantissa = fl & 0x0f;
uint8_t exponent = fl >> 4;
uint32_t offset = (0x7FFF >> (15 - exponent)) << 4;
return (mantissa << exponent) + offset;
}
/* Encode uint32_t to uf8 */
uf8 uf8_encode(uint32_t value)
{
/* Use CLZ for fast exponent calculation */
if (value < 16)
return value;
/* Find appropriate exponent using CLZ hint */
int lz = clz(value);
int msb = 31 - lz;
/* Start from a good initial guess */
uint8_t exponent = 0;
uint32_t overflow = 0;
if (msb >= 5) {
/* Estimate exponent - the formula is empirical */
exponent = msb - 4;
if (exponent > 15)
exponent = 15;
/* Calculate overflow for estimated exponent */
for (uint8_t e = 0; e < exponent; e++)
overflow = (overflow << 1) + 16;
/* Adjust if estimate was off */
while (exponent > 0 && value < overflow) {
overflow = (overflow - 16) >> 1;
exponent--;
}
}
/* Find exact exponent */
while (exponent < 15) {
uint32_t next_overflow = (overflow << 1) + 16;
if (value < next_overflow)
break;
overflow = next_overflow;
exponent++;
}
uint8_t mantissa = (value - overflow) >> exponent;
return (exponent << 4) | mantissa;
}
/* Test encode/decode round-trip */
static bool test(void)
{
int32_t previous_value = -1;
bool passed = true;
for (int i = 0; i < 256; i++) {
uint8_t fl = i;
int32_t value = uf8_decode(fl);
uint8_t fl2 = uf8_encode(value);
if (fl != fl2) {
printf("%02x: produces value %d but encodes back to %02x\n", fl,
value, fl2);
passed = false;
}
if (value <= previous_value) {
printf("%02x: value %d <= previous_value %d\n", fl, value,
previous_value);
passed = false;
}
previous_value = value;
}
return passed;
}
int main(void)
{
if (test()) {
printf("All tests passed.\n");
return 0;
}
return 1;
}
```
:::
## Assembly
:::spoiler `hw1-uf8.s`
```asm=
.data
str1: .string ": produces value "
str2: .string " but encodes back to "
str3: .string ": value "
str4: .string " >= previous_value "
strnl: .string "\n"
strps: .string "All tests passed"
.text
#################
# main #
#################
main:
addi sp,sp,-4
sw ra,0(sp)
jal test # a1: passed
beq a1,x0,NotPassed
# Test is passed
li a7,4
la a0, strps
ecall
li a5,0
j exit_main
NotPassed:
li a5,1
ret
exit_main:
mv a0,a5
lw ra,0(sp)
addi sp,sp,4
li a7, 10 # exit system call
ecall
#################
# CLZ #
#################
clz: # x: a0, return: a1
li t1,32 # t1 = n = 32
li t2,16 # t2 = c = 16
clz_loop:
srl t3,a0,t2 # t3=y=x>>c
beq t3,x0,clz_else
sub t1,t1,t2
mv t0,t3
clz_else:
srai t2,t2,1
beq t2,x0,clz_loop
sub a1,t1,t0
ret
#################
# uf8_decode #
#################
uf8_decode: # fl: a0, return: a1
andi t0,a0,0x0f # mantissa
srli t1,a0,4 # exponent
li t2,15
sub t2,t2,t1
li t3,0x7fff
srl t2,t3,t2
slli t2,t2,4 # offset
sll a0,t0,t1
add a1,a0,t2
ret
#################
# uf8_encode #
#################
uf8_encode: # value: a0, ret: a1
srli t0,a0,4 # will be zero if a0 < 16
bne t0,x0,ValGe16
ret
ValGe16:
addi sp,sp,-4
sw ra,0(sp)
jal clz # lz: a1
lw ra,0(sp)
addi sp,sp,4
li t0, 31
sub t0,t0,a1 # msb: t0
li t1,0 # exp: t1
li t2,0 # ovf: t2
addi t3,t0,-5
bltz t3, ExpLoopCond
# if msb >= 5
# estimate exponent
addi t1,t0,-4
li t3,15
ble t1,t3,ExpLe15
mv t1,t3
ExpLe15:
li t3,0
OvfLoopCond:
bge t3,t1, AdjLoopCond
slli t2,t2,1
addi t2,t2,16
addi t3,t3,1
j OvfLoopCond
AdjLoopCond:
sltu t3,a0,t2 # t3: value < overflow
sltu t4,x0,t1 # t4: 0 < exponent
and t3,t3,t4
bgeu x0,t3,ExpLoopCond # break when cond is 0
addi t2,t2,-16
srli t2,t2,1
addi t1,t1,-1
j AdjLoopCond
ExpLoopCond:
li t3,15
bgeu t1,t3,ExpLoopExit
slli t3,t2,1
addi t3,t3,16 # t3: next_overflow
bltu a0,t3,ExpLoopExit
mv t2,t3
addi t1,t1,1
j ExpLoopCond
ExpLoopExit:
sub t3,a0,t2
srl t3,t3,t1
slli t1,t1,4
or a1,t1,t3
ret
#################
# test #
#################
test:
li t5,0xffffffff # t0: previous_value
li t6, 1 # t1: passed
li s0, 255 # s0: fl/i
TestLoopCond:
blt s0,x0,TestLoopExit
addi sp,sp,-4
sw ra,0(sp)
mv a0,s0
jal uf8_decode # a1: value
mv a0,a1
jal uf8_encode
mv s1, a0 # s1: value; a1: fl2
lw ra,0(sp)
addi sp,sp,4
beq s0,a1,fl_equal_fl2
# Print if fl != fl2
li a7, 34 # Print hex
mv a0,s0 # fl
ecall
li a7,4
la a0,str1
ecall
li a7,34
mv a0,s1 # value
ecall
li a7,4
la a0,str2
ecall
li a7,34
mv a0,a1 # fl2
ecall
li a7,4
la a0,strnl
ecall
mv t6,x0
fl_equal_fl2:
bgtu t5,s1,prev_gt_val
# Print if previous_value <= value
li a7, 34 # Print hex
mv a0,s0 # fl
ecall
li a7,4
la a0,str3
ecall
li a7,34
mv a0,s1 # value
ecall
li a7,4
la a0,str4
ecall
li a7,34
mv a0,t5 # previous_value
ecall
li a7,4
la a0,strnl
ecall
mv t6,x0
prev_gt_val:
mv t5,s1
addi s0,s0,-1
j TestLoopCond
TestLoopExit:
mv a1,t6
ret
```
:::
# Problem C
## C Code
:::spoiler `q1/q1-bf16.c`
```clike=
#include <stdbool.h>
#include <stdint.h>
#include <string.h>
typedef struct {
uint16_t bits;
} bf16_t;
#define BF16_SIGN_MASK 0x8000U
#define BF16_EXP_MASK 0x7F80U
#define BF16_MANT_MASK 0x007FU
#define BF16_EXP_BIAS 127
#define BF16_NAN() ((bf16_t) {.bits = 0x7FC0})
#define BF16_ZERO() ((bf16_t) {.bits = 0x0000})
static inline bool bf16_isnan(bf16_t a)
{
return ((a.bits & BF16_EXP_MASK) == BF16_EXP_MASK) &&
(a.bits & BF16_MANT_MASK);
}
static inline bool bf16_isinf(bf16_t a)
{
return ((a.bits & BF16_EXP_MASK) == BF16_EXP_MASK) &&
!(a.bits & BF16_MANT_MASK);
}
static inline bool bf16_iszero(bf16_t a)
{
return !(a.bits & 0x7FFF);
}
static inline bf16_t f32_to_bf16(float val)
{
uint32_t f32bits;
memcpy(&f32bits, &val, sizeof(float));
if (((f32bits >> 23) & 0xFF) == 0xFF)
return (bf16_t) {.bits = (f32bits >> 16) & 0xFFFF};
f32bits += ((f32bits >> 16) & 1) + 0x7FFF;
return (bf16_t) {.bits = f32bits >> 16};
}
static inline float bf16_to_f32(bf16_t val)
{
uint32_t f32bits = ((uint32_t) val.bits) << 16;
float result;
memcpy(&result, &f32bits, sizeof(float));
return result;
}
static inline bf16_t bf16_add(bf16_t a, bf16_t b)
{
uint16_t sign_a = (a.bits >> 15) & 1;
uint16_t sign_b = (b.bits >> 15) & 1;
int16_t exp_a = ((a.bits >> 7) & 0xFF);
int16_t exp_b = ((b.bits >> 7) & 0xFF);
uint16_t mant_a = a.bits & 0x7F;
uint16_t mant_b = b.bits & 0x7F;
if (exp_a == 0xFF) {
if (mant_a)
return a;
if (exp_b == 0xFF)
return (mant_b || sign_a == sign_b) ? b : BF16_NAN();
return a;
}
if (exp_b == 0xFF)
return b;
if (!exp_a && !mant_a)
return b;
if (!exp_b && !mant_b)
return a;
if (exp_a)
mant_a |= 0x80;
if (exp_b)
mant_b |= 0x80;
int16_t exp_diff = exp_a - exp_b;
uint16_t result_sign;
int16_t result_exp;
uint32_t result_mant;
if (exp_diff > 0) {
result_exp = exp_a;
if (exp_diff > 8)
return a;
mant_b >>= exp_diff;
} else if (exp_diff < 0) {
result_exp = exp_b;
if (exp_diff < -8)
return b;
mant_a >>= -exp_diff;
} else {
result_exp = exp_a;
}
if (sign_a == sign_b) {
result_sign = sign_a;
result_mant = (uint32_t) mant_a + mant_b;
if (result_mant & 0x100) {
result_mant >>= 1;
if (++result_exp >= 0xFF)
return (bf16_t) {.bits = (result_sign << 15) | 0x7F80};
}
} else {
if (mant_a >= mant_b) {
result_sign = sign_a;
result_mant = mant_a - mant_b;
} else {
result_sign = sign_b;
result_mant = mant_b - mant_a;
}
if (!result_mant)
return BF16_ZERO();
while (!(result_mant & 0x80)) {
result_mant <<= 1;
if (--result_exp <= 0)
return BF16_ZERO();
}
}
return (bf16_t) {
.bits = (result_sign << 15) | ((result_exp & 0xFF) << 7) |
(result_mant & 0x7F),
};
}
static inline bf16_t bf16_sub(bf16_t a, bf16_t b)
{
b.bits ^= BF16_SIGN_MASK;
return bf16_add(a, b);
}
static inline bf16_t bf16_mul(bf16_t a, bf16_t b)
{
uint16_t sign_a = (a.bits >> 15) & 1;
uint16_t sign_b = (b.bits >> 15) & 1;
int16_t exp_a = ((a.bits >> 7) & 0xFF);
int16_t exp_b = ((b.bits >> 7) & 0xFF);
uint16_t mant_a = a.bits & 0x7F;
uint16_t mant_b = b.bits & 0x7F;
uint16_t result_sign = sign_a ^ sign_b;
if (exp_a == 0xFF) {
if (mant_a)
return a;
if (!exp_b && !mant_b)
return BF16_NAN();
return (bf16_t) {.bits = (result_sign << 15) | 0x7F80};
}
if (exp_b == 0xFF) {
if (mant_b)
return b;
if (!exp_a && !mant_a)
return BF16_NAN();
return (bf16_t) {.bits = (result_sign << 15) | 0x7F80};
}
if ((!exp_a && !mant_a) || (!exp_b && !mant_b))
return (bf16_t) {.bits = result_sign << 15};
int16_t exp_adjust = 0;
if (!exp_a) {
while (!(mant_a & 0x80)) {
mant_a <<= 1;
exp_adjust--;
}
exp_a = 1;
} else
mant_a |= 0x80;
if (!exp_b) {
while (!(mant_b & 0x80)) {
mant_b <<= 1;
exp_adjust--;
}
exp_b = 1;
} else
mant_b |= 0x80;
uint32_t result_mant = (uint32_t) mant_a * mant_b;
int32_t result_exp = (int32_t) exp_a + exp_b - BF16_EXP_BIAS + exp_adjust;
if (result_mant & 0x8000) {
result_mant = (result_mant >> 8) & 0x7F;
result_exp++;
} else
result_mant = (result_mant >> 7) & 0x7F;
if (result_exp >= 0xFF)
return (bf16_t) {.bits = (result_sign << 15) | 0x7F80};
if (result_exp <= 0) {
if (result_exp < -6)
return (bf16_t) {.bits = result_sign << 15};
result_mant >>= (1 - result_exp);
result_exp = 0;
}
return (bf16_t) {.bits = (result_sign << 15) | ((result_exp & 0xFF) << 7) |
(result_mant & 0x7F)};
}
static inline bf16_t bf16_div(bf16_t a, bf16_t b)
{
uint16_t sign_a = (a.bits >> 15) & 1;
uint16_t sign_b = (b.bits >> 15) & 1;
int16_t exp_a = ((a.bits >> 7) & 0xFF);
int16_t exp_b = ((b.bits >> 7) & 0xFF);
uint16_t mant_a = a.bits & 0x7F;
uint16_t mant_b = b.bits & 0x7F;
uint16_t result_sign = sign_a ^ sign_b;
if (exp_b == 0xFF) {
if (mant_b)
return b;
/* Inf/Inf = NaN */
if (exp_a == 0xFF && !mant_a)
return BF16_NAN();
return (bf16_t) {.bits = result_sign << 15};
}
if (!exp_b && !mant_b) {
if (!exp_a && !mant_a)
return BF16_NAN();
return (bf16_t) {.bits = (result_sign << 15) | 0x7F80};
}
if (exp_a == 0xFF) {
if (mant_a)
return a;
return (bf16_t) {.bits = (result_sign << 15) | 0x7F80};
}
if (!exp_a && !mant_a)
return (bf16_t) {.bits = result_sign << 15};
if (exp_a)
mant_a |= 0x80;
if (exp_b)
mant_b |= 0x80;
uint32_t dividend = (uint32_t) mant_a << 15;
uint32_t divisor = mant_b;
uint32_t quotient = 0;
for (int i = 0; i < 16; i++) {
quotient <<= 1;
if (dividend >= (divisor << (15 - i))) {
dividend -= (divisor << (15 - i));
quotient |= 1;
}
}
int32_t result_exp = (int32_t) exp_a - exp_b + BF16_EXP_BIAS;
if (!exp_a)
result_exp--;
if (!exp_b)
result_exp++;
if (quotient & 0x8000)
quotient >>= 8;
else {
while (!(quotient & 0x8000) && result_exp > 1) {
quotient <<= 1;
result_exp--;
}
quotient >>= 8;
}
quotient &= 0x7F;
if (result_exp >= 0xFF)
return (bf16_t) {.bits = (result_sign << 15) | 0x7F80};
if (result_exp <= 0)
return (bf16_t) {.bits = result_sign << 15};
return (bf16_t) {
.bits = (result_sign << 15) | ((result_exp & 0xFF) << 7) |
(quotient & 0x7F),
};
}
static inline bf16_t bf16_sqrt(bf16_t a)
{
uint16_t sign = (a.bits >> 15) & 1;
int16_t exp = ((a.bits >> 7) & 0xFF);
uint16_t mant = a.bits & 0x7F;
/* Handle special cases */
if (exp == 0xFF) {
if (mant)
return a; /* NaN propagation */
if (sign)
return BF16_NAN(); /* sqrt(-Inf) = NaN */
return a; /* sqrt(+Inf) = +Inf */
}
/* sqrt(0) = 0 (handle both +0 and -0) */
if (!exp && !mant)
return BF16_ZERO();
/* sqrt of negative number is NaN */
if (sign)
return BF16_NAN();
/* Flush denormals to zero */
if (!exp)
return BF16_ZERO();
/* Direct bit manipulation square root algorithm */
/* For sqrt: new_exp = (old_exp - bias) / 2 + bias */
int32_t e = exp - BF16_EXP_BIAS;
int32_t new_exp;
/* Get full mantissa with implicit 1 */
uint32_t m = 0x80 | mant; /* Range [128, 256) representing [1.0, 2.0) */
/* Adjust for odd exponents: sqrt(2^odd * m) = 2^((odd-1)/2) * sqrt(2*m) */
if (e & 1) {
m <<= 1; /* Double mantissa for odd exponent */
new_exp = ((e - 1) >> 1) + BF16_EXP_BIAS;
} else {
new_exp = (e >> 1) + BF16_EXP_BIAS;
}
/* Now m is in range [128, 256) or [256, 512) if exponent was odd */
/* Binary search for integer square root */
/* We want result where result^2 = m * 128 (since 128 represents 1.0) */
uint32_t low = 90; /* Min sqrt (roughly sqrt(128)) */
uint32_t high = 256; /* Max sqrt (roughly sqrt(512)) */
uint32_t result = 128; /* Default */
/* Binary search for square root of m */
while (low <= high) {
uint32_t mid = (low + high) >> 1;
uint32_t sq = (mid * mid) / 128; /* Square and scale */
if (sq <= m) {
result = mid; /* This could be our answer */
low = mid + 1;
} else {
high = mid - 1;
}
}
/* result now contains sqrt(m) * sqrt(128) / sqrt(128) = sqrt(m) */
/* But we need to adjust the scale */
/* Since m is scaled where 128=1.0, result should also be scaled same way */
/* Normalize to ensure result is in [128, 256) */
if (result >= 256) {
result >>= 1;
new_exp++;
} else if (result < 128) {
while (result < 128 && new_exp > 1) {
result <<= 1;
new_exp--;
}
}
/* Extract 7-bit mantissa (remove implicit 1) */
uint16_t new_mant = result & 0x7F;
/* Check for overflow/underflow */
if (new_exp >= 0xFF)
return (bf16_t) {.bits = 0x7F80}; /* +Inf */
if (new_exp <= 0)
return BF16_ZERO();
return (bf16_t) {.bits = ((new_exp & 0xFF) << 7) | new_mant};
}
```
:::
## Assembly
:::spoiler `hw1/hw1-bf16.s`
```asm=
.data
zero_data:
.half 0x0000, 0x8000
nan_data:
.half 0x7fff,0xffff,0x7f81, 0xff81
inf_data:
.half 0x7f80,0xff80
f32_data:
.word 0x3f8f86c2 # 1.1213
.word 0x41336873 # 11.213
bf16_data:
.half 0x3f90,0x4133 # 1.1213, 11.213
.half 0x4133,0xbf90 # 11.213, -1.1213
.half 0xbf90,0x4133 # -1.1213, 11.213
.half 0xc133,0xbf90 # -11.213, -1.1213
gold_data_add:
.word 0x4145,0x4121,0x4121,0xc145
gold_data_sub:
.word 0xc121,0x4145,0xc145,0xc121
gold_data_mul:
.word 0x4149,0xc149,0xc149,0x4149
gold_data_div:
.word 0x3dcd,0xc11f,0xbdcd,0x411f
gold_data_sqrt:
.word 0x3f88,0x4056,0x7fc0,0x7fc0
str_plus: .string " + "
str_min: .string " - "
str_mul: .string " * "
str_div: .string " / "
str_sqrt: .string " ^0.5 "
str_equal: .string " is "
str_not: .string " not "
str_nl: .string "\n"
str_passed: .string ", passed!\n"
str_failed: .string ", failed!\n"
.text
main:
jal test_add
jal test_sub
jal test_mul
jal test_div
jal test_sqrt
li a7,10
ecall
#################
# test_add #
#################
test_add:
addi sp,sp,-4
sw ra,0(sp)
la s0,bf16_data
la s1,gold_data_add
li s2,4 # data counter
test_add_loop:
addi s2,s2,-1
bltz s2,test_add_exit
slli t0,s2,2
add t1,s0,t0
add t2,s1,t0
lhu a1,0(t1) # op a
lhu a2,2(t1) # op b
lhu s3,0(t2) # answer
addi sp,sp,-20
sw ra,0(sp)
sw s0,4(sp)
sw s1,8(sp)
sw s2,12(sp)
sw s3,16(sp)
jal bf16_add
lw ra,0(sp)
lw s0,4(sp)
lw s1,8(sp)
lw s2,12(sp)
lw s3,16(sp)
addi sp,sp,20
mv s4,a0 # result: s4
mv a0, a1
li a7, 34
ecall
la a0, str_plus
li a7,4
ecall
mv a0, a2
li a7,34
ecall
la a0, str_equal
li a7,4
ecall
mv a0, s3 # ans
li a7,34
ecall
beq s3,s4,test_add_passed
la a0, str_not
li a7,4
ecall
mv a0,s4 # wrong result
li a7,34
ecall
la a0,str_failed
li a7,4
ecall
j test_add_loop
test_add_passed:
la a0,str_passed
li a7,4
ecall
j test_add_loop
test_add_exit:
lw ra,0(sp)
addi sp,sp,4
ret
#################
# test_sub #
#################
test_sub:
addi sp,sp,-4
sw ra,0(sp)
la s0,bf16_data
la s1,gold_data_sub
li s2,4 # data counter
test_sub_loop:
addi s2,s2,-1
bltz s2,test_sub_exit
slli t0,s2,2
add t1,s0,t0
add t2,s1,t0
lhu a1,0(t1) # op a
lhu a2,2(t1) # op b
lhu s3,0(t2) # answer
addi sp,sp,-20
sw ra,0(sp)
sw s0,4(sp)
sw s1,8(sp)
sw s2,12(sp)
sw s3,16(sp)
jal bf16_sub
lw ra,0(sp)
lw s0,4(sp)
lw s1,8(sp)
lw s2,12(sp)
lw s3,16(sp)
addi sp,sp,20
mv s4,a0 # result: s4
mv a0, a1
li a7, 34
ecall
la a0, str_min
li a7,4
ecall
mv a0, a2
li a7,34
ecall
la a0, str_equal
li a7,4
ecall
mv a0, s3 # ans
li a7,34
ecall
beq s3,s4,test_sub_passed
la a0, str_not
li a7,4
ecall
mv a0,s4 # wrong result
li a7,34
ecall
la a0,str_failed
li a7,4
ecall
j test_sub_loop
test_sub_passed:
la a0,str_passed
li a7,4
ecall
j test_sub_loop
test_sub_exit:
lw ra,0(sp)
addi sp,sp,4
ret
#################
# test_mul #
#################
test_mul:
addi sp,sp,-4
sw ra,0(sp)
la s0,bf16_data
la s1,gold_data_mul
li s2,4 # data counter
test_mul_loop:
addi s2,s2,-1
bltz s2,test_mul_exit
slli t0,s2,2
add t1,s0,t0
add t2,s1,t0
lhu a1,0(t1) # op a
lhu a2,2(t1) # op b
lhu s3,0(t2) # answer
addi sp,sp,-20
sw ra,0(sp)
sw s0,4(sp)
sw s1,8(sp)
sw s2,12(sp)
sw s3,16(sp)
jal bf16_mul
lw ra,0(sp)
lw s0,4(sp)
lw s1,8(sp)
lw s2,12(sp)
lw s3,16(sp)
addi sp,sp,20
mv s4,a0 # result: s4
mv a0, a1
li a7, 34
ecall
la a0, str_mul
li a7,4
ecall
mv a0, a2
li a7,34
ecall
la a0, str_equal
li a7,4
ecall
mv a0, s3 # ans
li a7,34
ecall
beq s3,s4,test_mul_passed
la a0, str_not
li a7,4
ecall
mv a0,s4 # wrong result
li a7,34
ecall
la a0,str_failed
li a7,4
ecall
j test_mul_loop
test_mul_passed:
la a0,str_passed
li a7,4
ecall
j test_mul_loop
test_mul_exit:
lw ra,0(sp)
addi sp,sp,4
ret
#################
# test_div #
#################
test_div:
addi sp,sp,-4
sw ra,0(sp)
la s0,bf16_data
la s1,gold_data_div
li s2,4 # data counter
test_div_loop:
addi s2,s2,-1
bltz s2,test_div_exit
slli t0,s2,2
add t1,s0,t0
add t2,s1,t0
lhu a1,0(t1) # op a
lhu a2,2(t1) # op b
lhu s3,0(t2) # answer
addi sp,sp,-20
sw ra,0(sp)
sw s0,4(sp)
sw s1,8(sp)
sw s2,12(sp)
sw s3,16(sp)
jal bf16_div
lw ra,0(sp)
lw s0,4(sp)
lw s1,8(sp)
lw s2,12(sp)
lw s3,16(sp)
addi sp,sp,20
mv s4,a0 # result: s4
mv a0, a1
li a7, 34
ecall
la a0, str_div
li a7,4
ecall
mv a0, a2
li a7,34
ecall
la a0, str_equal
li a7,4
ecall
mv a0, s3 # ans
li a7,34
ecall
beq s3,s4,test_div_passed
la a0, str_not
li a7,4
ecall
mv a0,s4 # wrong result
li a7,34
ecall
la a0,str_failed
li a7,4
ecall
j test_div_loop
test_div_passed:
la a0,str_passed
li a7,4
ecall
j test_div_loop
test_div_exit:
lw ra,0(sp)
addi sp,sp,4
ret
#################
# test_sqrt #
#################
test_sqrt:
addi sp,sp,-4
sw ra,0(sp)
la s0,bf16_data
la s1,gold_data_sqrt
li s2,4 # data counter
test_sqrt_loop:
addi s2,s2,-1
bltz s2,test_sqrt_exit
slli t0,s2,2
add t1,s0,t0
add t2,s1,t0
lhu a1,0(t1) # op a
lhu s3,0(t2) # answer
addi sp,sp,-20
sw ra,0(sp)
sw s0,4(sp)
sw s1,8(sp)
sw s2,12(sp)
sw s3,16(sp)
jal bf16_sqrt
lw ra,0(sp)
lw s0,4(sp)
lw s1,8(sp)
lw s2,12(sp)
lw s3,16(sp)
addi sp,sp,20
mv s4,a0 # result: s4
mv a0, a1
li a7, 34
ecall
la a0, str_sqrt
li a7,4
ecall
la a0, str_equal
li a7,4
ecall
mv a0, s3 # ans
li a7,34
ecall
beq s3,s4,test_sqrt_passed
la a0, str_not
li a7,4
ecall
mv a0,s4 # wrong result
li a7,34
ecall
la a0,str_failed
li a7,4
ecall
j test_sqrt_loop
test_sqrt_passed:
la a0,str_passed
li a7,4
ecall
j test_sqrt_loop
test_sqrt_exit:
lw ra,0(sp)
addi sp,sp,4
ret
#################
# bf16_isnan #
#################
bf16_isnan:
li s1,0x7F80 # BF16_EXP_MASK
and t0,a0,s1 # a & exp_mask
bne t0,s1,bf16_isnan_false
# check mantissa part is non-zero
andi t0,a0,0x7F
beq t0,x0,bf16_isnan_false
li a1,1
ret
bf16_isnan_false:
li a1,0
ret
#################
# bf16_isinf #
#################
bf16_isinf:
li s1,0x7F80 # BF16_EXP_MASK
and t0,a0,s1 # a & exp_mask
bne t0,s1,bf16_isinf_false
# check mantissa part is zero
andi t0,a0,0x7F
bne t0,x0,bf16_isinf_false
li a1,1
ret
bf16_isinf_false:
li a1,0
ret
#################
# bf16_iszero #
#################
bf16_iszero:
li s2,0x7FFF # mask exp and mant
and t0,a0,s2
bne t0,x0,bf16_iszero_false
li a1,1
ret
bf16_iszero_false:
li a1,0
ret
#################
# f32_to_bf16 #
#################
f32_to_bf16:
li s2,0x7FFF # mask exp and mant
mv t0,a1 # t0: f32bits
srli t1,t0,23
andi t1,t1,0xFF
addi t1,t1,-0xFF
bne t1,x0,f32_to_bf16_else
srli t1,t0,16
li t0,0xFFFF
and a0,t1,t0
ret
f32_to_bf16_else:
srli t1,t0,16
andi t1,t1,1
add t1,t1,s2
add t0,t0,t1
srli a0,t0,16
ret
#################
# bf16_to_f32 #
#################
bf16_to_f32:
slli a0,a1,16
ret
#################
# bf16_add #
#################
bf16_add: # a1: a, a2:b
srli s0,a1,15
andi s0,s0,1 # s0: sign_a
srli s1,a2,15
andi s1,s1,1 # s1: sign_b
srli s2,a1,7
andi s2,s2,0xFF # s2: exp_a
srli s3,a2,7
andi s3,s3,0xFF # s3: exp_b
andi s4,a1,0x7F # s4: mant_a
andi s5,a2,0x7F # s5: mant_b
li t0,0xFF # t0: 0xFF
bne s2,t0,exp_a_max_n
beq s4,x0,ret_a
bne s3,t0,ret_a
bnez s5,ret_b
bne s0,s1,ret_nan
exp_a_max_n:
beq s3,t0,ret_b
bne s2,x0,a_nonzero
beq s4,x0,ret_b
a_nonzero:
bne s3,x0,b_nonzero
beq s5,x0,ret_a
b_nonzero:
beq s2,x0,a_denorm
ori s4,s4,0x80
a_denorm:
beq s3,x0,b_denorm
ori s5,s5,0x80
b_denorm:
sub t1,s2,s3 # exp_diff: t1
ble t1,x0,exp_diff_neg
mv t3,s2 # result_exp: t3
beq t1,x0,res_exp_exit # exp_diff is 0
addi t4,t1,-8
bgt t4,x0,ret_a
srl s5,s5,t1
j res_exp_exit
exp_diff_neg:
mv t3,s3 # result_exp: t3
addi t4,t1,8
blt t4,x0,ret_b
sub t4,x0,t1
srl s4,s4,t4
res_exp_exit:
bne s0,s1,sign_is_diff
mv t2,s0 # result_sign = t2
add t4,s4,s5 # result_mant = t4
andi t5,t3,0x100
beq t5,x0,ret_encoded # return if no carry
srli t4,t4,1
addi t3,t3,1
bge t3,t0,ret_inf
j ret_encoded
sign_is_diff:
blt s4,s5,b_lt_a
mv t2,s0
sub t4,s4,s5
j b_lt_a_exit
b_lt_a:
mv t2,s1
sub t4,s5,s4
b_lt_a_exit:
beq t4,x0,ret_zero
denorm_loop:
andi t5,t4,0x80
bne t5,x0,ret_encoded
slli t4,t4,1
addi t3,t3,-1
blez t3,ret_zero
j denorm_loop
#################
# bf16_sub #
#################
bf16_sub: # a1: a, a2:b
li t0, 0x8000 # BF16_SIGN_MASK
xor a2,a2,t0
j bf16_add
#################
# bf16_mul #
#################
bf16_mul: # a1: a, a2:b
srli s0,a1,15
andi s0,s0,1 # s0: sign_a
srli s1,a2,15
andi s1,s1,1 # s1: sign_b
srli s2,a1,7
andi s2,s2,0xFF # s2: exp_a
srli s3,a2,7
andi s3,s3,0xFF # s3: exp_b
andi s4,a1,0x7F # s4: mant_a
andi s5,a2,0x7F # s5: mant_b
xor t2,s0,s1 # res_sign: t2
li t0,0xFF # t0: 0xFF
bne s2,t0,mul_exp_a_max_n
bnez s4,ret_a
bnez s3,ret_inf
beqz s5,ret_nan
j ret_inf
mul_exp_a_max_n:
bne s3,t0,mul_exp_b_max_n
bnez s5,ret_b
bnez s2,ret_inf
beqz s4,ret_nan
j ret_inf
mul_exp_b_max_n:
li t1,0 # exp_adj: t1
mant_a_norm_loop:
bnez s2,exp_a_nonzero
ori t5,s4,0x80
bnez t5,mant_a_norm
slli s4,s4,1
addi t1,t1,-1
j mant_a_norm_loop
mant_a_norm:
li s2,1
j mant_b_norm_loop
exp_a_nonzero:
ori s4,s4,0x80
mant_b_norm_loop:
bnez s3,exp_b_nonzero
ori t5,s5,0x80
bnez t5,mant_b_norm
slli s5,s5,1
addi t1,t1,-1
j mant_b_norm_exit
mant_b_norm:
li s3,1
j mant_b_norm_exit
exp_b_nonzero:
ori s5,s5,0x80
mant_b_norm_exit:
# Multiply
li t4,0
mult_loop:
beqz s5,mult_exit
add t4,t4,s4
addi s5,s5,-1
j mult_loop
mult_exit:
add t3,s2,s3
addi t3,t3,-127
add t3,t3,t1 # result_exp
li t5,0x8000
and t5,t4,t5
beqz t5,result_mant_adj_n
srli t4,t4,8
andi t4,t4,0x7F
addi t3,t3,1
j check_result_exp
result_mant_adj_n:
srli t4,t4,7
andi t4,t4,0x7F
check_result_exp:
blt t3,t0,result_inf_n
j ret_inf
result_inf_n:
bgtz t3,ret_encoded
addi t5,t3,6
bltz t5,ret_zero
li t5,1
sub t5,t5,t3
srl t4,t4,t5
li t3,0
j ret_encoded
#################
# bf16_div #
#################
bf16_div: # a1: a, a2:b
srli s0,a1,15
andi s0,s0,1 # s0: sign_a
srli s1,a2,15
andi s1,s1,1 # s1: sign_b
srli s2,a1,7
andi s2,s2,0xFF # s2: exp_a
srli s3,a2,7
andi s3,s3,0xFF # s3: exp_b
andi s4,a1,0x7F # s4: mant_a
andi s5,a2,0x7F # s5: mant_b
xor t2,s0,s1 # result_sign: t2
li t0,0xFF # t0: 0xFF
bne s3,t0,div_exp_b_max_n
bnez s5,ret_b
bne s2,t0,ret_zero
beqz s4,ret_nan
j ret_zero
div_exp_b_max_n:
bnez s3,div_b_zero_n
bnez s5,div_b_zero_n
bnez s2,ret_inf
beqz s4,ret_nan # 0/0 is nan
j ret_inf # inf otherwise
div_b_zero_n:
bne s2,t0,div_exp_a_max_n
bnez s4,ret_a
j ret_inf
div_exp_a_max_n: # check if a is zero
bnez s2,div_a_zero_n
beqz s4,ret_zero
div_a_zero_n:
beqz s2,div_a_denorm
ori s4,s4,0x80
div_a_denorm:
beqz s3,div_b_denorm
ori s5,s5,0x80
div_b_denorm:
slli s4,s4,15 # s4: dividend
# s5: divisor
li s6,0 # s6: quotient
li t4,15 # t4: i
div_loop:
bltz t4,div_loop_exit
slli s6,s6,1
sll t5,s5,t4
addi t4,t4,-1
blt s4,t5,div_loop
sub s4,s4,t5
ori s6,s6,1
j div_loop
div_loop_exit:
sub t3,s2,s3
addi t3,t3,127
bnez s2,div_exp_a_zero_n
addi s3,s3,-1
div_exp_a_zero_n:
bnez s3,div_exp_b_zero_n
addi s3,s3,1
div_exp_b_zero_n:
li t4,0x8000
and t5,s6,t4
beqz t5,quot_loop
srli s6,s6,8
j quot_exit
quot_loop:
and t5,s6,t4
bnez t5,quot_loop_exit
addi t5,t3,-1
blez t5,quot_loop_exit
slli s6,s6,1
addi t3,t3,-1
j quot_loop
quot_loop_exit:
srli s6,s6,8
quot_exit:
andi s6,s6,0x7F
mv t4,s6 # quotient as mantissa
bge t3,t0,ret_inf
blez t3,ret_zero
j ret_encoded
#################
# bf16_sqrt #
#################
bf16_sqrt: # a1: a
srli s0,a1,15
andi s0,s0,1 # s0: sign
srli s2,a1,7
andi s2,s2,0xFF # s2: exp
andi s4,a1,0x7F # s4: mant
li t0,0xFF # t0: 0xFF
bne s2,t0,sqrt_exp_max_n
bnez s4,ret_a
bnez s0,ret_nan
j ret_a
sqrt_exp_max_n:
bnez s2,sqrt_a_zero_n
beqz s4,ret_zero
sqrt_a_zero_n:
bnez s0,ret_nan
beqz s2,ret_zero
addi t1,s2,-127 # t1: e, t2: new_exp
ori t3,s4,0x80 # t3: m
andi t4,t1,1
beqz t4,sqrt_even_exp
slli t3,t3,1
addi t2,t1,-1
srli t2,t2,1
addi t2,t2,127
j sqrt_adjexp_exit
sqrt_even_exp:
srli t2,t1,1
addi t2,t2,127
sqrt_adjexp_exit:
li a2,90 # low
li a3,256 # high
li a4,128 # result
bs_loop:
bgt a2,a3, bs_exit
add t4,a2,a3
srli t4,t4,1 # t4:mid
# Multiply: t5: prod
li t5,0
mv t6,t4
sqrt_mult_loop:
beqz t6,sqrt_mult_exit
add t5,t5,t4
addi t6,t6,-1
j sqrt_mult_loop
sqrt_mult_exit:
srli t5,t5,7 # sq: t5
bgt t5,t3,bs_else
mv a4,t4
addi a2,t4,1
j bs_loop
bs_else:
addi a3,t4,-1
j bs_loop
bs_exit:
li t4,256
blt a4,t4,sqrt_result_norm_else
srli a4,a4,1
addi t3,t3,1
j sqrt_result_norm_exit
sqrt_result_norm_else:
li t4,128
sqrt_result_norm_loop:
bge a4,t4,sqrt_result_norm_exit
li t5,1
ble t2,t5,sqrt_result_norm_exit
slli a4,a4,1
addi t2,t2,-1
j sqrt_result_norm_loop
sqrt_result_norm_exit:
mv t5,t2 # new_exp
li t2,0 # sign
andi t4,a4,0x7F # new_mant
bge t5,t0,ret_inf
blez t5,ret_zero
# ret_encoded
andi t2,t5,0xFF
slli t2,t2,7
or a0,t4,t2
ret
#################
# Helper #
#################
ret_a:
mv a0,a1
ret
ret_b:
mv a0,a2
ret
ret_nan:
li a0,0x7FC0
ret
ret_inf:
slli t2,t2,15
li t1,0x7F80
or a0,t1,t2
ret
ret_zero:
slli t1,t1,15
ori a0,t1,0
ret
ret_encoded: # t2: sign, t3: exp, t4: mant
slli t2,t2,15
and t3,t3,t0
slli t3,t3,7
andi t4,t4,0x7F
or a0,t2,t2
or a0,a0,t3
or a0,a0,t4
ret
```
:::
# 🚩 Applications: Fast Inverse Square Root with BFloat16
## `Q_rsqrt()` Algorithm
### ✳️ Introduction
The `q_rsqrt()` algorithm computes an **approximate reciprocal square root**:
$y \approx \frac{1}{\sqrt{x}}$ for a given input $x$, often in **fixed-point or reduced-precision formats** (e.g., BF16 or Q-format). This method is widely used in graphics, neural networks, and signal processing due to its **high performance** and **deterministic iteration count**.
---
### 🧩 Mathematical Basis
Given $x > 0$, the reciprocal square root can be expressed as:
$y = \frac{1}{\sqrt{x}} = x^{-\frac{1}{2}}$
For efficient computation, we can **linearize around an initial approximation**:
$y_0 \approx \text{initial guess for } \frac{1}{\sqrt{x}}$
Then, using **Newton–Raphson iteration**, we refine the estimate:
$y_{n+1} = y_n \cdot \left( \frac{3}{2} - \frac{x}{2} y_n^2 \right)$
- $y_n$= current estimate
- $x$= input value
- Each iteration **squares $y_n$**, multiplies by $x$, subtracts from $\frac{3}{2}$, then multiplies by $y_n$to converge faster.
This converges **quadratically**, meaning the error roughly **squares** at each step.
---
### 🔢 Initial Approximation
A crucial step is choosing an efficient **initial guess**:
$y_0 \approx \text{magic number} - \frac{1}{2} \cdot \text{bit-level representation of } x$
- In floating-point, this is often implemented by **bit reinterpretation** and subtraction (the “fast inverse square root trick” popularized by Quake III).
- In fixed-point BF16 or Q-format, a **lookup table** or **linear approximation** is often used for simplicity.
The initial approximation ensures **1–2 iterations** of Newton–Raphson achieve sufficient precision.
---
### 🧠 Algorithmic Steps (Mathematical Form)
Let $x$ be the input, and $y$ the reciprocal square root output:
1. **Compute initial guess $y_0$** based on a linear approximation of $x$ in the chosen numeric format.
2. **Iteratively refine** using Newton–Raphson:
$y_{k+1} = y_k \cdot \left( 1.5 - 0.5 \cdot x \cdot y_k^2 \right)$
3. **Terminate** after fixed iterations (typically 1–3 for BF16), producing $y \approx 1/\sqrt{x}$.
---
### 🔢 BF16 / Fixed-Point Interpretation
In **BF16 or Q-format**, multiplications and subtractions are performed with **reduced-precision arithmetic**. Key considerations:
1. **Overflow prevention:** Ensure intermediate products fit into wider registers.
2. **Scaling:** Fixed-point numbers must be **properly scaled** for fractional bits.
3. **Iteration count:** Typically 1 iteration suffices for BF16, since the format limits achievable precision (~0.5% error).
Mathematically, the algorithm **mirrors the floating-point Newton–Raphson formula**, but all operations are constrained to the **fixed-point representation**.
---
### 📈 Summary
> 💬 *In essence:*
> The `q_rsqrt()` algorithm efficiently approximates $\frac{1}{\sqrt{x}}$ by combining a **good initial guess** with **1–2 iterations of Newton–Raphson**, producing a result suitable for **low-precision or real-time arithmetic units**.
> Its quadratic convergence and simple arithmetic make it ideal for **BF16 or Q-format hardware pipelines**, where predictable latency and minimal cycles are critical.
## 🛠️ Implementation
### C Code with FP32
```clike=
float Q_rsqrt( float number ){
long i;
float x2, y;
const float threehalfs = 1.5F;
x2 = number * 0.5F;
y = number;
i = * ( long * ) & y;
i = 0x5f3759df - ( i >> 1 );
y = * ( float * ) &i;
y = y * ( threehalfs - ( x2 * y * y ) );
return y;
}
```
### Modified C Code with BF16
```clike=
bf16_t Q_rsqrt_bf16( bf16_t number ){
uint16_t i;
bf16_t x2, y;
bf16_t threehalfs = f32_to_bf16(1.5);
x2 = bf16_mul(number,f32_to_bf16(0.5));
y = number;
i = * ( uint16_t * ) & (y.bits);
i = 0x5f37 - ( i >> 1 );
y = (bf16_t) { .bits = *(&i) };
y = bf16_mul(y, bf16_sub(threehalfs, bf16_mul(x2, bf16_mul(y,y))));
return y;
}
```
### Modified Assembly Code with BF16
```asm=
.data
str_0: .string "Inverse square root of "
str_1: .string ", BF16: "
str_2: .string ", ans: "
str_3: .string ", elapsed cycles: "
str_nl: .string "\n"
test_data:
.word 0x379591a2
.word 0x3f800000
.word 0x40c80000
.word 0x42c80000
.word 0x48a623a4
gold_data:
.word 0x436CD2C2
.word 0x3f800000
.word 0x3ecccccd
.word 0x3dcccccd
.word 0x3ae0b3f9
.text
main:
li s0,4 # counter
la s3,test_data # test_data
la s4,gold_data # gold_data
loop:
bltz s0,exit
slli t0,s0,2
add t1,s3,t0
lw t1,0(t1) # test_data
add s1,s4,t0
lw s1,0(s1) # gold_data
la a0,str_0
li a7, 4
ecall
mv a0,t1
li a7,2
ecall
# BF16:
la a0,str_1
li a7, 4
ecall
# set start time
li a7,31
ecall
mv s2,a1
mv a1,t1 # test_data
addi sp,sp,-20
sw s0,0(sp) # counter
sw s1,4(sp) # gold_data
sw s2,8(sp) # #cycle
sw s3,12(sp)
sw s4,16(sp)
jal f32_to_bf16
mv a1,a0
jal Q_sqrt_bf16
mv a1,a0
jal bf16_to_f32
lw s0,0(sp)
lw s1,4(sp)
lw s2,8(sp)
lw s3,12(sp)
lw s4,16(sp)
addi sp,sp,20
li a7,2 # a0 is result
ecall
# set end time
li a7,31
ecall
sub s2,a0,s2
la a0,str_2
li a7,4
ecall
mv a0,s1 # gold data
li a7,2
ecall
# elapsed cycles
la a0,str_3
li a7,4
ecall
mv a0,s2
li a7,1
ecall
la a0,str_nl
li a7,4
ecall
addi s0,s0,-1
j loop
exit:
li a7,10
ecall
Q_sqrt_bf16: # number: a1
addi sp,sp,-4
sw ra,0(sp)
li a2,0x3f00 # 0.5
jal bf16_mul
mv a3,a0 # x2: a3
li t1,0x5f37
srli a1,a1,1
sub a4,t1,a1 # y: a4
mv a2,a4
mv a1,a4
jal bf16_mul # bf16_mul(y,y): a0
mv a2,a0
mv a1,a3
jal bf16_mul # bf16_mul(x2,...)
mv a2,a0
li a1,0x3fc0 # a1: threehalfs=1.5
jal bf16_sub
mv a2,a0
mv a1,a4
jal bf16_mul # bf16_mul(y,...)
lw ra,0(sp)
addi sp,sp,4
ret
```
### Test Results


## Algorithm with Shift-Add Multiplication
### ✳️ Introduction
Shift–Add Multiplication is a **bitwise arithmetic method** for performing integer or fixed-point multiplication using only **addition** and **bit shifting**.
It is conceptually derived from the **binary expansion of the multiplier** and forms the mathematical foundation of hardware multipliers (e.g., array multipliers, Booth multipliers, and iterative ALUs).
---
### 🧩 Mathematical Basis
Let the two **unsigned integers** be:
$\[
A = \text{multiplicand}, \q
B = \text{multiplier}$
where $A, B \in \mathbb{N}$.
The binary representation of $B$ can be written as:
$B = \sum_{i=0}^{n-1} b_i \cdot 2^i, \quad b_i \in \{0,1\}$
Then, the product $P = A \times B$ can be expanded as:
$P = A \times \left(\sum_{i=0}^{n-1} b_i \cdot 2^i\right)
= \sum_{i=0}^{n-1} (A \cdot b_i) \cdot 2^i$
Since $b_i$ is either 0 or 1, this simplifies to a **conditional accumulation**:
$P = \sum_{i=0}^{n-1} p_i ,$ $p_i=
\begin{cases}
A \cdot 2^i, & \text{if } b_i = 1 \\
0, & \text{if } b_i = 0
\end{cases}$
Thus, multiplication reduces to:
1. **Shift** the multiplicand $A$ left by $i$ bits.
2. **Add** it to the running sum if the corresponding bit $b_i$ is 1.
---
### ⚖️ Properties
| Property | Description |
|-----------|--------------|
| **Complexity** | $O(n)$, where $n$ is the number of bits |
| **Operations** | Only requires addition and bit-shift (no direct multiply) |
| **Determinism** | Fixed iteration count; predictable latency |
| **Hardware Parallelism** | Easily parallelizable for array multipliers |
| **Accuracy** | Exact for integer arithmetic; in fixed-point, rounding may occur |
---
> 💬 **In summary:**
> The Shift–Add multiplication method leverages the binary structure of numbers to express multiplication as a series of conditional shifts and additions. It trades raw instruction complexity for predictable, uniform operations — a crucial property for low-power, fixed-latency arithmetic units, especially in BF16 or other reduced-precision numeric systems.
## 🚀 Implementation
### Modified Assembly Code with BF16
We modify the original substract-accumulating multiplication:
```asm=
...
# Multiply: t4 = t4 * s5
li t4,0
mult_loop:
beqz s5,mult_exit
add t4,t4,s4
addi s5,s5,-1
j mult_loop
mult_exit:
...
```
to bitwise shift-add multiplication, since:
- Multiplicand and multiplier are **unsigned**.
- Multiplicand and multiplier are **at most 16 bits**.
- It takes $\Theta(16)$ instead of $O(\textrm{value of multiplier})$, which is constant time in **this architecture**.
```asm=
...
# Multiply: t4 = t4 * s5
li t4, 0 # result = 0
li t0, 16 # 16 bits to process
mult_loop:
andi t5,s5,1 # check LSB of multiplier
beqz t5,skip_add # if 0, skip addition
add t4,t4,s4 # if LSB=1, add multiplicand to result
skip_add:
srli s5,s5,1 # shift multiplier right by 1
slli s4,s4,1 # shift multiplicand left by 1
addi t0,t0,-1
bnez t0,mult_loop
mult_exit:
...
```
### Test Results


# 🧮 Evaluation
The evaluation compares the proposed **fast multiplication–based reciprocal square root (`q_rsqrt`)** with the **traditional multiply–accumulate implementation**, both in **BF16 precision**, using **FP32** results as the reference.
| Test Data | Error (to FP32) | Cycles – Traditional Mul. | Cycles – Fast Mul. |
|------------|----------------|---------------------------|--------------------|
| 340253.125 | 0.577% | 5281 | 1010 |
| 100 | 0.586% | 4975 | 1010 |
| 6.25 | 0.586% | 4975 | 1010 |
| 1 | 0.000% | 4869 | 1000 |
| 0.00001783 | 0.075% | 4651 | 1002 |
---
### 💡 Commentary
The **fast multiplication method** achieves a **4.9×–5.2× speedup** over the traditional multiply–accumulate approach, lowering the average cycle count from ~5k to ~1k across all test cases.
Despite this large performance gain, the **numerical accuracy remains within 0.6%** of the FP32 reference — well within the tolerance expected for BF16 precision arithmetic.
Key observations:
- For well-conditioned inputs (e.g., `x = 1`), the fast method achieves **exact FP32 parity (0.000% error)**.
- Larger or smaller magnitudes show small, consistent rounding errors due to BF16’s reduced mantissa precision.
- Performance remains **deterministic** (fixed iteration count), making it suitable for **real-time or low-power applications**.
---
### ⚙️ Summary
| Metric | Observation |
|--------|--------------|
| **Speedup** | ~5× faster |
| **Accuracy Loss** | ≤ 0.6% |
| **Determinism** | Fixed cycle count (~1000) |
| **Use Case** | Embedded / real-time FP pipelines, BF16 ALU verification |
> ✅ *Conclusion:*
> The bitwise shift–add–based fast multiplier preserves accuracy while dramatically improving throughput, making it a practical choice for hardware or firmware-level BF16 math units.
# 📈Analysis
## ⚙️ 5-Stage Pipelined Processor in Ripes
Ripes provides multiple CPU models that can execute the same RISC-V assembly program:
- **Single-cycle processor**:
Each instruction completes all operations (fetch, decode, execute, memory access, write-back) within a single clock cycle.
- **5-stage pipelined processor**:
Splits instruction execution into five stages (IF, ID, EX, MEM, WB), allowing multiple instructions to execute simultaneously in different stages.
This section visualizes and explains each stage.
---

🔹 Instruction Fetch (IF)
- **PC**: Stores the current instruction address (e.g., `PC = 0x00000000`).
After each instruction, it updates to the next address.
- **Adder (+4)**: Computes `PC + 4 = 0x00000004` for the next instruction.
- **MUX**: Selects the next PC value.
- Normally chooses `PC + 4` (sequential execution).
- For jumps/branches, selects the target address instead.
- Example: for `jal x1, 316`, the next PC becomes `PC + 316`.
- **Instruction Memory**: Fetches the instruction at the current PC.
Here, `addr = 0x00000000` → instruction `0x13C000EF` (`jal x1, 316`).
- **Compressed Decoder**:
Checks if the instruction is a 16-bit compressed form and expands it to 32 bits if necessary.
(`jal` here is a 32-bit normal instruction, so it stays the same.)
- **IF/ID Pipeline Register**:
Stores the instruction and PC for use in the next cycle by the ID stage.
(Enable = green, Clear = red → instruction flows normally to ID.)
---

🔹 Instruction Decode / Register Fetch (ID)
- **IF/ID Register**: Supplies the current instruction (`0x13C000EF`) and PC.
- **Decode Unit**:
Extracts opcode, rd, rs1, rs2, funct3, and funct7 fields to identify the instruction type (here: JAL).
- **Register File**:
Reads the source registers. Since `jal` doesn’t use sources, Reg1 and Reg2 = `0x00000000`.
- **Immediate Generator**:
Extracts the immediate field → `0x0000013C` (jump offset = 316).
- **ID/EX Pipeline Register**:
Holds decoded values for the next clock cycle (passed to EX).
---

🔹 Execute (EX)
- **ALU**:
Performs arithmetic, logic, and address calculations.
- Input: `PC = 0x00000000`, `Imm = 0x0000013C`
- Output: `Res = 0x0000013C` → Jump Target Address
- **ALU Input Selectors (MUXes)**:
Upper MUX selects PC, lower MUX selects immediate → performs `PC + Imm`.
- **Branch Unit**:
Determines if a branch is taken (`beq`, `bne`, `blt`, `bge`, etc.).
For `jal`, the jump always occurs → `Branch taken = true`.
The jump target (0x0000013C) is sent back to the IF stage to update PC.
---

🔹 Memory Access (MEM)
- **Data Memory**:
The `jal` instruction doesn’t access memory, so this stage simply passes through without changes.
---

🔹 Write Back (WB)
- **MUX (Write-back Selector)**:
Chooses which data source will be written to the register file.
- In this case: `PC + 4 = 0x00000004`.
- **Register File**:
Writes `0x00000004` to the destination register `x1 (ra)`.
- **Result**:
After execution, `x1 = 0x00000004`, which stores the return address (next instruction address).
---
💡 **Summary**
This 5-stage pipeline allows multiple instructions to be processed simultaneously in different stages, improving CPU throughput and overall efficiency.
# Reference
[Inverse Square Root](https://ece.uwaterloo.ca/~dwharder/aads/Algorithms/Inverse_square_root/)