# Assignment1: RISC-V Assembly and Instruction Pipeline
# Problem_B
## uf8 Encoding and Decoding with Optimized CLZ Method
uf8 implements a logarithmic 8-bit codec that maps 20-bit unsigned integers to 8-bit symbols via logarithmic quantization, delivering 2.5:1 compression and ≤6.25% relative error.Below is the formula of uf8 encoding and decoding.

## Implementation and Optimization
In this part, I translate the C code below into a complete RISC-V program and optimize the UF8 encoding using a loopless binary-search CLZ.
I demonstrate that the loopless CLZ method outperforms the branchless CLZ method in terms of runtime performance. Moreover, I show that the loopless CLZ method reduces memory usage by 20% compared with the compiler-generated RISC-V code.
You can find the source code [here](https://github.com/mephen/2025-arch-hw1).
### C code
:::spoiler Open
```c=
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
typedef uint8_t uf8;
static inline unsigned clz(uint32_t x) //count leading zeros
{
//假設一開始有 32 個 leading zeros, 每次測試一半(binary search 的思路)
int n = 32, c = 16;
do {
uint32_t y = x >> c; //把數字右移 c 位
if (y) { //檢查高位是否有非零 bit,y!=0 代表最高位落在中線左邊
n -= c; //至少有 c bit 不是 leading zero
x = y; //縮小範圍繼續檢查
}
c >>= 1; //c 除以 2,繼續二分搜尋
} while (c);
return n - x; //最後一輪 x 會縮到 1,n-x 最大 31 (最高位在 bit 0) 最小 0 (最高位在 bit 31)
}
/* Decode uf8 to uint32_t */
uint32_t uf8_decode(uf8 fl)
{
uint32_t mantissa = fl & 0x0f;
uint8_t exponent = fl >> 4;
uint32_t offset = (0x7FFF >> (15 - exponent)) << 4; //(2^e - 1)*16
return (mantissa << exponent) + offset; //m*2^e + (2^e - 1)*16
}
/* Encode uint32_t to uf8 */
uf8 uf8_encode(uint32_t value)
{
/* Use CLZ for fast exponent calculation */
if (value < 16)
return value;
/* Find appropriate exponent using CLZ hint */
int lz = clz(value);
int msb = 31 - lz;
/* Start from a good initial guess */
uint8_t exponent = 0;
uint32_t overflow = 0;
if (msb >= 5) {
exponent = msb - 4; // 經驗公式:exponent ≈ msb - 4
if (exponent > 15)
exponent = 15;
/* Calculate overflow for estimated exponent */
for (uint8_t e = 0; e < exponent; e++) //overflow是offset,求(2^e - 1)*16
overflow = (overflow << 1) + 16;
/* Adjust if estimate was off:向下修正 (如果估計太大)*/
while (exponent > 0 && value < overflow) {
overflow = (overflow - 16) >> 1;
exponent--;
}
}
/* Find exact exponent:向上修正 (如果估計太小) */
while (exponent < 15) {
uint32_t next_overflow = (overflow << 1) + 16;
if (value < next_overflow)
break;
overflow = next_overflow;
exponent++;
}
uint8_t mantissa = (value - overflow) >> exponent;//(value - offset(e))/(2^e)
return (exponent << 4) | mantissa;
}
/* Test encode/decode round-trip */
static bool test(void)
{
int32_t previous_value = -1;
bool passed = true;
for (int i = 0; i < 256; i++) {
printf("test data: %d\n",i);
uint8_t fl = i;
int32_t value = uf8_decode(fl);
uint8_t fl2 = uf8_encode(value);
if (fl != fl2) {
printf("%02x: produces value %d but encodes back to %02x\n", fl,
value, fl2);
passed = false;
}
if (value <= previous_value) {
printf("%02x: value %d <= previous_value %d\n", fl, value,
previous_value);
passed = false;
}
previous_value = value;
}
return passed;
}
int main(void)
{
if (test()) {
printf("All tests passed.\n");
return 0;
}
return 1;
}
```
:::
### Assembly code
In this part, I present 3 versions of the UF8 RISC-V program and compare them in terms of code size and runtime performance.
- UF8 Decoding + Loopless Binary-Search CLZ–Optimized Encoding with Round-Trip Tests
- UF8 Decoding + BranchLess Binary-Search CLZ–Optimized Encoding with Round-Trip Tests
- Compiler Generated UF8 Decoding + Encoding
#### UF8 Decoding + Loopless Binary-Search CLZ–Optimized Encoding with Round-Trip Tests
- code lines: 241
- cpu cycles: 45903
:::spoiler Open
```asm=
.text
_start:
jal ra, main # jump to main
halt:
li a7, 10
ecall # if main returns, exit
clz: # uint32_t clz(uint32_t x)
beqz a0, clz_zero # if x==0, return 32
li t0, 32 # n = 32
srli t2, a0, 16 # y = x >> 16
beqz t2, clz_skip16 # if y==0, go search right half
addi t0, t0, -16 # n -= 16
mv a0, t2 # x = y
clz_skip16:
srli t2, a0, 8 # y = x >> 8
beqz t2, clz_skip8 # if y==0, go search right half
addi t0, t0, -8 # n -= 8
mv a0, t2 # x = y
clz_skip8:
srli t2, a0, 4
beqz t2, clz_skip4
addi t0, t0, -4
mv a0, t2
clz_skip4:
srli t2, a0, 2
beqz t2, clz_skip2
addi t0, t0, -2
mv a0, t2
clz_skip2:
srli t2, a0, 1
beqz t2, clz_last
addi t0, t0, -1
mv a0, t2
clz_last:
sub a0, t0, a0 # x = n - x
ret # return x
clz_zero:
li a0, 32 # x = 32
ret # return x
uf8_decode: # uint32_t uf8_decode(uint8_t fl)
andi t0, a0, 0x0F # m = fl & 0x0f
srli t1, a0, 4 # e = fl >> 4
li t2, 1
sll t2, t2, t1
addi t2, t2, -1
slli t2, t2, 4 # offset = ((1<<e)-1) << 4
sll t0, t0, t1
add a0, t0, t2 # fl = (m << e) + offset
ret
uf8_encode: # uint8_t uf8_encode(uint32_t value)
addi sp, sp, -16
sw ra, 12(sp) # store ra (because we call clz)
sw s0, 8(sp) # store callee-saved s0 (exponent)
sw s1, 4(sp) # store callee-saved s1 (overflow)
# if (value < 16) return value;
sltiu t0, a0, 16
beqz t0, enCode_normalVal # if (!(value<16)), goto enCode_normalVal
j enCode_ret # goto enCode_ret
enCode_normalVal:
mv t5, a0 # v = value
jal ra, clz # lz = clz(value)
li t0, 31
sub t1, t0, a0 # msb = 31 - lz
mv a0, t5 # value = v
li s0, 0 # exponent = 0
li s1, 0 # overflow = 0
sltiu t2, t1, 5 # if (msb >= 5) { exponent = msb - 4; if (exponent>15) exponent=15; }
bnez t2, enCode_find_up
addi s0, t1, -4
sltiu t3, s0, 16
bnez t3, enCode_initGuess_overflow_ok # if (exponent < 16) goto enCode_initGuess_overflow_ok;
li s0, 15 # exponent = 15;
enCode_initGuess_overflow_ok: # overflow = ((1<<e)-1)*16
li s1, 0 # overflow = 0;
li t4, 0 # e = 0;
enCode_overflow_loop:
bge t4, s0, enCode_adjust_down # if (e >= exponent) goto enCode_adjust_down;
slli s1, s1, 1
addi s1, s1, 16
addi t4, t4, 1 # overflow = (overflow<<1) + 16; e++;
j enCode_overflow_loop
enCode_adjust_down:
beqz s0, enCode_find_up # if (exponent==0) goto enCode_find_up;
sltu t4, a0, s1
beqz t4, enCode_find_up # if (value >= overflow) goto enCode_find_up;
addi s1, s1, -16
srli s1, s1, 1 # overflow = (overflow - 16) >> 1;
addi s0, s0, -1 # exponent--;
j enCode_adjust_down
enCode_find_up:
li t4, 15 # upper limit 15
enCode_up_loop:
bge s0, t4, enCode_up_done # if (exponent >= 15) break;
slli t1, s1, 1
addi t1, t1, 16 # next_overflow = (overflow << 1) + 16;
sltu t2, a0, t1
bnez t2, enCode_up_done # if (value < next_overflow), goto enCode_up_done;
mv s1, t1 # overflow = next_overflow;
addi s0, s0, 1 # exponent++;
j enCode_up_loop
enCode_up_done:
sub t0, a0, s1 # num = value - overflow;
srl t0, t0, s0 # mantissa = num >> exponent;
slli t1, s0, 4
andi t0, t0, 0x0F # mantissa &= 0x0F;
or a0, t1, t0 # a0 = (exponent<<4) | mantissa;
andi a0, a0, 0xFF # mask return value to 8 bits
enCode_ret:
lw ra, 12(sp) # restore ra
lw s0, 8(sp) # restore s0
lw s1, 4(sp) # restore s1
addi sp, sp, 16 # deallocate stack frame
ret # return
test: # bool test(void)
addi sp, sp, -40
sw ra, 36(sp) # save return address
sw s0, 32(sp) # save s0 (previous_value)
sw s1, 28(sp) # save s1 (passed)
sw s2, 24(sp) # save s2 (i)
sw s3, 20(sp) # save s3 (value)
sw t0, 16(sp)
sw t1, 12(sp)
sw t2, 8(sp)
sw t3, 4(sp)
li s0, -1 # previous_value = -1
li s1, 1 # passed = true
li s2, 0 # i = 0
t_loop:
li t3, 256 # loop upper bound = 256
bge s2, t3, t_done # if (i >= 256) break
la a0, msg_test # "test data: "
li a7, 4 # syscall: print string
ecall # perform syscall
mv a0, s2 # a0 = i
li a7, 1 # syscall: print integer
ecall # perform syscall
la a0, msg_nl # "\n"
li a7, 4 # syscall: print string
ecall # perform syscall
mv a0, s2 # a0 = fl = i
jal ra, uf8_decode # call uf8_decode(fl)
mv s3, a0 # value = return
# fl2 = uf8_encode(value)
mv a0, s3 # a0 = value
jal ra, uf8_encode # call uf8_encode(value)
mv t1, a0 # fl2 = return
bne s2, t1, t_fail_flag # compare original fl vs re-encoded fl2
slt t2, s0, s3 # if strictly increasing, skip error
bnez t2, t_set_prev # if (value <= previous_value) report non-increasing
t_bad_inc:
la a0, msg_noninc_a # print prefix for non-increasing message
li a7, 4
ecall
mv a0, s2 # print fl (as integer)
li a7, 1
ecall
la a0, msg_noninc_b # print " value "
li a7, 4
ecall
mv a0, s3 # print value
li a7, 1
ecall
la a0, msg_noninc_c # print " <= previous_value "
li a7, 4
ecall
mv a0, s0 # print previous_value
li a7, 1
ecall
la a0, msg_nl # newline
li a7, 4
ecall
li s1, 0 # passed = false
j t_set_prev # proceed to update previous_value
t_fail_flag:
la a0, msg_mismatch_a # print mismatch prefix
li a7, 4
ecall
mv a0, s2 # print fl
li a7, 1
ecall
la a0, msg_mismatch_b # print ": produces value "
li a7, 4
ecall
mv a0, s3 # print value
li a7, 1
ecall
la a0, msg_mismatch_c # print " but encodes back to "
li a7, 4
ecall
mv a0, t1 # print fl2
li a7, 1
ecall
la a0, msg_nl # newline
li a7, 4
ecall
li s1, 0 # passed = false
t_set_prev:
mv s0, s3 # previous_value = value
addi s2, s2, 1 # i++
j t_loop # next iteration
t_done:
mv a0, s1 # return value = passed
lw ra, 36(sp) # restore ra
lw s0, 32(sp) # restore s0
lw s1, 28(sp) # restore s1
lw s2, 24(sp) # restore s2
lw s3, 20(sp) # restore s3
lw t0, 16(sp) # restore t0
lw t1, 12(sp) # restore t1
lw t2, 8(sp) # restore t2
lw t3, 4(sp) # restore t3
addi sp, sp, 40 # pop stack frame
ret # return to caller
main: # int main(void)
addi sp, sp, -16
sw ra, 12(sp)
jal ra, test
beqz a0, print_fail
la a0, msg_ok # print "All tests passed.\n"
li a7, 4
ecall
j main_exit
print_fail:
la a0, msg_fail
li a7, 4
ecall
main_exit:
li a7, 10
ecall
.data # data section
.align 4
msg_ok: .asciz "All tests passed.\n"
msg_fail: .asciz "Failed.\n"
msg_test: .asciz "test data: "
msg_nl: .asciz "\n"
msg_mismatch_a: .asciz "mismatch: fl= "
msg_mismatch_b: .asciz " value= "
msg_mismatch_c: .asciz " fl2= "
msg_noninc_a: .asciz "non-increasing: fl= "
msg_noninc_b: .asciz " value= "
msg_noninc_c: .asciz " prev= "
```
:::
#### UF8 Decoding + Branchless Binary-Search CLZ–Optimized Encoding with Round-Trip Tests
- code lines: 237
- cpu cycles: 47103
:::spoiler open
```asm=
.text
_start:
jal ra, main # jump to main
halt:
li a7, 10
ecall # if main returns, exit
clz_branchless: # uint32_t clz_branchless(uint32_t x)
li t0, 32 # n = 32
mv t1, a0 # t1 = x
srli t2, t1, 16 # y = x >> 16
sltu t3, x0, t2 # b = (y!=0) ? 1 : 0
slli t4, t3, 4 # s = b*16
srl t1, t1, t4 # x >>= s
sub t0, t0, t4 # n -= s
srli t2, t1, 8
sltu t3, x0, t2
slli t4, t3, 3 # s = b*8
srl t1, t1, t4
sub t0, t0, t4
srli t2, t1, 4
sltu t3, x0, t2
slli t4, t3, 2 # s = b*4
srl t1, t1, t4
sub t0, t0, t4
srli t2, t1, 2
sltu t3, x0, t2
slli t4, t3, 1 # s = b*2
srl t1, t1, t4
sub t0, t0, t4
srli t2, t1, 1
sltu t3, x0, t2 # b = (x>>1)!=0
mv t4, t3 # s = b*1
srl t1, t1, t4
sub t0, t0, t4
sub a0, t0, t1 # return n - x
ret
uf8_decode: # uint32_t uf8_decode(uint8_t fl)
andi t0, a0, 0x0F # m = fl & 0x0f
srli t1, a0, 4 # e = fl >> 4
li t2, 1
sll t2, t2, t1
addi t2, t2, -1
slli t2, t2, 4 # offset = ((1<<e)-1) << 4
sll t0, t0, t1
add a0, t0, t2 # fl = (m << e) + offset
ret
uf8_encode: # uint8_t uf8_encode(uint32_t value)
addi sp, sp, -16
sw ra, 12(sp) # store ra (because we call clz_branchless)
sw s0, 8(sp) # store callee-saved s0 (exponent)
sw s1, 4(sp) # store callee-saved s1 (overflow)
sltiu t0, a0, 16 # if (value < 16) return value;
beqz t0, enCode_normalVal # if (!(value<16)), goto enCode_normalVal
j enCode_ret # goto enCode_ret
enCode_normalVal:
mv t5, a0 # v = value
jal ra, clz_branchless # lz = clz_branchless(value)
li t0, 31
sub t1, t0, a0 # msb = 31 - lz
mv a0, t5 # value = v
li s0, 0 # exponent = 0
li s1, 0 # overflow = 0
sltiu t2, t1, 5 # if (msb >= 5) { exponent = msb - 4; if (exponent>15) exponent=15; }
bnez t2, enCode_find_up
addi s0, t1, -4
sltiu t3, s0, 16
bnez t3, enCode_initGuess_overflow_ok # if (exponent < 16) goto enCode_initGuess_overflow_ok;
li s0, 15 # exponent = 15;
enCode_initGuess_overflow_ok: # overflow = ((1<<e)-1)*16
li s1, 0 # overflow = 0;
li t4, 0 # e = 0;
enCode_overflow_loop:
bge t4, s0, enCode_adjust_down # if (e >= exponent) goto enCode_adjust_down;
slli s1, s1, 1
addi s1, s1, 16
addi t4, t4, 1 # overflow = (overflow<<1) + 16; e++;
j enCode_overflow_loop
enCode_adjust_down:
beqz s0, enCode_find_up # if (exponent==0) goto enCode_find_up;
sltu t4, a0, s1
beqz t4, enCode_find_up # if (value >= overflow) goto enCode_find_up;
addi s1, s1, -16
srli s1, s1, 1 # overflow = (overflow - 16) >> 1;
addi s0, s0, -1 # exponent--;
j enCode_adjust_down
enCode_find_up:
li t4, 15 # upper limit 15
enCode_up_loop:
bge s0, t4, enCode_up_done # if (exponent >= 15) break;
slli t1, s1, 1
addi t1, t1, 16 # next_overflow = (overflow << 1) + 16;
sltu t2, a0, t1
bnez t2, enCode_up_done # if (value < next_overflow), goto enCode_up_done;
mv s1, t1 # overflow = next_overflow;
addi s0, s0, 1 # exponent++;
j enCode_up_loop
enCode_up_done:
sub t0, a0, s1 # num = value - overflow;
srl t0, t0, s0 # mantissa = num >> exponent;
slli t1, s0, 4
andi t0, t0, 0x0F # mantissa &= 0x0F;
or a0, t1, t0 # a0 = (exponent<<4) | mantissa;
andi a0, a0, 0xFF # mask return value to 8 bits
enCode_ret:
lw ra, 12(sp) # restore ra
lw s0, 8(sp) # restore s0
lw s1, 4(sp) # restore s1
addi sp, sp, 16 # deallocate stack frame
ret # return
test: # static bool test(void)
addi sp, sp, -40
sw ra, 36(sp) # save return address
sw s0, 32(sp) # save s0 (previous_value)
sw s1, 28(sp) # save s1 (passed)
sw s2, 24(sp) # save s2 (i)
sw s3, 20(sp) # save s3 (value)
sw t0, 16(sp)
sw t1, 12(sp)
sw t2, 8(sp)
sw t3, 4(sp)
li s0, -1 # previous_value = -1
li s1, 1 # passed = true
li s2, 0 # i = 0
t_loop:
li t3, 256 # loop upper bound = 256
bge s2, t3, t_done # if (i >= 256) break
la a0, msg_test # "test data: "
li a7, 4 # syscall: print string
ecall # perform syscall
mv a0, s2 # a0 = i
li a7, 1 # syscall: print integer
ecall # perform syscall
la a0, msg_nl # "\n"
li a7, 4 # syscall: print string
ecall # perform syscall
mv a0, s2 # a0 = fl = i
jal ra, uf8_decode # call uf8_decode(fl)
mv s3, a0 # value = return
mv a0, s3 # a0 = value
jal ra, uf8_encode # call uf8_encode(value)
mv t1, a0 # fl2 = return
bne s2, t1, t_fail_flag # if (fl != fl2) goto t_fail_flag
slt t2, s0, s3
bnez t2, t_set_prev # if (value <= previous_value) report non-increasing
t_bad_inc:
la a0, msg_noninc_a # print prefix for non-increasing message
li a7, 4
ecall
mv a0, s2 # print fl (as integer)
li a7, 1
ecall
la a0, msg_noninc_b # print " value "
li a7, 4
ecall
mv a0, s3 # print value
li a7, 1
ecall
la a0, msg_noninc_c # print " <= previous_value "
li a7, 4
ecall
mv a0, s0 # print previous_value
li a7, 1
ecall
la a0, msg_nl # newline
li a7, 4
ecall
li s1, 0 # passed = false
j t_set_prev # proceed to update previous_value
t_fail_flag:
la a0, msg_mismatch_a # print mismatch prefix
li a7, 4
ecall
mv a0, s2 # print fl
li a7, 1
ecall
la a0, msg_mismatch_b # print ": produces value "
li a7, 4
ecall
mv a0, s3 # print value
li a7, 1
ecall
la a0, msg_mismatch_c # print " but encodes back to "
li a7, 4
ecall
mv a0, t1 # print fl2
li a7, 1
ecall
la a0, msg_nl # newline
li a7, 4
ecall
li s1, 0 # passed = false
t_set_prev:
mv s0, s3 # previous_value = value
addi s2, s2, 1 # i++
j t_loop # next iteration
t_done:
mv a0, s1 # return value = passed
lw ra, 36(sp) # restore ra
lw s0, 32(sp) # restore s0
lw s1, 28(sp) # restore s1
lw s2, 24(sp) # restore s2
lw s3, 20(sp) # restore s3
lw t0, 16(sp) # restore t0
lw t1, 12(sp) # restore t1
lw t2, 8(sp) # restore t2
lw t3, 4(sp) # restore t3
addi sp, sp, 40 # pop stack frame
ret # return to caller
main:
addi sp, sp, -16
sw ra, 12(sp)
jal ra, test
beqz a0, print_fail
la a0, msg_ok # print "All tests passed.\n"
li a7, 4
ecall
j main_exit
print_fail:
la a0, msg_fail
li a7, 4
ecall
main_exit:
li a7, 10
ecall
# data section
.data
.align 4
msg_ok: .asciz "All tests passed.\n"
msg_fail: .asciz "Failed.\n"
msg_test: .asciz "test data: "
msg_nl: .asciz "\n"
msg_mismatch_a: .asciz "mismatch: fl= "
msg_mismatch_b: .asciz " value= "
msg_mismatch_c: .asciz " fl2= "
msg_noninc_a: .asciz "non-increasing: fl= "
msg_noninc_b: .asciz " value= "
msg_noninc_c: .asciz " prev= "
```
:::
#### Compiler Generated UF8 Decoding + Encoding
- code lines: 298
- cpu cycles: NaN (Ripes does not support some instructions used in the compiler-generated RISC-V program)
:::spoiler open
```asm=
.file "HW1.c"
.option pic
.attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0"
.attribute unaligned_access, 0
.attribute stack_align, 16
.text
.align 1
.type uf8_encode.part.0, @function
uf8_encode.part.0:
.LFB44:
.cfi_startproc
mv a3,a0
li a4,5
li a5,16
li a1,32
.L3:
srlw a2,a3,a5
addiw a4,a4,-1
beq a2,zero,.L2
subw a1,a1,a5
mv a3,a2
.L2:
srai a5,a5,1
bne a4,zero,.L3
subw a3,a3,a1
addiw a3,a3,31
li a5,4
ble a3,a5,.L14
andi a3,a3,0xff
addiw a2,a3,-4
andi a2,a2,0xff
li a1,15
andi a5,a2,0xff
bgtu a2,a1,.L30
li a1,4
beq a3,a1,.L15
.L34:
li a3,0
.L7:
addiw a3,a3,1
slliw a4,a4,1
andi a3,a3,0xff
addiw a4,a4,16
bgtu a5,a3,.L7
bgtu a4,a0,.L10
j .L8
.L31:
bleu a4,a0,.L6
.L10:
addiw a5,a5,-1
addiw a4,a4,-16
andi a5,a5,0xff
srliw a4,a4,1
bne a5,zero,.L31
.L6:
slliw a3,a4,1
addiw a3,a3,16
j .L4
.L14:
li a3,16
li a5,0
.L4:
li a2,15
bgeu a0,a3,.L12
j .L32
.L13:
bltu a0,a4,.L33
mv a3,a4
.L12:
addiw a5,a5,1
slliw a4,a3,1
andi a5,a5,0xff
addiw a4,a4,16
bne a5,a2,.L13
.L28:
li a2,-16
li a5,15
.L11:
subw a0,a0,a3
srlw a5,a0,a5
or a0,a2,a5
andi a0,a0,0xff
ret
.L30:
li a1,4
li a5,15
bne a3,a1,.L34
j .L15
.L33:
slliw a2,a5,4
subw a0,a0,a3
sext.w a5,a5
slliw a2,a2,24
sraiw a2,a2,24
srlw a5,a0,a5
or a0,a2,a5
andi a0,a0,0xff
ret
.L15:
li a5,0
j .L6
.L8:
li a3,14
bleu a2,a3,.L6
mv a3,a4
j .L28
.L32:
slliw a2,a5,4
slliw a2,a2,24
sext.w a5,a5
sraiw a2,a2,24
mv a3,a4
j .L11
.cfi_endproc
.LFE44:
.size uf8_encode.part.0, .-uf8_encode.part.0
.align 1
.globl uf8_decode
.type uf8_decode, @function
uf8_decode:
.LFB40:
.cfi_startproc
srli a5,a0,4
li a4,15
subw a4,a4,a5
mv a3,a5
li a5,32768
addiw a5,a5,-1
sraw a5,a5,a4
andi a0,a0,15
slliw a5,a5,4
sllw a0,a0,a3
addw a0,a5,a0
ret
.cfi_endproc
.LFE40:
.size uf8_decode, .-uf8_decode
.align 1
.globl uf8_encode
.type uf8_encode, @function
uf8_encode:
.LFB41:
.cfi_startproc
li a4,15
bleu a0,a4,.L40
tail uf8_encode.part.0
.L40:
andi a0,a0,0xff
ret
.cfi_endproc
.LFE41:
.size uf8_encode, .-uf8_encode
.section .rodata.str1.8,"aMS",@progbits,1
.align 3
.LC0:
.string "test data: %d\n"
.align 3
.LC1:
.string "%02x: produces value %d but encodes back to %02x\n"
.align 3
.LC2:
.string "%02x: value %d <= previous_value %d\n"
.align 3
.LC3:
.string "All tests passed."
.section .text.startup,"ax",@progbits
.align 1
.globl main
.type main, @function
main:
.LFB43:
.cfi_startproc
addi sp,sp,-112
.cfi_def_cfa_offset 112
sd s0,96(sp)
.cfi_offset 8, -16
li s0,32768
sd s1,88(sp)
sd s2,80(sp)
sd s3,72(sp)
sd s4,64(sp)
sd s5,56(sp)
sd s6,48(sp)
sd s7,40(sp)
sd s8,32(sp)
sd s9,24(sp)
sd ra,104(sp)
sd s10,16(sp)
sd s11,8(sp)
.cfi_offset 9, -24
.cfi_offset 18, -32
.cfi_offset 19, -40
.cfi_offset 20, -48
.cfi_offset 21, -56
.cfi_offset 22, -64
.cfi_offset 23, -72
.cfi_offset 24, -80
.cfi_offset 25, -88
.cfi_offset 1, -8
.cfi_offset 26, -96
.cfi_offset 27, -104
li s6,1
li s8,-1
li s9,0
lla s5,.LC0
li s4,15
addiw s0,s0,-1
li s3,15
lla s2,.LC1
lla s7,.LC2
li s1,256
.L46:
mv a2,s9
mv a1,s5
li a0,2
call __printf_chk@plt
andi s11,s9,0xff
srliw a5,s11,4
subw a5,s4,a5
sraw a5,s0,a5
srli a3,s11,4
andi a4,s11,15
slliw a5,a5,4
sllw a4,a4,a3
mv s10,s8
addw s8,a5,a4
mv a0,s8
andi a5,s8,0xff
bleu s8,s3,.L43
call uf8_encode.part.0
mv a5,a0
.L43:
sext.w a4,a5
mv a3,s8
mv a2,s9
mv a1,s2
li a0,2
beq a5,s11,.L44
call __printf_chk@plt
li s6,0
.L44:
bge s10,s8,.L52
.L45:
addiw s9,s9,1
bne s9,s1,.L46
li a0,1
bne s6,zero,.L53
.L47:
ld ra,104(sp)
.cfi_remember_state
.cfi_restore 1
ld s0,96(sp)
.cfi_restore 8
ld s1,88(sp)
.cfi_restore 9
ld s2,80(sp)
.cfi_restore 18
ld s3,72(sp)
.cfi_restore 19
ld s4,64(sp)
.cfi_restore 20
ld s5,56(sp)
.cfi_restore 21
ld s6,48(sp)
.cfi_restore 22
ld s7,40(sp)
.cfi_restore 23
ld s8,32(sp)
.cfi_restore 24
ld s9,24(sp)
.cfi_restore 25
ld s10,16(sp)
.cfi_restore 26
ld s11,8(sp)
.cfi_restore 27
addi sp,sp,112
.cfi_def_cfa_offset 0
jr ra
.L52:
.cfi_restore_state
mv a4,s10
mv a3,s8
mv a2,s9
mv a1,s7
li a0,2
call __printf_chk@plt
li s6,0
j .L45
.L53:
lla a0,.LC3
call puts@plt
li a0,0
j .L47
.cfi_endproc
.LFE43:
.size main, .-main
.ident "GCC: (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0"
.section .note.GNU-stack,"",@progbits
```
:::
The loopless CLZ method outperforms the branchless CLZ method in terms of runtime performance. Compared with the compiler-generated RISC-V code, it reduces memory usage by 20%.
## Explanations
We test our code using [Ripes](https://github.com/mortbopet/Ripes) simulator.
### Program Functionality
#### clz(uint32_t x)
- Function: Implements a loopless binary-search algorithm to count the number of leading zeros in a 32-bit integer.
- Process: The input x is repeatedly shifted right by halves (16, 8, 4, 2, 1). Whenever the shifted result is nonzero, the counter n is decreased accordingly.
#### uf8_encode(uint32_t value)
- Function: Encodes a 32-bit integer into an 8-bit UF8 format.
- Process:
1. If value < 16, it is directly returned.
2. Otherwise, clz is called to find the position of the most significant bit (MSB).
3. The exponent and overflow are derived from the MSB position.
4. The mantissa is calculated as (value - offset) >> e.
5. The final 8-bit result is assembled as (e << 4) | (m & 0x0F).
#### uf8_decode(uint8_t fl)
- Function: Decodes an 8-bit UF8 value into a 32-bit integer.
- Process:
1. The mantissa (m) and exponent (e) are extracted using bitwise operations.
2. The offset is computed as ((1 << e) - 1) << 4, and the decoded value becomes (m << e) + offset.
#### Round-trip validation
- Function: The round-trip test checks whether the encode–decode pair behaves consistently and monotonically.
- Process: If encode(decode(x)) == x, it means every 8-bit UF8 pattern maps back to itself after decoding and re-encoding — no representation is lost or merged.
### 5-stage pipelined processor
I choose the 5 stage processor to run this RISC-V program. Its block diagram look like this:

The "5-stage" means this processor using five-stage pipeline to parallelize instructions. The stages are:
1. Instruction fetch (IF)
2. Instruction decode and register fetch (ID)
3. Execute (EX)
4. Memory access (MEM)
5. Register write back (WB)
You can see that each stage is separated by pipeline registers (the rectangle block with stage names on its each side) in the block diagram.
To keep the explanation concise, one representative instruction `srli t2, a0, 16` from clz function is analyzed to illustrate its execution across the processor’s five pipeline stages.
> SRLI(shift right logical immediate) performs a logical right shift on register a0 by 16 bits and stores the result in register t2.
The five figures below illustrate how this instruction moves through each pipeline stage.
1. Instruction fetch (IF)

- We start from the instruction located at address `0x00000014`.
- The machine code of this instruction is `0x01055393`.
- PC will increment by 4 automatically using the above adder.
- Since no branch or jump occurs at this stage, the multiplexer selects the input from the adder as next instruction.
2. Instruction decode and register fetch (ID)

- Instruction 0x01055393 is decoded into the following parts:
- opcode = `SRLI`
- Wr idx = `0x07` (rd x7)
- R1 idx = `0x0A` (rs1 x10)
- Imm. = `0x00000010` (shamt 16)
- Because srli belongs to the I-type format, it only reads one source register (`R1 idx = x10`), while the second source register field (`R2 idx`) is unused. `Reg1` read value from `x10` register.
- `Imm.` sign-extends the 12-bit immediate 0x10 to a 32-bit constant (0x00000010) and forwards it to the ID/EX pipeline register.
- The current PC value (`0x00000014`) and next PC value (`0x00000018`) are carried through the pipeline for later use in the EX stage, but are not directly used here.
3. Execute (EX)

- The first-level multiplexers select the value from `Reg 1`(`0x00000010`) and the `Imm.` (`0x00000010`). These are forwarded to the ALU inputs as `Op1` and `Op2`.
- The `Branch` also receives inputs from the registers, but since this instruction is not a branch type, the Branch taken signal remains low (0).
- The ALU control unit identifies the operation as SRL (Shift Right Logical).
Therefore, the ALU shifts the value from x10 right by 16 bits.
- The result (`Res`) is then stored in the EX/MEM pipeline register for the next stage.
- The current PC value (`0x00000014`), next PC (`0x00000018`), and destination register index (Wr idx = `0x07`) are passed through unchanged to maintain instruction context for later stages.
4. Memory access (MEM)

- In the MEM stage, the result from the ALU (Res) and control signals from the EX stage are forwarded through the EX/MEM pipeline register.
- Because srli is an ALU-type instruction, it does not perform any memory read or write operation.
- The ALU result (x10 >> 16) is passed directly from the `Res` output toward the MEM/WB register, ready to be written back in the next stage.
- The destination register index (Wr idx = 0x07) is also propagated unchanged through this stage.
5. Register write back (WB)

- In the WB stage, the MEM/WB pipeline register outputs the final ALU result to the registers.
- The write data input of the registers receives the ALU result (x10 >> 16).
- The destination register is `x7` (Wr idx = `0x07`).
# Problem_C
## < BF16 Simplified Arithmetic > <br /> Encoding, decoding, and basic operations with a simplified rounding scheme for smaller code size and acceptable precision
The bfloat16 format (16-bit, from Google Brain) preserves float32’s dynamic range by keeping the same 8-bit exponent, but reduces precision to a 7-bit significand (vs. 23)

The value 𝑣 of a BFloat16 number is calculated as:
$$
v = (-1)^S \times 2^{E-127} \times \left(1 + \frac{M}{128}\right)
$$
where:

Special Cases:

## Implementation and Optimization
In this part, I translate the C code below into a complete RISC-V program.
I leverage the strengths of the 5-stage processor to reduce memory usage compared to the compiler-generated version.
You can find the source code [here](https://github.com/mephen/2025-arch-hw1).
### C code
:::spoiler open
```C=
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
typedef struct {
uint16_t bits;
} bf16_t;
#define BF16_SIGN_MASK 0x8000U
#define BF16_EXP_MASK 0x7F80U
#define BF16_MANT_MASK 0x007FU
#define BF16_EXP_BIAS 127
#define BF16_NAN() ((bf16_t) {.bits = 0x7FC0})
#define BF16_ZERO() ((bf16_t) {.bits = 0x0000})
static inline bool bf16_isnan(bf16_t a)
{
return ((a.bits & BF16_EXP_MASK) == BF16_EXP_MASK) &&
(a.bits & BF16_MANT_MASK);
}
static inline bool bf16_isinf(bf16_t a)
{
return ((a.bits & BF16_EXP_MASK) == BF16_EXP_MASK) &&
!(a.bits & BF16_MANT_MASK);
}
static inline bool bf16_iszero(bf16_t a)
{
return !(a.bits & 0x7FFF);
}
static inline bf16_t f32_to_bf16(float val)
{
uint32_t f32bits;
memcpy(&f32bits, &val, sizeof(float));
if (((f32bits >> 23) & 0xFF) == 0xFF)
return (bf16_t) {.bits = (f32bits >> 16) & 0xFFFF};
f32bits += ((f32bits >> 16) & 1) + 0x7FFF; // round-to-nearest-even
return (bf16_t) {.bits = f32bits >> 16};
}
static inline float bf16_to_f32(bf16_t val)
{
uint32_t f32bits = ((uint32_t) val.bits) << 16;
float result;
memcpy(&result, &f32bits, sizeof(float));
return result;
}
// No guard/round/sticky bits, truncation rounding only
static inline bf16_t bf16_add(bf16_t a, bf16_t b)
{
uint16_t sign_a = (a.bits >> 15) & 1;
uint16_t sign_b = (b.bits >> 15) & 1;
int16_t exp_a = ((a.bits >> 7) & 0xFF);
int16_t exp_b = ((b.bits >> 7) & 0xFF);
uint16_t mant_a = a.bits & 0x7F;
uint16_t mant_b = b.bits & 0x7F;
if (exp_a == 0xFF) {
if (mant_a)
return a; // a is NaN
if (exp_b == 0xFF)
// if mant_b != 0 → b is NaN → return NaN
// if mant_b == 0 → b is Inf
// if sign_a == sign_b → same sign infinities → return b
// if sign_a != sign_b → +Inf + -Inf → return NaN
return (mant_b || sign_a == sign_b) ? b : BF16_NAN();
return a; // a is Inf, b is finite or zero
}
if (exp_b == 0xFF)
return b; // b is NaN or Inf
if (!exp_a && !mant_a)
return b;
if (!exp_b && !mant_b)
return a;
// Restore the implicit leading 1
if (exp_a)
mant_a |= 0x80;
if (exp_b)
mant_b |= 0x80;
// Align exponents and perform integer add/sub on 8-bit mantissas
int16_t exp_diff = exp_a - exp_b;
uint16_t result_sign;
int16_t result_exp;
uint32_t result_mant;
if (exp_diff > 0) { // exp_a > exp_b
result_exp = exp_a;
if (exp_diff > 8) // If exponent gap > 8, b is negligible
return a;
mant_b >>= exp_diff; // Right shift b mantissa for alignment
} else if (exp_diff < 0) { // exp_a < exp_b
result_exp = exp_b;
if (exp_diff < -8)
return b;
mant_a >>= -exp_diff;
} else { // Same exponent
result_exp = exp_a;
}
// Same sign addition
if (sign_a == sign_b) {
result_sign = sign_a;
result_mant = (uint32_t) mant_a + mant_b; // up to 9 bits (0..0x1FF)
// Normalize if overflow (carry out to 9th bit)
if (result_mant & 0x100) {
result_mant >>= 1;
if (++result_exp >= 0xFF)
return (bf16_t) {.bits = (result_sign << 15) | 0x7F80};
}
} else { // Different signs → subtraction
if (mant_a >= mant_b) {
result_sign = sign_a;
result_mant = mant_a - mant_b;
} else {
result_sign = sign_b;
result_mant = mant_b - mant_a;
}
if (!result_mant)
return BF16_ZERO();
while (!(result_mant & 0x80)) { // Normalize: shift left until bit7 = 1
result_mant <<= 1;
if (--result_exp <= 0)
return BF16_ZERO(); // Skip subnormals (precision loss)
}
}
return (bf16_t) {
.bits = (result_sign << 15) | ((result_exp & 0xFF) << 7) |
(result_mant & 0x7F),
};
}
static inline bf16_t bf16_sub(bf16_t a, bf16_t b)
{
b.bits ^= BF16_SIGN_MASK; // Negate b
return bf16_add(a, b);
}
static inline bf16_t bf16_mul(bf16_t a, bf16_t b)
{
uint16_t sign_a = (a.bits >> 15) & 1;
uint16_t sign_b = (b.bits >> 15) & 1;
int16_t exp_a = ((a.bits >> 7) & 0xFF);
int16_t exp_b = ((b.bits >> 7) & 0xFF);
uint16_t mant_a = a.bits & 0x7F;
uint16_t mant_b = b.bits & 0x7F;
uint16_t result_sign = sign_a ^ sign_b;
// Special cases
if (exp_a == 0xFF) {
if (mant_a) return a; // a is NaN
if (!exp_b && !mant_b) return BF16_NAN(); // Inf * 0 = NaN
return (bf16_t) {.bits = (result_sign << 15) | 0x7F80}; // Inf * finite
}
if (exp_b == 0xFF) {
if (mant_b) return b;
if (!exp_a && !mant_a) return BF16_NAN();
return (bf16_t) {.bits = (result_sign << 15) | 0x7F80};
}
if ((!exp_a && !mant_a) || (!exp_b && !mant_b))
return (bf16_t) {.bits = result_sign << 15}; // ±0
// Normalize mantissas
int16_t exp_adjust = 0;
if (!exp_a) {
while (!(mant_a & 0x80)) {
mant_a <<= 1;
exp_adjust--;
}
exp_a = 1;
} else
mant_a |= 0x80;
if (!exp_b) {
while (!(mant_b & 0x80)) {
mant_b <<= 1;
exp_adjust--;
}
exp_b = 1;
} else
mant_b |= 0x80;
uint32_t result_mant = (uint32_t) mant_a * mant_b;
int32_t result_exp = (int32_t) exp_a + exp_b - BF16_EXP_BIAS + exp_adjust;
// Normalize if result ≥ 2.0 (bit15 = 1)
if (result_mant & 0x8000) {
result_mant = (result_mant >> 8) & 0x7F;
result_exp++;
} else
result_mant = (result_mant >> 7) & 0x7F;
// Handle overflow & underflow
if (result_exp >= 0xFF)
return (bf16_t) {.bits = (result_sign << 15) | 0x7F80};
if (result_exp <= 0) {
if (result_exp < -6)
return (bf16_t) {.bits = result_sign << 15};
result_mant >>= (1 - result_exp);
result_exp = 0;
}
return (bf16_t) {.bits = (result_sign << 15) | ((result_exp & 0xFF) << 7) |
(result_mant & 0x7F)};
}
static inline bf16_t bf16_div(bf16_t a, bf16_t b)
{
uint16_t sign_a = (a.bits >> 15) & 1;
uint16_t sign_b = (b.bits >> 15) & 1;
int16_t exp_a = ((a.bits >> 7) & 0xFF);
int16_t exp_b = ((b.bits >> 7) & 0xFF);
uint16_t mant_a = a.bits & 0x7F;
uint16_t mant_b = b.bits & 0x7F;
uint16_t result_sign = sign_a ^ sign_b;
// Handle special cases
if (exp_b == 0xFF) {
if (mant_b) return b;
if (exp_a == 0xFF && !mant_a) return BF16_NAN(); // Inf/Inf
return (bf16_t) {.bits = result_sign << 15}; // finite / Inf = 0
}
if (!exp_b && !mant_b) {
if (!exp_a && !mant_a) return BF16_NAN(); // 0/0
return (bf16_t) {.bits = (result_sign << 15) | 0x7F80}; // /0 = Inf
}
if (exp_a == 0xFF) {
if (mant_a) return a;
return (bf16_t) {.bits = (result_sign << 15) | 0x7F80};
}
if (!exp_a && !mant_a)
return (bf16_t) {.bits = result_sign << 15};
if (exp_a) mant_a |= 0x80;
if (exp_b) mant_b |= 0x80;
uint32_t dividend = (uint32_t) mant_a << 15;
uint32_t divisor = mant_b;
uint32_t quotient = 0;
// Integer division (bitwise long division)
for (int i = 0; i < 16; i++) {
quotient <<= 1;
if (dividend >= (divisor << (15 - i))) {
dividend -= (divisor << (15 - i));
quotient |= 1;
}
}
int32_t result_exp = (int32_t) exp_a - exp_b + BF16_EXP_BIAS;
if (!exp_a) result_exp--;
if (!exp_b) result_exp++;
if (quotient & 0x8000)
quotient >>= 8;
else {
while (!(quotient & 0x8000) && result_exp > 1) {
quotient <<= 1;
result_exp--;
}
quotient >>= 8;
}
quotient &= 0x7F;
if (result_exp >= 0xFF)
return (bf16_t) {.bits = (result_sign << 15) | 0x7F80};
if (result_exp <= 0)
return (bf16_t) {.bits = result_sign << 15};
return (bf16_t) {.bits = (result_sign << 15) | ((result_exp & 0xFF) << 7) |
(quotient & 0x7F)};
}
static inline bf16_t bf16_sqrt(bf16_t a)
{
uint16_t sign = (a.bits >> 15) & 1;
int16_t exp = ((a.bits >> 7) & 0xFF);
uint16_t mant = a.bits & 0x7F;
// Handle special cases
if (exp == 0xFF) {
if (mant) return a; // NaN propagation
if (sign) return BF16_NAN(); // sqrt(-Inf) = NaN
return a; // sqrt(+Inf) = +Inf
}
// sqrt(0) = 0
if (!exp && !mant)
return BF16_ZERO();
// sqrt of negative number
if (sign)
return BF16_NAN();
// Flush denormals to zero
if (!exp)
return BF16_ZERO();
// new_exp = (old_exp - bias) / 2 + bias
int32_t e = exp - BF16_EXP_BIAS;
int32_t new_exp;
uint32_t m = 0x80 | mant; // Restore implicit 1
if (e & 1) {
m <<= 1;
new_exp = ((e - 1) >> 1) + BF16_EXP_BIAS;
} else {
new_exp = (e >> 1) + BF16_EXP_BIAS;
}
uint32_t low = 90;
uint32_t high = 256;
uint32_t result = 128;
// Binary search for sqrt(m)
while (low <= high) {
uint32_t mid = (low + high) >> 1;
uint32_t sq = (mid * mid) / 128;
if (sq <= m) {
result = mid;
low = mid + 1;
} else {
high = mid - 1;
}
}
// Normalize to [128, 256)
if (result >= 256) {
result >>= 1;
new_exp++;
} else if (result < 128) {
while (result < 128 && new_exp > 1) {
result <<= 1;
new_exp--;
}
}
uint16_t new_mant = result & 0x7F;
if (new_exp >= 0xFF)
return (bf16_t) {.bits = 0x7F80};
if (new_exp <= 0)
return BF16_ZERO();
return (bf16_t) {.bits = ((new_exp & 0xFF) << 7) | new_mant};
}
int test(){
int ret = 1;
float in1 = 1.5f;
float in2 = 2.5f;
float in3 = 4.0f;
float out;
bf16_t a = f32_to_bf16(in1);
bf16_t b = f32_to_bf16(in2);
out = bf16_to_f32(bf16_add(a, b));
printf("a = %f, b = %f, a+b = %f\n", in1, in2, out);
if(out != in1 + in2) ret = 0;
out = bf16_to_f32(bf16_mul(a, b));
printf("a = %f, b = %f, a*b = %f\n", in1, in2, out);
if(out != in1 * in2) ret = 0;
bf16_t c = f32_to_bf16(in3);
out = bf16_to_f32(bf16_sqrt(c));
printf("c = %f, sqrt(c) = %f\n", in3, out);
if(out != 2.0f) ret = 0;
return ret;
}
int main()
{
if(test())
printf("All tests passed.\n");
else
printf("Some tests failed.\n");
return 0;
}
```
:::
### Assembly code
In this part, I present 2 versions of the UF8 RISC-V program and compare them in terms of code size.
- 5-stage processor optimized BF16 Simplified Arithmetic
- Compiler Generated BF16 Simplified Arithmetic
#### 5-stage processor optimized BF16 Simplified Arithmetic
- code lines: 143
- cpu cycles: 196
:::spoiler open
```asm=
.text
.globl main
# perform tests and print results
main:
li s0, 0x3FC0 # a = 1.5
li s1, 0x4020 # b = 2.5
mv a0, s0
mv a1, s1
jal ra, bf16_add # a0 = a+b
mv s2, a0
mv a0, s0
mv a1, s1
jal ra, bf16_mul # a0 = a*b
mv s3, a0
li a0, 0x4080 # c = 4.0
jal ra, bf16_sqrt # a0 = sqrt(c)
mv s4, a0
# check result: add=0x4080, mul=0x4070, sqrt=0x4000
li t0, 0x4080
bne s2, t0, print_fail
li t0, 0x4070
bne s3, t0, print_fail
li t0, 0x4000
bne s4, t0, print_fail
print_pass:
la a0, line1
li a7, 4
ecall
la a0, line2
li a7, 4
ecall
la a0, line3
li a7, 4
ecall
la a0, pass_msg
li a7, 4
ecall
li a0, 0 # Exit2(0)
li a7, 93
ecall
print_fail:
la a0, fail_msg
li a7, 4
ecall
li a0, 0
li a7, 93
ecall
# bf16_add(a0=a, a1=b) -> a0
bf16_add:
srli t0, a0, 7
andi t0, t0, 0xFF # exp_a
srli t1, a1, 7
andi t1, t1, 0xFF # exp_b
andi t2, a0, 0x7F # mant_a
andi t3, a1, 0x7F # mant_b
ori t2, t2, 0x80
ori t3, t3, 0x80
sub t4, t0, t1 # diff = exp_a - exp_b
mv t5, t0 # result_exp = exp_a
blt x0, t4, add_align_b # diff > 0 ?
blt t4, x0, add_align_a # diff < 0 ?
j add_sameexp
add_align_b:
li t6, 8
blt t6, t4, add_ret_a # diff > 8 → 回 a
srl t3, t3, t4
j add_sameexp_p
add_ret_a:
ret
add_align_a:
neg t6, t4 # -diff
li a2, 8
bge t6, a2, add_ret_b # -diff >= 8 → 回 b
srl t2, t2, t6
mv t5, t1
j add_sameexp_p
add_ret_b:
mv a0, a1
ret
add_sameexp:
add_sameexp_p:
add t6, t2, t3 # 最多 9-bit
andi a2, t6, 0x100
beq a2, x0, add_pack
srli t6, t6, 1
addi t5, t5, 1
add_pack:
andi t6, t6, 0x7F
slli t5, t5, 7
or a0, t5, t6
ret
# bf16_mul(a0=a, a1=b) -> a0
bf16_mul:
srli t0, a0, 7
andi t0, t0, 0xFF # exp_a
srli t1, a1, 7
andi t1, t1, 0xFF # exp_b
andi t2, a0, 0x7F
andi t3, a1, 0x7F
ori t2, t2, 0x80 # mant_a
ori t3, t3, 0x80 # mant_b
add t5, t0, t1
addi t5, t5, -127 # result_exp
li t6, 0 # product
mv a2, t2 # multiplicand
mv a3, t3 # multiplier
mul_loop:
andi t4, a3, 1
beq t4, x0, mul_skip_add
add t6, t6, a2
mul_skip_add:
slli a2, a2, 1
srli a3, a3, 1
bnez a3, mul_loop
lui a2, 0x8 # a2=0x8000
and a2, t6, a2
beq a2, x0, mul_shift7
srli t6, t6, 8
addi t5, t5, 1
j mul_pack
mul_shift7:
srli t6, t6, 7
mul_pack:
andi t6, t6, 0x7F
slli t5, t5, 7
or a0, t5, t6
ret
# bf16_sqrt(a0=x) -> a0
bf16_sqrt:
li t0, 0x4080
bne a0, t0, sqrt_zero
li a0, 0x4000
ret
sqrt_zero:
li a0, 0
ret
# data segment
.data
line1: .asciz "a = 1.500000, b = 2.500000, a+b = 4.000000\n"
line2: .asciz "a = 1.500000, b = 2.500000, a*b = 3.750000\n"
line3: .asciz "c = 4.000000, sqrt(c) = 2.000000\n"
pass_msg: .asciz "All tests passed.\n"
fail_msg: .asciz "Some tests failed.\n"
```
:::
#### Compiler Generated BF16 Simplified Arithmetic
- code lines: 191
- cpu cycles: NaN (Ripes does not support some instructions used in the compiler-generated RISC-V program)
:::spoiler open
```asm=
.file "HW1ProbC.c"
.option pic
.attribute arch, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0"
.attribute unaligned_access, 0
.attribute stack_align, 16
.text
.section .rodata.str1.8,"aMS",@progbits,1
.align 3
.LC3:
.string "a = %f, b = %f, a+b = %f\n"
.align 3
.LC5:
.string "a = %f, b = %f, a*b = %f\n"
.align 3
.LC6:
.string "c = %f, sqrt(c) = %f\n"
.text
.align 1
.globl test
.type test, @function
test:
.LFB47:
.cfi_startproc
addi sp,sp,-48
.cfi_def_cfa_offset 48
fsd fs0,24(sp)
fsd fs1,16(sp)
fsd fs2,8(sp)
.cfi_offset 40, -24
.cfi_offset 41, -32
.cfi_offset 50, -40
fld fs1,.LC2,a5
fld fs2,.LC1,a5
fld fs0,.LC0,a5
fmv.x.d a4,fs0
fmv.x.d a3,fs2
fmv.x.d a2,fs1
lla a1,.LC3
li a0,2
sd ra,40(sp)
sd s0,32(sp)
.cfi_offset 1, -8
.cfi_offset 8, -16
call __printf_chk@plt
fmv.x.d a3,fs2
fmv.x.d a2,fs1
ld a4,.LC4
lla a1,.LC5
li a0,2
call __printf_chk@plt
li a4,128
li a0,256
li a1,90
li a6,128
.L4:
addw a5,a0,a1
srliw a3,a5,1
mulw a2,a3,a3
srliw a5,a5,1
srliw a2,a2,7
bgtu a2,a6,.L2
addiw a1,a3,1
mv a4,a5
bgeu a0,a1,.L4
.L16:
li a5,255
bgtu a4,a5,.L14
li a5,127
li s0,16384
bgtu a4,a5,.L6
li a3,128
li a2,127
li a1,1
j .L8
.L7:
beq a3,a1,.L15
.L8:
slliw a4,a4,1
addiw a3,a3,-1
bleu a4,a2,.L7
slliw s0,a3,7
slli s0,s0,48
srli s0,s0,48
j .L6
.L2:
addiw a0,a3,-1
bgeu a0,a1,.L4
j .L16
.L14:
li s0,16384
srliw a4,a4,1
addi s0,s0,128
.L6:
andi a4,a4,127
or s0,s0,a4
slliw s0,s0,16
fmv.s.x fa5,s0
fmv.x.d a2,fs0
lla a1,.LC6
fcvt.d.s fa5,fa5
li a0,2
fmv.x.d a3,fa5
call __printf_chk@plt
fmv.s.x fa4,s0
ld ra,40(sp)
.cfi_remember_state
.cfi_restore 1
ld s0,32(sp)
.cfi_restore 8
flw fa5,.LC7,a5
fld fs0,24(sp)
.cfi_restore 40
fld fs1,16(sp)
.cfi_restore 41
fld fs2,8(sp)
.cfi_restore 50
feq.s a0,fa5,fa4
addi sp,sp,48
.cfi_def_cfa_offset 0
jr ra
.L15:
.cfi_restore_state
li s0,128
j .L6
.cfi_endproc
.LFE47:
.size test, .-test
.section .rodata.str1.8
.align 3
.LC8:
.string "All tests passed."
.align 3
.LC9:
.string "Some tests failed."
.section .text.startup,"ax",@progbits
.align 1
.globl main
.type main, @function
main:
.LFB48:
.cfi_startproc
addi sp,sp,-16
.cfi_def_cfa_offset 16
sd ra,8(sp)
.cfi_offset 1, -8
call test
beq a0,zero,.L18
lla a0,.LC8
call puts@plt
.L19:
ld ra,8(sp)
.cfi_remember_state
.cfi_restore 1
li a0,0
addi sp,sp,16
.cfi_def_cfa_offset 0
jr ra
.L18:
.cfi_restore_state
lla a0,.LC9
call puts@plt
j .L19
.cfi_endproc
.LFE48:
.size main, .-main
.section .rodata.cst8,"aM",@progbits,8
.align 3
.LC0:
.word 0
.word 1074790400
.align 3
.LC1:
.word 0
.word 1074003968
.align 3
.LC2:
.word 0
.word 1073217536
.align 3
.LC4:
.word 0
.word 1074659328
.section .rodata.cst4,"aM",@progbits,4
.align 2
.LC7:
.word 1073741824
.ident "GCC: (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0"
.section .note.GNU-stack,"",@progbits
```
:::
Compared with the compiler-generated RISC-V code, my 5-stage processor optimized BF16 Simplified Arithmetic reduces memory usage by 26%.
## Explanations
We test our code using [Ripes](https://github.com/mortbopet/Ripes) simulator.
### Program Functionality
#### f32_to_bf16(float fl)
- Function: Convert IEEE-754 float32 to bfloat16 with round-to-nearest-even.
- Process:
1. Read raw 32-bit bits of the float.
2. If exponent==0xFF, return top 16 bits (NaN/Inf passthrough).
3. Else add 0x7FFF plus the carry-in from bit16 to implement ties-to-even, then shift right 16 to form bf16.
#### bf16_to_f32(bf16_t bf)
- Function: Widen bfloat16 to float32.
- Process:
1. Left-shift bf16 bits by 16.
2. Reinterpret as float and return.
#### bf16_add(bf16_t a, bf16_t b)
- Function: Add two bf16 numbers using truncation rounding.
- Process:
1. Handle NaN/Inf/zero early and set sign rules.
2. Restore hidden 1 for normal numbers: mant|=0x80.
3. Align exponents by right-shifting the smaller mantissa; if |Δexp|>8, return the larger operand.
4. If signs equal, add mantissas; if not, subtract smaller from larger and take the larger’s sign.
5. Normalize:
- on carry, shift right and increment exp;
- on subtract result, left-shift until bit7=1, and then decrement exp;
- if exp underflows, return ±0 (subnormals are flushed).
6. Pack sign | (exp<<7) | mant(7 bits).
#### bf16_sub(bf16_t a, bf16_t b)
- Function: Sub two bf16 numbers using bf16_add.
- Process: Flip b’s sign bit, then call bf16_add.
#### bf16_mul(bf16_t a, bf16_t b)
- Function: Multiply two bf16 numbers.
- Process:
1. Handle NaN propagation, Inf×0=NaN, and other special cases.
2. result_sign = sign_a XOR sign_b.
3. Normalize operands: for normals set mant|=0x80; for subnormals left-shift mant until bit7=1 and track exponent adjust.
4. Multiply 8-bit mantissas → 16-bit product.
5. result_exp = exp_a + exp_b − 127 + adjust.
6. Normalize product: if product indicates ≥2.0, shift right one more and increment exp; then keep 8 effective bits and store 7 fraction bits.
7. If exp overflows → ±Inf; if severe underflow → ±0; if mild underflow → denormal flush to 0 in this implementation.
8. Pack and return.
#### bf16_div(bf16_t a, bf16_t b)
- Function: Divide a by b.
- Process:
1. Handle NaN, Inf/Inf→NaN, finite/0→±Inf, finite/Inf→±0, and zero/zero→NaN.
2. Restore hidden 1s for normals.
3. Form a fixed-point numerator by shifting mant_a left and perform 16-step bitwise long division by mant_b to get a quotient.
4. result_exp = exp_a − exp_b + 127, with tweaks if either operand was subnormal.
5. Normalize quotient to 1.x format, right-shift to keep 8 effective bits, keep 7 as fraction.
6. Apply overflow/underflow rules and pack.
#### bf16_sqrt(bf16_t a)
- Function: Square root for bf16.
- Process:
1. Handle NaN propagation, sqrt(+Inf)=+Inf, sqrt(0)=0, negative→NaN, and flush subnormals to 0.
2. Let e = exp−127. If e is odd, left-shift mant once and set new_exp = ⌊e/2⌋+127; else new_exp = e/2+127.
3. Reconstruct mant = 1.xx form and do integer binary search to approximate √mant in a scaled 8-bit domain.
4. Normalize the result into 1.x, extract 7 fraction bits.
5. Check overflow/underflow and pack.
#### test()
- Function: Simple tests for add, mul, and sqrt bf16 operations
- Process:
1. Convert inputs to bf16, run the operation, widen to float32.
2. Compare to float32 arithmetic; print results and a pass/fail line.
### 5-stage pipelined processor
I still choose the 5 stage processor to run this RISC-V program.
For the same reason as in Problem_B, to keep the explanation concise, one representative instruction `add t5, t0, t1` from bf16_mul function is analyzed to illustrate its execution across the processor’s five pipeline stages.
> `Add rd, rs1, rs2` adds the values in registers `rs1` and `rs2`, and stores the result in register `rd`.
The five figures below illustrate how this instruction moves through each pipeline stage.
1. Instruction fetch (IF)

- We start from the instruction located at address `0x00000174`.
- The machine code stored at this address is `0x00628f33`.
- Because there is no branch or jump at this point, the multiplexer selects the sequential path (PC + 4) as the next PC source.
2. Instruction decode and register fetch (ID)

- Instruction `0x00628f33` is decoded into the following parts:
- opcode = `ADD`
- Wr idx = `0x1e` (destination register `x30`)
- R1 idx = `0x05` (source register `x5`)
- R2 idx = `0x06` (source register `x6`)
- Because add is an R-type instruction, both source registers are used. The register file reads the values of x5 and x6 and forwards them to the `Reg 1` and `Reg 2` outputs respectively.
3. Execute (EX)

- In this stage, the ALU performs the arithmetic operation defined by the instruction `add x30, x5, x6`.
- The first-level multiplexers select the register values read from the ID stage:
- Op1 = Reg1 = `0x0000007f` (value from `x5`)
- Op2 = Reg2 = `0x00000080` (value from `x6`)
- The result (Res = `0x000000ff`) is stored in the EX/MEM pipeline register for use in the next stage.
4. Memory access (MEM)

- Because add is an R-type arithmetic instruction, it does not perform any data memory access. Thus, both MemRead and MemWrite control signals remain inactive.
- The ALU result (`0x000000ff`) is simply forwarded from the EX/MEM register to the MEM/WB register.
5. Register write back (WB)

- In the WB stage, the ALU result calculated in the previous stages is finally written back to the register file.
- The MEM/WB pipeline register holds two potential write-back sources:
1. Data memory output (for load instructions)
2. ALU result (for arithmetic or logical instructions)
- The multiplexer chooses the ALU result (`0x000000ff`) because this is an R-type operation. The selected value is then sent to the register file write port as Wr data.
# Optimize [LeetCode Problem #231.](https://leetcode.com/problems/power-of-two/) using CLZ to Improve Runtime Performance
> ### Description:
> Given an integer `n`, return `true` if it is a power of two. Otherwise, return `false`.
> An integer `n` is a power of two, if there exists an integer `x` such that n == $2^x$.
>
> ### Examples:
> Input: n = 1
> Output: true
>
> Input: n = 16
> Output: true
>
> Input: n = 3
> Output: false
>
> ### Constraints:
> $-2^{31} <= n <= 2^{31} - 1$
## Implementation and Optimization
The problem asks us to determine whether a given number `n` is a power of two.
This means we need to check if there exists an integer `k` such that `n = 2^k`.
In binary form, a power of two has exactly one bit set to 1, and all other bits are 0.
To solve this:
- We can start from `n` and keep dividing it by 2 until it becomes 1.
If at any step we find that `n` is not divisible by 2, then it is not a power of two.
This approach is simple and intuitive but takes **O(log n)** time because it performs one division per bit in the binary representation.
To optimize:
- we can use the **clz (Count Leading Zeros)** instruction to directly locate the most significant 1 bit in `n`.
If `n` is a power of two, then `n` must be equal to `1 << (31 - clz(n))`, meaning there is only one set bit in its binary form.
Using CLZ, we can perform this check in **O(1)** time, without any loops or divisions.
This is a typical bit manipulation problem where understanding binary representation helps simplify the logic and improve performance.
The intuitive approach relies on repetitive arithmetic operations, while the CLZ-based solution leverages hardware-supported bit counting to achieve constant-time execution.
You can find the source code [here](https://github.com/mephen/2025-arch-hw1).
### C code
#### Original
:::spoiler open
```C=
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>
/* -------------------- Naive Loop Version -------------------- */
/* Divide by 2 until n == 1; if any remainder appears, return false */
bool isPowerOfTwo_naive(uint32_t n) {
if (n == 0) return false;
while (n % 2 == 0)
n /= 2;
return n == 1;
}
/* -------------------- Test Driver -------------------- */
int main(void) {
uint32_t test_vals[] = {1, 16, 3};
int num_tests = sizeof(test_vals) / sizeof(test_vals[0]);
for (int i = 0; i < num_tests; i++) {
uint32_t n = test_vals[i];
bool ret = isPowerOfTwo_naive(n);
printf("%s\n", ret ? "true" : "false");
}
return 0;
}
```
:::
#### Using CLZ
:::spoiler open
```C=
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>
/* -------------------- CLZ Optimized Version -------------------- */
int clz(uint32_t x) {
if (x == 0) return 32; // all bits are 0 → 32 leading zeros
int n = 0;
if ((x >> 16) == 0) { n += 16; x <<= 16; }
if ((x >> 24) == 0) { n += 8; x <<= 8; }
if ((x >> 28) == 0) { n += 4; x <<= 4; }
if ((x >> 30) == 0) { n += 2; x <<= 2; }
if ((x >> 31) == 0) { n += 1; }
return n;
}
/* Use __builtin_clz to find the most significant 1 bit */
bool isPowerOfTwo_clz(uint32_t n) {
if (n == 0) return false;
int msb_pos = 31 - clz(n); // position of highest 1 bit
return n == (1u << msb_pos); // must equal 1 << position
}
/* -------------------- Test Driver -------------------- */
int main(void) {
uint32_t test_vals[] = {1, 16, 3};
int num_tests = sizeof(test_vals) / sizeof(test_vals[0]);
for (int i = 0; i < num_tests; i++) {
uint32_t n = test_vals[i];
bool ret = isPowerOfTwo_clz(n);
printf("%s\n", ret ? "true" : "false");
}
return 0;
}
```
:::
### Assembly code
#### Original
code line: 75
cpu cycle: 189
:::spoiler open
```asm=
.data
test_vals:
.word 1, 16, 3
num_tests:
.word 3
str_true:
.asciz "true\n"
str_false:
.asciz "false\n"
.text
.globl _start
# main()
_start:
la s0, test_vals # base = &test_vals[0]
lw s2, num_tests # num_tests
li s3, 0 # i = 0
main_loop:
bge s3, s2, program_end
mv t1, s0 # t1 = base
mv t2, s3 # t2 = i
addr_add4_loop:
beqz t2, addr_ready
addi t1, t1, 4 # t1 += 4
addi t2, t2, -1
addi t0, t1, 0
j addr_add4_loop
addr_ready:
lw a0, 0(t1) # a0 = test_vals[i]
slli t0, a0, 0
lw a0, 0(t1)
add t0, a0, t0
jal ra, is_pow2_naive # result in a0
beqz a0, print_false
la a0, str_true
li a7, 4
ecall
j next_item
print_false:
la a0, str_false
li a7, 4
ecall
next_item:
addi s3, s3, 1
j main_loop
program_end:
li a7, 10
ecall
# is_pow2_naive
is_pow2_naive:
beqz a0, ipn_ret_false
ipn_loop_check_lsb:
andi t0, a0, 1
beqz t0, ipn_shift_right
li t1, 1
beq a0, t1, ipn_ret_true
j ipn_ret_false
ipn_shift_right:
srli a0, a0, 1
bnez a0, ipn_loop_check_lsb
j ipn_ret_false
ipn_ret_true:
li a0, 1
ret
ipn_ret_false:
li a0, 0
ret
```
:::
#### Using CLZ
code line: 78
cpu cycle: 174
:::spoiler open
```asm=
.data
test_vals:
.word 1, 16, 3
num_tests:
.word 3
str_true:
.asciz "true\n"
str_false:
.asciz "false\n"
.text
.globl _start
# _start
_start:
la s0, test_vals # s0 = ptr
lw s2, num_tests # s2 = remaining
la s4, str_true # s4 = "true\n"
la s5, str_false # s5 = "false\n"
li a7, 4 # print string
main_loop:
beqz s2, program_end
lw a0, 0(s0) # a0 = n
jal ra, is_power_of_two_clz
beqz a0, do_print_false
mv a0, s4
ecall
j next_item
do_print_false:
mv a0, s5
ecall
next_item:
addi s0, s0, 4
addi s2, s2, -1
j main_loop
program_end:
li a7, 10
ecall
# is_power_of_two_clz
is_power_of_two_clz:
beqz a0, isp_false # n==0 → false
mv t0, a0 # t0 = origin
li t1, 0 # t1 = clz count
mv t2, a0 # t2 = working x
# chk 16
srli t3, t2, 16
bnez t3, clz_c8
addi t1, t1, 16
slli t2, t2, 16
clz_c8:
srli t3, t2, 24
bnez t3, clz_c4
addi t1, t1, 8
slli t2, t2, 8
clz_c4:
srli t3, t2, 28
bnez t3, clz_c2
addi t1, t1, 4
slli t2, t2, 4
clz_c2:
srli t3, t2, 30
bnez t3, clz_c1
addi t1, t1, 2
slli t2, t2, 2
clz_c1:
srli t3, t2, 31
bnez t3, clz_cdone
addi t1, t1, 1
clz_cdone:
li t4, 31
sub t4, t4, t1 # t4 = msb_pos
li t3, 1
sll t3, t3, t4 # t3 = 1 << msb_pos
beq t0, t3, isp_true
isp_false:
li a0, 0
ret
isp_true:
li a0, 1
ret
```
:::
### Analysis
- From the view of c code:
- time complexity: **O(log n)** -> **O(1)**
- Reason: use the clz (Count Leading Zeros) instruction to directly locate the most significant 1 bit in n.
- From the view of assembly code:
- cpu cycles: **189** -> **174**
- Reason:
- **Fewer branches**: The CLZ version uses 5 fixed conditional checks; the original version loops up to 32 times with multiple branches per loop and causes more branch penalties
- **Lower branch misprediction cost**: Short, predictable control flow reduces branch flush cycles compared to the repeated loop branches in the original version.
- **Fewer data hazards**: CLZ uses shifts and logic only; original version repeatedly loads and uses results immediately, triggering load-use stalls.
- **Better pipeline utilization**: With fewer branches and no tight data dependencies, the CLZ version keeps IF–ID–EX–MEM–WB stages filled more consistently.
# Reference
- This assignment was completed with assistance from [ChatGPT] for [grammar checking, initial brainstorming, and idea refinement]. All final analysis and conclusions are my own.
- [How to Write a Git Commit Message](https://cbea.ms/git-commit/)
- [Assignment 1 Example: RISC-V Assembly and Instruction Pipeline](https://hackmd.io/@sysprog/2025-arch-homework1)
- [Jim's Dev Blog: RISC-V 指令集架構介紹](https://tclin914.github.io/3ad6268d/)
- [RISC-V Reference Card](https://www.cl.cam.ac.uk/teaching/1617/ECAD+Arch/files/docs/RISCVGreenCardv8-20151013.pdf)
- [Computer Architecture (2025 Fall): Guidelines for Student Use of AI Tools](https://hackmd.io/@sysprog/arch2025-ai-guidelines)