# Assignment1: RISC-V Assembly and Instruction Pipeline
## Quiz1 Problem B
### c code
#### bf16 to fp32
``` c
static inline float bf16_to_fp32(bf16_t h)
{
union {
float f;
uint32_t i;
} u = {.i = (uint32_t)h.bits << 16};
return u.f;
}
```
#### fp32 to bf16
``` c
static inline bf16_t fp32_to_bf16(float s)
{
bf16_t h;
union {
float f;
uint32_t i;
} u = {.f = s};
if ((u.i & 0x7fffffff) > 0x7f800000) { /* NaN */
h.bits = (u.i >> 16) | 64; /* force to quiet */
return h;
}
h.bits = (u.i + (0x7fff + ((u.i >> 0x10) & 1))) >> 0x10;
return h;
}
```
:::danger
Don't paste code snip without comprehensive discussions.
:::
### RISC-V Assembly Code
#### bf16 to fp32
:::danger
Always write in English!
:::
```c
.text
.globl main
.data
input_data:
.half 0x3f80, 0xc120, 0x7fc0
# 測試資料 1,-10,NaN
answer:
.word 0x3f800000, 0xc1200000, 0x7fc00000 # 正確答案
newline:
.string "\n" # 換行符號
correct:
.string "the conversion is right\n" # 正確
wrong:
.string "the conversion is wrong\n" # 錯誤
.text
main:
la t0, input_data # 載入 input_data 的位址
la t1, answer # 載入 answer 的位址
li t2, 3 # 設定迴圈次數
li a0, 1
loop:
lh t3, 0(t0)
jal ra, bf16_to_fp32
lw t5, 0(t1)
beq t4, t5, correct_output # 如果 t4 == t5, 則跳到 correct_output
# 如果轉換錯誤,輸出錯誤訊息
la a0, wrong
li a7, 4
ecall
j continue_loop # 繼續下一個迴圈
correct_output:
la a0, correct
li a7, 4
ecall
continue_loop:
# 輸出換行符號
la a0, newline
li a7, 4
ecall
addi t0, t0, 2 # 移到下一個 16-bit
addi t1, t1, 4 # 移到下一個 32-bit
addi t2, t2, -1 # 迴圈減 1
bnez t2, loop # 若 t2 != 0 繼續
# 結束程式
li a7, 10
ecall
bf16_to_fp32:
slli t4, t3, 16
ret
```
#### fp32 to bf16
```c
.text
.globl main
.data
input_data:
.word 0x411FFFFF, 0xBF800000, 0x7FC00000 # 測試資料 9.999999,-1,NaN
answer:
.word 0x4120, 0xBF80, 0x7FC0 # 正確答案
newline:
.string "\n" # 換行符號
correct:
.string "the conversion is right\n" # 正確輸出
wrong:
.string "the conversion is wrong\n" # 錯誤輸出
.text
main:
la t0, input_data # 載入 input_data 的地址
la t1, answer # 載入 answer 的地址
li t2, 3 # 設定迴圈次數 (3個fp32資料)
li a0, 1
loop:
lw t3, 0(t0)
jal ra, fp32_to_bf16 # 呼叫 fp32_to_bf16 函數轉換 t3 為 bf16, 結果存入 a1
back:
lw t4, 0(t1) # 從 t1 地址讀取正確的 16-bit (bf16) 答案
beq a1, t4, correct_output # 如果 a1 == t4, 則跳到 correct_output
# 如果轉換錯誤,輸出錯誤訊息
la a0, wrong
li a7, 4
ecall
j continue_loop # 繼續下一個迴圈
correct_output:
la a0, correct
li a7, 4
ecall
continue_loop:
# 輸出換行符號
la a0, newline
li a7, 4
ecall
addi t0, t0, 4 # 移到下一個 32-bit
addi t1, t1, 4
addi t2, t2, -1 # 迴圈減 1
bnez t2, loop # 若 t2 != 0 繼續
# 結束程式
li a7, 10
ecall
fp32_to_bf16:
li s3, 0
li s0, 0x7fffffff
and s1, t3, s0
li s2, 0x7f800000
slt s3, s1, s2 #檢查是否為NaN
beq s3, x0, NaN
jal ra, NotNaN
NotNaN:
srli s2, t3, 16
andi s2, s2, 1 # 取得最低有效位元
li s3, 0x00007fff # 處理四捨五入
add s2, s2, s3
add t3, t3, s2
srli a1, t3, 16 # 取高 16 位作為 bfloat16
j back
NaN:
srli t3, t3, 16
li s3, 0x0040
or a1, t3, s3 # NaN處理
j back
```
:::danger
Use fewer instructions.
:::
## Use-Case from Leet code
### Problem 70. Climbing Stairs
In this problem, one is tasked with determining the number of distinct ways to reach the top of a staircase with n steps, given that each time one can climb either 1 or 2 steps. This problem can be directly mapped to the Fibonacci sequence, where the number of ways to reach the n-th step is equivalent to the sum of the ways to reach the previous two steps.
#### Motivation
This structure naturally lends itself to a dynamic programming solution, as we can store and reuse previously computed results to avoid redundant calculations and optimize performance.
DP relies heavily on storing intermediate values, which can become memory-intensive when the number of steps grows large. To optimize this, I decided to use bf16 to reduce memory consumption
### Solution in C Code
``` c
int climbStairs(int n) {
if(n==1)return 1;
if(n==2)return 2;
int* arr=(int*)malloc((n)*sizeof(int));
arr[0]=1;
arr[1]=2;
int i;
for(i=2;i<n;i++){
arr[i]=arr[i-1]+arr[i-2];
}
return arr[n-1];
}
```
### RISC-V Version without BF16
:::danger
Always write in English!
:::
```c
.data
fibarray: .word 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0 # 宣告一個包含5個元素的整數陣列,初始值為0
n:.word 14
str:
.string "輸出n為7,10,14的結果\n" # 換行符號
newline:
.string "\n" # 換行符號
.text
.globl main
main:
#先把array位置載入register
la t0,fibarray
#把陣列前兩個值放入記憶體中
li t1,1
li t2,2
sw t1, 0(t0)
sw t2, 4(t0)
addi t3,t0,8 #紀錄陣列的位置 fibarray+8
#取得n的值
la t4,n
lw t5,0(t4)
#初始化i的值記錄迴圈次數
li t6,2
loop:
bge t6,t5, End
#計算fib(n-1)+fib(n-2)
lw t1,-4(t3)
lw t2,-8(t3)
add a0,t1,t2
#儲存計算結果
sw a0,0(t3)
#i++與調整陣列位置
addi t6,t6,1
addi t3,t3,4
j loop
End:
# 輸出 結果
la t0, fibarray # 重新載入 fibarray 基址
la a0, str
li a7, 4
ecall
#調整陣列位置來輸出答案
lw a0, 24(t0)
li a7, 1
ecall
la a0, newline
li a7, 4
ecall
lw a0, 36(t0)
li a7, 1
ecall
la a0, newline
li a7, 4
ecall
lw a0, 52(t0)
li a7, 1
ecall
li a7, 10
ecall
```
### RISC-V Version with BF16
```c
.data
fibarray: .half 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0 #
n:.word 14
.text
.globl main
main:
#先把array位置載入register
la t0,fibarray
#把陣列前兩個值放入記憶體中
li t1,0x3F80
li t2,0x4000
sh t1, 0(t0)
sh t2, 2(t0)
addi t3,t0,4 #紀錄陣列的位置 fibarray+4
#取得n的值
la t4,n
lw t5,0(t4)
#初始化i的值記錄迴圈次數
li t6,2
loop:
bge t6,t5, End
#計算fib(n-1)+fib(n-2)
lh t1,-2(t3)
lh t2,-4(t3)
j bf16_add
#add a0,t1,t2
#儲存計算結果
after_add:
sw a0,0(t3)
#i++與調整陣列位置
addi t6,t6,1
addi t3,t3,2
j loop
End:
li a7, 10 # 設定 a7 為 10 (系統呼叫 10 用來結束程式)
ecall # 呼叫系統結束程式
bf16_add:
# 取出指數和尾數
slli s4, t1, 17
srli s4, s4, 24
slli s5, t2, 17
srli s5, s5, 24
li s6, 0x007F # 得到尾數的 mask
and s7, t1, s6
and s8, t2, s6
# 加入隱含的 1 位
li s9, 0x80
or s7, s7, s9 # 尾數 a 加入隱含位
or s8, s8, s9 # 尾數 b 加入隱含位
# 比較指數大小並對齊
bge s4, s5, align_exp0
mv s9, t1 # a 和 b 交換,確保 t1(a) 的指數較大
mv t1, t2
mv t2, s9
mv s9, s4 # 交換指數
mv s4, s5
mv s5, s9
mv s9, s7 # 交換尾數
mv s7, s8
mv s8, s9
align_exp0:
sub s11, s4, s5 # 計算指數差
srl s8, s8, s11 # 將尾數右移,對齊指數
# 尾數相加
add s7, s7, s8
li s11, 0x100
bge s7, s11, carry # 尾數 > 0x100 發生進位
j finish_add
carry:
# 處理尾數進位
addi s4, s4, 1 # 指數 + 1
srli s7, s7, 1 # 尾數右移 1
j finish_add
finish_add:
slli s9, s4, 7
slli s7, s7, 25
srli s7, s7, 25
or a0, s9, s7
j after_add
```
### Analysis
#### The memory usage without bf16

#### The memory usage with bf16

Using bfloat16 (bf16) can significantly reduce memory requirements. This results in a 50% reduction in memory usage.
#### Execution info without bf16

#### Execution info with bf16

However, there are some the additional overhead required for working with bf16. Since most hardware does not natively support bf16 arithmetic, it becomes necessary to implement custom floating-point addition and conversion operations. This will increase the number of cycles required to execute the program, leading to a longer runtime.
This trade-off between reduced memory consumption and increased cycle count needs to be carefully considered when optimizing algorithms for performance and efficiency.