# Assignment1: RISC-V Assembly and Instruction Pipeline ## Quiz1 Problem B ### c code #### bf16 to fp32 ``` c static inline float bf16_to_fp32(bf16_t h) { union { float f; uint32_t i; } u = {.i = (uint32_t)h.bits << 16}; return u.f; } ``` #### fp32 to bf16 ``` c static inline bf16_t fp32_to_bf16(float s) { bf16_t h; union { float f; uint32_t i; } u = {.f = s}; if ((u.i & 0x7fffffff) > 0x7f800000) { /* NaN */ h.bits = (u.i >> 16) | 64; /* force to quiet */ return h; } h.bits = (u.i + (0x7fff + ((u.i >> 0x10) & 1))) >> 0x10; return h; } ``` :::danger Don't paste code snip without comprehensive discussions. ::: ### RISC-V Assembly Code #### bf16 to fp32 :::danger Always write in English! ::: ```c .text .globl main .data input_data: .half 0x3f80, 0xc120, 0x7fc0 # 測試資料 1,-10,NaN answer: .word 0x3f800000, 0xc1200000, 0x7fc00000 # 正確答案 newline: .string "\n" # 換行符號 correct: .string "the conversion is right\n" # 正確 wrong: .string "the conversion is wrong\n" # 錯誤 .text main: la t0, input_data # 載入 input_data 的位址 la t1, answer # 載入 answer 的位址 li t2, 3 # 設定迴圈次數 li a0, 1 loop: lh t3, 0(t0) jal ra, bf16_to_fp32 lw t5, 0(t1) beq t4, t5, correct_output # 如果 t4 == t5, 則跳到 correct_output # 如果轉換錯誤,輸出錯誤訊息 la a0, wrong li a7, 4 ecall j continue_loop # 繼續下一個迴圈 correct_output: la a0, correct li a7, 4 ecall continue_loop: # 輸出換行符號 la a0, newline li a7, 4 ecall addi t0, t0, 2 # 移到下一個 16-bit addi t1, t1, 4 # 移到下一個 32-bit addi t2, t2, -1 # 迴圈減 1 bnez t2, loop # 若 t2 != 0 繼續 # 結束程式 li a7, 10 ecall bf16_to_fp32: slli t4, t3, 16 ret ``` #### fp32 to bf16 ```c .text .globl main .data input_data: .word 0x411FFFFF, 0xBF800000, 0x7FC00000 # 測試資料 9.999999,-1,NaN answer: .word 0x4120, 0xBF80, 0x7FC0 # 正確答案 newline: .string "\n" # 換行符號 correct: .string "the conversion is right\n" # 正確輸出 wrong: .string "the conversion is wrong\n" # 錯誤輸出 .text main: la t0, input_data # 載入 input_data 的地址 la t1, answer # 載入 answer 的地址 li t2, 3 # 設定迴圈次數 (3個fp32資料) li a0, 1 loop: lw t3, 0(t0) jal ra, fp32_to_bf16 # 呼叫 fp32_to_bf16 函數轉換 t3 為 bf16, 結果存入 a1 back: lw t4, 0(t1) # 從 t1 地址讀取正確的 16-bit (bf16) 答案 beq a1, t4, correct_output # 如果 a1 == t4, 則跳到 correct_output # 如果轉換錯誤,輸出錯誤訊息 la a0, wrong li a7, 4 ecall j continue_loop # 繼續下一個迴圈 correct_output: la a0, correct li a7, 4 ecall continue_loop: # 輸出換行符號 la a0, newline li a7, 4 ecall addi t0, t0, 4 # 移到下一個 32-bit addi t1, t1, 4 addi t2, t2, -1 # 迴圈減 1 bnez t2, loop # 若 t2 != 0 繼續 # 結束程式 li a7, 10 ecall fp32_to_bf16: li s3, 0 li s0, 0x7fffffff and s1, t3, s0 li s2, 0x7f800000 slt s3, s1, s2 #檢查是否為NaN beq s3, x0, NaN jal ra, NotNaN NotNaN: srli s2, t3, 16 andi s2, s2, 1 # 取得最低有效位元 li s3, 0x00007fff # 處理四捨五入 add s2, s2, s3 add t3, t3, s2 srli a1, t3, 16 # 取高 16 位作為 bfloat16 j back NaN: srli t3, t3, 16 li s3, 0x0040 or a1, t3, s3 # NaN處理 j back ``` :::danger Use fewer instructions. ::: ## Use-Case from Leet code ### Problem 70. Climbing Stairs In this problem, one is tasked with determining the number of distinct ways to reach the top of a staircase with n steps, given that each time one can climb either 1 or 2 steps. This problem can be directly mapped to the Fibonacci sequence, where the number of ways to reach the n-th step is equivalent to the sum of the ways to reach the previous two steps. #### Motivation This structure naturally lends itself to a dynamic programming solution, as we can store and reuse previously computed results to avoid redundant calculations and optimize performance. DP relies heavily on storing intermediate values, which can become memory-intensive when the number of steps grows large. To optimize this, I decided to use bf16 to reduce memory consumption ### Solution in C Code ``` c int climbStairs(int n) { if(n==1)return 1; if(n==2)return 2; int* arr=(int*)malloc((n)*sizeof(int)); arr[0]=1; arr[1]=2; int i; for(i=2;i<n;i++){ arr[i]=arr[i-1]+arr[i-2]; } return arr[n-1]; } ``` ### RISC-V Version without BF16 :::danger Always write in English! ::: ```c .data fibarray: .word 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0 # 宣告一個包含5個元素的整數陣列,初始值為0 n:.word 14 str: .string "輸出n為7,10,14的結果\n" # 換行符號 newline: .string "\n" # 換行符號 .text .globl main main: #先把array位置載入register la t0,fibarray #把陣列前兩個值放入記憶體中 li t1,1 li t2,2 sw t1, 0(t0) sw t2, 4(t0) addi t3,t0,8 #紀錄陣列的位置 fibarray+8 #取得n的值 la t4,n lw t5,0(t4) #初始化i的值記錄迴圈次數 li t6,2 loop: bge t6,t5, End #計算fib(n-1)+fib(n-2) lw t1,-4(t3) lw t2,-8(t3) add a0,t1,t2 #儲存計算結果 sw a0,0(t3) #i++與調整陣列位置 addi t6,t6,1 addi t3,t3,4 j loop End: # 輸出 結果 la t0, fibarray # 重新載入 fibarray 基址 la a0, str li a7, 4 ecall #調整陣列位置來輸出答案 lw a0, 24(t0) li a7, 1 ecall la a0, newline li a7, 4 ecall lw a0, 36(t0) li a7, 1 ecall la a0, newline li a7, 4 ecall lw a0, 52(t0) li a7, 1 ecall li a7, 10 ecall ``` ### RISC-V Version with BF16 ```c .data fibarray: .half 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0 # n:.word 14 .text .globl main main: #先把array位置載入register la t0,fibarray #把陣列前兩個值放入記憶體中 li t1,0x3F80 li t2,0x4000 sh t1, 0(t0) sh t2, 2(t0) addi t3,t0,4 #紀錄陣列的位置 fibarray+4 #取得n的值 la t4,n lw t5,0(t4) #初始化i的值記錄迴圈次數 li t6,2 loop: bge t6,t5, End #計算fib(n-1)+fib(n-2) lh t1,-2(t3) lh t2,-4(t3) j bf16_add #add a0,t1,t2 #儲存計算結果 after_add: sw a0,0(t3) #i++與調整陣列位置 addi t6,t6,1 addi t3,t3,2 j loop End: li a7, 10 # 設定 a7 為 10 (系統呼叫 10 用來結束程式) ecall # 呼叫系統結束程式 bf16_add: # 取出指數和尾數 slli s4, t1, 17 srli s4, s4, 24 slli s5, t2, 17 srli s5, s5, 24 li s6, 0x007F # 得到尾數的 mask and s7, t1, s6 and s8, t2, s6 # 加入隱含的 1 位 li s9, 0x80 or s7, s7, s9 # 尾數 a 加入隱含位 or s8, s8, s9 # 尾數 b 加入隱含位 # 比較指數大小並對齊 bge s4, s5, align_exp0 mv s9, t1 # a 和 b 交換,確保 t1(a) 的指數較大 mv t1, t2 mv t2, s9 mv s9, s4 # 交換指數 mv s4, s5 mv s5, s9 mv s9, s7 # 交換尾數 mv s7, s8 mv s8, s9 align_exp0: sub s11, s4, s5 # 計算指數差 srl s8, s8, s11 # 將尾數右移,對齊指數 # 尾數相加 add s7, s7, s8 li s11, 0x100 bge s7, s11, carry # 尾數 > 0x100 發生進位 j finish_add carry: # 處理尾數進位 addi s4, s4, 1 # 指數 + 1 srli s7, s7, 1 # 尾數右移 1 j finish_add finish_add: slli s9, s4, 7 slli s7, s7, 25 srli s7, s7, 25 or a0, s9, s7 j after_add ``` ### Analysis #### The memory usage without bf16 ![image](https://hackmd.io/_uploads/BJakSwcykg.png) #### The memory usage with bf16 ![image](https://hackmd.io/_uploads/BJv2Vw5yJe.png) Using bfloat16 (bf16) can significantly reduce memory requirements. This results in a 50% reduction in memory usage. #### Execution info without bf16 ![image](https://hackmd.io/_uploads/SJ5QID9k1e.png) #### Execution info with bf16 ![image](https://hackmd.io/_uploads/BJThSPcyJe.png) However, there are some the additional overhead required for working with bf16. Since most hardware does not natively support bf16 arithmetic, it becomes necessary to implement custom floating-point addition and conversion operations. This will increase the number of cycles required to execute the program, leading to a longer runtime. This trade-off between reduced memory consumption and increased cycle count needs to be carefully considered when optimizing algorithms for performance and efficiency.