Assignment1: RISC-V Assembly and Instruction Pipeline

# Assignment1: RISC-V Assembly and Instruction Pipeline contributed by < [`郭晏愷`](https://github.com/kdnvt/Palindrome-linked-list) > ## Problem B in Quiz 1 The bfloat16 format is a 16-bit floating-point representation, designed to provide a wide dynamic range by using a floating radix point. It is a shortened version of the 32-bit IEEE 754 single-precision format (binary32), aimed at accelerating machine learning. ### C Code ```c typedef struct { uint16_t bits; } bf16_t; static inline float bf16_to_fp32(bf16_t h) { union { float f; uint32_t i; } u = {.i = (uint32_t)h.bits << 16}; return u.f; } static inline bf16_t fp32_to_bf16(float s) { bf16_t h; union { float f; uint32_t i; } u = {.f = s}; if ((u.i & 0x7fffffff) > 0x7f800000) { /* NaN */ h.bits = (u.i >> 16) | 64; /* force to quiet */ return h; } h.bits = (u.i + (0x7fff + ((u.i >> 0x10) & 1))) >> 0x10; return h; } int main(){ float test_data[3] = {-1.5f, 123.456f, -0.0f}; bf16_t expected_bf16[3] = {0xBFC0, 0x42F7, 0x8000}; uint32_t expected_fp32[3] = {0xBFC00000, 0x42F70000, 0x80000000}; int num = sizeof(test_data) / sizeof(test_data[0]); for (int i = 0; i < num; i++) { bf16_t bf16_value = fp32_to_bf16(test_data[i]); if (bf16_value.bits == expected_bf16[i].bits) { printf("BF16 conversion for test_data[%d] is correct\n", i); } else { printf("BF16 conversion for test_data[%d] is incorrect\n", i); } float recovered_fp32 = bf16_to_fp32(bf16_value); if (*(uint32_t *)&recovered_fp32 == expected_fp32[i]) { printf("FP32 recovery for test_data[%d] is correct\n", i); } else { printf("FP32 recovery for test_data[%d] is incorrect\n", i); } } return 0; } ``` ### Assembly Code ```asm .data test_data: .word 0xBFC00000 # -1.5 (float) .word 0x42F6E979 # 123.456 (float) .word 0x80000000 # -0.0 (float) expected_bf16: .half 0xBFC0 # Expected bfloat16 values .half 0x42F7 .half 0x8000 expected_fp32: .word 0xBFC00000 # Expected float32 recovered values .word 0x42F70000 .word 0x80000000 bf16correct: .string "bf16_Correct\n" fp32correct: .string "fp32_Correct\n" bf16incorrect: .string "bf16_Incorrect\n" fp32incorrect: .string "fp32_Incorrect\n" .text .global main main: # Initialize loop counter la s0, test_data la s1, expected_bf16 la s2, expected_fp32 li s3, 0 # i = 0 li s4, 3 # num = 3 (size of the test_data array) li s5, 0xffff0000 loop: bge s3, s4, end_loop # if i >= num, exit loop lw a0, 0(s0) # Load float value (FP32) into a0 lh a1, 0(s1) sub a1, a1, s5 lw a2, 0(s2) jal ra, fp32_to_bf16 # Call function beq a0, a1, bf16_correct # Load test_data[i+1] addi s0, s0, 4 # Address of test_data[i] addi s1, s1, 2 # Address of expected_bf16[i] addi s2, s2, 4 # Address of expected_fp32[i] bne a0, a1, print_bf16_incorrect j next_iteration fp32_to_bf16: li t0, 0x7fffffff li t6, 0 and t1, t6, t0 li t0, 0x7f800000 bgt t1, t0, handle_nan _1: li t0, 0x7fff srli t1, a0, 16 andi t1, t1, 1 add t1, t1, t0 add t1, t1, a0 srli t1, t1, 16 mv a0, t1 ret handle_nan: srli t1, a0, 16 ori t1, t1, 64 mv a0, t1 j _1 bf16_correct: jal ra, print_bf16_correct jal ra, bf16_to_fp32 beq a0, a2, fp32_correct bne a0, a2, print_fp32_incorrect j next_iteration bf16_to_fp32: # a0 contains the input bf16 (h.bits) addi a0, a1, 0 slli t0, a0, 16 mv a0, t0 ret fp32_correct: la a0, fp32correct li a7, 4 ecall j next_iteration next_iteration: addi s3, s3, 1 # i++ j loop # Go to the next iteration end_loop: li a7, 10 li a0, 0 ecall # Print routines (implement later) print_bf16_correct: la a0, bf16correct li a7, 4 ecall ret print_bf16_incorrect: la a0, bf16incorrect li a7, 4 ecall j next_iteration print_fp32_incorrect: la a0, fp32incorrect li a7, 4 ecall j next_iteration ``` ## Add Binary ([LeetCode 67](https://leetcode.com/problems/add-binary/description/)) Description: >Given two binary strings a and b, return their sum as a binary string. Example 1: > Input: a = "11", b = "1" > Output: "100" Example 2: > Input: a = "1010", b = "1011" > Output: "10101" Constraints: > 1 <= a.length, b.length <= 104 > a and b consist only of '0' or '1' characters. > Each string does not contain leading zeros except for the zero itself. ## Solution ### C Code ```c #include <stdio.h> #include <stdlib.h> #include <string.h> char* addBinary(char* a, char* b) { int la = strlen(a), lb = strlen(b); int lc = la > lb ? la : lb; char* c = (char*)malloc(lc+1); c[lc] = '\0'; int i, carry = 0; for(i = 0 ; i < lc ; i++){ int bit_a = (la - 1 - i >= 0) ? a[la - 1 - i] - '0' : 0; int bit_b = (lb - 1 - i >= 0) ? b[lb - 1 - i] - '0' : 0; int sum = bit_a + bit_b + carry; carry = sum / 2; c[lc - 1 - i] = (sum % 2) + '0'; } if(carry == 1){ c = (char*)realloc(c,lc+2); for(i = lc ; i >= 0 ; i--){ c[i+1] = c[i]; } c[0] = '1'; } return c; } int main() { char a[] = "11"; char b[] = "1"; char* result = addBinary(a, b); printf("Result of %s +%s = %s\n", a, b, result); free(result); return 0; } ``` ### Assembly Code ```asm .data a_str: .string "11" b_str: .string "1" len_a: .word 0 len_b: .word 0 len_c: .word 0 carry: .word 0 i: .word 0 temp_a: .word 0 temp_b: .word 0 .text .global main main: # Cal length of a and b la t0, a_str # Load address a_str jal ra, strlen # Calculate the length of a_str mv s0 t0 # Move t0 to s0 la s1, len_a # Load address of len_a sw a0, 0(s1) # Store word of len_a(s1) la t0, b_str # Load address b_str to t0 jal ra, strlen # Calculate the length of b_str la s1, len_b # Load address of len_b sw a0, 0(s2) # Store word of len_b(s2) # len_c = max(len_a, len_b) blt s1, s2, set_len_b # If len_a < len_b, jump to set_len_b j set_len_c set_len_b: mv s3, s2 # len_c = len_b set_len_c: mv s3, s1 # len_c = len_a # Initial for loop la t1, i # i(t1) la t2, carry # carry(t2) li t1, 0 # i = 0 li t2, 0 # carry = 0 loop: bge t1, s3, end_loop # If i >= len_c, exit loop # 計算 a 的位元值 li t3, 1 la t4, temp_a sub t4, s1, t1 # t4 = len_a - i sub t4, t4, t3 # t4 = t4 - 1 blt t4, x0, skip_a_bit # If t4 < 0, skip_a_bit lb t5, 0(s0) # Get ASCII of a_str[t4] li t3, 48 sub t5, t5, t3 # Transfer ASCII to 0 or 1 sw t5, 0(t4) # Refresh temp_a j process_b skip_a_bit: li t5, 0 # If i >= len_c, exit loop sw t5, 0(t4) # Refresh temp_a process_b: # 計算 b 的位元值 la t6, temp_b # Load address temp_b to t6 sub t6, s2, t1 # t6 = len_b - i sub t6, t6, t3 # t6 = t6 - 1 blt t6, x0, skip_b_bit # If t6 < 0, skip_b_bit lb a1, 0(t0) # Get ASCII of b_str[t6] sub a1, a1, t3 # Transfer ASCII to 0 or 1 sw a1, 0(t6) # Refresh temp_b j process_sum skip_b_bit: li a1, 0 # 如果超出範圍，則設置 bit_b = 0 sw a1, 0(t6) # 暫存 b 的位元 process_sum: # 加法並處理進位 add a3, t4, t6 # sum = bit_a + bit_b add a3, a3, t2 # sum = sum + carry li t3, 2 div t2, a3, t3 # carry = sum / 2 rem a3, a3, t3 # sum = sum % 2 addi a3, a3, 48 # 將數字轉回 ASCII # 更新迴圈索引 i addi t1, t1, 1 j loop # 繼續迴圈 end_loop: # 如果有進位，則需要擴展結果字符串 beqz t2, end_program # 如果沒有進位，直接結束 li a4, 49 # '1' 的 ASCII sb a4, 0(a3) # 將 '1' 插入結果字串的最前面 end_program: # 結束程式 mv a0, a3 li a7, 10 # 系統調用退出 ecall # strlen 函數 strlen: mv t1, t0 # 載入字串地址 li t2, 0 # 初始化長度計數 strlen_loop: lb t3, 0(t1) # 讀取字元 beq t3, x0, strlen_done # 如果字元為 '\0'，結束 addi t2, t2, 1 # 長度 +1 addi t1, t1, 1 # 指針向後移動 j strlen_loop strlen_done: mv a0, t2 # 返回長度 ret ```