# Assignment1: RISC-V Assembly and Instruction Pipeline
contributed by < [`郭晏愷`](https://github.com/kdnvt/Palindrome-linked-list) >
## Problem B in Quiz 1
The bfloat16 format is a 16-bit floating-point representation, designed to provide a wide dynamic range by using a floating radix point. It is a shortened version of the 32-bit IEEE 754 single-precision format (binary32), aimed at accelerating machine learning.
### C Code
```c
typedef struct {
uint16_t bits;
} bf16_t;
static inline float bf16_to_fp32(bf16_t h)
{
union {
float f;
uint32_t i;
} u = {.i = (uint32_t)h.bits << 16};
return u.f;
}
static inline bf16_t fp32_to_bf16(float s)
{
bf16_t h;
union {
float f;
uint32_t i;
} u = {.f = s};
if ((u.i & 0x7fffffff) > 0x7f800000) { /* NaN */
h.bits = (u.i >> 16) | 64; /* force to quiet */
return h;
}
h.bits = (u.i + (0x7fff + ((u.i >> 0x10) & 1))) >> 0x10;
return h;
}
int main(){
float test_data[3] = {-1.5f, 123.456f, -0.0f};
bf16_t expected_bf16[3] = {0xBFC0, 0x42F7, 0x8000};
uint32_t expected_fp32[3] = {0xBFC00000, 0x42F70000, 0x80000000};
int num = sizeof(test_data) / sizeof(test_data[0]);
for (int i = 0; i < num; i++) {
bf16_t bf16_value = fp32_to_bf16(test_data[i]);
if (bf16_value.bits == expected_bf16[i].bits) {
printf("BF16 conversion for test_data[%d] is correct\n", i);
} else {
printf("BF16 conversion for test_data[%d] is incorrect\n", i);
}
float recovered_fp32 = bf16_to_fp32(bf16_value);
if (*(uint32_t *)&recovered_fp32 == expected_fp32[i]) {
printf("FP32 recovery for test_data[%d] is correct\n", i);
} else {
printf("FP32 recovery for test_data[%d] is incorrect\n", i);
}
}
return 0;
}
```
### Assembly Code
```asm
.data
test_data:
.word 0xBFC00000 # -1.5 (float)
.word 0x42F6E979 # 123.456 (float)
.word 0x80000000 # -0.0 (float)
expected_bf16:
.half 0xBFC0 # Expected bfloat16 values
.half 0x42F7
.half 0x8000
expected_fp32:
.word 0xBFC00000 # Expected float32 recovered values
.word 0x42F70000
.word 0x80000000
bf16correct:
.string "bf16_Correct\n"
fp32correct:
.string "fp32_Correct\n"
bf16incorrect:
.string "bf16_Incorrect\n"
fp32incorrect:
.string "fp32_Incorrect\n"
.text
.global main
main:
# Initialize loop counter
la s0, test_data
la s1, expected_bf16
la s2, expected_fp32
li s3, 0 # i = 0
li s4, 3 # num = 3 (size of the test_data array)
li s5, 0xffff0000
loop:
bge s3, s4, end_loop # if i >= num, exit loop
lw a0, 0(s0) # Load float value (FP32) into a0
lh a1, 0(s1)
sub a1, a1, s5
lw a2, 0(s2)
jal ra, fp32_to_bf16 # Call function
beq a0, a1, bf16_correct
# Load test_data[i+1]
addi s0, s0, 4 # Address of test_data[i]
addi s1, s1, 2 # Address of expected_bf16[i]
addi s2, s2, 4 # Address of expected_fp32[i]
bne a0, a1, print_bf16_incorrect
j next_iteration
fp32_to_bf16:
li t0, 0x7fffffff
li t6, 0
and t1, t6, t0
li t0, 0x7f800000
bgt t1, t0, handle_nan
_1:
li t0, 0x7fff
srli t1, a0, 16
andi t1, t1, 1
add t1, t1, t0
add t1, t1, a0
srli t1, t1, 16
mv a0, t1
ret
handle_nan:
srli t1, a0, 16
ori t1, t1, 64
mv a0, t1
j _1
bf16_correct:
jal ra, print_bf16_correct
jal ra, bf16_to_fp32
beq a0, a2, fp32_correct
bne a0, a2, print_fp32_incorrect
j next_iteration
bf16_to_fp32:
# a0 contains the input bf16 (h.bits)
addi a0, a1, 0
slli t0, a0, 16
mv a0, t0
ret
fp32_correct:
la a0, fp32correct
li a7, 4
ecall
j next_iteration
next_iteration:
addi s3, s3, 1 # i++
j loop # Go to the next iteration
end_loop:
li a7, 10
li a0, 0
ecall
# Print routines (implement later)
print_bf16_correct:
la a0, bf16correct
li a7, 4
ecall
ret
print_bf16_incorrect:
la a0, bf16incorrect
li a7, 4
ecall
j next_iteration
print_fp32_incorrect:
la a0, fp32incorrect
li a7, 4
ecall
j next_iteration
```
## Add Binary ([LeetCode 67](https://leetcode.com/problems/add-binary/description/))
Description:
>Given two binary strings a and b, return their sum as a binary string.
Example 1:
> Input: a = "11", b = "1"
> Output: "100"
Example 2:
> Input: a = "1010", b = "1011"
> Output: "10101"
Constraints:
> 1 <= a.length, b.length <= 104
> a and b consist only of '0' or '1' characters.
> Each string does not contain leading zeros except for the zero itself.
## Solution
### C Code
```c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
char* addBinary(char* a, char* b) {
int la = strlen(a), lb = strlen(b);
int lc = la > lb ? la : lb;
char* c = (char*)malloc(lc+1);
c[lc] = '\0';
int i, carry = 0;
for(i = 0 ; i < lc ; i++){
int bit_a = (la - 1 - i >= 0) ?
a[la - 1 - i] - '0' : 0;
int bit_b = (lb - 1 - i >= 0) ?
b[lb - 1 - i] - '0' : 0;
int sum = bit_a + bit_b + carry;
carry = sum / 2;
c[lc - 1 - i] = (sum % 2) + '0';
}
if(carry == 1){
c = (char*)realloc(c,lc+2);
for(i = lc ; i >= 0 ; i--){
c[i+1] = c[i];
}
c[0] = '1';
}
return c;
}
int main() {
char a[] = "11";
char b[] = "1";
char* result = addBinary(a, b);
printf("Result of %s +%s = %s\n", a, b, result);
free(result);
return 0;
}
```
### Assembly Code
```asm
.data
a_str:
.string "11"
b_str:
.string "1"
len_a:
.word 0
len_b:
.word 0
len_c:
.word 0
carry:
.word 0
i:
.word 0
temp_a:
.word 0
temp_b:
.word 0
.text
.global main
main:
# Cal length of a and b
la t0, a_str # Load address a_str
jal ra, strlen # Calculate the length of a_str
mv s0 t0 # Move t0 to s0
la s1, len_a # Load address of len_a
sw a0, 0(s1) # Store word of len_a(s1)
la t0, b_str # Load address b_str to t0
jal ra, strlen # Calculate the length of b_str
la s1, len_b # Load address of len_b
sw a0, 0(s2) # Store word of len_b(s2)
# len_c = max(len_a, len_b)
blt s1, s2, set_len_b # If len_a < len_b, jump to set_len_b
j set_len_c
set_len_b:
mv s3, s2 # len_c = len_b
set_len_c:
mv s3, s1 # len_c = len_a
# Initial for loop
la t1, i # i(t1)
la t2, carry # carry(t2)
li t1, 0 # i = 0
li t2, 0 # carry = 0
loop:
bge t1, s3, end_loop # If i >= len_c, exit loop
# 計算 a 的位元值
li t3, 1
la t4, temp_a
sub t4, s1, t1 # t4 = len_a - i
sub t4, t4, t3 # t4 = t4 - 1
blt t4, x0, skip_a_bit # If t4 < 0, skip_a_bit
lb t5, 0(s0) # Get ASCII of a_str[t4]
li t3, 48
sub t5, t5, t3 # Transfer ASCII to 0 or 1
sw t5, 0(t4) # Refresh temp_a
j process_b
skip_a_bit:
li t5, 0 # If i >= len_c, exit loop
sw t5, 0(t4) # Refresh temp_a
process_b:
# 計算 b 的位元值
la t6, temp_b # Load address temp_b to t6
sub t6, s2, t1 # t6 = len_b - i
sub t6, t6, t3 # t6 = t6 - 1
blt t6, x0, skip_b_bit # If t6 < 0, skip_b_bit
lb a1, 0(t0) # Get ASCII of b_str[t6]
sub a1, a1, t3 # Transfer ASCII to 0 or 1
sw a1, 0(t6) # Refresh temp_b
j process_sum
skip_b_bit:
li a1, 0 # 如果超出範圍,則設置 bit_b = 0
sw a1, 0(t6) # 暫存 b 的位元
process_sum:
# 加法並處理進位
add a3, t4, t6 # sum = bit_a + bit_b
add a3, a3, t2 # sum = sum + carry
li t3, 2
div t2, a3, t3 # carry = sum / 2
rem a3, a3, t3 # sum = sum % 2
addi a3, a3, 48 # 將數字轉回 ASCII
# 更新迴圈索引 i
addi t1, t1, 1
j loop # 繼續迴圈
end_loop:
# 如果有進位,則需要擴展結果字符串
beqz t2, end_program # 如果沒有進位,直接結束
li a4, 49 # '1' 的 ASCII
sb a4, 0(a3) # 將 '1' 插入結果字串的最前面
end_program:
# 結束程式
mv a0, a3
li a7, 10 # 系統調用退出
ecall
# strlen 函數
strlen:
mv t1, t0 # 載入字串地址
li t2, 0 # 初始化長度計數
strlen_loop:
lb t3, 0(t1) # 讀取字元
beq t3, x0, strlen_done # 如果字元為 '\0',結束
addi t2, t2, 1 # 長度 +1
addi t1, t1, 1 # 指針向後移動
j strlen_loop
strlen_done:
mv a0, t2 # 返回長度
ret
```