owned this note
owned this note
Published
Linked with GitHub
# Assignment1: RISC-V Assembly and Instruction Pipeline
contributed by < 侯廷錡 >
## 1. Problem C from [Quiz1](https://hackmd.io/@sysprog/arch2024-quiz1-sol)
### 1.1 Problem Description
The purpose of this code is to convert a 16-bit half-precision floating-point number (fp16) into a 32-bit single-precision floating-point number (fp32) using only bit manipulation. It avoids any floating-point arithmetic operations by directly handling the sign, exponent, and mantissa fields of the floating-point numbers. The code also handles special cases like zero, NaN (Not a Number), and infinity.
### 1.2 C Code Implementation
``` c #include <stdint.h>
static inline int my_clz(uint32_t x) {
int count = 0;
for (int i = 31; i >= 0; --i) {
if (x & (1U << i))
break;
count++;
}
return count;
}
static inline uint32_t fp16_to_fp32(uint16_t h) {
// Shift the 16-bit half-precision floating point number to the upper half of 32-bit
const uint32_t w = (uint32_t) h << 16;
// Isolate the sign bit
const uint32_t sign = w & UINT32_C(0x80000000);
// Extract the exponent and mantissa
const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF);
// Calculate renormalization shift for denormalized numbers
uint32_t renorm_shift = my_clz(nonsign);
renorm_shift = renorm_shift > 5 ? renorm_shift - 5 : 0;
// Check if it's NaN or infinity
const int32_t inf_nan_mask = ((int32_t)(nonsign + 0x04000000) >> 8) & INT32_C(0x7F800000);
// Check if it's zero
const int32_t zero_mask = (int32_t)(nonsign - 1) >> 31;
// Perform the conversion to fp32
return sign | ((((nonsign << renorm_shift >> 3) +
((0x70 - renorm_shift) << 23)) | inf_nan_mask) & ~zero_mask);
}
```
### 1.3 Translation to RISC-V Assembly
``` .data
num: .word 0xAB00 # Half-precision float (16-bit value) for testing
.text
.globl main
main:
lw a0, num # Load num (16-bit half-precision float)
jal ra, fp16_to_fp32 # Call fp16_to_fp32 to convert to single precision
mv a1, a0 # Move result to a1 for printing
jal ra, print_result # Call print to print result
li a7, 93 # Exit the program
ecall # System call to exit
# Function: fp16_to_fp32
fp16_to_fp32:
# Shift the 16-bit half-precision float to upper half of 32-bit word
mv t0, a0 # h = num
slli t0, t0, 16 # w = h << 16
# Isolate the sign bit
li t1, 0x80000000 # Load mask to isolate sign bit
and t1, t0, t1 # t1 = sign = w & 0x80000000
# Extract the mantissa and exponent (nonsign)
li t2, 0x7FFFFFFF # Load mask to extract nonsign bits
and t2, t0, t2 # t2 = nonsign = w & 0x7FFFFFFF
# Call my_clz to count leading zeros (renorm_shift)
mv a0, t2 # Pass nonsign as argument to my_clz
jal ra, my_clz # Call my_clz
mv t3, a0 # t3 = renorm_shift
# Adjust renorm_shift: renorm_shift = (renorm_shift > 5) ? renorm_shift - 5 : 0
addi t3, t3, -5 # renorm_shift - 5
blt t3, zero, skip_adjust
li t3, 0 # If renorm_shift < 0, set renorm_shift = 0
skip_adjust:
# Handle NaN and infinity
li t4, 0x04000000 # Prepare mask for NaN/infinity check
add t4, t2, t4 # nonsign + 0x04000000
srli t4, t4, 8 # >> 8
li t5, 0x7F800000 # Mask for inf_nan_mask
and t4, t4, t5 # inf_nan_mask = (nonsign + 0x04000000) & 0x7F800000
# Check if zero
addi t5, t2, -1 # nonsign - 1
srli t5, t5, 31 # zero_mask = (nonsign - 1) >> 31
li t6, 0xFFFFFFFF # Mask to complement zero_mask
xor t5, t5, t6 # ~zero_mask
# Perform the conversion
sll t2, t2, t3 # nonsign << renorm_shift
srli t2, t2, 3 # nonsign >> 3
li t6, 0x70 # Bias difference (0x70)
sub t6, t6, t3 # 0x70 - renorm_shift
slli t6, t6, 23 # (0x70 - renorm_shift) << 23
add t2, t2, t6 # Adjust exponent and mantissa
or t2, t2, t4 # | inf_nan_mask
and t2, t2, t5 # & ~zero_mask
or a0, t1, t2 # | sign
ret # Return result
# Function: my_clz (Count Leading Zeros)
my_clz:
mv t5, a0 # Load input x into t5 (nonsign)
li t0, 0 # count = 0
li t1, 31 # i = 31 (start from the most significant bit)
# Special case: if nonsign == 0, return 32
beq t5, zero, clz_done_zero
clz_loop:
sll t6, t2, t1 # t6 = 1 << i (create mask to check the i-th bit)
and t6, t5, t6 # t6 = x & (1 << i)
bnez t6, clz_done # If bit is 1, break the loop
addi t0, t0, 1 # Increment count (count++)
addi t1, t1, -1 # Decrement i (i--)
bgez t1, clz_loop # Continue loop if i >= 0
clz_done:
mv a0, t0 # Return the count of leading zeros in a0
ret # Return to caller
clz_done_zero:
li a0, 32 # If nonsign == 0, return 32 (all bits are zeros)
ret
# Function: print_result (Syscall to print integer)
print_result:
mv a0, a1 # Move result to a0 for printing
li a7, 1 # Syscall number for print integer
ecall # Make syscall
ret # Return to main
```
### 1.4 Initial RISC-V Assembly Program
Complete RISC-V assembly code for the initial version.
### 1.5 Code Optimization
Iterative improvements in the assembly code.
Strategies used to reduce code size and improve runtime performance.
### 1.6 Test Cases and Validation
Explanation of the predefined test data.
Description of how internal validation is implemented.
### 1.7 Simulation in Ripes
Explanation of how the RISC-V assembly code runs in Ripes.
Visualization and explanation of signals in different stages (IF, ID, IE, MEM, WB).
## 2. LeetCode or Open-Source Problem
### 2.1 Problem Description
You are given two binary strings a and b, representing two non-negative integers. Your task is to add these two binary numbers and return their sum as a binary string.
The binary strings a and b may have different lengths, and the sum should be represented without leading zeros (unless the sum is zero).
You cannot use built-in functions to directly convert the binary strings into integers and then back to binary strings. The addition must be performed by iterating through the binary digits, handling carry manually.
**Example 1:**
**Input:** `a = "1010", b = "1011"`
**Output:**`"10101"`
**Example 2:**
**Input:** `a = "11", b = "1"`
**Output:** `"100"`
Constraints:
- $1 \leq \text{a.length}, \text{b.length} \leq 10^4$
- a and b consist only of '0' and '1' characters.
- Both a and b are valid binary strings, and neither string contains leading zeros except the string "0".
### 2.2 C program
This algorithm adds binary digits from the rightmost side, managing carry as it processes each bit.Missing bits are treated as 0. The result is built by storing the sum modulo 2, updating the carry, and continuing until all bits and the carry are processed. Leading zeros are removed, and the final result is returned.
The time complexity is O(max(n, m)), where `n` and `m` are the lengths of strings a and b, as the algorithm processes each bit of the longer string.
```c
char* addBinary(char* a, char* b) {
int i=strlen(a), j=strlen(b);
int aux=0,k=fmax(i, j)+2;
char* result = (char*) malloc (sizeof(char) * k);
result[--k] = '\0';
i--; j--;
while(--k >= 0){
aux += i >= 0 ? a[i--] - '0': 0;
aux += j >= 0 ? b[j--] - '0': 0;
result[k] = aux % 2 + '0';
aux /= 2;
}
if(result[0] == '0') return result+1;
return result;
}
```
### 2.3 RISC-V Assembly Code Implementation
```
.data
a_str: .string "100001" # String a
b_str: .string "101" # String b
result: .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 # Reserve space for result
len_a: .word 0 # To store length of string a
len_b: .word 0 # To store length of string b
carry: .word 0 # Carry variable
i: .word 0 # Loop index i
j: .word 0 # Loop index j
k: .word 0 # Result index
newline: .byte 10 # Newline character (ASCII code 10)
.text
.globl main
main:
# Calculate length of string a
la t0, a_str # Load address of string a
li t1, 0 # Initialize counter for length
strlen_a_loop:
lb t2, 0(t0) # Load byte from string a
beq t2, zero, strlen_a_done # If byte is 0 (end of string), break loop
addi t1, t1, 1 # Increment length counter
addi t0, t0, 1 # Move to the next byte in string a
j strlen_a_loop # Repeat loop
strlen_a_done:
la t0, len_a # Load address of len_a
sw t1, 0(t0) # Store length of string a in len_a
# Calculate length of string b
la t0, b_str # Load address of string b
li t1, 0 # Initialize counter for length
strlen_b_loop:
lb t2, 0(t0) # Load byte from string b
beq t2, zero, strlen_b_done # If byte is 0 (end of string), break loop
addi t1, t1, 1 # Increment length counter
addi t0, t0, 1 # Move to the next byte in string b
j strlen_b_loop # Repeat loop
strlen_b_done:
la t0, len_b # Load address of len_b
sw t1, 0(t0) # Store length of string b in len_b
# Initialize i = len_a - 1
la t0, len_a
lw t1, 0(t0) # Load length of string a
addi t1, t1, -1 # i = len_a - 1
la t0, i
sw t1, 0(t0) # Store i
# Initialize j = len_b - 1
la t0, len_b
lw t1, 0(t0) # Load length of string b
addi t1, t1, -1 # j = len_b - 1
la t0, j
sw t1, 0(t0) # Store j
# Initialize k (index for result) to 0
li t1, 0 # k = 0
la t0, k
sw t1, 0(t0)
# Initialize carry to 0
li t1, 0
la t0, carry
sw t1, 0(t0)
# Binary addition loop
add_loop:
# Load indices and carry
la t0, i
lw t1, 0(t0) # t1 = i
la t0, j
lw t2, 0(t0) # t2 = j
la t0, carry
lw t3, 0(t0) # t3 = carry
# If i >= 0, add a[i] - '0'
blt t1, zero, skip_a
la t0, a_str # Load base address of a
add t0, t0, t1 # Calculate a[i] address
lb t4, 0(t0) # Load byte a[i]
addi t4, t4, -48 # Convert from ASCII to integer ('0' = 48)
add t3, t3, t4 # sum += a[i]
addi t1, t1, -1 # i--
la t0, i
sw t1, 0(t0) # Update i
skip_a:
# If j >= 0, add b[j] - '0'
blt t2, zero, skip_b
la t0, b_str # Load base address of b
add t0, t0, t2 # Calculate b[j] address
lb t4, 0(t0) # Load byte b[j]
addi t4, t4, -48 # Convert from ASCII to integer ('0' = 48)
add t3, t3, t4 # sum += b[j]
addi t2, t2, -1 # j--
la t0, j
sw t2, 0(t0) # Update j
skip_b:
# Store result bit (result[k] = sum % 2)
la t0, result # Load base address of result
la t5, k
lw t4, 0(t5) # Load current index k
andi t6, t3, 1 # t6 = sum % 2 (sum & 1)
addi t6, t6, 48 # Convert to ASCII ('0' or '1')
add t0, t0, t4 # Calculate result[k] address
sb t6, 0(t0) # Store result bit at result[k]
addi t4, t4, 1 # k++
sw t4, 0(t5) # Update k
# Update carry (carry = sum / 2)
srli t3, t3, 1 # t3 = sum >> 1 (carry)
la t0, carry
sw t3, 0(t0) # Store carry
# Continue loop while i >= 0, j >= 0, or carry != 0
la t0, i
lw t1, 0(t0)
la t0, j
lw t2, 0(t0)
la t0, carry
lw t3, 0(t0)
bge t1, zero, add_loop
bge t2, zero, add_loop
bnez t3, add_loop
# Done with addition, reverse result to print from left to right
# Adjust k
la t0, k
lw t1, 0(t0) # t1 = k
addi t1, t1, -1 # t1 = k - 1 (adjust index)
sw t1, 0(t0) # Update k
reverse_loop:
blt t1, zero, print_result
la t0, result # Load base address of result
add t0, t0, t1 # Address of result[k]
lb t2, 0(t0) # Load byte result[k]
# Prepare parameters for sys_write
li a0, 1 # File descriptor (stdout)
mv a1, t0 # Pointer to buffer (character to print)
li a2, 1 # Number of bytes to write
li a7, 64 # Syscall number for write
ecall # Make system call
addi t1, t1, -1 # k--
j reverse_loop
print_result:
# Optionally print a newline
la a1, newline # Address of newline character
li a0, 1 # File descriptor (stdout)
li a2, 1 # Number of bytes to write
li a7, 64 # Syscall number for write
ecall
# Exit the program
li a7, 93 # Syscall number for exit
li a0, 0 # Exit code 0
ecall
```
### 2.4 Test Cases and Validation
Predefined test data and validation process for this problem.
### 2.5 Optimization Strategies
1. Register Optimization:
* Uses registers to store variables, avoiding frequent memory access. Variables like len_a, len_b, and carry are stored directly in registers, which speeds up access and reduces memory latency, improving overall performance.
2. Pointer Arithmetic:
* By using pointers to directly manipulate the characters in the strings, the code eliminates the need for index calculations. This allows for direct access to string characters without having to compute their positions, simplifying the logic and reducing unnecessary addition operations.
3 .Efficient Loop Control:
* The loop condition is simplified to check whether the pointers have reached the start of the strings and whether there is a carry left to process. This reduces the need for additional branch instructions such as blt, bge, and makes the program flow more streamlined and faster.
4. Optimized Result Handling:
* The result is built directly in the correct order during the calculation, eliminating the need to reverse the result after the addition is complete. This removes extra loops and instructions, improving efficiency in handling the result.
5. Single Syscall for Output:
* Instead of making multiple ecall syscalls to output each character one by one, Version 2 uses a single ecall to output the entire result at once. This drastically reduces the number of syscalls, which are expensive
operations, and thus enhances the program’s overall efficiency.
6. Stack Management:
* Uses stack management by saving and restoring important registers (like ra and s0) at the start and end of the function. This ensures the system’s state is restored after the program finishes, providing stability, especially when the program involves nested subroutines or multiple function calls.
```
.data
a_str: .string "100001" # String a
b_str: .string "101" # String b
result_buffer: .zero 20 # Reserve space for result (20 bytes initialized to zero)
newline: .byte 10 # Newline character (ASCII code for '\n')
.text
.globl main
main:
# Function Prologue
addi sp, sp, -16 # Allocate stack space
sw ra, 12(sp) # Save return address
sw s0, 8(sp) # Save s0 (if needed)
# Load addresses of a_str and b_str into s0 and s1
la s0, a_str # s0 = address of a_str
la s1, b_str # s1 = address of b_str
# Calculate lengths of a_str and b_str
# Using s2 for len_a, s3 for len_b
mv t0, s0 # t0 = s0 (pointer to a_str)
li s2, 0 # s2 = len_a = 0
calc_len_a:
lb t1, 0(t0) # t1 = *(t0)
beq t1, zero, len_a_done # If t1 == 0 (null terminator), end loop
addi s2, s2, 1 # len_a++
addi t0, t0, 1 # t0++
j calc_len_a
len_a_done:
mv t0, s1 # t0 = s1 (pointer to b_str)
li s3, 0 # s3 = len_b = 0
calc_len_b:
lb t1, 0(t0) # t1 = *(t0)
beq t1, zero, len_b_done # If t1 == 0 (null terminator), end loop
addi s3, s3, 1 # len_b++
addi t0, t0, 1 # t0++
j calc_len_b
len_b_done:
# Initialize pointers and indices
add s4, s0, s2 # s4 = end_ptr_a = s0 + len_a
add s5, s1, s3 # s5 = end_ptr_b = s1 + len_b
la s6, result_buffer # s6 = result_ptr (will build result in reverse)
li s7, 0 # s7 = carry = 0
# Binary addition loop
binary_add_loop:
# Initialize t1 = carry (sum of bits)
mv t1, s7 # t1 = s7 (carry)
# If s4 > s0 (still bits in a)
ble s4, s0, check_b_a # If s4 <= s0, skip loading from a
addi s4, s4, -1 # s4--
lb t2, 0(s4) # t2 = *(s4)
addi t2, t2, -48 # Convert ASCII to integer
add t1, t1, t2 # t1 += t2
check_b_a:
# If s5 > s1 (still bits in b)
ble s5, s1, compute_sum # If s5 <= s1, skip loading from b
addi s5, s5, -1 # s5--
lb t2, 0(s5) # t2 = *(s5)
addi t2, t2, -48 # Convert ASCII to integer
add t1, t1, t2 # t1 += t2
compute_sum:
# Compute result bit and new carry
andi t2, t1, 1 # t2 = t1 & 1 (result bit)
srli s7, t1, 1 # s7 = t1 >> 1 (new carry)
addi t2, t2, 48 # Convert to ASCII ('0' or '1')
addi s6, s6, -1 # s6--
sb t2, 0(s6) # Store result bit
# Check if we should continue
# Continue if s4 > s0 or s5 > s1 or s7 != 0
bgt s4, s0, binary_add_loop
bgt s5, s1, binary_add_loop
bne s7, zero, binary_add_loop
# Result is in result buffer starting at s6
# Calculate length of result
la t0, result_buffer
addi t1, t0, 20 # t1 = result_buffer + 20 (end of result buffer)
sub t2, t1, s6 # t2 = t1 - s6 (length of result)
# Prepare to write result to stdout
li a0, 1 # a0 = file descriptor (stdout)
mv a1, s6 # a1 = buffer pointer (start of result)
mv a2, t2 # a2 = length of result
li a7, 64 # a7 = syscall number for write
ecall # Make syscall
# Print newline character
la a1, newline # a1 = address of newline character
li a2, 1 # a2 = length = 1
li a0, 1 # a0 = file descriptor (stdout)
li a7, 64 # a7 = syscall number for write
ecall
# Function Epilogue
lw ra, 12(sp) # Restore return address
lw s0, 8(sp) # Restore s0
addi sp, sp, 16 # Deallocate stack space
# Exit program
li a7, 93 # syscall number for exit
li a0, 0 # exit code 0
ecall
```