# Assignment1: RISC-V Assembly and Instruction Pipeline
contributed by < nelson0720j >
[Assignment](https://hackmd.io/@sysprog/2024-arch-homework1)
## problem C
### C code
```clike!
static inline float fabsf(float x) {
uint32_t i = *(uint32_t *)&x; // Read the bits of the float into an integer
i &= 0x7FFFFFFF; // Clear the sign bit to get the absolute value
x = *(float *)&i; // Write the modified bits back into the float
return x;
}
static inline int my_clz(uint32_t x) {
int count = 0;
for (int i = 31; i >= 0; --i) {
if (x & (1U << i))
break;
count++;
}
return count;
}
static inline uint32_t fp16_to_fp32(uint16_t h) {
const uint32_t w = (uint32_t) h << 16;
const uint32_t sign = w & UINT32_C(0x80000000);
const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF);
uint32_t renorm_shift = my_clz(nonsign);
renorm_shift = renorm_shift > 5 ? renorm_shift - 5 : 0;
const int32_t inf_nan_mask = ((int32_t)(nonsign + 0x04000000) >> 8) & INT32_C(0x7F800000);
return sign | ((((nonsign << renorm_shift >> 3) +
((0x70 - renorm_shift) << 23)) | inf_nan_mask) & ~zero_mask);
}
```
### [problemC assembly code](https://github.com/nelson0720j/Assignment1-RISC-V-Assembly-and-Instruction-Pipeline/blob/0416d70488d31876e454a38ec89dd61e152592f4/problemC.s)
```asm!
main:
# Call test function
jal ra, test # Call test function, result will be in a0
# Display result
li a7, 1 # Prepare for ecall (print integer)
ecall # Print the result (true/false) in a0
li a7, 10 # Exit program
ecall
# Test function to compare actual and expected result
test:
addi sp, sp, -16 # Reserve stack space
sw ra, 12(sp) # Save return address
sw t0, 8(sp) # Save temp register
# Test case: fp16_to_fp32
li a0, 0x3C00 # Load test value
jal ra, fp16_to_fp32 # Call fp16_to_fp32
li t0, 0x3F800000 # Expected result
bne a0, t0, fail # If result not equal to expected, jump to fail
# Test case: my_clz
li a0, 0x00001000 # Load test value
jal ra, my_clz # Call my_clz
li t0, 19 # Expected result
bne a0, t0, fail # If result not equal to expected, jump to fail
# Test case: fabsf
li a0, 0x80000000
jal ra, fabsf
li t0, 0x00000000
bne a0, t0, fail
# If both tests pass
li a0, 1 # Return true (1)
j end_test
fail:
li a0, 0 # Return false (0)
end_test:
lw ra, 12(sp) # Restore return address
lw t0, 8(sp) # Restore temp register
addi sp, sp, 16 # Restore stack space
ret
fabsf:
addi sp, sp, -8
sw ra, 4(sp)
sw s0, 0(sp)
li s0, 0x7FFFFFFF
and a0, a0, s0
lw s0, 0(sp)
lw ra, 4(sp)
addi sp, sp, 8
ret
fp16_to_fp32:
addi sp, sp, -32
sw ra, 28(sp)
sw s0, 24(sp)
sw s1, 20(sp)
sw s2, 16(sp)
sw s3, 12(sp)
sw s4, 8(sp)
sw s5, 4(sp)
sw s6, 0(sp)
mv s0, a0 # s0 store input h
slli s1, s0, 16 # s1 store w
li t0, 0x80000000
and s2, s1, t0 # s2 store sign
li t0, 0x7FFFFFFF
and s3, s1, t0 # s3 store nonsign
mv a0, s3
jal ra, my_clz # get clz count in a0
mv s4, a0 # s4 store my_clz result
li s5, 6 # s5 store 6 to decides how manybits to shift
blt s4, s5, set_zero
addi s4, s4, -5
j go
set_zero:
li s4, 0
go:
li t1, 0x04000000
add s5, s3, t1
srli s5, s5, 8
li t1, 0x7F800000
and s5, s5, t1 # s5 update store inf_nan_mask
addi s6, s3, -1
srli s6, s6, 31 # s6 store zero_mask
sll s3, s3, s4 # nonsign << renorm_shift
srli s3, s3, 3 # s3 store (nonsign << renorm_shift >> 3)
li t1, 0x70
sub t1, t1, s4 # (0x70 - renorm_shift)
slli t1, t1, 23 # t1 store ((0x70 - renorm_shift) << 23)
add s3, s3, t1
or s3, s3, s5
not t1, s6
and s3, s3, t1
or s3, s3, s2
mv a0, s3
lw ra, 28(sp)
lw s0, 24(sp)
lw s1, 20(sp)
lw s2, 16(sp)
lw s3, 12(sp)
lw s4, 8(sp)
lw s5, 4(sp)
lw s6, 0(sp)
addi sp, sp, 32
ret
# Function to count leading zeros (CLZ)
my_clz:
addi sp, sp, -24
sw ra, 20(sp)
sw s0, 16(sp)
sw s1, 12(sp)
sw s2, 8(sp)
sw s3, 4(sp)
sw s4, 0(sp)
li s0, 31 # s0 = i
li s1, 0 # s1 = count
loop:
li s4, 0x1
sll s2, s4, s0
and s3, a0, s2
bne s3, x0, done
addi s1, s1, 1
addi s0, s0, -1
bge s0, x0, loop
done:
mv a0, s1
lw ra, 20(sp)
lw s0, 16(sp)
lw s1, 12(sp)
lw s2, 8(sp)
lw s3, 4(sp)
lw s4, 0(sp)
addi sp, sp, 24
ret
```
Use main to test `fp16_to_fp32` and `clz` correctness.
#### result
| Execution info | Value |
|-----------------|---------|
| Cycles | 356 |
| Instrs. retired | 278 |
| CPI | 1.28 |
| IPC | 0.781 |
| Clock rate | 0 Hz |
## clz
```asm!
.data
test_value:
.word 0x00001000 # Test value (4096, 0x00001000)
.text
.global _start
_start:
# Load the test value
la a0, test_value # Load the address of the variable into a0
lw a0, 0(a0) # Load the value from the address into a0
# Call my_clz function
jal ra, my_clz # Jump and link to my_clz, result will be in a0
# Print the result
li a7, 1 # Prepare for ecall to print integer
ecall # Print the result in a0
# Exit the program
li a7, 10 # Prepare for ecall to exit
ecall
# Function to count leading zeros (CLZ)
my_clz:
addi sp, sp, -24 # Reserve stack space
sw ra, 20(sp) # Save return address
sw s0, 16(sp) # Save s0
sw s1, 12(sp) # Save s1
sw s2, 8(sp) # Save s2
sw s3, 4(sp) # Save s3
sw s4, 0(sp) # Save s4
li s0, 31 # Initialize s0 = 31 (index)
li s1, 0 # Initialize s1 = 0 (count)
loop:
li s4, 0x1 # Load 1 into s4
sll s2, s4, s0 # Shift left 1 by s0 bits
and s3, a0, s2 # Perform bitwise AND with a0 and s2
bne s3, x0, done # If the result is non-zero, branch to done
addi s1, s1, 1 # Increment the count
addi s0, s0, -1 # Decrement the index
bge s0, x0, loop # If s0 >= 0, continue looping
done:
mv a0, s1 # Move the count (leading zeros) to a0
lw ra, 20(sp) # Restore return address
lw s0, 16(sp) # Restore s0
lw s1, 12(sp) # Restore s1
lw s2, 8(sp) # Restore s2
lw s3, 4(sp) # Restore s3
lw s4, 0(sp) # Restore s4
addi sp, sp, 24 # Restore stack space
ret # Return from function
```
#### result
| Execution info | Value |
|-----------------|-------|
| Cycles | 215 |
| Instrs. retired | 163 |
| CPI | 1.32 |
| IPC | 0.758 |
| Clock rate | 0 Hz |
## [190. Reverse Bits](https://leetcode.com/problems/reverse-bits/description/)
Reverse bits of a given 32 bits unsigned integer.
Example :
Input: n = `00000010100101000001111010011100`
Output: `00111001011110000010100101000000`
Explanation: The input binary string `00000010100101000001111010011100` represents the unsigned integer 43261596, so return 964176192 which its binary representation is `00111001011110000010100101000000`.
### [ver1](https://github.com/nelson0720j/Assignment1-RISC-V-Assembly-and-Instruction-Pipeline/blob/0416d70488d31876e454a38ec89dd61e152592f4/leetcode_ver1.s)
* c code
```clike!
uint32_t reverseBits(uint32_t n) {
uint32_t result = 0;
for(int i = 1; i <= 32; i++) {
result <<= 1;
result |= (n&1);
n >>= 1;
}
return result;
}
```
1. Start with result = 0, which will hold the reversed bits.
2. Loop 32 times:
Shift result left to make space for the next bit.
Extract the last bit of n and add it to result.
Shift n right to process the next bit.
3. Return the reversed result after all bits have been processed.
* assembly code
```asm!
.data
input_val:
.word 0b00000000000000000000000010011100 # Example input value
.text
.global _start
_start:
# Load the input value
la a0, input_val # Load address of input_val into a0
lw a0, 0(a0) # Load the value at input_val into a0 (n)
# Call reverseBits function
jal ra, reverseBits # Call reverseBits, result will be in a0
# Print the result
li a7, 1 # Syscall code for print integer
ecall # Print the result in a0
# Exit the program
li a7, 10 # Syscall code for exit
ecall
# reverseBits function
reverseBits:
addi sp, sp, -16 # Reserve stack space
sw ra, 12(sp) # Save return address
sw t0, 8(sp) # Save t0
sw t1, 4(sp) # Save t1
sw t2, 0(sp) # Save t2
li t0, 0 # Initialize result = 0
li t1, 32 # Initialize counter = 32
loop:
beqz t1, done # If counter == 0, exit loop
slli t0, t0, 1 # result <<= 1
andi t2, a0, 1 # Extract the least significant bit (n & 1)
or t0, t0, t2 # result |= (n & 1)
srli a0, a0, 1 # n >>= 1
addi t1, t1, -1 # Decrement counter
j loop # Repeat the loop
done:
mv a0, t0 # Move the result to a0 (return value)
lw ra, 12(sp) # Restore return address
lw t0, 8(sp) # Restore t0
lw t1, 4(sp) # Restore t1
lw t2, 0(sp) # Restore t2
addi sp, sp, 16 # Restore stack space
ret # Return from function
```
Next, I used CLZ to improve my code.
### [ver2](https://github.com/nelson0720j/Assignment1-RISC-V-Assembly-and-Instruction-Pipeline/blob/0416d70488d31876e454a38ec89dd61e152592f4/leetcode_ver2.s)
* improved c code using clz
```clike!
uint32_t reverseBits(uint32_t n) {
uint32_t result = 0;
// Count the leading zeros in the binary representation of n
int leadingZeros = __builtin_clz(n);
int effectiveBits = 32 - leadingZeros;
// Only process the effective bits and reverse them
for (int i = 0; i < effectiveBits; i++) {
result <<= 1; // Shift result to the left to make space for the next bit
result |= (n & 1); // OR the least significant bit of n with result
n >>= 1; // Shift n to the right to process the next bit
}
// After reversing the bits, shift the result to the left to account for the leading zeros
result <<= leadingZeros;
return result;
}
```
By knowing the number of leading zeros, we can skip processing those bits. Instead of reversing all 32 bits, the loop only iterates through the significant bits (effectiveBits), saving computational cycles. This reduces unnecessary operations on the leading zeros.
After reversing the significant bits, the result is shifted left by the number of leading zeros to ensure that the reversed bit pattern matches the original bit length with the correct placement of zeros.
* assembly code
```asm!
.data
input_val:
.word 0b00000000000000000000000010011100 # Example input value
.text
.global _start
_start:
# Load the input value
la a0, input_val # Load address of input_val into a0
lw a0, 0(a0) # Load the value at input_val into a0 (n)
# Call reverseBits function
jal ra, reverseBits # Call reverseBits, result will be in a0
# Print the result
li a7, 1 # Syscall code for print integer
ecall # Print the result in a0
# Exit the program
li a7, 10 # Syscall code for exit
ecall
# reverseBits function
reverseBits:
# Prologue: Save registers
addi sp, sp, -24 # Reserve stack space
sw ra, 20(sp) # Save return address
sw s0, 16(sp) # Save s0 (n)
sw s1, 12(sp) # Save s1 (leadingZeros)
sw s2, 8(sp) # Save s2 (result)
sw s3, 4(sp) # Save s3 (effectiveBits)
sw s4, 0(sp) # Save s4 (temporary)
mv s0, a0 # Save n in s0
# Call my_clz to get leading zeros
jal ra, my_clz # Input is a0 (n), output is a0 (leadingZeros)
mv s1, a0 # Store leadingZeros in s1
li s2, 32
sub s3, s2, s1 # s3 = effectiveBits = 32 - leadingZeros
li s2, 0 # Initialize result = 0
mv a0, s0 # Restore n in a0
beqz s3, reverse_done # If effectiveBits == 0, skip loop
reverse_loop:
slli s2, s2, 1 # result <<= 1
andi s4, a0, 1 # s4 = n & 1
or s2, s2, s4 # result |= (n & 1)
srli a0, a0, 1 # n >>= 1
addi s3, s3, -1 # Decrement effectiveBits
bnez s3, reverse_loop # If effectiveBits != 0, continue loop
reverse_done:
# Shift result left by leadingZeros
mv a0, s2 # Move result to a0
beqz s1, reverse_end # If leadingZeros == 0, skip shift
sll a0, a0, s1 # result <<= leadingZeros
reverse_end:
# Epilogue: Restore registers
lw ra, 20(sp) # Restore return address
lw s0, 16(sp) # Restore s0 (n)
lw s1, 12(sp) # Restore s1 (leadingZeros)
lw s2, 8(sp) # Restore s2 (result)
lw s3, 4(sp) # Restore s3 (effectiveBits)
lw s4, 0(sp) # Restore s4 (temporary)
addi sp, sp, 24 # Restore stack space
ret # Return from function
# Function to count leading zeros (CLZ)
my_clz:
# Prologue: Save registers
addi sp, sp, -24 # Reserve stack space
sw ra, 20(sp) # Save return address
sw s0, 16(sp) # Save s0 (index)
sw s1, 12(sp) # Save s1 (count)
sw s2, 8(sp) # Save s2 (temporary)
sw s3, 4(sp) # Save s3 (temporary)
sw s4, 0(sp) # Save s4 (temporary)
li s0, 31 # Initialize s0 = 31 (bit index)
li s1, 0 # Initialize s1 = 0 (leading zeros count)
clz_loop:
li s4, 0x1 # Load 1 into s4
sll s2, s4, s0 # s2 = 1 << s0
and s3, a0, s2 # s3 = a0 & s2 (test bit at position s0)
bne s3, x0, clz_done # If bit is 1, break loop
addi s1, s1, 1 # Increment leading zeros count
addi s0, s0, -1 # Decrement bit index
bge s0, x0, clz_loop # If s0 >= 0, continue loop
clz_done:
mv a0, s1 # Move leading zeros count to a0
# Epilogue: Restore registers
lw ra, 20(sp) # Restore return address
lw s0, 16(sp) # Restore s0 (index)
lw s1, 12(sp) # Restore s1 (count)
lw s2, 8(sp) # Restore s2 (temporary)
lw s3, 4(sp) # Restore s3 (temporary)
lw s4, 0(sp) # Restore s4 (temporary)
addi sp, sp, 24 # Restore stack space
ret # Return from function
```
Then, I tried to optimize the assembly code by reducing the number of iterations to see if it improves performance due to fewer branch instructions.
### [ver3](https://github.com/nelson0720j/Assignment1-RISC-V-Assembly-and-Instruction-Pipeline/blob/0416d70488d31876e454a38ec89dd61e152592f4/leetcode_ver3.s)
```asm!
# improved reverseBits function
reverseBits:
# Save registers
addi sp, sp, -28
sw ra, 24(sp)
sw s0, 20(sp)
sw s1, 16(sp)
sw s2, 12(sp)
sw s3, 8(sp)
sw s4, 4(sp)
sw s5, 0(sp)
mv s0, a0 # s0 = n
# Call the optimized my_clz
jal ra, my_clz
mv s1, a0 # s1 = leadingZeros
li s2, 32
sub s3, s2, s1 # s3 = effectiveBits = 32 - leadingZeros
li s2, 0 # s2 = result = 0
mv a0, s0 # Restore a0 = n
beqz s3, reverse_done # If effective bits are zero, skip the loop
# Calculate the number of iterations (process 4 bits at a time)
addi s5, s3, 3
srli s5, s5, 2 # s5 = (effectiveBits + 3) / 4
reverse_loop:
beqz s5, reverse_done # If iteration count is zero, exit the loop
slli s2, s2, 4 # result <<= 4
andi s4, a0, 0xF # s4 = n & 0xF
# Get the reversed 4 bits from the lookup table
la t0, reverse_table
add t0, t0, s4
lb s4, 0(t0) # s4 = reverse_table[n & 0xF]
or s2, s2, s4 # result |= reversed bits
srli a0, a0, 4 # n >>= 4
addi s5, s5, -1 # Decrement iteration count
j reverse_loop
reverse_done:
# Left shift result by leadingZeros bits
mv a0, s2
beqz s1, reverse_end
sll a0, a0, s1 # result <<= leadingZeros
reverse_end:
# Restore registers
lw ra, 24(sp)
lw s0, 20(sp)
lw s1, 16(sp)
lw s2, 12(sp)
lw s3, 8(sp)
lw s4, 4(sp)
lw s5, 0(sp)
addi sp, sp, 28
ret
```
Use a lookup table (reverse_table) to reverse bits in chunks of 4 bits at a time instead of processing each bit individually.
Also, by using loop unrolling to reduced iterations. Fewer iterations mean fewer loop overhead instructions and fewer branch instructions, which can improve performance due to better instruction pipeline utilization and reduced branch prediction errors.
The input number has many significant bits, leading to greater efficiency gains from processing multiple bits at once. On the contrary, few significant bits, where the overhead of the optimization may not provide a net performance benefit over the original method.
### Analyze
ver1: origin
ver2: origin with clz
ver3: Ver2 with reduced iterations and a lookup table
-----
Used `0b00000010100101000001111010011100` to test.
| Execution info | ver1 | ver2 | ver3 |
|---------------------|------------|-----------| --------- |
| Cycles | 325 | 334 | 233 |
| Instrs. retired | 248 | 254 | 180 |
| CPI | 1.31 | 1.31 | 1.29 |
| IPC | 0.761 | 0.76 | 0.773 |
There are not enough zeros at the beginning to use the benefits of clz.
----
Test with `0b00000000000000000000000010011100` .
* ver
| Execution info | ver1 | ver2 | ver3 |
|---------------------|------------|------------|------------|
| Cycles | 325 | 352 | 325 |
| Instrs. retired | 247 | 272 | 251 |
| CPI | 1.32 | 1.29 | 1.29 |
| IPC | 0.76 | 0.773 | 0.772 |
When there are many leading zeros, using the CLZ (Count Leading Zeros) instruction can improve performance by reducing CPI and increasing IPC.
However, when there are few significant bits, in this case, the lookup is performed only twice, the cost of the lookup outweighs the time saved.
We can see the IPC reduce from 0.773 to 0.772.
## Conclusion
From the experiments, we can conclude the following:
* **CLZ (Count Leading Zeros)** improves performance when there are many leading zeros in the input by reducing the number of iterations. However, when there are few leading zeros, it adds overhead, resulting in slower performance.
* **Reducing iterations and using a lookup table**, as in ver3, significantly improves performance when there are fewer leading zeros, reducing both the cycle count and number of instructions executed. However, it provides less benefit when the CLZ is already effective.
In essence, CLZ works well for inputs with many leading zeros, while reducing iterations and using a lookup table is more efficient for inputs with fewer leading zeros.