# Assignment1: RISC-V Assembly and Instruction Pipeline
contributed by <[ysesst8902](https://github.com/ysesst8902/Computer_Architecture_hw1)>
###### tags: `RISC-V` `computer architure 2024`
## Problem C
Problem C is convert a 16-bit floating-point number to a 32-bit floating-point number, which will utilize counting leading zeros function.
We will use two important functions:
- clz(counting leading zeros)
- fp16_to_fp32 function
## Implementation of counting leading zeros
:::info
The `__builtin_clz` function is a GCC built-in function that counts the number of leading zero bits in the binary representation of an unsigned integer, starting from the most significant bit (MSB). It returns an integer representing how many zero bits precede the first 1 in the binary representation of the number.
:::
Now I implement it in the simplest way, by checking bit by bit, find the number of 0s from the most significant bit to the first 1.
### Original C code in clz
```c
static inline int my_clz(uint32_t x) {
int count = 0;
for (int i = 31; i >= 0; --i) {
if (x & (1U << i))
break;
count++;
}
return count;
}
```
### Assembly code (RISC-V)
```asm
clz:
addi sp, sp, -20
sw ra, 0(sp)
sw s0, 4(sp)
sw s1, 8(sp)
sw s2, 12(sp)
sw s3, 16(sp)
beq a0, x0, ifInputZero
addi s0, x0, 0
li s1, 31
count_zero:
li s2, 1
sll s2, s2, s1
and s3, a0, s2
bne s3, x0, count_zero_End
addi s0, s0, 1
addi s1, s1, -1
bne s1, x0, count_zero
ifInputZero:
li s0, 32
count_zero_End:
mv a0, s0
lw ra, 0(sp)
lw s0, 4(sp)
lw s1, 8(sp)
lw s2, 12(sp)
lw s3, 16(sp)
addi sp, sp, 20
jr ra
```
## Implement the fp16_to_fp32 function
:::info
Converts a 16-bit floating-point number in IEEE half-precision format (bit representation) to a 32-bit floating-point number in IEEE single-precision format (bit representation).
:::
### Original C code in fp16_to_fp32
```c
static inline uint32_t fp16_to_fp32(uint16_t h) {
const uint32_t w = (uint32_t) h << 16;
const uint32_t sign = w & UINT32_C(0x80000000);
const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF);
uint32_t renorm_shift = my_clz(nonsign);
renorm_shift = renorm_shift > 5 ? renorm_shift - 5 : 0;
const int32_t inf_nan_mask = ((int32_t)(nonsign + 0x04000000) >> 8) &
INT32_C(0x7F800000);
const int32_t zero_mask = (int32_t)(nonsign - 1) >> 31;
return sign | ((((nonsign << renorm_shift >> 3) +
((0x70 - renorm_shift) << 23)) | inf_nan_mask) & ~zero_mask);
}
```
### Assembly code (RISC-V)
```asm
main:
li a0, 0x3C0
jal ra, fp16_to_fp32
li a7, 10
ecall
clz:
addi sp, sp, -20
sw ra, 0(sp)
sw s0, 4(sp)
sw s1, 8(sp)
sw s2, 12(sp)
sw s3, 16(sp)
beq a0, x0, ifInputZero
addi s0, x0, 0
li s1, 31
count_zero:
li s2, 1
sll s2, s2, s1
and s3, a0, s2
bne s3, x0, count_zero_End
addi s0, s0, 1
addi s1, s1, -1
bne s1, x0, count_zero
ifInputZero:
li s0, 32
count_zero_End:
mv a0, s0
lw ra, 0(sp)
lw s0, 4(sp)
lw s1, 8(sp)
lw s2, 12(sp)
lw s3, 16(sp)
addi sp, sp, 20
jr ra
fp16_to_fp32:
addi sp, sp, -16
sw ra, 0(sp)
slli t0, a0, 16 # t0 = a0 << 16
li t1, 0x7FFFFFFF
and t3, t0, t1 # get the exponent and mantissa
sw t0, 4(sp)
sw t3, 8(sp)
sw a0, 12(sp)
mv a0, t3
jal ra, clz
mv t4, a0 # record clz in t4
lw t0, 4(sp)
lw t3, 8(sp)
lw a0, 12(sp)
li t1, 0x80000000 # record t1 in t1
li t1, 5
bgt t4, t1, overflow
li t4, 0 # renorm_shift
j isOverflowEnd
overflow:
addi t4, t4, -5 # renorm_shift
isOverflowEnd:
li t1, 0x04000000
add t5, t1, t3
srli t5, t5, 8
li t1, 0x7F800000
and t5, t5, t1 # inf_nan_mask
addi t1, t3, -1
srli t1, t1, 31 #t1 record zero_mask
sll t3, t3, t4
srli t3, t3, 3 # (nonsign << renorm_shift >> 3)
li t6, 0x70
sub t4, t6, t4
slli t4, t4, 23 # ((0x70 - renorm_shift) << 23)
add t3, t3, t4 # ((nonsign << renorm_shift >> 3) + ((0x70 - renorm_shift) << 23))
or t3, t3, t5
not t1, t1
and t3, t3, t1
or t3, t3, t2
mv a0, t3
lw ra, 0(sp)
addi sp, sp, 16
jr ra
```
## Overview:
* ### `main`: Initializes the input in `a0` and calls the `fp16_to_fp32` function to process it.
* ### `my_clz`: Counts the leading zeros of `a0` and stores the result in `a0`.
* ### `fp16_to_fp32`: Converts a 16-bit floating-point number (stored in `a0`) to a 32-bit floating-point
### Performance:
Cycles:138
Instrs.retired:107
CPI:1.27
IPC:0.775
---
## Optimization
Increase fp16_to_fp32 performance by improving the clz function.
- Uses bit-shifting to dramatically reduce the number of comparisons. It checks 16, 8, 4, 2, and 1 bits at a time, instead of checking bit by bit, which minimizes redundant operations.
- This "divide-and-conquer" approach allows the function to quickly narrow down the number of leading zeros, especially when there are many leading zeros in the input, thus improving efficiency.
### C code in new_clz
```c
unsigned int new_clz(unsigned int x) {
unsigned int count = 32;
if (x == 0)
return count;
count = 0;
if ((x >> 16) == 0) {
count += 16;
x <<= 16;
}
if ((x >> 24) == 0) {
count += 8;
x <<= 8;
}
if ((x >> 28) == 0) {
count += 4;
x <<= 4;
}
if ((x >> 30) == 0) {
count += 2;
x <<= 2;
}
if ((x >> 31) == 0)
count += 1;
return count;
}
```
### Assembly code (RISC-V)
```asm
.data
testcase: .word 0x1, 0xFFFFFFFF, 0x3FFF
answer: .word 31, 0, 18
true: .string "true"
false: .string "false"
newline: .string "\n"
.text
main:
li t2, 3
la t0, testcase
la t1, answer
testLoop:
lw a0, 0(t0)
jal ra, clz_optimized
lw t3, 0(t1)
beq a0, t3, setTrue
la a0, false
j print_result
setTrue:
la a0, true
print_result:
jal ra printTrueFalse
addi t0, t0, 4
addi t1, t1, 4
addi t2, t2, -1
bnez t2, testLoop
li a7, 10
ecall
printTrueFalse:
li a7, 4
ecall
la a0, newline
ecall
jr ra
clz_optimized:
addi sp, sp, -12
sw ra, 0(sp)
sw s0, 4(sp)
sw s1, 8(sp)
li s0, 32
beq a0, x0, ifInputZero
li s0, 0
check_high_16:
srli s1, a0, 16
beq s1, x0, check_high_16_0
srli a0, a0, 16
j check_high_24
check_high_16_0:
addi s0, s0, 16
check_high_24:
srli s1, a0, 8
beq s1, x0, check_high_24_0
srli a0, a0, 8
j check_high_28
check_high_24_0:
addi s0, s0, 8
check_high_28:
srli s1, a0, 4
beq s1, x0, check_high_28_0
srli a0, a0, 4
j check_high_30
check_high_28_0:
addi s0, s0, 4
check_high_30:
srli s1, a0, 2
beq s1, x0, check_high_30_0
srli a0, a0, 2
j check_final
check_high_30_0:
addi s0, s0, 2
check_final:
srli s1, a0, 1
beq s1, x0, final_add1
j ifInputZero
final_add1:
addi s0, s0, 1
ifInputZero:
mv a0, s0
lw ra, 0(sp)
lw s0, 4(sp)
lw s1, 8(sp)
addi sp, sp, 12
jr ra
```
### Performance(3 inputs):
| Column 1 | old_clz | new_clz |
| -------- | -------- | -------- |
| Cycle | 618 |236 |
| Instrs.retired | 460 | 148 |
| CPI | 1.34 | 1.59 |
| IPC | 0.744 | 0.627 |
The new new_clz outperforms the old my_clz by optimizing the counting of leading zeros through fewer comparisons and bit shifts. This results in drastically lower cycle counts and instruction execution. Although the CPI and IPC of the new version increased slightly, these minor changes do not detract from the significant overall performance gain.
## [LeetCode : 2595. Number of Even and Bits](https://leetcode.com/problems/number-of-even-and-odd-bits/description/)
You are given a positive integer n.
Let even denote the number of even indices in the binary representation of n with value 1.
Let odd denote the number of odd indices in the binary representation of n with value 1.
Note that bits are indexed from right to left in the binary representation of a number.
Return the array ```[even, odd]```.
Example 1:
Input: n = 50
Output: [1,2]
Explanation:
The binary representation of 50 is 110010.
It contains 1 on indices 1, 4, and 5.
Example 2:
Input: n = 2
Output: [0,1]
Explanation:
The binary representation of 2 is 10.
It contains 1 only on index 1.
Because this question is to count the individual numbers of 1 in odd and even digits, so in fact, you don’t need to care about the few zeros in front, so I just start where there is a 1.
### C program
```clike
unsigned int clz_optimized(unsigned int x) {
unsigned int count = 32;
if (x == 0)
return count;
count = 0;
if ((x >> 16) == 0) {
count += 16;
x <<= 16;
}
if ((x >> 24) == 0) {
count += 8;
x <<= 8;
}
if ((x >> 28) == 0) {
count += 4;
x <<= 4;
}
if ((x >> 30) == 0) {
count += 2;
x <<= 2;
}
if ((x >> 31) == 0)
count += 1;
return count;
}
int* evenOddBit(int n, int* returnSize) {
int *result = malloc(sizeof(int)*2);
int clz = 31 - clz_optimized(n);
int odd = 0 ;
int even =0;
for(clz ; clz >= 0; --clz){
if (n & (1U << clz)){
if (clz%2)
odd++;
else
even++;
}
}
result[0] = even;
result[1] = odd;
*returnSize = 2;
return result;
}
```
### Assembly code (RISC-V)
```asm
main:
li a0, 0x2
jal ra, EvenOddbit
li a7, 10
ecall
clz_optimized:
addi sp, sp, -12
sw ra, 0(sp)
sw s0, 4(sp)
sw s1, 8(sp)
li s0, 32
beq a0, x0, ifInputZero
li s0, 0
check_high_16:
srli s1, a0, 16
beq s1, x0, check_high_16_0
srli a0, a0, 16
j check_high_24
check_high_16_0:
addi s0, s0, 16
check_high_24:
srli s1, a0, 8
beq s1, x0, check_high_24_0
srli a0, a0, 8
j check_high_28
check_high_24_0:
addi s0, s0, 8
check_high_28:
srli s1, a0, 4
beq s1, x0, check_high_28_0
srli a0, a0, 4
j check_high_30
check_high_28_0:
addi s0, s0, 4
check_high_30:
srli s1, a0, 2
beq s1, x0, check_high_30_0
srli a0, a0, 2
j check_final
check_high_30_0:
addi s0, s0, 2
check_final:
srli s1, a0, 1
beq s1, x0, final_add1
j ifInputZero
final_add1:
addi s0, s0, 1
ifInputZero:
mv a0, s0
lw ra, 0(sp)
lw s0, 4(sp)
lw s1, 8(sp)
addi sp, sp, 12
jr ra
EvenOddbit:
addi sp, sp, -20
sw ra, 0(sp)
sw s0, 4(sp)
sw s1, 8(sp)
sw s2, 12(sp)
sw s3, 16(sp)
mv s3, a0
# calculate start idx
jal ra, clz_optimized # use clz_optimized
li s0, 31 # t0 = 31
sub s0, s0, a0 # s0 = 31 - clz_optimized(n)
li s1, 0 # odd = 0
li s2, 0 # even = 0
loop:
blt s0, x0, end_loop # if s0 < 0, jump to end
li t1, 1 # t1 = 1
sll t1, t1, s0 # t1 = 1 << start
and t1, t1, s3 # If n & (1 << s0)
beq t1, x0, next #If n & (1 << clz) == 0, pass
andi t2, s0, 1 # check s0 is odd or even (s0 & 1)
bnez t2, odd_case # if odd, jump odd_case
addi s2, s2, 1 # even++
j next
odd_case:
addi s1, s1, 1 # odd++
next:
addi s0, s0, -1 # s0--
j loop
end_loop:
mv a0, s2
mv a1, s1
lw ra, 0(sp)
lw s0, 4(sp)
lw s1, 8(sp)
lw s2, 12(sp)
lw s3, 16(sp)
addi sp, sp, 20
jr ra
```