# Assignment1: RISC-V Assembly and Instruction Pipeline
contributed by < [`李尚宸`](https://github.com/TreeLand1101/NCKU-Computer_Architecture/tree/main/assignment1) >
## Quiz1 Problem C (Counting Leading Zeros)
:::danger
Choose one problem (A, B, or C) from [Quiz1](https://hackmd.io/@sysprog/arch2024-quiz1-sol), translate it from C code to a complete RISC-V assembly program, and include the relevant test data.
:::
### C programs
#### fabsf.c
```c
#include <stdio.h>
#include <stdint.h>
static inline float fabsf(float x) {
uint32_t i = *(uint32_t *)&x;
i &= 0x7FFFFFFF;
x = *(float *)&i;
return x;
}
int main() {
float float_nums[] = {3.14f, -3.14f, 0.0f, -0.0f, 1.0f, -1.0f, 123.99f, -123.99f};
int n = sizeof(float_nums) / sizeof(float_nums[0]);
for (int i = 0; i < n; i++) {
printf("%f\n", fabsf(float_nums[i]));
}
return 0;
}
```
#### my_clz.c
```c
#include <stdio.h>
#include <stdint.h>
static inline int my_clz(uint32_t x) {
int count = 0;
for (int i = 31; i >= 0; --i) {
if (x & (1U << i))
break;
count++;
}
return count;
}
int main() {
/*
Test case 0: Input 0, binary: 00000000 00000000 00000000 00000000, Expected: 32
Test case 1: Input 1, binary: 00000000 00000000 00000000 00000001, Expected: 31
Test case 2: Input 1024, binary: 00000000 00000000 00000100 00000000, Expected: 21
*/
int test_case[] = {0, 1, 1024};
for (int i = 0; i < 3; ++i) {
printf("%d\n", my_clz(test_case[i]));
}
return 0;
}
```

:::danger
Show the full C source code corresponding to the original problem set.
:::
#### fp16_to_fp32.c
```c
#include <stdio.h>
#include <stdint.h>
static inline uint32_t fp16_to_fp32(uint16_t h) {
const uint32_t w = (uint32_t)h << 16;
const uint32_t sign = w & UINT32_C(0x80000000);
const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF);
uint32_t renorm_shift = __builtin_clz(nonsign);
renorm_shift = renorm_shift > 5 ? renorm_shift - 5 : 0;
const int32_t inf_nan_mask = ((int32_t)(nonsign + 0x04000000) >> 8) & INT32_C(0x7F800000);
const int32_t zero_mask = (int32_t)(nonsign - 1) >> 31;
return sign | ((((nonsign << renorm_shift >> 3) +
((0x70 - renorm_shift) << 23)) | inf_nan_mask) & ~zero_mask);
}
void test_fp16_to_fp32(uint16_t h) {
uint32_t result = fp16_to_fp32(h);
printf("fp16: 0x%04X, fp32: 0x%08X\n", h, result);
}
int main() {
uint16_t test_cases[] = {
0x3C00, // 1.0 in half-precision
0x4000, // 2.0 in half-precision
0x4040, // 3.0 in half-precision
0x0000, // +0.0 in half-precision
0x8000, // -0.0 in half-precision
0x7C00, // +Inf in half-precision
0xFC00, // -Inf in half-precision
0x7E00, // NaN in half-precision
0xB800, // -2.0 in half-precision
0x0400 // Very small positive number in half-precision
};
int num_tests = sizeof(test_cases) / sizeof(test_cases[0]);
for (int i = 0; i < num_tests; i++) {
test_fp16_to_fp32(test_cases[i]);
}
return 0;
}
```
### Assembly programs
#### fabsf.s
```c
.data
.align 4
float_nums:
.word 0x4048F5C3 # 3.14f (IEEE 754 representation)
.word 0xC048F5C3 # -3.14f
.word 0x00000000 # 0.0f
.word 0x80000000 # -0.0f
.word 0x3F800000 # 1.0f
.word 0xBF800000 # -1.0f
.word 0x42F7E666 # 123.99f
.word 0xC2F7E666 # -123.99f
newline:
.asciz "\n"
.text
.globl main
main:
la t0, float_nums
li t1, 8 # number of elements in float_nums
loop:
lw t2, 0(t0) # float_nums[i]
li t3, 0x7FFFFFFF # clear the sign bit
and t2, t2, t3 # float_nums[i] = float_nums[i] & 0x7FFFFFFF
jal ra, print_float
addi t0, t0, 4 # Next element
addi t1, t1, -1 # t1--
bnez t1, loop # If (t1 != 0), continue the loop
li a0, 0 # Exit status code
li a7, 93 # System call for exit
ecall
print_float:
li a7, 2
mv a0, t2
ecall
li a7, 4
la a0, newline
ecall
ret
```
#### my_clz.s
```c
.data
num_test_cases:
.word 3
test_case_array:
.word 0, 1, 1024
newline:
.asciz "\n"
.text
.globl main
main:
la t0, num_test_cases
lw t1, 0(t0)
la t0, test_case_array
li t2, 0
loop_main:
beq t1, t2, end_main # if (i == number) of test cases, exit loop
lw a0, 0(t0) # test_case[i]
jal ra, my_clz
li a7, 1 # System call code for print integer
ecall
la a0, newline
li a7, 4 # System call code for print string
ecall
addi t0, t0, 4 # Next test_case
addi t2, t2, 1 # i++
j loop_main
end_main:
li a7, 10 # System call code for exit
ecall
my_clz:
li t3, 31 # i = 31
li t4, 0 # count = 0
loop_my_clz:
li t5, 1 # t5 = 1
sll t5, t5, t3 # t5 = 1 << i
and t5, a0, t5 # t5 = x & (1 << i)
bnez t5, end_my_clz # if ((x & (1 << i)) != 0), exit loop
addi t4, t4, 1 # count++
addi t3, t3, -1 # i--
bltz t3, end_my_clz # if (i < 0), exit loop
j loop_my_clz
end_my_clz:
mv a0, t4 # return count
ret
```
:::danger
Use fewer instructions.
:::

#### fp16_to_fp32.s
```c
```
### Summary of Registers Used
#### `my_clz.s`
- `main` function:
- `t0`: Pointers for `num_test_cases` and `test_case_array`.
- `t1`: Number of test cases.
- `t2`:`i`
- `a7`: System call register.
- `my_clz` function:
- `a0`: Argument register used for passing the current test case and returning the result
- `t3`: `i`
- `t4`: `count`
- `t5`: `x & (1U << i)`
### Loop Unrolling
#### `my_clz_loop_unrolling.s`
- We perform loop unrolling in the `main` function
```c
.data
num_test_cases:
.word 3
test_case_array:
.word 0, 1, 1024
newline:
.asciz "\n"
.text
.globl main
main:
lw t1, 3
la t0, test_case_array
li t2, 0
test_case0:
lw a0, 0(t0)
jal ra, my_clz
li a7, 1 # System call code for print integer
ecall
la a0, newline
li a7, 4 # System call code for print string
ecall
addi t0, t0, 4 # Next test_case
addi t2, t2, 1 # i++
test_case1:
lw a0, 0(t0)
jal ra, my_clz
li a7, 1
ecall
la a0, newline
li a7, 4
ecall
addi t0, t0, 4
addi t2, t2, 1
test_case2:
lw a0, 0(t0)
jal ra, my_clz
li a7, 1
ecall
la a0, newline
li a7, 4
ecall
end_main:
li a7, 10 # System call code for exit
ecall
my_clz:
li t3, 31 # i = 31
li t4, 0 # count = 0
loop_my_clz:
li t5, 1 # t5 = 1
sll t5, t5, t3 # t5 = 1 << i
and t5, a0, t5 # t5 = x & (1 << i)
bnez t5, end_my_clz # if ((x & (1 << i)) != 0), exit loop
addi t4, t4, 1 # count++
addi t3, t3, -1 # i--
bltz t3, end_my_clz # if (i < 0), exit loop
j loop_my_clz
end_my_clz:
mv a0, t4 # return count
ret
```

### A Comparison of C, Non-Loop Unrolling, and Loop Unrolling Implementations
#### `my_clz`
| | C | Non-Loop Unrolling Assembly | Loop Unrolling Assembly |
| - | - | - | - |
| Cycles | 7469 | 946 | 928 |
| Instrs. retired | 5603 | 736 | 726 |
| CPI | 1.33 | 1.29 | 1.28 |
| IPC | 0.75 | 0.778 | 0.782 |
| Clock rate | 6.48 KHz | 59.17 Hz | 1.33 KHz |
---
## [Leetcode 268. Missing Number](https://leetcode.com/problems/missing-number/)
### Problem
Given an array nums containing n distinct numbers in the range [0, n], return the only number in the range that is missing from the array.
#### Example 1:
Input: nums = [3,0,1]
Output: 2
Explanation: n = 3 since there are 3 numbers, so all numbers are in the range [0,3]. 2 is the missing number in the range since it does not appear in nums.
#### Example 2:
Input: nums = [0,1]
Output: 2
Explanation: n = 2 since there are 2 numbers, so all numbers are in the range [0,2]. 2 is the missing number in the range since it does not appear in nums.
#### Example 3:
Input: nums = [9,6,4,2,3,5,7,0,1]
Output: 8
Explanation: n = 9 since there are 9 numbers, so all numbers are in the range [0,9]. 8 is the missing number in the range since it does not appear in nums.
### C Program
```c
#include <stdio.h>
int missing_number(int* nums, int n) {
int mask = 0, i = 0;
for (i = 0; i < n; i++) {
mask = mask ^ i ^ nums[i];
}
return mask ^ i;
}
int main() {
int nums0[] = {3, 0, 1};
int nums1[] = {0, 1};
int nums2[] = {9, 6, 4, 2, 3, 5, 7, 0, 1};
printf("%d\n", missing_number(nums0, sizeof(nums0) / sizeof(nums0[0]))); // Expected missing number is 2
printf("%d\n", missing_number(nums1, sizeof(nums1) / sizeof(nums1[0]))); // Expected missing number is 2
printf("%d\n", missing_number(nums2, sizeof(nums2) / sizeof(nums2[0]))); // Expected missing number is 8
return 0;
}
```

:::danger
You should automate the testing procedures as much as possible, meaning there's no need to manually check each program output. Instead, implement internal validation to handle the result checking.
:::
### Assembly Program
```c
.data
nums0:
.word 3, 0, 1
nums1:
.word 0, 1
nums2:
.word 9, 6, 4, 2, 3, 5, 7, 0, 1
len_nums0:
.word 3
len_nums1:
.word 2
len_nums2:
.word 9
newline:
.asciz "\n"
.text
.globl main
main:
# missing number of num0
la a0, nums0
lw a1, len_nums0
jal ra, missing_number
li a7, 1 # System call code for print integer
ecall
la a0, newline
li a7, 4 # System call code for print string
ecall
# missing number of num1
la a0, nums1
lw a1, len_nums1
jal ra, missing_number
li a7, 1
ecall
la a0, newline
li a7, 4
ecall
# missing number of num2
la a0, nums2
lw a1, len_nums2
jal ra, missing_number
li a7, 1
ecall
la a0, newline
li a7, 4
ecall
end_main:
li a7, 10 # System call code for exit
ecall
missing_number:
li t0, 0 # mask = 0
li t1, 0 # i = 0
loop_missing_number:
beq t1, a1, end_missing_number # if (i == n), exit loop
lw t2, 0(a0) # Load nums[i]
xor t0, t0, t1 # mask = mask ^ i
xor t0, t0, t2 # mask = mask ^ nums[i]
addi a0, a0, 4 # Next element
addi t1, t1, 1 # i++
j loop_missing_number
end_missing_number:
xor a0, t0, t1 # mask ^ n
ret
```
:::danger
Use fewer instructions.
:::

### Summary of Registers Used
- `main` function:
- `a0`: Pointer to the current array (`nums1`, `nums2`, `nums3`).
- `a1`: Length of the current array (`len_nums1`, `len_nums2`, `len_nums3`).
- `a7`: System call register.
- `missing_number` function:
- `t0`: `mask`
- `t1`: `i`
- `t2`: `nums[i]`
### Loop Unrolling
- We perform loop unrolling in the `missing_number` function
```c
.data
nums0:
.word 3, 0, 1
nums1:
.word 0, 1
nums2:
.word 9, 6, 4, 2, 3, 5, 7, 0, 1
len_nums0:
.word 3
len_nums1:
.word 2
len_nums2:
.word 9
newline:
.asciz "\n"
.text
.globl main
main:
nums0_missing_number:
la a0, nums0
li t0, 0 # mask = 0
li t1, 0 # i = 0
lw t2, 0(a0) # Load nums0[0]
xor t0, t0, t1 # mask = mask ^ i
xor t0, t0, t2 # mask = mask ^ nums0[0]
addi a0, a0, 4 # Next element
addi t1, t1, 1 # i++
lw t2, 0(a0) # Load nums0[1]
xor t0, t0, t1 # mask = mask ^ i
xor t0, t0, t2 # mask = mask ^ nums0[1]
addi a0, a0, 4 # Next element
addi t1, t1, 1 # i++
lw t2, 0(a0) # Load nums0[2]
xor t0, t0, t1 # mask = mask ^ i
xor t0, t0, t2 # mask = mask ^ nums0[2]
addi t1, t1, 1 # i++
xor a0, t0, t1 # mask ^ n
li a7, 1 # System call code for print integer
ecall
la a0, newline
li a7, 4 # System call code for print string
ecall
nums1_missing_number:
la a0, nums1
li t0, 0 # mask = 0
li t1, 0 # i = 0
lw t2, 0(a0) # Load nums1[0]
xor t0, t0, t1 # mask = mask ^ i
xor t0, t0, t2 # mask = mask ^ nums1[0]
addi a0, a0, 4 # Next element
addi t1, t1, 1 # i++
lw t2, 0(a0) # Load nums1[1]
xor t0, t0, t1 # mask = mask ^ i
xor t0, t0, t2 # mask = mask ^ nums1[1]
addi t1, t1, 1 # i++
xor a0, t0, t1 # mask ^ n
li a7, 1
ecall
la a0, newline
li a7, 4
ecall
nums2_missing_number:
la a0, nums2
li t0, 0 # mask = 0
li t1, 0 # i = 0
lw t2, 0(a0) # Load nums2[0]
xor t0, t0, t1 # mask = mask ^ i
xor t0, t0, t2 # mask = mask ^ nums2[0]
addi a0, a0, 4 # Next element
addi t1, t1, 1 # i++
lw t2, 0(a0) # Load nums2[1]
xor t0, t0, t1 # mask = mask ^ i
xor t0, t0, t2 # mask = mask ^ nums2[1]
addi a0, a0, 4 # Next element
addi t1, t1, 1 # i++
lw t2, 0(a0) # Load nums2[2]
xor t0, t0, t1 # mask = mask ^ i
xor t0, t0, t2 # mask = mask ^ nums2[2]
addi a0, a0, 4 # Next element
addi t1, t1, 1 # i++
lw t2, 0(a0) # Load nums2[3]
xor t0, t0, t1 # mask = mask ^ i
xor t0, t0, t2 # mask = mask ^ nums2[3]
addi a0, a0, 4 # Next element
addi t1, t1, 1 # i++
lw t2, 0(a0) # Load nums2[4]
xor t0, t0, t1 # mask = mask ^ i
xor t0, t0, t2 # mask = mask ^ nums2[4]
addi a0, a0, 4 # Next element
addi t1, t1, 1 # i++
lw t2, 0(a0) # Load nums2[5]
xor t0, t0, t1 # mask = mask ^ i
xor t0, t0, t2 # mask = mask ^ nums2[5]
addi a0, a0, 4 # Next element
addi t1, t1, 1 # i++
lw t2, 0(a0) # Load nums2[6]
xor t0, t0, t1 # mask = mask ^ i
xor t0, t0, t2 # mask = mask ^ nums2[6]
addi a0, a0, 4 # Next element
addi t1, t1, 1 # i++
lw t2, 0(a0) # Load nums2[7]
xor t0, t0, t1 # mask = mask ^ i
xor t0, t0, t2 # mask = mask ^ nums2[7]
addi a0, a0, 4 # Next element
addi t1, t1, 1 # i++
lw t2, 0(a0) # Load nums2[8]
xor t0, t0, t1 # mask = mask ^ i
xor t0, t0, t2 # mask = mask ^ nums2[8]
addi t1, t1, 1 # i++
xor a0, t0, t1 # mask ^ n
li a7, 1
ecall
la a0, newline
li a7, 4
ecall
end_main:
li a7, 10 # System call code for exit
ecall
```

### A Comparison of C, Non-Loop Unrolling, and Loop Unrolling Implementations
| | C | Non-Loop Unrolling Assembly | Loop Unrolling Assembly |
| - | - | - | - |
| Cycles | 5112 | 212 | 120 |
| Instrs. retired | 3847 | 148 | 102 |
| CPI | 1.33 | 1.43 | 1.18 |
| IPC | 0.753 | 0.698 | 0.85 |
| Clock rate | 7.83 KHz | 27.25 Hz | 77.42 KHz |
---
## Hazard
1. **Data Hazards**:
- **RAW (Read After Write)**: An instruction reads data before a previous instruction writes to it.
- **Example**:
```c
add x1, x2, x3 # Writes to x1
sub x4, x1, x5 # Reads x1 before write completes
```
- **Solution**: Data forwarding, pipeline stalling.
- **WAR (Write After Read)**: An instruction writes data before a previous one reads it.
- **Example**:
```c
sub x1, x2, x3 # Reads x3
add x3, x4, x5 # Writes to x3 before the read occurs
```
- **Solution**: Reordering instructions, or register renaming.
- **WAW (Write After Write)**: Two instructions write to the same register in the wrong order, causing data loss.
- **Example**:
```c
add x1, x2, x3 # Writes to x1
sub x1, x4, x5 # Overwrites x1 before the first write completes
```
- **Solution**: Pipeline stall, register renaming.
2. **Control Hazards**:
- Caused by branch instructions that change the instruction flow. Mispredicting can delay execution.
- **Solution**: Branch prediction, delayed branching.
3. **Structural Hazards**:
- Occur when multiple instructions require the same hardware resource at the same time.
- **Solution**: Adding more resources (e.g., multiple ALUs), or using multiple pipelines.
## Analysis
### Ripes Simulator
We use [Ripes](https://github.com/mortbopet/Ripes), a graphical processor simulator and assembly editor for the RISC-V.
There are five stages in the Ripes pipeline.
* `IF`: Instruction Fetch
* `ID`: Instruction Decode & Register Read
* `EX`: Execution or Address Calculation
* `MEM`: Data Memory Access
* `WB`: Write Back

### Case 1: my_clz.s
- **Data Hazard**
```c
loop_my_clz:
li t5, 1 # t5 = 1
sll t5, t5, t3 # t5 = 1 << i
```
- There is a RAW hazard between the `li t5, 1` instruction and the following instructions (`sll`).

- Data forwarding ensures that the value of `t5` is immediately available to the sll instruction, without having to wait for it to propagate through the entire pipeline and write back to the register file.
- **Control Hazard**
- Before `jal x1 48 <my_clz>` execution


- After `jal x1 48 <my_clz>` execution


### Case 2: missing_number.s
- **Data Hazard**
```c
xor t0, t0, t1 # mask = mask ^ i
xor t0, t0, t2 # mask = mask ^ nums[i]
```

- There is a RAW hazard between two `xor` instruction.

- This example, like the one above in my_clz.s, uses data forwarding to avoid data hazards.
- **Control Hazard**
- Before `jal x1 124 <missing_number>` execution


- After `jal x1 124 <missing_number>` execution


- Due to branch instructions that alter the flow of execution, `nop(flush)` instructions are inserted here.