# Homework2: RISC-V Toolchain
###### tags: `2022 computer architecture`
[TOC]
## Original code [Contains Duplicate](https://leetcode.com/problems/contains-duplicate/)
* I choose the Problem **Contain Duplicate** from [莊集](https://hackmd.io/@y8jRQNyoRe6WG-qekloIlA/Sk0PXEDzj) since it contains complete heap sort implementation written in assembly code, and I want to compare the code size and execution speed with the the one with optimization.
* c code written from [莊集](https://hackmd.io/@y8jRQNyoRe6WG-qekloIlA/Sk0PXEDzj)
```cpp=
#include<stdio.h>
#include<stdbool.h>
void swap(int *x, int *y){
int temp = *x;
*x = *y;
*y = temp;
}
void heapify(int *nums, int i, int numsSize){
int currValue = nums[i];
int j = i * 2 + 1; // left child
int parent;
while (j < numsSize){
if (j < (numsSize - 1)){
if (nums[j] < nums[j + 1]){
j = j + 1;
}
}
if (currValue >= nums[j])
break;
else{
if (j % 2 == 0)
parent = j / 2 - 1;
else
parent = j / 2;
nums[parent] = nums[j];
j = j * 2 + 1;
}
}
if (j % 2 == 0)
parent = j / 2 - 1;
else
parent = j / 2;
nums[parent] = currValue;
}
void heapSort(int *nums, int numsSize){
for (int i = numsSize / 2 - 1; i >= 0; i--){
heapify(nums, i, numsSize);
}
for (int i = numsSize - 1; i >= 0; i--){
swap(&nums[0], &nums[i]);
heapify(nums, 0, i);
}
}
bool containsDuplicate(int* nums, int numsSize){
heapSort(nums, numsSize);
for (int i = 0; i < numsSize - 1; i++){
if (nums[i] == nums[i + 1])
return true;
}
return false;
}
int main(){
int question1[] = {1, 4, 5, 6, 2, 3};
bool answer1 = false;
printf("Question1 ");
if (answer1 == containsDuplicate(question1, (sizeof(question1)/sizeof(question1[0]))))
printf("Accepted\n");
else
printf("Failed\n");
int question2[] = {3, 2, 1, 4, 2, 1};
bool answer2 = true;
printf("Question2 ");
if (answer2 == containsDuplicate(question2, (sizeof(question2)/sizeof(question2[0]))))
printf("Accepted\n");
else
printf("Failed\n");
int question3[] = {-1, -2, 3, 4, -2, -1};
bool answer3 = true;
printf("Question3 ");
if (answer3 == containsDuplicate(question3, (sizeof(question3)/sizeof(question3[0]))))
printf("Accepted\n");
else
printf("Failed\n");
return 0;
}
```
I'm trying to execute the assembly code written by 莊集 via rv32emu, while it remains some problems to be solved.
```shell=
##con.S comes from the assembly code written from 莊集
$ riscv-none-elf-as -R -march=rv32i -mabi=ilp32 -o con.o con.S
$ riscv-none-elf-ld con.elf -T con.lds --oformat=elf32-littleriscv con.o
```

## C Code Compilation and Analysis
### -O0 Assembly Code
* [-O0 Assembly Code](https://hackmd.io/oRIaSeU-Ts-1u_kB6fGm_g#-O0-assembly-code)
* **observation**
* heaplify
* LOC: 103
* sx Registers : s0
* tx Registers : None
* ax Resgisters: a0, a1, a2, a3, a4, a5
* Others : sp
* Stack Used : 48
* Number of load/save: 48
* Branch Number: 11
* heapSort
* LOC: 49
* s registers : s0
* t registers : None
* a resgisters: a0, a1, a2, a3, a4, a5
* others : sp,ra
* stack used : 48
* Number of load/save: 25
* Branch Number: 8
* containDuplicate
* LOC: 39
* s registers : 21
* t registers : None
* a resgisters: a0, a1, a2, a3, a4, a5
* others : SP, RA
* stack used : 48
* Number of load/save: 21
* Branch Number: 6
### -O1 Assembly Code
* [-O1 Assembly Code](https://hackmd.io/oRIaSeU-Ts-1u_kB6fGm_g?view#-O1-Assembly-Code)
* **observation**
* heaplify
* LOC: 50
* sx Registers : None
* tx Registers : None
* ax Resgisters: a0, a1, a2, a3, a4, a5, a6
* Others : None
* Stack Used : 0
* Number of load/save: 4
* Branch Number: 9
* heapSort
* LOC: 44
* s registers : s0, s1, s2, s3
* t registers : None
* a resgisters: a0, a1, a2, a3, a4, a5
* others : sp,ra
* stack used : 32
* Number of load/save: 17
* Branch Number: 7
* containDuplicate
* LOC: 29
* s registers : s0, s1
* t registers : None
* a resgisters: a0, a1, a2, a3, a4, a5
* others : sp, ra
* stack used : 16
* Number of load/save: 13
* Branch Number: 7
### -O2 Assembly Code
* [-O2 Assembly Code](https://hackmd.io/oRIaSeU-Ts-1u_kB6fGm_g?view#-O2-Assembly-Code)
* **observation**
* heaplify
* LOC: 48
* sx Registers : None
* tx Registers : t1, t3, t4
* ax Resgisters: a0, a1, a2, a3, a4, a5, a6, a7
* Others : None
* Stack Used : 0
* Number of load/save: 6
* Branch Number: 7
* heapSort
* LOC: 44
* s registers : s0, s1, s2, s3
* t registers : None
* a resgisters: a0, a1, a2, a3, a4, a5
* others : sp,ra
* stack used : 32
* Number of load/save: 17
* Branch Number: 7
* containDuplicate
* LOC: 33
* s registers : s0, s1
* t registers : None
* a resgisters: a0, a1, a2, a3, a4, a5
* others : sp, ra
* stack used : 16
* Number of load/save: 13
* Branch Number: 7
### -O3 Assembly Code
* [-O3 Assembly Code](https://hackmd.io/oRIaSeU-Ts-1u_kB6fGm_g?view#-O3-Assembly-Code)
* **observation**
* heaplify
* LOC: 48
* sx Registers : None
* tx Registers : t1, t3, t4
* ax Resgisters: a0, a1, a2, a3, a4, a5, a6, a7
* Others : None
* Stack Used : 0
* Number of load/save: 6
* Branch Number: 8
* heapSort
* LOC: 95
* s registers : s0, s1, s2, s3
* t registers : t1, t3, t4
* a resgisters: a0, a1, a2, a3, a4, a5, a6, a7
* others : sp,ra
* stack used : 32
* Number of load/save: 31
* Branch Number: 19
* containDuplicate
* LOC: 33
* s registers : s0, s1
* t registers : None
* a resgisters: a0, a1, a2, a3, a4, a5
* others : sp, ra
* stack used : 16
* Number of load/save: 13
* Branch Number: 7
### -Os Assembly Code
* [-Os Assembly Code](https://hackmd.io/oRIaSeU-Ts-1u_kB6fGm_g?view#-Os-Assembly-Code)
* **observation**
* heaplify
* LOC: 41
* sx Registers : None
* tx Registers : None
* ax Resgisters: a0, a1, a2, a3, a4, a5, a6, a7
* Others : None
* Stack Used : 0
* Number of load/save: 6
* Branch Number: 7
* heapSort
* LOC: 39
* s registers : s0, s1, s2
* t registers : None
* a resgisters: a0, a1, a2, a3, a4, a5, a6, a7
* others : sp,ra
* stack used : 16
* Number of load/save: 13
* Branch Number: 7
* containDuplicate
* LOC: 26
* s registers : s0, s1
* t registers : None
* a resgisters: a0, a1, a2, a3, a4, a5
* others : sp, ra
* stack used : 16
* Number of load/save: 11
* Branch Number: 6
### -Ofast Assembly Code
* [-Ofast Assembly Code](https://hackmd.io/oRIaSeU-Ts-1u_kB6fGm_g?view#-O3-Assembly-Code)
* **observation**
* heaplify
* LOC: 48
* sx Registers : None
* tx Registers : t1, t3, t4
* ax Resgisters: a0, a1, a2, a3, a4, a5, a6, a7
* Others : None
* Stack Used : 0
* Number of load/save: 6
* Branch Number: 8
* heapSort
* LOC: 95
* s registers : s0, s1, s2, s3
* t registers : t1, t3, t4
* a resgisters: a0, a1, a2, a3, a4, a5, a6, a7
* others : sp,ra
* stack used : 32
* Number of load/save: 31
* Branch Number: 20
* containDuplicate
* LOC: 33
* s registers : s0, s1
* t registers : None
* a resgisters: a0, a1, a2, a3, a4, a5
* others : sp, ra
* stack used : 16
* Number of load/save: 12
* Branch Number: 7
### Summary
We could observe that in O3 and Ofast optimization choices, it uses the tx registers for construction in the **heap sort** stage ; although the size of the code(LOC) increase tremendously, the number of CSR cycle has decreased. In conclusion, in this example, if you mind the code size more, choose **-Os**; however, if the speed means a lot more to you, choose **-O3** or **-Ofast**.
| Subject | -O0 | -O1 | -O2 | -O3 | -Os | -Ofast |
|:----------------:|:--------:|:-----:|:-----:|:-----:| ----- | ---- |
| LOC | 191 | 123 | 125 | 176 | 106 | 176 |
| Used Registers | 9 | 13 | 17 | 17 | 13 | 17 |
| CSR cycle count | 6482 | 4862 |4940 | 4714 | 4984 | 4714 |
* result of riscv-none-elf-size
| Subject | -O0 | -O1 | -O2 | -O3 | -Os | -Ofast |
|:----------------:|:--------:|:-----:|:-----:|:-----:| ----- | ---- |
| text| 76204 | 75792 | 75792 | 75996 | 75616 | 75996 |
| data | 2816 | 2816 | 2816 | 2816 | 2816 | |
| bss | 812 | 812 | 812 | 812 | 812 | 812 |
| dec | 79832 | 79420 |79420 | 79624 | 79244 | 79624 |
| hex | 137d8 | 1363c |1363c | 13708 | 1358c | 13708 |
## Optimization of the assembly code
### Loop unrolling
* The original execution information

* After duplicate the **containsDuplicateWhile** loop for three times
```assembly=
containsDuplicateWhile:
bge s3, a1, containsDuplicateFalse # while j < numsSize then do the comparaion with nums[i] and nums[i + 1]
slli s4, s2, 2 # let s4 be offset of nums + 4 * i, which means nums[i]
add s4, a0, s4 # s4 = nums + 4 * i
lw s4, 0(s4) # s4 = nums[i]
slli s5, s3, 2 # let s4 be offset of nums + 4 * j, which means nums[j] == nums[i + 1]
add s5, a0, s5 # s4 = nums + 4 * i
lw s5, 0(s5) # s4 = nums[j] = nums[i + 1]
beq s4, s5, containsDuplicateTrue # True if there is identical elements sit together
addi s2, s2, 1 # i++
addi s3, s3, 1 # j++
j containsDuplicateWhile
```

* Unroll the **swayAndHeapifyWhile** for two times
```assembly=
swayAndHeapifyWhile:
bge s2, zero, swapAndHeapifySkip # while i >= 0 then do swap and heapify
lw ra, 0(sp) # restore ra
lw s0, 4(sp) # restore s0
lw s1, 8(sp) # restore s1
lw s2, 12(sp) # restore s2
lw s3, 16(sp) # restore s3
lw s4, 20(sp) # restore s4
addi sp, sp, 24
jr ra
swapAndHeapifySkip:
addi s3, s0, 0 # let s3 be 0 and get nums[0]
slli s4, s2, 2 # let s4 be i' and get nums[i']
add s4, s0, s4 # s4 = nums + 4 * i'
## call swap
mv a0, s3
mv a1, s4
jal swap # swap(&nums[0], &nums[i])
## call heapify
mv a0, s0
mv a1, s2
jal beforeHeapSort
addi s2, s2, -1 # i(s2)--
```

### Change the instruction order
```assembly=
## original
beq t3, zero, innerEvenAssign # else if (j % 2 == 1) then parent = j / 2
j innerOddAssign
innerEvenAssign:
srli t3, t4, 1 # let t3 be parent = j / 2 - 1
addi t3, t3, -1
j innerOddAssignSkip
innerOddAssign:
srli t3, t4, 1 # let t3 be parent = j / 2
innerOddAssignSkip:
slli t3, t3, 2 # t3 = parent * 4
```
```assembly=
##After switching the order of instruction
beq t3, zero, innerEvenAssign # else if (j % 2 == 1) then parent = j / 2
innerOddAssign:
srli t3, t4, 1 # let t3 be parent = j / 2
j innerOddAssignSkip
innerEvenAssign:
srli t3, t4, 1 # let t3 be parent = j / 2 - 1
addi t3, t3, -1
innerOddAssignSkip:
slli t3, t3, 2 # t3 = parent * 4
```
### Loop unrolling after modifying the instruction order
* unroll **heapifyWhile** loop
```assembly=
heapifyWhile:
bge t4, a2, assignCurrVal # while j < numsSize do adjust parent node
addi t3, a2, -1 # if j < (numsSize - 1) && nums[j] < nums[j + 1] then j++
slt t3, t4, t3 # if j < numsSize - 1 then keep doing
beq t3, zero, heapifySkip
addi s11, t4, 1 # let s11 be dereference of *(nums + (j + 1))
slli s11, s11, 2 # s11 = 4 * (j + 1)
add s11, a0, s11 # s11 = nums + 4 * (j + 1)
lw s11, 0(s11) # s11 = *(nums + 4(j + 1)) = nums[j + 1]
slli s10, t4, 2 # s10 = 4 * j
add s10, a0, s10 # s10 = (num + 4j)
lw s10, 0(s10) # s10 = *(num + 4j) = nums[j]
slt t3, s10, s11 # if nums[j] < nums[j + 1] ?
beq t3, zero, heapifySkip # if nums[j] < nums[j + 1] then j++
addi t4, t4, 1 # j++
heapifySkip:
slli s10, t4, 2 # let s10 = 4 * j
add s10, a0, s10 # s10 = (nums + 4j)
lw s10, 0(s10) # s10 = *(nums + 4j)
slt s10, t5, s10 # if (currValue >= nums[j]) then break
beq s10, zero, assignCurrVal
andi t3, t4, 1 # if (j % 2 == 0) then parent = j / 2 - 1
beq t3, zero, innerEvenAssign # else if (j % 2 == 1) then parent = j / 2
innerOddAssign:
srli t3, t4, 1 # let t3 be parent = j / 2
j innerOddAssignSkip
innerEvenAssign:
srli t3, t4, 1 # let t3 be parent = j / 2 - 1
addi t3, t3, -1
innerOddAssignSkip:
slli t3, t3, 2 # t3 = parent * 4
add t3, a0, t3 # t3 = (nums + 4 * parent)
slli s11, t4, 2 # let s11 = 4 * j
add s11, a0, s11 # s11 = nums + 4j
lw s11, 0(s11) # s11 = nums[j]
sw s11, 0(t3) # nums[parent] = nums[j]
slli t4, t4, 1 # j = j * 2 + 1
addi t4, t4, 1
j heapifyWhile
```

* unroll **printArrayWhile** for 2 times
```assembly=
printArrayWhile:
blt t0, a1, printArraySkip # while i < numsSize, then do print array
mv a0, s0 # restore nums head addr
mv a1, s1 # restore numsSize
lw ra, 0(sp) # restore ra
lw s0, 4(sp) # restore s0
lw s1, 8(sp) # restore s1
addi sp, sp, 12
jr ra # return if i !< numsSize
printArraySkip:
lw a0, 0(t1) # a0 = t1[i]
addi a7, zero, 1 # print number
ecall
addi t0, t0, 1 # i++
addi t1, t1, 4 # t1 += 4, means iterate to next element in int array
j printArrayWhile
```
