# 2022 Assignment1: RISC-V Assembly and Instruction Pipeline
###### tags: `Computer Architecture`
contribyted by <[`chiangkd`](https://github.com/chiangkd)>
**Requirements follow the assignment below:**
[Assignment1: RISC-V Assembly and Instruction Pipeline](https://hackmd.io/@sysprog/2022-arch-homework1)
## Introduction
The problem I selected is [Leetcode 82. Remove Duplicates from Sorted List II](https://leetcode.com/problems/remove-duplicates-from-sorted-list-ii/)
**Description**
>Given the `head` of a sorted linked list, *delete all nodes that have duplicate numbers, leaving only distinct numbers from the original list. Return the linked list **sorted** as well.*
**Example:**

## C program
### Function of problem
The function below is my solution of leetcode probelm.
```c
struct ListNode *deleteDuplicates(struct ListNode *head)
{
if (!head)
return NULL;
int head_dup = 0;
while (head->next && head->val == head->next->val) {
head_dup = 1;
head = remNodeandFree(head);
}
struct ListNode *c = head;
for (struct ListNode *n = head->next; n; n = n->next) {
if (n->next && n->val == n->next->val) {
struct ListNode *npt = n->next;
struct ListNode *freenode = n;
while (npt && n->val == npt->val) {
npt = remNodeandFree(npt);
n->next = npt;
if (npt && npt->next && npt->val == npt->next->val) {
n = npt;
npt = npt->next;
free(freenode);
}
}
n = remNodeandFree(n);
}
c->next = n;
c = n;
if (!n)
break;
}
if(head_dup)
head = remNodeandFree(head);
return head;
}
```
### Full program
In order to run the program on the local platform, implementing serval functions to make the program more readable.
```c
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#define LEN1 10
#define LEN2 5
#define LEN3 3
struct ListNode
{
int val;
struct ListNode *next;
};
struct ListNode* remNodeandFree(struct ListNode *node) // free node without re-link
{
struct ListNode *c = node;
node = node->next;
free(c);
return node;
}
/* iteration method */
struct ListNode *deleteDuplicates(struct ListNode *head)
{
if (!head)
return NULL;
int head_dup = 0;
while (head->next && head->val == head->next->val) {
head_dup = 1;
head = remNodeandFree(head);
}
struct ListNode *c = head;
for (struct ListNode *n = head->next; n; n = n->next) {
if (n->next && n->val == n->next->val) {
struct ListNode *npt = n->next;
struct ListNode *freenode = n;
while (npt && n->val == npt->val) {
npt = remNodeandFree(npt);
n->next = npt;
if (npt && npt->next && npt->val == npt->next->val) {
n = npt;
npt = npt->next;
free(freenode);
}
}
n = remNodeandFree(n);
}
c->next = n;
c = n;
if (!n)
break;
}
if(head_dup)
head = remNodeandFree(head);
return head;
}
struct ListNode* initial_list(int *arr, int arr_len)
{
struct ListNode *head = malloc(sizeof(struct ListNode));
head->val = arr[0];
struct ListNode *c = head;
for (int i = 0; i < arr_len - 1; i++) {
struct ListNode *next = malloc(sizeof(struct ListNode));
next->val = arr[i + 1];
c->next = next;
c = c->next;
}
c->next = NULL;
return head;
}
void print_list(struct ListNode *head)
{
while (head) {
printf("%d ", head->val);
head = head->next;
}
printf("\n");
}
void free_list(struct ListNode *head)
{
while (head) {
struct ListNode *freenode = head;
head = head->next;
free(freenode);
}
return;
}
void runTestcase(struct ListNode* testcase)
{
printf("Before delete: \t");
print_list(testcase);
testcase = deleteDuplicates(testcase);
printf("After delete: \t");
print_list(testcase);
free_list(testcase);
}
int main()
{
int arr1[LEN1] = {1, 2, 3, 3, 4, 4, 5, 6, 6, 7};
int arr2[LEN2] = {1, 1, 1, 2, 3};
int arr3[LEN3] = {1, 2, 2};
struct ListNode *testcase1 = initial_list(arr1, LEN1);
struct ListNode *testcase2 = initial_list(arr2, LEN2);
struct ListNode *testcase3 = initial_list(arr3, LEN3);
runTestcase(testcase1);
runTestcase(testcase2);
runTestcase(testcase3);
return 0;
}
```
- `ListNode` is the structure definition for singly-linked list.
- `remNodeandFree` remove one node and free the node, but this function doesn't contain the re-link before and after the removed node.
- `initial_list` give an array and the length of the array, it will return a head of the linked-list with each element in the array in order.
- `print_list` print the entire list.
- `free_list` free the entire list.
- `runTestcase` include serval functions mentioned above and give a demonstration of the program.
## Assembly code
```
.data
len1: .word 10
len2: .word 5
len3: .word 14
test1: .word 1 1 1 2 3 4 4 5 6 6
test2: .word 2 2 2 3 5
test3: .word 1 2 2 2 2 2 5 7 7 7 7 7 7 8
bef_msg: .string "Before delete:\n "
aft_msg: .string "After delete:\n "
divider: .string "==================\n "
.text
main:
la a0, divider
li a7, 4
ecall
################# test case 1 #################
la a0, test1 # a0 = arr
lw a1, len1 # a1 = arr_len
jal ra, initialize_list # a0 = return value = struct ListNode head
jal ra, runtestcase
la a0, divider
li a7, 4
ecall
################# test case 2 #################
la a0, test2 # a0 = arr
lw a1, len2 # a1 = arr_len
jal ra, initialize_list
jal ra, runtestcase
la a0, divider
li a7, 4
ecall
################# test case 3 #################
la a0, test3 # a0 = arr
lw a1, len3 # a1 = arr_len
jal ra, initialize_list
jal ra, runtestcase
###############################################
la a0, divider
li a7, 4
ecall
j exit
runtestcase:
# prologue
addi sp, sp, -8
sw ra, 0(sp)
sw a0, 4(sp)
# print messsage
# a0 need to handle the print msg for printf system call,
# so just temporary mv list head to t1
mv t1, a0 # t1 = list head
la a0, bef_msg
li a7, 4
ecall
# print end
jal ra, print_list
la a0, aft_msg
li a7, 4
ecall
lw t1, 4(sp) # t1 = list head
jal ra, deleteDuplicates
mv t1, s2 # t1 = list head
jal ra, print_list
# epilogue
lw ra, 0(sp) # back to main
lw a0, 4(sp)
addi sp, sp, 8
jr ra
initialize_list:
# prologue
addi sp, sp, -8 # push stack pointer to the bottom
sw ra, 0(sp)
# body
addi t0, a1, -1 # t0 = arr_len - 1
mv t1, a0 # t1 = arr
lw t2, 0(t1) # load array value
li s0, 0x20000000 # s0 will handle the address of each list node
sw s0, 4(sp) # store list head to stack
jal ra malloc # struct ListNode *head = malloc
sw t2, 0(s0) # head->val = arr[0];
mv t3, s0 # struct ListNode *c = head;
addi s0, s0, 8
li s1, 0 # i == 0
loop:
jal ra malloc # struct ListNode *next = malloc
addi t1, t1, 4 # push array by 4 (int)
lw t2, 0(t1) # arr[i+1]
sw t2, 0(s0) # next->val = arr[i+1]
sw s0, 4(t3) # c -> next = next
addi t3, t3, 8 # push heap 8 byte
addi s1, s1, 1 # i++
addi s0,s0, 8 # push head,4 bytes for integer, 4 bytes for address
bne s1, t0 loop # for loop condition
#addi s0,s0,4 # End position
sw x0, 4(t3)
lw ra, 0(sp) # load return address
lw a0, 4(sp) # load list head address
addi sp, sp, 8
jr ra
print_list:
lw a0, 0(t1) # load value to a0
li a7, 1 # system call: print int
ecall
li a0, 9 # tab
li a7, 11 # print char
ecall
lw t1, 4(t1) # t1 = t1->next
bne t1, x0 print_list
li a0, 10 # next line
li a7, 11 # print char
ecall
jr ra
deleteDuplicates:
addi sp, sp, -4
sw ra 0(sp) # stored return address
beq t1, x0, ifnohead # if(!head), t1 = list head
li t6, 0 #head_dup = 0
lw t2, 4(t1) # node->next
# while (head->next && head ->val == head->next->val) {
bne t2,x0 nnexist # node->next exist
remdupnode:
lw t2 4(t1) # t2 = n = head -> next
mv a0, t1 # a0 = c = head
# Now head duplicate is removed
# for(struct ListNode *n = head-> next; n; n = n->next)
# now use s2 - s11 to handle remain work
mv s2, t1 # struct ListNode *c = head
lw s3, 4(s2) # s3 = n = head->next
###############################################
# s2 = list head #
# s3 = c #
# s4 = n #
# s5 = npt #
# use t register to handle all the condition #
###############################################
mv s3, s2 # s3 = c = head
lw t1, 4(s2) # n = head->next
mv s4, t1
forloop:
lw t2 4(s4) # t2 = n->next
bne t2, x0, if1cond1 # n->next exist
if1end:
sw s4, 4(s3) # c->next = n
mv s3, s4 # c = n
lw s4, 4(s4) # n = n->next
bne s4, x0, forloop # n exist
j forloopexit
if1cond1:
lw t3, 0(s4) # n->val
lw t4, 0(t2) # n->next->val
beq t3, t4, if1cond2
j if1end
if1cond2:
lw s5, 4(s4) # npt = n->next
whilecond1:
bne s5, x0, whilecond1pass # npt exist
whilenot:
lw s4 4(s4) # n = n->next
j if1end
whilecond1pass:
lw t2, 0(s4) # n->val
lw t3, 0(s5) # npt->val
beq t2, t3, whilecond2
j whilenot
whilecond2:
lw s5, 4(s5) # npt = npt->next
sw s5, 4(s4) # n->next = npt
if2cond1:
bne s5, x0, if2cond1pass
j whilecond1
if2cond1pass:
lw t2, 4(s5) # t2 = npt->next
bne t2, x0, if2cond2
j whilecond1
if2cond2:
lw t3, 0(s5) # t3 = npt->val
lw t4, 0(t2) # t4 = npt->next->val
beq t3, t4 if2cond3
j whilecond1
if2cond3:
mv s4, s5 # n = npt
lw s5, 4(s5) # npt = npt->next
j whilecond1
forloopexit:
bne t6, x0, remhead
enddup:
# epilogue
lw ra 0(sp)
addi sp, sp, 4
jr ra
remhead:
lw s2 4(s2)
j enddup
nnexist:
lw t3, 0(t1) # node->val
lw t4, 0(t2) # node->next->val
beq t3, t4 remnode # node->val == node->next->val
j remdupnode
remnode:
# t2 = removed node
li t6, 1 # head_dup = 1
mv t3, t2 # struct ListNode *c = node
lw t2, 4(t2) # node = node->next
sw t2, 4(t1) # store to heap
bne t2, x0, nnexist # node exist
j remdupnode
ifnohead:
jr ra # return NULL
malloc:
li a7, 214 # brk system call
ecall
jr ra
exit:
li a7, 10 # end the program
ecall
```
:::warning
**TODO:** Try to optimize the code.
:::
**Output**

### Execution info

### Explanation
First of all, I defined three different testcases(array) which is predefined in data section (included array head and array length), and then called the `initialize_list` function associate with input arrgument (`int *arr` and `int arr_len`). the function will return list head.
In `initialize_list` function, I defined my list head which is construct with input array value at the address at `0x20000000`, the value how I decided this value is about [memory layout](https://www.geeksforgeeks.org/memory-layout-of-c-program/), the **heap section** is much more above than data section, In Ripes, data section is start from `0x10000000`, so I choose `0x20000000` to prevent from section overlapping.

After calling `initialize_list`, the value in testcase(aarray), will be stored in heap(which I defined it at `0x20000000`), associate with its next point address in order. Take `testcase1` for example, which value is `[1 1 1 2 3 4 4 5 6 6]` with `arr_len` = 10
**ListNode structure define**
```c
struct ListNode
{
int val;
struct ListNode *next;
};
```
As the structure defined above, each structure will consume 8 bytes (2 words, 4 bytes for integer value, 4 bytes for next point address) memory, so I just simply align it at specific memory address in order.

- the start value `1` is stored at `0x20000000`, and the next for byte (`0x20000004`) will stored the information about the next node's address, which value is `0x20000008`, do it until go through entire array, and put `NULL` pointer at the end of the list(at `0x2000004c`).
After constructing the list, given the list head information in `runtestcase` function, which has same definition in C program, it will return the list head with "deleted duplicated" list.
In Ripes, I didn't implement `free` function. I just overwrite the value in same address with terminate condition (NULL pointer) when I ran antother testcase after running a testcase.
### Usefull System calls defined in Ripes
| Func. (a7) | Name | Description |
| ---------- | ----------- | --------------------------------------------------------- |
| 1 | PrintInt | integer to print |
| 4 | PrintString | address of the string |
| 214 | brk | sets the end of the data segment to the specified address |
## Analysis the pipelineing
- Pipeline improves performance by exploiting instruction level parallelism
- 5-stage pipeline for RISC-V: **IF, ID, EX, MEM, WB**
- Executes multiple instruction in parallel
- Each instruction has the same latency

>CS61C Lecture 14
The above is the 5-stage pipeline RISC-V datapath, The simply implementation is just put some register in between all the stages, these just hold values which allow us to on a clock tick control when things will be allowed to move on into the next stage. You can see similar structure showed in Ripes.

Each stages represent:
- IF (Instruction Fetch)
- 32-bit instruction is fetched from the instruction memory.
- PC is also read out of instruction memory, at the same time a caclulation is done to determin the next PC (plus 4 or take the result of a branch/jump)
- ID (Instruction Decode)
- Decode the instruction and according to the instruction(opcode), decide to use the immediate or not.
- EX (Execution)
- Actual computation occurs, consists of an ALU
- MEM (Memory Access)
- Read / Write to memory
- WB (Write back)
- Write the result(which result will write back is decided by MUX) back.

>[Wikipedia](https://en.wikipedia.org/wiki/Classic_RISC_pipeline)
There are many MUX to control each unit's behavior, it about the different control signal when input different type of instruction(I, R, S, SB, etc.)

>CS61C Lecture 13

>CS61C Lecture 13
The picture above shows the instructions correspond to different sequence of control signal.
### Explain the code in pipeline

>CS61C Lecture 08
Take the code on the top of `main` function, simply explain the procedure about **printf** behavior
```
main:
la a0, divider
li a7, 4
ecall
```
The code above will output a divid line(`==================`), which is predefined in data section
```
00000000 <main>:
0: 10000517 auipc x10 0x10000
4: 0a150513 addi x10 x10 161
8: 00400893 addi x17 x0 4
c: 00000073 ecall
```
The pseudo instruction `la` will be be translate into `auipc`(U-format) and `addi` (I-format)
1.`auipc` at **IF** stage

- The address of instruction `auipc` `0x00000000` is the output of PC unit and will be sent into instruction(Instr.) memory, and the output of Instr. memory is `0x10000517`, which is `0b0001 0000 0000 0000 0000 0101 0001 0111` in binary, from the definition of bit field we know that:
- imm\[31:12\] = `0001 0000 0000 0000 0000`, which is the base address of data section
- rd = `01010`, which represent 10 in decimal
- opcode = `0010111`, which represent **U-type**, [Greencard](https://inst.eecs.berkeley.edu/~cs61c/fa18/img/riscvcard.pdf)
2. `addi` at **IF** stage `auipc` at **ID** stage

- At **ID** stage (`auipc`)
- Decode output **AUIPC** and **0x0a**(10 in decimal) which is correspond to above discussion.
- Notice that the **IDEX** register output `0xdeadbeef`, which is originally used to mark newly allocated areas of memory that had not yet been initialized, which is called [Magic number](https://en.wikipedia.org/wiki/Magic_number_(programming))
- At **IF** stage (`addi`)
- PC output `0x00000004` for the address of instruction `addi`
- Instr. memory output `0x0a150513`, `0b0000 1010 0001 0101 0000 0101 0001 0011` in binary, and for I-type bit field definition, we know that:
- imm\[11:0\] = `0000 1010 0001` = 161, immediate, which is the offset of the "diveder" address compared to base address (`0x10000000`)
- rs1 = `01010` = 10, source register
- funct3 = `000`, associate with opcodem, represent `addi`
- rd = `01010` = 10, represent destination register
- opcode = `0010011`, which represent **I-type**
3. `auipc` at **EX** stage, `addi` at **ID** stage

- At **EX** stage (`auipc`)
- From the control signal mention above, `auipc` generate
- **ASel:** PC
- **BSel:** Imm
- **ALUSel:** Add
- So the ALU output is **PC**(`0x00000000`) **add** **Imm**(`0x10000000`) = `0x10000000`
- At **ID** stage (`addi`)
- Decode output **ADDI** and `0x0a` for destination register.
- Notice that Decode output to RegFile is `0x0a` and `0x01`, which is inst\[19:15\] and inst\[24:20\] respectively, but it is **not use**
4. `auipc` at **MEM** stage, `addi` at **EX** stage

- At **MEM** stage (`auipc`)
- **EX/MEM** output `0x1000000` send back to ALU input which is called **forwarding** in order to resolve **Data Hazard**
- At **EX** stage (`addi`)
- The ALU add immediate(`0x000000a1`) and base address(`0x10000000`) which is caculated by ALU in previous cycle, called **forwarding**.
- ALU output become `0x100000a1`
5. `auipc` at **WB** stage, `addi` at **MEM** stage

- At **WB** stage (`auipc`)
- From the control signal mentioned above, **WBSel** is chosen to **ALU**, and **WrEn** will set to 1
- At **MEM** stage (`addi`)
- nothing happen, just pass the result `0x100000a1` into next stage.
6. `auipc` already write back, `addi` at **WB** stage

- After **WB** stage (`auipc`)
- the `a0` register is now `0x10000000`
- At **WB** stage (`addi`)
- same as `auipc`, **WBSel** is selected to **ALU**, and **WrEn** is set to 1.
7. `addi` already write back.
- Now `a0` is `0x100000a1`, which is the address of "**divider**"
`li a7, 4` has similar procedure with `addi x10 x10 161` because `li` is an pseudo instruction that can simply interpret to `addi`, which is `addi x17, x0, 4`
And for `ecall`, at **IF** and **ID** stage is nothing special, same as above, fetch the instruction from **PC**, decode the instruction to let the machine know that it is `ecall`.
But at the **EX** stage, `ecall` need to wait `a0` and `a7` **write back**, for it will utilize these two registers to do the system call. So there are two **NOP** implement for **stall**, just wait the information write back.

## Pipeline Hazards
### Structural Hazards
The structural hazards mean the required resource is busy, means the resource needed in multiple stages, it occurs when **two or more instruction in the pipeline compete for access to a single physical resource**
There're several solutions:
1. Taking turns using resource, means that some instructions have to **stall**
2. Simply add more hardware to machine to handle more instruction, (simple and intuitive, but it will raise the cost and the hardware complexity)
3. Double pumping, means that do something on the **rising edge** and do something on the **falling edge** rather just do one thing in single cycle.
The most common structural hazards in RISC-V is about memory access, take the example which is mentioned in [CS61C Lecture 14](https://inst.eecs.berkeley.edu/~cs61c/su20/)

>CS61C Lecture 14
The figure above shows that memory unit is used in both **IF** and **MEM** stages about two different instruction. If double pumping is implemented, that's fine. But if not, there must add other hardware to handle it or just stallation.
### Data Hazards
Data Hazards is that the next instruction need the value depends on the previous calculation, if just access it before the previous instruction finish **Write Back** stage, that will cause data hazards.
- RAW (Read after Write)
- WAR (Write after Read)
- WAW (Write after Write)
And it may cause [race condition](https://en.wikipedia.org/wiki/Race_condition).
There is an example I have mentioned above in my pseudo instruction.
```
auipc x0, 0x10000
addi x10, x10, 161
```
Before `auipc` **write back** to the `x0`, `addi` need `x10` value to caculate the value about `x10 + 161`, and it will need **stall** if there isn't any technique to handle the problem.
However, as mentioned before, **forwarding** is an implementation in RISC-V, **forwarding** means that **forward** the result as soon as it available, even though it's not stored in RegFile yet.

It needs additional datapath to send the ALU output directly to the begin of this stage, that's, be the **next EX stage input**, so actually the value isn't stored in RegFile yet, just simply used the value after the calculation.
### `ecall` analysis
```
addi x17 x0 4
ecall
```
There is an unavoidable stall occurs in `ecall` in Ripes, the `ecall` instruction need `a0` stand for the `printf` command, and `a7` stand for `printf` function index. `ecall` does nothing in **EX** stage, all this instruction is access the `a0` and `a7` to know how system call need to do. So `ecall` need the previous instruction **WB** to the RegFile.

If `a0` and `a7` are not use for next two instructions, this problem can be solved by **code scheduling**, means insert two independent isntruction between `addi` and `ecall`, which don't influence the `ecall` and associated registers.
```
addi x17 x0 4
NOP # something unrelated instruction
NOP # something unrelated instruction
ecall
```
### Control Hazards
Talking about Branch instruction (`beq, bne, ...`), the branch outcome will determine the next instruction **Fetch**, that is, need to wait the branch outcome to ensure fetch the correct instrcution (PC + 4 or jump to some where), **stall** is an intuitive solution.
**Kill Instruction after Branch if <font color=#ff00>Taken</font>**

>CS61C Lecture 14
The picture shows that if branch is **not be taken**, `sub` and `or` are **IF** and **ID** in pipeline, but if branch **taken**, it PC will jump to label and the `sub` and `or` will convert to `NOP`, two instructions are affected by an incorrect branch.

>CS61C Lecture 14
There is another way to sovlve this problem - **Branch Prediction**
Branch Prediction is actually implemented in the normal RISC-V pipeline. In the correct case, there's not stall or `NOP`, if prediction is incorrect, discard all the instruction which is before the correct branch outcom. Actually, it is better on **average than stalling**
Take my code in `initial_list`, there is a conditional branch `bne s1, t0 loop` to judge the for loop keep going or break out.
```
initialize_list:
# prologue
addi sp, sp, -8 # push stack pointer to the bottom
sw ra, 0(sp)
# body
addi t0, a1, -1 # t0 = arr_len - 1
mv t1, a0 # t1 = arr
lw t2, 0(t1) # load array value
li s0, 0x20000000 # s0 will handle the address of each list node
sw s0, 4(sp) # store list head to stack
jal ra malloc # struct ListNode *head = malloc
sw t2, 0(s0) # head->val = arr[0];
mv t3, s0 # struct ListNode *c = head;
addi s0, s0, 8
li s1, 0 # i == 0
loop:
jal ra malloc # struct ListNode *next = malloc
addi t1, t1, 4 # push array by 4 (int)
lw t2, 0(t1) # arr[i+1]
sw t2, 0(s0) # next->val = arr[i+1]
sw s0, 4(t3) # c -> next = next
addi t3, t3, 8 # push heap 8 byte
addi s1, s1, 1 # i++
addi s0,s0, 8 # push head,4 bytes for integer, 4 bytes for address
bne s1, t0 loop # for loop condition
#addi s0,s0,4 # End position
sw x0, 4(t3)
lw ra, 0(sp) # load return address
lw a0, 4(sp) # load list head address
addi sp, sp, 8
jr ra
```

focusing on the pipeline that `bne` is on the **EX** stage, now the correct branch outcome is not generated yet, the the next 2 instruction `sw x0, 4(t3)` and `lw ra, 0(sp)` is on the **ID** and **IF** stage respectively.

Unfortunetly, the `bne` outcome is **True** that PC jump back to the label `loop`, and the next 2 instruction which has been fetched and decoded are **get flushed (convert to NOP)**

## Reference Link
[CS61C Discussion 3 – RISC-V](https://inst.eecs.berkeley.edu/~cs61c/fa17/disc/3/Disc3_Sol.pdf)
[RISC-V - Iterate through a linked list and apply a function to each node](https://stackoverflow.com/questions/68263927/risc-v-iterate-through-a-linked-list-and-apply-a-function-to-each-node)
[RISC-V Assembly Programmer's Manual](https://github.com/riscv-non-isa/riscv-asm-manual/blob/master/riscv-asm.md)