# Assignment1: RISC-V Assembly and Instruction Pipeline
<!-- contributed by < [`kaeteyaruyo`](https://github.com/kaeteyaruyo) > -->
###### tags: `RISC-V` `computer architecture 2022`
## Add Two Numbers
You are given two non-empty linked lists representing two non-negative integers. The digits are stored in reverse order, and each of their nodes contains a single digit. Add the two numbers and return the sum as a linked list.
You may assume the two numbers do not contain any leading zero, except the number 0 itself.
## Implementation
You can find the source code [here](https://github.com/Haser0305/ComputerArchitecture/tree/main/Homework/hw1). Feel free to fork and modify it.
### CPP code
I add these 2 lists to current carry. If 1 or both of these 2 list is empty, it will create a new node to keep doing the while loop until both lists are at the end.
```clike=
#include "stdio.h"
struct list_node {
int val;
list_node* next;
list_node() : val(0), next(nullptr) {
}
list_node(int x) : val(x), next(nullptr) {
}
list_node(int x, list_node* next) : val(x), next(next) {
}
};
list_node* add_two_numbers(list_node* l1, list_node* l2) {
list_node* head = l1;
int carry = 0;
int temp;
while (true) {
temp = l1->val;
l1->val = (l1->val + l2->val + carry) % 10;
carry = (temp + l2->val + carry) / 10;
if (l1->next == nullptr && l2->next != nullptr) {
l1->next = new list_node();
} else if (l1->next != nullptr && l2->next == nullptr) {
l2->next = new list_node();
}
if (l1->next == nullptr && l2->next == nullptr) {
break;
} else {
l1 = l1->next;
l2 = l2->next;
}
}
if (carry != 0) {
l1 = new list_node(1);
}
return head;
}
```
### Assembly code
#### Test data
##### Test data 1
> l1 : 8, 2, 3
> l2 : 3, 2, 6, 9
##### Test data 2
> l1 : 1, 2, 3, 4
> l2 : 3, 2, 1
##### Test data 3
> l1 : 1, 2, 3
> l2 : 3, 2, 1, 1
```clike=
.data
newline: .string "\n"
.text
init:
# the following around 140 lines just initial data
# test data 1
mv t0, gp
li t1, 8
li t2, 2
li t3, 4
sw t1, 0(t0)
addi t0, t0, 8 # move to next node base address
sw t0, -4(t0) # sw current addr to the previouse pointer
sw t2, 0(t0) # sw the new value to current node
# follow above again
addi t0, t0, 8
sw t0, -4(t0)
sw t3, 0(t0)
# if need more test data, just repeat above 3 line of code, and replace the temp register
# create l2 linked list
li t1, 3
li t2, 2
li t3, 6
li t4, 9
addi t0, t0, 8
sw zero, -4(t0) # previouse node is last one, so point to null.
sw t1, 0(t0)
addi t0, t0, 8
sw t0, -4(t0)
sw t2, 0(t0)
addi t0, t0, 8
sw t0, -4(t0)
sw t3, 0(t0)
addi t0, t0, 8
sw t0, -4(t0)
sw t4, 0(t0)
addi t0, t0, 8
sw zero, -4(t0) # put zero at the last node pointer
jal ra, main
la a0, newline
li a7, 4
ecall
# test data 2
mv t0, gp
li t1, 1
li t2, 2
li t3, 3
li t4, 4
sw t1, 0(t0)
addi t0, t0, 8
sw t0, -4(t0)
sw t2, 0(t0)
addi t0, t0, 8
sw t0, -4(t0)
sw t3, 0(t0)
addi t0, t0, 8
sw t0, -4(t0)
sw t4, 0(t0)
li t1, 3
li t2, 2
li t3, 1
addi t0, t0, 8
sw zero, -4(t0)
sw t1, 0(t0)
addi t0, t0, 8
sw t0, -4(t0)
sw t2, 0(t0)
addi t0, t0, 8
sw t0, -4(t0)
sw t3, 0(t0)
addi t0, t0, 8
sw zero, -4(t0)
jal ra, main
la a0, newline
li a7, 4
ecall
# test data 3
mv t0, gp
li t1, 1
li t2, 2
li t3, 3
sw t1, 0(t0)
addi t0, t0, 8
sw t0, -4(t0)
sw t2, 0(t0)
addi t0, t0, 8
sw t0, -4(t0)
sw t3, 0(t0)
li t1, 3
li t2, 2
li t3, 1
li t4, 1
addi t0, t0, 8
sw zero, -4(t0)
sw t1, 0(t0)
addi t0, t0, 8
sw t0, -4(t0)
sw t2, 0(t0)
addi t0, t0, 8
sw t0, -4(t0)
sw t3, 0(t0)
addi t0, t0, 8
sw t0, -4(t0)
sw t4, 0(t0)
addi t0, t0, 8
sw zero, -4(t0)
jal ra, main
j exit
main:
mv s3, gp # s3 = l1
mv s5, ra # store ra
jal ra, count_l1_len # update s2 to the len of l1
mv ra, s5 # recover ra
slli t0, s2, 3 # *8 to get the addr of l2
add s4, gp, t0 # s4 => base addr of l2
li s1, 0 # s1 = carry
mv t3, s3 # do not modify s3, so use t3 in the future
mv t4, s4 # as above
while:
lw t0, 0(t3) # laod l1 node
lw t1, 0(t4) # load l2 node
add t0, t0, t1 # l1->val += l2->val
add t0, t0, s1 # li->val += carry
li s1, 0 # carry is used, so update to 0
li t2, 10 # t2 in while is for checking the sum is greather then 10 or not
blt t0, t2, no_carry # no carry, skip 2 following instructions
addi s1, s1, 1 # has carry, so s1 should be set 1
addi t0, t0, -10 # update to the units digit
no_carry:
sw t0, 0(t3) # store the result in the node
lw t0, 4(t3) # load l1 next node addr
lw t1, 4(t4) # load l2 next node addr
beq zero, t0, l1Null # if the pointer is zero, this is the last node in l1
beq zero, t1, l2Null # same
addi t3, t3, 8 # both list have next node, so put the addr of next node in t3
addi t4, t4, 8 # same
j while
createNode:
# a0 is an input value. put the address of this node in a0
add t0, gp, s0
sw a0, 0(t0)
add a0, gp, s0
addi s0, s0, 8
jr ra
l1Null:
beq zero, t1, noMoreNode # l2 is also null, just need to check the carry
# l2 is not null
sw t1, 4(t3) # link last node in l1 to next l2 node
mv a0, t1 # need to check whether the following nodes need to add carry or create a new node
bgt s1, zero, checkCarry
mv a1, s3
j printAll
l2Null:
beq zero, t0, noMoreNode # l1 is also null
# l1 is not null
mv a0, t0 # l2 does not need to do anything more. just check l1
bgt s1, zero, checkCarry
mv a1, s3
j printAll
checkCarry:
# a0 is the base address that begin to check
lw t0, 0(a0)
li t1, 10 # t1 = 10
checkCarryLoop:
add t0, t0, s1 # value + carry
blt t0, t1, exitCheckCarry # no carry anymore
addi t0, t0, -10 # remove the unnecessary digit
sw t0, 0(a0) # store the result
lw t2, 4(a0) # read the next node addr
beqz t2, exitCheckCarry # if this is the last node, go to exitCheckCarry
mv a0, t2
j checkCarryLoop
exitCheckCarry:
bnez s1, createNodeForCarry # if the carry is not 0, create a new node for it
sw t0, 0(a0) # store the last result in the latest node
mv a1, gp # print all values begin at l1
j printAll
createNodeForCarry:
addi t2, a0, 8
sw t2, 4(a0)
sw zero, 4(t2)
sw s1, 0(t2)
mv a1, gp # prepare an arg for printAll
j printAll
noMoreNode:
# if both list do not have next node. just make sure to create a new node for carry
mv a1, gp
mv a0, s1
bne s1, zero, createNode # has carry
j printAll
count_l1_len:
# no input, update s2
li s2, 0
mv t1, s3
addi t1, t1, 4
lw, t0, 0(t1) # t0 = nextNodeAddress
cllLoop:
beq t0, zero, exitCllLoop
addi s2, s2, 1
addi t1, t1, 8
lw t0, 0(t1) # t0 = nextNodeAddress
j cllLoop
exitCllLoop:
addi s2, s2, 1
jr ra
printAll:
# a1 is the base address
li a7, 1 # set the system call to printInt
lw a0, 0(a1) # the system call will print int in a0
ecall
li a7, 4 # just like above
la a0, newline
ecall
lw t1, 4(a1) # check l1 is at the end or not
beq t1, zero, exitPrintAll
lw a1, 4(a1)
j printAll
exitPrintAll:
jr ra
exit:
li a7, 10
ecall
```
## Machine Code
```
00000000 <init>:
0: 00018293 addi x5 x3 0
4: 00800313 addi x6 x0 8
8: 00200393 addi x7 x0 2
c: 00400e13 addi x28 x0 4
10: 0062a023 sw x6 0 x5
14: 00828293 addi x5 x5 8
18: fe52ae23 sw x5 -4 x5
1c: 0072a023 sw x7 0 x5
20: 00828293 addi x5 x5 8
24: fe52ae23 sw x5 -4 x5
28: 01c2a023 sw x28 0 x5
2c: 00300313 addi x6 x0 3
30: 00200393 addi x7 x0 2
34: 00600e13 addi x28 x0 6
38: 00900e93 addi x29 x0 9
3c: 00828293 addi x5 x5 8
40: fe02ae23 sw x0 -4 x5
44: 0062a023 sw x6 0 x5
48: 00828293 addi x5 x5 8
4c: fe52ae23 sw x5 -4 x5
50: 0072a023 sw x7 0 x5
54: 00828293 addi x5 x5 8
58: fe52ae23 sw x5 -4 x5
5c: 01c2a023 sw x28 0 x5
60: 00828293 addi x5 x5 8
64: fe52ae23 sw x5 -4 x5
68: 01d2a023 sw x29 0 x5
6c: 00828293 addi x5 x5 8
70: fe02ae23 sw x0 -4 x5
74: 118000ef jal x1 280 <main>
78: 10000517 auipc x10 0x10000
7c: f8850513 addi x10 x10 -120
80: 00400893 addi x17 x0 4
84: 00000073 ecall
88: 00018293 addi x5 x3 0
8c: 00100313 addi x6 x0 1
90: 00200393 addi x7 x0 2
94: 00300e13 addi x28 x0 3
98: 00400e93 addi x29 x0 4
9c: 0062a023 sw x6 0 x5
a0: 00828293 addi x5 x5 8
a4: fe52ae23 sw x5 -4 x5
a8: 0072a023 sw x7 0 x5
ac: 00828293 addi x5 x5 8
b0: fe52ae23 sw x5 -4 x5
b4: 01c2a023 sw x28 0 x5
b8: 00828293 addi x5 x5 8
bc: fe52ae23 sw x5 -4 x5
c0: 01d2a023 sw x29 0 x5
c4: 00300313 addi x6 x0 3
c8: 00200393 addi x7 x0 2
cc: 00100e13 addi x28 x0 1
d0: 00828293 addi x5 x5 8
d4: fe02ae23 sw x0 -4 x5
d8: 0062a023 sw x6 0 x5
dc: 00828293 addi x5 x5 8
e0: fe52ae23 sw x5 -4 x5
e4: 0072a023 sw x7 0 x5
e8: 00828293 addi x5 x5 8
ec: fe52ae23 sw x5 -4 x5
f0: 01c2a023 sw x28 0 x5
f4: 00828293 addi x5 x5 8
f8: fe02ae23 sw x0 -4 x5
fc: 090000ef jal x1 144 <main>
100: 10000517 auipc x10 0x10000
104: f0050513 addi x10 x10 -256
108: 00400893 addi x17 x0 4
10c: 00000073 ecall
110: 00018293 addi x5 x3 0
114: 00100313 addi x6 x0 1
118: 00200393 addi x7 x0 2
11c: 00300e13 addi x28 x0 3
120: 0062a023 sw x6 0 x5
124: 00828293 addi x5 x5 8
128: fe52ae23 sw x5 -4 x5
12c: 0072a023 sw x7 0 x5
130: 00828293 addi x5 x5 8
134: fe52ae23 sw x5 -4 x5
138: 01c2a023 sw x28 0 x5
13c: 00300313 addi x6 x0 3
140: 00200393 addi x7 x0 2
144: 00100e13 addi x28 x0 1
148: 00100e93 addi x29 x0 1
14c: 00828293 addi x5 x5 8
150: fe02ae23 sw x0 -4 x5
154: 0062a023 sw x6 0 x5
158: 00828293 addi x5 x5 8
15c: fe52ae23 sw x5 -4 x5
160: 0072a023 sw x7 0 x5
164: 00828293 addi x5 x5 8
168: fe52ae23 sw x5 -4 x5
16c: 01c2a023 sw x28 0 x5
170: 00828293 addi x5 x5 8
174: fe52ae23 sw x5 -4 x5
178: 01d2a023 sw x29 0 x5
17c: 00828293 addi x5 x5 8
180: fe02ae23 sw x0 -4 x5
184: 008000ef jal x1 8 <main>
188: 1680006f jal x0 360 <exit>
0000018c <main>:
18c: 00018993 addi x19 x3 0
190: 00008a93 addi x21 x1 0
194: 100000ef jal x1 256 <count_l1_len>
198: 000a8093 addi x1 x21 0
19c: 00391293 slli x5 x18 3
1a0: 00518a33 add x20 x3 x5
1a4: 00000493 addi x9 x0 0
1a8: 00098e13 addi x28 x19 0
1ac: 000a0e93 addi x29 x20 0
000001b0 <while>:
1b0: 000e2283 lw x5 0 x28
1b4: 000ea303 lw x6 0 x29
1b8: 006282b3 add x5 x5 x6
1bc: 009282b3 add x5 x5 x9
1c0: 00000493 addi x9 x0 0
1c4: 00a00393 addi x7 x0 10
1c8: 0072c663 blt x5 x7 12 <no_carry>
1cc: 00148493 addi x9 x9 1
1d0: ff628293 addi x5 x5 -10
000001d4 <no_carry>:
1d4: 005e2023 sw x5 0 x28
1d8: 004e2283 lw x5 4 x28
1dc: 004ea303 lw x6 4 x29
1e0: 02500463 beq x0 x5 40 <l1Null>
1e4: 02600e63 beq x0 x6 60 <l2Null>
1e8: 008e0e13 addi x28 x28 8
1ec: 008e8e93 addi x29 x29 8
1f0: fc1ff06f jal x0 -64 <while>
000001f4 <createNode>:
1f4: 008182b3 add x5 x3 x8
1f8: 00a2a023 sw x10 0 x5
1fc: 00818533 add x10 x3 x8
200: 00840413 addi x8 x8 8
204: 00008067 jalr x0 x1 0
00000208 <l1Null>:
208: 06600e63 beq x0 x6 124 <noMoreNode>
20c: 006e2223 sw x6 4 x28
210: 00030513 addi x10 x6 0
214: 02904063 blt x0 x9 32 <checkCarry>
218: 00098593 addi x11 x19 0
21c: 0a40006f jal x0 164 <printAll>
00000220 <l2Null>:
220: 06500263 beq x0 x5 100 <noMoreNode>
224: 00028513 addi x10 x5 0
228: 00904663 blt x0 x9 12 <checkCarry>
22c: 00098593 addi x11 x19 0
230: 0900006f jal x0 144 <printAll>
00000234 <checkCarry>:
234: 00052283 lw x5 0 x10
238: 00a00313 addi x6 x0 10
0000023c <checkCarryLoop>:
23c: 009282b3 add x5 x5 x9
240: 0062ce63 blt x5 x6 28 <exitCheckCarry>
244: ff628293 addi x5 x5 -10
248: 00552023 sw x5 0 x10
24c: 00452383 lw x7 4 x10
250: 00038663 beq x7 x0 12 <exitCheckCarry>
254: 00038513 addi x10 x7 0
258: fe5ff06f jal x0 -28 <checkCarryLoop>
0000025c <exitCheckCarry>:
25c: 00049863 bne x9 x0 16 <createNodeForCarry>
260: 00552023 sw x5 0 x10
264: 00018593 addi x11 x3 0
268: 0580006f jal x0 88 <printAll>
0000026c <createNodeForCarry>:
26c: 00850393 addi x7 x10 8
270: 00752223 sw x7 4 x10
274: 0003a223 sw x0 4 x7
278: 0093a023 sw x9 0 x7
27c: 00018593 addi x11 x3 0
280: 0400006f jal x0 64 <printAll>
00000284 <noMoreNode>:
284: 00018593 addi x11 x3 0
288: 00048513 addi x10 x9 0
28c: f60494e3 bne x9 x0 -152 <createNode>
290: 0300006f jal x0 48 <printAll>
00000294 <count_l1_len>:
294: 00000913 addi x18 x0 0
298: 00098313 addi x6 x19 0
29c: 00430313 addi x6 x6 4
2a0: 00032283 lw x5 0 x6
000002a4 <cllLoop>:
2a4: 00028a63 beq x5 x0 20 <exitCllLoop>
2a8: 00190913 addi x18 x18 1
2ac: 00830313 addi x6 x6 8
2b0: 00032283 lw x5 0 x6
2b4: ff1ff06f jal x0 -16 <cllLoop>
000002b8 <exitCllLoop>:
2b8: 00190913 addi x18 x18 1
2bc: 00008067 jalr x0 x1 0
000002c0 <printAll>:
2c0: 00100893 addi x17 x0 1
2c4: 0005a503 lw x10 0 x11
2c8: 00000073 ecall
2cc: 00400893 addi x17 x0 4
2d0: 10000517 auipc x10 0x10000
2d4: d3050513 addi x10 x10 -720
2d8: 00000073 ecall
2dc: 0045a303 lw x6 4 x11
2e0: 00030663 beq x6 x0 12 <exitPrintAll>
2e4: 0045a583 lw x11 4 x11
2e8: fd9ff06f jal x0 -40 <printAll>
000002ec <exitPrintAll>:
2ec: 00008067 jalr x0 x1 0
000002f0 <exit>:
2f0: 00a00893 addi x17 x0 10
2f4: 00000073 ecall
```
## Memory
In the beginning, we put data inthe register and store into memory start from gp(`0x10000000`).
The memory around gp will like below picture at finished state.

## Analysis
### 5-stage pipelined processor
There are several processors can be choose in ripes simulator. In the following, we always use the **5-stage processor** to analyze.
Its block diagram looks like below:

We can identify there are 5 stages in the above image. They are
1. Instruction Fetch (IF)
2. Instruction Decode (ID)
3. Execute (EX)
4. Memory (MEM)
5. Write Back (WB)
We will discuss each of the 5 stages.
#### Instruction Fetch (IF)
In the IF stage, CPU will fetch the instruction from **instr. cache**.
For example, see the following image.

In this moment, the line 14th. instruction is in the IF stage. We can know the address of the instruction is `0x18`, and the coressponding machine code is `0xfe52ae23`.
Let's go back to the first instruction `mv t0, gp`, we can notice that the 1st instruction at the right side is different to it. That because the instruction `mv` is a pseudo instruction for helping the developer to code easier.
#### Instruction Decode (ID)

CPU decode the instruction in this stage. The immediate decode to `0xfffffffc`, it is 2's complement of -4. We can see the R1 idx will get the address `0x100000008`, it can be show directly in the GPR field in ripes.
The alias of name `x5` register is **t0**, so we can find out the value in the register t0 from the image, which in the IF stage, is `gp + 8`.
#### Execute (EX)

It's looks very complex in this image. The sw instruction will calculate the correct address which the value of the **rd register** to store.
If we see the r1_out of the previous stage is `0x10000000`. It will be cahnged to `0x10000008` after going through 2 multiplexer as an input Op1 of ALU. It because the data forwarding, we will talk about this later.
There are 4 multiplexer in this stage. The 2 upper are decide the input Op1, and others decide Op2. They are controled by the hazard detection to choose which data should be passed to ALU.
The result could be:
* output of previous stage
* the WB stage output
* the ALU result
* immediate
#### Memory (MEM)
In this stage, CPU will read or write to memory.
We can see the memory is changed after sw finish this stage. This instruction will put the address of `gp + 8` to `gp + 4`
> before update the memory
> 
> after update the memory
> 
> Notice.
> The load-use hazard may occurred here. We'll talk more about this later.
> This can only be solved by stalls or reorder the instruction.
#### Write Back (WB)

The result get from previous will be written back to register.
### Hazard
#### Control hazard
The control hazard occurred if a jump or branch decide to go to another address, so the prior stages need to be discarded.
The following pictures show this situation occurred in line 49.
> the instruction `jal` decide jumping to`main`
> 
> the instruction `auipc` and `addi` be flushed.
> 
#### Load-use hazard
Load-use hazard happens in **load instruction** followed by any instruction that needs the result of the memory load.
This hazard happened at the line 161.

> Insert a nop between `add` and `lw`
> 
In this code, it will cause a nop. If i reorder the code to exchange the instruction `li` and `add`, it will not occurred load-use hazerd here.
The code will be like this:
```clike=159
lw t0, 0(t3) # laod l1 node
lw t1, 0(t4) # load l2 node
li s1, 0 # move these 2 line code before add, then we do not need nop here
li t2, 10
add t0, t0, t1 # l1->val += l2->val
add t0, t0, s1 # li->val += carry
```
#### Forwarding
If data hazard occurred, most of these can be resolved by forwarding, which is when the result of the EX or MEM stage is sent to the EX stage for a following instruction to use.
> line 13 forwarding the register `x5` to instruction `sw`
> 
Note. Forwarding can not solve load-use hazard.
### Execution info
| | result |
| -------- | -------- |
| cycles | 261 |
| instrs. retired | 181 |
| CPI | 1.44 |
| IPC | 0.693 |
## Reference
* [Add Two Numbers - LeetCode](https://leetcode.com/problems/add-two-numbers)
* [RISC-V: Branch 指令](https://ithelp.ithome.com.tw/articles/10273738)
* [Lecture 09: RISC-V Pipeline Implementation](https://passlab.github.io/CSE564/notes/lecture09_RISCV_Impl_pipeline.pdf)
* [RISC-V Pipelining and Hazards](https://robotics.shanghaitech.edu.cn/courses/ca/19s/notes/Discussion8_2.pdf)