# Assignment1: RISC-V Assembly and Instruction Pipeline
contributed by < [`CSIE523`](https://github.com/CSIE523/2023_Computer_Architecture_FALL/tree/main/hw1) >
## Find first set using CLZ
> Description: Given a 64-bit integer, find the index or position of the bit first set to one counting from the least significant bit position.
>
>Example:
>Input: 325478 (0x000000000004F766)
>Output: 2
## Solution
### Idea for problem solving
The main idea for solving this problem is the algorithm from Wikipedia.
```
ffs(x) = w - clz(x & -x).
# w is total bits in x.
# -x is the 2's complement of x.
```
### C program
```c
#include<stdio.h>
#include<stdint.h>
uint64_t twoscomple(uint64_t x){
x ^= 0xffffffffffffffff;
return x + 1;
}
uint16_t count_leading_zeros(uint64_t x){
x |= (x >> 1);
x |= (x >> 2);
x |= (x >> 4);
x |= (x >> 8);
x |= (x >> 16);
x |= (x >> 32);
/* count ones (population count) */
x -= ((x >> 1) & 0x5555555555555555 );
x = ((x >> 2) & 0x3333333333333333) + (x & 0x3333333333333333);
x = ((x >> 4) + x) & 0x0f0f0f0f0f0f0f0f;
x += (x >> 8);
x += (x >> 16);
x += (x >> 32);
return (64 - (x & 0x7f));
}
int main(){
uint64_t input;
scanf("%llu", &input);
printf("Find first set is %d.\n", 64 - count_leading_zeros(input & twoscomple(input)));
}
```
### Assembly code
```c
.data
test_data1: .word 0x00101110 0x01001000
test_data2: .word 0x11000008 0x00000000
test_data3: .word 0x00000000 0x00000000
const_1: .word 0x55555555
const_2: .word 0x33333333
const_3: .word 0x0f0f0f0f
str: .string "\nThe first set of test_data is "
# s1 : unsigned int 64 high 32 bits
# s2 : unsigned int 64 low 32 bits
# s3 : 64 or 32
# s4 : test_data select
# t1 : unsigned int 64 high 32 bits (tmp)
# t2 : unsigned int 64 low 32 bits (tmp)
# t3 : init=1, *2 until x = 32
.text
data_select:
jal ra, load_data1
jal ra, load_data2
jal ra, load_data3
j exit
load_data1:
la t1, test_data1
j main
load_data2:
la t1, test_data2
j main
load_data3:
la t1, test_data3
main:
lw s1, 0(t1)
lw s2, 4(t1)
lw t3, 0(t1)
lw t4, 4(t1)
addi sp, sp, -4
sw ra, 0(sp)
jal ra, twos_comple
and s1, s1, t3
and s2, s2, t4
addi t1, s1, 0 # replace tmp
addi t2, s2, 0 # replace tmp
addi s3, x0, 0x20
addi t4, x0, 0x20
addi t3, x0, 1
shift_with_or_from1to16:
sub s3, s3, t3
sll t0, t1, s3
srl t1, t1, t3
srl t2, t2, t3
or t2, t2, t0
or s1, s1, t1
or s2, s2, t2
addi t1, s1, 0 # replace tmp
addi t2, s2, 0 # replace tmp
slli t3, t3, 1
addi s3, x0, 0x20
bne t3, t4, shift_with_or_from1to16
# x |= (x >> 32);
or s2, s1, t2
addi t1, s1, 0 # replace tmp
addi t2, s2, 0 # replace tmp
count_ones:
# x -= ((x >> 1) & 0x5555555555555555);
slli t0, t1, 31
srli t1, t1, 1
srli t2, t2, 1
or t2, t2, t0
lw t3, const_1
and t1, t1, t3
and t2, t2, t3
sub s2, s2, t2
sltu t0, s2, t2
sub s1, s1, t1
sub s1, s1, t0
addi t1, s1, 0 # replace tmp
addi t2, s2, 0 # replace tmp
# x = ((x >> 2) & 0x3333333333333333) + (x & 0x3333333333333333);
addi t4, t1, 0
addi t5, t2, 0
slli t0, t1, 30
srli t1, t1, 2
srli t2, t2, 2
or t2, t2, t0
lw t3, const_2
and t1, t1, t3
and t2, t2, t3
and t4, t4, t3
and t5, t5, t3
add t2, t2, t5
sltu t0, t2, t5
add t1, t1, t4
add t1, t1, t0
addi s1, t1, 0
addi s2, t2, 0
# x = ((x >> 4) + x) & 0x0f0f0f0f0f0f0f0f;
slli t0, t1, 28
srli t1, t1, 4
srli t2, t2, 4
or t2, t2, t0
add t2, t2, s2
sltu t0, t2, s2
add t1, t1, s1
add t1, t1, t0
lw t3, const_3
and t1, t1, t3
and t2, t2, t3
addi s1, t1, 0
addi s2, t2, 0
# x += (x >> 8);
slli t0, t1, 24
srli t1, t1, 8
srli t2, t2, 8
or t2, t2, t0
add t2, t2, s2
sltu t0, t2, s2
add t1, t1, s1
add t1, t1, t0
addi s1, t1, 0
addi s2, t2, 0
# x += (x >> 16);
slli t0, t1, 16
srli t1, t1, 16
srli t2, t2, 16
or t2, t2, t0
add t2, t2, s2
sltu t0, t2, s2
add t1, t1, s1
add t1, t1, t0
addi s1, t1, 0
addi s2, t2, 0
# x += (x >> 32);
add t2, t2, s1
sltu t0, t2, s2
add t1, t1, t0
addi s1, t1, 0
addi s2, t2, 0
# (64 - (x & 0x7f));
addi s3, x0, 0x40
andi s2, s2, 0x7f
sub s3, s3, s2
# 64 - return value
addi t3, x0, 0x40
sub s3, t3, s3
jal ra, printResult
lw ra, 0(sp)
addi sp, sp, 4
jr ra
twos_comple:
li t5, -1
xor t3, t3, t5 # high
xor t4, t4, t5 # low
addi t5, x0, 1
add t4, t4, t5
sltu t0, t4, t5
add t3, t3, t0
jr ra
printResult:
# print string
la a0, str
addi a7, zero, 4
ecall
# print ffs
li a7, 1
mv a0, s3
ecall
jr ra
exit:
li a7, 10
ecall
```
There are 3 test datas in this program:
* `test_data1` : `0x0010111001001000`
**First set = 13**
* `test_data2` : `0x1100000800000000`
**First set = 36**
* `test_data3` : `0x0000000000000000`
**First set = 0**
## Experiment Result

## Analysis
Using 5-stage pipelined processor in [Ripes](https://github.com/mortbopet/Ripes).
```
00000000 <data_select>:
0: 010000ef jal x1 16 <load_data1>
4: 018000ef jal x1 24 <load_data2>
8: 020000ef jal x1 32 <load_data3>
c: 2180006f jal x0 536 <exit>
00000010 <load_data1>:
10: 10000317 auipc x6 0x10000
14: ff030313 addi x6 x6 -16
18: 0180006f jal x0 24 <main>
0000001c <load_data2>:
1c: 10000317 auipc x6 0x10000
20: fec30313 addi x6 x6 -20
24: 00c0006f jal x0 12 <main>
00000028 <load_data3>:
28: 10000317 auipc x6 0x10000
2c: fe830313 addi x6 x6 -24
00000030 <main>:
30: 00032483 lw x9 0 x6
34: 00432903 lw x18 4 x6
38: 00032e03 lw x28 0 x6
3c: 00432e83 lw x29 4 x6
40: ffc10113 addi x2 x2 -4
44: 00112023 sw x1 0 x2
48: 19c000ef jal x1 412 <twos_comple>
4c: 01c4f4b3 and x9 x9 x28
50: 01d97933 and x18 x18 x29
54: 00048313 addi x6 x9 0
58: 00090393 addi x7 x18 0
5c: 02000993 addi x19 x0 32
60: 02000e93 addi x29 x0 32
64: 00100e13 addi x28 x0 1
00000068 <shift_with_or_from1to16>:
68: 41c989b3 sub x19 x19 x28
6c: 013312b3 sll x5 x6 x19
70: 01c35333 srl x6 x6 x28
74: 01c3d3b3 srl x7 x7 x28
78: 0053e3b3 or x7 x7 x5
7c: 0064e4b3 or x9 x9 x6
80: 00796933 or x18 x18 x7
84: 00048313 addi x6 x9 0
88: 00090393 addi x7 x18 0
8c: 001e1e13 slli x28 x28 1
90: 02000993 addi x19 x0 32
94: fdde1ae3 bne x28 x29 -44 <shift_with_or_from1to16>
98: 0074e933 or x18 x9 x7
9c: 00048313 addi x6 x9 0
a0: 00090393 addi x7 x18 0
000000a4 <count_ones>:
a4: 01f31293 slli x5 x6 31
a8: 00135313 srli x6 x6 1
ac: 0013d393 srli x7 x7 1
b0: 0053e3b3 or x7 x7 x5
b4: 10000e17 auipc x28 0x10000
b8: f64e2e03 lw x28 -156 x28
bc: 01c37333 and x6 x6 x28
c0: 01c3f3b3 and x7 x7 x28
c4: 40790933 sub x18 x18 x7
c8: 007932b3 sltu x5 x18 x7
cc: 406484b3 sub x9 x9 x6
d0: 405484b3 sub x9 x9 x5
d4: 00048313 addi x6 x9 0
d8: 00090393 addi x7 x18 0
dc: 00030e93 addi x29 x6 0
e0: 00038f13 addi x30 x7 0
e4: 01e31293 slli x5 x6 30
e8: 00235313 srli x6 x6 2
ec: 0023d393 srli x7 x7 2
f0: 0053e3b3 or x7 x7 x5
f4: 10000e17 auipc x28 0x10000
f8: f28e2e03 lw x28 -216 x28
fc: 01c37333 and x6 x6 x28
100: 01c3f3b3 and x7 x7 x28
104: 01cefeb3 and x29 x29 x28
108: 01cf7f33 and x30 x30 x28
10c: 01e383b3 add x7 x7 x30
110: 01e3b2b3 sltu x5 x7 x30
114: 01d30333 add x6 x6 x29
118: 00530333 add x6 x6 x5
11c: 00030493 addi x9 x6 0
120: 00038913 addi x18 x7 0
124: 01c31293 slli x5 x6 28
128: 00435313 srli x6 x6 4
12c: 0043d393 srli x7 x7 4
130: 0053e3b3 or x7 x7 x5
134: 012383b3 add x7 x7 x18
138: 0123b2b3 sltu x5 x7 x18
13c: 00930333 add x6 x6 x9
140: 00530333 add x6 x6 x5
144: 10000e17 auipc x28 0x10000
148: edce2e03 lw x28 -292 x28
14c: 01c37333 and x6 x6 x28
150: 01c3f3b3 and x7 x7 x28
154: 00030493 addi x9 x6 0
158: 00038913 addi x18 x7 0
15c: 01831293 slli x5 x6 24
160: 00835313 srli x6 x6 8
164: 0083d393 srli x7 x7 8
168: 0053e3b3 or x7 x7 x5
16c: 012383b3 add x7 x7 x18
170: 0123b2b3 sltu x5 x7 x18
174: 00930333 add x6 x6 x9
178: 00530333 add x6 x6 x5
17c: 00030493 addi x9 x6 0
180: 00038913 addi x18 x7 0
184: 01031293 slli x5 x6 16
188: 01035313 srli x6 x6 16
18c: 0103d393 srli x7 x7 16
190: 0053e3b3 or x7 x7 x5
194: 012383b3 add x7 x7 x18
198: 0123b2b3 sltu x5 x7 x18
19c: 00930333 add x6 x6 x9
1a0: 00530333 add x6 x6 x5
1a4: 00030493 addi x9 x6 0
1a8: 00038913 addi x18 x7 0
1ac: 009383b3 add x7 x7 x9
1b0: 0123b2b3 sltu x5 x7 x18
1b4: 00530333 add x6 x6 x5
1b8: 00030493 addi x9 x6 0
1bc: 00038913 addi x18 x7 0
1c0: 04000993 addi x19 x0 64
1c4: 07f97913 andi x18 x18 127
1c8: 412989b3 sub x19 x19 x18
1cc: 04000e13 addi x28 x0 64
1d0: 413e09b3 sub x19 x28 x19
1d4: 030000ef jal x1 48 <printResult>
1d8: 00012083 lw x1 0 x2
1dc: 00410113 addi x2 x2 4
1e0: 00008067 jalr x0 x1 0
000001e4 <twos_comple>:
1e4: fff00f13 addi x30 x0 -1
1e8: 01ee4e33 xor x28 x28 x30
1ec: 01eeceb3 xor x29 x29 x30
1f0: 00100f13 addi x30 x0 1
1f4: 01ee8eb3 add x29 x29 x30
1f8: 01eeb2b3 sltu x5 x29 x30
1fc: 005e0e33 add x28 x28 x5
200: 00008067 jalr x0 x1 0
00000204 <printResult>:
204: 10000517 auipc x10 0x10000
208: e2050513 addi x10 x10 -480
20c: 00400893 addi x17 x0 4
210: 00000073 ecall
214: 00100893 addi x17 x0 1
218: 00098513 addi x10 x19 0
21c: 00000073 ecall
220: 00008067 jalr x0 x1 0
00000224 <exit>:
224: 00a00893 addi x17 x0 10
228: 00000073 ecall
```
Let's take `6c: 013312b3 sll x5 x6 x19` for example in this 5-stage pipelined section.
### IF

* Program Counter gives next instruction address to instruction memory.
* Program Counter updates its value in order to fetch new instruction.
* Here the next instruction is `0x013312b3` in instruction memory address `0x6c`.
### ID

* In ID stage, it decodes `0x013312b3` fetched from previous stage.
* Translate `0x013312b3` into binary `0000000 10011 00110 001 00101 0110011`.
funct7 = `0000000`.
rs2 = `10011`.
rs1 = `00110`
funct3 = `001`
rd = `00101`
opcode = `0110011`
* In [RISC-V instruction manual](https://riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf), shift left logic is in the format below.

* It's decoded as `sll x5, x6, x19` in instruction format.
### EX

* In EX stage, it shifts the value in `x6` register left by the value in `x19` register with no branch.
### MEM

* In MEM stage, it doesn't perform any write or read operations.
### WB

* In WB stage, write shift left result back to `x5` register.
## Assembly code Optimization
### First version
The result of my first version of assembly code is presented in the experiment result above without any optimization.
### Second version (Loop unrolling)
In the first version of assembly code, I simplified the following C code using a loop and then translated it into assembly code.
```c
x |= (x >> 1);
x |= (x >> 2);
x |= (x >> 4);
x |= (x >> 8);
x |= (x >> 16);
for loop:
for(i=1;i<=16;i*=2)
x |= (x >> i);
```
```c
shift_with_or_from1to16:
sub s3, s3, t3
sll t0, t1, s3
srl t1, t1, t3
srl t2, t2, t3
or t2, t2, t0
or s1, s1, t1
or s2, s2, t2
addi t1, s1, 0 # replace tmp
addi t2, s2, 0 # replace tmp
slli t3, t3, 1
addi s3, x0, 0x20
bne t3, t4, shift_with_or_from1to16
# x |= (x >> 32);
or s2, s1, t2
addi t1, s1, 0 # replace tmp
addi t2, s2, 0 # replace tmp
```
With loop unrolling
```c
shift_with_or_loop_unrolling:
# x |= (x >> 1);
slli t0, t1, 31
srli t1, t1, 1
srli t2, t2, 1
or t2, t2, t0
or s1, s1, t1
or s2, s2, t2
addi t1, s1, 0 # replace tmp
addi t2, s2, 0 # replace tmp
# x |= (x >> 2);
slli t0, t1, 30
srli t1, t1, 2
srli t2, t2, 2
or t2, t2, t0
or s1, s1, t1
or s2, s2, t2
addi t1, s1, 0 # replace tmp
addi t2, s2, 0 # replace tmp
# x |= (x >> 4);
slli t0, t1, 28
srli t1, t1, 4
srli t2, t2, 4
or t2, t2, t0
or s1, s1, t1
or s2, s2, t2
addi t1, s1, 0 # replace tmp
addi t2, s2, 0 # replace tmp
# x |= (x >> 8);
slli t0, t1, 24
srli t1, t1, 8
srli t2, t2, 8
or t2, t2, t0
or s1, s1, t1
or s2, s2, t2
addi t1, s1, 0 # replace tmp
addi t2, s2, 0 # replace tmp
# x |= (x >> 16);
slli t0, t1, 16
srli t1, t1, 16
srli t2, t2, 16
or t2, t2, t0
or s1, s1, t1
or s2, s2, t2
addi t1, s1, 0 # replace tmp
addi t2, s2, 0 # replace tmp
# x |= (x >> 32);
or s2, s1, t2
addi t1, s1, 0 # replace tmp
addi t2, s2, 0 # replace tmp
```
### Third version (Redundant instructions eliminated)
After loop unrolling, we can observe that there are many instructions repeated.
```c
# x |= (x >> 1);
slli t0, t1, 31
srli t1, t1, 1
srli t2, t2, 1
or t2, t2, t0
or s1, s1, t1
or s2, s2, t2
addi t1, s1, 0 # replace tmp
addi t2, s2, 0 # replace tmp
```
I used the `t1` and `t2` registers to temporarily store the right-shifted value of x, and later combined it with the original value of x using an OR operation, storing the result in `s1` and `s2`. Since `t1` and `t2` are registers specifically used for logical operations, before further computations, the values in `t1` and `t2` will need to be replaced by the values in `s1` and `s2`.
Therefore, I eliminate some redundant assembly code. For example, I simplified assembly code above. I directly use the values in `s1` and `s2` for right shift logical operations, instead of using `t1` and `t2`. In this way, it doesn't need to replace the values of `t1` and `t2` anymore.
```c
# x |= (x >> 1);
slli t0, s1, 31
srli t1, s1, 1
srli t2, s2, 1
or t2, t2, t0
or s1, s1, t1
or s2, s2, t2
```
### Result Comparion
Lastly, the comparison of the three different execution infos is as follows.
* First verison

* Second version

* Third version

## Reference
[Find first set - Wikipedia](https://en.wikipedia.org/wiki/Find_first_set)