# Assignment1: RISC-V Assembly and Instruction Pipeline
contributed by [spadee27357](https://github.com/spadee27357)
## Determine number size range by CLZ
## C code
```c=
#include <stdio.h>
#include <stdint.h>
uint16_t count_leading_zeros(uint64_t x) {
if (x == 0) return 64;
x |= (x >> 1);
x |= (x >> 2);
x |= (x >> 4);
x |= (x >> 8);
x |= (x >> 16);
x |= (x >> 32);
x -= ((x >> 1) & 0x5555555555555555);
x = ((x >> 2) & 0x3333333333333333) + (x & 0x3333333333333333);
x = ((x >> 4) + x) & 0x0f0f0f0f0f0f0f0f;
x += (x >> 8);
x += (x >> 16);
x += (x >> 32);
return 64 - x;
}
void determine_size(uint64_t num) {
uint16_t zeros = count_leading_zeros(num);
if (zeros == 64) {
printf("The number is 0.\n");
return;
}
uint16_t bit_position = 64 - zeros;
printf("The number is roughly between 2^%d and 2^%d.\n", bit_position - 1, bit_position);
}
int main() {
uint64_t num;
printf("Enter a number: ");
scanf("%llu", &num);
determine_size(num);
return 0;
}
```

## Assembly code
input1~3 is test data
```=
.data
input1: .word 16
input2: .word 33
input3: .word 254
str1: .string "The number "
str2: .string " is roughly between 2^"
str3: .string " and 2^"
str4: .string "\n"
.text
main:
la a0, input1 # a0 = input1
lw a0, 0(a0) # load 32-bit word
jal ra, determine_size
la a0, input2 # a0 = input2
lw a0, 0(a0) # load 32-bit word
jal ra, determine_size
la a0, input3 # a0 = input3
lw a0, 0(a0) # load 32-bit word
jal ra, determine_size
# End program
li a7, 10 # syscall for program termination
ecall
determine_size:
# Preserve ra (return address)
addi sp, sp, -4
sw ra, 0(sp)
mv a1, a0 # a1 = input
la a0, str1
li a7, 4
ecall
mv a0, a1
li a7, 1
ecall
# Call count_leading_zeros
jal ra, count_leading_zeros
# Restore ra
lw ra, 0(sp)
addi sp, sp, 4
mv t3, a0
la a0, str2
li a7, 4
ecall
# Subtract it from 64 to get the position
li t1, 64
sub t1, t1, t3 # t1 will now hold the bit_position
mv t2, t1 # move the value to t2 to preserve it
# Now, check if the bit_position is zero (which means input number was 0)
beqz t1, print_zero
# Subtract 1 from bit_position and print the value
addi a0, t1, -1
li a7, 1 # Assuming syscall code 1 is for printing integers in your environment
ecall
la a0, str3
li a7, 4
ecall
# Then print the bit_position (from t2)
mv a0, t2
li a7, 1
ecall
la a0, str4
li a7, 4
ecall
ret
print_zero:
li a0, 0 # Load the integer value 0 to a0
li a7, 1 # syscall code for print integer
ecall
ret
count_leading_zeros:
srli t1, a0, 1
or a0, a0, t1
srli t1, a0, 2
or a0, a0, t1
srli t1, a0, 4
or a0, a0, t1
srli t1, a0, 8
or a0, a0, t1
srli t1, a0, 16
or a0, a0, t1
# Start of population count
li t2, 0x55555555
srli t1, a0, 1
and t1, t1, t2
sub a0, a0, t1
li t2, 0x33333333
srli t1, a0, 2
and t1, t1, t2
and a0, a0, t2
add a0, a0, t1
srli t1, a0, 4
add a0, a0, t1
li t2, 0x0f0f0f0f
and a0, a0, t2
srli t1, a0, 8
add a0, a0, t1
srli t1, a0, 16
add a0, a0, t1
# 64 - (x & 0x3f)
andi a0, a0, 0x3f
li t1, 64
sub a0, t1, a0
ret
```

## Analysis
The code test using the [Ripes](https://github.com/mortbopet/Ripes) simulator.
```
00000000 <main>:
0: 10000517 auipc x10 0x10000
4: 00050513 addi x10 x10 0
8: 00052503 lw x10 0 x10
c: 02c000ef jal x1 44 <determine_size>
10: 10000517 auipc x10 0x10000
14: ff450513 addi x10 x10 -12
18: 00052503 lw x10 0 x10
1c: 01c000ef jal x1 28 <determine_size>
20: 10000517 auipc x10 0x10000
24: fe850513 addi x10 x10 -24
28: 00052503 lw x10 0 x10
2c: 00c000ef jal x1 12 <determine_size>
30: 00a00893 addi x17 x0 10
34: 00000073 ecall
00000038 <determine_size>:
38: ffc10113 addi x2 x2 -4
3c: 00112023 sw x1 0 x2
40: 00050593 addi x11 x10 0
44: 10000517 auipc x10 0x10000
48: fc850513 addi x10 x10 -56
4c: 00400893 addi x17 x0 4
50: 00000073 ecall
54: 00058513 addi x10 x11 0
58: 00100893 addi x17 x0 1
5c: 00000073 ecall
60: 07c000ef jal x1 124 <count_leading_zeros>
64: 00012083 lw x1 0 x2
68: 00410113 addi x2 x2 4
6c: 00050e13 addi x28 x10 0
70: 10000517 auipc x10 0x10000
74: fa850513 addi x10 x10 -88
78: 00400893 addi x17 x0 4
7c: 00000073 ecall
80: 04000313 addi x6 x0 64
84: 41c30333 sub x6 x6 x28
88: 00030393 addi x7 x6 0
8c: 04030063 beq x6 x0 64 <print_zero>
90: fff30513 addi x10 x6 -1
94: 00100893 addi x17 x0 1
98: 00000073 ecall
9c: 10000517 auipc x10 0x10000
a0: f9350513 addi x10 x10 -109
a4: 00400893 addi x17 x0 4
a8: 00000073 ecall
ac: 00038513 addi x10 x7 0
b0: 00100893 addi x17 x0 1
b4: 00000073 ecall
b8: 10000517 auipc x10 0x10000
bc: f7f50513 addi x10 x10 -129
c0: 00400893 addi x17 x0 4
c4: 00000073 ecall
c8: 00008067 jalr x0 x1 0
000000cc <print_zero>:
cc: 00000513 addi x10 x0 0
d0: 00100893 addi x17 x0 1
d4: 00000073 ecall
d8: 00008067 jalr x0 x1 0
000000dc <count_leading_zeros>:
dc: 00155313 srli x6 x10 1
e0: 00656533 or x10 x10 x6
e4: 00255313 srli x6 x10 2
e8: 00656533 or x10 x10 x6
ec: 00455313 srli x6 x10 4
f0: 00656533 or x10 x10 x6
f4: 00855313 srli x6 x10 8
f8: 00656533 or x10 x10 x6
fc: 01055313 srli x6 x10 16
100: 00656533 or x10 x10 x6
104: 555553b7 lui x7 0x55555
108: 55538393 addi x7 x7 1365
10c: 00155313 srli x6 x10 1
110: 00737333 and x6 x6 x7
114: 40650533 sub x10 x10 x6
118: 333333b7 lui x7 0x33333
11c: 33338393 addi x7 x7 819
120: 00255313 srli x6 x10 2
124: 00737333 and x6 x6 x7
128: 00757533 and x10 x10 x7
12c: 00650533 add x10 x10 x6
130: 00455313 srli x6 x10 4
134: 00650533 add x10 x10 x6
138: 0f0f13b7 lui x7 0xf0f1
13c: f0f38393 addi x7 x7 -241
140: 00757533 and x10 x10 x7
144: 00855313 srli x6 x10 8
148: 00650533 add x10 x10 x6
14c: 01055313 srli x6 x10 16
150: 00650533 add x10 x10 x6
154: 03f57513 andi x10 x10 63
158: 04000313 addi x6 x0 64
15c: 40a30533 sub x10 x6 x10
160: 00008067 jalr x0 x1 0
```
## pipeline
### IF stage

#### Read the current value of the Program Counter (PC):
The Program Counter (PC) is a specialized register in the CPU that holds the memory address of the instruction to be fetched and executed next.
#### Access the Instruction Memory:
Using the address from the PC, the processor retrieves the instruction stored at that location in the instruction memory. The instruction memory is a section of memory dedicated to storing program instructions. For our example, the instruction at the given address is addi x10, x10, 0.
#### Fetch the Instruction:
The processor fetches (or reads) the addi x10, x10, 0 instruction from the instruction memory into the processor's pipeline for further processing.
#### Update the Program Counter (PC):
After fetching the instruction, the PC is updated to point to the next instruction in the sequence.
### ID stage

R1 is x10(0x0a), R2 is 0(0x00)
Reg1 is 0x00000000
Reg2 is 0x00000000
#### Instruction Decoding:
During this stage, the instruction addi x10, x10, 0 fetched from the IF is decoded. This means the CPU interprets the operation of this instruction and identifies which registers it needs to work with and what immediate value it needs (in this case, the immediate value is 0).
#### Register File Reading:
As the addi instruction needs to read a value from x10, the CPU accesses its register file during this phase to obtain the current value of x10. This value will subsequently be used in the addition operation.
### EX stage

In EX stage, we can observe that ALU adds two inputs together, so the ALU output is 0x10000000.
### MEM stage

In MEM stage, Addi instruction does not read or write operations performed on the data memory.
### WB stage

In the WB stage, the new value 0x10000000 is written into the register x10.