# Assignment1: RISC-V Assembly and Instruction Pipeline
contributed by < [`Hsiang0528`](https://github.com/Hsiang0528/Computer-Architecture/tree/main/hw1) >
## Calculate the log2 Integer part of a log2 int64 number by CLZ
> Description: Given an number, return int(log2(num)).
>
> Constraints: The number is an int64 number
## Solution
For any int64 number with leading_zeros CLZ,its Integer part of log2 is 63 - CLZ
### C program
#### First version
```c
#include <stdint.h>
uint16_t count_leading_zeros(uint64_t x)
{
x |= (x >> 1);
x |= (x >> 2);
x |= (x >> 4);
x |= (x >> 8);
x |= (x >> 16);
x |= (x >> 32);
x -= ((x >> 1) & 0x5555555555555555 );
x = ((x >> 2) & 0x3333333333333333) + (x & 0x3333333333333333);
x = ((x >> 4) + x) & 0x0f0f0f0f0f0f0f0f;
x += (x >> 8);
x += (x >> 16);
return (64 - (x & 0x7f));
}
int int_log2(uint16_t clz)
{
return 63 - clz;
}
```
#### Second version -- with loop
```c
uint16_t int_log2_num(uint64_t x)
{
for(int i = 1;i < 32;i <<= 1){
x |= (x >> i);
}
x |= (x >> 32);
x -= ((x >> 1) & 0x5555555555555555);
x = ((x >> 2) & 0x3333333333333333) + (x & 0x3333333333333333);
x = ((x >> 4) + x) & 0x0f0f0f0f0f0f0f0f;
x += (x >> 8);
x += (x >> 16);
x += (x >> 32);
return ((x & 0x7f) - 1);
}
```
### Assembly
```c
.data
num0: .word 0x11111111 0x00000011 # big endian
num1: .word 0x00000000 0x00000010 # big endian num =16
num2: .word 0x00000000 0x00000021 # big endian num =33
str: .string "\nint(log2(num"
str0: .string ")):"
.text
main:
# times
li a2,0
li a3,1
jal ra,func
func:
blt, a2,a3,load_num0
beq a2,a3,load_num1
bgt a2,a3,load_num2
load_num0:
la a0, num0
j load_num
load_num1:
la a0, num1
j load_num
load_num2:
la a0, num2
jal ra,load_num
j exit
exit:
li a7, 10 # exit
ecall
load_num:
# load num
lw t0,0(a0)
lw t1,4(a0)
li t5,1
li t6,31
# x = x | x >> 1,2,4,8,16
loop:
sll t2,t0,t6
srl t3,t0,t5
srl t4,t1,t5
or t4,t4,t2
or t0,t0,t3
or t1,t1,t4
sub t6,t6,t5
slli t5,t5,1
bge t6,t5,loop
# x = x | (x >> 32);
or t1,t1,t0
# x -= ((x >> 1) & 0x5555555555555555);
# y = (x >> 1) & 0x5555555555555555;
slli t2,t0,31
srli t3,t0,1
srli t4,t1,1
or t4,t4,t2
li t5,0x55555555
and t3,t3,t5
and t4,t4,t5
# x -= y
sub t0,t0,t3
sub t1,t1,t4
# x = ((x >> 2) & 0x3333333333333333) + (x & 0x3333333333333333);
# y = (x >> 2) & 0x3333333333333333;
slli t2,t0,30
srli t3,t0,2
srli t4,t1,2
or t4,t4,t2
li t5,0x33333333
and t3,t3,t5
and t4,t4,t5
# a = x & 0x3333333333333333;
and a0,t0,t5
and a1,t1,t5
add t0,t3,a0
add t1,t4,a1
# x = ((x >> 4) + x) & 0x0f0f0f0f0f0f0f0f;
# t = x >> 4
slli t2,t0,28
srli t3,t0,4
srli t4,t1,4
or t4,t4,t2
# x = x + t
add t0,t0,t3
add t1,t1,t4
# x = x & 0x0f0f0f0f0f0f0f0f;
li t5,0x0f0f0f0f
and t0,t0,t5
and t1,t1,t5
# x = x + (x >> 8);
# t = x >> 8
slli t2,t0,24
srli t3,t0,8
srli t4,t1,8
or t4,t4,t2
# x = x + t
add t0,t0,t3
add t1,t1,t4
# x = x + (x >> 16);
# t = x >> 16
slli t2,t0,16
srli t3,t0,16
srli t4,t1,16
or t4,t4,t2
# x = x + t
add t0,t0,t3
add t1,t1,t4
# x += (x >> 32);
add t1,t1,t0
# get int(log2(num))
andi t1,t1,0x7f
addi t0,t1,-1
print:
la a0, str
li a7, 4
ecall
mv a0,a2
li a7,1
ecall
la a0, str0
li a7, 4
ecall
mv a0,t0
li a7,1
ecall
times:
addi a2,a2,1
ret
```
There are 3 test data in this program:
* num0: .0x1111111100000011 int part : 60
* num1: .0x0000000000000010 int part : 4
* num2: .0x0000000000000021 int part : 5
#### experiment results

#### Potential data hazard

This diagram is the state before stalling. `jal x1 4 <func>` is in the EXE stage, while `blt x12 x13 12` is in the ID stage.

Next state, `jal` is in the MEM stage, while `blt` is in the IF satge.



Next state, `blt` is in the EXE satge,`beq`,`blt` is repalced by `nop` and `auipc x10 0x10000` is in the IF stage(because of `jal`).
## Assembly optimization
### Reduce instruction
before
```c
andi t2,t0,1
slli t2,t2,31
```
after
```c
slli t2,t0,31
```
-------------------------
before
```c
# return 64 - (x & 0x7f)
andi t1,t1,0x7f
li t0,64
# t0 = leading_zeros
sub t0,t0,t1
int_log_num_:
# int(log(num)) = 63 - leading zeros
li t1,63
sub t0,t1,t0
```
after
```c
# get int(log2(num))
andi t1,t1,0x7f
addi t0,t1,-1
```