# Assignment2: RISC-V Toolchain
contributed by [david965154](https://github.com/david965154/Computer_Architecture_hw2)
Rewrite:[Implement priority encoder using CLZ](https://hackmd.io/@NIYINGCHIH/BkvgNE3e6) by 倪英智
## Original Code
- [ ] Assembly code
```c
.data:
nextline: .string "\n"
input_size: .word 4
data_1: .word 0b00000000000000000000000000000001
data_2: .word 0b00000000000000000000000000000011
data_3: .word 0b00000000000000000000000000000100
data_4: .word 0b00000000000000000000000000001001
main:
addi sp, sp, -20
# push four pointers of test data onto the stack
lw t0, data_1
sw t0, 0(sp)
lw t0, data_2
sw t0, 4(sp)
lw t0, data_3
sw t0, 8(sp)
lw t0, data_4
sw t0, 12(sp)
addi s0, x0, 4 # s0 is the iteration times(4 test case)
addi s1, x0, 0 # s1 is counter
addi s2, sp, 0 # s2 initial at data_1
# Load the constants A01, A02, and mask 0x0f0f0f0f
li a5, 0x55555555
li a6, 0x33333333
li t1, 0x0f0f0f0f
loop:
lw a0, 0(s2) #load data into a0
addi s2, s2, 4 # s2 move to next data
addi s1, s1, 1 # counter++
clz:
# OR the input value with itself shifted to the right by 1, 2, 4, 8, 16, and 16 bits (total 32).
srli a1, a0, 1
or a0, a0, a1
srli a1, a0, 2
or a0, a0, a1
srli a1, a0, 4
or a0, a0, a1
srli a1, a0, 8
or a0, a0, a1
srli a1, a0, 16
or a0, a0, a1
srli a1, a0, 16
srli a2, a1, 16
or a0, a0, a2
# Count ones (population count).
srli a1, a0, 1
and a4, a1, a5
sub a0, a0, a4
and a1, a0, a6
srli a2, a0, 2
and a3, a6, a2
add a0, a3, a1
srli a1, a0, 4
add a0, a1, a0
and a0, a0, t1
srli a1, a0, 8
add a0,a1,a0
srli a1, a0, 16
add a0, a1, a0
srli a1, a0, 16
srli a2, a1, 16
add a0, a0, a2
# Return the number of zeros and count the priority
andi a0, a0, 0x7f # andi a0, a0, 0x7f
addi a1, x0, 1
sub a0, a0, a1
print:
li a7, 35
ecall
lw a0, nextline
li, a7, 11
ecall
bne s1, s0, loop
beq s1, s0, end
end:
li a7, 10 # Exit system call
ecall
```
- [ ] C code
```c
#include <stdio.h>
#include <stdint.h>
uint16_t count_leading_zeros(uint64_t x, int input_size)
{
x |= (x >> 1);
x |= (x >> 2);
x |= (x >> 4);
x |= (x >> 8);
x |= (x >> 16);
x |= (x >> 32);
/* count ones (population count) */
x -= ((x >> 1) & 0x5555555555555555 );
x = ((x >> 2) & 0x3333333333333333) + (x & 0x3333333333333333 );
x = ((x >> 4) + x) & 0x0f0f0f0f0f0f0f0f;
x += (x >> 8);
x += (x >> 16);
x += (x >> 32);
return (input_size - (x & 0x7f));
}
int priority_encoder_4bit(int leading_zero, int input_size) {
if(leading_zero<0)
{
printf("wrong input");
return 0;
}
else
{
return (input_size -1 -(la));
}
}
int main() {
int input;
int input_size;
int leading_zero = 0;
int priority = 0;
printf("Enter a number : ");
scanf("%4d", &input);
printf("Enter a input size : ");
scanf("%4d", &input_size);
leading_zero = count_leading_zeros(input,input_size);
printf("%d\n",leading_zero);
priority = priority_encoder_4bit(leading_zero, input_size);
printf("%d\n",priority);
return 0;
}
```
## Manual optimization
**Version 1**
Adjustment:
1. **Algorithm** :Keep shift right logical until the value remain 1 and find the shifted bits.
`time complexity : n/2 => worse`
2. **Print** : rv32emu cannot print as like rv32i, all the output should translate to ascii code, here is my solution:
Suppose here have a 4 bits number, it will divide to 10^3 and add 48 to print, next it's turn to divide to 10^2 and do same thing in the following. Until all the bit was printed
3. **Perfcounter** : To compare the performance between the algorithm optimization
4. **Remove the trivial code** : Through the observation, there have some codes it's no need, so I remove them from original code.
**Assembly code:**
```
.org 0
# Provide program starting address to linker
.data
data_0: .word 0b00000000000000000000000000000001
data_1: .word 0b00000000000000000000000010000001
data_2: .word 0b00000000000000001000000000000000
data_3: .word 0b00000000100000000001010000000100
data_4: .word 0b11111111111111111111111111111111
nextline: .ascii "\n"
.set str_size, .-nextline
buffer: .byte 0
.text
.global main
main:
jal ra, get_cycles
mv t1, a0
addi sp, sp, -20
# push four pointers of test data onto the stack
lw t0, data_0
sw t0, 0(sp)
lw t0, data_1
sw t0, 4(sp)
lw t0, data_2
sw t0, 8(sp)
lw t0, data_3
sw t0, 12(sp)
lw t0, data_4
sw t0, 16(sp)
addi s0, x0, 4 # s0 is the iteration times(4 test case)
addi s1, x0, 0 # s1 is counter
addi s2, sp, 0 # s2 initial at (0)sp
li a5, 0
loop:
lw a1, 0(s2) #load data into a0
addi a2, x0, 2
addi a0, x0, 0
addi s2, s2, 4 # s2 move to next data
addi s1, s1, 1 # counter++
blt a1, a5, ulimitcase
blt a1, a2, print
clz:
srli a1, a1, 1
addi a0, a0, 1
bge a1, a2, clz
jal det
ulimitcase:
li a0, 31
det:
addi a3, x0, 10
addi a4, x0 , 0
blt a0, a3, print
jal ra, stp
# a0: number(0<=x<10)
print:
addi a0, a0, 48 # convert to ascii
la t0, buffer
sb a0, 0(t0)
li a0, 1
la a1, buffer
li a2, 1
li a7, 64
ecall
la a2, str_size
lw a2, 0(a2)
addi a0, x0, 1
la a1, nextline
li a7, 64
ecall
addi a3, x0, 0
bne s1, s0, loop
beq s1, s0, end
stp:
addi a0, a0, -10
addi a4, a4, 1
bge a0, a3, stp
printn:
# input & output
# a0: divided
# a4: quotient
addi a4, a4, 48
addi sp, sp, -4
la t0, buffer #
sb a4, 0(t0) # save quotientto buffer
sw a0, 0(sp) # divided save in stack
addi a0, x0, 1
la a1, buffer
addi a2, x0, 1
li a7, 64
ecall
lw a0, 0(sp) # lw divided in stack
addi sp, sp, 4
addi a4, x0, 0 # reset quotient
ret
thstp:
addi a0, a0, -1000
addi a4, a4, 1
bge a0, a3, thstp
jal ra, printn # a0 < 1000
jal ra, h
hstp:
addi a0, a0, -100
addi a4, a4, 1
bge a0, a3, hstp
jal ra, printn
jal ra, t
tstp:
addi a0, a0, -10
addi a4, a4, 1
bge a0, a3, tstp
jal ra, printn
jal ra, o
end:
jal ra, get_cycles
sub a0, a0, t1
th:
# a0: cycle num divided
addi a3, x0, 1000 # a3 = divisor
addi a4, x0, 0 # a4 = quotient
bge a0, a3, thstp # a0 >= 1000
jal ra, printn
h:
addi a3, x0, 100
bge a0, a3, hstp
jal ra, printn
t:
addi a3, x0, 10
bge a0, a3, tstp
jal ra, printn
o:
addi a0, a0, 48
la t0, buffer
sb a0, 0(t0)
li a0, 1
la a1, buffer
addi a2, x0, 1
li a7, 64
ecall
li a7, 93 # Exit system call
ecall
get_cycles:
csrr a1, cycleh
csrr a0, cycle
csrr a2, cycleh
bne a1, a2, get_cycles
ret
```
**Version 2**
Adjustment:
1. **Algorithm** : Still shift to right logical, but in more efficient way. First, shift right 16 bits, the three caseit will need to handle. Equal to 1, the answer is 16. Bigger than 1, the right shift bits need to add `16/2=8`. Smaller than 1, the right shift bits need to subtract `16/2=8`. Just separate the range of 32 bits into two parts to find the position.
`time complexity : log2 => better`
```
.org 0
# Provide program starting address to linker
.data
data_0: .word 0b00000000000000000000000000000001
data_1: .word 0b00000000000000000000000010000001
data_2: .word 0b00000000000000001000000000000000
data_3: .word 0b00000000100000000001010000000100
data_4: .word 0b11111111111111111111111111111111
nextline: .ascii "\n"
.set str_size, .-nextline
buffer: .byte 0
.text
.global main
main:
jal ra, get_cycles
mv t1, a0
addi sp, sp, -20
# push four pointers of test data onto the stack
lw t0, data_0
sw t0, 0(sp)
lw t0, data_1
sw t0, 4(sp)
lw t0, data_2
sw t0, 8(sp)
lw t0, data_3
sw t0, 12(sp)
lw t0, data_4
sw t0, 16(sp)
addi s0, x0, 4 # s0 is the iteration times(4 test case)
addi s1, x0, 0 # s1 is counter
addi s2, sp, 0 # s2 initial at (0)sp
li a5, 2
li a6, 0
loop:
lw a1, 0(s2) #load data into a0
li a0, 16
li t2, 16
addi s2, s2, 4 # s2 move to next data
addi s1, s1, 1 # counter++
blt a1, a6, ulimitcase
blt a1, a5, llimitcase
jal clz
right:
sub a0, a0, t2
jal clz
left:
add a0, a0, t2
clz:
srl a2, a1, a0
srl t2, t2, 1
beq a2, x0, right
bge a2, a5, left
jal det
llimitcase:
li a0, 0
jal print
ulimitcase:
li a0, 31
det:
addi a3, x0, 10
addi a4, x0 , 0
blt a0, a3, print
jal ra, stp
# a0: number(0<=x<10)
print:
addi a0, a0, 48 # convert to ascii
la t0, buffer
sb a0, 0(t0)
li a0, 1
la a1, buffer
li a2, 1
li a7, 64
ecall
la a2, str_size
lw a2, 0(a2)
addi a0, x0, 1
la a1, nextline
li a7, 64
ecall
addi a3, x0, 0
bne s1, s0, loop
beq s1, s0, end
stp:
addi a0, a0, -10
addi a4, a4, 1
bge a0, a3, stp
printn:
# input & output
# a0: divided
# a4: quotient
addi a4, a4, 48
addi sp, sp, -4
la t0, buffer #
sb a4, 0(t0) # save quotientto buffer
sw a0, 0(sp) # divided save in stack
addi a0, x0, 1
la a1, buffer
addi a2, x0, 1
li a7, 64
ecall
lw a0, 0(sp) # lw divided in stack
addi sp, sp, 4
addi a4, x0, 0 # reset quotient
ret
thstp:
addi a0, a0, -1000
addi a4, a4, 1
bge a0, a3, thstp
jal ra, printn # a0 < 1000
jal ra, h
hstp:
addi a0, a0, -100
addi a4, a4, 1
bge a0, a3, hstp
jal ra, printn
jal ra, t
tstp:
addi a0, a0, -10
addi a4, a4, 1
bge a0, a3, tstp
jal ra, printn
jal ra, o
end:
jal ra, get_cycles
sub a0, a0, t1
th:
# a0: cycle num divided
addi a3, x0, 1000 # a3 = divisor
addi a4, x0, 0 # a4 = quotient
bge a0, a3, thstp # a0 >= 1000
jal ra, printn
h:
addi a3, x0, 100
bge a0, a3, hstp
jal ra, printn
t:
addi a3, x0, 10
bge a0, a3, tstp
jal ra, printn
o:
addi a0, a0, 48
la t0, buffer
sb a0, 0(t0)
li a0, 1
la a1, buffer
addi a2, x0, 1
li a7, 64
ecall
li a7, 93 # Exit system call
ecall
get_cycles:
csrr a1, cycleh
csrr a0, cycle
csrr a2, cycleh
bne a1, a2, get_cycles
ret
```
Compare:
**Testcase:**
To get the average cycles in different case, I use 5 cases as below.
```
0. data_0: 0b00000000000000000000000000000001
1. data_1: 0b00000000000000000000000010000001
2. data_2: 0b00000000000000001000000000000000
3. data_3: 0b00000000100000000001010000000100
4. data_4: 0b11111111111111111111111111111111
```
| code | ori | opt_1 | opt_2 |
|:------:|:---:|:-----:|:-----:|
| cycles | 393 | 377 | 320 |
The testcase 0 and testcase 4 are very unbalance to original code, because of the MSB in testcase is 1. Alse represent the value is negative so the code need to handle this exception, I decide to detect the numberwill output 31 once the value is negtive. In the other word, the number will skip the algorithm. Next is number 0 or 1, these two number is friendly to opt_1, the algorithm only need to shift right a few bits to get answer. Same testcase in opt_2, the algorithm cannot handle the right shift 0:
```
s:right shift bits
a:adjust value
s:16 a:8
s:8 a:4
s:4 a:2
s:2 a:1
s:1 a:0
s:1 a:0
...
```
But I need to show the average cycle it need, so I still use 5 testcase as above.
## **Analyse the ELF**
To know the different between compiler optimization O0, O1, O2, O3, Os, and Ofast, we diaassemble the .elf file with the instruction below:
```
riscv-none-elf-objdump -D codename.elf
```
Through the campare between opt_1 and opt_2, it's clearly to choose opt_2 as target to optimize.
Here is its correspond C code:
```c
#include <stdio.h>
#include <stdint.h>
uint16_t count_leading_zeros(uint64_t x)
{
int pace = 16;
int adj = 16;
int y;
while(x>1){
y = x >> pace;
adj>>=1;
if(y>1){
pace+=adj;
}
else if(y<1){
pace-=adj;
}
else if(y==1){
return (64-pace);
}
}
return 64;
}
int main() {
int input;
scanf("%d", &input);
printf("%d\n",64 - count_leading_zeros(input));
return 0;
}
```
Because of the size of file is too large, I put them in [github](https://github.com/david965154/Computer_Architecture_hw2/tree/main/object2asssembly)
In observation of the output, there have lots of trivial instrustion that it's no need in this code, because it's need to include all function in `stdlib.h` and `stdio.h`.
If we can replace the printf in `stdio.h` package with the assembly `printf` we can reduce no matter code size or execute time.
Now, to know the instruction size output from disassemble, we use the following instruction:
```shell
riscv-none-elf-size codename.elf
```
| | -O0 | -O1 | -O2 | -O3 | -Os | -Ofast |
| ---- | ----- | ----- | ----- | ----- | ----- | ------ |
| text | 51818 | 51478 | 51426 | 51426 | 51458 | 51426 |
| data | 1872 | 1872 | 1872 | 1872 | 1872 | 1872 |
| bss | 1528 | 1528 | 1528 | 1528 | 1528 | 1528 |
:::warning
You shall use RDCYCLE/RDCYCLEH instruction for the statistics of your program’s execution.
:notes: jserv
:::