# Assignment2: GNU Toolchain
contributed by [spadee27357](https://github.com/spadee27357)
## Question selection
I chose the [Implement Binarization by count leading zero](https://hackmd.io/@edenlin/CompArchi_HW1) from [林勁羽](https://github.com/JinYu1225/2023_Computer_Architecture/tree/main/Lab1)
The reason I chose this topic is that in image recognition tasks, it's common to encounter the use of binarization, a technique for image processing. This method is incredibly practical across various fields.
## Original Code
**C Code**
```c
#include <stdint.h>
#include <stdio.h>
uint32_t count_leading_zeros_32(uint32_t x)
{
x |= (x >> 1);
x |= (x >> 2);
x |= (x >> 4);
x |= (x >> 8);
x |= (x >> 16);
x -= ((x >> 1) & 0x55555555);
x = ((x >> 2) & 0x33333333) + (x & 0x33333333);
x = ((x >> 4) + x) & 0x0f0f0f0f;
x += (x >> 8);
x += (x >> 16);
return (32 - (x & 0x3f)); // change 0x7f to 0x3f
}
int main(){
// pixel test
// 8-bit color depth for black and white photo
uint32_t picture[5] = {20,80,128,150,231};
uint32_t threshold = 128;
uint32_t *pixel = &picture;
for(int i = 0; i < 5; i++){
uint32_t sub = threshold - *(pixel+i);
printf("%d, ",i);
printf("before = %d, ",*(pixel+i));
sub = count_leading_zeros_32(sub);
if(sub)
*(pixel+i) = 0;
else
*(pixel+i) = 255;
printf("after = %d\n",*(pixel+i));
}
return 0;
}
```
**Assembly code**
```
# input the image from "test"
# output the binariztion result to "result"
# "threshold" can be change from 0 to 255
# each value of "test" can be change from 0 to 255
# if the number of "test" being change,
# the number of "result" and the "s1" parameter in "main" should be change as well
.data
test: .byte 20, 80, 128, 150, 231
result: .byte 0, 0, 0, 0, 0
threshold: .byte 0x80
mask1: .word 0x55555555
mask2: .word 0x33333333
mask4: .word 0x0f0f0f0f
str1: .string " "
str2: .string "\nThe Binarization Result:\n"
.text
main:
# a1 = address of pixel
# a2 = address of result
# a3 = threshold
la a1, test # load address of pixel
la a2, result # load address of result
la a3, threshold # load address of threshold
lbu a3, 0(a3) # load value of threshold
BINARIZATION:
li s1, 5 # count of haven't done pixel
B_LOOP:
lbu a4, 0(a1) # load value of pixel
sub a0, a3, a4 # threshold - pixel
jal ra, CLZ # jump to CLZ
bne a0, x0, FLOOR # branch to FLOOR if CLZ = 0
CEIL:
li a4, 255
j STORE_MOVE # jump to STORE_MOVE
FLOOR:
li a4, 0
STORE_MOVE:
sb a4, 0(a2)
addi s1, s1, -1 # total count -1
addi a1, a1, 1 # go to next address of pixel
addi a2, a2, 1 # go to next address of result
bne s1, x0, B_LOOP
li s1, 5 # make count for RESULT_CHECK
sub a2, a2, s1 # reset address of result
la a0, str2 # start to print the result
li a7, 4
ecall
RESULT_CHECK:
lbu a0, 0(a2) # load result to a0
jal ra, PRINT_INT
addi s1, s1, -1 # count print result
addi a2, a2, 1 # move to next address of result
bne s1, x0, RESULT_CHECK
j EXIT
CLZ:
# a0: x = the number to be counted leading zero
# t0: shifted x
# t1: x shift & mask
# t2: mask
PAD1:
srli t0, a0, 1 # t0 = x >> 1
or a0, a0, t0 # x |= x >> 1
srli t0, a0, 2 # t0 = x >> 2
or a0, a0, t0 # x |= x >> 2
srli t0, a0, 4 # t0 = x >> 4
or a0, a0, t0 # x |= x >> 4
srli t0, a0, 8 # t0 = x >> 8
or a0, a0, t0 # x |= x >> 8
srli t0, a0, 16 # t0 = x >> 16
or a0, a0, t0 # x |= x >> 16
POPCNT:
lw t2, mask1 # load mask1 to t2
srli t0, a0, 1 # t0 = x >> 1
and t1, t0, t2 # t1 = (x >> 1) & mask1
sub a0, a0, t1 # x -= ((x >> 1) & mask1)
lw t2, mask2 # load mask2 to t2
srli t0, a0, 2 # t0 = x >> 2
and t1, t0, t2 # (x >> 2) & mask2
and a0, a0, t2 # x & mask2
add a0, t1, a0 # ((x >> 2) & mask2) + (x & mask2)
srli t0, a0, 4 # t0 = x >> 4
add a0, a0, t0 # x + (x >> 4)
lw t2, mask4 # load mask4 to t2
and a0, a0, t2 # ((x >> 4) + x) & mask4
srli t0, a0, 8 # t0 = x >> 8
add a0, a0, t0 # x += (x >> 8)
srli t0, a0, 16 # t0 = x >> 16
add a0, a0, t0 # x += (x >> 16)
andi t0, a0, 0x3f # t0 = x & 0x3f
li a0, 32 # a0 = 64
sub a0, a0, t0 # 64 - (x & 0x3f)
addi sp, sp, -4 # make space for ra in stack
sw ra, 0(sp) # push ra into stack
jal ra, PRINT_INT
lw ra, 0(sp) # pull ra out of stack
addi sp, sp, 4 # free the space create in stack for ra
ret
PRINT_INT:
li a7, 1 # print the value of input a0 in int
ecall
addi sp, sp, -4
sw a0, 0(sp)
la a0, str1
li a7, 4
ecall
lw a0, 0(sp)
addi sp, sp, 4
ret
EXIT:
li a7, 10 # Halts the simulator
ecall
```
## Modified Code
**C Code**
```C
#include <stdint.h>
#include <stdio.h>
#include <time.h>
#include <string.h>
extern uint64_t get_cycles();
extern uint64_t get_instret();
uint32_t count_leading_zeros_32(uint32_t x) {
x |= (x >> 1);
x |= (x >> 2);
x |= (x >> 4);
x |= (x >> 8);
x |= (x >> 16);
x -= ((x >> 1) & 0x55555555);
x = ((x >> 2) & 0x33333333) + (x & 0x33333333);
x = ((x >> 4) + x) & 0x0f0f0f0f;
x += (x >> 8);
x += (x >> 16);
return (32 - (x & 0x3f));
}
int main() {
/* measure cycles */
uint64_t instret = get_instret();
uint64_t oldcount = get_cycles();
uint32_t picture[5] = {20, 80, 128, 150, 231};
uint32_t threshold = 230;
for (int i = 0; i < 5; i++) {
printf("%d, before = %lu, ", i, picture[i]);
picture[i] = (count_leading_zeros_32(threshold - picture[i]) != 0) ? 0 : 255;
printf("after = %lu\n", picture[i]);
}
uint64_t cyclecount = get_cycles() - oldcount;
printf("cycle count: %u\n", (unsigned int) cyclecount);
printf("instret: %x\n", (unsigned) (instret & 0xffffffff));
return 0;
}
```
**Assembly code**
```
.data
picture: .word 20, 80, 128, 150, 231
threshold: .word 230
str1: .string "%d, before = "
str2: .string ", after = "
str3: .string "\n"
.text
.globl main
main:
la a1, picture
li a4, 5
la a2, threshold
lw a2, 0(a2)
li t3, 0
LOOP:
lw a5, 0(a1)
mv a3, a5
sub a0, a2, a5
jal ra, count_leading_zeros
li a3, 32
sub a0, a3, a0
bnez a0, SET_BLACK
li a3, 255
j STORE
SET_BLACK:
li a3, 0
STORE:
sw a3, 0(a1)
mv a0, t3
call PRINT_INT
la a0, str1
li a7, 4
ecall
mv a0, a5
call PRINT_INT
la a0, str2
li a7, 4
ecall
mv a0, a3
call PRINT_INT
la a0, str3
li a7, 4
ecall
addi a1, a1, 4
addi t3, t3, 1
blt t3, a4, LOOP
li a7, 10
ecall
count_leading_zeros:
srli t1, a0, 1
or a0, a0, t1
srli t1, a0, 2
or a0, a0, t1
srli t1, a0, 4
or a0, a0, t1
srli t1, a0, 8
or a0, a0, t1
srli t1, a0, 16
or a0, a0, t1
# Start of population count
li t2, 0x55555555
srli t1, a0, 1
and t1, t1, t2
sub a0, a0, t1
li t2, 0x33333333
srli t1, a0, 2
and t1, t1, t2
and a0, a0, t2
add a0, a0, t1
srli t1, a0, 4
add a0, a0, t1
li t2, 0x0f0f0f0f
and a0, a0, t2
srli t1, a0, 8
add a0, a0, t1
srli t1, a0, 16
add a0, a0, t1
andi a0, a0, 0x3f
li t1, 64
sub a0, t1, a0
ret
PRINT_INT:
li a7, 1
ecall
ret
```
## Compile and execute the code
**Performance measurement**
Compile the C code to ELF
```
riscv-none-elf-gcc -march=rv32i_zicsr_zifencei -mabi=ilp32 -Wall -c -o getcycles.o getcycles.s
riscv-none-elf-gcc -march=rv32i_zicsr_zifencei -mabi=ilp32 -Wall -c -o getinstret.o getinstret.s
riscv-none-elf-gcc -march=rv32i_zicsr_zifencei -mabi=ilp32 -O0 -Wall -c -o main.o main.c
riscv-none-elf-gcc -O0 -march=rv32i -mabi=ilp32 -o main.elf getcycles.o getinstret.o new.o
```
Result
```
~/rv32emu/build/rv32emu main.elf
```
```
0, before = 20, after = 0
1, before = 80, after = 0
2, before = 128, after = 0
3, before = 150, after = 0
4, before = 231, after = 255
cycle count: 14731
instret: 2c5
inferior exit code 0
```
Display the ELF Size
```
riscv-none-elf-size main.elf
```
```
text data bss dec hex filename
51752 1876 1528 55156 d774 main.elf
```
Display the ELF file header
```
riscv-none-elf-readelf -h main.elf
```
```
ELF Header:
Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00
Class: ELF32
Data: 2's complement, little endian
Version: 1 (current)
OS/ABI: UNIX - System V
ABI Version: 0
Type: EXEC (Executable file)
Machine: RISC-V
Version: 0x1
Entry point address: 0x100c2
Start of program headers: 52 (bytes into file)
Start of section headers: 69428 (bytes into file)
Flags: 0x1, RVC, soft-float ABI
Size of this header: 52 (bytes)
Size of program headers: 32 (bytes)
Number of program headers: 3
Size of section headers: 40 (bytes)
Number of section headers: 15
Section header string table index: 14
```
**GCC O0, O1, O2, O3, Os, Ofast optimization**
We can accurately determine the efficiency of O0, O1, O2, O3, and Os from the table below.
***Optimization Levels***
-O0: No optimization. This is the default level, meaning the compiler will compile quickly but perform no optimizations. This is typically used for debugging since it ensures the most direct correspondence between source code and generated machine code.
-O1: Basic optimization. This level attempts to reduce code size and execution time without significantly increasing compilation time.
-O2: Further optimization. Enhances execution efficiency and code size optimization without performing unsafe optimizations. This may make debugging more difficult.
-O3: Enables more optimization techniques, such as further inlining expansion and vectorization optimizations. This may increase the size of the code (sometimes referred to as "code bloat").
-Os: Focuses on reducing the size of the executable. This optimization tries to minimize code size without significantly impacting execution performance.
-Ofast: In addition to all the optimizations of -O3, it includes some optimizations that might change program behavior, such as ignoring accuracy and standard conformity.
```
riscv-none-elf-size main.elf
```
| | -O0 | -O1 | -O2 | -O3 | -Os | -Ofast |
| ----------- | ----- | ----- | ----- | ----- | ----- | ------ |
| text | 51752 | 51480 | 51480 | 51664 | 51446 | 51664 |
| data | 1876 | 1876 | 1876 | 1876 | 1876 | 1876 |
| bss | 1528 | 1528 | 1528 | 1528 | 1528 | 1528 |
| dec | 55156 | 54884 | 54884 | 55068 | 54850 | 55068 |
| Cycle Count | 14731 | 14395 | 14395 | 14351 | 14436 | 14351 |
| Instret | 2c5 | 2cc | 2cc | 2cf | 2cc | 2cf |
***Table Comparison***
text:
We can see that the size of the text section is largest at the -O0 level, indicating no optimization has been applied. From -O1 to -Ofast, there is almost no significant change in size, which could be due to the optimizations being primarily focused on performance rather than size, or the code itself has limited scope for further size reduction.
data and bss:
These sections usually contain static and global variables. Their size remains unchanged across different optimization levels, suggesting that the configuration of these variables is consistent regardless of the optimization level.
dec:
Overall, there isn’t much variation in size, showing that the main focus of optimization is on performance rather than size.
Cycle Count & Instret:
These are performance metrics. Moving from -O0 to -O1, we observe a notable improvement in performance, with a reduction in cycle counts. This indicates that even basic optimizations significantly impact performance. However, the performance gains from -O1 to -Ofast are not as substantial. This might be due to certain parts of the code reaching an optimization bottleneck, or the performance of the program itself is already nearing the hardware limits.
## Adapt the original assembly code
**Compare Modified Assembly Code**
```
riscv-none-elf-as -march=rv32i -mabi=ilp32 -o Hand.o Hand.s
riscv-none-elf-gcc -march=rv32i -mabi=ilp32 -o Hand.elf Hand.o
```
**ELF Header**
```
riscv-none-elf-readelf -h Hand.elf
```
```
ELF Header:
Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00
Class: ELF32
Data: 2's complement, little endian
Version: 1 (current)
OS/ABI: UNIX - System V
ABI Version: 0
Type: EXEC (Executable file)
Machine: RISC-V
Version: 0x1
Entry point address: 0x100d8
Start of program headers: 52 (bytes into file)
Start of section headers: 16492 (bytes into file)
Flags: 0x0
Size of this header: 52 (bytes)
Size of program headers: 32 (bytes)
Number of program headers: 3
Size of section headers: 40 (bytes)
Number of section headers: 14
Section header string table index: 13
```
**ELF size**
Similarly, we execute it in the terminal to obtain the result.
```
james@james-VirtualBox:~/rv32emu/programs/HW2$ riscv-none-elf-size Hand.elf
text data bss dec hex filename
8876 1420 840 11136 2b80 Hand.elf
```
We can make a comparison: the performance of the modified assembly code is better than the direct translation from C to ELF format.
| | text | data | bss | dec | Cycle Count | Instret |
| ----- | ----- | ---- | --- | --- | ----------- | ------- |
| -O0 | 51752 | 1876 | 1528 | 55156 | 14731 | 2c5 |
| -O1 | 51480 | 1876 | 1528 | 54884 | 14395 | 2cc |
| -O2 | 51480 | 1876 | 1528 | 54884 | 14395 | 2cc |
| -O3 | 51664 | 1876 | 1528 | 55068 | 14351 | 2cf |
| -Os | 51446 | 1876 | 1528 | 54850 | 14436 | 2cc |
| -Ofast | 51664 | 1876 | 1528 | 55068 | 14351 | 2cf |
|Modified| 6168 | 1420 | 840 | 8428 | 559 | 390 |

:::warning
You shall use RDCYCLE/RDCYCLEH instruction for the statistics of your program’s execution.
:notes: jserv
:::