# Assignment2: RISC-V Toolchain
contributed by < [terry23304](https://github.com/terry23304) >
## Choose a question
Problem: I choose [Implement Binarization by count leading zero](https://hackmd.io/@edenlin/CompArchi_HW1) from `edenlin`
## Add perfcounter to C code
```c
#include <stdint.h>
#include <stdio.h>
#include <stdint.h>
extern uint64_t get_cycles();
extern uint64_t get_instret();
uint32_t count_leading_zeros_32(uint32_t x)
{
x |= (x >> 1);
x |= (x >> 2);
x |= (x >> 4);
x |= (x >> 8);
x |= (x >> 16);
x -= ((x >> 1) & 0x55555555);
x = ((x >> 2) & 0x33333333) + (x & 0x33333333);
x = ((x >> 4) + x) & 0x0f0f0f0f;
x += (x >> 8);
x += (x >> 16);
return (32 - (x & 0x3f)); // change 0x7f to 0x3f
}
int main() {
// pixel test
// 8-bit color depth for black and white photo
uint32_t picture[5] = {20,80,128,150,231};
uint32_t threshold = 128;
uint32_t *pixel = &picture;
uint64_t instret = get_instret();
uint64_t oldcount = get_cycles();
for (int i = 0; i < 5; i++) {
uint32_t sub = threshold - *(pixel+i);
printf("%d, ",i);
printf("before = %ld, ",*(pixel+i));
sub = count_leading_zeros_32(sub);
if(sub)
*(pixel+i) = 0;
else
*(pixel+i) = 255;
printf("after = %ld\n",*(pixel+i));
}
uint64_t cyclecount = get_cycles() - oldcount;
printf("cycle count: %u\n", (unsigned int) cyclecount);
printf("instret: %x\n", (unsigned) (instret & 0xffffffff));
return 0;
}
```
### Makefile
```shell
.PHONY: clean
include ../../mk/toolchain.mk
CFLAGS = -march=rv32i_zicsr_zifencei -mabi=ilp32 -O0 -Wall
OBJS = \
getcycles.o \
getinstret.o \
main.o
BIN = main.elf
%.o: %.S
$(CROSS_COMPILE)gcc $(CFLAGS) -c -o $@ $<
%.o: %.c
$(CROSS_COMPILE)gcc $(CFLAGS) -c -o $@ $<
all: $(BIN)
$(BIN): $(OBJS)
$(CROSS_COMPILE)gcc -o $@ $^
clean:
$(RM) $(BIN) $(OBJS)
```
### use make to compile C code to elf file
```shell
riscv-none-elf-gcc -march=rv32i_zicsr_zifencei -mabi=ilp32 -O0 -Wall -c -o main.o main.c
riscv-none-elf-gcc -o main.elf getcycles.o getinstret.o main.o
```
## Observe different compilation options result
### -O0 Optimized Assembly Code
```c
0001016c <count_leading_zeros_32>:
1016c: fe010113 add sp,sp,-32
10170: 00812e23 sw s0,28(sp)
10174: 02010413 add s0,sp,32
10178: fea42623 sw a0,-20(s0)
1017c: fec42783 lw a5,-20(s0)
10180: 0017d793 srl a5,a5,0x1
10184: fec42703 lw a4,-20(s0)
10188: 00f767b3 or a5,a4,a5
1018c: fef42623 sw a5,-20(s0)
10190: fec42783 lw a5,-20(s0)
10194: 0027d793 srl a5,a5,0x2
10198: fec42703 lw a4,-20(s0)
1019c: 00f767b3 or a5,a4,a5
101a0: fef42623 sw a5,-20(s0)
101a4: fec42783 lw a5,-20(s0)
101a8: 0047d793 srl a5,a5,0x4
101ac: fec42703 lw a4,-20(s0)
101b0: 00f767b3 or a5,a4,a5
101b4: fef42623 sw a5,-20(s0)
101b8: fec42783 lw a5,-20(s0)
101bc: 0087d793 srl a5,a5,0x8
101c0: fec42703 lw a4,-20(s0)
101c4: 00f767b3 or a5,a4,a5
101c8: fef42623 sw a5,-20(s0)
101cc: fec42783 lw a5,-20(s0)
101d0: 0107d793 srl a5,a5,0x10
101d4: fec42703 lw a4,-20(s0)
101d8: 00f767b3 or a5,a4,a5
101dc: fef42623 sw a5,-20(s0)
101e0: fec42783 lw a5,-20(s0)
101e4: 0017d713 srl a4,a5,0x1
101e8: 555557b7 lui a5,0x55555
101ec: 55578793 add a5,a5,1365 # 55555555 <__BSS_END__+0x55537805>
101f0: 00f777b3 and a5,a4,a5
101f4: fec42703 lw a4,-20(s0)
101f8: 40f707b3 sub a5,a4,a5
101fc: fef42623 sw a5,-20(s0)
10200: fec42783 lw a5,-20(s0)
10204: 0027d713 srl a4,a5,0x2
10208: 333337b7 lui a5,0x33333
1020c: 33378793 add a5,a5,819 # 33333333 <__BSS_END__+0x333155e3>
10210: 00f77733 and a4,a4,a5
10214: fec42683 lw a3,-20(s0)
10218: 333337b7 lui a5,0x33333
1021c: 33378793 add a5,a5,819 # 33333333 <__BSS_END__+0x333155e3>
10220: 00f6f7b3 and a5,a3,a5
10224: 00f707b3 add a5,a4,a5
10228: fef42623 sw a5,-20(s0)
1022c: fec42783 lw a5,-20(s0)
10230: 0047d713 srl a4,a5,0x4
10234: fec42783 lw a5,-20(s0)
10238: 00f70733 add a4,a4,a5
1023c: 0f0f17b7 lui a5,0xf0f1
10240: f0f78793 add a5,a5,-241 # f0f0f0f <__BSS_END__+0xf0d31bf>
10244: 00f777b3 and a5,a4,a5
10248: fef42623 sw a5,-20(s0)
1024c: fec42783 lw a5,-20(s0)
10250: 0087d793 srl a5,a5,0x8
10254: fec42703 lw a4,-20(s0)
10258: 00f707b3 add a5,a4,a5
1025c: fef42623 sw a5,-20(s0)
10260: fec42783 lw a5,-20(s0)
10264: 0107d793 srl a5,a5,0x10
10268: fec42703 lw a4,-20(s0)
1026c: 00f707b3 add a5,a4,a5
10270: fef42623 sw a5,-20(s0)
10274: fec42783 lw a5,-20(s0)
10278: 03f7f793 and a5,a5,63
1027c: 02000713 li a4,32
10280: 40f707b3 sub a5,a4,a5
10284: 00078513 mv a0,a5
10288: 01c12403 lw s0,28(sp)
1028c: 02010113 add sp,sp,32
10290: 00008067 ret
00010294 <main>:
10294: fb010113 add sp,sp,-80
10298: 04112623 sw ra,76(sp)
1029c: 04812423 sw s0,72(sp)
102a0: 05010413 add s0,sp,80
102a4: 0001c7b7 lui a5,0x1c
102a8: d2478793 add a5,a5,-732 # 1bd24 <__clzsi2+0xbe>
102ac: 0007a583 lw a1,0(a5)
102b0: 0047a603 lw a2,4(a5)
102b4: 0087a683 lw a3,8(a5)
102b8: 00c7a703 lw a4,12(a5)
102bc: 0107a783 lw a5,16(a5)
102c0: fab42823 sw a1,-80(s0)
102c4: fac42a23 sw a2,-76(s0)
102c8: fad42c23 sw a3,-72(s0)
102cc: fae42e23 sw a4,-68(s0)
102d0: fcf42023 sw a5,-64(s0)
102d4: 08000793 li a5,128
102d8: fef42423 sw a5,-24(s0)
102dc: fb040793 add a5,s0,-80
102e0: fef42223 sw a5,-28(s0)
102e4: e75ff0ef jal 10158 <get_instret>
102e8: fca42c23 sw a0,-40(s0)
102ec: fcb42e23 sw a1,-36(s0)
102f0: e55ff0ef jal 10144 <get_cycles>
102f4: fca42823 sw a0,-48(s0)
102f8: fcb42a23 sw a1,-44(s0)
102fc: fe042623 sw zero,-20(s0)
10300: 0cc0006f j 103cc <main+0x138>
10304: fec42783 lw a5,-20(s0)
10308: 00279793 sll a5,a5,0x2
1030c: fe442703 lw a4,-28(s0)
10310: 00f707b3 add a5,a4,a5
10314: 0007a783 lw a5,0(a5)
10318: fe842703 lw a4,-24(s0)
1031c: 40f707b3 sub a5,a4,a5
10320: fcf42223 sw a5,-60(s0)
10324: fec42583 lw a1,-20(s0)
10328: 0001c7b7 lui a5,0x1c
1032c: cd878513 add a0,a5,-808 # 1bcd8 <__clzsi2+0x72>
10330: 4f8000ef jal 10828 <printf>
10334: fec42783 lw a5,-20(s0)
10338: 00279793 sll a5,a5,0x2
1033c: fe442703 lw a4,-28(s0)
10340: 00f707b3 add a5,a4,a5
10344: 0007a783 lw a5,0(a5)
10348: 00078593 mv a1,a5
1034c: 0001c7b7 lui a5,0x1c
10350: ce078513 add a0,a5,-800 # 1bce0 <__clzsi2+0x7a>
10354: 4d4000ef jal 10828 <printf>
10358: fc442503 lw a0,-60(s0)
1035c: e11ff0ef jal 1016c <count_leading_zeros_32>
10360: fca42223 sw a0,-60(s0)
10364: fc442783 lw a5,-60(s0)
10368: 00078e63 beqz a5,10384 <main+0xf0>
1036c: fec42783 lw a5,-20(s0)
10370: 00279793 sll a5,a5,0x2
10374: fe442703 lw a4,-28(s0)
10378: 00f707b3 add a5,a4,a5
1037c: 0007a023 sw zero,0(a5)
10380: 01c0006f j 1039c <main+0x108>
10384: fec42783 lw a5,-20(s0)
10388: 00279793 sll a5,a5,0x2
1038c: fe442703 lw a4,-28(s0)
10390: 00f707b3 add a5,a4,a5
10394: 0ff00713 li a4,255
10398: 00e7a023 sw a4,0(a5)
1039c: fec42783 lw a5,-20(s0)
103a0: 00279793 sll a5,a5,0x2
103a4: fe442703 lw a4,-28(s0)
103a8: 00f707b3 add a5,a4,a5
103ac: 0007a783 lw a5,0(a5)
103b0: 00078593 mv a1,a5
103b4: 0001c7b7 lui a5,0x1c
103b8: cf078513 add a0,a5,-784 # 1bcf0 <__clzsi2+0x8a>
103bc: 46c000ef jal 10828 <printf>
103c0: fec42783 lw a5,-20(s0)
103c4: 00178793 add a5,a5,1
103c8: fef42623 sw a5,-20(s0)
103cc: fec42703 lw a4,-20(s0)
103d0: 00400793 li a5,4
103d4: f2e7d8e3 bge a5,a4,10304 <main+0x70>
103d8: d6dff0ef jal 10144 <get_cycles>
103dc: 00050613 mv a2,a0
103e0: 00058693 mv a3,a1
103e4: fd042503 lw a0,-48(s0)
103e8: fd442583 lw a1,-44(s0)
103ec: 40a60733 sub a4,a2,a0
103f0: 00070813 mv a6,a4
103f4: 01063833 sltu a6,a2,a6
103f8: 40b687b3 sub a5,a3,a1
103fc: 410786b3 sub a3,a5,a6
10400: 00068793 mv a5,a3
10404: fce42423 sw a4,-56(s0)
10408: fcf42623 sw a5,-52(s0)
1040c: fc842783 lw a5,-56(s0)
10410: 00078593 mv a1,a5
10414: 0001c7b7 lui a5,0x1c
10418: d0078513 add a0,a5,-768 # 1bd00 <__clzsi2+0x9a>
1041c: 40c000ef jal 10828 <printf>
10420: fd842783 lw a5,-40(s0)
10424: 00078593 mv a1,a5
10428: 0001c7b7 lui a5,0x1c
1042c: d1478513 add a0,a5,-748 # 1bd14 <__clzsi2+0xae>
10430: 3f8000ef jal 10828 <printf>
10434: 00000793 li a5,0
10438: 00078513 mv a0,a5
1043c: 04c12083 lw ra,76(sp)
10440: 04812403 lw s0,72(sp)
10444: 05010113 add sp,sp,80
10448: 00008067 ret
```
There are 17448 lines in O0 optimization, therefore I choose `count_leading_zero` and `main` to compare.
#### elf size
`riscv64-unknown-elf-size ./main.elf`
```
text data bss dec hex filename
51804 1876 1528 55208 d7a8 ./main.elf
```
#### execute elf file
```
0, before = 20, after = 0
1, before = 80, after = 0
2, before = 128, after = 0
3, before = 150, after = 255
4, before = 231, after = 255
cycle count: 16582
instret: 2d7
inferior exit code 0
```
### -O1 Optimized Assembly Code
#### elf size
`riscv64-unknown-elf-size ./main.elf`
```
text data bss dec hex filename
51520 1876 1528 54924 d68c ./main.elf
```
#### execute elf file
```
0, before = 20, after = 0
1, before = 80, after = 0
2, before = 128, after = 0
3, before = 150, after = 255
4, before = 231, after = 255
cycle count: 16243
instret: 2dc
inferior exit code 0
```
### -O2 Optimized Assembly Code
#### elf size
`riscv64-unknown-elf-size ./main.elf`
```
text data bss dec hex filename
51520 1876 1528 54924 d68c ./main.elf
```
#### execute elf file
```
0, before = 20, after = 0
1, before = 80, after = 0
2, before = 128, after = 0
3, before = 150, after = 255
4, before = 231, after = 255
cycle count: 16243
instret: 2dc
inferior exit code 0
```
### -O3 Optimized Assembly Code
#### elf size
`riscv64-unknown-elf-size ./main.elf`
```
text data bss dec hex filename
51688 1876 1528 55092 d734 ./main.elf
```
#### execute elf file
```
0, before = 20, after = 0
1, before = 80, after = 0
2, before = 128, after = 0
3, before = 150, after = 255
4, before = 231, after = 255
cycle count: 16200
instret: 2dd
inferior exit code 0
```
### -Os Optimized Assembly Code
#### elf size
`riscv64-unknown-elf-size ./main.elf`
```
text data bss dec hex filename
51486 1876 1528 54890 d66a ./main.elf
```
#### execute elf file
```
0, before = 20, after = 0
1, before = 80, after = 0
2, before = 128, after = 0
3, before = 150, after = 255
4, before = 231, after = 255
cycle count: 16241
instret: 306
inferior exit code 0
```
### -Ofast Optimized Assembly Code
#### elf size
`riscv64-unknown-elf-size ./main.elf`
```
text data bss dec hex filename
51688 1876 1528 55092 d734 ./main.elf
```
#### execute elf file
```
0, before = 20, after = 0
1, before = 80, after = 0
2, before = 128, after = 0
3, before = 150, after = 255
4, before = 231, after = 255
cycle count: 16200
instret: 2dd
inferior exit code 0
```
### conclusion
**CSR**
| Name | cycle count |
| -------- | -------- |
| O0 | 16582 |
| O1 | 16243 |
| O2 | 16243 |
| O3 | 16200 |
| Os | 16241 |
| Ofast | 16200 |
- optimization level O0 using less line of code and registers compared to O2.
- O1 and O2 are the same.
- At optimization level O3, the compiler utilizes a greater number of registers, resulting in a larger code size compared to O2. However, it minimizes the cycle count and offers the highest level of performance.
The program size is large because it needs to include functions from `stdlib.h` and `stdio.h`, resulting in numerous unnecessary instructions in the code.
## Rewrite code
I think there's no need to use CLZ to check if the input number is greater than the threshold or not. Therefore, I modified the code to determine the output number based on the sign bit, making it either 0 or 255.
```c
#include <stdint.h>
#include <stdio.h>
#include <stdint.h>
extern uint64_t get_cycles();
extern uint64_t get_instret();
int main() {
// pixel test
// 8-bit color depth for black and white photo
uint32_t picture[5] = {20,80,128,150,231};
uint32_t threshold = 128;
uint32_t *pixel = &picture;
uint64_t instret = get_instret();
uint64_t oldcount = get_cycles();
for (int i = 0; i < 5; i++) {
uint32_t sub = threshold - *(pixel+i);
printf("%d, ",i);
printf("before = %ld, ",*(pixel+i));
if (sub & 0x80000000)
*(pixel + i) = 255;
else
*(pixel + i) = 0;
printf("after = %ld\n",*(pixel+i));
}
uint64_t cyclecount = get_cycles() - oldcount;
printf("cycle count: %u\n", (unsigned int) cyclecount);
printf("instret: %x\n", (unsigned) (instret & 0xffffffff));
return 0;
}
```
**elf size**
```
text data bss dec hex filename
51404 1876 1528 54808 d618 ./main.elf
```
**cycle count**
`cycle count: 16062`
:::warning
TODO: Revise the handwritten RISC-V assembly code.
:notes: jserv
:::
## Handwritten Assembly
Error:
```
(.text+0x38): undefined reference to `main'
collect2: error: ld returned 1 exit status
```
Add the following code on top of main function.
```
.globl main
.type main, @function
```
Error:
```
unknown syscall 4
Segmentation fault (core dumped)
```
```
la a0, str2
li a7, 4
ecall
```
Modifications:
```
.set SYSEXIT, 93
.set SYSWRITE, 64
li a0, 1
la a1, str2
li a2, 26
li a7, SYSWRITE
ecall
li a7, SYSEXIT
ecall
```
After the modification, the assembly code can run on rv32emu properly.