owned this note
owned this note
Published
Linked with GitHub
# Bubble Sort
C implementations which can execute with rv32emu.
```clike=
void _start()
{
volatile char* tx = (volatile char*)0x40002000;
volatile int arr[5] = {3,5,1,2,4};
volatile int tmp;
volatile int i=0;
for(int i=0; i<4; i++){
for(int j=0; j<4-i; j++){
if(arr[j] > arr[j+1]){
tmp = arr[j];
arr[j] = arr[j+1];
arr[j+1] = tmp;
}
}
}
while(i<5){
*tx = '0' + arr[i];
i++;
}
}
```
## Compare the assembly code between handwritten and compiler optimization one
Handwritten by <`Shengyuu`>
```cmake=
main:
#create stack
addi sp,sp,-48
#save s0
sw s0,44(sp)
#update s0
addi s0,sp,48
#init arr[] to memory
li a5,3
sw a5,-48(s0)
li a5,5
sw a5,-44(s0)
li a5,1
sw a5,-40(s0)
li a5,2
sw a5,-36(s0)
li a5,4
sw a5,-32(s0)
#init i to memory
sw zero,-20(s0)
j .L2
.L6:
#init j to memory
sw zero,-24(s0)
j .L3
.L5:
#load arr[ j ] to a4
lw a5,-24(s0)
slli a5,a5,2
addi a4,s0,-16
add a5,a4,a5
lw a4,-32(a5)
#load arr[j+1] a5
lw a5,-24(s0)
addi a5,a5,1
slli a5,a5,2
addi a3,s0,-16
add a5,a3,a5
lw a5,-32(a5)
#if arr[j + 1] > arr[j]
bge a5,a4,.L4
# arr[j+1] < arr[j]
#load arr[j] a5
lw a5,-24(s0)
slli a5,a5,2
addi a4,s0,-16
add a5,a4,a5
lw a5,-32(a5)
#store arr[j] to tmp
sw a5,-28(s0)
#load arr[j+1] to a4
lw a5,-24(s0)
addi a5,a5,1
slli a5,a5,2
addi a4,s0,-16
add a5,a4,a5
lw a4,-32(a5)
#arr[j] = arr[j+1]
lw a5,-24(s0)
slli a5,a5,2
addi a3,s0,-16
add a5,a3,a5
sw a4,-32(a5)
#store tmp to arr[j+1]
lw a5,-24(s0)
addi a5,a5,1
slli a5,a5,2
addi a4,s0,-16
add a5,a4,a5
lw a4,-28(s0)
sw a4,-32(a5)
.L4:
#j++
lw a5,-24(s0)
addi a5,a5,1
sw a5,-24(s0)
.L3:
#check (j - i) < 4
li a4,4
lw a5,-20(s0)
sub a5,a4,a5
lw a4,-24(s0)
blt a4,a5,.L5
#i++
lw a5,-20(s0)
addi a5,a5,1
sw a5,-20(s0)
.L2:
#check i < 4
lw a4,-20(s0)
li a5,3
bge a5,a4,.L6
#return
li a5,0
mv a0,a5
lw s0,44(sp)
addi sp,sp,48
jr ra
```
Compiler optimized:
1. First, we compare the results by different `RV32I_CFLAGS` from `-Os` to `O3`
* -Os:
```clike
./emu-rv32i bubbleSort
12345
>>> Execution time: 49216 ns
>>> Instruction count: 487 (IPS=9895156)
>>> Jumps: 31 (6.37%) - 11 forwards, 20 backwards
>>> Branching T=24 (68.57%) F=11 (31.43%)
```
* -O1:
```clike
./emu-rv32i bubbleSort
12345
>>> Execution time: 44262 ns
>>> Instruction count: 487 (IPS=11002665)
>>> Jumps: 31 (6.37%) - 11 forwards, 20 backwards
>>> Branching T=24 (68.57%) F=11 (31.43%)
```
* -O2:
```clike
./emu-rv32i bubbleSort
12345
>>> Execution time: 40263 ns
>>> Instruction count: 487 (IPS=12095472)
>>> Jumps: 31 (6.37%) - 11 forwards, 20 backwards
>>> Branching T=24 (68.57%) F=11 (31.43%)
```
* -O3:
```clike
./emu-rv32i bubbleSort
12345
>>> Execution time: 135208 ns
>>> Instruction count: 487 (IPS=3601857)
>>> Jumps: 31 (6.37%) - 11 forwards, 20 backwards
>>> Branching T=24 (68.57%) F=11 (31.43%)
```
2. Second, we trace the assembly code optimized by -O3
```cmake=
00010054 <_start>:
10054: fc010113 addi sp,sp,-64
10058: 02812e23 sw s0,60(sp)
1005c: 04010413 addi s0,sp,64
10060: 400027b7 lui a5,0x40002
10064: fef42223 sw a5,-28(s0)
10068: 000107b7 lui a5,0x10
1006c: 1d47a583 lw a1,468(a5) # 101d4 <_start+0x180>
10070: 1d478713 addi a4,a5,468
10074: 00472603 lw a2,4(a4)
10078: 1d478713 addi a4,a5,468
1007c: 00872683 lw a3,8(a4)
10080: 1d478713 addi a4,a5,468
10084: 00c72703 lw a4,12(a4)
10088: 1d478793 addi a5,a5,468
1008c: 0107a783 lw a5,16(a5)
10090: fcb42823 sw a1,-48(s0)
10094: fcc42a23 sw a2,-44(s0)
10098: fcd42c23 sw a3,-40(s0)
1009c: fce42e23 sw a4,-36(s0)
100a0: fef42023 sw a5,-32(s0)
100a4: fc042423 sw zero,-56(s0)
100a8: fe042623 sw zero,-20(s0)
100ac: 0c80006f j 10174 <_start+0x120>
100b0: fe042423 sw zero,-24(s0)
100b4: 0a00006f j 10154 <_start+0x100>
100b8: fe842783 lw a5,-24(s0)
100bc: 00279793 slli a5,a5,0x2
100c0: ff040713 addi a4,s0,-16
100c4: 00f707b3 add a5,a4,a5
100c8: fe07a703 lw a4,-32(a5)
100cc: fe842783 lw a5,-24(s0)
100d0: 00178793 addi a5,a5,1
100d4: 00279793 slli a5,a5,0x2
100d8: ff040693 addi a3,s0,-16
100dc: 00f687b3 add a5,a3,a5
100e0: fe07a783 lw a5,-32(a5)
100e4: 06e7d263 bge a5,a4,10148 <_start+0xf4>
100e8: fe842783 lw a5,-24(s0)
100ec: 00279793 slli a5,a5,0x2
100f0: ff040713 addi a4,s0,-16
100f4: 00f707b3 add a5,a4,a5
100f8: fe07a783 lw a5,-32(a5)
100fc: fcf42623 sw a5,-52(s0)
10100: fe842783 lw a5,-24(s0)
10104: 00178793 addi a5,a5,1
10108: 00279793 slli a5,a5,0x2
1010c: ff040713 addi a4,s0,-16
10110: 00f707b3 add a5,a4,a5
10114: fe07a703 lw a4,-32(a5)
10118: fe842783 lw a5,-24(s0)
1011c: 00279793 slli a5,a5,0x2
10120: ff040693 addi a3,s0,-16
10124: 00f687b3 add a5,a3,a5
10128: fee7a023 sw a4,-32(a5)
1012c: fe842783 lw a5,-24(s0)
10130: 00178793 addi a5,a5,1
10134: fcc42703 lw a4,-52(s0)
10138: 00279793 slli a5,a5,0x2
1013c: ff040693 addi a3,s0,-16
10140: 00f687b3 add a5,a3,a5
10144: fee7a023 sw a4,-32(a5)
10148: fe842783 lw a5,-24(s0)
1014c: 00178793 addi a5,a5,1
10150: fef42423 sw a5,-24(s0)
10154: 00400713 li a4,4
10158: fec42783 lw a5,-20(s0)
1015c: 40f707b3 sub a5,a4,a5
10160: fe842703 lw a4,-24(s0)
10164: f4f74ae3 blt a4,a5,100b8 <_start+0x64>
10168: fec42783 lw a5,-20(s0)
1016c: 00178793 addi a5,a5,1
10170: fef42623 sw a5,-20(s0)
10174: fec42703 lw a4,-20(s0)
10178: 00300793 li a5,3
1017c: f2e7dae3 bge a5,a4,100b0 <_start+0x5c>
10180: 0380006f j 101b8 <_start+0x164>
10184: fc842783 lw a5,-56(s0)
10188: 00279793 slli a5,a5,0x2
1018c: ff040713 addi a4,s0,-16
10190: 00f707b3 add a5,a4,a5
10194: fe07a783 lw a5,-32(a5)
10198: 0ff7f793 andi a5,a5,255
1019c: 03078793 addi a5,a5,48
101a0: 0ff7f713 andi a4,a5,255
101a4: fe442783 lw a5,-28(s0)
101a8: 00e78023 sb a4,0(a5)
101ac: fc842783 lw a5,-56(s0)
101b0: 00178793 addi a5,a5,1
101b4: fcf42423 sw a5,-56(s0)
101b8: fc842703 lw a4,-56(s0)
101bc: 00400793 li a5,4
101c0: fce7d2e3 bge a5,a4,10184 <_start+0x130>
101c4: 00000013 nop
101c8: 03c12403 lw s0,60(sp)
101cc: 04010113 addi sp,sp,64
101d0: 00008067 ret
```
## Statistics of execution flow
Branch :
* True counter :
Count the number of successful branch jumps.
Ex : beq s0, s1, `L1`
If the value in register `s0` and `s1` are same, then jump to `L1`. At this time, true counter will add one.
* False counter :
Count the number of failure branch jumps.
Using the example above, if the value inregister `s0` and `s1` are different, then execute the next instruction below `beq`
Jump :
* forward :
forward jump means that the next PC to be executed will be larger than the current PC.
* backward :
backward jump means that the next PC to be executed will be smaller than the current PC.
###### tags: `RISC-V`