# Assignment2: GNU Toolchain
contributed by <[`KuanYuan0530`](https://github.com/KuanYuan0530)>
## Program selection
Topic: [`Approximating a bfloat number using binary search`](https://hackmd.io/@JY7VQTBOSN-L5160WDDdmQ/SJw8t_6ea) by [`coding-ray`](https://github.com/timothyliu0912/computer_architecture/tree/main/Assignment1) (劉庭聿)
### Original C code
```c
#include <stdio.h>
float fp32_to_bf16(float x)
{
float y = x;
int *p = (int *) &y;
unsigned int exp = *p & 0x7F800000;
unsigned int man = *p & 0x007FFFFF;
if (exp == 0 && man == 0)
return x;
if (exp == 0x7F800000)
return x;
float r = x;
int *pr = (int *) &r;
*pr &= 0xFF800000;
r /= 0x100;
y = x + r;
*p &= 0xFFFF0000;
return y;
}
float binary_search(float low, float high, float target) {
low = fp32_to_bf16(low);
high = fp32_to_bf16(high);
target = fp32_to_bf16(target);
while (low <= high) {
float mid = low + (high - low) / 2;
if (mid == target) {
return mid;
}
if (mid < target) {
low = mid;
}
else {
high = mid;
}
}
return -1;
}
int main(){
float test_case1_x= 0.1;
float test_case1_ub = 10.0;
float test_case1_lb = 0.001;
float test_case2_x= 100.0;
float test_case2_ub = 256.0;
float test_case2_lb = 10.0;
float test_case3_x= 0.563;
float test_case3_ub = 1.0;
float test_case3_lb = 0.01;
printf("%f\n",binary_search(test_case1_lb, test_case1_ub, test_case1_x));
printf("%f\n",binary_search(test_case2_lb, test_case2_ub, test_case2_x));
printf("%f\n",binary_search(test_case3_lb, test_case3_ub, test_case3_x));
return 0;
}
```
## Implementation
### Using GNU Toolchain
#### Optimization by compiler
* gcc
> `-O0~-O3` : Adjust optimization level
> `-Os` : optimize size
> `-Ofast` : optimize speed
```shell
$ riscv-none-elf-gcc -march=rv32i -mabi=ilp32 -O1 Source2.c -o hw2_o0.elf
```
#### Disassemble
* objdump
> `-d` : Display the assembler mnemonics for the machine instructions
```shell
$ riscv-none-elf-objdump -d hw2_o1.elf
```
#### Show ELF information
* readelf
> `-h` : Display the ELF file header
```shell
$ riscv-none-elf-readelf -h hw2_o1.elf
```
#### Lists the section sizes
* size
```shell
$ riscv-none-elf-size hw2_o1.elf
```
### Result
#### O0 optimization
:::spoiler {state="open"}**objdump**
```
00010134 <fp32_to_bf16>:
10134: fc010113 addi sp,sp,-64
10138: 02112e23 sw ra,60(sp)
1013c: 02812c23 sw s0,56(sp)
10140: 04010413 addi s0,sp,64
10144: fca42623 sw a0,-52(s0)
10148: fcc42783 lw a5,-52(s0)
1014c: fcf42e23 sw a5,-36(s0)
10150: fdc40793 addi a5,s0,-36
10154: fef42623 sw a5,-20(s0)
10158: fec42783 lw a5,-20(s0)
1015c: 0007a783 lw a5,0(a5)
10160: 00078713 mv a4,a5
10164: 7f8007b7 lui a5,0x7f800
10168: 00f777b3 and a5,a4,a5
1016c: fef42423 sw a5,-24(s0)
10170: fec42783 lw a5,-20(s0)
10174: 0007a783 lw a5,0(a5) # 7f800000 <__BSS_END__+0x7f7de5c0>
10178: 00078713 mv a4,a5
1017c: 008007b7 lui a5,0x800
10180: fff78793 addi a5,a5,-1 # 7fffff <__BSS_END__+0x7de5bf>
10184: 00f777b3 and a5,a4,a5
10188: fef42223 sw a5,-28(s0)
1018c: fe842783 lw a5,-24(s0)
10190: 00079a63 bnez a5,101a4 <fp32_to_bf16+0x70>
10194: fe442783 lw a5,-28(s0)
10198: 00079663 bnez a5,101a4 <fp32_to_bf16+0x70>
1019c: fcc42783 lw a5,-52(s0)
101a0: 0900006f j 10230 <fp32_to_bf16+0xfc>
101a4: fe842703 lw a4,-24(s0)
101a8: 7f8007b7 lui a5,0x7f800
101ac: 00f71663 bne a4,a5,101b8 <fp32_to_bf16+0x84>
101b0: fcc42783 lw a5,-52(s0)
101b4: 07c0006f j 10230 <fp32_to_bf16+0xfc>
101b8: fcc42783 lw a5,-52(s0)
101bc: fcf42c23 sw a5,-40(s0)
101c0: fd840793 addi a5,s0,-40
101c4: fef42023 sw a5,-32(s0)
101c8: fe042783 lw a5,-32(s0)
101cc: 0007a703 lw a4,0(a5) # 7f800000 <__BSS_END__+0x7f7de5c0>
101d0: ff8007b7 lui a5,0xff800
101d4: 00f77733 and a4,a4,a5
101d8: fe042783 lw a5,-32(s0)
101dc: 00e7a023 sw a4,0(a5) # ff800000 <__BSS_END__+0xff7de5c0>
101e0: fd842703 lw a4,-40(s0)
101e4: 0001f7b7 lui a5,0x1f
101e8: 7a47a583 lw a1,1956(a5) # 1f7a4 <_exit+0xa>
101ec: 00070513 mv a0,a4
101f0: 61a000ef jal ra,1080a <__divsf3>
101f4: 00050793 mv a5,a0
101f8: fcf42c23 sw a5,-40(s0)
101fc: fd842783 lw a5,-40(s0)
10200: fcc42583 lw a1,-52(s0)
10204: 00078513 mv a0,a5
10208: 26c000ef jal ra,10474 <__addsf3>
1020c: 00050793 mv a5,a0
10210: fcf42e23 sw a5,-36(s0)
10214: fec42783 lw a5,-20(s0)
10218: 0007a703 lw a4,0(a5)
1021c: ffff07b7 lui a5,0xffff0
10220: 00f77733 and a4,a4,a5
10224: fec42783 lw a5,-20(s0)
10228: 00e7a023 sw a4,0(a5) # ffff0000 <__BSS_END__+0xfffce5c0>
1022c: fdc42783 lw a5,-36(s0)
10230: 00078513 mv a0,a5
10234: 03c12083 lw ra,60(sp)
10238: 03812403 lw s0,56(sp)
1023c: 04010113 addi sp,sp,64
10240: 00008067 ret
00010244 <binary_search>:
10244: fd010113 addi sp,sp,-48
10248: 02112623 sw ra,44(sp)
1024c: 02812423 sw s0,40(sp)
10250: 03010413 addi s0,sp,48
10254: fca42e23 sw a0,-36(s0)
10258: fcb42c23 sw a1,-40(s0)
1025c: fcc42a23 sw a2,-44(s0)
10260: fdc42503 lw a0,-36(s0)
10264: ed1ff0ef jal ra,10134 <fp32_to_bf16>
10268: fca42e23 sw a0,-36(s0)
1026c: fd842503 lw a0,-40(s0)
10270: ec5ff0ef jal ra,10134 <fp32_to_bf16>
10274: fca42c23 sw a0,-40(s0)
10278: fd442503 lw a0,-44(s0)
1027c: eb9ff0ef jal ra,10134 <fp32_to_bf16>
10280: fca42a23 sw a0,-44(s0)
10284: 0840006f j 10308 <binary_search+0xc4>
10288: fdc42583 lw a1,-36(s0)
1028c: fd842503 lw a0,-40(s0)
10290: 0f5000ef jal ra,10b84 <__subsf3>
10294: 00050793 mv a5,a0
10298: 00078713 mv a4,a5
1029c: 0001f7b7 lui a5,0x1f
102a0: 7a87a583 lw a1,1960(a5) # 1f7a8 <_exit+0xe>
102a4: 00070513 mv a0,a4
102a8: 562000ef jal ra,1080a <__divsf3>
102ac: 00050793 mv a5,a0
102b0: 00078593 mv a1,a5
102b4: fdc42503 lw a0,-36(s0)
102b8: 1bc000ef jal ra,10474 <__addsf3>
102bc: 00050793 mv a5,a0
102c0: fef42623 sw a5,-20(s0)
102c4: fd442583 lw a1,-44(s0)
102c8: fec42503 lw a0,-20(s0)
102cc: 7e2000ef jal ra,10aae <__eqsf2>
102d0: 00050793 mv a5,a0
102d4: 00079663 bnez a5,102e0 <binary_search+0x9c>
102d8: fec42783 lw a5,-20(s0)
102dc: 0480006f j 10324 <binary_search+0xe0>
102e0: fd442583 lw a1,-44(s0)
102e4: fec42503 lw a0,-20(s0)
102e8: 01b000ef jal ra,10b02 <__lesf2>
102ec: 00050793 mv a5,a0
102f0: 0007d863 bgez a5,10300 <binary_search+0xbc>
102f4: fec42783 lw a5,-20(s0)
102f8: fcf42e23 sw a5,-36(s0)
102fc: 00c0006f j 10308 <binary_search+0xc4>
10300: fec42783 lw a5,-20(s0)
10304: fcf42c23 sw a5,-40(s0)
10308: fd842583 lw a1,-40(s0)
1030c: fdc42503 lw a0,-36(s0)
10310: 7f2000ef jal ra,10b02 <__lesf2>
10314: 00050793 mv a5,a0
10318: f6f058e3 blez a5,10288 <binary_search+0x44>
1031c: 0001f7b7 lui a5,0x1f
10320: 7ac7a783 lw a5,1964(a5) # 1f7ac <_exit+0x12>
10324: 00078513 mv a0,a5
10328: 02c12083 lw ra,44(sp)
1032c: 02812403 lw s0,40(sp)
10330: 03010113 addi sp,sp,48
10334: 00008067 ret
00010338 <main>:
10338: fc010113 addi sp,sp,-64
1033c: 02112e23 sw ra,60(sp)
10340: 02812c23 sw s0,56(sp)
10344: 04010413 addi s0,sp,64
10348: 0001f7b7 lui a5,0x1f
1034c: 7b07a783 lw a5,1968(a5) # 1f7b0 <_exit+0x16>
10350: fef42623 sw a5,-20(s0)
10354: 0001f7b7 lui a5,0x1f
10358: 7b47a783 lw a5,1972(a5) # 1f7b4 <_exit+0x1a>
1035c: fef42423 sw a5,-24(s0)
10360: 0001f7b7 lui a5,0x1f
10364: 7b87a783 lw a5,1976(a5) # 1f7b8 <_exit+0x1e>
10368: fef42223 sw a5,-28(s0)
1036c: 0001f7b7 lui a5,0x1f
10370: 7bc7a783 lw a5,1980(a5) # 1f7bc <_exit+0x22>
10374: fef42023 sw a5,-32(s0)
10378: 0001f7b7 lui a5,0x1f
1037c: 7a47a783 lw a5,1956(a5) # 1f7a4 <_exit+0xa>
10380: fcf42e23 sw a5,-36(s0)
10384: 0001f7b7 lui a5,0x1f
10388: 7b47a783 lw a5,1972(a5) # 1f7b4 <_exit+0x1a>
1038c: fcf42c23 sw a5,-40(s0)
10390: 0001f7b7 lui a5,0x1f
10394: 7c07a783 lw a5,1984(a5) # 1f7c0 <_exit+0x26>
10398: fcf42a23 sw a5,-44(s0)
1039c: 0001f7b7 lui a5,0x1f
103a0: 7c47a783 lw a5,1988(a5) # 1f7c4 <_exit+0x2a>
103a4: fcf42823 sw a5,-48(s0)
103a8: 0001f7b7 lui a5,0x1f
103ac: 7c87a783 lw a5,1992(a5) # 1f7c8 <_exit+0x2e>
103b0: fcf42623 sw a5,-52(s0)
103b4: fec42603 lw a2,-20(s0)
103b8: fe842583 lw a1,-24(s0)
103bc: fe442503 lw a0,-28(s0)
103c0: e85ff0ef jal ra,10244 <binary_search>
103c4: 00050793 mv a5,a0
103c8: 00078513 mv a0,a5
103cc: 341000ef jal ra,10f0c <__extendsfdf2>
103d0: 00050713 mv a4,a0
103d4: 00058793 mv a5,a1
103d8: 00070613 mv a2,a4
103dc: 00078693 mv a3,a5
103e0: 0001f7b7 lui a5,0x1f
103e4: 7a078513 addi a0,a5,1952 # 1f7a0 <_exit+0x6>
103e8: 577000ef jal ra,1115e <printf>
103ec: fe042603 lw a2,-32(s0)
103f0: fdc42583 lw a1,-36(s0)
103f4: fd842503 lw a0,-40(s0)
103f8: e4dff0ef jal ra,10244 <binary_search>
103fc: 00050793 mv a5,a0
10400: 00078513 mv a0,a5
10404: 309000ef jal ra,10f0c <__extendsfdf2>
10408: 00050713 mv a4,a0
1040c: 00058793 mv a5,a1
10410: 00070613 mv a2,a4
10414: 00078693 mv a3,a5
10418: 0001f7b7 lui a5,0x1f
1041c: 7a078513 addi a0,a5,1952 # 1f7a0 <_exit+0x6>
10420: 53f000ef jal ra,1115e <printf>
10424: fd442603 lw a2,-44(s0)
10428: fd042583 lw a1,-48(s0)
1042c: fcc42503 lw a0,-52(s0)
10430: e15ff0ef jal ra,10244 <binary_search>
10434: 00050793 mv a5,a0
10438: 00078513 mv a0,a5
1043c: 2d1000ef jal ra,10f0c <__extendsfdf2>
10440: 00050713 mv a4,a0
10444: 00058793 mv a5,a1
10448: 00070613 mv a2,a4
1044c: 00078693 mv a3,a5
10450: 0001f7b7 lui a5,0x1f
10454: 7a078513 addi a0,a5,1952 # 1f7a0 <_exit+0x6>
10458: 507000ef jal ra,1115e <printf>
1045c: 00000793 li a5,0
10460: 00078513 mv a0,a5
10464: 03c12083 lw ra,60(sp)
10468: 03812403 lw s0,56(sp)
1046c: 04010113 addi sp,sp,64
10470: 00008067 ret
```
:::
:::spoiler {state="open"}**readelf**
```
ELF Header:
Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00
Class: ELF32
Data: 2's complement, little endian
Version: 1 (current)
OS/ABI: UNIX - System V
ABI Version: 0
Type: EXEC (Executable file)
Machine: RISC-V
Version: 0x1
Entry point address: 0x100a6
Start of program headers: 52 (bytes into file)
Start of section headers: 89784 (bytes into file)
Flags: 0x1, RVC, soft-float ABI
Size of this header: 52 (bytes)
Size of program headers: 32 (bytes)
Number of program headers: 2
Size of section headers: 40 (bytes)
Number of section headers: 17
Section header string table index: 16
```
:::
:::spoiler {state="open"}**size**
```
text data bss dec hex filename
67196 2512 92 69800 110a8 hw2_o0.elf
```
:::
#### O1 optimization
:::spoiler **objdump**
```
00010134 <fp32_to_bf16>:
10134: ff010113 addi sp,sp,-16
10138: 00112623 sw ra,12(sp)
1013c: 00812423 sw s0,8(sp)
10140: 00050413 mv s0,a0
10144: 00151793 slli a5,a0,0x1
10148: 02078863 beqz a5,10178 <fp32_to_bf16+0x44>
1014c: 7f8007b7 lui a5,0x7f800
10150: 00a7f733 and a4,a5,a0
10154: 02f70263 beq a4,a5,10178 <fp32_to_bf16+0x44>
10158: 1b81a583 lw a1,440(gp) # 21f48 <__SDATA_BEGIN__+0x18>
1015c: ff800537 lui a0,0xff800
10160: 00857533 and a0,a0,s0
10164: 67e000ef jal ra,107e2 <__mulsf3>
10168: 00040593 mv a1,s0
1016c: 188000ef jal ra,102f4 <__addsf3>
10170: ffff07b7 lui a5,0xffff0
10174: 00a7f533 and a0,a5,a0
10178: 00c12083 lw ra,12(sp)
1017c: 00812403 lw s0,8(sp)
10180: 01010113 addi sp,sp,16
10184: 00008067 ret
00010188 <binary_search>:
10188: fe010113 addi sp,sp,-32
1018c: 00112e23 sw ra,28(sp)
10190: 00812c23 sw s0,24(sp)
10194: 00912a23 sw s1,20(sp)
10198: 01212823 sw s2,16(sp)
1019c: 01312623 sw s3,12(sp)
101a0: 01412423 sw s4,8(sp)
101a4: 00058913 mv s2,a1
101a8: 00060413 mv s0,a2
101ac: f89ff0ef jal ra,10134 <fp32_to_bf16>
101b0: 00050493 mv s1,a0
101b4: 00090513 mv a0,s2
101b8: f7dff0ef jal ra,10134 <fp32_to_bf16>
101bc: 00050913 mv s2,a0
101c0: 00040513 mv a0,s0
101c4: f71ff0ef jal ra,10134 <fp32_to_bf16>
101c8: 00050993 mv s3,a0
101cc: 1c01aa03 lw s4,448(gp) # 21f50 <__SDATA_BEGIN__+0x20>
101d0: 00090593 mv a1,s2
101d4: 00048513 mv a0,s1
101d8: 588000ef jal ra,10760 <__lesf2>
101dc: 04a04a63 bgtz a0,10230 <binary_search+0xa8>
101e0: 00048593 mv a1,s1
101e4: 00090513 mv a0,s2
101e8: 083000ef jal ra,10a6a <__subsf3>
101ec: 000a0593 mv a1,s4
101f0: 5f2000ef jal ra,107e2 <__mulsf3>
101f4: 00048593 mv a1,s1
101f8: 0fc000ef jal ra,102f4 <__addsf3>
101fc: 00050413 mv s0,a0
10200: 00050593 mv a1,a0
10204: 00098513 mv a0,s3
10208: 482000ef jal ra,1068a <__eqsf2>
1020c: 02050463 beqz a0,10234 <binary_search+0xac>
10210: 00040593 mv a1,s0
10214: 00098513 mv a0,s3
10218: 4c6000ef jal ra,106de <__gesf2>
1021c: 00a05663 blez a0,10228 <binary_search+0xa0>
10220: 00040493 mv s1,s0
10224: fadff06f j 101d0 <binary_search+0x48>
10228: 00040913 mv s2,s0
1022c: fa5ff06f j 101d0 <binary_search+0x48>
10230: 1bc1a403 lw s0,444(gp) # 21f4c <__SDATA_BEGIN__+0x1c>
10234: 00040513 mv a0,s0
10238: 01c12083 lw ra,28(sp)
1023c: 01812403 lw s0,24(sp)
10240: 01412483 lw s1,20(sp)
10244: 01012903 lw s2,16(sp)
10248: 00c12983 lw s3,12(sp)
1024c: 00812a03 lw s4,8(sp)
10250: 02010113 addi sp,sp,32
10254: 00008067 ret
00010258 <main>:
10258: ff010113 addi sp,sp,-16
1025c: 00112623 sw ra,12(sp)
10260: 00812423 sw s0,8(sp)
10264: 00912223 sw s1,4(sp)
10268: 1c81a483 lw s1,456(gp) # 21f58 <__SDATA_BEGIN__+0x28>
1026c: 1c41a603 lw a2,452(gp) # 21f54 <__SDATA_BEGIN__+0x24>
10270: 00048593 mv a1,s1
10274: 1cc1a503 lw a0,460(gp) # 21f5c <__SDATA_BEGIN__+0x2c>
10278: f11ff0ef jal ra,10188 <binary_search>
1027c: 377000ef jal ra,10df2 <__extendsfdf2>
10280: 00050613 mv a2,a0
10284: 00058693 mv a3,a1
10288: 0001f437 lui s0,0x1f
1028c: 69040513 addi a0,s0,1680 # 1f690 <_exit+0xe>
10290: 5b5000ef jal ra,11044 <printf>
10294: 1d01a603 lw a2,464(gp) # 21f60 <__SDATA_BEGIN__+0x30>
10298: 1d41a583 lw a1,468(gp) # 21f64 <__SDATA_BEGIN__+0x34>
1029c: 00048513 mv a0,s1
102a0: ee9ff0ef jal ra,10188 <binary_search>
102a4: 34f000ef jal ra,10df2 <__extendsfdf2>
102a8: 00050613 mv a2,a0
102ac: 00058693 mv a3,a1
102b0: 69040513 addi a0,s0,1680
102b4: 591000ef jal ra,11044 <printf>
102b8: 1d81a603 lw a2,472(gp) # 21f68 <__SDATA_BEGIN__+0x38>
102bc: 1dc1a583 lw a1,476(gp) # 21f6c <__SDATA_BEGIN__+0x3c>
102c0: 1e01a503 lw a0,480(gp) # 21f70 <__SDATA_BEGIN__+0x40>
102c4: ec5ff0ef jal ra,10188 <binary_search>
102c8: 32b000ef jal ra,10df2 <__extendsfdf2>
102cc: 00050613 mv a2,a0
102d0: 00058693 mv a3,a1
102d4: 69040513 addi a0,s0,1680
102d8: 56d000ef jal ra,11044 <printf>
102dc: 00000513 li a0,0
102e0: 00c12083 lw ra,12(sp)
102e4: 00812403 lw s0,8(sp)
102e8: 00412483 lw s1,4(sp)
102ec: 01010113 addi sp,sp,16
102f0: 00008067 ret
```
:::
:::spoiler **readelf**
```
ELF Header:
Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00
Class: ELF32
Data: 2's complement, little endian
Version: 1 (current)
OS/ABI: UNIX - System V
ABI Version: 0
Type: EXEC (Executable file)
Machine: RISC-V
Version: 0x1
Entry point address: 0x100a6
Start of program headers: 52 (bytes into file)
Start of section headers: 87244 (bytes into file)
Flags: 0x1, RVC, soft-float ABI
Size of this header: 52 (bytes)
Size of program headers: 32 (bytes)
Number of program headers: 2
Size of section headers: 40 (bytes)
Number of section headers: 17
Section header string table index: 16
```
:::
:::spoiler {state="open"}**size**
```
text data bss dec hex filename
66820 2556 92 69468 10f5c hw2_o1.elf
```
:::
#### O2 optimization
:::spoiler **objdump**
```
000101d0 <fp32_to_bf16>:
101d0: ff010113 addi sp,sp,-16
101d4: 00812423 sw s0,8(sp)
101d8: 00112623 sw ra,12(sp)
101dc: 00151793 slli a5,a0,0x1
101e0: 00050413 mv s0,a0
101e4: 02078863 beqz a5,10214 <fp32_to_bf16+0x44>
101e8: 7f8007b7 lui a5,0x7f800
101ec: 00a7f733 and a4,a5,a0
101f0: 02f70263 beq a4,a5,10214 <fp32_to_bf16+0x44>
101f4: 1b81a583 lw a1,440(gp) # 21f48 <__SDATA_BEGIN__+0x18>
101f8: ff800537 lui a0,0xff800
101fc: 00857533 and a0,a0,s0
10200: 5ec000ef jal ra,107ec <__mulsf3>
10204: 00040593 mv a1,s0
10208: 178000ef jal ra,10380 <__addsf3>
1020c: ffff0437 lui s0,0xffff0
10210: 00a47433 and s0,s0,a0
10214: 00c12083 lw ra,12(sp)
10218: 00040513 mv a0,s0
1021c: 00812403 lw s0,8(sp)
10220: 01010113 addi sp,sp,16
10224: 00008067 ret
00010228 <binary_search>:
10228: fe010113 addi sp,sp,-32
1022c: 00912a23 sw s1,20(sp)
10230: 01212823 sw s2,16(sp)
10234: 01312623 sw s3,12(sp)
10238: 00112e23 sw ra,28(sp)
1023c: 00812c23 sw s0,24(sp)
10240: 01412423 sw s4,8(sp)
10244: 00151793 slli a5,a0,0x1
10248: 00050493 mv s1,a0
1024c: 00058913 mv s2,a1
10250: 00060993 mv s3,a2
10254: 02078863 beqz a5,10284 <binary_search+0x5c>
10258: 7f8007b7 lui a5,0x7f800
1025c: 00a7f6b3 and a3,a5,a0
10260: 02f68263 beq a3,a5,10284 <binary_search+0x5c>
10264: 1b81a583 lw a1,440(gp) # 21f48 <__SDATA_BEGIN__+0x18>
10268: ff800537 lui a0,0xff800
1026c: 00a4f533 and a0,s1,a0
10270: 57c000ef jal ra,107ec <__mulsf3>
10274: 00048593 mv a1,s1
10278: 108000ef jal ra,10380 <__addsf3>
1027c: ffff04b7 lui s1,0xffff0
10280: 00a4f4b3 and s1,s1,a0
10284: 00191793 slli a5,s2,0x1
10288: 02078863 beqz a5,102b8 <binary_search+0x90>
1028c: 7f8007b7 lui a5,0x7f800
10290: 0127f6b3 and a3,a5,s2
10294: 02f68263 beq a3,a5,102b8 <binary_search+0x90>
10298: 1b81a583 lw a1,440(gp) # 21f48 <__SDATA_BEGIN__+0x18>
1029c: ff800537 lui a0,0xff800
102a0: 00a97533 and a0,s2,a0
102a4: 548000ef jal ra,107ec <__mulsf3>
102a8: 00090593 mv a1,s2
102ac: 0d4000ef jal ra,10380 <__addsf3>
102b0: ffff0937 lui s2,0xffff0
102b4: 00a97933 and s2,s2,a0
102b8: 00199793 slli a5,s3,0x1
102bc: 02078863 beqz a5,102ec <binary_search+0xc4>
102c0: 7f8007b7 lui a5,0x7f800
102c4: 0137f6b3 and a3,a5,s3
102c8: 02f68263 beq a3,a5,102ec <binary_search+0xc4>
102cc: 1b81a583 lw a1,440(gp) # 21f48 <__SDATA_BEGIN__+0x18>
102d0: ff800537 lui a0,0xff800
102d4: 00a9f533 and a0,s3,a0
102d8: 514000ef jal ra,107ec <__mulsf3>
102dc: 00098593 mv a1,s3
102e0: 0a0000ef jal ra,10380 <__addsf3>
102e4: ffff09b7 lui s3,0xffff0
102e8: 00a9f9b3 and s3,s3,a0
102ec: 1c01aa03 lw s4,448(gp) # 21f50 <__SDATA_BEGIN__+0x20>
102f0: 0440006f j 10334 <binary_search+0x10c>
102f4: 780000ef jal ra,10a74 <__subsf3>
102f8: 000a0593 mv a1,s4
102fc: 4f0000ef jal ra,107ec <__mulsf3>
10300: 00048593 mv a1,s1
10304: 07c000ef jal ra,10380 <__addsf3>
10308: 00098593 mv a1,s3
1030c: 00050413 mv s0,a0
10310: 406000ef jal ra,10716 <__eqsf2>
10314: 00050793 mv a5,a0
10318: 00098593 mv a1,s3
1031c: 00040513 mv a0,s0
10320: 00040713 mv a4,s0
10324: 02078863 beqz a5,10354 <binary_search+0x12c>
10328: 442000ef jal ra,1076a <__lesf2>
1032c: 04055663 bgez a0,10378 <binary_search+0x150>
10330: 00040493 mv s1,s0
10334: 00090593 mv a1,s2
10338: 00048513 mv a0,s1
1033c: 42e000ef jal ra,1076a <__lesf2>
10340: 00050793 mv a5,a0
10344: 00048593 mv a1,s1
10348: 00090513 mv a0,s2
1034c: faf054e3 blez a5,102f4 <binary_search+0xcc>
10350: 1bc1a703 lw a4,444(gp) # 21f4c <__SDATA_BEGIN__+0x1c>
10354: 01c12083 lw ra,28(sp)
10358: 01812403 lw s0,24(sp)
1035c: 01412483 lw s1,20(sp)
10360: 01012903 lw s2,16(sp)
10364: 00c12983 lw s3,12(sp)
10368: 00812a03 lw s4,8(sp)
1036c: 00070513 mv a0,a4
10370: 02010113 addi sp,sp,32
10374: 00008067 ret
10378: 00040913 mv s2,s0
1037c: fb9ff06f j 10334 <binary_search+0x10c>
00010094 <main>:
10094: ff010113 addi sp,sp,-16
10098: 00912223 sw s1,4(sp)
1009c: 1c81a483 lw s1,456(gp) # 21f58 <__SDATA_BEGIN__+0x28>
100a0: 1c41a603 lw a2,452(gp) # 21f54 <__SDATA_BEGIN__+0x24>
100a4: 1cc1a503 lw a0,460(gp) # 21f5c <__SDATA_BEGIN__+0x2c>
100a8: 00048593 mv a1,s1
100ac: 00112623 sw ra,12(sp)
100b0: 00812423 sw s0,8(sp)
100b4: 174000ef jal ra,10228 <binary_search>
100b8: 545000ef jal ra,10dfc <__extendsfdf2>
100bc: 0001f437 lui s0,0x1f
100c0: 00058693 mv a3,a1
100c4: 00050613 mv a2,a0
100c8: 69040513 addi a0,s0,1680 # 1f690 <_exit+0x4>
100cc: 783000ef jal ra,1104e <printf>
100d0: 1d01a603 lw a2,464(gp) # 21f60 <__SDATA_BEGIN__+0x30>
100d4: 1d41a583 lw a1,468(gp) # 21f64 <__SDATA_BEGIN__+0x34>
100d8: 00048513 mv a0,s1
100dc: 14c000ef jal ra,10228 <binary_search>
100e0: 51d000ef jal ra,10dfc <__extendsfdf2>
100e4: 00058693 mv a3,a1
100e8: 00050613 mv a2,a0
100ec: 69040513 addi a0,s0,1680
100f0: 75f000ef jal ra,1104e <printf>
100f4: 1d81a603 lw a2,472(gp) # 21f68 <__SDATA_BEGIN__+0x38>
100f8: 1dc1a583 lw a1,476(gp) # 21f6c <__SDATA_BEGIN__+0x3c>
100fc: 1e01a503 lw a0,480(gp) # 21f70 <__SDATA_BEGIN__+0x40>
10100: 128000ef jal ra,10228 <binary_search>
10104: 4f9000ef jal ra,10dfc <__extendsfdf2>
10108: 00050613 mv a2,a0
1010c: 00058693 mv a3,a1
10110: 69040513 addi a0,s0,1680
10114: 73b000ef jal ra,1104e <printf>
10118: 00c12083 lw ra,12(sp)
1011c: 00812403 lw s0,8(sp)
10120: 00412483 lw s1,4(sp)
10124: 00000513 li a0,0
10128: 01010113 addi sp,sp,16
1012c: 00008067 ret
```
:::
:::spoiler **readelf**
```
ELF Header:
Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00
Class: ELF32
Data: 2's complement, little endian
Version: 1 (current)
OS/ABI: UNIX - System V
ABI Version: 0
Type: EXEC (Executable file)
Machine: RISC-V
Version: 0x1
Entry point address: 0x10144
Start of program headers: 52 (bytes into file)
Start of section headers: 87140 (bytes into file)
Flags: 0x1, RVC, soft-float ABI
Size of this header: 52 (bytes)
Size of program headers: 32 (bytes)
Number of program headers: 2
Size of section headers: 40 (bytes)
Number of section headers: 17
Section header string table index: 16
```
:::
:::spoiler {state="open"}**size**
```
text data bss dec hex filename
66830 2556 92 69478 10f66 hw2_o2.elf
```
:::
#### O3 optimization
:::spoiler **objdump**
```
000101d0 <fp32_to_bf16>:
101d0: ff010113 addi sp,sp,-16
101d4: 00812423 sw s0,8(sp)
101d8: 00112623 sw ra,12(sp)
101dc: 00151793 slli a5,a0,0x1
101e0: 00050413 mv s0,a0
101e4: 02078863 beqz a5,10214 <fp32_to_bf16+0x44>
101e8: 7f8007b7 lui a5,0x7f800
101ec: 00a7f733 and a4,a5,a0
101f0: 02f70263 beq a4,a5,10214 <fp32_to_bf16+0x44>
101f4: 1b81a583 lw a1,440(gp) # 21f48 <__SDATA_BEGIN__+0x18>
101f8: ff800537 lui a0,0xff800
101fc: 00857533 and a0,a0,s0
10200: 5ec000ef jal ra,107ec <__mulsf3>
10204: 00040593 mv a1,s0
10208: 178000ef jal ra,10380 <__addsf3>
1020c: ffff0437 lui s0,0xffff0
10210: 00a47433 and s0,s0,a0
10214: 00c12083 lw ra,12(sp)
10218: 00040513 mv a0,s0
1021c: 00812403 lw s0,8(sp)
10220: 01010113 addi sp,sp,16
10224: 00008067 ret
00010228 <binary_search>:
10228: fe010113 addi sp,sp,-32
1022c: 00912a23 sw s1,20(sp)
10230: 01212823 sw s2,16(sp)
10234: 01312623 sw s3,12(sp)
10238: 00112e23 sw ra,28(sp)
1023c: 00812c23 sw s0,24(sp)
10240: 01412423 sw s4,8(sp)
10244: 00151793 slli a5,a0,0x1
10248: 00050493 mv s1,a0
1024c: 00058913 mv s2,a1
10250: 00060993 mv s3,a2
10254: 02078863 beqz a5,10284 <binary_search+0x5c>
10258: 7f8007b7 lui a5,0x7f800
1025c: 00a7f6b3 and a3,a5,a0
10260: 02f68263 beq a3,a5,10284 <binary_search+0x5c>
10264: 1b81a583 lw a1,440(gp) # 21f48 <__SDATA_BEGIN__+0x18>
10268: ff800537 lui a0,0xff800
1026c: 00a4f533 and a0,s1,a0
10270: 57c000ef jal ra,107ec <__mulsf3>
10274: 00048593 mv a1,s1
10278: 108000ef jal ra,10380 <__addsf3>
1027c: ffff04b7 lui s1,0xffff0
10280: 00a4f4b3 and s1,s1,a0
10284: 00191793 slli a5,s2,0x1
10288: 02078863 beqz a5,102b8 <binary_search+0x90>
1028c: 7f8007b7 lui a5,0x7f800
10290: 0127f6b3 and a3,a5,s2
10294: 02f68263 beq a3,a5,102b8 <binary_search+0x90>
10298: 1b81a583 lw a1,440(gp) # 21f48 <__SDATA_BEGIN__+0x18>
1029c: ff800537 lui a0,0xff800
102a0: 00a97533 and a0,s2,a0
102a4: 548000ef jal ra,107ec <__mulsf3>
102a8: 00090593 mv a1,s2
102ac: 0d4000ef jal ra,10380 <__addsf3>
102b0: ffff0937 lui s2,0xffff0
102b4: 00a97933 and s2,s2,a0
102b8: 00199793 slli a5,s3,0x1
102bc: 02078863 beqz a5,102ec <binary_search+0xc4>
102c0: 7f8007b7 lui a5,0x7f800
102c4: 0137f6b3 and a3,a5,s3
102c8: 02f68263 beq a3,a5,102ec <binary_search+0xc4>
102cc: 1b81a583 lw a1,440(gp) # 21f48 <__SDATA_BEGIN__+0x18>
102d0: ff800537 lui a0,0xff800
102d4: 00a9f533 and a0,s3,a0
102d8: 514000ef jal ra,107ec <__mulsf3>
102dc: 00098593 mv a1,s3
102e0: 0a0000ef jal ra,10380 <__addsf3>
102e4: ffff09b7 lui s3,0xffff0
102e8: 00a9f9b3 and s3,s3,a0
102ec: 1c01aa03 lw s4,448(gp) # 21f50 <__SDATA_BEGIN__+0x20>
102f0: 0440006f j 10334 <binary_search+0x10c>
102f4: 780000ef jal ra,10a74 <__subsf3>
102f8: 000a0593 mv a1,s4
102fc: 4f0000ef jal ra,107ec <__mulsf3>
10300: 00048593 mv a1,s1
10304: 07c000ef jal ra,10380 <__addsf3>
10308: 00098593 mv a1,s3
1030c: 00050413 mv s0,a0
10310: 406000ef jal ra,10716 <__eqsf2>
10314: 00050793 mv a5,a0
10318: 00098593 mv a1,s3
1031c: 00040513 mv a0,s0
10320: 00040713 mv a4,s0
10324: 02078863 beqz a5,10354 <binary_search+0x12c>
10328: 442000ef jal ra,1076a <__lesf2>
1032c: 04055663 bgez a0,10378 <binary_search+0x150>
10330: 00040493 mv s1,s0
10334: 00090593 mv a1,s2
10338: 00048513 mv a0,s1
1033c: 42e000ef jal ra,1076a <__lesf2>
10340: 00050793 mv a5,a0
10344: 00048593 mv a1,s1
10348: 00090513 mv a0,s2
1034c: faf054e3 blez a5,102f4 <binary_search+0xcc>
10350: 1bc1a703 lw a4,444(gp) # 21f4c <__SDATA_BEGIN__+0x1c>
10354: 01c12083 lw ra,28(sp)
10358: 01812403 lw s0,24(sp)
1035c: 01412483 lw s1,20(sp)
10360: 01012903 lw s2,16(sp)
10364: 00c12983 lw s3,12(sp)
10368: 00812a03 lw s4,8(sp)
1036c: 00070513 mv a0,a4
10370: 02010113 addi sp,sp,32
10374: 00008067 ret
10378: 00040913 mv s2,s0
1037c: fb9ff06f j 10334 <binary_search+0x10c>
00010094 <main>:
10094: ff010113 addi sp,sp,-16
10098: 00912223 sw s1,4(sp)
1009c: 1c81a483 lw s1,456(gp) # 21f58 <__SDATA_BEGIN__+0x28>
100a0: 1c41a603 lw a2,452(gp) # 21f54 <__SDATA_BEGIN__+0x24>
100a4: 1cc1a503 lw a0,460(gp) # 21f5c <__SDATA_BEGIN__+0x2c>
100a8: 00048593 mv a1,s1
100ac: 00112623 sw ra,12(sp)
100b0: 00812423 sw s0,8(sp)
100b4: 174000ef jal ra,10228 <binary_search>
100b8: 545000ef jal ra,10dfc <__extendsfdf2>
100bc: 0001f437 lui s0,0x1f
100c0: 00058693 mv a3,a1
100c4: 00050613 mv a2,a0
100c8: 69040513 addi a0,s0,1680 # 1f690 <_exit+0x4>
100cc: 783000ef jal ra,1104e <printf>
100d0: 1d01a603 lw a2,464(gp) # 21f60 <__SDATA_BEGIN__+0x30>
100d4: 1d41a583 lw a1,468(gp) # 21f64 <__SDATA_BEGIN__+0x34>
100d8: 00048513 mv a0,s1
100dc: 14c000ef jal ra,10228 <binary_search>
100e0: 51d000ef jal ra,10dfc <__extendsfdf2>
100e4: 00058693 mv a3,a1
100e8: 00050613 mv a2,a0
100ec: 69040513 addi a0,s0,1680
100f0: 75f000ef jal ra,1104e <printf>
100f4: 1d81a603 lw a2,472(gp) # 21f68 <__SDATA_BEGIN__+0x38>
100f8: 1dc1a583 lw a1,476(gp) # 21f6c <__SDATA_BEGIN__+0x3c>
100fc: 1e01a503 lw a0,480(gp) # 21f70 <__SDATA_BEGIN__+0x40>
10100: 128000ef jal ra,10228 <binary_search>
10104: 4f9000ef jal ra,10dfc <__extendsfdf2>
10108: 00050613 mv a2,a0
1010c: 00058693 mv a3,a1
10110: 69040513 addi a0,s0,1680
10114: 73b000ef jal ra,1104e <printf>
10118: 00c12083 lw ra,12(sp)
1011c: 00812403 lw s0,8(sp)
10120: 00412483 lw s1,4(sp)
10124: 00000513 li a0,0
10128: 01010113 addi sp,sp,16
1012c: 00008067 ret
```
:::
:::spoiler **readelf**
```
ELF Header:
Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00
Class: ELF32
Data: 2's complement, little endian
Version: 1 (current)
OS/ABI: UNIX - System V
ABI Version: 0
Type: EXEC (Executable file)
Machine: RISC-V
Version: 0x1
Entry point address: 0x10144
Start of program headers: 52 (bytes into file)
Start of section headers: 87140 (bytes into file)
Flags: 0x1, RVC, soft-float ABI
Size of this header: 52 (bytes)
Size of program headers: 32 (bytes)
Number of program headers: 2
Size of section headers: 40 (bytes)
Number of section headers: 17
Section header string table index: 16
```
:::
:::spoiler {state="open"}**size**
```
text data bss dec hex filename
66830 2556 92 69478 10f66 hw2_o3.elf
```
:::
#### Os optimization
:::spoiler **objdump**
```
000101d0 <fp32_to_bf16>:
101d0: ff010113 addi sp,sp,-16
101d4: 7f8007b7 lui a5,0x7f800
101d8: 00812423 sw s0,8(sp)
101dc: 00112623 sw ra,12(sp)
101e0: 00a7f733 and a4,a5,a0
101e4: 00050413 mv s0,a0
101e8: 02071063 bnez a4,10208 <fp32_to_bf16+0x38>
101ec: 00951793 slli a5,a0,0x9
101f0: 00079e63 bnez a5,1020c <fp32_to_bf16+0x3c>
101f4: 00c12083 lw ra,12(sp)
101f8: 00040513 mv a0,s0
101fc: 00812403 lw s0,8(sp)
10200: 01010113 addi sp,sp,16
10204: 00008067 ret
10208: fef706e3 beq a4,a5,101f4 <fp32_to_bf16+0x24>
1020c: 1b81a583 lw a1,440(gp) # 21f58 <__SDATA_BEGIN__+0x18>
10210: ff8007b7 lui a5,0xff800
10214: 00f57533 and a0,a0,a5
10218: 5e2000ef jal ra,107fa <__mulsf3>
1021c: 00040593 mv a1,s0
10220: 0ec000ef jal ra,1030c <__addsf3>
10224: ffff0437 lui s0,0xffff0
10228: 00a47433 and s0,s0,a0
1022c: fc9ff06f j 101f4 <fp32_to_bf16+0x24>
00010230 <binary_search>:
10230: fe010113 addi sp,sp,-32
10234: 00112e23 sw ra,28(sp)
10238: 00812c23 sw s0,24(sp)
1023c: 00912a23 sw s1,20(sp)
10240: 00060413 mv s0,a2
10244: 01212823 sw s2,16(sp)
10248: 01312623 sw s3,12(sp)
1024c: 00058913 mv s2,a1
10250: 01512223 sw s5,4(sp)
10254: 01412423 sw s4,8(sp)
10258: f79ff0ef jal ra,101d0 <fp32_to_bf16>
1025c: 00050493 mv s1,a0
10260: 00090513 mv a0,s2
10264: f6dff0ef jal ra,101d0 <fp32_to_bf16>
10268: 00050913 mv s2,a0
1026c: 00040513 mv a0,s0
10270: f61ff0ef jal ra,101d0 <fp32_to_bf16>
10274: 1c01aa83 lw s5,448(gp) # 21f60 <__SDATA_BEGIN__+0x20>
10278: 00050993 mv s3,a0
1027c: 00090593 mv a1,s2
10280: 00048513 mv a0,s1
10284: 4f4000ef jal ra,10778 <__lesf2>
10288: 02a05863 blez a0,102b8 <binary_search+0x88>
1028c: 1bc1aa03 lw s4,444(gp) # 21f5c <__SDATA_BEGIN__+0x1c>
10290: 01c12083 lw ra,28(sp)
10294: 01812403 lw s0,24(sp)
10298: 01412483 lw s1,20(sp)
1029c: 01012903 lw s2,16(sp)
102a0: 00c12983 lw s3,12(sp)
102a4: 00412a83 lw s5,4(sp)
102a8: 000a0513 mv a0,s4
102ac: 00812a03 lw s4,8(sp)
102b0: 02010113 addi sp,sp,32
102b4: 00008067 ret
102b8: 00048593 mv a1,s1
102bc: 00090513 mv a0,s2
102c0: 7c2000ef jal ra,10a82 <__subsf3>
102c4: 000a8593 mv a1,s5
102c8: 532000ef jal ra,107fa <__mulsf3>
102cc: 00048593 mv a1,s1
102d0: 03c000ef jal ra,1030c <__addsf3>
102d4: 00050413 mv s0,a0
102d8: 00050a13 mv s4,a0
102dc: 00050593 mv a1,a0
102e0: 00098513 mv a0,s3
102e4: 3be000ef jal ra,106a2 <__eqsf2>
102e8: fa0504e3 beqz a0,10290 <binary_search+0x60>
102ec: 00040593 mv a1,s0
102f0: 00098513 mv a0,s3
102f4: 402000ef jal ra,106f6 <__gesf2>
102f8: 00a04663 bgtz a0,10304 <binary_search+0xd4>
102fc: 00040913 mv s2,s0
10300: f7dff06f j 1027c <binary_search+0x4c>
10304: 00040493 mv s1,s0
10308: f75ff06f j 1027c <binary_search+0x4c>
00010094 <main>:
10094: ff010113 addi sp,sp,-16
10098: 00912223 sw s1,4(sp)
1009c: 1c81a483 lw s1,456(gp) # 21f68 <__SDATA_BEGIN__+0x28>
100a0: 1c41a603 lw a2,452(gp) # 21f64 <__SDATA_BEGIN__+0x24>
100a4: 1cc1a503 lw a0,460(gp) # 21f6c <__SDATA_BEGIN__+0x2c>
100a8: 00048593 mv a1,s1
100ac: 00112623 sw ra,12(sp)
100b0: 00812423 sw s0,8(sp)
100b4: 17c000ef jal ra,10230 <binary_search>
100b8: 553000ef jal ra,10e0a <__extendsfdf2>
100bc: 0001f437 lui s0,0x1f
100c0: 00058693 mv a3,a1
100c4: 00050613 mv a2,a0
100c8: 6a040513 addi a0,s0,1696 # 1f6a0 <_exit+0x6>
100cc: 791000ef jal ra,1105c <printf>
100d0: 1d01a603 lw a2,464(gp) # 21f70 <__SDATA_BEGIN__+0x30>
100d4: 1d41a583 lw a1,468(gp) # 21f74 <__SDATA_BEGIN__+0x34>
100d8: 00048513 mv a0,s1
100dc: 154000ef jal ra,10230 <binary_search>
100e0: 52b000ef jal ra,10e0a <__extendsfdf2>
100e4: 00058693 mv a3,a1
100e8: 00050613 mv a2,a0
100ec: 6a040513 addi a0,s0,1696
100f0: 76d000ef jal ra,1105c <printf>
100f4: 1d81a603 lw a2,472(gp) # 21f78 <__SDATA_BEGIN__+0x38>
100f8: 1dc1a583 lw a1,476(gp) # 21f7c <__SDATA_BEGIN__+0x3c>
100fc: 1e01a503 lw a0,480(gp) # 21f80 <__SDATA_BEGIN__+0x40>
10100: 130000ef jal ra,10230 <binary_search>
10104: 507000ef jal ra,10e0a <__extendsfdf2>
10108: 00050613 mv a2,a0
1010c: 00058693 mv a3,a1
10110: 6a040513 addi a0,s0,1696
10114: 749000ef jal ra,1105c <printf>
10118: 00c12083 lw ra,12(sp)
1011c: 00812403 lw s0,8(sp)
10120: 00412483 lw s1,4(sp)
10124: 00000513 li a0,0
10128: 01010113 addi sp,sp,16
1012c: 00008067 ret
```
:::
:::spoiler **readelf**
```
ELF Header:
Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00
Class: ELF32
Data: 2's complement, little endian
Version: 1 (current)
OS/ABI: UNIX - System V
ABI Version: 0
Type: EXEC (Executable file)
Machine: RISC-V
Version: 0x1
Entry point address: 0x10144
Start of program headers: 52 (bytes into file)
Start of section headers: 87260 (bytes into file)
Flags: 0x1, RVC, soft-float ABI
Size of this header: 52 (bytes)
Size of program headers: 32 (bytes)
Number of program headers: 2
Size of section headers: 40 (bytes)
Number of section headers: 17
Section header string table index: 16
```
:::
:::spoiler {state="open"}**size**
```
text data bss dec hex filename
66844 2556 92 69492 10f74 hw2_os.elf
```
:::
#### Ofast optimization
:::spoiler **objdump**
```
000101d0 <fp32_to_bf16>:
101d0: ff010113 addi sp,sp,-16
101d4: 00812423 sw s0,8(sp)
101d8: 00112623 sw ra,12(sp)
101dc: 00151793 slli a5,a0,0x1
101e0: 00050413 mv s0,a0
101e4: 02078863 beqz a5,10214 <fp32_to_bf16+0x44>
101e8: 7f8007b7 lui a5,0x7f800
101ec: 00a7f733 and a4,a5,a0
101f0: 02f70263 beq a4,a5,10214 <fp32_to_bf16+0x44>
101f4: 1b81a583 lw a1,440(gp) # 21f48 <__SDATA_BEGIN__+0x18>
101f8: ff800537 lui a0,0xff800
101fc: 00857533 and a0,a0,s0
10200: 5e8000ef jal ra,107e8 <__mulsf3>
10204: 00040593 mv a1,s0
10208: 174000ef jal ra,1037c <__addsf3>
1020c: ffff0437 lui s0,0xffff0
10210: 00a47433 and s0,s0,a0
10214: 00c12083 lw ra,12(sp)
10218: 00040513 mv a0,s0
1021c: 00812403 lw s0,8(sp)
10220: 01010113 addi sp,sp,16
10224: 00008067 ret
00010228 <binary_search>:
10228: fe010113 addi sp,sp,-32
1022c: 00912a23 sw s1,20(sp)
10230: 01212823 sw s2,16(sp)
10234: 01412423 sw s4,8(sp)
10238: 00112e23 sw ra,28(sp)
1023c: 00812c23 sw s0,24(sp)
10240: 01312623 sw s3,12(sp)
10244: 00151793 slli a5,a0,0x1
10248: 00050493 mv s1,a0
1024c: 00058a13 mv s4,a1
10250: 00060913 mv s2,a2
10254: 02078863 beqz a5,10284 <binary_search+0x5c>
10258: 7f8007b7 lui a5,0x7f800
1025c: 00a7f6b3 and a3,a5,a0
10260: 02f68263 beq a3,a5,10284 <binary_search+0x5c>
10264: 1b81a583 lw a1,440(gp) # 21f48 <__SDATA_BEGIN__+0x18>
10268: ff800537 lui a0,0xff800
1026c: 00a4f533 and a0,s1,a0
10270: 578000ef jal ra,107e8 <__mulsf3>
10274: 00048593 mv a1,s1
10278: 104000ef jal ra,1037c <__addsf3>
1027c: ffff04b7 lui s1,0xffff0
10280: 00a4f4b3 and s1,s1,a0
10284: 001a1793 slli a5,s4,0x1
10288: 02078863 beqz a5,102b8 <binary_search+0x90>
1028c: 7f8007b7 lui a5,0x7f800
10290: 0147f6b3 and a3,a5,s4
10294: 02f68263 beq a3,a5,102b8 <binary_search+0x90>
10298: 1b81a583 lw a1,440(gp) # 21f48 <__SDATA_BEGIN__+0x18>
1029c: ff800537 lui a0,0xff800
102a0: 00aa7533 and a0,s4,a0
102a4: 544000ef jal ra,107e8 <__mulsf3>
102a8: 000a0593 mv a1,s4
102ac: 0d0000ef jal ra,1037c <__addsf3>
102b0: ffff0a37 lui s4,0xffff0
102b4: 00aa7a33 and s4,s4,a0
102b8: 00191793 slli a5,s2,0x1
102bc: 02078863 beqz a5,102ec <binary_search+0xc4>
102c0: 7f8007b7 lui a5,0x7f800
102c4: 0127f733 and a4,a5,s2
102c8: 02f70263 beq a4,a5,102ec <binary_search+0xc4>
102cc: 1b81a583 lw a1,440(gp) # 21f48 <__SDATA_BEGIN__+0x18>
102d0: ff800537 lui a0,0xff800
102d4: 01257533 and a0,a0,s2
102d8: 510000ef jal ra,107e8 <__mulsf3>
102dc: 00090593 mv a1,s2
102e0: 09c000ef jal ra,1037c <__addsf3>
102e4: ffff0937 lui s2,0xffff0
102e8: 00a97933 and s2,s2,a0
102ec: 1c01a983 lw s3,448(gp) # 21f50 <__SDATA_BEGIN__+0x20>
102f0: 0400006f j 10330 <binary_search+0x108>
102f4: 77c000ef jal ra,10a70 <__subsf3>
102f8: 00098593 mv a1,s3
102fc: 4ec000ef jal ra,107e8 <__mulsf3>
10300: 00048593 mv a1,s1
10304: 078000ef jal ra,1037c <__addsf3>
10308: 00090593 mv a1,s2
1030c: 00050413 mv s0,a0
10310: 402000ef jal ra,10712 <__eqsf2>
10314: 00050793 mv a5,a0
10318: 00090593 mv a1,s2
1031c: 00040513 mv a0,s0
10320: 02078863 beqz a5,10350 <binary_search+0x128>
10324: 442000ef jal ra,10766 <__lesf2>
10328: 04055663 bgez a0,10374 <binary_search+0x14c>
1032c: 00040493 mv s1,s0
10330: 000a0593 mv a1,s4
10334: 00048513 mv a0,s1
10338: 42e000ef jal ra,10766 <__lesf2>
1033c: 00050793 mv a5,a0
10340: 00048593 mv a1,s1
10344: 000a0513 mv a0,s4
10348: faf056e3 blez a5,102f4 <binary_search+0xcc>
1034c: 1bc1a903 lw s2,444(gp) # 21f4c <__SDATA_BEGIN__+0x1c>
10350: 01c12083 lw ra,28(sp)
10354: 01812403 lw s0,24(sp)
10358: 01412483 lw s1,20(sp)
1035c: 00c12983 lw s3,12(sp)
10360: 00812a03 lw s4,8(sp)
10364: 00090513 mv a0,s2
10368: 01012903 lw s2,16(sp)
1036c: 02010113 addi sp,sp,32
10370: 00008067 ret
10374: 00040a13 mv s4,s0
10378: fb9ff06f j 10330 <binary_search+0x108>
00010094 <main>:
10094: ff010113 addi sp,sp,-16
10098: 00912223 sw s1,4(sp)
1009c: 1c81a483 lw s1,456(gp) # 21f58 <__SDATA_BEGIN__+0x28>
100a0: 1c41a603 lw a2,452(gp) # 21f54 <__SDATA_BEGIN__+0x24>
100a4: 1cc1a503 lw a0,460(gp) # 21f5c <__SDATA_BEGIN__+0x2c>
100a8: 00048593 mv a1,s1
100ac: 00112623 sw ra,12(sp)
100b0: 00812423 sw s0,8(sp)
100b4: 174000ef jal ra,10228 <binary_search>
100b8: 541000ef jal ra,10df8 <__extendsfdf2>
100bc: 0001f437 lui s0,0x1f
100c0: 00058693 mv a3,a1
100c4: 00050613 mv a2,a0
100c8: 69040513 addi a0,s0,1680 # 1f690 <_exit+0x8>
100cc: 77f000ef jal ra,1104a <printf>
100d0: 1d01a603 lw a2,464(gp) # 21f60 <__SDATA_BEGIN__+0x30>
100d4: 1d41a583 lw a1,468(gp) # 21f64 <__SDATA_BEGIN__+0x34>
100d8: 00048513 mv a0,s1
100dc: 14c000ef jal ra,10228 <binary_search>
100e0: 519000ef jal ra,10df8 <__extendsfdf2>
100e4: 00058693 mv a3,a1
100e8: 00050613 mv a2,a0
100ec: 69040513 addi a0,s0,1680
100f0: 75b000ef jal ra,1104a <printf>
100f4: 1d81a603 lw a2,472(gp) # 21f68 <__SDATA_BEGIN__+0x38>
100f8: 1dc1a583 lw a1,476(gp) # 21f6c <__SDATA_BEGIN__+0x3c>
100fc: 1e01a503 lw a0,480(gp) # 21f70 <__SDATA_BEGIN__+0x40>
10100: 128000ef jal ra,10228 <binary_search>
10104: 4f5000ef jal ra,10df8 <__extendsfdf2>
10108: 00050613 mv a2,a0
1010c: 00058693 mv a3,a1
10110: 69040513 addi a0,s0,1680
10114: 737000ef jal ra,1104a <printf>
10118: 00c12083 lw ra,12(sp)
1011c: 00812403 lw s0,8(sp)
10120: 00412483 lw s1,4(sp)
10124: 00000513 li a0,0
10128: 01010113 addi sp,sp,16
1012c: 00008067 ret
```
:::
:::spoiler **readelf**
```
ELF Header:
Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00
Class: ELF32
Data: 2's complement, little endian
Version: 1 (current)
OS/ABI: UNIX - System V
ABI Version: 0
Type: EXEC (Executable file)
Machine: RISC-V
Version: 0x1
Entry point address: 0x10144
Start of program headers: 52 (bytes into file)
Start of section headers: 87140 (bytes into file)
Flags: 0x1, RVC, soft-float ABI
Size of this header: 52 (bytes)
Size of program headers: 32 (bytes)
Number of program headers: 2
Size of section headers: 40 (bytes)
Number of section headers: 17
Section header string table index: 16
```
:::
:::spoiler {state="open"}**size**
```
text data bss dec hex filename
66826 2556 92 69474 10f62 hw2_ofast.elf
```
:::
## Environment
Reference: [Lab2: RISC-V RV32I[MA] emulator with ELF support](https://hackmd.io/3tO0gfoqT7upOzVpFZlFBw?view)
### GNU Toolchain
Download GNU Toolchain
```shell
$ cd /tmp
$ wget https://github.com/xpack-dev-tools/riscv-none-elf-gcc-xpack/releases/download/v13.2.0-2/xpack-riscv-none-elf-gcc-13.2.0-2-linux-x64.tar.gz
$ tar zxvf xpack-riscv-none-elf-gcc-13.2.0-2-linux-x64.tar.gz
$ cp -af xpack-riscv-none-elf-gcc-13.2.0-2 $HOME/riscv-none-elf-gcc
```
**Configure $PATH**
``` code=
$ gedit ~/.bashrc
# add riscv-none-elf-gcc into PATH
export PATH="$PATH:/home/chen/riscv-none-elf-gcc/bin"
# add rv32emu into PATH
export PATH="$PATH:/home/chen/rv32emu/bin"
$ source ~/.bashrc
```
Check
```shell
$ riscv-none-elf-gcc -v
```
### rv32emu
```shell
# install requirment package
$ sudo apt update && sudo apt install -y libsdl2-dev libsdl2-mixer-dev make
# for "make *** [Makefile:142: build/map.o] Error 127" error
$ sudo apt-get install build-essential
$ git clone https://github.com/sysprog21/rv32emu
$ cd rv32emu
$ make
$ make check
```
:::warning
You shall use RDCYCLE/RDCYCLEH instruction for the statistics of your program’s execution.
:notes: jserv
:::
## RDCYCLE
Take the `getcycles.s` from [rv32emu repo](https://github.com/sysprog21/rv32emu/blob/master/tests/perfcounter/getcycles.S). Add it into current folder.
* getcycles.s
```c
.text
.globl get_cycles
.align 2
get_cycles:
csrr a1, cycleh
csrr a0, cycle
csrr a2, cycleh
bne a1, a2, get_cycles
ret
.size get_cycles,.-get_cycles
```
Add `get_cycles()` into the C program to count the cycle.
* hw2_cycle.c
```diff!
#include <stdio.h>
+#include <stdint.h>
+extern uint64_t get_cycles();
float fp32_to_bf16(float x)
{
float y = x;
int *p = (int *) &y;
unsigned int exp = *p & 0x7F800000;
unsigned int man = *p & 0x007FFFFF;
if (exp == 0 && man == 0)
return x;
if (exp == 0x7F800000)
return x;
float r = x;
int *pr = (int *) &r;
*pr &= 0xFF800000;
r /= 0x100;
y = x + r;
*p &= 0xFFFF0000;
return y;
}
float binary_search(float low, float high, float target) {
low = fp32_to_bf16(low);
high = fp32_to_bf16(high);
target = fp32_to_bf16(target);
while (low <= high) {
float mid = low + (high - low) / 2;
if (mid == target) {
return mid;
}
if (mid < target) {
low = mid;
}
else {
high = mid;
}
}
return -1;
}
int main(){
+ uint64_t oldcount = get_cycles();
float test_case1_x= 0.1;
float test_case1_ub = 10.0;
float test_case1_lb = 0.001;
float test_case2_x= 100.0;
float test_case2_ub = 256.0;
float test_case2_lb = 10.0;
float test_case3_x= 0.563;
float test_case3_ub = 1.0;
float test_case3_lb = 0.01;
printf("%f\n",binary_search(test_case1_lb, test_case1_ub, test_case1_x));
printf("%f\n",binary_search(test_case2_lb, test_case2_ub, test_case2_x));
printf("%f\n",binary_search(test_case3_lb, test_case3_ub, test_case3_x));
+ uint64_t cyclecount = get_cycles() - oldcount;
+ printf("cyle count: %u\n", (unsigned int)cyclecount);
return 0;
}
```
* Assemble `getcycle.s` to object file
Convert `getcycles.s` to `getcycles.o`
```shell
$ riscv-none-elf-gcc -march=rv32i_zicsr_zifencei -mabi=ilp32 -Wall -c -o getcycles.o getcycles.S
```
* Assemble `hw2_cycle.c` to object file with O1 optimization
Convert `hw2_cycle.c` to `hw2_cycle_o1.o`
```shell
$ riscv-none-elf-gcc -march=rv32i_zicsr_zifencei -mabi=ilp32 -O1 -Wall -c -o hw2_cycle_o1.o hw2_cycle.c
```
* Compile & link with `hw2_cycle_o1.o` and `getcycles.o` to `hw2_cycle_o1.elf`
```shell
$ riscv-none-elf-gcc -o hw2_cycle_o1.elf getcycles.o hw2_cycle_o1.o
```
* Use `rv32emu` to run the elf file
```shell
$ cd /rv32emu/tests/ca_hw2
$ ../../build/rv32emu hw2_cycle_o1.elf
```
### Result
* Result of O1 optimization
```
0.100098
100.000000
0.562500
cyle count: 43593
inferior exit code 0
```
* Result Table with different optimization
| | O0 | O1 | O2 | O3 | Os | Ofast |
| :---------: | :---: | :---: | :---: | :---: | :---: | :---: |
| cycle count | 44371 | 43593 | 43620 | 43620 | 43684 | 43538 |
:::warning
Show me the handwritten RISC-V assembly code.
:notes: jserv
:::