# Assignment2: GNU Toolchain contributed by <[`KuanYuan0530`](https://github.com/KuanYuan0530)> ## Program selection Topic: [`Approximating a bfloat number using binary search`](https://hackmd.io/@JY7VQTBOSN-L5160WDDdmQ/SJw8t_6ea) by [`coding-ray`](https://github.com/timothyliu0912/computer_architecture/tree/main/Assignment1) (劉庭聿) ### Original C code ```c #include <stdio.h> float fp32_to_bf16(float x) { float y = x; int *p = (int *) &y; unsigned int exp = *p & 0x7F800000; unsigned int man = *p & 0x007FFFFF; if (exp == 0 && man == 0) return x; if (exp == 0x7F800000) return x; float r = x; int *pr = (int *) &r; *pr &= 0xFF800000; r /= 0x100; y = x + r; *p &= 0xFFFF0000; return y; } float binary_search(float low, float high, float target) { low = fp32_to_bf16(low); high = fp32_to_bf16(high); target = fp32_to_bf16(target); while (low <= high) { float mid = low + (high - low) / 2; if (mid == target) { return mid; } if (mid < target) { low = mid; } else { high = mid; } } return -1; } int main(){ float test_case1_x= 0.1; float test_case1_ub = 10.0; float test_case1_lb = 0.001; float test_case2_x= 100.0; float test_case2_ub = 256.0; float test_case2_lb = 10.0; float test_case3_x= 0.563; float test_case3_ub = 1.0; float test_case3_lb = 0.01; printf("%f\n",binary_search(test_case1_lb, test_case1_ub, test_case1_x)); printf("%f\n",binary_search(test_case2_lb, test_case2_ub, test_case2_x)); printf("%f\n",binary_search(test_case3_lb, test_case3_ub, test_case3_x)); return 0; } ``` ## Implementation ### Using GNU Toolchain #### Optimization by compiler * gcc > `-O0~-O3` : Adjust optimization level > `-Os` : optimize size > `-Ofast` : optimize speed ```shell $ riscv-none-elf-gcc -march=rv32i -mabi=ilp32 -O1 Source2.c -o hw2_o0.elf ``` #### Disassemble * objdump > `-d` : Display the assembler mnemonics for the machine instructions ```shell $ riscv-none-elf-objdump -d hw2_o1.elf ``` #### Show ELF information * readelf > `-h` : Display the ELF file header ```shell $ riscv-none-elf-readelf -h hw2_o1.elf ``` #### Lists the section sizes * size ```shell $ riscv-none-elf-size hw2_o1.elf ``` ### Result #### O0 optimization :::spoiler {state="open"}**objdump** ``` 00010134 <fp32_to_bf16>: 10134: fc010113 addi sp,sp,-64 10138: 02112e23 sw ra,60(sp) 1013c: 02812c23 sw s0,56(sp) 10140: 04010413 addi s0,sp,64 10144: fca42623 sw a0,-52(s0) 10148: fcc42783 lw a5,-52(s0) 1014c: fcf42e23 sw a5,-36(s0) 10150: fdc40793 addi a5,s0,-36 10154: fef42623 sw a5,-20(s0) 10158: fec42783 lw a5,-20(s0) 1015c: 0007a783 lw a5,0(a5) 10160: 00078713 mv a4,a5 10164: 7f8007b7 lui a5,0x7f800 10168: 00f777b3 and a5,a4,a5 1016c: fef42423 sw a5,-24(s0) 10170: fec42783 lw a5,-20(s0) 10174: 0007a783 lw a5,0(a5) # 7f800000 <__BSS_END__+0x7f7de5c0> 10178: 00078713 mv a4,a5 1017c: 008007b7 lui a5,0x800 10180: fff78793 addi a5,a5,-1 # 7fffff <__BSS_END__+0x7de5bf> 10184: 00f777b3 and a5,a4,a5 10188: fef42223 sw a5,-28(s0) 1018c: fe842783 lw a5,-24(s0) 10190: 00079a63 bnez a5,101a4 <fp32_to_bf16+0x70> 10194: fe442783 lw a5,-28(s0) 10198: 00079663 bnez a5,101a4 <fp32_to_bf16+0x70> 1019c: fcc42783 lw a5,-52(s0) 101a0: 0900006f j 10230 <fp32_to_bf16+0xfc> 101a4: fe842703 lw a4,-24(s0) 101a8: 7f8007b7 lui a5,0x7f800 101ac: 00f71663 bne a4,a5,101b8 <fp32_to_bf16+0x84> 101b0: fcc42783 lw a5,-52(s0) 101b4: 07c0006f j 10230 <fp32_to_bf16+0xfc> 101b8: fcc42783 lw a5,-52(s0) 101bc: fcf42c23 sw a5,-40(s0) 101c0: fd840793 addi a5,s0,-40 101c4: fef42023 sw a5,-32(s0) 101c8: fe042783 lw a5,-32(s0) 101cc: 0007a703 lw a4,0(a5) # 7f800000 <__BSS_END__+0x7f7de5c0> 101d0: ff8007b7 lui a5,0xff800 101d4: 00f77733 and a4,a4,a5 101d8: fe042783 lw a5,-32(s0) 101dc: 00e7a023 sw a4,0(a5) # ff800000 <__BSS_END__+0xff7de5c0> 101e0: fd842703 lw a4,-40(s0) 101e4: 0001f7b7 lui a5,0x1f 101e8: 7a47a583 lw a1,1956(a5) # 1f7a4 <_exit+0xa> 101ec: 00070513 mv a0,a4 101f0: 61a000ef jal ra,1080a <__divsf3> 101f4: 00050793 mv a5,a0 101f8: fcf42c23 sw a5,-40(s0) 101fc: fd842783 lw a5,-40(s0) 10200: fcc42583 lw a1,-52(s0) 10204: 00078513 mv a0,a5 10208: 26c000ef jal ra,10474 <__addsf3> 1020c: 00050793 mv a5,a0 10210: fcf42e23 sw a5,-36(s0) 10214: fec42783 lw a5,-20(s0) 10218: 0007a703 lw a4,0(a5) 1021c: ffff07b7 lui a5,0xffff0 10220: 00f77733 and a4,a4,a5 10224: fec42783 lw a5,-20(s0) 10228: 00e7a023 sw a4,0(a5) # ffff0000 <__BSS_END__+0xfffce5c0> 1022c: fdc42783 lw a5,-36(s0) 10230: 00078513 mv a0,a5 10234: 03c12083 lw ra,60(sp) 10238: 03812403 lw s0,56(sp) 1023c: 04010113 addi sp,sp,64 10240: 00008067 ret 00010244 <binary_search>: 10244: fd010113 addi sp,sp,-48 10248: 02112623 sw ra,44(sp) 1024c: 02812423 sw s0,40(sp) 10250: 03010413 addi s0,sp,48 10254: fca42e23 sw a0,-36(s0) 10258: fcb42c23 sw a1,-40(s0) 1025c: fcc42a23 sw a2,-44(s0) 10260: fdc42503 lw a0,-36(s0) 10264: ed1ff0ef jal ra,10134 <fp32_to_bf16> 10268: fca42e23 sw a0,-36(s0) 1026c: fd842503 lw a0,-40(s0) 10270: ec5ff0ef jal ra,10134 <fp32_to_bf16> 10274: fca42c23 sw a0,-40(s0) 10278: fd442503 lw a0,-44(s0) 1027c: eb9ff0ef jal ra,10134 <fp32_to_bf16> 10280: fca42a23 sw a0,-44(s0) 10284: 0840006f j 10308 <binary_search+0xc4> 10288: fdc42583 lw a1,-36(s0) 1028c: fd842503 lw a0,-40(s0) 10290: 0f5000ef jal ra,10b84 <__subsf3> 10294: 00050793 mv a5,a0 10298: 00078713 mv a4,a5 1029c: 0001f7b7 lui a5,0x1f 102a0: 7a87a583 lw a1,1960(a5) # 1f7a8 <_exit+0xe> 102a4: 00070513 mv a0,a4 102a8: 562000ef jal ra,1080a <__divsf3> 102ac: 00050793 mv a5,a0 102b0: 00078593 mv a1,a5 102b4: fdc42503 lw a0,-36(s0) 102b8: 1bc000ef jal ra,10474 <__addsf3> 102bc: 00050793 mv a5,a0 102c0: fef42623 sw a5,-20(s0) 102c4: fd442583 lw a1,-44(s0) 102c8: fec42503 lw a0,-20(s0) 102cc: 7e2000ef jal ra,10aae <__eqsf2> 102d0: 00050793 mv a5,a0 102d4: 00079663 bnez a5,102e0 <binary_search+0x9c> 102d8: fec42783 lw a5,-20(s0) 102dc: 0480006f j 10324 <binary_search+0xe0> 102e0: fd442583 lw a1,-44(s0) 102e4: fec42503 lw a0,-20(s0) 102e8: 01b000ef jal ra,10b02 <__lesf2> 102ec: 00050793 mv a5,a0 102f0: 0007d863 bgez a5,10300 <binary_search+0xbc> 102f4: fec42783 lw a5,-20(s0) 102f8: fcf42e23 sw a5,-36(s0) 102fc: 00c0006f j 10308 <binary_search+0xc4> 10300: fec42783 lw a5,-20(s0) 10304: fcf42c23 sw a5,-40(s0) 10308: fd842583 lw a1,-40(s0) 1030c: fdc42503 lw a0,-36(s0) 10310: 7f2000ef jal ra,10b02 <__lesf2> 10314: 00050793 mv a5,a0 10318: f6f058e3 blez a5,10288 <binary_search+0x44> 1031c: 0001f7b7 lui a5,0x1f 10320: 7ac7a783 lw a5,1964(a5) # 1f7ac <_exit+0x12> 10324: 00078513 mv a0,a5 10328: 02c12083 lw ra,44(sp) 1032c: 02812403 lw s0,40(sp) 10330: 03010113 addi sp,sp,48 10334: 00008067 ret 00010338 <main>: 10338: fc010113 addi sp,sp,-64 1033c: 02112e23 sw ra,60(sp) 10340: 02812c23 sw s0,56(sp) 10344: 04010413 addi s0,sp,64 10348: 0001f7b7 lui a5,0x1f 1034c: 7b07a783 lw a5,1968(a5) # 1f7b0 <_exit+0x16> 10350: fef42623 sw a5,-20(s0) 10354: 0001f7b7 lui a5,0x1f 10358: 7b47a783 lw a5,1972(a5) # 1f7b4 <_exit+0x1a> 1035c: fef42423 sw a5,-24(s0) 10360: 0001f7b7 lui a5,0x1f 10364: 7b87a783 lw a5,1976(a5) # 1f7b8 <_exit+0x1e> 10368: fef42223 sw a5,-28(s0) 1036c: 0001f7b7 lui a5,0x1f 10370: 7bc7a783 lw a5,1980(a5) # 1f7bc <_exit+0x22> 10374: fef42023 sw a5,-32(s0) 10378: 0001f7b7 lui a5,0x1f 1037c: 7a47a783 lw a5,1956(a5) # 1f7a4 <_exit+0xa> 10380: fcf42e23 sw a5,-36(s0) 10384: 0001f7b7 lui a5,0x1f 10388: 7b47a783 lw a5,1972(a5) # 1f7b4 <_exit+0x1a> 1038c: fcf42c23 sw a5,-40(s0) 10390: 0001f7b7 lui a5,0x1f 10394: 7c07a783 lw a5,1984(a5) # 1f7c0 <_exit+0x26> 10398: fcf42a23 sw a5,-44(s0) 1039c: 0001f7b7 lui a5,0x1f 103a0: 7c47a783 lw a5,1988(a5) # 1f7c4 <_exit+0x2a> 103a4: fcf42823 sw a5,-48(s0) 103a8: 0001f7b7 lui a5,0x1f 103ac: 7c87a783 lw a5,1992(a5) # 1f7c8 <_exit+0x2e> 103b0: fcf42623 sw a5,-52(s0) 103b4: fec42603 lw a2,-20(s0) 103b8: fe842583 lw a1,-24(s0) 103bc: fe442503 lw a0,-28(s0) 103c0: e85ff0ef jal ra,10244 <binary_search> 103c4: 00050793 mv a5,a0 103c8: 00078513 mv a0,a5 103cc: 341000ef jal ra,10f0c <__extendsfdf2> 103d0: 00050713 mv a4,a0 103d4: 00058793 mv a5,a1 103d8: 00070613 mv a2,a4 103dc: 00078693 mv a3,a5 103e0: 0001f7b7 lui a5,0x1f 103e4: 7a078513 addi a0,a5,1952 # 1f7a0 <_exit+0x6> 103e8: 577000ef jal ra,1115e <printf> 103ec: fe042603 lw a2,-32(s0) 103f0: fdc42583 lw a1,-36(s0) 103f4: fd842503 lw a0,-40(s0) 103f8: e4dff0ef jal ra,10244 <binary_search> 103fc: 00050793 mv a5,a0 10400: 00078513 mv a0,a5 10404: 309000ef jal ra,10f0c <__extendsfdf2> 10408: 00050713 mv a4,a0 1040c: 00058793 mv a5,a1 10410: 00070613 mv a2,a4 10414: 00078693 mv a3,a5 10418: 0001f7b7 lui a5,0x1f 1041c: 7a078513 addi a0,a5,1952 # 1f7a0 <_exit+0x6> 10420: 53f000ef jal ra,1115e <printf> 10424: fd442603 lw a2,-44(s0) 10428: fd042583 lw a1,-48(s0) 1042c: fcc42503 lw a0,-52(s0) 10430: e15ff0ef jal ra,10244 <binary_search> 10434: 00050793 mv a5,a0 10438: 00078513 mv a0,a5 1043c: 2d1000ef jal ra,10f0c <__extendsfdf2> 10440: 00050713 mv a4,a0 10444: 00058793 mv a5,a1 10448: 00070613 mv a2,a4 1044c: 00078693 mv a3,a5 10450: 0001f7b7 lui a5,0x1f 10454: 7a078513 addi a0,a5,1952 # 1f7a0 <_exit+0x6> 10458: 507000ef jal ra,1115e <printf> 1045c: 00000793 li a5,0 10460: 00078513 mv a0,a5 10464: 03c12083 lw ra,60(sp) 10468: 03812403 lw s0,56(sp) 1046c: 04010113 addi sp,sp,64 10470: 00008067 ret ``` ::: :::spoiler {state="open"}**readelf** ``` ELF Header: Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00 Class: ELF32 Data: 2's complement, little endian Version: 1 (current) OS/ABI: UNIX - System V ABI Version: 0 Type: EXEC (Executable file) Machine: RISC-V Version: 0x1 Entry point address: 0x100a6 Start of program headers: 52 (bytes into file) Start of section headers: 89784 (bytes into file) Flags: 0x1, RVC, soft-float ABI Size of this header: 52 (bytes) Size of program headers: 32 (bytes) Number of program headers: 2 Size of section headers: 40 (bytes) Number of section headers: 17 Section header string table index: 16 ``` ::: :::spoiler {state="open"}**size** ``` text data bss dec hex filename 67196 2512 92 69800 110a8 hw2_o0.elf ``` ::: #### O1 optimization :::spoiler **objdump** ``` 00010134 <fp32_to_bf16>: 10134: ff010113 addi sp,sp,-16 10138: 00112623 sw ra,12(sp) 1013c: 00812423 sw s0,8(sp) 10140: 00050413 mv s0,a0 10144: 00151793 slli a5,a0,0x1 10148: 02078863 beqz a5,10178 <fp32_to_bf16+0x44> 1014c: 7f8007b7 lui a5,0x7f800 10150: 00a7f733 and a4,a5,a0 10154: 02f70263 beq a4,a5,10178 <fp32_to_bf16+0x44> 10158: 1b81a583 lw a1,440(gp) # 21f48 <__SDATA_BEGIN__+0x18> 1015c: ff800537 lui a0,0xff800 10160: 00857533 and a0,a0,s0 10164: 67e000ef jal ra,107e2 <__mulsf3> 10168: 00040593 mv a1,s0 1016c: 188000ef jal ra,102f4 <__addsf3> 10170: ffff07b7 lui a5,0xffff0 10174: 00a7f533 and a0,a5,a0 10178: 00c12083 lw ra,12(sp) 1017c: 00812403 lw s0,8(sp) 10180: 01010113 addi sp,sp,16 10184: 00008067 ret 00010188 <binary_search>: 10188: fe010113 addi sp,sp,-32 1018c: 00112e23 sw ra,28(sp) 10190: 00812c23 sw s0,24(sp) 10194: 00912a23 sw s1,20(sp) 10198: 01212823 sw s2,16(sp) 1019c: 01312623 sw s3,12(sp) 101a0: 01412423 sw s4,8(sp) 101a4: 00058913 mv s2,a1 101a8: 00060413 mv s0,a2 101ac: f89ff0ef jal ra,10134 <fp32_to_bf16> 101b0: 00050493 mv s1,a0 101b4: 00090513 mv a0,s2 101b8: f7dff0ef jal ra,10134 <fp32_to_bf16> 101bc: 00050913 mv s2,a0 101c0: 00040513 mv a0,s0 101c4: f71ff0ef jal ra,10134 <fp32_to_bf16> 101c8: 00050993 mv s3,a0 101cc: 1c01aa03 lw s4,448(gp) # 21f50 <__SDATA_BEGIN__+0x20> 101d0: 00090593 mv a1,s2 101d4: 00048513 mv a0,s1 101d8: 588000ef jal ra,10760 <__lesf2> 101dc: 04a04a63 bgtz a0,10230 <binary_search+0xa8> 101e0: 00048593 mv a1,s1 101e4: 00090513 mv a0,s2 101e8: 083000ef jal ra,10a6a <__subsf3> 101ec: 000a0593 mv a1,s4 101f0: 5f2000ef jal ra,107e2 <__mulsf3> 101f4: 00048593 mv a1,s1 101f8: 0fc000ef jal ra,102f4 <__addsf3> 101fc: 00050413 mv s0,a0 10200: 00050593 mv a1,a0 10204: 00098513 mv a0,s3 10208: 482000ef jal ra,1068a <__eqsf2> 1020c: 02050463 beqz a0,10234 <binary_search+0xac> 10210: 00040593 mv a1,s0 10214: 00098513 mv a0,s3 10218: 4c6000ef jal ra,106de <__gesf2> 1021c: 00a05663 blez a0,10228 <binary_search+0xa0> 10220: 00040493 mv s1,s0 10224: fadff06f j 101d0 <binary_search+0x48> 10228: 00040913 mv s2,s0 1022c: fa5ff06f j 101d0 <binary_search+0x48> 10230: 1bc1a403 lw s0,444(gp) # 21f4c <__SDATA_BEGIN__+0x1c> 10234: 00040513 mv a0,s0 10238: 01c12083 lw ra,28(sp) 1023c: 01812403 lw s0,24(sp) 10240: 01412483 lw s1,20(sp) 10244: 01012903 lw s2,16(sp) 10248: 00c12983 lw s3,12(sp) 1024c: 00812a03 lw s4,8(sp) 10250: 02010113 addi sp,sp,32 10254: 00008067 ret 00010258 <main>: 10258: ff010113 addi sp,sp,-16 1025c: 00112623 sw ra,12(sp) 10260: 00812423 sw s0,8(sp) 10264: 00912223 sw s1,4(sp) 10268: 1c81a483 lw s1,456(gp) # 21f58 <__SDATA_BEGIN__+0x28> 1026c: 1c41a603 lw a2,452(gp) # 21f54 <__SDATA_BEGIN__+0x24> 10270: 00048593 mv a1,s1 10274: 1cc1a503 lw a0,460(gp) # 21f5c <__SDATA_BEGIN__+0x2c> 10278: f11ff0ef jal ra,10188 <binary_search> 1027c: 377000ef jal ra,10df2 <__extendsfdf2> 10280: 00050613 mv a2,a0 10284: 00058693 mv a3,a1 10288: 0001f437 lui s0,0x1f 1028c: 69040513 addi a0,s0,1680 # 1f690 <_exit+0xe> 10290: 5b5000ef jal ra,11044 <printf> 10294: 1d01a603 lw a2,464(gp) # 21f60 <__SDATA_BEGIN__+0x30> 10298: 1d41a583 lw a1,468(gp) # 21f64 <__SDATA_BEGIN__+0x34> 1029c: 00048513 mv a0,s1 102a0: ee9ff0ef jal ra,10188 <binary_search> 102a4: 34f000ef jal ra,10df2 <__extendsfdf2> 102a8: 00050613 mv a2,a0 102ac: 00058693 mv a3,a1 102b0: 69040513 addi a0,s0,1680 102b4: 591000ef jal ra,11044 <printf> 102b8: 1d81a603 lw a2,472(gp) # 21f68 <__SDATA_BEGIN__+0x38> 102bc: 1dc1a583 lw a1,476(gp) # 21f6c <__SDATA_BEGIN__+0x3c> 102c0: 1e01a503 lw a0,480(gp) # 21f70 <__SDATA_BEGIN__+0x40> 102c4: ec5ff0ef jal ra,10188 <binary_search> 102c8: 32b000ef jal ra,10df2 <__extendsfdf2> 102cc: 00050613 mv a2,a0 102d0: 00058693 mv a3,a1 102d4: 69040513 addi a0,s0,1680 102d8: 56d000ef jal ra,11044 <printf> 102dc: 00000513 li a0,0 102e0: 00c12083 lw ra,12(sp) 102e4: 00812403 lw s0,8(sp) 102e8: 00412483 lw s1,4(sp) 102ec: 01010113 addi sp,sp,16 102f0: 00008067 ret ``` ::: :::spoiler **readelf** ``` ELF Header: Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00 Class: ELF32 Data: 2's complement, little endian Version: 1 (current) OS/ABI: UNIX - System V ABI Version: 0 Type: EXEC (Executable file) Machine: RISC-V Version: 0x1 Entry point address: 0x100a6 Start of program headers: 52 (bytes into file) Start of section headers: 87244 (bytes into file) Flags: 0x1, RVC, soft-float ABI Size of this header: 52 (bytes) Size of program headers: 32 (bytes) Number of program headers: 2 Size of section headers: 40 (bytes) Number of section headers: 17 Section header string table index: 16 ``` ::: :::spoiler {state="open"}**size** ``` text data bss dec hex filename 66820 2556 92 69468 10f5c hw2_o1.elf ``` ::: #### O2 optimization :::spoiler **objdump** ``` 000101d0 <fp32_to_bf16>: 101d0: ff010113 addi sp,sp,-16 101d4: 00812423 sw s0,8(sp) 101d8: 00112623 sw ra,12(sp) 101dc: 00151793 slli a5,a0,0x1 101e0: 00050413 mv s0,a0 101e4: 02078863 beqz a5,10214 <fp32_to_bf16+0x44> 101e8: 7f8007b7 lui a5,0x7f800 101ec: 00a7f733 and a4,a5,a0 101f0: 02f70263 beq a4,a5,10214 <fp32_to_bf16+0x44> 101f4: 1b81a583 lw a1,440(gp) # 21f48 <__SDATA_BEGIN__+0x18> 101f8: ff800537 lui a0,0xff800 101fc: 00857533 and a0,a0,s0 10200: 5ec000ef jal ra,107ec <__mulsf3> 10204: 00040593 mv a1,s0 10208: 178000ef jal ra,10380 <__addsf3> 1020c: ffff0437 lui s0,0xffff0 10210: 00a47433 and s0,s0,a0 10214: 00c12083 lw ra,12(sp) 10218: 00040513 mv a0,s0 1021c: 00812403 lw s0,8(sp) 10220: 01010113 addi sp,sp,16 10224: 00008067 ret 00010228 <binary_search>: 10228: fe010113 addi sp,sp,-32 1022c: 00912a23 sw s1,20(sp) 10230: 01212823 sw s2,16(sp) 10234: 01312623 sw s3,12(sp) 10238: 00112e23 sw ra,28(sp) 1023c: 00812c23 sw s0,24(sp) 10240: 01412423 sw s4,8(sp) 10244: 00151793 slli a5,a0,0x1 10248: 00050493 mv s1,a0 1024c: 00058913 mv s2,a1 10250: 00060993 mv s3,a2 10254: 02078863 beqz a5,10284 <binary_search+0x5c> 10258: 7f8007b7 lui a5,0x7f800 1025c: 00a7f6b3 and a3,a5,a0 10260: 02f68263 beq a3,a5,10284 <binary_search+0x5c> 10264: 1b81a583 lw a1,440(gp) # 21f48 <__SDATA_BEGIN__+0x18> 10268: ff800537 lui a0,0xff800 1026c: 00a4f533 and a0,s1,a0 10270: 57c000ef jal ra,107ec <__mulsf3> 10274: 00048593 mv a1,s1 10278: 108000ef jal ra,10380 <__addsf3> 1027c: ffff04b7 lui s1,0xffff0 10280: 00a4f4b3 and s1,s1,a0 10284: 00191793 slli a5,s2,0x1 10288: 02078863 beqz a5,102b8 <binary_search+0x90> 1028c: 7f8007b7 lui a5,0x7f800 10290: 0127f6b3 and a3,a5,s2 10294: 02f68263 beq a3,a5,102b8 <binary_search+0x90> 10298: 1b81a583 lw a1,440(gp) # 21f48 <__SDATA_BEGIN__+0x18> 1029c: ff800537 lui a0,0xff800 102a0: 00a97533 and a0,s2,a0 102a4: 548000ef jal ra,107ec <__mulsf3> 102a8: 00090593 mv a1,s2 102ac: 0d4000ef jal ra,10380 <__addsf3> 102b0: ffff0937 lui s2,0xffff0 102b4: 00a97933 and s2,s2,a0 102b8: 00199793 slli a5,s3,0x1 102bc: 02078863 beqz a5,102ec <binary_search+0xc4> 102c0: 7f8007b7 lui a5,0x7f800 102c4: 0137f6b3 and a3,a5,s3 102c8: 02f68263 beq a3,a5,102ec <binary_search+0xc4> 102cc: 1b81a583 lw a1,440(gp) # 21f48 <__SDATA_BEGIN__+0x18> 102d0: ff800537 lui a0,0xff800 102d4: 00a9f533 and a0,s3,a0 102d8: 514000ef jal ra,107ec <__mulsf3> 102dc: 00098593 mv a1,s3 102e0: 0a0000ef jal ra,10380 <__addsf3> 102e4: ffff09b7 lui s3,0xffff0 102e8: 00a9f9b3 and s3,s3,a0 102ec: 1c01aa03 lw s4,448(gp) # 21f50 <__SDATA_BEGIN__+0x20> 102f0: 0440006f j 10334 <binary_search+0x10c> 102f4: 780000ef jal ra,10a74 <__subsf3> 102f8: 000a0593 mv a1,s4 102fc: 4f0000ef jal ra,107ec <__mulsf3> 10300: 00048593 mv a1,s1 10304: 07c000ef jal ra,10380 <__addsf3> 10308: 00098593 mv a1,s3 1030c: 00050413 mv s0,a0 10310: 406000ef jal ra,10716 <__eqsf2> 10314: 00050793 mv a5,a0 10318: 00098593 mv a1,s3 1031c: 00040513 mv a0,s0 10320: 00040713 mv a4,s0 10324: 02078863 beqz a5,10354 <binary_search+0x12c> 10328: 442000ef jal ra,1076a <__lesf2> 1032c: 04055663 bgez a0,10378 <binary_search+0x150> 10330: 00040493 mv s1,s0 10334: 00090593 mv a1,s2 10338: 00048513 mv a0,s1 1033c: 42e000ef jal ra,1076a <__lesf2> 10340: 00050793 mv a5,a0 10344: 00048593 mv a1,s1 10348: 00090513 mv a0,s2 1034c: faf054e3 blez a5,102f4 <binary_search+0xcc> 10350: 1bc1a703 lw a4,444(gp) # 21f4c <__SDATA_BEGIN__+0x1c> 10354: 01c12083 lw ra,28(sp) 10358: 01812403 lw s0,24(sp) 1035c: 01412483 lw s1,20(sp) 10360: 01012903 lw s2,16(sp) 10364: 00c12983 lw s3,12(sp) 10368: 00812a03 lw s4,8(sp) 1036c: 00070513 mv a0,a4 10370: 02010113 addi sp,sp,32 10374: 00008067 ret 10378: 00040913 mv s2,s0 1037c: fb9ff06f j 10334 <binary_search+0x10c> 00010094 <main>: 10094: ff010113 addi sp,sp,-16 10098: 00912223 sw s1,4(sp) 1009c: 1c81a483 lw s1,456(gp) # 21f58 <__SDATA_BEGIN__+0x28> 100a0: 1c41a603 lw a2,452(gp) # 21f54 <__SDATA_BEGIN__+0x24> 100a4: 1cc1a503 lw a0,460(gp) # 21f5c <__SDATA_BEGIN__+0x2c> 100a8: 00048593 mv a1,s1 100ac: 00112623 sw ra,12(sp) 100b0: 00812423 sw s0,8(sp) 100b4: 174000ef jal ra,10228 <binary_search> 100b8: 545000ef jal ra,10dfc <__extendsfdf2> 100bc: 0001f437 lui s0,0x1f 100c0: 00058693 mv a3,a1 100c4: 00050613 mv a2,a0 100c8: 69040513 addi a0,s0,1680 # 1f690 <_exit+0x4> 100cc: 783000ef jal ra,1104e <printf> 100d0: 1d01a603 lw a2,464(gp) # 21f60 <__SDATA_BEGIN__+0x30> 100d4: 1d41a583 lw a1,468(gp) # 21f64 <__SDATA_BEGIN__+0x34> 100d8: 00048513 mv a0,s1 100dc: 14c000ef jal ra,10228 <binary_search> 100e0: 51d000ef jal ra,10dfc <__extendsfdf2> 100e4: 00058693 mv a3,a1 100e8: 00050613 mv a2,a0 100ec: 69040513 addi a0,s0,1680 100f0: 75f000ef jal ra,1104e <printf> 100f4: 1d81a603 lw a2,472(gp) # 21f68 <__SDATA_BEGIN__+0x38> 100f8: 1dc1a583 lw a1,476(gp) # 21f6c <__SDATA_BEGIN__+0x3c> 100fc: 1e01a503 lw a0,480(gp) # 21f70 <__SDATA_BEGIN__+0x40> 10100: 128000ef jal ra,10228 <binary_search> 10104: 4f9000ef jal ra,10dfc <__extendsfdf2> 10108: 00050613 mv a2,a0 1010c: 00058693 mv a3,a1 10110: 69040513 addi a0,s0,1680 10114: 73b000ef jal ra,1104e <printf> 10118: 00c12083 lw ra,12(sp) 1011c: 00812403 lw s0,8(sp) 10120: 00412483 lw s1,4(sp) 10124: 00000513 li a0,0 10128: 01010113 addi sp,sp,16 1012c: 00008067 ret ``` ::: :::spoiler **readelf** ``` ELF Header: Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00 Class: ELF32 Data: 2's complement, little endian Version: 1 (current) OS/ABI: UNIX - System V ABI Version: 0 Type: EXEC (Executable file) Machine: RISC-V Version: 0x1 Entry point address: 0x10144 Start of program headers: 52 (bytes into file) Start of section headers: 87140 (bytes into file) Flags: 0x1, RVC, soft-float ABI Size of this header: 52 (bytes) Size of program headers: 32 (bytes) Number of program headers: 2 Size of section headers: 40 (bytes) Number of section headers: 17 Section header string table index: 16 ``` ::: :::spoiler {state="open"}**size** ``` text data bss dec hex filename 66830 2556 92 69478 10f66 hw2_o2.elf ``` ::: #### O3 optimization :::spoiler **objdump** ``` 000101d0 <fp32_to_bf16>: 101d0: ff010113 addi sp,sp,-16 101d4: 00812423 sw s0,8(sp) 101d8: 00112623 sw ra,12(sp) 101dc: 00151793 slli a5,a0,0x1 101e0: 00050413 mv s0,a0 101e4: 02078863 beqz a5,10214 <fp32_to_bf16+0x44> 101e8: 7f8007b7 lui a5,0x7f800 101ec: 00a7f733 and a4,a5,a0 101f0: 02f70263 beq a4,a5,10214 <fp32_to_bf16+0x44> 101f4: 1b81a583 lw a1,440(gp) # 21f48 <__SDATA_BEGIN__+0x18> 101f8: ff800537 lui a0,0xff800 101fc: 00857533 and a0,a0,s0 10200: 5ec000ef jal ra,107ec <__mulsf3> 10204: 00040593 mv a1,s0 10208: 178000ef jal ra,10380 <__addsf3> 1020c: ffff0437 lui s0,0xffff0 10210: 00a47433 and s0,s0,a0 10214: 00c12083 lw ra,12(sp) 10218: 00040513 mv a0,s0 1021c: 00812403 lw s0,8(sp) 10220: 01010113 addi sp,sp,16 10224: 00008067 ret 00010228 <binary_search>: 10228: fe010113 addi sp,sp,-32 1022c: 00912a23 sw s1,20(sp) 10230: 01212823 sw s2,16(sp) 10234: 01312623 sw s3,12(sp) 10238: 00112e23 sw ra,28(sp) 1023c: 00812c23 sw s0,24(sp) 10240: 01412423 sw s4,8(sp) 10244: 00151793 slli a5,a0,0x1 10248: 00050493 mv s1,a0 1024c: 00058913 mv s2,a1 10250: 00060993 mv s3,a2 10254: 02078863 beqz a5,10284 <binary_search+0x5c> 10258: 7f8007b7 lui a5,0x7f800 1025c: 00a7f6b3 and a3,a5,a0 10260: 02f68263 beq a3,a5,10284 <binary_search+0x5c> 10264: 1b81a583 lw a1,440(gp) # 21f48 <__SDATA_BEGIN__+0x18> 10268: ff800537 lui a0,0xff800 1026c: 00a4f533 and a0,s1,a0 10270: 57c000ef jal ra,107ec <__mulsf3> 10274: 00048593 mv a1,s1 10278: 108000ef jal ra,10380 <__addsf3> 1027c: ffff04b7 lui s1,0xffff0 10280: 00a4f4b3 and s1,s1,a0 10284: 00191793 slli a5,s2,0x1 10288: 02078863 beqz a5,102b8 <binary_search+0x90> 1028c: 7f8007b7 lui a5,0x7f800 10290: 0127f6b3 and a3,a5,s2 10294: 02f68263 beq a3,a5,102b8 <binary_search+0x90> 10298: 1b81a583 lw a1,440(gp) # 21f48 <__SDATA_BEGIN__+0x18> 1029c: ff800537 lui a0,0xff800 102a0: 00a97533 and a0,s2,a0 102a4: 548000ef jal ra,107ec <__mulsf3> 102a8: 00090593 mv a1,s2 102ac: 0d4000ef jal ra,10380 <__addsf3> 102b0: ffff0937 lui s2,0xffff0 102b4: 00a97933 and s2,s2,a0 102b8: 00199793 slli a5,s3,0x1 102bc: 02078863 beqz a5,102ec <binary_search+0xc4> 102c0: 7f8007b7 lui a5,0x7f800 102c4: 0137f6b3 and a3,a5,s3 102c8: 02f68263 beq a3,a5,102ec <binary_search+0xc4> 102cc: 1b81a583 lw a1,440(gp) # 21f48 <__SDATA_BEGIN__+0x18> 102d0: ff800537 lui a0,0xff800 102d4: 00a9f533 and a0,s3,a0 102d8: 514000ef jal ra,107ec <__mulsf3> 102dc: 00098593 mv a1,s3 102e0: 0a0000ef jal ra,10380 <__addsf3> 102e4: ffff09b7 lui s3,0xffff0 102e8: 00a9f9b3 and s3,s3,a0 102ec: 1c01aa03 lw s4,448(gp) # 21f50 <__SDATA_BEGIN__+0x20> 102f0: 0440006f j 10334 <binary_search+0x10c> 102f4: 780000ef jal ra,10a74 <__subsf3> 102f8: 000a0593 mv a1,s4 102fc: 4f0000ef jal ra,107ec <__mulsf3> 10300: 00048593 mv a1,s1 10304: 07c000ef jal ra,10380 <__addsf3> 10308: 00098593 mv a1,s3 1030c: 00050413 mv s0,a0 10310: 406000ef jal ra,10716 <__eqsf2> 10314: 00050793 mv a5,a0 10318: 00098593 mv a1,s3 1031c: 00040513 mv a0,s0 10320: 00040713 mv a4,s0 10324: 02078863 beqz a5,10354 <binary_search+0x12c> 10328: 442000ef jal ra,1076a <__lesf2> 1032c: 04055663 bgez a0,10378 <binary_search+0x150> 10330: 00040493 mv s1,s0 10334: 00090593 mv a1,s2 10338: 00048513 mv a0,s1 1033c: 42e000ef jal ra,1076a <__lesf2> 10340: 00050793 mv a5,a0 10344: 00048593 mv a1,s1 10348: 00090513 mv a0,s2 1034c: faf054e3 blez a5,102f4 <binary_search+0xcc> 10350: 1bc1a703 lw a4,444(gp) # 21f4c <__SDATA_BEGIN__+0x1c> 10354: 01c12083 lw ra,28(sp) 10358: 01812403 lw s0,24(sp) 1035c: 01412483 lw s1,20(sp) 10360: 01012903 lw s2,16(sp) 10364: 00c12983 lw s3,12(sp) 10368: 00812a03 lw s4,8(sp) 1036c: 00070513 mv a0,a4 10370: 02010113 addi sp,sp,32 10374: 00008067 ret 10378: 00040913 mv s2,s0 1037c: fb9ff06f j 10334 <binary_search+0x10c> 00010094 <main>: 10094: ff010113 addi sp,sp,-16 10098: 00912223 sw s1,4(sp) 1009c: 1c81a483 lw s1,456(gp) # 21f58 <__SDATA_BEGIN__+0x28> 100a0: 1c41a603 lw a2,452(gp) # 21f54 <__SDATA_BEGIN__+0x24> 100a4: 1cc1a503 lw a0,460(gp) # 21f5c <__SDATA_BEGIN__+0x2c> 100a8: 00048593 mv a1,s1 100ac: 00112623 sw ra,12(sp) 100b0: 00812423 sw s0,8(sp) 100b4: 174000ef jal ra,10228 <binary_search> 100b8: 545000ef jal ra,10dfc <__extendsfdf2> 100bc: 0001f437 lui s0,0x1f 100c0: 00058693 mv a3,a1 100c4: 00050613 mv a2,a0 100c8: 69040513 addi a0,s0,1680 # 1f690 <_exit+0x4> 100cc: 783000ef jal ra,1104e <printf> 100d0: 1d01a603 lw a2,464(gp) # 21f60 <__SDATA_BEGIN__+0x30> 100d4: 1d41a583 lw a1,468(gp) # 21f64 <__SDATA_BEGIN__+0x34> 100d8: 00048513 mv a0,s1 100dc: 14c000ef jal ra,10228 <binary_search> 100e0: 51d000ef jal ra,10dfc <__extendsfdf2> 100e4: 00058693 mv a3,a1 100e8: 00050613 mv a2,a0 100ec: 69040513 addi a0,s0,1680 100f0: 75f000ef jal ra,1104e <printf> 100f4: 1d81a603 lw a2,472(gp) # 21f68 <__SDATA_BEGIN__+0x38> 100f8: 1dc1a583 lw a1,476(gp) # 21f6c <__SDATA_BEGIN__+0x3c> 100fc: 1e01a503 lw a0,480(gp) # 21f70 <__SDATA_BEGIN__+0x40> 10100: 128000ef jal ra,10228 <binary_search> 10104: 4f9000ef jal ra,10dfc <__extendsfdf2> 10108: 00050613 mv a2,a0 1010c: 00058693 mv a3,a1 10110: 69040513 addi a0,s0,1680 10114: 73b000ef jal ra,1104e <printf> 10118: 00c12083 lw ra,12(sp) 1011c: 00812403 lw s0,8(sp) 10120: 00412483 lw s1,4(sp) 10124: 00000513 li a0,0 10128: 01010113 addi sp,sp,16 1012c: 00008067 ret ``` ::: :::spoiler **readelf** ``` ELF Header: Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00 Class: ELF32 Data: 2's complement, little endian Version: 1 (current) OS/ABI: UNIX - System V ABI Version: 0 Type: EXEC (Executable file) Machine: RISC-V Version: 0x1 Entry point address: 0x10144 Start of program headers: 52 (bytes into file) Start of section headers: 87140 (bytes into file) Flags: 0x1, RVC, soft-float ABI Size of this header: 52 (bytes) Size of program headers: 32 (bytes) Number of program headers: 2 Size of section headers: 40 (bytes) Number of section headers: 17 Section header string table index: 16 ``` ::: :::spoiler {state="open"}**size** ``` text data bss dec hex filename 66830 2556 92 69478 10f66 hw2_o3.elf ``` ::: #### Os optimization :::spoiler **objdump** ``` 000101d0 <fp32_to_bf16>: 101d0: ff010113 addi sp,sp,-16 101d4: 7f8007b7 lui a5,0x7f800 101d8: 00812423 sw s0,8(sp) 101dc: 00112623 sw ra,12(sp) 101e0: 00a7f733 and a4,a5,a0 101e4: 00050413 mv s0,a0 101e8: 02071063 bnez a4,10208 <fp32_to_bf16+0x38> 101ec: 00951793 slli a5,a0,0x9 101f0: 00079e63 bnez a5,1020c <fp32_to_bf16+0x3c> 101f4: 00c12083 lw ra,12(sp) 101f8: 00040513 mv a0,s0 101fc: 00812403 lw s0,8(sp) 10200: 01010113 addi sp,sp,16 10204: 00008067 ret 10208: fef706e3 beq a4,a5,101f4 <fp32_to_bf16+0x24> 1020c: 1b81a583 lw a1,440(gp) # 21f58 <__SDATA_BEGIN__+0x18> 10210: ff8007b7 lui a5,0xff800 10214: 00f57533 and a0,a0,a5 10218: 5e2000ef jal ra,107fa <__mulsf3> 1021c: 00040593 mv a1,s0 10220: 0ec000ef jal ra,1030c <__addsf3> 10224: ffff0437 lui s0,0xffff0 10228: 00a47433 and s0,s0,a0 1022c: fc9ff06f j 101f4 <fp32_to_bf16+0x24> 00010230 <binary_search>: 10230: fe010113 addi sp,sp,-32 10234: 00112e23 sw ra,28(sp) 10238: 00812c23 sw s0,24(sp) 1023c: 00912a23 sw s1,20(sp) 10240: 00060413 mv s0,a2 10244: 01212823 sw s2,16(sp) 10248: 01312623 sw s3,12(sp) 1024c: 00058913 mv s2,a1 10250: 01512223 sw s5,4(sp) 10254: 01412423 sw s4,8(sp) 10258: f79ff0ef jal ra,101d0 <fp32_to_bf16> 1025c: 00050493 mv s1,a0 10260: 00090513 mv a0,s2 10264: f6dff0ef jal ra,101d0 <fp32_to_bf16> 10268: 00050913 mv s2,a0 1026c: 00040513 mv a0,s0 10270: f61ff0ef jal ra,101d0 <fp32_to_bf16> 10274: 1c01aa83 lw s5,448(gp) # 21f60 <__SDATA_BEGIN__+0x20> 10278: 00050993 mv s3,a0 1027c: 00090593 mv a1,s2 10280: 00048513 mv a0,s1 10284: 4f4000ef jal ra,10778 <__lesf2> 10288: 02a05863 blez a0,102b8 <binary_search+0x88> 1028c: 1bc1aa03 lw s4,444(gp) # 21f5c <__SDATA_BEGIN__+0x1c> 10290: 01c12083 lw ra,28(sp) 10294: 01812403 lw s0,24(sp) 10298: 01412483 lw s1,20(sp) 1029c: 01012903 lw s2,16(sp) 102a0: 00c12983 lw s3,12(sp) 102a4: 00412a83 lw s5,4(sp) 102a8: 000a0513 mv a0,s4 102ac: 00812a03 lw s4,8(sp) 102b0: 02010113 addi sp,sp,32 102b4: 00008067 ret 102b8: 00048593 mv a1,s1 102bc: 00090513 mv a0,s2 102c0: 7c2000ef jal ra,10a82 <__subsf3> 102c4: 000a8593 mv a1,s5 102c8: 532000ef jal ra,107fa <__mulsf3> 102cc: 00048593 mv a1,s1 102d0: 03c000ef jal ra,1030c <__addsf3> 102d4: 00050413 mv s0,a0 102d8: 00050a13 mv s4,a0 102dc: 00050593 mv a1,a0 102e0: 00098513 mv a0,s3 102e4: 3be000ef jal ra,106a2 <__eqsf2> 102e8: fa0504e3 beqz a0,10290 <binary_search+0x60> 102ec: 00040593 mv a1,s0 102f0: 00098513 mv a0,s3 102f4: 402000ef jal ra,106f6 <__gesf2> 102f8: 00a04663 bgtz a0,10304 <binary_search+0xd4> 102fc: 00040913 mv s2,s0 10300: f7dff06f j 1027c <binary_search+0x4c> 10304: 00040493 mv s1,s0 10308: f75ff06f j 1027c <binary_search+0x4c> 00010094 <main>: 10094: ff010113 addi sp,sp,-16 10098: 00912223 sw s1,4(sp) 1009c: 1c81a483 lw s1,456(gp) # 21f68 <__SDATA_BEGIN__+0x28> 100a0: 1c41a603 lw a2,452(gp) # 21f64 <__SDATA_BEGIN__+0x24> 100a4: 1cc1a503 lw a0,460(gp) # 21f6c <__SDATA_BEGIN__+0x2c> 100a8: 00048593 mv a1,s1 100ac: 00112623 sw ra,12(sp) 100b0: 00812423 sw s0,8(sp) 100b4: 17c000ef jal ra,10230 <binary_search> 100b8: 553000ef jal ra,10e0a <__extendsfdf2> 100bc: 0001f437 lui s0,0x1f 100c0: 00058693 mv a3,a1 100c4: 00050613 mv a2,a0 100c8: 6a040513 addi a0,s0,1696 # 1f6a0 <_exit+0x6> 100cc: 791000ef jal ra,1105c <printf> 100d0: 1d01a603 lw a2,464(gp) # 21f70 <__SDATA_BEGIN__+0x30> 100d4: 1d41a583 lw a1,468(gp) # 21f74 <__SDATA_BEGIN__+0x34> 100d8: 00048513 mv a0,s1 100dc: 154000ef jal ra,10230 <binary_search> 100e0: 52b000ef jal ra,10e0a <__extendsfdf2> 100e4: 00058693 mv a3,a1 100e8: 00050613 mv a2,a0 100ec: 6a040513 addi a0,s0,1696 100f0: 76d000ef jal ra,1105c <printf> 100f4: 1d81a603 lw a2,472(gp) # 21f78 <__SDATA_BEGIN__+0x38> 100f8: 1dc1a583 lw a1,476(gp) # 21f7c <__SDATA_BEGIN__+0x3c> 100fc: 1e01a503 lw a0,480(gp) # 21f80 <__SDATA_BEGIN__+0x40> 10100: 130000ef jal ra,10230 <binary_search> 10104: 507000ef jal ra,10e0a <__extendsfdf2> 10108: 00050613 mv a2,a0 1010c: 00058693 mv a3,a1 10110: 6a040513 addi a0,s0,1696 10114: 749000ef jal ra,1105c <printf> 10118: 00c12083 lw ra,12(sp) 1011c: 00812403 lw s0,8(sp) 10120: 00412483 lw s1,4(sp) 10124: 00000513 li a0,0 10128: 01010113 addi sp,sp,16 1012c: 00008067 ret ``` ::: :::spoiler **readelf** ``` ELF Header: Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00 Class: ELF32 Data: 2's complement, little endian Version: 1 (current) OS/ABI: UNIX - System V ABI Version: 0 Type: EXEC (Executable file) Machine: RISC-V Version: 0x1 Entry point address: 0x10144 Start of program headers: 52 (bytes into file) Start of section headers: 87260 (bytes into file) Flags: 0x1, RVC, soft-float ABI Size of this header: 52 (bytes) Size of program headers: 32 (bytes) Number of program headers: 2 Size of section headers: 40 (bytes) Number of section headers: 17 Section header string table index: 16 ``` ::: :::spoiler {state="open"}**size** ``` text data bss dec hex filename 66844 2556 92 69492 10f74 hw2_os.elf ``` ::: #### Ofast optimization :::spoiler **objdump** ``` 000101d0 <fp32_to_bf16>: 101d0: ff010113 addi sp,sp,-16 101d4: 00812423 sw s0,8(sp) 101d8: 00112623 sw ra,12(sp) 101dc: 00151793 slli a5,a0,0x1 101e0: 00050413 mv s0,a0 101e4: 02078863 beqz a5,10214 <fp32_to_bf16+0x44> 101e8: 7f8007b7 lui a5,0x7f800 101ec: 00a7f733 and a4,a5,a0 101f0: 02f70263 beq a4,a5,10214 <fp32_to_bf16+0x44> 101f4: 1b81a583 lw a1,440(gp) # 21f48 <__SDATA_BEGIN__+0x18> 101f8: ff800537 lui a0,0xff800 101fc: 00857533 and a0,a0,s0 10200: 5e8000ef jal ra,107e8 <__mulsf3> 10204: 00040593 mv a1,s0 10208: 174000ef jal ra,1037c <__addsf3> 1020c: ffff0437 lui s0,0xffff0 10210: 00a47433 and s0,s0,a0 10214: 00c12083 lw ra,12(sp) 10218: 00040513 mv a0,s0 1021c: 00812403 lw s0,8(sp) 10220: 01010113 addi sp,sp,16 10224: 00008067 ret 00010228 <binary_search>: 10228: fe010113 addi sp,sp,-32 1022c: 00912a23 sw s1,20(sp) 10230: 01212823 sw s2,16(sp) 10234: 01412423 sw s4,8(sp) 10238: 00112e23 sw ra,28(sp) 1023c: 00812c23 sw s0,24(sp) 10240: 01312623 sw s3,12(sp) 10244: 00151793 slli a5,a0,0x1 10248: 00050493 mv s1,a0 1024c: 00058a13 mv s4,a1 10250: 00060913 mv s2,a2 10254: 02078863 beqz a5,10284 <binary_search+0x5c> 10258: 7f8007b7 lui a5,0x7f800 1025c: 00a7f6b3 and a3,a5,a0 10260: 02f68263 beq a3,a5,10284 <binary_search+0x5c> 10264: 1b81a583 lw a1,440(gp) # 21f48 <__SDATA_BEGIN__+0x18> 10268: ff800537 lui a0,0xff800 1026c: 00a4f533 and a0,s1,a0 10270: 578000ef jal ra,107e8 <__mulsf3> 10274: 00048593 mv a1,s1 10278: 104000ef jal ra,1037c <__addsf3> 1027c: ffff04b7 lui s1,0xffff0 10280: 00a4f4b3 and s1,s1,a0 10284: 001a1793 slli a5,s4,0x1 10288: 02078863 beqz a5,102b8 <binary_search+0x90> 1028c: 7f8007b7 lui a5,0x7f800 10290: 0147f6b3 and a3,a5,s4 10294: 02f68263 beq a3,a5,102b8 <binary_search+0x90> 10298: 1b81a583 lw a1,440(gp) # 21f48 <__SDATA_BEGIN__+0x18> 1029c: ff800537 lui a0,0xff800 102a0: 00aa7533 and a0,s4,a0 102a4: 544000ef jal ra,107e8 <__mulsf3> 102a8: 000a0593 mv a1,s4 102ac: 0d0000ef jal ra,1037c <__addsf3> 102b0: ffff0a37 lui s4,0xffff0 102b4: 00aa7a33 and s4,s4,a0 102b8: 00191793 slli a5,s2,0x1 102bc: 02078863 beqz a5,102ec <binary_search+0xc4> 102c0: 7f8007b7 lui a5,0x7f800 102c4: 0127f733 and a4,a5,s2 102c8: 02f70263 beq a4,a5,102ec <binary_search+0xc4> 102cc: 1b81a583 lw a1,440(gp) # 21f48 <__SDATA_BEGIN__+0x18> 102d0: ff800537 lui a0,0xff800 102d4: 01257533 and a0,a0,s2 102d8: 510000ef jal ra,107e8 <__mulsf3> 102dc: 00090593 mv a1,s2 102e0: 09c000ef jal ra,1037c <__addsf3> 102e4: ffff0937 lui s2,0xffff0 102e8: 00a97933 and s2,s2,a0 102ec: 1c01a983 lw s3,448(gp) # 21f50 <__SDATA_BEGIN__+0x20> 102f0: 0400006f j 10330 <binary_search+0x108> 102f4: 77c000ef jal ra,10a70 <__subsf3> 102f8: 00098593 mv a1,s3 102fc: 4ec000ef jal ra,107e8 <__mulsf3> 10300: 00048593 mv a1,s1 10304: 078000ef jal ra,1037c <__addsf3> 10308: 00090593 mv a1,s2 1030c: 00050413 mv s0,a0 10310: 402000ef jal ra,10712 <__eqsf2> 10314: 00050793 mv a5,a0 10318: 00090593 mv a1,s2 1031c: 00040513 mv a0,s0 10320: 02078863 beqz a5,10350 <binary_search+0x128> 10324: 442000ef jal ra,10766 <__lesf2> 10328: 04055663 bgez a0,10374 <binary_search+0x14c> 1032c: 00040493 mv s1,s0 10330: 000a0593 mv a1,s4 10334: 00048513 mv a0,s1 10338: 42e000ef jal ra,10766 <__lesf2> 1033c: 00050793 mv a5,a0 10340: 00048593 mv a1,s1 10344: 000a0513 mv a0,s4 10348: faf056e3 blez a5,102f4 <binary_search+0xcc> 1034c: 1bc1a903 lw s2,444(gp) # 21f4c <__SDATA_BEGIN__+0x1c> 10350: 01c12083 lw ra,28(sp) 10354: 01812403 lw s0,24(sp) 10358: 01412483 lw s1,20(sp) 1035c: 00c12983 lw s3,12(sp) 10360: 00812a03 lw s4,8(sp) 10364: 00090513 mv a0,s2 10368: 01012903 lw s2,16(sp) 1036c: 02010113 addi sp,sp,32 10370: 00008067 ret 10374: 00040a13 mv s4,s0 10378: fb9ff06f j 10330 <binary_search+0x108> 00010094 <main>: 10094: ff010113 addi sp,sp,-16 10098: 00912223 sw s1,4(sp) 1009c: 1c81a483 lw s1,456(gp) # 21f58 <__SDATA_BEGIN__+0x28> 100a0: 1c41a603 lw a2,452(gp) # 21f54 <__SDATA_BEGIN__+0x24> 100a4: 1cc1a503 lw a0,460(gp) # 21f5c <__SDATA_BEGIN__+0x2c> 100a8: 00048593 mv a1,s1 100ac: 00112623 sw ra,12(sp) 100b0: 00812423 sw s0,8(sp) 100b4: 174000ef jal ra,10228 <binary_search> 100b8: 541000ef jal ra,10df8 <__extendsfdf2> 100bc: 0001f437 lui s0,0x1f 100c0: 00058693 mv a3,a1 100c4: 00050613 mv a2,a0 100c8: 69040513 addi a0,s0,1680 # 1f690 <_exit+0x8> 100cc: 77f000ef jal ra,1104a <printf> 100d0: 1d01a603 lw a2,464(gp) # 21f60 <__SDATA_BEGIN__+0x30> 100d4: 1d41a583 lw a1,468(gp) # 21f64 <__SDATA_BEGIN__+0x34> 100d8: 00048513 mv a0,s1 100dc: 14c000ef jal ra,10228 <binary_search> 100e0: 519000ef jal ra,10df8 <__extendsfdf2> 100e4: 00058693 mv a3,a1 100e8: 00050613 mv a2,a0 100ec: 69040513 addi a0,s0,1680 100f0: 75b000ef jal ra,1104a <printf> 100f4: 1d81a603 lw a2,472(gp) # 21f68 <__SDATA_BEGIN__+0x38> 100f8: 1dc1a583 lw a1,476(gp) # 21f6c <__SDATA_BEGIN__+0x3c> 100fc: 1e01a503 lw a0,480(gp) # 21f70 <__SDATA_BEGIN__+0x40> 10100: 128000ef jal ra,10228 <binary_search> 10104: 4f5000ef jal ra,10df8 <__extendsfdf2> 10108: 00050613 mv a2,a0 1010c: 00058693 mv a3,a1 10110: 69040513 addi a0,s0,1680 10114: 737000ef jal ra,1104a <printf> 10118: 00c12083 lw ra,12(sp) 1011c: 00812403 lw s0,8(sp) 10120: 00412483 lw s1,4(sp) 10124: 00000513 li a0,0 10128: 01010113 addi sp,sp,16 1012c: 00008067 ret ``` ::: :::spoiler **readelf** ``` ELF Header: Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00 Class: ELF32 Data: 2's complement, little endian Version: 1 (current) OS/ABI: UNIX - System V ABI Version: 0 Type: EXEC (Executable file) Machine: RISC-V Version: 0x1 Entry point address: 0x10144 Start of program headers: 52 (bytes into file) Start of section headers: 87140 (bytes into file) Flags: 0x1, RVC, soft-float ABI Size of this header: 52 (bytes) Size of program headers: 32 (bytes) Number of program headers: 2 Size of section headers: 40 (bytes) Number of section headers: 17 Section header string table index: 16 ``` ::: :::spoiler {state="open"}**size** ``` text data bss dec hex filename 66826 2556 92 69474 10f62 hw2_ofast.elf ``` ::: ## Environment Reference: [Lab2: RISC-V RV32I[MA] emulator with ELF support](https://hackmd.io/3tO0gfoqT7upOzVpFZlFBw?view) ### GNU Toolchain Download GNU Toolchain ```shell $ cd /tmp $ wget https://github.com/xpack-dev-tools/riscv-none-elf-gcc-xpack/releases/download/v13.2.0-2/xpack-riscv-none-elf-gcc-13.2.0-2-linux-x64.tar.gz $ tar zxvf xpack-riscv-none-elf-gcc-13.2.0-2-linux-x64.tar.gz $ cp -af xpack-riscv-none-elf-gcc-13.2.0-2 $HOME/riscv-none-elf-gcc ``` **Configure $PATH** ``` code= $ gedit ~/.bashrc # add riscv-none-elf-gcc into PATH export PATH="$PATH:/home/chen/riscv-none-elf-gcc/bin" # add rv32emu into PATH export PATH="$PATH:/home/chen/rv32emu/bin" $ source ~/.bashrc ``` Check ```shell $ riscv-none-elf-gcc -v ``` ### rv32emu ```shell # install requirment package $ sudo apt update && sudo apt install -y libsdl2-dev libsdl2-mixer-dev make # for "make *** [Makefile:142: build/map.o] Error 127" error $ sudo apt-get install build-essential $ git clone https://github.com/sysprog21/rv32emu $ cd rv32emu $ make $ make check ``` :::warning You shall use RDCYCLE/RDCYCLEH instruction for the statistics of your program’s execution. :notes: jserv ::: ## RDCYCLE Take the `getcycles.s` from [rv32emu repo](https://github.com/sysprog21/rv32emu/blob/master/tests/perfcounter/getcycles.S). Add it into current folder. * getcycles.s ```c .text .globl get_cycles .align 2 get_cycles: csrr a1, cycleh csrr a0, cycle csrr a2, cycleh bne a1, a2, get_cycles ret .size get_cycles,.-get_cycles ``` Add `get_cycles()` into the C program to count the cycle. * hw2_cycle.c ```diff! #include <stdio.h> +#include <stdint.h> +extern uint64_t get_cycles(); float fp32_to_bf16(float x) { float y = x; int *p = (int *) &y; unsigned int exp = *p & 0x7F800000; unsigned int man = *p & 0x007FFFFF; if (exp == 0 && man == 0) return x; if (exp == 0x7F800000) return x; float r = x; int *pr = (int *) &r; *pr &= 0xFF800000; r /= 0x100; y = x + r; *p &= 0xFFFF0000; return y; } float binary_search(float low, float high, float target) { low = fp32_to_bf16(low); high = fp32_to_bf16(high); target = fp32_to_bf16(target); while (low <= high) { float mid = low + (high - low) / 2; if (mid == target) { return mid; } if (mid < target) { low = mid; } else { high = mid; } } return -1; } int main(){ + uint64_t oldcount = get_cycles(); float test_case1_x= 0.1; float test_case1_ub = 10.0; float test_case1_lb = 0.001; float test_case2_x= 100.0; float test_case2_ub = 256.0; float test_case2_lb = 10.0; float test_case3_x= 0.563; float test_case3_ub = 1.0; float test_case3_lb = 0.01; printf("%f\n",binary_search(test_case1_lb, test_case1_ub, test_case1_x)); printf("%f\n",binary_search(test_case2_lb, test_case2_ub, test_case2_x)); printf("%f\n",binary_search(test_case3_lb, test_case3_ub, test_case3_x)); + uint64_t cyclecount = get_cycles() - oldcount; + printf("cyle count: %u\n", (unsigned int)cyclecount); return 0; } ``` * Assemble `getcycle.s` to object file Convert `getcycles.s` to `getcycles.o` ```shell $ riscv-none-elf-gcc -march=rv32i_zicsr_zifencei -mabi=ilp32 -Wall -c -o getcycles.o getcycles.S ``` * Assemble `hw2_cycle.c` to object file with O1 optimization Convert `hw2_cycle.c` to `hw2_cycle_o1.o` ```shell $ riscv-none-elf-gcc -march=rv32i_zicsr_zifencei -mabi=ilp32 -O1 -Wall -c -o hw2_cycle_o1.o hw2_cycle.c ``` * Compile & link with `hw2_cycle_o1.o` and `getcycles.o` to `hw2_cycle_o1.elf` ```shell $ riscv-none-elf-gcc -o hw2_cycle_o1.elf getcycles.o hw2_cycle_o1.o ``` * Use `rv32emu` to run the elf file ```shell $ cd /rv32emu/tests/ca_hw2 $ ../../build/rv32emu hw2_cycle_o1.elf ``` ### Result * Result of O1 optimization ``` 0.100098 100.000000 0.562500 cyle count: 43593 inferior exit code 0 ``` * Result Table with different optimization | | O0 | O1 | O2 | O3 | Os | Ofast | | :---------: | :---: | :---: | :---: | :---: | :---: | :---: | | cycle count | 44371 | 43593 | 43620 | 43620 | 43684 | 43538 | :::warning Show me the handwritten RISC-V assembly code. :notes: jserv :::