HW2 - HackMD

# Assignment2: RISC-V Toolchain Contributed by <[jeremy90307](https://github.com/jeremy90307)> ## Prepare GNU Toolchain for RISC-V [Lab2: RISC-V RV32I[MA] emulator with ELF support](https://hackmd.io/@sysprog/rJAufgHYS) ## Question Selection ### Question I chose the question by student [KuanYuan053](https://github.com/KuanYuan0530/Computer_Architecture_2023/): ["Implement quantization from bfloat16 to int8"](https://hackmd.io/@K1NCVjKnTCmNaikFb4gt-A/B1Fj-TGWp) :::spoiler C code Since student KuanYuan053 has already optimized the C code significantly, I won't waste time modifying it. I'll proceed directly to the RISC-V part. ```c #include <stdio.h> #include <stdlib.h> #include<math.h> # define array_size 7 # define range 127 /*2^(n-1)-1, n: quant bit*/ float fp32_to_bf16(float x); int* quant_bf16_to_int8(float x[]); float bf16_findmax(float x[]); int main() { float array[array_size] = {1.200000, 1.203125, 2.310000, 2.312500, 3.460000, 3.4531255, 5.630000}; float array2[array_size] = { 0.1, 0.2, 1.2, 3, 2.1, -4.2, 3.5}; float array3[array_size] = { 3.14159265, 0.12345678 , 1.23456789 , 0.00000123, 0.00000001, 0.99999999 , 0.00000007 }; float array_bf16[array_size] = {}; int *after_quant; /*data 1*/ for (int i = 0; i < 7; i++) { array_bf16[i] = fp32_to_bf16(array[i]); } printf("data 1\nbfloat16 number is \n"); for (int i = 0; i < array_size; i++) { printf("%.12f\n", array_bf16[i]); } after_quant = quant_bf16_to_int8(array_bf16); printf("after quantization \n"); for (int i = 0; i < array_size; i++) { printf("%d\n", after_quant[i]); } /*data 2*/ for (int i = 0; i < 7; i++) { array_bf16[i] = fp32_to_bf16(array2[i]); } printf("data 2\nbfloat16 number is \n"); for (int i = 0; i < array_size; i++) { printf("%.12f\n", array_bf16[i]); } after_quant = quant_bf16_to_int8(array_bf16); printf("after quantization \n"); for (int i = 0; i < array_size; i++) { printf("%d\n", after_quant[i]); } /*data 3*/ for (int i = 0; i < 7; i++) { array_bf16[i] = fp32_to_bf16(array3[i]); } printf("data 3\nbfloat16 number is \n"); for (int i = 0; i < array_size; i++) { printf("%.12f\n", array_bf16[i]); } after_quant = quant_bf16_to_int8(array_bf16); printf("after quantization \n"); for (int i = 0; i < array_size; i++) { printf("%d\n", after_quant[i]); } system("pause"); return 0; } float fp32_to_bf16(float x) { float y = x; int *p = (int *)&y; unsigned int exp = *p & 0x7F800000; unsigned int man = *p & 0x007FFFFF; if (exp == 0 && man == 0) /* zero */ return x; if (exp == 0x7F800000 /* Fill this! */) /* infinity or NaN */ return x; /* Normalized number */ /* round to nearest */ float r = x; int *pr = (int *)&r; *pr &= 0xFF800000; /* r has the same exp as x */ r /= 0x100 /* Fill this! */; y = x + r; *p &= 0xFFFF0000; return y; } int* quant_bf16_to_int8(float x[array_size]) { static int after_quant[array_size] = {}; float max = fabs(x[0]); for (int i = 1; i < array_size; i++) { if (fabs(x[i]) > max) { max = fabs(x[i]); } } printf("maximum number is %.12f\n", max); float scale = range / max; for (int i = 0; i < array_size; i++) { after_quant[i] = (x[i] * scale); } return after_quant; } ``` ::: :::spoiler add ticks.c ```c #include <stdio.h> #include <stdlib.h> #include<math.h> #include <inttypes.h> # define array_size 7 # define range 127 /*2^(n-1)-1, n: quant bit*/ float fp32_to_bf16(float x); int* quant_bf16_to_int8(float x[]); float bf16_findmax(float x[]); typedef uint64_t ticks; static inline ticks getticks(void) { uint64_t result; uint32_t l, h, h2; asm volatile( "rdcycleh %0\n" "rdcycle %1\n" "rdcycleh %2\n" "sub %0, %0, %2\n" "seqz %0, %0\n" "sub %0, zero, %0\n" "and %1, %1, %0\n" : "=r"(h), "=r"(l), "=r"(h2)); result = (((uint64_t) h) << 32) | ((uint64_t) l); return result; } int main() { ticks t0 = getticks(); float array[array_size] = {1.200000, 1.203125, 2.310000, 2.312500, 3.460000, 3.4531255, 5.630000}; float array2[array_size] = { 0.1, 0.2, 1.2, 3, 2.1, -4.2, 3.5}; float array3[array_size] = { 3.14159265, 0.12345678 , 1.23456789 , 0.00000123, 0.00000001, 0.99999999 , 0.00000007 }; float array_bf16[array_size] = {}; int *after_quant; /*data 1*/ for (int i = 0; i < 7; i++) { array_bf16[i] = fp32_to_bf16(array[i]); } printf("data 1\nbfloat16 number is \n"); for (int i = 0; i < array_size; i++) { printf("%.12f\n", array_bf16[i]); } after_quant = quant_bf16_to_int8(array_bf16); printf("after quantization \n"); for (int i = 0; i < array_size; i++) { printf("%d\n", after_quant[i]); } /*data 2*/ for (int i = 0; i < 7; i++) { array_bf16[i] = fp32_to_bf16(array2[i]); } printf("data 2\nbfloat16 number is \n"); for (int i = 0; i < array_size; i++) { printf("%.12f\n", array_bf16[i]); } after_quant = quant_bf16_to_int8(array_bf16); printf("after quantization \n"); for (int i = 0; i < array_size; i++) { printf("%d\n", after_quant[i]); } /*data 3*/ for (int i = 0; i < 7; i++) { array_bf16[i] = fp32_to_bf16(array3[i]); } printf("data 3\nbfloat16 number is \n"); for (int i = 0; i < array_size; i++) { printf("%.12f\n", array_bf16[i]); } after_quant = quant_bf16_to_int8(array_bf16); printf("after quantization \n"); for (int i = 0; i < array_size; i++) { printf("%d\n", after_quant[i]); } ticks t1 = getticks(); printf("elapsed cycle: %" PRIu64 "\n", t1 - t0); system("pause"); return 0; } float fp32_to_bf16(float x) { float y = x; int *p = (int *)&y; unsigned int exp = *p & 0x7F800000; unsigned int man = *p & 0x007FFFFF; if (exp == 0 && man == 0) /* zero */ return x; if (exp == 0x7F800000 /* Fill this! */) /* infinity or NaN */ return x; /* Normalized number */ /* round to nearest */ float r = x; int *pr = (int *)&r; *pr &= 0xFF800000; /* r has the same exp as x */ r /= 0x100 /* Fill this! */; y = x + r; *p &= 0xFFFF0000; return y; } int* quant_bf16_to_int8(float x[array_size]) { static int after_quant[array_size] = {}; float max = fabs(x[0]); for (int i = 1; i < array_size; i++) { if (fabs(x[i]) > max) { max = fabs(x[i]); } } printf("maximum number is %.12f\n", max); float scale = range / max; for (int i = 0; i < array_size; i++) { after_quant[i] = (x[i] * scale); } return after_quant; } ``` ::: :::spoiler original assembly ```c .data array: .word 0x3f99999a, 0x3f9a0000, 0x4013d70a, 0x40140000, 0x405d70a4, 0x405d0000, 0x40b428f6 # test data1: 1.200000, 1.203125, 2.310000, 2.312500, 3.460000, 3.4531255, 5.630000 array2: .word 0x3dcccccd, 0x3e4ccccd, 0x3f99999a, 0x40400000, 0x40066666, 0xc0866666, 0x40600000 # test data2: 0.1, 0.2, 1.2, 3, 2.1, -4.2, 3.5 array3: .word 0x40490fdb, 0x3dfcd6e9, 0x3f9e0652, 0x35a5167a, 0x322bcc77, 0x3f800000, 0x339652e8 # test data3: 3.14159265, 0.12345678 , 1.23456789 , 0.00000123, 0.00000001, 0.99999999 , 0.00000007 array_bf16: .word 0, 0, 0, 0, 0, 0, 0 exp_mask: .word 0x7F800000 man_mask: .word 0x007FFFFF sign_exp_mask: .word 0xFF800000 bf16_mask: .word 0xFFFF0000 next_line: .string "\n" max_string: .string "maximum number is " bf16_string: .string "\nbfloat16 number is \n" .text main: # push data addi sp, sp, -12 la t0, array sw t0, 0(sp) la t0, array2 sw t0, 4(sp) la t0, array3 sw t0, 8(sp) la s10, array_bf16 # global array_bf16 address(s10) addi s11, x0, 3 # data number(s11) la s9, exp_mask # global exp(s9) la s8, man_mask # global man(s8) la s6, bf16_mask # global bf16(s6) lw s9, 0(s9) lw s8, 0(s8) lw s6, 0(s6) add s7, x0, sp main_for: la a0, bf16_string addi a7, x0, 4 ecall addi a3, x0, 7 # array size(a3) lw a1, 0(s7) # array_data pointer(a1) mv a2, s10 # array_bf16 pointer(a2) jal ra, fp32_to_bf16_findmax addi s11, s11, -1 addi s7, s7, 4 bne s11, x0, main_for # Exit program li a7, 10 ecall fp32_to_bf16_findmax: # array_data pointer(a1), array_bf16 pointer(a2), array size(a3) # prologue addi sp, sp, -8 sw s0, 0(sp) sw s1, 4(sp) # array loop for1: lw a5, 0(a1) # x(a5) # fp32_to_bf16 and t0, a5, s9 # x exp(t0) and t1, a5, s8 # x man(t1) # if zero bne t0, x0, else # exp is zero bne t1, x0, else j finish_bf16 else: # if infinity or NaN beq t0, s9, finish_bf16 # round # r = x.man shift right 8 bit # x+r = x.man + x.man>>8 li t3, 0x00800000 # make up 1 to No.24bit or t1, t1, t3 srli t2, t1, 8 # r(t2) add t1, t1, t2 # x+r # check carry and t4, t1, t3 # check No.24bit (t4), 0:carry, 1: nocarry bne t4, x0, no_carry add t0, t0, t3 # exp+1 srli t1 ,t1, 1 # man alignment no_carry: and t0, t0, s9 # mask exp(t0) and t1, t1, s8 # mask man(t1) or t2, t0, t1 # combine exp & man li t3, 0x80000000 # sign mask and t3, a5, t3 # x sign or a5, t3, t2 # bfloat16(a5) and a5, a5, s6 finish_bf16: sw a5, 0(a2) mv a0, a5 addi a7, x0, 34 ecall la a0, next_line addi a7, x0, 4 ecall slti t3, a3, 7 # (a3==7) t3=0, (a3<7) t3=1 bne t3, x0, compare # saved first max j max_change compare: # compare exp blt s0, t0, max_change blt t0, s0, max_not_change # compare man blt s1, t1, max_change blt t1, s1, max_not_change max_change: mv s0, t0 # max exp(s0) mv s1, t1 # max man(s1) mv a4, a5 # max bf16(a4) max_not_change: addi a3, a3, -1 addi a1, a1, 4 addi a2, a2, 4 bne a3, x0, for1 # Absolute li t2, 0x7fffffff and a4, a4, t2 #print la a0, max_string addi a7, x0, 4 ecall mv a0, a4 addi a7, x0, 34 ecall # epilogue lw s0, 0(sp) lw s1, 4(sp) addi sp, sp, 8 jr ra ``` ::: ### Motiviation I think student KuanYuan has optimized the code to a very concise level. Therefore, I intend to rewrite the code and successfully implement the conversion from bf16 to int8 in the RISC-V architecture, a part that KuanYuan wasn't able to accomplish. Also, because I chose a relatively simpler topic for lab1 assignment, I am keen to utilize this opportunity in lab2 to thoroughly learn RISC-V . ## Improve Student [KuanYuan0530](https://hackmd.io/@K1NCVjKnTCmNaikFb4gt-A/B1Fj-TGWp) has optimized the original assembly code to be highly concise and successfully executed it on the rv32emu emulator, showcasing remarkable proficiency. As [KuanYuan0530](https://hackmd.io/@K1NCVjKnTCmNaikFb4gt-A/B1Fj-TGWp) has only completed the part of finding the maximum value within the bf16 array for quantization, I will attempt to solve the remaining portion. My goal is to successfully convert bf16 data into int8 format. Through this process, I aim not only to complete the task but also to learn from his design logic. I believe this effort will contribute significantly to my progress in this field. :point_right:**Implemented specifically** Initially, student KuanYuan completed the quantization process only up to converting from fp32 to b16 and finding the maximum absolute value of bf16,the unresolved issue pertains to dividing bf16 values by the scale. I used a somewhat rough method to find the scale, as I was unable to implement bf16 division within the assignment deadline. The approach I used to calculate the scale significantly reduced the overall quantization accuracy, leading to differences between the final result and the actual answer. Finally, for the multiplication of bf16 values, I referenced [Brian's](https://github.com/BrianCheng-TheLegend) bf16 multiplier, resulting in a quantization method with slightly reduced precision. :::warning You have to describe how you have improved upon the first implementation. :notes: jserv ::: :::info I have added a description in this regard. ::: ### Try manual optimization (fp32_to_bf16 & find maximum absolution value) 1. Use `li` to replace `la` and `lw`. 2. Delete unnecessary stack. 3. Avoid control hazards 4. Compare the entire bf16 value instead of separately comparing `exp` and `man` for size comparison. ```diff .data array: .word 0x3f99999a, 0x3f9a0000, 0x4013d70a, 0x40140000, 0x405d70a4, 0x405d0000, 0x40b428f6 # test data1: 1.200000, 1.203125, 2.310000, 2.312500, 3.460000, 3.4531255, 5.630000 array2: .word 0x3dcccccd, 0x3e4ccccd, 0x3f99999a, 0x40400000, 0x40066666, 0xc0866666, 0x40600000 # test data2: 0.1, 0.2, 1.2, 3, 2.1, -4.2, 3.5 array3: .word 0x40490fdb, 0x3dfcd6e9, 0x3f9e0652, 0x35a5167a, 0x322bcc77, 0x3f800000, 0x339652e8 # test data3: 3.14159265, 0.12345678 , 1.23456789 , 0.00000123, 0.00000001, 0.99999999 , 0.00000007 array_bf16: .word 0, 0, 0, 0, 0, 0, 0 exp_mask: .word 0x7F800000 man_mask: .word 0x007FFFFF sign_exp_mask: .word 0xFF800000 bf16_mask: .word 0xFFFF0000 next_line: .string "\n" max_string: .string "maximum number is " bf16_string: .string "\nbfloat16 number is \n" .text main: # push data addi sp, sp, -12 la t0, array sw t0, 0(sp) la t0, array2 sw t0, 4(sp) la t0, array3 sw t0, 8(sp) la s10, array_bf16 # global array_bf16 address(s10) addi s11, x0, 3 # data number(s11) -> three groups data - la s9, exp_mask # global exp(s9) - la s8, man_mask # global man(s8) - la s6, bf16_mask # global bf16(s6) - lw s9, 0(s9) - lw s8, 0(s8) - lw s6, 0(s6) - add s7, x0, sp + li t5, 0x7F800000 #exp_mask + li t6, 0x007FFFFF #man_mask + li s6, 0xFFFF0000 #bf16_mask + li s7, 0x7FFFFFFF #abs_mask main_for: la a0, bf16_string addi a7, x0, 4 ecall addi a3, x0, 7 # array size(a3) - lw a1, 0(s7) # array_data pointer(a1) + lw a1, 0(sp) # array_data pointer(a1) mv a2, s10 # array_bf16 pointer(a2) - jal ra, fp32_to_bf16_findmax - addi s11, s11, -1 - addi s7, s7, 4 - bne s11, x0, main_for - # Exit program - li a7, 10 - ecall fp32_to_bf16_findmax: # array_data pointer(a1), array_bf16 pointer(a2), array size(a3) - # prologue - addi sp, sp, -8 - sw s0, 0(sp) - sw s1, 4(sp) # array loop for1: lw a5, 0(a1) # x(a5) # fp32_to_bf16 - and t0, a5, s9 # x exp(t0) - and t1, a5, s8 # x man(t1) + and t0, a5, t5 # x exp(t0) + and t1, a5, t6 # x man(t1) # if zero bne t0, x0, else # exp is zero bne t1, x0, else j finish_bf16 else: # if infinity or NaN - beq t0, s9, finish_bf16 + beq t0, t5, finish_bf16 # round # r = x.man shift right 8 bit # x+r = x.man + x.man>>8 li t3, 0x00800000 # make up 1 to No.24bit or t1, t1, t3 srli t2, t1, 8 # r(t2) add t1, t1, t2 # x+r # check carry and t4, t1, t3 # check No.24bit (t4), 0:carry, 1: nocarry bne t4, x0, no_carry add t0, t0, t3 # exp+1 srli t1 ,t1, 1 # man alignment no_carry: - and t0, t0, s9 # mask exp(t0) - and t1, t1, s8 # mask man(t1) + and t0, t0, t5 # mask exp(t0) + and t1, t1, t6 # mask man(t1) or t2, t0, t1 # combine exp & man li t3, 0x80000000 # sign mask and t3, a5, t3 # x sign or a5, t3, t2 # bfloat16(a5) - and a5, a5, s6 + and a5, a5, s6 #s6 -> bf16_mask finish_bf16: sw a5, 0(a2) mv a0, a5 addi a7, x0, 34 ecall la a0, next_line addi a7, x0, 4 ecall slti t3, a3, 7 # (a3==7) t3=0, (a3<7) t3=1 + and s8, a5, s7 # abs bf16 -> s8 bne t3, x0, compare # saved first max j max_change compare: - # compare exp - blt s0, t0, max_change - blt t0, s0, max_not_change - # compare man - blt s1, t1, max_change - blt t1, s1, max_not_change + blt s8, s0, max_not_change max_change: - mv s0, t0 # max exp(s0) - mv s1, t1 # max man(s1) + mv s0, s8 # max bf16(s0) mv a4, a5 # max bf16(a4) max_not_change: addi a3, a3, -1 addi a1, a1, 4 addi a2, a2, 4 bne a3, x0, for1 # Absolute - li t2, 0x7fffffff - and a4, a4, t2 + and a4, a4, s7 #print la a0, max_string addi a7, x0, 4 ecall mv a0, a4 addi a7, x0, 34 ecall # epilogue - lw s0, 0(sp) - lw s1, 4(sp) - addi sp, sp, 8 - jr ra + and s0, x0, s0 + and s1, x0, s1 + addi s11, s11, -1 + addi sp, sp, 4 + and s8, x0, s8 + bne s11, x0, main_for +Exit: + li a7, 10 + ecall ``` :::warning You shall use RDCYCLE/RDCYCLEH instruction for the statistics of your program’s execution. :notes: jserv ::: :::success I have added this part to the [conclusion](https://hackmd.io/3oeAp56nT3uVBbEyTxyQnQ?view#Conclusion) section. ::: ### Add bf16 to int8 conversion :::spoiler **scale code** ```c .data maxbf16: .word 0x40b40000 .text main: lw a4, maxbf16 li t6, 0x007FFFFF quant_bf16_to_int8: li s2, 0x7F #127 to hex and t0, a4, t6 #max_man->t0 maxbf16->a4 srli t0, t0, 15 srli t1, a4, 23 #max_exp addi t1, t1, -127 #Denominator-> power of exp <- t1 li t4, 7 sub t3, t4, t1 srl t0, t0, t3 li t5, 1 sll t5, t5, t1 #1<<t1 or t0, t5, t5 #10^(t1) + fraction li s3, 0 scale: add s4, s4, t0 addi s3, s3, 1 #count scale bge s2, s4, scale exit: mv a0, s3 li a7, 1 ecall ``` ::: In quantization, the scale divides the floating-point range into 127 equal parts, making the maximum value 127, as we previously determined. However, due to my inability to successfully implement floating-point division, I used a somewhat crude method that only yields integers. Consequently, the INT8 values obtained using this method have significant deviations from the theoretical values. Until I can successfully implement floating-point division, this approach serves as a substitute. :::spoiler **Converting INT to bf16.** ```c int_to_floatpoint: addi sp, sp, -16 sw s2, 0(sp) sw s3, 4(sp) sw s4, 8(sp) sw s5, 12(sp) li t0, 0 mv s2, a6 loop2: srli a6, a6, 1 addi t0, t0, 1 blt x0, a6, loop2 ###end loop2 addi t0, t0, -1 # count shift right num addi s3, t0, 127 # exp_num # Why not +127? Because the shift count is one extra. slli s3, s3, 23 # exp in bf16 -> s3 li t1, 0xFFFFFFFF li t2, 32 sub t3, t2, t0 srl t1, t1, t3 and s4, s2, t1 # frac_num in bf16 li t1, 23 sub t1, t1, t0 # t1=23-(count shift right num) sll s4, s4, t1 # frac in bf16 or s5, s4, s3 # int->bf16 ok mv a6, s5 la a0,next_line li a7,4 ecall mv a0, a6 li a7, 34 ecall lw s2, 0(sp) lw s3, 4(sp) lw s4, 8(sp) lw s5, 12(sp) addi sp, sp, 16 ``` ::: Since the result obtained from the scale is an integer, it is converted to bf16 for ease of multiplication in the subsequent steps. :::spoiler **float point mult code** ```c .data test1: .word 0x42000000 test2: .word 0x40860000 .text Multi_bfloat: # decoder function input is a0 # jal ra,decoder # load a0(two bloat number in one register) to t0 # decoder function output is s5,s6 lw s5,test1 lw s6,test2 add t0,s5,x0 # store s5(bfloat 2) to t0 add t1,s6,x0 # store s6(bfloat 1) to t1 li t6,0x7F800000 # mask 0x7F800000 # get exponent to t2,t3 and t3,t0,t6 # use mask 0x7F800000 to get t0 exponent and t2,t1,t6 # use mask 0x7F800000 to get t1 exponent add t3,t3,t2 # add two exponent to t3 li t6,0x3F800000 # mask 0x3F800000 sub t3,t3,t6 # sub 127 to exponent # get sign xor t2,t0,t1 # get sign and store on t2 srli t2,t2,31 # get rid of useless data slli t2,t2,31 # let sign back to right position # get sign and exponent together or t3,t3,t2 # set the sign and exponent to t0 slli t0,t0,9 srli t0,t0,9 or t0,t3,t0 # get fraction to t2 and t3 li t6,0x7F # mask 0x7F slli t6,t6,16 # shift mask to 0x7F0000 and t2,t0,t6 # use mask 0x7F0000 get fraction and t3,t1,t6 # use mask 0x7F0000 get fraction slli t2,t2,9 # shift left let no leading 0 srli t2,t2,1 # shift right let leading has one 0 li t6,0x80000000 # mask 80000000 or t2,t2,t6 # use mask 0x80000000 to add integer srli t2,t2,1 # shift right to add space for overflow slli t3,t3,8 # shift left let no leading 0 or t3,t3,t6 # use mask 0x80000000 to add integer srli t3,t3,1 # shift right to add space for overflow add s11,x0,x0 # set a counter and 0 addi s10,x0,8 # set a end condition add t1,x0,x0 # reset t1 to 0 and let this register be result li t6,0x80000000 # mask 0x80000000 loop: addi s11,s11,1 # add 1 at counter every loop srli t6,t6,1 # shift right at 1 every loop and t4,t2,t6 # use mask to specified number at that place beq t4,x0,not_add # jump if t4 equal to 0 add t1,t1,t3 # add t3 to t1 not_add: srli t3,t3,1 # shift left 1 bit to t3 bne s11,s10,loop # if the condition not satisfy return to loop # end of loop # check if overflow li t6,0x80000000 and t4,t1,t6 # get t1 max bit # if t4 max bit equal to 0 will not overflow beq t4,x0,not_overflow # if overflow slli t1,t1,1 # shift left 1 bits to remove integer li t6,0x800000 # mask 0x800000 add t0,t0,t6 # exponent add 1 if overflow j Mult_end # jump to Mult_end # if not overflow not_overflow: slli t1,t1,2 # shift left 2 bits to remove integer Mult_end: srli t1,t1,24 # shift right to remove useless bits addi t1,t1,1 # add 1 little bit to check if carry srli t1,t1,1 # shift right to remove useless bits slli t1,t1,16 # shift left to let fraction be right position srli t0,t0,23 # shift right to remove useless bits slli t0,t0,23 # shift left to let sign and exponent be right position or t0,t0,t1 # combine t0 and t1 together to get bfloat add s3,t0,x0 # store bfloat after multiplication to s3 #ret # return to main ### end of function exit: mv a0,s3 li a7,2 ecall ``` ::: I referenced my classmate [Brian Cheng's](https://github.com/BrianCheng-TheLegend) method for floating-point multiplication, which I applied to the final stage where floating-point values are multiplied by the scale. :::spoiler **Remove the decimal part of the BF16 to make it an integer code** ```c rm_decimal_of_bf16: mv t0, a4 li t3, 0x80000000 and t3, t0, t3 srli t3, t3, 31 # Detecting positive or negative and t0,t0,s7 # absolution srli t0, t0, 23 # exp->s2 addi t0, t0,-127 # power of 2 and t1, t1, t6 srli t1, t1, 16 li t2, 0x80 # 1000 0000 or t1, t1, t2 li t2, 7 sub t2, t2, t0 # how many bits do you right shift srl t1, t1, t2 # ANS li t2, 1 bne t3, t2, printINT8 Add_negative_sign: add t2, t1, t1 sub t1, t1, t2 printINT8: la a0,next_line li a7, 4 ecall mv a0, t1 li a7, 1 ecall # next data addi s5, s5, 4 addi a3, a3, -1 bne a3, x0, for2 ### end of function ``` ::: Since the result obtained from the bf16 multiplier is also in bf16 format, I remove the decimal part. In the final stage, I check if the sign bit is 1. If it is 1, I add the negative sign. ### combine ```c .data array: .word 0x3f99999a, 0x3f9a0000, 0x4013d70a, 0x40140000, 0x405d70a4, 0x405d0000, 0x40b428f6 # test data1: 1.200000, 1.203125, 2.310000, 2.312500, 3.460000, 3.4531255, 5.630000 array2: .word 0x3dcccccd, 0x3e4ccccd, 0x3f99999a, 0x40400000, 0x40066666, 0xc0866666, 0x40600000 # test data2: 0.1, 0.2, 1.2, 3, 2.1, -4.2, 3.5 array3: .word 0x40490fdb, 0x3dfcd6e9, 0x3f9e0652, 0x35a5167a, 0x322bcc77, 0x3f800000, 0x339652e8 # test data3: 3.14159265, 0.12345678 , 1.23456789 , 0.00000123, 0.00000001, 0.99999999 , 0.00000007 array_bf16: .word 0, 0, 0, 0, 0, 0, 0 next_line: .string "\n" max_string: .string "maximum number is " bf16_string: .string "\nbfloat16 number is \n" scale_num: .string "\nscale is " transform_to_bf16_is: .string "\ntransform to bf16 is:" .text main: # push data addi sp, sp, -12 la t0, array sw t0, 0(sp) la t0, array2 sw t0, 4(sp) la t0, array3 sw t0, 8(sp) la s10, array_bf16 # global array_bf16 address(s10) addi s11, x0, 3 # data number(s11) -> three groups data li t5, 0x7F800000 # exp_mask li t6, 0x007FFFFF # man_mask li s6, 0xFFFF0000 # bf16_mask li s7, 0x7FFFFFFF # abs_mask main_for: la a0, bf16_string #call bfloat16 number is addi a7, x0, 4 ecall addi a3, x0, 7 # array size(a3) lw a1, 0(sp) # array_data pointer(a1) mv a2, s10 # array_bf16 pointer(a2) fp32_to_bf16_findmax: # array_data pointer(a1), array_bf16 pointer(a2), array size(a3) # array loop for1: lw a5, 0(a1) # x(a5) # fp32_to_bf16 and t0, a5, t5 # x exp(t0) and t1, a5, t6 # x man(t1) # if zero bne t0, x0, else # exp is zero bne t1, x0, else j finish_bf16 else: # if infinity or NaN beq t0, t5, finish_bf16 # round # r = x.man shift right 8 bit # x+r = x.man + x.man>>8 li t3, 0x00800000 # make up 1 to No.24bit or t1, t1, t3 srli t2, t1, 8 # r(t2) add t1, t1, t2 # x+r # check carry and t4, t1, t3 # check No.24bit (t4), 0:carry, 1: nocarry bne t4, x0, no_carry add t0, t0, t3 # exp+1 srli t1 ,t1, 1 # man alignment no_carry: and t0, t0, t5 # mask exp(t0) and t1, t1, t6 # mask man(t1) or t2, t0, t1 # combine exp & man li t3, 0x80000000 # sign mask and t3, a5, t3 # x sign or a5, t3, t2 # bfloat16(a5) and a5, a5, s6 #s6 -> bf16_mask finish_bf16: sw a5, 0(a2) mv a0, a5 addi a7, x0, 34 ecall la a0, next_line addi a7, x0, 4 ecall slti t3, a3, 7 # (a3==7) t3=0, (a3<7) t3=1 and s8, a5, s7 # abs bf16 -> s8 bne t3, x0, compare # saved first max j max_change compare: blt s8, s0, max_not_change max_change: mv s0, s8 # max bf16(s0) mv a4, a5 # max bf16(a4) max_not_change: addi a3, a3, -1 addi a1, a1, 4 addi a2, a2, 4 bne a3, x0, for1 # Absolute and a4, a4, s7 # s7=>0x7FFFFFFF abs_mask #print la a0, max_string # call maximum number is addi a7, x0, 4 ecall mv a0, a4 addi a7, x0, 34 ecall and s0, x0, s0 and s1, x0, s1 #scale_function scale: addi sp, sp, -16 sw s2, 0(sp) sw s3, 4(sp) sw s4, 8(sp) sw s5, 12(sp) li s2, 0x7F # 127 to hex li s3, 1 # add to fraction head (1.fraction) and t0, a4, t6 # max_man->t0 maxbf16->a4 man_mask=0x007FFFFF->t6 srli t0, t0, 16 # bf16_man t0=t0>>15 srli t1, a4, 23 # max_exp addi t1, t1, -127 # Denominator-> power of 2 <- t1 li t4, 7 # man has 7bits sub t3, t4, t1 srl t0, t0, t3 # mean t0 >> (7-(power of 2)) sll s3, s3, t1 # s3=(1<<t1) or t0, s3, t0 # 10^(t1) + fraction li a6, 0 scale_loop: add s4, s4, t0 addi a6, a6, 1 # count scale bge s2, s4, scale_loop lw s2, 0(sp) lw s3, 4(sp) lw s4, 8(sp) lw s5, 12(sp) addi sp, sp, 16 la a0,scale_num li a7,4 ecall mv a0, a6 li a7, 1 ecall int_to_fp: addi sp, sp, -16 sw s2, 0(sp) sw s3, 4(sp) sw s4, 8(sp) sw s5, 12(sp) li t0, 0 mv s2, a6 loop2: srli a6, a6, 1 addi t0, t0, 1 blt x0, a6, loop2 ###end loop2 addi t0, t0, -1 # count shift right num addi s3, t0, 127 # exp_num # Why not +127? Because the shift count is one extra. slli s3, s3, 23 # exp in bf16 -> s3 li t1, 0xFFFFFFFF li t2, 32 sub t3, t2, t0 srl t1, t1, t3 and s4, s2, t1 # frac_num in bf16 li t1, 23 sub t1, t1, t0 # t1=23-(count shift right num) sll s4, s4, t1 # frac in bf16 or s5, s4, s3 # int->bf16 ok mv a6, s5 la a0,transform_to_bf16_is li a7,4 ecall mv a0, a6 li a7, 34 ecall lw s2, 0(sp) lw s3, 4(sp) lw s4, 8(sp) lw s5, 12(sp) addi sp, sp, 16 Multi_bfloat: # decoder function input is a0 # jal ra,decoder # load a0(two bloat number in one register) to t0 # decoder function output is s5,s6 addi sp, sp, -16 sw s2, 0(sp) sw s3, 4(sp) sw s4, 8(sp) sw s5, 12(sp) mv s5, s10 addi a3, x0, 7 # array size -> 7 for2: lw a4, 0(s5) add t0,a6,x0 # store s5(bfloat 2) to t0 add t1,a4,x0 # store s6(bfloat 1) to t1 li s2,0x7F800000 # mask 0x7F800000 # get exponent to t2,t3 and t3,t0,s2 # use mask 0x7F800000 to get t0 exponent and t2,t1,s2 # use mask 0x7F800000 to get t1 exponent add t3,t3,t2 # add two exponent to t3 li s2,0x3F800000 # mask 0x3F800000 sub t3,t3,s2 # sub 127 to exponent # get sign xor t2,t0,t1 # get sign and store on t2 srli t2,t2,31 # get rid of useless data slli t2,t2,31 # let sign back to right position # get sign and exponent together or t3,t3,t2 # set the sign and exponent to t0 slli t0,t0,9 srli t0,t0,9 or t0,t3,t0 # get fraction to t2 and t3 li s2,0x7F # mask 0x7F slli s2,s2,16 # shift mask to 0x7F0000 and t2,t0,s2 # use mask 0x7F0000 get fraction and t3,t1,s2 # use mask 0x7F0000 get fraction slli t2,t2,9 # shift left let no leading 0 srli t2,t2,1 # shift right let leading has one 0 li s2,0x80000000 # mask 0x80000000 or t2,t2,s2 # use mask 0x80000000 to add integer srli t2,t2,1 # shift right to add space for overflow slli t3,t3,8 # shift left let no leading 0 or t3,t3,s2 # use mask 0x80000000 to add integer srli t3,t3,1 # shift right to add space for overflow add s3,x0,x0 # set a counter and 0 addi s4,x0,8 # set a end condition add t1,x0,x0 # reset t1 to 0 and let this register be result li s2,0x80000000 # mask 0x80000000 loop: addi s3,s3,1 # add 1 at counter every loop srli s2,s2,1 # shift right at 1 every loop and t4,t2,s2 # use mask to specified number at that place beq t4,x0,not_add # jump if t4 equal to 0 add t1,t1,t3 # add t3 to t1 not_add: srli t3,t3,1 # shift left 1 bit to t3 bne s3,s4,loop # if the condition not satisfy return to loop # end of loop # check if overflow li s2,0x80000000 and t4,t1,s2 # get t1 max bit # if t4 max bit equal to 0 will not overflow beq t4,x0,not_overflow # if overflow slli t1,t1,1 # shift left 1 bits to remove integer li s2,0x800000 # mask 0x800000 add t0,t0,s2 # exponent add 1 if overflow j Mult_end # jump to Mult_end # if not overflow not_overflow: slli t1,t1,2 # shift left 2 bits to remove integer Mult_end: srli t1,t1,24 # shift right to remove useless bits addi t1,t1,1 # add 1 little bit to check if carry srli t1,t1,1 # shift right to remove useless bits slli t1,t1,16 # shift left to let fraction be right position srli t0,t0,23 # shift right to remove useless bits slli t0,t0,23 # shift left to let sign and exponent be right position or t0,t0,t1 # combine t0 and t1 together to get bfloat add a4,t0,x0 # store bfloat after multiplication to s3 ### end of function #Remove the decimal part of the BF16 to make it an integer. rm_decimal_of_bf16: mv t0, a4 li t3, 0x80000000 and t3, t0, t3 srli t3, t3, 31 # Detecting positive or negative and t0,t0,s7 # absolution srli t0, t0, 23 # exp->s2 addi t0, t0,-127 # power of 2 and t1, t1, t6 srli t1, t1, 16 li t2, 0x80 # 1000 0000 or t1, t1, t2 li t2, 7 sub t2, t2, t0 # how many bits do you right shift srl t1, t1, t2 # ANS li t2, 1 bne t3, t2, printINT8 Add_negative_sign: add t2, t1, t1 sub t1, t1, t2 printINT8: la a0,next_line li a7, 4 ecall mv a0, t1 li a7, 1 ecall # next data addi s5, s5, 4 addi a3, a3, -1 bne a3, x0, for2 ### end of function lw s2, 0(sp) lw s3, 4(sp) lw s4, 8(sp) lw s5, 12(sp) addi sp, sp, 16 next_array: addi s11, s11, -1 addi sp, sp, 4 and s8, x0, s8 bne s11, x0, main_for Exit: li a7, 10 ecall ``` #### Result data1 (ripes) ![](https://hackmd.io/_uploads/rkecRs9fa.png) data2 (ripes) ![](https://hackmd.io/_uploads/r1yxJ3qfa.png) data3 (ripes) ![](https://hackmd.io/_uploads/Sybzy39M6.png) ### Room for improvement 1. Attempting the implementation of a bf16 division unit and successfully running it to calculate the scale. 2. Optimizing my own assembly code to reduce memory usage. ## Optimized by riscv-none-elf-gcc Using six different optimization levels: `-O0` `-O1` `-O2` `-O3` `-Os` `-Ofast`, and conducting further analysis. **Reference** > [3.10 Options That Control Optimization](https://gcc.gnu.org/onlinedocs/gcc-3.4.6/gcc/Optimize-Options.html#Optimize-Options) > [description in Chinese](https://blog.csdn.net/wuxing26jiayou/article/details/96132721)* * compile ```c riscv-none-elf-gcc -march=rv32i -mabi=ilp32 -O0 lab2.c -o lab2.elf riscv-none-elf-gcc -march=rv32i -mabi=ilp32 -O1 lab2.c -o lab2.elf riscv-none-elf-gcc -march=rv32i -mabi=ilp32 -O2 lab2.c -o lab2.elf riscv-none-elf-gcc -march=rv32i -mabi=ilp32 -O3 lab2.c -o lab2.elf riscv-none-elf-gcc -march=rv32i -mabi=ilp32 -Os lab2.c -o lab2.elf ``` * size ```c riscv-none-elf-size lab2.elf ``` * Display the lab2.elf header ```c $ riscv-none-elf-readelf -h lab2.elf ELF Header: Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00 Class: ELF32 Data: 2's complement, little endian Version: 1 (current) OS/ABI: UNIX - System V ABI Version: 0 Type: EXEC (Executable file) Machine: RISC-V Version: 0x1 Entry point address: 0x100d8 Start of program headers: 52 (bytes into file) Start of section headers: 99312 (bytes into file) Flags: 0x0 Size of this header: 52 (bytes) Size of program headers: 32 (bytes) Number of program headers: 3 Size of section headers: 40 (bytes) Number of section headers: 15 Section header string table index: 14 ``` * write disassembly code ```c riscv-none-elf-objdump -d lab2.elf > disassembly/lab2.txt ``` * run ```c ./rv32emu lab2.elf ``` :::spoiler result ``` data 1 bfloat16 number is 1.203125000000 1.203125000000 2.312500000000 2.312500000000 3.453125000000 3.453125000000 5.625000000000 maximum number is 5.625000000000 after quantization 27 27 52 52 77 77 127 data 2 bfloat16 number is 0.100097656250 0.200195312500 1.203125000000 3.000000000000 2.093750000000 -4.187500000000 3.500000000000 maximum number is 4.187500000000 after quantization 3 6 36 90 63 -127 106 data 3 bfloat16 number is 3.140625000000 0.123535156250 1.234375000000 0.000001229346 0.000000010012 1.000000000000 0.000000069849 maximum number is 3.140625000000 after quantization 127 4 49 0 0 40 0 inferior exit code 0 ``` ::: ### RISC-V Instructions/Registers Usage Statistics ```c $ make tool ~/rv32emu/tests/hw2$ ~/rv32emu/build/rv_histogram ./lab2.elf ~/rv32emu/tests/hw2$ ~/rv32emu/build/rv_histogram -r ./lab2.elf ``` Instructions Histogram ![](https://hackmd.io/_uploads/rkIqV0nf6.png) Registers Histogram ![](https://hackmd.io/_uploads/H181BR2z6.png) ### -O0 **Due to the excessively verbose optimized assembly code, I have chosen to analyze the `quant_bf16_to_int8` function, which is a crucial part of the code.** * Compile ```c $ riscv-none-elf-gcc -march=rv32i -mabi=ilp32 -O0 lab2.c -o lab2.elf ``` * Size ```c $ riscv-none-elf-size lab2.elf ``` ```c ~/rv32emu/tests/hw2$ riscv-none-elf-size lab2.elf text data bss dec hex filename 81736 2320 1556 85612 14e6c lab2.elf ``` * ELF header ```c ~/rv32emu/tests/hw2$ riscv-none-elf-readelf -h lab2.elf ELF Header: Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00 Class: ELF32 Data: 2's complement, little endian Version: 1 (current) OS/ABI: UNIX - System V ABI Version: 0 Type: EXEC (Executable file) Machine: RISC-V Version: 0x1 Entry point address: 0x100d8 Start of program headers: 52 (bytes into file) Start of section headers: 99312 (bytes into file) Flags: 0x0 Size of this header: 52 (bytes) Size of program headers: 32 (bytes) Number of program headers: 3 Size of section headers: 40 (bytes) Number of section headers: 15 Section header string table index: 14 ``` :::spoiler quant_bf16_to_int8 disassembly code ```c 000106e8 <quant_bf16_to_int8>: 106e8: fd010113 add sp,sp,-48 106ec: 02112623 sw ra,44(sp) 106f0: 02812423 sw s0,40(sp) 106f4: 03010413 add s0,sp,48 106f8: fca42e23 sw a0,-36(s0) 106fc: fdc42783 lw a5,-36(s0) 10700: 0007a703 lw a4,0(a5) 10704: 800007b7 lui a5,0x80000 10708: fff78793 add a5,a5,-1 # 7fffffff <__BSS_END__+0x7ffdb0d3> 1070c: 00f777b3 and a5,a4,a5 10710: fef42623 sw a5,-20(s0) 10714: 00100793 li a5,1 10718: fef42423 sw a5,-24(s0) 1071c: 0680006f j 10784 <quant_bf16_to_int8+0x9c> 10720: fe842783 lw a5,-24(s0) 10724: 00279793 sll a5,a5,0x2 10728: fdc42703 lw a4,-36(s0) 1072c: 00f707b3 add a5,a4,a5 10730: 0007a703 lw a4,0(a5) 10734: 800007b7 lui a5,0x80000 10738: fff78793 add a5,a5,-1 # 7fffffff <__BSS_END__+0x7ffdb0d3> 1073c: 00f777b3 and a5,a4,a5 10740: 00078593 mv a1,a5 10744: fec42503 lw a0,-20(s0) 10748: 249000ef jal 11190 <__lesf2> 1074c: 00050793 mv a5,a0 10750: 0207d463 bgez a5,10778 <quant_bf16_to_int8+0x90> 10754: fe842783 lw a5,-24(s0) 10758: 00279793 sll a5,a5,0x2 1075c: fdc42703 lw a4,-36(s0) 10760: 00f707b3 add a5,a4,a5 10764: 0007a703 lw a4,0(a5) 10768: 800007b7 lui a5,0x80000 1076c: fff78793 add a5,a5,-1 # 7fffffff <__BSS_END__+0x7ffdb0d3> 10770: 00f777b3 and a5,a4,a5 10774: fef42623 sw a5,-20(s0) 10778: fe842783 lw a5,-24(s0) 1077c: 00178793 add a5,a5,1 10780: fef42423 sw a5,-24(s0) 10784: fe842703 lw a4,-24(s0) 10788: 00600793 li a5,6 1078c: f8e7dae3 bge a5,a4,10720 <quant_bf16_to_int8+0x38> 10790: fec42503 lw a0,-20(s0) 10794: 74d000ef jal 116e0 <__extendsfdf2> 10798: 00050713 mv a4,a0 1079c: 00058793 mv a5,a1 107a0: 00070613 mv a2,a4 107a4: 00078693 mv a3,a5 107a8: 000237b7 lui a5,0x23 107ac: 15078513 add a0,a5,336 # 23150 <__trunctfdf2+0x4ac> 107b0: 770010ef jal 11f20 <printf> 107b4: 000237b7 lui a5,0x23 107b8: fec42583 lw a1,-20(s0) 107bc: 1707a503 lw a0,368(a5) # 23170 <__trunctfdf2+0x4cc> 107c0: 560000ef jal 10d20 <__divsf3> 107c4: 00050793 mv a5,a0 107c8: fef42023 sw a5,-32(s0) 107cc: fe042223 sw zero,-28(s0) 107d0: 0540006f j 10824 <quant_bf16_to_int8+0x13c> 107d4: fe442783 lw a5,-28(s0) 107d8: 00279793 sll a5,a5,0x2 107dc: fdc42703 lw a4,-36(s0) 107e0: 00f707b3 add a5,a4,a5 107e4: 0007a783 lw a5,0(a5) 107e8: fe042583 lw a1,-32(s0) 107ec: 00078513 mv a0,a5 107f0: 271000ef jal 11260 <__mulsf3> 107f4: 00050793 mv a5,a0 107f8: 00078513 mv a0,a5 107fc: 671000ef jal 1166c <__fixsfsi> 10800: 00050693 mv a3,a0 10804: fa818713 add a4,gp,-88 # 24978 <after_quant.0> 10808: fe442783 lw a5,-28(s0) 1080c: 00279793 sll a5,a5,0x2 10810: 00f707b3 add a5,a4,a5 10814: 00d7a023 sw a3,0(a5) 10818: fe442783 lw a5,-28(s0) 1081c: 00178793 add a5,a5,1 10820: fef42223 sw a5,-28(s0) 10824: fe442703 lw a4,-28(s0) 10828: 00600793 li a5,6 1082c: fae7d4e3 bge a5,a4,107d4 <quant_bf16_to_int8+0xec> 10830: fa818793 add a5,gp,-88 # 24978 <after_quant.0> 10834: 00078513 mv a0,a5 10838: 02c12083 lw ra,44(sp) 1083c: 02812403 lw s0,40(sp) 10840: 03010113 add sp,sp,48 10844: 00008067 ret ``` ::: **Statistics** * line of code : 89 * register * ax : `a0` `a1` `a2` `a3` `a4` `a5` `a7` `a8` * sx : `s0` * tx : none * branch and jump * jump : 6 * branch : 3 * stack * sp : 48 * lw and sw * lw : 23 * sw : 11 ### -O1 * Compile ```c $ riscv-none-elf-gcc -march=rv32i -mabi=ilp32 -O1 lab2.c -o lab2_O1.elf ``` * Size ```c $ riscv-none-elf-size lab2_O1.elf ``` ```c ~/rv32emu/tests/hw2$ riscv-none-elf-size lab2.elf text data bss dec hex filename 81016 2328 1556 85900 14ba4 lab2_O1.elf ``` * ELF header ```c ~/rv32emu/tests/hw2$ riscv-none-elf-readelf -h lab2_O1.elf ELF Header: Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00 Class: ELF32 Data: 2's complement, little endian Version: 1 (current) OS/ABI: UNIX - System V ABI Version: 0 Type: EXEC (Executable file) Machine: RISC-V Version: 0x1 Entry point address: 0x100d8 Start of program headers: 52 (bytes into file) Start of section headers: 99320 (bytes into file) Flags: 0x0 Size of this header: 52 (bytes) Size of program headers: 32 (bytes) Number of program headers: 3 Size of section headers: 40 (bytes) Number of section headers: 15 Section header string table index: 14 ``` :::spoiler quant_bf16_to_int8 disassembly code ```c 000101d4 <quant_bf16_to_int8>: 101d4: fe010113 add sp,sp,-32 101d8: 00112e23 sw ra,28(sp) 101dc: 00812c23 sw s0,24(sp) 101e0: 00912a23 sw s1,20(sp) 101e4: 01212823 sw s2,16(sp) 101e8: 01312623 sw s3,12(sp) 101ec: 01412423 sw s4,8(sp) 101f0: 01512223 sw s5,4(sp) 101f4: 00052983 lw s3,0(a0) # ff800000 <__BSS_END__+0xff7db0cc> 101f8: 00199993 sll s3,s3,0x1 101fc: 0019d993 srl s3,s3,0x1 10200: 00450493 add s1,a0,4 10204: 00050913 mv s2,a0 10208: 01c50a13 add s4,a0,28 1020c: 80000ab7 lui s5,0x80000 10210: fffa8a93 add s5,s5,-1 # 7fffffff <__BSS_END__+0x7ffdb0cb> 10214: 00c0006f j 10220 <quant_bf16_to_int8+0x4c> 10218: 00448493 add s1,s1,4 1021c: 03448263 beq s1,s4,10240 <quant_bf16_to_int8+0x6c> 10220: 0004a403 lw s0,0(s1) 10224: 01547433 and s0,s0,s5 10228: 00098593 mv a1,s3 1022c: 00040513 mv a0,s0 10230: 4a1000ef jal 10ed0 <__gesf2> 10234: fea052e3 blez a0,10218 <quant_bf16_to_int8+0x44> 10238: 00040993 mv s3,s0 1023c: fddff06f j 10218 <quant_bf16_to_int8+0x44> 10240: 00098513 mv a0,s3 10244: 1d4010ef jal 11418 <__extendsfdf2> 10248: 00050613 mv a2,a0 1024c: 00058693 mv a3,a1 10250: 00023537 lui a0,0x23 10254: db850513 add a0,a0,-584 # 22db8 <__trunctfdf2+0x3dc> 10258: 201010ef jal 11c58 <printf> 1025c: 00098593 mv a1,s3 10260: f341a503 lw a0,-204(gp) # 24904 <__SDATA_BEGIN__+0x6c> 10264: 7fc000ef jal 10a60 <__divsf3> 10268: 00050493 mv s1,a0 1026c: fb018413 add s0,gp,-80 # 24980 <after_quant.0> 10270: 01c40993 add s3,s0,28 10274: 00092583 lw a1,0(s2) 10278: 00048513 mv a0,s1 1027c: 51d000ef jal 10f98 <__mulsf3> 10280: 124010ef jal 113a4 <__fixsfsi> 10284: 00a42023 sw a0,0(s0) 10288: 00490913 add s2,s2,4 1028c: 00440413 add s0,s0,4 10290: ff3412e3 bne s0,s3,10274 <quant_bf16_to_int8+0xa0> 10294: fb018513 add a0,gp,-80 # 24980 <after_quant.0> 10298: 01c12083 lw ra,28(sp) 1029c: 01812403 lw s0,24(sp) 102a0: 01412483 lw s1,20(sp) 102a4: 01012903 lw s2,16(sp) 102a8: 00c12983 lw s3,12(sp) 102ac: 00812a03 lw s4,8(sp) 102b0: 00412a83 lw s5,4(sp) 102b4: 02010113 add sp,sp,32 102b8: 00008067 ret ``` ::: **Statistics** * line of code : 60 * register * ax : `a0` `a1` `a2` `a3` `a4` `a5` `a6` `a8``a9``a10` * sx : `s0` `s1` `s2` `s3` `s4` `s5` * tx : none * branch and jump * jump : 8 * branch : 3 * stack * sp : 32 * lw and sw * lw : 11 * sw : 8 ### -O2 * Compile ```c $ riscv-none-elf-gcc -march=rv32i -mabi=ilp32 -O2 lab2.c -o lab2_O2.elf ``` * Size ```c $ riscv-none-elf-size lab2_O2.elf ``` ```c ~/rv32emu/tests/hw2$ riscv-none-elf-size lab2_02.elf text data bss dec hex filename 81224 2328 1556 85108 14c74 lab2_O2.elf ``` * ELF header ```c ~/rv32emu/tests/hw2$ riscv-none-elf-readelf -h lab2_O2.elf ELF Header: Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00 Class: ELF32 Data: 2's complement, little endian Version: 1 (current) OS/ABI: UNIX - System V ABI Version: 0 Type: EXEC (Executable file) Machine: RISC-V Version: 0x1 Entry point address: 0x10474 Start of program headers: 52 (bytes into file) Start of section headers: 99336 (bytes into file) Flags: 0x0 Size of this header: 52 (bytes) Size of program headers: 32 (bytes) Number of program headers: 3 Size of section headers: 40 (bytes) Number of section headers: 15 Section header string table index: 14 ``` :::spoiler quant_bf16_to_int8 disassembly code ```c 00010574 <quant_bf16_to_int8>: 10574: fe010113 add sp,sp,-32 10578: 01312623 sw s3,12(sp) 1057c: 00052983 lw s3,0(a0) # ff800000 <__BSS_END__+0xff7db0cc> 10580: 01412423 sw s4,8(sp) 10584: 80000a37 lui s4,0x80000 10588: fffa0a13 add s4,s4,-1 # 7fffffff <__BSS_END__+0x7ffdb0cb> 1058c: 00812c23 sw s0,24(sp) 10590: 01212823 sw s2,16(sp) 10594: 01512223 sw s5,4(sp) 10598: 00112e23 sw ra,28(sp) 1059c: 00912a23 sw s1,20(sp) 105a0: 00050913 mv s2,a0 105a4: 0149f9b3 and s3,s3,s4 105a8: 00450413 add s0,a0,4 105ac: 01c50a93 add s5,a0,28 105b0: 00042483 lw s1,0(s0) # ffff0000 <__BSS_END__+0xfffcb0cc> 105b4: 00098513 mv a0,s3 105b8: 00440413 add s0,s0,4 105bc: 0144f4b3 and s1,s1,s4 105c0: 00048593 mv a1,s1 105c4: 1d5000ef jal 10f98 <__lesf2> 105c8: 00055463 bgez a0,105d0 <quant_bf16_to_int8+0x5c> 105cc: 00048993 mv s3,s1 105d0: ff5410e3 bne s0,s5,105b0 <quant_bf16_to_int8+0x3c> 105d4: 00098513 mv a0,s3 105d8: 711000ef jal 114e8 <__extendsfdf2> 105dc: 00050613 mv a2,a0 105e0: 00023537 lui a0,0x23 105e4: 00058693 mv a3,a1 105e8: e8850513 add a0,a0,-376 # 22e88 <__trunctfdf2+0x3dc> 105ec: 73c010ef jal 11d28 <printf> 105f0: f341a503 lw a0,-204(gp) # 24904 <__SDATA_BEGIN__+0x6c> 105f4: 00098593 mv a1,s3 105f8: 530000ef jal 10b28 <__divsf3> 105fc: fb018413 add s0,gp,-80 # 24980 <after_quant.0> 10600: 00050493 mv s1,a0 10604: 01c40993 add s3,s0,28 10608: 00092583 lw a1,0(s2) 1060c: 00048513 mv a0,s1 10610: 00440413 add s0,s0,4 10614: 255000ef jal 11068 <__mulsf3> 10618: 65d000ef jal 11474 <__fixsfsi> 1061c: fea42e23 sw a0,-4(s0) 10620: 00490913 add s2,s2,4 10624: ff3412e3 bne s0,s3,10608 <quant_bf16_to_int8+0x94> 10628: 01c12083 lw ra,28(sp) 1062c: 01812403 lw s0,24(sp) 10630: 01412483 lw s1,20(sp) 10634: 01012903 lw s2,16(sp) 10638: 00c12983 lw s3,12(sp) 1063c: 00412a83 lw s5,4(sp) 10640: fb018513 add a0,gp,-80 # 24980 <after_quant.0> 10644: 00812a03 lw s4,8(sp) 10648: 02010113 add sp,sp,32 1064c: 00008067 ret ``` ::: **Statistics** * line of code : 56 * register * ax : `a0` `a1` `a2` `a3` `a4` `a5` `a8` `a9` * sx : `s0` `s1` `s2` `s3` `s4` `s5` * tx : none * branch and jump * jump : 8 * branch : 3 * stack * sp : 32 * lw and sw * lw : 11 * sw : 8 ### -O3 * Compile ```c $ riscv-none-elf-gcc -march=rv32i -mabi=ilp32 -O3 lab2.c -o lab2_O3.elf ``` * Size ```c $ riscv-none-elf-size lab2_O3.elf ``` ```c ~/rv32emu/tests/hw2$ riscv-none-elf-size lab2_03.elf text data bss dec hex filename 81212 2396 1556 85164 14cac lab2_O3.elf ``` * ELF header ```c ~/rv32emu/tests/hw2$ riscv-none-elf-readelf -h lab2_O3.elf ELF Header: Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00 Class: ELF32 Data: 2's complement, little endian Version: 1 (current) OS/ABI: UNIX - System V ABI Version: 0 Type: EXEC (Executable file) Machine: RISC-V Version: 0x1 Entry point address: 0x102fc Start of program headers: 52 (bytes into file) Start of section headers: 99492 (bytes into file) Flags: 0x0 Size of this header: 52 (bytes) Size of program headers: 32 (bytes) Number of program headers: 3 Size of section headers: 40 (bytes) Number of section headers: 15 Section header string table index: 14 ``` :::spoiler quant_bf16_to_int8 disassembly code ```c 000103fc <quant_bf16_to_int8>: 103fc: fe010113 add sp,sp,-32 10400: 00912a23 sw s1,20(sp) 10404: 01212823 sw s2,16(sp) 10408: 00052483 lw s1,0(a0) # ff800000 <__BSS_END__+0xff7db088> 1040c: 00452903 lw s2,4(a0) 10410: 800007b7 lui a5,0x80000 10414: fff78793 add a5,a5,-1 # 7fffffff <__BSS_END__+0x7ffdb087> 10418: 00f4f4b3 and s1,s1,a5 1041c: 00f97933 and s2,s2,a5 10420: 00812c23 sw s0,24(sp) 10424: 00090593 mv a1,s2 10428: 00050413 mv s0,a0 1042c: 00048513 mv a0,s1 10430: 00112e23 sw ra,28(sp) 10434: 01312623 sw s3,12(sp) 10438: 3ad000ef jal 10fe4 <__lesf2> 1043c: 00055463 bgez a0,10444 <quant_bf16_to_int8+0x48> 10440: 00090493 mv s1,s2 10444: 00842903 lw s2,8(s0) # ffff0008 <__BSS_END__+0xfffcb090> 10448: 00048513 mv a0,s1 1044c: 00191913 sll s2,s2,0x1 10450: 00195913 srl s2,s2,0x1 10454: 00090593 mv a1,s2 10458: 38d000ef jal 10fe4 <__lesf2> 1045c: 00055463 bgez a0,10464 <quant_bf16_to_int8+0x68> 10460: 00090493 mv s1,s2 10464: 00c42903 lw s2,12(s0) 10468: 00048513 mv a0,s1 1046c: 00191913 sll s2,s2,0x1 10470: 00195913 srl s2,s2,0x1 10474: 00090593 mv a1,s2 10478: 36d000ef jal 10fe4 <__lesf2> 1047c: 00055463 bgez a0,10484 <quant_bf16_to_int8+0x88> 10480: 00090493 mv s1,s2 10484: 01042903 lw s2,16(s0) 10488: 00048513 mv a0,s1 1048c: 00191913 sll s2,s2,0x1 10490: 00195913 srl s2,s2,0x1 10494: 00090593 mv a1,s2 10498: 34d000ef jal 10fe4 <__lesf2> 1049c: 00055463 bgez a0,104a4 <quant_bf16_to_int8+0xa8> 104a0: 00090493 mv s1,s2 104a4: 01442903 lw s2,20(s0) 104a8: 00048513 mv a0,s1 104ac: 00191913 sll s2,s2,0x1 104b0: 00195913 srl s2,s2,0x1 104b4: 00090593 mv a1,s2 104b8: 32d000ef jal 10fe4 <__lesf2> 104bc: 00055463 bgez a0,104c4 <quant_bf16_to_int8+0xc8> 104c0: 00090493 mv s1,s2 104c4: 01842903 lw s2,24(s0) 104c8: 00048593 mv a1,s1 104cc: 00191913 sll s2,s2,0x1 104d0: 00195913 srl s2,s2,0x1 104d4: 00090513 mv a0,s2 104d8: 245000ef jal 10f1c <__gesf2> 104dc: 00a05463 blez a0,104e4 <quant_bf16_to_int8+0xe8> 104e0: 00090493 mv s1,s2 104e4: 00048513 mv a0,s1 104e8: 04c010ef jal 11534 <__extendsfdf2> 104ec: 00050613 mv a2,a0 104f0: 00023537 lui a0,0x23 104f4: 00058693 mv a3,a1 104f8: ed850513 add a0,a0,-296 # 22ed8 <__trunctfdf2+0x3e0> 104fc: 079010ef jal 11d74 <printf> 10500: f341a503 lw a0,-204(gp) # 24904 <__SDATA_BEGIN__+0x6c> 10504: 00048593 mv a1,s1 10508: 5a4000ef jal 10aac <__divsf3> 1050c: 00042583 lw a1,0(s0) 10510: 00050493 mv s1,a0 10514: ff418913 add s2,gp,-12 # 249c4 <after_quant.0> 10518: 39d000ef jal 110b4 <__mulsf3> 1051c: 7a5000ef jal 114c0 <__fixsfsi> 10520: 00442583 lw a1,4(s0) 10524: 00050793 mv a5,a0 10528: 00f92023 sw a5,0(s2) 1052c: 00048513 mv a0,s1 10530: 385000ef jal 110b4 <__mulsf3> 10534: 78d000ef jal 114c0 <__fixsfsi> 10538: 00842583 lw a1,8(s0) 1053c: 00050793 mv a5,a0 10540: 00f92223 sw a5,4(s2) 10544: 00048513 mv a0,s1 10548: 36d000ef jal 110b4 <__mulsf3> 1054c: 775000ef jal 114c0 <__fixsfsi> 10550: 00c42583 lw a1,12(s0) 10554: 00050793 mv a5,a0 10558: 00f92423 sw a5,8(s2) 1055c: 00048513 mv a0,s1 10560: 355000ef jal 110b4 <__mulsf3> 10564: 75d000ef jal 114c0 <__fixsfsi> 10568: 01042583 lw a1,16(s0) 1056c: 00050793 mv a5,a0 10570: 00f92623 sw a5,12(s2) 10574: 00048513 mv a0,s1 10578: 33d000ef jal 110b4 <__mulsf3> 1057c: 745000ef jal 114c0 <__fixsfsi> 10580: 01442583 lw a1,20(s0) 10584: 00050793 mv a5,a0 10588: 00f92823 sw a5,16(s2) 1058c: 00048513 mv a0,s1 10590: 325000ef jal 110b4 <__mulsf3> 10594: 72d000ef jal 114c0 <__fixsfsi> 10598: 01842583 lw a1,24(s0) 1059c: 00050793 mv a5,a0 105a0: 00f92a23 sw a5,20(s2) 105a4: 00048513 mv a0,s1 105a8: 30d000ef jal 110b4 <__mulsf3> 105ac: 715000ef jal 114c0 <__fixsfsi> 105b0: 01c12083 lw ra,28(sp) 105b4: 01812403 lw s0,24(sp) 105b8: 00a92c23 sw a0,24(s2) 105bc: 01412483 lw s1,20(sp) 105c0: 01012903 lw s2,16(sp) 105c4: ff418513 add a0,gp,-12 # 249c4 <after_quant.0> 105c8: 00c12983 lw s3,12(sp) 105cc: 02010113 add sp,sp,32 105d0: 00008067 ret ``` ::: **Statistics** * line of code : 119 * register * ax : `a0` `a1` `a2` `a3` `a4` `a5` `a8` `a9` * sx : `s0` `s1` `s2` `s3` * tx : none * branch and jump * jump : 24 * branch : 6 * stack * sp : 32 * lw and sw * lw : 20 * sw : 12 ### -Os * Compile ```c $ riscv-none-elf-gcc -march=rv32i -mabi=ilp32 -Os lab2.c -o lab2_Os.elf ``` * Size ```c $ riscv-none-elf-size lab2_Os.elf ``` ```c ~/rv32emu/tests/hw2$ riscv-none-elf-size lab2_0s.elf text data bss dec hex filename 80900 2328 1556 84784 14b30 lab2_Os.elf ``` * ELF header ```c ~/rv32emu/tests/hw2$ riscv-none-elf-readelf -h lab2_Os.elf ELF Header: Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00 Class: ELF32 Data: 2's complement, little endian Version: 1 (current) OS/ABI: UNIX - System V ABI Version: 0 Type: EXEC (Executable file) Machine: RISC-V Version: 0x1 Entry point address: 0x10320 Start of program headers: 52 (bytes into file) Start of section headers: 99336 (bytes into file) Flags: 0x0 Size of this header: 52 (bytes) Size of program headers: 32 (bytes) Number of program headers: 3 Size of section headers: 40 (bytes) Number of section headers: 15 Section header string table index: 14 ``` :::spoiler quant_bf16_to_int8 disassembly code ```c 00010428 <quant_bf16_to_int8>: 10428: fe010113 add sp,sp,-32 1042c: 00912a23 sw s1,20(sp) 10430: 00052483 lw s1,0(a0) 10434: 01312623 sw s3,12(sp) 10438: 800009b7 lui s3,0x80000 1043c: fff98993 add s3,s3,-1 # 7fffffff <__BSS_END__+0x7ffdb0cb> 10440: 00812c23 sw s0,24(sp) 10444: 01212823 sw s2,16(sp) 10448: 01512223 sw s5,4(sp) 1044c: 00112e23 sw ra,28(sp) 10450: 01412423 sw s4,8(sp) 10454: 01612023 sw s6,0(sp) 10458: 00050413 mv s0,a0 1045c: 0134f4b3 and s1,s1,s3 10460: 00450913 add s2,a0,4 10464: 01c50a93 add s5,a0,28 10468: 00092a03 lw s4,0(s2) 1046c: 00048593 mv a1,s1 10470: 013a7a33 and s4,s4,s3 10474: 000a0513 mv a0,s4 10478: 1e5000ef jal 10e5c <__gesf2> 1047c: 00a05463 blez a0,10484 <quant_bf16_to_int8+0x5c> 10480: 000a0493 mv s1,s4 10484: 00490913 add s2,s2,4 10488: ff5910e3 bne s2,s5,10468 <quant_bf16_to_int8+0x40> 1048c: 00048513 mv a0,s1 10490: 715000ef jal 113a4 <__extendsfdf2> 10494: 00050613 mv a2,a0 10498: 00023537 lui a0,0x23 1049c: 00058693 mv a3,a1 104a0: d4850513 add a0,a0,-696 # 22d48 <__trunctfdf2+0x3e0> 104a4: 740010ef jal 11be4 <printf> 104a8: f341a503 lw a0,-204(gp) # 24904 <__SDATA_BEGIN__+0x6c> 104ac: 00048593 mv a1,s1 104b0: 53c000ef jal 109ec <__divsf3> 104b4: 00050913 mv s2,a0 104b8: 00000493 li s1,0 104bc: fb018b13 add s6,gp,-80 # 24980 <after_quant.0> 104c0: 01c00a13 li s4,28 104c4: 009407b3 add a5,s0,s1 104c8: 0007a583 lw a1,0(a5) # ff800000 <__BSS_END__+0xff7db0cc> 104cc: 00090513 mv a0,s2 104d0: 009b0ab3 add s5,s6,s1 104d4: 251000ef jal 10f24 <__mulsf3> 104d8: 659000ef jal 11330 <__fixsfsi> 104dc: 00aaa023 sw a0,0(s5) 104e0: 00448493 add s1,s1,4 104e4: ff4490e3 bne s1,s4,104c4 <quant_bf16_to_int8+0x9c> 104e8: 01c12083 lw ra,28(sp) 104ec: 01812403 lw s0,24(sp) 104f0: 01412483 lw s1,20(sp) 104f4: 01012903 lw s2,16(sp) 104f8: 00812a03 lw s4,8(sp) 104fc: 00412a83 lw s5,4(sp) 10500: 00012b03 lw s6,0(sp) 10504: fb018513 add a0,gp,-80 # 24980 <after_quant.0> 10508: 00c12983 lw s3,12(sp) 1050c: 02010113 add sp,sp,32 10510: 00008067 ret ``` ::: **Statistics** * line of code : 60 * register * ax : `a0` `a1` `a2` `a3` `a4` `a5` `a7` `a8` `a9` * sx : `s0` `s1` `s2` `s3` `s4` `s5` `s6` * tx : none * branch and jump * jump : 6 * branch : 3 * stack * sp : 32 * lw and sw * lw : 12 * sw : 9 ### -Ofast * Compile ```c $ riscv-none-elf-gcc -march=rv32i -mabi=ilp32 -Ofast lab2.c -o lab2_Ofast.elf ``` * Size ```c $ riscv-none-elf-size lab2_Ofast.elf ``` ```c ~/rv32emu/tests/hw2$ riscv-none-elf-size lab2_0fast.elf text data bss dec hex filename 81004 2396 1556 84956 14bdc lab2_Ofast.elf ``` * ELF header ```c ~/rv32emu/tests/hw2$ riscv-none-elf-readelf -h lab2_Ofast.elf ELF Header: Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00 Class: ELF32 Data: 2's complement, little endian Version: 1 (current) OS/ABI: UNIX - System V ABI Version: 0 Type: EXEC (Executable file) Machine: RISC-V Version: 0x1 Entry point address: 0x102fc Start of program headers: 52 (bytes into file) Start of section headers: 99404 (bytes into file) Flags: 0x0 Size of this header: 52 (bytes) Size of program headers: 32 (bytes) Number of program headers: 3 Size of section headers: 40 (bytes) Number of section headers: 15 Section header string table index: 14 ``` :::spoiler quant_bf16_to_int8 disassembly code ```c 000103fc <quant_bf16_to_int8>: 103fc: fe010113 add sp,sp,-32 10400: 00812c23 sw s0,24(sp) 10404: 01212823 sw s2,16(sp) 10408: 00452403 lw s0,4(a0) # ff800004 <__BSS_END__+0xff7db08c> 1040c: 00852903 lw s2,8(a0) 10410: 800007b7 lui a5,0x80000 10414: fff78793 add a5,a5,-1 # 7fffffff <__BSS_END__+0x7ffdb087> 10418: 00f97933 and s2,s2,a5 1041c: 00f47433 and s0,s0,a5 10420: 00912a23 sw s1,20(sp) 10424: 00090593 mv a1,s2 10428: 00050493 mv s1,a0 1042c: 00040513 mv a0,s0 10430: 00112e23 sw ra,28(sp) 10434: 01312623 sw s3,12(sp) 10438: 2e5000ef jal 10f1c <__gesf2> 1043c: 00055463 bgez a0,10444 <quant_bf16_to_int8+0x48> 10440: 00090413 mv s0,s2 10444: 0004a903 lw s2,0(s1) 10448: 00040513 mv a0,s0 1044c: 00191913 sll s2,s2,0x1 10450: 00195913 srl s2,s2,0x1 10454: 00090593 mv a1,s2 10458: 2c5000ef jal 10f1c <__gesf2> 1045c: 00055463 bgez a0,10464 <quant_bf16_to_int8+0x68> 10460: 00090413 mv s0,s2 10464: 00c4a903 lw s2,12(s1) 10468: 00040513 mv a0,s0 1046c: 00191913 sll s2,s2,0x1 10470: 00195913 srl s2,s2,0x1 10474: 00090593 mv a1,s2 10478: 2a5000ef jal 10f1c <__gesf2> 1047c: 00055463 bgez a0,10484 <quant_bf16_to_int8+0x88> 10480: 00090413 mv s0,s2 10484: 0104a903 lw s2,16(s1) 10488: 00040513 mv a0,s0 1048c: 00191913 sll s2,s2,0x1 10490: 00195913 srl s2,s2,0x1 10494: 00090593 mv a1,s2 10498: 285000ef jal 10f1c <__gesf2> 1049c: 00055463 bgez a0,104a4 <quant_bf16_to_int8+0xa8> 104a0: 00090413 mv s0,s2 104a4: 0144a903 lw s2,20(s1) 104a8: 00040513 mv a0,s0 104ac: 00191913 sll s2,s2,0x1 104b0: 00195913 srl s2,s2,0x1 104b4: 00090593 mv a1,s2 104b8: 265000ef jal 10f1c <__gesf2> 104bc: 00055463 bgez a0,104c4 <quant_bf16_to_int8+0xc8> 104c0: 00090413 mv s0,s2 104c4: 0184a903 lw s2,24(s1) 104c8: 00040513 mv a0,s0 104cc: 00191913 sll s2,s2,0x1 104d0: 00195913 srl s2,s2,0x1 104d4: 00090593 mv a1,s2 104d8: 245000ef jal 10f1c <__gesf2> 104dc: 00055463 bgez a0,104e4 <quant_bf16_to_int8+0xe8> 104e0: 00090413 mv s0,s2 104e4: 00040513 mv a0,s0 104e8: 77d000ef jal 11464 <__extendsfdf2> 104ec: 00050613 mv a2,a0 104f0: 00023537 lui a0,0x23 104f4: 00058693 mv a3,a1 104f8: e0850513 add a0,a0,-504 # 22e08 <__trunctfdf2+0x3e0> 104fc: 7a8010ef jal 11ca4 <printf> 10500: f341a503 lw a0,-204(gp) # 24904 <__SDATA_BEGIN__+0x6c> 10504: 00040593 mv a1,s0 10508: 5a4000ef jal 10aac <__divsf3> 1050c: 0004a583 lw a1,0(s1) 10510: 00050413 mv s0,a0 10514: ff418913 add s2,gp,-12 # 249c4 <after_quant.0> 10518: 2cd000ef jal 10fe4 <__mulsf3> 1051c: 6d5000ef jal 113f0 <__fixsfsi> 10520: 0044a583 lw a1,4(s1) 10524: 00050793 mv a5,a0 10528: 00f92023 sw a5,0(s2) 1052c: 00040513 mv a0,s0 10530: 2b5000ef jal 10fe4 <__mulsf3> 10534: 6bd000ef jal 113f0 <__fixsfsi> 10538: 0084a583 lw a1,8(s1) 1053c: 00050793 mv a5,a0 10540: 00f92223 sw a5,4(s2) 10544: 00040513 mv a0,s0 10548: 29d000ef jal 10fe4 <__mulsf3> 1054c: 6a5000ef jal 113f0 <__fixsfsi> 10550: 00c4a583 lw a1,12(s1) 10554: 00050793 mv a5,a0 10558: 00f92423 sw a5,8(s2) 1055c: 00040513 mv a0,s0 10560: 285000ef jal 10fe4 <__mulsf3> 10564: 68d000ef jal 113f0 <__fixsfsi> 10568: 0104a583 lw a1,16(s1) 1056c: 00050793 mv a5,a0 10570: 00f92623 sw a5,12(s2) 10574: 00040513 mv a0,s0 10578: 26d000ef jal 10fe4 <__mulsf3> 1057c: 675000ef jal 113f0 <__fixsfsi> 10580: 0144a583 lw a1,20(s1) 10584: 00050793 mv a5,a0 10588: 00f92823 sw a5,16(s2) 1058c: 00040513 mv a0,s0 10590: 255000ef jal 10fe4 <__mulsf3> 10594: 65d000ef jal 113f0 <__fixsfsi> 10598: 0184a583 lw a1,24(s1) 1059c: 00050793 mv a5,a0 105a0: 00f92a23 sw a5,20(s2) 105a4: 00040513 mv a0,s0 105a8: 23d000ef jal 10fe4 <__mulsf3> 105ac: 645000ef jal 113f0 <__fixsfsi> 105b0: 01c12083 lw ra,28(sp) 105b4: 01812403 lw s0,24(sp) 105b8: 00a92c23 sw a0,24(s2) 105bc: 01412483 lw s1,20(sp) 105c0: 01012903 lw s2,16(sp) 105c4: ff418513 add a0,gp,-12 # 249c4 <after_quant.0> 105c8: 00c12983 lw s3,12(sp) 105cc: 02010113 add sp,sp,32 105d0: 00008067 ret ``` ::: **Statistics** * line of code : 119 * register * ax : `a0` `a1` `a2` `a3` `a4` `a5` `a7` `a8` * sx : `s0` `s1` `s2` `s3` * tx : none * branch and jump * jump : 23 * branch : 6 * stack * sp : 32 * lw and sw * lw : 20 * sw : 12 ### Conclusion |Level| text | data | bss | dec | hex | filename | | -|------- | -------- | -------- | ---- | ----- |-| |O0| 81736 | 2320 | 1556 | 85612 | 14e6c |lab2.elf| |O1| 81016 | 2328 | 1556 | 85900 | 14ba4 |lab2_O1.elf| |O2| 81224 | 2328 | 1556 | 85108 | 14c74 |lab2_O2.elf| |O3| 81212 | 2396 | 1556 | 85164 | 14cac |lab2_O3.elf| |Os| 80900 | 2328 | 1556 | 84784 | 14b30 |lab2_Os.elf| |Ofast|81004|2396|1556|84956|14bdc|lab2_Ofast.elf| **From O0 to O1** The most noticeable difference is in the usage of lw and sw instructions. The frequency of usage in O0 is almost twice as much as O1, which results in faster execution of functions in O1 optimization level. Additionally, we observe that O0 code is almost a direct translation from C language, which leads to longer instructions in certain cases. **From O1 to O2** It can be noticed that the optimization from O1 to O2 is not very significant. **From O1 to O3** The optimization from O1 to O3 reduces the number of registers but increases the count of jump and branch instructions, as well as the usage of lw and sw instructions. This results in the function taking a significantly longer time to execute. **From O3 to Ofast** O3 and Ofast optimizations are nearly identical, indicating that the compiler optimization has reached its limit. ### :point_right: Use [ticks.c](https://github.com/sysprog21/rv32emu/blob/master/tests/ticks.c) for the statistics of your program’s execution. :::spoiler Rewrite makefile ``` .PHONY: clean ASFLAGS = -march=rv32i -mabi=ilp32 all: source_O0.elf source_O1.elf source_O2.elf source_O3.elf source_Os.elf source_Ofast.elf source_O0.elf: riscv-none-elf-gcc $(LDFLAGS) -O0 source.c -o $@ source_O1.elf: riscv-none-elf-gcc $(LDFLAGS) -O1 source.c -o $@ source_O2.elf: riscv-none-elf-gcc $(LDFLAGS) -O2 source.c -o $@ source_O3.elf: riscv-none-elf-gcc $(LDFLAGS) -O3 source.c -o $@ source_Os.elf: riscv-none-elf-gcc $(LDFLAGS) -Os source.c -o $@ source_Ofast.elf: riscv-none-elf-gcc $(LDFLAGS) -Ofast source.c -o $@ clean: rm *.elf ``` ::: | Level | O0 | O1 | O2 |O3 | Os | Ofast | Assembly| | -------- | -------- | -------- |- |- |- |- |-| | elapsed cycle | 195232 | 193080 | 192836 | 188939 | 193367 | 188938 | 5227 :+1: | ## Reference * [5 Reasons Why Machine Learning Quantization is Important for AI Projects](https://www.rinf.tech/5-reasons-why-machine-learning-quantization-is-important-for-ai-projects/) * [ELF](https://tobygao.github.io/Learning-Lounge/2018/12/09/elf.html) * [AI模型壓縮技術-量化(Quantization)](https://chih-sheng-huang821.medium.com/ai%E6%A8%A1%E5%9E%8B%E5%A3%93%E7%B8%AE%E6%8A%80%E8%A1%93-%E9%87%8F%E5%8C%96-quantization-966505128365) * [ Install GNU Toolchain for RISC-V and rv32emu ](https://hackmd.io/@sysprog/SJAR5XMmi) * [Implement function to find maximum absolute value in bfloat16 array for quantization by KuanYuan0530](https://hackmd.io/@K1NCVjKnTCmNaikFb4gt-A/B1Fj-TGWp) * [Reducing memory usage with bfloat and bfloat multiplication by Brian Cheng](https://hackmd.io/@PWCheng/CAHW01) * [gcc编译优化-O0 -O1 -O2 -O3 -OS解析](https://blog.csdn.net/wuxing26jiayou/article/details/96132721)

Syntax	Example	Reference
# Header	Header	基本排版
- Unordered List	Unordered List
1. Ordered List	Ordered List
- [ ] Todo List	Todo List
> Blockquote	Blockquote
Bold font	Bold font
Italics font	Italics font
~~Strikethrough~~	~~Strikethrough~~
19^th^	19^th
H~2~O	H₂O
++Inserted text++	Inserted text
==Marked text==	Marked text
[link text](https:// "title")	Link
![image alt](https:// "title")	Image
`Code`	`Code`	在筆記中貼入程式碼
```javascript var i = 0; ```	`var i = 0;`
:smile:		Emoji list
{%youtube youtube_id %}	Externals
$L^aT_eX$	L^aT_eX
:::info This is a alert area. :::	This is a alert area.