Contributed by <jeremy90307>
Lab2: RISC-V RV32I[MA] emulator with ELF support
I chose the question by student KuanYuan053: "Implement quantization from bfloat16 to int8"
Since student KuanYuan053 has already optimized the C code significantly, I won't waste time modifying it. I'll proceed directly to the RISC-V part.
#include <stdio.h>
#include <stdlib.h>
#include<math.h>
# define array_size 7
# define range 127 /*2^(n-1)-1, n: quant bit*/
float fp32_to_bf16(float x);
int* quant_bf16_to_int8(float x[]);
float bf16_findmax(float x[]);
int main()
{
float array[array_size] = {1.200000, 1.203125, 2.310000, 2.312500, 3.460000, 3.4531255, 5.630000};
float array2[array_size] = { 0.1, 0.2, 1.2, 3, 2.1, -4.2, 3.5};
float array3[array_size] = { 3.14159265, 0.12345678 , 1.23456789 , 0.00000123, 0.00000001, 0.99999999 , 0.00000007 };
float array_bf16[array_size] = {};
int *after_quant;
/*data 1*/
for (int i = 0; i < 7; i++) {
array_bf16[i] = fp32_to_bf16(array[i]);
}
printf("data 1\nbfloat16 number is \n");
for (int i = 0; i < array_size; i++) {
printf("%.12f\n", array_bf16[i]);
}
after_quant = quant_bf16_to_int8(array_bf16);
printf("after quantization \n");
for (int i = 0; i < array_size; i++) {
printf("%d\n", after_quant[i]);
}
/*data 2*/
for (int i = 0; i < 7; i++) {
array_bf16[i] = fp32_to_bf16(array2[i]);
}
printf("data 2\nbfloat16 number is \n");
for (int i = 0; i < array_size; i++) {
printf("%.12f\n", array_bf16[i]);
}
after_quant = quant_bf16_to_int8(array_bf16);
printf("after quantization \n");
for (int i = 0; i < array_size; i++) {
printf("%d\n", after_quant[i]);
}
/*data 3*/
for (int i = 0; i < 7; i++) {
array_bf16[i] = fp32_to_bf16(array3[i]);
}
printf("data 3\nbfloat16 number is \n");
for (int i = 0; i < array_size; i++) {
printf("%.12f\n", array_bf16[i]);
}
after_quant = quant_bf16_to_int8(array_bf16);
printf("after quantization \n");
for (int i = 0; i < array_size; i++) {
printf("%d\n", after_quant[i]);
}
system("pause");
return 0;
}
float fp32_to_bf16(float x)
{
float y = x;
int *p = (int *)&y;
unsigned int exp = *p & 0x7F800000;
unsigned int man = *p & 0x007FFFFF;
if (exp == 0 && man == 0) /* zero */
return x;
if (exp == 0x7F800000 /* Fill this! */) /* infinity or NaN */
return x;
/* Normalized number */
/* round to nearest */
float r = x;
int *pr = (int *)&r;
*pr &= 0xFF800000; /* r has the same exp as x */
r /= 0x100 /* Fill this! */;
y = x + r;
*p &= 0xFFFF0000;
return y;
}
int* quant_bf16_to_int8(float x[array_size])
{
static int after_quant[array_size] = {};
float max = fabs(x[0]);
for (int i = 1; i < array_size; i++) {
if (fabs(x[i]) > max) {
max = fabs(x[i]);
}
}
printf("maximum number is %.12f\n", max);
float scale = range / max;
for (int i = 0; i < array_size; i++) {
after_quant[i] = (x[i] * scale);
}
return after_quant;
}
#include <stdio.h>
#include <stdlib.h>
#include<math.h>
#include <inttypes.h>
# define array_size 7
# define range 127 /*2^(n-1)-1, n: quant bit*/
float fp32_to_bf16(float x);
int* quant_bf16_to_int8(float x[]);
float bf16_findmax(float x[]);
typedef uint64_t ticks;
static inline ticks getticks(void)
{
uint64_t result;
uint32_t l, h, h2;
asm volatile(
"rdcycleh %0\n"
"rdcycle %1\n"
"rdcycleh %2\n"
"sub %0, %0, %2\n"
"seqz %0, %0\n"
"sub %0, zero, %0\n"
"and %1, %1, %0\n"
: "=r"(h), "=r"(l), "=r"(h2));
result = (((uint64_t) h) << 32) | ((uint64_t) l);
return result;
}
int main()
{
ticks t0 = getticks();
float array[array_size] = {1.200000, 1.203125, 2.310000, 2.312500, 3.460000, 3.4531255, 5.630000};
float array2[array_size] = { 0.1, 0.2, 1.2, 3, 2.1, -4.2, 3.5};
float array3[array_size] = { 3.14159265, 0.12345678 , 1.23456789 , 0.00000123, 0.00000001, 0.99999999 , 0.00000007 };
float array_bf16[array_size] = {};
int *after_quant;
/*data 1*/
for (int i = 0; i < 7; i++) {
array_bf16[i] = fp32_to_bf16(array[i]);
}
printf("data 1\nbfloat16 number is \n");
for (int i = 0; i < array_size; i++) {
printf("%.12f\n", array_bf16[i]);
}
after_quant = quant_bf16_to_int8(array_bf16);
printf("after quantization \n");
for (int i = 0; i < array_size; i++) {
printf("%d\n", after_quant[i]);
}
/*data 2*/
for (int i = 0; i < 7; i++) {
array_bf16[i] = fp32_to_bf16(array2[i]);
}
printf("data 2\nbfloat16 number is \n");
for (int i = 0; i < array_size; i++) {
printf("%.12f\n", array_bf16[i]);
}
after_quant = quant_bf16_to_int8(array_bf16);
printf("after quantization \n");
for (int i = 0; i < array_size; i++) {
printf("%d\n", after_quant[i]);
}
/*data 3*/
for (int i = 0; i < 7; i++) {
array_bf16[i] = fp32_to_bf16(array3[i]);
}
printf("data 3\nbfloat16 number is \n");
for (int i = 0; i < array_size; i++) {
printf("%.12f\n", array_bf16[i]);
}
after_quant = quant_bf16_to_int8(array_bf16);
printf("after quantization \n");
for (int i = 0; i < array_size; i++) {
printf("%d\n", after_quant[i]);
}
ticks t1 = getticks();
printf("elapsed cycle: %" PRIu64 "\n", t1 - t0);
system("pause");
return 0;
}
float fp32_to_bf16(float x)
{
float y = x;
int *p = (int *)&y;
unsigned int exp = *p & 0x7F800000;
unsigned int man = *p & 0x007FFFFF;
if (exp == 0 && man == 0) /* zero */
return x;
if (exp == 0x7F800000 /* Fill this! */) /* infinity or NaN */
return x;
/* Normalized number */
/* round to nearest */
float r = x;
int *pr = (int *)&r;
*pr &= 0xFF800000; /* r has the same exp as x */
r /= 0x100 /* Fill this! */;
y = x + r;
*p &= 0xFFFF0000;
return y;
}
int* quant_bf16_to_int8(float x[array_size])
{
static int after_quant[array_size] = {};
float max = fabs(x[0]);
for (int i = 1; i < array_size; i++) {
if (fabs(x[i]) > max) {
max = fabs(x[i]);
}
}
printf("maximum number is %.12f\n", max);
float scale = range / max;
for (int i = 0; i < array_size; i++) {
after_quant[i] = (x[i] * scale);
}
return after_quant;
}
.data
array: .word 0x3f99999a, 0x3f9a0000, 0x4013d70a, 0x40140000, 0x405d70a4, 0x405d0000, 0x40b428f6
# test data1: 1.200000, 1.203125, 2.310000, 2.312500, 3.460000, 3.4531255, 5.630000
array2: .word 0x3dcccccd, 0x3e4ccccd, 0x3f99999a, 0x40400000, 0x40066666, 0xc0866666, 0x40600000
# test data2: 0.1, 0.2, 1.2, 3, 2.1, -4.2, 3.5
array3: .word 0x40490fdb, 0x3dfcd6e9, 0x3f9e0652, 0x35a5167a, 0x322bcc77, 0x3f800000, 0x339652e8
# test data3: 3.14159265, 0.12345678 , 1.23456789 , 0.00000123, 0.00000001, 0.99999999 , 0.00000007
array_bf16: .word 0, 0, 0, 0, 0, 0, 0
exp_mask: .word 0x7F800000
man_mask: .word 0x007FFFFF
sign_exp_mask: .word 0xFF800000
bf16_mask: .word 0xFFFF0000
next_line: .string "\n"
max_string: .string "maximum number is "
bf16_string: .string "\nbfloat16 number is \n"
.text
main:
# push data
addi sp, sp, -12
la t0, array
sw t0, 0(sp)
la t0, array2
sw t0, 4(sp)
la t0, array3
sw t0, 8(sp)
la s10, array_bf16 # global array_bf16 address(s10)
addi s11, x0, 3 # data number(s11)
la s9, exp_mask # global exp(s9)
la s8, man_mask # global man(s8)
la s6, bf16_mask # global bf16(s6)
lw s9, 0(s9)
lw s8, 0(s8)
lw s6, 0(s6)
add s7, x0, sp
main_for:
la a0, bf16_string
addi a7, x0, 4
ecall
addi a3, x0, 7 # array size(a3)
lw a1, 0(s7) # array_data pointer(a1)
mv a2, s10 # array_bf16 pointer(a2)
jal ra, fp32_to_bf16_findmax
addi s11, s11, -1
addi s7, s7, 4
bne s11, x0, main_for
# Exit program
li a7, 10
ecall
fp32_to_bf16_findmax:
# array_data pointer(a1), array_bf16 pointer(a2), array size(a3)
# prologue
addi sp, sp, -8
sw s0, 0(sp)
sw s1, 4(sp)
# array loop
for1:
lw a5, 0(a1) # x(a5)
# fp32_to_bf16
and t0, a5, s9 # x exp(t0)
and t1, a5, s8 # x man(t1)
# if zero
bne t0, x0, else
# exp is zero
bne t1, x0, else
j finish_bf16
else:
# if infinity or NaN
beq t0, s9, finish_bf16
# round
# r = x.man shift right 8 bit
# x+r = x.man + x.man>>8
li t3, 0x00800000 # make up 1 to No.24bit
or t1, t1, t3
srli t2, t1, 8 # r(t2)
add t1, t1, t2 # x+r
# check carry
and t4, t1, t3 # check No.24bit (t4), 0:carry, 1: nocarry
bne t4, x0, no_carry
add t0, t0, t3 # exp+1
srli t1 ,t1, 1 # man alignment
no_carry:
and t0, t0, s9 # mask exp(t0)
and t1, t1, s8 # mask man(t1)
or t2, t0, t1 # combine exp & man
li t3, 0x80000000 # sign mask
and t3, a5, t3 # x sign
or a5, t3, t2 # bfloat16(a5)
and a5, a5, s6
finish_bf16:
sw a5, 0(a2)
mv a0, a5
addi a7, x0, 34
ecall
la a0, next_line
addi a7, x0, 4
ecall
slti t3, a3, 7 # (a3==7) t3=0, (a3<7) t3=1
bne t3, x0, compare
# saved first max
j max_change
compare:
# compare exp
blt s0, t0, max_change
blt t0, s0, max_not_change
# compare man
blt s1, t1, max_change
blt t1, s1, max_not_change
max_change:
mv s0, t0 # max exp(s0)
mv s1, t1 # max man(s1)
mv a4, a5 # max bf16(a4)
max_not_change:
addi a3, a3, -1
addi a1, a1, 4
addi a2, a2, 4
bne a3, x0, for1
# Absolute
li t2, 0x7fffffff
and a4, a4, t2
#print
la a0, max_string
addi a7, x0, 4
ecall
mv a0, a4
addi a7, x0, 34
ecall
# epilogue
lw s0, 0(sp)
lw s1, 4(sp)
addi sp, sp, 8
jr ra
I think student KuanYuan has optimized the code to a very concise level. Therefore, I intend to rewrite the code and successfully implement the conversion from bf16 to int8 in the RISC-V architecture, a part that KuanYuan wasn't able to accomplish. Also, because I chose a relatively simpler topic for lab1 assignment, I am keen to utilize this opportunity in lab2 to thoroughly learn RISC-V .
Student KuanYuan0530 has optimized the original assembly code to be highly concise and successfully executed it on the rv32emu emulator, showcasing remarkable proficiency.
As KuanYuan0530 has only completed the part of finding the maximum value within the bf16 array for quantization, I will attempt to solve the remaining portion. My goal is to successfully convert bf16 data into int8 format. Through this process, I aim not only to complete the task but also to learn from his design logic. I believe this effort will contribute significantly to my progress in this field.
:point_right:Implemented specifically
Initially, student KuanYuan completed the quantization process only up to converting from fp32 to b16 and finding the maximum absolute value of bf16,the unresolved issue pertains to dividing bf16 values by the scale. I used a somewhat rough method to find the scale, as I was unable to implement bf16 division within the assignment deadline. The approach I used to calculate the scale significantly reduced the overall quantization accuracy, leading to differences between the final result and the actual answer. Finally, for the multiplication of bf16 values, I referenced Brian's bf16 multiplier, resulting in a quantization method with slightly reduced precision.
You have to describe how you have improved upon the first implementation.
:notes: jserv
I have added a description in this regard.
li
to replace la
and lw
.exp
and man
for size comparison..data
array: .word 0x3f99999a, 0x3f9a0000, 0x4013d70a, 0x40140000, 0x405d70a4, 0x405d0000, 0x40b428f6
# test data1: 1.200000, 1.203125, 2.310000, 2.312500, 3.460000, 3.4531255, 5.630000
array2: .word 0x3dcccccd, 0x3e4ccccd, 0x3f99999a, 0x40400000, 0x40066666, 0xc0866666, 0x40600000
# test data2: 0.1, 0.2, 1.2, 3, 2.1, -4.2, 3.5
array3: .word 0x40490fdb, 0x3dfcd6e9, 0x3f9e0652, 0x35a5167a, 0x322bcc77, 0x3f800000, 0x339652e8
# test data3: 3.14159265, 0.12345678 , 1.23456789 , 0.00000123, 0.00000001, 0.99999999 , 0.00000007
array_bf16: .word 0, 0, 0, 0, 0, 0, 0
exp_mask: .word 0x7F800000
man_mask: .word 0x007FFFFF
sign_exp_mask: .word 0xFF800000
bf16_mask: .word 0xFFFF0000
next_line: .string "\n"
max_string: .string "maximum number is "
bf16_string: .string "\nbfloat16 number is \n"
.text
main:
# push data
addi sp, sp, -12
la t0, array
sw t0, 0(sp)
la t0, array2
sw t0, 4(sp)
la t0, array3
sw t0, 8(sp)
la s10, array_bf16 # global array_bf16 address(s10)
addi s11, x0, 3 # data number(s11) -> three groups data
- la s9, exp_mask # global exp(s9)
- la s8, man_mask # global man(s8)
- la s6, bf16_mask # global bf16(s6)
- lw s9, 0(s9)
- lw s8, 0(s8)
- lw s6, 0(s6)
- add s7, x0, sp
+ li t5, 0x7F800000 #exp_mask
+ li t6, 0x007FFFFF #man_mask
+ li s6, 0xFFFF0000 #bf16_mask
+ li s7, 0x7FFFFFFF #abs_mask
main_for:
la a0, bf16_string
addi a7, x0, 4
ecall
addi a3, x0, 7 # array size(a3)
- lw a1, 0(s7) # array_data pointer(a1)
+ lw a1, 0(sp) # array_data pointer(a1)
mv a2, s10 # array_bf16 pointer(a2)
- jal ra, fp32_to_bf16_findmax
- addi s11, s11, -1
- addi s7, s7, 4
- bne s11, x0, main_for
- # Exit program
- li a7, 10
- ecall
fp32_to_bf16_findmax:
# array_data pointer(a1), array_bf16 pointer(a2), array size(a3)
- # prologue
- addi sp, sp, -8
- sw s0, 0(sp)
- sw s1, 4(sp)
# array loop
for1:
lw a5, 0(a1) # x(a5)
# fp32_to_bf16
- and t0, a5, s9 # x exp(t0)
- and t1, a5, s8 # x man(t1)
+ and t0, a5, t5 # x exp(t0)
+ and t1, a5, t6 # x man(t1)
# if zero
bne t0, x0, else
# exp is zero
bne t1, x0, else
j finish_bf16
else:
# if infinity or NaN
- beq t0, s9, finish_bf16
+ beq t0, t5, finish_bf16
# round
# r = x.man shift right 8 bit
# x+r = x.man + x.man>>8
li t3, 0x00800000 # make up 1 to No.24bit
or t1, t1, t3
srli t2, t1, 8 # r(t2)
add t1, t1, t2 # x+r
# check carry
and t4, t1, t3 # check No.24bit (t4), 0:carry, 1: nocarry
bne t4, x0, no_carry
add t0, t0, t3 # exp+1
srli t1 ,t1, 1 # man alignment
no_carry:
- and t0, t0, s9 # mask exp(t0)
- and t1, t1, s8 # mask man(t1)
+ and t0, t0, t5 # mask exp(t0)
+ and t1, t1, t6 # mask man(t1)
or t2, t0, t1 # combine exp & man
li t3, 0x80000000 # sign mask
and t3, a5, t3 # x sign
or a5, t3, t2 # bfloat16(a5)
- and a5, a5, s6
+ and a5, a5, s6 #s6 -> bf16_mask
finish_bf16:
sw a5, 0(a2)
mv a0, a5
addi a7, x0, 34
ecall
la a0, next_line
addi a7, x0, 4
ecall
slti t3, a3, 7 # (a3==7) t3=0, (a3<7) t3=1
+ and s8, a5, s7 # abs bf16 -> s8
bne t3, x0, compare
# saved first max
j max_change
compare:
- # compare exp
- blt s0, t0, max_change
- blt t0, s0, max_not_change
- # compare man
- blt s1, t1, max_change
- blt t1, s1, max_not_change
+ blt s8, s0, max_not_change
max_change:
- mv s0, t0 # max exp(s0)
- mv s1, t1 # max man(s1)
+ mv s0, s8 # max bf16(s0)
mv a4, a5 # max bf16(a4)
max_not_change:
addi a3, a3, -1
addi a1, a1, 4
addi a2, a2, 4
bne a3, x0, for1
# Absolute
- li t2, 0x7fffffff
- and a4, a4, t2
+ and a4, a4, s7
#print
la a0, max_string
addi a7, x0, 4
ecall
mv a0, a4
addi a7, x0, 34
ecall
# epilogue
- lw s0, 0(sp)
- lw s1, 4(sp)
- addi sp, sp, 8
- jr ra
+ and s0, x0, s0
+ and s1, x0, s1
+ addi s11, s11, -1
+ addi sp, sp, 4
+ and s8, x0, s8
+ bne s11, x0, main_for
+Exit:
+ li a7, 10
+ ecall
You shall use RDCYCLE/RDCYCLEH instruction for the statistics of your program’s execution.
:notes: jserv
I have added this part to the conclusion section.
.data
maxbf16: .word 0x40b40000
.text
main:
lw a4, maxbf16
li t6, 0x007FFFFF
quant_bf16_to_int8:
li s2, 0x7F #127 to hex
and t0, a4, t6 #max_man->t0 maxbf16->a4
srli t0, t0, 15
srli t1, a4, 23 #max_exp
addi t1, t1, -127 #Denominator-> power of exp <- t1
li t4, 7
sub t3, t4, t1
srl t0, t0, t3
li t5, 1
sll t5, t5, t1 #1<<t1
or t0, t5, t5 #10^(t1) + fraction
li s3, 0
scale:
add s4, s4, t0
addi s3, s3, 1 #count scale
bge s2, s4, scale
exit:
mv a0, s3
li a7, 1
ecall
In quantization, the scale divides the floating-point range into 127 equal parts, making the maximum value 127, as we previously determined. However, due to my inability to successfully implement floating-point division, I used a somewhat crude method that only yields integers. Consequently, the INT8 values obtained using this method have significant deviations from the theoretical values. Until I can successfully implement floating-point division, this approach serves as a substitute.
int_to_floatpoint:
addi sp, sp, -16
sw s2, 0(sp)
sw s3, 4(sp)
sw s4, 8(sp)
sw s5, 12(sp)
li t0, 0
mv s2, a6
loop2:
srli a6, a6, 1
addi t0, t0, 1
blt x0, a6, loop2
###end loop2
addi t0, t0, -1 # count shift right num
addi s3, t0, 127 # exp_num
# Why not +127? Because the shift count is one extra.
slli s3, s3, 23 # exp in bf16 -> s3
li t1, 0xFFFFFFFF
li t2, 32
sub t3, t2, t0
srl t1, t1, t3
and s4, s2, t1 # frac_num in bf16
li t1, 23
sub t1, t1, t0 # t1=23-(count shift right num)
sll s4, s4, t1 # frac in bf16
or s5, s4, s3 # int->bf16 ok
mv a6, s5
la a0,next_line
li a7,4
ecall
mv a0, a6
li a7, 34
ecall
lw s2, 0(sp)
lw s3, 4(sp)
lw s4, 8(sp)
lw s5, 12(sp)
addi sp, sp, 16
Since the result obtained from the scale is an integer, it is converted to bf16 for ease of multiplication in the subsequent steps.
.data
test1: .word 0x42000000
test2: .word 0x40860000
.text
Multi_bfloat:
# decoder function input is a0
# jal ra,decoder # load a0(two bloat number in one register) to t0
# decoder function output is s5,s6
lw s5,test1
lw s6,test2
add t0,s5,x0 # store s5(bfloat 2) to t0
add t1,s6,x0 # store s6(bfloat 1) to t1
li t6,0x7F800000 # mask 0x7F800000
# get exponent to t2,t3
and t3,t0,t6 # use mask 0x7F800000 to get t0 exponent
and t2,t1,t6 # use mask 0x7F800000 to get t1 exponent
add t3,t3,t2 # add two exponent to t3
li t6,0x3F800000 # mask 0x3F800000
sub t3,t3,t6 # sub 127 to exponent
# get sign
xor t2,t0,t1 # get sign and store on t2
srli t2,t2,31 # get rid of useless data
slli t2,t2,31 # let sign back to right position
# get sign and exponent together
or t3,t3,t2
# set the sign and exponent to t0
slli t0,t0,9
srli t0,t0,9
or t0,t3,t0
# get fraction to t2 and t3
li t6,0x7F # mask 0x7F
slli t6,t6,16 # shift mask to 0x7F0000
and t2,t0,t6 # use mask 0x7F0000 get fraction
and t3,t1,t6 # use mask 0x7F0000 get fraction
slli t2,t2,9 # shift left let no leading 0
srli t2,t2,1 # shift right let leading has one 0
li t6,0x80000000 # mask 80000000
or t2,t2,t6 # use mask 0x80000000 to add integer
srli t2,t2,1 # shift right to add space for overflow
slli t3,t3,8 # shift left let no leading 0
or t3,t3,t6 # use mask 0x80000000 to add integer
srli t3,t3,1 # shift right to add space for overflow
add s11,x0,x0 # set a counter and 0
addi s10,x0,8 # set a end condition
add t1,x0,x0 # reset t1 to 0 and let this register be result
li t6,0x80000000 # mask 0x80000000
loop:
addi s11,s11,1 # add 1 at counter every loop
srli t6,t6,1 # shift right at 1 every loop
and t4,t2,t6 # use mask to specified number at that place
beq t4,x0,not_add # jump if t4 equal to 0
add t1,t1,t3 # add t3 to t1
not_add:
srli t3,t3,1 # shift left 1 bit to t3
bne s11,s10,loop # if the condition not satisfy return to loop
# end of loop
# check if overflow
li t6,0x80000000
and t4,t1,t6 # get t1 max bit
# if t4 max bit equal to 0 will not overflow
beq t4,x0,not_overflow
# if overflow
slli t1,t1,1 # shift left 1 bits to remove integer
li t6,0x800000 # mask 0x800000
add t0,t0,t6 # exponent add 1 if overflow
j Mult_end # jump to Mult_end
# if not overflow
not_overflow:
slli t1,t1,2 # shift left 2 bits to remove integer
Mult_end:
srli t1,t1,24 # shift right to remove useless bits
addi t1,t1,1 # add 1 little bit to check if carry
srli t1,t1,1 # shift right to remove useless bits
slli t1,t1,16 # shift left to let fraction be right position
srli t0,t0,23 # shift right to remove useless bits
slli t0,t0,23 # shift left to let sign and exponent be right position
or t0,t0,t1 # combine t0 and t1 together to get bfloat
add s3,t0,x0 # store bfloat after multiplication to s3
#ret # return to main
### end of function
exit:
mv a0,s3
li a7,2
ecall
I referenced my classmate Brian Cheng's method for floating-point multiplication, which I applied to the final stage where floating-point values are multiplied by the scale.
rm_decimal_of_bf16:
mv t0, a4
li t3, 0x80000000
and t3, t0, t3
srli t3, t3, 31 # Detecting positive or negative
and t0,t0,s7 # absolution
srli t0, t0, 23 # exp->s2
addi t0, t0,-127 # power of 2
and t1, t1, t6
srli t1, t1, 16
li t2, 0x80 # 1000 0000
or t1, t1, t2
li t2, 7
sub t2, t2, t0 # how many bits do you right shift
srl t1, t1, t2 # ANS
li t2, 1
bne t3, t2, printINT8
Add_negative_sign:
add t2, t1, t1
sub t1, t1, t2
printINT8:
la a0,next_line
li a7, 4
ecall
mv a0, t1
li a7, 1
ecall
# next data
addi s5, s5, 4
addi a3, a3, -1
bne a3, x0, for2
### end of function
Since the result obtained from the bf16 multiplier is also in bf16 format, I remove the decimal part. In the final stage, I check if the sign bit is 1. If it is 1, I add the negative sign.
.data
array: .word 0x3f99999a, 0x3f9a0000, 0x4013d70a, 0x40140000, 0x405d70a4, 0x405d0000, 0x40b428f6
# test data1: 1.200000, 1.203125, 2.310000, 2.312500, 3.460000, 3.4531255, 5.630000
array2: .word 0x3dcccccd, 0x3e4ccccd, 0x3f99999a, 0x40400000, 0x40066666, 0xc0866666, 0x40600000
# test data2: 0.1, 0.2, 1.2, 3, 2.1, -4.2, 3.5
array3: .word 0x40490fdb, 0x3dfcd6e9, 0x3f9e0652, 0x35a5167a, 0x322bcc77, 0x3f800000, 0x339652e8
# test data3: 3.14159265, 0.12345678 , 1.23456789 , 0.00000123, 0.00000001, 0.99999999 , 0.00000007
array_bf16: .word 0, 0, 0, 0, 0, 0, 0
next_line: .string "\n"
max_string: .string "maximum number is "
bf16_string: .string "\nbfloat16 number is \n"
scale_num: .string "\nscale is "
transform_to_bf16_is: .string "\ntransform to bf16 is:"
.text
main:
# push data
addi sp, sp, -12
la t0, array
sw t0, 0(sp)
la t0, array2
sw t0, 4(sp)
la t0, array3
sw t0, 8(sp)
la s10, array_bf16 # global array_bf16 address(s10)
addi s11, x0, 3 # data number(s11) -> three groups data
li t5, 0x7F800000 # exp_mask
li t6, 0x007FFFFF # man_mask
li s6, 0xFFFF0000 # bf16_mask
li s7, 0x7FFFFFFF # abs_mask
main_for:
la a0, bf16_string #call bfloat16 number is
addi a7, x0, 4
ecall
addi a3, x0, 7 # array size(a3)
lw a1, 0(sp) # array_data pointer(a1)
mv a2, s10 # array_bf16 pointer(a2)
fp32_to_bf16_findmax:
# array_data pointer(a1), array_bf16 pointer(a2), array size(a3)
# array loop
for1:
lw a5, 0(a1) # x(a5)
# fp32_to_bf16
and t0, a5, t5 # x exp(t0)
and t1, a5, t6 # x man(t1)
# if zero
bne t0, x0, else
# exp is zero
bne t1, x0, else
j finish_bf16
else:
# if infinity or NaN
beq t0, t5, finish_bf16
# round
# r = x.man shift right 8 bit
# x+r = x.man + x.man>>8
li t3, 0x00800000 # make up 1 to No.24bit
or t1, t1, t3
srli t2, t1, 8 # r(t2)
add t1, t1, t2 # x+r
# check carry
and t4, t1, t3 # check No.24bit (t4), 0:carry, 1: nocarry
bne t4, x0, no_carry
add t0, t0, t3 # exp+1
srli t1 ,t1, 1 # man alignment
no_carry:
and t0, t0, t5 # mask exp(t0)
and t1, t1, t6 # mask man(t1)
or t2, t0, t1 # combine exp & man
li t3, 0x80000000 # sign mask
and t3, a5, t3 # x sign
or a5, t3, t2 # bfloat16(a5)
and a5, a5, s6 #s6 -> bf16_mask
finish_bf16:
sw a5, 0(a2)
mv a0, a5
addi a7, x0, 34
ecall
la a0, next_line
addi a7, x0, 4
ecall
slti t3, a3, 7 # (a3==7) t3=0, (a3<7) t3=1
and s8, a5, s7 # abs bf16 -> s8
bne t3, x0, compare
# saved first max
j max_change
compare:
blt s8, s0, max_not_change
max_change:
mv s0, s8 # max bf16(s0)
mv a4, a5 # max bf16(a4)
max_not_change:
addi a3, a3, -1
addi a1, a1, 4
addi a2, a2, 4
bne a3, x0, for1
# Absolute
and a4, a4, s7 # s7=>0x7FFFFFFF abs_mask
#print
la a0, max_string # call maximum number is
addi a7, x0, 4
ecall
mv a0, a4
addi a7, x0, 34
ecall
and s0, x0, s0
and s1, x0, s1
#scale_function
scale:
addi sp, sp, -16
sw s2, 0(sp)
sw s3, 4(sp)
sw s4, 8(sp)
sw s5, 12(sp)
li s2, 0x7F # 127 to hex
li s3, 1 # add to fraction head (1.fraction)
and t0, a4, t6 # max_man->t0 maxbf16->a4 man_mask=0x007FFFFF->t6
srli t0, t0, 16 # bf16_man t0=t0>>15
srli t1, a4, 23 # max_exp
addi t1, t1, -127 # Denominator-> power of 2 <- t1
li t4, 7 # man has 7bits
sub t3, t4, t1
srl t0, t0, t3 # mean t0 >> (7-(power of 2))
sll s3, s3, t1 # s3=(1<<t1)
or t0, s3, t0 # 10^(t1) + fraction
li a6, 0
scale_loop:
add s4, s4, t0
addi a6, a6, 1 # count scale
bge s2, s4, scale_loop
lw s2, 0(sp)
lw s3, 4(sp)
lw s4, 8(sp)
lw s5, 12(sp)
addi sp, sp, 16
la a0,scale_num
li a7,4
ecall
mv a0, a6
li a7, 1
ecall
int_to_fp:
addi sp, sp, -16
sw s2, 0(sp)
sw s3, 4(sp)
sw s4, 8(sp)
sw s5, 12(sp)
li t0, 0
mv s2, a6
loop2:
srli a6, a6, 1
addi t0, t0, 1
blt x0, a6, loop2
###end loop2
addi t0, t0, -1 # count shift right num
addi s3, t0, 127 # exp_num
# Why not +127? Because the shift count is one extra.
slli s3, s3, 23 # exp in bf16 -> s3
li t1, 0xFFFFFFFF
li t2, 32
sub t3, t2, t0
srl t1, t1, t3
and s4, s2, t1 # frac_num in bf16
li t1, 23
sub t1, t1, t0 # t1=23-(count shift right num)
sll s4, s4, t1 # frac in bf16
or s5, s4, s3 # int->bf16 ok
mv a6, s5
la a0,transform_to_bf16_is
li a7,4
ecall
mv a0, a6
li a7, 34
ecall
lw s2, 0(sp)
lw s3, 4(sp)
lw s4, 8(sp)
lw s5, 12(sp)
addi sp, sp, 16
Multi_bfloat:
# decoder function input is a0
# jal ra,decoder
# load a0(two bloat number in one register) to t0
# decoder function output is s5,s6
addi sp, sp, -16
sw s2, 0(sp)
sw s3, 4(sp)
sw s4, 8(sp)
sw s5, 12(sp)
mv s5, s10
addi a3, x0, 7 # array size -> 7
for2:
lw a4, 0(s5)
add t0,a6,x0 # store s5(bfloat 2) to t0
add t1,a4,x0 # store s6(bfloat 1) to t1
li s2,0x7F800000 # mask 0x7F800000
# get exponent to t2,t3
and t3,t0,s2 # use mask 0x7F800000 to get t0 exponent
and t2,t1,s2 # use mask 0x7F800000 to get t1 exponent
add t3,t3,t2 # add two exponent to t3
li s2,0x3F800000 # mask 0x3F800000
sub t3,t3,s2 # sub 127 to exponent
# get sign
xor t2,t0,t1 # get sign and store on t2
srli t2,t2,31 # get rid of useless data
slli t2,t2,31 # let sign back to right position
# get sign and exponent together
or t3,t3,t2
# set the sign and exponent to t0
slli t0,t0,9
srli t0,t0,9
or t0,t3,t0
# get fraction to t2 and t3
li s2,0x7F # mask 0x7F
slli s2,s2,16 # shift mask to 0x7F0000
and t2,t0,s2 # use mask 0x7F0000 get fraction
and t3,t1,s2 # use mask 0x7F0000 get fraction
slli t2,t2,9 # shift left let no leading 0
srli t2,t2,1 # shift right let leading has one 0
li s2,0x80000000 # mask 0x80000000
or t2,t2,s2 # use mask 0x80000000 to add integer
srli t2,t2,1 # shift right to add space for overflow
slli t3,t3,8 # shift left let no leading 0
or t3,t3,s2 # use mask 0x80000000 to add integer
srli t3,t3,1 # shift right to add space for overflow
add s3,x0,x0 # set a counter and 0
addi s4,x0,8 # set a end condition
add t1,x0,x0 # reset t1 to 0 and let this register be result
li s2,0x80000000 # mask 0x80000000
loop:
addi s3,s3,1 # add 1 at counter every loop
srli s2,s2,1 # shift right at 1 every loop
and t4,t2,s2 # use mask to specified number at that place
beq t4,x0,not_add # jump if t4 equal to 0
add t1,t1,t3 # add t3 to t1
not_add:
srli t3,t3,1 # shift left 1 bit to t3
bne s3,s4,loop # if the condition not satisfy return to loop
# end of loop
# check if overflow
li s2,0x80000000
and t4,t1,s2 # get t1 max bit
# if t4 max bit equal to 0 will not overflow
beq t4,x0,not_overflow
# if overflow
slli t1,t1,1 # shift left 1 bits to remove integer
li s2,0x800000 # mask 0x800000
add t0,t0,s2 # exponent add 1 if overflow
j Mult_end # jump to Mult_end
# if not overflow
not_overflow:
slli t1,t1,2 # shift left 2 bits to remove integer
Mult_end:
srli t1,t1,24 # shift right to remove useless bits
addi t1,t1,1 # add 1 little bit to check if carry
srli t1,t1,1 # shift right to remove useless bits
slli t1,t1,16 # shift left to let fraction be right position
srli t0,t0,23 # shift right to remove useless bits
slli t0,t0,23 # shift left to let sign and exponent be right position
or t0,t0,t1 # combine t0 and t1 together to get bfloat
add a4,t0,x0 # store bfloat after multiplication to s3
### end of function
#Remove the decimal part of the BF16 to make it an integer.
rm_decimal_of_bf16:
mv t0, a4
li t3, 0x80000000
and t3, t0, t3
srli t3, t3, 31 # Detecting positive or negative
and t0,t0,s7 # absolution
srli t0, t0, 23 # exp->s2
addi t0, t0,-127 # power of 2
and t1, t1, t6
srli t1, t1, 16
li t2, 0x80 # 1000 0000
or t1, t1, t2
li t2, 7
sub t2, t2, t0 # how many bits do you right shift
srl t1, t1, t2 # ANS
li t2, 1
bne t3, t2, printINT8
Add_negative_sign:
add t2, t1, t1
sub t1, t1, t2
printINT8:
la a0,next_line
li a7, 4
ecall
mv a0, t1
li a7, 1
ecall
# next data
addi s5, s5, 4
addi a3, a3, -1
bne a3, x0, for2
### end of function
lw s2, 0(sp)
lw s3, 4(sp)
lw s4, 8(sp)
lw s5, 12(sp)
addi sp, sp, 16
next_array:
addi s11, s11, -1
addi sp, sp, 4
and s8, x0, s8
bne s11, x0, main_for
Exit:
li a7, 10
ecall
data1
(ripes)
data2
(ripes)
data3
(ripes)
Using six different optimization levels: -O0
-O1
-O2
-O3
-Os
-Ofast
, and conducting further analysis.
Reference
3.10 Options That Control Optimization
description in Chinese*
riscv-none-elf-gcc -march=rv32i -mabi=ilp32 -O0 lab2.c -o lab2.elf
riscv-none-elf-gcc -march=rv32i -mabi=ilp32 -O1 lab2.c -o lab2.elf
riscv-none-elf-gcc -march=rv32i -mabi=ilp32 -O2 lab2.c -o lab2.elf
riscv-none-elf-gcc -march=rv32i -mabi=ilp32 -O3 lab2.c -o lab2.elf
riscv-none-elf-gcc -march=rv32i -mabi=ilp32 -Os lab2.c -o lab2.elf
riscv-none-elf-size lab2.elf
$ riscv-none-elf-readelf -h lab2.elf
ELF Header:
Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00
Class: ELF32
Data: 2's complement, little endian
Version: 1 (current)
OS/ABI: UNIX - System V
ABI Version: 0
Type: EXEC (Executable file)
Machine: RISC-V
Version: 0x1
Entry point address: 0x100d8
Start of program headers: 52 (bytes into file)
Start of section headers: 99312 (bytes into file)
Flags: 0x0
Size of this header: 52 (bytes)
Size of program headers: 32 (bytes)
Number of program headers: 3
Size of section headers: 40 (bytes)
Number of section headers: 15
Section header string table index: 14
riscv-none-elf-objdump -d lab2.elf > disassembly/lab2.txt
./rv32emu lab2.elf
data 1
bfloat16 number is
1.203125000000
1.203125000000
2.312500000000
2.312500000000
3.453125000000
3.453125000000
5.625000000000
maximum number is 5.625000000000
after quantization
27
27
52
52
77
77
127
data 2
bfloat16 number is
0.100097656250
0.200195312500
1.203125000000
3.000000000000
2.093750000000
-4.187500000000
3.500000000000
maximum number is 4.187500000000
after quantization
3
6
36
90
63
-127
106
data 3
bfloat16 number is
3.140625000000
0.123535156250
1.234375000000
0.000001229346
0.000000010012
1.000000000000
0.000000069849
maximum number is 3.140625000000
after quantization
127
4
49
0
0
40
0
inferior exit code 0
$ make tool
~/rv32emu/tests/hw2$ ~/rv32emu/build/rv_histogram ./lab2.elf
~/rv32emu/tests/hw2$ ~/rv32emu/build/rv_histogram -r ./lab2.elf
Instructions Histogram
Registers Histogram
Due to the excessively verbose optimized assembly code, I have chosen to analyze the quant_bf16_to_int8
function, which is a crucial part of the code.
$ riscv-none-elf-gcc -march=rv32i -mabi=ilp32 -O0 lab2.c -o lab2.elf
$ riscv-none-elf-size lab2.elf
~/rv32emu/tests/hw2$ riscv-none-elf-size lab2.elf
text data bss dec hex filename
81736 2320 1556 85612 14e6c lab2.elf
~/rv32emu/tests/hw2$ riscv-none-elf-readelf -h lab2.elf
ELF Header:
Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00
Class: ELF32
Data: 2's complement, little endian
Version: 1 (current)
OS/ABI: UNIX - System V
ABI Version: 0
Type: EXEC (Executable file)
Machine: RISC-V
Version: 0x1
Entry point address: 0x100d8
Start of program headers: 52 (bytes into file)
Start of section headers: 99312 (bytes into file)
Flags: 0x0
Size of this header: 52 (bytes)
Size of program headers: 32 (bytes)
Number of program headers: 3
Size of section headers: 40 (bytes)
Number of section headers: 15
Section header string table index: 14
000106e8 <quant_bf16_to_int8>:
106e8: fd010113 add sp,sp,-48
106ec: 02112623 sw ra,44(sp)
106f0: 02812423 sw s0,40(sp)
106f4: 03010413 add s0,sp,48
106f8: fca42e23 sw a0,-36(s0)
106fc: fdc42783 lw a5,-36(s0)
10700: 0007a703 lw a4,0(a5)
10704: 800007b7 lui a5,0x80000
10708: fff78793 add a5,a5,-1 # 7fffffff <__BSS_END__+0x7ffdb0d3>
1070c: 00f777b3 and a5,a4,a5
10710: fef42623 sw a5,-20(s0)
10714: 00100793 li a5,1
10718: fef42423 sw a5,-24(s0)
1071c: 0680006f j 10784 <quant_bf16_to_int8+0x9c>
10720: fe842783 lw a5,-24(s0)
10724: 00279793 sll a5,a5,0x2
10728: fdc42703 lw a4,-36(s0)
1072c: 00f707b3 add a5,a4,a5
10730: 0007a703 lw a4,0(a5)
10734: 800007b7 lui a5,0x80000
10738: fff78793 add a5,a5,-1 # 7fffffff <__BSS_END__+0x7ffdb0d3>
1073c: 00f777b3 and a5,a4,a5
10740: 00078593 mv a1,a5
10744: fec42503 lw a0,-20(s0)
10748: 249000ef jal 11190 <__lesf2>
1074c: 00050793 mv a5,a0
10750: 0207d463 bgez a5,10778 <quant_bf16_to_int8+0x90>
10754: fe842783 lw a5,-24(s0)
10758: 00279793 sll a5,a5,0x2
1075c: fdc42703 lw a4,-36(s0)
10760: 00f707b3 add a5,a4,a5
10764: 0007a703 lw a4,0(a5)
10768: 800007b7 lui a5,0x80000
1076c: fff78793 add a5,a5,-1 # 7fffffff <__BSS_END__+0x7ffdb0d3>
10770: 00f777b3 and a5,a4,a5
10774: fef42623 sw a5,-20(s0)
10778: fe842783 lw a5,-24(s0)
1077c: 00178793 add a5,a5,1
10780: fef42423 sw a5,-24(s0)
10784: fe842703 lw a4,-24(s0)
10788: 00600793 li a5,6
1078c: f8e7dae3 bge a5,a4,10720 <quant_bf16_to_int8+0x38>
10790: fec42503 lw a0,-20(s0)
10794: 74d000ef jal 116e0 <__extendsfdf2>
10798: 00050713 mv a4,a0
1079c: 00058793 mv a5,a1
107a0: 00070613 mv a2,a4
107a4: 00078693 mv a3,a5
107a8: 000237b7 lui a5,0x23
107ac: 15078513 add a0,a5,336 # 23150 <__trunctfdf2+0x4ac>
107b0: 770010ef jal 11f20 <printf>
107b4: 000237b7 lui a5,0x23
107b8: fec42583 lw a1,-20(s0)
107bc: 1707a503 lw a0,368(a5) # 23170 <__trunctfdf2+0x4cc>
107c0: 560000ef jal 10d20 <__divsf3>
107c4: 00050793 mv a5,a0
107c8: fef42023 sw a5,-32(s0)
107cc: fe042223 sw zero,-28(s0)
107d0: 0540006f j 10824 <quant_bf16_to_int8+0x13c>
107d4: fe442783 lw a5,-28(s0)
107d8: 00279793 sll a5,a5,0x2
107dc: fdc42703 lw a4,-36(s0)
107e0: 00f707b3 add a5,a4,a5
107e4: 0007a783 lw a5,0(a5)
107e8: fe042583 lw a1,-32(s0)
107ec: 00078513 mv a0,a5
107f0: 271000ef jal 11260 <__mulsf3>
107f4: 00050793 mv a5,a0
107f8: 00078513 mv a0,a5
107fc: 671000ef jal 1166c <__fixsfsi>
10800: 00050693 mv a3,a0
10804: fa818713 add a4,gp,-88 # 24978 <after_quant.0>
10808: fe442783 lw a5,-28(s0)
1080c: 00279793 sll a5,a5,0x2
10810: 00f707b3 add a5,a4,a5
10814: 00d7a023 sw a3,0(a5)
10818: fe442783 lw a5,-28(s0)
1081c: 00178793 add a5,a5,1
10820: fef42223 sw a5,-28(s0)
10824: fe442703 lw a4,-28(s0)
10828: 00600793 li a5,6
1082c: fae7d4e3 bge a5,a4,107d4 <quant_bf16_to_int8+0xec>
10830: fa818793 add a5,gp,-88 # 24978 <after_quant.0>
10834: 00078513 mv a0,a5
10838: 02c12083 lw ra,44(sp)
1083c: 02812403 lw s0,40(sp)
10840: 03010113 add sp,sp,48
10844: 00008067 ret
Statistics
a0
a1
a2
a3
a4
a5
a7
a8
s0
$ riscv-none-elf-gcc -march=rv32i -mabi=ilp32 -O1 lab2.c -o lab2_O1.elf
$ riscv-none-elf-size lab2_O1.elf
~/rv32emu/tests/hw2$ riscv-none-elf-size lab2.elf
text data bss dec hex filename
81016 2328 1556 85900 14ba4 lab2_O1.elf
~/rv32emu/tests/hw2$ riscv-none-elf-readelf -h lab2_O1.elf
ELF Header:
Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00
Class: ELF32
Data: 2's complement, little endian
Version: 1 (current)
OS/ABI: UNIX - System V
ABI Version: 0
Type: EXEC (Executable file)
Machine: RISC-V
Version: 0x1
Entry point address: 0x100d8
Start of program headers: 52 (bytes into file)
Start of section headers: 99320 (bytes into file)
Flags: 0x0
Size of this header: 52 (bytes)
Size of program headers: 32 (bytes)
Number of program headers: 3
Size of section headers: 40 (bytes)
Number of section headers: 15
Section header string table index: 14
000101d4 <quant_bf16_to_int8>:
101d4: fe010113 add sp,sp,-32
101d8: 00112e23 sw ra,28(sp)
101dc: 00812c23 sw s0,24(sp)
101e0: 00912a23 sw s1,20(sp)
101e4: 01212823 sw s2,16(sp)
101e8: 01312623 sw s3,12(sp)
101ec: 01412423 sw s4,8(sp)
101f0: 01512223 sw s5,4(sp)
101f4: 00052983 lw s3,0(a0) # ff800000 <__BSS_END__+0xff7db0cc>
101f8: 00199993 sll s3,s3,0x1
101fc: 0019d993 srl s3,s3,0x1
10200: 00450493 add s1,a0,4
10204: 00050913 mv s2,a0
10208: 01c50a13 add s4,a0,28
1020c: 80000ab7 lui s5,0x80000
10210: fffa8a93 add s5,s5,-1 # 7fffffff <__BSS_END__+0x7ffdb0cb>
10214: 00c0006f j 10220 <quant_bf16_to_int8+0x4c>
10218: 00448493 add s1,s1,4
1021c: 03448263 beq s1,s4,10240 <quant_bf16_to_int8+0x6c>
10220: 0004a403 lw s0,0(s1)
10224: 01547433 and s0,s0,s5
10228: 00098593 mv a1,s3
1022c: 00040513 mv a0,s0
10230: 4a1000ef jal 10ed0 <__gesf2>
10234: fea052e3 blez a0,10218 <quant_bf16_to_int8+0x44>
10238: 00040993 mv s3,s0
1023c: fddff06f j 10218 <quant_bf16_to_int8+0x44>
10240: 00098513 mv a0,s3
10244: 1d4010ef jal 11418 <__extendsfdf2>
10248: 00050613 mv a2,a0
1024c: 00058693 mv a3,a1
10250: 00023537 lui a0,0x23
10254: db850513 add a0,a0,-584 # 22db8 <__trunctfdf2+0x3dc>
10258: 201010ef jal 11c58 <printf>
1025c: 00098593 mv a1,s3
10260: f341a503 lw a0,-204(gp) # 24904 <__SDATA_BEGIN__+0x6c>
10264: 7fc000ef jal 10a60 <__divsf3>
10268: 00050493 mv s1,a0
1026c: fb018413 add s0,gp,-80 # 24980 <after_quant.0>
10270: 01c40993 add s3,s0,28
10274: 00092583 lw a1,0(s2)
10278: 00048513 mv a0,s1
1027c: 51d000ef jal 10f98 <__mulsf3>
10280: 124010ef jal 113a4 <__fixsfsi>
10284: 00a42023 sw a0,0(s0)
10288: 00490913 add s2,s2,4
1028c: 00440413 add s0,s0,4
10290: ff3412e3 bne s0,s3,10274 <quant_bf16_to_int8+0xa0>
10294: fb018513 add a0,gp,-80 # 24980 <after_quant.0>
10298: 01c12083 lw ra,28(sp)
1029c: 01812403 lw s0,24(sp)
102a0: 01412483 lw s1,20(sp)
102a4: 01012903 lw s2,16(sp)
102a8: 00c12983 lw s3,12(sp)
102ac: 00812a03 lw s4,8(sp)
102b0: 00412a83 lw s5,4(sp)
102b4: 02010113 add sp,sp,32
102b8: 00008067 ret
Statistics
a0
a1
a2
a3
a4
a5
a6
a8``a9``a10
s0
s1
s2
s3
s4
s5
$ riscv-none-elf-gcc -march=rv32i -mabi=ilp32 -O2 lab2.c -o lab2_O2.elf
$ riscv-none-elf-size lab2_O2.elf
~/rv32emu/tests/hw2$ riscv-none-elf-size lab2_02.elf
text data bss dec hex filename
81224 2328 1556 85108 14c74 lab2_O2.elf
~/rv32emu/tests/hw2$ riscv-none-elf-readelf -h lab2_O2.elf
ELF Header:
Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00
Class: ELF32
Data: 2's complement, little endian
Version: 1 (current)
OS/ABI: UNIX - System V
ABI Version: 0
Type: EXEC (Executable file)
Machine: RISC-V
Version: 0x1
Entry point address: 0x10474
Start of program headers: 52 (bytes into file)
Start of section headers: 99336 (bytes into file)
Flags: 0x0
Size of this header: 52 (bytes)
Size of program headers: 32 (bytes)
Number of program headers: 3
Size of section headers: 40 (bytes)
Number of section headers: 15
Section header string table index: 14
00010574 <quant_bf16_to_int8>:
10574: fe010113 add sp,sp,-32
10578: 01312623 sw s3,12(sp)
1057c: 00052983 lw s3,0(a0) # ff800000 <__BSS_END__+0xff7db0cc>
10580: 01412423 sw s4,8(sp)
10584: 80000a37 lui s4,0x80000
10588: fffa0a13 add s4,s4,-1 # 7fffffff <__BSS_END__+0x7ffdb0cb>
1058c: 00812c23 sw s0,24(sp)
10590: 01212823 sw s2,16(sp)
10594: 01512223 sw s5,4(sp)
10598: 00112e23 sw ra,28(sp)
1059c: 00912a23 sw s1,20(sp)
105a0: 00050913 mv s2,a0
105a4: 0149f9b3 and s3,s3,s4
105a8: 00450413 add s0,a0,4
105ac: 01c50a93 add s5,a0,28
105b0: 00042483 lw s1,0(s0) # ffff0000 <__BSS_END__+0xfffcb0cc>
105b4: 00098513 mv a0,s3
105b8: 00440413 add s0,s0,4
105bc: 0144f4b3 and s1,s1,s4
105c0: 00048593 mv a1,s1
105c4: 1d5000ef jal 10f98 <__lesf2>
105c8: 00055463 bgez a0,105d0 <quant_bf16_to_int8+0x5c>
105cc: 00048993 mv s3,s1
105d0: ff5410e3 bne s0,s5,105b0 <quant_bf16_to_int8+0x3c>
105d4: 00098513 mv a0,s3
105d8: 711000ef jal 114e8 <__extendsfdf2>
105dc: 00050613 mv a2,a0
105e0: 00023537 lui a0,0x23
105e4: 00058693 mv a3,a1
105e8: e8850513 add a0,a0,-376 # 22e88 <__trunctfdf2+0x3dc>
105ec: 73c010ef jal 11d28 <printf>
105f0: f341a503 lw a0,-204(gp) # 24904 <__SDATA_BEGIN__+0x6c>
105f4: 00098593 mv a1,s3
105f8: 530000ef jal 10b28 <__divsf3>
105fc: fb018413 add s0,gp,-80 # 24980 <after_quant.0>
10600: 00050493 mv s1,a0
10604: 01c40993 add s3,s0,28
10608: 00092583 lw a1,0(s2)
1060c: 00048513 mv a0,s1
10610: 00440413 add s0,s0,4
10614: 255000ef jal 11068 <__mulsf3>
10618: 65d000ef jal 11474 <__fixsfsi>
1061c: fea42e23 sw a0,-4(s0)
10620: 00490913 add s2,s2,4
10624: ff3412e3 bne s0,s3,10608 <quant_bf16_to_int8+0x94>
10628: 01c12083 lw ra,28(sp)
1062c: 01812403 lw s0,24(sp)
10630: 01412483 lw s1,20(sp)
10634: 01012903 lw s2,16(sp)
10638: 00c12983 lw s3,12(sp)
1063c: 00412a83 lw s5,4(sp)
10640: fb018513 add a0,gp,-80 # 24980 <after_quant.0>
10644: 00812a03 lw s4,8(sp)
10648: 02010113 add sp,sp,32
1064c: 00008067 ret
Statistics
a0
a1
a2
a3
a4
a5
a8
a9
s0
s1
s2
s3
s4
s5
$ riscv-none-elf-gcc -march=rv32i -mabi=ilp32 -O3 lab2.c -o lab2_O3.elf
$ riscv-none-elf-size lab2_O3.elf
~/rv32emu/tests/hw2$ riscv-none-elf-size lab2_03.elf
text data bss dec hex filename
81212 2396 1556 85164 14cac lab2_O3.elf
~/rv32emu/tests/hw2$ riscv-none-elf-readelf -h lab2_O3.elf
ELF Header:
Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00
Class: ELF32
Data: 2's complement, little endian
Version: 1 (current)
OS/ABI: UNIX - System V
ABI Version: 0
Type: EXEC (Executable file)
Machine: RISC-V
Version: 0x1
Entry point address: 0x102fc
Start of program headers: 52 (bytes into file)
Start of section headers: 99492 (bytes into file)
Flags: 0x0
Size of this header: 52 (bytes)
Size of program headers: 32 (bytes)
Number of program headers: 3
Size of section headers: 40 (bytes)
Number of section headers: 15
Section header string table index: 14
000103fc <quant_bf16_to_int8>:
103fc: fe010113 add sp,sp,-32
10400: 00912a23 sw s1,20(sp)
10404: 01212823 sw s2,16(sp)
10408: 00052483 lw s1,0(a0) # ff800000 <__BSS_END__+0xff7db088>
1040c: 00452903 lw s2,4(a0)
10410: 800007b7 lui a5,0x80000
10414: fff78793 add a5,a5,-1 # 7fffffff <__BSS_END__+0x7ffdb087>
10418: 00f4f4b3 and s1,s1,a5
1041c: 00f97933 and s2,s2,a5
10420: 00812c23 sw s0,24(sp)
10424: 00090593 mv a1,s2
10428: 00050413 mv s0,a0
1042c: 00048513 mv a0,s1
10430: 00112e23 sw ra,28(sp)
10434: 01312623 sw s3,12(sp)
10438: 3ad000ef jal 10fe4 <__lesf2>
1043c: 00055463 bgez a0,10444 <quant_bf16_to_int8+0x48>
10440: 00090493 mv s1,s2
10444: 00842903 lw s2,8(s0) # ffff0008 <__BSS_END__+0xfffcb090>
10448: 00048513 mv a0,s1
1044c: 00191913 sll s2,s2,0x1
10450: 00195913 srl s2,s2,0x1
10454: 00090593 mv a1,s2
10458: 38d000ef jal 10fe4 <__lesf2>
1045c: 00055463 bgez a0,10464 <quant_bf16_to_int8+0x68>
10460: 00090493 mv s1,s2
10464: 00c42903 lw s2,12(s0)
10468: 00048513 mv a0,s1
1046c: 00191913 sll s2,s2,0x1
10470: 00195913 srl s2,s2,0x1
10474: 00090593 mv a1,s2
10478: 36d000ef jal 10fe4 <__lesf2>
1047c: 00055463 bgez a0,10484 <quant_bf16_to_int8+0x88>
10480: 00090493 mv s1,s2
10484: 01042903 lw s2,16(s0)
10488: 00048513 mv a0,s1
1048c: 00191913 sll s2,s2,0x1
10490: 00195913 srl s2,s2,0x1
10494: 00090593 mv a1,s2
10498: 34d000ef jal 10fe4 <__lesf2>
1049c: 00055463 bgez a0,104a4 <quant_bf16_to_int8+0xa8>
104a0: 00090493 mv s1,s2
104a4: 01442903 lw s2,20(s0)
104a8: 00048513 mv a0,s1
104ac: 00191913 sll s2,s2,0x1
104b0: 00195913 srl s2,s2,0x1
104b4: 00090593 mv a1,s2
104b8: 32d000ef jal 10fe4 <__lesf2>
104bc: 00055463 bgez a0,104c4 <quant_bf16_to_int8+0xc8>
104c0: 00090493 mv s1,s2
104c4: 01842903 lw s2,24(s0)
104c8: 00048593 mv a1,s1
104cc: 00191913 sll s2,s2,0x1
104d0: 00195913 srl s2,s2,0x1
104d4: 00090513 mv a0,s2
104d8: 245000ef jal 10f1c <__gesf2>
104dc: 00a05463 blez a0,104e4 <quant_bf16_to_int8+0xe8>
104e0: 00090493 mv s1,s2
104e4: 00048513 mv a0,s1
104e8: 04c010ef jal 11534 <__extendsfdf2>
104ec: 00050613 mv a2,a0
104f0: 00023537 lui a0,0x23
104f4: 00058693 mv a3,a1
104f8: ed850513 add a0,a0,-296 # 22ed8 <__trunctfdf2+0x3e0>
104fc: 079010ef jal 11d74 <printf>
10500: f341a503 lw a0,-204(gp) # 24904 <__SDATA_BEGIN__+0x6c>
10504: 00048593 mv a1,s1
10508: 5a4000ef jal 10aac <__divsf3>
1050c: 00042583 lw a1,0(s0)
10510: 00050493 mv s1,a0
10514: ff418913 add s2,gp,-12 # 249c4 <after_quant.0>
10518: 39d000ef jal 110b4 <__mulsf3>
1051c: 7a5000ef jal 114c0 <__fixsfsi>
10520: 00442583 lw a1,4(s0)
10524: 00050793 mv a5,a0
10528: 00f92023 sw a5,0(s2)
1052c: 00048513 mv a0,s1
10530: 385000ef jal 110b4 <__mulsf3>
10534: 78d000ef jal 114c0 <__fixsfsi>
10538: 00842583 lw a1,8(s0)
1053c: 00050793 mv a5,a0
10540: 00f92223 sw a5,4(s2)
10544: 00048513 mv a0,s1
10548: 36d000ef jal 110b4 <__mulsf3>
1054c: 775000ef jal 114c0 <__fixsfsi>
10550: 00c42583 lw a1,12(s0)
10554: 00050793 mv a5,a0
10558: 00f92423 sw a5,8(s2)
1055c: 00048513 mv a0,s1
10560: 355000ef jal 110b4 <__mulsf3>
10564: 75d000ef jal 114c0 <__fixsfsi>
10568: 01042583 lw a1,16(s0)
1056c: 00050793 mv a5,a0
10570: 00f92623 sw a5,12(s2)
10574: 00048513 mv a0,s1
10578: 33d000ef jal 110b4 <__mulsf3>
1057c: 745000ef jal 114c0 <__fixsfsi>
10580: 01442583 lw a1,20(s0)
10584: 00050793 mv a5,a0
10588: 00f92823 sw a5,16(s2)
1058c: 00048513 mv a0,s1
10590: 325000ef jal 110b4 <__mulsf3>
10594: 72d000ef jal 114c0 <__fixsfsi>
10598: 01842583 lw a1,24(s0)
1059c: 00050793 mv a5,a0
105a0: 00f92a23 sw a5,20(s2)
105a4: 00048513 mv a0,s1
105a8: 30d000ef jal 110b4 <__mulsf3>
105ac: 715000ef jal 114c0 <__fixsfsi>
105b0: 01c12083 lw ra,28(sp)
105b4: 01812403 lw s0,24(sp)
105b8: 00a92c23 sw a0,24(s2)
105bc: 01412483 lw s1,20(sp)
105c0: 01012903 lw s2,16(sp)
105c4: ff418513 add a0,gp,-12 # 249c4 <after_quant.0>
105c8: 00c12983 lw s3,12(sp)
105cc: 02010113 add sp,sp,32
105d0: 00008067 ret
Statistics
a0
a1
a2
a3
a4
a5
a8
a9
s0
s1
s2
s3
$ riscv-none-elf-gcc -march=rv32i -mabi=ilp32 -Os lab2.c -o lab2_Os.elf
$ riscv-none-elf-size lab2_Os.elf
~/rv32emu/tests/hw2$ riscv-none-elf-size lab2_0s.elf
text data bss dec hex filename
80900 2328 1556 84784 14b30 lab2_Os.elf
~/rv32emu/tests/hw2$ riscv-none-elf-readelf -h lab2_Os.elf
ELF Header:
Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00
Class: ELF32
Data: 2's complement, little endian
Version: 1 (current)
OS/ABI: UNIX - System V
ABI Version: 0
Type: EXEC (Executable file)
Machine: RISC-V
Version: 0x1
Entry point address: 0x10320
Start of program headers: 52 (bytes into file)
Start of section headers: 99336 (bytes into file)
Flags: 0x0
Size of this header: 52 (bytes)
Size of program headers: 32 (bytes)
Number of program headers: 3
Size of section headers: 40 (bytes)
Number of section headers: 15
Section header string table index: 14
00010428 <quant_bf16_to_int8>:
10428: fe010113 add sp,sp,-32
1042c: 00912a23 sw s1,20(sp)
10430: 00052483 lw s1,0(a0)
10434: 01312623 sw s3,12(sp)
10438: 800009b7 lui s3,0x80000
1043c: fff98993 add s3,s3,-1 # 7fffffff <__BSS_END__+0x7ffdb0cb>
10440: 00812c23 sw s0,24(sp)
10444: 01212823 sw s2,16(sp)
10448: 01512223 sw s5,4(sp)
1044c: 00112e23 sw ra,28(sp)
10450: 01412423 sw s4,8(sp)
10454: 01612023 sw s6,0(sp)
10458: 00050413 mv s0,a0
1045c: 0134f4b3 and s1,s1,s3
10460: 00450913 add s2,a0,4
10464: 01c50a93 add s5,a0,28
10468: 00092a03 lw s4,0(s2)
1046c: 00048593 mv a1,s1
10470: 013a7a33 and s4,s4,s3
10474: 000a0513 mv a0,s4
10478: 1e5000ef jal 10e5c <__gesf2>
1047c: 00a05463 blez a0,10484 <quant_bf16_to_int8+0x5c>
10480: 000a0493 mv s1,s4
10484: 00490913 add s2,s2,4
10488: ff5910e3 bne s2,s5,10468 <quant_bf16_to_int8+0x40>
1048c: 00048513 mv a0,s1
10490: 715000ef jal 113a4 <__extendsfdf2>
10494: 00050613 mv a2,a0
10498: 00023537 lui a0,0x23
1049c: 00058693 mv a3,a1
104a0: d4850513 add a0,a0,-696 # 22d48 <__trunctfdf2+0x3e0>
104a4: 740010ef jal 11be4 <printf>
104a8: f341a503 lw a0,-204(gp) # 24904 <__SDATA_BEGIN__+0x6c>
104ac: 00048593 mv a1,s1
104b0: 53c000ef jal 109ec <__divsf3>
104b4: 00050913 mv s2,a0
104b8: 00000493 li s1,0
104bc: fb018b13 add s6,gp,-80 # 24980 <after_quant.0>
104c0: 01c00a13 li s4,28
104c4: 009407b3 add a5,s0,s1
104c8: 0007a583 lw a1,0(a5) # ff800000 <__BSS_END__+0xff7db0cc>
104cc: 00090513 mv a0,s2
104d0: 009b0ab3 add s5,s6,s1
104d4: 251000ef jal 10f24 <__mulsf3>
104d8: 659000ef jal 11330 <__fixsfsi>
104dc: 00aaa023 sw a0,0(s5)
104e0: 00448493 add s1,s1,4
104e4: ff4490e3 bne s1,s4,104c4 <quant_bf16_to_int8+0x9c>
104e8: 01c12083 lw ra,28(sp)
104ec: 01812403 lw s0,24(sp)
104f0: 01412483 lw s1,20(sp)
104f4: 01012903 lw s2,16(sp)
104f8: 00812a03 lw s4,8(sp)
104fc: 00412a83 lw s5,4(sp)
10500: 00012b03 lw s6,0(sp)
10504: fb018513 add a0,gp,-80 # 24980 <after_quant.0>
10508: 00c12983 lw s3,12(sp)
1050c: 02010113 add sp,sp,32
10510: 00008067 ret
Statistics
a0
a1
a2
a3
a4
a5
a7
a8
a9
s0
s1
s2
s3
s4
s5
s6
$ riscv-none-elf-gcc -march=rv32i -mabi=ilp32 -Ofast lab2.c -o lab2_Ofast.elf
$ riscv-none-elf-size lab2_Ofast.elf
~/rv32emu/tests/hw2$ riscv-none-elf-size lab2_0fast.elf
text data bss dec hex filename
81004 2396 1556 84956 14bdc lab2_Ofast.elf
~/rv32emu/tests/hw2$ riscv-none-elf-readelf -h lab2_Ofast.elf
ELF Header:
Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00
Class: ELF32
Data: 2's complement, little endian
Version: 1 (current)
OS/ABI: UNIX - System V
ABI Version: 0
Type: EXEC (Executable file)
Machine: RISC-V
Version: 0x1
Entry point address: 0x102fc
Start of program headers: 52 (bytes into file)
Start of section headers: 99404 (bytes into file)
Flags: 0x0
Size of this header: 52 (bytes)
Size of program headers: 32 (bytes)
Number of program headers: 3
Size of section headers: 40 (bytes)
Number of section headers: 15
Section header string table index: 14
000103fc <quant_bf16_to_int8>:
103fc: fe010113 add sp,sp,-32
10400: 00812c23 sw s0,24(sp)
10404: 01212823 sw s2,16(sp)
10408: 00452403 lw s0,4(a0) # ff800004 <__BSS_END__+0xff7db08c>
1040c: 00852903 lw s2,8(a0)
10410: 800007b7 lui a5,0x80000
10414: fff78793 add a5,a5,-1 # 7fffffff <__BSS_END__+0x7ffdb087>
10418: 00f97933 and s2,s2,a5
1041c: 00f47433 and s0,s0,a5
10420: 00912a23 sw s1,20(sp)
10424: 00090593 mv a1,s2
10428: 00050493 mv s1,a0
1042c: 00040513 mv a0,s0
10430: 00112e23 sw ra,28(sp)
10434: 01312623 sw s3,12(sp)
10438: 2e5000ef jal 10f1c <__gesf2>
1043c: 00055463 bgez a0,10444 <quant_bf16_to_int8+0x48>
10440: 00090413 mv s0,s2
10444: 0004a903 lw s2,0(s1)
10448: 00040513 mv a0,s0
1044c: 00191913 sll s2,s2,0x1
10450: 00195913 srl s2,s2,0x1
10454: 00090593 mv a1,s2
10458: 2c5000ef jal 10f1c <__gesf2>
1045c: 00055463 bgez a0,10464 <quant_bf16_to_int8+0x68>
10460: 00090413 mv s0,s2
10464: 00c4a903 lw s2,12(s1)
10468: 00040513 mv a0,s0
1046c: 00191913 sll s2,s2,0x1
10470: 00195913 srl s2,s2,0x1
10474: 00090593 mv a1,s2
10478: 2a5000ef jal 10f1c <__gesf2>
1047c: 00055463 bgez a0,10484 <quant_bf16_to_int8+0x88>
10480: 00090413 mv s0,s2
10484: 0104a903 lw s2,16(s1)
10488: 00040513 mv a0,s0
1048c: 00191913 sll s2,s2,0x1
10490: 00195913 srl s2,s2,0x1
10494: 00090593 mv a1,s2
10498: 285000ef jal 10f1c <__gesf2>
1049c: 00055463 bgez a0,104a4 <quant_bf16_to_int8+0xa8>
104a0: 00090413 mv s0,s2
104a4: 0144a903 lw s2,20(s1)
104a8: 00040513 mv a0,s0
104ac: 00191913 sll s2,s2,0x1
104b0: 00195913 srl s2,s2,0x1
104b4: 00090593 mv a1,s2
104b8: 265000ef jal 10f1c <__gesf2>
104bc: 00055463 bgez a0,104c4 <quant_bf16_to_int8+0xc8>
104c0: 00090413 mv s0,s2
104c4: 0184a903 lw s2,24(s1)
104c8: 00040513 mv a0,s0
104cc: 00191913 sll s2,s2,0x1
104d0: 00195913 srl s2,s2,0x1
104d4: 00090593 mv a1,s2
104d8: 245000ef jal 10f1c <__gesf2>
104dc: 00055463 bgez a0,104e4 <quant_bf16_to_int8+0xe8>
104e0: 00090413 mv s0,s2
104e4: 00040513 mv a0,s0
104e8: 77d000ef jal 11464 <__extendsfdf2>
104ec: 00050613 mv a2,a0
104f0: 00023537 lui a0,0x23
104f4: 00058693 mv a3,a1
104f8: e0850513 add a0,a0,-504 # 22e08 <__trunctfdf2+0x3e0>
104fc: 7a8010ef jal 11ca4 <printf>
10500: f341a503 lw a0,-204(gp) # 24904 <__SDATA_BEGIN__+0x6c>
10504: 00040593 mv a1,s0
10508: 5a4000ef jal 10aac <__divsf3>
1050c: 0004a583 lw a1,0(s1)
10510: 00050413 mv s0,a0
10514: ff418913 add s2,gp,-12 # 249c4 <after_quant.0>
10518: 2cd000ef jal 10fe4 <__mulsf3>
1051c: 6d5000ef jal 113f0 <__fixsfsi>
10520: 0044a583 lw a1,4(s1)
10524: 00050793 mv a5,a0
10528: 00f92023 sw a5,0(s2)
1052c: 00040513 mv a0,s0
10530: 2b5000ef jal 10fe4 <__mulsf3>
10534: 6bd000ef jal 113f0 <__fixsfsi>
10538: 0084a583 lw a1,8(s1)
1053c: 00050793 mv a5,a0
10540: 00f92223 sw a5,4(s2)
10544: 00040513 mv a0,s0
10548: 29d000ef jal 10fe4 <__mulsf3>
1054c: 6a5000ef jal 113f0 <__fixsfsi>
10550: 00c4a583 lw a1,12(s1)
10554: 00050793 mv a5,a0
10558: 00f92423 sw a5,8(s2)
1055c: 00040513 mv a0,s0
10560: 285000ef jal 10fe4 <__mulsf3>
10564: 68d000ef jal 113f0 <__fixsfsi>
10568: 0104a583 lw a1,16(s1)
1056c: 00050793 mv a5,a0
10570: 00f92623 sw a5,12(s2)
10574: 00040513 mv a0,s0
10578: 26d000ef jal 10fe4 <__mulsf3>
1057c: 675000ef jal 113f0 <__fixsfsi>
10580: 0144a583 lw a1,20(s1)
10584: 00050793 mv a5,a0
10588: 00f92823 sw a5,16(s2)
1058c: 00040513 mv a0,s0
10590: 255000ef jal 10fe4 <__mulsf3>
10594: 65d000ef jal 113f0 <__fixsfsi>
10598: 0184a583 lw a1,24(s1)
1059c: 00050793 mv a5,a0
105a0: 00f92a23 sw a5,20(s2)
105a4: 00040513 mv a0,s0
105a8: 23d000ef jal 10fe4 <__mulsf3>
105ac: 645000ef jal 113f0 <__fixsfsi>
105b0: 01c12083 lw ra,28(sp)
105b4: 01812403 lw s0,24(sp)
105b8: 00a92c23 sw a0,24(s2)
105bc: 01412483 lw s1,20(sp)
105c0: 01012903 lw s2,16(sp)
105c4: ff418513 add a0,gp,-12 # 249c4 <after_quant.0>
105c8: 00c12983 lw s3,12(sp)
105cc: 02010113 add sp,sp,32
105d0: 00008067 ret
Statistics
a0
a1
a2
a3
a4
a5
a7
a8
s0
s1
s2
s3
Level | text | data | bss | dec | hex | filename |
---|---|---|---|---|---|---|
O0 | 81736 | 2320 | 1556 | 85612 | 14e6c | lab2.elf |
O1 | 81016 | 2328 | 1556 | 85900 | 14ba4 | lab2_O1.elf |
O2 | 81224 | 2328 | 1556 | 85108 | 14c74 | lab2_O2.elf |
O3 | 81212 | 2396 | 1556 | 85164 | 14cac | lab2_O3.elf |
Os | 80900 | 2328 | 1556 | 84784 | 14b30 | lab2_Os.elf |
Ofast | 81004 | 2396 | 1556 | 84956 | 14bdc | lab2_Ofast.elf |
From O0 to O1
The most noticeable difference is in the usage of lw and sw instructions. The frequency of usage in O0 is almost twice as much as O1, which results in faster execution of functions in O1 optimization level.
Additionally, we observe that O0 code is almost a direct translation from C language, which leads to longer instructions in certain cases.
From O1 to O2
It can be noticed that the optimization from O1 to O2 is not very significant.
From O1 to O3
The optimization from O1 to O3 reduces the number of registers but increases the count of jump and branch instructions, as well as the usage of lw and sw instructions. This results in the function taking a significantly longer time to execute.
From O3 to Ofast
O3 and Ofast optimizations are nearly identical, indicating that the compiler optimization has reached its limit.
.PHONY: clean
ASFLAGS = -march=rv32i -mabi=ilp32
all: source_O0.elf source_O1.elf source_O2.elf source_O3.elf source_Os.elf source_Ofast.elf
source_O0.elf:
riscv-none-elf-gcc $(LDFLAGS) -O0 source.c -o $@
source_O1.elf:
riscv-none-elf-gcc $(LDFLAGS) -O1 source.c -o $@
source_O2.elf:
riscv-none-elf-gcc $(LDFLAGS) -O2 source.c -o $@
source_O3.elf:
riscv-none-elf-gcc $(LDFLAGS) -O3 source.c -o $@
source_Os.elf:
riscv-none-elf-gcc $(LDFLAGS) -Os source.c -o $@
source_Ofast.elf:
riscv-none-elf-gcc $(LDFLAGS) -Ofast source.c -o $@
clean:
rm *.elf
Level | O0 | O1 | O2 | O3 | Os | Ofast | Assembly |
---|---|---|---|---|---|---|---|
elapsed cycle | 195232 | 193080 | 192836 | 188939 | 193367 | 188938 | 5227 :+1: |
or
By clicking below, you agree to our terms of service.
New to HackMD? Sign up