Assignment2: RISC-V Toolchain

Contributed by <jeremy90307>

Prepare GNU Toolchain for RISC-V

Lab2: RISC-V RV32I[MA] emulator with ELF support

Question Selection

Question

I chose the question by student KuanYuan053: "Implement quantization from bfloat16 to int8"

C code

Since student KuanYuan053 has already optimized the C code significantly, I won't waste time modifying it. I'll proceed directly to the RISC-V part.

#include <stdio.h>
#include <stdlib.h>
#include<math.h>

# define array_size 7
# define range 127 /*2^(n-1)-1, n: quant bit*/ 

float fp32_to_bf16(float x);
int* quant_bf16_to_int8(float x[]);
float bf16_findmax(float x[]);

int main()
{
	float array[array_size] = {1.200000, 1.203125, 2.310000, 2.312500, 3.460000, 3.4531255, 5.630000};
	float array2[array_size] = { 0.1, 0.2, 1.2, 3, 2.1, -4.2, 3.5};
	float array3[array_size] = { 3.14159265, 0.12345678 , 1.23456789 , 0.00000123, 0.00000001, 0.99999999 , 0.00000007 };
	float array_bf16[array_size] = {};
	int *after_quant;
	/*data 1*/
	for (int i = 0; i < 7; i++) {
		array_bf16[i] = fp32_to_bf16(array[i]);
	}
	printf("data 1\nbfloat16 number is \n");
	for (int i = 0; i < array_size; i++) {
		printf("%.12f\n", array_bf16[i]);
	}
	after_quant = quant_bf16_to_int8(array_bf16);
	printf("after quantization \n");
	for (int i = 0; i < array_size; i++) {
		printf("%d\n", after_quant[i]);
	}
	/*data 2*/
	for (int i = 0; i < 7; i++) {
		array_bf16[i] = fp32_to_bf16(array2[i]);
	}
	printf("data 2\nbfloat16 number is \n");
	for (int i = 0; i < array_size; i++) {
		printf("%.12f\n", array_bf16[i]);
	}
	after_quant = quant_bf16_to_int8(array_bf16);
	printf("after quantization \n");
	for (int i = 0; i < array_size; i++) {
		printf("%d\n", after_quant[i]);
	}
	/*data 3*/
	for (int i = 0; i < 7; i++) {
		array_bf16[i] = fp32_to_bf16(array3[i]);
	}
	printf("data 3\nbfloat16 number is \n");
	for (int i = 0; i < array_size; i++) {
		printf("%.12f\n", array_bf16[i]);
	}
	after_quant = quant_bf16_to_int8(array_bf16);
	printf("after quantization \n");
	for (int i = 0; i < array_size; i++) {
		printf("%d\n", after_quant[i]);
	}
	system("pause");
	return 0;
}

float fp32_to_bf16(float x)
{
	float y = x;
	int *p = (int *)&y;
	unsigned int exp = *p & 0x7F800000;
	unsigned int man = *p & 0x007FFFFF;
	if (exp == 0 && man == 0) /* zero */
		return x;
	if (exp == 0x7F800000 /* Fill this! */) /* infinity or NaN */
		return x;

	/* Normalized number */
	/* round to nearest */
	float r = x;
	int *pr = (int *)&r;
	*pr &= 0xFF800000;  /* r has the same exp as x */
	r /= 0x100 /* Fill this! */;
	y = x + r;

	*p &= 0xFFFF0000;

	return y;
}

int* quant_bf16_to_int8(float x[array_size])
{
	static int after_quant[array_size] = {};
	float max = fabs(x[0]);
	for (int i = 1; i < array_size; i++) {
		if (fabs(x[i]) > max) {
			max = fabs(x[i]);
		}
	}
	printf("maximum number is %.12f\n", max);
	float scale = range / max;
	for (int i = 0; i < array_size; i++) {
		after_quant[i] = (x[i] * scale);
	}
	return after_quant;
}

add ticks.c

#include <stdio.h>
#include <stdlib.h>
#include<math.h>
#include <inttypes.h>

# define array_size 7
# define range 127 /*2^(n-1)-1, n: quant bit*/ 

float fp32_to_bf16(float x);
int* quant_bf16_to_int8(float x[]);
float bf16_findmax(float x[]);
typedef uint64_t ticks;
static inline ticks getticks(void)
{
    uint64_t result;
    uint32_t l, h, h2;
    asm volatile(
        "rdcycleh %0\n"
        "rdcycle %1\n"
        "rdcycleh %2\n"
        "sub %0, %0, %2\n"
        "seqz %0, %0\n"
        "sub %0, zero, %0\n"
        "and %1, %1, %0\n"
        : "=r"(h), "=r"(l), "=r"(h2));
    result = (((uint64_t) h) << 32) | ((uint64_t) l);
    return result;
}
int main()
{
	ticks t0 = getticks();
	float array[array_size] = {1.200000, 1.203125, 2.310000, 2.312500, 3.460000, 3.4531255, 5.630000};
	float array2[array_size] = { 0.1, 0.2, 1.2, 3, 2.1, -4.2, 3.5};
	float array3[array_size] = { 3.14159265, 0.12345678 , 1.23456789 , 0.00000123, 0.00000001, 0.99999999 , 0.00000007 };
	float array_bf16[array_size] = {};
	int *after_quant;
	/*data 1*/
	for (int i = 0; i < 7; i++) {
		array_bf16[i] = fp32_to_bf16(array[i]);
	}
	printf("data 1\nbfloat16 number is \n");
	for (int i = 0; i < array_size; i++) {
		printf("%.12f\n", array_bf16[i]);
	}
	after_quant = quant_bf16_to_int8(array_bf16);
	printf("after quantization \n");
	for (int i = 0; i < array_size; i++) {
		printf("%d\n", after_quant[i]);
	}
	/*data 2*/
	for (int i = 0; i < 7; i++) {
		array_bf16[i] = fp32_to_bf16(array2[i]);
	}
	printf("data 2\nbfloat16 number is \n");
	for (int i = 0; i < array_size; i++) {
		printf("%.12f\n", array_bf16[i]);
	}
	after_quant = quant_bf16_to_int8(array_bf16);
	printf("after quantization \n");
	for (int i = 0; i < array_size; i++) {
		printf("%d\n", after_quant[i]);
	}
	/*data 3*/
	for (int i = 0; i < 7; i++) {
		array_bf16[i] = fp32_to_bf16(array3[i]);
	}
	printf("data 3\nbfloat16 number is \n");
	for (int i = 0; i < array_size; i++) {
		printf("%.12f\n", array_bf16[i]);
	}
	after_quant = quant_bf16_to_int8(array_bf16);
	printf("after quantization \n");
	for (int i = 0; i < array_size; i++) {
		printf("%d\n", after_quant[i]);
	}
	ticks t1 = getticks();
    printf("elapsed cycle: %" PRIu64 "\n", t1 - t0);
	system("pause");
	return 0;
}

float fp32_to_bf16(float x)
{
	float y = x;
	int *p = (int *)&y;
	unsigned int exp = *p & 0x7F800000;
	unsigned int man = *p & 0x007FFFFF;
	if (exp == 0 && man == 0) /* zero */
		return x;
	if (exp == 0x7F800000 /* Fill this! */) /* infinity or NaN */
		return x;

	/* Normalized number */
	/* round to nearest */
	float r = x;
	int *pr = (int *)&r;
	*pr &= 0xFF800000;  /* r has the same exp as x */
	r /= 0x100 /* Fill this! */;
	y = x + r;

	*p &= 0xFFFF0000;

	return y;
}

int* quant_bf16_to_int8(float x[array_size])
{
	static int after_quant[array_size] = {};
	float max = fabs(x[0]);
	for (int i = 1; i < array_size; i++) {
		if (fabs(x[i]) > max) {
			max = fabs(x[i]);
		}
	}
	printf("maximum number is %.12f\n", max);
	float scale = range / max;
	for (int i = 0; i < array_size; i++) {
		after_quant[i] = (x[i] * scale);
	}
	return after_quant;
}

original assembly

.data
array: .word 0x3f99999a, 0x3f9a0000, 0x4013d70a, 0x40140000, 0x405d70a4, 0x405d0000, 0x40b428f6
# test data1: 1.200000, 1.203125, 2.310000, 2.312500, 3.460000, 3.4531255, 5.630000

array2: .word 0x3dcccccd, 0x3e4ccccd, 0x3f99999a, 0x40400000, 0x40066666, 0xc0866666, 0x40600000
# test data2: 0.1, 0.2, 1.2, 3, 2.1, -4.2, 3.5

array3: .word 0x40490fdb, 0x3dfcd6e9, 0x3f9e0652, 0x35a5167a, 0x322bcc77, 0x3f800000, 0x339652e8
# test data3: 3.14159265, 0.12345678 , 1.23456789 , 0.00000123, 0.00000001, 0.99999999 , 0.00000007

array_bf16: .word 0, 0, 0, 0, 0, 0, 0

exp_mask: .word 0x7F800000
man_mask: .word 0x007FFFFF
sign_exp_mask: .word 0xFF800000
bf16_mask: .word 0xFFFF0000

next_line: .string "\n"
max_string: .string "maximum number is "
bf16_string: .string "\nbfloat16 number is \n"

.text
main:
        # push data    
        addi sp, sp, -12
        la t0, array
        sw t0, 0(sp)
        la t0, array2
        sw t0, 4(sp)
        la t0, array3
        sw t0, 8(sp)
        la s10, array_bf16  # global array_bf16 address(s10)        
        addi s11, x0, 3  # data number(s11)
        la s9, exp_mask  # global exp(s9)
        la s8, man_mask  # global man(s8)
        la s6, bf16_mask  # global bf16(s6)
        lw s9, 0(s9)
        lw s8, 0(s8)
        lw s6, 0(s6)
        add s7, x0, sp
main_for:
        la a0, bf16_string
        addi a7, x0, 4
        ecall         
        addi a3, x0, 7  # array size(a3)
        lw a1, 0(s7)  # array_data pointer(a1)
        mv a2, s10  # array_bf16 pointer(a2)
        jal ra, fp32_to_bf16_findmax
               
        addi s11, s11, -1
        addi s7, s7, 4
        bne s11, x0, main_for
        # Exit program
        li a7, 10
        ecall 
        
fp32_to_bf16_findmax:
# array_data pointer(a1), array_bf16 pointer(a2), array size(a3)
        # prologue
        addi sp, sp, -8
        sw s0, 0(sp)
        sw s1, 4(sp)
        
# array loop
for1:
        lw a5, 0(a1)  # x(a5)
        # fp32_to_bf16
        and t0, a5, s9  # x exp(t0)
        and t1, a5, s8  # x man(t1)
        # if zero        
        bne t0, x0, else
        # exp is zero
        bne t1, x0, else
        j finish_bf16        
else: 
        # if infinity or NaN
        beq t0, s9, finish_bf16                              
        # round        
        # r = x.man shift right 8 bit
        # x+r = x.man + x.man>>8
        li t3, 0x00800000  # make up 1 to No.24bit
        or t1, t1, t3
        srli t2, t1, 8  # r(t2)
        add t1, t1, t2  # x+r
        
        # check carry
        and t4, t1, t3  # check No.24bit (t4), 0:carry, 1: nocarry
        bne t4, x0, no_carry
        add t0, t0, t3  # exp+1
        srli t1 ,t1, 1  # man alignment
no_carry:
        and t0, t0, s9  # mask exp(t0)
        and t1, t1, s8  # mask man(t1)
        or t2, t0, t1  # combine exp & man
        li t3, 0x80000000  # sign mask
        and t3, a5, t3  # x sign
        or a5, t3, t2  # bfloat16(a5) 
        and a5, a5, s6
finish_bf16:
        sw a5, 0(a2)
        
        mv a0, a5
        addi a7, x0, 34
        ecall
        la a0, next_line
        addi a7, x0, 4
        ecall
        
        slti t3, a3, 7  # (a3==7) t3=0, (a3<7) t3=1
        bne t3, x0, compare
        # saved first max
        j max_change
        
compare:
        # compare exp
        blt s0, t0, max_change 
        blt t0, s0, max_not_change
        
        # compare man       
        blt s1, t1, max_change
        blt t1, s1, max_not_change

max_change:
        mv s0, t0  # max exp(s0)
        mv s1, t1  # max man(s1)         
        mv a4, a5  # max bf16(a4)
max_not_change:               
        addi a3, a3, -1
        addi a1, a1, 4
        addi a2, a2, 4
        bne a3, x0, for1
        
        # Absolute
        li t2, 0x7fffffff
        and a4, a4, t2
        
        #print
        la a0, max_string
        addi a7, x0, 4
        ecall
        mv a0, a4
        addi a7, x0, 34
        ecall
     
        # epilogue
        lw s0, 0(sp)
        lw s1, 4(sp)
        addi sp, sp, 8
        jr ra

Motiviation

I think student KuanYuan has optimized the code to a very concise level. Therefore, I intend to rewrite the code and successfully implement the conversion from bf16 to int8 in the RISC-V architecture, a part that KuanYuan wasn't able to accomplish. Also, because I chose a relatively simpler topic for lab1 assignment, I am keen to utilize this opportunity in lab2 to thoroughly learn RISC-V .

Improve

Student KuanYuan0530 has optimized the original assembly code to be highly concise and successfully executed it on the rv32emu emulator, showcasing remarkable proficiency.

As KuanYuan0530 has only completed the part of finding the maximum value within the bf16 array for quantization, I will attempt to solve the remaining portion. My goal is to successfully convert bf16 data into int8 format. Through this process, I aim not only to complete the task but also to learn from his design logic. I believe this effort will contribute significantly to my progress in this field.

:point_right:Implemented specifically
Initially, student KuanYuan completed the quantization process only up to converting from fp32 to b16 and finding the maximum absolute value of bf16,the unresolved issue pertains to dividing bf16 values by the scale. I used a somewhat rough method to find the scale, as I was unable to implement bf16 division within the assignment deadline. The approach I used to calculate the scale significantly reduced the overall quantization accuracy, leading to differences between the final result and the actual answer. Finally, for the multiplication of bf16 values, I referenced Brian's bf16 multiplier, resulting in a quantization method with slightly reduced precision.

You have to describe how you have improved upon the first implementation.
:notes: jserv

I have added a description in this regard.

Try manual optimization (fp32_to_bf16 & find maximum absolution value)

Use li to replace la and lw.
Delete unnecessary stack.
Avoid control hazards
Compare the entire bf16 value instead of separately comparing exp and man for size comparison.

.data
array: .word 0x3f99999a, 0x3f9a0000, 0x4013d70a, 0x40140000, 0x405d70a4, 0x405d0000, 0x40b428f6
# test data1: 1.200000, 1.203125, 2.310000, 2.312500, 3.460000, 3.4531255, 5.630000

array2: .word 0x3dcccccd, 0x3e4ccccd, 0x3f99999a, 0x40400000, 0x40066666, 0xc0866666, 0x40600000
# test data2: 0.1, 0.2, 1.2, 3, 2.1, -4.2, 3.5

array3: .word 0x40490fdb, 0x3dfcd6e9, 0x3f9e0652, 0x35a5167a, 0x322bcc77, 0x3f800000, 0x339652e8
# test data3: 3.14159265, 0.12345678 , 1.23456789 , 0.00000123, 0.00000001, 0.99999999 , 0.00000007

array_bf16: .word 0, 0, 0, 0, 0, 0, 0

exp_mask: .word 0x7F800000
man_mask: .word 0x007FFFFF
sign_exp_mask: .word 0xFF800000
bf16_mask: .word 0xFFFF0000

next_line: .string "\n"
max_string: .string "maximum number is "
bf16_string: .string "\nbfloat16 number is \n"

.text
main:
        # push data    
        addi sp, sp, -12
        la t0, array
        sw t0, 0(sp)
        la t0, array2
        sw t0, 4(sp)
        la t0, array3
        sw t0, 8(sp)
        la s10, array_bf16      # global array_bf16 address(s10)        
        addi s11, x0, 3         # data number(s11) -> three groups data
-       la s9, exp_mask         # global exp(s9)
-       la s8, man_mask         # global man(s8)
-       la s6, bf16_mask        # global bf16(s6)
-       lw s9, 0(s9)
-       lw s8, 0(s8)
-       lw s6, 0(s6)
-       add s7, x0, sp
+       li t5, 0x7F800000       #exp_mask
+       li t6, 0x007FFFFF       #man_mask
+       li s6, 0xFFFF0000       #bf16_mask
+       li s7, 0x7FFFFFFF #abs_mask
main_for:
        la a0, bf16_string
        addi a7, x0, 4
        ecall         
        addi a3, x0, 7          # array size(a3)
-       lw a1, 0(s7)            # array_data pointer(a1)
+       lw a1, 0(sp)            # array_data pointer(a1)
        mv a2, s10              # array_bf16 pointer(a2)
-       jal ra, fp32_to_bf16_findmax

-       addi s11, s11, -1
-       addi s7, s7, 4
-       bne s11, x0, main_for
-       # Exit program
-       li a7, 10
-       ecall 


fp32_to_bf16_findmax:
# array_data pointer(a1), array_bf16 pointer(a2), array size(a3)
-       # prologue
-       addi sp, sp, -8
-       sw s0, 0(sp)
-       sw s1, 4(sp)


# array loop
for1:
        lw a5, 0(a1)  # x(a5)
        # fp32_to_bf16
-       and t0, a5, s9  # x exp(t0)
-       and t1, a5, s8  # x man(t1)
+       and t0, a5, t5  # x exp(t0)
+       and t1, a5, t6  # x man(t1)
        # if zero        
        bne t0, x0, else
        # exp is zero
        bne t1, x0, else
        j finish_bf16        
else: 
        # if infinity or NaN
-       beq t0, s9, finish_bf16                              
+       beq t0, t5, finish_bf16                              
        # round        
        # r = x.man shift right 8 bit
        # x+r = x.man + x.man>>8
        li t3, 0x00800000  # make up 1 to No.24bit
        or t1, t1, t3
        srli t2, t1, 8  # r(t2)
        add t1, t1, t2  # x+r
        
        # check carry
        and t4, t1, t3  # check No.24bit (t4), 0:carry, 1: nocarry
        bne t4, x0, no_carry
        add t0, t0, t3  # exp+1
        srli t1 ,t1, 1  # man alignment
no_carry:
-       and t0, t0, s9  # mask exp(t0)
-       and t1, t1, s8  # mask man(t1)
+       and t0, t0, t5  # mask exp(t0)
+       and t1, t1, t6  # mask man(t1)
        or t2, t0, t1  # combine exp & man
        li t3, 0x80000000  # sign mask
        and t3, a5, t3  # x sign
        or a5, t3, t2  # bfloat16(a5) 
-       and a5, a5, s6
+       and a5, a5, s6 #s6 -> bf16_mask
finish_bf16:
        sw a5, 0(a2)

        mv a0, a5
        addi a7, x0, 34
        ecall
        la a0, next_line
        addi a7, x0, 4
        ecall

        slti t3, a3, 7  # (a3==7) t3=0, (a3<7) t3=1
+       and s8, a5, s7  # abs bf16 -> s8
        bne t3, x0, compare
        # saved first max
        j max_change

compare:
-       # compare exp
-       blt s0, t0, max_change 
-       blt t0, s0, max_not_change

-       # compare man       
-       blt s1, t1, max_change
-       blt t1, s1, max_not_change
+       blt s8, s0, max_not_change

max_change:
-       mv s0, t0  # max exp(s0)
-       mv s1, t1  # max man(s1)         
+       mv s0, s8  # max bf16(s0) 
        mv a4, a5  # max bf16(a4)
max_not_change:               
        addi a3, a3, -1
        addi a1, a1, 4
        addi a2, a2, 4
        bne a3, x0, for1

        # Absolute
-       li t2, 0x7fffffff
-       and a4, a4, t2
+       and a4, a4, s7

        #print
        la a0, max_string
        addi a7, x0, 4
        ecall
        mv a0, a4
        addi a7, x0, 34
        ecall

        # epilogue
-       lw s0, 0(sp)
-       lw s1, 4(sp)
-       addi sp, sp, 8
-       jr ra

+       and s0, x0, s0
+       and s1, x0, s1

+       addi s11, s11, -1
+       addi sp, sp, 4
+       and s8, x0, s8
+       bne s11, x0, main_for
+Exit:
+       li a7, 10
+       ecall

You shall use RDCYCLE/RDCYCLEH instruction for the statistics of your program’s execution.
:notes: jserv

I have added this part to the conclusion section.

Add bf16 to int8 conversion

scale code

.data
        maxbf16: .word 0x40b40000
.text
main:
        lw a4, maxbf16
        li t6, 0x007FFFFF
        
quant_bf16_to_int8:
        li s2, 0x7F    #127 to hex
        
        and t0, a4, t6 #max_man->t0 maxbf16->a4
        srli t0, t0, 15
        srli t1, a4, 23 #max_exp
        addi t1, t1, -127 #Denominator-> power of exp <- t1
        li t4, 7
        sub t3, t4, t1
        srl t0, t0, t3
        li t5, 1
        sll t5, t5, t1 #1<<t1
        or t0, t5, t5  #10^(t1) + fraction
        li s3, 0 
scale:
        add s4, s4, t0
        addi s3, s3, 1 #count scale
        bge s2, s4, scale
exit:
        mv a0, s3
        li a7, 1
        ecall

In quantization, the scale divides the floating-point range into 127 equal parts, making the maximum value 127, as we previously determined. However, due to my inability to successfully implement floating-point division, I used a somewhat crude method that only yields integers. Consequently, the INT8 values obtained using this method have significant deviations from the theoretical values. Until I can successfully implement floating-point division, this approach serves as a substitute.

Converting INT to bf16.

int_to_floatpoint:
        addi sp, sp, -16
        sw s2, 0(sp)
        sw s3, 4(sp)
        sw s4, 8(sp)
        sw s5, 12(sp)
        li t0, 0
        mv s2, a6
loop2:
        
        srli a6, a6, 1
        
        addi t0, t0, 1 
        blt x0, a6, loop2
###end loop2
        addi t0, t0, -1 # count shift right num
        
        addi s3, t0, 127 # exp_num
        # Why not +127? Because the shift count is one extra.
        slli s3, s3, 23 # exp in bf16 -> s3
        
        li t1, 0xFFFFFFFF
        li t2, 32
        sub t3, t2, t0
        srl t1, t1, t3
        and s4, s2, t1 # frac_num in bf16
        li t1, 23
        sub t1, t1, t0 # t1=23-(count shift right num)
        sll s4, s4, t1 # frac in bf16
        or s5, s4, s3 # int->bf16 ok
        mv a6, s5 
        
        
        la a0,next_line
        li a7,4
        ecall
        
        mv a0, a6
        li a7, 34
        ecall
        
        lw s2, 0(sp)
        lw s3, 4(sp)
        lw s4, 8(sp)
        lw s5, 12(sp)
        addi sp, sp, 16

Since the result obtained from the scale is an integer, it is converted to bf16 for ease of multiplication in the subsequent steps.

float point mult code

.data
test1: .word 0x42000000
test2: .word 0x40860000
.text
Multi_bfloat:
# decoder function input is a0
# jal ra,decoder        # load a0(two bloat number in one register) to t0
# decoder function output is s5,s6
        lw s5,test1
        lw s6,test2
        add t0,s5,x0          # store s5(bfloat 2) to t0
        add t1,s6,x0          # store s6(bfloat 1) to t1
        li t6,0x7F800000      # mask 0x7F800000
        # get exponent to t2,t3
        and t3,t0,t6          # use mask 0x7F800000 to get t0 exponent
        and t2,t1,t6          # use mask 0x7F800000 to get t1 exponent
        add t3,t3,t2          # add two exponent to t3
        li t6,0x3F800000      # mask 0x3F800000
        sub t3,t3,t6          # sub 127 to exponent

        # get sign
        xor t2,t0,t1          # get sign and store on t2
        srli t2,t2,31         # get rid of useless data
        slli t2,t2,31         # let sign back to right position
    
        # get sign and exponent together
        or t3,t3,t2
        # set the sign and exponent to t0
        slli t0,t0,9
        srli t0,t0,9
        or t0,t3,t0

        # get fraction to t2 and t3
        li t6,0x7F            # mask 0x7F
        slli t6,t6,16         # shift mask to 0x7F0000
        and t2,t0,t6          # use mask 0x7F0000 get fraction
        and t3,t1,t6          # use mask 0x7F0000 get fraction
        slli t2,t2,9          # shift left let no leading 0
        srli t2,t2,1          # shift right let leading has one 0
        li t6,0x80000000        # mask 80000000
        or t2,t2,t6           # use mask 0x80000000 to add integer
        srli t2,t2,1          # shift right to add space for overflow

        slli t3,t3,8          # shift left let no leading 0
        or t3,t3,t6           # use mask 0x80000000 to add integer
        srli t3,t3,1          # shift right to add space for overflow

        add s11,x0,x0         # set a counter and 0
        addi s10,x0,8         # set a end condition
        add t1,x0,x0          # reset t1 to 0 and let this register be result
        li t6,0x80000000      # mask 0x80000000

loop:
        addi s11,s11,1        # add 1 at counter every loop
        srli t6,t6,1          # shift right at 1 every loop
    
        and t4,t2,t6          # use mask to specified number at that place
        beq t4,x0,not_add     # jump if t4 equal to 0
        add t1,t1,t3          # add t3 to t1
not_add:
        srli t3,t3,1          # shift left 1 bit to t3
        bne s11,s10,loop      # if the condition not satisfy return to loop
# end of loop 
  
        # check if overflow
        li t6,0x80000000
        and t4,t1,t6          # get t1 max bit
    
        # if t4 max bit equal to 0 will not overflow
        beq t4,x0,not_overflow
    
        # if overflow
        slli t1,t1,1          # shift left 1 bits to remove integer
        li t6,0x800000        # mask 0x800000
        add t0,t0,t6          # exponent add 1 if overflow
        j Mult_end            # jump to Mult_end
     
        # if not overflow
not_overflow:
        slli t1,t1,2          # shift left 2 bits to remove integer
Mult_end:
        srli t1,t1,24         # shift right to remove useless bits
        addi t1,t1,1          # add 1 little bit to check if carry
        srli t1,t1,1          # shift right to remove useless bits
        slli t1,t1,16         # shift left to let fraction be right position
    
        srli t0,t0,23         # shift right to remove useless bits
        slli t0,t0,23         # shift left to let sign and exponent be right position
        or t0,t0,t1           # combine t0 and t1 together to get bfloat

        add s3,t0,x0          # store bfloat after multiplication to  s3
        #ret                   # return to main
### end of function
exit:
        mv a0,s3
        li a7,2
        ecall

I referenced my classmate Brian Cheng's method for floating-point multiplication, which I applied to the final stage where floating-point values are multiplied by the scale.

Remove the decimal part of the BF16 to make it an integer code

rm_decimal_of_bf16:
        mv t0, a4
        li t3, 0x80000000
        and t3, t0, t3
        srli t3, t3, 31 # Detecting positive or negative
        and t0,t0,s7 # absolution
        srli t0, t0, 23 # exp->s2
        addi t0, t0,-127 # power of 2
        and t1, t1, t6
        srli t1, t1, 16
        li t2, 0x80 # 1000 0000
        or t1, t1, t2
        li t2, 7
        sub t2, t2, t0 # how many bits do you right shift
        srl t1, t1, t2 # ANS
        li t2, 1
        bne t3, t2, printINT8
Add_negative_sign:
        add t2, t1, t1
        sub t1, t1, t2
printINT8:
        la a0,next_line
        li a7, 4
        ecall
        mv a0, t1
        li a7, 1
        ecall
# next data
        addi s5, s5, 4
        addi a3, a3, -1
        bne a3, x0, for2
### end of function

Since the result obtained from the bf16 multiplier is also in bf16 format, I remove the decimal part. In the final stage, I check if the sign bit is 1. If it is 1, I add the negative sign.

combine

.data
array: .word 0x3f99999a, 0x3f9a0000, 0x4013d70a, 0x40140000, 0x405d70a4, 0x405d0000, 0x40b428f6
# test data1: 1.200000, 1.203125, 2.310000, 2.312500, 3.460000, 3.4531255, 5.630000

array2: .word 0x3dcccccd, 0x3e4ccccd, 0x3f99999a, 0x40400000, 0x40066666, 0xc0866666, 0x40600000
# test data2: 0.1, 0.2, 1.2, 3, 2.1, -4.2, 3.5

array3: .word 0x40490fdb, 0x3dfcd6e9, 0x3f9e0652, 0x35a5167a, 0x322bcc77, 0x3f800000, 0x339652e8
# test data3: 3.14159265, 0.12345678 , 1.23456789 , 0.00000123, 0.00000001, 0.99999999 , 0.00000007

array_bf16: .word 0, 0, 0, 0, 0, 0, 0

next_line: .string "\n"
max_string: .string "maximum number is "
bf16_string: .string "\nbfloat16 number is \n"
scale_num: .string "\nscale is "
transform_to_bf16_is: .string "\ntransform to bf16 is:"
.text
main:
        # push data    
        addi sp, sp, -12
        la t0, array
        sw t0, 0(sp)
        la t0, array2
        sw t0, 4(sp)
        la t0, array3
        sw t0, 8(sp)
        la s10, array_bf16      # global array_bf16 address(s10)        
        addi s11, x0, 3         # data number(s11) -> three groups data
        li t5, 0x7F800000       # exp_mask
        li t6, 0x007FFFFF       # man_mask
        li s6, 0xFFFF0000       # bf16_mask
        li s7, 0x7FFFFFFF       # abs_mask
main_for:
        la a0, bf16_string      #call bfloat16 number is
        addi a7, x0, 4
        ecall         
        addi a3, x0, 7          # array size(a3)
        lw a1, 0(sp)            # array_data pointer(a1)
        mv a2, s10              # array_bf16 pointer(a2)

        
fp32_to_bf16_findmax:
# array_data pointer(a1), array_bf16 pointer(a2), array size(a3)
    
# array loop
for1:
        lw a5, 0(a1)            # x(a5)
        # fp32_to_bf16
        and t0, a5, t5          # x exp(t0)
        and t1, a5, t6          # x man(t1)
        # if zero        
        bne t0, x0, else
        # exp is zero
        bne t1, x0, else
        j finish_bf16        
else: 
        # if infinity or NaN
        beq t0, t5, finish_bf16                              
        # round        
        # r = x.man shift right 8 bit
        # x+r = x.man + x.man>>8
        li t3, 0x00800000      # make up 1 to No.24bit
        or t1, t1, t3
        srli t2, t1, 8         # r(t2)
        add t1, t1, t2         # x+r
        
        # check carry
        and t4, t1, t3         # check No.24bit (t4), 0:carry, 1: nocarry
        bne t4, x0, no_carry
        add t0, t0, t3         # exp+1
        srli t1 ,t1, 1         # man alignment
no_carry:
        and t0, t0, t5         # mask exp(t0)
        and t1, t1, t6         # mask man(t1)
        or t2, t0, t1          # combine exp & man
        li t3, 0x80000000      # sign mask
        and t3, a5, t3         # x sign
        or a5, t3, t2          # bfloat16(a5) 
        and a5, a5, s6         #s6 -> bf16_mask
finish_bf16:
        sw a5, 0(a2)
        mv a0, a5
        addi a7, x0, 34
        ecall
        la a0, next_line
        addi a7, x0, 4
        ecall
        
        slti t3, a3, 7         # (a3==7) t3=0, (a3<7) t3=1
        and s8, a5, s7         # abs bf16 -> s8
        bne t3, x0, compare
        # saved first max
        j max_change
        
compare:
        blt s8, s0, max_not_change

max_change:
        mv s0, s8              # max bf16(s0) 
        mv a4, a5              # max bf16(a4)
max_not_change:               
        addi a3, a3, -1
        addi a1, a1, 4
        addi a2, a2, 4
        bne a3, x0, for1
        
        # Absolute
        and a4, a4, s7         # s7=>0x7FFFFFFF abs_mask
        
        #print
        la a0, max_string      # call maximum number is
        addi a7, x0, 4
        ecall
        mv a0, a4
        addi a7, x0, 34
        ecall

        and s0, x0, s0
        and s1, x0, s1
        
#scale_function
scale:
        addi sp, sp, -16
        sw s2, 0(sp)
        sw s3, 4(sp)
        sw s4, 8(sp)
        sw s5, 12(sp)
        li s2, 0x7F            # 127 to hex
        li s3, 1               # add to fraction head (1.fraction)
        
        and t0, a4, t6         # max_man->t0   maxbf16->a4   man_mask=0x007FFFFF->t6
        srli t0, t0, 16        # bf16_man t0=t0>>15
        srli t1, a4, 23        # max_exp
        addi t1, t1, -127      # Denominator-> power of 2 <- t1
        li t4, 7               # man has 7bits
        sub t3, t4, t1 
        srl t0, t0, t3         # mean t0 >> (7-(power of 2)) 
        
        sll s3, s3, t1         # s3=(1<<t1)
        or t0, s3, t0          # 10^(t1) + fraction
        li a6, 0 
scale_loop:
        add s4, s4, t0
        addi a6, a6, 1         # count scale
        bge s2, s4, scale_loop
        lw s2, 0(sp)
        lw s3, 4(sp)
        lw s4, 8(sp)
        lw s5, 12(sp)
        addi sp, sp, 16
        
        la a0,scale_num
        li a7,4
        ecall
        mv a0, a6
        li a7, 1
        ecall
        
int_to_fp:
        addi sp, sp, -16
        sw s2, 0(sp)
        sw s3, 4(sp)
        sw s4, 8(sp)
        sw s5, 12(sp)
        li t0, 0
        mv s2, a6
loop2:
        
        srli a6, a6, 1
        
        addi t0, t0, 1 
        blt x0, a6, loop2
###end loop2
        addi t0, t0, -1        # count shift right num
        
        addi s3, t0, 127       # exp_num
        # Why not +127? Because the shift count is one extra.
        slli s3, s3, 23        # exp in bf16 -> s3
        
        li t1, 0xFFFFFFFF
        li t2, 32
        sub t3, t2, t0
        srl t1, t1, t3
        and s4, s2, t1         # frac_num in bf16
        li t1, 23
        sub t1, t1, t0         # t1=23-(count shift right num)
        sll s4, s4, t1         # frac in bf16
        or s5, s4, s3          # int->bf16 ok
        mv a6, s5 
        
        la a0,transform_to_bf16_is
        li a7,4
        ecall
        mv a0, a6
        li a7, 34
        ecall
        
        lw s2, 0(sp)
        lw s3, 4(sp)
        lw s4, 8(sp)
        lw s5, 12(sp)
        addi sp, sp, 16

Multi_bfloat:
# decoder function input is a0
# jal ra,decoder        
# load a0(two bloat number in one register) to t0
# decoder function output is s5,s6
        addi sp, sp, -16
        sw s2, 0(sp)
        sw s3, 4(sp)
        sw s4, 8(sp)
        sw s5, 12(sp)
        
        mv s5, s10
        
        addi a3, x0, 7        # array size -> 7
for2: 
        lw a4, 0(s5)
        add t0,a6,x0          # store s5(bfloat 2) to t0
        add t1,a4,x0          # store s6(bfloat 1) to t1
        li s2,0x7F800000      # mask 0x7F800000
        # get exponent to t2,t3
        and t3,t0,s2          # use mask 0x7F800000 to get t0 exponent
        and t2,t1,s2          # use mask 0x7F800000 to get t1 exponent
        add t3,t3,t2          # add two exponent to t3
        li s2,0x3F800000      # mask 0x3F800000
        sub t3,t3,s2          # sub 127 to exponent

        # get sign
        xor t2,t0,t1          # get sign and store on t2
        srli t2,t2,31         # get rid of useless data
        slli t2,t2,31         # let sign back to right position
    
        # get sign and exponent together
        or t3,t3,t2
        # set the sign and exponent to t0
        slli t0,t0,9
        srli t0,t0,9
        or t0,t3,t0

        # get fraction to t2 and t3
        li s2,0x7F            # mask 0x7F
        slli s2,s2,16         # shift mask to 0x7F0000
        and t2,t0,s2          # use mask 0x7F0000 get fraction
        and t3,t1,s2          # use mask 0x7F0000 get fraction
        slli t2,t2,9          # shift left let no leading 0
        srli t2,t2,1          # shift right let leading has one 0
        li s2,0x80000000        # mask 0x80000000
        or t2,t2,s2           # use mask 0x80000000 to add integer
        srli t2,t2,1          # shift right to add space for overflow

        slli t3,t3,8          # shift left let no leading 0
        or t3,t3,s2           # use mask 0x80000000 to add integer
        srli t3,t3,1          # shift right to add space for overflow

        add s3,x0,x0          # set a counter and 0
        addi s4,x0,8          # set a end condition
        add t1,x0,x0          # reset t1 to 0 and let this register be result
        li s2,0x80000000      # mask 0x80000000

loop:
        addi s3,s3,1          # add 1 at counter every loop
        srli s2,s2,1          # shift right at 1 every loop
    
        and t4,t2,s2          # use mask to specified number at that place
        beq t4,x0,not_add     # jump if t4 equal to 0
        add t1,t1,t3          # add t3 to t1
not_add:
        srli t3,t3,1          # shift left 1 bit to t3
        bne s3,s4,loop        # if the condition not satisfy return to loop
# end of loop 

        # check if overflow
        li s2,0x80000000
        and t4,t1,s2          # get t1 max bit
    
        # if t4 max bit equal to 0 will not overflow
        beq t4,x0,not_overflow
    
        # if overflow
        slli t1,t1,1          # shift left 1 bits to remove integer
        li s2,0x800000        # mask 0x800000
        add t0,t0,s2          # exponent add 1 if overflow
        j Mult_end            # jump to Mult_end
     
        # if not overflow
not_overflow:
        slli t1,t1,2          # shift left 2 bits to remove integer
Mult_end:
        srli t1,t1,24         # shift right to remove useless bits
        addi t1,t1,1          # add 1 little bit to check if carry
        srli t1,t1,1          # shift right to remove useless bits
        slli t1,t1,16         # shift left to let fraction be right position
    
        srli t0,t0,23         # shift right to remove useless bits
        slli t0,t0,23         # shift left to let sign and exponent be right position
        or t0,t0,t1           # combine t0 and t1 together to get bfloat

        add a4,t0,x0          # store bfloat after multiplication to  s3
### end of function
  
#Remove the decimal part of the BF16 to make it an integer.
rm_decimal_of_bf16:
        mv t0, a4
        li t3, 0x80000000
        and t3, t0, t3
        srli t3, t3, 31       # Detecting positive or negative
        and t0,t0,s7          # absolution
        srli t0, t0, 23       # exp->s2
        addi t0, t0,-127      # power of 2
        and t1, t1, t6
        srli t1, t1, 16
        li t2, 0x80           # 1000 0000
        or t1, t1, t2
        li t2, 7
        sub t2, t2, t0        # how many bits do you right shift
        srl t1, t1, t2        # ANS
        li t2, 1
        bne t3, t2, printINT8
Add_negative_sign:
        add t2, t1, t1
        sub t1, t1, t2
printINT8:
        la a0,next_line
        li a7, 4
        ecall
        mv a0, t1
        li a7, 1
        ecall
# next data
        addi s5, s5, 4
        addi a3, a3, -1
        bne a3, x0, for2
### end of function
        lw s2, 0(sp)
        lw s3, 4(sp)
        lw s4, 8(sp)
        lw s5, 12(sp)
        addi sp, sp, 16

next_array:
        addi s11, s11, -1
        addi sp, sp, 4
        and s8, x0, s8
        bne s11, x0, main_for
        
Exit:
        li a7, 10
        ecall

Result

data1
(ripes)

data2
(ripes)

data3
(ripes)

Room for improvement

Attempting the implementation of a bf16 division unit and successfully running it to calculate the scale.
Optimizing my own assembly code to reduce memory usage.

Optimized by riscv-none-elf-gcc

Using six different optimization levels: -O0 -O1 -O2 -O3 -Os -Ofast, and conducting further analysis.
Reference

3.10 Options That Control Optimization
description in Chinese*

compile

riscv-none-elf-gcc -march=rv32i -mabi=ilp32 -O0 lab2.c -o lab2.elf 
riscv-none-elf-gcc -march=rv32i -mabi=ilp32 -O1 lab2.c -o lab2.elf 
riscv-none-elf-gcc -march=rv32i -mabi=ilp32 -O2 lab2.c -o lab2.elf 
riscv-none-elf-gcc -march=rv32i -mabi=ilp32 -O3 lab2.c -o lab2.elf 
riscv-none-elf-gcc -march=rv32i -mabi=ilp32 -Os lab2.c -o lab2.elf

size

riscv-none-elf-size lab2.elf

Display the lab2.elf header

$ riscv-none-elf-readelf -h lab2.elf
ELF Header:
  Magic:   7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00 
  Class:                             ELF32
  Data:                              2's complement, little endian
  Version:                           1 (current)
  OS/ABI:                            UNIX - System V
  ABI Version:                       0
  Type:                              EXEC (Executable file)
  Machine:                           RISC-V
  Version:                           0x1
  Entry point address:               0x100d8
  Start of program headers:          52 (bytes into file)
  Start of section headers:          99312 (bytes into file)
  Flags:                             0x0
  Size of this header:               52 (bytes)
  Size of program headers:           32 (bytes)
  Number of program headers:         3
  Size of section headers:           40 (bytes)
  Number of section headers:         15
  Section header string table index: 14

write disassembly code

riscv-none-elf-objdump -d lab2.elf > disassembly/lab2.txt

./rv32emu lab2.elf

result

data 1
bfloat16 number is 
1.203125000000
1.203125000000
2.312500000000
2.312500000000
3.453125000000
3.453125000000
5.625000000000
maximum number is 5.625000000000
after quantization 
27
27
52
52
77
77
127
data 2
bfloat16 number is 
0.100097656250
0.200195312500
1.203125000000
3.000000000000
2.093750000000
-4.187500000000
3.500000000000
maximum number is 4.187500000000
after quantization 
3
6
36
90
63
-127
106
data 3
bfloat16 number is 
3.140625000000
0.123535156250
1.234375000000
0.000001229346
0.000000010012
1.000000000000
0.000000069849
maximum number is 3.140625000000
after quantization 
127
4
49
0
0
40
0
inferior exit code 0

RISC-V Instructions/Registers Usage Statistics

$ make tool
~/rv32emu/tests/hw2$ ~/rv32emu/build/rv_histogram ./lab2.elf
~/rv32emu/tests/hw2$ ~/rv32emu/build/rv_histogram -r ./lab2.elf

Instructions Histogram

Registers Histogram

-O0

Due to the excessively verbose optimized assembly code, I have chosen to analyze the quant_bf16_to_int8 function, which is a crucial part of the code.

Compile

$ riscv-none-elf-gcc -march=rv32i -mabi=ilp32 -O0 lab2.c -o lab2.elf

Size

$ riscv-none-elf-size lab2.elf

~/rv32emu/tests/hw2$ riscv-none-elf-size lab2.elf
   text	   data	    bss	    dec	    hex	filename
  81736	   2320	   1556	  85612	  14e6c	lab2.elf

ELF header

~/rv32emu/tests/hw2$ riscv-none-elf-readelf -h lab2.elf
ELF Header:
  Magic:   7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00 
  Class:                             ELF32
  Data:                              2's complement, little endian
  Version:                           1 (current)
  OS/ABI:                            UNIX - System V
  ABI Version:                       0
  Type:                              EXEC (Executable file)
  Machine:                           RISC-V
  Version:                           0x1
  Entry point address:               0x100d8
  Start of program headers:          52 (bytes into file)
  Start of section headers:          99312 (bytes into file)
  Flags:                             0x0
  Size of this header:               52 (bytes)
  Size of program headers:           32 (bytes)
  Number of program headers:         3
  Size of section headers:           40 (bytes)
  Number of section headers:         15
  Section header string table index: 14

quant_bf16_to_int8 disassembly code

000106e8 <quant_bf16_to_int8>:
   106e8:	fd010113          	add	sp,sp,-48
   106ec:	02112623          	sw	ra,44(sp)
   106f0:	02812423          	sw	s0,40(sp)
   106f4:	03010413          	add	s0,sp,48
   106f8:	fca42e23          	sw	a0,-36(s0)
   106fc:	fdc42783          	lw	a5,-36(s0)
   10700:	0007a703          	lw	a4,0(a5)
   10704:	800007b7          	lui	a5,0x80000
   10708:	fff78793          	add	a5,a5,-1 # 7fffffff <__BSS_END__+0x7ffdb0d3>
   1070c:	00f777b3          	and	a5,a4,a5
   10710:	fef42623          	sw	a5,-20(s0)
   10714:	00100793          	li	a5,1
   10718:	fef42423          	sw	a5,-24(s0)
   1071c:	0680006f          	j	10784 <quant_bf16_to_int8+0x9c>
   10720:	fe842783          	lw	a5,-24(s0)
   10724:	00279793          	sll	a5,a5,0x2
   10728:	fdc42703          	lw	a4,-36(s0)
   1072c:	00f707b3          	add	a5,a4,a5
   10730:	0007a703          	lw	a4,0(a5)
   10734:	800007b7          	lui	a5,0x80000
   10738:	fff78793          	add	a5,a5,-1 # 7fffffff <__BSS_END__+0x7ffdb0d3>
   1073c:	00f777b3          	and	a5,a4,a5
   10740:	00078593          	mv	a1,a5
   10744:	fec42503          	lw	a0,-20(s0)
   10748:	249000ef          	jal	11190 <__lesf2>
   1074c:	00050793          	mv	a5,a0
   10750:	0207d463          	bgez	a5,10778 <quant_bf16_to_int8+0x90>
   10754:	fe842783          	lw	a5,-24(s0)
   10758:	00279793          	sll	a5,a5,0x2
   1075c:	fdc42703          	lw	a4,-36(s0)
   10760:	00f707b3          	add	a5,a4,a5
   10764:	0007a703          	lw	a4,0(a5)
   10768:	800007b7          	lui	a5,0x80000
   1076c:	fff78793          	add	a5,a5,-1 # 7fffffff <__BSS_END__+0x7ffdb0d3>
   10770:	00f777b3          	and	a5,a4,a5
   10774:	fef42623          	sw	a5,-20(s0)
   10778:	fe842783          	lw	a5,-24(s0)
   1077c:	00178793          	add	a5,a5,1
   10780:	fef42423          	sw	a5,-24(s0)
   10784:	fe842703          	lw	a4,-24(s0)
   10788:	00600793          	li	a5,6
   1078c:	f8e7dae3          	bge	a5,a4,10720 <quant_bf16_to_int8+0x38>
   10790:	fec42503          	lw	a0,-20(s0)
   10794:	74d000ef          	jal	116e0 <__extendsfdf2>
   10798:	00050713          	mv	a4,a0
   1079c:	00058793          	mv	a5,a1
   107a0:	00070613          	mv	a2,a4
   107a4:	00078693          	mv	a3,a5
   107a8:	000237b7          	lui	a5,0x23
   107ac:	15078513          	add	a0,a5,336 # 23150 <__trunctfdf2+0x4ac>
   107b0:	770010ef          	jal	11f20 <printf>
   107b4:	000237b7          	lui	a5,0x23
   107b8:	fec42583          	lw	a1,-20(s0)
   107bc:	1707a503          	lw	a0,368(a5) # 23170 <__trunctfdf2+0x4cc>
   107c0:	560000ef          	jal	10d20 <__divsf3>
   107c4:	00050793          	mv	a5,a0
   107c8:	fef42023          	sw	a5,-32(s0)
   107cc:	fe042223          	sw	zero,-28(s0)
   107d0:	0540006f          	j	10824 <quant_bf16_to_int8+0x13c>
   107d4:	fe442783          	lw	a5,-28(s0)
   107d8:	00279793          	sll	a5,a5,0x2
   107dc:	fdc42703          	lw	a4,-36(s0)
   107e0:	00f707b3          	add	a5,a4,a5
   107e4:	0007a783          	lw	a5,0(a5)
   107e8:	fe042583          	lw	a1,-32(s0)
   107ec:	00078513          	mv	a0,a5
   107f0:	271000ef          	jal	11260 <__mulsf3>
   107f4:	00050793          	mv	a5,a0
   107f8:	00078513          	mv	a0,a5
   107fc:	671000ef          	jal	1166c <__fixsfsi>
   10800:	00050693          	mv	a3,a0
   10804:	fa818713          	add	a4,gp,-88 # 24978 <after_quant.0>
   10808:	fe442783          	lw	a5,-28(s0)
   1080c:	00279793          	sll	a5,a5,0x2
   10810:	00f707b3          	add	a5,a4,a5
   10814:	00d7a023          	sw	a3,0(a5)
   10818:	fe442783          	lw	a5,-28(s0)
   1081c:	00178793          	add	a5,a5,1
   10820:	fef42223          	sw	a5,-28(s0)
   10824:	fe442703          	lw	a4,-28(s0)
   10828:	00600793          	li	a5,6
   1082c:	fae7d4e3          	bge	a5,a4,107d4 <quant_bf16_to_int8+0xec>
   10830:	fa818793          	add	a5,gp,-88 # 24978 <after_quant.0>
   10834:	00078513          	mv	a0,a5
   10838:	02c12083          	lw	ra,44(sp)
   1083c:	02812403          	lw	s0,40(sp)
   10840:	03010113          	add	sp,sp,48
   10844:	00008067          	ret

Statistics

line of code : 89
register
- ax : a0 a1 a2 a3 a4 a5 a7 a8
- sx : s0
- tx : none
branch and jump
- jump : 6
- branch : 3
stack
- sp : 48
lw and sw
- lw : 23
- sw : 11

-O1

Compile

$ riscv-none-elf-gcc -march=rv32i -mabi=ilp32 -O1 lab2.c -o lab2_O1.elf

Size

$ riscv-none-elf-size lab2_O1.elf

~/rv32emu/tests/hw2$ riscv-none-elf-size lab2.elf
   text	   data	    bss	    dec	    hex	filename
  81016	   2328	   1556	  85900	  14ba4	lab2_O1.elf

ELF header

~/rv32emu/tests/hw2$ riscv-none-elf-readelf -h lab2_O1.elf
ELF Header:
  Magic:   7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00 
  Class:                             ELF32
  Data:                              2's complement, little endian
  Version:                           1 (current)
  OS/ABI:                            UNIX - System V
  ABI Version:                       0
  Type:                              EXEC (Executable file)
  Machine:                           RISC-V
  Version:                           0x1
  Entry point address:               0x100d8
  Start of program headers:          52 (bytes into file)
  Start of section headers:          99320 (bytes into file)
  Flags:                             0x0
  Size of this header:               52 (bytes)
  Size of program headers:           32 (bytes)
  Number of program headers:         3
  Size of section headers:           40 (bytes)
  Number of section headers:         15
  Section header string table index: 14

quant_bf16_to_int8 disassembly code

000101d4 <quant_bf16_to_int8>:
   101d4:	fe010113          	add	sp,sp,-32
   101d8:	00112e23          	sw	ra,28(sp)
   101dc:	00812c23          	sw	s0,24(sp)
   101e0:	00912a23          	sw	s1,20(sp)
   101e4:	01212823          	sw	s2,16(sp)
   101e8:	01312623          	sw	s3,12(sp)
   101ec:	01412423          	sw	s4,8(sp)
   101f0:	01512223          	sw	s5,4(sp)
   101f4:	00052983          	lw	s3,0(a0) # ff800000 <__BSS_END__+0xff7db0cc>
   101f8:	00199993          	sll	s3,s3,0x1
   101fc:	0019d993          	srl	s3,s3,0x1
   10200:	00450493          	add	s1,a0,4
   10204:	00050913          	mv	s2,a0
   10208:	01c50a13          	add	s4,a0,28
   1020c:	80000ab7          	lui	s5,0x80000
   10210:	fffa8a93          	add	s5,s5,-1 # 7fffffff <__BSS_END__+0x7ffdb0cb>
   10214:	00c0006f          	j	10220 <quant_bf16_to_int8+0x4c>
   10218:	00448493          	add	s1,s1,4
   1021c:	03448263          	beq	s1,s4,10240 <quant_bf16_to_int8+0x6c>
   10220:	0004a403          	lw	s0,0(s1)
   10224:	01547433          	and	s0,s0,s5
   10228:	00098593          	mv	a1,s3
   1022c:	00040513          	mv	a0,s0
   10230:	4a1000ef          	jal	10ed0 <__gesf2>
   10234:	fea052e3          	blez	a0,10218 <quant_bf16_to_int8+0x44>
   10238:	00040993          	mv	s3,s0
   1023c:	fddff06f          	j	10218 <quant_bf16_to_int8+0x44>
   10240:	00098513          	mv	a0,s3
   10244:	1d4010ef          	jal	11418 <__extendsfdf2>
   10248:	00050613          	mv	a2,a0
   1024c:	00058693          	mv	a3,a1
   10250:	00023537          	lui	a0,0x23
   10254:	db850513          	add	a0,a0,-584 # 22db8 <__trunctfdf2+0x3dc>
   10258:	201010ef          	jal	11c58 <printf>
   1025c:	00098593          	mv	a1,s3
   10260:	f341a503          	lw	a0,-204(gp) # 24904 <__SDATA_BEGIN__+0x6c>
   10264:	7fc000ef          	jal	10a60 <__divsf3>
   10268:	00050493          	mv	s1,a0
   1026c:	fb018413          	add	s0,gp,-80 # 24980 <after_quant.0>
   10270:	01c40993          	add	s3,s0,28
   10274:	00092583          	lw	a1,0(s2)
   10278:	00048513          	mv	a0,s1
   1027c:	51d000ef          	jal	10f98 <__mulsf3>
   10280:	124010ef          	jal	113a4 <__fixsfsi>
   10284:	00a42023          	sw	a0,0(s0)
   10288:	00490913          	add	s2,s2,4
   1028c:	00440413          	add	s0,s0,4
   10290:	ff3412e3          	bne	s0,s3,10274 <quant_bf16_to_int8+0xa0>
   10294:	fb018513          	add	a0,gp,-80 # 24980 <after_quant.0>
   10298:	01c12083          	lw	ra,28(sp)
   1029c:	01812403          	lw	s0,24(sp)
   102a0:	01412483          	lw	s1,20(sp)
   102a4:	01012903          	lw	s2,16(sp)
   102a8:	00c12983          	lw	s3,12(sp)
   102ac:	00812a03          	lw	s4,8(sp)
   102b0:	00412a83          	lw	s5,4(sp)
   102b4:	02010113          	add	sp,sp,32
   102b8:	00008067          	ret

Statistics

line of code : 60
register
- ax : a0 a1 a2 a3 a4 a5 a6 a8``a9``a10
- sx : s0 s1 s2 s3 s4 s5
- tx : none
branch and jump
- jump : 8
- branch : 3
stack
- sp : 32
lw and sw
- lw : 11
- sw : 8

-O2

Compile

$ riscv-none-elf-gcc -march=rv32i -mabi=ilp32 -O2 lab2.c -o lab2_O2.elf

Size

$ riscv-none-elf-size lab2_O2.elf

~/rv32emu/tests/hw2$ riscv-none-elf-size lab2_02.elf
   text	   data	    bss	    dec	    hex	filename
  81224	   2328	   1556	  85108	  14c74	lab2_O2.elf

ELF header

~/rv32emu/tests/hw2$ riscv-none-elf-readelf -h lab2_O2.elf
ELF Header:
  Magic:   7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00 
  Class:                             ELF32
  Data:                              2's complement, little endian
  Version:                           1 (current)
  OS/ABI:                            UNIX - System V
  ABI Version:                       0
  Type:                              EXEC (Executable file)
  Machine:                           RISC-V
  Version:                           0x1
  Entry point address:               0x10474
  Start of program headers:          52 (bytes into file)
  Start of section headers:          99336 (bytes into file)
  Flags:                             0x0
  Size of this header:               52 (bytes)
  Size of program headers:           32 (bytes)
  Number of program headers:         3
  Size of section headers:           40 (bytes)
  Number of section headers:         15
  Section header string table index: 14

quant_bf16_to_int8 disassembly code

00010574 <quant_bf16_to_int8>:
   10574:	fe010113          	add	sp,sp,-32
   10578:	01312623          	sw	s3,12(sp)
   1057c:	00052983          	lw	s3,0(a0) # ff800000 <__BSS_END__+0xff7db0cc>
   10580:	01412423          	sw	s4,8(sp)
   10584:	80000a37          	lui	s4,0x80000
   10588:	fffa0a13          	add	s4,s4,-1 # 7fffffff <__BSS_END__+0x7ffdb0cb>
   1058c:	00812c23          	sw	s0,24(sp)
   10590:	01212823          	sw	s2,16(sp)
   10594:	01512223          	sw	s5,4(sp)
   10598:	00112e23          	sw	ra,28(sp)
   1059c:	00912a23          	sw	s1,20(sp)
   105a0:	00050913          	mv	s2,a0
   105a4:	0149f9b3          	and	s3,s3,s4
   105a8:	00450413          	add	s0,a0,4
   105ac:	01c50a93          	add	s5,a0,28
   105b0:	00042483          	lw	s1,0(s0) # ffff0000 <__BSS_END__+0xfffcb0cc>
   105b4:	00098513          	mv	a0,s3
   105b8:	00440413          	add	s0,s0,4
   105bc:	0144f4b3          	and	s1,s1,s4
   105c0:	00048593          	mv	a1,s1
   105c4:	1d5000ef          	jal	10f98 <__lesf2>
   105c8:	00055463          	bgez	a0,105d0 <quant_bf16_to_int8+0x5c>
   105cc:	00048993          	mv	s3,s1
   105d0:	ff5410e3          	bne	s0,s5,105b0 <quant_bf16_to_int8+0x3c>
   105d4:	00098513          	mv	a0,s3
   105d8:	711000ef          	jal	114e8 <__extendsfdf2>
   105dc:	00050613          	mv	a2,a0
   105e0:	00023537          	lui	a0,0x23
   105e4:	00058693          	mv	a3,a1
   105e8:	e8850513          	add	a0,a0,-376 # 22e88 <__trunctfdf2+0x3dc>
   105ec:	73c010ef          	jal	11d28 <printf>
   105f0:	f341a503          	lw	a0,-204(gp) # 24904 <__SDATA_BEGIN__+0x6c>
   105f4:	00098593          	mv	a1,s3
   105f8:	530000ef          	jal	10b28 <__divsf3>
   105fc:	fb018413          	add	s0,gp,-80 # 24980 <after_quant.0>
   10600:	00050493          	mv	s1,a0
   10604:	01c40993          	add	s3,s0,28
   10608:	00092583          	lw	a1,0(s2)
   1060c:	00048513          	mv	a0,s1
   10610:	00440413          	add	s0,s0,4
   10614:	255000ef          	jal	11068 <__mulsf3>
   10618:	65d000ef          	jal	11474 <__fixsfsi>
   1061c:	fea42e23          	sw	a0,-4(s0)
   10620:	00490913          	add	s2,s2,4
   10624:	ff3412e3          	bne	s0,s3,10608 <quant_bf16_to_int8+0x94>
   10628:	01c12083          	lw	ra,28(sp)
   1062c:	01812403          	lw	s0,24(sp)
   10630:	01412483          	lw	s1,20(sp)
   10634:	01012903          	lw	s2,16(sp)
   10638:	00c12983          	lw	s3,12(sp)
   1063c:	00412a83          	lw	s5,4(sp)
   10640:	fb018513          	add	a0,gp,-80 # 24980 <after_quant.0>
   10644:	00812a03          	lw	s4,8(sp)
   10648:	02010113          	add	sp,sp,32
   1064c:	00008067          	ret

Statistics

line of code : 56
register
- ax : a0 a1 a2 a3 a4 a5 a8 a9
- sx : s0 s1 s2 s3 s4 s5
- tx : none
branch and jump
- jump : 8
- branch : 3
stack
- sp : 32
lw and sw
- lw : 11
- sw : 8

-O3

Compile

$ riscv-none-elf-gcc -march=rv32i -mabi=ilp32 -O3 lab2.c -o lab2_O3.elf

Size

$ riscv-none-elf-size lab2_O3.elf

~/rv32emu/tests/hw2$ riscv-none-elf-size lab2_03.elf
   text	   data	    bss	    dec	    hex	filename
  81212	   2396	   1556	  85164	  14cac	lab2_O3.elf

ELF header

~/rv32emu/tests/hw2$ riscv-none-elf-readelf -h lab2_O3.elf
ELF Header:
  Magic:   7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00 
  Class:                             ELF32
  Data:                              2's complement, little endian
  Version:                           1 (current)
  OS/ABI:                            UNIX - System V
  ABI Version:                       0
  Type:                              EXEC (Executable file)
  Machine:                           RISC-V
  Version:                           0x1
  Entry point address:               0x102fc
  Start of program headers:          52 (bytes into file)
  Start of section headers:          99492 (bytes into file)
  Flags:                             0x0
  Size of this header:               52 (bytes)
  Size of program headers:           32 (bytes)
  Number of program headers:         3
  Size of section headers:           40 (bytes)
  Number of section headers:         15
  Section header string table index: 14

quant_bf16_to_int8 disassembly code

000103fc <quant_bf16_to_int8>:
   103fc:	fe010113          	add	sp,sp,-32
   10400:	00912a23          	sw	s1,20(sp)
   10404:	01212823          	sw	s2,16(sp)
   10408:	00052483          	lw	s1,0(a0) # ff800000 <__BSS_END__+0xff7db088>
   1040c:	00452903          	lw	s2,4(a0)
   10410:	800007b7          	lui	a5,0x80000
   10414:	fff78793          	add	a5,a5,-1 # 7fffffff <__BSS_END__+0x7ffdb087>
   10418:	00f4f4b3          	and	s1,s1,a5
   1041c:	00f97933          	and	s2,s2,a5
   10420:	00812c23          	sw	s0,24(sp)
   10424:	00090593          	mv	a1,s2
   10428:	00050413          	mv	s0,a0
   1042c:	00048513          	mv	a0,s1
   10430:	00112e23          	sw	ra,28(sp)
   10434:	01312623          	sw	s3,12(sp)
   10438:	3ad000ef          	jal	10fe4 <__lesf2>
   1043c:	00055463          	bgez	a0,10444 <quant_bf16_to_int8+0x48>
   10440:	00090493          	mv	s1,s2
   10444:	00842903          	lw	s2,8(s0) # ffff0008 <__BSS_END__+0xfffcb090>
   10448:	00048513          	mv	a0,s1
   1044c:	00191913          	sll	s2,s2,0x1
   10450:	00195913          	srl	s2,s2,0x1
   10454:	00090593          	mv	a1,s2
   10458:	38d000ef          	jal	10fe4 <__lesf2>
   1045c:	00055463          	bgez	a0,10464 <quant_bf16_to_int8+0x68>
   10460:	00090493          	mv	s1,s2
   10464:	00c42903          	lw	s2,12(s0)
   10468:	00048513          	mv	a0,s1
   1046c:	00191913          	sll	s2,s2,0x1
   10470:	00195913          	srl	s2,s2,0x1
   10474:	00090593          	mv	a1,s2
   10478:	36d000ef          	jal	10fe4 <__lesf2>
   1047c:	00055463          	bgez	a0,10484 <quant_bf16_to_int8+0x88>
   10480:	00090493          	mv	s1,s2
   10484:	01042903          	lw	s2,16(s0)
   10488:	00048513          	mv	a0,s1
   1048c:	00191913          	sll	s2,s2,0x1
   10490:	00195913          	srl	s2,s2,0x1
   10494:	00090593          	mv	a1,s2
   10498:	34d000ef          	jal	10fe4 <__lesf2>
   1049c:	00055463          	bgez	a0,104a4 <quant_bf16_to_int8+0xa8>
   104a0:	00090493          	mv	s1,s2
   104a4:	01442903          	lw	s2,20(s0)
   104a8:	00048513          	mv	a0,s1
   104ac:	00191913          	sll	s2,s2,0x1
   104b0:	00195913          	srl	s2,s2,0x1
   104b4:	00090593          	mv	a1,s2
   104b8:	32d000ef          	jal	10fe4 <__lesf2>
   104bc:	00055463          	bgez	a0,104c4 <quant_bf16_to_int8+0xc8>
   104c0:	00090493          	mv	s1,s2
   104c4:	01842903          	lw	s2,24(s0)
   104c8:	00048593          	mv	a1,s1
   104cc:	00191913          	sll	s2,s2,0x1
   104d0:	00195913          	srl	s2,s2,0x1
   104d4:	00090513          	mv	a0,s2
   104d8:	245000ef          	jal	10f1c <__gesf2>
   104dc:	00a05463          	blez	a0,104e4 <quant_bf16_to_int8+0xe8>
   104e0:	00090493          	mv	s1,s2
   104e4:	00048513          	mv	a0,s1
   104e8:	04c010ef          	jal	11534 <__extendsfdf2>
   104ec:	00050613          	mv	a2,a0
   104f0:	00023537          	lui	a0,0x23
   104f4:	00058693          	mv	a3,a1
   104f8:	ed850513          	add	a0,a0,-296 # 22ed8 <__trunctfdf2+0x3e0>
   104fc:	079010ef          	jal	11d74 <printf>
   10500:	f341a503          	lw	a0,-204(gp) # 24904 <__SDATA_BEGIN__+0x6c>
   10504:	00048593          	mv	a1,s1
   10508:	5a4000ef          	jal	10aac <__divsf3>
   1050c:	00042583          	lw	a1,0(s0)
   10510:	00050493          	mv	s1,a0
   10514:	ff418913          	add	s2,gp,-12 # 249c4 <after_quant.0>
   10518:	39d000ef          	jal	110b4 <__mulsf3>
   1051c:	7a5000ef          	jal	114c0 <__fixsfsi>
   10520:	00442583          	lw	a1,4(s0)
   10524:	00050793          	mv	a5,a0
   10528:	00f92023          	sw	a5,0(s2)
   1052c:	00048513          	mv	a0,s1
   10530:	385000ef          	jal	110b4 <__mulsf3>
   10534:	78d000ef          	jal	114c0 <__fixsfsi>
   10538:	00842583          	lw	a1,8(s0)
   1053c:	00050793          	mv	a5,a0
   10540:	00f92223          	sw	a5,4(s2)
   10544:	00048513          	mv	a0,s1
   10548:	36d000ef          	jal	110b4 <__mulsf3>
   1054c:	775000ef          	jal	114c0 <__fixsfsi>
   10550:	00c42583          	lw	a1,12(s0)
   10554:	00050793          	mv	a5,a0
   10558:	00f92423          	sw	a5,8(s2)
   1055c:	00048513          	mv	a0,s1
   10560:	355000ef          	jal	110b4 <__mulsf3>
   10564:	75d000ef          	jal	114c0 <__fixsfsi>
   10568:	01042583          	lw	a1,16(s0)
   1056c:	00050793          	mv	a5,a0
   10570:	00f92623          	sw	a5,12(s2)
   10574:	00048513          	mv	a0,s1
   10578:	33d000ef          	jal	110b4 <__mulsf3>
   1057c:	745000ef          	jal	114c0 <__fixsfsi>
   10580:	01442583          	lw	a1,20(s0)
   10584:	00050793          	mv	a5,a0
   10588:	00f92823          	sw	a5,16(s2)
   1058c:	00048513          	mv	a0,s1
   10590:	325000ef          	jal	110b4 <__mulsf3>
   10594:	72d000ef          	jal	114c0 <__fixsfsi>
   10598:	01842583          	lw	a1,24(s0)
   1059c:	00050793          	mv	a5,a0
   105a0:	00f92a23          	sw	a5,20(s2)
   105a4:	00048513          	mv	a0,s1
   105a8:	30d000ef          	jal	110b4 <__mulsf3>
   105ac:	715000ef          	jal	114c0 <__fixsfsi>
   105b0:	01c12083          	lw	ra,28(sp)
   105b4:	01812403          	lw	s0,24(sp)
   105b8:	00a92c23          	sw	a0,24(s2)
   105bc:	01412483          	lw	s1,20(sp)
   105c0:	01012903          	lw	s2,16(sp)
   105c4:	ff418513          	add	a0,gp,-12 # 249c4 <after_quant.0>
   105c8:	00c12983          	lw	s3,12(sp)
   105cc:	02010113          	add	sp,sp,32
   105d0:	00008067          	ret

Statistics

line of code : 119
register
- ax : a0 a1 a2 a3 a4 a5 a8 a9
- sx : s0 s1 s2 s3
- tx : none
branch and jump
- jump : 24
- branch : 6
stack
- sp : 32
lw and sw
- lw : 20
- sw : 12

-Os

Compile

$ riscv-none-elf-gcc -march=rv32i -mabi=ilp32 -Os lab2.c -o lab2_Os.elf

Size

$ riscv-none-elf-size lab2_Os.elf

~/rv32emu/tests/hw2$ riscv-none-elf-size lab2_0s.elf
   text	   data	    bss	    dec	    hex	filename
  80900	   2328	   1556	  84784	  14b30	lab2_Os.elf

ELF header

~/rv32emu/tests/hw2$ riscv-none-elf-readelf -h lab2_Os.elf
ELF Header:
  Magic:   7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00 
  Class:                             ELF32
  Data:                              2's complement, little endian
  Version:                           1 (current)
  OS/ABI:                            UNIX - System V
  ABI Version:                       0
  Type:                              EXEC (Executable file)
  Machine:                           RISC-V
  Version:                           0x1
  Entry point address:               0x10320
  Start of program headers:          52 (bytes into file)
  Start of section headers:          99336 (bytes into file)
  Flags:                             0x0
  Size of this header:               52 (bytes)
  Size of program headers:           32 (bytes)
  Number of program headers:         3
  Size of section headers:           40 (bytes)
  Number of section headers:         15
  Section header string table index: 14

quant_bf16_to_int8 disassembly code

00010428 <quant_bf16_to_int8>:
   10428:	fe010113          	add	sp,sp,-32
   1042c:	00912a23          	sw	s1,20(sp)
   10430:	00052483          	lw	s1,0(a0)
   10434:	01312623          	sw	s3,12(sp)
   10438:	800009b7          	lui	s3,0x80000
   1043c:	fff98993          	add	s3,s3,-1 # 7fffffff <__BSS_END__+0x7ffdb0cb>
   10440:	00812c23          	sw	s0,24(sp)
   10444:	01212823          	sw	s2,16(sp)
   10448:	01512223          	sw	s5,4(sp)
   1044c:	00112e23          	sw	ra,28(sp)
   10450:	01412423          	sw	s4,8(sp)
   10454:	01612023          	sw	s6,0(sp)
   10458:	00050413          	mv	s0,a0
   1045c:	0134f4b3          	and	s1,s1,s3
   10460:	00450913          	add	s2,a0,4
   10464:	01c50a93          	add	s5,a0,28
   10468:	00092a03          	lw	s4,0(s2)
   1046c:	00048593          	mv	a1,s1
   10470:	013a7a33          	and	s4,s4,s3
   10474:	000a0513          	mv	a0,s4
   10478:	1e5000ef          	jal	10e5c <__gesf2>
   1047c:	00a05463          	blez	a0,10484 <quant_bf16_to_int8+0x5c>
   10480:	000a0493          	mv	s1,s4
   10484:	00490913          	add	s2,s2,4
   10488:	ff5910e3          	bne	s2,s5,10468 <quant_bf16_to_int8+0x40>
   1048c:	00048513          	mv	a0,s1
   10490:	715000ef          	jal	113a4 <__extendsfdf2>
   10494:	00050613          	mv	a2,a0
   10498:	00023537          	lui	a0,0x23
   1049c:	00058693          	mv	a3,a1
   104a0:	d4850513          	add	a0,a0,-696 # 22d48 <__trunctfdf2+0x3e0>
   104a4:	740010ef          	jal	11be4 <printf>
   104a8:	f341a503          	lw	a0,-204(gp) # 24904 <__SDATA_BEGIN__+0x6c>
   104ac:	00048593          	mv	a1,s1
   104b0:	53c000ef          	jal	109ec <__divsf3>
   104b4:	00050913          	mv	s2,a0
   104b8:	00000493          	li	s1,0
   104bc:	fb018b13          	add	s6,gp,-80 # 24980 <after_quant.0>
   104c0:	01c00a13          	li	s4,28
   104c4:	009407b3          	add	a5,s0,s1
   104c8:	0007a583          	lw	a1,0(a5) # ff800000 <__BSS_END__+0xff7db0cc>
   104cc:	00090513          	mv	a0,s2
   104d0:	009b0ab3          	add	s5,s6,s1
   104d4:	251000ef          	jal	10f24 <__mulsf3>
   104d8:	659000ef          	jal	11330 <__fixsfsi>
   104dc:	00aaa023          	sw	a0,0(s5)
   104e0:	00448493          	add	s1,s1,4
   104e4:	ff4490e3          	bne	s1,s4,104c4 <quant_bf16_to_int8+0x9c>
   104e8:	01c12083          	lw	ra,28(sp)
   104ec:	01812403          	lw	s0,24(sp)
   104f0:	01412483          	lw	s1,20(sp)
   104f4:	01012903          	lw	s2,16(sp)
   104f8:	00812a03          	lw	s4,8(sp)
   104fc:	00412a83          	lw	s5,4(sp)
   10500:	00012b03          	lw	s6,0(sp)
   10504:	fb018513          	add	a0,gp,-80 # 24980 <after_quant.0>
   10508:	00c12983          	lw	s3,12(sp)
   1050c:	02010113          	add	sp,sp,32
   10510:	00008067          	ret

Statistics

line of code : 60
register
- ax : a0 a1 a2 a3 a4 a5 a7 a8 a9
- sx : s0 s1 s2 s3 s4 s5 s6
- tx : none
branch and jump
- jump : 6
- branch : 3
stack
- sp : 32
lw and sw
- lw : 12
- sw : 9

-Ofast

Compile

$ riscv-none-elf-gcc -march=rv32i -mabi=ilp32 -Ofast lab2.c -o lab2_Ofast.elf

Size

$ riscv-none-elf-size lab2_Ofast.elf

~/rv32emu/tests/hw2$ riscv-none-elf-size lab2_0fast.elf
   text	   data	    bss	    dec	    hex	filename
  81004	   2396	   1556	  84956	  14bdc	lab2_Ofast.elf

ELF header

~/rv32emu/tests/hw2$ riscv-none-elf-readelf -h lab2_Ofast.elf
ELF Header:
  Magic:   7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00 
  Class:                             ELF32
  Data:                              2's complement, little endian
  Version:                           1 (current)
  OS/ABI:                            UNIX - System V
  ABI Version:                       0
  Type:                              EXEC (Executable file)
  Machine:                           RISC-V
  Version:                           0x1
  Entry point address:               0x102fc
  Start of program headers:          52 (bytes into file)
  Start of section headers:          99404 (bytes into file)
  Flags:                             0x0
  Size of this header:               52 (bytes)
  Size of program headers:           32 (bytes)
  Number of program headers:         3
  Size of section headers:           40 (bytes)
  Number of section headers:         15
  Section header string table index: 14

quant_bf16_to_int8 disassembly code

000103fc <quant_bf16_to_int8>:
   103fc:	fe010113          	add	sp,sp,-32
   10400:	00812c23          	sw	s0,24(sp)
   10404:	01212823          	sw	s2,16(sp)
   10408:	00452403          	lw	s0,4(a0) # ff800004 <__BSS_END__+0xff7db08c>
   1040c:	00852903          	lw	s2,8(a0)
   10410:	800007b7          	lui	a5,0x80000
   10414:	fff78793          	add	a5,a5,-1 # 7fffffff <__BSS_END__+0x7ffdb087>
   10418:	00f97933          	and	s2,s2,a5
   1041c:	00f47433          	and	s0,s0,a5
   10420:	00912a23          	sw	s1,20(sp)
   10424:	00090593          	mv	a1,s2
   10428:	00050493          	mv	s1,a0
   1042c:	00040513          	mv	a0,s0
   10430:	00112e23          	sw	ra,28(sp)
   10434:	01312623          	sw	s3,12(sp)
   10438:	2e5000ef          	jal	10f1c <__gesf2>
   1043c:	00055463          	bgez	a0,10444 <quant_bf16_to_int8+0x48>
   10440:	00090413          	mv	s0,s2
   10444:	0004a903          	lw	s2,0(s1)
   10448:	00040513          	mv	a0,s0
   1044c:	00191913          	sll	s2,s2,0x1
   10450:	00195913          	srl	s2,s2,0x1
   10454:	00090593          	mv	a1,s2
   10458:	2c5000ef          	jal	10f1c <__gesf2>
   1045c:	00055463          	bgez	a0,10464 <quant_bf16_to_int8+0x68>
   10460:	00090413          	mv	s0,s2
   10464:	00c4a903          	lw	s2,12(s1)
   10468:	00040513          	mv	a0,s0
   1046c:	00191913          	sll	s2,s2,0x1
   10470:	00195913          	srl	s2,s2,0x1
   10474:	00090593          	mv	a1,s2
   10478:	2a5000ef          	jal	10f1c <__gesf2>
   1047c:	00055463          	bgez	a0,10484 <quant_bf16_to_int8+0x88>
   10480:	00090413          	mv	s0,s2
   10484:	0104a903          	lw	s2,16(s1)
   10488:	00040513          	mv	a0,s0
   1048c:	00191913          	sll	s2,s2,0x1
   10490:	00195913          	srl	s2,s2,0x1
   10494:	00090593          	mv	a1,s2
   10498:	285000ef          	jal	10f1c <__gesf2>
   1049c:	00055463          	bgez	a0,104a4 <quant_bf16_to_int8+0xa8>
   104a0:	00090413          	mv	s0,s2
   104a4:	0144a903          	lw	s2,20(s1)
   104a8:	00040513          	mv	a0,s0
   104ac:	00191913          	sll	s2,s2,0x1
   104b0:	00195913          	srl	s2,s2,0x1
   104b4:	00090593          	mv	a1,s2
   104b8:	265000ef          	jal	10f1c <__gesf2>
   104bc:	00055463          	bgez	a0,104c4 <quant_bf16_to_int8+0xc8>
   104c0:	00090413          	mv	s0,s2
   104c4:	0184a903          	lw	s2,24(s1)
   104c8:	00040513          	mv	a0,s0
   104cc:	00191913          	sll	s2,s2,0x1
   104d0:	00195913          	srl	s2,s2,0x1
   104d4:	00090593          	mv	a1,s2
   104d8:	245000ef          	jal	10f1c <__gesf2>
   104dc:	00055463          	bgez	a0,104e4 <quant_bf16_to_int8+0xe8>
   104e0:	00090413          	mv	s0,s2
   104e4:	00040513          	mv	a0,s0
   104e8:	77d000ef          	jal	11464 <__extendsfdf2>
   104ec:	00050613          	mv	a2,a0
   104f0:	00023537          	lui	a0,0x23
   104f4:	00058693          	mv	a3,a1
   104f8:	e0850513          	add	a0,a0,-504 # 22e08 <__trunctfdf2+0x3e0>
   104fc:	7a8010ef          	jal	11ca4 <printf>
   10500:	f341a503          	lw	a0,-204(gp) # 24904 <__SDATA_BEGIN__+0x6c>
   10504:	00040593          	mv	a1,s0
   10508:	5a4000ef          	jal	10aac <__divsf3>
   1050c:	0004a583          	lw	a1,0(s1)
   10510:	00050413          	mv	s0,a0
   10514:	ff418913          	add	s2,gp,-12 # 249c4 <after_quant.0>
   10518:	2cd000ef          	jal	10fe4 <__mulsf3>
   1051c:	6d5000ef          	jal	113f0 <__fixsfsi>
   10520:	0044a583          	lw	a1,4(s1)
   10524:	00050793          	mv	a5,a0
   10528:	00f92023          	sw	a5,0(s2)
   1052c:	00040513          	mv	a0,s0
   10530:	2b5000ef          	jal	10fe4 <__mulsf3>
   10534:	6bd000ef          	jal	113f0 <__fixsfsi>
   10538:	0084a583          	lw	a1,8(s1)
   1053c:	00050793          	mv	a5,a0
   10540:	00f92223          	sw	a5,4(s2)
   10544:	00040513          	mv	a0,s0
   10548:	29d000ef          	jal	10fe4 <__mulsf3>
   1054c:	6a5000ef          	jal	113f0 <__fixsfsi>
   10550:	00c4a583          	lw	a1,12(s1)
   10554:	00050793          	mv	a5,a0
   10558:	00f92423          	sw	a5,8(s2)
   1055c:	00040513          	mv	a0,s0
   10560:	285000ef          	jal	10fe4 <__mulsf3>
   10564:	68d000ef          	jal	113f0 <__fixsfsi>
   10568:	0104a583          	lw	a1,16(s1)
   1056c:	00050793          	mv	a5,a0
   10570:	00f92623          	sw	a5,12(s2)
   10574:	00040513          	mv	a0,s0
   10578:	26d000ef          	jal	10fe4 <__mulsf3>
   1057c:	675000ef          	jal	113f0 <__fixsfsi>
   10580:	0144a583          	lw	a1,20(s1)
   10584:	00050793          	mv	a5,a0
   10588:	00f92823          	sw	a5,16(s2)
   1058c:	00040513          	mv	a0,s0
   10590:	255000ef          	jal	10fe4 <__mulsf3>
   10594:	65d000ef          	jal	113f0 <__fixsfsi>
   10598:	0184a583          	lw	a1,24(s1)
   1059c:	00050793          	mv	a5,a0
   105a0:	00f92a23          	sw	a5,20(s2)
   105a4:	00040513          	mv	a0,s0
   105a8:	23d000ef          	jal	10fe4 <__mulsf3>
   105ac:	645000ef          	jal	113f0 <__fixsfsi>
   105b0:	01c12083          	lw	ra,28(sp)
   105b4:	01812403          	lw	s0,24(sp)
   105b8:	00a92c23          	sw	a0,24(s2)
   105bc:	01412483          	lw	s1,20(sp)
   105c0:	01012903          	lw	s2,16(sp)
   105c4:	ff418513          	add	a0,gp,-12 # 249c4 <after_quant.0>
   105c8:	00c12983          	lw	s3,12(sp)
   105cc:	02010113          	add	sp,sp,32
   105d0:	00008067          	ret

Statistics

line of code : 119
register
- ax : a0 a1 a2 a3 a4 a5 a7 a8
- sx : s0 s1 s2 s3
- tx : none
branch and jump
- jump : 23
- branch : 6
stack
- sp : 32
lw and sw
- lw : 20
- sw : 12

Conclusion

Level	text	data	bss	dec	hex	filename
O0	81736	2320	1556	85612	14e6c	lab2.elf
O1	81016	2328	1556	85900	14ba4	lab2_O1.elf
O2	81224	2328	1556	85108	14c74	lab2_O2.elf
O3	81212	2396	1556	85164	14cac	lab2_O3.elf
Os	80900	2328	1556	84784	14b30	lab2_Os.elf
Ofast	81004	2396	1556	84956	14bdc	lab2_Ofast.elf

From O0 to O1
The most noticeable difference is in the usage of lw and sw instructions. The frequency of usage in O0 is almost twice as much as O1, which results in faster execution of functions in O1 optimization level.
Additionally, we observe that O0 code is almost a direct translation from C language, which leads to longer instructions in certain cases.
From O1 to O2
It can be noticed that the optimization from O1 to O2 is not very significant.
From O1 to O3
The optimization from O1 to O3 reduces the number of registers but increases the count of jump and branch instructions, as well as the usage of lw and sw instructions. This results in the function taking a significantly longer time to execute.
From O3 to Ofast
O3 and Ofast optimizations are nearly identical, indicating that the compiler optimization has reached its limit.

:point_right: Use ticks.c for the statistics of your program’s execution.

Rewrite makefile

.PHONY: clean

ASFLAGS = -march=rv32i -mabi=ilp32


all: source_O0.elf source_O1.elf source_O2.elf source_O3.elf source_Os.elf source_Ofast.elf

source_O0.elf:
	riscv-none-elf-gcc $(LDFLAGS) -O0 source.c -o $@
source_O1.elf:
	riscv-none-elf-gcc $(LDFLAGS) -O1 source.c -o $@
source_O2.elf:
	riscv-none-elf-gcc $(LDFLAGS) -O2 source.c -o $@
source_O3.elf:
	riscv-none-elf-gcc $(LDFLAGS) -O3 source.c -o $@
source_Os.elf:
	riscv-none-elf-gcc $(LDFLAGS) -Os source.c -o $@
source_Ofast.elf:
	riscv-none-elf-gcc $(LDFLAGS) -Ofast source.c -o $@

clean:
	rm *.elf

Level	O0	O1	O2	O3	Os	Ofast	Assembly
elapsed cycle	195232	193080	192836	188939	193367	188938	5227 :+1:

Assignment2: RISC-V Toolchain

Prepare GNU Toolchain for RISC-V

Question Selection

Question

Motiviation

Improve

Try manual optimization (fp32_to_bf16 & find maximum absolution value)

Add bf16 to int8 conversion

combine

Result

Room for improvement

Optimized by riscv-none-elf-gcc

RISC-V Instructions/Registers Usage Statistics

-O0

-O1

-O2

-O3

-Os

-Ofast

Conclusion

:point_right: Use ticks.c for the statistics of your program’s execution.

Reference

Read more

伺服語錄

撰寫 LKM

印表機驅動安裝

2024q1 Homework2 (quiz1+2)