owned this note
owned this note
Published
Linked with GitHub
# Assignment2: RISC-V Toolchain
contributed by < [kkkkk1109](https://github.com/kkkkk1109) >
## Rewrite [Reducing memory usage with bfloat and bfloat multiplication](https://hackmd.io/@PWCheng/CAHW01)
I choose the problem [Reducing memory usage with bfloat and bfloat multiplication](https://hackmd.io/@PWCheng/CAHW01) from <[`Brian Cheng`](https://github.com/BrianCheng-TheLegend?tab=repositories)>. In assignment 1 ,I ultilized CLZ to implement the square root. Since there were division and addition of floatint point, I want to learn some more floating point operation. Also, I think the topic is quite interesting by merging two bf16 to one register, and I'm wondering the accuracy between bf16 and IEEE754. Therefore, I choose this subject in assignment 2.
**Original C code:**
```c
#include<stdio.h>
float fp32_to_bf16(float x)
{
float y = x;
int *p = (int *) &y;
unsigned int exp = *p & 0x7F800000;
unsigned int man = *p & 0x007FFFFF;
if (exp == 0 && man == 0) /* zero */
return x;
if (exp == 0x7F800000) /* infinity or NaN */
return x;
/* Normalized number */
/* round to nearest */
float r = x;
int *pr = (int *) &r;
*pr &= 0xFF800000; /* r has the same exp as x */
r /= 0X100;
y = x + r;
*p &= 0xFFFF0000;
return y;
}
// encoder : encode two bfloat number in one memory
int encoder(int *a,int *b){
int c=0;
c=*a+(*b>>16);
*a=0;
*b=0;
return c;
}
// decoder : decode one memory number in two bfloat
void decoder(int c ,int *n1,int *n2){
*n1=c&0xffff0000;
*n2=(c&0x0000ffff)<<16;
}
int main(){
// definition of num1 and transfer it to bfloat
float num1=-12.123;
int *np1=(int *) &num1;
num1=fp32_to_bf16(num1);
// definition of num2 and transfer it to bfloat
float num2=45.568;
int *np2=(int *) &num2;
num2=fp32_to_bf16(num2);
float add;
int *p=(int *) &add;
*p=0;
// show num1 binary form and it's value
printf("0x%x\n",*np1);
printf("%f\n",num1);
// show num2 binary form and it's value
printf("0x%x\n",*np2);
printf("%f\n",num2);
// add two number together and print the binary form
*p=encoder(np1,np2);
decoder(*p,&num1,&num2);
float mul_num;
mul_num=num1*num2;
printf("%f\n",mul_num);
return 0;
}// // The bfloat multiplication still not implement
// void bfloat_multiplication(int *num){
// printf("0x%x\n",*num);
// int mask[]={0x1,0xff,0x7f};
// int fra=0,exp=0,sig=0;
// int bn1=0,bn2=0;
// // fraction
// bn2 = *num & mask[2];
// bn2 |= 0x100;
// mask[2] <<=16;
// bn1 = *num & mask[2];
// bn1 >>= 16;
// bn1 |= 0x100;
// for(int i=0;i<8;i++){
// if((bn1 && 1{ bn1&1
// fra+=bn2;
// }
// bn1>>1;
// bn2<<1;
// }
// // if((fra & 0x8000)>>16) fra>>16
// // fra >>= 9;
// // else
// // fra >>=8;
// printf("%x\n",fra);
// // exponent
// mask[1] <<= 7;
// bn2 = *num & mask[1];
// mask[1] <<= 16;
// bn1 = *num & mask[1];
// bn1 >>= 23;
// bn2 >>= 7;
// exp=bn1+bn2-127;
// printf("%x\n",exp);
// // sign
// bn1 = 0;
// bn2 = 0;
// mask[0] <<= 15;
// bn2= *num & mask[0];
// bn2 <<= 16;
// mask[0] <<= 16;
// bn1 = *num & mask[0];
// bn1 ^= bn2;
// sig=bn1;
// *num = sig | (fra << 15) | (exp << 22);
// }
```
In C code, the `bfloat_multiplication` is not used, also, I found the there are some mistakes in function `bfloat_multiplication` and `main`, and I rewrite the c code.
```c
float bfloat_multiplication(int *num){
uint32_t r = *num;
uint16_t fra=0,exp=0,sig=0,f1=0,f2=0,c=0;
uint16_t bf1,bf2;
// fraction
bf1 = (r >> 16) & 0xffff;
bf2 = r & 0xffff;
printf("bf1 = %x\n",bf1);
printf("bf2 = %x\n",bf2);
f1= ( bf1 & 0xff ) | 0x80;
f2= ( bf2 & 0xff ) | 0x80;
printf("f1 = %x\n",f1);
printf("f2 = %x\n",f2);
for(int i = 0; i < 8; i ++){
//printf("i=%d\n",i);
if(f1 & 1){
fra += f2;
}
f1 >>= 1;
f2 <<= 1;
}
if(fra>>15){
fra >>= 1;
c=1;
}
fra >>= 5;
//rounded
int sticky = fra & 1;
int round= (fra>>1) &1;
int lsb= (fra>>2) & 1;
fra= (fra>>2) + (round & (lsb | sticky));
printf("frac after rounded %x\n",fra);
exp = (bf1 & (0x7f80)) >> 7;
exp += (bf2 & (0x7f80)) >> 7 ;
exp=exp-127+c;
printf("exp: %d\n",exp);
//sign
sig = (bf1 >> 15) ^(bf2 >> 15);
printf("sig: %d\n",sig);
r=(sig << 15) | (exp << 7) | (fra &0x7f);
r=r << 16;
return *(float*)&r;
}
```
```c
int main(){
// definition of num1 and transfer it to bfloat
float num1=-12.123;
int *np1=(int *) &num1;
num1=fp32_to_bf16(num1);
// definition of num2 and transfer it to bfloat
float num2=45.568;
int *np2=(int *) &num2;
num2=fp32_to_bf16(num2);
float add;
int *p=(int *) &add;
*p=0;
// show num1 binary form and it's value
printf("bf float of %f is 0x%x\n",num1,*np1);
// show num2 binary form and it's value
printf("bf float of %f is 0x%x\n",num2,*np2);
// add two number together and print the binary form
*p=encoder(np1,np2);
printf("encoder: %x \n",*p);
decoder(*p,np1,np2);//decoder(*p,&num1,&num2); wrong input
float ans=bfloat_multiplication(p);
printf("%f\n",ans);
return 0;
}
```
Except for using * in c , I implement the `bfloat_multiplication` to do the multiplication.
Compile the c code with different optimization level
| Optimized level | -O0 | -O1 | -O2 | -O3&-Ofast | -Os |
|:---------------:|:-----:|:-----:| ----- |:----------:|:---------:|
| CYCLE | 10342 | 9688 | 9687 | ==9636== | 9703 |
| instret | 10342 | 8662 | 8661 | ==8610== | 8677 |
| text | 55718 | 54610 | 54598 | 54726 | 54574 |
| dta | 1924 | 1968 | 1928 | 1928 | 1928 |
| bss | 1528 | 1528 | 1528 | 1528 | 1528 |
| dec | 59170 | 58066 | 58054 | 58182 | ==58030== |
Loop unrolling
| Optimized level | -O0 | -O1 | -O2&-O3 | -Ofast | -Os |
|:---------------:|:-----:|:-----:| ------- |:------:|:-----:|
| CYCLE | 10289 | 9632 | 9631 | ==9514== | 9655
| instret | 9256 | 8606 | 8605 | ==8498== | 8629 |
| text | 56078 | 54706 | 54706 | 54514 | 54694 |
| dta | 1924 | 1928 | 1928 | 1936 | 1928 |
| bss | 1528 | 1528 | 1528 | 1528 | 1528 |
| dec | 59530 | 58162 | 58162 | ==57978== | 58150 |
#### Different Optimization levels in GNU
**-O0**
- This level involves ==no optimization== and produces code that closely mirrors the source code structure. It is primarily used for debugging but offers lower performance.
**-O1**
- At this level, basic optimizations like removing unused local variables and reducing stack storage for local variables are enabled. It improves execution speed while maintaining shorter compile times.
**-O2**
- Also known as Level 2 optimization, this level goes a step further, applying more code transformations and performance optimizations. It significantly enhances code execution speed with a slightly longer compilation time.
**-O3**
- This represents a high level of optimization, involving a multitude of code transformations and performance enhancements. It leads to a substantial improvement in code execution speed, but may introduce more complex generated code and, at times, instability.
**-Ofast**
- Similar to -O3, this level adds extra optimization options, potentially sacrificing some mathematical precision to boost performance. It's valuable for scientific computing but may not be suitable for all software.
**-Os**
- The goal here is to optimize code for a ==reduced size== of the generated executable rather than maximum performance. It's useful for embedded systems and resource-constrained environments.
#### Analysis
In the both tables, we can see that the smallest cycle count and instret when compiling with `-Ofast` levels.
However, the smallest size happens in `-Ofast` in loop unrolling c code while it should be happened in `-Os` as the result in first table.
Using `rv_histogram`
```
build/rv_histogram -r tests/perfcounter/perfcounts.elf
build/rv_histogram tests/perfcounter/perfcounts.elf
```
Optimized level | -O0 | -O1 | -O2 | -O3&-Ofast | -Os |
|:---------------:|:-----:|:-----:| ----- |:----------:|:---------:|
| a5 | 16.74% | 15.65% | 15.64% | 15.6% |15.77%
The biggest change in the histogram is the `a5` register usage, from *16.74%* (none optimization) to about *15.6%*, and the rest of registers or instruction are nearly the same.
#### Problem(unsolved)
When compiling with -O2, -O3, -Ofast and -Os,there are warning. However, building the generated elf file still performs the program.
```
kkkkk1109@ubuntu:~/rv32emu/tests/perfcounter$ make
riscv-none-elf-gcc -march=rv32i_zicsr_zifencei -mabi=ilp32 -O2 -Wall -c -o main.o main.c
main.c: In function 'bfloat_multiplication':
main.c:119:13: warning: dereferencing type-punned pointer will break strict-aliasing rules [-Wstrict-aliasing]
119 | return *(float*)&rm;
| ^~~~~~~~~~~
main.c: In function 'fp32_to_bf16':
main.c:14:24: warning: 'y' is used uninitialized [-Wuninitialized]
14 | unsigned int exp = *p & 0x7F800000;
| ^~
main.c:11:11: note: 'y' declared here
11 | float y = 0;
| ^
main.c: In function 'bfloat_multiplication':
main.c:119:12: warning: 'rm' is used uninitialized [-Wuninitialized]
119 | return *(float*)&rm;
| ^~~~~~~~~~~~
main.c:34:14: note: 'rm' declared here
34 | uint32_t rm = *num;
| ^~
riscv-none-elf-gcc -o perfcount.elf getcycles.o getinstret.o sparkle.o main.o
```
**Hand-written assembly code**
This is the original [assembly code](https://github.com/BrianCheng-TheLegend/2023_ComputerArchitecture/blob/main/Hw01/Hw01.s)
To print the integer in assembly code, I add a void in `syscall.c`
```clike
static void syscall_ooo(riscv_t *rv){
uint32_t fd = rv_get_reg(rv, rv_reg_a1);
fprintf(stdout,"%d",fd);
}
```
and define 99 as this system call number
```clike
#define SUPPORTED_SYSCALLS \
_(ooo, 99) \
```
To use the `cssr` instruction, we need to modify the `makefile`
```
ASFLAGS = -march=rv32i_zicsr -mabi=ilp32
```
#### Rev1
I made some improvements to the assembly code.
1. In original code, the original programmer used `la`and`lw` to produce the mask, and I changed them to `li`.
2. The register used to reserve the bitmask is `t6`, when producing a new mask, the old mask would be lost, but sometmes we required the mask we replace before, so I used the `t6` and `t5` to reserve the mask which decrease the times to produce bitmask.
3. In `Multi_bfloat`, when the significand happens overflow, we have to add carry to the exponent. In orginal code, when overflow happens, the bitmask would be load again to do the `add` operation.I found way to streamline the code by changing the sequence of operation in this function.
```diff
.org 0
.global _start
.set SYSEXIT, 93
.set SYSWRITE, 64
.set PRINT_INT, 99
.data
# test data
test0: .word 0x4141f9a7,0x423645a2
test1: .word 0x3fa66666,0x42c63333
test2: .word 0x43e43a5e,0x42b1999a
# mask
# mask0 for exponent ,fraction
# ( 0 ,4 ,8 ,12 ,16 ,20 ,24 )
-mask0: .word 0x7F800000,0x007FFFFF,0x800000,0x8000,0x7f,0x3F800000,0x80000000
# mask1 for round
-mask1: .word 0x8000
# mask2 for decoder
-mask2: .word 0xFFFF0000,0x0000FFFF
#string
str: .string "\n"
str1: .string "Count CYCLE is :"
.set str1_size, .-str1
.text
start:
jal ra,get_cycles #count cycle
mv s8,a0 # old cycle
li a7,1
la a2,test0 # load test data address to a2
lw a6,0(a2) # load test data to a6
jal ra,f32_b16_p1 # call fp32 to bf16 function
add a5,a6,x0 # store first bfloat in a5
lw a6,4(a2) # load test data to a6
jal ra,f32_b16_p1 # jump to float32 transform to bfloat function
add a4,a6,x0 # store the result to a4
jal ra,encoder # jump to encoder funtion
add s9,s3,x0 # save s3(data after encode) to s9
jal ra,decoder # jump to decoder function
jal ra,Multi_bfloat # jump to bfloat Multiplication funcition
# print cycle
li a0,1
la a1, str1
la a2,str1_size
li a7,SYSWRITE
ecall # ecall
j exit # jump to exit this program
### function converts IEEE754 fp32 to bfloat16
f32_b16_p1:
sw a6,0(sp)
- add t0,a6,x0 # a6 will be only for this funtion to access
+mv t0, a6
- la a3,mask0 # load mask0 address to a3
# exponent
- lw t6,0(a3) # load mask 0x7F800000 to t6
+ li t6,0x7F800000
and t1,t0,t6 # let exponent save to t1
# fraction
- lw t6,4(a3) # load 0x007FFFFF to t6
+ li t5, 0x007FFFFF #
and t2,t0,t5 # let fraction save to t2
# check this number if 0 or inf (exponent + fraction)
- lw t6,0(a3) # load mask 0x7F800000 to t6 t6沒有變 不用這行
beq t1,t6,inf_or_zero # exp == 0x7F800000
or t3,t1,t2
beq t3,x0,inf_or_zero # exp == 0 && man == 0
# add integer to fraction
- lw t6,8(a3) # load integer
+ li t6,0x800000 # li mask
or t2,t2,t6 # add integer 0.111+1
# round to nearest for fraction
- lw t6,12(a3) # load the round number
+ srli t6,t6,8 #load mask
add t2,t2,t6 # add round number
srli t4,t2,24 # shift left 24 to t5
+ li,t6,0x7f
beq t4,x0,no_overflow # if t5 equal to 0 move to
no_overflow
# if overflow
- lw t6,8(a3) # load mask 0x007FFFFF
add t1,t1,t5 # add 1 to exponent change t6 to t5
srli t2,t2,17 # shift t2 to left 1 integer and 7 fraction
- lw t6,16(a3) # load mask 0x7f
+ li t6,0x7f
and t2,t2,t6 # let t2 only have integer
slli t2,t2,16 # shift right 16
j f32_b16_p2
# if not overflow
no_overflow:
srli t2,t2,16 # shift t2 to left 1 integer and 7 fraction
-lw t6,16(a3) # load mask 0x7f
and t2,t2,t6 # let t2 only have integer
slli t2,t2,16 # shift right 16
#f32_b16 end function
f32_b16_p2:
# save to a6
srli t0,t0,31 # shift left to let t0 remain sign
slli t0,t0,31 # shift right to let t0 sign to the right position
or t0,t0,t1 # combine sign and exponent together
or t0,t0,t2 # combine sign,exponent and fraction together
add a6,t0,x0 # save t0 to a6
ret # move back to main function
inf_or_zero:
srli a6,a6,16
slli a6,a6,16
ret # return to main
### end of funtion
### encode two bfloat to one register
encoder:
add t0,a5,x0 # load a5(first bfloat) to t0
add t1,a4,x0 # load a4(second bfloat) to t1
srli t1,t1,16 # shift to let second bfloat fit in one register
or t0,t0,t1 # combine two bfloat in one register
add s3,t0,x0 # load t0 to s3
ret # return to main
### decode two bfloat on one register to two registers
decoder:
add t0,s9,x0 # load s9(data encode) to t0
- la a1,mask2 # load mask2 address
- lw s2,0(a1) # load mask 0xFFFF0000
+ srli t1,t0,16
+ slli t1,t1,16
- and t1,t0,s2 # use mask to specification bfloat 1
- lw s2,4(a1) # load mask 0x0000FFFF
+ li s2,0x0000ffff
and t2,t0,s2 # use mask to specification bfloat 2
slli t2,t2,16 # shift to left to let bfloat peform like original float
add s6,t1,x0 # store t1(bfloat 1) to s6
add s5,t2,x0 # store t2(bfloat 2) to s5
ret # return to main
### change line
cl:
li a7,4 # set a7 as string mode
la a0,str # load str to a0
ecall # ecall
ret # return to main
### Multiplication with bfloat in one register
Multi_bfloat:
# decoder function input is a0
# jal ra,decoder # load a0(two bloat number in one register) to t0
# decoder function output is s5,s6
add t0,s5,x0 # store s5(bfloat 2) to t0
add t1,s6,x0 # store s6(bfloat 1) to t1
- lw t6,0(a3) # load mask0 mask 0x7F800000
+ li t6,0x7F800000
# get exponent to t2,t3
and t3,t0,t6 # use mask 0x7F800000 to get t0 exponent
and t2,t1,t6 # use mask 0x7F800000 to get t1 exponent
add t3,t3,t2 # add two exponent to t3
srli t3,t3,23 # 先右移
- lw t6,20(a3) # load mask0 mask 0x3F800000
- sub t3,t3,t6 # sub 127 to exponent
+ addi t3,t3,-127
mv s3, t3 #exp 存到s3
# get sign
xor t2,t0,t1 # get sign and store on t2
srli t2,t2,31 # get rid of useless data
slli t2,t2,22 # let sign back to right position sig +exp放在 0~8
or s3,s3,t2 # s3=0x0000最後9=sig+exp
# get sign and exponent together
- or t3,t3,t2
# set the sign and exponent to t0
- slli t0,t0,9
- srli t0,t0,9
- or t0,t3,t0
# get fraction to t2 and t3
- lw t6,16(a3) # load mask0 mask 0x7F
- slli t6,t6,16 # shift mask to 0x7F0000
+ li t6,0x7F0000
and t2,t0,t6 # use mask 0x7F0000 get fraction
and t3,t1,t6 # use mask 0x7F0000 get fraction
slli t2,t2,9 # shift left let no leading 0
srli t2,t2,1 # shift right let leading has one 0
- lw t6,24(a3) # load mask0 mask 0x80000000
+ li t6,0x80000000
or t2,t2,t6 # use mask 0x80000000 to add integer
srli t2,t2,1 # shift right to add space for overflow
slli t3,t3,8 # shift left let no leading 0
or t3,t3,t6 # use mask 0x80000000 to add integer
srli t3,t3,1 # shift right to add space for overflow
add s11,x0,x0 # set a counter and 0
addi s10,x0,8 # set a end condition
add t1,x0,x0 # reset t1 to 0 and let this register be result
- lw t6,24(a3) # load mask0 mask 0x80000000
+ mv t5,t6
loop:
addi s11,s11,1 # add 1 at counter every loop
srli t5,t5,1 # shift right at 1 every loop
and t4,t2,t5 # use mask to specified number at that place
beq t4,x0,not_add # jump if t4 equal to 0
add t1,t1,t3 # add t3 to t1
not_add:
srli t3,t3,1 # shift left 1 bit to t3
bne s11,s10,loop # if the condition not satisfy return to loop
# end of loop
# check if overflow
- lw t6,24(a3) # load mask0 mask 0x80000000 to t6
and t4,t1,t6 # get t1 max bit
# if t4 max bit equal to 0 will not overflow
beq t4,x0,not_overflow
# if overflow
slli t1,t1,1 # shift left 1 bits to remove integer
- lw t6,8(a3) # load mask0 mask 0x800000
- add t0,t0,t6 # exponent add 1 if overflow
addi s3,s3 1
j Mult_end # jump to Mult_end
# if not overflow
not_overflow:
slli t1,t1,2 # shift left 2 bits to remove integer
Mult_end:
srli t1,t1,24 # shift right to remove useless bits
addi t1,t1,1 # add 1 little bit to check if carry
srli t1,t1,1 # shift right to remove useless bits
slli t1,t1,16 # shift left to let fraction be right position
- srli t0,t0,23 # shift right to remove useless bits
- slli t0,t0,23 # shift left to let sign and exponent be right position
slli s3,s3,23
or s3,s3 t1
- or t0,t0,t1 # combine t0 and t1 together to get bfloat
- add s3,t0,x0 # store bfloat after multiplication to s3
ret # return to main
### end of function
exit:
jal ra, get_cycles
sub a1,a0,s8
li a7,PRINT_INT
li a0,1
ecall
mv ra ,x0
li a7,SYSEXIT # set a7 as exit
ecall # ecall
get_cycles:
csrr a1, cycleh
csrr a0, cycle
csrr a2, cycleh
bne a1, a2, get_cycles
ret
```
#### Rev2
loop unrolling in the `Multi_bfloat`
```
loop:
addi s11,s11,1 # add 1 at counter every loop
srli t5,t5,1 # shift right at 1 every loop
and t4,t2,t5 # use mask to specified number at that place
beq t4,x0,not_add # jump if t4 equal to 0
add t1,t1,t3 # add t3 to t1
not_add:
srli t3,t3,1 # shift left 1 bit to t3
srli t5,t5,1 # shift right at 1 every loop
and t4,t2,t5 # use mask to specified number at that place
beq t4,x0,not_add2 # jump if t4 equal to 0
add t1,t1,t3 # add t3 to t1
not_add2:
srli t3,t3,1 # shift left 1 bit to t3
srli t5,t5,1 # shift right at 1 every loop
and t4,t2,t5 # use mask to specified number at that place
beq t4,x0,not_add3 # jump if t4 equal to 0
add t1,t1,t3 # add t3 to t1
not_add3:
srli t3,t3,1 # shift left 1 bit to t3
srli t5,t5,1 # shift right at 1 every loop
and t4,t2,t5 # use mask to specified number at that place
beq t4,x0,not_add4 # jump if t4 equal to 0
add t1,t1,t3 # add t3 to t1
not_add4:
srli t3,t3,1 # shift left 1 bit to t3
srli t5,t5,1 # shift right at 1 every loop
and t4,t2,t5 # use mask to specified number at that place
beq t4,x0,not_add5 # jump if t4 equal to 0
add t1,t1,t3 # add t3 to t1
not_add5:
srli t3,t3,1 # shift left 1 bit to t3
srli t5,t5,1 # shift right at 1 every loop
and t4,t2,t5 # use mask to specified number at that place
beq t4,x0,not_add6 # jump if t4 equal to 0
add t1,t1,t3 # add t3 to t1
not_add6:
srli t3,t3,1 # shift left 1 bit to t3
srli t5,t5,1 # shift right at 1 every loop
and t4,t2,t5 # use mask to specified number at that place
beq t4,x0,not_add7 # jump if t4 equal to 0
add t1,t1,t3 # add t3 to t1
not_add7:
srli t3,t3,1 # shift left 1 bit to t3
srli t5,t5,1 # shift right at 1 every loop
and t4,t2,t5 # use mask to specified number at that place
beq t4,x0,out_loop # jump if t4 equal to 0
add t1,t1,t3 # add t3 to t1
# end of loop
```
As the shown table, the cycle and instret reduced from *200* to *171*.
| | original | rev 1 | rev 2 |
|:-------:|:--------:|:-------:|:-------:|
| INSTRET | 200 | 187 | ==171== |
| CYCLE | 200 | 187 | 171 |
| text | 692 | 596 | 728 |
| data | 0 | 0 | 0 |
| bss | 0 | 0 | 0 |
| dec | 692 | ==596== | 728 |
In rev 2( loop unrolling), while the number of cycles are the smallest, the size becomes the largest.
| | original(%) | rev 1(%) | rev 2(%) |
|:----:|:-----------:|:--------:|:---------:|
| add | 13.46 | 13.24 | 14.49 |
| srli | 10.26 | 12.5 | ==17.75== |
| addi | 8.97 | 11.03 | 8.88 |
| lw | ==12.18== | 1.47 | 1.18 |
As I mentioned, I use `li`, `srli` to do the bitmask, the table shows that the `lw` operation decreases 10%. Also, due to the loop unrolling in the rev 2, the percentage of `add` and `srli` in the `Multi_bfloat` increases.
### Conclusion
Though using the -Ofast or higher level compiler can perform better, we should consider the size of data, also the hardware implmentation. We need to strike a balance between performance and data volume.
### Reference
* [Reducing memory usage with bfloat and bfloat multiplication](https://hackmd.io/@PWCheng/CAHW01)
* [RV32emu](https://github.com/sysprog21/rv32emu)