# Assignment2: RISC-V Toolchain
contributed by < `kkkkk1109` >
## Rewrite [Reducing memory usage with bfloat and bfloat multiplication](https://hackmd.io/@PWCheng/CAHW01)
go to .s
```
cd target.s address
```
to run the elf file
```
build/rv32emu address/target.elf
```
to compile c code
```
riscv-none-elf-gcc -march=rv32i -mabi=ilp32 -O0 hw2.c -o hw2.elf
```
I choose the problem [Reducing memory usage with bfloat and bfloat multiplication](https://leetcode.com/problems/monotonic-array/) from [鄭博文]. In assignment 1 ,I ultilized CLZ to implement the square root. Since there were division and addition of floatint point I want to learn some more floating point operation. Also, I think the topic is quite interesting by merging two bf16 to one register, and I'm wondering the accuracy between bf16 and IEEE754. Therefore, I choose this subject in assignment 2.
**Original C code:**
```c
#include<stdio.h>
float fp32_to_bf16(float x)
{
float y = x;
int *p = (int *) &y;
unsigned int exp = *p & 0x7F800000;
unsigned int man = *p & 0x007FFFFF;
if (exp == 0 && man == 0) /* zero */
return x;
if (exp == 0x7F800000) /* infinity or NaN */
return x;
/* Normalized number */
/* round to nearest */
float r = x;
int *pr = (int *) &r;
*pr &= 0xFF800000; /* r has the same exp as x */
r /= 0X100;
y = x + r;
*p &= 0xFFFF0000;
return y;
}
// encoder : encode two bfloat number in one memory
int encoder(int *a,int *b){
int c=0;
c=*a+(*b>>16);
*a=0;
*b=0;
return c;
}
// decoder : decode one memory number in two bfloat
void decoder(int c ,int *n1,int *n2){
*n1=c&0xffff0000;
*n2=(c&0x0000ffff)<<16;
}
int main(){
// definition of num1 and transfer it to bfloat
float num1=-12.123;
int *np1=(int *) &num1;
num1=fp32_to_bf16(num1);
// definition of num2 and transfer it to bfloat
float num2=45.568;
int *np2=(int *) &num2;
num2=fp32_to_bf16(num2);
float add;
int *p=(int *) &add;
*p=0;
// show num1 binary form and it's value
printf("0x%x\n",*np1);
printf("%f\n",num1);
// show num2 binary form and it's value
printf("0x%x\n",*np2);
printf("%f\n",num2);
// add two number together and print the binary form
*p=encoder(np1,np2);
decoder(*p,&num1,&num2);
float mul_num;
mul_num=num1*num2;
printf("%f\n",mul_num);
return 0;
}// // The bfloat multiplication still not implement
// void bfloat_multiplication(int *num){
// printf("0x%x\n",*num);
// int mask[]={0x1,0xff,0x7f};
// int fra=0,exp=0,sig=0;
// int bn1=0,bn2=0;
// // fraction
// bn2 = *num & mask[2];
// bn2 |= 0x100;
// mask[2] <<=16;
// bn1 = *num & mask[2];
// bn1 >>= 16;
// bn1 |= 0x100;
// for(int i=0;i<8;i++){
// if((bn1 && 1{ bn1&1
// fra+=bn2;
// }
// bn1>>1;
// bn2<<1;
// }
// // if((fra & 0x8000)>>16) fra>>16
// // fra >>= 9;
// // else
// // fra >>=8;
// printf("%x\n",fra);
// // exponent
// mask[1] <<= 7;
// bn2 = *num & mask[1];
// mask[1] <<= 16;
// bn1 = *num & mask[1];
// bn1 >>= 23;
// bn2 >>= 7;
// exp=bn1+bn2-127;
// printf("%x\n",exp);
// // sign
// bn1 = 0;
// bn2 = 0;
// mask[0] <<= 15;
// bn2= *num & mask[0];
// bn2 <<= 16;
// mask[0] <<= 16;
// bn1 = *num & mask[0];
// bn1 ^= bn2;
// sig=bn1;
// *num = sig | (fra << 15) | (exp << 22);
// }
```
In C code, the `bfloat_multiplication` is not used, also, I found the there are some mistakes in function `bfloat_multiplication` and `main`, and I rewrite the c code.
```c
float bfloat_multiplication(int *num){
uint32_t r = *num;
uint16_t fra=0,exp=0,sig=0,f1=0,f2=0,c=0;
uint16_t bf1,bf2;
// fraction
bf1 = (r >> 16) & 0xffff;
bf2 = r & 0xffff;
printf("bf1 = %x\n",bf1);
printf("bf2 = %x\n",bf2);
f1= ( bf1 & 0xff ) | 0x80;
f2= ( bf2 & 0xff ) | 0x80;
printf("f1 = %x\n",f1);
printf("f2 = %x\n",f2);
for(int i = 0; i < 8; i ++){
//printf("i=%d\n",i);
if(f1 & 1){
fra += f2;
}
f1 >>= 1;
f2 <<= 1;
}
if(fra>>15){
fra >>= 1;
c=1;
}
fra >>= 5;
//rounded
int sticky = fra & 1;
int round= (fra>>1) &1;
int lsb= (fra>>2) & 1;
fra= (fra>>2) + (round & (lsb | sticky));
printf("frac after rounded %x\n",fra);
exp = (bf1 & (0x7f80)) >> 7;
exp += (bf2 & (0x7f80)) >> 7 ;
exp=exp-127+c;
printf("exp: %d\n",exp);
//sign
sig = (bf1 >> 15) ^(bf2 >> 15);
printf("sig: %d\n",sig);
r=(sig << 15) | (exp << 7) | (fra &0x7f);
r=r << 16;
return *(float*)&r;
}
```
```c
int main(){
// definition of num1 and transfer it to bfloat
float num1=-12.123;
int *np1=(int *) &num1;
num1=fp32_to_bf16(num1);
// definition of num2 and transfer it to bfloat
float num2=45.568;
int *np2=(int *) &num2;
num2=fp32_to_bf16(num2);
float add;
int *p=(int *) &add;
*p=0;
// show num1 binary form and it's value
printf("bf float of %f is 0x%x\n",num1,*np1);
// show num2 binary form and it's value
printf("bf float of %f is 0x%x\n",num2,*np2);
// add two number together and print the binary form
*p=encoder(np1,np2);
printf("encoder: %x \n",*p);
decoder(*p,np1,np2);//decoder(*p,&num1,&num2); wrong input
float ans=bfloat_multiplication(p);
printf("%f\n",ans);
return 0;
}
```
Except for using * in c , I implement the `bfloat_multiplication` to do the multiplication.
Compile with different optimization level, here are the results(orginal c code)


**Hand-written assembly code**
I made some improvements to the assembly code.
1. In original code, the original programmer used `la`and`lw` to produce the mask, and I changed them to `li`.
2. The register used to reserve the bitmask is `t6`, when producing a new mask, the old mask would be lost, but sometmes we required the mask we replace before, so I used the `t6` and `t5` to reserve the mask which decrease the times to produce bitmask.
3. In `Multi_bfloat`, when the significand happens overflow, we have to add carry to the exponent. In orginal code, when overflow happens, the bitmask would be load again to do the `add` operation.I found way to streamline the code by changing the sequence of operation in this function.
```
.data
# test data
test0: .word 0x4141f9a7,0x423645a2
test1: .word 0x3fa66666,0x42c63333
test2: .word 0x43e43a5e,0x42b1999a
# mask
# mask0 for exponent ,fraction
# ( 0 ,4 ,8 ,12 ,16 ,20 ,24 )
mask0: .word 0x7F800000,0x007FFFFF,0x800000,0x8000,0x7f,0x3F800000,0x80000000
# mask1 for round
mask1: .word 0x8000
# mask2 for decoder
mask2: .word 0xFFFF0000,0x0000FFFF
#string
str: .string "\n"
.text
main:
li a7,1
la a2,test0 # load test data address to a2
lw a6,0(a2) # load test data to a6
jal ra,f32_b16_p1 # call fp32 to bf16 function
add a5,a6,x0 # store first bfloat in a5
lw a6,4(a2) # load test data to a6
jal ra,f32_b16_p1 # jump to float32 transform to bfloat function
add a4,a6,x0 # store the result to a4
jal ra,encoder # jump to encoder funtion
add s9,s3,x0 # save s3(data after encode) to s9
jal ra,decoder # jump to decoder function
jal ra,Multi_bfloat # jump to bfloat Multiplication funcition
# Output second bfloat after decoder
li a7,2 # set a7 as float mode
add a0,x0,s5 # set a0 as s5
ecall # ecall
jal ra,cl # change line
# Output first bfloat after decoder
li a7,2 # set a7 as float mode
add a0,x0,s6 # set a0 as s6
ecall # ecall
jal ra,cl # change line
# Output Multiplication result
li a7,2 # set a7 as float mode
add a0,x0,s3 # set a0 as s3(Multiplication results)
ecall # ecall
j exit # jump to exit this program
### function converts IEEE754 fp32 to bfloat16
f32_b16_p1:
sw a6,0(sp)
-add t0,a6,x0 # a6 will be only for this funtion to access
+mv t0, a6
-la a3,mask0 # load mask0 address to a3
# exponent
-lw t6,0(a3) # load mask 0x7F800000 to t6
+li t6,0x7F800000
and t1,t0,t6 # let exponent save to t1
# fraction
-lw t6,4(a3) # load 0x007FFFFF to t6
+li t5, 0x007FFFFF ##############################不要另外li 減少li次數
and t2,t0,t5 # let fraction save to t2
# check this number if 0 or inf (exponent + fraction)
-lw t6,0(a3) # load mask 0x7F800000 to t6 t6沒有變 不用這行
beq t1,t6,inf_or_zero # exp == 0x7F800000
or t3,t1,t2
beq t3,x0,inf_or_zero # exp == 0 && man == 0
# add integer to fraction
-lw t6,8(a3) # load integer
+li t6,0x800000 # li mask
or t2,t2,t6 # add integer 0.111+1
# round to nearest for fraction
-lw t6,12(a3) # load the round number
+srli t6,t6,8 #load mask
add t2,t2,t6 # add round number
srli t4,t2,24 # shift left 24 to t5
+li,t6,0x7f
beq t4,x0,no_overflow # if t5 equal to 0 move to
no_overflow
# if overflow
-lw t6,8(a3) # load mask 0x007FFFFF
add t1,t1,t5 # add 1 to exponent change t6 to t5
srli t2,t2,17 # shift t2 to left 1 integer and 7 fraction
-lw t6,16(a3) # load mask 0x7f
+li t6,0x7f
and t2,t2,t6 # let t2 only have integer
slli t2,t2,16 # shift right 16
j f32_b16_p2
# if not overflow
no_overflow:
srli t2,t2,16 # shift t2 to left 1 integer and 7 fraction
-lw t6,16(a3) # load mask 0x7f
and t2,t2,t6 # let t2 only have integer
slli t2,t2,16 # shift right 16
#f32_b16 end function
f32_b16_p2:
# save to a6
srli t0,t0,31 # shift left to let t0 remain sign
slli t0,t0,31 # shift right to let t0 sign to the right position
or t0,t0,t1 # combine sign and exponent together
or t0,t0,t2 # combine sign,exponent and fraction together
add a6,t0,x0 # save t0 to a6
ret # move back to main function
inf_or_zero:
srli a6,a6,16
slli a6,a6,16
ret # return to main
### end of funtion
### encode two bfloat to one register
encoder:
add t0,a5,x0 # load a5(first bfloat) to t0
add t1,a4,x0 # load a4(second bfloat) to t1
srli t1,t1,16 # shift to let second bfloat fit in one register
or t0,t0,t1 # combine two bfloat in one register
add s3,t0,x0 # load t0 to s3
ret # return to main
### decode two bfloat on one register to two registers
decoder:
add t0,s9,x0 # load s9(data encode) to t0
-la a1,mask2 # load mask2 address
-lw s2,0(a1) # load mask 0xFFFF0000
+srli t1,t0,16
+slli t1,t1,16
-and t1,t0,s2 # use mask to specification bfloat 1
-lw s2,4(a1) # load mask 0x0000FFFF
+li s2,0x0000ffff
and t2,t0,s2 # use mask to specification bfloat 2
slli t2,t2,16 # shift to left to let bfloat peform like original float
add s6,t1,x0 # store t1(bfloat 1) to s6
add s5,t2,x0 # store t2(bfloat 2) to s5
ret # return to main
### change line
cl:
li a7,4 # set a7 as string mode
la a0,str # load str to a0
ecall # ecall
ret # return to main
### Multiplication with bfloat in one register
Multi_bfloat:
# decoder function input is a0
# jal ra,decoder # load a0(two bloat number in one register) to t0
# decoder function output is s5,s6
add t0,s5,x0 # store s5(bfloat 2) to t0
add t1,s6,x0 # store s6(bfloat 1) to t1
-lw t6,0(a3) # load mask0 mask 0x7F800000
+li t6,0x7F800000
# get exponent to t2,t3
and t3,t0,t6 # use mask 0x7F800000 to get t0 exponent
and t2,t1,t6 # use mask 0x7F800000 to get t1 exponent
add t3,t3,t2 # add two exponent to t3
srli t3,t3,23 # 先右移
-lw t6,20(a3) # load mask0 mask 0x3F800000
-sub t3,t3,t6 # sub 127 to exponent
+addi t3,t3,-127
mv s3, t3 #exp 存到s3
# get sign
xor t2,t0,t1 # get sign and store on t2
srli t2,t2,31 # get rid of useless data
slli t2,t2,22 # let sign back to right position sig +exp放在 0~8
or s3,s3,t2 # s3=0x0000最後9=sig+exp
# get sign and exponent together
-or t3,t3,t2
# set the sign and exponent to t0
-slli t0,t0,9
-srli t0,t0,9
-or t0,t3,t0
# get fraction to t2 and t3
-lw t6,16(a3) # load mask0 mask 0x7F
-slli t6,t6,16 # shift mask to 0x7F0000
+li t6,0x7F0000
and t2,t0,t6 # use mask 0x7F0000 get fraction
and t3,t1,t6 # use mask 0x7F0000 get fraction
slli t2,t2,9 # shift left let no leading 0
srli t2,t2,1 # shift right let leading has one 0
-lw t6,24(a3) # load mask0 mask 0x80000000
+li t6,0x80000000
or t2,t2,t6 # use mask 0x80000000 to add integer
srli t2,t2,1 # shift right to add space for overflow
slli t3,t3,8 # shift left let no leading 0
or t3,t3,t6 # use mask 0x80000000 to add integer
srli t3,t3,1 # shift right to add space for overflow
add s11,x0,x0 # set a counter and 0
addi s10,x0,8 # set a end condition
add t1,x0,x0 # reset t1 to 0 and let this register be result
-lw t6,24(a3) # load mask0 mask 0x80000000
+mv t5,t6
loop:
addi s11,s11,1 # add 1 at counter every loop
srli t5,t5,1 # shift right at 1 every loop
and t4,t2,t5 # use mask to specified number at that place
beq t4,x0,not_add # jump if t4 equal to 0
add t1,t1,t3 # add t3 to t1
not_add:
srli t3,t3,1 # shift left 1 bit to t3
bne s11,s10,loop # if the condition not satisfy return to loop
# end of loop
# check if overflow
-lw t6,24(a3) # load mask0 mask 0x80000000 to t6
and t4,t1,t6 # get t1 max bit
# if t4 max bit equal to 0 will not overflow
beq t4,x0,not_overflow
# if overflow
slli t1,t1,1 # shift left 1 bits to remove integer
-lw t6,8(a3) # load mask0 mask 0x800000
-add t0,t0,t6 # exponent add 1 if overflow
addi s3,s3 1
j Mult_end # jump to Mult_end
# if not overflow
not_overflow:
slli t1,t1,2 # shift left 2 bits to remove integer
Mult_end:
srli t1,t1,24 # shift right to remove useless bits
addi t1,t1,1 # add 1 little bit to check if carry
srli t1,t1,1 # shift right to remove useless bits
slli t1,t1,16 # shift left to let fraction be right position
-srli t0,t0,23 # shift right to remove useless bits
-slli t0,t0,23 # shift left to let sign and exponent be right position
slli s3,s3,23
or s3,s3 t1
-or t0,t0,t1 # combine t0 and t1 together to get bfloat
-add s3,t0,x0 # store bfloat after multiplication to s3
ret # return to main
### end of function
exit:
li a7,93 # set a7 as exit
ecall # ecall
############ Check every bits
# li a7,35 # set a7 as binary mode
# add a0,t0,x0 # store print data to a0
# ecall # ecall
############
```
Compile the c code with different optimization level
| Column 1 | -o | -1 | -2 | -3 | -0fas
| -------- | ---- | ---- | --- | --- | --- |
| Text | Text | Text | | | |