Assignment2: RISC-V Toolchain

# Assignment2: RISC-V Toolchain contributed by < `kkkkk1109` > ## Rewrite [Reducing memory usage with bfloat and bfloat multiplication](https://hackmd.io/@PWCheng/CAHW01) go to .s ``` cd target.s address ``` to run the elf file ``` build/rv32emu address/target.elf ``` to compile c code ``` riscv-none-elf-gcc -march=rv32i -mabi=ilp32 -O0 hw2.c -o hw2.elf ``` I choose the problem [Reducing memory usage with bfloat and bfloat multiplication](https://leetcode.com/problems/monotonic-array/) from [鄭博文]. In assignment 1 ,I ultilized CLZ to implement the square root. Since there were division and addition of floatint point I want to learn some more floating point operation. Also, I think the topic is quite interesting by merging two bf16 to one register, and I'm wondering the accuracy between bf16 and IEEE754. Therefore, I choose this subject in assignment 2. **Original C code:** ```c #include<stdio.h> float fp32_to_bf16(float x) { float y = x; int *p = (int *) &y; unsigned int exp = *p & 0x7F800000; unsigned int man = *p & 0x007FFFFF; if (exp == 0 && man == 0) /* zero */ return x; if (exp == 0x7F800000) /* infinity or NaN */ return x; /* Normalized number */ /* round to nearest */ float r = x; int *pr = (int *) &r; *pr &= 0xFF800000; /* r has the same exp as x */ r /= 0X100; y = x + r; *p &= 0xFFFF0000; return y; } // encoder : encode two bfloat number in one memory int encoder(int *a,int *b){ int c=0; c=*a+(*b>>16); *a=0; *b=0; return c; } // decoder : decode one memory number in two bfloat void decoder(int c ,int *n1,int *n2){ *n1=c&0xffff0000; *n2=(c&0x0000ffff)<<16; } int main(){ // definition of num1 and transfer it to bfloat float num1=-12.123; int *np1=(int *) &num1; num1=fp32_to_bf16(num1); // definition of num2 and transfer it to bfloat float num2=45.568; int *np2=(int *) &num2; num2=fp32_to_bf16(num2); float add; int *p=(int *) &add; *p=0; // show num1 binary form and it's value printf("0x%x\n",*np1); printf("%f\n",num1); // show num2 binary form and it's value printf("0x%x\n",*np2); printf("%f\n",num2); // add two number together and print the binary form *p=encoder(np1,np2); decoder(*p,&num1,&num2); float mul_num; mul_num=num1*num2; printf("%f\n",mul_num); return 0; }// // The bfloat multiplication still not implement // void bfloat_multiplication(int *num){ // printf("0x%x\n",*num); // int mask[]={0x1,0xff,0x7f}; // int fra=0,exp=0,sig=0; // int bn1=0,bn2=0; // // fraction // bn2 = *num & mask[2]; // bn2 |= 0x100; // mask[2] <<=16; // bn1 = *num & mask[2]; // bn1 >>= 16; // bn1 |= 0x100; // for(int i=0;i<8;i++){ // if((bn1 && 1{ bn1&1 // fra+=bn2; // } // bn1>>1; // bn2<<1; // } // // if((fra & 0x8000)>>16) fra>>16 // // fra >>= 9; // // else // // fra >>=8; // printf("%x\n",fra); // // exponent // mask[1] <<= 7; // bn2 = *num & mask[1]; // mask[1] <<= 16; // bn1 = *num & mask[1]; // bn1 >>= 23; // bn2 >>= 7; // exp=bn1+bn2-127; // printf("%x\n",exp); // // sign // bn1 = 0; // bn2 = 0; // mask[0] <<= 15; // bn2= *num & mask[0]; // bn2 <<= 16; // mask[0] <<= 16; // bn1 = *num & mask[0]; // bn1 ^= bn2; // sig=bn1; // *num = sig | (fra << 15) | (exp << 22); // } ``` In C code, the `bfloat_multiplication` is not used, also, I found the there are some mistakes in function `bfloat_multiplication` and `main`, and I rewrite the c code. ```c float bfloat_multiplication(int *num){ uint32_t r = *num; uint16_t fra=0,exp=0,sig=0,f1=0,f2=0,c=0; uint16_t bf1,bf2; // fraction bf1 = (r >> 16) & 0xffff; bf2 = r & 0xffff; printf("bf1 = %x\n",bf1); printf("bf2 = %x\n",bf2); f1= ( bf1 & 0xff ) | 0x80; f2= ( bf2 & 0xff ) | 0x80; printf("f1 = %x\n",f1); printf("f2 = %x\n",f2); for(int i = 0; i < 8; i ++){ //printf("i=%d\n",i); if(f1 & 1){ fra += f2; } f1 >>= 1; f2 <<= 1; } if(fra>>15){ fra >>= 1; c=1; } fra >>= 5; //rounded int sticky = fra & 1; int round= (fra>>1) &1; int lsb= (fra>>2) & 1; fra= (fra>>2) + (round & (lsb | sticky)); printf("frac after rounded %x\n",fra); exp = (bf1 & (0x7f80)) >> 7; exp += (bf2 & (0x7f80)) >> 7 ; exp=exp-127+c; printf("exp: %d\n",exp); //sign sig = (bf1 >> 15) ^(bf2 >> 15); printf("sig: %d\n",sig); r=(sig << 15) | (exp << 7) | (fra &0x7f); r=r << 16; return *(float*)&r; } ``` ```c int main(){ // definition of num1 and transfer it to bfloat float num1=-12.123; int *np1=(int *) &num1; num1=fp32_to_bf16(num1); // definition of num2 and transfer it to bfloat float num2=45.568; int *np2=(int *) &num2; num2=fp32_to_bf16(num2); float add; int *p=(int *) &add; *p=0; // show num1 binary form and it's value printf("bf float of %f is 0x%x\n",num1,*np1); // show num2 binary form and it's value printf("bf float of %f is 0x%x\n",num2,*np2); // add two number together and print the binary form *p=encoder(np1,np2); printf("encoder: %x \n",*p); decoder(*p,np1,np2);//decoder(*p,&num1,&num2); wrong input float ans=bfloat_multiplication(p); printf("%f\n",ans); return 0; } ``` Except for using * in c , I implement the `bfloat_multiplication` to do the multiplication. Compile with different optimization level, here are the results(orginal c code) ![](https://hackmd.io/_uploads/ryqmNWYf6.png) ![](https://hackmd.io/_uploads/rkMk0I9fp.png) **Hand-written assembly code** I made some improvements to the assembly code. 1. In original code, the original programmer used `la`and`lw` to produce the mask, and I changed them to `li`. 2. The register used to reserve the bitmask is `t6`, when producing a new mask, the old mask would be lost, but sometmes we required the mask we replace before, so I used the `t6` and `t5` to reserve the mask which decrease the times to produce bitmask. 3. In `Multi_bfloat`, when the significand happens overflow, we have to add carry to the exponent. In orginal code, when overflow happens, the bitmask would be load again to do the `add` operation.I found way to streamline the code by changing the sequence of operation in this function. ``` .data # test data test0: .word 0x4141f9a7,0x423645a2 test1: .word 0x3fa66666,0x42c63333 test2: .word 0x43e43a5e,0x42b1999a # mask # mask0 for exponent ,fraction # ( 0 ,4 ,8 ,12 ,16 ,20 ,24 ) mask0: .word 0x7F800000,0x007FFFFF,0x800000,0x8000,0x7f,0x3F800000,0x80000000 # mask1 for round mask1: .word 0x8000 # mask2 for decoder mask2: .word 0xFFFF0000,0x0000FFFF #string str: .string "\n" .text main: li a7,1 la a2,test0 # load test data address to a2 lw a6,0(a2) # load test data to a6 jal ra,f32_b16_p1 # call fp32 to bf16 function add a5,a6,x0 # store first bfloat in a5 lw a6,4(a2) # load test data to a6 jal ra,f32_b16_p1 # jump to float32 transform to bfloat function add a4,a6,x0 # store the result to a4 jal ra,encoder # jump to encoder funtion add s9,s3,x0 # save s3(data after encode) to s9 jal ra,decoder # jump to decoder function jal ra,Multi_bfloat # jump to bfloat Multiplication funcition # Output second bfloat after decoder li a7,2 # set a7 as float mode add a0,x0,s5 # set a0 as s5 ecall # ecall jal ra,cl # change line # Output first bfloat after decoder li a7,2 # set a7 as float mode add a0,x0,s6 # set a0 as s6 ecall # ecall jal ra,cl # change line # Output Multiplication result li a7,2 # set a7 as float mode add a0,x0,s3 # set a0 as s3(Multiplication results) ecall # ecall j exit # jump to exit this program ### function converts IEEE754 fp32 to bfloat16 f32_b16_p1: sw a6,0(sp) -add t0,a6,x0 # a6 will be only for this funtion to access +mv t0, a6 -la a3,mask0 # load mask0 address to a3 # exponent -lw t6,0(a3) # load mask 0x7F800000 to t6 +li t6,0x7F800000 and t1,t0,t6 # let exponent save to t1 # fraction -lw t6,4(a3) # load 0x007FFFFF to t6 +li t5, 0x007FFFFF ##############################不要另外li 減少li次數 and t2,t0,t5 # let fraction save to t2 # check this number if 0 or inf (exponent + fraction) -lw t6,0(a3) # load mask 0x7F800000 to t6 t6沒有變不用這行 beq t1,t6,inf_or_zero # exp == 0x7F800000 or t3,t1,t2 beq t3,x0,inf_or_zero # exp == 0 && man == 0 # add integer to fraction -lw t6,8(a3) # load integer +li t6,0x800000 # li mask or t2,t2,t6 # add integer 0.111+1 # round to nearest for fraction -lw t6,12(a3) # load the round number +srli t6,t6,8 #load mask add t2,t2,t6 # add round number srli t4,t2,24 # shift left 24 to t5 +li,t6,0x7f beq t4,x0,no_overflow # if t5 equal to 0 move to no_overflow # if overflow -lw t6,8(a3) # load mask 0x007FFFFF add t1,t1,t5 # add 1 to exponent change t6 to t5 srli t2,t2,17 # shift t2 to left 1 integer and 7 fraction -lw t6,16(a3) # load mask 0x7f +li t6,0x7f and t2,t2,t6 # let t2 only have integer slli t2,t2,16 # shift right 16 j f32_b16_p2 # if not overflow no_overflow: srli t2,t2,16 # shift t2 to left 1 integer and 7 fraction -lw t6,16(a3) # load mask 0x7f and t2,t2,t6 # let t2 only have integer slli t2,t2,16 # shift right 16 #f32_b16 end function f32_b16_p2: # save to a6 srli t0,t0,31 # shift left to let t0 remain sign slli t0,t0,31 # shift right to let t0 sign to the right position or t0,t0,t1 # combine sign and exponent together or t0,t0,t2 # combine sign,exponent and fraction together add a6,t0,x0 # save t0 to a6 ret # move back to main function inf_or_zero: srli a6,a6,16 slli a6,a6,16 ret # return to main ### end of funtion ### encode two bfloat to one register encoder: add t0,a5,x0 # load a5(first bfloat) to t0 add t1,a4,x0 # load a4(second bfloat) to t1 srli t1,t1,16 # shift to let second bfloat fit in one register or t0,t0,t1 # combine two bfloat in one register add s3,t0,x0 # load t0 to s3 ret # return to main ### decode two bfloat on one register to two registers decoder: add t0,s9,x0 # load s9(data encode) to t0 -la a1,mask2 # load mask2 address -lw s2,0(a1) # load mask 0xFFFF0000 +srli t1,t0,16 +slli t1,t1,16 -and t1,t0,s2 # use mask to specification bfloat 1 -lw s2,4(a1) # load mask 0x0000FFFF +li s2,0x0000ffff and t2,t0,s2 # use mask to specification bfloat 2 slli t2,t2,16 # shift to left to let bfloat peform like original float add s6,t1,x0 # store t1(bfloat 1) to s6 add s5,t2,x0 # store t2(bfloat 2) to s5 ret # return to main ### change line cl: li a7,4 # set a7 as string mode la a0,str # load str to a0 ecall # ecall ret # return to main ### Multiplication with bfloat in one register Multi_bfloat: # decoder function input is a0 # jal ra,decoder # load a0(two bloat number in one register) to t0 # decoder function output is s5,s6 add t0,s5,x0 # store s5(bfloat 2) to t0 add t1,s6,x0 # store s6(bfloat 1) to t1 -lw t6,0(a3) # load mask0 mask 0x7F800000 +li t6,0x7F800000 # get exponent to t2,t3 and t3,t0,t6 # use mask 0x7F800000 to get t0 exponent and t2,t1,t6 # use mask 0x7F800000 to get t1 exponent add t3,t3,t2 # add two exponent to t3 srli t3,t3,23 # 先右移 -lw t6,20(a3) # load mask0 mask 0x3F800000 -sub t3,t3,t6 # sub 127 to exponent +addi t3,t3,-127 mv s3, t3 #exp 存到s3 # get sign xor t2,t0,t1 # get sign and store on t2 srli t2,t2,31 # get rid of useless data slli t2,t2,22 # let sign back to right position sig +exp放在 0~8 or s3,s3,t2 # s3=0x0000最後9=sig+exp # get sign and exponent together -or t3,t3,t2 # set the sign and exponent to t0 -slli t0,t0,9 -srli t0,t0,9 -or t0,t3,t0 # get fraction to t2 and t3 -lw t6,16(a3) # load mask0 mask 0x7F -slli t6,t6,16 # shift mask to 0x7F0000 +li t6,0x7F0000 and t2,t0,t6 # use mask 0x7F0000 get fraction and t3,t1,t6 # use mask 0x7F0000 get fraction slli t2,t2,9 # shift left let no leading 0 srli t2,t2,1 # shift right let leading has one 0 -lw t6,24(a3) # load mask0 mask 0x80000000 +li t6,0x80000000 or t2,t2,t6 # use mask 0x80000000 to add integer srli t2,t2,1 # shift right to add space for overflow slli t3,t3,8 # shift left let no leading 0 or t3,t3,t6 # use mask 0x80000000 to add integer srli t3,t3,1 # shift right to add space for overflow add s11,x0,x0 # set a counter and 0 addi s10,x0,8 # set a end condition add t1,x0,x0 # reset t1 to 0 and let this register be result -lw t6,24(a3) # load mask0 mask 0x80000000 +mv t5,t6 loop: addi s11,s11,1 # add 1 at counter every loop srli t5,t5,1 # shift right at 1 every loop and t4,t2,t5 # use mask to specified number at that place beq t4,x0,not_add # jump if t4 equal to 0 add t1,t1,t3 # add t3 to t1 not_add: srli t3,t3,1 # shift left 1 bit to t3 bne s11,s10,loop # if the condition not satisfy return to loop # end of loop # check if overflow -lw t6,24(a3) # load mask0 mask 0x80000000 to t6 and t4,t1,t6 # get t1 max bit # if t4 max bit equal to 0 will not overflow beq t4,x0,not_overflow # if overflow slli t1,t1,1 # shift left 1 bits to remove integer -lw t6,8(a3) # load mask0 mask 0x800000 -add t0,t0,t6 # exponent add 1 if overflow addi s3,s3 1 j Mult_end # jump to Mult_end # if not overflow not_overflow: slli t1,t1,2 # shift left 2 bits to remove integer Mult_end: srli t1,t1,24 # shift right to remove useless bits addi t1,t1,1 # add 1 little bit to check if carry srli t1,t1,1 # shift right to remove useless bits slli t1,t1,16 # shift left to let fraction be right position -srli t0,t0,23 # shift right to remove useless bits -slli t0,t0,23 # shift left to let sign and exponent be right position slli s3,s3,23 or s3,s3 t1 -or t0,t0,t1 # combine t0 and t1 together to get bfloat -add s3,t0,x0 # store bfloat after multiplication to s3 ret # return to main ### end of function exit: li a7,93 # set a7 as exit ecall # ecall ############ Check every bits # li a7,35 # set a7 as binary mode # add a0,t0,x0 # store print data to a0 # ecall # ecall ############ ``` Compile the c code with different optimization level | Column 1 | -o | -1 | -2 | -3 | -0fas | -------- | ---- | ---- | --- | --- | --- | | Text | Text | Text | | | |