Assignment2: RISC-V Toolchain

# Assignment2: RISC-V Toolchain contributed by < [kkkkk1109](https://github.com/kkkkk1109) > ## Rewrite [Reducing memory usage with bfloat and bfloat multiplication](https://hackmd.io/@PWCheng/CAHW01) I choose the problem [Reducing memory usage with bfloat and bfloat multiplication](https://hackmd.io/@PWCheng/CAHW01) from <[`Brian Cheng`](https://github.com/BrianCheng-TheLegend?tab=repositories)>. In assignment 1 ,I ultilized CLZ to implement the square root. Since there were division and addition of floatint point, I want to learn some more floating point operation. Also, I think the topic is quite interesting by merging two bf16 to one register, and I'm wondering the accuracy between bf16 and IEEE754. Therefore, I choose this subject in assignment 2. **Original C code:** ```c #include<stdio.h> float fp32_to_bf16(float x) { float y = x; int *p = (int *) &y; unsigned int exp = *p & 0x7F800000; unsigned int man = *p & 0x007FFFFF; if (exp == 0 && man == 0) /* zero */ return x; if (exp == 0x7F800000) /* infinity or NaN */ return x; /* Normalized number */ /* round to nearest */ float r = x; int *pr = (int *) &r; *pr &= 0xFF800000; /* r has the same exp as x */ r /= 0X100; y = x + r; *p &= 0xFFFF0000; return y; } // encoder : encode two bfloat number in one memory int encoder(int *a,int *b){ int c=0; c=*a+(*b>>16); *a=0; *b=0; return c; } // decoder : decode one memory number in two bfloat void decoder(int c ,int *n1,int *n2){ *n1=c&0xffff0000; *n2=(c&0x0000ffff)<<16; } int main(){ // definition of num1 and transfer it to bfloat float num1=-12.123; int *np1=(int *) &num1; num1=fp32_to_bf16(num1); // definition of num2 and transfer it to bfloat float num2=45.568; int *np2=(int *) &num2; num2=fp32_to_bf16(num2); float add; int *p=(int *) &add; *p=0; // show num1 binary form and it's value printf("0x%x\n",*np1); printf("%f\n",num1); // show num2 binary form and it's value printf("0x%x\n",*np2); printf("%f\n",num2); // add two number together and print the binary form *p=encoder(np1,np2); decoder(*p,&num1,&num2); float mul_num; mul_num=num1*num2; printf("%f\n",mul_num); return 0; }// // The bfloat multiplication still not implement // void bfloat_multiplication(int *num){ // printf("0x%x\n",*num); // int mask[]={0x1,0xff,0x7f}; // int fra=0,exp=0,sig=0; // int bn1=0,bn2=0; // // fraction // bn2 = *num & mask[2]; // bn2 |= 0x100; // mask[2] <<=16; // bn1 = *num & mask[2]; // bn1 >>= 16; // bn1 |= 0x100; // for(int i=0;i<8;i++){ // if((bn1 && 1{ bn1&1 // fra+=bn2; // } // bn1>>1; // bn2<<1; // } // // if((fra & 0x8000)>>16) fra>>16 // // fra >>= 9; // // else // // fra >>=8; // printf("%x\n",fra); // // exponent // mask[1] <<= 7; // bn2 = *num & mask[1]; // mask[1] <<= 16; // bn1 = *num & mask[1]; // bn1 >>= 23; // bn2 >>= 7; // exp=bn1+bn2-127; // printf("%x\n",exp); // // sign // bn1 = 0; // bn2 = 0; // mask[0] <<= 15; // bn2= *num & mask[0]; // bn2 <<= 16; // mask[0] <<= 16; // bn1 = *num & mask[0]; // bn1 ^= bn2; // sig=bn1; // *num = sig | (fra << 15) | (exp << 22); // } ``` In C code, the `bfloat_multiplication` is not used, also, I found the there are some mistakes in function `bfloat_multiplication` and `main`, and I rewrite the c code. ```c float bfloat_multiplication(int *num){ uint32_t r = *num; uint16_t fra=0,exp=0,sig=0,f1=0,f2=0,c=0; uint16_t bf1,bf2; // fraction bf1 = (r >> 16) & 0xffff; bf2 = r & 0xffff; printf("bf1 = %x\n",bf1); printf("bf2 = %x\n",bf2); f1= ( bf1 & 0xff ) | 0x80; f2= ( bf2 & 0xff ) | 0x80; printf("f1 = %x\n",f1); printf("f2 = %x\n",f2); for(int i = 0; i < 8; i ++){ //printf("i=%d\n",i); if(f1 & 1){ fra += f2; } f1 >>= 1; f2 <<= 1; } if(fra>>15){ fra >>= 1; c=1; } fra >>= 5; //rounded int sticky = fra & 1; int round= (fra>>1) &1; int lsb= (fra>>2) & 1; fra= (fra>>2) + (round & (lsb | sticky)); printf("frac after rounded %x\n",fra); exp = (bf1 & (0x7f80)) >> 7; exp += (bf2 & (0x7f80)) >> 7 ; exp=exp-127+c; printf("exp: %d\n",exp); //sign sig = (bf1 >> 15) ^(bf2 >> 15); printf("sig: %d\n",sig); r=(sig << 15) | (exp << 7) | (fra &0x7f); r=r << 16; return *(float*)&r; } ``` ```c int main(){ // definition of num1 and transfer it to bfloat float num1=-12.123; int *np1=(int *) &num1; num1=fp32_to_bf16(num1); // definition of num2 and transfer it to bfloat float num2=45.568; int *np2=(int *) &num2; num2=fp32_to_bf16(num2); float add; int *p=(int *) &add; *p=0; // show num1 binary form and it's value printf("bf float of %f is 0x%x\n",num1,*np1); // show num2 binary form and it's value printf("bf float of %f is 0x%x\n",num2,*np2); // add two number together and print the binary form *p=encoder(np1,np2); printf("encoder: %x \n",*p); decoder(*p,np1,np2);//decoder(*p,&num1,&num2); wrong input float ans=bfloat_multiplication(p); printf("%f\n",ans); return 0; } ``` Except for using * in c , I implement the `bfloat_multiplication` to do the multiplication. Compile the c code with different optimization level | Optimized level | -O0 | -O1 | -O2 | -O3&-Ofast | -Os | |:---------------:|:-----:|:-----:| ----- |:----------:|:---------:| | CYCLE | 10342 | 9688 | 9687 | ==9636== | 9703 | | instret | 10342 | 8662 | 8661 | ==8610== | 8677 | | text | 55718 | 54610 | 54598 | 54726 | 54574 | | dta | 1924 | 1968 | 1928 | 1928 | 1928 | | bss | 1528 | 1528 | 1528 | 1528 | 1528 | | dec | 59170 | 58066 | 58054 | 58182 | ==58030== | Loop unrolling | Optimized level | -O0 | -O1 | -O2&-O3 | -Ofast | -Os | |:---------------:|:-----:|:-----:| ------- |:------:|:-----:| | CYCLE | 10289 | 9632 | 9631 | ==9514== | 9655 | instret | 9256 | 8606 | 8605 | ==8498== | 8629 | | text | 56078 | 54706 | 54706 | 54514 | 54694 | | dta | 1924 | 1928 | 1928 | 1936 | 1928 | | bss | 1528 | 1528 | 1528 | 1528 | 1528 | | dec | 59530 | 58162 | 58162 | ==57978== | 58150 | #### Different Optimization levels in GNU **-O0** - This level involves ==no optimization== and produces code that closely mirrors the source code structure. It is primarily used for debugging but offers lower performance. **-O1** - At this level, basic optimizations like removing unused local variables and reducing stack storage for local variables are enabled. It improves execution speed while maintaining shorter compile times. **-O2** - Also known as Level 2 optimization, this level goes a step further, applying more code transformations and performance optimizations. It significantly enhances code execution speed with a slightly longer compilation time. **-O3** - This represents a high level of optimization, involving a multitude of code transformations and performance enhancements. It leads to a substantial improvement in code execution speed, but may introduce more complex generated code and, at times, instability. **-Ofast** - Similar to -O3, this level adds extra optimization options, potentially sacrificing some mathematical precision to boost performance. It's valuable for scientific computing but may not be suitable for all software. **-Os** - The goal here is to optimize code for a ==reduced size== of the generated executable rather than maximum performance. It's useful for embedded systems and resource-constrained environments. #### Analysis In the both tables, we can see that the smallest cycle count and instret when compiling with `-Ofast` levels. However, the smallest size happens in `-Ofast` in loop unrolling c code while it should be happened in `-Os` as the result in first table. Using `rv_histogram` ``` build/rv_histogram -r tests/perfcounter/perfcounts.elf build/rv_histogram tests/perfcounter/perfcounts.elf ``` Optimized level | -O0 | -O1 | -O2 | -O3&-Ofast | -Os | |:---------------:|:-----:|:-----:| ----- |:----------:|:---------:| | a5 | 16.74% | 15.65% | 15.64% | 15.6% |15.77% The biggest change in the histogram is the `a5` register usage, from *16.74%* (none optimization) to about *15.6%*, and the rest of registers or instruction are nearly the same. #### Problem(unsolved) When compiling with -O2, -O3, -Ofast and -Os,there are warning. However, building the generated elf file still performs the program. ``` kkkkk1109@ubuntu:~/rv32emu/tests/perfcounter$ make riscv-none-elf-gcc -march=rv32i_zicsr_zifencei -mabi=ilp32 -O2 -Wall -c -o main.o main.c main.c: In function 'bfloat_multiplication': main.c:119:13: warning: dereferencing type-punned pointer will break strict-aliasing rules [-Wstrict-aliasing] 119 | return *(float*)&rm; | ^~~~~~~~~~~ main.c: In function 'fp32_to_bf16': main.c:14:24: warning: 'y' is used uninitialized [-Wuninitialized] 14 | unsigned int exp = *p & 0x7F800000; | ^~ main.c:11:11: note: 'y' declared here 11 | float y = 0; | ^ main.c: In function 'bfloat_multiplication': main.c:119:12: warning: 'rm' is used uninitialized [-Wuninitialized] 119 | return *(float*)&rm; | ^~~~~~~~~~~~ main.c:34:14: note: 'rm' declared here 34 | uint32_t rm = *num; | ^~ riscv-none-elf-gcc -o perfcount.elf getcycles.o getinstret.o sparkle.o main.o ``` **Hand-written assembly code** This is the original [assembly code](https://github.com/BrianCheng-TheLegend/2023_ComputerArchitecture/blob/main/Hw01/Hw01.s) To print the integer in assembly code, I add a void in `syscall.c` ```clike static void syscall_ooo(riscv_t *rv){ uint32_t fd = rv_get_reg(rv, rv_reg_a1); fprintf(stdout,"%d",fd); } ``` and define 99 as this system call number ```clike #define SUPPORTED_SYSCALLS \ _(ooo, 99) \ ``` To use the `cssr` instruction, we need to modify the `makefile` ``` ASFLAGS = -march=rv32i_zicsr -mabi=ilp32 ``` #### Rev1 I made some improvements to the assembly code. 1. In original code, the original programmer used `la`and`lw` to produce the mask, and I changed them to `li`. 2. The register used to reserve the bitmask is `t6`, when producing a new mask, the old mask would be lost, but sometmes we required the mask we replace before, so I used the `t6` and `t5` to reserve the mask which decrease the times to produce bitmask. 3. In `Multi_bfloat`, when the significand happens overflow, we have to add carry to the exponent. In orginal code, when overflow happens, the bitmask would be load again to do the `add` operation.I found way to streamline the code by changing the sequence of operation in this function. ```diff .org 0 .global _start .set SYSEXIT, 93 .set SYSWRITE, 64 .set PRINT_INT, 99 .data # test data test0: .word 0x4141f9a7,0x423645a2 test1: .word 0x3fa66666,0x42c63333 test2: .word 0x43e43a5e,0x42b1999a # mask # mask0 for exponent ,fraction # ( 0 ,4 ,8 ,12 ,16 ,20 ,24 ) -mask0: .word 0x7F800000,0x007FFFFF,0x800000,0x8000,0x7f,0x3F800000,0x80000000 # mask1 for round -mask1: .word 0x8000 # mask2 for decoder -mask2: .word 0xFFFF0000,0x0000FFFF #string str: .string "\n" str1: .string "Count CYCLE is :" .set str1_size, .-str1 .text start: jal ra,get_cycles #count cycle mv s8,a0 # old cycle li a7,1 la a2,test0 # load test data address to a2 lw a6,0(a2) # load test data to a6 jal ra,f32_b16_p1 # call fp32 to bf16 function add a5,a6,x0 # store first bfloat in a5 lw a6,4(a2) # load test data to a6 jal ra,f32_b16_p1 # jump to float32 transform to bfloat function add a4,a6,x0 # store the result to a4 jal ra,encoder # jump to encoder funtion add s9,s3,x0 # save s3(data after encode) to s9 jal ra,decoder # jump to decoder function jal ra,Multi_bfloat # jump to bfloat Multiplication funcition # print cycle li a0,1 la a1, str1 la a2,str1_size li a7,SYSWRITE ecall # ecall j exit # jump to exit this program ### function converts IEEE754 fp32 to bfloat16 f32_b16_p1: sw a6,0(sp) - add t0,a6,x0 # a6 will be only for this funtion to access +mv t0, a6 - la a3,mask0 # load mask0 address to a3 # exponent - lw t6,0(a3) # load mask 0x7F800000 to t6 + li t6,0x7F800000 and t1,t0,t6 # let exponent save to t1 # fraction - lw t6,4(a3) # load 0x007FFFFF to t6 + li t5, 0x007FFFFF # and t2,t0,t5 # let fraction save to t2 # check this number if 0 or inf (exponent + fraction) - lw t6,0(a3) # load mask 0x7F800000 to t6 t6沒有變不用這行 beq t1,t6,inf_or_zero # exp == 0x7F800000 or t3,t1,t2 beq t3,x0,inf_or_zero # exp == 0 && man == 0 # add integer to fraction - lw t6,8(a3) # load integer + li t6,0x800000 # li mask or t2,t2,t6 # add integer 0.111+1 # round to nearest for fraction - lw t6,12(a3) # load the round number + srli t6,t6,8 #load mask add t2,t2,t6 # add round number srli t4,t2,24 # shift left 24 to t5 + li,t6,0x7f beq t4,x0,no_overflow # if t5 equal to 0 move to no_overflow # if overflow - lw t6,8(a3) # load mask 0x007FFFFF add t1,t1,t5 # add 1 to exponent change t6 to t5 srli t2,t2,17 # shift t2 to left 1 integer and 7 fraction - lw t6,16(a3) # load mask 0x7f + li t6,0x7f and t2,t2,t6 # let t2 only have integer slli t2,t2,16 # shift right 16 j f32_b16_p2 # if not overflow no_overflow: srli t2,t2,16 # shift t2 to left 1 integer and 7 fraction -lw t6,16(a3) # load mask 0x7f and t2,t2,t6 # let t2 only have integer slli t2,t2,16 # shift right 16 #f32_b16 end function f32_b16_p2: # save to a6 srli t0,t0,31 # shift left to let t0 remain sign slli t0,t0,31 # shift right to let t0 sign to the right position or t0,t0,t1 # combine sign and exponent together or t0,t0,t2 # combine sign,exponent and fraction together add a6,t0,x0 # save t0 to a6 ret # move back to main function inf_or_zero: srli a6,a6,16 slli a6,a6,16 ret # return to main ### end of funtion ### encode two bfloat to one register encoder: add t0,a5,x0 # load a5(first bfloat) to t0 add t1,a4,x0 # load a4(second bfloat) to t1 srli t1,t1,16 # shift to let second bfloat fit in one register or t0,t0,t1 # combine two bfloat in one register add s3,t0,x0 # load t0 to s3 ret # return to main ### decode two bfloat on one register to two registers decoder: add t0,s9,x0 # load s9(data encode) to t0 - la a1,mask2 # load mask2 address - lw s2,0(a1) # load mask 0xFFFF0000 + srli t1,t0,16 + slli t1,t1,16 - and t1,t0,s2 # use mask to specification bfloat 1 - lw s2,4(a1) # load mask 0x0000FFFF + li s2,0x0000ffff and t2,t0,s2 # use mask to specification bfloat 2 slli t2,t2,16 # shift to left to let bfloat peform like original float add s6,t1,x0 # store t1(bfloat 1) to s6 add s5,t2,x0 # store t2(bfloat 2) to s5 ret # return to main ### change line cl: li a7,4 # set a7 as string mode la a0,str # load str to a0 ecall # ecall ret # return to main ### Multiplication with bfloat in one register Multi_bfloat: # decoder function input is a0 # jal ra,decoder # load a0(two bloat number in one register) to t0 # decoder function output is s5,s6 add t0,s5,x0 # store s5(bfloat 2) to t0 add t1,s6,x0 # store s6(bfloat 1) to t1 - lw t6,0(a3) # load mask0 mask 0x7F800000 + li t6,0x7F800000 # get exponent to t2,t3 and t3,t0,t6 # use mask 0x7F800000 to get t0 exponent and t2,t1,t6 # use mask 0x7F800000 to get t1 exponent add t3,t3,t2 # add two exponent to t3 srli t3,t3,23 # 先右移 - lw t6,20(a3) # load mask0 mask 0x3F800000 - sub t3,t3,t6 # sub 127 to exponent + addi t3,t3,-127 mv s3, t3 #exp 存到s3 # get sign xor t2,t0,t1 # get sign and store on t2 srli t2,t2,31 # get rid of useless data slli t2,t2,22 # let sign back to right position sig +exp放在 0~8 or s3,s3,t2 # s3=0x0000最後9=sig+exp # get sign and exponent together - or t3,t3,t2 # set the sign and exponent to t0 - slli t0,t0,9 - srli t0,t0,9 - or t0,t3,t0 # get fraction to t2 and t3 - lw t6,16(a3) # load mask0 mask 0x7F - slli t6,t6,16 # shift mask to 0x7F0000 + li t6,0x7F0000 and t2,t0,t6 # use mask 0x7F0000 get fraction and t3,t1,t6 # use mask 0x7F0000 get fraction slli t2,t2,9 # shift left let no leading 0 srli t2,t2,1 # shift right let leading has one 0 - lw t6,24(a3) # load mask0 mask 0x80000000 + li t6,0x80000000 or t2,t2,t6 # use mask 0x80000000 to add integer srli t2,t2,1 # shift right to add space for overflow slli t3,t3,8 # shift left let no leading 0 or t3,t3,t6 # use mask 0x80000000 to add integer srli t3,t3,1 # shift right to add space for overflow add s11,x0,x0 # set a counter and 0 addi s10,x0,8 # set a end condition add t1,x0,x0 # reset t1 to 0 and let this register be result - lw t6,24(a3) # load mask0 mask 0x80000000 + mv t5,t6 loop: addi s11,s11,1 # add 1 at counter every loop srli t5,t5,1 # shift right at 1 every loop and t4,t2,t5 # use mask to specified number at that place beq t4,x0,not_add # jump if t4 equal to 0 add t1,t1,t3 # add t3 to t1 not_add: srli t3,t3,1 # shift left 1 bit to t3 bne s11,s10,loop # if the condition not satisfy return to loop # end of loop # check if overflow - lw t6,24(a3) # load mask0 mask 0x80000000 to t6 and t4,t1,t6 # get t1 max bit # if t4 max bit equal to 0 will not overflow beq t4,x0,not_overflow # if overflow slli t1,t1,1 # shift left 1 bits to remove integer - lw t6,8(a3) # load mask0 mask 0x800000 - add t0,t0,t6 # exponent add 1 if overflow addi s3,s3 1 j Mult_end # jump to Mult_end # if not overflow not_overflow: slli t1,t1,2 # shift left 2 bits to remove integer Mult_end: srli t1,t1,24 # shift right to remove useless bits addi t1,t1,1 # add 1 little bit to check if carry srli t1,t1,1 # shift right to remove useless bits slli t1,t1,16 # shift left to let fraction be right position - srli t0,t0,23 # shift right to remove useless bits - slli t0,t0,23 # shift left to let sign and exponent be right position slli s3,s3,23 or s3,s3 t1 - or t0,t0,t1 # combine t0 and t1 together to get bfloat - add s3,t0,x0 # store bfloat after multiplication to s3 ret # return to main ### end of function exit: jal ra, get_cycles sub a1,a0,s8 li a7,PRINT_INT li a0,1 ecall mv ra ,x0 li a7,SYSEXIT # set a7 as exit ecall # ecall get_cycles: csrr a1, cycleh csrr a0, cycle csrr a2, cycleh bne a1, a2, get_cycles ret ``` #### Rev2 loop unrolling in the `Multi_bfloat` ``` loop: addi s11,s11,1 # add 1 at counter every loop srli t5,t5,1 # shift right at 1 every loop and t4,t2,t5 # use mask to specified number at that place beq t4,x0,not_add # jump if t4 equal to 0 add t1,t1,t3 # add t3 to t1 not_add: srli t3,t3,1 # shift left 1 bit to t3 srli t5,t5,1 # shift right at 1 every loop and t4,t2,t5 # use mask to specified number at that place beq t4,x0,not_add2 # jump if t4 equal to 0 add t1,t1,t3 # add t3 to t1 not_add2: srli t3,t3,1 # shift left 1 bit to t3 srli t5,t5,1 # shift right at 1 every loop and t4,t2,t5 # use mask to specified number at that place beq t4,x0,not_add3 # jump if t4 equal to 0 add t1,t1,t3 # add t3 to t1 not_add3: srli t3,t3,1 # shift left 1 bit to t3 srli t5,t5,1 # shift right at 1 every loop and t4,t2,t5 # use mask to specified number at that place beq t4,x0,not_add4 # jump if t4 equal to 0 add t1,t1,t3 # add t3 to t1 not_add4: srli t3,t3,1 # shift left 1 bit to t3 srli t5,t5,1 # shift right at 1 every loop and t4,t2,t5 # use mask to specified number at that place beq t4,x0,not_add5 # jump if t4 equal to 0 add t1,t1,t3 # add t3 to t1 not_add5: srli t3,t3,1 # shift left 1 bit to t3 srli t5,t5,1 # shift right at 1 every loop and t4,t2,t5 # use mask to specified number at that place beq t4,x0,not_add6 # jump if t4 equal to 0 add t1,t1,t3 # add t3 to t1 not_add6: srli t3,t3,1 # shift left 1 bit to t3 srli t5,t5,1 # shift right at 1 every loop and t4,t2,t5 # use mask to specified number at that place beq t4,x0,not_add7 # jump if t4 equal to 0 add t1,t1,t3 # add t3 to t1 not_add7: srli t3,t3,1 # shift left 1 bit to t3 srli t5,t5,1 # shift right at 1 every loop and t4,t2,t5 # use mask to specified number at that place beq t4,x0,out_loop # jump if t4 equal to 0 add t1,t1,t3 # add t3 to t1 # end of loop ``` As the shown table, the cycle and instret reduced from *200* to *171*. | | original | rev 1 | rev 2 | |:-------:|:--------:|:-------:|:-------:| | INSTRET | 200 | 187 | ==171== | | CYCLE | 200 | 187 | 171 | | text | 692 | 596 | 728 | | data | 0 | 0 | 0 | | bss | 0 | 0 | 0 | | dec | 692 | ==596== | 728 | In rev 2( loop unrolling), while the number of cycles are the smallest, the size becomes the largest. | | original(%) | rev 1(%) | rev 2(%) | |:----:|:-----------:|:--------:|:---------:| | add | 13.46 | 13.24 | 14.49 | | srli | 10.26 | 12.5 | ==17.75== | | addi | 8.97 | 11.03 | 8.88 | | lw | ==12.18== | 1.47 | 1.18 | As I mentioned, I use `li`, `srli` to do the bitmask, the table shows that the `lw` operation decreases 10%. Also, due to the loop unrolling in the rev 2, the percentage of `add` and `srli` in the `Multi_bfloat` increases. ### Conclusion Though using the -Ofast or higher level compiler can perform better, we should consider the size of data, also the hardware implmentation. We need to strike a balance between performance and data volume. ### Reference * [Reducing memory usage with bfloat and bfloat multiplication](https://hackmd.io/@PWCheng/CAHW01) * [RV32emu](https://github.com/sysprog21/rv32emu)