# Assignment2: RISC-V Toolchain > distributed by < [brian049](https://github.com/brian049/2023_Computer_Architecture/tree/main/hw2) > ## Question Selection The following question is picked from the Assignment 1 > 魏泳禎 Implement and analyze BF16 multiplication [Source code](https://github.com/aa860630/2023-computer-architecture/tree/main/HW1) ### Motivation I would choose this proposal because my topic of QUIZ1 is same as the author's, but the author used fewer instructions than me and added new features. ## Rewrite code Because the author didn't provide the entire c code with test case, so I rewrite the c code with main function and a test case. ### Modefied C code ```c= # include<stdio.h> float fp32_to_bf16(float x) { float y = x; int *p = (int *) &y; unsigned int exp = *p & 0x7F800000; unsigned int man = *p & 0x007FFFFF; if (exp == 0 && man == 0) /* zero */ return x; if (exp == 0x7F800000 /* Fill this! */) /* infinity or NaN */ return x; /* Normalized number */ /* round to nearest */ float r = x; int *pr = (int *) &r; *pr &= 0xFF800000; /* r has the same exp as x */ r /= 0x100 /* Fill this! */; y = x + r; *p &= 0xFFFF0000; return y; } int main(){ float x = 50.266666412353515625; //0x42491111 x = fp32_to_bf16(x); printf("%f\n", x); float xx = -73.53333282470703125; //0xc2931111 xx = fp32_to_bf16(xx); printf("%f\n", xx); printf("%f\n", x*xx); } ``` ### Assembly code ::: spoiler the author's assembly code ```cpp .data test: .word 0x3f99999a number1: .word 0x42491111 number2: .word 0xc2931111 sign_mask: .word 0x80000000 exp_mask: .word 0x7F800000 man_mask: .word 0x007FFFFF add_int: .word 0x00000080 norm_mask: .word 0x00008000 infinity_mask: .word 0x7F800000 right_shfit8_mask: .word 0x00008000 result_mask: .word 0xFFFF0000 valueis: .string "value is:" nextline: .string "\n" NaN: .string "NaN" .text fp32_to_bf16: # s0 : test # s1 : exp # s2 : man la s0 number1 lw s0 0(s0) la s1 number2 lw s1 0(s1) la s2 exp_mask lw s2 0(s2) la s3 man_mask lw s3 0(s3) and s2 s0 s2 #只取number1的exp部分 and s3 s0 s3 #只取number1的man部分 bnez s2 exp1_isnt_0 #若exp為0的話盡速下一行做第二次判別 beqz s3 return_x #若man也為0,return x exp1_isnt_0: #S1 : infinity la s4 infinity_mask lw s4 0(s4) beq s2 s4 return_x #若exp為11111111則跳infinity la t0 right_shfit8_mask #右移8位 lw t0 0(t0) add s0, s0, t0 la t0, result_mask #32->16捨棄小數右邊16個bits lw t0, 0(t0) and s0, s0, t0 and s2 s1 s2 #只取number2的exp部分 and s3 s1 s3 #只取number2的man部分 bnez s2 exp2_isnt_0 #若exp為0的話盡速下一行做第二次判別 beqz s3 return_x #若man也為0,return x exp2_isnt_0: #S1 : infinity la s4 infinity_mask lw s4 0(s4) beq s2 s4 return_x #若exp為11111111則跳infinity la t0 right_shfit8_mask #右移8位 lw t0 0(t0) add s1, s1, t0 la t0, result_mask #32->16捨棄小數右邊16個bits lw t0, 0(t0) and s1, s1, t0 j main return_x: j end main: #s0 = number1 #s1 = number2 la s0 number1 lw s0 0(s0) la s1 number2 lw s1 0(s1) la s2 sign_mask lw s2 0(s2) xor t2 s0 s1 and s10 t2 s2 # get sign la s2 exp_mask lw s2 0(s2) and t3 s0 s2 and t4 s1 s2 srli t3 t3 23 srli t4 t4 23 addi t3 t3 -127 addi t4 t4 -127 add t3 t3 t4 addi s11 t3 127 # get exponent la s3 man_mask lw s3 0(s3) la s4 add_int lw s4 0(s4) #t3 = multiplicand #t4 = multiplier #t5 = product and t3 s0 s3 and t4 s1 s3 srli t3 t3 16 srli t4 t4 16 or t3 t3 s4 or t4 t4 s4 mv s9 t3 mv t5 x0 mv s7 x0 mv s8 x0 addi s8 s8 8 andi t6 t4 1 # t6 = last_bit srli t4 t4 1 #right shift multiplier 1 bit beqz t6 loop add t5 s9 t5 loop: slli t3 t3 1 #left shift multiplicand 1 bit bge s7 s8 normalize addi s7 s7 1 andi t6 t4 1 # t6 = last_bit srli t4 t4 1 #right shift multiplier 1 bit #slli t3 t3 1 #left shift multiplicand 1 bit beqz t6, loop add t5 t3 t5 j loop normalize: la s0 norm_mask lw s0 0(s0) and s0 s0 t5 #if mantissa need to carry beqz s0 bits_15 addi s11 s11 1 slli s11 s11 24 #have to cut the integer meanwhile srli s11 s11 1 srli t5 t5 7 # discard unnecessary digits slli t5 t5 24 # after carring one bit,only have to shift left 24 bits srli t5 t5 9 #corresponding to the position of the mantissa j combine bits_15: slli s11 s11 24 srli s11 s11 1 srli t5 t5 7 # discard unnecessary digits slli t5 t5 25 srli t5 t5 9 combine: or s10 s11 s10 #combine sign and exponent or s10 s10 t5 #/........................../ #beq t3 0x7f800000 nan #if exponent >=11111111 print NaN #nan: #la a0 NaN #li a7,4 #ecall #j end #/........................../ print_number: # t0 : valueis # t1 : \n la t0 valueis mv a0 t0 li a7,4 ecall mv a0, s10 li a7,34 ecall la t1, nextline mv a0, t1 li a7,4 ecall end: add x0 x0 x0 ``` ::: ![](https://hackmd.io/_uploads/Hk5SUUF-a.png) and cycles after running at RIPES Modify a little so that the codes can be run on rv32emu. :::spoiler riscv code after modify so that it can be run on rv32emu ```cpp .data test: .word 0x3f99999a number1: .word 0x42491111 number2: .word 0xc2931111 sign_mask: .word 0x80000000 exp_mask: .word 0x7F800000 man_mask: .word 0x007FFFFF add_int: .word 0x00000080 norm_mask: .word 0x00008000 infinity_mask: .word 0x7F800000 right_shfit8_mask: .word 0x00008000 result_mask: .word 0xFFFF0000 valueis: .string "value is:" nextline: .string "\n" NaN: .string "NaN" .text .global fp32_to_bf16 fp32_to_bf16: # s0 : test # s1 : exp # s2 : man addi sp, sp, -44 sw ra, 0(sp) sw s0, 4(sp) sw s1, 8(sp) sw s2, 12(sp) sw s3, 16(sp) sw s4, 20(sp) sw s7, 24(sp) sw s8, 28(sp) sw s9, 32(sp) sw s10, 36(sp) sw s11, 40(sp) la a0 number1 lw a0 0(a0) la a1 number2 lw a1 0(a1) mv s0, a0 mv s1, a1 la s2 exp_mask lw s2 0(s2) la s3 man_mask lw s3 0(s3) and s2 s0 s2 #只取number1的exp部分 and s3 s0 s3 #只取number1的man部分 bnez s2 exp1_isnt_0 #若exp為0的話盡速下一行做第二次判別 beqz s3 return_x #若man也為0,return x exp1_isnt_0: #S1 : infinity la s4 infinity_mask lw s4 0(s4) beq s2 s4 return_x #若exp為11111111則跳infinity la t0 right_shfit8_mask #右移8位 lw t0 0(t0) add s0, s0, t0 la t0, result_mask #32->16捨棄小數右邊16個bits lw t0, 0(t0) and s0, s0, t0 and s2 s1 s2 #只取number2的exp部分 and s3 s1 s3 #只取number2的man部分 bnez s2 exp2_isnt_0 #若exp為0的話盡速下一行做第二次判別 beqz s3 return_x #若man也為0,return x exp2_isnt_0: #S1 : infinity la s4 infinity_mask lw s4 0(s4) beq s2 s4 return_x #若exp為11111111則跳infinity la t0 right_shfit8_mask #右移8位 lw t0 0(t0) add s1, s1, t0 la t0, result_mask #32->16捨棄小數右邊16個bits lw t0, 0(t0) and s1, s1, t0 j main return_x: j end main: #s0 = number1 #s1 = number2 la s2 sign_mask lw s2 0(s2) xor t2 s0 s1 and s10 t2 s2 # get sign la s2 exp_mask lw s2 0(s2) and t3 s0 s2 and t4 s1 s2 srli t3 t3 23 srli t4 t4 23 addi t3 t3 -127 addi t4 t4 -127 add t3 t3 t4 addi s11 t3 127 # get exponent la s3 man_mask lw s3 0(s3) la s4 add_int lw s4 0(s4) #t3 = multiplicand #t4 = multiplier #t5 = product and t3 s0 s3 and t4 s1 s3 srli t3 t3 16 srli t4 t4 16 or t3 t3 s4 or t4 t4 s4 mv s9 t3 mv t5 x0 mv s7 x0 mv s8 x0 addi s8 s8 8 andi t6 t4 1 # t6 = last_bit srli t4 t4 1 #right shift multiplier 1 bit beqz t6 loop add t5 s9 t5 loop: slli t3 t3 1 #left shift multiplicand 1 bit bge s7 s8 normalize addi s7 s7 1 andi t6 t4 1 # t6 = last_bit srli t4 t4 1 #right shift multiplier 1 bit #slli t3 t3 1 #left shift multiplicand 1 bit beqz t6, loop add t5 t3 t5 j loop normalize: la s0 norm_mask lw s0 0(s0) and s0 s0 t5 #if mantissa need to carry beqz s0 bits_15 addi s11 s11 1 slli s11 s11 24 #have to cut the integer meanwhile srli s11 s11 1 srli t5 t5 7 # discard unnecessary digits slli t5 t5 24 # after carring one bit,only have to shift left 24 bits srli t5 t5 9 #corresponding to the position of the mantissa j combine bits_15: slli s11 s11 24 srli s11 s11 1 srli t5 t5 7 # discard unnecessary digits slli t5 t5 25 srli t5 t5 9 combine: or s10 s11 s10 #combine sign and exponent or s10 s10 t5 #/........................../ #beq t3 0x7f800000 nan #if exponent >=11111111 print NaN #nan: #la a0 NaN #li a7,4 #ecall #j end #/........................../ print_number: # t0 : valueis # t1 : \n la t0 valueis mv a0 t0 li a7,4 ecall mv a0, s10 li a7,34 ecall la t1, nextline mv a0, t1 li a7,4 ecall end: mv a0, s10 lw ra, 0(sp) lw s0, 4(sp) lw s1, 8(sp) lw s2, 12(sp) lw s3, 16(sp) lw s4, 20(sp) lw s7, 24(sp) lw s8, 28(sp) lw s9, 32(sp) lw s10, 36(sp) lw s11, 40(sp) addi sp, sp, 44 ret ``` ::: ![](https://hackmd.io/_uploads/ByKfKWKGa.png) ## Comparison Then I put the code on rv32emu to estimate its cycle count with different optimization flags. ### no optimization flag ```bash $ ../../build/rv32emu bfmul.elf multiplication answer: 0xc5660000 cycle count: 169 instret: 2c5 inferior exit code 0 ``` <!-- ![](https://hackmd.io/_uploads/rkbG3WtMp.png) --> :::warning :warning: Don't put the screenshots which contain plain text only. Instead, utilize HackMD syntax to annotate the text. :notes: jserv ::: ### -O0 ```bash $ ../../build/rv32emu bfmul.elf multiplication answer: 0xc5660000 cycle count: 169 instret: 2c5 inferior exit code 0 ``` <!-- ![](https://hackmd.io/_uploads/HyUG2bFzp.png) --> ### -O1 ```bash $ ../../build/rv32emu bfmul.elf multiplication answer: 0xc5660000 cycle count: 170 instret: 2c0 inferior exit code 0 ``` <!-- ![](https://hackmd.io/_uploads/S1oz3bKfT.png) --> ### -O2 ```bash $ ../../build/rv32emu bfmul.elf multiplication answer: 0xc5660000 cycle count: 170 instret: 2c0 inferior exit code ``` <!-- ![](https://hackmd.io/_uploads/rykX3WYfp.png) --> ### -O3 ```bash $ ../../build/rv32emu bfmul.elf multiplication answer: 0xc5660000 cycle count: 170 instret: 2c0 inferior exit code 0 ``` <!-- ![](https://hackmd.io/_uploads/B1Q72WFz6.png) --> ### -Ofast ```bash $ ../../build/rv32emu bfmul.elf multiplication answer: 0xc5660000 cycle count: 170 instret: 2c0 inferior exit code 0 ``` <!-- ![](https://hackmd.io/_uploads/HkUX3btMp.png) --> ### -Os ```bash $ ../../build/rv32emu bfmul.elf multiplication answer: 0xc5660000 cycle count: 170 instret: 2c0 inferior exit code 0 ``` <!-- ![](https://hackmd.io/_uploads/HJF73WtfT.png) --> ### result comparison | Level | cycle | | ------------- | ----- | | no optimization flag | 169 | | -O1 | 169 | | -O2 | 170 | | -O3 | 170 | | -Ofast | 170 | | -Os | 170 | ## Reflection While I was modifing the code, I stuck at some faults that I can't run correctly. I found it out that I should save the register before running, espectially the register `s0`, and load out the register after execution. Then I found some parts that I can optimize just like [the author's assembly code](https://github.com/aa860630/2023-computer-architecture/blob/main/HW1/final%20version.s) showing below I think it is not use: ```cpp #right_shfit8_mask = 0x00008000 la t0, right_shfit8_mask # Shift right 8 bits lw t0, 0(t0) add s0, s0, t0 ``` But the following codes after the above codes are: ```cpp # result_mask = 0xFFFF0000 la t0, result_mask # 32->16 by and instruction lw t0, 0(t0) and s0, s0, t0 ``` after deleting some instructions, we can see there is fewer cycles needed by chcking the execution info. ![](https://hackmd.io/_uploads/Bk3L2ZYMT.png)