Assignment1: RISC-V Assembly and Instruction Pipeline

# Assignment1: RISC-V Assembly and Instruction Pipeline ## Problem `B` ### C code ```c static inline float bf16_to_fp32(bf16_t h) { union { float f; uint32_t i; } u = {.i = (uint32_t)h.bits << 16}; return u.f; } static inline bf16_t fp32_to_bf16(float s) { bf16_t h; union { float f; uint32_t i; } u = {.f = s}; if ((u.i & 0x7fffffff) > 0x7f800000) { /* NaN */ h.bits = (u.i >> 16) | 64; /* force to quiet */ return h; } h.bits = (u.i + (0x7fff + ((u.i >> 0x10) & 1))) >> 0x10; return h; } ``` ### Assembly code #### bf16_to_fp32 ```c bf16_to_fp32: slli t1, t0, 16 ret ``` ##### test with Ripe [full code in GitHub](https://github.com/Eric-liau/Computer-Archicture-hw1/blob/main/bf16_to_fp32.s) ![1](https://hackmd.io/_uploads/HJnCyKH1ye.png) #### fp32_to_bf16 **Ver1** ```c fp32_to_bf16: # NaN detect li t1, 0x7fffffff li t2, 0x7f800000 and t3, t0, t1 slt t4, t2, t3 beqz t4, notNaN srli t1, t0, 16 ori t1, t1, 64 ret notNaN: srli t1, t0, 16 andi t1, t1, 1 li t2, 0x7fff add t1, t1, t2 add t1, t0, t1 srli t1, t1, 16 ret ``` #### Execution information ![10](https://hackmd.io/_uploads/By2BMm9yyx.png) **Ver2** Since most floating-point numbers are not NaN, I changed the branching condition to reduce the number of cycles. ```c fp32_to_bf16: # NaN detect li t1, 0x7fffffff li t2, 0x7f800000 and t3, t0, t1 blt t2, t3, NaN notNaN: srli t1, t0, 16 andi t1, t1, 1 li t2, 0x7fff add t1, t1, t2 add t1, t0, t1 srli t1, t1, 16 ret NaN: srli t1, t0, 16 ori t1, t1, 64 ret ``` #### Execution information ![11](https://hackmd.io/_uploads/SJQgNmq1kg.png) ##### test with Ripe [full code in GitHub](https://github.com/Eric-liau/Computer-Archicture-hw1/blob/main/fp32_to_bf16.s) ![2](https://hackmd.io/_uploads/r1o5qtB1Jx.png) ## [Leetcode 2469. Convert the Temperature](https://leetcode.com/problems/convert-the-temperature/description/) Using bf16 instead of float for computations, we therefore need addition and multiplication operations compatible with the bf16 datatype. ### C code #### my_clz modified from [2023 Quiz1](https://hackmd.io/@sysprog/arch2023-quiz1-sol) Problem A ```c int my_clz(short x) { x |= (x >> 1); x |= (x >> 2); x |= (x >> 4); x |= (x >> 8); x -= ((x >> 1) & 0x5555); x = ((x >> 2) & 0x3333) + (x & 0x3333); x = ((x >> 4) + x) & 0x0f0f; x += (x >> 8); return (16 - (x & 0x7f)); } ``` #### bf16_add ```c short bf16_add(short a, short b){ unsigned short rst_sign, rst_exp, rst_fra, rst; unsigned short a_sign = a >> 15; unsigned short a_exp = (unsigned short)(a << 1) >> 8; unsigned short a_fra = (a & 127) | 128; unsigned short b_sign = b >> 15; unsigned short b_exp = (unsigned short)(b << 1) >> 8; unsigned short b_fra = (b & 127) | 128; if(a_exp > b_exp){ unsigned short sft = a_exp - b_exp; b_fra >>= sft; if(a_sign ^ b_sign) rst_fra = a_fra - b_fra; else rst_fra = a_fra + b_fra; rst_exp = a_exp; rst_sign = a_sign; } else{ unsigned short sft = b_exp - a_exp; a_fra >>= sft; if(a_sign ^ b_sign) rst_fra = b_fra - a_fra; else rst_fra = a_fra + b_fra; rst_exp = b_exp; rst_sign = b_sign; } //normalize int lz = my_clz(rst_fra); if(lz <= 8){ lz = 8 - lz; rst_fra >>= lz; rst_exp += lz; } else{ lz -= 8; rst_fra <<= lz; rst_exp -= lz; } rst_fra -= 128; rst_sign <<= 15; rst_exp <<= 7; rst = rst_sign | rst_exp | rst_fra; return rst; } ``` #### bf16_mul ```c short bf16_mul(short a, short b){ unsigned short a_sign = a >> 15; unsigned short a_exp = (unsigned short)(a << 1) >> 8; unsigned short a_fra = (a & 127) | 128; unsigned short b_sign = b >> 15; unsigned short b_exp = (unsigned short)(b << 1) >> 8; unsigned short b_fra = (b & 127) | 128; unsigned short sign = a_sign ^ b_sign; unsigned short exp = a_exp + b_exp - 127; unsigned short fra = a_fra * b_fra; fra >>= 7; //normalize int lz = my_clz(fra); lz = 8 - lz; fra >>= lz; exp += lz; fra -= 128; sign <<= 15; exp <<= 7; unsigned short rst = sign | exp | fra; return rst; } ``` #### convert_temperature ```c unsigned short convert_temperature_kel(float celsius){ float num = 273.15; unsigned short num_bf16 = fp32_to_bf16(num); unsigned short celsius_bf16 = fp32_to_bf16(celsius); unsigned short kelvin_bf16 = bf16_add(celsius_bf16, num_bf16); return kelvin_bf16; } unsigned short convert_temperature_fah(float celsius){ float num1 = 1.8, num2 = 32; unsigned short num1_bf16 = fp32_to_bf16(num1); unsigned short num2_bf16 = fp32_to_bf16(num2); unsigned short celsius_bf16 = fp32_to_bf16(celsius); unsigned short fahrenheit_bf16 = bf16_mul(celsius_bf16, num1_bf16); fahrenheit_bf16 = bf16_add(fahrenheit_bf16, num2_bf16); return fahrenheit_bf16; } ``` ### Assembly code #### my_clz ```c clz: # x |= (x >> 1) srli t1, t0, 1 or t0, t0, t1 # x |= (x >> 2) srli t1, t0, 2 or t0, t0, t1 # x |= (x >> 4) srli t1, t0, 4 or t0, t0, t1 # x |= (x >> 8) srli t1, t0, 8 or t0, t0, t1 # x -= ((x >> 1) & 0x5555) srli t1, t0, 1 li t2, 0x5555 and t1, t1, t2 sub t0, t0, t1 # x = ((x >> 2) & 0x3333) + (x & 0x3333) srli t1, t0, 2 li t2, 0x3333 and t1, t1, t2 and t0, t0, t2 add t0, t0, t1 # x = ((x >> 4) + x) & 0x0f0f srli t1, t0, 4 add t0, t0, t1 li t2, 0x0f0f and t0, t0, t2 # x += (x >> 8) srli t1, t0, 8 add t0, t0, t1 # return (16 - (x & 0x7f)) andi t0, t0, 0x7f xori t0, t0, -1 addi t1, t0, 17 ret ``` ##### test with Ripe [full code in GitHub](https://github.com/Eric-liau/Computer-Archicture-hw1/blob/main/my_clz.s) ![3](https://hackmd.io/_uploads/ryuHuCPk1x.png) #### bf16_add ```c bf16_add: addi sp, sp, -4 sw ra, 0(sp) # t2 = t0_exp slli t2, t0, 17 srli t2, t2, 24 # t3 = t1_exp slli t3, t1, 17 srli t3, t3, 24 blt t2, t3, swap # t2 = exp, t4 = sft sub t4, t2, t3 j cal_start swap: # swap(t0, t1) mv t4, t0 mv t0, t1 mv t1, t4 # t2 = exp, t4 = sft sub t4, t3, t2 mv t2, t3 cal_start: # t3 = sign srli t3, t0, 15 # t5 = 0 ? add : sub xor t5, t0, t1 srli t5, t5, 15 # t0 = t0_fra andi t0, t0, 127 ori, t0, t0, 128 # t1 = t1_fra andi t1, t1, 127 ori t1, t1, 128 srl t1, t1, t4 # t0 = fra beqz t5, add_operation sub t0, t0, t1 j normalize add_operation: add t0, t0, t1 normalize: # t1 = lz call clz li t4, 8 addi t1, t1, -8 # normalize exp sub t2, t2, t1 # t1 = |t1| srai t4, t1, 4 xor t1, t1, t4 srli t4, t4, 31 add t1, t1, t4 # branch if lz >= 8 beqz t4, shift_left srl t0, t0, t1 j finish shift_left: sll t0, t0, t1 finish: addi t0, t0, -128 slli t3, t3, 15 slli t2, t2, 7 or t1, t0, t2 or t1, t1, t3 lw ra, 0(sp) addi sp, sp, 4 ret ``` ##### test with Ripe [full code in GitHub](https://github.com/Eric-liau/Computer-Archicture-hw1/blob/main/bf16_add.s) ![4](https://hackmd.io/_uploads/HJJTJmuk1l.png) ##### verify correctness | input1-hex | input1-bf16 | input2-hex | input2-bf16 | output-hex | output-bf16 | output-float | | ---------- | ----------- | ---------- | ----------- |---------- | ----------- | ------------ | | 0x411d | 9.8125 | 0x4126 | 10.375 | 0x41a1 | 20.125 | 20.1875 | | 0xc2c8 | -100 | 0x4285 | 66.5 | 0xc206 | -33.5 | -33.5 | | 0x426b | 58.75 | 0x429e | 79 | 0x4309 | 137 | 137.75 | | 0xc117 | -9.4375 | 0xc178 | -15.5 | 0xc1c7 | -24.875 | -24.9375 | | 0x3f1a | 0.601562 | 0x3d76 | 0.060059 | 0x3f29 | 0.660156 | 0.661621 | #### bf16_mul ```c bf16_mul: addi sp, sp -4 sw ra, 0(sp) # t2 = sign xor t2, t0, t1 srli, t2, t2, 15 # t3 = exp slli t3, t0, 17 srli t3, t3, 24 slli t4, t1, 17 srli t4, t4, 24 add t3, t3, t4 addi t3, t3, -127 # t0 = fra andi t0, t0, 127 ori t0, t0, 128 andi t1, t1, 127 ori t1, t1, 128 mul t0, t0, t1 srli t0, t0, 7 call clz li t4, 8 sub t1, t4, t1 srl t0, t0, t1 add t3, t3, t1 addi t0, t0, -128 slli t2, t2, 15 slli t3, t3, 7 or t1, t0, t2 or t1, t1, t3 lw ra, 0(sp) addi sp, sp, 4 ret ``` ##### test with Ripe [full code in GitHub](https://github.com/Eric-liau/Computer-Archicture-hw1/blob/main/bf16_mul.s) ![5](https://hackmd.io/_uploads/HJa2R4d1ye.png) ##### verify correctness | input1-hex | input1-bf16 | input2-hex | input2-bf16 | output-hex | output-bf16 | output-float | | ---------- | ----------- | ---------- | ----------- |---------- | ----------- | ------------ | | 0x411d | 9.8125 | 0x4126 | 10.375 | 0x42cb | 101.5 | 101.804688 | | 0xc2c8 | -100 | 0x4285 | 66.5 | 0xc5cf | -6624 | -6650 | | 0x426b | 58.75 | 0x429e | 79 | 0x4591 | 4640 | 4641.25 | | 0xc117 | -9.4375 | 0xffffc178 | -15.5 | 0x4312 | 146 | 146.281250 | | 0x3f1a | 0.601562 | 0x3d76 | 0.060059 | 0x3d13 | 0.035889 | 0.036129 | #### convert_temperature ```c convert_temperature_kel: addi sp, sp, -4 sw ra, 0(sp) call fp32_to_bf16 li t0 0x4389 # (bf16)273.15 call bf16_add lw ra, 0(sp) addi sp, sp, 4 ret convert_temperature_fah: addi sp, sp, -4 sw ra, 0(sp) call fp32_to_bf16 li t0 0x3fe6 # (bf16)1.8 call bf16_mul li t0 0x4200 # (bf16)32 call bf16_add lw ra, 0(sp) addi sp, sp, 4 ret ``` ##### test with Ripe [full code in GitHub](https://github.com/Eric-liau/Computer-Archicture-hw1/blob/main/convert_temperature.s) ![6](https://hackmd.io/_uploads/H12ljQFkkg.png) ##### verify correctness | celsius-float | kelvin-hex | kelvin-bf16 | kelvin-float | fahrenheit-hex | fahrenheit-bf16 | fahrenheit-float | | ---------- | ----------- | ---------- | ----------- |---------- | ----------- | ------------ | | 10.4 | 0x438e | 284 | 283.549988 | 0x424a | 50 | 50.720001 | | -198.245697 | 0x4298 | 76 | 74.904305 | 0xc3a1 | -322 | -324.842255 | | 78.779999 | 0x43b0 | 352 | 351.929993 | 0x432d | 173 | 173.804001 | | -15.5 | 0x4382 | 260 | 257.649994 | 0x4088 | 4.25 | 4.1 | | 0.06 | 0x4389 | 274 | 273.209991 | 0x4200 | 32 | 32.108002 | | -54.872002 | 0x435c | 220 | 218.278 | 0xc284 | -66 | -66.7696 | In the case of using the above test data, there is an average error rate of 0.605% in Kelvin temperature conversions, and the average error rate in Fahrenheit temperature conversions is 1.153%. By sacrificing accuracy, memory space is saved, with each piece of data using only 2 bytes of memory. This is a 50% reduction compared to the original 4 bytes. ##### Execution information ![7](https://hackmd.io/_uploads/H1OBAXFy1l.png) After updating fp32_to_bf16 to Ver2 ![12](https://hackmd.io/_uploads/Hy0PIQ91kl.png) ## Analysis ### 5 stage RISC-V pipeline CPU ![8](https://hackmd.io/_uploads/Sy98B8Yykl.png) **IF :** Fetch the instruction in intruciton memory that will be excuted in later stages. **ID :** Decode the instruction fetched in IF and read the needed register data. **EXE :** Perform the corresponding operation based on the instruction. **MEM :** Read data from memory or write data to memory if needed. **WB :** Write either the operation result from EXE stage or the data read from memory into the register. Use the instruction `addi x17 x0 34` to show how it works on 5 stage RISC-V pipeline CPU. ### IF stage ![IF](https://hackmd.io/_uploads/HyeZGwF1yx.png) - We can observe that the output of the Program Counter (PC) is 0x2C, and after passing through a plus-4 adder, the input to the PC in the next cycle will be 0x30. - 0x2C is also the input to the Instruction Memory (IM), which is the address where the instruction is stored. Therefore, the PC reads the instruction data from address 0x2C in the IM. ### ID stage ![ID](https://hackmd.io/_uploads/ry0rLPtJJg.png) - After decoding, we recognize that this is an ADDI instruction, which is an I-type instruction. This means we do not need the value from Reg 2 when executing this instruction, even though the CPU still reads it from the register. - The values we need are the data from register x0(since the R1 index is 0), which is always zero, and the immediate value generated by the immediate generator. ### EXE stage ![EXE](https://hackmd.io/_uploads/Sy0VguF11e.png) - Since this instruction isn't a branch instruction, the Branch Unit can be ignore. - The ALU adds 0x0 from `x0` to 0x22 generated by the immediate generator to obtain the result. ### MEM stage ![MEM](https://hackmd.io/_uploads/Bk1TfdYJyg.png) - The instruction neither loads nor saves data during the MEM stage, meaning it does nothing in this stage. ### WB stage ![WB](https://hackmd.io/_uploads/rkLbm_KJ1l.png)![WB1](https://hackmd.io/_uploads/rkNgN_K11e.png) - The CPU stores the data into the register during the ID stage, indicating that the register is allowed to be written to.