Assignment1: RISC-V Assembly and Instruction Pipeline

# Assignment1: RISC-V Assembly and Instruction Pipeline contributed by< `p76131505` > ## fabsf ```c static inline float fabsf(float x) { uint32_t i = *(uint32_t *)&x; // Read the bits of the float into an integer i &= 0x7FFFFFFF; // Clear the sign bit to get the absolute value x = *(float *)&i; // Write the modified bits back into the float return x; } ``` :::danger Don't paste code snip without comprehensive discussions. ::: ```c .data num0: .word 0x0f000000 num1: .word 0xff000000 num2: .word 0xf0f0f0f0 endl: .string "\n" .text la t0, num0 # x li t1, 3 fabsf: lw t2, 0(t0) addi a0, t2, 0 li a7, 2 ecall la a0, endl li a7, 4 ecall addi a0, t2, 0 li t3, 0x7fffffff and a0, a0, t3 li a7, 2 ecall la a0, endl li a7, 4 ecall addi t0, t0, 4 # move to the next test case addi t1, t1, -1 # test case counter-- bne t1, zero, fabsf li a7, 10 ecall ``` ## my_clz ```c static inline int my_clz(uint32_t x) { int count = 0; for (int i = 31; i >= 0; --i) { if (x & (1U << i)) break; count++; } return count; } ``` ```c .data num0: .word 0x0f000000 num1: .word 0xff000000 num2: .word 1 endl: .string "\n" .text la t0, num0 # x li t1, 3 my_clz: lw t2, 0(t0) li a0, 0 # count=0 li s1, 31 # i=31 loop: li s2, 1 # s2=1U sll s3, s2, s1 # s3=1U<<i and s4, t2, s3 # x&(1U<<i) bne s4, zero, end addi a0, a0, 1 # count++ addi s1, s1, -1 # --i bge s1, zero, loop end: li a7, 1 ecall la a0, endl li a7, 4 ecall addi t0, t0, 4 # move to the next test case addi t1, t1, -1 # test case counter-- bne t1, zero, my_clz li a7, 10 ecall ``` ## fp16_to_fp32 ```c static inline uint32_t fp16_to_fp32(uint16_t h) { const uint32_t w = (uint32_t) h << 16; const uint32_t sign = w & UINT32_C(0x80000000); const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF); uint32_t renorm_shift = my_clz(nonsign); renorm_shift = renorm_shift > 5 ? renorm_shift - 5 : 0; const int32_t inf_nan_mask = ((int32_t)(nonsign + 0x04000000) >> 8) & INT32_C(0x7F800000); const int32_t zero_mask = (int32_t)(nonsign - 1) >> 31; return sign | ((((nonsign << renorm_shift >> 3) + ((0x70 - renorm_shift) << 23)) | inf_nan_mask) & ~zero_mask); } ``` ```c .data num0: .word 100 .text la t0, num0 lw t0, 0(t0) # h slli t0, t0, 16 # w = h << 16 li t1, 0x80000000 and t1, t0, t1 # sign = w & UINT32_C(0x80000000) li t2, 0x7fffffff and t2, t0, t2 # nonsign = w & UINT32_C(0x7FFFFFFF) my_clz: li a0, 0 # count=0 li s1, 31 # i=31 loop: li s2, 1 # s2=1U sll s3, s2, s1 # s3=1U<<i and s4, t2, s3 # x&(1U<<i) bne s4, zero, clz_ret addi a0, a0, 1 # count++ addi s1, s1, -1 # --i bge s1, zero, loop clz_ret: li t3, 5 bgt a0, t3, g li a0, 0 j mask g: addi a0, a0, -5 mask: li t4, 0x04000000 add t4, t2, t4 # t4 = nonsign + 0x04000000 srli t4, t4, 8 # t4 >> 8 li t5, 0x7f800000 and t4, t4, t5 # inf_nan_mask addi t6, t2, -1 srli t6, t6, 31 # zero_mask sll t2, t2, a0 srli t2, t2, 3 # t2 = non << renorm_shift >> 3 sub a0, zero, a0 addi a0, a0, 0x70 slli a0, a0, 23 # a0 = 0x70 - renorm_shift << 23 add a0, t2, a0 or a0, a0, t4 xori t6, t6, -1 and a0, a0, t6 or a0, t1, a0 li a7,34 ecall ``` ## Selected Problems :::danger What inspired you from Quiz 1, as the assignment asks you to use these inspirations to improve the existing problems? ::: *using ```my_clz``` to calculate how many leading zeros are in ```num```, then subtracting 32 to determine how many bits need to alternate between 1 and 0. [Leetcode 693. Binary Number with Alternating Bits](https://leetcode.com/problems/binary-number-with-alternating-bits/description/) Given a positive integer, check whether it has alternating bits: namely, if two adjacent bits will always have different values. Example 1: Input: n = 5 Output: true Explanation: The binary representation of 5 is: 101 Example 2: Input: n = 7 Output: false Explanation: The binary representation of 7 is: 111. Example 3: Input: n = 11 Output: false Explanation: The binary representation of 11 is: 1011. ## Solution ```c #include <stdio.h> #include <stdint.h> #include <stdbool.h> static inline int my_clz(uint32_t x) { int count = 0; for (int i = 31; i >= 0; --i) { if (x & (1U << i)) break; count++; } return count; } bool hasAlternatingBits(uint32_t n) { int totalBits = 32 - my_clz(n); for (int i = 0; i < totalBits - 1; i++) { if (((n >> i) & 1) == ((n >> (i + 1)) & 1)) { return false; } } return true; } int main() { uint32_t n = 5; for(int i=0;i<5;i++) printf("%s\n", hasAlternatingBits(i)?"true":"false"); return 0; } ``` ## Use my_clz to solve this problem(version1) ``` .data n0: .word 0b10101010101011 n1: .word 0b11111000000000 n2: .word 0b10101010101000 endl: .string "\n" f: .string "false\n" t: .string "true\n" .text la t0, n0 li t1, 3 my_clz: lw t2, 0(t0) li a0, 0 # count=0 li s1, 31 # i=31 li s0, 0 # bit i loop: li s2, 1 # s2=1U sll s3, s2, s1 # s3=1U<<i and s4, t2, s3 # x&(1U<<i) bne s4, zero, hasAlternatingBits addi a0, a0, 1 # count++ addi s1, s1, -1 # --i bge s1, zero, loop hasAlternatingBits: li s5, 31 sub s5, s5, a0 # total=31-a0 ble s5, zero, tend bitloop: srl s6, t2, s0 # s6=n>>i addi s11, s0, 1 srl s7, t2, s11 # s7=n>>(i+1) andi s8, s6, 1 andi s9, s7, 1 beq s8, s9, end addi s0, s0, 1 # i++ blt s0, s5, bitloop tend: mv a0, t2 li a7, 1 ecall la a0, t li a7, 4 ecall addi t0, t0, 4 addi t1, t1, -1 bne t1, zero, my_clz li a7, 10 ecall end: mv a0, t2 li a7, 1 ecall la a0, f li a7, 4 ecall addi t0, t0, 4 addi t1, t1, -1 bne t1, zero, my_clz li a7, 10 ecall ``` ## modify(using unrolling in my_clz)(version2) ``` .data n0: .word 5 n1: .word 7 n2: .word 11 endl: .string "\n" f: .string "false\n" t: .string "true\n" .text la t0, n0 li t1, 3 my_clz: lw t2, 0(t0) li a0, 0 # count=0 li s1, 31 # i=31 li s0, 0 # bit i loop: li s2, 1 # s2=1U sll s3, s2, s1 # s3=1U<<i and s4, t2, s3 # x&(1U<<i) bne s4, zero, hasAlternatingBits addi a0, a0, 1 # count++ addi s1, s1, -1 # --i sll s3, s2, s1 # s3=1U<<i and s4, t2, s3 # x&(1U<<i) bne s4, zero, hasAlternatingBits addi a0, a0, 1 # count++ addi s1, s1, -1 # --i sll s3, s2, s1 # s3=1U<<i and s4, t2, s3 # x&(1U<<i) bne s4, zero, hasAlternatingBits addi a0, a0, 1 # count++ addi s1, s1, -1 # --isll s3, s2, s1 # s3=1U<<i and s4, t2, s3 # x&(1U<<i) bne s4, zero, hasAlternatingBits addi a0, a0, 1 # count++ addi s1, s1, -1 # --i bge s1, zero, loop hasAlternatingBits: li s5, 31 sub s5, s5, a0 # total=31-a0 ble s5, zero, tend bitloop: srl s6, t2, s0 # s6=n>>i addi s11, s0, 1 srl s7, t2, s11 # s7=n>>(i+1) andi s8, s6, 1 andi s9, s7, 1 beq s8, s9, end addi s0, s0, 1 # i++ blt s0, s5, bitloop tend: mv a0, t2 li a7, 1 ecall la a0, t li a7, 4 ecall addi t0, t0, 4 addi t1, t1, -1 bne t1, zero, my_clz li a7, 10 ecall end: mv a0, t2 li a7, 1 ecall la a0, f li a7, 4 ecall addi t0, t0, 4 addi t1, t1, -1 bne t1, zero, my_clz li a7, 10 ecall ``` (version1) ![image](https://hackmd.io/_uploads/B1HAODU1kl.png) (version2) ![image](https://hackmd.io/_uploads/Bys7uw8JJx.png) ### analysis *By using loop unrolling, the need to check the condition for jumping back to the loop is reduced, which decreases the number of cycles ## Ripes Simulator ![image](https://hackmd.io/_uploads/B1Em1Q8k1x.png) using 5-stage pipelined process in pipes ``` 0: 10000297 auipc x5 0x10000 4: 00028293 addi x5 x5 0 8: 00300313 addi x6 x0 3 0000000c <my_clz>: c: 0002a383 lw x7 0 x5 10: 00000513 addi x10 x0 0 14: 01f00493 addi x9 x0 31 18: 00000413 addi x8 x0 0 0000001c <loop>: 1c: 00100913 addi x18 x0 1 20: 009919b3 sll x19 x18 x9 24: 0133fa33 and x20 x7 x19 28: 040a1463 bne x20 x0 72 <hasAlternatingBits> 2c: 00150513 addi x10 x10 1 30: fff48493 addi x9 x9 -1 34: 009919b3 sll x19 x18 x9 38: 0133fa33 and x20 x7 x19 3c: 020a1a63 bne x20 x0 52 <hasAlternatingBits> 40: 00150513 addi x10 x10 1 44: fff48493 addi x9 x9 -1 48: 009919b3 sll x19 x18 x9 4c: 0133fa33 and x20 x7 x19 50: 020a1063 bne x20 x0 32 <hasAlternatingBits> 54: 00150513 addi x10 x10 1 58: fff48493 addi x9 x9 -1 5c: 0133fa33 and x20 x7 x19 60: 000a1863 bne x20 x0 16 <hasAlternatingBits> 64: 00150513 addi x10 x10 1 68: fff48493 addi x9 x9 -1 6c: fa04d8e3 bge x9 x0 -80 <loop> 00000070 <hasAlternatingBits>: 70: 01f00a93 addi x21 x0 31 74: 40aa8ab3 sub x21 x21 x10 78: 03505263 bge x0 x21 36 <tend> 0000007c <bitloop>: 7c: 0083db33 srl x22 x7 x8 80: 00140d93 addi x27 x8 1 84: 01b3dbb3 srl x23 x7 x27 88: 001b7c13 andi x24 x22 1 8c: 001bfc93 andi x25 x23 1 90: 039c0e63 beq x24 x25 60 <end> 94: 00140413 addi x8 x8 1 98: ff5442e3 blt x8 x21 -28 <bitloop> 0000009c <tend>: 9c: 00038513 addi x10 x7 0 a0: 00100893 addi x17 x0 1 a4: 00000073 ecall a8: 10000517 auipc x10 0x10000 ac: f6d50513 addi x10 x10 -147 b0: 00400893 addi x17 x0 4 b4: 00000073 ecall b8: 00428293 addi x5 x5 4 bc: fff30313 addi x6 x6 -1 c0: f40316e3 bne x6 x0 -180 <my_clz> c4: 00a00893 addi x17 x0 10 c8: 00000073 ecall 000000cc <end>: cc: 00038513 addi x10 x7 0 d0: 00100893 addi x17 x0 1 d4: 00000073 ecall d8: 10000517 auipc x10 0x10000 dc: f3650513 addi x10 x10 -202 e0: 00400893 addi x17 x0 4 e4: 00000073 ecall e8: 00428293 addi x5 x5 4 ec: fff30313 addi x6 x6 -1 f0: f0031ee3 bne x6 x0 -228 <my_clz> f4: 00a00893 addi x17 x0 10 f8: 00000073 ecall ``` take ```70: 01f00a93 addi x21,x0,31```for example in this 5-stage pipelined section. *** #### IF ![image](https://hackmd.io/_uploads/ByjvxPIykg.png) *The program counter updates the value of the instruction each time to fetch a new instruction. *The current instruction memory address is ```0x00000048```, and the instruction read from that address is ```0x009919b3```. *Since the current instruction memory address is 0x0000004c, and most instructions in many architectures (such as RISC-V) are 4 bytes (32 bits) long, the next instruction address will typically be 0x0000004c, which is 4 bytes ahead of the current one. *** #### ID ![image](https://hackmd.io/_uploads/SJGjgvLykg.png) *decode the instruction ```0x009919b3``` into binary ```0000000 01001 10010 001 10011 0110011``` funct7 = ```0000000``` rs2 = ```01001``` rs1 = ```10010``` funct3 = ```001``` rd = ```10011``` opcode = ```0110011``` *The instruction ```0x009919b3``` has been decoded as ```sll x19,x18,x9``` *** #### EXE ![image](https://hackmd.io/_uploads/H1hfXv8J1l.png) *In the EXE (execute) stage, the value in register ```x18``` is shifted left by the number of positions specified in the ```x9``` register, and the result is stored back in register ```x19```. *** #### MEM ![image](https://hackmd.io/_uploads/Sy11Vw8yke.png) *In the MEM (memory) stage, it checks whether to write to or read from memory. For the ```sll``` instruction, there is no need for reading from or writing to memory. *** #### WB ![image](https://hackmd.io/_uploads/H1dT4v8kyl.png) *In the WB (write-back) stage, the data is written back to register ```x19```