# Assignment1: RISC-V Assembly and Instruction Pipeline contributed by < [`Hotmercury`](https://github.com/Hotmercury) > we can find instruction [reference](https://riscv.org//wp-content/uploads/2017/05/riscv-spec-v2.2.pdf) [code on github](https://github.com/HotMercury/riscv-multiplicator-no-mul-instruction) Find the position 0 the most close to LSB, and then make the position left to this postion become 0. we can use this to compute the carry flag. :::success example mask = 1010 0111 1010 0111 -> 0000 0111 and we can use (mask << 1) xor (mask) will get cary bit -> 0000 1000 ::: ```c uint64_t mask_lowest_zero(uint64_t x) { uint64_t mask = x; mask &= (mask << 1) | 0x1; mask &= (mask << 2) | 0x3; mask &= (mask << 4) | 0xF; mask &= (mask << 8) | 0xFF; mask &= (mask << 16) | 0xFFFF; mask &= (mask << 32) | 0xFFFFFFFF; return mask; } ``` use ori immidiate only has 12 bit ![](https://hackmd.io/_uploads/S1xmJ-mxp.png) $$ mask \ bigger \ than \ 2^{11} \ must \ to \ use \ register $$ ```asm .data # dword -> 8 bytes test_1: .dword 0x10FFFFFFFFFF3333 .text main: la t0, test_1 # lui t0, test_1[31:12] # lw t0, test_1[11:0] lw a0, 0(t0) lw a1, 4(t0) jal mask_lowest_zero li a7, 10 # return 0 ecall mask_lowest_zero: # a0 low , a1 high # mask &= (mask << 1) | 1; # a1,a0 = 64 bits parameter slli t1, a1, 1 srli t0, a0, 31 or t1, t1, t0 # t1,t0 slli t0, a0, 1 # t0 = a0 << 1 ori t0, t0, 1 # x = x | 1 and a0, a0, t0 and a1, a1, t1 # mask &= (mask << 2) | 0x3; slli t1, a1, 2 srli t0, a0, 30 or t1, t1, t0 # left 32 bits slli t0, a0, 2 # t0 = a0 << 2 ori t0, t0, 3 # x = x | 3 and a0, a0, t0 and a1, a1, t1 # mask &= (mask << 4) | 0xF; slli t1, a1, 4 srli t0, a0, 28 or t1, t1, t0 # left 32 bits slli t0, a0, 4 # t0 = a0 << 4 ori t0, t0, 0xF # x = x | 0xF and a0, a0, t0 and a1, a1, t1 # mask &= (mask << 8) | 0xFF; slli t1, a1, 8 srli t0, a0, 24 or t1, t1, t0 # left 32 bits slli t0, a0, 8 # t0 = a0 << 8 ori t0, t0, 0xFF # x = x | 0xFF and a0, a0, t0 and a1, a1, t1 # mask &= (mask << 16) | 0xFFFF; li t3 , 0xFFFF # lui + addi slli t1, a1, 16 srli t0, a0, 16 or t1, t1, t0 # left 32 bits slli t0, a0, 16 # t0 = a0 << 16 or t0, t0, t3 # x = x | 0xFFFF and a0, a0, t0 and a1, a1, t1 # mask &= (mask << 32) | 0xFFFFFFFF; and a1, a1,a0 ``` x = x + 1 **flow** 1. test if overflow 2. find the carry flag 3. return flag & another bit ```clike int64_t inc(int64_t x) { // x = all one bits will overflow to 0 if (~x == 0) return 0; // set carry flag int64_t mask = mask_lowest_zero(x); // 0011 -> 0100 int64_t z1 = mask ^ ((mask << 1) | 1); return (x & ~mask) | z1; } ``` ```asm inc: addi sp, sp -12 sw ra, 0(sp) sw a0, 4(sp) sw a1, 8(sp) # save parameters jal mask_lowest_zero mv t0, a0 mv t1, a1 # t1,t0 mask lw a0, 4(sp) lw a1, 8(sp) # restore parameters slli t3, t1, 1 srli t2, t0, 31 or t3, t3, t2 slli t2, t0, 1 # a1,a0 << 1 ori t2, t2, 1 # a1,a0 | 1 xor t2, t2, t0 xor t3, t3, t1 # t2,t3 z1 not t1, t1 not t0, t0 # ~mask and a1, a1, t1 and a0, a0, t0 # a1,a0 & ~mask or a1, a1, t3 or a0, a0, t2 # a1,a0 | z1 lw ra, 0(sp) addi sp, sp, 12 ret ``` find the value of nth bit ```c static inline int64_t getbit(int64_t value, int n) { return (value >> n) & 1; } ``` ```asm getbit: addi sp, sp, -8 sw ra, 0(sp) sw s0, 4(sp) li s0, 31 bge a3, s0, getbit_l # if (pos >= 32); srl a0, a0, a3 andi a0, a0, 1 j getbit_end getbit_l: sub s0, a3, s0 srl a1, a1, s0 andi a1, a1, 1 mv a0, a1 getbit_end: lw ra, 0(sp) lw s0, 4(sp) addi sp, sp, 16 ret ``` multiplicand multiplier = result **flow** 1. get specify bit of b 2. if(1) result + a << i ```clike int64_t imul32(int32_t a, int32_t b) { int64_t r = 0, a64 = (int64_t) a, b64 = (int64_t) b; for (int i = 0; i < 32; i++) { if (getbit(b64, i)) r += a64 << i; } return r; } ``` ```asm imul32: addi sp, sp, -28 sw ra,0(sp) sw s0,4(sp) sw s1,8(sp) sw s2,12(sp) sw s3,16(sp) sw s4,20(sp) sw s5,24(sp) mv s0, a0 # a mv s1, a1 # b li s2, 0 li s3, 0 # result s3,s2 li s4, 0 # i counter li s5, 32 # loop bound imul32_loop: beq s4, s5, imul32_end mv a0, a1 # a0 = b # a1 dont care mv a2, s4 # a2 = i jal getbit beq a0, zero, imul_skip # if (getbit(b, i)) sub t2, s5, s4 # t2 = 32 - i mv t0, s0 # t0 = a srl t1, t0, t2 # shift left i sll t0, t0, s4 # shift left i slli t3, s2, 31 # overflow check slli t4, t0, 31 # overflow check and t5, t3, t4 # overflow check beq t5, zero, 8 # if (overflow) addi s3, s3, 1 # result++ add s2, s2, t0 add s3, s3, t1 # r += a << i imul_skip: addi s4, s4, 1 # i++ j imul32_loop imul32_end: mv a0, s2 mv a1, s3 lw ra,0(sp) lw s0,4(sp) lw s1,8(sp) lw s2,12(sp) lw s3,16(sp) lw s4,20(sp) lw s5,24(sp) addi sp,sp,24 ret ``` merge result and b -> result(32),b(32) so we dont care overflow by use risc32 to implement 64 bit ```clike // improve int32 multiply int64_t imul32_improve(int32_t a, int32_t b) { int64_t r = (int64_t)b; int64_t a64 = (int64_t)a; for (int i = 0; i < 32; i++){ if(r & 1){ r = r + (a64 << 32); } r = r >> 1; } return r; } ``` **float32 multiply** **flow** 1. transform tp IEEE 2. decide sign bit 3. find the mantissa and add another 1 23 + 1 4. exponent 5. use imul32 compute the multiply of mantissa of two number 6. 24 * 24 most 48 bit - 23 = 25 we only perserve 24 bit,sowe need to check another shift `int mshift = getbit(mrtmp, 24);` will get the position of 25 from LSB 7. `int32_t er = mshift ? inc(ertmp) : ertmp;` if had shift that we should add another 1 to exponent 8. conbime new IEEE ```c float fmul32(float a, float b) { /* TODO: Special values like NaN and INF */ int32_t ia = *(int32_t *) &a, ib = *(int32_t *) &b; /* sign */ int sa = ia >> 31; int sb = ib >> 31; /* mantissa */ int32_t ma = (ia & 0x7FFFFF) | 0x800000; int32_t mb = (ib & 0x7FFFFF) | 0x800000; /* exponent */ int32_t ea = ((ia >> 23) & 0xFF); int32_t eb = ((ib >> 23) & 0xFF); /* 'r' = result */ int64_t mrtmp = imul32(ma, mb) >> 23; int mshift = getbit(mrtmp, C01); int64_t mr = mrtmp >> mshift; int32_t ertmp = ea + eb - C02; int32_t er = mshift ? inc(ertmp) : ertmp; /* TODO: Overflow ^ */ int sr = sa ^ sb; int32_t r = (sr << C03) | ((er & 0xFF) << C04) | (mr & 0x7FFFFF); return *(float *) &r; } ``` ```asm fmul32: addi sp, sp, -24 sw ra, 0(sp) sw s0, 4(sp) sw s1, 8(sp) sw s2, 12(sp) sw s3, 16(sp) sw s4, 20(sp) srli s0, a0, 31 srli s1, a1, 31 xor s0, s0, s1 # s0 = sign_a ^ sign_b li t0, 0x7FFFFF li t1, 0x800000 and s1, a0, t0 or s1, s1, t1 # s1 = mantissa_a and s2, a1, t0 or s2, s2, t1 # s2 = mantissa_b srli s3, a0, 23 andi s3, s3, 0xFF # s3 = exp_a srli s4, a1, 23 andi s4, s4, 0xFF # s4 = exp_b mv a0, s1 mv a1, s2 jal imul32 mv s1, a0 mv s2, a1 # s1,s2 = mantissa_a * mantissa_b srli s1, s1, 23 slli s2, s2, 9 or s1, s1, s2 # s1 = mantissa_a * mantissa_b >> 23 # s2 dont care mv a0, s1 # a0 = mantissa_a * mantissa_b >> 23 li a2, 24 # a1 = 24 jal getbit srl s1, s1, a0 # s1 = mantissa_a * mantissa_b >> 23 >> getbit add s3, s3, s4 # s3 = exp_a + exp_b addi s3, s3, -127 # s4 dont care # int32_t er = mshift ? inc(ertmp) : ertmp; # skip inc add s3, s3, a0 # s3 = er srli s0, s0, 31 # s0 = (sr << 31) andi s3, s3, 0xFF # s3 = er slli s3, s3, 23 # s3 = er << 23 li t0, 0x7FFFFF and s1, s1, t0 # s1 = mr or s0, s0, s3 # s0 = (sr << 31) | (er << 23) or s0, s0, s1 # s0 = (sr << 31) | (er << 23) | mr mv a0, s0 lw ra, 0(sp) lw s0, 4(sp) lw s1, 8(sp) lw s2, 12(sp) lw s3, 16(sp) lw s4, 20(sp) addi sp, sp, 24 ret ``` ### Analyze - what is CPI ? Below is the mathematical calculation for Cycle Per Instruction (CPI), so a higher CPI is considered better. $$ CPI = \frac{\sum_i(IC_i)(CC_i)}{IC} $$ - What is IPC ? Instructions per cycle higher is better When calculating floating-point multiplication, we need to compute the sign bit, exponent, and mantissa. This calculation involves multiplying the mantissas of two floating-point numbers, which means using two 23-bit segments from the IEEE representation, denoted as |1|8|23|. Upon observing the actual operation using a simulator, we identified that the performance bottleneck primarily lies in the mantissa calculation. The two diagrams below show the original and improved data, demonstrating a significant reduction of nearly 50% in cycles. ![](https://hackmd.io/_uploads/BybC-QWZa.png) ![](https://hackmd.io/_uploads/r1HzbQb-p.png) **difference** **original code** ```asm imul32_loop: beq s4, s5, imul32_end mv a0, a1 # a0 = b mv a2, s4 # a2 = i jal getbit beq a0, zero, imul_skip # if (getbit(b, i)) sub t2, s5, s4 # t2 = 32 - i mv t0, s0 # t0 = a srl t1, t0, t2 # shift left i sll t0, t0, s4 # shift left i slli t3, s2, 31 # overflow check slli t4, t0, 31 # overflow check and t5, t3, t4 # overflow check beq t5, zero, 8 # if (overflow) addi s3, s3, 1 # result++ add s2, s2, t0 add s3, s3, t1 # r += a << i imul_skip: addi s4, s4, 1 # i++ j imul32_loop ``` 1. Calling the `getbit` function using `jal getbit` results in a significant cycle overhead. By observing this pipeline, we can also note that when we use `jal`, it introduces two additional NOP instructions. ![](https://hackmd.io/_uploads/Sk4Fd4W-6.png) ![](https://hackmd.io/_uploads/Hk5c_VbW6.png) And clear will happen ![](https://hackmd.io/_uploads/rJ3Ot4--a.png) 2. When `getbit` is true, we need to perform the operation `r += a64 << i;`. We first shift `a64` left by `i` and then add it to `r`. However, this operation introduces a potential issue - register overflow. In our simulation, we are using the RV32 architecture, which operates with 32-bit principles. But considering that a 32-bit multiplication can result in a 64-bit value, we need to use two registers to accommodate this. This introduces the overflow concern. When performing addition on the lower 32 bits, it may result in a carry, effectively incrementing the value in the higher bits of the register. Hence, we see various overflow checks in the RISC-V code mentioned above. These checks ensure that we handle potential overflow scenarios appropriately to maintain the integrity of our calculations. example :::success let proccesor rv4 when 1001(a1) 1001(a0) + 0100(t1) 1000(t0) we need to record that (a0 + t0) has carry bit and add sum(a1,t1,0) ::: ![](https://hackmd.io/_uploads/ryN2iEbWp.png) 3. reodering the instructions For example, in the following code, instructions have been deliberately reordered. This can be an effective way to avoid "Read after Write" hazards. However, through observation, we can see that this doesn't have an impact because data forwarding can resolve this issue. You can notice that the gray selectors differ, indicating that the value in t3 comes from the calculation in the previous pipeline stage. *order* ```asm srli s1, s1, 1 # result >> 1 slli t3, s2, 31 # result << (32 - 1) or s1, s1, t3 # result | (result << (32 - 1)) srli s2, s2, 1 # result >> 1 ``` ![](https://hackmd.io/_uploads/HJKUESbbp.png) ![](https://hackmd.io/_uploads/HkcF4SZZT.png) *reorder* ```asm srli s1, s1, 1 # result >> 1 slli t3, s2, 31 # result << (32 - 1) srli s2, s2, 1 # result >> 1 or s1, s1, t3 # result | (result << (32 - 1)) ``` ![](https://hackmd.io/_uploads/BJRBgHW-a.png) ![](https://hackmd.io/_uploads/HJn2NBWZT.png) **improve** ```asm imul32_loop: beq t0, t1, imul32_end andi t2, s1, 1 # if (result & 1) beq t2, zero, imul32_skip add s2, s2, s0 # result += a imul32_skip: srli s1, s1, 1 # result >> 1 slli t3, s2, 31 # result << (32 - 1) srli s2, s2, 1 # result >> 1 or s1, s1, t3 # result | (result << (32 - 1)) addi t0, t0, 1 # i++ j imul32_loop ``` ### Hazard various hazard example - structural hazard - two instructions try to write to the register at the same time - Data Hazards - Control hazards ### problem branch often happen,and will come with two nop, can we avoid it? ![](https://hackmd.io/_uploads/HyVoDSbZa.png)