Assignment1: RISC-V Assembly and Instruction Pipeline

# Assignment1: RISC-V Assembly and Instruction Pipeline contributed by < [`CSIE523`](https://github.com/CSIE523/2023_Computer_Architecture_FALL/tree/main/hw1) > ## Find first set using CLZ > Description: Given a 64-bit integer, find the index or position of the bit first set to one counting from the least significant bit position. > >Example: >Input: 325478 (0x000000000004F766) >Output: 2 ## Solution ### Idea for problem solving The main idea for solving this problem is the algorithm from Wikipedia. ``` ffs(x) = w - clz(x & -x). # w is total bits in x. # -x is the 2's complement of x. ``` ### C program ```c #include<stdio.h> #include<stdint.h> uint64_t twoscomple(uint64_t x){ x ^= 0xffffffffffffffff; return x + 1; } uint16_t count_leading_zeros(uint64_t x){ x |= (x >> 1); x |= (x >> 2); x |= (x >> 4); x |= (x >> 8); x |= (x >> 16); x |= (x >> 32); /* count ones (population count) */ x -= ((x >> 1) & 0x5555555555555555 ); x = ((x >> 2) & 0x3333333333333333) + (x & 0x3333333333333333); x = ((x >> 4) + x) & 0x0f0f0f0f0f0f0f0f; x += (x >> 8); x += (x >> 16); x += (x >> 32); return (64 - (x & 0x7f)); } int main(){ uint64_t input; scanf("%llu", &input); printf("Find first set is %d.\n", 64 - count_leading_zeros(input & twoscomple(input))); } ``` ### Assembly code ```c .data test_data1: .word 0x00101110 0x01001000 test_data2: .word 0x11000008 0x00000000 test_data3: .word 0x00000000 0x00000000 const_1: .word 0x55555555 const_2: .word 0x33333333 const_3: .word 0x0f0f0f0f str: .string "\nThe first set of test_data is " # s1 : unsigned int 64 high 32 bits # s2 : unsigned int 64 low 32 bits # s3 : 64 or 32 # s4 : test_data select # t1 : unsigned int 64 high 32 bits (tmp) # t2 : unsigned int 64 low 32 bits (tmp) # t3 : init=1, *2 until x = 32 .text data_select: jal ra, load_data1 jal ra, load_data2 jal ra, load_data3 j exit load_data1: la t1, test_data1 j main load_data2: la t1, test_data2 j main load_data3: la t1, test_data3 main: lw s1, 0(t1) lw s2, 4(t1) lw t3, 0(t1) lw t4, 4(t1) addi sp, sp, -4 sw ra, 0(sp) jal ra, twos_comple and s1, s1, t3 and s2, s2, t4 addi t1, s1, 0 # replace tmp addi t2, s2, 0 # replace tmp addi s3, x0, 0x20 addi t4, x0, 0x20 addi t3, x0, 1 shift_with_or_from1to16: sub s3, s3, t3 sll t0, t1, s3 srl t1, t1, t3 srl t2, t2, t3 or t2, t2, t0 or s1, s1, t1 or s2, s2, t2 addi t1, s1, 0 # replace tmp addi t2, s2, 0 # replace tmp slli t3, t3, 1 addi s3, x0, 0x20 bne t3, t4, shift_with_or_from1to16 # x |= (x >> 32); or s2, s1, t2 addi t1, s1, 0 # replace tmp addi t2, s2, 0 # replace tmp count_ones: # x -= ((x >> 1) & 0x5555555555555555); slli t0, t1, 31 srli t1, t1, 1 srli t2, t2, 1 or t2, t2, t0 lw t3, const_1 and t1, t1, t3 and t2, t2, t3 sub s2, s2, t2 sltu t0, s2, t2 sub s1, s1, t1 sub s1, s1, t0 addi t1, s1, 0 # replace tmp addi t2, s2, 0 # replace tmp # x = ((x >> 2) & 0x3333333333333333) + (x & 0x3333333333333333); addi t4, t1, 0 addi t5, t2, 0 slli t0, t1, 30 srli t1, t1, 2 srli t2, t2, 2 or t2, t2, t0 lw t3, const_2 and t1, t1, t3 and t2, t2, t3 and t4, t4, t3 and t5, t5, t3 add t2, t2, t5 sltu t0, t2, t5 add t1, t1, t4 add t1, t1, t0 addi s1, t1, 0 addi s2, t2, 0 # x = ((x >> 4) + x) & 0x0f0f0f0f0f0f0f0f; slli t0, t1, 28 srli t1, t1, 4 srli t2, t2, 4 or t2, t2, t0 add t2, t2, s2 sltu t0, t2, s2 add t1, t1, s1 add t1, t1, t0 lw t3, const_3 and t1, t1, t3 and t2, t2, t3 addi s1, t1, 0 addi s2, t2, 0 # x += (x >> 8); slli t0, t1, 24 srli t1, t1, 8 srli t2, t2, 8 or t2, t2, t0 add t2, t2, s2 sltu t0, t2, s2 add t1, t1, s1 add t1, t1, t0 addi s1, t1, 0 addi s2, t2, 0 # x += (x >> 16); slli t0, t1, 16 srli t1, t1, 16 srli t2, t2, 16 or t2, t2, t0 add t2, t2, s2 sltu t0, t2, s2 add t1, t1, s1 add t1, t1, t0 addi s1, t1, 0 addi s2, t2, 0 # x += (x >> 32); add t2, t2, s1 sltu t0, t2, s2 add t1, t1, t0 addi s1, t1, 0 addi s2, t2, 0 # (64 - (x & 0x7f)); addi s3, x0, 0x40 andi s2, s2, 0x7f sub s3, s3, s2 # 64 - return value addi t3, x0, 0x40 sub s3, t3, s3 jal ra, printResult lw ra, 0(sp) addi sp, sp, 4 jr ra twos_comple: li t5, -1 xor t3, t3, t5 # high xor t4, t4, t5 # low addi t5, x0, 1 add t4, t4, t5 sltu t0, t4, t5 add t3, t3, t0 jr ra printResult: # print string la a0, str addi a7, zero, 4 ecall # print ffs li a7, 1 mv a0, s3 ecall jr ra exit: li a7, 10 ecall ``` There are 3 test datas in this program: * `test_data1` : `0x0010111001001000` **First set = 13** * `test_data2` : `0x1100000800000000` **First set = 36** * `test_data3` : `0x0000000000000000` **First set = 0** ## Experiment Result ![](https://hackmd.io/_uploads/Syz0Df-Wa.png) ## Analysis Using 5-stage pipelined processor in [Ripes](https://github.com/mortbopet/Ripes). ``` 00000000 <data_select>: 0: 010000ef jal x1 16 <load_data1> 4: 018000ef jal x1 24 <load_data2> 8: 020000ef jal x1 32 <load_data3> c: 2180006f jal x0 536 <exit> 00000010 <load_data1>: 10: 10000317 auipc x6 0x10000 14: ff030313 addi x6 x6 -16 18: 0180006f jal x0 24 <main> 0000001c <load_data2>: 1c: 10000317 auipc x6 0x10000 20: fec30313 addi x6 x6 -20 24: 00c0006f jal x0 12 <main> 00000028 <load_data3>: 28: 10000317 auipc x6 0x10000 2c: fe830313 addi x6 x6 -24 00000030 <main>: 30: 00032483 lw x9 0 x6 34: 00432903 lw x18 4 x6 38: 00032e03 lw x28 0 x6 3c: 00432e83 lw x29 4 x6 40: ffc10113 addi x2 x2 -4 44: 00112023 sw x1 0 x2 48: 19c000ef jal x1 412 <twos_comple> 4c: 01c4f4b3 and x9 x9 x28 50: 01d97933 and x18 x18 x29 54: 00048313 addi x6 x9 0 58: 00090393 addi x7 x18 0 5c: 02000993 addi x19 x0 32 60: 02000e93 addi x29 x0 32 64: 00100e13 addi x28 x0 1 00000068 <shift_with_or_from1to16>: 68: 41c989b3 sub x19 x19 x28 6c: 013312b3 sll x5 x6 x19 70: 01c35333 srl x6 x6 x28 74: 01c3d3b3 srl x7 x7 x28 78: 0053e3b3 or x7 x7 x5 7c: 0064e4b3 or x9 x9 x6 80: 00796933 or x18 x18 x7 84: 00048313 addi x6 x9 0 88: 00090393 addi x7 x18 0 8c: 001e1e13 slli x28 x28 1 90: 02000993 addi x19 x0 32 94: fdde1ae3 bne x28 x29 -44 <shift_with_or_from1to16> 98: 0074e933 or x18 x9 x7 9c: 00048313 addi x6 x9 0 a0: 00090393 addi x7 x18 0 000000a4 <count_ones>: a4: 01f31293 slli x5 x6 31 a8: 00135313 srli x6 x6 1 ac: 0013d393 srli x7 x7 1 b0: 0053e3b3 or x7 x7 x5 b4: 10000e17 auipc x28 0x10000 b8: f64e2e03 lw x28 -156 x28 bc: 01c37333 and x6 x6 x28 c0: 01c3f3b3 and x7 x7 x28 c4: 40790933 sub x18 x18 x7 c8: 007932b3 sltu x5 x18 x7 cc: 406484b3 sub x9 x9 x6 d0: 405484b3 sub x9 x9 x5 d4: 00048313 addi x6 x9 0 d8: 00090393 addi x7 x18 0 dc: 00030e93 addi x29 x6 0 e0: 00038f13 addi x30 x7 0 e4: 01e31293 slli x5 x6 30 e8: 00235313 srli x6 x6 2 ec: 0023d393 srli x7 x7 2 f0: 0053e3b3 or x7 x7 x5 f4: 10000e17 auipc x28 0x10000 f8: f28e2e03 lw x28 -216 x28 fc: 01c37333 and x6 x6 x28 100: 01c3f3b3 and x7 x7 x28 104: 01cefeb3 and x29 x29 x28 108: 01cf7f33 and x30 x30 x28 10c: 01e383b3 add x7 x7 x30 110: 01e3b2b3 sltu x5 x7 x30 114: 01d30333 add x6 x6 x29 118: 00530333 add x6 x6 x5 11c: 00030493 addi x9 x6 0 120: 00038913 addi x18 x7 0 124: 01c31293 slli x5 x6 28 128: 00435313 srli x6 x6 4 12c: 0043d393 srli x7 x7 4 130: 0053e3b3 or x7 x7 x5 134: 012383b3 add x7 x7 x18 138: 0123b2b3 sltu x5 x7 x18 13c: 00930333 add x6 x6 x9 140: 00530333 add x6 x6 x5 144: 10000e17 auipc x28 0x10000 148: edce2e03 lw x28 -292 x28 14c: 01c37333 and x6 x6 x28 150: 01c3f3b3 and x7 x7 x28 154: 00030493 addi x9 x6 0 158: 00038913 addi x18 x7 0 15c: 01831293 slli x5 x6 24 160: 00835313 srli x6 x6 8 164: 0083d393 srli x7 x7 8 168: 0053e3b3 or x7 x7 x5 16c: 012383b3 add x7 x7 x18 170: 0123b2b3 sltu x5 x7 x18 174: 00930333 add x6 x6 x9 178: 00530333 add x6 x6 x5 17c: 00030493 addi x9 x6 0 180: 00038913 addi x18 x7 0 184: 01031293 slli x5 x6 16 188: 01035313 srli x6 x6 16 18c: 0103d393 srli x7 x7 16 190: 0053e3b3 or x7 x7 x5 194: 012383b3 add x7 x7 x18 198: 0123b2b3 sltu x5 x7 x18 19c: 00930333 add x6 x6 x9 1a0: 00530333 add x6 x6 x5 1a4: 00030493 addi x9 x6 0 1a8: 00038913 addi x18 x7 0 1ac: 009383b3 add x7 x7 x9 1b0: 0123b2b3 sltu x5 x7 x18 1b4: 00530333 add x6 x6 x5 1b8: 00030493 addi x9 x6 0 1bc: 00038913 addi x18 x7 0 1c0: 04000993 addi x19 x0 64 1c4: 07f97913 andi x18 x18 127 1c8: 412989b3 sub x19 x19 x18 1cc: 04000e13 addi x28 x0 64 1d0: 413e09b3 sub x19 x28 x19 1d4: 030000ef jal x1 48 <printResult> 1d8: 00012083 lw x1 0 x2 1dc: 00410113 addi x2 x2 4 1e0: 00008067 jalr x0 x1 0 000001e4 <twos_comple>: 1e4: fff00f13 addi x30 x0 -1 1e8: 01ee4e33 xor x28 x28 x30 1ec: 01eeceb3 xor x29 x29 x30 1f0: 00100f13 addi x30 x0 1 1f4: 01ee8eb3 add x29 x29 x30 1f8: 01eeb2b3 sltu x5 x29 x30 1fc: 005e0e33 add x28 x28 x5 200: 00008067 jalr x0 x1 0 00000204 <printResult>: 204: 10000517 auipc x10 0x10000 208: e2050513 addi x10 x10 -480 20c: 00400893 addi x17 x0 4 210: 00000073 ecall 214: 00100893 addi x17 x0 1 218: 00098513 addi x10 x19 0 21c: 00000073 ecall 220: 00008067 jalr x0 x1 0 00000224 <exit>: 224: 00a00893 addi x17 x0 10 228: 00000073 ecall ``` Let's take `6c: 013312b3 sll x5 x6 x19` for example in this 5-stage pipelined section. ### IF ![](https://hackmd.io/_uploads/S15sxO-bT.png) * Program Counter gives next instruction address to instruction memory. * Program Counter updates its value in order to fetch new instruction. * Here the next instruction is `0x013312b3` in instruction memory address `0x6c`. ### ID ![](https://hackmd.io/_uploads/Hy0ZfOWbT.png) * In ID stage, it decodes `0x013312b3` fetched from previous stage. * Translate `0x013312b3` into binary `0000000 10011 00110 001 00101 0110011`. funct7 = `0000000`. rs2 = `10011`. rs1 = `00110` funct3 = `001` rd = `00101` opcode = `0110011` * In [RISC-V instruction manual](https://riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf), shift left logic is in the format below. ![](https://hackmd.io/_uploads/B1FuOPZba.png) * It's decoded as `sll x5, x6, x19` in instruction format. ### EX ![](https://hackmd.io/_uploads/rJ8CXOW-6.png) * In EX stage, it shifts the value in `x6` register left by the value in `x19` register with no branch. ### MEM ![](https://hackmd.io/_uploads/ByGrD_ZWT.png) * In MEM stage, it doesn't perform any write or read operations. ### WB ![](https://hackmd.io/_uploads/ByXXDu-Zp.png) * In WB stage, write shift left result back to `x5` register. ## Assembly code Optimization ### First version The result of my first version of assembly code is presented in the experiment result above without any optimization. ### Second version (Loop unrolling) In the first version of assembly code, I simplified the following C code using a loop and then translated it into assembly code. ```c x |= (x >> 1); x |= (x >> 2); x |= (x >> 4); x |= (x >> 8); x |= (x >> 16); for loop: for(i=1;i<=16;i*=2) x |= (x >> i); ``` ```c shift_with_or_from1to16: sub s3, s3, t3 sll t0, t1, s3 srl t1, t1, t3 srl t2, t2, t3 or t2, t2, t0 or s1, s1, t1 or s2, s2, t2 addi t1, s1, 0 # replace tmp addi t2, s2, 0 # replace tmp slli t3, t3, 1 addi s3, x0, 0x20 bne t3, t4, shift_with_or_from1to16 # x |= (x >> 32); or s2, s1, t2 addi t1, s1, 0 # replace tmp addi t2, s2, 0 # replace tmp ``` With loop unrolling ```c shift_with_or_loop_unrolling: # x |= (x >> 1); slli t0, t1, 31 srli t1, t1, 1 srli t2, t2, 1 or t2, t2, t0 or s1, s1, t1 or s2, s2, t2 addi t1, s1, 0 # replace tmp addi t2, s2, 0 # replace tmp # x |= (x >> 2); slli t0, t1, 30 srli t1, t1, 2 srli t2, t2, 2 or t2, t2, t0 or s1, s1, t1 or s2, s2, t2 addi t1, s1, 0 # replace tmp addi t2, s2, 0 # replace tmp # x |= (x >> 4); slli t0, t1, 28 srli t1, t1, 4 srli t2, t2, 4 or t2, t2, t0 or s1, s1, t1 or s2, s2, t2 addi t1, s1, 0 # replace tmp addi t2, s2, 0 # replace tmp # x |= (x >> 8); slli t0, t1, 24 srli t1, t1, 8 srli t2, t2, 8 or t2, t2, t0 or s1, s1, t1 or s2, s2, t2 addi t1, s1, 0 # replace tmp addi t2, s2, 0 # replace tmp # x |= (x >> 16); slli t0, t1, 16 srli t1, t1, 16 srli t2, t2, 16 or t2, t2, t0 or s1, s1, t1 or s2, s2, t2 addi t1, s1, 0 # replace tmp addi t2, s2, 0 # replace tmp # x |= (x >> 32); or s2, s1, t2 addi t1, s1, 0 # replace tmp addi t2, s2, 0 # replace tmp ``` ### Third version (Redundant instructions eliminated) After loop unrolling, we can observe that there are many instructions repeated. ```c # x |= (x >> 1); slli t0, t1, 31 srli t1, t1, 1 srli t2, t2, 1 or t2, t2, t0 or s1, s1, t1 or s2, s2, t2 addi t1, s1, 0 # replace tmp addi t2, s2, 0 # replace tmp ``` I used the `t1` and `t2` registers to temporarily store the right-shifted value of x, and later combined it with the original value of x using an OR operation, storing the result in `s1` and `s2`. Since `t1` and `t2` are registers specifically used for logical operations, before further computations, the values in `t1` and `t2` will need to be replaced by the values in `s1` and `s2`. Therefore, I eliminate some redundant assembly code. For example, I simplified assembly code above. I directly use the values in `s1` and `s2` for right shift logical operations, instead of using `t1` and `t2`. In this way, it doesn't need to replace the values of `t1` and `t2` anymore. ```c # x |= (x >> 1); slli t0, s1, 31 srli t1, s1, 1 srli t2, s2, 1 or t2, t2, t0 or s1, s1, t1 or s2, s2, t2 ``` ### Result Comparion Lastly, the comparison of the three different execution infos is as follows. * First verison ![](https://hackmd.io/_uploads/Syz0Df-Wa.png) * Second version ![](https://hackmd.io/_uploads/Sy8yO4-ZT.png) * Third version ![](https://hackmd.io/_uploads/S1OMuN-bp.png) ## Reference [Find first set - Wikipedia](https://en.wikipedia.org/wiki/Find_first_set)