Assignment1: Implement log base power of 2 with CLZ

# Assignment1: Implement log base power of 2 with CLZ contributed by < [`eeeXun`](https://github.com/eeeXun/computer_architecture/tree/master/hw1) > ## Log base power of 2 with clz To get the log base power of 2, we need know the the position of first leading one. We can use `64 - CLZ` to get the position first leading one. Then `64 - CLZ - 1` divided by the power would get the result. For example 1. $log_{2^1} 64$, the `64 - CLZ - 1` is 6, and the power is `1`. So the result is `6 / 1 = 6` 2. $log_{2^2} 64$, the `64 - CLZ - 1` is 6, and the power is `2`. So the result is `6 / 2 = 3` 3. $log_{2^3} 64$, the `64 - CLZ - 1` is 6, and the power is `3`. So the result is `6 / 3 = 2` ## Implementation ### C code ```c #include <stdint.h> #include <stdio.h> uint16_t count_leading_zeros(uint64_t x) { x |= (x >> 1); x |= (x >> 2); x |= (x >> 4); x |= (x >> 8); x |= (x >> 16); x |= (x >> 32); /* count ones (population count) */ x -= ((x >> 1) & 0x5555555555555555); x = ((x >> 2) & 0x3333333333333333) + (x & 0x3333333333333333); x = ((x >> 4) + x) & 0x0f0f0f0f0f0f0f0f; x += (x >> 8); x += (x >> 16); x += (x >> 32); return (64 - (x & 0x7f)); } // log base power of 2 uint16_t logp2(int power, uint16_t clz) { uint16_t result = 0; int tmp = 64 - clz; while (1) { tmp -= power; if (tmp <= 0) break; result++; } return result; } int main(int argc, char* argv[]) { uint64_t a = 64; uint16_t clz = count_leading_zeros(a); printf("%d\n", logp2(2, clz)); return 0; } ``` ### Assembly #### version 1 with only single test data ($log_2 16$) ```asm! .data num_1: .dword 0x0000000000000010 .text main: li, a0, 1 la, t0, num_1 lw, a1, 0(t0) lw, a2, 4(t0) jal, ra, shift_or_loop mv, a1, a0 li, a0, 1 jal, ra, logp2 li, a7, 1 # print ecall li a7, 10 # exit ecall # x |= (x >> 1) # x |= (x >> 2) # x |= (x >> 4) # x |= (x >> 8) # x |= (x >> 16) # x |= (x >> 32) # arg # a0: shift_times # a1: num_l # a2: num_u shift_or_loop: addi, sp, sp, -16 # push sw, ra, 0(sp) sw, a0, 4(sp) sw, a1, 8(sp) sw, a2, 12(sp) jal, ra, shift_right mv, t0, a0 mv, t1, a1 lw, ra, 0(sp) lw, s0, 4(sp) lw, s1, 8(sp) lw, s2, 12(sp) addi, sp, sp, 16 # pop slli, a0, s0, 1 or, a1, s1, t0 or, a2, s2, t1 li, t0, 32 bge t0, a0, shift_or_loop mv s0, a1 mv, s1, a2 # continued from shift_or_loop # arg # s0: num_l # s1: num_u count_leading_zeros: addi, sp, sp, -12 # push sw, ra, 0(sp) sw, s0, 4(sp) sw, s1, 8(sp) li, a0, 1 mv, a1, s0 mv, a2, s1 jal, ra, shift_right li, t0, 0x55555555 and, t1, a0, t0 and, t2, a1, t0 lw, a0, 4(sp) lw, a1, 8(sp) mv a2, t1 mv a3, t2 jal ra, uint64_sub mv, s0, a0 mv, s1, a1 lw, ra, 0(sp) addi, sp, sp, 12 # pop addi, sp, sp, -12 # push sw, ra, 0(sp) sw, s0, 4(sp) sw, s1, 8(sp) li, a0, 2 mv, a1, s0 mv, a2, s1 jal, ra, shift_right li, t0, 0x33333333 and, s0, a0, t0 and, s1, a1, t0 lw, s2, 4(sp) lw, s3, 8(sp) and, s2, s2, t0 and, s3, s3, t0 mv, a0, s0 mv, a1, s1 mv, a2, s2 mv, a3, s3 jal, ra, uint64_add lw, ra, 0(sp) addi, sp, sp, 12 # pop addi, sp, sp, -12 # push sw, ra, 0(sp) sw, a0, 4(sp) sw, a1, 8(sp) li, a0, 4 lw, a1, 4(sp) lw, a2, 8(sp) jal, ra, shift_right lw, a2, 4(sp) lw, a3, 8(sp) jal,ra, uint64_add li, t0, 0x0f0f0f0f and, a0, a0, t0 and, a1, a1, t0 lw, ra, 0(sp) addi, sp, sp, 12 # pop addi, sp, sp, -12 # push sw, ra, 0(sp) sw, a0, 4(sp) sw, a1, 8(sp) li, a0, 8 lw, a1, 4(sp) lw, a2, 8(sp) jal, ra, shift_right lw, a2, 4(sp) lw, a3, 8(sp) jal, ra, uint64_add lw, ra, 0(sp) addi, sp, sp, 12 # pop addi, sp, sp, -12 # push sw, ra, 0(sp) sw, a0, 4(sp) sw, a1, 8(sp) li, a0, 16 lw, a1, 4(sp) lw, a2, 8(sp) jal, ra, shift_right lw, a2, 4(sp) lw, a3, 8(sp) jal, ra, uint64_add lw, ra, 0(sp) addi, sp, sp, 12 # pop addi, sp, sp, -12 # push sw, ra, 0(sp) sw, a0, 4(sp) sw, a1, 8(sp) li, a0, 32 lw, a1, 4(sp) lw, a2, 8(sp) jal, ra, shift_right lw, a2, 4(sp) lw, a3, 8(sp) jal, ra, uint64_add lw, ra, 0(sp) addi, sp, sp, 12 # pop li, s0, 64 andi, t0, a0, 0x7f sub, a0, s0, t0 jr, ra # arg # a0 for shift_times # a1 for num_l # a2 for num_u # return # a0 for num_l # a1 for num_u shift_right: srl, s0, a1, a0 # s0 = (num_l >> shift_times) li, t0, 32 sub, t0, t0, a0 # t0 = 32 - shift_times sll, t1, a2, t0 # t1 = (num_u << t0) or, s0, s0, t1 # s0 = (s0 | t0) srl, s1, a2, a0 # s1 = (num_u >> shift_times) mv, a0, s0 mv, a1, s1 jr, ra # arg # a0 for num_l of summand # a1 for num_u of summand # a2 for num_l of addend # a3 for num_u of addend # return # a0 for num_l # a1 for num_u uint64_add: add, s0, a0, a2 sltu, t0, s0, a0 # carry, from compiler add, s1, a1, a3 add, s1, s1, t0 mv, a0, s0 mv, a1, s1 jr, ra # arg # a0 for num_l of minuend # a1 for num_u of minuend # a2 for num_l of subtrahend # a3 for num_u of subtrahend # return # a0 for num_l # a1 for num_u uint64_sub: sub, s0, a0, a2 sltu, t0, a0, s0 # borrow, from compiler sub, s1, a1, a3 sub, s1, s1, t0 mv, a0, s0 mv, a1, s1 jr, ra # arg # a0: power (for power 2) # a1: clz # return # a0: result logp2: mv, s0, a0 mv, s1, a1 mv, a0, zero li, t0, 64 sub, t0, t0, s1 logp2_loop: sub, t0, t0, s0 bge, zero, t0, logp2_ret addi, a0, a0, 1 j, logp2_loop logp2_ret: jr, ra ``` ![](https://hackmd.io/_uploads/S101Wajl6.png) #### version 2 with only single test data ($log_2 16$) The overhead of prologue and epilogue for jumping to `shift_right`, `uint64_add`, `uint64_sub` seems quit large. They are only few instructions. So I decide to expand those instructions to reduce the overhead. ```asm! .data num_1: .dword 0x0000000000000010 .text main: la, t0, num_1 lw, a0, 0(t0) lw, a1, 4(t0) jal, ra, count_leading_zeros mv, a1, a0 li, a0, 1 jal, ra, logp2 li, a7, 1 # print ecall li a7, 10 # exit ecall # arg # a0: num_l # a1: num_u count_leading_zeros: li, s0, 1 li, s1, 32 # x |= (x >> 1) # x |= (x >> 2) # x |= (x >> 4) # x |= (x >> 8) # x |= (x >> 16) # x |= (x >> 32) _clz_loop: srl, s2, a0, s0 # s2 = a0 >> s0 sub, t0, s1, s0 # t0 = 32 - s0 sll, t0, a1, t0 # t0 = a1 << (32 - s0) or, s2, s2, t0 # s2 |= t0 srl, s3, a1, s0 # s3 = a1 >> s0 slli, s0, s0, 1 # s0 *= 2 or, a0, a0, s2 or, a1, a1, s3 bge s1, s0, _clz_loop mv s0, a0 mv, s1, a1 # continued from _clz_loop # s0: num_l # s1: num_u _clz: srli, s2, s0, 1 # s2 = s0 >> 1 slli, t0, s1, 31 # t0 = s1 << (32 - 1) or, s2, s2, t0 # s2 |= t0 srli, s3, s1, 1 # s3 = s1 >> 1 li, t0, 0x55555555 and, s2, s2, t0 # s2 = s2 & 0x55555555 and, s3, s3, t0 # s3 = s3 & 0x55555555 sub, t0, s0, s2 # t0 = s0 - s2 sltu, t1, s0, t0 # borrow sub, s1, s1, s3 # s1 = s1 - s3 sub, s1, s1, t1 mv s0, t0 srli, s2, s0, 2 # s2 = s0 >> 2 slli, t0, s1, 30 # t0 = s1 << (32 - 2) or, s2, s2, t0 # s2 |= t0 srli, s3, s1, 2 # s3 = s1 >> 2 li, t0, 0x33333333 and, s2, s2, t0 # s2 = s2 & 0x33333333 and, s3, s3, t0 # s3 = s3 & 0x33333333 and, s4, s0, t0 # s4 = s0 & 0x33333333 and, s5, s1, t0 # s5 = s1 & 0x33333333 add, s0, s2, s4 sltu, t0, s0, s2 # carry add, s1, s3, s5 add, s1, s1, t0 srli, s2, s0, 4 # s2 = s0 >> 4 slli, t0, s1, 28 # t0 = s1 << (32 - 4) or, s2, s2, t0 # s2 |= t0 srli, s3, s1, 4 # s3 = s1 >> 4 add, s4, s0, s2 # s4 = s0 + s2 sltu, t0, s4, s0 # carry add, s5, s1, s3 # s5 = s1 + s3 add, s5, s5, t0 li, t0, 0x0f0f0f0f and, s0, s4, t0 and, s1, s5, t0 srli, s2, s0, 8 # s2 = s0 >> 8 slli, t0, s1, 24 # t0 = s1 << (32 - 8) or, s2, s2, t0 # s2 |= t0 srli, s3, s1, 8 # s3 = s1 >> 8 add, s0, s0, s2 sltu, t0, s0, s2 # carry add, s1, s1, s3 add, s1, s1, t0 srli, s2, s0, 16 # s2 = s0 >> 16 slli, t0, s1, 16 # t0 = s1 << (32 - 16) or, s2, s2, t0 # s2 |= t0 srli, s3, s1, 16 # s3 = s1 >> 16 add, s0, s0, s2 sltu, t0, s0, s2 # carry add, s1, s1, s3 add, s1, s1, t0 mv, s2, s1 # >> 32 => s2 = s1 li, s3, 0 # s3 = 0 add, s0, s0, s2 sltu, t0, s0, s2 # carry add, s1, s1, s3 add, s1, s1, t0 li, t0, 64 andi, t1, s0, 0x7f sub, a0, t0, t1 jr, ra # arg # a0: power (for power 2) # a1: clz # return # a0: result logp2: mv, s0, a0 mv, s1, a1 mv, a0, zero li, t0, 64 sub, t0, t0, s1 logp2_loop: sub, t0, t0, s0 bge, zero, t0, logp2_ret addi, a0, a0, 1 j, logp2_loop logp2_ret: jr, ra ``` ![](https://hackmd.io/_uploads/rygzZToga.png) ### version 3 with multiple test cases Test case with : $log_{2^1} {num\_1}$ to $log_{2^4} {num\_1}$, $log_{2^1} {num\_2}$ to $log_{2^4} {num\_2}$, $log_{2^1} {num\_3}$ to $log_{2^4} {num\_3}$ ```asm! .data case_num: .word 3 num_1: .dword 0x0000000000000010 num_2: .dword 0x0000000000000040 num_3: .dword 0x0a00492800010298 _EOL: .string "\n" .text main: la, s0, case_num la, s1, num_1 la, s2, _EOL lw, s0, 0(s0) addi, sp, sp, -20 # push sw, s0, 0(sp) sw, s1, 4(sp) sw, s2, 8(sp) case_loop: lw, a0, 0(s1) lw, a1, 4(s1) jal, ra, count_leading_zeros mv, a1, a0 li, a0, 1 sw, a0, 12(sp) sw, a1, 16(sp) # base from 2 to 16 base_loop: jal, ra, logp2 li, a7, 1 # print result ecall lw, a0, 8(sp) li, a7, 4 # print EOL ecall lw, a0, 12(sp) lw, a1, 16(sp) addi, a0, a0, 1 li, t0, 4 sw, a0, 12(sp) bge, t0, a0, base_loop lw, s0, 0(sp) lw, s1, 4(sp) addi, s0, s0, -1 beq, zero, s0, exit addi, s1, s1, 8 sw, s0, 0(sp) sw, s1, 4(sp) j, case_loop exit: addi, sp, sp, 20 # pop li, a7, 10 # exit ecall # arg # a0: num_l # a1: num_u count_leading_zeros: li, s0, 1 li, s1, 32 # x |= (x >> 1) # x |= (x >> 2) # x |= (x >> 4) # x |= (x >> 8) # x |= (x >> 16) # x |= (x >> 32) _clz_loop: srl, s2, a0, s0 # s2 = a0 >> s0 sub, t0, s1, s0 # t0 = 32 - s0 sll, t0, a1, t0 # t0 = a1 << (32 - s0) or, s2, s2, t0 # s2 |= t0 srl, s3, a1, s0 # s3 = a1 >> s0 slli, s0, s0, 1 # s0 *= 2 or, a0, a0, s2 or, a1, a1, s3 bge s1, s0, _clz_loop mv s0, a0 mv, s1, a1 # continued from _clz_loop # s0: num_l # s1: num_u _clz: srli, s2, s0, 1 # s2 = s0 >> 1 slli, t0, s1, 31 # t0 = s1 << (32 - 1) or, s2, s2, t0 # s2 |= t0 srli, s3, s1, 1 # s3 = s1 >> 1 li, t0, 0x55555555 and, s2, s2, t0 # s2 = s2 & 0x55555555 and, s3, s3, t0 # s3 = s3 & 0x55555555 sub, t0, s0, s2 # t0 = s0 - s2 sltu, t1, s0, t0 # borrow sub, s1, s1, s3 # s1 = s1 - s3 sub, s1, s1, t1 mv s0, t0 srli, s2, s0, 2 # s2 = s0 >> 2 slli, t0, s1, 30 # t0 = s1 << (32 - 2) or, s2, s2, t0 # s2 |= t0 srli, s3, s1, 2 # s3 = s1 >> 2 li, t0, 0x33333333 and, s2, s2, t0 # s2 = s2 & 0x33333333 and, s3, s3, t0 # s3 = s3 & 0x33333333 and, s4, s0, t0 # s4 = s0 & 0x33333333 and, s5, s1, t0 # s5 = s1 & 0x33333333 add, s0, s2, s4 sltu, t0, s0, s2 # carry add, s1, s3, s5 add, s1, s1, t0 srli, s2, s0, 4 # s2 = s0 >> 4 slli, t0, s1, 28 # t0 = s1 << (32 - 4) or, s2, s2, t0 # s2 |= t0 srli, s3, s1, 4 # s3 = s1 >> 4 add, s4, s0, s2 # s4 = s0 + s2 sltu, t0, s4, s0 # carry add, s5, s1, s3 # s5 = s1 + s3 add, s5, s5, t0 li, t0, 0x0f0f0f0f and, s0, s4, t0 and, s1, s5, t0 srli, s2, s0, 8 # s2 = s0 >> 8 slli, t0, s1, 24 # t0 = s1 << (32 - 8) or, s2, s2, t0 # s2 |= t0 srli, s3, s1, 8 # s3 = s1 >> 8 add, s0, s0, s2 sltu, t0, s0, s2 # carry add, s1, s1, s3 add, s1, s1, t0 srli, s2, s0, 16 # s2 = s0 >> 16 slli, t0, s1, 16 # t0 = s1 << (32 - 16) or, s2, s2, t0 # s2 |= t0 srli, s3, s1, 16 # s3 = s1 >> 16 add, s0, s0, s2 sltu, t0, s0, s2 # carry add, s1, s1, s3 add, s1, s1, t0 mv, s2, s1 # >> 32 => s2 = s1 li, s3, 0 # s3 = 0 add, s0, s0, s2 sltu, t0, s0, s2 # carry add, s1, s1, s3 add, s1, s1, t0 li, t0, 64 andi, t1, s0, 0x7f sub, a0, t0, t1 jr, ra # arg # a0: power (for power 2) # a1: clz # return # a0: result logp2: mv, s0, a0 mv, s1, a1 mv, a0, zero li, t0, 64 sub, t0, t0, s1 logp2_loop: sub, t0, t0, s0 bge, zero, t0, logp2_ret addi, a0, a0, 1 j, logp2_loop logp2_ret: jr, ra ``` ![](https://hackmd.io/_uploads/S1Am-pilp.png) ## Analysis & Pipeline Generated from [Ripes](https://github.com/mortbopet/Ripes) ``` 00000000 <main>: 0: 10000297 auipc x5 0x10000 4: 00028293 addi x5 x5 0 8: 0002a503 lw x10 0 x5 c: 0042a583 lw x11 4 x5 10: 020000ef jal x1 32 <count_leading_zeros> 14: 00050593 addi x11 x10 0 18: 00100513 addi x10 x0 1 1c: 14c000ef jal x1 332 <logp2> 20: 00100893 addi x17 x0 1 24: 00000073 ecall 28: 00a00893 addi x17 x0 10 2c: 00000073 ecall 00000030 <count_leading_zeros>: 30: 00100413 addi x8 x0 1 34: 02000493 addi x9 x0 32 00000038 <_clz_loop>: 38: 00855933 srl x18 x10 x8 3c: 408482b3 sub x5 x9 x8 40: 005592b3 sll x5 x11 x5 44: 00596933 or x18 x18 x5 48: 0085d9b3 srl x19 x11 x8 4c: 00141413 slli x8 x8 1 50: 01256533 or x10 x10 x18 54: 0135e5b3 or x11 x11 x19 58: fe84d0e3 bge x9 x8 -32 <_clz_loop> 5c: 00050413 addi x8 x10 0 60: 00058493 addi x9 x11 0 00000064 <_clz>: 64: 00145913 srli x18 x8 1 68: 01f49293 slli x5 x9 31 6c: 00596933 or x18 x18 x5 70: 0014d993 srli x19 x9 1 74: 555552b7 lui x5 0x55555 78: 55528293 addi x5 x5 1365 7c: 00597933 and x18 x18 x5 80: 0059f9b3 and x19 x19 x5 84: 412402b3 sub x5 x8 x18 88: 00543333 sltu x6 x8 x5 8c: 413484b3 sub x9 x9 x19 90: 406484b3 sub x9 x9 x6 94: 00028413 addi x8 x5 0 98: 00245913 srli x18 x8 2 9c: 01e49293 slli x5 x9 30 a0: 00596933 or x18 x18 x5 a4: 0024d993 srli x19 x9 2 a8: 333332b7 lui x5 0x33333 ac: 33328293 addi x5 x5 819 b0: 00597933 and x18 x18 x5 b4: 0059f9b3 and x19 x19 x5 b8: 00547a33 and x20 x8 x5 bc: 0054fab3 and x21 x9 x5 c0: 01490433 add x8 x18 x20 c4: 012432b3 sltu x5 x8 x18 c8: 015984b3 add x9 x19 x21 cc: 005484b3 add x9 x9 x5 d0: 00445913 srli x18 x8 4 d4: 01c49293 slli x5 x9 28 d8: 00596933 or x18 x18 x5 dc: 0044d993 srli x19 x9 4 e0: 01240a33 add x20 x8 x18 e4: 008a32b3 sltu x5 x20 x8 e8: 01348ab3 add x21 x9 x19 ec: 005a8ab3 add x21 x21 x5 f0: 0f0f12b7 lui x5 0xf0f1 f4: f0f28293 addi x5 x5 -241 f8: 005a7433 and x8 x20 x5 fc: 005af4b3 and x9 x21 x5 100: 00845913 srli x18 x8 8 104: 01849293 slli x5 x9 24 108: 00596933 or x18 x18 x5 10c: 0084d993 srli x19 x9 8 110: 01240433 add x8 x8 x18 114: 012432b3 sltu x5 x8 x18 118: 013484b3 add x9 x9 x19 11c: 005484b3 add x9 x9 x5 120: 01045913 srli x18 x8 16 124: 01049293 slli x5 x9 16 128: 00596933 or x18 x18 x5 12c: 0104d993 srli x19 x9 16 130: 01240433 add x8 x8 x18 134: 012432b3 sltu x5 x8 x18 138: 013484b3 add x9 x9 x19 13c: 005484b3 add x9 x9 x5 140: 00048913 addi x18 x9 0 144: 00000993 addi x19 x0 0 148: 01240433 add x8 x8 x18 14c: 012432b3 sltu x5 x8 x18 150: 013484b3 add x9 x9 x19 154: 005484b3 add x9 x9 x5 158: 04000293 addi x5 x0 64 15c: 07f47313 andi x6 x8 127 160: 40628533 sub x10 x5 x6 164: 00008067 jalr x0 x1 0 00000168 <logp2>: 168: 00050413 addi x8 x10 0 16c: 00058493 addi x9 x11 0 170: 00000513 addi x10 x0 0 174: 04000293 addi x5 x0 64 178: 409282b3 sub x5 x5 x9 0000017c <logp2_loop>: 17c: 408282b3 sub x5 x5 x8 180: 00505663 bge x0 x5 12 <logp2_ret> 184: 00150513 addi x10 x10 1 188: ff5ff06f jal x0 -12 <logp2_loop> 0000018c <logp2_ret>: 18c: 00008067 jalr x0 x1 0 ``` ### IF stage - At the beginning, [`IR`(Instruction Register)](https://en.wikipedia.org/wiki/Instruction_register) is `0x0`, meaning that we are going to fetch instruction at `0x0`, `auipc x5 0x10000`. - And `PC` is `0x4`. Then next instruction we are going to fetch - By `IR`, we can get the machine code from `Instr. memory` ![](https://hackmd.io/_uploads/SJxvbajea.png) - `PC` should be `IR` + 4 if no bracnching occured. ![](https://hackmd.io/_uploads/r1od-Tie6.png) ### ID stage - The `auipc x5 0x10000` is an U-type instruction. The `OP` code for `auipc` is `0010111`. So the machine is |IMM|rd|op| |:-:|:-:|:-:| |00010000000000000000|00101|0010111| And `0x10000297` in hex. After decoding, the processor get the `OP` code `0010111` and know that this is `auipc` instruction. And destination register is `x5`. And the upper immediate value is `0x10000`. So the immediate value is `0x10000000` ![](https://hackmd.io/_uploads/HkT5Zpolp.png) ### EX stage 1. The `auipc x5 0x10000` would add the `PC`, which is `0x0` when fetching it, ![](https://hackmd.io/_uploads/SyxUn-6ie6.png) and upper immediate value `0x10000000`. Then the result is `0x10000000` ![](https://hackmd.io/_uploads/Skhab6ilp.png) 2. The `10: 020000ef jal x1 32 <count_leading_zeros>` set the `PC` to new position ![](https://hackmd.io/_uploads/rylyGTje6.png) ### MEM stage 1. The `8: 0002a503 lw x10 0 x5` instruction load data from the address of `x5`, which is `0x10000000` ![](https://hackmd.io/_uploads/ByOefTie6.png) and put the data to `Read out` ![](https://hackmd.io/_uploads/BJ07fTsga.png) 2. The `0: 10000297 auipc x5 0x10000` instruction. After EX stage and in MEM stage, it update the register `x5` being used in EX stage ![](https://hackmd.io/_uploads/BJeDGpoxa.png) 3. After EX stage of `10: 020000ef jal x1 32 <count_leading_zeros>` and in MEM stage, it flush the instructions in ID and EXE stage. ![](https://hackmd.io/_uploads/BJMKGTslT.png) ### WB stage 1. The `0: 10000297 auipc x5 0x10000` instruction write the EX stage result to the register `x5`. ![](https://hackmd.io/_uploads/BySczTixT.png) 2. The `4: 00028293 addi x5 x5 0` not only write the EX stage result to register `x5`, but also update the register `x5` beinging used in EX stage ![](https://hackmd.io/_uploads/HyvsfpsxT.png) ## Reference 1. [Implement log2 with branchless clz](https://graphics.stanford.edu/~seander/bithacks.html) 2. [Instruction register wiki](https://en.wikipedia.org/wiki/Instruction_register) 3. [Introduction to the computer: how a computer executes instructions](http://www.cs.emory.edu/~cheung/Courses/170/Syllabus/01/intro-computer2.html)