# Assignment1: RISC-V Assembly and Instruction Pipeline contributed by <[`hugo0406`](https://github.com/hugo0406/Computer-Architecture/tree/main/hw1)> ## Find Leftmost 0-byte using CLZ - In C,the end of the string is denoted by an all-0 byte. To find the length of a string, a C program uses the “strlen” function.This function searches the string, from left to right, for the 0-byte, and returns the number of bytes scanned,not counting the 0-byte. - A fast implementation of “strlen” might load and test single bytes until a word boundary is reached,and then load a word at a time into a register, and test the register for the presence of a 0-byte. - ![](https://hackmd.io/_uploads/rJ47QNlbp.png) - Values from 0 to 3 denoting bytes 0 to 3, and a value of 4 denoting that there's no 0-byte in the word. - “00” denotes a 0-byte, “nn” denotes a nonzero byte, and “xx” denotes a byte that may be 0 or nonzero. - Follows considering a 64-bit(double word) case instead of 32-bit,then range of the function's return value is 0 to 8. ## Implementation ### C Code ```code= #include <stdio.h> #include <stdint.h> uint16_t count_leading_zeros(uint64_t x) { x |= (x >> 1); x |= (x >> 2); x |= (x >> 4); x |= (x >> 8); x |= (x >> 16); x |= (x >> 32); /* count ones (population count) */ x -= ((x >> 1) & 0x5555555555555555); x = ((x >> 2) & 0x3333333333333333) + (x & 0x3333333333333333); x = ((x >> 4) + x) & 0x0f0f0f0f0f0f0f0f; x += (x >> 8); x += (x >> 16); x += (x >> 32); return (64 - (x & 0x7f)); } uint16_t zbytel(uint64_t x) { uint64_t y; uint16_t n; y = (x & 0x7F7F7F7F7F7F7F7F)+ 0x7F7F7F7F7F7F7F7F; // convert each 0-byte to 0x80 y = ~(y | x |0x7F7F7F7F7F7F7F7F); //and each nonzero byte to 0x00 n = count_leading_zeros(y) >> 3 ; // use number of leading zeros return n; // n = 0 ... 8 , 8 if x has no 0-byte. } int main( int argc, char *argv[ ] ){ uint64_t a = 0x1122334455007788; //In this example, uint16_t zbla = zbytel(a); // the value is 5. uint64_t b = 0x1122334455667788; //Another example, uint16_t zblb = zbytel(b); //the value is 8. printf("%llx\n",a); printf("%d\n",zbla); printf("%llx\n",b); printf("%d\n",zblb); return 0; } ``` ### Assembly Code **VersionⅠ** use only one test case : ```code= .data test1: .dword 0x1122334455007700 str1: .string "The Leftmost 0-byte is " .text main: la t0,test1 #a1a0 =test1 lw a0,0(t0) lw a1,4(t0) jal ra,zbytel mv t0,a0 la a0,str1 li a7,4 ecall mv a0,t0 li a7,1 ecall li a7, 10 ecall zbytel: addi sp,sp,-4 #push sw ra,0(sp) mv s0,a0 #s0:test1_half_right mv s1,a1 #s1:test1_half_left #y = (x & 0x7F7F7F7F7F7F7F7F)+ 0x7F7F7F7F7F7F7F7F li t0,0x7f7f7f7f and s2,s0,t0 add s2,s2,t0 #y = ~(y | x |0x7F7F7F7F7F7F7F7F) or s2,s2,s0 or s2,s2,t0 xori s2,s2,-1 #s2:y_half_right and s3,s1,t0 add s3,s3,t0 or s3,s3,s0 or s3,s3,t0 xori s3,s3,-1 #s3:y_half_left mv a0,s2 mv a1,s3 jal clz lw ra,0(sp) addi sp,sp,4 #pop srli a0,a0,3 #clz(y)>>3 jr ra clz: #x |= (x >> 1) andi t1,a1,0x1 srli s4,a1,1 srli s5,a0,1 slli t1,t1,31 or s5,s5,t1 or a1,s4,a1 or a0,s5,a0 #x |= (x >> 2) andi t1,a1,0x3 srli s4,a1,2 srli s5,a0,2 slli t1,t1,30 or s5,s5,t1 or a1,s4,a1 or a0,s5,a0 #x |= (x >> 4) andi t1,a1,0xf srli s4,a1,4 srli s5,a0,4 slli t1,t1,28 or s5,s5,t1 or a1,s4,a1 or a0,s5,a0 #x |= (x >> 8) andi t1,a1,0xff srli s4,a1,8 srli s5,a0,8 slli t1,t1,24 or s5,s5,t1 or a1,s4,a1 or a0,s5,a0 #x |= (x >> 16) li t1,0xffff and t1,a1,t1 srli s4,a1,16 srli s5,a0,16 slli t1,t1,16 or s5,s5,t1 or a1,s4,a1 or a0,s5,a0 #x |= (x >> 32) mv s5,a1 and s4,a1,x0 or a1,s4,a1 or a0,s5,a0 # x -= ((x >> 1) & 0x5555555555555555) andi t1,a1,0x1 srli s4,a1,1 srli s5,a0,1 slli t1,t1,31 or s5,s5,t1 li t1,0x55555555 and s4,s4,t1 and s5,s5,t1 sub a1,a1,s4 sub a0,a0,s5 #x = ((x >> 2) & 0x3333333333333333) + (x & 0x3333333333333333) andi t1,a1,0x3 srli s4,a1,2 srli s5,a0,2 slli t1,t1,30 or s5,s5,t1 li t1,0x33333333 and s4,s4,t1 #s4=>a1 and s5,s5,t1 #s5=>a0 and a1,a1,t1 and a0,a0,t1 add a1,a1,s4 add a0,a0,s5 #x = ((x >> 4) + x) & 0x0f0f0f0f0f0f0f0f; andi t1,a1,0xf srli s4,a1,4 srli s5,a0,4 slli t1,t1,28 or s5,s5,t1 add s4,s4,a1 add s5,s5,a0 li t1,0x0f0f0f0f and a1,s4,t1 and a0,s5,t1 #x += (x >> 8) andi t1,a1,0xff srli s4,a1,8 srli s5,a0,8 slli t1,t1,24 or s5,s5,t1 add a1,a1,s4 add a0,a0,s5 #x += (x >> 16) li t1,0xffff and t1,t1,a1 srli s4,a1,16 srli s5,a0,16 slli t1,t1,16 or s5,s5,t1 add a1,a1,s4 add a0,a0,s5 #x += (x >> 32) mv s5,a1 and s4,a1,x0 add a1,a1,s4 add a0,a0,s5 #64 - (x & 0x7f) andi a0,a0,0x7f li t1,64 sub a0,t1,a0 jr ra ``` **VersionⅡ** use multiple test cases by loops: ```code= .data test1: .dword 0x1122334455007700 test2: .dword 0x0123456789abcdef test3: .dword 0x1100220033445566 str1: .string "The Leftmost 0-byte is " endl: .string "\n" .text main: la t2, test1 # t2 points to the first test case li t3, 3 # number of test cases loop: lw a0, 0(t2) # a0:test_half_right lw a1, 4(t2) # a1:test_half_left jal zbytel mv t4, a0 # save the result to t4 #print la a0, str1 li a7, 4 ecall mv a0, t4 li a7, 1 ecall la a0, endl li a7, 4 ecall addi t2, t2, 8 # move to the next test case addi t3, t3, -1 # test case counter-- bne t3, x0, loop # counter=0,break li a7, 10 # exit ecall zbytel: addi sp,sp,-4 #push sw ra,0(sp) mv s0,a0 #s0:test_half_right mv s1,a1 #s1:test_half_left #y = (x & 0x7F7F7F7F7F7F7F7F)+ 0x7F7F7F7F7F7F7F7F li t0,0x7f7f7f7f and s2,s0,t0 add s2,s2,t0 #y = ~(y | x |0x7F7F7F7F7F7F7F7F) or s2,s2,s0 or s2,s2,t0 xori s2,s2,-1 #s2:y_half_right and s3,s1,t0 add s3,s3,t0 or s3,s3,s0 or s3,s3,t0 xori s3,s3,-1 #s3:y_half_left mv a0,s2 mv a1,s3 jal clz lw ra,0(sp) addi sp,sp,4 #pop srli a0,a0,3 #clz(y)>>3 jr ra clz: #x |= (x >> 1) andi t1,a1,0x1 srli s4,a1,1 srli s5,a0,1 slli t1,t1,31 or s5,s5,t1 or a1,s4,a1 or a0,s5,a0 #x |= (x >> 2) andi t1,a1,0x3 srli s4,a1,2 srli s5,a0,2 slli t1,t1,30 or s5,s5,t1 or a1,s4,a1 or a0,s5,a0 #x |= (x >> 4) andi t1,a1,0xf srli s4,a1,4 srli s5,a0,4 slli t1,t1,28 or s5,s5,t1 or a1,s4,a1 or a0,s5,a0 #x |= (x >> 8) andi t1,a1,0xff srli s4,a1,8 srli s5,a0,8 slli t1,t1,24 or s5,s5,t1 or a1,s4,a1 or a0,s5,a0 #x |= (x >> 16) li t1,0xffff and t1,a1,t1 srli s4,a1,16 srli s5,a0,16 slli t1,t1,16 or s5,s5,t1 or a1,s4,a1 or a0,s5,a0 #x |= (x >> 32) mv s5,a1 and s4,a1,x0 or a1,s4,a1 or a0,s5,a0 # x -= ((x >> 1) & 0x5555555555555555) andi t1,a1,0x1 srli s4,a1,1 srli s5,a0,1 slli t1,t1,31 or s5,s5,t1 li t1,0x55555555 and s4,s4,t1 and s5,s5,t1 sub a1,a1,s4 sub a0,a0,s5 #x = ((x >> 2) & 0x3333333333333333) + (x & 0x3333333333333333) andi t1,a1,0x3 srli s4,a1,2 srli s5,a0,2 slli t1,t1,30 or s5,s5,t1 li t1,0x33333333 and s4,s4,t1 and s5,s5,t1 and a1,a1,t1 and a0,a0,t1 add a1,a1,s4 add a0,a0,s5 #x = ((x >> 4) + x) & 0x0f0f0f0f0f0f0f0f; andi t1,a1,0xf srli s4,a1,4 srli s5,a0,4 slli t1,t1,28 or s5,s5,t1 add s4,s4,a1 add s5,s5,a0 li t1,0x0f0f0f0f and a1,s4,t1 and a0,s5,t1 #x += (x >> 8) andi t1,a1,0xff srli s4,a1,8 srli s5,a0,8 slli t1,t1,24 or s5,s5,t1 add a1,a1,s4 add a0,a0,s5 #x += (x >> 16) li t1,0xffff and t1,t1,a1 srli s4,a1,16 srli s5,a0,16 slli t1,t1,16 or s5,s5,t1 add a1,a1,s4 add a0,a0,s5 #x += (x >> 32) mv s5,a1 and s4,a1,x0 add a1,a1,s4 add a0,a0,s5 #64 - (x & 0x7f) andi a0,a0,0x7f li t1,64 sub a0,t1,a0 jr ra ``` **Output** <center class="left"> <img src="https://hackmd.io/_uploads/BkzpkwZZ6.png" width="200"/><img src="https://hackmd.io/_uploads/H1d0yP--a.png" width="200"/> </center> ## Analysis & Pipeline Generated from [Ripes](https://github.com/mortbopet/Ripes). Below are a few lines of disassembled code: ``` 00000000 <main>: 0: 10000397 auipc x7 0x10000 4: 00038393 addi x7 x7 0 8: 00300e13 addi x28 x0 3 0000000c <loop>: c: 0003a503 lw x10 0 x7 10: 0043a583 lw x11 4 x7 14: 048000ef jal x1 72 <zbytel> 18: 00050e93 addi x29 x10 0 1c: 10000517 auipc x10 0x10000 20: ffc50513 addi x10 x10 -4 24: 00400893 addi x17 x0 4 28: 00000073 ecall 2c: 000e8513 addi x10 x29 0 30: 00100893 addi x17 x0 1 34: 00000073 ecall 38: 10000517 auipc x10 0x10000 3c: ff850513 addi x10 x10 -8 40: 00400893 addi x17 x0 4 44: 00000073 ecall 48: 00838393 addi x7 x7 8 4c: fffe0e13 addi x28 x28 -1 50: fa0e1ee3 bne x28 x0 -68 <loop> 54: 00a00893 addi x17 x0 10 58: 00000073 ecall 0000005c <zbytel>: 5c: ffc10113 addi x2 x2 -4 60: 00112023 sw x1 0 x2 64: 00050413 addi x8 x10 0 68: 00058493 addi x9 x11 0 6c: 7f7f82b7 lui x5 0x7f7f8 70: f7f28293 addi x5 x5 -129 74: 00547933 and x18 x8 x5 78: 00590933 add x18 x18 x5 7c: 00896933 or x18 x18 x8 80: 00596933 or x18 x18 x5 84: fff94913 xori x18 x18 -1 88: 0054f9b3 and x19 x9 x5 8c: 005989b3 add x19 x19 x5 90: 0089e9b3 or x19 x19 x8 94: 0059e9b3 or x19 x19 x5 98: fff9c993 xori x19 x19 -1 9c: 00090513 addi x10 x18 0 a0: 00098593 addi x11 x19 0 a4: 014000ef jal x1 20 <clz> a8: 00012083 lw x1 0 x2 ac: 00410113 addi x2 x2 4 b0: 00355513 srli x10 x10 3 b4: 00008067 jalr x0 x1 0 ``` Take the instruction ` jal x1 72 <zbytel> `,which address is at 0x14, and see how it works in pipeline: ### IF Stage ![](https://hackmd.io/_uploads/BkbjHYb-a.png) - First of all ,the PC is `0x14`,meaning that we are going to fetch instruction `jal x1 72 <zbytel>`. - From Instr. memory ,we can get instruction machine code `0x048000ef` ,also is `jal x1 72 <zbytel>` will get into the next stage. - PC= PC+4, if no bracnching occured ,where next instuction we're going to fetch at next cycle. ### ID Stage ![](https://hackmd.io/_uploads/BkyUB5W-T.png) ![](https://hackmd.io/_uploads/BylBd5Wbp.png) ![](https://hackmd.io/_uploads/r1Bow9WbT.png) - In this stage, the decoder decodes the machine code `0x048000ef`. - `opcode`field is `1101111`,represnt `jal` instruction - `rd` field is `00001` ,represent x1(ra) - `imm` field is `00000100100000000000` - imm[10:1]=`0000100100`,so immediate value is `0x00000048` - instr[19:15]=`00000`,so R1 idx is `0x00` - insrt[24:20]=`01000`,so R2 idx is `0x08` ### EXE Stage ![](https://hackmd.io/_uploads/SkszVj-Za.png) - In this stage, the ALU sums the inputs `0x00000014`(the address) and `0x00000048` (the immediate),then get `0x0000005c` where **zbytel** at. ### MEM Stage ![](https://hackmd.io/_uploads/Synbis-W6.png) - Flushing 2 instructions(use nop) next 2 cycles. - **IF** stage fetch the instruction at `0x0000005c`. - The `jal` instruction doesn't involve memory access, so there are no memory-related operations in this stage. ### WB Stage ![](https://hackmd.io/_uploads/ryVTTjZba.png) - Lastly, the processor writes `0x00000018` into x1(ra). ## Reference 1. [Hacker's Delight](https://doc.lagout.org/security/Hackers%20Delight.pdf) 2. [RISC-V Instruction Set Manual](https://riscv.org//wp-content/uploads/2017/05/riscv-spec-v2.2.pdf) 3. [RISC-V Instruction Format](https://blog.csdn.net/bxdzyhx/article/details/118057390)