Try   HackMD

Assignment1: RISC-V Assembly and Instruction Pipeline

contributed by <hugo0406>

Find Leftmost 0-byte using CLZ

  • In C,the end of the string is denoted by an all-0 byte. To find the length of a string, a C program uses the “strlen” function.This function searches the string, from left to right, for the 0-byte, and returns the number of bytes scanned,not counting the 0-byte.

  • A fast implementation of “strlen” might load and test single bytes until a word boundary is reached,and then load a word at a time into a register, and test the register for the presence of a 0-byte.

  • Image Not Showing Possible Reasons
    • The image was uploaded to a note which you don't have access to
    • The note which the image was originally uploaded to has been deleted
    Learn More →

    • Values from 0 to 3 denoting bytes 0 to 3, and a value of 4 denoting that there's no 0-byte in the word.
    • “00” denotes a 0-byte, “nn” denotes a nonzero byte, and “xx” denotes a byte that may be 0 or nonzero.
    • Follows considering a 64-bit(double word) case instead of 32-bit,then range of the function's return value is 0 to 8.

Implementation

C Code

#include <stdio.h> #include <stdint.h> uint16_t count_leading_zeros(uint64_t x) { x |= (x >> 1); x |= (x >> 2); x |= (x >> 4); x |= (x >> 8); x |= (x >> 16); x |= (x >> 32); /* count ones (population count) */ x -= ((x >> 1) & 0x5555555555555555); x = ((x >> 2) & 0x3333333333333333) + (x & 0x3333333333333333); x = ((x >> 4) + x) & 0x0f0f0f0f0f0f0f0f; x += (x >> 8); x += (x >> 16); x += (x >> 32); return (64 - (x & 0x7f)); } uint16_t zbytel(uint64_t x) { uint64_t y; uint16_t n; y = (x & 0x7F7F7F7F7F7F7F7F)+ 0x7F7F7F7F7F7F7F7F; // convert each 0-byte to 0x80 y = ~(y | x |0x7F7F7F7F7F7F7F7F); //and each nonzero byte to 0x00 n = count_leading_zeros(y) >> 3 ; // use number of leading zeros return n; // n = 0 ... 8 , 8 if x has no 0-byte. } int main( int argc, char *argv[ ] ){ uint64_t a = 0x1122334455007788; //In this example, uint16_t zbla = zbytel(a); // the value is 5. uint64_t b = 0x1122334455667788; //Another example, uint16_t zblb = zbytel(b); //the value is 8. printf("%llx\n",a); printf("%d\n",zbla); printf("%llx\n",b); printf("%d\n",zblb); return 0; }

Assembly Code

VersionⅠ use only one test case :

.data test1: .dword 0x1122334455007700 str1: .string "The Leftmost 0-byte is " .text main: la t0,test1 #a1a0 =test1 lw a0,0(t0) lw a1,4(t0) jal ra,zbytel mv t0,a0 la a0,str1 li a7,4 ecall mv a0,t0 li a7,1 ecall li a7, 10 ecall zbytel: addi sp,sp,-4 #push sw ra,0(sp) mv s0,a0 #s0:test1_half_right mv s1,a1 #s1:test1_half_left #y = (x & 0x7F7F7F7F7F7F7F7F)+ 0x7F7F7F7F7F7F7F7F li t0,0x7f7f7f7f and s2,s0,t0 add s2,s2,t0 #y = ~(y | x |0x7F7F7F7F7F7F7F7F) or s2,s2,s0 or s2,s2,t0 xori s2,s2,-1 #s2:y_half_right and s3,s1,t0 add s3,s3,t0 or s3,s3,s0 or s3,s3,t0 xori s3,s3,-1 #s3:y_half_left mv a0,s2 mv a1,s3 jal clz lw ra,0(sp) addi sp,sp,4 #pop srli a0,a0,3 #clz(y)>>3 jr ra clz: #x |= (x >> 1) andi t1,a1,0x1 srli s4,a1,1 srli s5,a0,1 slli t1,t1,31 or s5,s5,t1 or a1,s4,a1 or a0,s5,a0 #x |= (x >> 2) andi t1,a1,0x3 srli s4,a1,2 srli s5,a0,2 slli t1,t1,30 or s5,s5,t1 or a1,s4,a1 or a0,s5,a0 #x |= (x >> 4) andi t1,a1,0xf srli s4,a1,4 srli s5,a0,4 slli t1,t1,28 or s5,s5,t1 or a1,s4,a1 or a0,s5,a0 #x |= (x >> 8) andi t1,a1,0xff srli s4,a1,8 srli s5,a0,8 slli t1,t1,24 or s5,s5,t1 or a1,s4,a1 or a0,s5,a0 #x |= (x >> 16) li t1,0xffff and t1,a1,t1 srli s4,a1,16 srli s5,a0,16 slli t1,t1,16 or s5,s5,t1 or a1,s4,a1 or a0,s5,a0 #x |= (x >> 32) mv s5,a1 and s4,a1,x0 or a1,s4,a1 or a0,s5,a0 # x -= ((x >> 1) & 0x5555555555555555) andi t1,a1,0x1 srli s4,a1,1 srli s5,a0,1 slli t1,t1,31 or s5,s5,t1 li t1,0x55555555 and s4,s4,t1 and s5,s5,t1 sub a1,a1,s4 sub a0,a0,s5 #x = ((x >> 2) & 0x3333333333333333) + (x & 0x3333333333333333) andi t1,a1,0x3 srli s4,a1,2 srli s5,a0,2 slli t1,t1,30 or s5,s5,t1 li t1,0x33333333 and s4,s4,t1 #s4=>a1 and s5,s5,t1 #s5=>a0 and a1,a1,t1 and a0,a0,t1 add a1,a1,s4 add a0,a0,s5 #x = ((x >> 4) + x) & 0x0f0f0f0f0f0f0f0f; andi t1,a1,0xf srli s4,a1,4 srli s5,a0,4 slli t1,t1,28 or s5,s5,t1 add s4,s4,a1 add s5,s5,a0 li t1,0x0f0f0f0f and a1,s4,t1 and a0,s5,t1 #x += (x >> 8) andi t1,a1,0xff srli s4,a1,8 srli s5,a0,8 slli t1,t1,24 or s5,s5,t1 add a1,a1,s4 add a0,a0,s5 #x += (x >> 16) li t1,0xffff and t1,t1,a1 srli s4,a1,16 srli s5,a0,16 slli t1,t1,16 or s5,s5,t1 add a1,a1,s4 add a0,a0,s5 #x += (x >> 32) mv s5,a1 and s4,a1,x0 add a1,a1,s4 add a0,a0,s5 #64 - (x & 0x7f) andi a0,a0,0x7f li t1,64 sub a0,t1,a0 jr ra

VersionⅡ use multiple test cases by loops:

.data test1: .dword 0x1122334455007700 test2: .dword 0x0123456789abcdef test3: .dword 0x1100220033445566 str1: .string "The Leftmost 0-byte is " endl: .string "\n" .text main: la t2, test1 # t2 points to the first test case li t3, 3 # number of test cases loop: lw a0, 0(t2) # a0:test_half_right lw a1, 4(t2) # a1:test_half_left jal zbytel mv t4, a0 # save the result to t4 #print la a0, str1 li a7, 4 ecall mv a0, t4 li a7, 1 ecall la a0, endl li a7, 4 ecall addi t2, t2, 8 # move to the next test case addi t3, t3, -1 # test case counter-- bne t3, x0, loop # counter=0,break li a7, 10 # exit ecall zbytel: addi sp,sp,-4 #push sw ra,0(sp) mv s0,a0 #s0:test_half_right mv s1,a1 #s1:test_half_left #y = (x & 0x7F7F7F7F7F7F7F7F)+ 0x7F7F7F7F7F7F7F7F li t0,0x7f7f7f7f and s2,s0,t0 add s2,s2,t0 #y = ~(y | x |0x7F7F7F7F7F7F7F7F) or s2,s2,s0 or s2,s2,t0 xori s2,s2,-1 #s2:y_half_right and s3,s1,t0 add s3,s3,t0 or s3,s3,s0 or s3,s3,t0 xori s3,s3,-1 #s3:y_half_left mv a0,s2 mv a1,s3 jal clz lw ra,0(sp) addi sp,sp,4 #pop srli a0,a0,3 #clz(y)>>3 jr ra clz: #x |= (x >> 1) andi t1,a1,0x1 srli s4,a1,1 srli s5,a0,1 slli t1,t1,31 or s5,s5,t1 or a1,s4,a1 or a0,s5,a0 #x |= (x >> 2) andi t1,a1,0x3 srli s4,a1,2 srli s5,a0,2 slli t1,t1,30 or s5,s5,t1 or a1,s4,a1 or a0,s5,a0 #x |= (x >> 4) andi t1,a1,0xf srli s4,a1,4 srli s5,a0,4 slli t1,t1,28 or s5,s5,t1 or a1,s4,a1 or a0,s5,a0 #x |= (x >> 8) andi t1,a1,0xff srli s4,a1,8 srli s5,a0,8 slli t1,t1,24 or s5,s5,t1 or a1,s4,a1 or a0,s5,a0 #x |= (x >> 16) li t1,0xffff and t1,a1,t1 srli s4,a1,16 srli s5,a0,16 slli t1,t1,16 or s5,s5,t1 or a1,s4,a1 or a0,s5,a0 #x |= (x >> 32) mv s5,a1 and s4,a1,x0 or a1,s4,a1 or a0,s5,a0 # x -= ((x >> 1) & 0x5555555555555555) andi t1,a1,0x1 srli s4,a1,1 srli s5,a0,1 slli t1,t1,31 or s5,s5,t1 li t1,0x55555555 and s4,s4,t1 and s5,s5,t1 sub a1,a1,s4 sub a0,a0,s5 #x = ((x >> 2) & 0x3333333333333333) + (x & 0x3333333333333333) andi t1,a1,0x3 srli s4,a1,2 srli s5,a0,2 slli t1,t1,30 or s5,s5,t1 li t1,0x33333333 and s4,s4,t1 and s5,s5,t1 and a1,a1,t1 and a0,a0,t1 add a1,a1,s4 add a0,a0,s5 #x = ((x >> 4) + x) & 0x0f0f0f0f0f0f0f0f; andi t1,a1,0xf srli s4,a1,4 srli s5,a0,4 slli t1,t1,28 or s5,s5,t1 add s4,s4,a1 add s5,s5,a0 li t1,0x0f0f0f0f and a1,s4,t1 and a0,s5,t1 #x += (x >> 8) andi t1,a1,0xff srli s4,a1,8 srli s5,a0,8 slli t1,t1,24 or s5,s5,t1 add a1,a1,s4 add a0,a0,s5 #x += (x >> 16) li t1,0xffff and t1,t1,a1 srli s4,a1,16 srli s5,a0,16 slli t1,t1,16 or s5,s5,t1 add a1,a1,s4 add a0,a0,s5 #x += (x >> 32) mv s5,a1 and s4,a1,x0 add a1,a1,s4 add a0,a0,s5 #64 - (x & 0x7f) andi a0,a0,0x7f li t1,64 sub a0,t1,a0 jr ra

Output

Image Not Showing Possible Reasons
  • The image was uploaded to a note which you don't have access to
  • The note which the image was originally uploaded to has been deleted
Learn More →
Image Not Showing Possible Reasons
  • The image was uploaded to a note which you don't have access to
  • The note which the image was originally uploaded to has been deleted
Learn More →

Analysis & Pipeline

Generated from Ripes.
Below are a few lines of disassembled code:

00000000 <main>:
    0:        10000397        auipc x7 0x10000
    4:        00038393        addi x7 x7 0
    8:        00300e13        addi x28 x0 3

0000000c <loop>:
    c:        0003a503        lw x10 0 x7
    10:        0043a583        lw x11 4 x7
    14:        048000ef        jal x1 72 <zbytel>
    18:        00050e93        addi x29 x10 0
    1c:        10000517        auipc x10 0x10000
    20:        ffc50513        addi x10 x10 -4
    24:        00400893        addi x17 x0 4
    28:        00000073        ecall
    2c:        000e8513        addi x10 x29 0
    30:        00100893        addi x17 x0 1
    34:        00000073        ecall
    38:        10000517        auipc x10 0x10000
    3c:        ff850513        addi x10 x10 -8
    40:        00400893        addi x17 x0 4
    44:        00000073        ecall
    48:        00838393        addi x7 x7 8
    4c:        fffe0e13        addi x28 x28 -1
    50:        fa0e1ee3        bne x28 x0 -68 <loop>
    54:        00a00893        addi x17 x0 10
    58:        00000073        ecall

0000005c <zbytel>:
    5c:        ffc10113        addi x2 x2 -4
    60:        00112023        sw x1 0 x2
    64:        00050413        addi x8 x10 0
    68:        00058493        addi x9 x11 0
    6c:        7f7f82b7        lui x5 0x7f7f8
    70:        f7f28293        addi x5 x5 -129
    74:        00547933        and x18 x8 x5
    78:        00590933        add x18 x18 x5
    7c:        00896933        or x18 x18 x8
    80:        00596933        or x18 x18 x5
    84:        fff94913        xori x18 x18 -1
    88:        0054f9b3        and x19 x9 x5
    8c:        005989b3        add x19 x19 x5
    90:        0089e9b3        or x19 x19 x8
    94:        0059e9b3        or x19 x19 x5
    98:        fff9c993        xori x19 x19 -1
    9c:        00090513        addi x10 x18 0
    a0:        00098593        addi x11 x19 0
    a4:        014000ef        jal x1 20 <clz>
    a8:        00012083        lw x1 0 x2
    ac:        00410113        addi x2 x2 4
    b0:        00355513        srli x10 x10 3
    b4:        00008067        jalr x0 x1 0

Take the instruction jal x1 72 <zbytel>,which address is at 0x14, and see how it works in pipeline:

IF Stage

  • First of all ,the PC is 0x14,meaning that we are going to fetch instruction jal x1 72 <zbytel>.

  • From Instr. memory ,we can get instruction machine code 0x048000ef ,also is jal x1 72 <zbytel> will get into the next stage.

  • PC= PC+4, if no bracnching occured ,where next instuction we're going to fetch at next cycle.

ID Stage


  • In this stage, the decoder decodes the machine code 0x048000ef.

  • opcodefield is 1101111,represnt jal instruction

  • rd field is 00001 ,represent x1(ra)

  • imm field is 00000100100000000000

    • imm[10:1]=0000100100,so immediate value is 0x00000048
    • instr[19:15]=00000,so R1 idx is 0x00
    • insrt[24:20]=01000,so R2 idx is 0x08

EXE Stage

  • In this stage, the ALU sums the inputs 0x00000014(the address) and 0x00000048 (the immediate),then get 0x0000005c where zbytel at.

MEM Stage

  • Flushing 2 instructions(use nop) next 2 cycles.
  • IF stage fetch the instruction at 0x0000005c.
  • The jal instruction doesn't involve memory access, so there are no memory-related operations in this stage.

WB Stage

  • Lastly, the processor writes 0x00000018 into x1(ra).

Reference

  1. Hacker's Delight
  2. RISC-V Instruction Set Manual
  3. RISC-V Instruction Format