# Assignment1: RISC-V Assembly and Instruction Pipeline
contributed by <[`hugo0406`](https://github.com/hugo0406/Computer-Architecture/tree/main/hw1)>
## Find Leftmost 0-byte using CLZ
- In C,the end of the string is denoted by an all-0 byte. To find the length of a string, a C program uses the “strlen” function.This function searches the string, from left to right, for the 0-byte, and returns the number of bytes scanned,not counting the 0-byte.
- A fast implementation of “strlen” might load and test single bytes until a word boundary is reached,and then load a word at a time into a register, and test the register for the presence of a 0-byte.
- 
- Values from 0 to 3 denoting bytes 0 to 3, and a value of 4 denoting that there's no 0-byte in the word.
- “00” denotes a 0-byte, “nn” denotes a nonzero byte, and “xx” denotes a byte that may be 0 or nonzero.
- Follows considering a 64-bit(double word) case instead of 32-bit,then range of the function's return value is 0 to 8.
## Implementation
### C Code
```code=
#include <stdio.h>
#include <stdint.h>
uint16_t count_leading_zeros(uint64_t x)
{
x |= (x >> 1);
x |= (x >> 2);
x |= (x >> 4);
x |= (x >> 8);
x |= (x >> 16);
x |= (x >> 32);
/* count ones (population count) */
x -= ((x >> 1) & 0x5555555555555555);
x = ((x >> 2) & 0x3333333333333333) + (x & 0x3333333333333333);
x = ((x >> 4) + x) & 0x0f0f0f0f0f0f0f0f;
x += (x >> 8);
x += (x >> 16);
x += (x >> 32);
return (64 - (x & 0x7f));
}
uint16_t zbytel(uint64_t x) {
uint64_t y;
uint16_t n;
y = (x & 0x7F7F7F7F7F7F7F7F)+ 0x7F7F7F7F7F7F7F7F; // convert each 0-byte to 0x80
y = ~(y | x |0x7F7F7F7F7F7F7F7F); //and each nonzero byte to 0x00
n = count_leading_zeros(y) >> 3 ; // use number of leading zeros
return n; // n = 0 ... 8 , 8 if x has no 0-byte.
}
int main( int argc, char *argv[ ] ){
uint64_t a = 0x1122334455007788; //In this example,
uint16_t zbla = zbytel(a); // the value is 5.
uint64_t b = 0x1122334455667788; //Another example,
uint16_t zblb = zbytel(b); //the value is 8.
printf("%llx\n",a);
printf("%d\n",zbla);
printf("%llx\n",b);
printf("%d\n",zblb);
return 0;
}
```
### Assembly Code
**VersionⅠ** use only one test case :
```code=
.data
test1: .dword 0x1122334455007700
str1: .string "The Leftmost 0-byte is "
.text
main:
la t0,test1 #a1a0 =test1
lw a0,0(t0)
lw a1,4(t0)
jal ra,zbytel
mv t0,a0
la a0,str1
li a7,4
ecall
mv a0,t0
li a7,1
ecall
li a7, 10
ecall
zbytel:
addi sp,sp,-4 #push
sw ra,0(sp)
mv s0,a0 #s0:test1_half_right
mv s1,a1 #s1:test1_half_left
#y = (x & 0x7F7F7F7F7F7F7F7F)+ 0x7F7F7F7F7F7F7F7F
li t0,0x7f7f7f7f
and s2,s0,t0
add s2,s2,t0
#y = ~(y | x |0x7F7F7F7F7F7F7F7F)
or s2,s2,s0
or s2,s2,t0
xori s2,s2,-1 #s2:y_half_right
and s3,s1,t0
add s3,s3,t0
or s3,s3,s0
or s3,s3,t0
xori s3,s3,-1 #s3:y_half_left
mv a0,s2
mv a1,s3
jal clz
lw ra,0(sp)
addi sp,sp,4 #pop
srli a0,a0,3 #clz(y)>>3
jr ra
clz:
#x |= (x >> 1)
andi t1,a1,0x1
srli s4,a1,1
srli s5,a0,1
slli t1,t1,31
or s5,s5,t1
or a1,s4,a1
or a0,s5,a0
#x |= (x >> 2)
andi t1,a1,0x3
srli s4,a1,2
srli s5,a0,2
slli t1,t1,30
or s5,s5,t1
or a1,s4,a1
or a0,s5,a0
#x |= (x >> 4)
andi t1,a1,0xf
srli s4,a1,4
srli s5,a0,4
slli t1,t1,28
or s5,s5,t1
or a1,s4,a1
or a0,s5,a0
#x |= (x >> 8)
andi t1,a1,0xff
srli s4,a1,8
srli s5,a0,8
slli t1,t1,24
or s5,s5,t1
or a1,s4,a1
or a0,s5,a0
#x |= (x >> 16)
li t1,0xffff
and t1,a1,t1
srli s4,a1,16
srli s5,a0,16
slli t1,t1,16
or s5,s5,t1
or a1,s4,a1
or a0,s5,a0
#x |= (x >> 32)
mv s5,a1
and s4,a1,x0
or a1,s4,a1
or a0,s5,a0
# x -= ((x >> 1) & 0x5555555555555555)
andi t1,a1,0x1
srli s4,a1,1
srli s5,a0,1
slli t1,t1,31
or s5,s5,t1
li t1,0x55555555
and s4,s4,t1
and s5,s5,t1
sub a1,a1,s4
sub a0,a0,s5
#x = ((x >> 2) & 0x3333333333333333) + (x & 0x3333333333333333)
andi t1,a1,0x3
srli s4,a1,2
srli s5,a0,2
slli t1,t1,30
or s5,s5,t1
li t1,0x33333333
and s4,s4,t1 #s4=>a1
and s5,s5,t1 #s5=>a0
and a1,a1,t1
and a0,a0,t1
add a1,a1,s4
add a0,a0,s5
#x = ((x >> 4) + x) & 0x0f0f0f0f0f0f0f0f;
andi t1,a1,0xf
srli s4,a1,4
srli s5,a0,4
slli t1,t1,28
or s5,s5,t1
add s4,s4,a1
add s5,s5,a0
li t1,0x0f0f0f0f
and a1,s4,t1
and a0,s5,t1
#x += (x >> 8)
andi t1,a1,0xff
srli s4,a1,8
srli s5,a0,8
slli t1,t1,24
or s5,s5,t1
add a1,a1,s4
add a0,a0,s5
#x += (x >> 16)
li t1,0xffff
and t1,t1,a1
srli s4,a1,16
srli s5,a0,16
slli t1,t1,16
or s5,s5,t1
add a1,a1,s4
add a0,a0,s5
#x += (x >> 32)
mv s5,a1
and s4,a1,x0
add a1,a1,s4
add a0,a0,s5
#64 - (x & 0x7f)
andi a0,a0,0x7f
li t1,64
sub a0,t1,a0
jr ra
```
**VersionⅡ** use multiple test cases by loops:
```code=
.data
test1: .dword 0x1122334455007700
test2: .dword 0x0123456789abcdef
test3: .dword 0x1100220033445566
str1: .string "The Leftmost 0-byte is "
endl: .string "\n"
.text
main:
la t2, test1 # t2 points to the first test case
li t3, 3 # number of test cases
loop:
lw a0, 0(t2) # a0:test_half_right
lw a1, 4(t2) # a1:test_half_left
jal zbytel
mv t4, a0 # save the result to t4
#print
la a0, str1
li a7, 4
ecall
mv a0, t4
li a7, 1
ecall
la a0, endl
li a7, 4
ecall
addi t2, t2, 8 # move to the next test case
addi t3, t3, -1 # test case counter--
bne t3, x0, loop # counter=0,break
li a7, 10 # exit
ecall
zbytel:
addi sp,sp,-4 #push
sw ra,0(sp)
mv s0,a0 #s0:test_half_right
mv s1,a1 #s1:test_half_left
#y = (x & 0x7F7F7F7F7F7F7F7F)+ 0x7F7F7F7F7F7F7F7F
li t0,0x7f7f7f7f
and s2,s0,t0
add s2,s2,t0
#y = ~(y | x |0x7F7F7F7F7F7F7F7F)
or s2,s2,s0
or s2,s2,t0
xori s2,s2,-1 #s2:y_half_right
and s3,s1,t0
add s3,s3,t0
or s3,s3,s0
or s3,s3,t0
xori s3,s3,-1 #s3:y_half_left
mv a0,s2
mv a1,s3
jal clz
lw ra,0(sp)
addi sp,sp,4 #pop
srli a0,a0,3 #clz(y)>>3
jr ra
clz:
#x |= (x >> 1)
andi t1,a1,0x1
srli s4,a1,1
srli s5,a0,1
slli t1,t1,31
or s5,s5,t1
or a1,s4,a1
or a0,s5,a0
#x |= (x >> 2)
andi t1,a1,0x3
srli s4,a1,2
srli s5,a0,2
slli t1,t1,30
or s5,s5,t1
or a1,s4,a1
or a0,s5,a0
#x |= (x >> 4)
andi t1,a1,0xf
srli s4,a1,4
srli s5,a0,4
slli t1,t1,28
or s5,s5,t1
or a1,s4,a1
or a0,s5,a0
#x |= (x >> 8)
andi t1,a1,0xff
srli s4,a1,8
srli s5,a0,8
slli t1,t1,24
or s5,s5,t1
or a1,s4,a1
or a0,s5,a0
#x |= (x >> 16)
li t1,0xffff
and t1,a1,t1
srli s4,a1,16
srli s5,a0,16
slli t1,t1,16
or s5,s5,t1
or a1,s4,a1
or a0,s5,a0
#x |= (x >> 32)
mv s5,a1
and s4,a1,x0
or a1,s4,a1
or a0,s5,a0
# x -= ((x >> 1) & 0x5555555555555555)
andi t1,a1,0x1
srli s4,a1,1
srli s5,a0,1
slli t1,t1,31
or s5,s5,t1
li t1,0x55555555
and s4,s4,t1
and s5,s5,t1
sub a1,a1,s4
sub a0,a0,s5
#x = ((x >> 2) & 0x3333333333333333) + (x & 0x3333333333333333)
andi t1,a1,0x3
srli s4,a1,2
srli s5,a0,2
slli t1,t1,30
or s5,s5,t1
li t1,0x33333333
and s4,s4,t1
and s5,s5,t1
and a1,a1,t1
and a0,a0,t1
add a1,a1,s4
add a0,a0,s5
#x = ((x >> 4) + x) & 0x0f0f0f0f0f0f0f0f;
andi t1,a1,0xf
srli s4,a1,4
srli s5,a0,4
slli t1,t1,28
or s5,s5,t1
add s4,s4,a1
add s5,s5,a0
li t1,0x0f0f0f0f
and a1,s4,t1
and a0,s5,t1
#x += (x >> 8)
andi t1,a1,0xff
srli s4,a1,8
srli s5,a0,8
slli t1,t1,24
or s5,s5,t1
add a1,a1,s4
add a0,a0,s5
#x += (x >> 16)
li t1,0xffff
and t1,t1,a1
srli s4,a1,16
srli s5,a0,16
slli t1,t1,16
or s5,s5,t1
add a1,a1,s4
add a0,a0,s5
#x += (x >> 32)
mv s5,a1
and s4,a1,x0
add a1,a1,s4
add a0,a0,s5
#64 - (x & 0x7f)
andi a0,a0,0x7f
li t1,64
sub a0,t1,a0
jr ra
```
**Output**
<center class="left">
<img src="https://hackmd.io/_uploads/BkzpkwZZ6.png" width="200"/><img src="https://hackmd.io/_uploads/H1d0yP--a.png" width="200"/>
</center>
## Analysis & Pipeline
Generated from [Ripes](https://github.com/mortbopet/Ripes).
Below are a few lines of disassembled code:
```
00000000 <main>:
0: 10000397 auipc x7 0x10000
4: 00038393 addi x7 x7 0
8: 00300e13 addi x28 x0 3
0000000c <loop>:
c: 0003a503 lw x10 0 x7
10: 0043a583 lw x11 4 x7
14: 048000ef jal x1 72 <zbytel>
18: 00050e93 addi x29 x10 0
1c: 10000517 auipc x10 0x10000
20: ffc50513 addi x10 x10 -4
24: 00400893 addi x17 x0 4
28: 00000073 ecall
2c: 000e8513 addi x10 x29 0
30: 00100893 addi x17 x0 1
34: 00000073 ecall
38: 10000517 auipc x10 0x10000
3c: ff850513 addi x10 x10 -8
40: 00400893 addi x17 x0 4
44: 00000073 ecall
48: 00838393 addi x7 x7 8
4c: fffe0e13 addi x28 x28 -1
50: fa0e1ee3 bne x28 x0 -68 <loop>
54: 00a00893 addi x17 x0 10
58: 00000073 ecall
0000005c <zbytel>:
5c: ffc10113 addi x2 x2 -4
60: 00112023 sw x1 0 x2
64: 00050413 addi x8 x10 0
68: 00058493 addi x9 x11 0
6c: 7f7f82b7 lui x5 0x7f7f8
70: f7f28293 addi x5 x5 -129
74: 00547933 and x18 x8 x5
78: 00590933 add x18 x18 x5
7c: 00896933 or x18 x18 x8
80: 00596933 or x18 x18 x5
84: fff94913 xori x18 x18 -1
88: 0054f9b3 and x19 x9 x5
8c: 005989b3 add x19 x19 x5
90: 0089e9b3 or x19 x19 x8
94: 0059e9b3 or x19 x19 x5
98: fff9c993 xori x19 x19 -1
9c: 00090513 addi x10 x18 0
a0: 00098593 addi x11 x19 0
a4: 014000ef jal x1 20 <clz>
a8: 00012083 lw x1 0 x2
ac: 00410113 addi x2 x2 4
b0: 00355513 srli x10 x10 3
b4: 00008067 jalr x0 x1 0
```
Take the instruction ` jal x1 72 <zbytel> `,which address is at 0x14, and see how it works in pipeline:
### IF Stage

- First of all ,the PC is `0x14`,meaning that we are going to fetch instruction `jal x1 72 <zbytel>`.
- From Instr. memory ,we can get instruction machine code `0x048000ef` ,also is `jal x1 72 <zbytel>` will get into the next stage.
- PC= PC+4, if no bracnching occured ,where next instuction we're going to fetch at next cycle.
### ID Stage



- In this stage, the decoder decodes the machine code `0x048000ef`.
- `opcode`field is `1101111`,represnt `jal` instruction
- `rd` field is `00001` ,represent x1(ra)
- `imm` field is `00000100100000000000`
- imm[10:1]=`0000100100`,so immediate value is `0x00000048`
- instr[19:15]=`00000`,so R1 idx is `0x00`
- insrt[24:20]=`01000`,so R2 idx is `0x08`
### EXE Stage

- In this stage, the ALU sums the inputs `0x00000014`(the address) and `0x00000048` (the immediate),then get `0x0000005c` where **zbytel** at.
### MEM Stage

- Flushing 2 instructions(use nop) next 2 cycles.
- **IF** stage fetch the instruction at `0x0000005c`.
- The `jal` instruction doesn't involve memory access, so there are no memory-related operations in this stage.
### WB Stage

- Lastly, the processor writes `0x00000018` into x1(ra).
## Reference
1. [Hacker's Delight](https://doc.lagout.org/security/Hackers%20Delight.pdf)
2. [RISC-V Instruction Set Manual](https://riscv.org//wp-content/uploads/2017/05/riscv-spec-v2.2.pdf)
3. [RISC-V Instruction Format](https://blog.csdn.net/bxdzyhx/article/details/118057390)