owned this note
owned this note
Published
Linked with GitHub
# Assignment1: RISC-V Assembly and Instruction Pipeline
contributed by [`PoChuan994`](https://github.com/chuan0306)
## Project statement (use case)
### Finding first string of 1-bits of a given length by CLZ
- The topic of this project is to find the first consecutive string with a length of *n* or longer
- This topic is suitable for disk allocation, especially for disk compaction
## RISC-V implementation
### RISC-V assembly code - Version 2
```assembly=
.data
t1_u: .word 0x0f000000 # upper bits of test data 1, test_data1[0~31]
t1_l: .word 0x00000000 # lower bits of test data 2, test_data2[32~63]
t2_u: .word 0x00000000
t2_l: .word 0x00000000
t3_u: .word 0x01234567
t3_l: .word 0x89abcdef
.text
main:
# initial setting
la t0, t1_u # load address of upper bits of test data 1 into s0
la t1, t2_u
la t2, t3_u
addi sp, sp, -12
sw t0, 0(sp)
sw t1, 4(sp)
sw t2, 8(sp)
add s0, zero, t0 # s0 = & test_data_upper
add s1, zero, zero # int i (used for test data loop control)
addi s2, zero, 3 # upper bound of i (used for loop control)
addi s3, zero, 4
addi s4, zero, -1 # be used to do not operation
main_for_loop:
# call finding_string procedure
#mv a0, s0 # a0 = & test_data_1_upper
jal ra, fs
li a7, 1
ecall
li a0, 0
li a7, 11
ecall
addi s1, s1, 1
addi s0, s0, 8
blt s1, s2, -12
li a7, 11
ecall
j Exit
fs:
addi sp, sp, -16
sw ra, 0(sp)
sw s1, 4(sp)
sw s2, 8(sp)
sw s3, 12(sp)
#sw s4, 16(sp)
addi s1, s0, 4 # s1 = & test_date_lower
lw a1, 0(s0) # a1 = value of test_data upper
lw a2, 0(s1) # a2 = value of test_date lower, test_data = [a1, a2]
#li s2, 0 # s2 = clz = 0
li s2, 0 # s2 = pos = 0
x_equal_0_check:
bne a1, zero, x_not_equal_0
bne a2, zero, x_not_equal_0
x_eqaul_0:
addi a0, zero, -1
j fs_end
x_not_equal_0:
jal ra, CLZ
# x = x << clz
li t0, 32
sub t0, t0, a0
srl a4, a2, t0
sll a3, a1, a0
or a3, a3, a4 # a1 = a1 << clz
sll a4, a2, a0 # a2 = a2 << clz, x([a3, a4]) = x([a1, a2]) << clz
# pos = pos + clz
add s2, s2, a0
# x = -x, [a3, a4] = - [a1, a2]
xor a1, a3, s4
xor a2, a4, s4
jal ra, CLZ
# check: clz > n
bge a0, s3, 32
## < case
# x = x << clz
sub t0, t0, a0
srl a2, a4, t0
sll a1, a3, a0
or a1, a1, a2 # a1 = a3 << clz
sll a2, a4, a0 # a2 = a4 << clz, x([a1, a2]) = x([a3, a4]) << clz
# pos = pos + clz
add s2, s2, a0
j x_equal_0_check
## >= base
mv a0, s2
j fs_end
fs_end:
lw ra, 0(sp)
lw s1, 4(sp)
lw s2, 8(sp)
lw s3, 12(sp)
addi sp, sp, 16
jalr ra
CLZ:
addi sp, sp, -4
sw ra, 0(sp)
mv t0, a1
mv t1, a2
li t4, 0x55555555
li t5, 0x33333333
li t6, 0x0f0f0f0f
# x |= (x>>1);
srli t3, t1, 1 # shift lower bits of test data right with 1 bit
slli t2, t0, 31 # shift upper bits of test data left with 31 bits
or t3, t2, t3 # combine to get new lower bits of test data
srli t2, t0, 1 # shift upper bound of test data right with 1 bit
or t0, t0, t2 # [0~31]x | [0~31](x >> 1)
or t1, t1, t3 # [32~63]x | [32~63](x >> 1)
# x |= (x>>2);
srli t3, t1, 2
slli t2, t0, 30
or t3, t2, t3
srli t2, t0, 2
or t0, t0, t2
or t1, t1, t3
# x |= (x>>4);
srli t3, t1, 4
slli t2, t0, 28
or t3, t2, t3
srli t2, t0, 4
or t0, t0, t2
or t1, t1, t3
# x |= (x>>8);
srli t3, t1, 8
slli t2, t0, 24
or t3, t2, t3
srli t2, t0, 8
or t0, t0, t2
or t1, t1, t3
# x |= (x>>16);
srli t3, t1, 16
slli t2, t0, 16
or t3, t2, t3
srli t2, t0, 16
or t0, t0, t2
or t1, t1, t3
# x |= (x>>32)
li t2, 0
add t3, t0, zero
or t0, t0, t2
or t1, t1, t3
# x -= ((x>>1) & 0x5555555555555555)
## [t2, t3] = x>>1 ([t0, t1]>>1)
srli t3, t1, 1
slli t2, t0, 31
or t3, t2, t3
srli t2, t0, 1
## (x>>1) & 0x5~
and t2, t2, t4
and t3, t3, t4 # [t2, t3] = (x>>1)&0x5~
sub t3, t1, t3
blt t1, t3, 16 # if underflow then jump
add t1, t3, zero # t1=t3
sub t0, t0, t2 # no underflow at lower bits, [t0, t1]=> x -= ((x>>1) & 0x5555555555555555)
beq zero, zero, 12
addi t0, t0, -1 # underflow at lower bits
sub t0, t0, t2 #[t0, t1] => x -= ((x>>1) & 0x5555555555555555)
# x = ((x>>2)&0x333333333333333) + (x & 0x3333333333333333)
## [t2, t3] = x>>2 ([t0, t1]>>2)
srli t3, t1, 2
slli t2, t0, 30
or t3, t3, t2
srli t2, t0, 2 # [t2, t3] = x>>2
## (x>>1) & 0x3~
and t2, t2, t5
and t3, t3, t5 # [t2, t3] = ((x>>2)&0x3~)
## x & 0x3~
and t0, t0, t5
and t1, t1, t5 # [t0, t1] = (x & 0x3~)
add t1, t1, t3
add t0, t0, t2
## overflow detection (lower bits)
or t4, t1, zero
xor t4, s4, t4 # nor t5, t1, zero (t4 = ~(s4 | zero))
bgeu t4, t3, 8 # if no overflow then jump
addi t0, t0, 1 # if overflow upper bits plus 1
# x += ((x>>4)+x) & 0x0f~0f
## [t2, t3] = x>>4 ([t0, t1]>>4)
srli t3, t1, 4
slli t2, t0, 28
or t3, t3, t2
srli t2, t0, 4
## (x>>4) + x
add t1, t1, t3
add t0, t0, t2
## overflow detection (lower bits)
or t4, t1, zero
xor t4, s4, t4 # nor t5, t1, zero (t4 = ~(s4 | zero))
bgeu t4, t3, 8 # if no overflow then jump
addi t0, t0, 1 # if overflow upper bits plus 1
## ((x>>4) + x) & 0x0f~0f
and t0, t0, t6
and t1, t1, t6
# x += x(x>>8)
srli t3, t1, 8
slli t2, t0, 24
or t3, t3, t2
srli t2, t0, 8 # [t2, t3] = x>>8
add t0, t0, t2
add t1, t1, t3
## overflow detection
or t4, t1, zero
xor t4, s4, t4
bgeu t4, t3, 8
addi t0, t0, 1
# x += x(x>>16)
srli t3, t1, 16
slli t2, t0, 16
or t3, t3, t2
srli t2, t0, 16 # [t2, t3] = x>>8
add t0, t0, t2
add t1, t1, t3
## overflow detection
or t4, t1, zero
xor t4, s4, t4
bgeu t4, t3, 8
addi t0, t0, 1
# x += (x>>32)
add t3, t0, zero
add t2, zero, zero
add t0, t0, t2
add t1, t1, t3
## overflow detection
or t4, t1, zero
xor t4, s4, t4
bgeu t4, t3, 8
addi t0, t0, 1
# 64 - (x & (0x7f))
li t4, 0x7f
li a0, 64
and t1, t1, t4
sub a0, a0, t1
lw ra, 0(sp)
addi sp, sp, 4
jalr ra
Exit:
nop
```
### RISC-V assembly code - Version 1
```assembly=
.data
test_data_1: .dword 0x0123456789abc123
test_data_2: .dword 0x0111000000000000
test_data_3: .dword 0x0000000000f00000
.text
main:
# initial setting
addi a1, zero, 4 # int n
lui a5, 0xfffff
srai a5, a5, 12 # a5 = 0xffffffff
# test data setting
lui a2, 0x01234 # upper bits of test data 1
addi a2, a2, 0x567 # upper bits of test data 1
lui a3, 0x89abc # lower bits if test data 1
addi a3, a3, 0x123 # upper bits of test data 1
# for test
#lui a2, 0x0f000
#add a3, zero, zero
main_for_loop:
jal ra, finding_string # jump to find_string procedure
# tmp for test
add zero, zero, zero
# finding_string procedure
finding_string:
# preserved return address, test data
addi sp, sp, -12
sw ra, 0(sp)
sw a2, 4(sp)
sw a3, 8(sp)
# initial setting
add s0, zero, zero # int clz
add s1, zero, zero # int pos = 0
finding_string_while_loop:
# check whether x equals to zero
bne a2, zero, 24
bne a3, zero, 20
addi a0, zero, -1 # not found, return -1
addi sp, sp, -4
sw a0, 0(sp)
jalr ra # not found, return control to main
jal ra, CLZ
lw a0, 0(sp)
addi sp, sp, 4
add s0, a0, zero # s0 = clz
# x = x << clz
addi t3, zero, 32
sub t3, t3, s0
srl t3, a3, t3
sll t2, a2, s0
or s2, t2, t3 # s2 = [0~31] x << clz
sll s3, a3, s0 # s3 = [32~63] x << clz
# pos = pos + clz
add s1, s1, s0
# x = ~ x
xor a2, s2, a5
xor a3, s3, a5 # [a2, a3] = ~x
jal ra, CLZ
lw a0, 0(sp)
addi sp, sp, 4
add s0, a0, zero # s0 = clz
bge s0, a0, 36
# < case
addi t3, zero, 32
sub t3, t3, s0
srl t5, s3, t3
sll t4, s2, s0
or a2, t4, t5
sll a3, s3, s0
add s1, s1, s0
beq zero, zero, finding_string_while_loop
# >= case
lw ra, 0(sp)
lw a2, 0(sp)
lw a3, 0(sp)
addi sp, sp, 8
# addi sp, sp, -4
sw s1, 0(sp)
jalr ra
# counting_leading_zero procedure
CLZ:
addi sp, sp, -4
sw ra, 0(sp)
# x |= (x>>1)
srli t1, a3, 1 # shift lower bits of test data right with 1 bit
slli t0, a2, 31 # shift upper bits of test data left with 31 bits
or t1, t1, t0 # combine to get new lower bit of test data (after srl)
srli t0, a2, 1 # shift upper bits of test data right with 1 bit
or t0, t0, a2 # [0~31]x | [0~31](x>>1)
or t1, t1, a3 # [32~63]x | [32~63](x>>1)
# value of x is stored in t0, t1 ([0~31], [32~63])
# x |= (x>>2)
srli t3, t1, 2
slli t2, t0, 30
or t3, t3, t2
srli t2, t0, 2
or t0, t2, t0
or t1, t3, t1
# x |= (x>>4)
srli t3, t1, 4
slli t2, t0, 28
or t3, t3, t2
srli t2, t0, 4
or t0, t2, t0
or t1, t3, t1
# x |= (x>>8)
srli t3, t1, 8
slli t2, t0, 24
or t3, t3, t2
srli t2, t0, 8
or t0, t2, t0
or t1, t3, t1
# x |= (x>>16)
srli t3, t1, 16
slli t2, t0, 16
or t3, t3, t2
srli t2, t0, 16
or t0, t2, t0
or t1, t3, t1
# x |= (x>>32)
add t3, t0, zero
add t2, zero, zero
or t0, t0, t2
or t1, t1, t3
# x -= ((x>>1) & 0x5555555555555555)
srli t3, t1, 1
slli t2, t0, 31
or t3, t3, t2
srli t2, t0, 1 # [t2, t3] = (x>>1)
lui t4, 0x55555
addi t4, t4, 0x555 # t4=0x55555555
and t2, t2, t4
and t3, t3, t4 # [t2, t3] = (x>>1)&0x5555555555555555
sub t3, t1, t3
blt t1, t3, 16 # if underflow then jump
add t1, t3, zero # t1=t3
sub t0, t0, t2 # no underflow at lower bits, [t0, t1]=> x -= ((x>>1) & 0x5555555555555555)
beq zero, zero, 12
addi t0, t0, -1 # underflow at lower bits
sub t0, t0, t2 #[t0, t1] => x -= ((x>>1) & 0x5555555555555555)
# x = ((x>>2)&0x333333333333333) + (x & 0x3333333333333333)
srli t3, t1, 2
slli t2, t0, 30
or t3, t3, t2
srli t2, t0, 2 # [t2, t3] = x>>2
lui t4, 0x33333
addi t4, t4, 0x333 # t4=0x33333333
and t2, t2, t4
and t3, t3, t4 # [t2, t3] = ((x>>2)&0x333333333333333)
and t0, t0, t4
and t1, t1, t4 # [t0, t1] = (x&0x333333333333333)
add t1, t1, t3
add t0, t0, t2
## overflow detection (lower bits)
lui t4, 0xfffff
srai t4, t4, 12 # t4=0xf~f used for not operation (by xor)
### nor operation
or t5, t1, zero
xor t5, t4, t5 # nor t5, t1, zero (t5 = ~(t1 | zero))
bgeu t5, t3, 8 # if no overflow then jump
addi t0, t0, 1 # if overflow upper bits plus 1
# x += ((x>>4)+x) & 0x0f~0f
lui t4, 0x0f0f0
srli t5, t4, 16
or t4, t4, t5 # t4=0x0f~0f
srli t3, t1, 4
slli t2, t0, 28
or t3, t3, t2
srli t2, t0, 4 # [t2, t3] = x>>4
## (x>>4) + x
add t1, t1, t3
add t0, t0, t2
### overflow detection
lui t5, 0xfffff
srai t5, t5, 12 # t5=0xf~f used for not operation (by xor)
#### nor operation
or t6, t1, zero
xor t6, t5, t6 # nor a4, t1, zero (a4 = ~(t1 | zero))
bgeu t6, t3, 8
addi t0, t0, 1
## ((x>>4) + x) & 0x0f~0f
and t0, t0, t4
and t1, t1, t4
# x += x(x>>8)
srli t3, t1, 8
slli t2, t0, 24
or t3, t3, t2
srli t2, t0, 8 # [t2, t3] = x>>8
add t0, t0, t2
add t1, t1, t3
## overflow detection
lui t4, 0xfffff
srai t5, t5, 12
### nor operation
or t5, t1, zero
xor t5, t4, t5
bgeu t5, t3, 8
addi t0, t0, 1
# x += x(x>>16)
srli t3, t1, 16
slli t2, t0, 16
or t3, t3, t2
srli t2, t0, 16 # [t2, t3] = x>>8
add t0, t0, t2
add t1, t1, t3
## overflow detection
lui t4, 0xfffff
srai t5, t5, 12
### nor operation
or t5, t1, zero
xor t5, t4, t5
bgeu t5, t3, 8
addi t0, t0, 1
# x += (x>>32)
add t3, t0, zero
add t2, zero, zero
add t0, t0, t2
add t1, t1, t3
## overflow detection
lui t4, 0xfffff
srai t5, t5, 12
### nor operation
or t5, t1, zero
xor t5, t4, t5
bgeu t5, t3, 8
addi t0, t0, 1
# 64 - (x & (0x7f))
addi t4, zero, 0x7f
addi a0, zero, 64
and t1, t1, t4
sub a0, a0, t1
lw ra, 0(sp)
sw a0, 0(sp)
jalr ra
End:
add zero, zero, zero
```
### workflow of RISC-V implementation Version 2

## Analysis
<!-- Generated from [Ripes](https://github.com/mortbopet/Ripes) -->
- test data
- test data 1: 0x0f00000000000000
- test data 2: 0x0000000000000000
- test data 3: 0x0123456789abcdef
### Execution Info
- Version 1
- | Execution Info | [test data 1](https://hackmd.io/_uploads/Hki_jsX-p.png) | [test data 2](https://hackmd.io/_uploads/rk5knjmZa.png) |[test data 3](https://hackmd.io/_uploads/HJ8bhombp.png) |
| -------- | -------- | -------- |-------- |
| Clock cycles | 900 | 369 | 902 |
| Instr. count | 790 | 332 | 792 |
| CPI | 1.14 | 1.15 | 1.14 |
| IPC | 0.878 | 0.873 | 0.878 |
| Clock rate | 9.71 Hz | 10.64 Hz | 8.62 Hz |
- Version 2
- | Execution Info | [Version 2](https://hackmd.io/_uploads/S1U2f37W6.png) |
| -------- | -------- |
| Clock cycles | 2598 |
| Instr. count | 2246 |
| CPI | 2246 |
| IPC | 1.16 |
| Clock rate | 0.865 |
- total execution time
- `execution time` = `Instructions count` $\times$ `CPI` $\times$ `cycle time`
$\qquad\qquad\qquad\;\;$ = `Clock cycles` $\times$ `cycle time`
- execution time of V1 $= (900\;+\;+369\;+\;902) \times 100 =217100 ms$
- execution time of V2 $= 2589 \times 100 =258900 ms$
### Data cache hit ratio
- || Hit rate | Hits | Miss |
| -------- | -------- | -------- | -------- |
| [test data 1(V1)](https://hackmd.io/_uploads/S1U2f37W6.png) | 0.9444 | 34 | 2 |
| [test data 2(V1)](https://hackmd.io/_uploads/rkr172Q-6.png) | 0.7727 | 17 | 5 |
| [test data 3(V1)](https://hackmd.io/_uploads/H11mmnmZ6.png) | 0.9444 | 34 | 2 |
| [version 2](https://hackmd.io/_uploads/By0QQ2QWp.png) | 0.942 | 65 | 4 |
### Instruction cache hit ratio
- || Hit rate | Hits | Miss |
| -------- | -------- | -------- | -------- |
| [test data 1(V1)](https://hackmd.io/_uploads/BJDTV2QW6.png) | 0.9032 | 813 | 88 |
| [test data 2(V1)](https://hackmd.io/_uploads/SJBgB2QWT.png) | 0.8351 | 309 | 61 |
| [test data 3(V1)](https://hackmd.io/_uploads/Byt4BhQ-T.png) | 0.8915 | 805 | 908 |
| [version 2](https://hackmd.io/_uploads/H1trS2mZ6.png) | 0.928 | 2412 | 187 |
### **Performance analysis**
- Even though the execution time of Version 2 (V2) is longer than Version 1 (V1), V1 only calculates one set of test data at a time, while **V2 can iteratively calculate three sets of test data without requiring manual data changes**
- **V2 exhibits better overall performance than Version 1 in terms of data cache hit ratio**
- **V2 has a significantly higher instruction cache hit ratio than V1**
- In this work, the overflow detection is indeed implemented, so **there will be no overflow**.
## Pipeline stage analysis
### IF stage
- The first instruction exists at the location of memory `0x00000000`.
- The ALU adds 4 to the PC(current instruction address) to calculate PC+4 (the memory address of the next instruction), which is `0x00000004`.
- and the PC+4 will be used to fetch next instruction, if no `branch` or `jump` instructions occured.
- The instruction stored in memory address `0x00000000` is `auipc x5 0x10000`, and it's corresponding opcode is `0x10000297` iii(hexadecimal notation in `0x10000297`).


### ID stage
- By decoding the instruction (`0x00400593`),
- Instruction type is `U-type` (`Upper immediate` instruction).
- The `Immediate` is 0x10000000.

### EXE stage
- The `operand 1` comes from register `x0`, its value is `0x0000000`
- The `operand 2` comes from `immediate` value, which is `0x10000000`
- The output value is `0x10000000`
- There is no `branch` condition

### MEM stage
- Since the instruction type is `U-type`, there is no data transfer operation (memory access), so the `Wr en` (Write Enable) signal is set to zero

### WB stage
- The value to be written to the register is `0x04`, and it comes from the result of the ALU calculation (`EXE stage`)
- The target register is `0x05`
- 
## Memory analysis
- 
- | address | data | used for |
| -------- | -------- | -------- |
| 0x7fffffec ~ <br/> 0x7fffffe4 | 0x10000010 <br/> 0x10000008 <br/> 0x10000000 | Store the pointer to test data 1~3 |
| 0x7fffffe0 ~ <br/> 0x7fffff4 | 0x00000004 <br/> 0x00000003 <br/> 0x00000000 <br/> 0x00000040 |s3 (`n=4`)<br/>s2 (`3`)<br/> s1 (`i=0`)<br/> `ra` (back to main_loop block) |
| 0x7fffffd0 | 0x00000088 | `ra` (back to x_not_equal_0) |
- 
- When the CLZ block (`CLZ procedure`) is completed, `ra` (return address) is restored from stack.
- The `CLZ procedure` can return to the `finding_string procedure`.
## Reference
- [COMPUTER ORGANIZATION AND DESIGN - The Hardware/Software Interface:RISC-V Edition](chrome-extension://bocbaocobfecmglnmeaeppambideimao/pdf/viewer.html?file=https%3A%2F%2Fwww.cs.sfu.ca%2F~ashriram%2FCourses%2FCS295%2Fassets%2Fbooks%2FHandP_RISCV.pdf)
- [Hacker’s Delight](chrome-extension://bocbaocobfecmglnmeaeppambideimao/pdf/viewer.html?file=https%3A%2F%2Fdoc.lagout.org%2Fsecurity%2FHackers%2520Delight.pdf)
- [RISC-V指令集架構介紹 - Integer Calling convention](https://tclin914.github.io/77838749/)
- [RISC-V Syscall 系列 2:Syscall 過程分析](https://tinylab.org/riscv-syscall-part2-procedure/)
- [Risc-V Assembly Language Hello World](https://smist08.wordpress.com/2019/09/07/risc-v-assembly-language-hello-world/)
<!--
Excution Info of
test data 1: 
test data 2: 
test data 3: 
version 2:
Data cache of
test data 1: 
test data 2:
test data 3:
V2:
Instruction cache of
test data 1: 
test data 2:
test data 3:
v2 
-->