# Assignment1: RISC-V Assembly and Instruction Pipeline
contributed by< `p76131505` >
## fabsf
```c
static inline float fabsf(float x) {
uint32_t i = *(uint32_t *)&x; // Read the bits of the float into an integer
i &= 0x7FFFFFFF; // Clear the sign bit to get the absolute value
x = *(float *)&i; // Write the modified bits back into the float
return x;
}
```
:::danger
Don't paste code snip without comprehensive discussions.
:::
```c
.data
num0: .word 0x0f000000
num1: .word 0xff000000
num2: .word 0xf0f0f0f0
endl: .string "\n"
.text
la t0, num0 # x
li t1, 3
fabsf:
lw t2, 0(t0)
addi a0, t2, 0
li a7, 2
ecall
la a0, endl
li a7, 4
ecall
addi a0, t2, 0
li t3, 0x7fffffff
and a0, a0, t3
li a7, 2
ecall
la a0, endl
li a7, 4
ecall
addi t0, t0, 4 # move to the next test case
addi t1, t1, -1 # test case counter--
bne t1, zero, fabsf
li a7, 10
ecall
```
## my_clz
```c
static inline int my_clz(uint32_t x) {
int count = 0;
for (int i = 31; i >= 0; --i) {
if (x & (1U << i))
break;
count++;
}
return count;
}
```
```c
.data
num0: .word 0x0f000000
num1: .word 0xff000000
num2: .word 1
endl: .string "\n"
.text
la t0, num0 # x
li t1, 3
my_clz:
lw t2, 0(t0)
li a0, 0 # count=0
li s1, 31 # i=31
loop:
li s2, 1 # s2=1U
sll s3, s2, s1 # s3=1U<<i
and s4, t2, s3 # x&(1U<<i)
bne s4, zero, end
addi a0, a0, 1 # count++
addi s1, s1, -1 # --i
bge s1, zero, loop
end:
li a7, 1
ecall
la a0, endl
li a7, 4
ecall
addi t0, t0, 4 # move to the next test case
addi t1, t1, -1 # test case counter--
bne t1, zero, my_clz
li a7, 10
ecall
```
## fp16_to_fp32
```c
static inline uint32_t fp16_to_fp32(uint16_t h) {
const uint32_t w = (uint32_t) h << 16;
const uint32_t sign = w & UINT32_C(0x80000000);
const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF);
uint32_t renorm_shift = my_clz(nonsign);
renorm_shift = renorm_shift > 5 ? renorm_shift - 5 : 0;
const int32_t inf_nan_mask = ((int32_t)(nonsign + 0x04000000) >> 8) &
INT32_C(0x7F800000);
const int32_t zero_mask = (int32_t)(nonsign - 1) >> 31;
return sign | ((((nonsign << renorm_shift >> 3) +
((0x70 - renorm_shift) << 23)) | inf_nan_mask) & ~zero_mask);
}
```
```c
.data
num0: .word 100
.text
la t0, num0
lw t0, 0(t0) # h
slli t0, t0, 16 # w = h << 16
li t1, 0x80000000
and t1, t0, t1 # sign = w & UINT32_C(0x80000000)
li t2, 0x7fffffff
and t2, t0, t2 # nonsign = w & UINT32_C(0x7FFFFFFF)
my_clz:
li a0, 0 # count=0
li s1, 31 # i=31
loop:
li s2, 1 # s2=1U
sll s3, s2, s1 # s3=1U<<i
and s4, t2, s3 # x&(1U<<i)
bne s4, zero, clz_ret
addi a0, a0, 1 # count++
addi s1, s1, -1 # --i
bge s1, zero, loop
clz_ret:
li t3, 5
bgt a0, t3, g
li a0, 0
j mask
g:
addi a0, a0, -5
mask:
li t4, 0x04000000
add t4, t2, t4 # t4 = nonsign + 0x04000000
srli t4, t4, 8 # t4 >> 8
li t5, 0x7f800000
and t4, t4, t5 # inf_nan_mask
addi t6, t2, -1
srli t6, t6, 31 # zero_mask
sll t2, t2, a0
srli t2, t2, 3 # t2 = non << renorm_shift >> 3
sub a0, zero, a0
addi a0, a0, 0x70
slli a0, a0, 23 # a0 = 0x70 - renorm_shift << 23
add a0, t2, a0
or a0, a0, t4
xori t6, t6, -1
and a0, a0, t6
or a0, t1, a0
li a7,34
ecall
```
## Selected Problems
:::danger
What inspired you from Quiz 1, as the assignment asks you to use these inspirations to improve the existing problems?
:::
*using ```my_clz``` to calculate how many leading zeros are in ```num```, then subtracting 32 to determine how many bits need to alternate between 1 and 0.
[Leetcode 693. Binary Number with Alternating Bits](https://leetcode.com/problems/binary-number-with-alternating-bits/description/)
Given a positive integer, check whether it has alternating bits: namely, if two adjacent bits will always have different values.
Example 1:
Input: n = 5
Output: true
Explanation: The binary representation of 5 is: 101
Example 2:
Input: n = 7
Output: false
Explanation: The binary representation of 7 is: 111.
Example 3:
Input: n = 11
Output: false
Explanation: The binary representation of 11 is: 1011.
## Solution
```c
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>
static inline int my_clz(uint32_t x) {
int count = 0;
for (int i = 31; i >= 0; --i) {
if (x & (1U << i))
break;
count++;
}
return count;
}
bool hasAlternatingBits(uint32_t n) {
int totalBits = 32 - my_clz(n);
for (int i = 0; i < totalBits - 1; i++) {
if (((n >> i) & 1) == ((n >> (i + 1)) & 1)) {
return false;
}
}
return true;
}
int main() {
uint32_t n = 5;
for(int i=0;i<5;i++)
printf("%s\n", hasAlternatingBits(i)?"true":"false");
return 0;
}
```
## Use my_clz to solve this problem(version1)
```
.data
n0: .word 0b10101010101011
n1: .word 0b11111000000000
n2: .word 0b10101010101000
endl: .string "\n"
f: .string "false\n"
t: .string "true\n"
.text
la t0, n0
li t1, 3
my_clz:
lw t2, 0(t0)
li a0, 0 # count=0
li s1, 31 # i=31
li s0, 0 # bit i
loop:
li s2, 1 # s2=1U
sll s3, s2, s1 # s3=1U<<i
and s4, t2, s3 # x&(1U<<i)
bne s4, zero, hasAlternatingBits
addi a0, a0, 1 # count++
addi s1, s1, -1 # --i
bge s1, zero, loop
hasAlternatingBits:
li s5, 31
sub s5, s5, a0 # total=31-a0
ble s5, zero, tend
bitloop:
srl s6, t2, s0 # s6=n>>i
addi s11, s0, 1
srl s7, t2, s11 # s7=n>>(i+1)
andi s8, s6, 1
andi s9, s7, 1
beq s8, s9, end
addi s0, s0, 1 # i++
blt s0, s5, bitloop
tend:
mv a0, t2
li a7, 1
ecall
la a0, t
li a7, 4
ecall
addi t0, t0, 4
addi t1, t1, -1
bne t1, zero, my_clz
li a7, 10
ecall
end:
mv a0, t2
li a7, 1
ecall
la a0, f
li a7, 4
ecall
addi t0, t0, 4
addi t1, t1, -1
bne t1, zero, my_clz
li a7, 10
ecall
```
## modify(using unrolling in my_clz)(version2)
```
.data
n0: .word 5
n1: .word 7
n2: .word 11
endl: .string "\n"
f: .string "false\n"
t: .string "true\n"
.text
la t0, n0
li t1, 3
my_clz:
lw t2, 0(t0)
li a0, 0 # count=0
li s1, 31 # i=31
li s0, 0 # bit i
loop:
li s2, 1 # s2=1U
sll s3, s2, s1 # s3=1U<<i
and s4, t2, s3 # x&(1U<<i)
bne s4, zero, hasAlternatingBits
addi a0, a0, 1 # count++
addi s1, s1, -1 # --i
sll s3, s2, s1 # s3=1U<<i
and s4, t2, s3 # x&(1U<<i)
bne s4, zero, hasAlternatingBits
addi a0, a0, 1 # count++
addi s1, s1, -1 # --i
sll s3, s2, s1 # s3=1U<<i
and s4, t2, s3 # x&(1U<<i)
bne s4, zero, hasAlternatingBits
addi a0, a0, 1 # count++
addi s1, s1, -1 # --isll s3, s2, s1 # s3=1U<<i
and s4, t2, s3 # x&(1U<<i)
bne s4, zero, hasAlternatingBits
addi a0, a0, 1 # count++
addi s1, s1, -1 # --i
bge s1, zero, loop
hasAlternatingBits:
li s5, 31
sub s5, s5, a0 # total=31-a0
ble s5, zero, tend
bitloop:
srl s6, t2, s0 # s6=n>>i
addi s11, s0, 1
srl s7, t2, s11 # s7=n>>(i+1)
andi s8, s6, 1
andi s9, s7, 1
beq s8, s9, end
addi s0, s0, 1 # i++
blt s0, s5, bitloop
tend:
mv a0, t2
li a7, 1
ecall
la a0, t
li a7, 4
ecall
addi t0, t0, 4
addi t1, t1, -1
bne t1, zero, my_clz
li a7, 10
ecall
end:
mv a0, t2
li a7, 1
ecall
la a0, f
li a7, 4
ecall
addi t0, t0, 4
addi t1, t1, -1
bne t1, zero, my_clz
li a7, 10
ecall
```
(version1)

(version2)

### analysis
*By using loop unrolling, the need to check the condition for jumping back to the loop is reduced, which decreases the number of cycles
## Ripes Simulator

using 5-stage pipelined process in pipes
```
0: 10000297 auipc x5 0x10000
4: 00028293 addi x5 x5 0
8: 00300313 addi x6 x0 3
0000000c <my_clz>:
c: 0002a383 lw x7 0 x5
10: 00000513 addi x10 x0 0
14: 01f00493 addi x9 x0 31
18: 00000413 addi x8 x0 0
0000001c <loop>:
1c: 00100913 addi x18 x0 1
20: 009919b3 sll x19 x18 x9
24: 0133fa33 and x20 x7 x19
28: 040a1463 bne x20 x0 72 <hasAlternatingBits>
2c: 00150513 addi x10 x10 1
30: fff48493 addi x9 x9 -1
34: 009919b3 sll x19 x18 x9
38: 0133fa33 and x20 x7 x19
3c: 020a1a63 bne x20 x0 52 <hasAlternatingBits>
40: 00150513 addi x10 x10 1
44: fff48493 addi x9 x9 -1
48: 009919b3 sll x19 x18 x9
4c: 0133fa33 and x20 x7 x19
50: 020a1063 bne x20 x0 32 <hasAlternatingBits>
54: 00150513 addi x10 x10 1
58: fff48493 addi x9 x9 -1
5c: 0133fa33 and x20 x7 x19
60: 000a1863 bne x20 x0 16 <hasAlternatingBits>
64: 00150513 addi x10 x10 1
68: fff48493 addi x9 x9 -1
6c: fa04d8e3 bge x9 x0 -80 <loop>
00000070 <hasAlternatingBits>:
70: 01f00a93 addi x21 x0 31
74: 40aa8ab3 sub x21 x21 x10
78: 03505263 bge x0 x21 36 <tend>
0000007c <bitloop>:
7c: 0083db33 srl x22 x7 x8
80: 00140d93 addi x27 x8 1
84: 01b3dbb3 srl x23 x7 x27
88: 001b7c13 andi x24 x22 1
8c: 001bfc93 andi x25 x23 1
90: 039c0e63 beq x24 x25 60 <end>
94: 00140413 addi x8 x8 1
98: ff5442e3 blt x8 x21 -28 <bitloop>
0000009c <tend>:
9c: 00038513 addi x10 x7 0
a0: 00100893 addi x17 x0 1
a4: 00000073 ecall
a8: 10000517 auipc x10 0x10000
ac: f6d50513 addi x10 x10 -147
b0: 00400893 addi x17 x0 4
b4: 00000073 ecall
b8: 00428293 addi x5 x5 4
bc: fff30313 addi x6 x6 -1
c0: f40316e3 bne x6 x0 -180 <my_clz>
c4: 00a00893 addi x17 x0 10
c8: 00000073 ecall
000000cc <end>:
cc: 00038513 addi x10 x7 0
d0: 00100893 addi x17 x0 1
d4: 00000073 ecall
d8: 10000517 auipc x10 0x10000
dc: f3650513 addi x10 x10 -202
e0: 00400893 addi x17 x0 4
e4: 00000073 ecall
e8: 00428293 addi x5 x5 4
ec: fff30313 addi x6 x6 -1
f0: f0031ee3 bne x6 x0 -228 <my_clz>
f4: 00a00893 addi x17 x0 10
f8: 00000073 ecall
```
take ```70: 01f00a93 addi x21,x0,31```for example in this 5-stage pipelined section.
***
#### IF

*The program counter updates the value of the instruction each time to fetch a new instruction.
*The current instruction memory address is ```0x00000048```, and the instruction read from that address is ```0x009919b3```.
*Since the current instruction memory address is 0x0000004c, and most instructions in many architectures (such as RISC-V) are 4 bytes (32 bits) long, the next instruction address will typically be 0x0000004c, which is 4 bytes ahead of the current one.
***
#### ID

*decode the instruction ```0x009919b3``` into binary ```0000000 01001 10010 001 10011 0110011```
funct7 = ```0000000```
rs2 = ```01001```
rs1 = ```10010```
funct3 = ```001```
rd = ```10011```
opcode = ```0110011```
*The instruction ```0x009919b3``` has been decoded as ```sll x19,x18,x9```
***
#### EXE

*In the EXE (execute) stage, the value in register ```x18``` is shifted left by the number of positions specified in the ```x9``` register, and the result is stored back in register ```x19```.
***
#### MEM

*In the MEM (memory) stage, it checks whether to write to or read from memory. For the ```sll``` instruction, there is no need for reading from or writing to memory.
***
#### WB

*In the WB (write-back) stage, the data is written back to register ```x19```