# Assignment1: RISC-V Assembly and Instruction Pipeline
## Problem `B`
### C code
```c
static inline float bf16_to_fp32(bf16_t h)
{
union {
float f;
uint32_t i;
} u = {.i = (uint32_t)h.bits << 16};
return u.f;
}
static inline bf16_t fp32_to_bf16(float s)
{
bf16_t h;
union {
float f;
uint32_t i;
} u = {.f = s};
if ((u.i & 0x7fffffff) > 0x7f800000) { /* NaN */
h.bits = (u.i >> 16) | 64; /* force to quiet */
return h;
}
h.bits = (u.i + (0x7fff + ((u.i >> 0x10) & 1))) >> 0x10;
return h;
}
```
### Assembly code
#### bf16_to_fp32
```c
bf16_to_fp32:
slli t1, t0, 16
ret
```
##### test with Ripe
[full code in GitHub](https://github.com/Eric-liau/Computer-Archicture-hw1/blob/main/bf16_to_fp32.s)

#### fp32_to_bf16
**Ver1**
```c
fp32_to_bf16:
# NaN detect
li t1, 0x7fffffff
li t2, 0x7f800000
and t3, t0, t1
slt t4, t2, t3
beqz t4, notNaN
srli t1, t0, 16
ori t1, t1, 64
ret
notNaN:
srli t1, t0, 16
andi t1, t1, 1
li t2, 0x7fff
add t1, t1, t2
add t1, t0, t1
srli t1, t1, 16
ret
```
#### Execution information

**Ver2**
Since most floating-point numbers are not NaN, I changed the branching condition to reduce the number of cycles.
```c
fp32_to_bf16:
# NaN detect
li t1, 0x7fffffff
li t2, 0x7f800000
and t3, t0, t1
blt t2, t3, NaN
notNaN:
srli t1, t0, 16
andi t1, t1, 1
li t2, 0x7fff
add t1, t1, t2
add t1, t0, t1
srli t1, t1, 16
ret
NaN:
srli t1, t0, 16
ori t1, t1, 64
ret
```
#### Execution information

##### test with Ripe
[full code in GitHub](https://github.com/Eric-liau/Computer-Archicture-hw1/blob/main/fp32_to_bf16.s)

## [Leetcode 2469. Convert the Temperature](https://leetcode.com/problems/convert-the-temperature/description/)
Using bf16 instead of float for computations, we therefore need addition and multiplication operations compatible with the bf16 datatype.
### C code
#### my_clz
modified from [2023 Quiz1](https://hackmd.io/@sysprog/arch2023-quiz1-sol) Problem A
```c
int my_clz(short x)
{
x |= (x >> 1);
x |= (x >> 2);
x |= (x >> 4);
x |= (x >> 8);
x -= ((x >> 1) & 0x5555);
x = ((x >> 2) & 0x3333) + (x & 0x3333);
x = ((x >> 4) + x) & 0x0f0f;
x += (x >> 8);
return (16 - (x & 0x7f));
}
```
#### bf16_add
```c
short bf16_add(short a, short b){
unsigned short rst_sign, rst_exp, rst_fra, rst;
unsigned short a_sign = a >> 15;
unsigned short a_exp = (unsigned short)(a << 1) >> 8;
unsigned short a_fra = (a & 127) | 128;
unsigned short b_sign = b >> 15;
unsigned short b_exp = (unsigned short)(b << 1) >> 8;
unsigned short b_fra = (b & 127) | 128;
if(a_exp > b_exp){
unsigned short sft = a_exp - b_exp;
b_fra >>= sft;
if(a_sign ^ b_sign)
rst_fra = a_fra - b_fra;
else
rst_fra = a_fra + b_fra;
rst_exp = a_exp;
rst_sign = a_sign;
}
else{
unsigned short sft = b_exp - a_exp;
a_fra >>= sft;
if(a_sign ^ b_sign)
rst_fra = b_fra - a_fra;
else
rst_fra = a_fra + b_fra;
rst_exp = b_exp;
rst_sign = b_sign;
}
//normalize
int lz = my_clz(rst_fra);
if(lz <= 8){
lz = 8 - lz;
rst_fra >>= lz;
rst_exp += lz;
}
else{
lz -= 8;
rst_fra <<= lz;
rst_exp -= lz;
}
rst_fra -= 128;
rst_sign <<= 15;
rst_exp <<= 7;
rst = rst_sign | rst_exp | rst_fra;
return rst;
}
```
#### bf16_mul
```c
short bf16_mul(short a, short b){
unsigned short a_sign = a >> 15;
unsigned short a_exp = (unsigned short)(a << 1) >> 8;
unsigned short a_fra = (a & 127) | 128;
unsigned short b_sign = b >> 15;
unsigned short b_exp = (unsigned short)(b << 1) >> 8;
unsigned short b_fra = (b & 127) | 128;
unsigned short sign = a_sign ^ b_sign;
unsigned short exp = a_exp + b_exp - 127;
unsigned short fra = a_fra * b_fra;
fra >>= 7;
//normalize
int lz = my_clz(fra);
lz = 8 - lz;
fra >>= lz;
exp += lz;
fra -= 128;
sign <<= 15;
exp <<= 7;
unsigned short rst = sign | exp | fra;
return rst;
}
```
#### convert_temperature
```c
unsigned short convert_temperature_kel(float celsius){
float num = 273.15;
unsigned short num_bf16 = fp32_to_bf16(num);
unsigned short celsius_bf16 = fp32_to_bf16(celsius);
unsigned short kelvin_bf16 = bf16_add(celsius_bf16, num_bf16);
return kelvin_bf16;
}
unsigned short convert_temperature_fah(float celsius){
float num1 = 1.8, num2 = 32;
unsigned short num1_bf16 = fp32_to_bf16(num1);
unsigned short num2_bf16 = fp32_to_bf16(num2);
unsigned short celsius_bf16 = fp32_to_bf16(celsius);
unsigned short fahrenheit_bf16 = bf16_mul(celsius_bf16, num1_bf16);
fahrenheit_bf16 = bf16_add(fahrenheit_bf16, num2_bf16);
return fahrenheit_bf16;
}
```
### Assembly code
#### my_clz
```c
clz:
# x |= (x >> 1)
srli t1, t0, 1
or t0, t0, t1
# x |= (x >> 2)
srli t1, t0, 2
or t0, t0, t1
# x |= (x >> 4)
srli t1, t0, 4
or t0, t0, t1
# x |= (x >> 8)
srli t1, t0, 8
or t0, t0, t1
# x -= ((x >> 1) & 0x5555)
srli t1, t0, 1
li t2, 0x5555
and t1, t1, t2
sub t0, t0, t1
# x = ((x >> 2) & 0x3333) + (x & 0x3333)
srli t1, t0, 2
li t2, 0x3333
and t1, t1, t2
and t0, t0, t2
add t0, t0, t1
# x = ((x >> 4) + x) & 0x0f0f
srli t1, t0, 4
add t0, t0, t1
li t2, 0x0f0f
and t0, t0, t2
# x += (x >> 8)
srli t1, t0, 8
add t0, t0, t1
# return (16 - (x & 0x7f))
andi t0, t0, 0x7f
xori t0, t0, -1
addi t1, t0, 17
ret
```
##### test with Ripe
[full code in GitHub](https://github.com/Eric-liau/Computer-Archicture-hw1/blob/main/my_clz.s)

#### bf16_add
```c
bf16_add:
addi sp, sp, -4
sw ra, 0(sp)
# t2 = t0_exp
slli t2, t0, 17
srli t2, t2, 24
# t3 = t1_exp
slli t3, t1, 17
srli t3, t3, 24
blt t2, t3, swap
# t2 = exp, t4 = sft
sub t4, t2, t3
j cal_start
swap:
# swap(t0, t1)
mv t4, t0
mv t0, t1
mv t1, t4
# t2 = exp, t4 = sft
sub t4, t3, t2
mv t2, t3
cal_start:
# t3 = sign
srli t3, t0, 15
# t5 = 0 ? add : sub
xor t5, t0, t1
srli t5, t5, 15
# t0 = t0_fra
andi t0, t0, 127
ori, t0, t0, 128
# t1 = t1_fra
andi t1, t1, 127
ori t1, t1, 128
srl t1, t1, t4
# t0 = fra
beqz t5, add_operation
sub t0, t0, t1
j normalize
add_operation:
add t0, t0, t1
normalize:
# t1 = lz
call clz
li t4, 8
addi t1, t1, -8
# normalize exp
sub t2, t2, t1
# t1 = |t1|
srai t4, t1, 4
xor t1, t1, t4
srli t4, t4, 31
add t1, t1, t4
# branch if lz >= 8
beqz t4, shift_left
srl t0, t0, t1
j finish
shift_left:
sll t0, t0, t1
finish:
addi t0, t0, -128
slli t3, t3, 15
slli t2, t2, 7
or t1, t0, t2
or t1, t1, t3
lw ra, 0(sp)
addi sp, sp, 4
ret
```
##### test with Ripe
[full code in GitHub](https://github.com/Eric-liau/Computer-Archicture-hw1/blob/main/bf16_add.s)

##### verify correctness
| input1-hex | input1-bf16 | input2-hex | input2-bf16 | output-hex | output-bf16 | output-float |
| ---------- | ----------- | ---------- | ----------- |---------- | ----------- | ------------ |
| 0x411d | 9.8125 | 0x4126 | 10.375 | 0x41a1 | 20.125 | 20.1875 |
| 0xc2c8 | -100 | 0x4285 | 66.5 | 0xc206 | -33.5 | -33.5 |
| 0x426b | 58.75 | 0x429e | 79 | 0x4309 | 137 | 137.75 |
| 0xc117 | -9.4375 | 0xc178 | -15.5 | 0xc1c7 | -24.875 | -24.9375 |
| 0x3f1a | 0.601562 | 0x3d76 | 0.060059 | 0x3f29 | 0.660156 | 0.661621 |
#### bf16_mul
```c
bf16_mul:
addi sp, sp -4
sw ra, 0(sp)
# t2 = sign
xor t2, t0, t1
srli, t2, t2, 15
# t3 = exp
slli t3, t0, 17
srli t3, t3, 24
slli t4, t1, 17
srli t4, t4, 24
add t3, t3, t4
addi t3, t3, -127
# t0 = fra
andi t0, t0, 127
ori t0, t0, 128
andi t1, t1, 127
ori t1, t1, 128
mul t0, t0, t1
srli t0, t0, 7
call clz
li t4, 8
sub t1, t4, t1
srl t0, t0, t1
add t3, t3, t1
addi t0, t0, -128
slli t2, t2, 15
slli t3, t3, 7
or t1, t0, t2
or t1, t1, t3
lw ra, 0(sp)
addi sp, sp, 4
ret
```
##### test with Ripe
[full code in GitHub](https://github.com/Eric-liau/Computer-Archicture-hw1/blob/main/bf16_mul.s)

##### verify correctness
| input1-hex | input1-bf16 | input2-hex | input2-bf16 | output-hex | output-bf16 | output-float |
| ---------- | ----------- | ---------- | ----------- |---------- | ----------- | ------------ |
| 0x411d | 9.8125 | 0x4126 | 10.375 | 0x42cb | 101.5 | 101.804688 |
| 0xc2c8 | -100 | 0x4285 | 66.5 | 0xc5cf | -6624 | -6650 |
| 0x426b | 58.75 | 0x429e | 79 | 0x4591 | 4640 | 4641.25 |
| 0xc117 | -9.4375 | 0xffffc178 | -15.5 | 0x4312 | 146 | 146.281250 |
| 0x3f1a | 0.601562 | 0x3d76 | 0.060059 | 0x3d13 | 0.035889 | 0.036129 |
#### convert_temperature
```c
convert_temperature_kel:
addi sp, sp, -4
sw ra, 0(sp)
call fp32_to_bf16
li t0 0x4389 # (bf16)273.15
call bf16_add
lw ra, 0(sp)
addi sp, sp, 4
ret
convert_temperature_fah:
addi sp, sp, -4
sw ra, 0(sp)
call fp32_to_bf16
li t0 0x3fe6 # (bf16)1.8
call bf16_mul
li t0 0x4200 # (bf16)32
call bf16_add
lw ra, 0(sp)
addi sp, sp, 4
ret
```
##### test with Ripe
[full code in GitHub](https://github.com/Eric-liau/Computer-Archicture-hw1/blob/main/convert_temperature.s)

##### verify correctness
| celsius-float | kelvin-hex | kelvin-bf16 | kelvin-float | fahrenheit-hex | fahrenheit-bf16 | fahrenheit-float |
| ---------- | ----------- | ---------- | ----------- |---------- | ----------- | ------------ |
| 10.4 | 0x438e | 284 | 283.549988 | 0x424a | 50 | 50.720001 |
| -198.245697 | 0x4298 | 76 | 74.904305 | 0xc3a1 | -322 | -324.842255 |
| 78.779999 | 0x43b0 | 352 | 351.929993 | 0x432d | 173 | 173.804001 |
| -15.5 | 0x4382 | 260 | 257.649994 | 0x4088 | 4.25 | 4.1 |
| 0.06 | 0x4389 | 274 | 273.209991 | 0x4200 | 32 | 32.108002 |
| -54.872002 | 0x435c | 220 | 218.278 | 0xc284 | -66 | -66.7696 |
In the case of using the above test data, there is an average error rate of 0.605% in Kelvin temperature conversions, and the average error rate in Fahrenheit temperature conversions is 1.153%.
By sacrificing accuracy, memory space is saved, with each piece of data using only 2 bytes of memory. This is a 50% reduction compared to the original 4 bytes.
##### Execution information

After updating fp32_to_bf16 to Ver2

## Analysis
### 5 stage RISC-V pipeline CPU

**IF :** Fetch the instruction in intruciton memory that will be excuted in later stages.
**ID :** Decode the instruction fetched in IF and read the needed register data.
**EXE :** Perform the corresponding operation based on the instruction.
**MEM :** Read data from memory or write data to memory if needed.
**WB :** Write either the operation result from EXE stage or the data read from memory into the register.
Use the instruction `addi x17 x0 34` to show how it works on 5 stage RISC-V pipeline CPU.
### IF stage

- We can observe that the output of the Program Counter (PC) is 0x2C, and after passing through a plus-4 adder, the input to the PC in the next cycle will be 0x30.
- 0x2C is also the input to the Instruction Memory (IM), which is the address where the instruction is stored. Therefore, the PC reads the instruction data from address 0x2C in the IM.
### ID stage

- After decoding, we recognize that this is an ADDI instruction, which is an I-type instruction. This means we do not need the value from Reg 2 when executing this instruction, even though the CPU still reads it from the register.
- The values we need are the data from register x0(since the R1 index is 0), which is always zero, and the immediate value generated by the immediate generator.
### EXE stage

- Since this instruction isn't a branch instruction, the Branch Unit can be ignore.
- The ALU adds 0x0 from `x0` to 0x22 generated by the immediate generator to obtain the result.
### MEM stage

- The instruction neither loads nor saves data during the MEM stage, meaning it does nothing in this stage.
### WB stage

- The CPU stores the data into the register during the ID stage, indicating that the register is allowed to be written to.