owned this note
owned this note
Published
Linked with GitHub
# Assignment1: RISC-V Assembly and Instruction Pipeline
contributed by < [`Po-Ying,Chiu`](https://github.com/charliechiou?tab=repositories) >
###### tags: `RISC-V` `computer architure 2024` `convlution`
## 1D discrete convolution
> Description:
> `1D Discrete Convolution` is an operation used in signal processing and neural networks to extract features from sequential data like time series or audio.
>
> A filter slides over the input, multiplying and summing values to extract the features, making it useful in tasks like speech recognition and text analysis.
For every input x[n] we can represent it as
$$
x[n] = \sum_{k=-\infty}^{\infty} x[k] \delta[n - k]
$$
where $\delta[n]$ is the unit inpulse and $x[n]$ is the input signal.
$\delta[n-k]$ shifts the unit impulse to the position of the signal we want to sample, multiplies the signal, and sets other values to zero. Therefore, accumulating them will give us back to $x[n]$.

Now by the same concept,we can change $\delta [n-k]$ to $h[n-k]$.But this time the signal before and after the sample points will not be set to zero.
$h[n]$ may represent the impulse response of the input signal. For each sample point of the input, we assign a weight to indicate "how interested we are" in that part of the signal, and then multiply it by the signal.
Therefore, our equation becomes:
$$
y[n] = \sum_{k=-\infty}^{\infty} x[k] h[n - k]
$$
>Take the `average` as an example: we have the same level of interest in every point, so
$ℎ[n]$ would be $[1,1,1]$. Means that each point in the input signal is given equal weight, reflecting that **we consider all points equally important when calculating the average.**
To make the calculation process clearer, below is a step-by-step illustration of each shift of h where x = [1,2,3] and h=[3,2,1]
For step one,we shift the $h[n]$ to n=1 and multiple each signal together.We have
\begin{align*}
y[0] &= x[0] \cdot h[0] + x[1] \cdot h[-1] + x[2] \cdot h[-2] \\
&= (1 \cdot 3) + (2 \cdot 0) + (3 \cdot 0) \\
&= 3
\end{align*}

For the next step,$h[n-1]$ makes the $h[n]$ move forward.We have
\begin{align*}
y[1] &= x[0] \cdot h[1] + x[1] \cdot h[0] + x[2] \cdot h[-1] \\
&= (1 \cdot 2) + (2 \cdot 3) + (3 \cdot 0) \\
&= 2 + 6 + 0 \\
&= 8
\end{align*}

\begin{align*}
y[2] &= x[0] \cdot h[2] + x[1] \cdot h[1] + x[2] \cdot h[0] \\
&= (1 \cdot 1) + (2 \cdot 2) + (3 \cdot 3) \\
&= 1 + 4 + 9 \\
&= 14
\end{align*}

Continue the step, we will notice that
\begin{align*}
y[5] &= x[0] \cdot h[5] + x[1] \cdot h[4] + x[2] \cdot h[3] \\
&= (1 \cdot 0) + (2 \cdot 0) + (3 \cdot 0) \\
&= 0 + 0 + 0 \\
&= 0
\end{align*}

In conclusion, if $x$ has $m$ signals and $h$ has $n$ signals, we only need to calculate $m+n-1$ times. After sliding the $h$ sequence $m+n-1$ times, $h$ will completely leave the range of $x$, resulting in all outputs being zero.
## C program
### Origin
This code implements a 1D convolution to compute the convolution result of two sequences, x and h.
For the initialize setting:
x and h are the input sequences with lengths of 3, and y is an array to store the convolution result, set to a size of 20 to ensure sufficient space for the output. The variables m and n represent the lengths of x and h, respectively, both set to 3.
#### C Code
In the C code, we use two 'for loops' to simulate convolution. The outer loop iterates over each current sample point 𝑦[𝑖], while the inner loop processes each element of 𝑥 and ℎ to compute the result for 𝑦[𝑖].
```c
#include <stdio.h>
#include <stdint.h>
int main()
{
// Initialize two sequences
float x[10] = {1.0, 2.0, 0.0}; // x: 1.0, 2.0, 0.0
float h[10] = {3.0, 2.0, 1.0}; // y: 3.0, 4.0, 1.0
float y[20];
// initialize two sequence
int i, j, m = 3, n = 3; // Lengths of sequences x and h are both 3
for (i = 0; i < m + n - 1; i++) // Loop through each position in the output sequence y
{
y[i] = 0; // Initialize the current output sample y[i] to 0
for (j = 0; j <= i; j++) // Loop through each element of sequences x and h
{
// Ensure the indices are within bounds for sequences x and h
if (j < m && (i - j) < n)
{
y[i] += x[j] * h[i - j]; // calculate
}
}
}
// Display the output
printf("Convoluted sequence is :\n");
for (i = 0; i < m + n - 1; i++)
{
printf("y[%d] = %f\n", i, y[i]);
}
return 0;
}
```
#### Output
```
Convoluted sequence is :
y[0] = 3.000000
y[1] = 8.000000
y[2] = 5.000000
y[3] = 2.000000
y[4] = 0.000000
```
Below is the result of simulating convolution using MATLAB:

### FP16 version
Since this assignment does not allow the use of M or F/D extensions, I had to implement the FP16 multiplication and addition myself.
+ For the `addition`, aligning the mantissa and compare the values to determine whether to perform addition or subtraction.
+ For `multiplication`, I referred to the fmul32 content from [Quiz 1 of the previous year](https://hackmd.io/@sysprog/arch2023-quiz1-sol#Problem-C) and modified it for FP16 multiplication.
#### Addition for FP16
```c
#include <stdint.h>
#include <stdio.h>
// Aligning the mantissa based on the shift value
uint16_t align_mantissa(uint16_t mant, int shift)
{
if (shift > 0)
{
return mant >> shift; // Right-shift the mantissa if shift > 0
}
return mant;
}
// bitwise addition
uint16_t fp16_bitwise_add(uint16_t a, uint16_t b)
{
// extract the sign, exponential, mantissa
uint16_t sign_a = a & 0x8000;
uint16_t sign_b = b & 0x8000;
int exp_a = (a & 0x7C00) >> 10;
int exp_b = (b & 0x7C00) >> 10;
uint16_t mant_a = a & 0x03FF;
uint16_t mant_b = b & 0x03FF;
// adding mantissa hidding 1
mant_a = (mant_a | 0x0400);
mant_b = (mant_b | 0x0400);
// move smaller mantissa to the right
if (exp_a > exp_b)
{
mant_b = align_mantissa(mant_b, exp_a - exp_b);
exp_b = exp_a;
}
else if (exp_b > exp_a)
{
mant_a = align_mantissa(mant_a, exp_b - exp_a);
exp_a = exp_b;
}
// compare the numbers and do the addition
uint16_t mant_result;
int exp_result = exp_a;
if (sign_a == sign_b)
{
mant_result = mant_a + mant_b; // both + or -
}
else
{
if (mant_a >= mant_b)
{
mant_result = mant_a - mant_b; // compare and subtraction
}
else
{
mant_result = mant_b - mant_a; // subtraction
sign_a = sign_b; // change the sign bit
}
}
// if there is a carry-over, increase the exponent by 1
if (mant_result & 0x0800)
{
mant_result >>= 1;
exp_result++;
}
// remove hidding 1
mant_result &= 0x03FF;
if (exp_result >= 31)
{
exp_result = 31;
mant_result = 0;
}
else if (exp_result <= 0)
{
exp_result = 0;
mant_result = 0;
}
// construct together
return sign_a | (exp_result << 10) | mant_result;
}
```
#### Multiplication for FP16
```c
#include <stdio.h>
#include <stdint.h>
// Get the nth bit of a value
static inline int64_t getbit(int64_t value, int n)
{
return (value >> n) & 1;
}
// FP16 integer multiplication
int64_t imul16(int64_t a, int64_t b)
{
int64_t r = 0, a64 = (int64_t)a, b64 = (int64_t)b;
for (int i = 0; i < 16; i++)
{
if (getbit(b64, i))
{
r += a64 << i;
}
}
return r;
}
// FP16 multiplication
uint16_t fmul16(uint16_t a, uint16_t b)
{
/* sign */
int sign_a = a >> 15;
int sign_b = b >> 15;
/* mantissa */
int32_t mantissa_a = (a & 0x3FF) | 0x400; // FP16: 10-bit mantissa + hidden bit
int32_t mantissa_b = (b & 0x3FF) | 0x400;
/* exponent */
int32_t exp_a = ((a >> 10) & 0x1F); // FP16: 5-bit exponent
int32_t exp_b = ((b >> 10) & 0x1F);
/* Perform the mantissa multiplication using the imul16 function */
int64_t mantissa_result_tmp = imul16(mantissa_a, mantissa_b) >> 10; // Shift by FP16 mantissa bits (10 bits)
int mshift = mantissa_result_tmp >> 24; // Adjust shift if necessary (simplified example for mshift calculation)
/* Normalize the mantissa and adjust the exponent */
int64_t mantissa_result = mantissa_result_tmp >> mshift;
int32_t exp_result_tmp = exp_a + exp_b - 15; // Adjust exponent bias for FP16 (bias is 15)
int32_t exp_resultr = mshift ? exp_result_tmp + 1 : exp_result_tmp;
/* Determine the sign of the result */
int sign_result = sign_a ^ sign_b;
/* Reconstruct the final 16-bit floating-point number */
uint16_t r = (sign_result << 15) | ((exp_resultr & 0x1F) << 10) | (mantissa_result & 0x3FF); // Reconstruct FP16
return r;
}
```
Therefore the convolution program in FP16 will be
```c
int main()
{
uint16_t x[] = {0x3C00, 0x4000, 0x0000}; // FP16: 1.0, 2.0, 0.0
uint16_t h[] = {0x4200, 0x4000, 0x3c00}; // FP16: 3.0, 2.0, 1.0
uint16_t y[20] = {0};
int i, j, m = 3, n = 3;
for (i = 0; i < m + n - 1; i++)
{
y[i] = 0;
for (j = 0; j <= i; j++)
{
if (j < m && (i - j) < n)
{
uint16_t fp16_result = fmul16(x[j], h[i - j]);
y[i] = fp16_bitwise_add(y[i], fp16_result);
}
}
}
printf("Convoluted sequence is:\n");
for (i = 0; i < m + n - 1; i++)
{
printf("y[%d] = 0x%04X\n", i, y[i]);
}
/*
Expectied:
0x4200 -> 3.0
0x4800 -> 8.0
0x4500 -> 5.0
0x4000 -> 2.0
0x0000 -> 0.0
*/
return 0;
}
```
## Assembly program
In this section, I separately completed the translation of FP16 addition and multiplication from C code into RISC-V assembly, and then implemented them as functions. Afterward, I directly integrated them (i.e., copy-pasted) into the main program.
>Directly copying and pasting not only demonstrates that my function is functional, but it also helped me better understand the importance of the `Six Fundamental Steps in Calling a Function` discussed in the course.
:::info
In the [GitHub](https://github.com/charliechiou/Computer-Architecture-assignment1), I have included multiple separately written code files, such as `fp16_adds.s` and `fp16_mul.s`, each of which is a separately written function.
:::
### test data
```c
.data
testcase1_x:
.4byte 0x3C00 0x4000 0x0000 #input x
testcase1_h:
.4byte 0x4200 0x4000 0x3c00 #input h
# expected output -> 0x4200 0x4800 0x4500 0x4000 0x0000
testcase2_x:
.4byte 0x3c00 0x4000 0x3c00 #input x
testcase2_h:
.4byte 0x4200 0x4000 0x3c00 #input h
# expected output -> 0x4200 0x4800 0x4800 0x4400 0x3c00
testcase3_x:
.4byte 0x4500 0x4400 0x3c00 #input x
testcase3_h:
.4byte 0x4000 0x3c00 0x3c00 #input h
# expected output -> 0x4900 0x4a80 0x4980 0x4500 0x3c00
testcase4_x:
.4byte 0x3c00 0x4000 0x3c00 0x4400 #input x
testcase4_h:
.4byte 0x4200 0x4000 0x3c00 #input h
# expected output -> 0x4200 0x4800 0x4800 0x4c00 0x4880 0x4400
y:
.zero 20 #for testcase 1~3
#.zero 24 #for testcase 4
str1:.string "Input x is : "
str2:.string "Input h is : "
str3:.string " "
str4:.string "\n"
str5:.string "Output y is : "
```
There are 4 test cases, and the expected outputs are provided afterward.
### Main
In the main program, I used two loops to complete the convolution, which helped me better understand how loops operate and how branching works in RISC-V.
```c
.text
printInputInit:
la s0,testcase1_x
la s1,testcase1_h
mv a0,s0
mv a1,s1
jal printInput
main:
li s2,3 # m(s2) = 3 for testcase 1~3
#li s2,4 # m(s2) = 4 for testcase 4
li s3,3 # n(s3) = 3
li s4,0 # i(s4) = 0
add s5,s2,s3
addi s5,s5,-1 # s5 = m + n - 1
la s6,y #y array address
outer_loop:
bge s4, s5, end_outer_loop
li s7, 0
inner_loop:
blt s4, s7, end_inner_loop
check_j_less_m:
blt s7, s2, check_ij_less_n
j skip_inner_loop
check_ij_less_n:
sub t1, s4, s7
blt t1, s3, calculate
j skip_inner_loop
calculate:
slli t0,s7,2
add t0,t0,s0
lw a0,0(t0)
slli t1,t1,2
add t1,t1,s1
lw a1,0(t1)
jal fp16_mul #uint16_t fp16_result = float_mul(x[j], h[i - j]);
slli t0,s4,2
add t0,t0,s6
lw a1,0(t0)
jal fp16_add #y[i] = fp16_bitwise_add(y[i], fp16_result);
sw a0,0(t0)
skip_inner_loop:
addi s7, s7, 1 # j++
j inner_loop
end_inner_loop:
addi s4,s4,1
j outer_loop
end_outer_loop:
mv a0,s6
jal printOutput
```
You can test different data by simply changing the number following the testcase. If you want to use testcase4, you will need to uncomment the sections marked with "for testcase 4".(For convenience, I placed the code for executing testcase 4 in the Appendix A.)
### Multiplication for FP16
```c
fp16_mul:
#prologue
addi sp,sp,-32
sw s0,0(sp)
sw s1,4(sp)
sw s2,8(sp)
sw s3,12(sp)
sw s4,16(sp)
sw s5,20(sp)
sw s6,24(sp)
sw s7,28(sp)
sw ra,32(sp)
mv s0,a0
mv s1,a1
srli s2,s0,15 #int sign_a(s2) = a(S0) >> 15;
srli s3,s1,15 #int sign_b(S3) = b(s1) >> 15;
andi t1,s0,0x3ff
ori s4,t1,0x400 #int32_t mantissa_a(s4) = (a & 0x3FF) | 0x400;
andi t1,s1,0x3ff
ori s5,t1,0x400 #int32_t mantissa_b(s5) = (b & 0x3FF) | 0x400;
srli t1,s0,10
andi s6,t1,0x1f #int32_t exp_a(s6) = ((a >> 10) & 0x1F);
srli t1,s1,10
andi s7,t1,0x1f #int32_t exp_b(s7) = ((b >> 10) & 0x1F);
#mul t1,s4,s5
mv t0,ra
mv a0,s4
mv a1,s5
jal ra,imul16
mv t1,a0
mv ra,t0
srli t1,t1,10 #int64_t mantissa_result_tmp(t1) = ((int64_t)mantissa_a * mantissa_b) >> 10;
srli t2,t1,24 #int mshift(t2) = mantissa_result_tmp >> 24;
srl s4,t1,t2 #int64_t mantissa_result(s4) = mantissa_result_tmp >> mshift;
add s6,s6,s7
addi s6,s6,-15 #int32_t exp_result_tmp(s6) = exp_a + exp_b - 15;
#int32_t exp_resultr(s6) = mshift ? exp_result_tmp + 1 : exp_result_tmp;
beqz t2,mshift_zero
addi s6,s6,1
mshift_zero:
xor s2,s2,s3 #int sign_result(S2) = sign_a ^ sign_b;
slli s2,s2,15
andi s6,s6,0x1f
slli s6,s6,10
andi s4,s4,0x3ff
or t1,s2,s6
or a0,t1,s4
#epilogue
lw s0,0(sp)
lw s1,4(sp)
lw s2,8(sp)
lw s3,12(sp)
lw s4,16(sp)
lw s5,20(sp)
lw s6,24(sp)
lw s7,28(sp)
lw ra,32(sp)
addi sp,sp,32
ret
imul16:
#a0 -> a, a1 -> b
li a3,0
li t1,0 #set i
li t2,16 #set max i
loop:
beq t1,t2,end_loop
srl t3,a1,t1
andi t3,t3,1 #getbit(b64, i)
beqz t3,skip_loop
sll t4,a0,t1
add a3,a3,t4 #r += a64 << i;
skip_loop:
addi t1,t1,1
j loop
end_loop:
mv a0,a3
ret
```
### Addition for FP16
```c
fp16_add:
#prologue
addi sp,sp,-32
sw s0,0(sp)
sw s1,4(sp)
sw s2,8(sp)
sw s3,12(sp)
sw s4,16(sp)
sw s5,20(sp)
sw s6,24(sp)
sw s7,28(sp)
sw ra,32(sp)
mv s0,a0
mv s1,a1
li t1,0x8000
and s2,s0,t1 #uint16_t sign_a(s2) = a(s0) & 0x8000;
and s3,s1,t1 #uint16_t sign_b(s3) = b(S1) & 0x8000;
li t1,0x7C00
and s4,s0,t1
srli s4,s4,10 #int exp_a(s4) = (a & 0x7C00) >> 10;
and s5,s1,t1
srli s5,s5,10 #int exp_b(s5) = (b & 0x7C00) >> 10;
andi s6,s0,0x03FF #uint16_t mant_a(s6) = a & 0x03FF;
andi s7,s1,0x03FF #uint16_t mant_b(s7) = b & 0x03FF;
ori s6,s6,0x0400 #mant_a(s6) = ((mant_a) | 0x0400);
ori s7,s7,0x0400 #mant_b(s7) = ((mant_b) | 0x0400);
beq s4,s5,finish_align
blt s4,s5,align_bit_aless
mv t1,ra
mv a0,s7 #a0 = mant_b(s7)
sub a1,s4,s5 #a1 = exp_a(s4) - exp_b(s5)
jal ra,align_mantissa
mv s7,a0 #mant_b(s7) = align_mantissa(mant_b, exp_a - exp_b);
mv ra,t1
mv s5,s4 #exp_b(s5) = exp_a(s4);
j finish_align
align_bit_aless:
mv t1,ra
mv a0,s6 #a0 = mant_a(s6)
sub a1,s5,s4 #a1 = exp_b(s5) - exp_a(s4)
jal ra,align_mantissa
mv s6,a0 #mant_a(s6) = align_mantissa(mant_a, exp_b - exp_a);
mv ra,t1
mv s4,s5 #exp_a(s4) = exp_b(s5);
finish_align:
# int exp_result = exp_a; -> exp_result(s4)
beq s2,s3,sign_equal
bge s6,s7,a_greater_b
sub s6,s7,s6 #mant_result(s6) = mant_b(s7) - mant_a(s6);
mv s2,s3 #sign_a(s2) = sign_b(s3);
j finish_mantissa
a_greater_b:
sub s6,s6,s7 #mant_result(s6) = mant_a(s6) - mant_b(s7);
j finish_mantissa
sign_equal:
add s6,s6,s7 #mant_result(s6) = mant_a(s6) + mant_b(s7);
finish_mantissa:
li t1,0x0800
and t1,s6,t1
beqz t1,finish_mantissa_normalization #if (mant_result & 0x0800)
srli s6,s6,1 #mant_result >>= 1;
addi s4,s4,1 #exp_result++;
finish_mantissa_normalization:
andi s6,s6,0x03ff #mant_result(s6) &= 0x03FF;
li t1,31
bge s4,t1,exp_greater #if (exp_result >= 31)
bge zero,s4,exp_less #else if (exp_result <= 0)
j combine_result
exp_greater:
li s4,31 #exp_result = 31;
li s6,0 #mant_result = 0;
j combine_result
exp_less:
li s4,0 #exp_result = 0;
li s6,0 #mant_result = 0;
j combine_result
combine_result:
slli t1,s4,10
or t1,s2,t1
or a0,t1,s6 #return sign_a | (exp_result << 10) | mant_result;
#epilogue
lw s0,0(sp)
lw s1,4(sp)
lw s2,8(sp)
lw s3,12(sp)
lw s4,16(sp)
lw s5,20(sp)
lw s6,24(sp)
lw s7,28(sp)
lw ra,32(sp)
addi sp,sp,32
ret
align_mantissa:
blez a1,end_align_mantissa
srl a0,a0,a1
end_align_mantissa:
ret
```
### Print the Input & Result
```c
printInput:
mv t1,a0
mv t2,a1
la a0,str1
li a7,4
ecall #print "Input x is : "
lw a0,0(t1)
li a7,34
ecall #print x[0]
la a0,str3
li a7,4
ecall #print space
lw a0,4(t1)
li a7,34
ecall #print x[1]
la a0,str3
li a7,4
ecall #print space
lw a0,8(t1)
li a7,34
ecall #print x[2]
la a0,str3
li a7,4
ecall #print space
##### below is for testcase 4 #####
#lw a0,12(t1)
#li a7,34
#ecall #print x[3]
###################################
la a0,str4
li a7,4
ecall #next line
la a0,str2
li a7,4
ecall #print "Input h is : "
lw a0,0(t2)
li a7,34
ecall #print h[0]
la a0,str3
li a7,4
ecall #print space
lw a0,4(t2)
li a7,34
ecall #print h[1]
la a0,str3
li a7,4
ecall #print space
lw a0,8(t2)
li a7,34
ecall #print h[2]
la a0,str4
li a7,4
ecall #next line
ret
printOutput:
mv t1,a0
la a0,str5
li a7,4
ecall #print "Output y is : "
lw a0,0(t1)
li a7,34
ecall #print y[1]
la a0,str3
li a7,4
ecall #print space
lw a0,4(t1)
li a7,34
ecall #print y[2]
la a0,str3
li a7,4
ecall #print space
lw a0,8(t1)
li a7,34
ecall #print y[3]
la a0,str3
li a7,4
ecall #print space
lw a0,12(t1)
li a7,34
ecall #print y[4]
la a0,str3
li a7,4
ecall #print space
lw a0,16(t1)
li a7,34
ecall #print y[5]
la a0,str3
li a7,4
ecall #print space
##### below is for testcase 4 #####
#lw a0,20(t1)
#li a7,34
#ecall #print y[6]
#la a0,str3
#li a7,4
#ecall #print space
###################################
```
### Output
- testcase 1
x=[1,2,0] ; h=[3,2,1]
```c
Input x is : 0x3c00 0x4000 0x0000
Input h is : 0x4200 0x4000 0x3c00
Output y is : 0x4200 0x4800 0x4500 0x4000 0x0400
```
- testcase 2
x=[1,2,1] ; h=[3,2,1]
```c
Input x is : 0x3c00 0x4000 0x3c00
Input h is : 0x4200 0x4000 0x3c00
Output y is : 0x4200 0x4800 0x4800 0x4400 0x3c00
```
- testcase 3
x=[5,4,1] ; h=[2,1,1]
```c
Input x is : 0x4500 0x4400 0x3c00
Input h is : 0x4000 0x3c00 0x3c00
Output y is : 0x4900 0x4a80 0x4980 0x4500 0x3c00
```
- testcase 4
x=[1,2,1,4] ; h=[3,2,1]
```c
Input x is : 0x3c00 0x4000 0x3c00 0x4400
Input h is : 0x4200 0x4000 0x3c00
Output y is : 0x4200 0x4800 0x4800 0x4c00 0x4880 0x4400
```
## Assembly program (Output FP32)
To meet the assignment requirements (using the code from Quiz 1's Problems A, B, and C), I utilized the code from [Problem A](https://hackmd.io/@sysprog/arch2024-quiz1-sol#Problem-A) to convert the final output from FP16 to FP32. Below is the modified `PrintOutput` function and the function for the conversion.
```c
printOutput:
mv a3,a0
la a0,str5
li a7,4
ecall #print "Output x is : "
lw a0,0(a3)
jal fp16_to_fp32
li a7,2
ecall #print y[1]
la a0,str3
li a7,4
ecall #print space
lw a0,4(a3)
jal fp16_to_fp32
li a7,2
ecall #print y[2]
la a0,str3
li a7,4
ecall #print space
lw a0,8(a3)
jal fp16_to_fp32
li a7,2
ecall #print y[3]
la a0,str3
li a7,4
ecall #print space
lw a0,12(a3)
jal fp16_to_fp32
li a7,2
ecall #print y[4]
la a0,str3
li a7,4
ecall #print space
lw a0,16(a3)
jal fp16_to_fp32
li a7,2
ecall #print y[5]
la a0,str3
li a7,4
ecall #print space
##### below is for testcase 4 #####
#lw a0,20(t6)
#li a7,34
#ecall #print y[6]
#la a0,str3
#li a7,4
#ecall #print space
###################################
li a7,10
ecall
#################################################################
fp16_to_fp32:
#prologue
addi sp,sp,-20
sw s0,0(sp)
sw s1,4(sp)
sw s2,8(sp)
sw s3,12(sp)
sw s4,20(sp)
#start
mv s0,a0
slli s1, s0, 16 # w(s1) = (uint32_t)h << 16;
li s2, 0x80000000
and s2, s1, s2 # sign(s2) = w & UINT32_C(0x80000000);
li s3,0x7FFFFFFF
and s3, s1, s3 # nonsign(s3) = w & UINT32_C(0x7FFFFFFF);
mv a0,s3
mv t6,ra
jal my_clz #renorm_shift = my_clz(nonsign);
jal renorm_if
mv s4, a0 #renorm_shift(s4) = renorm_shift > 5 ? renorm_shift - 5 : 0;
mv ra,t6
li t0,0x7F800000 #INT32_C(0x7F800000)
li t1,0x04000000
add t1,s3,t1 # nonsign + 0x04000000
srli t1,t1,8 # (nonsign + 0x04000000) >> 8
and t0,t0,t1 # inf_nan_mask(t0) = (nonsign + 0x04000000) >> 8 & 0x7F800000
addi t1,s3,-1
srli t1,t1,31 #zero_mask(t1) = (int32_t)(nonsign - 1) >> 31;
sll t2,s3,s4
srli t2,t2,3
li t3,0x70
sub t3,t3,s4
slli t3,t3,23
add t2,t2,t3 #((nonsign << renorm_shift >> 3) + ((0x70 - renorm_shift) << 23))
or t2,t2,t0 #(((nonsign << renorm_shift >> 3) + ((0x70 - renorm_shift) << 23)) | inf_nan_mask)
li t4,0xFFFFFFFF
xor t1,t1,t4 # ~zero_mask(~t1)
and t2,t2,t1 #((((nonsign << renorm_shift >> 3) + ((0x70 - renorm_shift) << 23)) | inf_nan_mask) & ~zero_mask)
or a0,s2,t2
#epilogue
lw s4,20(sp)
lw s3,12(sp)
lw s2,8(sp)
lw s1,4(sp)
lw s0,0(sp)
addi sp,sp,20
ret
my_clz:
addi sp,sp,-20
sw s0,0(sp)
sw s1,4(sp)
sw s2,8(sp)
sw s3,12(sp)
sw s4,20(sp)
li s0, 0 #set count
li s1, 31 #set i
clz_loop:
bltz s1, clz_done #if i<0, escape the loop
li s2, 1
sll s3, s2, s1
and s4, s3, a0
bnez s4, clz_done
addi s0, s0, 1
addi s1, s1, -1
j clz_loop
clz_done:
mv a0, s0
lw s4,20(sp)
lw s3,12(sp)
lw s2,8(sp)
lw s1,4(sp)
lw s0,0(sp)
addi sp,sp,20
ret
renorm_if:
li t0, 5
blt t0,a0,renorm_if_true
li a0,0
ret
renorm_if_true:
addi a0,a0,-5
ret
```
It is important to note that in the ecall, `li a7, 2` indicates "Print floating point number," while in the original code, `li a7, 34` was used to indicate "Print an Integer (with hexadecimal format, left-padded with zeros)."
By changing the original a7 and converting FP16 to FP32, the output can be displayed as a more readable floating-point number.

### Result
- testcase 1
x=[1,2,0] ; h=[3,2,1]
```c
Input x is : 0x3c00 0x4000 0x0000
Input h is : 0x4200 0x4000 0x3c00
Output y is : 3 8 5 2 6.10352e-05
```
- testcase 2
x=[1,2,1] ; h=[3,2,1]
```c
Input x is : 0x3c00 0x4000 0x3c00
Input h is : 0x4200 0x4000 0x3c00
Output y is : 3 8 8 4 1
```
- testcase 3
x=[5,4,1] ; h=[2,1,1]
```c
Input x is : 0x4500 0x4400 0x3c00
Input h is : 0x4000 0x3c00 0x3c00
Output y is : 10 13 11 5 1
```
- testcase 4
x=[1,2,1,4] ; h=[3,2,1]
```c
Input x is : 0x3c00 0x4000 0x3c00 0x4400
Input h is : 0x4200 0x4000 0x3c00
Output y is : 3 8 8 16 9 1.61149e-43
```
The complete code is included in Appendix B.
## Ripes simulation
We use [Ripes](https://github.com/mortbopet/Ripes) to simulate the **Five-stage RISC-V Processor** pipeline.

It divide the process into five stages which is `Intruction Fetch(IF)`,`Instruction Decode(ID)`,`Excute(EX)`,`Memory Access(MEM)`,and `Write Back(WB)`.Every instructions will go through the five stages and make the results.
+ Assembly

+ Disassembly

1. **IF (Instruction Fetch)** : In this stage, the processor fetches the instruction from memory. This is the first step of the processor pipeline.
>+ **PC (Program Counter)** : Holds the address of the current instruction to be executed. The enable signal controls when the PC is updated.
>+ **Multiplexer**: The first multiplexer selects the source for updating the PC , particularly when a branch operation is needed., while the second multiplexer chooses the offset values for the PC.
>+ **Adder** : Adds the offset to the PC.
>+ **Instruction Memory** : The memory unit where instructions are stored and fetched from.This memory is accessed using the program counter (PC) address to retrieve the next instruction to be executed.
>+ **IF/ID Pipeline Register** : A register that transfers data between the Instruction Fetch (IF) and Instruction Decode (ID) stages.

2. **ID (Instruction Decode)** : The fetched instruction is decoded in this stage, where the processor determines the required operation and identifies the data sources.
>+ **Decode** : Decode the instructions and breaking down the instruction into its components to tell the processor which registers to use and which operation type should be done.
>
>+ **Register** : Managing data needed for computations during instruction execution. It handles operations like reading and writing register values based on the instruction being processed.
>+ **Immediate** : Extract the immediate value from the instruction.
>+ **ID/EX Pipeline Register** : A register that transfers data between the Instruction Decode (ID) and Execute (EX) stages.

3. **EX (Execute)**: During this stage, the operation specified by the instruction is executed, which could include arithmetic operations, logical operations, shifts, or branch decisions.
>+ **MUX** : To control the data flow.
>+ **ALU** : Input two Operation values and execute the particular operation such as ADD,AND...
>+ **Branch** : To decide whether the branch is taken.
>+ **EX/MEM Pipeline Register** : A register that transfers data between the Execute (EX) and Memory Access (MEM) stages.

4. **MEM (Memory Access)**: If the instruction involves data retrieval or storage, this stage handles accessing memory.
>+ **DATA memory** : Uses the ALU result as the address to determine the specific memory to access, enabling data to be stored or read.
5. **WB (Write Back)**: The final stage involves writing the result of the operation back to the register or memory for use by subsequent instructions.

:::warning
My understanding of each component is still not thorough, so the explanations in this section is not very detailed. More comprehensive descriptions of each component's operation will be added in the future.
:::
## Analysis
Take `la s0,testcase1_x` for example,the pseudo instructions is break into two instructions which are `auipc x8 0x10000` and `addi x8,x8,0` which will be decode into `0x10000417` and `0x00040413`


### Memory
First, we examine the memory, where the text is stored from bottom to top in `Little-Endian format`. The program counter points to the Address and retrieves the instruction. As examples, the first and second instructions are located at addresses 0x00000000 and 0x00000004, respectively.

### IF
For the first step,the program counter points to the `0x00000000`and the instruction memory saved the first instruction code `0x10000417` and the adder are ready to add for the next instruction.Passing the PC address `0x00000000` and instruction code `0x10000417` to the `IF/ID register`.

### ID
In the ID stage,the Decode block decode the instructions to `AUIPC`, set the Immediate value to `0x10000` and set the `rd` to `0x08`.Passing the PC address `0x00000000` and rd `0x08` to the `ID/ED register`.

>Meanwhile, the `IF block` is processing the instruction `addi x8, x8, 0`. The program counter (PC) points to the address `0x00000004` where the instruction is stored, and the Instruction Memory outputs `0x00040413`.
>
### EX
Next, `ALU` add `PC` and `Imm(i.e.,0x10000000)` together and store in `EX/MEM register`

>Meanwhile, the `ID block` is processing the instruction `addi x8, x8, 0`.
>The `decode block` decode the instructions to `ADDI`, set the Immediate value to `0x00000000` and set the `rd` to `0x08`.
### MEM
The MEM stage sending the result of ALU to `MEM/WB register`.

>Meanwhile, the `EX block` is processing the instruction `addi x8, x8, 0`.
>Adding `0x10000000` and `0x00000000` together.
### WB
For the WB stage,send the ds register `0x08` and the value `PC + 0x10000 = 0x10000` back to store in the `x8 register`.

>Meanwhile, the `MEM block` is processing the instruction `addi x8, x8, 0`.
>Sending the result of ALU to `MEM/WB register`.
### Memory and Register
After the instruction `la s0,testcase1_x` is done.The register `x8(s0)` is storing the address `0x100000000` where the address point to our testdata_x.

### Execution information

## Reference
open-source project : [Convolution-of-Discrete-Sequences](https://github.com/KurienEapen/Convolution-of-Discrete-Sequences)
[Convolution](https://medium.com/@acamvproducingstudio/%E8%BC%95%E9%AC%86%E7%90%86%E8%A7%A3%E6%8D%B2%E7%A9%8D-convolution-%E8%A8%8A%E8%99%9F%E8%88%87%E7%B3%BB%E7%B5%B1%E5%A4%A7%E8%A3%9C%E5%B8%96-%E4%B8%80-2123bcf85e67)
[Convolution in 1D](https://www.algorithm-archive.org/contents/convolutions/1d/1d.html)
[Single-precision floating-point format](https://en.wikipedia.org/wiki/Single-precision_floating-point_format#IEEE_754_single-precision_binary_floating-point_format:_binary32)
[Simulating Floating Point Multiplication in C using Bitwise Operators](https://stackoverflow.com/questions/54610832/simulating-floating-point-multiplication-in-c-using-bitwise-operators)
[Five stage pipeline](https://ithelp.ithome.com.tw/m/articles/10261505)
[Float toy](https://evanw.github.io/float-toy/)
And of course,
[Computer Architecture](https://ithelp.ithome.com.tw/m/articles/10261505)
## Appendix A
Code for testcase 4
```c
.data
testcase1_x:
.4byte 0x3C00 0x4000 0x0000 #input x
testcase1_h:
.4byte 0x4200 0x4000 0x3c00 #input h
# expected output -> 0x4200 0x4800 0x4500 0x4000 0x0000
testcase2_x:
.4byte 0x3c00 0x4000 0x3c00 #input x
testcase2_h:
.4byte 0x4200 0x4000 0x3c00 #input h
# expected output -> 0x4200 0x4800 0x4800 0x4400 0x3c00
testcase3_x:
.4byte 0x4500 0x4400 0x3c00 #input x
testcase3_h:
.4byte 0x4000 0x3c00 0x3c00 #input h
# expected output -> 0x4900 0x4a80 0x4980 0x4500 0x3c00
testcase4_x:
.4byte 0x3c00 0x4000 0x3c00 0x4400 #input x
testcase4_h:
.4byte 0x4200 0x4000 0x3c00 #input h
# expected output -> 0x4200 0x4800 0x4800 0x4c00 0x4880 0x4400
y:
#.zero 20 #for testcase 1~3
.zero 24 #for testcase 4
str1:.string "Input x is : "
str2:.string "Input h is : "
str3:.string " "
str4:.string "\n"
str5:.string "Output y is : "
.text
printInputInit:
la s0,testcase4_x
la s1,testcase4_h
mv a0,s0
mv a1,s1
jal printInput
main:
#li s2,3 # m(s2) = 3 for testcase 1~3
li s2,4 # m(s2) = 4 for testcase 4
li s3,3 # n(s3) = 3
li s4,0 # i(s4) = 0
add s5,s2,s3
addi s5,s5,-1 # s5 = m + n - 1
la s6,y #y array address
outer_loop:
bge s4, s5, end_outer_loop
li s7, 0
inner_loop:
blt s4, s7, end_inner_loop
check_j_less_m:
blt s7, s2, check_ij_less_n
j skip_inner_loop
check_ij_less_n:
sub t1, s4, s7
blt t1, s3, calculate
j skip_inner_loop
calculate:
slli t0,s7,2
add t0,t0,s0
lw a0,0(t0)
slli t1,t1,2
add t1,t1,s1
lw a1,0(t1)
jal fp16_mul #uint16_t fp16_result = float_mul(x[j], h[i - j]);
slli t0,s4,2
add t0,t0,s6
lw a1,0(t0)
jal fp16_add #y[i] = fp16_bitwise_add(y[i], fp16_result);
sw a0,0(t0)
skip_inner_loop:
addi s7, s7, 1 # j++
j inner_loop
end_inner_loop:
addi s4,s4,1
j outer_loop
end_outer_loop:
mv a0,s6
jal printOutput
fp16_mul:
#prologue
addi sp,sp,-32
sw s0,0(sp)
sw s1,4(sp)
sw s2,8(sp)
sw s3,12(sp)
sw s4,16(sp)
sw s5,20(sp)
sw s6,24(sp)
sw s7,28(sp)
sw ra,32(sp)
mv s0,a0
mv s1,a1
srli s2,s0,15 #int sign_a(s2) = a(S0) >> 15;
srli s3,s1,15 #int sign_b(S3) = b(s1) >> 15;
andi t1,s0,0x3ff
ori s4,t1,0x400 #int32_t mantissa_a(s4) = (a & 0x3FF) | 0x400;
andi t1,s1,0x3ff
ori s5,t1,0x400 #int32_t mantissa_b(s5) = (b & 0x3FF) | 0x400;
srli t1,s0,10
andi s6,t1,0x1f #int32_t exp_a(s6) = ((a >> 10) & 0x1F);
srli t1,s1,10
andi s7,t1,0x1f #int32_t exp_b(s7) = ((b >> 10) & 0x1F);
#mul t1,s4,s5
mv t0,ra
mv a0,s4
mv a1,s5
jal ra,imul16
mv t1,a0
mv ra,t0
srli t1,t1,10 #int64_t mantissa_result_tmp(t1) = ((int64_t)mantissa_a * mantissa_b) >> 10;
srli t2,t1,24 #int mshift(t2) = mantissa_result_tmp >> 24;
srl s4,t1,t2 #int64_t mantissa_result(s4) = mantissa_result_tmp >> mshift;
add s6,s6,s7
addi s6,s6,-15 #int32_t exp_result_tmp(s6) = exp_a + exp_b - 15;
#int32_t exp_resultr(s6) = mshift ? exp_result_tmp + 1 : exp_result_tmp;
beqz t2,mshift_zero
addi s6,s6,1
mshift_zero:
xor s2,s2,s3 #int sign_result(S2) = sign_a ^ sign_b;
slli s2,s2,15
andi s6,s6,0x1f
slli s6,s6,10
andi s4,s4,0x3ff
or t1,s2,s6
or a0,t1,s4
#epilogue
lw s0,0(sp)
lw s1,4(sp)
lw s2,8(sp)
lw s3,12(sp)
lw s4,16(sp)
lw s5,20(sp)
lw s6,24(sp)
lw s7,28(sp)
lw ra,32(sp)
addi sp,sp,32
ret
imul16:
#a0 -> a, a1 -> b
li a3,0
li t1,0 #set i
li t2,16 #set max i
loop:
beq t1,t2,end_loop
srl t3,a1,t1
andi t3,t3,1 #getbit(b64, i)
beqz t3,skip_loop
sll t4,a0,t1
add a3,a3,t4 #r += a64 << i;
skip_loop:
addi t1,t1,1
j loop
end_loop:
mv a0,a3
ret
fp16_add:
#prologue
addi sp,sp,-32
sw s0,0(sp)
sw s1,4(sp)
sw s2,8(sp)
sw s3,12(sp)
sw s4,16(sp)
sw s5,20(sp)
sw s6,24(sp)
sw s7,28(sp)
sw ra,32(sp)
mv s0,a0
mv s1,a1
li t1,0x8000
and s2,s0,t1 #uint16_t sign_a(s2) = a(s0) & 0x8000;
and s3,s1,t1 #uint16_t sign_b(s3) = b(S1) & 0x8000;
li t1,0x7C00
and s4,s0,t1
srli s4,s4,10 #int exp_a(s4) = (a & 0x7C00) >> 10;
and s5,s1,t1
srli s5,s5,10 #int exp_b(s5) = (b & 0x7C00) >> 10;
andi s6,s0,0x03FF #uint16_t mant_a(s6) = a & 0x03FF;
andi s7,s1,0x03FF #uint16_t mant_b(s7) = b & 0x03FF;
ori s6,s6,0x0400 #mant_a(s6) = ((mant_a) | 0x0400);
ori s7,s7,0x0400 #mant_b(s7) = ((mant_b) | 0x0400);
beq s4,s5,finish_align
blt s4,s5,align_bit_aless
mv t1,ra
mv a0,s7 #a0 = mant_b(s7)
sub a1,s4,s5 #a1 = exp_a(s4) - exp_b(s5)
jal ra,align_mantissa
mv s7,a0 #mant_b(s7) = align_mantissa(mant_b, exp_a - exp_b);
mv ra,t1
mv s5,s4 #exp_b(s5) = exp_a(s4);
j finish_align
align_bit_aless:
mv t1,ra
mv a0,s6 #a0 = mant_a(s6)
sub a1,s5,s4 #a1 = exp_b(s5) - exp_a(s4)
jal ra,align_mantissa
mv s6,a0 #mant_a(s6) = align_mantissa(mant_a, exp_b - exp_a);
mv ra,t1
mv s4,s5 #exp_a(s4) = exp_b(s5);
finish_align:
# int exp_result = exp_a; -> exp_result(s4)
beq s2,s3,sign_equal
bge s6,s7,a_greater_b
sub s6,s7,s6 #mant_result(s6) = mant_b(s7) - mant_a(s6);
mv s2,s3 #sign_a(s2) = sign_b(s3);
j finish_mantissa
a_greater_b:
sub s6,s6,s7 #mant_result(s6) = mant_a(s6) - mant_b(s7);
j finish_mantissa
sign_equal:
add s6,s6,s7 #mant_result(s6) = mant_a(s6) + mant_b(s7);
finish_mantissa:
li t1,0x0800
and t1,s6,t1
beqz t1,finish_mantissa_normalization #if (mant_result & 0x0800)
srli s6,s6,1 #mant_result >>= 1;
addi s4,s4,1 #exp_result++;
finish_mantissa_normalization:
andi s6,s6,0x03ff #mant_result(s6) &= 0x03FF;
li t1,31
bge s4,t1,exp_greater #if (exp_result >= 31)
bge zero,s4,exp_less #else if (exp_result <= 0)
j combine_result
exp_greater:
li s4,31 #exp_result = 31;
li s6,0 #mant_result = 0;
j combine_result
exp_less:
li s4,0 #exp_result = 0;
li s6,0 #mant_result = 0;
j combine_result
combine_result:
slli t1,s4,10
or t1,s2,t1
or a0,t1,s6 #return sign_a | (exp_result << 10) | mant_result;
#epilogue
lw s0,0(sp)
lw s1,4(sp)
lw s2,8(sp)
lw s3,12(sp)
lw s4,16(sp)
lw s5,20(sp)
lw s6,24(sp)
lw s7,28(sp)
lw ra,32(sp)
addi sp,sp,32
ret
align_mantissa:
blez a1,end_align_mantissa
srl a0,a0,a1
end_align_mantissa:
ret
printInput:
mv t1,a0
mv t2,a1
la a0,str1
li a7,4
ecall #print "Input x is : "
lw a0,0(t1)
li a7,34
ecall #print x[0]
la a0,str3
li a7,4
ecall #print space
lw a0,4(t1)
li a7,34
ecall #print x[1]
la a0,str3
li a7,4
ecall #print space
lw a0,8(t1)
li a7,34
ecall #print x[2]
la a0,str3
li a7,4
ecall #print space
##### below is for testcase 4 #####
lw a0,12(t1)
li a7,34
ecall #print x[3]
###################################
la a0,str4
li a7,4
ecall #next line
la a0,str2
li a7,4
ecall #print "Input h is : "
lw a0,0(t2)
li a7,34
ecall #print h[0]
la a0,str3
li a7,4
ecall #print space
lw a0,4(t2)
li a7,34
ecall #print h[1]
la a0,str3
li a7,4
ecall #print space
lw a0,8(t2)
li a7,34
ecall #print h[2]
la a0,str4
li a7,4
ecall #next line
ret
printOutput:
mv t1,a0
la a0,str5
li a7,4
ecall #print "Output y is : "
lw a0,0(t1)
li a7,34
ecall #print y[1]
la a0,str3
li a7,4
ecall #print space
lw a0,4(t1)
li a7,34
ecall #print y[2]
la a0,str3
li a7,4
ecall #print space
lw a0,8(t1)
li a7,34
ecall #print y[3]
la a0,str3
li a7,4
ecall #print space
lw a0,12(t1)
li a7,34
ecall #print y[4]
la a0,str3
li a7,4
ecall #print space
lw a0,16(t1)
li a7,34
ecall #print y[5]
la a0,str3
li a7,4
ecall #print space
##### below is for testcase 4 #####
lw a0,20(t1)
li a7,34
ecall #print y[6]
la a0,str3
li a7,4
ecall #print space
###################################
```
## Appendix B
Code for Assembly program (Output FP32)
```c
.data
testcase1_x:
.4byte 0x3C00 0x4000 0x0000 #input x
testcase1_h:
.4byte 0x4200 0x4000 0x3c00 #input h
# expected output -> 3(0x4200) 8(0x4800) 5(0x4500) 2(0x4000) 0(0x0000)
testcase2_x:
.4byte 0x3c00 0x4000 0x3c00 #input x
testcase2_h:
.4byte 0x4200 0x4000 0x3c00 #input h
# expected output -> 3(0x4200) 8(0x4800) 8(0x4800) 4(0x4400) 1(0x3c00)
testcase3_x:
.4byte 0x4500 0x4400 0x3c00 #input x
testcase3_h:
.4byte 0x4000 0x3c00 0x3c00 #input h
# expected output -> 10(0x4900) 13(0x4a80) 11(0x4980) 5(0x4500) 1(0x3c00)
testcase4_x:
.4byte 0x3c00 0x4000 0x3c00 0x4400 #input x
testcase4_h:
.4byte 0x4200 0x4000 0x3c00 #input h
# expected output -> 3(0x4200) 8(0x4800) 8(0x4800) 16(0x4c00) 9(0x4880) 0(0x4400)
y:
.zero 20 #for testcase 1~3
#.zero 24 #for testcase 4
str1:.string "Input x is : "
str2:.string "Input h is : "
str3:.string " "
str4:.string "\n"
str5:.string "Output y is : "
.text
printInputInit:
la s0,testcase1_x
la s1,testcase1_h
mv a0,s0
mv a1,s1
jal printInput
main:
li s2,3 # m(s2) = 3 for testcase 1~3
#li s2,4 # m(s2) = 4 for testcase 4
li s3,3 # n(s3) = 3
li s4,0 # i(s4) = 0
add s5,s2,s3
addi s5,s5,-1 # s5 = m + n - 1
la s6,y #y array address
outer_loop:
bge s4, s5, end_outer_loop
li s7, 0
inner_loop:
blt s4, s7, end_inner_loop
check_j_less_m:
blt s7, s2, check_ij_less_n
j skip_inner_loop
check_ij_less_n:
sub t1, s4, s7
blt t1, s3, calculate
j skip_inner_loop
calculate:
slli t0,s7,2
add t0,t0,s0
lw a0,0(t0)
slli t1,t1,2
add t1,t1,s1
lw a1,0(t1)
jal fp16_mul #uint16_t fp16_result = float_mul(x[j], h[i - j]);
slli t0,s4,2
add t0,t0,s6
lw a1,0(t0)
jal fp16_add #y[i] = fp16_bitwise_add(y[i], fp16_result);
sw a0,0(t0)
skip_inner_loop:
addi s7, s7, 1 # j++
j inner_loop
end_inner_loop:
addi s4,s4,1
j outer_loop
end_outer_loop:
mv a0,s6
j printOutput
#################################################################
fp16_mul:
#prologue
addi sp,sp,-32
sw s0,0(sp)
sw s1,4(sp)
sw s2,8(sp)
sw s3,12(sp)
sw s4,16(sp)
sw s5,20(sp)
sw s6,24(sp)
sw s7,28(sp)
sw ra,32(sp)
mv s0,a0
mv s1,a1
srli s2,s0,15 #int sign_a(s2) = a(S0) >> 15;
srli s3,s1,15 #int sign_b(S3) = b(s1) >> 15;
andi t1,s0,0x3ff
ori s4,t1,0x400 #int32_t mantissa_a(s4) = (a & 0x3FF) | 0x400;
andi t1,s1,0x3ff
ori s5,t1,0x400 #int32_t mantissa_b(s5) = (b & 0x3FF) | 0x400;
srli t1,s0,10
andi s6,t1,0x1f #int32_t exp_a(s6) = ((a >> 10) & 0x1F);
srli t1,s1,10
andi s7,t1,0x1f #int32_t exp_b(s7) = ((b >> 10) & 0x1F);
#mul t1,s4,s5
mv t0,ra
mv a0,s4
mv a1,s5
jal ra,imul16
mv t1,a0
mv ra,t0
srli t1,t1,10 #int64_t mantissa_result_tmp(t1) = ((int64_t)mantissa_a * mantissa_b) >> 10;
srli t2,t1,24 #int mshift(t2) = mantissa_result_tmp >> 24;
srl s4,t1,t2 #int64_t mantissa_result(s4) = mantissa_result_tmp >> mshift;
add s6,s6,s7
addi s6,s6,-15 #int32_t exp_result_tmp(s6) = exp_a + exp_b - 15;
#int32_t exp_resultr(s6) = mshift ? exp_result_tmp + 1 : exp_result_tmp;
beqz t2,mshift_zero
addi s6,s6,1
mshift_zero:
xor s2,s2,s3 #int sign_result(S2) = sign_a ^ sign_b;
slli s2,s2,15
andi s6,s6,0x1f
slli s6,s6,10
andi s4,s4,0x3ff
or t1,s2,s6
or a0,t1,s4
#epilogue
lw s0,0(sp)
lw s1,4(sp)
lw s2,8(sp)
lw s3,12(sp)
lw s4,16(sp)
lw s5,20(sp)
lw s6,24(sp)
lw s7,28(sp)
lw ra,32(sp)
addi sp,sp,32
ret
imul16:
#a0 -> a, a1 -> b
li a3,0
li t1,0 #set i
li t2,16 #set max i
loop:
beq t1,t2,end_loop
srl t3,a1,t1
andi t3,t3,1 #getbit(b64, i)
beqz t3,skip_loop
sll t4,a0,t1
add a3,a3,t4 #r += a64 << i;
skip_loop:
addi t1,t1,1
j loop
end_loop:
mv a0,a3
ret
#################################################################
fp16_add:
#prologue
addi sp,sp,-32
sw s0,0(sp)
sw s1,4(sp)
sw s2,8(sp)
sw s3,12(sp)
sw s4,16(sp)
sw s5,20(sp)
sw s6,24(sp)
sw s7,28(sp)
sw ra,32(sp)
mv s0,a0
mv s1,a1
li t1,0x8000
and s2,s0,t1 #uint16_t sign_a(s2) = a(s0) & 0x8000;
and s3,s1,t1 #uint16_t sign_b(s3) = b(S1) & 0x8000;
li t1,0x7C00
and s4,s0,t1
srli s4,s4,10 #int exp_a(s4) = (a & 0x7C00) >> 10;
and s5,s1,t1
srli s5,s5,10 #int exp_b(s5) = (b & 0x7C00) >> 10;
andi s6,s0,0x03FF #uint16_t mant_a(s6) = a & 0x03FF;
andi s7,s1,0x03FF #uint16_t mant_b(s7) = b & 0x03FF;
ori s6,s6,0x0400 #mant_a(s6) = ((mant_a) | 0x0400);
ori s7,s7,0x0400 #mant_b(s7) = ((mant_b) | 0x0400);
beq s4,s5,finish_align
blt s4,s5,align_bit_aless
mv t1,ra
mv a0,s7 #a0 = mant_b(s7)
sub a1,s4,s5 #a1 = exp_a(s4) - exp_b(s5)
jal ra,align_mantissa
mv s7,a0 #mant_b(s7) = align_mantissa(mant_b, exp_a - exp_b);
mv ra,t1
mv s5,s4 #exp_b(s5) = exp_a(s4);
j finish_align
align_bit_aless:
mv t1,ra
mv a0,s6 #a0 = mant_a(s6)
sub a1,s5,s4 #a1 = exp_b(s5) - exp_a(s4)
jal ra,align_mantissa
mv s6,a0 #mant_a(s6) = align_mantissa(mant_a, exp_b - exp_a);
mv ra,t1
mv s4,s5 #exp_a(s4) = exp_b(s5);
finish_align:
# int exp_result = exp_a; -> exp_result(s4)
beq s2,s3,sign_equal
bge s6,s7,a_greater_b
sub s6,s7,s6 #mant_result(s6) = mant_b(s7) - mant_a(s6);
mv s2,s3 #sign_a(s2) = sign_b(s3);
j finish_mantissa
a_greater_b:
sub s6,s6,s7 #mant_result(s6) = mant_a(s6) - mant_b(s7);
j finish_mantissa
sign_equal:
add s6,s6,s7 #mant_result(s6) = mant_a(s6) + mant_b(s7);
finish_mantissa:
li t1,0x0800
and t1,s6,t1
beqz t1,finish_mantissa_normalization #if (mant_result & 0x0800)
srli s6,s6,1 #mant_result >>= 1;
addi s4,s4,1 #exp_result++;
finish_mantissa_normalization:
andi s6,s6,0x03ff #mant_result(s6) &= 0x03FF;
li t1,31
bge s4,t1,exp_greater #if (exp_result >= 31)
bge zero,s4,exp_less #else if (exp_result <= 0)
j combine_result
exp_greater:
li s4,31 #exp_result = 31;
li s6,0 #mant_result = 0;
j combine_result
exp_less:
li s4,0 #exp_result = 0;
li s6,0 #mant_result = 0;
j combine_result
combine_result:
slli t1,s4,10
or t1,s2,t1
or a0,t1,s6 #return sign_a | (exp_result << 10) | mant_result;
#epilogue
lw s0,0(sp)
lw s1,4(sp)
lw s2,8(sp)
lw s3,12(sp)
lw s4,16(sp)
lw s5,20(sp)
lw s6,24(sp)
lw s7,28(sp)
lw ra,32(sp)
addi sp,sp,32
ret
align_mantissa:
blez a1,end_align_mantissa
srl a0,a0,a1
end_align_mantissa:
ret
#################################################################
printInput:
mv t1,a0
mv t2,a1
la a0,str1
li a7,4
ecall #print "Input x is : "
lw a0,0(t1)
li a7,34
ecall #print x[0]
la a0,str3
li a7,4
ecall #print space
lw a0,4(t1)
li a7,34
ecall #print x[1]
la a0,str3
li a7,4
ecall #print space
lw a0,8(t1)
li a7,34
ecall #print x[2]
la a0,str3
li a7,4
ecall #print space
##### below is for testcase 4 #####
#lw a0,12(t1)
#li a7,34
#ecall #print x[3]
###################################
la a0,str4
li a7,4
ecall #next line
la a0,str2
li a7,4
ecall #print "Input h is : "
lw a0,0(t2)
li a7,34
ecall #print h[0]
la a0,str3
li a7,4
ecall #print space
lw a0,4(t2)
li a7,34
ecall #print h[1]
la a0,str3
li a7,4
ecall #print space
lw a0,8(t2)
li a7,34
ecall #print h[2]
la a0,str4
li a7,4
ecall #next line
ret
#################################################################
printOutput:
mv a3,a0
la a0,str5
li a7,4
ecall #print "Output y is : "
lw a0,0(a3)
jal fp16_to_fp32
li a7,2
ecall #print y[1]
la a0,str3
li a7,4
ecall #print space
lw a0,4(a3)
jal fp16_to_fp32
li a7,2
ecall #print y[2]
la a0,str3
li a7,4
ecall #print space
lw a0,8(a3)
jal fp16_to_fp32
li a7,2
ecall #print y[3]
la a0,str3
li a7,4
ecall #print space
lw a0,12(a3)
jal fp16_to_fp32
li a7,2
ecall #print y[4]
la a0,str3
li a7,4
ecall #print space
lw a0,16(a3)
jal fp16_to_fp32
li a7,2
ecall #print y[5]
la a0,str3
li a7,4
ecall #print space
##### below is for testcase 4 #####
#lw a0,20(t6)
#li a7,2
#ecall #print y[6]
#la a0,str3
#li a7,4
#ecall #print space
###################################
li a7,10
ecall
#################################################################
fp16_to_fp32:
#prologue
addi sp,sp,-20
sw s0,0(sp)
sw s1,4(sp)
sw s2,8(sp)
sw s3,12(sp)
sw s4,20(sp)
#start
mv s0,a0
slli s1, s0, 16 # w(s1) = (uint32_t)h << 16;
li s2, 0x80000000
and s2, s1, s2 # sign(s2) = w & UINT32_C(0x80000000);
li s3,0x7FFFFFFF
and s3, s1, s3 # nonsign(s3) = w & UINT32_C(0x7FFFFFFF);
mv a0,s3
mv t6,ra
jal my_clz #renorm_shift = my_clz(nonsign);
jal renorm_if
mv s4, a0 #renorm_shift(s4) = renorm_shift > 5 ? renorm_shift - 5 : 0;
mv ra,t6
li t0,0x7F800000 #INT32_C(0x7F800000)
li t1,0x04000000
add t1,s3,t1 # nonsign + 0x04000000
srli t1,t1,8 # (nonsign + 0x04000000) >> 8
and t0,t0,t1 # inf_nan_mask(t0) = (nonsign + 0x04000000) >> 8 & 0x7F800000
addi t1,s3,-1
srli t1,t1,31 #zero_mask(t1) = (int32_t)(nonsign - 1) >> 31;
sll t2,s3,s4
srli t2,t2,3
li t3,0x70
sub t3,t3,s4
slli t3,t3,23
add t2,t2,t3 #((nonsign << renorm_shift >> 3) + ((0x70 - renorm_shift) << 23))
or t2,t2,t0 #(((nonsign << renorm_shift >> 3) + ((0x70 - renorm_shift) << 23)) | inf_nan_mask)
li t4,0xFFFFFFFF
xor t1,t1,t4 # ~zero_mask(~t1)
and t2,t2,t1 #((((nonsign << renorm_shift >> 3) + ((0x70 - renorm_shift) << 23)) | inf_nan_mask) & ~zero_mask)
or a0,s2,t2
#epilogue
lw s4,20(sp)
lw s3,12(sp)
lw s2,8(sp)
lw s1,4(sp)
lw s0,0(sp)
addi sp,sp,20
ret
my_clz:
addi sp,sp,-20
sw s0,0(sp)
sw s1,4(sp)
sw s2,8(sp)
sw s3,12(sp)
sw s4,20(sp)
li s0, 0 #set count
li s1, 31 #set i
clz_loop:
bltz s1, clz_done #if i<0, escape the loop
li s2, 1
sll s3, s2, s1
and s4, s3, a0
bnez s4, clz_done
addi s0, s0, 1
addi s1, s1, -1
j clz_loop
clz_done:
mv a0, s0
lw s4,20(sp)
lw s3,12(sp)
lw s2,8(sp)
lw s1,4(sp)
lw s0,0(sp)
addi sp,sp,20
ret
renorm_if:
li t0, 5
blt t0,a0,renorm_if_true
li a0,0
ret
renorm_if_true:
addi a0,a0,-5
ret
```