# Assignment1: RISC-V Assembly and Instruction Pipeline
contributed by <`jychen0611`> (Jia-Yang, Chen)
## quiz1 - problem C
:::danger
Choose one problem (A, B, or C) from [Quiz1](https://hackmd.io/@sysprog/arch2024-quiz1-sol), translate it from C code to a complete RISC-V assembly program, and include the relevant test data.
:::
### C code
```c
static inline float fabsf(float x) {
uint32_t i = *(uint32_t *)&x; // Read the bits of the float into an integer
i &= 0x7FFFFFFF; // Clear the sign bit to get the absolute value
x = *(float *)&i; // Write the modified bits back into the float
return x;
}
```
```c
static inline int my_clz(uint32_t x) {
int count = 0;
for (int i = 31; i >= 0; --i) {
if (x & (1U << i))
break;
count++;
}
return count;
}
```
```c
static inline uint32_t fp16_to_fp32(uint16_t h) {
/*
* Extends the 16-bit half-precision floating-point number to 32 bits
* by shifting it to the upper half of a 32-bit word:
* +---+-----+------------+-------------------+
* | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
* +---+-----+------------+-------------------+
* Bits 31 26-30 16-25 0-15
*
* S - sign bit, E - exponent bits, M - mantissa bits, 0 - zero bits.
*/
const uint32_t w = (uint32_t) h << 16;
/*
* Isolates the sign bit from the input number, placing it in the most
* significant bit of a 32-bit word:
*
* +---+----------------------------------+
* | S |0000000 00000000 00000000 00000000|
* +---+----------------------------------+
* Bits 31 0-31
*/
const uint32_t sign = w & UINT32_C(0x80000000);
/*
* Extracts the mantissa and exponent from the input number, placing
* them in bits 0-30 of the 32-bit word:
*
* +---+-----+------------+-------------------+
* | 0 |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
* +---+-----+------------+-------------------+
* Bits 30 27-31 17-26 0-16
*/
const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF);
/*
* The renorm_shift variable indicates how many bits the mantissa
* needs to be shifted to normalize the half-precision number.
* For normalized numbers, renorm_shift will be 0. For denormalized
* numbers, renorm_shift will be greater than 0. Shifting a
* denormalized number will move the mantissa into the exponent,
* normalizing it.
*/
uint32_t renorm_shift = my_clz(nonsign);
renorm_shift = renorm_shift > 5 ? renorm_shift - 5 : 0;
/*
* If the half-precision number has an exponent of 15, adding a
* specific value will cause overflow into bit 31, which converts
* the upper 9 bits into ones. Thus:
* inf_nan_mask ==
* 0x7F800000 if the half-precision number is
* NaN or infinity (exponent of 15)
* 0x00000000 otherwise
*/
const int32_t inf_nan_mask = ((int32_t)(nonsign + 0x04000000) >> 8) &
INT32_C(0x7F800000);
/*
* If nonsign equals 0, subtracting 1 will cause overflow, setting
* bit 31 to 1. Otherwise, bit 31 will be 0. Shifting this result
* propagates bit 31 across all bits in zero_mask. Thus:
* zero_mask ==
* 0xFFFFFFFF if the half-precision number is
* zero (+0.0h or -0.0h)
* 0x00000000 otherwise
*/
const int32_t zero_mask = (int32_t)(nonsign - 1) >> 31;
/*
* 1. Shifts nonsign left by renorm_shift to normalize it (for denormal
* inputs).
* 2. Shifts nonsign right by 3, adjusting the exponent to fit in the
* 8-bit exponent field and moving the mantissa into the correct
* position within the 23-bit mantissa field of the single-precision
* format.
* 3. Adds 0x70 to the exponent to account for the difference in bias
* between half-precision and single-precision.
* 4. Subtracts renorm_shift from the exponent to account for any
* renormalization that occurred.
* 5. ORs with inf_nan_mask to set the exponent to 0xFF if the input
* was NaN or infinity.
* 6. ANDs with the inverted zero_mask to set the mantissa and exponent
* to zero if the input was zero.
* 7. Combines everything with the sign bit of the input number.
*/
return sign | ((((nonsign << renorm_shift >> 3) +
((0x70 - renorm_shift) << 23)) | inf_nan_mask) & ~zero_mask);
}
```
:::danger
Show the full C source code corresponding to the original problem set.
:::
### Assembly code
```s
fabsf:
# prologue
addi sp, sp, -4
sw ra, 0(sp)
# i &= 0x7FFFFFFF;
li t0, 0x7FFFFFFF
and a0, a0, t0
# epilogue
lw ra, 0(sp)
addi sp, sp, 4
ret
```
```s
my_clz:
# prologue
addi sp, sp, -4
sw ra, 0(sp)
li t0, 0 # count = 0
li t1, 31 # i = 31
li t2, 0x80000000
my_clz_loop:
# if(x&(1U<<i)) break;
and t3, a0, t2
bnez t3, my_clz_exit
srli t2, t2, 1 # t2 >>= 1
addi t0, t0, 1 # count++
addi t1, t1, -1 # --i
bgez t1, my_clz_loop # while i>=0 --> loop
my_clz_exit:
mv a0, t0 # return count
# epilogue
lw ra, 0(sp)
addi sp, sp, 4
ret
```
```s
fp16_to_fp32:
# prologue
addi sp, sp, -4
sw ra, 0(sp)
# const uint32_t w = (uint32_t) h << 16;
mv t0, a0 # w
slli t0, t0, 16
# const uint32_t sign = w & UINT32_C(0x80000000);
mv t1, t0 # sign
li t3, 0x80000000
and t1, t1, t3
# const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF);
mv t2, t0 # nonsign
li t3, 0x7FFFFFFF
and t2, t2, t3
# uint32_t renorm_shift = my_clz(nonsign);
mv a0, t2
jal my_clz
mv t4, a0 # renorm_shift
# renorm_shift = renorm_shift > 5 ? renorm_shift - 5 : 0;
addi t4, t4, -5
bgtz t4, fp16_to_fp32_bigger_than_five
li t4, 0
fp16_to_fp32_bigger_than_five:
# const int32_t inf_nan_mask = ((int32_t)(nonsign + 0x04000000) >> 8) & INT32_C(0x7F800000);
mv t5, t2 # inf_nan_mask
li t3, 0x04000000
add t5, t5, t3
srli t5, t5, 8
li t3, 0x7F800000
and t5, t5, t3
# const int32_t zero_mask = (int32_t)(nonsign - 1) >> 31;
mv t6, t2 # zero_mask
addi t6, t6, -1
srli t6, t6, 31
# return sign | ((((nonsign << renorm_shift >> 3) + ((0x70 - renorm_shift) << 23)) | inf_nan_mask) & ~zero_mask);
sll a0, t2, t4
srli a0, a0, 3
li t3, 0x70
sub t3, t3, t4
slli t3, t3, 23
add a0, a0, t3
or a0, a0, t5
not t6, t6
and a0, a0, t6
or a0, t1, a0
# epilogue
lw ra, 0(sp)
addi sp, sp, 4
ret
```
## Use case ([power of four](https://leetcode.com/problems/power-of-four/description/))
>Given an integer n, return true if it is a power of four. Otherwise, return false.
>An integer n is a power of four, if there exists an integer x such that n == $4^x$.
>$-2^{31} <= n <= 2^{31} - 1$
* If a number n is a power of 4, then it must have only one bit set, and that bit must be in an odd position.
* Thus, `n&(n-1)` must be zero, and `32 - my_clz(n) - 1` must be even.
```c
static inline int my_clz(uint32_t x) {
int count = 0;
for (int i = 31; i >= 0; --i) {
if (x & (1U << i))
break;
count++;
}
return count;
}
bool isPowerOfFour(uint32_t n) {
if(n<=0)
return false;
/* in odd position && only one bit set(is power of two ) */
return !((32 - my_clz(n) - 1) % 2) && !(n&(n-1));
}
```
```s
.data
.text
j main
my_clz:
# prologue
addi sp, sp, -4
sw ra, 0(sp)
li t0, 0 # count = 0
li t1, 31 # i = 31
li t2, 0x80000000
my_clz_loop:
# if(x&(1U<<i)) break;
and t3, a0, t2
bnez t3, my_clz_exit
srli t2, t2, 1 # t2 >>= 1
addi t0, t0, 1 # count++
addi t1, t1, -1 # --i
bgez t1, my_clz_loop # while i>=0 --> loop
my_clz_exit:
mv a0, t0 # return count
# epilogue
lw ra, 0(sp)
addi sp, sp, 4
ret
is_pow_of_4:
# prologue
addi sp, sp, -8
sw ra, 0(sp)
sw s0, 4(sp)
blez a0, is_pow_of_4_lez
mv s0, a0 # s0 = n
jal my_clz
mv t0, a0 # t0 = lz
addi t1, s0, -1
and t1, s0, t1
beqz t1, is_pow_of_2
li t1, 1
is_pow_of_2:
xori t1, t1, 1 # t1 = is_pow_of_two
li t2, 31
sub t2, t2, t0
andi t2, t2, 1
xori t2, t2, 1 # t2 = is_in_odd_position
and a0, t1, t2
j is_pow_of_4_exit
is_pow_of_4_lez:
li a0, 0
is_pow_of_4_exit:
# epilogue
lw s0, 4(sp)
lw ra, 0(sp)
addi sp, sp, 8
ret
main:
li a0, 16
jal is_pow_of_4
```

## Use case (p-Norm)
Given an `3-dimension` floating point vector `v` and an integer `p`, return its p-Norm.
* $1<= p <= 2$
### p-Norm
$||v||_{p} = (\sum_{i=1}^{n}{|x_i|}^p)^\frac{1}{p}$
### C code
$e^x = 1 + \frac{x}{1!} + \frac{x^2}{2!} + \frac{x^3}{3!}+ ...$
```c
static inline float my_exp(float x) {
float result = 1.0f;
float term = 1.0f;
for(int n = 1; n <= 20; n++){
term *= x / n;
result += term;
}
return result;
}
```
$ln(x)=2\sum_{n=0}^{\infty}\frac{1}{2n+1}(\frac{x-1}{x+1})^{2n+1}$
```c
static inline float my_ln(float x) {
if(x <= 0.0f)
return -1.0f;
float result = 0.0f;
float term = (x - 1) / (x + 1);
float term_squared = term * term;
float current_term = term;
for(int n = 1; n <= 100; n += 2){
result += current_term / n;
current_term *= term_squared;
}
return 2 * result;
}
```
$base^{fraction} = e^{fraction\ *\ ln(base)}$
```c
static inline float my_pow(float base, float exponent){
if(base == 0.0f && exponent < 0.0f)
return 0.0f;
if(exponent == 0.0f)
return 1.0f;
float result = 1.0f;
int exp_int = (int)exponent;
float fraction = exponent - exp_int;
if(exp_int < 0){
base = 1.0f / base;
exp_int = -exp_int;
}
for(int i = 0; i < exp_int; ++i)
result *= base;
if(fraction > 0)
result *= my_exp(fraction * my_ln(base));
return result;
}
```
```c
static inline float my_root(float x, int n){
return my_pow(x, 1.0 / n);
}
```
```c
static inline float find_norm(float* v, int p){
float res = 0.0f;
for(int i=0;i<2;++i){
res += my_pow(fabsf(v[i]), p);
}
res = my_root(res, p);
return res;
}
```
### e.g.
:::danger
You should automate the testing procedures as much as possible, meaning there's no need to manually check each program output. Instead, implement internal validation to handle the result checking.
:::
TEST_CASE1 {1.0, 2.0, 3.0}

TEST_CASE2 {1.0, -2.0, 3.0}

TEST_CASE3 {0.0, -1.0, -7.0}

```shell
$ ./pnorm
TEST_CASE1 {1.0, 2.0, 3.0}
1-norm : 6.000000
2-norm : 3.741657
TEST_CASE2 {1.0, -2.0, 3.0}
1-norm : 6.000000
2-norm : 3.741657
TEST_CASE3 {0.0, -1.0, -7.0}
1-norm : 8.000000
2-norm : 7.057732
```
### Rewrite using `fmul32`, `fdiv32`, and `fadd32`
#### fmul32
```c
/* fmul32 */
float fmul32(float a, float b)
{
int32_t ia = *(int32_t *)&a, ib = *(int32_t *)&b;
if (ia == 0 || ib == 0) return 0;
int32_t exp_a = (ia & 0x7F800000) >> 23;
int32_t exp_b = (ib & 0x7F800000) >> 23;
int32_t fra_a = ia & 0x7FFFFF;
int32_t fra_b = ib & 0x7FFFFF;
/* TODO: Special values like NaN and INF */
if((exp_a == 255 && fra_a == 0 ) || (exp_b == 255 && fra_b == 0) ){
printf("Infinity value!\n");
}
if((exp_a == 255 && fra_a != 0 ) || (exp_b == 255 && fra_b != 0) ){
printf("NaN value!\n");
}
/* mantissa */
int32_t ma = (ia & 0x7FFFFF) | 0x800000;
int32_t mb = (ib & 0x7FFFFF) | 0x800000;
int32_t sea = ia & 0xFF800000;
int32_t seb = ib & 0xFF800000;
/* result of mantissa */
int32_t m = imul24(ma, mb);
int32_t mshift = getbit(m, 24);
m >>= mshift;
int32_t r = ((sea - 0x3f800000 + seb) & 0xFF800000) + m - (0x800000 & -!mshift);
int32_t ovfl = (r ^ ia ^ ib) >> 31;
r = r ^ ( (r ^ 0x7f800000) & ovfl);
return *(float *)&r;
}
```
```c
/* my_exp */
static inline float my_exp(float x) {
float result = 1.0f;
float term = 1.0f;
for(int n = 1; n <= 5; n++){
term = fmul32(term, fdiv32(x, (float)n));
result = fadd32(result, term);
}
return result;
}
/* my_ln */
static inline float my_ln(float x) {
if(x <= 0.0f)
return -1.0f;
if(x == 1.0f)
return 0.0f;
if(x <= 2.8f)
return 1.0f;
float result = 0.0f;
float term = fdiv32(fadd32(x, -1.0f), fadd32(x, 1.0f));
float term_squared = fmul32(term, term);
float current_term = term;
for(int n = 1; n <= 100; n += 2){
result = fadd32(result, fdiv32(current_term, n));
current_term = fmul32(current_term, term_squared);
}
return fmul32(2.0f, result);
}
/* my_pow */
static inline float my_pow(float base, float exponent){
if(base == 0.0f && exponent < 0.0f)
return 0.0f;
if(exponent == 0.0f)
return 1.0f;
float result = 1.0f;
int exp_int = (int)exponent;
float fraction = fadd32(exponent, (float)-exp_int);
if(exp_int < 0){
base = fdiv32(1.0f, base);
exp_int = -exp_int;
}
for(int i = 0; i < exp_int; ++i)
result = fmul32(result, base);
if(fraction > 0)
result = fmul32(result, my_exp(fmul32(fraction, my_ln(base))));
return result;
}
/* my_root */
static inline float my_root(float x, int n){
return my_pow(x, fdiv32(1.0f, (float)n));
}
/* find_norm */
static inline float find_norm(float* v, int n, int p){
float res = 0.0f;
for(int i=0;i<n;++i){
float pow_val = my_pow(fabsf(v[i]), p);
res = fadd32(res, pow_val);
}
res = my_root(res, p);
return res;
}
```
```shell
$ ./pnorm
TEST_CASE1 {1.0, 2.0, 3.0}
1-norm : 6.000000
2-norm : 3.732667
TEST_CASE2 {1.0, -2.0, 3.0}
1-norm : 6.000000
2-norm : 3.732667
TEST_CASE3 {0.0, -1.0, -7.0}
1-norm : 8.000001
2-norm : 6.952097
```
### Assembly code
```s
```
## Analysis
## Reference
* [Lp-space](https://en.wikipedia.org/wiki/Lp_space)
* [Taylor series](https://en.wikipedia.org/wiki/Taylor_series)
* [Newton's method](https://en.wikipedia.org/wiki/Newton%27s_method)
* [Vector Norm Calculator](https://www.inchcalculator.com/vector-norm-calculator/)
* [Get sine value without floating point multiplication support](https://hackmd.io/@ranvd/computer-arch-hw1)
* [Quiz1 of Computer Architecture (2024 Fall)](https://hackmd.io/@sysprog/arch2024-quiz1-sol)