contributed by <jychen0611
> (Jia-Yang, Chen)
Choose one problem (A, B, or C) from Quiz1, translate it from C code to a complete RISC-V assembly program, and include the relevant test data.
static inline float fabsf(float x) {
uint32_t i = *(uint32_t *)&x; // Read the bits of the float into an integer
i &= 0x7FFFFFFF; // Clear the sign bit to get the absolute value
x = *(float *)&i; // Write the modified bits back into the float
return x;
}
static inline int my_clz(uint32_t x) {
int count = 0;
for (int i = 31; i >= 0; --i) {
if (x & (1U << i))
break;
count++;
}
return count;
}
static inline uint32_t fp16_to_fp32(uint16_t h) {
/*
* Extends the 16-bit half-precision floating-point number to 32 bits
* by shifting it to the upper half of a 32-bit word:
* +---+-----+------------+-------------------+
* | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
* +---+-----+------------+-------------------+
* Bits 31 26-30 16-25 0-15
*
* S - sign bit, E - exponent bits, M - mantissa bits, 0 - zero bits.
*/
const uint32_t w = (uint32_t) h << 16;
/*
* Isolates the sign bit from the input number, placing it in the most
* significant bit of a 32-bit word:
*
* +---+----------------------------------+
* | S |0000000 00000000 00000000 00000000|
* +---+----------------------------------+
* Bits 31 0-31
*/
const uint32_t sign = w & UINT32_C(0x80000000);
/*
* Extracts the mantissa and exponent from the input number, placing
* them in bits 0-30 of the 32-bit word:
*
* +---+-----+------------+-------------------+
* | 0 |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
* +---+-----+------------+-------------------+
* Bits 30 27-31 17-26 0-16
*/
const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF);
/*
* The renorm_shift variable indicates how many bits the mantissa
* needs to be shifted to normalize the half-precision number.
* For normalized numbers, renorm_shift will be 0. For denormalized
* numbers, renorm_shift will be greater than 0. Shifting a
* denormalized number will move the mantissa into the exponent,
* normalizing it.
*/
uint32_t renorm_shift = my_clz(nonsign);
renorm_shift = renorm_shift > 5 ? renorm_shift - 5 : 0;
/*
* If the half-precision number has an exponent of 15, adding a
* specific value will cause overflow into bit 31, which converts
* the upper 9 bits into ones. Thus:
* inf_nan_mask ==
* 0x7F800000 if the half-precision number is
* NaN or infinity (exponent of 15)
* 0x00000000 otherwise
*/
const int32_t inf_nan_mask = ((int32_t)(nonsign + 0x04000000) >> 8) &
INT32_C(0x7F800000);
/*
* If nonsign equals 0, subtracting 1 will cause overflow, setting
* bit 31 to 1. Otherwise, bit 31 will be 0. Shifting this result
* propagates bit 31 across all bits in zero_mask. Thus:
* zero_mask ==
* 0xFFFFFFFF if the half-precision number is
* zero (+0.0h or -0.0h)
* 0x00000000 otherwise
*/
const int32_t zero_mask = (int32_t)(nonsign - 1) >> 31;
/*
* 1. Shifts nonsign left by renorm_shift to normalize it (for denormal
* inputs).
* 2. Shifts nonsign right by 3, adjusting the exponent to fit in the
* 8-bit exponent field and moving the mantissa into the correct
* position within the 23-bit mantissa field of the single-precision
* format.
* 3. Adds 0x70 to the exponent to account for the difference in bias
* between half-precision and single-precision.
* 4. Subtracts renorm_shift from the exponent to account for any
* renormalization that occurred.
* 5. ORs with inf_nan_mask to set the exponent to 0xFF if the input
* was NaN or infinity.
* 6. ANDs with the inverted zero_mask to set the mantissa and exponent
* to zero if the input was zero.
* 7. Combines everything with the sign bit of the input number.
*/
return sign | ((((nonsign << renorm_shift >> 3) +
((0x70 - renorm_shift) << 23)) | inf_nan_mask) & ~zero_mask);
}
Show the full C source code corresponding to the original problem set.
fabsf:
# prologue
addi sp, sp, -4
sw ra, 0(sp)
# i &= 0x7FFFFFFF;
li t0, 0x7FFFFFFF
and a0, a0, t0
# epilogue
lw ra, 0(sp)
addi sp, sp, 4
ret
my_clz:
# prologue
addi sp, sp, -4
sw ra, 0(sp)
li t0, 0 # count = 0
li t1, 31 # i = 31
li t2, 0x80000000
my_clz_loop:
# if(x&(1U<<i)) break;
and t3, a0, t2
bnez t3, my_clz_exit
srli t2, t2, 1 # t2 >>= 1
addi t0, t0, 1 # count++
addi t1, t1, -1 # --i
bgez t1, my_clz_loop # while i>=0 --> loop
my_clz_exit:
mv a0, t0 # return count
# epilogue
lw ra, 0(sp)
addi sp, sp, 4
ret
fp16_to_fp32:
# prologue
addi sp, sp, -4
sw ra, 0(sp)
# const uint32_t w = (uint32_t) h << 16;
mv t0, a0 # w
slli t0, t0, 16
# const uint32_t sign = w & UINT32_C(0x80000000);
mv t1, t0 # sign
li t3, 0x80000000
and t1, t1, t3
# const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF);
mv t2, t0 # nonsign
li t3, 0x7FFFFFFF
and t2, t2, t3
# uint32_t renorm_shift = my_clz(nonsign);
mv a0, t2
jal my_clz
mv t4, a0 # renorm_shift
# renorm_shift = renorm_shift > 5 ? renorm_shift - 5 : 0;
addi t4, t4, -5
bgtz t4, fp16_to_fp32_bigger_than_five
li t4, 0
fp16_to_fp32_bigger_than_five:
# const int32_t inf_nan_mask = ((int32_t)(nonsign + 0x04000000) >> 8) & INT32_C(0x7F800000);
mv t5, t2 # inf_nan_mask
li t3, 0x04000000
add t5, t5, t3
srli t5, t5, 8
li t3, 0x7F800000
and t5, t5, t3
# const int32_t zero_mask = (int32_t)(nonsign - 1) >> 31;
mv t6, t2 # zero_mask
addi t6, t6, -1
srli t6, t6, 31
# return sign | ((((nonsign << renorm_shift >> 3) + ((0x70 - renorm_shift) << 23)) | inf_nan_mask) & ~zero_mask);
sll a0, t2, t4
srli a0, a0, 3
li t3, 0x70
sub t3, t3, t4
slli t3, t3, 23
add a0, a0, t3
or a0, a0, t5
not t6, t6
and a0, a0, t6
or a0, t1, a0
# epilogue
lw ra, 0(sp)
addi sp, sp, 4
ret
Given an integer n, return true if it is a power of four. Otherwise, return false.
An integer n is a power of four, if there exists an integer x such that n ==
.
n&(n-1)
must be zero, and 32 - my_clz(n) - 1
must be even.static inline int my_clz(uint32_t x) {
int count = 0;
for (int i = 31; i >= 0; --i) {
if (x & (1U << i))
break;
count++;
}
return count;
}
bool isPowerOfFour(uint32_t n) {
if(n<=0)
return false;
/* in odd position && only one bit set(is power of two ) */
return !((32 - my_clz(n) - 1) % 2) && !(n&(n-1));
}
.data
.text
j main
my_clz:
# prologue
addi sp, sp, -4
sw ra, 0(sp)
li t0, 0 # count = 0
li t1, 31 # i = 31
li t2, 0x80000000
my_clz_loop:
# if(x&(1U<<i)) break;
and t3, a0, t2
bnez t3, my_clz_exit
srli t2, t2, 1 # t2 >>= 1
addi t0, t0, 1 # count++
addi t1, t1, -1 # --i
bgez t1, my_clz_loop # while i>=0 --> loop
my_clz_exit:
mv a0, t0 # return count
# epilogue
lw ra, 0(sp)
addi sp, sp, 4
ret
is_pow_of_4:
# prologue
addi sp, sp, -8
sw ra, 0(sp)
sw s0, 4(sp)
blez a0, is_pow_of_4_lez
mv s0, a0 # s0 = n
jal my_clz
mv t0, a0 # t0 = lz
addi t1, s0, -1
and t1, s0, t1
beqz t1, is_pow_of_2
li t1, 1
is_pow_of_2:
xori t1, t1, 1 # t1 = is_pow_of_two
li t2, 31
sub t2, t2, t0
andi t2, t2, 1
xori t2, t2, 1 # t2 = is_in_odd_position
and a0, t1, t2
j is_pow_of_4_exit
is_pow_of_4_lez:
li a0, 0
is_pow_of_4_exit:
# epilogue
lw s0, 4(sp)
lw ra, 0(sp)
addi sp, sp, 8
ret
main:
li a0, 16
jal is_pow_of_4
Given an 3-dimension
floating point vector v
and an integer p
, return its p-Norm.
static inline float my_exp(float x) {
float result = 1.0f;
float term = 1.0f;
for(int n = 1; n <= 20; n++){
term *= x / n;
result += term;
}
return result;
}
static inline float my_ln(float x) {
if(x <= 0.0f)
return -1.0f;
float result = 0.0f;
float term = (x - 1) / (x + 1);
float term_squared = term * term;
float current_term = term;
for(int n = 1; n <= 100; n += 2){
result += current_term / n;
current_term *= term_squared;
}
return 2 * result;
}
static inline float my_pow(float base, float exponent){
if(base == 0.0f && exponent < 0.0f)
return 0.0f;
if(exponent == 0.0f)
return 1.0f;
float result = 1.0f;
int exp_int = (int)exponent;
float fraction = exponent - exp_int;
if(exp_int < 0){
base = 1.0f / base;
exp_int = -exp_int;
}
for(int i = 0; i < exp_int; ++i)
result *= base;
if(fraction > 0)
result *= my_exp(fraction * my_ln(base));
return result;
}
static inline float my_root(float x, int n){
return my_pow(x, 1.0 / n);
}
static inline float find_norm(float* v, int p){
float res = 0.0f;
for(int i=0;i<2;++i){
res += my_pow(fabsf(v[i]), p);
}
res = my_root(res, p);
return res;
}
You should automate the testing procedures as much as possible, meaning there's no need to manually check each program output. Instead, implement internal validation to handle the result checking.
TEST_CASE1 {1.0, 2.0, 3.0}
TEST_CASE2 {1.0, -2.0, 3.0}
TEST_CASE3 {0.0, -1.0, -7.0}
$ ./pnorm
TEST_CASE1 {1.0, 2.0, 3.0}
1-norm : 6.000000
2-norm : 3.741657
TEST_CASE2 {1.0, -2.0, 3.0}
1-norm : 6.000000
2-norm : 3.741657
TEST_CASE3 {0.0, -1.0, -7.0}
1-norm : 8.000000
2-norm : 7.057732
fmul32
, fdiv32
, and fadd32
/* fmul32 */
float fmul32(float a, float b)
{
int32_t ia = *(int32_t *)&a, ib = *(int32_t *)&b;
if (ia == 0 || ib == 0) return 0;
int32_t exp_a = (ia & 0x7F800000) >> 23;
int32_t exp_b = (ib & 0x7F800000) >> 23;
int32_t fra_a = ia & 0x7FFFFF;
int32_t fra_b = ib & 0x7FFFFF;
/* TODO: Special values like NaN and INF */
if((exp_a == 255 && fra_a == 0 ) || (exp_b == 255 && fra_b == 0) ){
printf("Infinity value!\n");
}
if((exp_a == 255 && fra_a != 0 ) || (exp_b == 255 && fra_b != 0) ){
printf("NaN value!\n");
}
/* mantissa */
int32_t ma = (ia & 0x7FFFFF) | 0x800000;
int32_t mb = (ib & 0x7FFFFF) | 0x800000;
int32_t sea = ia & 0xFF800000;
int32_t seb = ib & 0xFF800000;
/* result of mantissa */
int32_t m = imul24(ma, mb);
int32_t mshift = getbit(m, 24);
m >>= mshift;
int32_t r = ((sea - 0x3f800000 + seb) & 0xFF800000) + m - (0x800000 & -!mshift);
int32_t ovfl = (r ^ ia ^ ib) >> 31;
r = r ^ ( (r ^ 0x7f800000) & ovfl);
return *(float *)&r;
}
/* my_exp */
static inline float my_exp(float x) {
float result = 1.0f;
float term = 1.0f;
for(int n = 1; n <= 5; n++){
term = fmul32(term, fdiv32(x, (float)n));
result = fadd32(result, term);
}
return result;
}
/* my_ln */
static inline float my_ln(float x) {
if(x <= 0.0f)
return -1.0f;
if(x == 1.0f)
return 0.0f;
if(x <= 2.8f)
return 1.0f;
float result = 0.0f;
float term = fdiv32(fadd32(x, -1.0f), fadd32(x, 1.0f));
float term_squared = fmul32(term, term);
float current_term = term;
for(int n = 1; n <= 100; n += 2){
result = fadd32(result, fdiv32(current_term, n));
current_term = fmul32(current_term, term_squared);
}
return fmul32(2.0f, result);
}
/* my_pow */
static inline float my_pow(float base, float exponent){
if(base == 0.0f && exponent < 0.0f)
return 0.0f;
if(exponent == 0.0f)
return 1.0f;
float result = 1.0f;
int exp_int = (int)exponent;
float fraction = fadd32(exponent, (float)-exp_int);
if(exp_int < 0){
base = fdiv32(1.0f, base);
exp_int = -exp_int;
}
for(int i = 0; i < exp_int; ++i)
result = fmul32(result, base);
if(fraction > 0)
result = fmul32(result, my_exp(fmul32(fraction, my_ln(base))));
return result;
}
/* my_root */
static inline float my_root(float x, int n){
return my_pow(x, fdiv32(1.0f, (float)n));
}
/* find_norm */
static inline float find_norm(float* v, int n, int p){
float res = 0.0f;
for(int i=0;i<n;++i){
float pow_val = my_pow(fabsf(v[i]), p);
res = fadd32(res, pow_val);
}
res = my_root(res, p);
return res;
}
$ ./pnorm
TEST_CASE1 {1.0, 2.0, 3.0}
1-norm : 6.000000
2-norm : 3.732667
TEST_CASE2 {1.0, -2.0, 3.0}
1-norm : 6.000000
2-norm : 3.732667
TEST_CASE3 {0.0, -1.0, -7.0}
1-norm : 8.000001
2-norm : 6.952097
contributed by < jychen0611 >
Feb 20, 2025contributed by < jychen0611 >
Jun 4, 2024basic service set (BSS) : a group of stations that communicate with each other
May 29, 20242024q1 Homework6 (integration)
May 22, 2024or
By clicking below, you agree to our terms of service.
New to HackMD? Sign up