# Assignment1: RISC-V Assembly and Instruction Pipeline contributed by <`jychen0611`> (Jia-Yang, Chen) ## quiz1 - problem C :::danger Choose one problem (A, B, or C) from [Quiz1](https://hackmd.io/@sysprog/arch2024-quiz1-sol), translate it from C code to a complete RISC-V assembly program, and include the relevant test data. ::: ### C code ```c static inline float fabsf(float x) { uint32_t i = *(uint32_t *)&x; // Read the bits of the float into an integer i &= 0x7FFFFFFF; // Clear the sign bit to get the absolute value x = *(float *)&i; // Write the modified bits back into the float return x; } ``` ```c static inline int my_clz(uint32_t x) { int count = 0; for (int i = 31; i >= 0; --i) { if (x & (1U << i)) break; count++; } return count; } ``` ```c static inline uint32_t fp16_to_fp32(uint16_t h) { /* * Extends the 16-bit half-precision floating-point number to 32 bits * by shifting it to the upper half of a 32-bit word: * +---+-----+------------+-------------------+ * | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000| * +---+-----+------------+-------------------+ * Bits 31 26-30 16-25 0-15 * * S - sign bit, E - exponent bits, M - mantissa bits, 0 - zero bits. */ const uint32_t w = (uint32_t) h << 16; /* * Isolates the sign bit from the input number, placing it in the most * significant bit of a 32-bit word: * * +---+----------------------------------+ * | S |0000000 00000000 00000000 00000000| * +---+----------------------------------+ * Bits 31 0-31 */ const uint32_t sign = w & UINT32_C(0x80000000); /* * Extracts the mantissa and exponent from the input number, placing * them in bits 0-30 of the 32-bit word: * * +---+-----+------------+-------------------+ * | 0 |EEEEE|MM MMMM MMMM|0000 0000 0000 0000| * +---+-----+------------+-------------------+ * Bits 30 27-31 17-26 0-16 */ const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF); /* * The renorm_shift variable indicates how many bits the mantissa * needs to be shifted to normalize the half-precision number. * For normalized numbers, renorm_shift will be 0. For denormalized * numbers, renorm_shift will be greater than 0. Shifting a * denormalized number will move the mantissa into the exponent, * normalizing it. */ uint32_t renorm_shift = my_clz(nonsign); renorm_shift = renorm_shift > 5 ? renorm_shift - 5 : 0; /* * If the half-precision number has an exponent of 15, adding a * specific value will cause overflow into bit 31, which converts * the upper 9 bits into ones. Thus: * inf_nan_mask == * 0x7F800000 if the half-precision number is * NaN or infinity (exponent of 15) * 0x00000000 otherwise */ const int32_t inf_nan_mask = ((int32_t)(nonsign + 0x04000000) >> 8) & INT32_C(0x7F800000); /* * If nonsign equals 0, subtracting 1 will cause overflow, setting * bit 31 to 1. Otherwise, bit 31 will be 0. Shifting this result * propagates bit 31 across all bits in zero_mask. Thus: * zero_mask == * 0xFFFFFFFF if the half-precision number is * zero (+0.0h or -0.0h) * 0x00000000 otherwise */ const int32_t zero_mask = (int32_t)(nonsign - 1) >> 31; /* * 1. Shifts nonsign left by renorm_shift to normalize it (for denormal * inputs). * 2. Shifts nonsign right by 3, adjusting the exponent to fit in the * 8-bit exponent field and moving the mantissa into the correct * position within the 23-bit mantissa field of the single-precision * format. * 3. Adds 0x70 to the exponent to account for the difference in bias * between half-precision and single-precision. * 4. Subtracts renorm_shift from the exponent to account for any * renormalization that occurred. * 5. ORs with inf_nan_mask to set the exponent to 0xFF if the input * was NaN or infinity. * 6. ANDs with the inverted zero_mask to set the mantissa and exponent * to zero if the input was zero. * 7. Combines everything with the sign bit of the input number. */ return sign | ((((nonsign << renorm_shift >> 3) + ((0x70 - renorm_shift) << 23)) | inf_nan_mask) & ~zero_mask); } ``` :::danger Show the full C source code corresponding to the original problem set. ::: ### Assembly code ```s fabsf: # prologue addi sp, sp, -4 sw ra, 0(sp) # i &= 0x7FFFFFFF; li t0, 0x7FFFFFFF and a0, a0, t0 # epilogue lw ra, 0(sp) addi sp, sp, 4 ret ``` ```s my_clz: # prologue addi sp, sp, -4 sw ra, 0(sp) li t0, 0 # count = 0 li t1, 31 # i = 31 li t2, 0x80000000 my_clz_loop: # if(x&(1U<<i)) break; and t3, a0, t2 bnez t3, my_clz_exit srli t2, t2, 1 # t2 >>= 1 addi t0, t0, 1 # count++ addi t1, t1, -1 # --i bgez t1, my_clz_loop # while i>=0 --> loop my_clz_exit: mv a0, t0 # return count # epilogue lw ra, 0(sp) addi sp, sp, 4 ret ``` ```s fp16_to_fp32: # prologue addi sp, sp, -4 sw ra, 0(sp) # const uint32_t w = (uint32_t) h << 16; mv t0, a0 # w slli t0, t0, 16 # const uint32_t sign = w & UINT32_C(0x80000000); mv t1, t0 # sign li t3, 0x80000000 and t1, t1, t3 # const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF); mv t2, t0 # nonsign li t3, 0x7FFFFFFF and t2, t2, t3 # uint32_t renorm_shift = my_clz(nonsign); mv a0, t2 jal my_clz mv t4, a0 # renorm_shift # renorm_shift = renorm_shift > 5 ? renorm_shift - 5 : 0; addi t4, t4, -5 bgtz t4, fp16_to_fp32_bigger_than_five li t4, 0 fp16_to_fp32_bigger_than_five: # const int32_t inf_nan_mask = ((int32_t)(nonsign + 0x04000000) >> 8) & INT32_C(0x7F800000); mv t5, t2 # inf_nan_mask li t3, 0x04000000 add t5, t5, t3 srli t5, t5, 8 li t3, 0x7F800000 and t5, t5, t3 # const int32_t zero_mask = (int32_t)(nonsign - 1) >> 31; mv t6, t2 # zero_mask addi t6, t6, -1 srli t6, t6, 31 # return sign | ((((nonsign << renorm_shift >> 3) + ((0x70 - renorm_shift) << 23)) | inf_nan_mask) & ~zero_mask); sll a0, t2, t4 srli a0, a0, 3 li t3, 0x70 sub t3, t3, t4 slli t3, t3, 23 add a0, a0, t3 or a0, a0, t5 not t6, t6 and a0, a0, t6 or a0, t1, a0 # epilogue lw ra, 0(sp) addi sp, sp, 4 ret ``` ## Use case ([power of four](https://leetcode.com/problems/power-of-four/description/)) >Given an integer n, return true if it is a power of four. Otherwise, return false. >An integer n is a power of four, if there exists an integer x such that n == $4^x$. >$-2^{31} <= n <= 2^{31} - 1$ * If a number n is a power of 4, then it must have only one bit set, and that bit must be in an odd position. * Thus, `n&(n-1)` must be zero, and `32 - my_clz(n) - 1` must be even. ```c static inline int my_clz(uint32_t x) { int count = 0; for (int i = 31; i >= 0; --i) { if (x & (1U << i)) break; count++; } return count; } bool isPowerOfFour(uint32_t n) { if(n<=0) return false; /* in odd position && only one bit set(is power of two ) */ return !((32 - my_clz(n) - 1) % 2) && !(n&(n-1)); } ``` ```s .data .text j main my_clz: # prologue addi sp, sp, -4 sw ra, 0(sp) li t0, 0 # count = 0 li t1, 31 # i = 31 li t2, 0x80000000 my_clz_loop: # if(x&(1U<<i)) break; and t3, a0, t2 bnez t3, my_clz_exit srli t2, t2, 1 # t2 >>= 1 addi t0, t0, 1 # count++ addi t1, t1, -1 # --i bgez t1, my_clz_loop # while i>=0 --> loop my_clz_exit: mv a0, t0 # return count # epilogue lw ra, 0(sp) addi sp, sp, 4 ret is_pow_of_4: # prologue addi sp, sp, -8 sw ra, 0(sp) sw s0, 4(sp) blez a0, is_pow_of_4_lez mv s0, a0 # s0 = n jal my_clz mv t0, a0 # t0 = lz addi t1, s0, -1 and t1, s0, t1 beqz t1, is_pow_of_2 li t1, 1 is_pow_of_2: xori t1, t1, 1 # t1 = is_pow_of_two li t2, 31 sub t2, t2, t0 andi t2, t2, 1 xori t2, t2, 1 # t2 = is_in_odd_position and a0, t1, t2 j is_pow_of_4_exit is_pow_of_4_lez: li a0, 0 is_pow_of_4_exit: # epilogue lw s0, 4(sp) lw ra, 0(sp) addi sp, sp, 8 ret main: li a0, 16 jal is_pow_of_4 ``` ![image](https://hackmd.io/_uploads/H1WS1ti1kx.png) ## Use case (p-Norm) Given an `3-dimension` floating point vector `v` and an integer `p`, return its p-Norm. * $1<= p <= 2$ ### p-Norm $||v||_{p} = (\sum_{i=1}^{n}{|x_i|}^p)^\frac{1}{p}$ ### C code $e^x = 1 + \frac{x}{1!} + \frac{x^2}{2!} + \frac{x^3}{3!}+ ...$ ```c static inline float my_exp(float x) { float result = 1.0f; float term = 1.0f; for(int n = 1; n <= 20; n++){ term *= x / n; result += term; } return result; } ``` $ln(x)=2\sum_{n=0}^{\infty}\frac{1}{2n+1}(\frac{x-1}{x+1})^{2n+1}$ ```c static inline float my_ln(float x) { if(x <= 0.0f) return -1.0f; float result = 0.0f; float term = (x - 1) / (x + 1); float term_squared = term * term; float current_term = term; for(int n = 1; n <= 100; n += 2){ result += current_term / n; current_term *= term_squared; } return 2 * result; } ``` $base^{fraction} = e^{fraction\ *\ ln(base)}$ ```c static inline float my_pow(float base, float exponent){ if(base == 0.0f && exponent < 0.0f) return 0.0f; if(exponent == 0.0f) return 1.0f; float result = 1.0f; int exp_int = (int)exponent; float fraction = exponent - exp_int; if(exp_int < 0){ base = 1.0f / base; exp_int = -exp_int; } for(int i = 0; i < exp_int; ++i) result *= base; if(fraction > 0) result *= my_exp(fraction * my_ln(base)); return result; } ``` ```c static inline float my_root(float x, int n){ return my_pow(x, 1.0 / n); } ``` ```c static inline float find_norm(float* v, int p){ float res = 0.0f; for(int i=0;i<2;++i){ res += my_pow(fabsf(v[i]), p); } res = my_root(res, p); return res; } ``` ### e.g. :::danger You should automate the testing procedures as much as possible, meaning there's no need to manually check each program output. Instead, implement internal validation to handle the result checking. ::: TEST_CASE1 {1.0, 2.0, 3.0} ![image](https://hackmd.io/_uploads/HyoBbM-k1x.png) TEST_CASE2 {1.0, -2.0, 3.0} ![image](https://hackmd.io/_uploads/SJtrZzWkyl.png) TEST_CASE3 {0.0, -1.0, -7.0} ![image](https://hackmd.io/_uploads/S1vQZzZ11l.png) ```shell $ ./pnorm TEST_CASE1 {1.0, 2.0, 3.0} 1-norm : 6.000000 2-norm : 3.741657 TEST_CASE2 {1.0, -2.0, 3.0} 1-norm : 6.000000 2-norm : 3.741657 TEST_CASE3 {0.0, -1.0, -7.0} 1-norm : 8.000000 2-norm : 7.057732 ``` ### Rewrite using `fmul32`, `fdiv32`, and `fadd32` #### fmul32 ```c /* fmul32 */ float fmul32(float a, float b) { int32_t ia = *(int32_t *)&a, ib = *(int32_t *)&b; if (ia == 0 || ib == 0) return 0; int32_t exp_a = (ia & 0x7F800000) >> 23; int32_t exp_b = (ib & 0x7F800000) >> 23; int32_t fra_a = ia & 0x7FFFFF; int32_t fra_b = ib & 0x7FFFFF; /* TODO: Special values like NaN and INF */ if((exp_a == 255 && fra_a == 0 ) || (exp_b == 255 && fra_b == 0) ){ printf("Infinity value!\n"); } if((exp_a == 255 && fra_a != 0 ) || (exp_b == 255 && fra_b != 0) ){ printf("NaN value!\n"); } /* mantissa */ int32_t ma = (ia & 0x7FFFFF) | 0x800000; int32_t mb = (ib & 0x7FFFFF) | 0x800000; int32_t sea = ia & 0xFF800000; int32_t seb = ib & 0xFF800000; /* result of mantissa */ int32_t m = imul24(ma, mb); int32_t mshift = getbit(m, 24); m >>= mshift; int32_t r = ((sea - 0x3f800000 + seb) & 0xFF800000) + m - (0x800000 & -!mshift); int32_t ovfl = (r ^ ia ^ ib) >> 31; r = r ^ ( (r ^ 0x7f800000) & ovfl); return *(float *)&r; } ``` ```c /* my_exp */ static inline float my_exp(float x) { float result = 1.0f; float term = 1.0f; for(int n = 1; n <= 5; n++){ term = fmul32(term, fdiv32(x, (float)n)); result = fadd32(result, term); } return result; } /* my_ln */ static inline float my_ln(float x) { if(x <= 0.0f) return -1.0f; if(x == 1.0f) return 0.0f; if(x <= 2.8f) return 1.0f; float result = 0.0f; float term = fdiv32(fadd32(x, -1.0f), fadd32(x, 1.0f)); float term_squared = fmul32(term, term); float current_term = term; for(int n = 1; n <= 100; n += 2){ result = fadd32(result, fdiv32(current_term, n)); current_term = fmul32(current_term, term_squared); } return fmul32(2.0f, result); } /* my_pow */ static inline float my_pow(float base, float exponent){ if(base == 0.0f && exponent < 0.0f) return 0.0f; if(exponent == 0.0f) return 1.0f; float result = 1.0f; int exp_int = (int)exponent; float fraction = fadd32(exponent, (float)-exp_int); if(exp_int < 0){ base = fdiv32(1.0f, base); exp_int = -exp_int; } for(int i = 0; i < exp_int; ++i) result = fmul32(result, base); if(fraction > 0) result = fmul32(result, my_exp(fmul32(fraction, my_ln(base)))); return result; } /* my_root */ static inline float my_root(float x, int n){ return my_pow(x, fdiv32(1.0f, (float)n)); } /* find_norm */ static inline float find_norm(float* v, int n, int p){ float res = 0.0f; for(int i=0;i<n;++i){ float pow_val = my_pow(fabsf(v[i]), p); res = fadd32(res, pow_val); } res = my_root(res, p); return res; } ``` ```shell $ ./pnorm TEST_CASE1 {1.0, 2.0, 3.0} 1-norm : 6.000000 2-norm : 3.732667 TEST_CASE2 {1.0, -2.0, 3.0} 1-norm : 6.000000 2-norm : 3.732667 TEST_CASE3 {0.0, -1.0, -7.0} 1-norm : 8.000001 2-norm : 6.952097 ``` ### Assembly code ```s ``` ## Analysis ## Reference * [Lp-space](https://en.wikipedia.org/wiki/Lp_space) * [Taylor series](https://en.wikipedia.org/wiki/Taylor_series) * [Newton's method](https://en.wikipedia.org/wiki/Newton%27s_method) * [Vector Norm Calculator](https://www.inchcalculator.com/vector-norm-calculator/) * [Get sine value without floating point multiplication support](https://hackmd.io/@ranvd/computer-arch-hw1) * [Quiz1 of Computer Architecture (2024 Fall)](https://hackmd.io/@sysprog/arch2024-quiz1-sol)