Assignment1: RISC-V Assembly and Instruction Pipeline

contributed by <jychen0611> (Jia-Yang, Chen)

quiz1 - problem C

Choose one problem (A, B, or C) from Quiz1, translate it from C code to a complete RISC-V assembly program, and include the relevant test data.

C code

static inline float fabsf(float x) {
    uint32_t i = *(uint32_t *)&x;  // Read the bits of the float into an integer
    i &= 0x7FFFFFFF;               // Clear the sign bit to get the absolute value
    x = *(float *)&i;              // Write the modified bits back into the float
    return x;
}

static inline int my_clz(uint32_t x) {
    int count = 0;
    for (int i = 31; i >= 0; --i) {
        if (x & (1U << i))
            break;
        count++;
    }
    return count;
}

static inline uint32_t fp16_to_fp32(uint16_t h) {
    /*
     * Extends the 16-bit half-precision floating-point number to 32 bits
     * by shifting it to the upper half of a 32-bit word:
     *      +---+-----+------------+-------------------+
     *      | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
     *      +---+-----+------------+-------------------+
     * Bits  31  26-30    16-25            0-15
     *
     * S - sign bit, E - exponent bits, M - mantissa bits, 0 - zero bits.
     */
    const uint32_t w = (uint32_t) h << 16;
    
    /*
     * Isolates the sign bit from the input number, placing it in the most
     * significant bit of a 32-bit word:
     *
     *      +---+----------------------------------+
     *      | S |0000000 00000000 00000000 00000000|
     *      +---+----------------------------------+
     * Bits  31                 0-31
     */
    const uint32_t sign = w & UINT32_C(0x80000000);
    
    /*
     * Extracts the mantissa and exponent from the input number, placing
     * them in bits 0-30 of the 32-bit word:
     *
     *      +---+-----+------------+-------------------+
     *      | 0 |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
     *      +---+-----+------------+-------------------+
     * Bits  30  27-31     17-26            0-16
     */
    const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF);
    
    /*
     * The renorm_shift variable indicates how many bits the mantissa
     * needs to be shifted to normalize the half-precision number. 
     * For normalized numbers, renorm_shift will be 0. For denormalized
     * numbers, renorm_shift will be greater than 0. Shifting a 
     * denormalized number will move the mantissa into the exponent,
     * normalizing it.
     */
    uint32_t renorm_shift = my_clz(nonsign);
    renorm_shift = renorm_shift > 5 ? renorm_shift - 5 : 0;
    
    /*
     * If the half-precision number has an exponent of 15, adding a 
     * specific value will cause overflow into bit 31, which converts 
     * the upper 9 bits into ones. Thus:
     *   inf_nan_mask ==
     *                   0x7F800000 if the half-precision number is 
     *                   NaN or infinity (exponent of 15)
     *                   0x00000000 otherwise
     */
    const int32_t inf_nan_mask = ((int32_t)(nonsign + 0x04000000) >> 8) &
                                 INT32_C(0x7F800000);
    
    /*
     * If nonsign equals 0, subtracting 1 will cause overflow, setting
     * bit 31 to 1. Otherwise, bit 31 will be 0. Shifting this result
     * propagates bit 31 across all bits in zero_mask. Thus:
     *   zero_mask ==
     *                0xFFFFFFFF if the half-precision number is 
     *                zero (+0.0h or -0.0h)
     *                0x00000000 otherwise
     */
    const int32_t zero_mask = (int32_t)(nonsign - 1) >> 31;
    
    /*
     * 1. Shifts nonsign left by renorm_shift to normalize it (for denormal
     *    inputs).
     * 2. Shifts nonsign right by 3, adjusting the exponent to fit in the
     *    8-bit exponent field and moving the mantissa into the correct
     *    position within the 23-bit mantissa field of the single-precision
     *    format.
     * 3. Adds 0x70 to the exponent to account for the difference in bias
     *    between half-precision and single-precision.
     * 4. Subtracts renorm_shift from the exponent to account for any
     *    renormalization that occurred.
     * 5. ORs with inf_nan_mask to set the exponent to 0xFF if the input
     *    was NaN or infinity.
     * 6. ANDs with the inverted zero_mask to set the mantissa and exponent
     *    to zero if the input was zero.
     * 7. Combines everything with the sign bit of the input number.
     */
    return sign | ((((nonsign << renorm_shift >> 3) +
            ((0x70 - renorm_shift) << 23)) | inf_nan_mask) & ~zero_mask);
}

Show the full C source code corresponding to the original problem set.

Assembly code

fabsf:
    # prologue 
    addi sp, sp, -4
    sw ra, 0(sp)
    
    # i &= 0x7FFFFFFF; 
    li t0, 0x7FFFFFFF
    and a0, a0, t0

    # epilogue
    lw ra, 0(sp)
    addi sp, sp, 4
    ret

my_clz:
    # prologue 
    addi sp, sp, -4
    sw ra, 0(sp)

    li t0, 0 # count = 0
    li t1, 31 # i = 31
    li t2, 0x80000000
my_clz_loop:
    # if(x&(1U<<i)) break;
    and t3, a0, t2 
    bnez t3, my_clz_exit 
    
    srli t2, t2, 1 # t2 >>= 1
    addi t0, t0, 1 # count++
    addi t1, t1, -1 # --i
    bgez t1, my_clz_loop # while i>=0 --> loop
my_clz_exit:   
    mv a0, t0 # return count
    
    # epilogue
    lw ra, 0(sp)
    addi sp, sp, 4
    ret

fp16_to_fp32:
    # prologue 
    addi sp, sp, -4
    sw ra, 0(sp)
    
    # const uint32_t w = (uint32_t) h << 16;
    mv t0, a0 # w
    slli t0, t0, 16 
    
    # const uint32_t sign = w & UINT32_C(0x80000000);
    mv t1, t0 # sign
    li t3, 0x80000000
    and t1, t1, t3
    
    #  const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF);
    mv t2, t0 # nonsign
    li t3, 0x7FFFFFFF
    and t2, t2, t3
    
    # uint32_t renorm_shift = my_clz(nonsign);
    mv a0, t2
    jal my_clz
    mv t4, a0 # renorm_shift
    # renorm_shift = renorm_shift > 5 ? renorm_shift - 5 : 0;
    addi t4, t4, -5
    bgtz t4, fp16_to_fp32_bigger_than_five
    li t4, 0
fp16_to_fp32_bigger_than_five:
    
    # const int32_t inf_nan_mask = ((int32_t)(nonsign + 0x04000000) >> 8) & INT32_C(0x7F800000);
    mv t5, t2 # inf_nan_mask
    li t3, 0x04000000
    add t5, t5, t3
    srli t5, t5, 8
    li t3, 0x7F800000
    and t5, t5, t3  
    
    # const int32_t zero_mask = (int32_t)(nonsign - 1) >> 31;
    mv t6, t2 # zero_mask 
    addi t6, t6, -1
    srli t6, t6, 31
    
    # return sign | ((((nonsign << renorm_shift >> 3) + ((0x70 - renorm_shift) << 23)) | inf_nan_mask) & ~zero_mask);
    sll a0, t2, t4
    srli a0, a0, 3
    li t3, 0x70
    sub t3, t3, t4
    slli t3, t3, 23
    add a0, a0, t3
    or a0, a0, t5
    not t6, t6
    and a0, a0, t6
    or a0, t1, a0    
    
    # epilogue
    lw ra, 0(sp)
    addi sp, sp, 4
    ret

Use case (power of four)

Given an integer n, return true if it is a power of four. Otherwise, return false.

An integer n is a power of four, if there exists an integer x such that n ==
$4^{x}$ .

$- 2^{31} <= n <= 2^{31} - 1$

If a number n is a power of 4, then it must have only one bit set, and that bit must be in an odd position.
Thus, n&(n-1) must be zero, and 32 - my_clz(n) - 1 must be even.

static inline int my_clz(uint32_t x) {
    int count = 0;
    for (int i = 31; i >= 0; --i) {
        if (x & (1U << i))
            break;
        count++;
    }
    return count;
}
bool isPowerOfFour(uint32_t n) {
    if(n<=0)
        return false;
    /* in odd position  && only one bit set(is power of two ) */
    return !((32 - my_clz(n) - 1) % 2) && !(n&(n-1)); 
}

.data

.text
    j main
my_clz:
    # prologue 
    addi sp, sp, -4
    sw ra, 0(sp)

    li t0, 0 # count = 0
    li t1, 31 # i = 31
    li t2, 0x80000000
my_clz_loop:
    # if(x&(1U<<i)) break;
    and t3, a0, t2 
    bnez t3, my_clz_exit 
    
    srli t2, t2, 1 # t2 >>= 1
    addi t0, t0, 1 # count++
    addi t1, t1, -1 # --i
    bgez t1, my_clz_loop # while i>=0 --> loop
my_clz_exit:   
    mv a0, t0 # return count
    
    # epilogue
    lw ra, 0(sp)
    addi sp, sp, 4
    ret
    
is_pow_of_4:
    # prologue 
    addi sp, sp, -8
    sw ra, 0(sp)
    sw s0, 4(sp)
    
    blez a0, is_pow_of_4_lez
    mv s0, a0 # s0 = n
    
    jal my_clz
    mv t0, a0 # t0 = lz

    addi t1, s0, -1 
    and t1, s0, t1 
    beqz t1, is_pow_of_2
    li t1, 1
is_pow_of_2:
    xori t1, t1, 1 # t1 = is_pow_of_two
    
    li t2, 31
    sub t2, t2, t0
    andi t2, t2, 1
    xori t2, t2, 1 # t2 = is_in_odd_position
    
    and a0, t1, t2
    j is_pow_of_4_exit

is_pow_of_4_lez:
    li a0, 0    

is_pow_of_4_exit:
    # epilogue
    lw s0, 4(sp)
    lw ra, 0(sp)
    addi sp, sp, 8
    ret



main:
    li a0, 16
    jal is_pow_of_4

Use case (p-Norm)

Given an 3-dimension floating point vector v and an integer p, return its p-Norm.

$1 <= p <= 2$

p-Norm

| | v | |_{p} = (\sum_{i = 1}^{n} {| x_{i} |}^{p})^{\frac{1}{p}}

C code

e^{x} = 1 + \frac{x}{1!} + \frac{x^{2}}{2!} + \frac{x^{3}}{3!} + . . .

static inline float my_exp(float x) {
    float result = 1.0f; 
    float term = 1.0f;   
    for(int n = 1; n <= 20; n++){ 
        term *= x / n; 
        result += term; 
    }
    return result;
}

l n (x) = 2 \sum_{n = 0}^{\infty} \frac{1}{2 n + 1} (\frac{x - 1}{x + 1})^{2 n + 1}

static inline float my_ln(float x) {
    if(x <= 0.0f) 
        return -1.0f; 
    
    float result = 0.0f; 
    float term = (x - 1) / (x + 1); 
    float term_squared = term * term; 
    float current_term = term; 
    for(int n = 1; n <= 100; n += 2){ 
        result += current_term / n; 
        current_term *= term_squared; 
    }
    
    return 2 * result;
}

b a s e^{f r a c t i o n} = e^{f r a c t i o n * l n (b a s e)}

static inline float my_pow(float base, float exponent){
    if(base == 0.0f && exponent < 0.0f)
        return 0.0f;
    
    if(exponent == 0.0f)
        return 1.0f; 

    float result = 1.0f;
    int exp_int = (int)exponent; 
    float fraction = exponent - exp_int; 
    
    if(exp_int < 0){
        base = 1.0f / base; 
        exp_int = -exp_int; 
    }

    for(int i = 0; i < exp_int; ++i)
        result *= base; 

    if(fraction > 0) 
        result *= my_exp(fraction * my_ln(base));

    return result;
}

static inline float my_root(float x, int n){
    return my_pow(x, 1.0 / n);
}

static inline float find_norm(float* v, int p){
    float res = 0.0f;
    for(int i=0;i<2;++i){
        res +=  my_pow(fabsf(v[i]), p);
    }
    res = my_root(res, p);
    return res;
}

e.g.

You should automate the testing procedures as much as possible, meaning there's no need to manually check each program output. Instead, implement internal validation to handle the result checking.

TEST_CASE1 {1.0, 2.0, 3.0}

TEST_CASE2 {1.0, -2.0, 3.0}

TEST_CASE3 {0.0, -1.0, -7.0}

$ ./pnorm
TEST_CASE1 {1.0, 2.0, 3.0}
1-norm : 6.000000
2-norm : 3.741657
TEST_CASE2 {1.0, -2.0, 3.0}
1-norm : 6.000000
2-norm : 3.741657
TEST_CASE3 {0.0, -1.0, -7.0}
1-norm : 8.000000
2-norm : 7.057732

Rewrite using `fmul32`, `fdiv32`, and `fadd32`

fmul32

/* fmul32 */
float fmul32(float a, float b)
{
    int32_t ia = *(int32_t *)&a, ib = *(int32_t *)&b;
    if (ia == 0 || ib == 0) return 0;
    
    
    int32_t exp_a = (ia & 0x7F800000) >> 23;
    int32_t exp_b = (ib & 0x7F800000) >> 23;
    int32_t fra_a = ia & 0x7FFFFF;
    int32_t fra_b = ib & 0x7FFFFF;
    /* TODO: Special values like NaN and INF */
    if((exp_a == 255 && fra_a == 0 ) || (exp_b == 255 && fra_b == 0) ){
        printf("Infinity value!\n");
    }
    if((exp_a == 255 && fra_a != 0 ) || (exp_b == 255 && fra_b != 0) ){
        printf("NaN value!\n");
    }

    /* mantissa */
    int32_t ma = (ia & 0x7FFFFF) | 0x800000;
    int32_t mb = (ib & 0x7FFFFF) | 0x800000;

    int32_t sea = ia & 0xFF800000;
    int32_t seb = ib & 0xFF800000;

    /* result of mantissa */
    int32_t m = imul24(ma, mb);
    int32_t mshift = getbit(m, 24);
    m >>= mshift;

    int32_t r = ((sea - 0x3f800000 + seb) & 0xFF800000) + m - (0x800000 & -!mshift);
    int32_t ovfl = (r ^ ia ^ ib) >> 31;
    r = r ^ ( (r ^ 0x7f800000) & ovfl);
    return *(float *)&r;
}

/* my_exp */
static inline float my_exp(float x) {
    float result = 1.0f; 
    float term = 1.0f;   
    for(int n = 1; n <= 5; n++){ 
        term = fmul32(term, fdiv32(x, (float)n));
        result = fadd32(result, term); 
    }
    return result;
}

/* my_ln */
static inline float my_ln(float x) {
    if(x <= 0.0f)
        return -1.0f;
    if(x == 1.0f)
        return 0.0f;
    if(x <= 2.8f)
        return 1.0f;

    float result = 0.0f;
    float term = fdiv32(fadd32(x, -1.0f), fadd32(x, 1.0f));
    float term_squared = fmul32(term, term);
    float current_term = term;
    for(int n = 1; n <= 100; n += 2){
        result = fadd32(result, fdiv32(current_term, n)); 
        current_term = fmul32(current_term, term_squared);
    }

    return fmul32(2.0f, result);
}

/* my_pow */
static inline float my_pow(float base, float exponent){
    if(base == 0.0f && exponent < 0.0f)
        return 0.0f;

    if(exponent == 0.0f)
        return 1.0f;

    float result = 1.0f;
    int exp_int = (int)exponent;
    float fraction = fadd32(exponent, (float)-exp_int);

    if(exp_int < 0){
        base = fdiv32(1.0f, base);
        exp_int = -exp_int;
    }

    for(int i = 0; i < exp_int; ++i)
        result = fmul32(result, base);

    if(fraction > 0)
        result = fmul32(result, my_exp(fmul32(fraction, my_ln(base))));

    return result;
}

/* my_root */
static inline float my_root(float x, int n){
    return my_pow(x, fdiv32(1.0f, (float)n));
}

/* find_norm */
static inline float find_norm(float* v, int n, int p){
    float res = 0.0f;
    for(int i=0;i<n;++i){
        float pow_val =  my_pow(fabsf(v[i]), p);
        res = fadd32(res, pow_val);
    }
    res = my_root(res, p);
    return res;
}

$ ./pnorm
TEST_CASE1 {1.0, 2.0, 3.0}
1-norm : 6.000000
2-norm : 3.732667
TEST_CASE2 {1.0, -2.0, 3.0}
1-norm : 6.000000
2-norm : 3.732667
TEST_CASE3 {0.0, -1.0, -7.0}
1-norm : 8.000001
2-norm : 6.952097

Assignment1: RISC-V Assembly and Instruction Pipeline

quiz1 - problem C

C code

Assembly code

Use case (power of four)

Use case (p-Norm)

p-Norm

C code

e.g.

Rewrite using `fmul32`, `fdiv32`, and `fadd32`

fmul32

Assembly code

Analysis

Reference

Assignment1: RISC-V Assembly and Instruction Pipeline

quiz1 - problem C

C code

Assembly code

Use case (power of four)

Use case (p-Norm)

p-Norm

C code

e.g.

Rewrite using fmul32, fdiv32, and fadd32

fmul32

Assembly code

Analysis

Reference

Read more

2024q1 Homework2 (quiz1+2)

2024q1 Homework5 (assessment)

IEEE 802.11

2024q1 Homework6 (integration)

Rewrite using `fmul32`, `fdiv32`, and `fadd32`