Assignment1: RISC-V Assembly and Instruction Pipeline

The bfloat16 format is a 16-bit floating-point representation, designed to provide a wide dynamic range by using a floating radix point. It is a shortened version of the 32-bit IEEE 754 single-precision format (binary32), aimed at accelerating machine learning.

The structure of the bfloat16 floating-point format is as follows.

Quiz 1 Problem B:

        ┌ sign 
        │
        │   ┌ exponent
        │   │
        │   │      ┌ mantissa 
        │   │      │
        │┌──┴───┐┌─┴───┐
      0b0000000000000000 bfloat16

Implementation

C code

Why was union used rather than arbitrary pointer access?

typedef struct {
    uint16_t bits;
} bf16_t;
static inline bf16_t fp32_to_bf16(float s)
{
    bf16_t h;
    union {
        float f;
        uint32_t i;
    } u = {.f = s};
    if ((u.i & 0x7fffffff) > 0x7f800000) { /* NaN */
        h.bits = (u.i >> 16) | 64;         /* force to quiet */
        return h;                                                                                                                                             
    }
    h.bits = (u.i + (0x7fff + ((u.i >> 0x10) & 1))) >> 0x10;
    return h;
}
static inline float bf16_to_fp32(bf16_t h)
{
    union {
        float f;
        uint32_t i;
    } u = {.i = (uint32_t)h.bits << 16};
    return u.f;
}

Compile result:

~~![image](https://hackmd.io/_uploads/SJ_s9NZykg.png)~~

Do not use screenshots for plain text content, as this is inaccessible to visually impaired users.

Assembly code

.data
arr: .word   0xc1cc0000   # int arr = 0xc1cc0000;

.text

main:
    lw s0, arr           # s0 = arr;
    jal ra, fp32_to_fp16 # fp32_to_fp16(s0);
    li a7,10             # exit(0);
    ecall                # exit(0);

fp32_to_fp16:
    addi sp, sp, -8      # sp -= 8;
    sw ra, 4(sp)         # *(sp + 4) = ra;
    sw s0, 0(sp)         # *sp = s0;

    mv t0, s0            # t0 = s0;
    slli t0, t0, 1       # t0 <<= 1;
    srli t0, t0, 24      # t0 >>= 24;
    addi t1, x0, 0xff    # t1 = 0xff;

    srli a0, s0, 16      # a0 = s0 >> 16;

    bne t0, t1, Else     # if (t0 != t1) goto Else;
        ori a0, a0, 64   # a0 |= 64;
        j Exit           # goto Exit;
    Else:
        andi a0, a0, 1   # a0 &= 1;
        li t2, 0x7fff    # t2 = 0x7fff;
        add a0, a0, t2   # a0 += t2;
        add a0, s0, a0   # a0 = s0 + a0;
        srli a0, a0, 0x10 # a0 >>= 16;

    Exit:
        lw s0, 0(sp)     # s0 = *sp;
        lw ra, 4(sp)     # ra = *(sp + 8);
        addi sp, sp, 8   # sp += 8;
        jr ra            # return;

bf16_to_fp32:
    addi sp, sp, -8      # sp -= 16;
    sw ra, 4(sp)         # *(sp + 4) = ra;
    sw s1, 0(sp)         # *sp = s1;

    slli a0, s0, 16      # a0 = s0 << 16;

    lw s0, 0(sp)         # s0 = *sp;
    lw ra, 4(sp)         # ra = *(sp + 4);
    addi sp, sp, 8       # sp += 8;
    jr ra                # return;

Implement square root using CLZ method

Using Newton's method

The Newton-Raphson method, also known as Newton's method, is widely used for finding the roots of polynomials. The algorithm for Newton's method is as follows:

Given the function

f (x)

, we can find its root using the iterative process:

x^{k + 1} = x_{k} - \frac{f (x_{k})}{f^{'} (x_{k})} where k \in (1, \infty)

Applying this method to solve

x^{2} = a

, the iterative process becomes:

x^{k + 1} = x_{k} - \frac{x_{k}^{2} - a}{2 x_{k}} = \frac{1}{2} (x_{k} + \frac{a}{x_{k}})

Without loss of generality, we can simplify the formula above as follows:

x^{k + 1} = 0.5 x_{k} + \frac{a}{x_{k}}

This formula helps refine the estimate of the square root of

a

through each iteration.

Initial guess

With the formula above, we can easily calculate the precise square root. However, selecting an appropriate initial guess is crucial. A good initial guess can significantly reduce the number of iterations needed to reach the correct root. Using CLZ (count leading zeros) is an effective method for determining a suitable initial guess.

Code implementation

C code

      ________________________________________________________________
     |_0_|_______8______|____________________23_______________________|
      sign  exponential                   mantissa

Calculate leading zeros














uint8_t CLZ (uint32_t x){
    x |= x >> 1;
    x |= x >> 2;
    x |= x >> 4;
    x |= x >> 8;
    x |= x >> 16;

    x -= ((x >> 1) & 0x55555555);
    x = ((x >> 2) & 0x33333333) + (x & 0x33333333);
    x = ((x >> 4) + x) & 0x0f0f0f0f;
    x += (x >> 8);

    return (32 - (x & 0x1f));
}

Float addition
















































































static inline float add_float(float a, float b){

    // 1. Handle special cases like NaN, infinity, zero.
    if (a == 0.0f) return b;
    if (b == 0.0f) return a;

    union {
        uint32_t bits;
        float value;
    } fpa = {.value = a}, fpb = {.value = b};

    uint32_t sign_a = fpa.bits & 0x80000000;
    uint32_t sign_b = fpb.bits & 0x80000000;

    // Extract absolute values
    uint32_t abs_a = fpa.bits & 0x7fffffff;
    uint32_t abs_b = fpb.bits & 0x7fffffff;

    // Ensure abs_a > abs_b
    if (abs_a < abs_b) {
        uint32_t temp = abs_a;
        abs_a = abs_b;
        abs_b = temp;
        temp = sign_a;
        sign_a = sign_b;
        sign_b = sign_a;
    }

    // Extract exponents and mantissas
    uint32_t exp_a = (abs_a >> 23) & 0xff;
    uint32_t exp_b = (abs_b >> 23) & 0xff;
    uint32_t mantissa_a;
    uint32_t mantissa_b;
    if (exp_a > 0){
        mantissa_a = (abs_a & 0x7fffff) | 0x800000; // Add implicit 1
    }
    else
        mantissa_a = (abs_a & 0x7fffff);
    
    if (exp_b > 0){
        mantissa_b = (abs_b & 0x7fffff) | 0x800000; // Add implicit 1
    }
    else{
        mantissa_b = (abs_b & 0x7fffff);
    }

    // Align mantissa_b with mantissa_a
    uint32_t diff_exp = exp_a - exp_b;
    mantissa_b >>= diff_exp;

    // Calculate resulting mantissa based on sign difference
    uint32_t mantissa;
    if ((sign_a >> 31) ^ (sign_b >> 31)) {
        mantissa = mantissa_a - mantissa_b;
    } else {
        mantissa = mantissa_a + mantissa_b;
    }

    // Normalize mantissa if necessary
    uint32_t exp = exp_a;
    if (mantissa & 0x1000000) {
        mantissa >>= 1;
        exp++;
    } else {
        while (mantissa && !(mantissa & 0x800000)) {
            mantissa <<= 1;
            exp--;
        }
    }

    // Handle underflow and overflow
    if (exp <= 0) return 0.0f;

    union {
        uint32_t bits;
        float value;
    } out = {.bits = sign_a | (exp << 23) | (mantissa & 0x7fffff)};

    return out.value;
}

float division


































































































static inline float div_float(float p, float q){

    union {
        uint32_t bits;
        float value;
    } fpp = {.value = p};

    // Extract sign, exponent, and mantissa of p
    uint32_t sign_p = (fpp.bits >> 31);
    uint32_t exp_p = (fpp.bits >> 23) & 0xff;
    uint32_t mantissa_p;

    // Normalize mantissa of p
    if (exp_p > 0) {
        mantissa_p = (fpp.bits & 0x7FFFFF) | 0x800000; // Add implicit 1
    } else {
        mantissa_p = (fpp.bits & 0x7FFFFF);
        int dif = CLZ(mantissa_p) - 8;
        mantissa_p <<= dif;
        exp_p = 1 -dif;
    }




    union {
        uint32_t bits;
        float value;
    } fpq = {.value = q};

    // Extract sign, exponent, and mantissa of q
    uint32_t sign_q = (fpq.bits >> 31) & 0x1;
    uint32_t exp_q = (fpq.bits >> 23) & 0xff;
    uint32_t mantissa_q;
    
    // Normalize mantissa of 
    if (exp_q > 0) {
        mantissa_q = (fpq.bits & 0x7FFFFF) | 0x800000; // Add implicit 1
    } else {
        mantissa_q = (fpq.bits & 0x7FFFFF);
        int dif = CLZ(mantissa_q) - 8;
        mantissa_q <<= dif;
        exp_q = 1 -dif;
    }

    // Compute sign, exponent, and mantissa of the result
    uint32_t sign = sign_p ^ sign_q;
    int exp = exp_p - exp_q + 127;
    uint32_t mantissa = 0;

    // Align mantissa_p to be larger than mantissa_q
    if (mantissa_p < mantissa_q) {
        mantissa_p <<= 1;
        exp--;
    }


    // Perform division of mantissas using bitwise long division
    int nbits = 25;
    if (exp < 0) {
        nbits += exp;
        exp = 0;
        if (nbits < 0) {
            return 0;
        }
    }

    for (int i = 0; i < nbits; i++) {
        mantissa <<= 1;
        if (mantissa_p >= mantissa_q) {
            mantissa_p -= mantissa_q;
            mantissa |= 1;
        }
        mantissa_p <<= 1;
    }
  
    // Round the result
    uint8_t odd, rnd, sticky;
    sticky = (mantissa_p != 0);
    rnd = (mantissa & 1);
    odd = (mantissa & 2);
    mantissa = (mantissa >> 1) + (rnd & (sticky | odd));

    // Normalize the result if needed
    int lz = CLZ(mantissa);
    if (exp == 0 && (lz < 9)) {
        mantissa >>= (9 - lz);
        exp += (9 - lz);
    }

    // Combine the sign, exponent, and mantissa to form the final result
    union {
        uint32_t bits;
        float value;
    } output = {.bits = (sign << 31) | (exp << 23) | (mantissa & 0x7FFFFF)};

    return output.value;
}























static inline float Newtons_method(int alpha){


    // do the leading zero counting
    int lzc = (32 - CLZ(round_a))/2;
    
    //init float
    float output = int2float(lzc);

    if (output == 0){
        output = 2;
    }
    
    float input = int2float(alpha);

    //iteration loop
    for (int i = 0; i < iteration; i++){
        float temp = div_float(input, output);
        output = add_float(output , temp); 
        output = div_float(output, (float) 2);
    }
    return output;
}

Assembly code




























































































































































































































































































































































































































































.data
inputdata: .word 1160030, 25, 500
answer:    .word 0x4486a180,0x40a00000, 0x41B2E389
iteration: .word 5

str1:      .string "\n The testing number is: "
str2:      .string "\n The correct root number is:"
str3:      .string "\n The calculated root number is:"
str4:      .string "\n The answer is CORRECT"
str5:      .string "\n The answer is WRONG"
str6:      .string "\n we got total"
str7:      .string "\n error"

.text
main:
    la s3, inputdata
    li s1, 2
    li s2, 0
    la s4, answer
    mv a5, x0
    mloop:
    lw a0, 0(s3)
    mv s0, a0
    jal ra, Newtons_method
    
    lw a4, 0(s4)

    jal ra, printResult
    addi s4, s4, 4
    addi s2, s2, 1
    addi s3, s3, 4
    blt s2, s1, mloop
    jal ra, conclude
    #Exit the program
    li a7, 10
    ecall
    
# a0: the input/output data
# a2: iteration 
Newtons_method:
    addi sp, sp, -8
    sw ra, 4(sp)
    # round input
    sw a0, 0(sp)
    
    # get initial guess
    add t0, a0, x0
    jal ra, CLZ
    add a0, t0, x0

    addi a0, a0, -32
    sub a0, x0, a0
    srli a0, a0, 1
    
    mv t3, a0
    jal ra, int2float
    mv a0, t3


    bne a0, x0, nExit
        li a0, 0x40000000
    nExit:
   

 the total iteration time
    lw a1, 0(sp)
    mv t3, a1
    jal ra, int2float
    mv a1, t3

    mv a2, x0
    lw a3, iteration

    loop:
        bge a2, a3, outloop  #if a2 >= a3 jump to output loop
        add t0, a0, x0
        add t1, a1, x0
        jal ra, div_float # t4 = t1/t0
        add t0, a0, x0
        jal ra, add_float # t1 = t4 + t0
        li t0, 0x40000000
        jal ra div_float # t4 = t1/t0
        add a0, t4, x0   # a0(output) = t4
        addi a2, a2, 1
        j loop
    outloop:  
    lw ra 4(sp)
    addi sp, sp, 8
    jr ra

add_float:
    addi sp, sp, -4
    sw ra, 0(sp)
    
    # calculate abs
    li t2, 0x7fffffff
    and t1, t0, t2    # t1 = abs (t0)
    and t2, t4, t2    # t2 = abs (t4)
    
    # we always make sure   (abs(t4) > abs(t0))
    bge t2, t1, aExit1
    # switch t0 and t4
    mv t3, t0
    mv t0, t4
    mv t4, t3 
     # switch t1 and t2
    mv t3, t1
    mv t1, t2
    mv t2, t3 
    aExit1:
    srli t6, t2, 23
    srli t5, t1, 23
    # compute the t4's mantissa
    li t3 0x7fffff
    and t2, t2, t3
    and t1, t1, t3
    bge x0, t6, aExit2
        li t3, 0x800000
        or t2, t2, t3
    aExit2: 
    
    bge x0, t5, aExit3
        li t3, 0x800000
        or t1, t3, t1
    aExit3:
    # now we have t4 -> t2(mantissa), t6 (exp)
    #             t0 -> t1(mantissa), t5 (exp)
        
        sub t5, t6, t5 # t5 is the diff of exp
        srl t1, t1, t5
        srli t0, t0, 31
        srli t4, t4, 31
        # now t0, t4 are the sign bit
        xor t0, t0, t4
        beq t0, x0, aElse4
            sub t3, t2, t1
            j aExit4
        aElse4:
            add t3, t2, t1
    aExit4:
        
        # we can release t2, t1, t0, t5
        # t3 mantissa
        # t4 sign
        # t6 exp
    li t0, 0x1000000
    and t0, t0, t3
    beq t0, x0 aElse5
        srli t3, t3, 1
        addi t6, t6, 1
        j aExit5
    aElse5:
        beq t3, x0, aExit5
        li t0, 0x800000
        and t0, t3, t0
        bne t0, x0, aExit5
            slli t3, t3, 1
            addi t6, t6, -1
            j aElse5
    aExit5:
        add t1, x0, x0
        bge x0, t6, add_out
 
        li t2, 0x7fffff 
        and t1, t3, t2
        slli t6, t6, 23
        or t1, t1, t6
        slli t4, t4, 31
        or t1, t1, t4


    add_out:
        lw ra, 0(sp)
        addi sp, sp, 4
        jr ra



# t0 is the input/output data
# t1 abd t2 are the temperatory data
CLZ:
    srli t1, t0, 1    # t1 = t0 >> 1
    or t0, t0, t1     # t0 = t1 | t0
    srli t1, t0, 2    # t1 = t0 >> 2
    or t0, t0, t1     # t0 = t1 | t0
    srli t1, t0, 4    # t1 = t0 >> 4
    or t0, t0, t1     # t0 = t1 | t0
    srli t1, t0, 8    # t1 = t0 >> 8
    or t0, t0, t1     # t0 = t1 | t0
    srli t1, t0, 16   # t1 = t0 >> 16
    or t0, t0, t1     # t0 = t1 | t0  
    
    srli t1, t0, 1    # t1 = t0 >> 1
    li t2, 0x55555555
    and t1, t1, t2
    sub t0, t0, t1

    srli t1, t0, 2
    li t2, 0x33333333
    and t1, t1, t2
    and t0, t0, t2
    add t0, t0, t1

    srli t1, t0, 4
    li t2, 0x0f0f0f0f
    add t0, t1, t0
    and t0, t0, t2

    srli t1, t0, 8
    add t0, t0, t1
    srli t1, t0, 16
    add t0, t0, t1

    andi t0, t0, 0x1f
    addi t0, t0, -32
    sub t0, x0, t0
    jr ra


# t0 exponental part
# t1 mantissa part
# t3 input/output number
int2float:
    addi sp, sp, -4
    sw ra, 0(sp)
    add t0, t3, x0

    jal ra, CLZ

    addi t0, t0, 1
    addi t0, t0, -32
    sub t0, x0, t0


    
    addi t1, t0, -23
    sub t1, x0, t1
    sll t3, t3, t1
    li t1, 0x800000
    xor t3, t3, t1
    addi t0, t0, 127

    slli t0, t0, 23
    or t3, t3, t0
intout:
    lw ra, 0(sp)
    addi sp, sp, 4
    jr ra



# t0, t1, input number t1/t0
# t2 sign
# t3 exp
# t1 mantissa 
# t4 output 
div_float:
    addi sp, sp, -4
    sw ra, 0(sp)
    add t4, x0, x0
    beq t1, x0, div_out
    li t4, 0x7fffffff
    beq t0, x0, div_out
    
    # separate the t1 data into sign|exp|mantissa
    # t1 sign t2
    srli t2, t1, 31     # t2 = t1 >> 31
    # t1 exp t3
    srli t3, t1, 23     # t3 = t1 >> 23
    andi t3, t3, 0xff   # t3 = t3 & 0xff
    

    li t4, 0x7FFFFF     # t4 = 0x7FFFFF
    and t1, t1, t4      # t1 = t1 & 0x7FFFFF
    li t4, 0x800000 # t4 = 0x800000
    bge x0, t3, dElse1  # if t3 <= 0 jump to dElse1
        or t1, t1, t4   # t1 = t1 | 0x800000
        j dExit1        # jump to dExit1
    dElse1:
        slli t1, t1, 1
        addi t3, t3, -1
        and t5, t1, t4
        bne t5, x0, dElse1

    dExit1:
    # save the data t2, t3
    addi sp, sp, -8 
    sw t2, 0(sp)    # 0(sp) the t1's sign value
    sw t3, 4(sp)    # 4(sp) t1's exp



    # separate the t0 data into sign|exp|mantissa
    # t0 sign t2
    srli t2, t0, 31     # t2 = t0 >> 1
    andi t2, t2, 1      # t2 = t2 & 1
    # t1 exp t3
    srli t3, t0, 23     # t3 = t0 >> 23
    andi t3, t3, 0xff   # t3 = t3 & 0xff
    li t4, 0x7FFFFF # t4 = 0x7FFFFF
    and t0, t0, t4  # t0 = t0 & 0x7FFFFF 
    li t4, 0x800000 # t4 = 0x800000
    
    bge x0, t3, dElse2  # if 0 < t3 jump to dElse2 
        or t0, t0, t4   # t0 = t0 | 0x800000
        j dExit2        # jump to dExit1
    dElse2:
        slli t0, t0, 1
        addi t3, t3, -1
        and t5, t0, t4
        bne t5, x0, dElse2
    dExit2:
        
    lw t4, 0(sp)        # t4 = t1's sign value
    xor t2, t2, t4      # t2 = t2 ^ t4
    lw t4, 4(sp)        # t4 = t1's exp value
    addi t4, t4, 127    # t4 = t4 +127 
    sub t3, t4, t3      # t3 = t4 - t3
    addi sp, sp, 8      # recover the sp position
    
    
    bge t1, t0, dExit3 # if t1 < t0 align mantissa 
        slli t1, t1, 1
        addi t3, t3, -1
    dExit3:
   
    
    # t5 iteration number
    li t5, 25
    
    add t4, x0, x0      # t4 = 0;
    bge t3, x0, dExit4
        add t5, t5, t3
        add t3, x0, x0
        blt t5, x0, div_out
    dExit4:
    # division loop output t4
    li t6, 0
  
    dloop:
        bge t6, t5, doutloop
        slli t4, t4, 1  # t4 = t4 << 1
        blt t1, t0, dExit5
            sub t1, t1, t0
            ori t4, t4, 1
        dExit5:
        slli t1, t1, 1
        addi t6, t6, 1
        j dloop
    doutloop:

    # round result
    # odd t0
    # rnd t1
    # sticky t5
    xori t5, t1, 1
    andi t1, t4, 1
    andi t0, t4, 2
    
    srli t4, t4, 1 
    or t0, t0, t5
    and t1, t1, t0
    add t4, t4, t1
    
# normalize the result if needed
    bne t3, x0, dExit6
    li t1, 9
    bge t0, t1, dExit6
        addi sp, sp, -4
        sw t2, 0(sp)
        jal ra CLZ
        lw t2, 0(sp)
        addi sp, sp, 4
        sub t1, t1, t0
        srl t4, t4, t1
        add t3, t3, t1
    dExit6:

    li t5 0x7fffff
    and t4, t4, t5 # mantissa = mantissa & 0x7fffff
    slli t3, t3, 23 # exp << 23
    slli t2, t2, 31 # exp << 31
    or t4, t4, t3
    or t4, t4, t2
    
    div_out:
        lw ra, 0(sp)
        addi sp, sp, 4
        jr ra
    
 printResult:
     mv t0, s0 # original data
     mv t1, a0 # root data
     mv t2, a4 # answer
     
     la a0, str1
     li a7, 4
     ecall
     
     mv a0, t0 
     li a7, 1
     ecall
     
     la a0, str2
     li a7, 4
     ecall
     

     mv a0, a4 
     li a7, 1
     ecall
     
     la a0, str3
     li a7, 4
     ecall
     
     mv a0, t1 
     li a7, 1
     ecall
     
     beq a4, a0, CORRECT
     addi a5, a5, 1
     la a0, str4
     li a7, 4
     ecall
     j printout
    CORRECT:
     la a0, str4
     li a7, 4
     ecall
     printout:
     ret
    
conclude:
     la a0, str6
     li a7, 4
     ecall
     mv a0, a5
     li a7, 1
     ecall
     la a0, str7
     li a7, 4
     ecall
     ret

Analysis

pseudo instruction


00000000 <main>:
    0:        10000997        auipc x19 0x10000
    4:        00098993        addi x19 x19 0
    8:        00200493        addi x9 x0 2
    c:        00000913        addi x18 x0 0
    10:        10000a17        auipc x20 0x10000
    14:        ffca0a13        addi x20 x20 -4
    18:        00000793        addi x15 x0 0

0000001c <mloop>:
    1c:        0009a503        lw x10 0 x19
    20:        00050413        addi x8 x10 0
    24:        028000ef        jal x1 40 <Newtons_method>
    28:        000a2703        lw x14 0 x20
    2c:        3d4000ef        jal x1 980 <printResult>
    30:        004a0a13        addi x20 x20 4
    34:        00190913        addi x18 x18 1
    38:        00498993        addi x19 x19 4
    3c:        fe9940e3        blt x18 x9 -32 <mloop>
    40:        450000ef        jal x1 1104 <conclude>
    44:        00a00893        addi x17 x0 10
    48:        00000073        ecall

0000004c <Newtons_method>:
    4c:        ff810113        addi x2 x2 -8
    50:        00112223        sw x1 4 x2
    54:        00a12023        sw x10 0 x2
    58:        000502b3        add x5 x10 x0
    5c:        168000ef        jal x1 360 <CLZ>
    60:        00028533        add x10 x5 x0
    64:        fe050513        addi x10 x10 -32
    68:        40a00533        sub x10 x0 x10
    6c:        00155513        srli x10 x10 1
    70:        00050e13        addi x28 x10 0
    74:        1d8000ef        jal x1 472 <int2float>
    78:        000e0513        addi x10 x28 0
    7c:        00051463        bne x10 x0 8 <nExit>
    80:        40000537        lui x10 0x40000

00000084 <nExit>:
    84:        00012583        lw x11 0 x2
    88:        00058e13        addi x28 x11 0
    8c:        1c0000ef        jal x1 448 <int2float>
    90:        000e0593        addi x11 x28 0
    94:        00000613        addi x12 x0 0
    98:        10000697        auipc x13 0x10000
    9c:        f806a683        lw x13 -128 x13

000000a0 <loop>:
    a0:        02d65663        bge x12 x13 44 <outloop>
    a4:        000502b3        add x5 x10 x0
    a8:        00058333        add x6 x11 x0
    ac:        1e8000ef        jal x1 488 <div_float>
    b0:        000502b3        add x5 x10 x0
    b4:        024000ef        jal x1 36 <add_float>
    b8:        400002b7        lui x5 0x40000
    bc:        1d8000ef        jal x1 472 <div_float>
    c0:        000e8533        add x10 x29 x0
    c4:        00160613        addi x12 x12 1
    c8:        fd9ff06f        jal x0 -40 <loop>

000000cc <outloop>:
    cc:        00412083        lw x1 4 x2
    d0:        00810113        addi x2 x2 8
    d4:        00008067        jalr x0 x1 0

000000d8 <add_float>:
    d8:        ffc10113        addi x2 x2 -4
    dc:        00112023        sw x1 0 x2
    e0:        800003b7        lui x7 0x80000
    e4:        fff38393        addi x7 x7 -1
    e8:        0072f333        and x6 x5 x7
    ec:        007ef3b3        and x7 x29 x7
    f0:        0063de63        bge x7 x6 28 <aExit1>
    f4:        00028e13        addi x28 x5 0
    f8:        000e8293        addi x5 x29 0
    fc:        000e0e93        addi x29 x28 0
    100:        00030e13        addi x28 x6 0
    104:        00038313        addi x6 x7 0
    108:        000e0393        addi x7 x28 0

0000010c <aExit1>:
    10c:        0173df93        srli x31 x7 23
    110:        01735f13        srli x30 x6 23
    114:        00800e37        lui x28 0x800
    118:        fffe0e13        addi x28 x28 -1
    11c:        01c3f3b3        and x7 x7 x28
    120:        01c37333        and x6 x6 x28
    124:        01f05663        bge x0 x31 12 <aExit2>
    128:        00800e37        lui x28 0x800
    12c:        01c3e3b3        or x7 x7 x28

00000130 <aExit2>:
    130:        01e05663        bge x0 x30 12 <aExit3>
    134:        00800e37        lui x28 0x800
    138:        006e6333        or x6 x28 x6

0000013c <aExit3>:
    13c:        41ef8f33        sub x30 x31 x30
    140:        01e35333        srl x6 x6 x30
    144:        01f2d293        srli x5 x5 31
    148:        01fede93        srli x29 x29 31
    14c:        01d2c2b3        xor x5 x5 x29
    150:        00028663        beq x5 x0 12 <aElse4>
    154:        40638e33        sub x28 x7 x6
    158:        0080006f        jal x0 8 <aExit4>

0000015c <aElse4>:
    15c:        00638e33        add x28 x7 x6

00000160 <aExit4>:
    160:        010002b7        lui x5 0x1000
    164:        01c2f2b3        and x5 x5 x28
    168:        00028863        beq x5 x0 16 <aElse5>
    16c:        001e5e13        srli x28 x28 1
    170:        001f8f93        addi x31 x31 1
    174:        0200006f        jal x0 32 <aExit5>

00000178 <aElse5>:
    178:        000e0e63        beq x28 x0 28 <aExit5>
    17c:        008002b7        lui x5 0x800
    180:        005e72b3        and x5 x28 x5
    184:        00029863        bne x5 x0 16 <aExit5>
    188:        001e1e13        slli x28 x28 1
    18c:        ffff8f93        addi x31 x31 -1
    190:        fe9ff06f        jal x0 -24 <aElse5>

00000194 <aExit5>:
    194:        00000333        add x6 x0 x0
    198:        03f05063        bge x0 x31 32 <add_out>
    19c:        008003b7        lui x7 0x800
    1a0:        fff38393        addi x7 x7 -1
    1a4:        007e7333        and x6 x28 x7
    1a8:        017f9f93        slli x31 x31 23
    1ac:        01f36333        or x6 x6 x31
    1b0:        01fe9e93        slli x29 x29 31
    1b4:        01d36333        or x6 x6 x29

000001b8 <add_out>:
    1b8:        00012083        lw x1 0 x2
    1bc:        00410113        addi x2 x2 4
    1c0:        00008067        jalr x0 x1 0

000001c4 <CLZ>:
    1c4:        0012d313        srli x6 x5 1
    1c8:        0062e2b3        or x5 x5 x6
    1cc:        0022d313        srli x6 x5 2
    1d0:        0062e2b3        or x5 x5 x6
    1d4:        0042d313        srli x6 x5 4
    1d8:        0062e2b3        or x5 x5 x6
    1dc:        0082d313        srli x6 x5 8
    1e0:        0062e2b3        or x5 x5 x6
    1e4:        0102d313        srli x6 x5 16
    1e8:        0062e2b3        or x5 x5 x6
    1ec:        0012d313        srli x6 x5 1
    1f0:        555553b7        lui x7 0x55555
    1f4:        55538393        addi x7 x7 1365
    1f8:        00737333        and x6 x6 x7
    1fc:        406282b3        sub x5 x5 x6
    200:        0022d313        srli x6 x5 2
    204:        333333b7        lui x7 0x33333
    208:        33338393        addi x7 x7 819
    20c:        00737333        and x6 x6 x7
    210:        0072f2b3        and x5 x5 x7
    214:        006282b3        add x5 x5 x6
    218:        0042d313        srli x6 x5 4
    21c:        0f0f13b7        lui x7 0xf0f1
    220:        f0f38393        addi x7 x7 -241
    224:        005302b3        add x5 x6 x5
    228:        0072f2b3        and x5 x5 x7
    22c:        0082d313        srli x6 x5 8
    230:        006282b3        add x5 x5 x6
    234:        0102d313        srli x6 x5 16
    238:        006282b3        add x5 x5 x6
    23c:        01f2f293        andi x5 x5 31
    240:        fe028293        addi x5 x5 -32
    244:        405002b3        sub x5 x0 x5
    248:        00008067        jalr x0 x1 0

0000024c <int2float>:
    24c:        ffc10113        addi x2 x2 -4
    250:        00112023        sw x1 0 x2
    254:        000e02b3        add x5 x28 x0
    258:        f6dff0ef        jal x1 -148 <CLZ>
    25c:        00128293        addi x5 x5 1
    260:        fe028293        addi x5 x5 -32
    264:        405002b3        sub x5 x0 x5
    268:        fe928313        addi x6 x5 -23
    26c:        40600333        sub x6 x0 x6
    270:        006e1e33        sll x28 x28 x6
    274:        00800337        lui x6 0x800
    278:        006e4e33        xor x28 x28 x6
    27c:        07f28293        addi x5 x5 127
    280:        01729293        slli x5 x5 23
    284:        005e6e33        or x28 x28 x5

00000288 <intout>:
    288:        00012083        lw x1 0 x2
    28c:        00410113        addi x2 x2 4
    290:        00008067        jalr x0 x1 0

00000294 <div_float>:
    294:        ffc10113        addi x2 x2 -4
    298:        00112023        sw x1 0 x2
    29c:        00000eb3        add x29 x0 x0
    2a0:        14030a63        beq x6 x0 340 <div_out>
    2a4:        80000eb7        lui x29 0x80000
    2a8:        fffe8e93        addi x29 x29 -1
    2ac:        14028463        beq x5 x0 328 <div_out>
    2b0:        01f35393        srli x7 x6 31
    2b4:        01735e13        srli x28 x6 23
    2b8:        0ffe7e13        andi x28 x28 255
    2bc:        00800eb7        lui x29 0x800
    2c0:        fffe8e93        addi x29 x29 -1
    2c4:        01d37333        and x6 x6 x29
    2c8:        00800eb7        lui x29 0x800
    2cc:        01c05663        bge x0 x28 12 <dElse1>
    2d0:        01d36333        or x6 x6 x29
    2d4:        0140006f        jal x0 20 <dExit1>

000002d8 <dElse1>:
    2d8:        00131313        slli x6 x6 1
    2dc:        fffe0e13        addi x28 x28 -1
    2e0:        01d37f33        and x30 x6 x29
    2e4:        fe0f1ae3        bne x30 x0 -12 <dElse1>

000002e8 <dExit1>:
    2e8:        ff810113        addi x2 x2 -8
    2ec:        00712023        sw x7 0 x2
    2f0:        01c12223        sw x28 4 x2
    2f4:        01f2d393        srli x7 x5 31
    2f8:        0013f393        andi x7 x7 1
    2fc:        0172de13        srli x28 x5 23
    300:        0ffe7e13        andi x28 x28 255
    304:        00800eb7        lui x29 0x800
    308:        fffe8e93        addi x29 x29 -1
    30c:        01d2f2b3        and x5 x5 x29
    310:        00800eb7        lui x29 0x800
    314:        01c05663        bge x0 x28 12 <dElse2>
    318:        01d2e2b3        or x5 x5 x29
    31c:        0140006f        jal x0 20 <dExit2>

00000320 <dElse2>:
    320:        00129293        slli x5 x5 1
    324:        fffe0e13        addi x28 x28 -1
    328:        01d2ff33        and x30 x5 x29
    32c:        fe0f1ae3        bne x30 x0 -12 <dElse2>

00000330 <dExit2>:
    330:        00012e83        lw x29 0 x2
    334:        01d3c3b3        xor x7 x7 x29
    338:        00412e83        lw x29 4 x2
    33c:        07fe8e93        addi x29 x29 127
    340:        41ce8e33        sub x28 x29 x28
    344:        00810113        addi x2 x2 8
    348:        00535663        bge x6 x5 12 <dExit3>
    34c:        00131313        slli x6 x6 1
    350:        fffe0e13        addi x28 x28 -1

00000354 <dExit3>:
    354:        01900f13        addi x30 x0 25
    358:        00000eb3        add x29 x0 x0
    35c:        000e5863        bge x28 x0 16 <dExit4>
    360:        01cf0f33        add x30 x30 x28
    364:        00000e33        add x28 x0 x0
    368:        080f4663        blt x30 x0 140 <div_out>

0000036c <dExit4>:
    36c:        00000f93        addi x31 x0 0

00000370 <dloop>:
    370:        03efd063        bge x31 x30 32 <doutloop>
    374:        001e9e93        slli x29 x29 1
    378:        00534663        blt x6 x5 12 <dExit5>
    37c:        40530333        sub x6 x6 x5
    380:        001eee93        ori x29 x29 1

00000384 <dExit5>:
    384:        00131313        slli x6 x6 1
    388:        001f8f93        addi x31 x31 1
    38c:        fe5ff06f        jal x0 -28 <dloop>

00000390 <doutloop>:
    390:        00134f13        xori x30 x6 1
    394:        001ef313        andi x6 x29 1
    398:        002ef293        andi x5 x29 2
    39c:        001ede93        srli x29 x29 1
    3a0:        01e2e2b3        or x5 x5 x30
    3a4:        00537333        and x6 x6 x5
    3a8:        006e8eb3        add x29 x29 x6
    3ac:        020e1663        bne x28 x0 44 <dExit6>
    3b0:        00900313        addi x6 x0 9
    3b4:        0262d263        bge x5 x6 36 <dExit6>
    3b8:        ffc10113        addi x2 x2 -4
    3bc:        00712023        sw x7 0 x2
    3c0:        e05ff0ef        jal x1 -508 <CLZ>
    3c4:        00012383        lw x7 0 x2
    3c8:        00410113        addi x2 x2 4
    3cc:        40530333        sub x6 x6 x5
    3d0:        006edeb3        srl x29 x29 x6
    3d4:        006e0e33        add x28 x28 x6

000003d8 <dExit6>:
    3d8:        00800f37        lui x30 0x800
    3dc:        ffff0f13        addi x30 x30 -1
    3e0:        01eefeb3        and x29 x29 x30
    3e4:        017e1e13        slli x28 x28 23
    3e8:        01f39393        slli x7 x7 31
    3ec:        01ceeeb3        or x29 x29 x28
    3f0:        007eeeb3        or x29 x29 x7

000003f4 <div_out>:
    3f4:        00012083        lw x1 0 x2
    3f8:        00410113        addi x2 x2 4
    3fc:        00008067        jalr x0 x1 0

00000400 <printResult>:
    400:        00040293        addi x5 x8 0
    404:        00050313        addi x6 x10 0
    408:        00070393        addi x7 x14 0
    40c:        10000517        auipc x10 0x10000
    410:        c1050513        addi x10 x10 -1008
    414:        00400893        addi x17 x0 4
    418:        00000073        ecall
    41c:        00028513        addi x10 x5 0
    420:        00100893        addi x17 x0 1
    424:        00000073        ecall
    428:        10000517        auipc x10 0x10000
    42c:        c0e50513        addi x10 x10 -1010
    430:        00400893        addi x17 x0 4
    434:        00000073        ecall
    438:        00070513        addi x10 x14 0
    43c:        00100893        addi x17 x0 1
    440:        00000073        ecall
    444:        10000517        auipc x10 0x10000
    448:        c1050513        addi x10 x10 -1008
    44c:        00400893        addi x17 x0 4
    450:        00000073        ecall
    454:        00030513        addi x10 x6 0
    458:        00100893        addi x17 x0 1
    45c:        00000073        ecall
    460:        00a70e63        beq x14 x10 28 <CORRECT>
    464:        00178793        addi x15 x15 1
    468:        10000517        auipc x10 0x10000
    46c:        c0d50513        addi x10 x10 -1011
    470:        00400893        addi x17 x0 4
    474:        00000073        ecall
    478:        0140006f        jal x0 20 <printout>

0000047c <CORRECT>:
    47c:        10000517        auipc x10 0x10000
    480:        bf950513        addi x10 x10 -1031
    484:        00400893        addi x17 x0 4
    488:        00000073        ecall

0000048c <printout>:
    48c:        00008067        jalr x0 x1 0

00000490 <conclude>:
    490:        10000517        auipc x10 0x10000
    494:        c1350513        addi x10 x10 -1005
    498:        00400893        addi x17 x0 4
    49c:        00000073        ecall
    4a0:        00078513        addi x10 x15 0
    4a4:        00100893        addi x17 x0 1
    4a8:        00000073        ecall
    4ac:        10000517        auipc x10 0x10000
    4b0:        c0650513        addi x10 x10 -1018
    4b4:        00400893        addi x17 x0 4
    4b8:        00000073        ecall
    4bc:        00008067        jalr x0 x1 0

Result

As for the code above can only calculate integer number, which means that the input data should in the range of

[1, 2^{31}]

AS result, I selected three integer as my testing data, such like 1160030, 25, 500

And the each output data is exactly the same as I calculate in C code, where bothe Newtons iteration epoch = 5.

Reference:

Always refer to primary sources, such as official RISC-V documentation.