Try   HackMD

Assignment1: RISC-V Assembly and Instruction Pipeline

The bfloat16 format is a 16-bit floating-point representation, designed to provide a wide dynamic range by using a floating radix point. It is a shortened version of the 32-bit IEEE 754 single-precision format (binary32), aimed at accelerating machine learning.

The structure of the bfloat16 floating-point format is as follows.

Quiz 1 Problem B:

        ┌ sign 
        │
        │   ┌ exponent
        │   │
        │   │      ┌ mantissa 
        │   │      │
        │┌──┴───┐┌─┴───┐
      0b0000000000000000 bfloat16

Implementation

C code

Why was union used rather than arbitrary pointer access?

typedef struct {
    uint16_t bits;
} bf16_t;
static inline bf16_t fp32_to_bf16(float s)
{
    bf16_t h;
    union {
        float f;
        uint32_t i;
    } u = {.f = s};
    if ((u.i & 0x7fffffff) > 0x7f800000) { /* NaN */
        h.bits = (u.i >> 16) | 64;         /* force to quiet */
        return h;                                                                                                                                             
    }
    h.bits = (u.i + (0x7fff + ((u.i >> 0x10) & 1))) >> 0x10;
    return h;
}
static inline float bf16_to_fp32(bf16_t h)
{
    union {
        float f;
        uint32_t i;
    } u = {.i = (uint32_t)h.bits << 16};
    return u.f;
}

Compile result:

![image](https://hackmd.io/_uploads/SJ_s9NZykg.png)

Do not use screenshots for plain text content, as this is inaccessible to visually impaired users.

Assembly code

.data
arr: .word   0xc1cc0000   # int arr = 0xc1cc0000;

.text

main:
    lw s0, arr           # s0 = arr;
    jal ra, fp32_to_fp16 # fp32_to_fp16(s0);
    li a7,10             # exit(0);
    ecall                # exit(0);

fp32_to_fp16:
    addi sp, sp, -8      # sp -= 8;
    sw ra, 4(sp)         # *(sp + 4) = ra;
    sw s0, 0(sp)         # *sp = s0;

    mv t0, s0            # t0 = s0;
    slli t0, t0, 1       # t0 <<= 1;
    srli t0, t0, 24      # t0 >>= 24;
    addi t1, x0, 0xff    # t1 = 0xff;

    srli a0, s0, 16      # a0 = s0 >> 16;

    bne t0, t1, Else     # if (t0 != t1) goto Else;
        ori a0, a0, 64   # a0 |= 64;
        j Exit           # goto Exit;
    Else:
        andi a0, a0, 1   # a0 &= 1;
        li t2, 0x7fff    # t2 = 0x7fff;
        add a0, a0, t2   # a0 += t2;
        add a0, s0, a0   # a0 = s0 + a0;
        srli a0, a0, 0x10 # a0 >>= 16;

    Exit:
        lw s0, 0(sp)     # s0 = *sp;
        lw ra, 4(sp)     # ra = *(sp + 8);
        addi sp, sp, 8   # sp += 8;
        jr ra            # return;

bf16_to_fp32:
    addi sp, sp, -8      # sp -= 16;
    sw ra, 4(sp)         # *(sp + 4) = ra;
    sw s1, 0(sp)         # *sp = s1;

    slli a0, s0, 16      # a0 = s0 << 16;

    lw s0, 0(sp)         # s0 = *sp;
    lw ra, 4(sp)         # ra = *(sp + 4);
    addi sp, sp, 8       # sp += 8;
    jr ra                # return;

Implement square root using CLZ method

Using Newton's method

The Newton-Raphson method, also known as Newton's method, is widely used for finding the roots of polynomials. The algorithm for Newton's method is as follows:

Given the function

f(x), we can find its root using the iterative process:
xk+1=xkf(xk)f(xk)     where k(1,)

Applying this method to solve

x2=a, the iterative process becomes:
xk+1=xkxk2a2xk=12(xk+axk)

Without loss of generality, we can simplify the formula above as follows:

xk+1=0.5xk+axk
This formula helps refine the estimate of the square root of
a
through each iteration.

Initial guess

With the formula above, we can easily calculate the precise square root. However, selecting an appropriate initial guess is crucial. A good initial guess can significantly reduce the number of iterations needed to reach the correct root. Using CLZ (count leading zeros) is an effective method for determining a suitable initial guess.

Code implementation

C code

      ________________________________________________________________
     |_0_|_______8______|____________________23_______________________|
      sign  exponential                   mantissa

Calculate leading zeros

uint8_t CLZ (uint32_t x){ x |= x >> 1; x |= x >> 2; x |= x >> 4; x |= x >> 8; x |= x >> 16; x -= ((x >> 1) & 0x55555555); x = ((x >> 2) & 0x33333333) + (x & 0x33333333); x = ((x >> 4) + x) & 0x0f0f0f0f; x += (x >> 8); return (32 - (x & 0x1f)); }

Float addition

static inline float add_float(float a, float b){ // 1. Handle special cases like NaN, infinity, zero. if (a == 0.0f) return b; if (b == 0.0f) return a; union { uint32_t bits; float value; } fpa = {.value = a}, fpb = {.value = b}; uint32_t sign_a = fpa.bits & 0x80000000; uint32_t sign_b = fpb.bits & 0x80000000; // Extract absolute values uint32_t abs_a = fpa.bits & 0x7fffffff; uint32_t abs_b = fpb.bits & 0x7fffffff; // Ensure abs_a > abs_b if (abs_a < abs_b) { uint32_t temp = abs_a; abs_a = abs_b; abs_b = temp; temp = sign_a; sign_a = sign_b; sign_b = sign_a; } // Extract exponents and mantissas uint32_t exp_a = (abs_a >> 23) & 0xff; uint32_t exp_b = (abs_b >> 23) & 0xff; uint32_t mantissa_a; uint32_t mantissa_b; if (exp_a > 0){ mantissa_a = (abs_a & 0x7fffff) | 0x800000; // Add implicit 1 } else mantissa_a = (abs_a & 0x7fffff); if (exp_b > 0){ mantissa_b = (abs_b & 0x7fffff) | 0x800000; // Add implicit 1 } else{ mantissa_b = (abs_b & 0x7fffff); } // Align mantissa_b with mantissa_a uint32_t diff_exp = exp_a - exp_b; mantissa_b >>= diff_exp; // Calculate resulting mantissa based on sign difference uint32_t mantissa; if ((sign_a >> 31) ^ (sign_b >> 31)) { mantissa = mantissa_a - mantissa_b; } else { mantissa = mantissa_a + mantissa_b; } // Normalize mantissa if necessary uint32_t exp = exp_a; if (mantissa & 0x1000000) { mantissa >>= 1; exp++; } else { while (mantissa && !(mantissa & 0x800000)) { mantissa <<= 1; exp--; } } // Handle underflow and overflow if (exp <= 0) return 0.0f; union { uint32_t bits; float value; } out = {.bits = sign_a | (exp << 23) | (mantissa & 0x7fffff)}; return out.value; }

float division

static inline float div_float(float p, float q){ union { uint32_t bits; float value; } fpp = {.value = p}; // Extract sign, exponent, and mantissa of p uint32_t sign_p = (fpp.bits >> 31); uint32_t exp_p = (fpp.bits >> 23) & 0xff; uint32_t mantissa_p; // Normalize mantissa of p if (exp_p > 0) { mantissa_p = (fpp.bits & 0x7FFFFF) | 0x800000; // Add implicit 1 } else { mantissa_p = (fpp.bits & 0x7FFFFF); int dif = CLZ(mantissa_p) - 8; mantissa_p <<= dif; exp_p = 1 -dif; } union { uint32_t bits; float value; } fpq = {.value = q}; // Extract sign, exponent, and mantissa of q uint32_t sign_q = (fpq.bits >> 31) & 0x1; uint32_t exp_q = (fpq.bits >> 23) & 0xff; uint32_t mantissa_q; // Normalize mantissa of if (exp_q > 0) { mantissa_q = (fpq.bits & 0x7FFFFF) | 0x800000; // Add implicit 1 } else { mantissa_q = (fpq.bits & 0x7FFFFF); int dif = CLZ(mantissa_q) - 8; mantissa_q <<= dif; exp_q = 1 -dif; } // Compute sign, exponent, and mantissa of the result uint32_t sign = sign_p ^ sign_q; int exp = exp_p - exp_q + 127; uint32_t mantissa = 0; // Align mantissa_p to be larger than mantissa_q if (mantissa_p < mantissa_q) { mantissa_p <<= 1; exp--; } // Perform division of mantissas using bitwise long division int nbits = 25; if (exp < 0) { nbits += exp; exp = 0; if (nbits < 0) { return 0; } } for (int i = 0; i < nbits; i++) { mantissa <<= 1; if (mantissa_p >= mantissa_q) { mantissa_p -= mantissa_q; mantissa |= 1; } mantissa_p <<= 1; } // Round the result uint8_t odd, rnd, sticky; sticky = (mantissa_p != 0); rnd = (mantissa & 1); odd = (mantissa & 2); mantissa = (mantissa >> 1) + (rnd & (sticky | odd)); // Normalize the result if needed int lz = CLZ(mantissa); if (exp == 0 && (lz < 9)) { mantissa >>= (9 - lz); exp += (9 - lz); } // Combine the sign, exponent, and mantissa to form the final result union { uint32_t bits; float value; } output = {.bits = (sign << 31) | (exp << 23) | (mantissa & 0x7FFFFF)}; return output.value; }
static inline float Newtons_method(int alpha){ // do the leading zero counting int lzc = (32 - CLZ(round_a))/2; //init float float output = int2float(lzc); if (output == 0){ output = 2; } float input = int2float(alpha); //iteration loop for (int i = 0; i < iteration; i++){ float temp = div_float(input, output); output = add_float(output , temp); output = div_float(output, (float) 2); } return output; }

Assembly code

.data inputdata: .word 1160030, 25, 500 answer: .word 0x4486a180,0x40a00000, 0x41B2E389 iteration: .word 5 str1: .string "\n The testing number is: " str2: .string "\n The correct root number is:" str3: .string "\n The calculated root number is:" str4: .string "\n The answer is CORRECT" str5: .string "\n The answer is WRONG" str6: .string "\n we got total" str7: .string "\n error" .text main: la s3, inputdata li s1, 2 li s2, 0 la s4, answer mv a5, x0 mloop: lw a0, 0(s3) mv s0, a0 jal ra, Newtons_method lw a4, 0(s4) jal ra, printResult addi s4, s4, 4 addi s2, s2, 1 addi s3, s3, 4 blt s2, s1, mloop jal ra, conclude #Exit the program li a7, 10 ecall # a0: the input/output data # a2: iteration Newtons_method: addi sp, sp, -8 sw ra, 4(sp) # round input sw a0, 0(sp) # get initial guess add t0, a0, x0 jal ra, CLZ add a0, t0, x0 addi a0, a0, -32 sub a0, x0, a0 srli a0, a0, 1 mv t3, a0 jal ra, int2float mv a0, t3 bne a0, x0, nExit li a0, 0x40000000 nExit: the total iteration time lw a1, 0(sp) mv t3, a1 jal ra, int2float mv a1, t3 mv a2, x0 lw a3, iteration loop: bge a2, a3, outloop #if a2 >= a3 jump to output loop add t0, a0, x0 add t1, a1, x0 jal ra, div_float # t4 = t1/t0 add t0, a0, x0 jal ra, add_float # t1 = t4 + t0 li t0, 0x40000000 jal ra div_float # t4 = t1/t0 add a0, t4, x0 # a0(output) = t4 addi a2, a2, 1 j loop outloop: lw ra 4(sp) addi sp, sp, 8 jr ra add_float: addi sp, sp, -4 sw ra, 0(sp) # calculate abs li t2, 0x7fffffff and t1, t0, t2 # t1 = abs (t0) and t2, t4, t2 # t2 = abs (t4) # we always make sure (abs(t4) > abs(t0)) bge t2, t1, aExit1 # switch t0 and t4 mv t3, t0 mv t0, t4 mv t4, t3 # switch t1 and t2 mv t3, t1 mv t1, t2 mv t2, t3 aExit1: srli t6, t2, 23 srli t5, t1, 23 # compute the t4's mantissa li t3 0x7fffff and t2, t2, t3 and t1, t1, t3 bge x0, t6, aExit2 li t3, 0x800000 or t2, t2, t3 aExit2: bge x0, t5, aExit3 li t3, 0x800000 or t1, t3, t1 aExit3: # now we have t4 -> t2(mantissa), t6 (exp) # t0 -> t1(mantissa), t5 (exp) sub t5, t6, t5 # t5 is the diff of exp srl t1, t1, t5 srli t0, t0, 31 srli t4, t4, 31 # now t0, t4 are the sign bit xor t0, t0, t4 beq t0, x0, aElse4 sub t3, t2, t1 j aExit4 aElse4: add t3, t2, t1 aExit4: # we can release t2, t1, t0, t5 # t3 mantissa # t4 sign # t6 exp li t0, 0x1000000 and t0, t0, t3 beq t0, x0 aElse5 srli t3, t3, 1 addi t6, t6, 1 j aExit5 aElse5: beq t3, x0, aExit5 li t0, 0x800000 and t0, t3, t0 bne t0, x0, aExit5 slli t3, t3, 1 addi t6, t6, -1 j aElse5 aExit5: add t1, x0, x0 bge x0, t6, add_out li t2, 0x7fffff and t1, t3, t2 slli t6, t6, 23 or t1, t1, t6 slli t4, t4, 31 or t1, t1, t4 add_out: lw ra, 0(sp) addi sp, sp, 4 jr ra # t0 is the input/output data # t1 abd t2 are the temperatory data CLZ: srli t1, t0, 1 # t1 = t0 >> 1 or t0, t0, t1 # t0 = t1 | t0 srli t1, t0, 2 # t1 = t0 >> 2 or t0, t0, t1 # t0 = t1 | t0 srli t1, t0, 4 # t1 = t0 >> 4 or t0, t0, t1 # t0 = t1 | t0 srli t1, t0, 8 # t1 = t0 >> 8 or t0, t0, t1 # t0 = t1 | t0 srli t1, t0, 16 # t1 = t0 >> 16 or t0, t0, t1 # t0 = t1 | t0 srli t1, t0, 1 # t1 = t0 >> 1 li t2, 0x55555555 and t1, t1, t2 sub t0, t0, t1 srli t1, t0, 2 li t2, 0x33333333 and t1, t1, t2 and t0, t0, t2 add t0, t0, t1 srli t1, t0, 4 li t2, 0x0f0f0f0f add t0, t1, t0 and t0, t0, t2 srli t1, t0, 8 add t0, t0, t1 srli t1, t0, 16 add t0, t0, t1 andi t0, t0, 0x1f addi t0, t0, -32 sub t0, x0, t0 jr ra # t0 exponental part # t1 mantissa part # t3 input/output number int2float: addi sp, sp, -4 sw ra, 0(sp) add t0, t3, x0 jal ra, CLZ addi t0, t0, 1 addi t0, t0, -32 sub t0, x0, t0 addi t1, t0, -23 sub t1, x0, t1 sll t3, t3, t1 li t1, 0x800000 xor t3, t3, t1 addi t0, t0, 127 slli t0, t0, 23 or t3, t3, t0 intout: lw ra, 0(sp) addi sp, sp, 4 jr ra # t0, t1, input number t1/t0 # t2 sign # t3 exp # t1 mantissa # t4 output div_float: addi sp, sp, -4 sw ra, 0(sp) add t4, x0, x0 beq t1, x0, div_out li t4, 0x7fffffff beq t0, x0, div_out # separate the t1 data into sign|exp|mantissa # t1 sign t2 srli t2, t1, 31 # t2 = t1 >> 31 # t1 exp t3 srli t3, t1, 23 # t3 = t1 >> 23 andi t3, t3, 0xff # t3 = t3 & 0xff li t4, 0x7FFFFF # t4 = 0x7FFFFF and t1, t1, t4 # t1 = t1 & 0x7FFFFF li t4, 0x800000 # t4 = 0x800000 bge x0, t3, dElse1 # if t3 <= 0 jump to dElse1 or t1, t1, t4 # t1 = t1 | 0x800000 j dExit1 # jump to dExit1 dElse1: slli t1, t1, 1 addi t3, t3, -1 and t5, t1, t4 bne t5, x0, dElse1 dExit1: # save the data t2, t3 addi sp, sp, -8 sw t2, 0(sp) # 0(sp) the t1's sign value sw t3, 4(sp) # 4(sp) t1's exp # separate the t0 data into sign|exp|mantissa # t0 sign t2 srli t2, t0, 31 # t2 = t0 >> 1 andi t2, t2, 1 # t2 = t2 & 1 # t1 exp t3 srli t3, t0, 23 # t3 = t0 >> 23 andi t3, t3, 0xff # t3 = t3 & 0xff li t4, 0x7FFFFF # t4 = 0x7FFFFF and t0, t0, t4 # t0 = t0 & 0x7FFFFF li t4, 0x800000 # t4 = 0x800000 bge x0, t3, dElse2 # if 0 < t3 jump to dElse2 or t0, t0, t4 # t0 = t0 | 0x800000 j dExit2 # jump to dExit1 dElse2: slli t0, t0, 1 addi t3, t3, -1 and t5, t0, t4 bne t5, x0, dElse2 dExit2: lw t4, 0(sp) # t4 = t1's sign value xor t2, t2, t4 # t2 = t2 ^ t4 lw t4, 4(sp) # t4 = t1's exp value addi t4, t4, 127 # t4 = t4 +127 sub t3, t4, t3 # t3 = t4 - t3 addi sp, sp, 8 # recover the sp position bge t1, t0, dExit3 # if t1 < t0 align mantissa slli t1, t1, 1 addi t3, t3, -1 dExit3: # t5 iteration number li t5, 25 add t4, x0, x0 # t4 = 0; bge t3, x0, dExit4 add t5, t5, t3 add t3, x0, x0 blt t5, x0, div_out dExit4: # division loop output t4 li t6, 0 dloop: bge t6, t5, doutloop slli t4, t4, 1 # t4 = t4 << 1 blt t1, t0, dExit5 sub t1, t1, t0 ori t4, t4, 1 dExit5: slli t1, t1, 1 addi t6, t6, 1 j dloop doutloop: # round result # odd t0 # rnd t1 # sticky t5 xori t5, t1, 1 andi t1, t4, 1 andi t0, t4, 2 srli t4, t4, 1 or t0, t0, t5 and t1, t1, t0 add t4, t4, t1 # normalize the result if needed bne t3, x0, dExit6 li t1, 9 bge t0, t1, dExit6 addi sp, sp, -4 sw t2, 0(sp) jal ra CLZ lw t2, 0(sp) addi sp, sp, 4 sub t1, t1, t0 srl t4, t4, t1 add t3, t3, t1 dExit6: li t5 0x7fffff and t4, t4, t5 # mantissa = mantissa & 0x7fffff slli t3, t3, 23 # exp << 23 slli t2, t2, 31 # exp << 31 or t4, t4, t3 or t4, t4, t2 div_out: lw ra, 0(sp) addi sp, sp, 4 jr ra printResult: mv t0, s0 # original data mv t1, a0 # root data mv t2, a4 # answer la a0, str1 li a7, 4 ecall mv a0, t0 li a7, 1 ecall la a0, str2 li a7, 4 ecall mv a0, a4 li a7, 1 ecall la a0, str3 li a7, 4 ecall mv a0, t1 li a7, 1 ecall beq a4, a0, CORRECT addi a5, a5, 1 la a0, str4 li a7, 4 ecall j printout CORRECT: la a0, str4 li a7, 4 ecall printout: ret conclude: la a0, str6 li a7, 4 ecall mv a0, a5 li a7, 1 ecall la a0, str7 li a7, 4 ecall ret

Analysis

pseudo instruction


00000000 <main>:
    0:        10000997        auipc x19 0x10000
    4:        00098993        addi x19 x19 0
    8:        00200493        addi x9 x0 2
    c:        00000913        addi x18 x0 0
    10:        10000a17        auipc x20 0x10000
    14:        ffca0a13        addi x20 x20 -4
    18:        00000793        addi x15 x0 0

0000001c <mloop>:
    1c:        0009a503        lw x10 0 x19
    20:        00050413        addi x8 x10 0
    24:        028000ef        jal x1 40 <Newtons_method>
    28:        000a2703        lw x14 0 x20
    2c:        3d4000ef        jal x1 980 <printResult>
    30:        004a0a13        addi x20 x20 4
    34:        00190913        addi x18 x18 1
    38:        00498993        addi x19 x19 4
    3c:        fe9940e3        blt x18 x9 -32 <mloop>
    40:        450000ef        jal x1 1104 <conclude>
    44:        00a00893        addi x17 x0 10
    48:        00000073        ecall

0000004c <Newtons_method>:
    4c:        ff810113        addi x2 x2 -8
    50:        00112223        sw x1 4 x2
    54:        00a12023        sw x10 0 x2
    58:        000502b3        add x5 x10 x0
    5c:        168000ef        jal x1 360 <CLZ>
    60:        00028533        add x10 x5 x0
    64:        fe050513        addi x10 x10 -32
    68:        40a00533        sub x10 x0 x10
    6c:        00155513        srli x10 x10 1
    70:        00050e13        addi x28 x10 0
    74:        1d8000ef        jal x1 472 <int2float>
    78:        000e0513        addi x10 x28 0
    7c:        00051463        bne x10 x0 8 <nExit>
    80:        40000537        lui x10 0x40000

00000084 <nExit>:
    84:        00012583        lw x11 0 x2
    88:        00058e13        addi x28 x11 0
    8c:        1c0000ef        jal x1 448 <int2float>
    90:        000e0593        addi x11 x28 0
    94:        00000613        addi x12 x0 0
    98:        10000697        auipc x13 0x10000
    9c:        f806a683        lw x13 -128 x13

000000a0 <loop>:
    a0:        02d65663        bge x12 x13 44 <outloop>
    a4:        000502b3        add x5 x10 x0
    a8:        00058333        add x6 x11 x0
    ac:        1e8000ef        jal x1 488 <div_float>
    b0:        000502b3        add x5 x10 x0
    b4:        024000ef        jal x1 36 <add_float>
    b8:        400002b7        lui x5 0x40000
    bc:        1d8000ef        jal x1 472 <div_float>
    c0:        000e8533        add x10 x29 x0
    c4:        00160613        addi x12 x12 1
    c8:        fd9ff06f        jal x0 -40 <loop>

000000cc <outloop>:
    cc:        00412083        lw x1 4 x2
    d0:        00810113        addi x2 x2 8
    d4:        00008067        jalr x0 x1 0

000000d8 <add_float>:
    d8:        ffc10113        addi x2 x2 -4
    dc:        00112023        sw x1 0 x2
    e0:        800003b7        lui x7 0x80000
    e4:        fff38393        addi x7 x7 -1
    e8:        0072f333        and x6 x5 x7
    ec:        007ef3b3        and x7 x29 x7
    f0:        0063de63        bge x7 x6 28 <aExit1>
    f4:        00028e13        addi x28 x5 0
    f8:        000e8293        addi x5 x29 0
    fc:        000e0e93        addi x29 x28 0
    100:        00030e13        addi x28 x6 0
    104:        00038313        addi x6 x7 0
    108:        000e0393        addi x7 x28 0

0000010c <aExit1>:
    10c:        0173df93        srli x31 x7 23
    110:        01735f13        srli x30 x6 23
    114:        00800e37        lui x28 0x800
    118:        fffe0e13        addi x28 x28 -1
    11c:        01c3f3b3        and x7 x7 x28
    120:        01c37333        and x6 x6 x28
    124:        01f05663        bge x0 x31 12 <aExit2>
    128:        00800e37        lui x28 0x800
    12c:        01c3e3b3        or x7 x7 x28

00000130 <aExit2>:
    130:        01e05663        bge x0 x30 12 <aExit3>
    134:        00800e37        lui x28 0x800
    138:        006e6333        or x6 x28 x6

0000013c <aExit3>:
    13c:        41ef8f33        sub x30 x31 x30
    140:        01e35333        srl x6 x6 x30
    144:        01f2d293        srli x5 x5 31
    148:        01fede93        srli x29 x29 31
    14c:        01d2c2b3        xor x5 x5 x29
    150:        00028663        beq x5 x0 12 <aElse4>
    154:        40638e33        sub x28 x7 x6
    158:        0080006f        jal x0 8 <aExit4>

0000015c <aElse4>:
    15c:        00638e33        add x28 x7 x6

00000160 <aExit4>:
    160:        010002b7        lui x5 0x1000
    164:        01c2f2b3        and x5 x5 x28
    168:        00028863        beq x5 x0 16 <aElse5>
    16c:        001e5e13        srli x28 x28 1
    170:        001f8f93        addi x31 x31 1
    174:        0200006f        jal x0 32 <aExit5>

00000178 <aElse5>:
    178:        000e0e63        beq x28 x0 28 <aExit5>
    17c:        008002b7        lui x5 0x800
    180:        005e72b3        and x5 x28 x5
    184:        00029863        bne x5 x0 16 <aExit5>
    188:        001e1e13        slli x28 x28 1
    18c:        ffff8f93        addi x31 x31 -1
    190:        fe9ff06f        jal x0 -24 <aElse5>

00000194 <aExit5>:
    194:        00000333        add x6 x0 x0
    198:        03f05063        bge x0 x31 32 <add_out>
    19c:        008003b7        lui x7 0x800
    1a0:        fff38393        addi x7 x7 -1
    1a4:        007e7333        and x6 x28 x7
    1a8:        017f9f93        slli x31 x31 23
    1ac:        01f36333        or x6 x6 x31
    1b0:        01fe9e93        slli x29 x29 31
    1b4:        01d36333        or x6 x6 x29

000001b8 <add_out>:
    1b8:        00012083        lw x1 0 x2
    1bc:        00410113        addi x2 x2 4
    1c0:        00008067        jalr x0 x1 0

000001c4 <CLZ>:
    1c4:        0012d313        srli x6 x5 1
    1c8:        0062e2b3        or x5 x5 x6
    1cc:        0022d313        srli x6 x5 2
    1d0:        0062e2b3        or x5 x5 x6
    1d4:        0042d313        srli x6 x5 4
    1d8:        0062e2b3        or x5 x5 x6
    1dc:        0082d313        srli x6 x5 8
    1e0:        0062e2b3        or x5 x5 x6
    1e4:        0102d313        srli x6 x5 16
    1e8:        0062e2b3        or x5 x5 x6
    1ec:        0012d313        srli x6 x5 1
    1f0:        555553b7        lui x7 0x55555
    1f4:        55538393        addi x7 x7 1365
    1f8:        00737333        and x6 x6 x7
    1fc:        406282b3        sub x5 x5 x6
    200:        0022d313        srli x6 x5 2
    204:        333333b7        lui x7 0x33333
    208:        33338393        addi x7 x7 819
    20c:        00737333        and x6 x6 x7
    210:        0072f2b3        and x5 x5 x7
    214:        006282b3        add x5 x5 x6
    218:        0042d313        srli x6 x5 4
    21c:        0f0f13b7        lui x7 0xf0f1
    220:        f0f38393        addi x7 x7 -241
    224:        005302b3        add x5 x6 x5
    228:        0072f2b3        and x5 x5 x7
    22c:        0082d313        srli x6 x5 8
    230:        006282b3        add x5 x5 x6
    234:        0102d313        srli x6 x5 16
    238:        006282b3        add x5 x5 x6
    23c:        01f2f293        andi x5 x5 31
    240:        fe028293        addi x5 x5 -32
    244:        405002b3        sub x5 x0 x5
    248:        00008067        jalr x0 x1 0

0000024c <int2float>:
    24c:        ffc10113        addi x2 x2 -4
    250:        00112023        sw x1 0 x2
    254:        000e02b3        add x5 x28 x0
    258:        f6dff0ef        jal x1 -148 <CLZ>
    25c:        00128293        addi x5 x5 1
    260:        fe028293        addi x5 x5 -32
    264:        405002b3        sub x5 x0 x5
    268:        fe928313        addi x6 x5 -23
    26c:        40600333        sub x6 x0 x6
    270:        006e1e33        sll x28 x28 x6
    274:        00800337        lui x6 0x800
    278:        006e4e33        xor x28 x28 x6
    27c:        07f28293        addi x5 x5 127
    280:        01729293        slli x5 x5 23
    284:        005e6e33        or x28 x28 x5

00000288 <intout>:
    288:        00012083        lw x1 0 x2
    28c:        00410113        addi x2 x2 4
    290:        00008067        jalr x0 x1 0

00000294 <div_float>:
    294:        ffc10113        addi x2 x2 -4
    298:        00112023        sw x1 0 x2
    29c:        00000eb3        add x29 x0 x0
    2a0:        14030a63        beq x6 x0 340 <div_out>
    2a4:        80000eb7        lui x29 0x80000
    2a8:        fffe8e93        addi x29 x29 -1
    2ac:        14028463        beq x5 x0 328 <div_out>
    2b0:        01f35393        srli x7 x6 31
    2b4:        01735e13        srli x28 x6 23
    2b8:        0ffe7e13        andi x28 x28 255
    2bc:        00800eb7        lui x29 0x800
    2c0:        fffe8e93        addi x29 x29 -1
    2c4:        01d37333        and x6 x6 x29
    2c8:        00800eb7        lui x29 0x800
    2cc:        01c05663        bge x0 x28 12 <dElse1>
    2d0:        01d36333        or x6 x6 x29
    2d4:        0140006f        jal x0 20 <dExit1>

000002d8 <dElse1>:
    2d8:        00131313        slli x6 x6 1
    2dc:        fffe0e13        addi x28 x28 -1
    2e0:        01d37f33        and x30 x6 x29
    2e4:        fe0f1ae3        bne x30 x0 -12 <dElse1>

000002e8 <dExit1>:
    2e8:        ff810113        addi x2 x2 -8
    2ec:        00712023        sw x7 0 x2
    2f0:        01c12223        sw x28 4 x2
    2f4:        01f2d393        srli x7 x5 31
    2f8:        0013f393        andi x7 x7 1
    2fc:        0172de13        srli x28 x5 23
    300:        0ffe7e13        andi x28 x28 255
    304:        00800eb7        lui x29 0x800
    308:        fffe8e93        addi x29 x29 -1
    30c:        01d2f2b3        and x5 x5 x29
    310:        00800eb7        lui x29 0x800
    314:        01c05663        bge x0 x28 12 <dElse2>
    318:        01d2e2b3        or x5 x5 x29
    31c:        0140006f        jal x0 20 <dExit2>

00000320 <dElse2>:
    320:        00129293        slli x5 x5 1
    324:        fffe0e13        addi x28 x28 -1
    328:        01d2ff33        and x30 x5 x29
    32c:        fe0f1ae3        bne x30 x0 -12 <dElse2>

00000330 <dExit2>:
    330:        00012e83        lw x29 0 x2
    334:        01d3c3b3        xor x7 x7 x29
    338:        00412e83        lw x29 4 x2
    33c:        07fe8e93        addi x29 x29 127
    340:        41ce8e33        sub x28 x29 x28
    344:        00810113        addi x2 x2 8
    348:        00535663        bge x6 x5 12 <dExit3>
    34c:        00131313        slli x6 x6 1
    350:        fffe0e13        addi x28 x28 -1

00000354 <dExit3>:
    354:        01900f13        addi x30 x0 25
    358:        00000eb3        add x29 x0 x0
    35c:        000e5863        bge x28 x0 16 <dExit4>
    360:        01cf0f33        add x30 x30 x28
    364:        00000e33        add x28 x0 x0
    368:        080f4663        blt x30 x0 140 <div_out>

0000036c <dExit4>:
    36c:        00000f93        addi x31 x0 0

00000370 <dloop>:
    370:        03efd063        bge x31 x30 32 <doutloop>
    374:        001e9e93        slli x29 x29 1
    378:        00534663        blt x6 x5 12 <dExit5>
    37c:        40530333        sub x6 x6 x5
    380:        001eee93        ori x29 x29 1

00000384 <dExit5>:
    384:        00131313        slli x6 x6 1
    388:        001f8f93        addi x31 x31 1
    38c:        fe5ff06f        jal x0 -28 <dloop>

00000390 <doutloop>:
    390:        00134f13        xori x30 x6 1
    394:        001ef313        andi x6 x29 1
    398:        002ef293        andi x5 x29 2
    39c:        001ede93        srli x29 x29 1
    3a0:        01e2e2b3        or x5 x5 x30
    3a4:        00537333        and x6 x6 x5
    3a8:        006e8eb3        add x29 x29 x6
    3ac:        020e1663        bne x28 x0 44 <dExit6>
    3b0:        00900313        addi x6 x0 9
    3b4:        0262d263        bge x5 x6 36 <dExit6>
    3b8:        ffc10113        addi x2 x2 -4
    3bc:        00712023        sw x7 0 x2
    3c0:        e05ff0ef        jal x1 -508 <CLZ>
    3c4:        00012383        lw x7 0 x2
    3c8:        00410113        addi x2 x2 4
    3cc:        40530333        sub x6 x6 x5
    3d0:        006edeb3        srl x29 x29 x6
    3d4:        006e0e33        add x28 x28 x6

000003d8 <dExit6>:
    3d8:        00800f37        lui x30 0x800
    3dc:        ffff0f13        addi x30 x30 -1
    3e0:        01eefeb3        and x29 x29 x30
    3e4:        017e1e13        slli x28 x28 23
    3e8:        01f39393        slli x7 x7 31
    3ec:        01ceeeb3        or x29 x29 x28
    3f0:        007eeeb3        or x29 x29 x7

000003f4 <div_out>:
    3f4:        00012083        lw x1 0 x2
    3f8:        00410113        addi x2 x2 4
    3fc:        00008067        jalr x0 x1 0

00000400 <printResult>:
    400:        00040293        addi x5 x8 0
    404:        00050313        addi x6 x10 0
    408:        00070393        addi x7 x14 0
    40c:        10000517        auipc x10 0x10000
    410:        c1050513        addi x10 x10 -1008
    414:        00400893        addi x17 x0 4
    418:        00000073        ecall
    41c:        00028513        addi x10 x5 0
    420:        00100893        addi x17 x0 1
    424:        00000073        ecall
    428:        10000517        auipc x10 0x10000
    42c:        c0e50513        addi x10 x10 -1010
    430:        00400893        addi x17 x0 4
    434:        00000073        ecall
    438:        00070513        addi x10 x14 0
    43c:        00100893        addi x17 x0 1
    440:        00000073        ecall
    444:        10000517        auipc x10 0x10000
    448:        c1050513        addi x10 x10 -1008
    44c:        00400893        addi x17 x0 4
    450:        00000073        ecall
    454:        00030513        addi x10 x6 0
    458:        00100893        addi x17 x0 1
    45c:        00000073        ecall
    460:        00a70e63        beq x14 x10 28 <CORRECT>
    464:        00178793        addi x15 x15 1
    468:        10000517        auipc x10 0x10000
    46c:        c0d50513        addi x10 x10 -1011
    470:        00400893        addi x17 x0 4
    474:        00000073        ecall
    478:        0140006f        jal x0 20 <printout>

0000047c <CORRECT>:
    47c:        10000517        auipc x10 0x10000
    480:        bf950513        addi x10 x10 -1031
    484:        00400893        addi x17 x0 4
    488:        00000073        ecall

0000048c <printout>:
    48c:        00008067        jalr x0 x1 0

00000490 <conclude>:
    490:        10000517        auipc x10 0x10000
    494:        c1350513        addi x10 x10 -1005
    498:        00400893        addi x17 x0 4
    49c:        00000073        ecall
    4a0:        00078513        addi x10 x15 0
    4a4:        00100893        addi x17 x0 1
    4a8:        00000073        ecall
    4ac:        10000517        auipc x10 0x10000
    4b0:        c0650513        addi x10 x10 -1018
    4b4:        00400893        addi x17 x0 4
    4b8:        00000073        ecall
    4bc:        00008067        jalr x0 x1 0

Result

As for the code above can only calculate integer number, which means that the input data should in the range of

[1,231]
AS result, I selected three integer as my testing data, such like 1160030, 25, 500

And the each output data is exactly the same as I calculate in C code, where bothe Newtons iteration epoch = 5.

image

Reference:

Always refer to primary sources, such as official RISC-V documentation.