# Assignment1: RISC-V Assembly and Instruction Pipeline The bfloat16 format is a 16-bit floating-point representation, designed to provide a wide dynamic range by using a floating radix point. It is a shortened version of the 32-bit IEEE 754 single-precision format (binary32), aimed at accelerating machine learning. The structure of the bfloat16 floating-point format is as follows. ## Quiz 1 Problem B: ``` ┌ sign │ │ ┌ exponent │ │ │ │ ┌ mantissa │ │ │ │┌──┴───┐┌─┴───┐ 0b0000000000000000 bfloat16 ``` ### Implementation #### C code :::danger Why was `union` used rather than arbitrary pointer access? ::: ```c typedef struct { uint16_t bits; } bf16_t; static inline bf16_t fp32_to_bf16(float s) { bf16_t h; union { float f; uint32_t i; } u = {.f = s}; if ((u.i & 0x7fffffff) > 0x7f800000) { /* NaN */ h.bits = (u.i >> 16) | 64; /* force to quiet */ return h; } h.bits = (u.i + (0x7fff + ((u.i >> 0x10) & 1))) >> 0x10; return h; } static inline float bf16_to_fp32(bf16_t h) { union { float f; uint32_t i; } u = {.i = (uint32_t)h.bits << 16}; return u.f; } ``` Compile result: <s> ![image](https://hackmd.io/_uploads/SJ_s9NZykg.png) </s> :::danger Do not use screenshots for plain text content, as this is inaccessible to visually impaired users. ::: #### Assembly code ```c .data arr: .word 0xc1cc0000 # int arr = 0xc1cc0000; .text main: lw s0, arr # s0 = arr; jal ra, fp32_to_fp16 # fp32_to_fp16(s0); li a7,10 # exit(0); ecall # exit(0); fp32_to_fp16: addi sp, sp, -8 # sp -= 8; sw ra, 4(sp) # *(sp + 4) = ra; sw s0, 0(sp) # *sp = s0; mv t0, s0 # t0 = s0; slli t0, t0, 1 # t0 <<= 1; srli t0, t0, 24 # t0 >>= 24; addi t1, x0, 0xff # t1 = 0xff; srli a0, s0, 16 # a0 = s0 >> 16; bne t0, t1, Else # if (t0 != t1) goto Else; ori a0, a0, 64 # a0 |= 64; j Exit # goto Exit; Else: andi a0, a0, 1 # a0 &= 1; li t2, 0x7fff # t2 = 0x7fff; add a0, a0, t2 # a0 += t2; add a0, s0, a0 # a0 = s0 + a0; srli a0, a0, 0x10 # a0 >>= 16; Exit: lw s0, 0(sp) # s0 = *sp; lw ra, 4(sp) # ra = *(sp + 8); addi sp, sp, 8 # sp += 8; jr ra # return; bf16_to_fp32: addi sp, sp, -8 # sp -= 16; sw ra, 4(sp) # *(sp + 4) = ra; sw s1, 0(sp) # *sp = s1; slli a0, s0, 16 # a0 = s0 << 16; lw s0, 0(sp) # s0 = *sp; lw ra, 4(sp) # ra = *(sp + 4); addi sp, sp, 8 # sp += 8; jr ra # return; ``` ## Implement square root using CLZ method ### Using Newton's method The Newton-Raphson method, also known as Newton's method, is widely used for finding the roots of polynomials. The algorithm for Newton's method is as follows: Given the function $f(x)$, we can find its root using the iterative process: $$ x^{k+1}=x_k-\frac{f(x_k)}{f'(x_k)}\ \ \ \ \ \text{where k}\in (1,\infty) $$ Applying this method to solve $x^2=a$, the iterative process becomes: $$ x^{k+1}=x_k-\frac{x_k^2-a}{2x_k}\\ =\frac{1}{2}(x_k+\frac{a}{x_k}) $$ Without loss of generality, we can simplify the formula above as follows: $$ x^{k+1} =0.5x_k+\frac{a}{x_k} $$ This formula helps refine the estimate of the square root of $a$ through each iteration. ### Initial guess With the formula above, we can easily calculate the precise square root. However, selecting an appropriate initial guess is crucial. A good initial guess can significantly reduce the number of iterations needed to reach the correct root. Using CLZ (count leading zeros) is an effective method for determining a suitable initial guess. ### Code implementation #### C code ``` ________________________________________________________________ |_0_|_______8______|____________________23_______________________| sign exponential mantissa ``` Calculate leading zeros ```c= uint8_t CLZ (uint32_t x){ x |= x >> 1; x |= x >> 2; x |= x >> 4; x |= x >> 8; x |= x >> 16; x -= ((x >> 1) & 0x55555555); x = ((x >> 2) & 0x33333333) + (x & 0x33333333); x = ((x >> 4) + x) & 0x0f0f0f0f; x += (x >> 8); return (32 - (x & 0x1f)); } ``` Float addition ```c= static inline float add_float(float a, float b){ // 1. Handle special cases like NaN, infinity, zero. if (a == 0.0f) return b; if (b == 0.0f) return a; union { uint32_t bits; float value; } fpa = {.value = a}, fpb = {.value = b}; uint32_t sign_a = fpa.bits & 0x80000000; uint32_t sign_b = fpb.bits & 0x80000000; // Extract absolute values uint32_t abs_a = fpa.bits & 0x7fffffff; uint32_t abs_b = fpb.bits & 0x7fffffff; // Ensure abs_a > abs_b if (abs_a < abs_b) { uint32_t temp = abs_a; abs_a = abs_b; abs_b = temp; temp = sign_a; sign_a = sign_b; sign_b = sign_a; } // Extract exponents and mantissas uint32_t exp_a = (abs_a >> 23) & 0xff; uint32_t exp_b = (abs_b >> 23) & 0xff; uint32_t mantissa_a; uint32_t mantissa_b; if (exp_a > 0){ mantissa_a = (abs_a & 0x7fffff) | 0x800000; // Add implicit 1 } else mantissa_a = (abs_a & 0x7fffff); if (exp_b > 0){ mantissa_b = (abs_b & 0x7fffff) | 0x800000; // Add implicit 1 } else{ mantissa_b = (abs_b & 0x7fffff); } // Align mantissa_b with mantissa_a uint32_t diff_exp = exp_a - exp_b; mantissa_b >>= diff_exp; // Calculate resulting mantissa based on sign difference uint32_t mantissa; if ((sign_a >> 31) ^ (sign_b >> 31)) { mantissa = mantissa_a - mantissa_b; } else { mantissa = mantissa_a + mantissa_b; } // Normalize mantissa if necessary uint32_t exp = exp_a; if (mantissa & 0x1000000) { mantissa >>= 1; exp++; } else { while (mantissa && !(mantissa & 0x800000)) { mantissa <<= 1; exp--; } } // Handle underflow and overflow if (exp <= 0) return 0.0f; union { uint32_t bits; float value; } out = {.bits = sign_a | (exp << 23) | (mantissa & 0x7fffff)}; return out.value; } ``` float division ```c= static inline float div_float(float p, float q){ union { uint32_t bits; float value; } fpp = {.value = p}; // Extract sign, exponent, and mantissa of p uint32_t sign_p = (fpp.bits >> 31); uint32_t exp_p = (fpp.bits >> 23) & 0xff; uint32_t mantissa_p; // Normalize mantissa of p if (exp_p > 0) { mantissa_p = (fpp.bits & 0x7FFFFF) | 0x800000; // Add implicit 1 } else { mantissa_p = (fpp.bits & 0x7FFFFF); int dif = CLZ(mantissa_p) - 8; mantissa_p <<= dif; exp_p = 1 -dif; } union { uint32_t bits; float value; } fpq = {.value = q}; // Extract sign, exponent, and mantissa of q uint32_t sign_q = (fpq.bits >> 31) & 0x1; uint32_t exp_q = (fpq.bits >> 23) & 0xff; uint32_t mantissa_q; // Normalize mantissa of if (exp_q > 0) { mantissa_q = (fpq.bits & 0x7FFFFF) | 0x800000; // Add implicit 1 } else { mantissa_q = (fpq.bits & 0x7FFFFF); int dif = CLZ(mantissa_q) - 8; mantissa_q <<= dif; exp_q = 1 -dif; } // Compute sign, exponent, and mantissa of the result uint32_t sign = sign_p ^ sign_q; int exp = exp_p - exp_q + 127; uint32_t mantissa = 0; // Align mantissa_p to be larger than mantissa_q if (mantissa_p < mantissa_q) { mantissa_p <<= 1; exp--; } // Perform division of mantissas using bitwise long division int nbits = 25; if (exp < 0) { nbits += exp; exp = 0; if (nbits < 0) { return 0; } } for (int i = 0; i < nbits; i++) { mantissa <<= 1; if (mantissa_p >= mantissa_q) { mantissa_p -= mantissa_q; mantissa |= 1; } mantissa_p <<= 1; } // Round the result uint8_t odd, rnd, sticky; sticky = (mantissa_p != 0); rnd = (mantissa & 1); odd = (mantissa & 2); mantissa = (mantissa >> 1) + (rnd & (sticky | odd)); // Normalize the result if needed int lz = CLZ(mantissa); if (exp == 0 && (lz < 9)) { mantissa >>= (9 - lz); exp += (9 - lz); } // Combine the sign, exponent, and mantissa to form the final result union { uint32_t bits; float value; } output = {.bits = (sign << 31) | (exp << 23) | (mantissa & 0x7FFFFF)}; return output.value; } ``` ```c= static inline float Newtons_method(int alpha){ // do the leading zero counting int lzc = (32 - CLZ(round_a))/2; //init float float output = int2float(lzc); if (output == 0){ output = 2; } float input = int2float(alpha); //iteration loop for (int i = 0; i < iteration; i++){ float temp = div_float(input, output); output = add_float(output , temp); output = div_float(output, (float) 2); } return output; } ``` #### Assembly code ```asm= .data inputdata: .word 1160030, 25, 500 answer: .word 0x4486a180,0x40a00000, 0x41B2E389 iteration: .word 5 str1: .string "\n The testing number is: " str2: .string "\n The correct root number is:" str3: .string "\n The calculated root number is:" str4: .string "\n The answer is CORRECT" str5: .string "\n The answer is WRONG" str6: .string "\n we got total" str7: .string "\n error" .text main: la s3, inputdata li s1, 2 li s2, 0 la s4, answer mv a5, x0 mloop: lw a0, 0(s3) mv s0, a0 jal ra, Newtons_method lw a4, 0(s4) jal ra, printResult addi s4, s4, 4 addi s2, s2, 1 addi s3, s3, 4 blt s2, s1, mloop jal ra, conclude #Exit the program li a7, 10 ecall # a0: the input/output data # a2: iteration Newtons_method: addi sp, sp, -8 sw ra, 4(sp) # round input sw a0, 0(sp) # get initial guess add t0, a0, x0 jal ra, CLZ add a0, t0, x0 addi a0, a0, -32 sub a0, x0, a0 srli a0, a0, 1 mv t3, a0 jal ra, int2float mv a0, t3 bne a0, x0, nExit li a0, 0x40000000 nExit: the total iteration time lw a1, 0(sp) mv t3, a1 jal ra, int2float mv a1, t3 mv a2, x0 lw a3, iteration loop: bge a2, a3, outloop #if a2 >= a3 jump to output loop add t0, a0, x0 add t1, a1, x0 jal ra, div_float # t4 = t1/t0 add t0, a0, x0 jal ra, add_float # t1 = t4 + t0 li t0, 0x40000000 jal ra div_float # t4 = t1/t0 add a0, t4, x0 # a0(output) = t4 addi a2, a2, 1 j loop outloop: lw ra 4(sp) addi sp, sp, 8 jr ra add_float: addi sp, sp, -4 sw ra, 0(sp) # calculate abs li t2, 0x7fffffff and t1, t0, t2 # t1 = abs (t0) and t2, t4, t2 # t2 = abs (t4) # we always make sure (abs(t4) > abs(t0)) bge t2, t1, aExit1 # switch t0 and t4 mv t3, t0 mv t0, t4 mv t4, t3 # switch t1 and t2 mv t3, t1 mv t1, t2 mv t2, t3 aExit1: srli t6, t2, 23 srli t5, t1, 23 # compute the t4's mantissa li t3 0x7fffff and t2, t2, t3 and t1, t1, t3 bge x0, t6, aExit2 li t3, 0x800000 or t2, t2, t3 aExit2: bge x0, t5, aExit3 li t3, 0x800000 or t1, t3, t1 aExit3: # now we have t4 -> t2(mantissa), t6 (exp) # t0 -> t1(mantissa), t5 (exp) sub t5, t6, t5 # t5 is the diff of exp srl t1, t1, t5 srli t0, t0, 31 srli t4, t4, 31 # now t0, t4 are the sign bit xor t0, t0, t4 beq t0, x0, aElse4 sub t3, t2, t1 j aExit4 aElse4: add t3, t2, t1 aExit4: # we can release t2, t1, t0, t5 # t3 mantissa # t4 sign # t6 exp li t0, 0x1000000 and t0, t0, t3 beq t0, x0 aElse5 srli t3, t3, 1 addi t6, t6, 1 j aExit5 aElse5: beq t3, x0, aExit5 li t0, 0x800000 and t0, t3, t0 bne t0, x0, aExit5 slli t3, t3, 1 addi t6, t6, -1 j aElse5 aExit5: add t1, x0, x0 bge x0, t6, add_out li t2, 0x7fffff and t1, t3, t2 slli t6, t6, 23 or t1, t1, t6 slli t4, t4, 31 or t1, t1, t4 add_out: lw ra, 0(sp) addi sp, sp, 4 jr ra # t0 is the input/output data # t1 abd t2 are the temperatory data CLZ: srli t1, t0, 1 # t1 = t0 >> 1 or t0, t0, t1 # t0 = t1 | t0 srli t1, t0, 2 # t1 = t0 >> 2 or t0, t0, t1 # t0 = t1 | t0 srli t1, t0, 4 # t1 = t0 >> 4 or t0, t0, t1 # t0 = t1 | t0 srli t1, t0, 8 # t1 = t0 >> 8 or t0, t0, t1 # t0 = t1 | t0 srli t1, t0, 16 # t1 = t0 >> 16 or t0, t0, t1 # t0 = t1 | t0 srli t1, t0, 1 # t1 = t0 >> 1 li t2, 0x55555555 and t1, t1, t2 sub t0, t0, t1 srli t1, t0, 2 li t2, 0x33333333 and t1, t1, t2 and t0, t0, t2 add t0, t0, t1 srli t1, t0, 4 li t2, 0x0f0f0f0f add t0, t1, t0 and t0, t0, t2 srli t1, t0, 8 add t0, t0, t1 srli t1, t0, 16 add t0, t0, t1 andi t0, t0, 0x1f addi t0, t0, -32 sub t0, x0, t0 jr ra # t0 exponental part # t1 mantissa part # t3 input/output number int2float: addi sp, sp, -4 sw ra, 0(sp) add t0, t3, x0 jal ra, CLZ addi t0, t0, 1 addi t0, t0, -32 sub t0, x0, t0 addi t1, t0, -23 sub t1, x0, t1 sll t3, t3, t1 li t1, 0x800000 xor t3, t3, t1 addi t0, t0, 127 slli t0, t0, 23 or t3, t3, t0 intout: lw ra, 0(sp) addi sp, sp, 4 jr ra # t0, t1, input number t1/t0 # t2 sign # t3 exp # t1 mantissa # t4 output div_float: addi sp, sp, -4 sw ra, 0(sp) add t4, x0, x0 beq t1, x0, div_out li t4, 0x7fffffff beq t0, x0, div_out # separate the t1 data into sign|exp|mantissa # t1 sign t2 srli t2, t1, 31 # t2 = t1 >> 31 # t1 exp t3 srli t3, t1, 23 # t3 = t1 >> 23 andi t3, t3, 0xff # t3 = t3 & 0xff li t4, 0x7FFFFF # t4 = 0x7FFFFF and t1, t1, t4 # t1 = t1 & 0x7FFFFF li t4, 0x800000 # t4 = 0x800000 bge x0, t3, dElse1 # if t3 <= 0 jump to dElse1 or t1, t1, t4 # t1 = t1 | 0x800000 j dExit1 # jump to dExit1 dElse1: slli t1, t1, 1 addi t3, t3, -1 and t5, t1, t4 bne t5, x0, dElse1 dExit1: # save the data t2, t3 addi sp, sp, -8 sw t2, 0(sp) # 0(sp) the t1's sign value sw t3, 4(sp) # 4(sp) t1's exp # separate the t0 data into sign|exp|mantissa # t0 sign t2 srli t2, t0, 31 # t2 = t0 >> 1 andi t2, t2, 1 # t2 = t2 & 1 # t1 exp t3 srli t3, t0, 23 # t3 = t0 >> 23 andi t3, t3, 0xff # t3 = t3 & 0xff li t4, 0x7FFFFF # t4 = 0x7FFFFF and t0, t0, t4 # t0 = t0 & 0x7FFFFF li t4, 0x800000 # t4 = 0x800000 bge x0, t3, dElse2 # if 0 < t3 jump to dElse2 or t0, t0, t4 # t0 = t0 | 0x800000 j dExit2 # jump to dExit1 dElse2: slli t0, t0, 1 addi t3, t3, -1 and t5, t0, t4 bne t5, x0, dElse2 dExit2: lw t4, 0(sp) # t4 = t1's sign value xor t2, t2, t4 # t2 = t2 ^ t4 lw t4, 4(sp) # t4 = t1's exp value addi t4, t4, 127 # t4 = t4 +127 sub t3, t4, t3 # t3 = t4 - t3 addi sp, sp, 8 # recover the sp position bge t1, t0, dExit3 # if t1 < t0 align mantissa slli t1, t1, 1 addi t3, t3, -1 dExit3: # t5 iteration number li t5, 25 add t4, x0, x0 # t4 = 0; bge t3, x0, dExit4 add t5, t5, t3 add t3, x0, x0 blt t5, x0, div_out dExit4: # division loop output t4 li t6, 0 dloop: bge t6, t5, doutloop slli t4, t4, 1 # t4 = t4 << 1 blt t1, t0, dExit5 sub t1, t1, t0 ori t4, t4, 1 dExit5: slli t1, t1, 1 addi t6, t6, 1 j dloop doutloop: # round result # odd t0 # rnd t1 # sticky t5 xori t5, t1, 1 andi t1, t4, 1 andi t0, t4, 2 srli t4, t4, 1 or t0, t0, t5 and t1, t1, t0 add t4, t4, t1 # normalize the result if needed bne t3, x0, dExit6 li t1, 9 bge t0, t1, dExit6 addi sp, sp, -4 sw t2, 0(sp) jal ra CLZ lw t2, 0(sp) addi sp, sp, 4 sub t1, t1, t0 srl t4, t4, t1 add t3, t3, t1 dExit6: li t5 0x7fffff and t4, t4, t5 # mantissa = mantissa & 0x7fffff slli t3, t3, 23 # exp << 23 slli t2, t2, 31 # exp << 31 or t4, t4, t3 or t4, t4, t2 div_out: lw ra, 0(sp) addi sp, sp, 4 jr ra printResult: mv t0, s0 # original data mv t1, a0 # root data mv t2, a4 # answer la a0, str1 li a7, 4 ecall mv a0, t0 li a7, 1 ecall la a0, str2 li a7, 4 ecall mv a0, a4 li a7, 1 ecall la a0, str3 li a7, 4 ecall mv a0, t1 li a7, 1 ecall beq a4, a0, CORRECT addi a5, a5, 1 la a0, str4 li a7, 4 ecall j printout CORRECT: la a0, str4 li a7, 4 ecall printout: ret conclude: la a0, str6 li a7, 4 ecall mv a0, a5 li a7, 1 ecall la a0, str7 li a7, 4 ecall ret ``` ## Analysis ### pseudo instruction ``` 00000000 <main>: 0: 10000997 auipc x19 0x10000 4: 00098993 addi x19 x19 0 8: 00200493 addi x9 x0 2 c: 00000913 addi x18 x0 0 10: 10000a17 auipc x20 0x10000 14: ffca0a13 addi x20 x20 -4 18: 00000793 addi x15 x0 0 0000001c <mloop>: 1c: 0009a503 lw x10 0 x19 20: 00050413 addi x8 x10 0 24: 028000ef jal x1 40 <Newtons_method> 28: 000a2703 lw x14 0 x20 2c: 3d4000ef jal x1 980 <printResult> 30: 004a0a13 addi x20 x20 4 34: 00190913 addi x18 x18 1 38: 00498993 addi x19 x19 4 3c: fe9940e3 blt x18 x9 -32 <mloop> 40: 450000ef jal x1 1104 <conclude> 44: 00a00893 addi x17 x0 10 48: 00000073 ecall 0000004c <Newtons_method>: 4c: ff810113 addi x2 x2 -8 50: 00112223 sw x1 4 x2 54: 00a12023 sw x10 0 x2 58: 000502b3 add x5 x10 x0 5c: 168000ef jal x1 360 <CLZ> 60: 00028533 add x10 x5 x0 64: fe050513 addi x10 x10 -32 68: 40a00533 sub x10 x0 x10 6c: 00155513 srli x10 x10 1 70: 00050e13 addi x28 x10 0 74: 1d8000ef jal x1 472 <int2float> 78: 000e0513 addi x10 x28 0 7c: 00051463 bne x10 x0 8 <nExit> 80: 40000537 lui x10 0x40000 00000084 <nExit>: 84: 00012583 lw x11 0 x2 88: 00058e13 addi x28 x11 0 8c: 1c0000ef jal x1 448 <int2float> 90: 000e0593 addi x11 x28 0 94: 00000613 addi x12 x0 0 98: 10000697 auipc x13 0x10000 9c: f806a683 lw x13 -128 x13 000000a0 <loop>: a0: 02d65663 bge x12 x13 44 <outloop> a4: 000502b3 add x5 x10 x0 a8: 00058333 add x6 x11 x0 ac: 1e8000ef jal x1 488 <div_float> b0: 000502b3 add x5 x10 x0 b4: 024000ef jal x1 36 <add_float> b8: 400002b7 lui x5 0x40000 bc: 1d8000ef jal x1 472 <div_float> c0: 000e8533 add x10 x29 x0 c4: 00160613 addi x12 x12 1 c8: fd9ff06f jal x0 -40 <loop> 000000cc <outloop>: cc: 00412083 lw x1 4 x2 d0: 00810113 addi x2 x2 8 d4: 00008067 jalr x0 x1 0 000000d8 <add_float>: d8: ffc10113 addi x2 x2 -4 dc: 00112023 sw x1 0 x2 e0: 800003b7 lui x7 0x80000 e4: fff38393 addi x7 x7 -1 e8: 0072f333 and x6 x5 x7 ec: 007ef3b3 and x7 x29 x7 f0: 0063de63 bge x7 x6 28 <aExit1> f4: 00028e13 addi x28 x5 0 f8: 000e8293 addi x5 x29 0 fc: 000e0e93 addi x29 x28 0 100: 00030e13 addi x28 x6 0 104: 00038313 addi x6 x7 0 108: 000e0393 addi x7 x28 0 0000010c <aExit1>: 10c: 0173df93 srli x31 x7 23 110: 01735f13 srli x30 x6 23 114: 00800e37 lui x28 0x800 118: fffe0e13 addi x28 x28 -1 11c: 01c3f3b3 and x7 x7 x28 120: 01c37333 and x6 x6 x28 124: 01f05663 bge x0 x31 12 <aExit2> 128: 00800e37 lui x28 0x800 12c: 01c3e3b3 or x7 x7 x28 00000130 <aExit2>: 130: 01e05663 bge x0 x30 12 <aExit3> 134: 00800e37 lui x28 0x800 138: 006e6333 or x6 x28 x6 0000013c <aExit3>: 13c: 41ef8f33 sub x30 x31 x30 140: 01e35333 srl x6 x6 x30 144: 01f2d293 srli x5 x5 31 148: 01fede93 srli x29 x29 31 14c: 01d2c2b3 xor x5 x5 x29 150: 00028663 beq x5 x0 12 <aElse4> 154: 40638e33 sub x28 x7 x6 158: 0080006f jal x0 8 <aExit4> 0000015c <aElse4>: 15c: 00638e33 add x28 x7 x6 00000160 <aExit4>: 160: 010002b7 lui x5 0x1000 164: 01c2f2b3 and x5 x5 x28 168: 00028863 beq x5 x0 16 <aElse5> 16c: 001e5e13 srli x28 x28 1 170: 001f8f93 addi x31 x31 1 174: 0200006f jal x0 32 <aExit5> 00000178 <aElse5>: 178: 000e0e63 beq x28 x0 28 <aExit5> 17c: 008002b7 lui x5 0x800 180: 005e72b3 and x5 x28 x5 184: 00029863 bne x5 x0 16 <aExit5> 188: 001e1e13 slli x28 x28 1 18c: ffff8f93 addi x31 x31 -1 190: fe9ff06f jal x0 -24 <aElse5> 00000194 <aExit5>: 194: 00000333 add x6 x0 x0 198: 03f05063 bge x0 x31 32 <add_out> 19c: 008003b7 lui x7 0x800 1a0: fff38393 addi x7 x7 -1 1a4: 007e7333 and x6 x28 x7 1a8: 017f9f93 slli x31 x31 23 1ac: 01f36333 or x6 x6 x31 1b0: 01fe9e93 slli x29 x29 31 1b4: 01d36333 or x6 x6 x29 000001b8 <add_out>: 1b8: 00012083 lw x1 0 x2 1bc: 00410113 addi x2 x2 4 1c0: 00008067 jalr x0 x1 0 000001c4 <CLZ>: 1c4: 0012d313 srli x6 x5 1 1c8: 0062e2b3 or x5 x5 x6 1cc: 0022d313 srli x6 x5 2 1d0: 0062e2b3 or x5 x5 x6 1d4: 0042d313 srli x6 x5 4 1d8: 0062e2b3 or x5 x5 x6 1dc: 0082d313 srli x6 x5 8 1e0: 0062e2b3 or x5 x5 x6 1e4: 0102d313 srli x6 x5 16 1e8: 0062e2b3 or x5 x5 x6 1ec: 0012d313 srli x6 x5 1 1f0: 555553b7 lui x7 0x55555 1f4: 55538393 addi x7 x7 1365 1f8: 00737333 and x6 x6 x7 1fc: 406282b3 sub x5 x5 x6 200: 0022d313 srli x6 x5 2 204: 333333b7 lui x7 0x33333 208: 33338393 addi x7 x7 819 20c: 00737333 and x6 x6 x7 210: 0072f2b3 and x5 x5 x7 214: 006282b3 add x5 x5 x6 218: 0042d313 srli x6 x5 4 21c: 0f0f13b7 lui x7 0xf0f1 220: f0f38393 addi x7 x7 -241 224: 005302b3 add x5 x6 x5 228: 0072f2b3 and x5 x5 x7 22c: 0082d313 srli x6 x5 8 230: 006282b3 add x5 x5 x6 234: 0102d313 srli x6 x5 16 238: 006282b3 add x5 x5 x6 23c: 01f2f293 andi x5 x5 31 240: fe028293 addi x5 x5 -32 244: 405002b3 sub x5 x0 x5 248: 00008067 jalr x0 x1 0 0000024c <int2float>: 24c: ffc10113 addi x2 x2 -4 250: 00112023 sw x1 0 x2 254: 000e02b3 add x5 x28 x0 258: f6dff0ef jal x1 -148 <CLZ> 25c: 00128293 addi x5 x5 1 260: fe028293 addi x5 x5 -32 264: 405002b3 sub x5 x0 x5 268: fe928313 addi x6 x5 -23 26c: 40600333 sub x6 x0 x6 270: 006e1e33 sll x28 x28 x6 274: 00800337 lui x6 0x800 278: 006e4e33 xor x28 x28 x6 27c: 07f28293 addi x5 x5 127 280: 01729293 slli x5 x5 23 284: 005e6e33 or x28 x28 x5 00000288 <intout>: 288: 00012083 lw x1 0 x2 28c: 00410113 addi x2 x2 4 290: 00008067 jalr x0 x1 0 00000294 <div_float>: 294: ffc10113 addi x2 x2 -4 298: 00112023 sw x1 0 x2 29c: 00000eb3 add x29 x0 x0 2a0: 14030a63 beq x6 x0 340 <div_out> 2a4: 80000eb7 lui x29 0x80000 2a8: fffe8e93 addi x29 x29 -1 2ac: 14028463 beq x5 x0 328 <div_out> 2b0: 01f35393 srli x7 x6 31 2b4: 01735e13 srli x28 x6 23 2b8: 0ffe7e13 andi x28 x28 255 2bc: 00800eb7 lui x29 0x800 2c0: fffe8e93 addi x29 x29 -1 2c4: 01d37333 and x6 x6 x29 2c8: 00800eb7 lui x29 0x800 2cc: 01c05663 bge x0 x28 12 <dElse1> 2d0: 01d36333 or x6 x6 x29 2d4: 0140006f jal x0 20 <dExit1> 000002d8 <dElse1>: 2d8: 00131313 slli x6 x6 1 2dc: fffe0e13 addi x28 x28 -1 2e0: 01d37f33 and x30 x6 x29 2e4: fe0f1ae3 bne x30 x0 -12 <dElse1> 000002e8 <dExit1>: 2e8: ff810113 addi x2 x2 -8 2ec: 00712023 sw x7 0 x2 2f0: 01c12223 sw x28 4 x2 2f4: 01f2d393 srli x7 x5 31 2f8: 0013f393 andi x7 x7 1 2fc: 0172de13 srli x28 x5 23 300: 0ffe7e13 andi x28 x28 255 304: 00800eb7 lui x29 0x800 308: fffe8e93 addi x29 x29 -1 30c: 01d2f2b3 and x5 x5 x29 310: 00800eb7 lui x29 0x800 314: 01c05663 bge x0 x28 12 <dElse2> 318: 01d2e2b3 or x5 x5 x29 31c: 0140006f jal x0 20 <dExit2> 00000320 <dElse2>: 320: 00129293 slli x5 x5 1 324: fffe0e13 addi x28 x28 -1 328: 01d2ff33 and x30 x5 x29 32c: fe0f1ae3 bne x30 x0 -12 <dElse2> 00000330 <dExit2>: 330: 00012e83 lw x29 0 x2 334: 01d3c3b3 xor x7 x7 x29 338: 00412e83 lw x29 4 x2 33c: 07fe8e93 addi x29 x29 127 340: 41ce8e33 sub x28 x29 x28 344: 00810113 addi x2 x2 8 348: 00535663 bge x6 x5 12 <dExit3> 34c: 00131313 slli x6 x6 1 350: fffe0e13 addi x28 x28 -1 00000354 <dExit3>: 354: 01900f13 addi x30 x0 25 358: 00000eb3 add x29 x0 x0 35c: 000e5863 bge x28 x0 16 <dExit4> 360: 01cf0f33 add x30 x30 x28 364: 00000e33 add x28 x0 x0 368: 080f4663 blt x30 x0 140 <div_out> 0000036c <dExit4>: 36c: 00000f93 addi x31 x0 0 00000370 <dloop>: 370: 03efd063 bge x31 x30 32 <doutloop> 374: 001e9e93 slli x29 x29 1 378: 00534663 blt x6 x5 12 <dExit5> 37c: 40530333 sub x6 x6 x5 380: 001eee93 ori x29 x29 1 00000384 <dExit5>: 384: 00131313 slli x6 x6 1 388: 001f8f93 addi x31 x31 1 38c: fe5ff06f jal x0 -28 <dloop> 00000390 <doutloop>: 390: 00134f13 xori x30 x6 1 394: 001ef313 andi x6 x29 1 398: 002ef293 andi x5 x29 2 39c: 001ede93 srli x29 x29 1 3a0: 01e2e2b3 or x5 x5 x30 3a4: 00537333 and x6 x6 x5 3a8: 006e8eb3 add x29 x29 x6 3ac: 020e1663 bne x28 x0 44 <dExit6> 3b0: 00900313 addi x6 x0 9 3b4: 0262d263 bge x5 x6 36 <dExit6> 3b8: ffc10113 addi x2 x2 -4 3bc: 00712023 sw x7 0 x2 3c0: e05ff0ef jal x1 -508 <CLZ> 3c4: 00012383 lw x7 0 x2 3c8: 00410113 addi x2 x2 4 3cc: 40530333 sub x6 x6 x5 3d0: 006edeb3 srl x29 x29 x6 3d4: 006e0e33 add x28 x28 x6 000003d8 <dExit6>: 3d8: 00800f37 lui x30 0x800 3dc: ffff0f13 addi x30 x30 -1 3e0: 01eefeb3 and x29 x29 x30 3e4: 017e1e13 slli x28 x28 23 3e8: 01f39393 slli x7 x7 31 3ec: 01ceeeb3 or x29 x29 x28 3f0: 007eeeb3 or x29 x29 x7 000003f4 <div_out>: 3f4: 00012083 lw x1 0 x2 3f8: 00410113 addi x2 x2 4 3fc: 00008067 jalr x0 x1 0 00000400 <printResult>: 400: 00040293 addi x5 x8 0 404: 00050313 addi x6 x10 0 408: 00070393 addi x7 x14 0 40c: 10000517 auipc x10 0x10000 410: c1050513 addi x10 x10 -1008 414: 00400893 addi x17 x0 4 418: 00000073 ecall 41c: 00028513 addi x10 x5 0 420: 00100893 addi x17 x0 1 424: 00000073 ecall 428: 10000517 auipc x10 0x10000 42c: c0e50513 addi x10 x10 -1010 430: 00400893 addi x17 x0 4 434: 00000073 ecall 438: 00070513 addi x10 x14 0 43c: 00100893 addi x17 x0 1 440: 00000073 ecall 444: 10000517 auipc x10 0x10000 448: c1050513 addi x10 x10 -1008 44c: 00400893 addi x17 x0 4 450: 00000073 ecall 454: 00030513 addi x10 x6 0 458: 00100893 addi x17 x0 1 45c: 00000073 ecall 460: 00a70e63 beq x14 x10 28 <CORRECT> 464: 00178793 addi x15 x15 1 468: 10000517 auipc x10 0x10000 46c: c0d50513 addi x10 x10 -1011 470: 00400893 addi x17 x0 4 474: 00000073 ecall 478: 0140006f jal x0 20 <printout> 0000047c <CORRECT>: 47c: 10000517 auipc x10 0x10000 480: bf950513 addi x10 x10 -1031 484: 00400893 addi x17 x0 4 488: 00000073 ecall 0000048c <printout>: 48c: 00008067 jalr x0 x1 0 00000490 <conclude>: 490: 10000517 auipc x10 0x10000 494: c1350513 addi x10 x10 -1005 498: 00400893 addi x17 x0 4 49c: 00000073 ecall 4a0: 00078513 addi x10 x15 0 4a4: 00100893 addi x17 x0 1 4a8: 00000073 ecall 4ac: 10000517 auipc x10 0x10000 4b0: c0650513 addi x10 x10 -1018 4b4: 00400893 addi x17 x0 4 4b8: 00000073 ecall 4bc: 00008067 jalr x0 x1 0 ``` ### Result As for the code above can only calculate integer number, which means that the input data should in the range of $[1, 2^{31}]$ AS result, I selected three integer as my testing data, such like 1160030, 25, 500 And the each output data is exactly the same as I calculate in C code, where bothe Newtons iteration epoch = 5. ![image](https://hackmd.io/_uploads/rJLav421ke.png) ## Reference: :::danger Always refer to primary sources, such as official RISC-V documentation. :::