# Assignment1: RISC-V Assembly and Instruction Pipeline
The bfloat16 format is a 16-bit floating-point representation, designed to provide a wide dynamic range by using a floating radix point. It is a shortened version of the 32-bit IEEE 754 single-precision format (binary32), aimed at accelerating machine learning.
The structure of the bfloat16 floating-point format is as follows.
## Quiz 1 Problem B:
```
┌ sign
│
│ ┌ exponent
│ │
│ │ ┌ mantissa
│ │ │
│┌──┴───┐┌─┴───┐
0b0000000000000000 bfloat16
```
### Implementation
#### C code
:::danger
Why was `union` used rather than arbitrary pointer access?
:::
```c
typedef struct {
uint16_t bits;
} bf16_t;
static inline bf16_t fp32_to_bf16(float s)
{
bf16_t h;
union {
float f;
uint32_t i;
} u = {.f = s};
if ((u.i & 0x7fffffff) > 0x7f800000) { /* NaN */
h.bits = (u.i >> 16) | 64; /* force to quiet */
return h;
}
h.bits = (u.i + (0x7fff + ((u.i >> 0x10) & 1))) >> 0x10;
return h;
}
static inline float bf16_to_fp32(bf16_t h)
{
union {
float f;
uint32_t i;
} u = {.i = (uint32_t)h.bits << 16};
return u.f;
}
```
Compile result:
<s>

</s>
:::danger
Do not use screenshots for plain text content, as this is inaccessible to visually impaired users.
:::
#### Assembly code
```c
.data
arr: .word 0xc1cc0000 # int arr = 0xc1cc0000;
.text
main:
lw s0, arr # s0 = arr;
jal ra, fp32_to_fp16 # fp32_to_fp16(s0);
li a7,10 # exit(0);
ecall # exit(0);
fp32_to_fp16:
addi sp, sp, -8 # sp -= 8;
sw ra, 4(sp) # *(sp + 4) = ra;
sw s0, 0(sp) # *sp = s0;
mv t0, s0 # t0 = s0;
slli t0, t0, 1 # t0 <<= 1;
srli t0, t0, 24 # t0 >>= 24;
addi t1, x0, 0xff # t1 = 0xff;
srli a0, s0, 16 # a0 = s0 >> 16;
bne t0, t1, Else # if (t0 != t1) goto Else;
ori a0, a0, 64 # a0 |= 64;
j Exit # goto Exit;
Else:
andi a0, a0, 1 # a0 &= 1;
li t2, 0x7fff # t2 = 0x7fff;
add a0, a0, t2 # a0 += t2;
add a0, s0, a0 # a0 = s0 + a0;
srli a0, a0, 0x10 # a0 >>= 16;
Exit:
lw s0, 0(sp) # s0 = *sp;
lw ra, 4(sp) # ra = *(sp + 8);
addi sp, sp, 8 # sp += 8;
jr ra # return;
bf16_to_fp32:
addi sp, sp, -8 # sp -= 16;
sw ra, 4(sp) # *(sp + 4) = ra;
sw s1, 0(sp) # *sp = s1;
slli a0, s0, 16 # a0 = s0 << 16;
lw s0, 0(sp) # s0 = *sp;
lw ra, 4(sp) # ra = *(sp + 4);
addi sp, sp, 8 # sp += 8;
jr ra # return;
```
## Implement square root using CLZ method
### Using Newton's method
The Newton-Raphson method, also known as Newton's method, is widely used for finding the roots of polynomials. The algorithm for Newton's method is as follows:
Given the function $f(x)$, we can find its root using the iterative process:
$$
x^{k+1}=x_k-\frac{f(x_k)}{f'(x_k)}\ \ \ \ \
\text{where k}\in (1,\infty)
$$
Applying this method to solve $x^2=a$, the iterative process becomes:
$$
x^{k+1}=x_k-\frac{x_k^2-a}{2x_k}\\ =\frac{1}{2}(x_k+\frac{a}{x_k})
$$
Without loss of generality, we can simplify the formula above as follows:
$$
x^{k+1} =0.5x_k+\frac{a}{x_k}
$$
This formula helps refine the estimate of the square root of $a$ through each iteration.
### Initial guess
With the formula above, we can easily calculate the precise square root. However, selecting an appropriate initial guess is crucial. A good initial guess can significantly reduce the number of iterations needed to reach the correct root. Using CLZ (count leading zeros) is an effective method for determining a suitable initial guess.
### Code implementation
#### C code
```
________________________________________________________________
|_0_|_______8______|____________________23_______________________|
sign exponential mantissa
```
Calculate leading zeros
```c=
uint8_t CLZ (uint32_t x){
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
x -= ((x >> 1) & 0x55555555);
x = ((x >> 2) & 0x33333333) + (x & 0x33333333);
x = ((x >> 4) + x) & 0x0f0f0f0f;
x += (x >> 8);
return (32 - (x & 0x1f));
}
```
Float addition
```c=
static inline float add_float(float a, float b){
// 1. Handle special cases like NaN, infinity, zero.
if (a == 0.0f) return b;
if (b == 0.0f) return a;
union {
uint32_t bits;
float value;
} fpa = {.value = a}, fpb = {.value = b};
uint32_t sign_a = fpa.bits & 0x80000000;
uint32_t sign_b = fpb.bits & 0x80000000;
// Extract absolute values
uint32_t abs_a = fpa.bits & 0x7fffffff;
uint32_t abs_b = fpb.bits & 0x7fffffff;
// Ensure abs_a > abs_b
if (abs_a < abs_b) {
uint32_t temp = abs_a;
abs_a = abs_b;
abs_b = temp;
temp = sign_a;
sign_a = sign_b;
sign_b = sign_a;
}
// Extract exponents and mantissas
uint32_t exp_a = (abs_a >> 23) & 0xff;
uint32_t exp_b = (abs_b >> 23) & 0xff;
uint32_t mantissa_a;
uint32_t mantissa_b;
if (exp_a > 0){
mantissa_a = (abs_a & 0x7fffff) | 0x800000; // Add implicit 1
}
else
mantissa_a = (abs_a & 0x7fffff);
if (exp_b > 0){
mantissa_b = (abs_b & 0x7fffff) | 0x800000; // Add implicit 1
}
else{
mantissa_b = (abs_b & 0x7fffff);
}
// Align mantissa_b with mantissa_a
uint32_t diff_exp = exp_a - exp_b;
mantissa_b >>= diff_exp;
// Calculate resulting mantissa based on sign difference
uint32_t mantissa;
if ((sign_a >> 31) ^ (sign_b >> 31)) {
mantissa = mantissa_a - mantissa_b;
} else {
mantissa = mantissa_a + mantissa_b;
}
// Normalize mantissa if necessary
uint32_t exp = exp_a;
if (mantissa & 0x1000000) {
mantissa >>= 1;
exp++;
} else {
while (mantissa && !(mantissa & 0x800000)) {
mantissa <<= 1;
exp--;
}
}
// Handle underflow and overflow
if (exp <= 0) return 0.0f;
union {
uint32_t bits;
float value;
} out = {.bits = sign_a | (exp << 23) | (mantissa & 0x7fffff)};
return out.value;
}
```
float division
```c=
static inline float div_float(float p, float q){
union {
uint32_t bits;
float value;
} fpp = {.value = p};
// Extract sign, exponent, and mantissa of p
uint32_t sign_p = (fpp.bits >> 31);
uint32_t exp_p = (fpp.bits >> 23) & 0xff;
uint32_t mantissa_p;
// Normalize mantissa of p
if (exp_p > 0) {
mantissa_p = (fpp.bits & 0x7FFFFF) | 0x800000; // Add implicit 1
} else {
mantissa_p = (fpp.bits & 0x7FFFFF);
int dif = CLZ(mantissa_p) - 8;
mantissa_p <<= dif;
exp_p = 1 -dif;
}
union {
uint32_t bits;
float value;
} fpq = {.value = q};
// Extract sign, exponent, and mantissa of q
uint32_t sign_q = (fpq.bits >> 31) & 0x1;
uint32_t exp_q = (fpq.bits >> 23) & 0xff;
uint32_t mantissa_q;
// Normalize mantissa of
if (exp_q > 0) {
mantissa_q = (fpq.bits & 0x7FFFFF) | 0x800000; // Add implicit 1
} else {
mantissa_q = (fpq.bits & 0x7FFFFF);
int dif = CLZ(mantissa_q) - 8;
mantissa_q <<= dif;
exp_q = 1 -dif;
}
// Compute sign, exponent, and mantissa of the result
uint32_t sign = sign_p ^ sign_q;
int exp = exp_p - exp_q + 127;
uint32_t mantissa = 0;
// Align mantissa_p to be larger than mantissa_q
if (mantissa_p < mantissa_q) {
mantissa_p <<= 1;
exp--;
}
// Perform division of mantissas using bitwise long division
int nbits = 25;
if (exp < 0) {
nbits += exp;
exp = 0;
if (nbits < 0) {
return 0;
}
}
for (int i = 0; i < nbits; i++) {
mantissa <<= 1;
if (mantissa_p >= mantissa_q) {
mantissa_p -= mantissa_q;
mantissa |= 1;
}
mantissa_p <<= 1;
}
// Round the result
uint8_t odd, rnd, sticky;
sticky = (mantissa_p != 0);
rnd = (mantissa & 1);
odd = (mantissa & 2);
mantissa = (mantissa >> 1) + (rnd & (sticky | odd));
// Normalize the result if needed
int lz = CLZ(mantissa);
if (exp == 0 && (lz < 9)) {
mantissa >>= (9 - lz);
exp += (9 - lz);
}
// Combine the sign, exponent, and mantissa to form the final result
union {
uint32_t bits;
float value;
} output = {.bits = (sign << 31) | (exp << 23) | (mantissa & 0x7FFFFF)};
return output.value;
}
```
```c=
static inline float Newtons_method(int alpha){
// do the leading zero counting
int lzc = (32 - CLZ(round_a))/2;
//init float
float output = int2float(lzc);
if (output == 0){
output = 2;
}
float input = int2float(alpha);
//iteration loop
for (int i = 0; i < iteration; i++){
float temp = div_float(input, output);
output = add_float(output , temp);
output = div_float(output, (float) 2);
}
return output;
}
```
#### Assembly code
```asm=
.data
inputdata: .word 1160030, 25, 500
answer: .word 0x4486a180,0x40a00000, 0x41B2E389
iteration: .word 5
str1: .string "\n The testing number is: "
str2: .string "\n The correct root number is:"
str3: .string "\n The calculated root number is:"
str4: .string "\n The answer is CORRECT"
str5: .string "\n The answer is WRONG"
str6: .string "\n we got total"
str7: .string "\n error"
.text
main:
la s3, inputdata
li s1, 2
li s2, 0
la s4, answer
mv a5, x0
mloop:
lw a0, 0(s3)
mv s0, a0
jal ra, Newtons_method
lw a4, 0(s4)
jal ra, printResult
addi s4, s4, 4
addi s2, s2, 1
addi s3, s3, 4
blt s2, s1, mloop
jal ra, conclude
#Exit the program
li a7, 10
ecall
# a0: the input/output data
# a2: iteration
Newtons_method:
addi sp, sp, -8
sw ra, 4(sp)
# round input
sw a0, 0(sp)
# get initial guess
add t0, a0, x0
jal ra, CLZ
add a0, t0, x0
addi a0, a0, -32
sub a0, x0, a0
srli a0, a0, 1
mv t3, a0
jal ra, int2float
mv a0, t3
bne a0, x0, nExit
li a0, 0x40000000
nExit:
the total iteration time
lw a1, 0(sp)
mv t3, a1
jal ra, int2float
mv a1, t3
mv a2, x0
lw a3, iteration
loop:
bge a2, a3, outloop #if a2 >= a3 jump to output loop
add t0, a0, x0
add t1, a1, x0
jal ra, div_float # t4 = t1/t0
add t0, a0, x0
jal ra, add_float # t1 = t4 + t0
li t0, 0x40000000
jal ra div_float # t4 = t1/t0
add a0, t4, x0 # a0(output) = t4
addi a2, a2, 1
j loop
outloop:
lw ra 4(sp)
addi sp, sp, 8
jr ra
add_float:
addi sp, sp, -4
sw ra, 0(sp)
# calculate abs
li t2, 0x7fffffff
and t1, t0, t2 # t1 = abs (t0)
and t2, t4, t2 # t2 = abs (t4)
# we always make sure (abs(t4) > abs(t0))
bge t2, t1, aExit1
# switch t0 and t4
mv t3, t0
mv t0, t4
mv t4, t3
# switch t1 and t2
mv t3, t1
mv t1, t2
mv t2, t3
aExit1:
srli t6, t2, 23
srli t5, t1, 23
# compute the t4's mantissa
li t3 0x7fffff
and t2, t2, t3
and t1, t1, t3
bge x0, t6, aExit2
li t3, 0x800000
or t2, t2, t3
aExit2:
bge x0, t5, aExit3
li t3, 0x800000
or t1, t3, t1
aExit3:
# now we have t4 -> t2(mantissa), t6 (exp)
# t0 -> t1(mantissa), t5 (exp)
sub t5, t6, t5 # t5 is the diff of exp
srl t1, t1, t5
srli t0, t0, 31
srli t4, t4, 31
# now t0, t4 are the sign bit
xor t0, t0, t4
beq t0, x0, aElse4
sub t3, t2, t1
j aExit4
aElse4:
add t3, t2, t1
aExit4:
# we can release t2, t1, t0, t5
# t3 mantissa
# t4 sign
# t6 exp
li t0, 0x1000000
and t0, t0, t3
beq t0, x0 aElse5
srli t3, t3, 1
addi t6, t6, 1
j aExit5
aElse5:
beq t3, x0, aExit5
li t0, 0x800000
and t0, t3, t0
bne t0, x0, aExit5
slli t3, t3, 1
addi t6, t6, -1
j aElse5
aExit5:
add t1, x0, x0
bge x0, t6, add_out
li t2, 0x7fffff
and t1, t3, t2
slli t6, t6, 23
or t1, t1, t6
slli t4, t4, 31
or t1, t1, t4
add_out:
lw ra, 0(sp)
addi sp, sp, 4
jr ra
# t0 is the input/output data
# t1 abd t2 are the temperatory data
CLZ:
srli t1, t0, 1 # t1 = t0 >> 1
or t0, t0, t1 # t0 = t1 | t0
srli t1, t0, 2 # t1 = t0 >> 2
or t0, t0, t1 # t0 = t1 | t0
srli t1, t0, 4 # t1 = t0 >> 4
or t0, t0, t1 # t0 = t1 | t0
srli t1, t0, 8 # t1 = t0 >> 8
or t0, t0, t1 # t0 = t1 | t0
srli t1, t0, 16 # t1 = t0 >> 16
or t0, t0, t1 # t0 = t1 | t0
srli t1, t0, 1 # t1 = t0 >> 1
li t2, 0x55555555
and t1, t1, t2
sub t0, t0, t1
srli t1, t0, 2
li t2, 0x33333333
and t1, t1, t2
and t0, t0, t2
add t0, t0, t1
srli t1, t0, 4
li t2, 0x0f0f0f0f
add t0, t1, t0
and t0, t0, t2
srli t1, t0, 8
add t0, t0, t1
srli t1, t0, 16
add t0, t0, t1
andi t0, t0, 0x1f
addi t0, t0, -32
sub t0, x0, t0
jr ra
# t0 exponental part
# t1 mantissa part
# t3 input/output number
int2float:
addi sp, sp, -4
sw ra, 0(sp)
add t0, t3, x0
jal ra, CLZ
addi t0, t0, 1
addi t0, t0, -32
sub t0, x0, t0
addi t1, t0, -23
sub t1, x0, t1
sll t3, t3, t1
li t1, 0x800000
xor t3, t3, t1
addi t0, t0, 127
slli t0, t0, 23
or t3, t3, t0
intout:
lw ra, 0(sp)
addi sp, sp, 4
jr ra
# t0, t1, input number t1/t0
# t2 sign
# t3 exp
# t1 mantissa
# t4 output
div_float:
addi sp, sp, -4
sw ra, 0(sp)
add t4, x0, x0
beq t1, x0, div_out
li t4, 0x7fffffff
beq t0, x0, div_out
# separate the t1 data into sign|exp|mantissa
# t1 sign t2
srli t2, t1, 31 # t2 = t1 >> 31
# t1 exp t3
srli t3, t1, 23 # t3 = t1 >> 23
andi t3, t3, 0xff # t3 = t3 & 0xff
li t4, 0x7FFFFF # t4 = 0x7FFFFF
and t1, t1, t4 # t1 = t1 & 0x7FFFFF
li t4, 0x800000 # t4 = 0x800000
bge x0, t3, dElse1 # if t3 <= 0 jump to dElse1
or t1, t1, t4 # t1 = t1 | 0x800000
j dExit1 # jump to dExit1
dElse1:
slli t1, t1, 1
addi t3, t3, -1
and t5, t1, t4
bne t5, x0, dElse1
dExit1:
# save the data t2, t3
addi sp, sp, -8
sw t2, 0(sp) # 0(sp) the t1's sign value
sw t3, 4(sp) # 4(sp) t1's exp
# separate the t0 data into sign|exp|mantissa
# t0 sign t2
srli t2, t0, 31 # t2 = t0 >> 1
andi t2, t2, 1 # t2 = t2 & 1
# t1 exp t3
srli t3, t0, 23 # t3 = t0 >> 23
andi t3, t3, 0xff # t3 = t3 & 0xff
li t4, 0x7FFFFF # t4 = 0x7FFFFF
and t0, t0, t4 # t0 = t0 & 0x7FFFFF
li t4, 0x800000 # t4 = 0x800000
bge x0, t3, dElse2 # if 0 < t3 jump to dElse2
or t0, t0, t4 # t0 = t0 | 0x800000
j dExit2 # jump to dExit1
dElse2:
slli t0, t0, 1
addi t3, t3, -1
and t5, t0, t4
bne t5, x0, dElse2
dExit2:
lw t4, 0(sp) # t4 = t1's sign value
xor t2, t2, t4 # t2 = t2 ^ t4
lw t4, 4(sp) # t4 = t1's exp value
addi t4, t4, 127 # t4 = t4 +127
sub t3, t4, t3 # t3 = t4 - t3
addi sp, sp, 8 # recover the sp position
bge t1, t0, dExit3 # if t1 < t0 align mantissa
slli t1, t1, 1
addi t3, t3, -1
dExit3:
# t5 iteration number
li t5, 25
add t4, x0, x0 # t4 = 0;
bge t3, x0, dExit4
add t5, t5, t3
add t3, x0, x0
blt t5, x0, div_out
dExit4:
# division loop output t4
li t6, 0
dloop:
bge t6, t5, doutloop
slli t4, t4, 1 # t4 = t4 << 1
blt t1, t0, dExit5
sub t1, t1, t0
ori t4, t4, 1
dExit5:
slli t1, t1, 1
addi t6, t6, 1
j dloop
doutloop:
# round result
# odd t0
# rnd t1
# sticky t5
xori t5, t1, 1
andi t1, t4, 1
andi t0, t4, 2
srli t4, t4, 1
or t0, t0, t5
and t1, t1, t0
add t4, t4, t1
# normalize the result if needed
bne t3, x0, dExit6
li t1, 9
bge t0, t1, dExit6
addi sp, sp, -4
sw t2, 0(sp)
jal ra CLZ
lw t2, 0(sp)
addi sp, sp, 4
sub t1, t1, t0
srl t4, t4, t1
add t3, t3, t1
dExit6:
li t5 0x7fffff
and t4, t4, t5 # mantissa = mantissa & 0x7fffff
slli t3, t3, 23 # exp << 23
slli t2, t2, 31 # exp << 31
or t4, t4, t3
or t4, t4, t2
div_out:
lw ra, 0(sp)
addi sp, sp, 4
jr ra
printResult:
mv t0, s0 # original data
mv t1, a0 # root data
mv t2, a4 # answer
la a0, str1
li a7, 4
ecall
mv a0, t0
li a7, 1
ecall
la a0, str2
li a7, 4
ecall
mv a0, a4
li a7, 1
ecall
la a0, str3
li a7, 4
ecall
mv a0, t1
li a7, 1
ecall
beq a4, a0, CORRECT
addi a5, a5, 1
la a0, str4
li a7, 4
ecall
j printout
CORRECT:
la a0, str4
li a7, 4
ecall
printout:
ret
conclude:
la a0, str6
li a7, 4
ecall
mv a0, a5
li a7, 1
ecall
la a0, str7
li a7, 4
ecall
ret
```
## Analysis
### pseudo instruction
```
00000000 <main>:
0: 10000997 auipc x19 0x10000
4: 00098993 addi x19 x19 0
8: 00200493 addi x9 x0 2
c: 00000913 addi x18 x0 0
10: 10000a17 auipc x20 0x10000
14: ffca0a13 addi x20 x20 -4
18: 00000793 addi x15 x0 0
0000001c <mloop>:
1c: 0009a503 lw x10 0 x19
20: 00050413 addi x8 x10 0
24: 028000ef jal x1 40 <Newtons_method>
28: 000a2703 lw x14 0 x20
2c: 3d4000ef jal x1 980 <printResult>
30: 004a0a13 addi x20 x20 4
34: 00190913 addi x18 x18 1
38: 00498993 addi x19 x19 4
3c: fe9940e3 blt x18 x9 -32 <mloop>
40: 450000ef jal x1 1104 <conclude>
44: 00a00893 addi x17 x0 10
48: 00000073 ecall
0000004c <Newtons_method>:
4c: ff810113 addi x2 x2 -8
50: 00112223 sw x1 4 x2
54: 00a12023 sw x10 0 x2
58: 000502b3 add x5 x10 x0
5c: 168000ef jal x1 360 <CLZ>
60: 00028533 add x10 x5 x0
64: fe050513 addi x10 x10 -32
68: 40a00533 sub x10 x0 x10
6c: 00155513 srli x10 x10 1
70: 00050e13 addi x28 x10 0
74: 1d8000ef jal x1 472 <int2float>
78: 000e0513 addi x10 x28 0
7c: 00051463 bne x10 x0 8 <nExit>
80: 40000537 lui x10 0x40000
00000084 <nExit>:
84: 00012583 lw x11 0 x2
88: 00058e13 addi x28 x11 0
8c: 1c0000ef jal x1 448 <int2float>
90: 000e0593 addi x11 x28 0
94: 00000613 addi x12 x0 0
98: 10000697 auipc x13 0x10000
9c: f806a683 lw x13 -128 x13
000000a0 <loop>:
a0: 02d65663 bge x12 x13 44 <outloop>
a4: 000502b3 add x5 x10 x0
a8: 00058333 add x6 x11 x0
ac: 1e8000ef jal x1 488 <div_float>
b0: 000502b3 add x5 x10 x0
b4: 024000ef jal x1 36 <add_float>
b8: 400002b7 lui x5 0x40000
bc: 1d8000ef jal x1 472 <div_float>
c0: 000e8533 add x10 x29 x0
c4: 00160613 addi x12 x12 1
c8: fd9ff06f jal x0 -40 <loop>
000000cc <outloop>:
cc: 00412083 lw x1 4 x2
d0: 00810113 addi x2 x2 8
d4: 00008067 jalr x0 x1 0
000000d8 <add_float>:
d8: ffc10113 addi x2 x2 -4
dc: 00112023 sw x1 0 x2
e0: 800003b7 lui x7 0x80000
e4: fff38393 addi x7 x7 -1
e8: 0072f333 and x6 x5 x7
ec: 007ef3b3 and x7 x29 x7
f0: 0063de63 bge x7 x6 28 <aExit1>
f4: 00028e13 addi x28 x5 0
f8: 000e8293 addi x5 x29 0
fc: 000e0e93 addi x29 x28 0
100: 00030e13 addi x28 x6 0
104: 00038313 addi x6 x7 0
108: 000e0393 addi x7 x28 0
0000010c <aExit1>:
10c: 0173df93 srli x31 x7 23
110: 01735f13 srli x30 x6 23
114: 00800e37 lui x28 0x800
118: fffe0e13 addi x28 x28 -1
11c: 01c3f3b3 and x7 x7 x28
120: 01c37333 and x6 x6 x28
124: 01f05663 bge x0 x31 12 <aExit2>
128: 00800e37 lui x28 0x800
12c: 01c3e3b3 or x7 x7 x28
00000130 <aExit2>:
130: 01e05663 bge x0 x30 12 <aExit3>
134: 00800e37 lui x28 0x800
138: 006e6333 or x6 x28 x6
0000013c <aExit3>:
13c: 41ef8f33 sub x30 x31 x30
140: 01e35333 srl x6 x6 x30
144: 01f2d293 srli x5 x5 31
148: 01fede93 srli x29 x29 31
14c: 01d2c2b3 xor x5 x5 x29
150: 00028663 beq x5 x0 12 <aElse4>
154: 40638e33 sub x28 x7 x6
158: 0080006f jal x0 8 <aExit4>
0000015c <aElse4>:
15c: 00638e33 add x28 x7 x6
00000160 <aExit4>:
160: 010002b7 lui x5 0x1000
164: 01c2f2b3 and x5 x5 x28
168: 00028863 beq x5 x0 16 <aElse5>
16c: 001e5e13 srli x28 x28 1
170: 001f8f93 addi x31 x31 1
174: 0200006f jal x0 32 <aExit5>
00000178 <aElse5>:
178: 000e0e63 beq x28 x0 28 <aExit5>
17c: 008002b7 lui x5 0x800
180: 005e72b3 and x5 x28 x5
184: 00029863 bne x5 x0 16 <aExit5>
188: 001e1e13 slli x28 x28 1
18c: ffff8f93 addi x31 x31 -1
190: fe9ff06f jal x0 -24 <aElse5>
00000194 <aExit5>:
194: 00000333 add x6 x0 x0
198: 03f05063 bge x0 x31 32 <add_out>
19c: 008003b7 lui x7 0x800
1a0: fff38393 addi x7 x7 -1
1a4: 007e7333 and x6 x28 x7
1a8: 017f9f93 slli x31 x31 23
1ac: 01f36333 or x6 x6 x31
1b0: 01fe9e93 slli x29 x29 31
1b4: 01d36333 or x6 x6 x29
000001b8 <add_out>:
1b8: 00012083 lw x1 0 x2
1bc: 00410113 addi x2 x2 4
1c0: 00008067 jalr x0 x1 0
000001c4 <CLZ>:
1c4: 0012d313 srli x6 x5 1
1c8: 0062e2b3 or x5 x5 x6
1cc: 0022d313 srli x6 x5 2
1d0: 0062e2b3 or x5 x5 x6
1d4: 0042d313 srli x6 x5 4
1d8: 0062e2b3 or x5 x5 x6
1dc: 0082d313 srli x6 x5 8
1e0: 0062e2b3 or x5 x5 x6
1e4: 0102d313 srli x6 x5 16
1e8: 0062e2b3 or x5 x5 x6
1ec: 0012d313 srli x6 x5 1
1f0: 555553b7 lui x7 0x55555
1f4: 55538393 addi x7 x7 1365
1f8: 00737333 and x6 x6 x7
1fc: 406282b3 sub x5 x5 x6
200: 0022d313 srli x6 x5 2
204: 333333b7 lui x7 0x33333
208: 33338393 addi x7 x7 819
20c: 00737333 and x6 x6 x7
210: 0072f2b3 and x5 x5 x7
214: 006282b3 add x5 x5 x6
218: 0042d313 srli x6 x5 4
21c: 0f0f13b7 lui x7 0xf0f1
220: f0f38393 addi x7 x7 -241
224: 005302b3 add x5 x6 x5
228: 0072f2b3 and x5 x5 x7
22c: 0082d313 srli x6 x5 8
230: 006282b3 add x5 x5 x6
234: 0102d313 srli x6 x5 16
238: 006282b3 add x5 x5 x6
23c: 01f2f293 andi x5 x5 31
240: fe028293 addi x5 x5 -32
244: 405002b3 sub x5 x0 x5
248: 00008067 jalr x0 x1 0
0000024c <int2float>:
24c: ffc10113 addi x2 x2 -4
250: 00112023 sw x1 0 x2
254: 000e02b3 add x5 x28 x0
258: f6dff0ef jal x1 -148 <CLZ>
25c: 00128293 addi x5 x5 1
260: fe028293 addi x5 x5 -32
264: 405002b3 sub x5 x0 x5
268: fe928313 addi x6 x5 -23
26c: 40600333 sub x6 x0 x6
270: 006e1e33 sll x28 x28 x6
274: 00800337 lui x6 0x800
278: 006e4e33 xor x28 x28 x6
27c: 07f28293 addi x5 x5 127
280: 01729293 slli x5 x5 23
284: 005e6e33 or x28 x28 x5
00000288 <intout>:
288: 00012083 lw x1 0 x2
28c: 00410113 addi x2 x2 4
290: 00008067 jalr x0 x1 0
00000294 <div_float>:
294: ffc10113 addi x2 x2 -4
298: 00112023 sw x1 0 x2
29c: 00000eb3 add x29 x0 x0
2a0: 14030a63 beq x6 x0 340 <div_out>
2a4: 80000eb7 lui x29 0x80000
2a8: fffe8e93 addi x29 x29 -1
2ac: 14028463 beq x5 x0 328 <div_out>
2b0: 01f35393 srli x7 x6 31
2b4: 01735e13 srli x28 x6 23
2b8: 0ffe7e13 andi x28 x28 255
2bc: 00800eb7 lui x29 0x800
2c0: fffe8e93 addi x29 x29 -1
2c4: 01d37333 and x6 x6 x29
2c8: 00800eb7 lui x29 0x800
2cc: 01c05663 bge x0 x28 12 <dElse1>
2d0: 01d36333 or x6 x6 x29
2d4: 0140006f jal x0 20 <dExit1>
000002d8 <dElse1>:
2d8: 00131313 slli x6 x6 1
2dc: fffe0e13 addi x28 x28 -1
2e0: 01d37f33 and x30 x6 x29
2e4: fe0f1ae3 bne x30 x0 -12 <dElse1>
000002e8 <dExit1>:
2e8: ff810113 addi x2 x2 -8
2ec: 00712023 sw x7 0 x2
2f0: 01c12223 sw x28 4 x2
2f4: 01f2d393 srli x7 x5 31
2f8: 0013f393 andi x7 x7 1
2fc: 0172de13 srli x28 x5 23
300: 0ffe7e13 andi x28 x28 255
304: 00800eb7 lui x29 0x800
308: fffe8e93 addi x29 x29 -1
30c: 01d2f2b3 and x5 x5 x29
310: 00800eb7 lui x29 0x800
314: 01c05663 bge x0 x28 12 <dElse2>
318: 01d2e2b3 or x5 x5 x29
31c: 0140006f jal x0 20 <dExit2>
00000320 <dElse2>:
320: 00129293 slli x5 x5 1
324: fffe0e13 addi x28 x28 -1
328: 01d2ff33 and x30 x5 x29
32c: fe0f1ae3 bne x30 x0 -12 <dElse2>
00000330 <dExit2>:
330: 00012e83 lw x29 0 x2
334: 01d3c3b3 xor x7 x7 x29
338: 00412e83 lw x29 4 x2
33c: 07fe8e93 addi x29 x29 127
340: 41ce8e33 sub x28 x29 x28
344: 00810113 addi x2 x2 8
348: 00535663 bge x6 x5 12 <dExit3>
34c: 00131313 slli x6 x6 1
350: fffe0e13 addi x28 x28 -1
00000354 <dExit3>:
354: 01900f13 addi x30 x0 25
358: 00000eb3 add x29 x0 x0
35c: 000e5863 bge x28 x0 16 <dExit4>
360: 01cf0f33 add x30 x30 x28
364: 00000e33 add x28 x0 x0
368: 080f4663 blt x30 x0 140 <div_out>
0000036c <dExit4>:
36c: 00000f93 addi x31 x0 0
00000370 <dloop>:
370: 03efd063 bge x31 x30 32 <doutloop>
374: 001e9e93 slli x29 x29 1
378: 00534663 blt x6 x5 12 <dExit5>
37c: 40530333 sub x6 x6 x5
380: 001eee93 ori x29 x29 1
00000384 <dExit5>:
384: 00131313 slli x6 x6 1
388: 001f8f93 addi x31 x31 1
38c: fe5ff06f jal x0 -28 <dloop>
00000390 <doutloop>:
390: 00134f13 xori x30 x6 1
394: 001ef313 andi x6 x29 1
398: 002ef293 andi x5 x29 2
39c: 001ede93 srli x29 x29 1
3a0: 01e2e2b3 or x5 x5 x30
3a4: 00537333 and x6 x6 x5
3a8: 006e8eb3 add x29 x29 x6
3ac: 020e1663 bne x28 x0 44 <dExit6>
3b0: 00900313 addi x6 x0 9
3b4: 0262d263 bge x5 x6 36 <dExit6>
3b8: ffc10113 addi x2 x2 -4
3bc: 00712023 sw x7 0 x2
3c0: e05ff0ef jal x1 -508 <CLZ>
3c4: 00012383 lw x7 0 x2
3c8: 00410113 addi x2 x2 4
3cc: 40530333 sub x6 x6 x5
3d0: 006edeb3 srl x29 x29 x6
3d4: 006e0e33 add x28 x28 x6
000003d8 <dExit6>:
3d8: 00800f37 lui x30 0x800
3dc: ffff0f13 addi x30 x30 -1
3e0: 01eefeb3 and x29 x29 x30
3e4: 017e1e13 slli x28 x28 23
3e8: 01f39393 slli x7 x7 31
3ec: 01ceeeb3 or x29 x29 x28
3f0: 007eeeb3 or x29 x29 x7
000003f4 <div_out>:
3f4: 00012083 lw x1 0 x2
3f8: 00410113 addi x2 x2 4
3fc: 00008067 jalr x0 x1 0
00000400 <printResult>:
400: 00040293 addi x5 x8 0
404: 00050313 addi x6 x10 0
408: 00070393 addi x7 x14 0
40c: 10000517 auipc x10 0x10000
410: c1050513 addi x10 x10 -1008
414: 00400893 addi x17 x0 4
418: 00000073 ecall
41c: 00028513 addi x10 x5 0
420: 00100893 addi x17 x0 1
424: 00000073 ecall
428: 10000517 auipc x10 0x10000
42c: c0e50513 addi x10 x10 -1010
430: 00400893 addi x17 x0 4
434: 00000073 ecall
438: 00070513 addi x10 x14 0
43c: 00100893 addi x17 x0 1
440: 00000073 ecall
444: 10000517 auipc x10 0x10000
448: c1050513 addi x10 x10 -1008
44c: 00400893 addi x17 x0 4
450: 00000073 ecall
454: 00030513 addi x10 x6 0
458: 00100893 addi x17 x0 1
45c: 00000073 ecall
460: 00a70e63 beq x14 x10 28 <CORRECT>
464: 00178793 addi x15 x15 1
468: 10000517 auipc x10 0x10000
46c: c0d50513 addi x10 x10 -1011
470: 00400893 addi x17 x0 4
474: 00000073 ecall
478: 0140006f jal x0 20 <printout>
0000047c <CORRECT>:
47c: 10000517 auipc x10 0x10000
480: bf950513 addi x10 x10 -1031
484: 00400893 addi x17 x0 4
488: 00000073 ecall
0000048c <printout>:
48c: 00008067 jalr x0 x1 0
00000490 <conclude>:
490: 10000517 auipc x10 0x10000
494: c1350513 addi x10 x10 -1005
498: 00400893 addi x17 x0 4
49c: 00000073 ecall
4a0: 00078513 addi x10 x15 0
4a4: 00100893 addi x17 x0 1
4a8: 00000073 ecall
4ac: 10000517 auipc x10 0x10000
4b0: c0650513 addi x10 x10 -1018
4b4: 00400893 addi x17 x0 4
4b8: 00000073 ecall
4bc: 00008067 jalr x0 x1 0
```
### Result
As for the code above can only calculate integer number, which means that the input data should in the range of $[1, 2^{31}]$
AS result, I selected three integer as my testing data, such like 1160030, 25, 500
And the each output data is exactly the same as I calculate in C code, where bothe Newtons iteration epoch = 5.

## Reference:
:::danger
Always refer to primary sources, such as official RISC-V documentation.
:::