The bfloat16 format is a 16-bit floating-point representation, designed to provide a wide dynamic range by using a floating radix point. It is a shortened version of the 32-bit IEEE 754 single-precision format (binary32), aimed at accelerating machine learning.
The structure of the bfloat16 floating-point format is as follows.
┌ sign
│
│ ┌ exponent
│ │
│ │ ┌ mantissa
│ │ │
│┌──┴───┐┌─┴───┐
0b0000000000000000 bfloat16
Why was union
used rather than arbitrary pointer access?
typedef struct {
uint16_t bits;
} bf16_t;
static inline bf16_t fp32_to_bf16(float s)
{
bf16_t h;
union {
float f;
uint32_t i;
} u = {.f = s};
if ((u.i & 0x7fffffff) > 0x7f800000) { /* NaN */
h.bits = (u.i >> 16) | 64; /* force to quiet */
return h;
}
h.bits = (u.i + (0x7fff + ((u.i >> 0x10) & 1))) >> 0x10;
return h;
}
static inline float bf16_to_fp32(bf16_t h)
{
union {
float f;
uint32_t i;
} u = {.i = (uint32_t)h.bits << 16};
return u.f;
}
Compile result:
Do not use screenshots for plain text content, as this is inaccessible to visually impaired users.
.data
arr: .word 0xc1cc0000 # int arr = 0xc1cc0000;
.text
main:
lw s0, arr # s0 = arr;
jal ra, fp32_to_fp16 # fp32_to_fp16(s0);
li a7,10 # exit(0);
ecall # exit(0);
fp32_to_fp16:
addi sp, sp, -8 # sp -= 8;
sw ra, 4(sp) # *(sp + 4) = ra;
sw s0, 0(sp) # *sp = s0;
mv t0, s0 # t0 = s0;
slli t0, t0, 1 # t0 <<= 1;
srli t0, t0, 24 # t0 >>= 24;
addi t1, x0, 0xff # t1 = 0xff;
srli a0, s0, 16 # a0 = s0 >> 16;
bne t0, t1, Else # if (t0 != t1) goto Else;
ori a0, a0, 64 # a0 |= 64;
j Exit # goto Exit;
Else:
andi a0, a0, 1 # a0 &= 1;
li t2, 0x7fff # t2 = 0x7fff;
add a0, a0, t2 # a0 += t2;
add a0, s0, a0 # a0 = s0 + a0;
srli a0, a0, 0x10 # a0 >>= 16;
Exit:
lw s0, 0(sp) # s0 = *sp;
lw ra, 4(sp) # ra = *(sp + 8);
addi sp, sp, 8 # sp += 8;
jr ra # return;
bf16_to_fp32:
addi sp, sp, -8 # sp -= 16;
sw ra, 4(sp) # *(sp + 4) = ra;
sw s1, 0(sp) # *sp = s1;
slli a0, s0, 16 # a0 = s0 << 16;
lw s0, 0(sp) # s0 = *sp;
lw ra, 4(sp) # ra = *(sp + 4);
addi sp, sp, 8 # sp += 8;
jr ra # return;
The Newton-Raphson method, also known as Newton's method, is widely used for finding the roots of polynomials. The algorithm for Newton's method is as follows:
Given the function
Applying this method to solve
Without loss of generality, we can simplify the formula above as follows:
This formula helps refine the estimate of the square root of
With the formula above, we can easily calculate the precise square root. However, selecting an appropriate initial guess is crucial. A good initial guess can significantly reduce the number of iterations needed to reach the correct root. Using CLZ (count leading zeros) is an effective method for determining a suitable initial guess.
________________________________________________________________
|_0_|_______8______|____________________23_______________________|
sign exponential mantissa
Calculate leading zeros
uint8_t CLZ (uint32_t x){
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
x -= ((x >> 1) & 0x55555555);
x = ((x >> 2) & 0x33333333) + (x & 0x33333333);
x = ((x >> 4) + x) & 0x0f0f0f0f;
x += (x >> 8);
return (32 - (x & 0x1f));
}
Float addition
static inline float add_float(float a, float b){
// 1. Handle special cases like NaN, infinity, zero.
if (a == 0.0f) return b;
if (b == 0.0f) return a;
union {
uint32_t bits;
float value;
} fpa = {.value = a}, fpb = {.value = b};
uint32_t sign_a = fpa.bits & 0x80000000;
uint32_t sign_b = fpb.bits & 0x80000000;
// Extract absolute values
uint32_t abs_a = fpa.bits & 0x7fffffff;
uint32_t abs_b = fpb.bits & 0x7fffffff;
// Ensure abs_a > abs_b
if (abs_a < abs_b) {
uint32_t temp = abs_a;
abs_a = abs_b;
abs_b = temp;
temp = sign_a;
sign_a = sign_b;
sign_b = sign_a;
}
// Extract exponents and mantissas
uint32_t exp_a = (abs_a >> 23) & 0xff;
uint32_t exp_b = (abs_b >> 23) & 0xff;
uint32_t mantissa_a;
uint32_t mantissa_b;
if (exp_a > 0){
mantissa_a = (abs_a & 0x7fffff) | 0x800000; // Add implicit 1
}
else
mantissa_a = (abs_a & 0x7fffff);
if (exp_b > 0){
mantissa_b = (abs_b & 0x7fffff) | 0x800000; // Add implicit 1
}
else{
mantissa_b = (abs_b & 0x7fffff);
}
// Align mantissa_b with mantissa_a
uint32_t diff_exp = exp_a - exp_b;
mantissa_b >>= diff_exp;
// Calculate resulting mantissa based on sign difference
uint32_t mantissa;
if ((sign_a >> 31) ^ (sign_b >> 31)) {
mantissa = mantissa_a - mantissa_b;
} else {
mantissa = mantissa_a + mantissa_b;
}
// Normalize mantissa if necessary
uint32_t exp = exp_a;
if (mantissa & 0x1000000) {
mantissa >>= 1;
exp++;
} else {
while (mantissa && !(mantissa & 0x800000)) {
mantissa <<= 1;
exp--;
}
}
// Handle underflow and overflow
if (exp <= 0) return 0.0f;
union {
uint32_t bits;
float value;
} out = {.bits = sign_a | (exp << 23) | (mantissa & 0x7fffff)};
return out.value;
}
float division
static inline float div_float(float p, float q){
union {
uint32_t bits;
float value;
} fpp = {.value = p};
// Extract sign, exponent, and mantissa of p
uint32_t sign_p = (fpp.bits >> 31);
uint32_t exp_p = (fpp.bits >> 23) & 0xff;
uint32_t mantissa_p;
// Normalize mantissa of p
if (exp_p > 0) {
mantissa_p = (fpp.bits & 0x7FFFFF) | 0x800000; // Add implicit 1
} else {
mantissa_p = (fpp.bits & 0x7FFFFF);
int dif = CLZ(mantissa_p) - 8;
mantissa_p <<= dif;
exp_p = 1 -dif;
}
union {
uint32_t bits;
float value;
} fpq = {.value = q};
// Extract sign, exponent, and mantissa of q
uint32_t sign_q = (fpq.bits >> 31) & 0x1;
uint32_t exp_q = (fpq.bits >> 23) & 0xff;
uint32_t mantissa_q;
// Normalize mantissa of
if (exp_q > 0) {
mantissa_q = (fpq.bits & 0x7FFFFF) | 0x800000; // Add implicit 1
} else {
mantissa_q = (fpq.bits & 0x7FFFFF);
int dif = CLZ(mantissa_q) - 8;
mantissa_q <<= dif;
exp_q = 1 -dif;
}
// Compute sign, exponent, and mantissa of the result
uint32_t sign = sign_p ^ sign_q;
int exp = exp_p - exp_q + 127;
uint32_t mantissa = 0;
// Align mantissa_p to be larger than mantissa_q
if (mantissa_p < mantissa_q) {
mantissa_p <<= 1;
exp--;
}
// Perform division of mantissas using bitwise long division
int nbits = 25;
if (exp < 0) {
nbits += exp;
exp = 0;
if (nbits < 0) {
return 0;
}
}
for (int i = 0; i < nbits; i++) {
mantissa <<= 1;
if (mantissa_p >= mantissa_q) {
mantissa_p -= mantissa_q;
mantissa |= 1;
}
mantissa_p <<= 1;
}
// Round the result
uint8_t odd, rnd, sticky;
sticky = (mantissa_p != 0);
rnd = (mantissa & 1);
odd = (mantissa & 2);
mantissa = (mantissa >> 1) + (rnd & (sticky | odd));
// Normalize the result if needed
int lz = CLZ(mantissa);
if (exp == 0 && (lz < 9)) {
mantissa >>= (9 - lz);
exp += (9 - lz);
}
// Combine the sign, exponent, and mantissa to form the final result
union {
uint32_t bits;
float value;
} output = {.bits = (sign << 31) | (exp << 23) | (mantissa & 0x7FFFFF)};
return output.value;
}
static inline float Newtons_method(int alpha){
// do the leading zero counting
int lzc = (32 - CLZ(round_a))/2;
//init float
float output = int2float(lzc);
if (output == 0){
output = 2;
}
float input = int2float(alpha);
//iteration loop
for (int i = 0; i < iteration; i++){
float temp = div_float(input, output);
output = add_float(output , temp);
output = div_float(output, (float) 2);
}
return output;
}
.data
inputdata: .word 1160030, 25, 500
answer: .word 0x4486a180,0x40a00000, 0x41B2E389
iteration: .word 5
str1: .string "\n The testing number is: "
str2: .string "\n The correct root number is:"
str3: .string "\n The calculated root number is:"
str4: .string "\n The answer is CORRECT"
str5: .string "\n The answer is WRONG"
str6: .string "\n we got total"
str7: .string "\n error"
.text
main:
la s3, inputdata
li s1, 2
li s2, 0
la s4, answer
mv a5, x0
mloop:
lw a0, 0(s3)
mv s0, a0
jal ra, Newtons_method
lw a4, 0(s4)
jal ra, printResult
addi s4, s4, 4
addi s2, s2, 1
addi s3, s3, 4
blt s2, s1, mloop
jal ra, conclude
#Exit the program
li a7, 10
ecall
# a0: the input/output data
# a2: iteration
Newtons_method:
addi sp, sp, -8
sw ra, 4(sp)
# round input
sw a0, 0(sp)
# get initial guess
add t0, a0, x0
jal ra, CLZ
add a0, t0, x0
addi a0, a0, -32
sub a0, x0, a0
srli a0, a0, 1
mv t3, a0
jal ra, int2float
mv a0, t3
bne a0, x0, nExit
li a0, 0x40000000
nExit:
the total iteration time
lw a1, 0(sp)
mv t3, a1
jal ra, int2float
mv a1, t3
mv a2, x0
lw a3, iteration
loop:
bge a2, a3, outloop #if a2 >= a3 jump to output loop
add t0, a0, x0
add t1, a1, x0
jal ra, div_float # t4 = t1/t0
add t0, a0, x0
jal ra, add_float # t1 = t4 + t0
li t0, 0x40000000
jal ra div_float # t4 = t1/t0
add a0, t4, x0 # a0(output) = t4
addi a2, a2, 1
j loop
outloop:
lw ra 4(sp)
addi sp, sp, 8
jr ra
add_float:
addi sp, sp, -4
sw ra, 0(sp)
# calculate abs
li t2, 0x7fffffff
and t1, t0, t2 # t1 = abs (t0)
and t2, t4, t2 # t2 = abs (t4)
# we always make sure (abs(t4) > abs(t0))
bge t2, t1, aExit1
# switch t0 and t4
mv t3, t0
mv t0, t4
mv t4, t3
# switch t1 and t2
mv t3, t1
mv t1, t2
mv t2, t3
aExit1:
srli t6, t2, 23
srli t5, t1, 23
# compute the t4's mantissa
li t3 0x7fffff
and t2, t2, t3
and t1, t1, t3
bge x0, t6, aExit2
li t3, 0x800000
or t2, t2, t3
aExit2:
bge x0, t5, aExit3
li t3, 0x800000
or t1, t3, t1
aExit3:
# now we have t4 -> t2(mantissa), t6 (exp)
# t0 -> t1(mantissa), t5 (exp)
sub t5, t6, t5 # t5 is the diff of exp
srl t1, t1, t5
srli t0, t0, 31
srli t4, t4, 31
# now t0, t4 are the sign bit
xor t0, t0, t4
beq t0, x0, aElse4
sub t3, t2, t1
j aExit4
aElse4:
add t3, t2, t1
aExit4:
# we can release t2, t1, t0, t5
# t3 mantissa
# t4 sign
# t6 exp
li t0, 0x1000000
and t0, t0, t3
beq t0, x0 aElse5
srli t3, t3, 1
addi t6, t6, 1
j aExit5
aElse5:
beq t3, x0, aExit5
li t0, 0x800000
and t0, t3, t0
bne t0, x0, aExit5
slli t3, t3, 1
addi t6, t6, -1
j aElse5
aExit5:
add t1, x0, x0
bge x0, t6, add_out
li t2, 0x7fffff
and t1, t3, t2
slli t6, t6, 23
or t1, t1, t6
slli t4, t4, 31
or t1, t1, t4
add_out:
lw ra, 0(sp)
addi sp, sp, 4
jr ra
# t0 is the input/output data
# t1 abd t2 are the temperatory data
CLZ:
srli t1, t0, 1 # t1 = t0 >> 1
or t0, t0, t1 # t0 = t1 | t0
srli t1, t0, 2 # t1 = t0 >> 2
or t0, t0, t1 # t0 = t1 | t0
srli t1, t0, 4 # t1 = t0 >> 4
or t0, t0, t1 # t0 = t1 | t0
srli t1, t0, 8 # t1 = t0 >> 8
or t0, t0, t1 # t0 = t1 | t0
srli t1, t0, 16 # t1 = t0 >> 16
or t0, t0, t1 # t0 = t1 | t0
srli t1, t0, 1 # t1 = t0 >> 1
li t2, 0x55555555
and t1, t1, t2
sub t0, t0, t1
srli t1, t0, 2
li t2, 0x33333333
and t1, t1, t2
and t0, t0, t2
add t0, t0, t1
srli t1, t0, 4
li t2, 0x0f0f0f0f
add t0, t1, t0
and t0, t0, t2
srli t1, t0, 8
add t0, t0, t1
srli t1, t0, 16
add t0, t0, t1
andi t0, t0, 0x1f
addi t0, t0, -32
sub t0, x0, t0
jr ra
# t0 exponental part
# t1 mantissa part
# t3 input/output number
int2float:
addi sp, sp, -4
sw ra, 0(sp)
add t0, t3, x0
jal ra, CLZ
addi t0, t0, 1
addi t0, t0, -32
sub t0, x0, t0
addi t1, t0, -23
sub t1, x0, t1
sll t3, t3, t1
li t1, 0x800000
xor t3, t3, t1
addi t0, t0, 127
slli t0, t0, 23
or t3, t3, t0
intout:
lw ra, 0(sp)
addi sp, sp, 4
jr ra
# t0, t1, input number t1/t0
# t2 sign
# t3 exp
# t1 mantissa
# t4 output
div_float:
addi sp, sp, -4
sw ra, 0(sp)
add t4, x0, x0
beq t1, x0, div_out
li t4, 0x7fffffff
beq t0, x0, div_out
# separate the t1 data into sign|exp|mantissa
# t1 sign t2
srli t2, t1, 31 # t2 = t1 >> 31
# t1 exp t3
srli t3, t1, 23 # t3 = t1 >> 23
andi t3, t3, 0xff # t3 = t3 & 0xff
li t4, 0x7FFFFF # t4 = 0x7FFFFF
and t1, t1, t4 # t1 = t1 & 0x7FFFFF
li t4, 0x800000 # t4 = 0x800000
bge x0, t3, dElse1 # if t3 <= 0 jump to dElse1
or t1, t1, t4 # t1 = t1 | 0x800000
j dExit1 # jump to dExit1
dElse1:
slli t1, t1, 1
addi t3, t3, -1
and t5, t1, t4
bne t5, x0, dElse1
dExit1:
# save the data t2, t3
addi sp, sp, -8
sw t2, 0(sp) # 0(sp) the t1's sign value
sw t3, 4(sp) # 4(sp) t1's exp
# separate the t0 data into sign|exp|mantissa
# t0 sign t2
srli t2, t0, 31 # t2 = t0 >> 1
andi t2, t2, 1 # t2 = t2 & 1
# t1 exp t3
srli t3, t0, 23 # t3 = t0 >> 23
andi t3, t3, 0xff # t3 = t3 & 0xff
li t4, 0x7FFFFF # t4 = 0x7FFFFF
and t0, t0, t4 # t0 = t0 & 0x7FFFFF
li t4, 0x800000 # t4 = 0x800000
bge x0, t3, dElse2 # if 0 < t3 jump to dElse2
or t0, t0, t4 # t0 = t0 | 0x800000
j dExit2 # jump to dExit1
dElse2:
slli t0, t0, 1
addi t3, t3, -1
and t5, t0, t4
bne t5, x0, dElse2
dExit2:
lw t4, 0(sp) # t4 = t1's sign value
xor t2, t2, t4 # t2 = t2 ^ t4
lw t4, 4(sp) # t4 = t1's exp value
addi t4, t4, 127 # t4 = t4 +127
sub t3, t4, t3 # t3 = t4 - t3
addi sp, sp, 8 # recover the sp position
bge t1, t0, dExit3 # if t1 < t0 align mantissa
slli t1, t1, 1
addi t3, t3, -1
dExit3:
# t5 iteration number
li t5, 25
add t4, x0, x0 # t4 = 0;
bge t3, x0, dExit4
add t5, t5, t3
add t3, x0, x0
blt t5, x0, div_out
dExit4:
# division loop output t4
li t6, 0
dloop:
bge t6, t5, doutloop
slli t4, t4, 1 # t4 = t4 << 1
blt t1, t0, dExit5
sub t1, t1, t0
ori t4, t4, 1
dExit5:
slli t1, t1, 1
addi t6, t6, 1
j dloop
doutloop:
# round result
# odd t0
# rnd t1
# sticky t5
xori t5, t1, 1
andi t1, t4, 1
andi t0, t4, 2
srli t4, t4, 1
or t0, t0, t5
and t1, t1, t0
add t4, t4, t1
# normalize the result if needed
bne t3, x0, dExit6
li t1, 9
bge t0, t1, dExit6
addi sp, sp, -4
sw t2, 0(sp)
jal ra CLZ
lw t2, 0(sp)
addi sp, sp, 4
sub t1, t1, t0
srl t4, t4, t1
add t3, t3, t1
dExit6:
li t5 0x7fffff
and t4, t4, t5 # mantissa = mantissa & 0x7fffff
slli t3, t3, 23 # exp << 23
slli t2, t2, 31 # exp << 31
or t4, t4, t3
or t4, t4, t2
div_out:
lw ra, 0(sp)
addi sp, sp, 4
jr ra
printResult:
mv t0, s0 # original data
mv t1, a0 # root data
mv t2, a4 # answer
la a0, str1
li a7, 4
ecall
mv a0, t0
li a7, 1
ecall
la a0, str2
li a7, 4
ecall
mv a0, a4
li a7, 1
ecall
la a0, str3
li a7, 4
ecall
mv a0, t1
li a7, 1
ecall
beq a4, a0, CORRECT
addi a5, a5, 1
la a0, str4
li a7, 4
ecall
j printout
CORRECT:
la a0, str4
li a7, 4
ecall
printout:
ret
conclude:
la a0, str6
li a7, 4
ecall
mv a0, a5
li a7, 1
ecall
la a0, str7
li a7, 4
ecall
ret
00000000 <main>:
0: 10000997 auipc x19 0x10000
4: 00098993 addi x19 x19 0
8: 00200493 addi x9 x0 2
c: 00000913 addi x18 x0 0
10: 10000a17 auipc x20 0x10000
14: ffca0a13 addi x20 x20 -4
18: 00000793 addi x15 x0 0
0000001c <mloop>:
1c: 0009a503 lw x10 0 x19
20: 00050413 addi x8 x10 0
24: 028000ef jal x1 40 <Newtons_method>
28: 000a2703 lw x14 0 x20
2c: 3d4000ef jal x1 980 <printResult>
30: 004a0a13 addi x20 x20 4
34: 00190913 addi x18 x18 1
38: 00498993 addi x19 x19 4
3c: fe9940e3 blt x18 x9 -32 <mloop>
40: 450000ef jal x1 1104 <conclude>
44: 00a00893 addi x17 x0 10
48: 00000073 ecall
0000004c <Newtons_method>:
4c: ff810113 addi x2 x2 -8
50: 00112223 sw x1 4 x2
54: 00a12023 sw x10 0 x2
58: 000502b3 add x5 x10 x0
5c: 168000ef jal x1 360 <CLZ>
60: 00028533 add x10 x5 x0
64: fe050513 addi x10 x10 -32
68: 40a00533 sub x10 x0 x10
6c: 00155513 srli x10 x10 1
70: 00050e13 addi x28 x10 0
74: 1d8000ef jal x1 472 <int2float>
78: 000e0513 addi x10 x28 0
7c: 00051463 bne x10 x0 8 <nExit>
80: 40000537 lui x10 0x40000
00000084 <nExit>:
84: 00012583 lw x11 0 x2
88: 00058e13 addi x28 x11 0
8c: 1c0000ef jal x1 448 <int2float>
90: 000e0593 addi x11 x28 0
94: 00000613 addi x12 x0 0
98: 10000697 auipc x13 0x10000
9c: f806a683 lw x13 -128 x13
000000a0 <loop>:
a0: 02d65663 bge x12 x13 44 <outloop>
a4: 000502b3 add x5 x10 x0
a8: 00058333 add x6 x11 x0
ac: 1e8000ef jal x1 488 <div_float>
b0: 000502b3 add x5 x10 x0
b4: 024000ef jal x1 36 <add_float>
b8: 400002b7 lui x5 0x40000
bc: 1d8000ef jal x1 472 <div_float>
c0: 000e8533 add x10 x29 x0
c4: 00160613 addi x12 x12 1
c8: fd9ff06f jal x0 -40 <loop>
000000cc <outloop>:
cc: 00412083 lw x1 4 x2
d0: 00810113 addi x2 x2 8
d4: 00008067 jalr x0 x1 0
000000d8 <add_float>:
d8: ffc10113 addi x2 x2 -4
dc: 00112023 sw x1 0 x2
e0: 800003b7 lui x7 0x80000
e4: fff38393 addi x7 x7 -1
e8: 0072f333 and x6 x5 x7
ec: 007ef3b3 and x7 x29 x7
f0: 0063de63 bge x7 x6 28 <aExit1>
f4: 00028e13 addi x28 x5 0
f8: 000e8293 addi x5 x29 0
fc: 000e0e93 addi x29 x28 0
100: 00030e13 addi x28 x6 0
104: 00038313 addi x6 x7 0
108: 000e0393 addi x7 x28 0
0000010c <aExit1>:
10c: 0173df93 srli x31 x7 23
110: 01735f13 srli x30 x6 23
114: 00800e37 lui x28 0x800
118: fffe0e13 addi x28 x28 -1
11c: 01c3f3b3 and x7 x7 x28
120: 01c37333 and x6 x6 x28
124: 01f05663 bge x0 x31 12 <aExit2>
128: 00800e37 lui x28 0x800
12c: 01c3e3b3 or x7 x7 x28
00000130 <aExit2>:
130: 01e05663 bge x0 x30 12 <aExit3>
134: 00800e37 lui x28 0x800
138: 006e6333 or x6 x28 x6
0000013c <aExit3>:
13c: 41ef8f33 sub x30 x31 x30
140: 01e35333 srl x6 x6 x30
144: 01f2d293 srli x5 x5 31
148: 01fede93 srli x29 x29 31
14c: 01d2c2b3 xor x5 x5 x29
150: 00028663 beq x5 x0 12 <aElse4>
154: 40638e33 sub x28 x7 x6
158: 0080006f jal x0 8 <aExit4>
0000015c <aElse4>:
15c: 00638e33 add x28 x7 x6
00000160 <aExit4>:
160: 010002b7 lui x5 0x1000
164: 01c2f2b3 and x5 x5 x28
168: 00028863 beq x5 x0 16 <aElse5>
16c: 001e5e13 srli x28 x28 1
170: 001f8f93 addi x31 x31 1
174: 0200006f jal x0 32 <aExit5>
00000178 <aElse5>:
178: 000e0e63 beq x28 x0 28 <aExit5>
17c: 008002b7 lui x5 0x800
180: 005e72b3 and x5 x28 x5
184: 00029863 bne x5 x0 16 <aExit5>
188: 001e1e13 slli x28 x28 1
18c: ffff8f93 addi x31 x31 -1
190: fe9ff06f jal x0 -24 <aElse5>
00000194 <aExit5>:
194: 00000333 add x6 x0 x0
198: 03f05063 bge x0 x31 32 <add_out>
19c: 008003b7 lui x7 0x800
1a0: fff38393 addi x7 x7 -1
1a4: 007e7333 and x6 x28 x7
1a8: 017f9f93 slli x31 x31 23
1ac: 01f36333 or x6 x6 x31
1b0: 01fe9e93 slli x29 x29 31
1b4: 01d36333 or x6 x6 x29
000001b8 <add_out>:
1b8: 00012083 lw x1 0 x2
1bc: 00410113 addi x2 x2 4
1c0: 00008067 jalr x0 x1 0
000001c4 <CLZ>:
1c4: 0012d313 srli x6 x5 1
1c8: 0062e2b3 or x5 x5 x6
1cc: 0022d313 srli x6 x5 2
1d0: 0062e2b3 or x5 x5 x6
1d4: 0042d313 srli x6 x5 4
1d8: 0062e2b3 or x5 x5 x6
1dc: 0082d313 srli x6 x5 8
1e0: 0062e2b3 or x5 x5 x6
1e4: 0102d313 srli x6 x5 16
1e8: 0062e2b3 or x5 x5 x6
1ec: 0012d313 srli x6 x5 1
1f0: 555553b7 lui x7 0x55555
1f4: 55538393 addi x7 x7 1365
1f8: 00737333 and x6 x6 x7
1fc: 406282b3 sub x5 x5 x6
200: 0022d313 srli x6 x5 2
204: 333333b7 lui x7 0x33333
208: 33338393 addi x7 x7 819
20c: 00737333 and x6 x6 x7
210: 0072f2b3 and x5 x5 x7
214: 006282b3 add x5 x5 x6
218: 0042d313 srli x6 x5 4
21c: 0f0f13b7 lui x7 0xf0f1
220: f0f38393 addi x7 x7 -241
224: 005302b3 add x5 x6 x5
228: 0072f2b3 and x5 x5 x7
22c: 0082d313 srli x6 x5 8
230: 006282b3 add x5 x5 x6
234: 0102d313 srli x6 x5 16
238: 006282b3 add x5 x5 x6
23c: 01f2f293 andi x5 x5 31
240: fe028293 addi x5 x5 -32
244: 405002b3 sub x5 x0 x5
248: 00008067 jalr x0 x1 0
0000024c <int2float>:
24c: ffc10113 addi x2 x2 -4
250: 00112023 sw x1 0 x2
254: 000e02b3 add x5 x28 x0
258: f6dff0ef jal x1 -148 <CLZ>
25c: 00128293 addi x5 x5 1
260: fe028293 addi x5 x5 -32
264: 405002b3 sub x5 x0 x5
268: fe928313 addi x6 x5 -23
26c: 40600333 sub x6 x0 x6
270: 006e1e33 sll x28 x28 x6
274: 00800337 lui x6 0x800
278: 006e4e33 xor x28 x28 x6
27c: 07f28293 addi x5 x5 127
280: 01729293 slli x5 x5 23
284: 005e6e33 or x28 x28 x5
00000288 <intout>:
288: 00012083 lw x1 0 x2
28c: 00410113 addi x2 x2 4
290: 00008067 jalr x0 x1 0
00000294 <div_float>:
294: ffc10113 addi x2 x2 -4
298: 00112023 sw x1 0 x2
29c: 00000eb3 add x29 x0 x0
2a0: 14030a63 beq x6 x0 340 <div_out>
2a4: 80000eb7 lui x29 0x80000
2a8: fffe8e93 addi x29 x29 -1
2ac: 14028463 beq x5 x0 328 <div_out>
2b0: 01f35393 srli x7 x6 31
2b4: 01735e13 srli x28 x6 23
2b8: 0ffe7e13 andi x28 x28 255
2bc: 00800eb7 lui x29 0x800
2c0: fffe8e93 addi x29 x29 -1
2c4: 01d37333 and x6 x6 x29
2c8: 00800eb7 lui x29 0x800
2cc: 01c05663 bge x0 x28 12 <dElse1>
2d0: 01d36333 or x6 x6 x29
2d4: 0140006f jal x0 20 <dExit1>
000002d8 <dElse1>:
2d8: 00131313 slli x6 x6 1
2dc: fffe0e13 addi x28 x28 -1
2e0: 01d37f33 and x30 x6 x29
2e4: fe0f1ae3 bne x30 x0 -12 <dElse1>
000002e8 <dExit1>:
2e8: ff810113 addi x2 x2 -8
2ec: 00712023 sw x7 0 x2
2f0: 01c12223 sw x28 4 x2
2f4: 01f2d393 srli x7 x5 31
2f8: 0013f393 andi x7 x7 1
2fc: 0172de13 srli x28 x5 23
300: 0ffe7e13 andi x28 x28 255
304: 00800eb7 lui x29 0x800
308: fffe8e93 addi x29 x29 -1
30c: 01d2f2b3 and x5 x5 x29
310: 00800eb7 lui x29 0x800
314: 01c05663 bge x0 x28 12 <dElse2>
318: 01d2e2b3 or x5 x5 x29
31c: 0140006f jal x0 20 <dExit2>
00000320 <dElse2>:
320: 00129293 slli x5 x5 1
324: fffe0e13 addi x28 x28 -1
328: 01d2ff33 and x30 x5 x29
32c: fe0f1ae3 bne x30 x0 -12 <dElse2>
00000330 <dExit2>:
330: 00012e83 lw x29 0 x2
334: 01d3c3b3 xor x7 x7 x29
338: 00412e83 lw x29 4 x2
33c: 07fe8e93 addi x29 x29 127
340: 41ce8e33 sub x28 x29 x28
344: 00810113 addi x2 x2 8
348: 00535663 bge x6 x5 12 <dExit3>
34c: 00131313 slli x6 x6 1
350: fffe0e13 addi x28 x28 -1
00000354 <dExit3>:
354: 01900f13 addi x30 x0 25
358: 00000eb3 add x29 x0 x0
35c: 000e5863 bge x28 x0 16 <dExit4>
360: 01cf0f33 add x30 x30 x28
364: 00000e33 add x28 x0 x0
368: 080f4663 blt x30 x0 140 <div_out>
0000036c <dExit4>:
36c: 00000f93 addi x31 x0 0
00000370 <dloop>:
370: 03efd063 bge x31 x30 32 <doutloop>
374: 001e9e93 slli x29 x29 1
378: 00534663 blt x6 x5 12 <dExit5>
37c: 40530333 sub x6 x6 x5
380: 001eee93 ori x29 x29 1
00000384 <dExit5>:
384: 00131313 slli x6 x6 1
388: 001f8f93 addi x31 x31 1
38c: fe5ff06f jal x0 -28 <dloop>
00000390 <doutloop>:
390: 00134f13 xori x30 x6 1
394: 001ef313 andi x6 x29 1
398: 002ef293 andi x5 x29 2
39c: 001ede93 srli x29 x29 1
3a0: 01e2e2b3 or x5 x5 x30
3a4: 00537333 and x6 x6 x5
3a8: 006e8eb3 add x29 x29 x6
3ac: 020e1663 bne x28 x0 44 <dExit6>
3b0: 00900313 addi x6 x0 9
3b4: 0262d263 bge x5 x6 36 <dExit6>
3b8: ffc10113 addi x2 x2 -4
3bc: 00712023 sw x7 0 x2
3c0: e05ff0ef jal x1 -508 <CLZ>
3c4: 00012383 lw x7 0 x2
3c8: 00410113 addi x2 x2 4
3cc: 40530333 sub x6 x6 x5
3d0: 006edeb3 srl x29 x29 x6
3d4: 006e0e33 add x28 x28 x6
000003d8 <dExit6>:
3d8: 00800f37 lui x30 0x800
3dc: ffff0f13 addi x30 x30 -1
3e0: 01eefeb3 and x29 x29 x30
3e4: 017e1e13 slli x28 x28 23
3e8: 01f39393 slli x7 x7 31
3ec: 01ceeeb3 or x29 x29 x28
3f0: 007eeeb3 or x29 x29 x7
000003f4 <div_out>:
3f4: 00012083 lw x1 0 x2
3f8: 00410113 addi x2 x2 4
3fc: 00008067 jalr x0 x1 0
00000400 <printResult>:
400: 00040293 addi x5 x8 0
404: 00050313 addi x6 x10 0
408: 00070393 addi x7 x14 0
40c: 10000517 auipc x10 0x10000
410: c1050513 addi x10 x10 -1008
414: 00400893 addi x17 x0 4
418: 00000073 ecall
41c: 00028513 addi x10 x5 0
420: 00100893 addi x17 x0 1
424: 00000073 ecall
428: 10000517 auipc x10 0x10000
42c: c0e50513 addi x10 x10 -1010
430: 00400893 addi x17 x0 4
434: 00000073 ecall
438: 00070513 addi x10 x14 0
43c: 00100893 addi x17 x0 1
440: 00000073 ecall
444: 10000517 auipc x10 0x10000
448: c1050513 addi x10 x10 -1008
44c: 00400893 addi x17 x0 4
450: 00000073 ecall
454: 00030513 addi x10 x6 0
458: 00100893 addi x17 x0 1
45c: 00000073 ecall
460: 00a70e63 beq x14 x10 28 <CORRECT>
464: 00178793 addi x15 x15 1
468: 10000517 auipc x10 0x10000
46c: c0d50513 addi x10 x10 -1011
470: 00400893 addi x17 x0 4
474: 00000073 ecall
478: 0140006f jal x0 20 <printout>
0000047c <CORRECT>:
47c: 10000517 auipc x10 0x10000
480: bf950513 addi x10 x10 -1031
484: 00400893 addi x17 x0 4
488: 00000073 ecall
0000048c <printout>:
48c: 00008067 jalr x0 x1 0
00000490 <conclude>:
490: 10000517 auipc x10 0x10000
494: c1350513 addi x10 x10 -1005
498: 00400893 addi x17 x0 4
49c: 00000073 ecall
4a0: 00078513 addi x10 x15 0
4a4: 00100893 addi x17 x0 1
4a8: 00000073 ecall
4ac: 10000517 auipc x10 0x10000
4b0: c0650513 addi x10 x10 -1018
4b4: 00400893 addi x17 x0 4
4b8: 00000073 ecall
4bc: 00008067 jalr x0 x1 0
As for the code above can only calculate integer number, which means that the input data should in the range of
AS result, I selected three integer as my testing data, such like 1160030, 25, 500
And the each output data is exactly the same as I calculate in C code, where bothe Newtons iteration epoch = 5.
Always refer to primary sources, such as official RISC-V documentation.
or
By clicking below, you agree to our terms of service.
New to HackMD? Sign up