contributed by < chihenliu >
this work I select Implement transformation from integer to float by clz from 洪碩星
I want to understand the conversion between floating points and integers, which is a fundamental concept in both C and assembly language. Additionally, I want to become familiar with the application of the GNU Toolchain to aid in this understanding
IEEE 754 floating-point numbers consist of three components: the sign bit, exponent, and mantissa, and define special values like positive and negative infinity and NaN. This is a commonly used floating-point representation in the field of computing, IEEE754 floating point have two Common format single and double precision
single precision
double precision
Lab2: RISC-V RV32I[MACF] emulator with ELF support
We can set up the GNU Toolchain for RISC-V according to the instructions provided in the article above.
I'm using Ubuntu 20.04 LTS
on WSL 2 for my computer architecture assignments and daily computational fluid dynamics (CFD) simulations.
Follow the instructions below to install the xPack GNU RISC-V Embedded GCC
$ cd /tmp
$ wget https://github.com/xpack-dev-tools/riscv-none-elf-gcc-xpack/releases/download/v13.2.0-2/xpack-riscv-none-elf-gcc-13.2.0-2-linux-x64.tar.gz
$ tar zxvf xpack-riscv-none-elf-gcc-13.2.0-2-linux-x64.tar.gz
$ cp -af xpack-riscv-none-elf-gcc-13.2.0-2 $HOME/riscv-none-elf-gcc
$cd risc-v-none-elf-gcc
$pwd
$nano ~/.bashrc
Add the PATH to the bottom of the bashrc file.
After adding the PATH, press Ctrl+X to exit the nano editor.
source ~/.bashrc
$ riscv-none-elf-gcc -v
$ sudo apt install libsdl2-dev libsdl2-mixer-dev
$ git clone https://github.com/sysprog21/rv32emu
$ cd rv32emu
$ make
make check
Out Messages
$ riscv-none-elf-readelf -h build/hello.elf
riscv-none-elf-size build/hello.elf
Learn More →
Don't put the screenshots which contain plain text only.
Here is C and RISC-V code
#include <inttypes.h>
#include <stdint.h>
#include <stdio.h>
typedef uint64_t ticks;
union DoubleConverter {
unsigned long long intValue;
double doubleValue;
};
double ll_to_double(unsigned long long significand) {
// Only support 0 < significand < 1 << 53.
if (significand == 0 || significand >= 1ULL << 53)
return -1.0; // or handle the error in a way you prefer.
// // naive version
// int shifts = 0;
// // Align the leading 1 of the significand to the hidden-1 position.
// // Count the number of shifts required.
// while ((significand & (1ULL << 52)) == 0)
// {
// significand <<= 1;
// shifts++;
// }
// clz implementaion version
int shifts = __builtin_clzll(significand) - 11;
significand <<= shifts;
// The number 1.0 has an exponent of 0, and would need to be
// shifted left 52 times. IEEE-754 format requires a bias of 1023,
// so the exponent field is given by the following expression:
unsigned long long exponent = 1023 + 52 - shifts;
// Now merge significand and exponent. Be sure to strip away
// the hidden 1 in the significand.
unsigned long long merged = (exponent << 52) | (significand & 0xFFFFFFFFFFFFF);
// Use union for type conversion
union DoubleConverter converter;
converter.intValue = merged;
return converter.doubleValue;
}
//cycle count
static inline ticks getticks(void)
{
uint64_t result;
uint32_t l, h, h2;
asm volatile(
"rdcycleh %0\n"
"rdcycle %1\n"
"rdcycleh %2\n"
"sub %0, %0, %2\n"
"seqz %0, %0\n"
"sub %0, zero, %0\n"
"and %1, %1, %0\n"
: "=r"(h), "=r"(l), "=r"(h2));
result = (((uint64_t) h) << 32) | ((uint64_t) l);
return result;
}
int main() {
unsigned long long input = 1235655;
ticks t0 = getticks();
double result = ll_to_double(input);
ticks t1 = getticks();
printf("elapsed cycle: %" PRIu64 "\n", t1 - t0);
return 0;
}
.data
num: .dword 0xBBFFFFFFFF, 0x84f2, 0x811111111
mask: .word 0xFFFFF
maskclz: .word 0x55555555, 0x33333333, 0x0f0f0f0f
.global itof_clz
# ====== subroutines ======
# cast int64 to double
# input uint64[a0, a1]
# output double[a0, a1]
itof_clz:
addi sp, sp, -4
sw ra, 0(sp)
bnez a0, inrange
bnez a1, inrange
li t0, 1
slli t0, t0, 21
blt a0, t0, inrange
# overrange, set msb 1
li t0, 1
slli t0, t0, 31
or a0, a0, t0
ret
inrange:
addi sp, sp, -8
sw a0, 0(sp)
sw a1, 4(sp)
call clz
addi a2, a0, -11
lw a0, 0(sp)
lw a1, 4(sp)
addi sp, sp, 8
li a3, 32
bge a2, a3, ge32
# lt32
sub a3, a3, a2
sll a0, a0, a2
srl t0, a1, a3
sll a1, a1, a2
or a0, a0, t0
j merged
ge32:
sub a3, a2, a3
sll a1, a1, a3
mv a0, a1
li a1, 0
# exponent = 1023 + 52 - shifts
merged:
li a3, 1075
sub a3, a3, a2
slli a3, a3, 20
la t0, mask
lw t0, 0(t0)
and a0, a0, t0
or a0, a0, a3
lw ra, 0(sp)
addi sp, sp, 4
ret
clz:
# input int64[a0, a1]
# output int32[a0]
# x |= (x >> {1, 2, 4, 8, 16})
addi sp, sp, -4
sw ra, 0(sp)
addi t1, x0, 0x1
Loop1:
addi t2, x0, 32
srl t0, a0, t1
or a0, a0, t0
srl t0, a1, t1
or a1, a1, t0
sub t2, t2, t1
sll t0, a0, t2
or a1, a1, t0
slli t1, t1, 1
addi t2, x0, 32
bgt t2, t1, Loop1
# x |= (x >> 32)
or a1, a1, a0
# x -= ((x >> 1) & 0x5555555555555555);
la t6, maskclz
srli t0, a0, 1
lw t5, 0(t6)
and t0, t0, t5 # t0 = (a0 >> 1) & 0x55555555
srli t1, a1, 1
slli t2, a0, 31
or t1, t1, t2
and t1, t1, t5 # t1 = (a1 >> 1) & 0x55555555
sub a0, a0, t0
sltu t3, a1, t1
bne t3, x0, Borrow
sub a1, a1, t1
j Done
Borrow:
addi a0, a0, -1
sub a1, t1, a1
addi t3, x0, -1
sub a1, t3, a1
Done:
# x = ((x >> 2) & 0x3333333333333333) + (x & 0x3333333333333333)
lw t5, 4(t6)
srli t0, a0, 2
srli t1, a1, 2
slli t2, a0, 30
or t1, t1, t2
# [t0, t1] = x >> 2
and t0, t0, t5
and t1, t1, t5
and a0, a0, t5
and a1, a1, t5
mv a2, t0
mv a3, t1
jal ra, Add64
# ((x >> 4) + x) & 0x0f0f0f0f0f0f0f0f
srli t0, a0, 4
srli t1, a1, 4
slli t2, a0, 28
or t1, t1, t2
mv a2, t0
mv a3, t1
jal ra, Add64
lw t5, 8(t6)
and a0, a0 ,t5
and a1, a1 ,t5
# x += (x >> 8)
srli t0, a0, 8
srli t1, a1, 8
slli t2, a0, 24
or t1, t1, t2
mv a2, t0
mv a3, t1
jal ra, Add64
# x += (x >> 16)
srli t0, a0, 16
srli t1, a1, 16
slli t2, a0, 16
or t1, t1, t2
mv a2, t0
mv a3, t1
jal ra, Add64
# x += (x >> 32)
mv a2, x0
mv a3, a0
jal ra, Add64
# return (64 - (x & 0x7f))
andi a0, a1, 0x7f
addi a1, x0, 64
sub a0, a1, a0
lw ra, 0(sp)
addi sp, sp, 4
ret
Add64:
add a0, a0, a2
add a1, a1, a3
sltu s0, a1, a3
bne s0, x0, Carry
ret
Carry:
addi a0, a0, 1
ret
exit:
# Exit program
li a7, 10
ecall
for command can get disassemble code
riscv-none-elf-objdump -d itofFromC.elf >{$outfilename}
00010180 <ll_to_double>:
10180: fb010113 add sp,sp,-80
10184: 04112623 sw ra,76(sp)
10188: 04812423 sw s0,72(sp)
1018c: 05212223 sw s2,68(sp)
10190: 05312023 sw s3,64(sp)
10194: 03412e23 sw s4,60(sp)
10198: 03512c23 sw s5,56(sp)
1019c: 03612a23 sw s6,52(sp)
101a0: 03712823 sw s7,48(sp)
101a4: 05010413 add s0,sp,80
101a8: faa42c23 sw a0,-72(s0)
101ac: fab42e23 sw a1,-68(s0)
101b0: fb842783 lw a5,-72(s0)
101b4: fbc42703 lw a4,-68(s0)
101b8: 00e7e7b3 or a5,a5,a4
101bc: 02078c63 beqz a5,101f4 <ll_to_double+0x74>
101c0: fbc42703 lw a4,-68(s0)
101c4: 002007b7 lui a5,0x200
101c8: 02f77663 bgeu a4,a5,101f4 <ll_to_double+0x74>
101cc: fb842503 lw a0,-72(s0)
101d0: fbc42583 lw a1,-68(s0)
101d4: 23c000ef jal 10410 <__clzdi2>
...
Output
cycle count
headerfile
size
for command can get disassemble code
riscv-none-elf-objdump -d itofFromC.elf >{$outfilename}
10268: fe010113 add sp,sp,-32
1026c: 00112e23 sw ra,28(sp)
10270: 00812c23 sw s0,24(sp)
10274: 00912a23 sw s1,20(sp)
10278: 01212823 sw s2,16(sp)
1027c: 01312623 sw s3,12(sp)
10280: f01ff0ef jal 10180 <getticks>
10284: 00050493 mv s1,a0
10288: 00058993 mv s3,a1
1028c: ef5ff0ef jal 10180 <getticks>
10290: 00050413 mv s0,a0
10294: 00058913 mv s2,a1
10298: 0012e637 lui a2,0x12e
1029c: ac760613 add a2,a2,-1337 # 12dac7 <__BSS_END__+0x109ba7>
102a0: 00000693 li a3,0
102a4: 00022537 lui a0,0x22
102a8: ac050513 add a0,a0,-1344 # 21ac0 <__clzsi2+0x8c>
102ac: 688000ef jal 10934 <printf>
102b0: ed01a603 lw a2,-304(gp) # 238a0 <__SDATA_BEGIN__+0x8>
102b4: ed41a683 lw a3,-300(gp) # 238a4 <__SDATA_BEGIN__+0xc>
102b8: 00022537 lui a0,0x22
102bc: ad050513 add a0,a0,-1328 # 21ad0 <__clzsi2+0x9c>
102c0: 674000ef jal 10934 <printf>
102c4: 40940633 sub a2,s0,s1
102c8: 00c43433 sltu s0,s0,a2
102cc: 413906b3 sub a3,s2,s3
102d0: 408686b3 sub a3,a3,s0
102d4: 00022537 lui a0,0x22
102d8: ae050513 add a0,a0,-1312 # 21ae0 <__clzsi2+0xac>
102dc: 658000ef jal 10934 <printf>
102e0: 00000513 li a0,0
102e4: 01c12083 lw ra,28(sp)
102e8: 01812403 lw s0,24(sp)
102ec: 01412483 lw s1,20(sp)
102f0: 01012903 lw s2,16(sp)
102f4: 00c12983 lw s3,12(sp)
102f8: 02010113 add sp,sp,32
102fc: 00008067 ret
Output
cycle count
header file
size
for command can get disassemble code
riscv-none-elf-objdump -d itofFromC.elf >{$outfilename}
000100c0 <main>:
100c0: fe010113 add sp,sp,-32
100c4: 00112e23 sw ra,28(sp)
100c8: 00812c23 sw s0,24(sp)
100cc: 00912a23 sw s1,20(sp)
100d0: 01212823 sw s2,16(sp)
100d4: 01312623 sw s3,12(sp)
100d8: 140000ef jal 10218 <getticks>
100dc: 00050493 mv s1,a0
100e0: 00058993 mv s3,a1
100e4: 134000ef jal 10218 <getticks>
100e8: 00050413 mv s0,a0
100ec: 0012e637 lui a2,0x12e
100f0: 00022537 lui a0,0x22
100f4: ac760613 add a2,a2,-1337 # 12dac7 <__BSS_END__+0x109ba7>
100f8: 00000693 li a3,0
100fc: ad050513 add a0,a0,-1328 # 21ad0 <__clzsi2+0x8c>
10100: 00058913 mv s2,a1
10104: 041000ef jal 10944 <printf>
10108: ed01a603 lw a2,-304(gp) # 238a0 <__SDATA_BEGIN__+0x8>
1010c: ed41a683 lw a3,-300(gp) # 238a4 <__SDATA_BEGIN__+0xc>
10110: 00022537 lui a0,0x22
10114: ae050513 add a0,a0,-1312 # 21ae0 <__clzsi2+0x9c>
10118: 02d000ef jal 10944 <printf>
1011c: 40940633 sub a2,s0,s1
10120: 00c43433 sltu s0,s0,a2
10124: 413906b3 sub a3,s2,s3
10128: 00022537 lui a0,0x22
1012c: 408686b3 sub a3,a3,s0
10130: af050513 add a0,a0,-1296 # 21af0 <__clzsi2+0xac>
10134: 011000ef jal 10944 <printf>
10138: 01c12083 lw ra,28(sp)
1013c: 01812403 lw s0,24(sp)
10140: 01412483 lw s1,20(sp)
10144: 01012903 lw s2,16(sp)
10148: 00c12983 lw s3,12(sp)
1014c: 00000513 li a0,0
10150: 02010113 add sp,sp,32
10154: 00008067 ret
Output
cycle count
headerfile
size
for command can get disassemble code
riscv-none-elf-objdump -d itofFromC.elf >{$outfilename}
000100c0 <main>:
100c0: fe010113 add sp,sp,-32
100c4: 00112e23 sw ra,28(sp)
100c8: 00812c23 sw s0,24(sp)
100cc: 00912a23 sw s1,20(sp)
100d0: 01212823 sw s2,16(sp)
100d4: 01312623 sw s3,12(sp)
100d8: c80029f3 rdcycleh s3
100dc: c00024f3 rdcycle s1
100e0: c80027f3 rdcycleh a5
100e4: 40f989b3 sub s3,s3,a5
100e8: 0019b993 seqz s3,s3
100ec: 413009b3 neg s3,s3
100f0: 0134f4b3 and s1,s1,s3
100f4: c8002973 rdcycleh s2
100f8: c0002473 rdcycle s0
100fc: c80027f3 rdcycleh a5
10100: 40f90933 sub s2,s2,a5
10104: 00193913 seqz s2,s2
10108: 41200933 neg s2,s2
1010c: 01247433 and s0,s0,s2
10110: 0012e637 lui a2,0x12e
10114: 00022537 lui a0,0x22
10118: ac760613 add a2,a2,-1337 # 12dac7 <__BSS_END__+0x109ba7>
1011c: 00000693 li a3,0
10120: ad050513 add a0,a0,-1328 # 21ad0 <__clzsi2+0x8c>
10124: 021000ef jal 10944 <printf>
10128: ed01a603 lw a2,-304(gp) # 238a0 <__SDATA_BEGIN__+0x8>
1012c: ed41a683 lw a3,-300(gp) # 238a4 <__SDATA_BEGIN__+0xc>
10130: 00022537 lui a0,0x22
10134: ae050513 add a0,a0,-1312 # 21ae0 <__clzsi2+0x9c>
10138: 00d000ef jal 10944 <printf>
1013c: 40940633 sub a2,s0,s1
10140: 00c43433 sltu s0,s0,a2
10144: 413906b3 sub a3,s2,s3
10148: 00022537 lui a0,0x22
1014c: 408686b3 sub a3,a3,s0
10150: af050513 add a0,a0,-1296 # 21af0 <__clzsi2+0xac>
10154: 7f0000ef jal 10944 <printf>
10158: 01c12083 lw ra,28(sp)
1015c: 01812403 lw s0,24(sp)
10160: 01412483 lw s1,20(sp)
10164: 01012903 lw s2,16(sp)
10168: 00c12983 lw s3,12(sp)
1016c: 00000513 li a0,0
10170: 02010113 add sp,sp,32
10174: 00008067 ret
Output
cycle count
headerfile
size
for command can get disassemble code
riscv-none-elf-objdump -d itofFromC.elf >{$outfilename}
000100c0 <main>:
100c0: fe010113 add sp,sp,-32
100c4: 00112e23 sw ra,28(sp)
100c8: 00812c23 sw s0,24(sp)
100cc: 00912a23 sw s1,20(sp)
100d0: 01212823 sw s2,16(sp)
100d4: 01312623 sw s3,12(sp)
100d8: c80029f3 rdcycleh s3
100dc: c00024f3 rdcycle s1
100e0: c80027f3 rdcycleh a5
100e4: 40f989b3 sub s3,s3,a5
100e8: 0019b993 seqz s3,s3
100ec: 413009b3 neg s3,s3
100f0: 0134f4b3 and s1,s1,s3
100f4: c8002973 rdcycleh s2
100f8: c0002473 rdcycle s0
100fc: c80027f3 rdcycleh a5
10100: 40f90933 sub s2,s2,a5
10104: 00193913 seqz s2,s2
10108: 41200933 neg s2,s2
1010c: 01247433 and s0,s0,s2
10110: 0012e637 lui a2,0x12e
10114: 00022537 lui a0,0x22
10118: ac760613 add a2,a2,-1337 # 12dac7 <__BSS_END__+0x109ba7>
1011c: 00000693 li a3,0
10120: ad050513 add a0,a0,-1328 # 21ad0 <__clzsi2+0x8c>
10124: 021000ef jal 10944 <printf>
10128: ed01a603 lw a2,-304(gp) # 238a0 <__SDATA_BEGIN__+0x8>
1012c: ed41a683 lw a3,-300(gp) # 238a4 <__SDATA_BEGIN__+0xc>
10130: 00022537 lui a0,0x22
10134: ae050513 add a0,a0,-1312 # 21ae0 <__clzsi2+0x9c>
10138: 00d000ef jal 10944 <printf>
1013c: 40940633 sub a2,s0,s1
10140: 00c43433 sltu s0,s0,a2
10144: 413906b3 sub a3,s2,s3
10148: 00022537 lui a0,0x22
1014c: 408686b3 sub a3,a3,s0
10150: af050513 add a0,a0,-1296 # 21af0 <__clzsi2+0xac>
10154: 7f0000ef jal 10944 <printf>
10158: 01c12083 lw ra,28(sp)
1015c: 01812403 lw s0,24(sp)
10160: 01412483 lw s1,20(sp)
10164: 01012903 lw s2,16(sp)
10168: 00c12983 lw s3,12(sp)
1016c: 00000513 li a0,0
10170: 02010113 add sp,sp,32
10174: 00008067 ret
Output
cycle count
headerfile
size
O0 | O1 | O2 | O3 | Ofast | |
---|---|---|---|---|---|
main.elf cycle count | 212 | 186 | 186 | 186 | 186 |
itofFromC.elf cycle count | 129 | 11 | 11 | 0 | 0 |
O0
to 01
212 -> 186
129 -> 11
53 -> 38
89->50
10&10 -> 5&7
30&19 -> 7&3
10 -> 7
12->8
It is evident that both the line count of the code and the number of registers used have decreased significantly, along with a reduction in the usage of pop and push operations. These factors contribute to the rapid decrease in cycle count for the <ll_to_double> label.
O1
to O2
186 -> 186
11 -> 11
38 -> 38
50 ->54
5&7 -> 7&5
7&3 -> 10&3
7 -> 9
8 ->7
It was noticed that the number of registers started to increase, and the line count of the code began to rise. However, the cycle count remained unchanged
O2
to O3
186 -> 186
11 -> 0
38 -> 46
54 -> 54
7&5 -> 7&5
10&3 -> 10&3
9 -> 7
7 -> 8
Starting from this point, it was observed that the get_ticks label was merged into the main code. This change might have been a factor contributing to the calculation of a cycle count of 0
O3
to Ofast
186 -> 186
0 -> 0
46-> 46
54 -> 54
7&5 -> 7&5
10&3 -> 10&3
7 -> 8
8 -> 7
It was noticed that throughout this process, the number of registers and the line count of the code remained unchanged, and the cycle count showed no variation.
From the above observations, it can be determined that there is a change when using GCC optimization, and optimization can only go up to O2. This is because starting from O3, the <ll_to_double> function is not called by the main function 。
When optimizing GCC to O3
andOfast
, it was observed that the cycle count for itofFromC.elf came out as 0. This indicates that the function was essentially bypassed without being used, yet the input and output were correct. By examining the disassembled RISC-V code, the difference between O0
and O3
optimization levels becomes apparent.
O0
10364: fd010113 add sp,sp,-48
10368: 02112623 sw ra,44(sp)
1036c: 02812423 sw s0,40(sp)
10370: 03010413 add s0,sp,48
10374: 0012e737 lui a4,0x12e
10378: ac770713 add a4,a4,-1337 # 12dac7 <__BSS_END__+0x109bb7>
1037c: 00000793 li a5,0
10380: fee42423 sw a4,-24(s0)
10384: fef42623 sw a5,-20(s0)
10388: f5dff0ef jal 102e4 <getticks>
1038c: fea42023 sw a0,-32(s0)
10390: feb42223 sw a1,-28(s0)
10394: fe842503 lw a0,-24(s0)
10398: fec42583 lw a1,-20(s0)
1039c: de5ff0ef jal 10180 <ll_to_double>
103a0: fca42c23 sw a0,-40(s0)
103a4: fcb42e23 sw a1,-36(s0)
103a8: f3dff0ef jal 102e4 <getticks>
103ac: fca42823 sw a0,-48(s0)
103b0: fcb42a23 sw a1,-44(s0)
103b4: fe842603 lw a2,-24(s0)
103b8: fec42683 lw a3,-20(s0)
103bc: 000227b7 lui a5,0x22
103c0: bf878513 add a0,a5,-1032 # 21bf8 <__clzsi2+0x8c>
103c4: 6a8000ef jal 10a6c <printf>
103c8: fd842603 lw a2,-40(s0)
103cc: fdc42683 lw a3,-36(s0)
103d0: 000227b7 lui a5,0x22
103d4: c0878513 add a0,a5,-1016 # 21c08 <__clzsi2+0x9c>
103d8: 694000ef jal 10a6c <printf>
103dc: fd042703 lw a4,-48(s0)
103e0: fd442783 lw a5,-44(s0)
103e4: fe042503 lw a0,-32(s0)
103e8: fe442583 lw a1,-28(s0)
103ec: 40a70633 sub a2,a4,a0
103f0: 00060813 mv a6,a2
103f4: 01073833 sltu a6,a4,a6
103f8: 40b786b3 sub a3,a5,a1
103fc: 410687b3 sub a5,a3,a6
10400: 00078693 mv a3,a5
10404: 00060713 mv a4,a2
10408: 00068793 mv a5,a3
1040c: 00070613 mv a2,a4
10410: 00078693 mv a3,a5
10414: 000227b7 lui a5,0x22
10418: c1878513 add a0,a5,-1000 # 21c18 <__clzsi2+0xac>
1041c: 650000ef jal 10a6c <printf>
10420: 00000793 li a5,0
10424: 00078513 mv a0,a5
10428: 02c12083 lw ra,44(sp)
1042c: 02812403 lw s0,40(sp)
10430: 03010113 add sp,sp,48
10434: 00008067 ret
O3
100c0: fe010113 add sp,sp,-32
100c4: 00112e23 sw ra,28(sp)
100c8: 00812c23 sw s0,24(sp)
100cc: 00912a23 sw s1,20(sp)
100d0: 01212823 sw s2,16(sp)
100d4: 01312623 sw s3,12(sp)
100d8: c80029f3 rdcycleh s3
100dc: c00024f3 rdcycle s1
100e0: c80027f3 rdcycleh a5
100e4: 40f989b3 sub s3,s3,a5
100e8: 0019b993 seqz s3,s3
100ec: 413009b3 neg s3,s3
100f0: 0134f4b3 and s1,s1,s3
100f4: c8002973 rdcycleh s2
100f8: c0002473 rdcycle s0
100fc: c80027f3 rdcycleh a5
10100: 40f90933 sub s2,s2,a5
10104: 00193913 seqz s2,s2
10108: 41200933 neg s2,s2
1010c: 01247433 and s0,s0,s2
10110: 0012e637 lui a2,0x12e
10114: 00022537 lui a0,0x22
10118: ac760613 add a2,a2,-1337 # 12dac7 <__BSS_END__+0x109ba7>
1011c: 00000693 li a3,0
10120: ad050513 add a0,a0,-1328 # 21ad0 <__clzsi2+0x8c>
10124: 021000ef jal 10944 <printf>
10128: ed01a603 lw a2,-304(gp) # 238a0 <__SDATA_BEGIN__+0x8>
1012c: ed41a683 lw a3,-300(gp) # 238a4 <__SDATA_BEGIN__+0xc>
10130: 00022537 lui a0,0x22
10134: ae050513 add a0,a0,-1312 # 21ae0 <__clzsi2+0x9c>
10138: 00d000ef jal 10944 <printf>
1013c: 40940633 sub a2,s0,s1
10140: 00c43433 sltu s0,s0,a2
10144: 413906b3 sub a3,s2,s3
10148: 00022537 lui a0,0x22
1014c: 408686b3 sub a3,a3,s0
10150: af050513 add a0,a0,-1296 # 21af0 <__clzsi2+0xac>
10154: 7f0000ef jal 10944 <printf>
10158: 01c12083 lw ra,28(sp)
1015c: 01812403 lw s0,24(sp)
10160: 01412483 lw s1,20(sp)
10164: 01012903 lw s2,16(sp)
10168: 00c12983 lw s3,12(sp)
1016c: 00000513 li a0,0
10170: 02010113 add sp,sp,32
10174: 00008067 ret
Upon comparing the main label, it was observed that the itof function, represented by the <ll_to_double> label, was not utilized within the main code. This may have led to the resulting cycle count being 0. The original C language code did not include any print statements for input or output, initially suggesting that the output might not have been used. In an attempt to address this, the <ll_to_double> label was omitted, and the getticks label was merged into the main code. It was speculated that this might be the reason for the cycle count being 0. Despite trying this approach after discussing with classmates and 洪碩星, the problem persisted. It's possible that 洪碩星 had already optimized the original code quite effectively.
Through this assignment, I gained an understanding of and proficiency in the application of the GNU Toolchain in RISC-V. I was able to incorporate assembly language into C code to enhance program speed, and I optimized its performance using rv32emu. I observed the assembly language generated through disassembly to gain insights into the process of writing RISC-V assembly language and optimizing it. I also learned that optimizing results may not always be correct through the Toolchain.
Assignment1: RISC-V Assembly and Instruction Pipeline
Assignment2: GNU Toolchain
Lab2: RISC-V RV32I[MACF] emulator with ELF support
藉由 JIT 編譯加速 rv32emu
rv32emu 開發紀錄
Assignment1: RISC-V Assembly and Instruction Pipeline