contributed by < jimmylu890303
>
I chose the implementation of multiplication overflow prediction for unsigned integers using CLZ
as my homework assignment 2 from 洪佑杭.
Motivation: The main reason why I chose this is that I am curious about how counting leading zeros can be applied to overflow prediction. So I chose this.
Because I need to analyze the performance of his code, I define a extern function(getcycles.s) to get current cycle.
.text
.globl get_cycles
.align 2
get_cycles:
csrr a1, cycleh
csrr a0, cycle
csrr a2, cycleh
bne a1, a2, get_cycles
ret
.size get_cycles,.-get_cycles
#include <stdint.h>
#include <stdbool.h>
#include <stdio.h>
// define extern function get_cycles() in getcycles.S
extern uint64_t get_cycles();
... omission
int main()
{
// Get init cycle before execution
uint64_t oldcount = get_cycles();
printf("%d\n", predict_if_mul_overflow(&a_x0, &a_x1));
printf("%d\n", predict_if_mul_overflow(&b_x0, &b_x1));
printf("%d\n", predict_if_mul_overflow(&c_x0, &c_x1));
printf("%d\n", predict_if_mul_overflow(&d_x0, &d_x1));
// Obtain the total number of execution cycles
uint64_t cyclecount = get_cycles() - oldcount;
printf("cycle count: %u\n", (unsigned int) cyclecount);
return 0;
}
I have defined an external function get_cycles() in my C program. Therefore, I need to link main.c and getcycles.s when compiling with GCC.
// Convert getcycles.s to getcycles.o
$ riscv-none-elf-gcc -march=rv32i_zicsr_zifencei -mabi=ilp32 -Wall -c -o getcycles.o getcycles.s
// Convert main.c to main.s
$ riscv-none-elf-gcc -march=rv32i_zicsr_zifencei -mabi=ilp32 -Wall -S -o main.s main.c
// Convert main.s to main.o
$ riscv-none-elf-gcc -march=rv32i_zicsr_zifencei -mabi=ilp32 -Wall -c -o main.o main.s
// Compile & link with main.o and getcycles.o to main.elf
$ riscv-none-elf-gcc -o main.elf getcycles.o main.o
$ ~/rv32emu/build/rv32emu main.elf
0 0 1 1
cycle count: 6190
inferior exit code 0
$ riscv-none-elf-objdump -d main.elf
// output
Disassembly of section .text:
00010094 <exit>:
10094: 1141 add sp,sp,-16
10096: 4581 li a1,0
10098: c422 sw s0,8(sp)
1009a: c606 sw ra,12(sp)
1009c: 842a mv s0,a0
1009e: 6fd000ef jal 10f9a <__call_exitprocs>
100a2: f981a783 lw a5,-104(gp) # 1d7a8 <__stdio_exit_handler>
100a6: c391 beqz a5,100aa <exit+0x16>
100a8: 9782 jalr a5
100aa: 8522 mv a0,s0
100ac: 5b1090ef jal 19e5c <_exit>
... omission
000100c2 <_start>:
100c2: 0000d197 auipc gp,0xd
100c6: 74e18193 add gp,gp,1870 # 1d810 <__global_pointer$>
100ca: f7c18513 add a0,gp,-132 # 1d78c <__bss_start>
100ce: 58c18613 add a2,gp,1420 # 1dd9c <__BSS_END__>
100d2: 8e09 sub a2,a2,a0
100d4: 4581 li a1,0
100d6: 58b000ef jal 10e60 <memset>
100da: 00001517 auipc a0,0x1
100de: d3650513 add a0,a0,-714 # 10e10 <__libc_fini_array>
100e2: 255d jal 10788 <atexit>
100e4: 4c3000ef jal 10da6 <__libc_init_array>
100e8: 4502 lw a0,0(sp)
100ea: 004c add a1,sp,4
100ec: 4601 li a2,0
100ee: 2b79 jal 1068c <main>
100f0: b755 j 10094 <exit>
... omission
00010144 <get_cycles>:
10144: c80025f3 rdcycleh a1
10148: c0002573 rdcycle a0
1014c: c8002673 rdcycleh a2
10150: fec59ae3 bne a1,a2,10144 <get_cycles>
10154: 00008067 ret
$ riscv-none-elf-readelf -h main.elf
// output
ELF Header:
Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00
Class: ELF32
Data: 2's complement, little endian
Version: 1 (current)
OS/ABI: UNIX - System V
ABI Version: 0
Type: EXEC (Executable file)
Machine: RISC-V
Version: 0x1
Entry point address: 0x100c2
Start of program headers: 52 (bytes into file)
Start of section headers: 69612 (bytes into file)
Flags: 0x1, RVC, soft-float ABI
Size of this header: 52 (bytes)
Size of program headers: 32 (bytes)
Number of program headers: 3
Size of section headers: 40 (bytes)
Number of section headers: 15
Section header string table index: 14
$ riscv-none-elf-size main.elf
// output
text data bss dec hex filename
52564 1928 1548 56040 dae8 main.elf
I added a get_cycles
section to the original code in order to analyze its performance, allowing me to retrieve the current cycle.
get_cycles:
csrr a1, cycleh
csrr a0, cycle
csrr a2, cycleh
bne a1, a2, get_cycles
ret
However, rv32emu doesn't support system calls for printing integers, so I added a convert_ascii section to convert integers to ASCII
.
# Predefine 4-byte buffer
.data
buffer: .byte 0, 0, 0, 0
# convert number to ascii code
# Input:
# a0: number
# a1: number of bytes in buffer
convert_ascii:
addi sp, sp, -8
sw a0, 0(sp) # save dividend
sw a1, 4(sp) # save counter
convert_loop:
lw t0, 0(sp) # load dividend
li t1, 0 # t1 = quotient
li t2, 0 # t2 = reminder
lw t3, 4(sp) # load counter
li t4, 10 # divisor
divide_loop:
# if x > 10 => go x-=10
bge t0, t4, divide_subtract
# else reminder be found
mv t2, t0
j divide_loop_done
divide_subtract:
# x = x - 10
sub t0, t0, t4
# quotient += 1
addi t1, t1, 1
j divide_loop
divide_loop_done:
# save quotient to stack
sw t1, 0(sp)
# convert ascii code this round and save to buffer
addi t2, t2, 48
la a0, buffer
add a0, a0, t3
addi a0, a0, -1
sb t2, 0(a0)
# counter = 0 => exit
addi t3, t3, -1
sw t3, 4(sp)
beqz t3, convert_loop_done
j convert_loop
convert_loop_done:
addi sp, sp, 8
ret
$ ~/rv32emu/build/rv32emu main.elf
0 0 1 1
cycle count: 876
inferior exit code 0
$ riscv-none-elf-objdump -d main.elf
// output
main.elf: file format elf32-littleriscv
Disassembly of section .text:
00000000 <main>:
0: 100000ef jal 100 <get_cycles>
4: ffc10113 add sp,sp,-4
8: 00a12023 sw a0,0(sp)
c: fec10113 add sp,sp,-20
10: 0f0000ef jal 100 <get_cycles>
14: 00a12823 sw a0,16(sp)
18: 00000297 auipc t0,0x0
1c: 3c828293 add t0,t0,968 # 3e0 <cmp_data_1>
20: 00512023 sw t0,0(sp)
24: 00000297 auipc t0,0x0
28: 3cc28293 add t0,t0,972 # 3f0 <cmp_data_2>
2c: 00512223 sw t0,4(sp)
30: 00000297 auipc t0,0x0
34: 3d028293 add t0,t0,976 # 400 <cmp_data_3>
38: 00512423 sw t0,8(sp)
3c: 00000297 auipc t0,0x0
40: 3d428293 add t0,t0,980 # 410 <cmp_data_4>
44: 00512623 sw t0,12(sp)
48: 00400413 li s0,4
4c: 00000493 li s1,0
50: 00010913 mv s2,sp
$ riscv-none-elf-readelf -h main.elf
// output
ELF Header:
Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00
Class: ELF32
Data: 2's complement, little endian
Version: 1 (current)
OS/ABI: UNIX - System V
ABI Version: 0
Type: EXEC (Executable file)
Machine: RISC-V
Version: 0x1
Entry point address: 0x0
Start of program headers: 52 (bytes into file)
Start of section headers: 6312 (bytes into file)
Flags: 0x0
Size of this header: 52 (bytes)
Size of program headers: 32 (bytes)
Number of program headers: 2
Size of section headers: 40 (bytes)
Number of section headers: 7
Section header string table index: 6
$ riscv-none-elf-size main.elf
// output
text data bss dec hex filename
1077 0 0 1077 435 main.elf
many libraries included
in the C program.printf
cause many cycles.Don't do any optimization, this is the default compile option
text data bss dec hex filename
52564 1928 1548 56040 dae8 main_O0.elf
$ ~/rv32emu/build/rv32emu main_O0.elf
0 0 1 1
cycle count: 6190
inferior exit code 0
ELF Header:
Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00
Class: ELF32
Data: 2's complement, little endian
Version: 1 (current)
OS/ABI: UNIX - System V
ABI Version: 0
Type: EXEC (Executable file)
Machine: RISC-V
Version: 0x1
Entry point address: 0x100c2
Start of program headers: 52 (bytes into file)
Start of section headers: 69612 (bytes into file)
Flags: 0x1, RVC, soft-float ABI
Size of this header: 52 (bytes)
Size of program headers: 32 (bytes)
Number of program headers: 3
Size of section headers: 40 (bytes)
Number of section headers: 15
Section header string table index: 14
text data bss dec hex filename
51608 1928 1548 55084 d72c main_O1.elf
$ ~/rv32emu/build/rv32emu main_O1.elf
0 0 1 1
cycle count: 4524
inferior exit code 0
ELF Header:
Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00
Class: ELF32
Data: 2's complement, little endian
Version: 1 (current)
OS/ABI: UNIX - System V
ABI Version: 0
Type: EXEC (Executable file)
Machine: RISC-V
Version: 0x1
Entry point address: 0x100c2
Start of program headers: 52 (bytes into file)
Start of section headers: 69612 (bytes into file)
Flags: 0x1, RVC, soft-float ABI
Size of this header: 52 (bytes)
Size of program headers: 32 (bytes)
Number of program headers: 3
Size of section headers: 40 (bytes)
Number of section headers: 15
Section header string table index: 14
text data bss dec hex filename
51608 1928 1548 55084 d72c main_O2.elf
$ ~/rv32emu/build/rv32emu main_O2.elf
0 0 1 1
cycle count: 4524
inferior exit code 0
ELF Header:
Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00
Class: ELF32
Data: 2's complement, little endian
Version: 1 (current)
OS/ABI: UNIX - System V
ABI Version: 0
Type: EXEC (Executable file)
Machine: RISC-V
Version: 0x1
Entry point address: 0x1016a
Start of program headers: 52 (bytes into file)
Start of section headers: 69628 (bytes into file)
Flags: 0x1, RVC, soft-float ABI
Size of this header: 52 (bytes)
Size of program headers: 32 (bytes)
Number of program headers: 3
Size of section headers: 40 (bytes)
Number of section headers: 15
Section header string table index: 14
text data bss dec hex filename
52258 1928 1548 55734 d9b6 main_O3.elf
$ ~/rv32emu/build/rv32emu main_O3.elf
0 0 1 1
cycle count: 4440
inferior exit code 0
ELF Header:
Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00
Class: ELF32
Data: 2's complement, little endian
Version: 1 (current)
OS/ABI: UNIX - System V
ABI Version: 0
Type: EXEC (Executable file)
Machine: RISC-V
Version: 0x1
Entry point address: 0x1016a
Start of program headers: 52 (bytes into file)
Start of section headers: 69628 (bytes into file)
Flags: 0x1, RVC, soft-float ABI
Size of this header: 52 (bytes)
Size of program headers: 32 (bytes)
Number of program headers: 3
Size of section headers: 40 (bytes)
Number of section headers: 15
Section header string table index: 14
text data bss dec hex filename
52258 1928 1548 55734 d9b6 main_Ofast.elf
$ ~/rv32emu/build/rv32emu main_Ofast.elf
0 0 1 1
cycle count: 4440
inferior exit code 0
ELF Header:
Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00
Class: ELF32
Data: 2's complement, little endian
Version: 1 (current)
OS/ABI: UNIX - System V
ABI Version: 0
Type: EXEC (Executable file)
Machine: RISC-V
Version: 0x1
Entry point address: 0x1016a
Start of program headers: 52 (bytes into file)
Start of section headers: 69628 (bytes into file)
Flags: 0x1, RVC, soft-float ABI
Size of this header: 52 (bytes)
Size of program headers: 32 (bytes)
Number of program headers: 3
Size of section headers: 40 (bytes)
Number of section headers: 15
Section header string table index: 14
int main()
{
// Get init cycle before execution
uint64_t oldcount = get_cycles();
printf("%d\n", predict_if_mul_overflow(&a_x0, &a_x1));
printf("%d\n", predict_if_mul_overflow(&b_x0, &b_x1));
printf("%d\n", predict_if_mul_overflow(&c_x0, &c_x1));
printf("%d\n", predict_if_mul_overflow(&d_x0, &d_x1));
// Obtain the total number of execution cycles
uint64_t cyclecount = get_cycles() - oldcount;
printf("cycle count: %u\n", (unsigned int) cyclecount);
return 0;
}
We have noticed that the printf function is called five times.
First, these calls consume many cycles to print values.
Second, including the stdio.h library increases the size of the .elf file.
Therefore, I decided to optimize the code using inline assembly.
I converted the output of predict_if_mul_overflow() to ASCII and replaced all instances of printf with assembly code.
int main()
{
uint64_t oldcount = get_cycles();
int predict = (int)predict_if_mul_overflow(&a_x0, &a_x1);
char buffer[2]={0,'\n'};
buffer[0] = predict+48;
asm("li a0, 1 \n"
"mv a1, %0 \n"
"li a2, 2 \n"
"li a7, 64 \n"
"ecall \n"
:
:"r"(&buffer));
predict = (int)predict_if_mul_overflow(&b_x0, &b_x1);
buffer[0] = predict+48;
asm("li a0, 1 \n"
"mv a1, %0 \n"
"li a2, 2 \n"
"li a7, 64 \n"
"ecall \n"
:
:"r"(&buffer));
predict = (int)predict_if_mul_overflow(&c_x0, &c_x1);
buffer[0] = predict+48;
asm("li a0, 1 \n"
"mv a1, %0 \n"
"li a2, 2 \n"
"li a7, 64 \n"
"ecall \n"
:
:"r"(&buffer));
predict = (int)predict_if_mul_overflow(&d_x0, &d_x1);
buffer[0] = predict+48;
asm("li a0, 1 \n"
"mv a1, %0 \n"
"li a2, 2 \n"
"li a7, 64 \n"
"ecall \n"
:
:"r"(&buffer));
uint64_t cyclecount = get_cycles() - oldcount;
char cycle_str[] = "cycle count: ";
asm("li a0, 1 \n"
"mv a1, %0 \n"
"li a2, 13 \n"
"li a7, 64 \n"
"ecall \n"
:
:"r"(&cycle_str));
char *ptr = convert_ascii((unsigned int) cyclecount);
asm("li a0, 1 \n"
"mv a1, %0 \n"
"li a2, 5 \n"
"li a7, 64 \n"
"ecall \n"
:
:"r"(ptr));
free(ptr);
return 0;
}
Full implementation at objdump_O0.txt.
$ ~/rv32emu/build/rv32emu main_O0.elf
0 0 1 1
cycle count: 6190
inferior exit code 0
$ ~/rv32emu/build/rv32emu main_opt.elf
0 0 1 1
cycle count: 2561
inferior exit code 0
text data bss dec hex filename
52564 1928 1548 56040 dae8 main_O0.elf
text data bss dec hex filename
8026 1424 860 10310 2846 main_opt.elf
or
By clicking below, you agree to our terms of service.
New to HackMD? Sign up