contributed by Terry7Wei7
Rewrite Implement priority encoder using CLZ by 倪英智 https://hackmd.io/@NIYINGCHIH/BkvgNE3e6
Motivation: I choose the problem Implement priority encoder using CLZ from 倪英智. One common application of using the CLZ (Count Leading Zeros) instruction is in interrupt handling within computer architectures. During the interrupt handling process, the system needs to determine the priorities of different interrupt requests to decide which one should be processed. The CLZ instruction is helpful in determining the priority of interrupt requests because it efficiently calculates the number of leading zero bits in the input.
In this case, I will attempt to enhance program performance.
#include <stdio.h>
#include <stdint.h>
uint16_t count_leading_zeros(uint64_t x, int input_size)
{
x |= (x >> 1);
x |= (x >> 2);
x |= (x >> 4);
x |= (x >> 8);
x |= (x >> 16);
x |= (x >> 32);
/* count ones (population count) */
x -= ((x >> 1) & 0x5555555555555555 );
x = ((x >> 2) & 0x3333333333333333) + (x & 0x3333333333333333 );
x = ((x >> 4) + x) & 0x0f0f0f0f0f0f0f0f;
x += (x >> 8);
x += (x >> 16);
x += (x >> 32);
return (input_size - (x & 0x7f));
}
int priority_encoder_4bit(int leading_zero, int input_size) {
if(leading_zero<0){
printf("wrong input");
return 0;
}
else{
return (input_size -1 -(leading_zero));
}
}
int main() {
int input;
int input_size;
int leading_zero = 0;
int priority = 0;
printf("Enter a number : ");
scanf("%4d", &input);
printf("Enter a input size : ");
scanf("%4d", &input_size);
leading_zero = count_leading_zeros(input,input_size);
printf("%d\n",leading_zero);
priority = priority_encoder_4bit(leading_zero, input_size);
printf("%d\n",priority);
return 0;
}
command line
$ riscv-none-elf-gcc -march=rv32i_zicsr_zifencei -mabi=ilp32 -Wall -S -o opt2.S opt2.c
$ riscv-none-elf-gcc -march=rv32i_zicsr_zifencei -mabi=ilp32 -Wall -c -o opt2.o opt2.S
$ riscv-none-elf-gcc -o opt2.elf getcycles.o opt2.o
$ ../../build/rv32emu opt2.elf
$ riscv-none-elf-size opt2.elf
$ riscv-none-elf-readelf -h opt2.elf
O0
terry@terry-virtual-machine:~/rv32emu/tests/asm-toolchain$ riscv-none-elf-size orihw_O0.elf
text data bss dec hex filename
151152 2416 1528 155096 25dd8 orihw_O0.elf
terry@terry-virtual-machine:~/rv32emu/tests/asm-toolchain$ riscv-none-elf-readelf -h orihw_O0.elf
ELF Header:
Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00
Class: ELF32
Data: 2's complement, little endian
Version: 1 (current)
OS/ABI: UNIX - System V
ABI Version: 0
Type: EXEC (Executable file)
Machine: RISC-V
Version: 0x1
Entry point address: 0x100d8
Start of program headers: 52 (bytes into file)
Start of section headers: 172460 (bytes into file)
Flags: 0x0
Size of this header: 52 (bytes)
Size of program headers: 32 (bytes)
Number of program headers: 3
Size of section headers: 40 (bytes)
Number of section headers: 15
Section header string table index: 14
O1
terry@terry-virtual-machine:~/rv32emu/tests/asm-toolchain$ riscv-none-elf-size orihw_O1.elf
text data bss dec hex filename
150204 2416 1528 154148 25a24 orihw_O1.elf
terry@terry-virtual-machine:~/rv32emu/tests/asm-toolchain$ riscv-none-elf-readelf -h orihw_O1.elf
ELF Header:
Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00
Class: ELF32
Data: 2's complement, little endian
Version: 1 (current)
OS/ABI: UNIX - System V
ABI Version: 0
Type: EXEC (Executable file)
Machine: RISC-V
Version: 0x1
Entry point address: 0x100d8
Start of program headers: 52 (bytes into file)
Start of section headers: 172460 (bytes into file)
Flags: 0x0
Size of this header: 52 (bytes)
Size of program headers: 32 (bytes)
Number of program headers: 3
Size of section headers: 40 (bytes)
Number of section headers: 15
Section header string table index: 14
O2
terry@terry-virtual-machine:~/rv32emu/tests/asm-toolchain$ riscv-none-elf-size orihw_O2.elf
text data bss dec hex filename
150200 2416 1528 154144 25a20 orihw_O2.elf
terry@terry-virtual-machine:~/rv32emu/tests/asm-toolchain$ riscv-none-elf-readelf -h orihw_O2.elf
ELF Header:
Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00
Class: ELF32
Data: 2's complement, little endian
Version: 1 (current)
OS/ABI: UNIX - System V
ABI Version: 0
Type: EXEC (Executable file)
Machine: RISC-V
Version: 0x1
Entry point address: 0x1016c
Start of program headers: 52 (bytes into file)
Start of section headers: 172476 (bytes into file)
Flags: 0x0
Size of this header: 52 (bytes)
Size of program headers: 32 (bytes)
Number of program headers: 3
Size of section headers: 40 (bytes)
Number of section headers: 15
Section header string table index: 14
O3
terry@terry-virtual-machine:~/rv32emu/tests/asm-toolchain$ riscv-none-elf-size orihw_O3.elf
text data bss dec hex filename
150200 2416 1528 154144 25a20 orihw_O3.elf
terry@terry-virtual-machine:~/rv32emu/tests/asm-toolchain$ riscv-none-elf-readelf -h orihw_O3.elf
ELF Header:
Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00
Class: ELF32
Data: 2's complement, little endian
Version: 1 (current)
OS/ABI: UNIX - System V
ABI Version: 0
Type: EXEC (Executable file)
Machine: RISC-V
Version: 0x1
Entry point address: 0x1016c
Start of program headers: 52 (bytes into file)
Start of section headers: 172476 (bytes into file)
Flags: 0x0
Size of this header: 52 (bytes)
Size of program headers: 32 (bytes)
Number of program headers: 3
Size of section headers: 40 (bytes)
Number of section headers: 15
Section header string table index: 14
Os
terry@terry-virtual-machine:~/rv32emu/tests/asm-toolchain$ riscv-none-elf-size orihw_Os.elf
text data bss dec hex filename
150200 2416 1528 154144 25a20 orihw_Os.elf
terry@terry-virtual-machine:~/rv32emu/tests/asm-toolchain$ riscv-none-elf-readelf -h orihw_Os.elf
ELF Header:
Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00
Class: ELF32
Data: 2's complement, little endian
Version: 1 (current)
OS/ABI: UNIX - System V
ABI Version: 0
Type: EXEC (Executable file)
Machine: RISC-V
Version: 0x1
Entry point address: 0x1016c
Start of program headers: 52 (bytes into file)
Start of section headers: 172476 (bytes into file)
Flags: 0x0
Size of this header: 52 (bytes)
Size of program headers: 32 (bytes)
Number of program headers: 3
Size of section headers: 40 (bytes)
Number of section headers: 15
Section header string table index: 14
Ofast
terry@terry-virtual-machine:~/rv32emu/tests/asm-toolchain$ riscv-none-elf-size orihw_Ofast.elf
text data bss dec hex filename
150200 2416 1528 154144 25a20 orihw_Ofast.elf
terry@terry-virtual-machine:~/rv32emu/tests/asm-toolchain$ riscv-none-elf-readelf -h orihw_Ofast.elf
ELF Header:
Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00
Class: ELF32
Data: 2's complement, little endian
Version: 1 (current)
OS/ABI: UNIX - System V
ABI Version: 0
Type: EXEC (Executable file)
Machine: RISC-V
Version: 0x1
Entry point address: 0x1016c
Start of program headers: 52 (bytes into file)
Start of section headers: 172476 (bytes into file)
Flags: 0x0
Size of this header: 52 (bytes)
Size of program headers: 32 (bytes)
Number of program headers: 3
Size of section headers: 40 (bytes)
Number of section headers: 15
Section header string table index: 14
Don't put the screenshots which contain plain text only.
Use Bitwise Operations Instead of Multiplication and Division: In the count_leading_zeros function, you use bitwise operations and a series of additions, but when calculating the number of 1s in x, you can use more bitwise operations instead of additions. For instance, you can use x = (x * 0x200040008001ULL & 0x1111111111111111) % 0xf; to replace the final additions.
Use unsigned int Instead of uint16_t: On most platforms, unsigned int might be more efficient than uint16_t because it matches the native word size of the machine.
Reduce Function Call Overhead: In the count_leading_zeros function, you can directly calculate the leading zeros and priority encoding in the main function instead of splitting it into two function calls. This can reduce the overhead of function calls.
Here is a slightly optimized version of your code:
#include <stdio.h>
#include <stdint.h>
int main() {
int input;
int input_size;
int leading_zero = 0;
int priority = 0;
printf("Enter a number: ");
scanf("%d", &input);
printf("Enter input size: ");
scanf("%d", &input_size);
// Calculate leading zeros
uint64_t x = input;
x |= (x >> 1);
x |= (x >> 2);
x |= (x >> 4);
x |= (x >> 8);
x |= (x >> 16);
x |= (x >> 32);
x -= ((x >> 1) & 0x5555555555555555);
x = ((x >> 2) & 0x3333333333333333) + (x & 0x3333333333333333);
x = ((x >> 4) + x) & 0x0f0f0f0f0f0f0f0f;
x = (x * 0x200040008001ULL & 0x1111111111111111) % 0xf;
leading_zero = input_size - (int)x;
// Calculate priority encoding output
if (leading_zero < 0 || leading_zero >= input_size) {
// Handle invalid input
printf("Invalid input\n");
} else {
priority = input_size - 1 - leading_zero;
printf("Leading zeros: %d\n", leading_zero);
printf("Priority output: %d\n", priority);
}
return 0;
}
O0
terry@terry-virtual-machine:~/rv32emu/tests/asm-toolchain$ riscv-none-elf-size 2opthw_O0.elf
text data bss dec hex filename
151420 2416 1528 155364 25ee4 2opthw_O0.elf
terry@terry-virtual-machine:~/rv32emu/tests/asm-toolchain$ riscv-none-elf-readelf -h 2opthw_O0.elf
ELF Header:
Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00
Class: ELF32
Data: 2's complement, little endian
Version: 1 (current)
OS/ABI: UNIX - System V
ABI Version: 0
Type: EXEC (Executable file)
Machine: RISC-V
Version: 0x1
Entry point address: 0x100d8
Start of program headers: 52 (bytes into file)
Start of section headers: 172512 (bytes into file)
Flags: 0x0
Size of this header: 52 (bytes)
Size of program headers: 32 (bytes)
Number of program headers: 3
Size of section headers: 40 (bytes)
Number of section headers: 15
Section header string table index: 14
O1
terry@terry-virtual-machine:~/rv32emu/tests/asm-toolchain$ riscv-none-elf-size 2opthw_O1.elf
text data bss dec hex filename
150520 2416 1528 154464 25b60 2opthw_O1.elf
terry@terry-virtual-machine:~/rv32emu/tests/asm-toolchain$ riscv-none-elf-readelf -h 2opthw_O1.elf
ELF Header:
Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00
Class: ELF32
Data: 2's complement, little endian
Version: 1 (current)
OS/ABI: UNIX - System V
ABI Version: 0
Type: EXEC (Executable file)
Machine: RISC-V
Version: 0x1
Entry point address: 0x100d8
Start of program headers: 52 (bytes into file)
Start of section headers: 172496 (bytes into file)
Flags: 0x0
Size of this header: 52 (bytes)
Size of program headers: 32 (bytes)
Number of program headers: 3
Size of section headers: 40 (bytes)
Number of section headers: 15
Section header string table index: 14
O2
terry@terry-virtual-machine:~/rv32emu/tests/asm-toolchain$ riscv-none-elf-size 2opthw_O2.elf
text data bss dec hex filename
152140 2568 1528 156236 2624c 2opthw_O2.elf
terry@terry-virtual-machine:~/rv32emu/tests/asm-toolchain$ riscv-none-elf-readelf -h 2opthw_O2.elf
ELF Header:
Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00
Class: ELF32
Data: 2's complement, little endian
Version: 1 (current)
OS/ABI: UNIX - System V
ABI Version: 0
Type: EXEC (Executable file)
Machine: RISC-V
Version: 0x1
Entry point address: 0x102ec
Start of program headers: 52 (bytes into file)
Start of section headers: 173448 (bytes into file)
Flags: 0x0
Size of this header: 52 (bytes)
Size of program headers: 32 (bytes)
Number of program headers: 3
Size of section headers: 40 (bytes)
Number of section headers: 15
Section header string table index: 14
O3
terry@terry-virtual-machine:~/rv32emu/tests/asm-toolchain$ riscv-none-elf-size 2opthw_O3.elf
text data bss dec hex filename
152140 2568 1528 156236 2624c 2opthw_O3.elf
terry@terry-virtual-machine:~/rv32emu/tests/asm-toolchain$ riscv-none-elf-readelf -h 2opthw_O3.elf
ELF Header:
Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00
Class: ELF32
Data: 2's complement, little endian
Version: 1 (current)
OS/ABI: UNIX - System V
ABI Version: 0
Type: EXEC (Executable file)
Machine: RISC-V
Version: 0x1
Entry point address: 0x102ec
Start of program headers: 52 (bytes into file)
Start of section headers: 173448 (bytes into file)
Flags: 0x0
Size of this header: 52 (bytes)
Size of program headers: 32 (bytes)
Number of program headers: 3
Size of section headers: 40 (bytes)
Number of section headers: 15
Section header string table index: 14
OS
terry@terry-virtual-machine:~/rv32emu/tests/asm-toolchain$ riscv-none-elf-size 2opthw_Os.elf
text data bss dec hex filename
152092 2568 1528 156188 2621c 2opthw_Os.elf
terry@terry-virtual-machine:~/rv32emu/tests/asm-toolchain$ riscv-none-elf-readelf -h 2opthw_Os.elf
ELF Header:
Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00
Class: ELF32
Data: 2's complement, little endian
Version: 1 (current)
OS/ABI: UNIX - System V
ABI Version: 0
Type: EXEC (Executable file)
Machine: RISC-V
Version: 0x1
Entry point address: 0x102bc
Start of program headers: 52 (bytes into file)
Start of section headers: 173400 (bytes into file)
Flags: 0x0
Size of this header: 52 (bytes)
Size of program headers: 32 (bytes)
Number of program headers: 3
Size of section headers: 40 (bytes)
Number of section headers: 15
Section header string table index: 14
Ofast
terry@terry-virtual-machine:~/rv32emu/tests/asm-toolchain$ riscv-none-elf-size 2opthw_Ofast.elf
text data bss dec hex filename
152140 2568 1528 156236 2624c 2opthw_Ofast.elf
terry@terry-virtual-machine:~/rv32emu/tests/asm-toolchain$ riscv-none-elf-readelf -h 2opthw_Ofast.elf
ELF Header:
Magic: 7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00 00
Class: ELF32
Data: 2's complement, little endian
Version: 1 (current)
OS/ABI: UNIX - System V
ABI Version: 0
Type: EXEC (Executable file)
Machine: RISC-V
Version: 0x1
Entry point address: 0x102ec
Start of program headers: 52 (bytes into file)
Start of section headers: 173448 (bytes into file)
Flags: 0x0
Size of this header: 52 (bytes)
Size of program headers: 32 (bytes)
Number of program headers: 3
Size of section headers: 40 (bytes)
Number of section headers: 15
Section header string table index: 14
Don't put the screenshots which contain plain text only.
RISC-V
.global main
.data
input: .word 10
input_size: .word 4
.text
.align 2
.global count_leading_zeros
.type count_leading_zeros, @function
count_leading_zeros:
lw a1, 0(a0) # Load input
lw a2, 4(a0) # Load input_size
li a3, 1 # Initialize shift mask
slli a4, a3, 1 # Shift mask for OR operation
or a1, a1, a1 # OR with shifted input
slli a4, a4, 2
or a1, a1, a4
slli a4, a4, 4
or a1, a1, a4
slli a4, a4, 8
or a1, a1, a4
slli a4, a4, 16
or a1, a1, a4
slli a4, a4, 31
or a1, a1, a4
li a4, 0x5555555555555555 # Mask for population count
and a5, a1, a4
srli a5, a5, 1
and a1, a1, a5
li a4, 0x3333333333333333 # Mask for population count
and a5, a1, a4
srli a5, a5, 2
and a1, a1, a5
li a4, 0x0F0F0F0F0F0F0F0F # Mask for population count
and a5, a1, a4
srli a5, a5, 4
and a1, a1, a5
li a4, 0x00FF00FF00FF00FF # Mask for population count
and a5, a1, a4
srli a5, a5, 8
and a1, a1, a5
li a4, 0x0000FFFF0000FFFF # Mask for population count
and a5, a1, a4
srli a5, a5, 16
and a1, a1, a5
li a4, 0x00000000FFFFFFFF # Mask for population count
and a5, a1, a4
srli a5, a5, 31
and a1, a1, a5
li a6, 0x7F # Mask for 7 bits
and a1, a1, a6 # Get lowest 7 bits
sub a2, a2, a1 # input_size - count
ret
.align 2
.global priority_encoder_4bit
.type priority_encoder_4bit, @function
priority_encoder_4bit:
lw a1, 0(a0) # Load leading_zero
lw a2, 4(a0) # Load input_size
li a3, 1 # Initialize shift mask
slli a4, a3, 1 # Shift mask for OR operation
or a1, a1, a1 # OR with shifted leading_zero
slli a4, a4, 2
or a1, a1, a4
slli a4, a4, 4
or a1, a1, a4
slli a4, a4, 8
or a1, a1, a4
sub a2, a2, a1 # input_size - 1 - leading_zero
ret
.align 2
.global main
.type main, @function
main:
# Allocate space on stack
addi sp, sp, -16
# Load inputs
la a0, input # Load address of input
la a1, input_size # Load address of input_size
# Call count_leading_zeros function
call count_leading_zeros
mv a2, a1 # Move result to a2 (leading_zero)
# Load inputs for priority_encoder_4bit function
la a0, input # Load address of input
la a1, input_size # Load address of input_size
# Call priority_encoder_4bit function
call priority_encoder_4bit
mv a3, a1 # Move result to a3 (priority)
# Print results
mv a1, a2 # Move leading_zero to a1 for printing
mv a2, a3 # Move priority to a2 for printing
la a4, format # Load address of format string
li a5, 0 # No varargs
ecall # System call to print
ret
CSR | ORI | OPT |
---|---|---|
O0 | 3619 | 1967 |
O1 | 2285 | 1946 |
O2 | 1849 | 1911 |
O3 | 1849 | 1911 |
Os | 1849 | 1911 |
Ofast | 1849 | 1911 |
The original and optimized C code has both been optimized using riscv-none-elf-gcc. After applying -O2 optimization, it can be observed that the cycle count cannot be further reduced. It has reached its limit.
https://github.com/riscv-non-isa/riscv-arch-test https://github.com/sysprog21/rv32emu https://github.com/sysprog21/rv32emu/blob/master/tests/ticks.c https://github.com/sysprog21/rv32emu/blob/master/tests/perfcounter/getcycles.S https://hackmd.io/@sysprog/SJAR5XMmi