contributed by david965154
Rewrite:Implement priority encoder using CLZ by 倪英智
.data:
nextline: .string "\n"
input_size: .word 4
data_1: .word 0b00000000000000000000000000000001
data_2: .word 0b00000000000000000000000000000011
data_3: .word 0b00000000000000000000000000000100
data_4: .word 0b00000000000000000000000000001001
main:
addi sp, sp, -20
# push four pointers of test data onto the stack
lw t0, data_1
sw t0, 0(sp)
lw t0, data_2
sw t0, 4(sp)
lw t0, data_3
sw t0, 8(sp)
lw t0, data_4
sw t0, 12(sp)
addi s0, x0, 4 # s0 is the iteration times(4 test case)
addi s1, x0, 0 # s1 is counter
addi s2, sp, 0 # s2 initial at data_1
# Load the constants A01, A02, and mask 0x0f0f0f0f
li a5, 0x55555555
li a6, 0x33333333
li t1, 0x0f0f0f0f
loop:
lw a0, 0(s2) #load data into a0
addi s2, s2, 4 # s2 move to next data
addi s1, s1, 1 # counter++
clz:
# OR the input value with itself shifted to the right by 1, 2, 4, 8, 16, and 16 bits (total 32).
srli a1, a0, 1
or a0, a0, a1
srli a1, a0, 2
or a0, a0, a1
srli a1, a0, 4
or a0, a0, a1
srli a1, a0, 8
or a0, a0, a1
srli a1, a0, 16
or a0, a0, a1
srli a1, a0, 16
srli a2, a1, 16
or a0, a0, a2
# Count ones (population count).
srli a1, a0, 1
and a4, a1, a5
sub a0, a0, a4
and a1, a0, a6
srli a2, a0, 2
and a3, a6, a2
add a0, a3, a1
srli a1, a0, 4
add a0, a1, a0
and a0, a0, t1
srli a1, a0, 8
add a0,a1,a0
srli a1, a0, 16
add a0, a1, a0
srli a1, a0, 16
srli a2, a1, 16
add a0, a0, a2
# Return the number of zeros and count the priority
andi a0, a0, 0x7f # andi a0, a0, 0x7f
addi a1, x0, 1
sub a0, a0, a1
print:
li a7, 35
ecall
lw a0, nextline
li, a7, 11
ecall
bne s1, s0, loop
beq s1, s0, end
end:
li a7, 10 # Exit system call
ecall
#include <stdio.h>
#include <stdint.h>
uint16_t count_leading_zeros(uint64_t x, int input_size)
{
x |= (x >> 1);
x |= (x >> 2);
x |= (x >> 4);
x |= (x >> 8);
x |= (x >> 16);
x |= (x >> 32);
/* count ones (population count) */
x -= ((x >> 1) & 0x5555555555555555 );
x = ((x >> 2) & 0x3333333333333333) + (x & 0x3333333333333333 );
x = ((x >> 4) + x) & 0x0f0f0f0f0f0f0f0f;
x += (x >> 8);
x += (x >> 16);
x += (x >> 32);
return (input_size - (x & 0x7f));
}
int priority_encoder_4bit(int leading_zero, int input_size) {
if(leading_zero<0)
{
printf("wrong input");
return 0;
}
else
{
return (input_size -1 -(la));
}
}
int main() {
int input;
int input_size;
int leading_zero = 0;
int priority = 0;
printf("Enter a number : ");
scanf("%4d", &input);
printf("Enter a input size : ");
scanf("%4d", &input_size);
leading_zero = count_leading_zeros(input,input_size);
printf("%d\n",leading_zero);
priority = priority_encoder_4bit(leading_zero, input_size);
printf("%d\n",priority);
return 0;
}
Version 1
Adjustment:
time complexity : n/2 => worse
Assembly code:
.org 0
# Provide program starting address to linker
.data
data_0: .word 0b00000000000000000000000000000001
data_1: .word 0b00000000000000000000000010000001
data_2: .word 0b00000000000000001000000000000000
data_3: .word 0b00000000100000000001010000000100
data_4: .word 0b11111111111111111111111111111111
nextline: .ascii "\n"
.set str_size, .-nextline
buffer: .byte 0
.text
.global main
main:
jal ra, get_cycles
mv t1, a0
addi sp, sp, -20
# push four pointers of test data onto the stack
lw t0, data_0
sw t0, 0(sp)
lw t0, data_1
sw t0, 4(sp)
lw t0, data_2
sw t0, 8(sp)
lw t0, data_3
sw t0, 12(sp)
lw t0, data_4
sw t0, 16(sp)
addi s0, x0, 4 # s0 is the iteration times(4 test case)
addi s1, x0, 0 # s1 is counter
addi s2, sp, 0 # s2 initial at (0)sp
li a5, 0
loop:
lw a1, 0(s2) #load data into a0
addi a2, x0, 2
addi a0, x0, 0
addi s2, s2, 4 # s2 move to next data
addi s1, s1, 1 # counter++
blt a1, a5, ulimitcase
blt a1, a2, print
clz:
srli a1, a1, 1
addi a0, a0, 1
bge a1, a2, clz
jal det
ulimitcase:
li a0, 31
det:
addi a3, x0, 10
addi a4, x0 , 0
blt a0, a3, print
jal ra, stp
# a0: number(0<=x<10)
print:
addi a0, a0, 48 # convert to ascii
la t0, buffer
sb a0, 0(t0)
li a0, 1
la a1, buffer
li a2, 1
li a7, 64
ecall
la a2, str_size
lw a2, 0(a2)
addi a0, x0, 1
la a1, nextline
li a7, 64
ecall
addi a3, x0, 0
bne s1, s0, loop
beq s1, s0, end
stp:
addi a0, a0, -10
addi a4, a4, 1
bge a0, a3, stp
printn:
# input & output
# a0: divided
# a4: quotient
addi a4, a4, 48
addi sp, sp, -4
la t0, buffer #
sb a4, 0(t0) # save quotientto buffer
sw a0, 0(sp) # divided save in stack
addi a0, x0, 1
la a1, buffer
addi a2, x0, 1
li a7, 64
ecall
lw a0, 0(sp) # lw divided in stack
addi sp, sp, 4
addi a4, x0, 0 # reset quotient
ret
thstp:
addi a0, a0, -1000
addi a4, a4, 1
bge a0, a3, thstp
jal ra, printn # a0 < 1000
jal ra, h
hstp:
addi a0, a0, -100
addi a4, a4, 1
bge a0, a3, hstp
jal ra, printn
jal ra, t
tstp:
addi a0, a0, -10
addi a4, a4, 1
bge a0, a3, tstp
jal ra, printn
jal ra, o
end:
jal ra, get_cycles
sub a0, a0, t1
th:
# a0: cycle num divided
addi a3, x0, 1000 # a3 = divisor
addi a4, x0, 0 # a4 = quotient
bge a0, a3, thstp # a0 >= 1000
jal ra, printn
h:
addi a3, x0, 100
bge a0, a3, hstp
jal ra, printn
t:
addi a3, x0, 10
bge a0, a3, tstp
jal ra, printn
o:
addi a0, a0, 48
la t0, buffer
sb a0, 0(t0)
li a0, 1
la a1, buffer
addi a2, x0, 1
li a7, 64
ecall
li a7, 93 # Exit system call
ecall
get_cycles:
csrr a1, cycleh
csrr a0, cycle
csrr a2, cycleh
bne a1, a2, get_cycles
ret
Version 2
Adjustment:
16/2=8
. Smaller than 1, the right shift bits need to subtract 16/2=8
. Just separate the range of 32 bits into two parts to find the position.time complexity : log2 => better
.org 0
# Provide program starting address to linker
.data
data_0: .word 0b00000000000000000000000000000001
data_1: .word 0b00000000000000000000000010000001
data_2: .word 0b00000000000000001000000000000000
data_3: .word 0b00000000100000000001010000000100
data_4: .word 0b11111111111111111111111111111111
nextline: .ascii "\n"
.set str_size, .-nextline
buffer: .byte 0
.text
.global main
main:
jal ra, get_cycles
mv t1, a0
addi sp, sp, -20
# push four pointers of test data onto the stack
lw t0, data_0
sw t0, 0(sp)
lw t0, data_1
sw t0, 4(sp)
lw t0, data_2
sw t0, 8(sp)
lw t0, data_3
sw t0, 12(sp)
lw t0, data_4
sw t0, 16(sp)
addi s0, x0, 4 # s0 is the iteration times(4 test case)
addi s1, x0, 0 # s1 is counter
addi s2, sp, 0 # s2 initial at (0)sp
li a5, 2
li a6, 0
loop:
lw a1, 0(s2) #load data into a0
li a0, 16
li t2, 16
addi s2, s2, 4 # s2 move to next data
addi s1, s1, 1 # counter++
blt a1, a6, ulimitcase
blt a1, a5, llimitcase
jal clz
right:
sub a0, a0, t2
jal clz
left:
add a0, a0, t2
clz:
srl a2, a1, a0
srl t2, t2, 1
beq a2, x0, right
bge a2, a5, left
jal det
llimitcase:
li a0, 0
jal print
ulimitcase:
li a0, 31
det:
addi a3, x0, 10
addi a4, x0 , 0
blt a0, a3, print
jal ra, stp
# a0: number(0<=x<10)
print:
addi a0, a0, 48 # convert to ascii
la t0, buffer
sb a0, 0(t0)
li a0, 1
la a1, buffer
li a2, 1
li a7, 64
ecall
la a2, str_size
lw a2, 0(a2)
addi a0, x0, 1
la a1, nextline
li a7, 64
ecall
addi a3, x0, 0
bne s1, s0, loop
beq s1, s0, end
stp:
addi a0, a0, -10
addi a4, a4, 1
bge a0, a3, stp
printn:
# input & output
# a0: divided
# a4: quotient
addi a4, a4, 48
addi sp, sp, -4
la t0, buffer #
sb a4, 0(t0) # save quotientto buffer
sw a0, 0(sp) # divided save in stack
addi a0, x0, 1
la a1, buffer
addi a2, x0, 1
li a7, 64
ecall
lw a0, 0(sp) # lw divided in stack
addi sp, sp, 4
addi a4, x0, 0 # reset quotient
ret
thstp:
addi a0, a0, -1000
addi a4, a4, 1
bge a0, a3, thstp
jal ra, printn # a0 < 1000
jal ra, h
hstp:
addi a0, a0, -100
addi a4, a4, 1
bge a0, a3, hstp
jal ra, printn
jal ra, t
tstp:
addi a0, a0, -10
addi a4, a4, 1
bge a0, a3, tstp
jal ra, printn
jal ra, o
end:
jal ra, get_cycles
sub a0, a0, t1
th:
# a0: cycle num divided
addi a3, x0, 1000 # a3 = divisor
addi a4, x0, 0 # a4 = quotient
bge a0, a3, thstp # a0 >= 1000
jal ra, printn
h:
addi a3, x0, 100
bge a0, a3, hstp
jal ra, printn
t:
addi a3, x0, 10
bge a0, a3, tstp
jal ra, printn
o:
addi a0, a0, 48
la t0, buffer
sb a0, 0(t0)
li a0, 1
la a1, buffer
addi a2, x0, 1
li a7, 64
ecall
li a7, 93 # Exit system call
ecall
get_cycles:
csrr a1, cycleh
csrr a0, cycle
csrr a2, cycleh
bne a1, a2, get_cycles
ret
Compare:
Testcase:
To get the average cycles in different case, I use 5 cases as below.
0. data_0: 0b00000000000000000000000000000001
1. data_1: 0b00000000000000000000000010000001
2. data_2: 0b00000000000000001000000000000000
3. data_3: 0b00000000100000000001010000000100
4. data_4: 0b11111111111111111111111111111111
code | ori | opt_1 | opt_2 |
---|---|---|---|
cycles | 393 | 377 | 320 |
The testcase 0 and testcase 4 are very unbalance to original code, because of the MSB in testcase is 1. Alse represent the value is negative so the code need to handle this exception, I decide to detect the numberwill output 31 once the value is negtive. In the other word, the number will skip the algorithm. Next is number 0 or 1, these two number is friendly to opt_1, the algorithm only need to shift right a few bits to get answer. Same testcase in opt_2, the algorithm cannot handle the right shift 0:
s:right shift bits
a:adjust value
s:16 a:8
s:8 a:4
s:4 a:2
s:2 a:1
s:1 a:0
s:1 a:0
...
But I need to show the average cycle it need, so I still use 5 testcase as above.
To know the different between compiler optimization O0, O1, O2, O3, Os, and Ofast, we diaassemble the .elf file with the instruction below:
riscv-none-elf-objdump -D codename.elf
Through the campare between opt_1 and opt_2, it's clearly to choose opt_2 as target to optimize.
Here is its correspond C code:
#include <stdio.h>
#include <stdint.h>
uint16_t count_leading_zeros(uint64_t x)
{
int pace = 16;
int adj = 16;
int y;
while(x>1){
y = x >> pace;
adj>>=1;
if(y>1){
pace+=adj;
}
else if(y<1){
pace-=adj;
}
else if(y==1){
return (64-pace);
}
}
return 64;
}
int main() {
int input;
scanf("%d", &input);
printf("%d\n",64 - count_leading_zeros(input));
return 0;
}
Because of the size of file is too large, I put them in github
In observation of the output, there have lots of trivial instrustion that it's no need in this code, because it's need to include all function in stdlib.h
and stdio.h
.
If we can replace the printf in stdio.h
package with the assembly printf
we can reduce no matter code size or execute time.
Now, to know the instruction size output from disassemble, we use the following instruction:
riscv-none-elf-size codename.elf
-O0 | -O1 | -O2 | -O3 | -Os | -Ofast | |
---|---|---|---|---|---|---|
text | 51818 | 51478 | 51426 | 51426 | 51458 | 51426 |
data | 1872 | 1872 | 1872 | 1872 | 1872 | 1872 |
bss | 1528 | 1528 | 1528 | 1528 | 1528 | 1528 |
You shall use RDCYCLE/RDCYCLEH instruction for the statistics of your program’s execution.