Assignment2: RISC-V Toolchain

Rewrite:Implement priority encoder using CLZ by 倪英智

Original Code

Assembly code

.data:
    nextline: .string "\n"
    input_size: .word 4
    data_1: .word 0b00000000000000000000000000000001
    data_2: .word 0b00000000000000000000000000000011
    data_3: .word 0b00000000000000000000000000000100
    data_4: .word 0b00000000000000000000000000001001
    
main:
    addi sp, sp, -20
    
    # push four pointers of test data onto the stack
    lw t0, data_1
    sw t0, 0(sp)
    lw t0, data_2
    sw t0, 4(sp)
    lw t0, data_3
    sw t0, 8(sp)
    lw t0, data_4
    sw t0, 12(sp)
    
    addi s0, x0, 4    # s0 is the iteration times(4 test case)
    addi s1, x0, 0    # s1 is counter
    addi s2, sp, 0    # s2 initial at data_1
    # Load the constants A01, A02, and mask 0x0f0f0f0f
    li a5, 0x55555555
    li a6, 0x33333333
    li t1, 0x0f0f0f0f

loop:
    lw a0, 0(s2)        #load data into a0
    addi s2, s2, 4      # s2 move to next data
    addi s1, s1, 1      # counter++
clz:
    # OR the input value with itself shifted to the right by 1, 2, 4, 8, 16, and 16 bits (total 32).
    srli a1, a0, 1   
    or a0, a0, a1 
    srli a1, a0, 2   
    or a0, a0, a1 
    srli a1, a0, 4   
    or a0, a0, a1 
    srli a1, a0, 8   
    or a0, a0, a1 
    srli a1, a0, 16  
    or a0, a0, a1 
    srli a1, a0, 16  
    srli a2, a1, 16  
    or a0, a0, a2 
  
    # Count ones (population count).
    srli a1, a0, 1
    and a4, a1, a5  
    sub a0, a0, a4
    and a1, a0, a6
    srli a2, a0, 2
    and a3, a6, a2
    add a0, a3, a1
    srli a1, a0, 4
    add a0, a1, a0
    and a0, a0, t1
    srli a1, a0, 8
    add a0,a1,a0
    srli a1, a0, 16
    add a0, a1, a0
    srli a1, a0, 16
    srli a2, a1, 16
    add a0, a0, a2
  
    # Return the number of zeros and count the priority
    andi a0, a0, 0x7f # andi a0, a0, 0x7f
    addi a1, x0, 1
    sub a0, a0, a1

  
print:
    li a7, 35
    ecall
    lw a0, nextline
    li, a7, 11
    ecall
    bne s1, s0, loop
    beq s1, s0, end
    
end:  
    li a7, 10    # Exit system call
    ecall

C code

#include <stdio.h>
#include <stdint.h>

uint16_t count_leading_zeros(uint64_t x, int input_size)
{
    x |= (x >> 1);
    x |= (x >> 2);
    x |= (x >> 4);
    x |= (x >> 8);
    x |= (x >> 16);
    x |= (x >> 32);

    /* count ones (population count) */
    x -= ((x >> 1) & 0x5555555555555555 );
    x = ((x >> 2) & 0x3333333333333333) + (x & 0x3333333333333333 );
    x = ((x >> 4) + x) & 0x0f0f0f0f0f0f0f0f;
    x += (x >> 8);
    x += (x >> 16);
    x += (x >> 32);

    return (input_size - (x & 0x7f));
}

int priority_encoder_4bit(int leading_zero, int input_size) {
    if(leading_zero<0)
    {
        printf("wrong input");
        return 0;
    }
    else
    {
        return (input_size -1 -(la));
    }
}

int main() {
    int input;
    int input_size;
    int leading_zero = 0;
    int priority = 0;
    printf("Enter a  number : ");
    scanf("%4d", &input);
    printf("Enter a  input size : ");
    scanf("%4d", &input_size);
    leading_zero = count_leading_zeros(input,input_size);
    printf("%d\n",leading_zero);
    priority = priority_encoder_4bit(leading_zero, input_size);
    printf("%d\n",priority);
    return 0;
}

Manual optimization

Version 1
Adjustment:

Algorithm :Keep shift right logical until the value remain 1 and find the shifted bits.
time complexity : n/2 => worse
Print : rv32emu cannot print as like rv32i, all the output should translate to ascii code, here is my solution:
Suppose here have a 4 bits number, it will divide to 10^3 and add 48 to print, next it's turn to divide to 10^2 and do same thing in the following. Until all the bit was printed
Perfcounter : To compare the performance between the algorithm optimization
Remove the trivial code : Through the observation, there have some codes it's no need, so I remove them from original code.

Assembly code:

.org 0
# Provide program starting address to linker

.data
    data_0: .word 0b00000000000000000000000000000001
    data_1: .word 0b00000000000000000000000010000001
    data_2: .word 0b00000000000000001000000000000000
    data_3: .word 0b00000000100000000001010000000100
    data_4: .word 0b11111111111111111111111111111111
    nextline: .ascii "\n"
    .set str_size, .-nextline
    buffer: .byte 0
.text
.global main

main:
    jal ra, get_cycles
    mv t1, a0

    addi sp, sp, -20

    # push four pointers of test data onto the stack
    lw t0, data_0
    sw t0, 0(sp)
    lw t0, data_1
    sw t0, 4(sp)
    lw t0, data_2
    sw t0, 8(sp)
    lw t0, data_3
    sw t0, 12(sp)
    lw t0, data_4
    sw t0, 16(sp)
    
    addi s0, x0, 4    # s0 is the iteration times(4 test case)
    addi s1, x0, 0    # s1 is counter
    addi s2, sp, 0    # s2 initial at (0)sp
    li a5, 0
loop:
    lw a1, 0(s2)        #load data into a0

    addi a2, x0, 2
    addi a0, x0, 0

    addi s2, s2, 4      # s2 move to next data
    addi s1, s1, 1      # counter++
    blt a1, a5, ulimitcase
    blt a1, a2, print

clz:
    srli a1, a1, 1
    addi a0, a0, 1
    bge a1, a2, clz
    jal det
ulimitcase:
    li a0, 31
det:
    addi a3, x0, 10
    addi a4, x0 , 0
    blt a0, a3, print
    jal ra, stp
# a0: number(0<=x<10)
print:
    addi a0, a0, 48  # convert to ascii
    la t0, buffer    
    sb a0, 0(t0)      

    li a0, 1
    la a1, buffer
    li a2, 1
    li a7, 64
    ecall

    la a2, str_size
    lw a2, 0(a2) 
    addi a0, x0, 1
    la a1, nextline
    li a7, 64
    ecall

    addi a3, x0, 0
    bne s1, s0, loop
    beq s1, s0, end
stp:
    addi a0, a0, -10
    addi a4, a4, 1
    bge a0, a3, stp

printn:
    # input & output
    # a0: divided
    # a4: quotient
    addi a4, a4, 48
    addi sp, sp, -4
    la t0, buffer    # 
    sb a4, 0(t0)     # save quotientto buffer  
    sw a0, 0(sp)     # divided save in stack
    addi a0, x0, 1
    la a1, buffer
    addi a2, x0, 1
    li a7, 64
    ecall
    lw a0, 0(sp)     # lw divided in stack
    addi sp, sp, 4
    addi a4, x0, 0   # reset quotient  
    ret

thstp:
    addi a0, a0, -1000
    addi a4, a4, 1
    bge a0, a3, thstp
    jal ra, printn    # a0 < 1000
    jal ra, h
hstp:
    addi a0, a0, -100
    addi a4, a4, 1
    bge a0, a3, hstp
    jal ra, printn
    jal ra, t
tstp:
    addi a0, a0, -10
    addi a4, a4, 1
    bge a0, a3, tstp
    jal ra, printn
    jal ra, o
end:
    jal ra, get_cycles
    sub a0, a0, t1

th:
    # a0: cycle num divided
    addi a3, x0, 1000  # a3 = divisor
    addi a4, x0, 0     # a4 = quotient
    bge a0, a3, thstp  # a0 >= 1000
    jal ra, printn
h:
    addi a3, x0, 100
    bge a0, a3, hstp
    jal ra, printn 
t:
    addi a3, x0, 10
    bge a0, a3, tstp
    jal ra, printn
o:  
    addi a0, a0, 48
    la t0, buffer
    sb a0, 0(t0)
    li a0, 1
    la a1, buffer
    addi a2, x0, 1
    li a7, 64
    ecall  
        

    li a7, 93    # Exit system call
    ecall
get_cycles:
    csrr a1, cycleh
    csrr a0, cycle
    csrr a2, cycleh
    bne a1, a2, get_cycles
    ret

Version 2
Adjustment:

Algorithm : Still shift to right logical, but in more efficient way. First, shift right 16 bits, the three caseit will need to handle. Equal to 1, the answer is 16. Bigger than 1, the right shift bits need to add 16/2=8. Smaller than 1, the right shift bits need to subtract 16/2=8. Just separate the range of 32 bits into two parts to find the position.
time complexity : log2 => better

.org 0
# Provide program starting address to linker

.data
    data_0: .word 0b00000000000000000000000000000001
    data_1: .word 0b00000000000000000000000010000001
    data_2: .word 0b00000000000000001000000000000000
    data_3: .word 0b00000000100000000001010000000100
    data_4: .word 0b11111111111111111111111111111111
    nextline: .ascii "\n"
    .set str_size, .-nextline
    buffer: .byte 0
.text
.global main

main:
    jal ra, get_cycles
    mv t1, a0

    addi sp, sp, -20

    # push four pointers of test data onto the stack
    lw t0, data_0
    sw t0, 0(sp)
    lw t0, data_1
    sw t0, 4(sp)
    lw t0, data_2
    sw t0, 8(sp)
    lw t0, data_3
    sw t0, 12(sp)
    lw t0, data_4
    sw t0, 16(sp)
    
    addi s0, x0, 4    # s0 is the iteration times(4 test case)
    addi s1, x0, 0    # s1 is counter
    addi s2, sp, 0    # s2 initial at (0)sp
    li a5, 2
    li a6, 0
loop:
    lw a1, 0(s2)        #load data into a0

    li a0, 16
    li t2, 16
    addi s2, s2, 4      # s2 move to next data
    addi s1, s1, 1      # counter++
    blt a1, a6, ulimitcase
    blt a1, a5, llimitcase
    jal clz

right:
    sub a0, a0, t2 
    jal clz
left:
    add a0, a0, t2
clz:
    srl a2, a1, a0
    srl t2, t2, 1
    beq a2, x0, right
    bge a2, a5, left
    jal det

llimitcase:
    li a0, 0
    jal print
ulimitcase:
    li a0, 31
    
det:
    addi a3, x0, 10
    addi a4, x0 , 0
    blt a0, a3, print
    jal ra, stp
# a0: number(0<=x<10)
print:
    addi a0, a0, 48  # convert to ascii
    la t0, buffer    
    sb a0, 0(t0)      

    li a0, 1
    la a1, buffer
    li a2, 1
    li a7, 64
    ecall

    la a2, str_size
    lw a2, 0(a2) 
    addi a0, x0, 1
    la a1, nextline
    li a7, 64
    ecall

    addi a3, x0, 0
    bne s1, s0, loop
    beq s1, s0, end
stp:
    addi a0, a0, -10
    addi a4, a4, 1
    bge a0, a3, stp

printn:
    # input & output
    # a0: divided
    # a4: quotient
    addi a4, a4, 48
    addi sp, sp, -4
    la t0, buffer    # 
    sb a4, 0(t0)     # save quotientto buffer  
    sw a0, 0(sp)     # divided save in stack
    addi a0, x0, 1
    la a1, buffer
    addi a2, x0, 1
    li a7, 64
    ecall
    lw a0, 0(sp)     # lw divided in stack
    addi sp, sp, 4
    addi a4, x0, 0   # reset quotient  
    ret

thstp:
    addi a0, a0, -1000
    addi a4, a4, 1
    bge a0, a3, thstp
    jal ra, printn    # a0 < 1000
    jal ra, h
hstp:
    addi a0, a0, -100
    addi a4, a4, 1
    bge a0, a3, hstp
    jal ra, printn
    jal ra, t
tstp:
    addi a0, a0, -10
    addi a4, a4, 1
    bge a0, a3, tstp
    jal ra, printn
    jal ra, o
end:
    jal ra, get_cycles
    sub a0, a0, t1

th:
    # a0: cycle num divided
    addi a3, x0, 1000  # a3 = divisor
    addi a4, x0, 0     # a4 = quotient
    bge a0, a3, thstp  # a0 >= 1000
    jal ra, printn
h:
    addi a3, x0, 100
    bge a0, a3, hstp
    jal ra, printn 
t:
    addi a3, x0, 10
    bge a0, a3, tstp
    jal ra, printn
o:  
    addi a0, a0, 48
    la t0, buffer
    sb a0, 0(t0)
    li a0, 1
    la a1, buffer
    addi a2, x0, 1
    li a7, 64
    ecall  
        

    li a7, 93    # Exit system call
    ecall
get_cycles:
    csrr a1, cycleh
    csrr a0, cycle
    csrr a2, cycleh
    bne a1, a2, get_cycles
    ret

Compare:
Testcase:
To get the average cycles in different case, I use 5 cases as below.

0. data_0: 0b00000000000000000000000000000001
1. data_1: 0b00000000000000000000000010000001
2. data_2: 0b00000000000000001000000000000000
3. data_3: 0b00000000100000000001010000000100
4. data_4: 0b11111111111111111111111111111111

code	ori	opt_1	opt_2
cycles	393	377	320

The testcase 0 and testcase 4 are very unbalance to original code, because of the MSB in testcase is 1. Alse represent the value is negative so the code need to handle this exception, I decide to detect the numberwill output 31 once the value is negtive. In the other word, the number will skip the algorithm. Next is number 0 or 1, these two number is friendly to opt_1, the algorithm only need to shift right a few bits to get answer. Same testcase in opt_2, the algorithm cannot handle the right shift 0:

s:right shift bits
a:adjust value

s:16 a:8
s:8  a:4
s:4  a:2
s:2  a:1
s:1  a:0
s:1  a:0
...

But I need to show the average cycle it need, so I still use 5 testcase as above.

Analyse the ELF

To know the different between compiler optimization O0, O1, O2, O3, Os, and Ofast, we diaassemble the .elf file with the instruction below:

riscv-none-elf-objdump -D codename.elf

Through the campare between opt_1 and opt_2, it's clearly to choose opt_2 as target to optimize.
Here is its correspond C code:

#include <stdio.h>
#include <stdint.h>

uint16_t count_leading_zeros(uint64_t x)
{
    int pace = 16;
    int adj = 16;
    int y;
    while(x>1){
        y = x >> pace;
        adj>>=1;
        if(y>1){
            pace+=adj;
        }
        else if(y<1){
            pace-=adj;
        }
        else if(y==1){
            return (64-pace);
        }
    }
    return 64;
}

int main() {
    int input;
    scanf("%d", &input);
    printf("%d\n",64 - count_leading_zeros(input));
    return 0;
}

Because of the size of file is too large, I put them in github
In observation of the output, there have lots of trivial instrustion that it's no need in this code, because it's need to include all function in stdlib.h and stdio.h.
If we can replace the printf in stdio.h package with the assembly printf we can reduce no matter code size or execute time.

Now, to know the instruction size output from disassemble, we use the following instruction:

riscv-none-elf-size codename.elf

	-O0	-O1	-O2	-O3	-Os	-Ofast
text	51818	51478	51426	51426	51458	51426
data	1872	1872	1872	1872	1872	1872
bss	1528	1528	1528	1528	1528	1528

You shall use RDCYCLE/RDCYCLEH instruction for the statistics of your program’s execution.

Image Not Showing Possible Reasons

The image file may be corrupted
The server hosting the image is unavailable
The image path is incorrect
The image format is not supported

Learn More →

jserv

Assignment2: RISC-V Toolchain

Original Code

Manual optimization

Analyse the ELF

Read more

Linux 核心專題: 回顧 bitops 並改進

2024q1 Homework2 (quiz1+2)

Performance Analysis of the IEEE 802.11 Distributed Coordination Function

FNS 改進