Try   HackMD

2022 Assignment2: RISC-V Toolchain

tags: Computer Architecture

contribute by <chiangkd>

Requirement follow Assignment2: RISC-V Toolchain

Pick up one Problem

Pick up one assembly program and adapt it into both RISC-V assembly and C implementations which can be flawlessly executed with rv32emu.

I choose the problem from 曾晧峖-Valid Anagram(Leetcode 242). The reason why I choose this problem is that when I go through the classmates homework from Assignment1: RISC-V Assembly and Instruction Pipeline. I notice that there is seldom problem is about string or character. I think only less than 5 problems is related. However, I want to know how string operation can efficiently run under RISC-V architecture.

Original Solution by 曾晧峖

C code

bool isAnagram(char * s, char * t)
{
    int letter_freq[26] = {0}; //store frequency of every letter
	
    // calculate frequency of every letter of s
    for (int i = 0; s[i]; i++) {
        letter_freq[s[i] - 'a']++;
    }
    // calculate frequency of every letter of t
    for (int i = 0; t[i]; i++) {
        letter_freq[t[i] - 'a']--;
    }
    // if letter_freq[i] != 0, it mean in letter char(i + 'a') frequency of letter of s and t are not same. 
    for (int i = 0; i < 26; i++) {
        if (letter_freq[i]) 
            return false;
    }
    return true; // all of letter of frequency are same
}

Handwrite Assembly code (Modified)

# RISC-V assembly program to print "Hello World!" to stdout.

.org 0
# Provide program starting address to linker
.global _start
/* newlib system calls */
.set SYSEXIT,  93
.set SYSWRITE, 64

.data

test_1_s: .string "anagram"
test_1_t: .string "nagaram"
test_2_s: .string "rat"
test_2_t: .string "anagram"
test_3_s: .string "tseng"
test_3_t: .string "gnest"

correct_1:       .string "test_1: Pass\n"
                 .set t1cor_size, .-correct_1
not_correct_1:   .string "test_1: Fail\n"
                 .set t1incor_size, .-not_correct_1
correct_2:       .string "test_2: Pass\n"
                 .set t2cor_size, .-correct_2
not_correct_2:   .string "test_2: Fail\n"
                 .set t2incor_size, .-not_correct_2
correct_3:       .string "test_3: Pass\n"
                 .set t3cor_size, .-correct_3
not_correct_3:   .string "test_3: Fail\n"
                 .set t3incor_size, .-not_correct_3

.text

_start: 
    li a7, SYSWRITE        # "write" system call
    li a0, 1               #  1 = standard output (stdout)
    jal ra, TEST_1
    jal ra, TEST_2
    jal ra, TEST_3
    j end

TEST_1:
    addi sp, sp, -4
    sw ra, 0(sp)
    la a0, test_1_s        # s(a0) = test_1_s  
    la a1, test_1_t        # t(a1) = test_1_t
    jal ra, isAnagram      # call isAnagram(s(a0), t(a1))
    bne a0, x0, TRUE_1     # if isAnagram(s(a0), t(a1)) == 1 correct
    li a0, 1               # reload a0 to handle stdout
    la a1, not_correct_1   # test1 not correct address
    la a2, t1incor_size    # test1 not correct length
    ecall
    lw ra, 0(sp)
    addi sp, sp, 4
    jr ra

TEST_2:
    addi sp, sp, -4
    sw ra, 0(sp)
    la a0, test_2_s        # s(a0) = test_2_s
    la a1, test_2_t        # t(a1) = test_2_t
    jal ra, isAnagram	   # call isAnagram(s(a0), t(a1))
    beq a0, x0, TRUE_2	   # if isAnagram(s(a0), t(a1)) == 0 correct

    la a1, not_correct_2   # test2 not correct address
    la a2, t2incor_size    # test not correct length
    ecall
    lw ra, 0(sp)
    addi sp, sp, 4
    jr ra

TEST_3:
    addi sp, sp, -4
    sw ra, 0(sp)
    la a0, test_3_s        # s(a0) = test_3_s
    la a1, test_3_t        # t(a1) = test_3_t
    jal ra, isAnagram	   # call isAnagram(s(a0), t(a1))
    bne a0, x0, TRUE_3     # if isAnagram(s(a0), t(a1)) == 1 correct
    li a0, 1               # reload a0 to handle stdout
    la a1, not_correct_3   # test3 not correct address
    la a2, t3incor_size    # test3 not correct length
    ecall
    lw ra, 0(sp)
    addi sp, sp, 4
    jr ra

TRUE_1:
    la a1, correct_1     # test1 correct address
    la a2, t1cor_size    # test1 correct length
    ecall
    lw ra, 0(sp)
    addi sp, sp, 4
    jr ra               # go to example2

TRUE_2:
    li a0, 1             # reload a0 to handle stdout 
    la a1, correct_2     # test2 correct address
    la a2, t2cor_size    # test2 correct length
    ecall
    lw ra, 0(sp)
    addi sp, sp, 4
    jr ra              # go to example3

TRUE_3:
    la a1, correct_3     # test2 correct address
    la a2, t3cor_size    # test2 correct length
    ecall
    lw ra, 0(sp)
    addi sp, sp, 4
    jr ra
    
isAnagram:                  # a0 = s, a1 = t
    addi sp, sp, -104       # get sapce for store int letter_freq[26]
    addi t0, sp, 0          # t0 = letter_freq[0]
    addi t1, x0, 0          # t1 = i = 0
    li t2, 26
LOOP1:                      # int letter_freq[26] = {0};
    beq t1, t2, GET_FREQ_s  # if i < 26 
    sw x0, 0(t0)            # letter_freq[i] = 0;
    addi t1, t1, 1          # i++;
    addi t0, t0, 4          # 
    j LOOP1
	
GET_FREQ_s:
    addi t0, sp, 0         # t0 = letter_freq[0]
    addi t1, a0, 0         # t1 = s
    addi t2, x0, 0         # t2 = i = 0
LOOP2:                            # for(  ;s[i] ;i++ )
    add  t3, t1, t2         # get address of s[index] from s[0] + 
    lb t5, 0(t3)            # t5 = s[i]
    beq t5, x0, GET_FREQ_F  # if s[i] ==0 break the loop
    addi t5, t5, -97        # t5 = s[i] - 'a'
    slli t5, t5, 2          # get offset form letter_freq[0] to letter_freq[s[i] - 'a']
    add t5, t5, t0          # get address of letter_freq[s[i] - 'a']
    lw t3, 0(t5)            # t3 = [freq[s[i] - 'a']]
    addi t3, t3, 1          # t3 = [freq[s[i] - 'a']] + 1
    sw t3, 0(t5)            # [freq[s[i] - 'a']] = ([freq[s[i] - 'a']] + 1) 
    addi t2, t2, 1          # i++
    j LOOP2
GET_FREQ_F:
    addi t0, sp, 0         # t0 = freq[]
    addi t1, a1, 0         # t1 = t
    addi t2, x0, 0         # t2 = i = 0
LOOP3:                      # for(  ;t[i] ;i++ )
    add  t3, t1, t2         # get address of t[index]
    lb t5, 0(t3)            # t5 = t[i]
    beq t5, x0, CHECK       # if t[i] == 0 break the loop
    addi t5, t5, -97        # t5 = t[i] - 'a'
    slli t5, t5, 2          # get offset form letter_freq[0] to letter_freq[t[i] - 'a']
    add t5, t5, t0        
    lw t3, 0(t5)            # t5 = [freq[t[i] - 'a']]
    addi t3, t3, -1         # t3 = [freq[t[i] - 'a']] - 1
    sw t3, 0(t5)            # [freq[t[i] - 'a']] = ([freq[t[i] - 'a']] - 1) 
    addi t2, t2, 1          # i++
    j LOOP3
CHECK:
    addi t0, sp, 0 # t0 = address freq[0]
    addi t1, x0, 0 # t1 = i = 0
    li t2, 26
LOOP4:                     # for (int i = 0; i < 26; i++)
    beq t1, t2, TRUE       # i < 26
    lw t3, 0(t0)           # t3 = freq[i];
    bne t3, x0, FALSE      # if freq[i] != 0 break
    addi t1, t1, 1         # i++;
    addi t0, t0, 4         # freq + 1
    j LOOP4

FALSE:
    addi a0, x0, 0          # if false return false
    j END_F
TRUE:
    addi a0, x0, 1          # if pass check return true
END_F:
    addi sp, sp, 104
    jr ra                   # return,  a0 = return value = true or false

end:
    li a7, SYSEXIT      # "exit" syscall
    add a0, x0, 0       # Use 0 return code
    ecall               # invoke syscall to terminate the program

There is some problem when I adapt origin assembly code to execute with rv32emu, such as

  • bne a0, x0 TRUE_1 need to write as bne a0, x0, TRUE_1
  • Some system call need more register than Ripes, so some register may be overwriten.
  • More detail is written in Note, github

Original code run each testcase right after previous testcase, but I modify to run it after jump back to main

Origin







g



main

_start



testcase1

testcase1



main->testcase1





testcase2

testcase2



testcase1->testcase2





testcase3

testcase3



testcase2->testcase3





end

end



testcase3->end





Modified







g



main

_start

testcase1

testcase2

testcase3



testcase1

testcase1



main:ref1->testcase1





testcase2

testcase2



main:ref2->testcase2





testcase3

testcase3



main:ref3->testcase3





testcase1->main:ref2





testcase2->main:ref3





end

end



testcase3->end





Disassemble the ELF files and Analysis

Compile by RISC-V GNU toolchain

riscv-none-elf-gcc -Wall -march=rv32i -mabi=ilp32 hw2_origin.c -o hw2_origin.elf

It can successfully run with expected answer.

-O0(default)

Optimize for compilation time (which is the default optimization mode)

riscv-none-elf-gcc -Wall -march=rv32i -mabi=ilp32 -O0 hw2_origin.c -o hw2_origin_00.elf

Size

riscv-none-elf-size hw2_origin_O0.elf

text	   data	    bss	    dec	    hex	filename
1914	   1096	     61	   3071	    bff	hw2_origin_O0.elf

Disassembly code

riscv-none-elf-objdump -d hw2_origin_O0.elf >dis_O0.txt

00010184 <isAnagram>: 10184: f6010113 addi sp,sp,-160 10188: 08112e23 sw ra,156(sp) 1018c: 08812c23 sw s0,152(sp) 10190: 0a010413 addi s0,sp,160 10194: f6a42623 sw a0,-148(s0) 10198: f6b42423 sw a1,-152(s0) 1019c: f7c40793 addi a5,s0,-132 101a0: 06800713 li a4,104 101a4: 00070613 mv a2,a4 101a8: 00000593 li a1,0 101ac: 00078513 mv a0,a5 101b0: 328000ef jal ra,104d8 <memset> 101b4: fe042623 sw zero,-20(s0) 101b8: 0480006f j 10200 <isAnagram+0x7c> 101bc: fec42783 lw a5,-20(s0) 101c0: f6c42703 lw a4,-148(s0) 101c4: 00f707b3 add a5,a4,a5 101c8: 0007c783 lbu a5,0(a5) 101cc: f9f78713 addi a4,a5,-97 101d0: 00271793 slli a5,a4,0x2 101d4: ff078793 addi a5,a5,-16 101d8: 008787b3 add a5,a5,s0 101dc: f8c7a783 lw a5,-116(a5) 101e0: 00178693 addi a3,a5,1 101e4: 00271793 slli a5,a4,0x2 101e8: ff078793 addi a5,a5,-16 101ec: 008787b3 add a5,a5,s0 101f0: f8d7a623 sw a3,-116(a5) 101f4: fec42783 lw a5,-20(s0) 101f8: 00178793 addi a5,a5,1 101fc: fef42623 sw a5,-20(s0) 10200: fec42783 lw a5,-20(s0) 10204: f6c42703 lw a4,-148(s0) 10208: 00f707b3 add a5,a4,a5 1020c: 0007c783 lbu a5,0(a5) 10210: fa0796e3 bnez a5,101bc <isAnagram+0x38> 10214: fe042423 sw zero,-24(s0) 10218: 0480006f j 10260 <isAnagram+0xdc> 1021c: fe842783 lw a5,-24(s0) 10220: f6842703 lw a4,-152(s0) 10224: 00f707b3 add a5,a4,a5 10228: 0007c783 lbu a5,0(a5) 1022c: f9f78713 addi a4,a5,-97 10230: 00271793 slli a5,a4,0x2 10234: ff078793 addi a5,a5,-16 10238: 008787b3 add a5,a5,s0 1023c: f8c7a783 lw a5,-116(a5) 10240: fff78693 addi a3,a5,-1 10244: 00271793 slli a5,a4,0x2 10248: ff078793 addi a5,a5,-16 1024c: 008787b3 add a5,a5,s0 10250: f8d7a623 sw a3,-116(a5) 10254: fe842783 lw a5,-24(s0) 10258: 00178793 addi a5,a5,1 1025c: fef42423 sw a5,-24(s0) 10260: fe842783 lw a5,-24(s0) 10264: f6842703 lw a4,-152(s0) 10268: 00f707b3 add a5,a4,a5 1026c: 0007c783 lbu a5,0(a5) 10270: fa0796e3 bnez a5,1021c <isAnagram+0x98> 10274: fe042223 sw zero,-28(s0) 10278: 0300006f j 102a8 <isAnagram+0x124> 1027c: fe442783 lw a5,-28(s0) 10280: 00279793 slli a5,a5,0x2 10284: ff078793 addi a5,a5,-16 10288: 008787b3 add a5,a5,s0 1028c: f8c7a783 lw a5,-116(a5) 10290: 00078663 beqz a5,1029c <isAnagram+0x118> 10294: 00000793 li a5,0 10298: 0200006f j 102b8 <isAnagram+0x134> 1029c: fe442783 lw a5,-28(s0) 102a0: 00178793 addi a5,a5,1 102a4: fef42223 sw a5,-28(s0) 102a8: fe442703 lw a4,-28(s0) 102ac: 01900793 li a5,25 102b0: fce7d6e3 bge a5,a4,1027c <isAnagram+0xf8> 102b4: 00100793 li a5,1 102b8: 00078513 mv a0,a5 102bc: 09c12083 lw ra,156(sp) 102c0: 09812403 lw s0,152(sp) 102c4: 0a010113 addi sp,sp,160 102c8: 00008067 ret 000102cc <main>: 102cc: fd010113 addi sp,sp,-48 102d0: 02112623 sw ra,44(sp) 102d4: 02812423 sw s0,40(sp) 102d8: 03010413 addi s0,sp,48 102dc: 000147b7 lui a5,0x14 102e0: b3c78793 addi a5,a5,-1220 # 13b3c <__modsi3+0x30> 102e4: fef42623 sw a5,-20(s0) 102e8: 000147b7 lui a5,0x14 102ec: b4478793 addi a5,a5,-1212 # 13b44 <__modsi3+0x38> 102f0: fef42423 sw a5,-24(s0) 102f4: 000147b7 lui a5,0x14 102f8: b4c78793 addi a5,a5,-1204 # 13b4c <__modsi3+0x40> 102fc: fef42223 sw a5,-28(s0) 10300: 000147b7 lui a5,0x14 10304: b3c78793 addi a5,a5,-1220 # 13b3c <__modsi3+0x30> 10308: fef42023 sw a5,-32(s0) 1030c: 000147b7 lui a5,0x14 10310: b5078793 addi a5,a5,-1200 # 13b50 <__modsi3+0x44> 10314: fcf42e23 sw a5,-36(s0) 10318: 000147b7 lui a5,0x14 1031c: b5878793 addi a5,a5,-1192 # 13b58 <__modsi3+0x4c> 10320: fcf42c23 sw a5,-40(s0) 10324: fe842583 lw a1,-24(s0) 10328: fec42503 lw a0,-20(s0) 1032c: e59ff0ef jal ra,10184 <isAnagram> 10330: 00050713 mv a4,a0 10334: 00100793 li a5,1 10338: 00f71a63 bne a4,a5,1034c <main+0x80> 1033c: 000147b7 lui a5,0x14 10340: b6078513 addi a0,a5,-1184 # 13b60 <__modsi3+0x54> 10344: 39c000ef jal ra,106e0 <puts> 10348: 0100006f j 10358 <main+0x8c> 1034c: 000147b7 lui a5,0x14 10350: b7078513 addi a0,a5,-1168 # 13b70 <__modsi3+0x64> 10354: 38c000ef jal ra,106e0 <puts> 10358: fe042583 lw a1,-32(s0) 1035c: fe442503 lw a0,-28(s0) 10360: e25ff0ef jal ra,10184 <isAnagram> 10364: 00050793 mv a5,a0 10368: 00079a63 bnez a5,1037c <main+0xb0> 1036c: 000147b7 lui a5,0x14 10370: b8478513 addi a0,a5,-1148 # 13b84 <__modsi3+0x78> 10374: 36c000ef jal ra,106e0 <puts> 10378: 0100006f j 10388 <main+0xbc> 1037c: 000147b7 lui a5,0x14 10380: b9478513 addi a0,a5,-1132 # 13b94 <__modsi3+0x88> 10384: 35c000ef jal ra,106e0 <puts> 10388: fd842583 lw a1,-40(s0) 1038c: fdc42503 lw a0,-36(s0) 10390: df5ff0ef jal ra,10184 <isAnagram> 10394: 00050713 mv a4,a0 10398: 00100793 li a5,1 1039c: 00f71a63 bne a4,a5,103b0 <main+0xe4> 103a0: 000147b7 lui a5,0x14 103a4: ba878513 addi a0,a5,-1112 # 13ba8 <__modsi3+0x9c> 103a8: 338000ef jal ra,106e0 <puts> 103ac: 0100006f j 103bc <main+0xf0> 103b0: 000147b7 lui a5,0x14 103b4: bb878513 addi a0,a5,-1096 # 13bb8 <__modsi3+0xac> 103b8: 328000ef jal ra,106e0 <puts> 103bc: 00000793 li a5,0 103c0: 00078513 mv a0,a5 103c4: 02c12083 lw ra,44(sp) 103c8: 02812403 lw s0,40(sp) 103cc: 03010113 addi sp,sp,48 103d0: 00008067 ret

../../build/rv32emu hw2_origin_0O.elf
It can run succesfully.

Observation

  • LOC: 82
  • Rigister used
    • S register: s0
    • A register: a0 to a5
    • T register: none
    • Other: ra
    • Total: 8
  • Stack used: 160 bytes
  • Number of load/save: 34
  • Branch Number: 9

Explanation

<main>
  • From line 90 to line 107 stored three testcases' address into stack, from -16(s0) to -40(s0).
  • line 108 and line 109 load testcase address to a1, a0 as function argument
    • a1: -24(s0) stands for test_1_t in C
    • a0: -20(s0) stands for test_1_s in C
  • line 110 to line 113 call function and a0 as the return value (1 or 0), and then move(mv) a0 to a4
    • a4 stands for isAnagram True or False
    • a5 stands for the answer(is True) for this testcase.
  • line 113 bne a4,a5,1034c <main+0x80>, make a decision if a4 equals to a5,
    • if equals (means testcase PASS), print test_1: correct\n
    • if not equals (means testcase FAIL), print test_1: not_correct\n

Remaining is three duplicated work for testing three testcases, and the end is load word and return.

<isAnagram>

Argument input

  • a0 for test_1_s
  • a1 for test_1_t

Just simply go through without any optimization.

-O1

Try to reduce code size and execution time without increasing compilation time too much.

riscv-none-elf-gcc -Wall -march=rv32i -mabi=ilp32 -O1 hw2_origin.c -o hw2_origin_O1.elf

riscv-none-elf-size hw2_origin_O1.elf

Size

text	   data	    bss	    dec	    hex	filename
1710	   1096	     61	   2867	    b33	hw2_origin_O1.elf

Disassembly code

riscv-none-elf-objdump -d hw2_origin_O1.elf > dis_O1.txt

00010184 <isAnagram>: 10184: f8010113 addi sp,sp,-128 10188: 06112e23 sw ra,124(sp) 1018c: 06812c23 sw s0,120(sp) 10190: 06912a23 sw s1,116(sp) 10194: 00050493 mv s1,a0 10198: 00058413 mv s0,a1 1019c: 06800613 li a2,104 101a0: 00000593 li a1,0 101a4: 00810513 addi a0,sp,8 101a8: 214000ef jal ra,103bc <memset> 101ac: 0004c783 lbu a5,0(s1) 101b0: 02078863 beqz a5,101e0 <isAnagram+0x5c> 101b4: 00148513 addi a0,s1,1 101b8: f9f78793 addi a5,a5,-97 101bc: 00279793 slli a5,a5,0x2 101c0: 07078793 addi a5,a5,112 101c4: 002787b3 add a5,a5,sp 101c8: f987a703 lw a4,-104(a5) 101cc: 00170713 addi a4,a4,1 101d0: f8e7ac23 sw a4,-104(a5) 101d4: 00150513 addi a0,a0,1 101d8: fff54783 lbu a5,-1(a0) 101dc: fc079ee3 bnez a5,101b8 <isAnagram+0x34> 101e0: 00044783 lbu a5,0(s0) 101e4: 02078863 beqz a5,10214 <isAnagram+0x90> 101e8: 00140593 addi a1,s0,1 101ec: f9f78793 addi a5,a5,-97 101f0: 00279793 slli a5,a5,0x2 101f4: 07078793 addi a5,a5,112 101f8: 002787b3 add a5,a5,sp 101fc: f987a703 lw a4,-104(a5) 10200: fff70713 addi a4,a4,-1 10204: f8e7ac23 sw a4,-104(a5) 10208: 00158593 addi a1,a1,1 1020c: fff5c783 lbu a5,-1(a1) 10210: fc079ee3 bnez a5,101ec <isAnagram+0x68> 10214: 00810793 addi a5,sp,8 10218: 07010693 addi a3,sp,112 1021c: 0007a703 lw a4,0(a5) 10220: 00071a63 bnez a4,10234 <isAnagram+0xb0> 10224: 00478793 addi a5,a5,4 10228: fed79ae3 bne a5,a3,1021c <isAnagram+0x98> 1022c: 00100513 li a0,1 10230: 0080006f j 10238 <isAnagram+0xb4> 10234: 00000513 li a0,0 10238: 07c12083 lw ra,124(sp) 1023c: 07812403 lw s0,120(sp) 10240: 07412483 lw s1,116(sp) 10244: 08010113 addi sp,sp,128 10248: 00008067 ret 0001024c <main>: 1024c: fe010113 addi sp,sp,-32 10250: 00112e23 sw ra,28(sp) 10254: 000105b7 lui a1,0x10 10258: 71c58593 addi a1,a1,1820 # 1071c <__errno+0x8> 1025c: 00010537 lui a0,0x10 10260: 72450513 addi a0,a0,1828 # 10724 <__errno+0x10> 10264: f21ff0ef jal ra,10184 <isAnagram> 10268: 00a12623 sw a0,12(sp) 1026c: 000105b7 lui a1,0x10 10270: 72c58593 addi a1,a1,1836 # 1072c <__errno+0x18> 10274: 00010537 lui a0,0x10 10278: 73050513 addi a0,a0,1840 # 10730 <__errno+0x1c> 1027c: f09ff0ef jal ra,10184 <isAnagram> 10280: 00a12423 sw a0,8(sp) 10284: 000105b7 lui a1,0x10 10288: 73458593 addi a1,a1,1844 # 10734 <__errno+0x20> 1028c: 00010537 lui a0,0x10 10290: 73c50513 addi a0,a0,1852 # 1073c <__errno+0x28> 10294: ef1ff0ef jal ra,10184 <isAnagram> 10298: 00a12223 sw a0,4(sp) 1029c: 00c12783 lw a5,12(sp) 102a0: 00812783 lw a5,8(sp) 102a4: 00412783 lw a5,4(sp) 102a8: 00000513 li a0,0 102ac: 01c12083 lw ra,28(sp) 102b0: 02010113 addi sp,sp,32 102b4: 00008067 ret

Observation

  • LOC: 50
  • Rigister used
    • S register: s0, s1
    • A register: a0 to a5
    • T register: none
    • Other: ra
    • Total: 9
  • Stack used: 128 bytes
  • Number of load and save: 15
  • Branch Number: 8

../../build/rv32emu hw2_origin_O1.elf
It can run successfully

Explanation

-O1 optimization generate much more shorter code

<main>
  • line 56 to line 59 load a1, a0 for testcase
    • a1: stands for test_1_t in C
    • a0: stands for test_1_s in C
  • line 60 call isAnagram with associate input, and return a0 (1 if TRUE, 0 if FALSE)
  • Remaining are three duplicated code for each three testcases.
<isAnagram>

Argument input

  • a0 for test_1_s
  • a1 for test_1_t
  • Detail is writen in Note

-O2

Optimize more

riscv-none-elf-gcc -Wall -march=rv32i -mabi=ilp32 -O2 hw2_origin.c -o hw2_origin_O2.elf

Size

riscv-none-elf-size hw2_origin_O2.elf

text	   data	    bss	    dec	    hex	filename
1738	   1096	     61	   2895	    b4f	hw2_origin_O2.elf

Disassembly code

riscv-none-elf-objdump -d hw2_origin_O2.elf > dis_O2.txt

000100c4 <main>: 100c4: 000105b7 lui a1,0x10 100c8: 00010537 lui a0,0x10 100cc: fe010113 addi sp,sp,-32 100d0: 73858593 addi a1,a1,1848 # 10738 <__errno+0x8> 100d4: 74050513 addi a0,a0,1856 # 10740 <__errno+0x10> 100d8: 00112e23 sw ra,28(sp) 100dc: 11c000ef jal ra,101f8 <isAnagram> 100e0: 00050793 mv a5,a0 100e4: 000105b7 lui a1,0x10 100e8: 00010537 lui a0,0x10 100ec: 74858593 addi a1,a1,1864 # 10748 <__errno+0x18> 100f0: 74c50513 addi a0,a0,1868 # 1074c <__errno+0x1c> 100f4: 00f12223 sw a5,4(sp) 100f8: 100000ef jal ra,101f8 <isAnagram> 100fc: 00050793 mv a5,a0 10100: 000105b7 lui a1,0x10 10104: 00010537 lui a0,0x10 10108: 75058593 addi a1,a1,1872 # 10750 <__errno+0x20> 1010c: 75850513 addi a0,a0,1880 # 10758 <__errno+0x28> 10110: 00f12423 sw a5,8(sp) 10114: 0e4000ef jal ra,101f8 <isAnagram> 10118: 00a12623 sw a0,12(sp) 1011c: 01c12083 lw ra,28(sp) 10120: 00412783 lw a5,4(sp) 10124: 00812783 lw a5,8(sp) 10128: 00c12783 lw a5,12(sp) 1012c: 00000513 li a0,0 10130: 02010113 addi sp,sp,32 10134: 00008067 ret 000101f8 <isAnagram>: 101f8: f8010113 addi sp,sp,-128 101fc: 06812c23 sw s0,120(sp) 10200: 06912a23 sw s1,116(sp) 10204: 00058413 mv s0,a1 10208: 00050493 mv s1,a0 1020c: 06800613 li a2,104 10210: 00000593 li a1,0 10214: 00810513 addi a0,sp,8 10218: 06112e23 sw ra,124(sp) 1021c: 1bc000ef jal ra,103d8 <memset> 10220: 0004c783 lbu a5,0(s1) 10224: 02078863 beqz a5,10254 <isAnagram+0x5c> 10228: 00148513 addi a0,s1,1 1022c: f9f78793 addi a5,a5,-97 10230: 00279793 slli a5,a5,0x2 10234: 07078793 addi a5,a5,112 10238: 002786b3 add a3,a5,sp 1023c: f986a703 lw a4,-104(a3) 10240: 00054783 lbu a5,0(a0) 10244: 00150513 addi a0,a0,1 10248: 00170713 addi a4,a4,1 1024c: f8e6ac23 sw a4,-104(a3) 10250: fc079ee3 bnez a5,1022c <isAnagram+0x34> 10254: 00044783 lbu a5,0(s0) 10258: 02078863 beqz a5,10288 <isAnagram+0x90> 1025c: 00140593 addi a1,s0,1 10260: f9f78793 addi a5,a5,-97 10264: 00279793 slli a5,a5,0x2 10268: 07078793 addi a5,a5,112 1026c: 002786b3 add a3,a5,sp 10270: f986a703 lw a4,-104(a3) 10274: 0005c783 lbu a5,0(a1) 10278: 00158593 addi a1,a1,1 1027c: fff70713 addi a4,a4,-1 10280: f8e6ac23 sw a4,-104(a3) 10284: fc079ee3 bnez a5,10260 <isAnagram+0x68> 10288: 00810793 addi a5,sp,8 1028c: 07010693 addi a3,sp,112 10290: 0080006f j 10298 <isAnagram+0xa0> 10294: 02f68463 beq a3,a5,102bc <isAnagram+0xc4> 10298: 0007a703 lw a4,0(a5) 1029c: 00478793 addi a5,a5,4 102a0: fe070ae3 beqz a4,10294 <isAnagram+0x9c> 102a4: 07c12083 lw ra,124(sp) 102a8: 07812403 lw s0,120(sp) 102ac: 07412483 lw s1,116(sp) 102b0: 00000513 li a0,0 102b4: 08010113 addi sp,sp,128 102b8: 00008067 ret 102bc: 07c12083 lw ra,124(sp) 102c0: 07812403 lw s0,120(sp) 102c4: 07412483 lw s1,116(sp) 102c8: 00100513 li a0,1 102cc: 08010113 addi sp,sp,128 102d0: 00008067 ret

It can run successfully

Observation

  • LOC: 55
  • Rigister used
    • S register: s0, s1
    • A register: a0 to a5
    • T register: none
    • Other: ra
    • Total: 9
  • Stack used: 128 bytes
  • Number of load and save: 18
  • Branch Number: 8

Explanation

<main>

For testcase1
In -O2 optimization, main function used s0 to handle the stack address for a0(line 5), where in -O1 optimization, used a0. So in -O2 optimization, it need additional instruction to stored s0(line3).

For testcase2
Simply used s0 and a1 and its offset to handle the string address(line 15 to line 17), so it doesn't need another lui instruction to reload stack point (like line 4)

For testcase3
Simply use a0 and a1 and its offset to handle the string address(line 23 to line 26)

  • I think it can also use s0 and associate offset rather than used a0 or a1 with reload the stack pointer (lui a1, 0x14 and lui a0, 0x14)
<isAnagram>

Argument input

  • a0 for test_1_s
  • a1 for test_1_t
  • Detail is in Note

-O3

Optimize even more

riscv-none-elf-gcc -Wall -march=rv32i -mabi=ilp32 -O3 hw2_origin.c -o hw2_origin_O3.elf

Size

riscv-none-elf-size hw2_origin_O3.elf

text	   data	    bss	    dec	    hex	filename
1738	   1096	     61	   2895	    b4f	hw2_origin_O2.elf

Disassembly code

riscv-none-elf-objdump -d hw2_origin_O3.elf > dis_O3.txt

000100c4 <main>: 100c4: 000105b7 lui a1,0x10 100c8: 00010537 lui a0,0x10 100cc: fe010113 addi sp,sp,-32 100d0: 73858593 addi a1,a1,1848 # 10738 <__errno+0x8> 100d4: 74050513 addi a0,a0,1856 # 10740 <__errno+0x10> 100d8: 00112e23 sw ra,28(sp) 100dc: 11c000ef jal ra,101f8 <isAnagram> 100e0: 00050793 mv a5,a0 100e4: 000105b7 lui a1,0x10 100e8: 00010537 lui a0,0x10 100ec: 74858593 addi a1,a1,1864 # 10748 <__errno+0x18> 100f0: 74c50513 addi a0,a0,1868 # 1074c <__errno+0x1c> 100f4: 00f12223 sw a5,4(sp) 100f8: 100000ef jal ra,101f8 <isAnagram> 100fc: 00050793 mv a5,a0 10100: 000105b7 lui a1,0x10 10104: 00010537 lui a0,0x10 10108: 75058593 addi a1,a1,1872 # 10750 <__errno+0x20> 1010c: 75850513 addi a0,a0,1880 # 10758 <__errno+0x28> 10110: 00f12423 sw a5,8(sp) 10114: 0e4000ef jal ra,101f8 <isAnagram> 10118: 00a12623 sw a0,12(sp) 1011c: 01c12083 lw ra,28(sp) 10120: 00412783 lw a5,4(sp) 10124: 00812783 lw a5,8(sp) 10128: 00c12783 lw a5,12(sp) 1012c: 00000513 li a0,0 10130: 02010113 addi sp,sp,32 10134: 00008067 ret 000101f8 <isAnagram>: 101f8: f8010113 addi sp,sp,-128 101fc: 06812c23 sw s0,120(sp) 10200: 06912a23 sw s1,116(sp) 10204: 00058413 mv s0,a1 10208: 00050493 mv s1,a0 1020c: 06800613 li a2,104 10210: 00000593 li a1,0 10214: 00810513 addi a0,sp,8 10218: 06112e23 sw ra,124(sp) 1021c: 1bc000ef jal ra,103d8 <memset> 10220: 0004c783 lbu a5,0(s1) 10224: 02078863 beqz a5,10254 <isAnagram+0x5c> 10228: 00148513 addi a0,s1,1 1022c: f9f78793 addi a5,a5,-97 10230: 00279793 slli a5,a5,0x2 10234: 07078793 addi a5,a5,112 10238: 002786b3 add a3,a5,sp 1023c: f986a703 lw a4,-104(a3) 10240: 00054783 lbu a5,0(a0) 10244: 00150513 addi a0,a0,1 10248: 00170713 addi a4,a4,1 1024c: f8e6ac23 sw a4,-104(a3) 10250: fc079ee3 bnez a5,1022c <isAnagram+0x34> 10254: 00044783 lbu a5,0(s0) 10258: 02078863 beqz a5,10288 <isAnagram+0x90> 1025c: 00140593 addi a1,s0,1 10260: f9f78793 addi a5,a5,-97 10264: 00279793 slli a5,a5,0x2 10268: 07078793 addi a5,a5,112 1026c: 002786b3 add a3,a5,sp 10270: f986a703 lw a4,-104(a3) 10274: 0005c783 lbu a5,0(a1) 10278: 00158593 addi a1,a1,1 1027c: fff70713 addi a4,a4,-1 10280: f8e6ac23 sw a4,-104(a3) 10284: fc079ee3 bnez a5,10260 <isAnagram+0x68> 10288: 00810793 addi a5,sp,8 1028c: 07010693 addi a3,sp,112 10290: 0080006f j 10298 <isAnagram+0xa0> 10294: 02f68463 beq a3,a5,102bc <isAnagram+0xc4> 10298: 0007a703 lw a4,0(a5) 1029c: 00478793 addi a5,a5,4 102a0: fe070ae3 beqz a4,10294 <isAnagram+0x9c> 102a4: 07c12083 lw ra,124(sp) 102a8: 07812403 lw s0,120(sp) 102ac: 07412483 lw s1,116(sp) 102b0: 00000513 li a0,0 102b4: 08010113 addi sp,sp,128 102b8: 00008067 ret 102bc: 07c12083 lw ra,124(sp) 102c0: 07812403 lw s0,120(sp) 102c4: 07412483 lw s1,116(sp) 102c8: 00100513 li a0,1 102cc: 08010113 addi sp,sp,128 102d0: 00008067 ret

It can run successfully
../../build rv32emu hw2_origin_O3.elf

Observation

  • LOC: 55
  • Rigister used
    • S register: s0, s1
    • A register: a0 to a5
    • T register: none
    • Other: ra
    • Total: 9
  • Stack used: 128
  • Number of load and save: 18
  • Branch Number: 8

Explanation

In my case, the -O3 optimization is totally same as -O2.

-Ofast

riscv-none-elf-gcc -Wall -march=rv32i -mabi=ilp32 -Ofast hw2_origin.c -o hw2_origin_Ofast.elf

Size

riscv-none-elf-size hw2_origin_Ofast.elf

text	   data	    bss	    dec	    hex	filename
1738	   1096	     61	   2895	    b4f	hw2_origin_O2.elf

Disassembly code

riscv-none-elf-objdump -d hw2_origin_Ofast.elf > dis_Ofast.txt

000100c4 <main>: 100c4: 000105b7 lui a1,0x10 100c8: 00010537 lui a0,0x10 100cc: fe010113 addi sp,sp,-32 100d0: 73858593 addi a1,a1,1848 # 10738 <__errno+0x8> 100d4: 74050513 addi a0,a0,1856 # 10740 <__errno+0x10> 100d8: 00112e23 sw ra,28(sp) 100dc: 11c000ef jal ra,101f8 <isAnagram> 100e0: 00050793 mv a5,a0 100e4: 000105b7 lui a1,0x10 100e8: 00010537 lui a0,0x10 100ec: 74858593 addi a1,a1,1864 # 10748 <__errno+0x18> 100f0: 74c50513 addi a0,a0,1868 # 1074c <__errno+0x1c> 100f4: 00f12223 sw a5,4(sp) 100f8: 100000ef jal ra,101f8 <isAnagram> 100fc: 00050793 mv a5,a0 10100: 000105b7 lui a1,0x10 10104: 00010537 lui a0,0x10 10108: 75058593 addi a1,a1,1872 # 10750 <__errno+0x20> 1010c: 75850513 addi a0,a0,1880 # 10758 <__errno+0x28> 10110: 00f12423 sw a5,8(sp) 10114: 0e4000ef jal ra,101f8 <isAnagram> 10118: 00a12623 sw a0,12(sp) 1011c: 01c12083 lw ra,28(sp) 10120: 00412783 lw a5,4(sp) 10124: 00812783 lw a5,8(sp) 10128: 00c12783 lw a5,12(sp) 1012c: 00000513 li a0,0 10130: 02010113 addi sp,sp,32 10134: 00008067 ret 000101f8 <isAnagram>: 101f8: f8010113 addi sp,sp,-128 101fc: 06812c23 sw s0,120(sp) 10200: 06912a23 sw s1,116(sp) 10204: 00058413 mv s0,a1 10208: 00050493 mv s1,a0 1020c: 06800613 li a2,104 10210: 00000593 li a1,0 10214: 00810513 addi a0,sp,8 10218: 06112e23 sw ra,124(sp) 1021c: 1bc000ef jal ra,103d8 <memset> 10220: 0004c783 lbu a5,0(s1) 10224: 02078863 beqz a5,10254 <isAnagram+0x5c> 10228: 00148513 addi a0,s1,1 1022c: f9f78793 addi a5,a5,-97 10230: 00279793 slli a5,a5,0x2 10234: 07078793 addi a5,a5,112 10238: 002786b3 add a3,a5,sp 1023c: f986a703 lw a4,-104(a3) 10240: 00054783 lbu a5,0(a0) 10244: 00150513 addi a0,a0,1 10248: 00170713 addi a4,a4,1 1024c: f8e6ac23 sw a4,-104(a3) 10250: fc079ee3 bnez a5,1022c <isAnagram+0x34> 10254: 00044783 lbu a5,0(s0) 10258: 02078863 beqz a5,10288 <isAnagram+0x90> 1025c: 00140593 addi a1,s0,1 10260: f9f78793 addi a5,a5,-97 10264: 00279793 slli a5,a5,0x2 10268: 07078793 addi a5,a5,112 1026c: 002786b3 add a3,a5,sp 10270: f986a703 lw a4,-104(a3) 10274: 0005c783 lbu a5,0(a1) 10278: 00158593 addi a1,a1,1 1027c: fff70713 addi a4,a4,-1 10280: f8e6ac23 sw a4,-104(a3) 10284: fc079ee3 bnez a5,10260 <isAnagram+0x68> 10288: 00810793 addi a5,sp,8 1028c: 07010693 addi a3,sp,112 10290: 0080006f j 10298 <isAnagram+0xa0> 10294: 02f68463 beq a3,a5,102bc <isAnagram+0xc4> 10298: 0007a703 lw a4,0(a5) 1029c: 00478793 addi a5,a5,4 102a0: fe070ae3 beqz a4,10294 <isAnagram+0x9c> 102a4: 07c12083 lw ra,124(sp) 102a8: 07812403 lw s0,120(sp) 102ac: 07412483 lw s1,116(sp) 102b0: 00000513 li a0,0 102b4: 08010113 addi sp,sp,128 102b8: 00008067 ret 102bc: 07c12083 lw ra,124(sp) 102c0: 07812403 lw s0,120(sp) 102c4: 07412483 lw s1,116(sp) 102c8: 00100513 li a0,1 102cc: 08010113 addi sp,sp,128 102d0: 00008067 ret

../../build/rv32emu hw2_origin_Ofast.elf
It can run successfully

Observation

  • LOC: 55
  • Rigister used
    • S register: s0, s1
    • A register: a0 to a5
    • T register: none
    • Other: ra
    • Total: 9
  • Stack used: 128
  • Number of load/save: 18
  • Branch Number: 8

Explanation

In my case, -Ofast is totally same with -O2 and -O3.

-Os

Optimize for size

riscv-none-elf-gcc -Wall -march=rv32i -mabi=ilp32 -Os hw2_origin.c -o hw2_origin_Os.elf

Size

riscv-none-elf-size hw2_origin_Os.elf

text	   data	    bss	    dec	    hex	filename
1702	   1096	     61	   2859	    b2b	hw2_origin_Os.elf

Disassembly code

riscv-none-elf-objdump -d hw2_origin_Os.elf > dis_Os.txt

000100c4 <main>: 100c4: 000105b7 lui a1,0x10 100c8: 00010537 lui a0,0x10 100cc: fe010113 addi sp,sp,-32 100d0: 71458593 addi a1,a1,1812 # 10714 <__errno+0x8> 100d4: 71c50513 addi a0,a0,1820 # 1071c <__errno+0x10> 100d8: 00112e23 sw ra,28(sp) 100dc: 114000ef jal ra,101f0 <isAnagram> 100e0: 00a12223 sw a0,4(sp) 100e4: 000105b7 lui a1,0x10 100e8: 00010537 lui a0,0x10 100ec: 72458593 addi a1,a1,1828 # 10724 <__errno+0x18> 100f0: 72850513 addi a0,a0,1832 # 10728 <__errno+0x1c> 100f4: 0fc000ef jal ra,101f0 <isAnagram> 100f8: 00a12423 sw a0,8(sp) 100fc: 000105b7 lui a1,0x10 10100: 00010537 lui a0,0x10 10104: 72c58593 addi a1,a1,1836 # 1072c <__errno+0x20> 10108: 73450513 addi a0,a0,1844 # 10734 <__errno+0x28> 1010c: 0e4000ef jal ra,101f0 <isAnagram> 10110: 00a12623 sw a0,12(sp) 10114: 01c12083 lw ra,28(sp) 10118: 00412783 lw a5,4(sp) 1011c: 00812783 lw a5,8(sp) 10120: 00c12783 lw a5,12(sp) 10124: 00000513 li a0,0 10128: 02010113 addi sp,sp,32 1012c: 00008067 ret 000101f0 <isAnagram>: 101f0: f8010113 addi sp,sp,-128 101f4: 06812c23 sw s0,120(sp) 101f8: 06912a23 sw s1,116(sp) 101fc: 00058413 mv s0,a1 10200: 00050493 mv s1,a0 10204: 06800613 li a2,104 10208: 00000593 li a1,0 1020c: 00810513 addi a0,sp,8 10210: 06112e23 sw ra,124(sp) 10214: 1a0000ef jal ra,103b4 <memset> 10218: 00048513 mv a0,s1 1021c: 00054783 lbu a5,0(a0) 10220: 00150513 addi a0,a0,1 10224: 04079263 bnez a5,10268 <isAnagram+0x78> 10228: 00040593 mv a1,s0 1022c: 0005c783 lbu a5,0(a1) 10230: 00158593 addi a1,a1,1 10234: 04079a63 bnez a5,10288 <isAnagram+0x98> 10238: 00810793 addi a5,sp,8 1023c: 0007a703 lw a4,0(a5) 10240: 06071463 bnez a4,102a8 <isAnagram+0xb8> 10244: 00478793 addi a5,a5,4 10248: 07010713 addi a4,sp,112 1024c: fee798e3 bne a5,a4,1023c <isAnagram+0x4c> 10250: 00100513 li a0,1 10254: 07c12083 lw ra,124(sp) 10258: 07812403 lw s0,120(sp) 1025c: 07412483 lw s1,116(sp) 10260: 08010113 addi sp,sp,128 10264: 00008067 ret 10268: f9f78793 addi a5,a5,-97 1026c: 00279793 slli a5,a5,0x2 10270: 07078793 addi a5,a5,112 10274: 002787b3 add a5,a5,sp 10278: f987a703 lw a4,-104(a5) 1027c: 00170713 addi a4,a4,1 10280: f8e7ac23 sw a4,-104(a5) 10284: f99ff06f j 1021c <isAnagram+0x2c> 10288: f9f78793 addi a5,a5,-97 1028c: 00279793 slli a5,a5,0x2 10290: 07078793 addi a5,a5,112 10294: 002787b3 add a5,a5,sp 10298: f987a703 lw a4,-104(a5) 1029c: fff70713 addi a4,a4,-1 102a0: f8e7ac23 sw a4,-104(a5) 102a4: f89ff06f j 1022c <isAnagram+0x3c> 102a8: 00000513 li a0,0 102ac: fa9ff06f j 10254 <isAnagram+0x64>

It can run successfully

Observation

  • LOC: 48
  • Rigister used
    • S register: s0, s1
    • A register: a0 to a5
    • T register: none
    • Other: ra
    • Total: 9
  • Stack used: 128 bytes
  • Number of load/save: 13
  • Branch Number: 8

Explanation

<main>

In -Os optimization, tend to minimize the code size, In main, used existed jal ra, 10604<put> in line 40 and line 43

<isAnagram>

Argument input

  • a0 for test_1_s
  • a1 for test_1_t
  • -Os optimization generate much more bne rather than beq, more detail is writen in Note

Summary

It only discuss isAnagram, without main.

O0 O1 O2 O3 Ofast Os
LOC 82 50 55 55 55 48
Rigister 8 9 9 9 9 9
Stack (bytes) 160 128 128 128 128 128
# of Load/Save 34 15 18 18 18 13
# Branch 9 8 8 8 8 8

Comment all printf instruction in sorce code, compile and run rv32emu --stats

Also, add a line

volatile int a = isAnagram(test_1_s, test_1_t);

to make sure isAnagram will not be remove by any optimization

  • If didn't do this, this function will never run under -O1 level or higher.
O0 O1 O2 O3 Ofast Os
CSR Cycle 1688 811 837 837 837 837

Noticed that in my case, -O2, -O3, -Ofast seems completely same. And for -Os although there is a little different in disassembly, but has the same number of CSR Cycle, which is 837.

Below is my opnion of how optimization level influence my case after studying this article.

  1. -O0 doesn't do any optimization, so it runs slow and has the most code size is intuitive.
  2. -O1 try to reduce code size and execution time.
  3. -O2 optimize nearly all supported optimizations but no involve a space-speed tradeoff(means does not perform loop unrolling)
  4. -O3 turns on all optimizations specified by -O2, including loop unrolling and function inlining
    • However, there is nothing change in my case compared with -O2 level. In my opinion, in my C code, the for loop condition is whether s[i] or t[i] exist or not, it is actually not a constant, the value will know in execution. So in the compile stage, there is nothing change.
  5. -Ofast enables all -O3 optimizations and optimizations that are not valid for all standard-compliant programs. It turns on -ffast-math
    • -ffast-math trigger non-standards-compilant floating point optimization, it allow compiler assume that floating point has infinite precision.
    • In my code there is not any caculation about floating point. So the outcome is same with -O2
    • I still not totally understand the explanation above, just quote from the other website.
  6. -Os optimize for size, enables all -O2 optimizations except that often increase code size (such as -freorder-block-algorithm=stc).
    • -freorder-block-algorithm=simple is used in -Os, which does not increase code size.
    • -freorder-block-algorithm=stc is used in -O2, minimizing the number of branched executed by making extra copies of code.

However, rv32emu is an single-cycle simulator, which means that is not pipeline, and no hazard generated.

Rewrite the Problem

run the program

build/rv32emu tests/Hw2/hw2.elf

C code

Assembly code

Check the results and Optimization

In order to compare fairly, print PASS and FAIL to check the algorithm work correctly or not. When caculating CSR Cycle, comment the correspond instruction, such as.

.org 0
# Provide program starting address to linker
.global _start

/* newlib system calls */
.set SYSEXIT,  93
- .set SYSWRITE, 64

.data

test_1_s: .string "anagram"
test_1_t: .string "nagaram"
test_2_s: .string "rat"
test_2_t: .string "car"
test_3_s: .string "tseng"
test_3_t: .string "gnest"
- correct_1:       .string "test_1: Pass\n"
-                  .set t1cor_size, .-correct_1
- not_correct_1:   .string "test_1: Fail\n"
-                  .set t1incor_size, .-not_correct_1
- correct_2:       .string "test_2: Pass\n"
-                  .set t2cor_size, .-correct_2
- not_correct_2:   .string "test_2: Fail\n"
-                  .set t2incor_size, .-not_correct_2
- correct_3:       .string "test_3: Pass\n"
-                  .set t3cor_size, .-correct_3
- not_correct_3:   .string "test_3: Fail\n"
-                  .set t3incor_size, .-not_correct_3

.text

_start: 
-    li a7, SYSWRITE        # "write" system call
-    li a0, 1               #  1 = standard output (stdout)
    jal ra, TEST_1
    jal ra, TEST_2
    jal ra, TEST_3
    j end

TEST_1:
    addi sp, sp, -4
    sw ra, 0(sp)
    la a0, test_1_s        # s(a0) = test_1_s  
    la a1, test_1_t        # t(a1) = test_1_t
    jal ra, isAnagram      # call isAnagram(s(a0), t(a1))
    bne a0, x0, TRUE_1     # if isAnagram(s(a0), t(a1)) == 1 correct
-    li a0, 1               # reload a0 to handle stdout
-    la a1, not_correct_1   # test1 not correct address
-    la a2, t1incor_size    # test1 not correct length
-    ecall
    lw ra, 0(sp)
    addi sp, sp, 4
    jr ra

TEST_2:
    addi sp, sp, -4
    sw ra, 0(sp)
    la a0, test_2_s        # s(a0) = test_2_s
    la a1, test_2_t        # t(a1) = test_2_t
    jal ra, isAnagram	   # call isAnagram(s(a0), t(a1))
    beq a0, x0, TRUE_2	   # if isAnagram(s(a0), t(a1)) == 0 correct

-    la a1, not_correct_2   # test2 not correct address
-    la a2, t2incor_size    # test not correct length
-    ecall
    lw ra, 0(sp)
    addi sp, sp, 4
    jr ra

TEST_3:
    addi sp, sp, -4
    sw ra, 0(sp)
    la a0, test_3_s        # s(a0) = test_3_s
    la a1, test_3_t        # t(a1) = test_3_t
    jal ra, isAnagram	   # call isAnagram(s(a0), t(a1))
    bne a0, x0, TRUE_3     # if isAnagram(s(a0), t(a1)) == 1 correct
-    li a0, 1               # reload a0 to handle stdout
-    la a1, not_correct_3   # test3 not correct address
-    la a2, t3incor_size    # test3 not correct length
    ecall
    lw ra, 0(sp)
    addi sp, sp, 4
    jr ra

TRUE_1:
-    la a1, correct_1     # test1 correct address
-    la a2, t1cor_size    # test1 correct length
-    ecall
    lw ra, 0(sp)
    addi sp, sp, 4
    jr ra               # go to example2

TRUE_2:
-    li a0, 1             # reload a0 to handle stdout 
-    la a1, correct_2     # test2 correct address
-    la a2, t2cor_size    # test2 correct length
-    ecall
    lw ra, 0(sp)
    addi sp, sp, 4
    jr ra              # go to example3

TRUE_3:
-    la a1, correct_3     # test2 correct address
-    la a2, t3cor_size    # test2 correct length
-    ecall
    lw ra, 0(sp)
    addi sp, sp, 4
    jr ra
    
isAnagram:                  # a0 = s, a1 = t
    addi sp, sp, -104       # get sapce for store int letter_freq[26]
    addi t0, sp, 0          # t0 = letter_freq[0]
    addi t1, x0, 0          # t1 = i = 0
    li t2, 26
LOOP1:                      # int letter_freq[26] = {0};
    beq t1, t2, GET_FREQ_s  # if i < 26 
    sw x0, 0(t0)            # letter_freq[i] = 0;
    addi t1, t1, 1          # i++;
    addi t0, t0, 4          # 
    j LOOP1
	
GET_FREQ_s:
    addi t0, sp, 0         # t0 = letter_freq[0]
    addi t1, a0, 0         # t1 = s
    addi t2, x0, 0         # t2 = i = 0
LOOP2:                            # for(  ;s[i] ;i++ )
    add  t3, t1, t2         # get address of s[index] from s[0] + 
    lb t5, 0(t3)            # t5 = s[i]
    beq t5, x0, GET_FREQ_F # if s[i] ==0 break the loop
    addi t5, t5, -97         # t5 = s[i] - 'a'
    slli t5, t5, 2          # get offset form letter_freq[0] to letter_freq[s[i] - 'a']
    add t5, t5, t0          # get address of letter_freq[s[i] - 'a']
    lw t3, 0(t5)            # t3 = [freq[s[i] - 'a']]
    addi t3, t3, 1          # t3 = [freq[s[i] - 'a']] + 1
    sw t3, 0(t5)            # [freq[s[i] - 'a']] = ([freq[s[i] - 'a']] + 1) 
    addi t2, t2, 1          # i++
    j LOOP2
GET_FREQ_F:
    addi t0, sp, 0         # t0 = freq[]
    addi t1, a1, 0         # t1 = t
    addi t2, x0, 0         # t2 = i = 0
LOOP3:                      # for(  ;t[i] ;i++ )
    add  t3, t1, t2         # get address of t[index]
    lb t5, 0(t3)            # t5 = t[i]
    beq t5, x0, CHECK       # if t[i] == 0 break the loop
    addi t5, t5, -97        # t5 = t[i] - 'a'
    slli t5, t5, 2          # get offset form letter_freq[0] to letter_freq[t[i] - 'a']
    add t5, t5, t0        
    lw t3, 0(t5)            # t5 = [freq[t[i] - 'a']]
    addi t3, t3, -1         # t3 = [freq[t[i] - 'a']] - 1
    sw t3, 0(t5)            # [freq[t[i] - 'a']] = ([freq[t[i] - 'a']] - 1) 
    addi t2, t2, 1          # i++
    j LOOP3
CHECK:
    addi t0, sp, 0 # t0 = address freq[0]
    addi t1, x0, 0 # t1 = i = 0
    li t2, 26
LOOP4:                      # for (int i = 0; i < 26; i++)
    beq t1, t2, TRUE         # i < 26
    lw t3, 0(t0)            # t3 = freq[i];
    bne t3, x0, FALSE       # if freq[i] != 0 break
    addi t1, t1, 1         # i++;
    addi t0, t0, 4         # freq + 1
    j LOOP4

FALSE:
    addi a0, x0, 0          # if false return false
    j END_F
TRUE:
    addi a0, x0, 1          # if pass check return true
END_F:
    addi sp, sp, 104
    jr ra                   # return,  a0 = return value = true or false

end:
    li a7, SYSEXIT      # "exit" syscall
    add a0, x0, 0       # Use 0 return code
    ecall               # invoke syscall to terminate the program

Caculate the CSR Cycle

make
../../build rv32emu --stats hw2.elf
inferior exit code 0
CSR cycle count: 751

The handwritten assembly code is better than compiler in all optimization level.

Try to optimize

.org 0 # Provide program starting address to linker .global _start /* newlib system calls */ .set SYSEXIT, 93 .set SYSWRITE, 64 .data test_1_s: .string "anagram" test_1_t: .string "nagaram" test_2_s: .string "rat" test_2_t: .string "car" test_3_s: .string "tseng" test_3_t: .string "gnest" correct_1: .string "test_1: Pass\n" .set t1cor_size, .-correct_1 not_correct_1: .string "test_1: Fail\n" .set t1incor_size, .-not_correct_1 correct_2: .string "test_2: Pass\n" .set t2cor_size, .-correct_2 not_correct_2: .string "test_2: Fail\n" .set t2incor_size, .-not_correct_2 correct_3: .string "test_3: Pass\n" .set t3cor_size, .-correct_3 not_correct_3: .string "test_3: Fail\n" .set t3incor_size, .-not_correct_3 .text isAnagram: # a0 = s, a1 = t addi sp, sp, -104 # get sapce for store int letter_freq[26] memset: # int letter_freq[26] = {0}; sw x0, 0(sp) # letter_freq[i] = 0; sw x0, 4(sp) sw x0, 8(sp) sw x0, 12(sp) sw x0, 16(sp) sw x0, 20(sp) sw x0, 24(sp) sw x0, 28(sp) sw x0, 32(sp) sw x0, 36(sp) sw x0, 40(sp) sw x0, 44(sp) sw x0, 48(sp) sw x0, 52(sp) sw x0, 56(sp) sw x0, 60(sp) sw x0, 64(sp) sw x0, 68(sp) sw x0, 72(sp) sw x0, 76(sp) sw x0, 80(sp) sw x0, 84(sp) sw x0, 88(sp) sw x0, 92(sp) sw x0, 96(sp) sw x0, 100(sp) PRECOND1: lbu t5, 0(a0) # t5 = s[0] beqz t5, PRECOND2 # if s[0] = 0, break the loop addi t2, a0, 1 # t2 = s++ LOOP2: # for( ;s[i] ;i++ ) addi t5, t5, -97 # s[i] - 'a' slli t5, t5, 0x2 # get offset from letter_freq[0] to letter_freq[s[i] - 'a'] addi t5, t5, 112 add t5, t5, sp # t5 = address of letter_freq[s[i] - 'a'] lw t3, -104(t5) # t3 = letter_freq[s[i] - 'a'] addi t3, t3, 1 # letter_freq[s[i] - 'a']++ sw t3, -104(t5) # store back addi t2, t2, 1 # s++ lbu t5, -1(t2) # t5 = s[i] bnez t5, LOOP2 PRECOND2: lbu t5, 0(a1) # t5 = t[0] beqz t5, CHECK # if t[0] = 0, break the loop addi t2, a1, 1 # t2 = t++ LOOP3: # for( ;t[i] ;i++ ) addi t5, t5, -97 # t[i] - 'a' slli t5, t5, 0x2 # get offset from letter_freq[0] to letter_freq[t[i] - 'a'] addi t5, t5, 112 add t5, t5, sp # t5 = address of letter_freq[t[i] - 'a'] lw t3, -104(t5) # t3 = letter_freq[t[i] - 'a'] addi t3, t3, -1 # letter_freq[t[i] - 'a']-- sw t3, -104(t5) # store back addi t2, t2, 1 # t++ lbu t5, -1(t2) # t5 = t[i] bnez t5, LOOP3 CHECK: addi t0, sp, 8 # t0 = address freq[0] addi t1, sp, 112 # 104 = 4 * 26 = the end of the letter_freq[] LOOP4: # for (int i = 0; i < 26; i++) lw t3, 0(t0) # t3 = freq[i]; bne t3, x0, FALSE # if freq[i] != 0 break addi t0, t0, 4 # freq++ bne t0, t1, LOOP4 TRUE: addi a0, x0, 1 # if pass check return true j END_F FALSE: addi a0, x0, 0 # if false return false END_F: addi sp, sp, 104 jr ra # return, a0 = return value = true or false _start: la a0, test_1_s # s(a0) = test_1_s la a1, test_1_t # t(a1) = test_1_t jal ra, isAnagram # call isAnagram(s(a0), t(a1)) la a0, test_2_s # s(a0) = test_2_s la a1, test_2_t # t(a1) = test_2_t jal ra, isAnagram # call isAnagram(s(a0), t(a1)) la a0, test_3_s # s(a0) = test_3_s la a1, test_3_t # t(a1) = test_3_t jal ra, isAnagram # call isAnagram(s(a0), t(a1)) end: li a7, SYSEXIT # "exit" syscall add a0, x0, 0 # Use 0 return code ecall # invoke syscall to terminate the program

Inspirate by -O1 optimization level and kdvnt's homework, write down the code above.

  • Unroll the memset function, which is originally done by loop.
  • Reschedule the code to reduce branch number
101b8: f9f78793 addi a5,a5,-97 101bc: 00279793 slli a5,a5,0x2 101c0: 07078793 addi a5,a5,112 101c4: 002787b3 add a5,a5,sp 101c8: f987a703 lw a4,-104(a5) 101cc: 00170713 addi a4,a4,1 101d0: f8e7ac23 sw a4,-104(a5) 101d4: 00150513 addi a0,a0,1 101d8: fff54783 lbu a5,-1(a0)

Notice that in O1 optimization level, there is some redundant assembly code, and I modified it.

addi t5, t5, -97       # s[i] - 'a'
slli t5, t5, 0x2       # get offset from letter_freq[0] to letter_freq[s[i] - 'a']
add t5, t5, sp         # t5 = address of letter_freq[s[i] - 'a']
lw t3, 0(t5)           # t3 = letter_freq[s[i] - 'a']
addi t3, t3, 1         # letter_freq[s[i] - 'a']++
sw t3, 0(t5)           # store back
addi t2, t2, 1         # s++
lbu t5, -1(t2)         # t5 = s[i]

Result

CSR cycle count: 499
O0 O1 O2 O3 Ofast Os My Version
CSR Cycle 1688 811 837 837 837 837 499

However, if I try some loopunrolling (motivated by kdvnt's homework), CSR cycle count doesn't change, such as

LOOP2:                            # for(  ;s[i] ;i++ )
    beqz t5, PRECOND2
    addi t5, t5, -97       # s[i] - 'a'
    slli t5, t5, 0x2       # get offset from letter_freq[0] to letter_freq[s[i] - 'a']
    add t5, t5, sp         # t5 = address of letter_freq[s[i] - 'a']
    lw t3, 0(t5)           # t3 = letter_freq[s[i] - 'a']
    addi t3, t3, 1         # letter_freq[s[i] - 'a']++
    sw t3, 0(t5)           # store back
    addi t2, t2, 1         # s++
    lbu t5, -1(t2)         # t5 = s[i]
    
    beqz t5, PRECOND2
    addi t5, t5, -97       # s[i] - 'a'
    slli t5, t5, 0x2       # get offset from letter_freq[0] to letter_freq[s[i] - 'a']
    add t5, t5, sp         # t5 = address of letter_freq[s[i] - 'a']
    lw t3, 0(t5)           # t3 = letter_freq[s[i] - 'a']
    addi t3, t3, 1         # letter_freq[s[i] - 'a']++
    sw t3, 0(t5)           # store back
    addi t2, t2, 1         # s++
    lbu t5, -1(t2)         # t5 = s[i]
    
    j LOOP2

But, if I do the same work in Ripes, the cycle will reduce as expected(similar result in kdvnt's work).

Notice that loop unrolling reduce branch penalty, but rv32emu doesn't implement pipeline. In rv32emu, instruction is execute line by line(with CSR counter plus 1), so there is no branch misprediction. So, I think only consider CSR cycle to determine the code efficiency is unfair.

I try to use clock() which is defined in time.h or gettimeofday() which is defined in /sys/time.h, to caculate actual time.

Rather put the clock() in rv32emu's main function, I just want to caculate the time consume about my algorithm(isAnagram in my case). But encounter some problem when calling system call.

Implement syscall clock_gettime

Rectification

The goal of rv32emu is to create a cycle-accurate[1] RISC-V instruction set emulator, shouldn't rely on (semi-)system call such as clock/clock_gettime. Directly use performance counter[2] to get the actual executed instruction, which are part of RISC-V instruction set.

syscall.c

#define SUPPORTED_SYSCALLS       \
    _(close,            57)      \
    _(lseek,            62)      \
    _(read,             63)      \
    _(write,            64)      \
    _(fstat,            80)      \
    _(exit,             93)      \
    _(gettimeofday,     169)     \
    _(brk,              214)     \
+    _(clock_gettime,    403)     \
    _(open,             1024)    \
    IIF(RV32_HAS(SDL))(          \
        _(draw_frame,   0xBEEF)  \
        _(setup_queue,  0xC0DE)  \
        _(submit_queue, 0xFEED), \
    )
static void syscall_clock_gettime(struct riscv_t *rv)
{
    state_t *s = rv_userdata(rv); /* access userdata */

    /* get the parameters */
    riscv_word_t tp = rv_get_reg(rv, rv_reg_a1);

    /* return the clock time */
    if (tp) {
#if defined(HAVE_POSIX_TIMER)
        struct timespec t;
        clock_gettime(CLOCK_MONOTONIC, &t);

        int32_t tv_sec = t.tv_sec;
        int32_t tv_msec = t.tv_nsec / 1000000;  // resolution (ms)

#elif defined(HAVE_MACH_TIMER)
        static mach_timebase_info_data_t info;
        /* If this is the first time we have run, get the timebase.
         * We can use denom == 0 to indicate that sTimebaseInfo is
         * uninitialized.
         */
        if (info.denom == 0)
            (void) mach_timebase_info(&info);
        /* Hope that the multiplication doesn't overflow. */
        uint64_t nsecs = mach_absolute_time() * info.numer / info.denom;
        int32_t tv_sec = nsecs / 1e9;
        int32_t tv_usec = (nsecs / 1e3) - (tv_sec * 1e6);
#else /* low resolution timer */
        clock_t t = clock();
        int32_t tv_sec = t / CLOCKS_PER_SEC;
        int32_t tv_usec = (t % CLOCKS_PER_SEC) * (1000000 / CLOCKS_PER_SEC);
#endif
        memory_write(s->mem, tp + 0, (const uint8_t *) &tv_sec, 4);
        memory_write(s->mem, tp + 8, (const uint8_t *) &tv_msec, 4);
    }

    /* success */
    rv_set_reg(rv, rv_reg_a0, 0);
}
  • More detail in Note

Motivated by 2022q1 Homework6 (kecho) which is done by Risheng1128, try to generate random string with string length 10000 to 100000, and use clock function which is implement above to test time.

The code above test two same string input, so the algorithm will go through whole string.