2022 Assignment2: RISC-V Toolchain

tags: `Computer Architecture`

contribute by <chiangkd>

Requirement follow Assignment2: RISC-V Toolchain

Pick up one Problem

Pick up one assembly program and adapt it into both RISC-V assembly and C implementations which can be flawlessly executed with rv32emu.

I choose the problem from 曾晧峖-Valid Anagram(Leetcode 242). The reason why I choose this problem is that when I go through the classmates homework from Assignment1: RISC-V Assembly and Instruction Pipeline. I notice that there is seldom problem is about string or character. I think only less than 5 problems is related. However, I want to know how string operation can efficiently run under RISC-V architecture.

Original Solution by 曾晧峖

C code

bool isAnagram(char * s, char * t)
{
    int letter_freq[26] = {0}; //store frequency of every letter
	
    // calculate frequency of every letter of s
    for (int i = 0; s[i]; i++) {
        letter_freq[s[i] - 'a']++;
    }
    // calculate frequency of every letter of t
    for (int i = 0; t[i]; i++) {
        letter_freq[t[i] - 'a']--;
    }
    // if letter_freq[i] != 0, it mean in letter char(i + 'a') frequency of letter of s and t are not same. 
    for (int i = 0; i < 26; i++) {
        if (letter_freq[i]) 
            return false;
    }
    return true; // all of letter of frequency are same
}

Handwrite Assembly code (Modified)

# RISC-V assembly program to print "Hello World!" to stdout.

.org 0
# Provide program starting address to linker
.global _start
/* newlib system calls */
.set SYSEXIT,  93
.set SYSWRITE, 64

.data

test_1_s: .string "anagram"
test_1_t: .string "nagaram"
test_2_s: .string "rat"
test_2_t: .string "anagram"
test_3_s: .string "tseng"
test_3_t: .string "gnest"

correct_1:       .string "test_1: Pass\n"
                 .set t1cor_size, .-correct_1
not_correct_1:   .string "test_1: Fail\n"
                 .set t1incor_size, .-not_correct_1
correct_2:       .string "test_2: Pass\n"
                 .set t2cor_size, .-correct_2
not_correct_2:   .string "test_2: Fail\n"
                 .set t2incor_size, .-not_correct_2
correct_3:       .string "test_3: Pass\n"
                 .set t3cor_size, .-correct_3
not_correct_3:   .string "test_3: Fail\n"
                 .set t3incor_size, .-not_correct_3

.text

_start: 
    li a7, SYSWRITE        # "write" system call
    li a0, 1               #  1 = standard output (stdout)
    jal ra, TEST_1
    jal ra, TEST_2
    jal ra, TEST_3
    j end

TEST_1:
    addi sp, sp, -4
    sw ra, 0(sp)
    la a0, test_1_s        # s(a0) = test_1_s  
    la a1, test_1_t        # t(a1) = test_1_t
    jal ra, isAnagram      # call isAnagram(s(a0), t(a1))
    bne a0, x0, TRUE_1     # if isAnagram(s(a0), t(a1)) == 1 correct
    li a0, 1               # reload a0 to handle stdout
    la a1, not_correct_1   # test1 not correct address
    la a2, t1incor_size    # test1 not correct length
    ecall
    lw ra, 0(sp)
    addi sp, sp, 4
    jr ra

TEST_2:
    addi sp, sp, -4
    sw ra, 0(sp)
    la a0, test_2_s        # s(a0) = test_2_s
    la a1, test_2_t        # t(a1) = test_2_t
    jal ra, isAnagram	   # call isAnagram(s(a0), t(a1))
    beq a0, x0, TRUE_2	   # if isAnagram(s(a0), t(a1)) == 0 correct

    la a1, not_correct_2   # test2 not correct address
    la a2, t2incor_size    # test not correct length
    ecall
    lw ra, 0(sp)
    addi sp, sp, 4
    jr ra

TEST_3:
    addi sp, sp, -4
    sw ra, 0(sp)
    la a0, test_3_s        # s(a0) = test_3_s
    la a1, test_3_t        # t(a1) = test_3_t
    jal ra, isAnagram	   # call isAnagram(s(a0), t(a1))
    bne a0, x0, TRUE_3     # if isAnagram(s(a0), t(a1)) == 1 correct
    li a0, 1               # reload a0 to handle stdout
    la a1, not_correct_3   # test3 not correct address
    la a2, t3incor_size    # test3 not correct length
    ecall
    lw ra, 0(sp)
    addi sp, sp, 4
    jr ra

TRUE_1:
    la a1, correct_1     # test1 correct address
    la a2, t1cor_size    # test1 correct length
    ecall
    lw ra, 0(sp)
    addi sp, sp, 4
    jr ra               # go to example2

TRUE_2:
    li a0, 1             # reload a0 to handle stdout 
    la a1, correct_2     # test2 correct address
    la a2, t2cor_size    # test2 correct length
    ecall
    lw ra, 0(sp)
    addi sp, sp, 4
    jr ra              # go to example3

TRUE_3:
    la a1, correct_3     # test2 correct address
    la a2, t3cor_size    # test2 correct length
    ecall
    lw ra, 0(sp)
    addi sp, sp, 4
    jr ra
    
isAnagram:                  # a0 = s, a1 = t
    addi sp, sp, -104       # get sapce for store int letter_freq[26]
    addi t0, sp, 0          # t0 = letter_freq[0]
    addi t1, x0, 0          # t1 = i = 0
    li t2, 26
LOOP1:                      # int letter_freq[26] = {0};
    beq t1, t2, GET_FREQ_s  # if i < 26 
    sw x0, 0(t0)            # letter_freq[i] = 0;
    addi t1, t1, 1          # i++;
    addi t0, t0, 4          # 
    j LOOP1
	
GET_FREQ_s:
    addi t0, sp, 0         # t0 = letter_freq[0]
    addi t1, a0, 0         # t1 = s
    addi t2, x0, 0         # t2 = i = 0
LOOP2:                            # for(  ;s[i] ;i++ )
    add  t3, t1, t2         # get address of s[index] from s[0] + 
    lb t5, 0(t3)            # t5 = s[i]
    beq t5, x0, GET_FREQ_F  # if s[i] ==0 break the loop
    addi t5, t5, -97        # t5 = s[i] - 'a'
    slli t5, t5, 2          # get offset form letter_freq[0] to letter_freq[s[i] - 'a']
    add t5, t5, t0          # get address of letter_freq[s[i] - 'a']
    lw t3, 0(t5)            # t3 = [freq[s[i] - 'a']]
    addi t3, t3, 1          # t3 = [freq[s[i] - 'a']] + 1
    sw t3, 0(t5)            # [freq[s[i] - 'a']] = ([freq[s[i] - 'a']] + 1) 
    addi t2, t2, 1          # i++
    j LOOP2
GET_FREQ_F:
    addi t0, sp, 0         # t0 = freq[]
    addi t1, a1, 0         # t1 = t
    addi t2, x0, 0         # t2 = i = 0
LOOP3:                      # for(  ;t[i] ;i++ )
    add  t3, t1, t2         # get address of t[index]
    lb t5, 0(t3)            # t5 = t[i]
    beq t5, x0, CHECK       # if t[i] == 0 break the loop
    addi t5, t5, -97        # t5 = t[i] - 'a'
    slli t5, t5, 2          # get offset form letter_freq[0] to letter_freq[t[i] - 'a']
    add t5, t5, t0        
    lw t3, 0(t5)            # t5 = [freq[t[i] - 'a']]
    addi t3, t3, -1         # t3 = [freq[t[i] - 'a']] - 1
    sw t3, 0(t5)            # [freq[t[i] - 'a']] = ([freq[t[i] - 'a']] - 1) 
    addi t2, t2, 1          # i++
    j LOOP3
CHECK:
    addi t0, sp, 0 # t0 = address freq[0]
    addi t1, x0, 0 # t1 = i = 0
    li t2, 26
LOOP4:                     # for (int i = 0; i < 26; i++)
    beq t1, t2, TRUE       # i < 26
    lw t3, 0(t0)           # t3 = freq[i];
    bne t3, x0, FALSE      # if freq[i] != 0 break
    addi t1, t1, 1         # i++;
    addi t0, t0, 4         # freq + 1
    j LOOP4

FALSE:
    addi a0, x0, 0          # if false return false
    j END_F
TRUE:
    addi a0, x0, 1          # if pass check return true
END_F:
    addi sp, sp, 104
    jr ra                   # return,  a0 = return value = true or false

end:
    li a7, SYSEXIT      # "exit" syscall
    add a0, x0, 0       # Use 0 return code
    ecall               # invoke syscall to terminate the program

There is some problem when I adapt origin assembly code to execute with rv32emu, such as

bne a0, x0 TRUE_1 need to write as bne a0, x0, TRUE_1
Some system call need more register than Ripes, so some register may be overwriten.
More detail is written in Note, github

Original code run each testcase right after previous testcase, but I modify to run it after jump back to main

Origin

Modified

Disassemble the ELF files and Analysis

Compile by RISC-V GNU toolchain

riscv-none-elf-gcc -Wall -march=rv32i -mabi=ilp32 hw2_origin.c -o hw2_origin.elf

It can successfully run with expected answer.

-O0(default)

Optimize for compilation time (which is the default optimization mode)

riscv-none-elf-gcc -Wall -march=rv32i -mabi=ilp32 -O0 hw2_origin.c -o hw2_origin_00.elf

Size

riscv-none-elf-size hw2_origin_O0.elf

text	   data	    bss	    dec	    hex	filename
1914	   1096	     61	   3071	    bff	hw2_origin_O0.elf

Disassembly code

riscv-none-elf-objdump -d hw2_origin_O0.elf >dis_O0.txt























































































































































00010184 <isAnagram>:
   10184:	f6010113          	addi	sp,sp,-160
   10188:	08112e23          	sw	ra,156(sp)
   1018c:	08812c23          	sw	s0,152(sp)
   10190:	0a010413          	addi	s0,sp,160
   10194:	f6a42623          	sw	a0,-148(s0)
   10198:	f6b42423          	sw	a1,-152(s0)
   1019c:	f7c40793          	addi	a5,s0,-132
   101a0:	06800713          	li	a4,104
   101a4:	00070613          	mv	a2,a4
   101a8:	00000593          	li	a1,0
   101ac:	00078513          	mv	a0,a5
   101b0:	328000ef          	jal	ra,104d8 <memset>
   101b4:	fe042623          	sw	zero,-20(s0)
   101b8:	0480006f          	j	10200 <isAnagram+0x7c>
   101bc:	fec42783          	lw	a5,-20(s0)
   101c0:	f6c42703          	lw	a4,-148(s0)
   101c4:	00f707b3          	add	a5,a4,a5
   101c8:	0007c783          	lbu	a5,0(a5)
   101cc:	f9f78713          	addi	a4,a5,-97
   101d0:	00271793          	slli	a5,a4,0x2
   101d4:	ff078793          	addi	a5,a5,-16
   101d8:	008787b3          	add	a5,a5,s0
   101dc:	f8c7a783          	lw	a5,-116(a5)
   101e0:	00178693          	addi	a3,a5,1
   101e4:	00271793          	slli	a5,a4,0x2
   101e8:	ff078793          	addi	a5,a5,-16
   101ec:	008787b3          	add	a5,a5,s0
   101f0:	f8d7a623          	sw	a3,-116(a5)
   101f4:	fec42783          	lw	a5,-20(s0)
   101f8:	00178793          	addi	a5,a5,1
   101fc:	fef42623          	sw	a5,-20(s0)
   10200:	fec42783          	lw	a5,-20(s0)
   10204:	f6c42703          	lw	a4,-148(s0)
   10208:	00f707b3          	add	a5,a4,a5
   1020c:	0007c783          	lbu	a5,0(a5)
   10210:	fa0796e3          	bnez	a5,101bc <isAnagram+0x38>
   10214:	fe042423          	sw	zero,-24(s0)
   10218:	0480006f          	j	10260 <isAnagram+0xdc>
   1021c:	fe842783          	lw	a5,-24(s0)
   10220:	f6842703          	lw	a4,-152(s0)
   10224:	00f707b3          	add	a5,a4,a5
   10228:	0007c783          	lbu	a5,0(a5)
   1022c:	f9f78713          	addi	a4,a5,-97
   10230:	00271793          	slli	a5,a4,0x2
   10234:	ff078793          	addi	a5,a5,-16
   10238:	008787b3          	add	a5,a5,s0
   1023c:	f8c7a783          	lw	a5,-116(a5)
   10240:	fff78693          	addi	a3,a5,-1
   10244:	00271793          	slli	a5,a4,0x2
   10248:	ff078793          	addi	a5,a5,-16
   1024c:	008787b3          	add	a5,a5,s0
   10250:	f8d7a623          	sw	a3,-116(a5)
   10254:	fe842783          	lw	a5,-24(s0)
   10258:	00178793          	addi	a5,a5,1
   1025c:	fef42423          	sw	a5,-24(s0)
   10260:	fe842783          	lw	a5,-24(s0)
   10264:	f6842703          	lw	a4,-152(s0)
   10268:	00f707b3          	add	a5,a4,a5
   1026c:	0007c783          	lbu	a5,0(a5)
   10270:	fa0796e3          	bnez	a5,1021c <isAnagram+0x98>
   10274:	fe042223          	sw	zero,-28(s0)
   10278:	0300006f          	j	102a8 <isAnagram+0x124>
   1027c:	fe442783          	lw	a5,-28(s0)
   10280:	00279793          	slli	a5,a5,0x2
   10284:	ff078793          	addi	a5,a5,-16
   10288:	008787b3          	add	a5,a5,s0
   1028c:	f8c7a783          	lw	a5,-116(a5)
   10290:	00078663          	beqz	a5,1029c <isAnagram+0x118>
   10294:	00000793          	li	a5,0
   10298:	0200006f          	j	102b8 <isAnagram+0x134>
   1029c:	fe442783          	lw	a5,-28(s0)
   102a0:	00178793          	addi	a5,a5,1
   102a4:	fef42223          	sw	a5,-28(s0)
   102a8:	fe442703          	lw	a4,-28(s0)
   102ac:	01900793          	li	a5,25
   102b0:	fce7d6e3          	bge	a5,a4,1027c <isAnagram+0xf8>
   102b4:	00100793          	li	a5,1
   102b8:	00078513          	mv	a0,a5
   102bc:	09c12083          	lw	ra,156(sp)
   102c0:	09812403          	lw	s0,152(sp)
   102c4:	0a010113          	addi	sp,sp,160
   102c8:	00008067          	ret

000102cc <main>:
   102cc:	fd010113          	addi	sp,sp,-48
   102d0:	02112623          	sw	ra,44(sp)
   102d4:	02812423          	sw	s0,40(sp)
   102d8:	03010413          	addi	s0,sp,48
   102dc:	000147b7          	lui	a5,0x14
   102e0:	b3c78793          	addi	a5,a5,-1220 # 13b3c <__modsi3+0x30>
   102e4:	fef42623          	sw	a5,-20(s0)
   102e8:	000147b7          	lui	a5,0x14
   102ec:	b4478793          	addi	a5,a5,-1212 # 13b44 <__modsi3+0x38>
   102f0:	fef42423          	sw	a5,-24(s0)
   102f4:	000147b7          	lui	a5,0x14
   102f8:	b4c78793          	addi	a5,a5,-1204 # 13b4c <__modsi3+0x40>
   102fc:	fef42223          	sw	a5,-28(s0)
   10300:	000147b7          	lui	a5,0x14
   10304:	b3c78793          	addi	a5,a5,-1220 # 13b3c <__modsi3+0x30>
   10308:	fef42023          	sw	a5,-32(s0)
   1030c:	000147b7          	lui	a5,0x14
   10310:	b5078793          	addi	a5,a5,-1200 # 13b50 <__modsi3+0x44>
   10314:	fcf42e23          	sw	a5,-36(s0)
   10318:	000147b7          	lui	a5,0x14
   1031c:	b5878793          	addi	a5,a5,-1192 # 13b58 <__modsi3+0x4c>
   10320:	fcf42c23          	sw	a5,-40(s0)
   10324:	fe842583          	lw	a1,-24(s0)
   10328:	fec42503          	lw	a0,-20(s0)
   1032c:	e59ff0ef          	jal	ra,10184 <isAnagram>
   10330:	00050713          	mv	a4,a0
   10334:	00100793          	li	a5,1
   10338:	00f71a63          	bne	a4,a5,1034c <main+0x80>
   1033c:	000147b7          	lui	a5,0x14
   10340:	b6078513          	addi	a0,a5,-1184 # 13b60 <__modsi3+0x54>
   10344:	39c000ef          	jal	ra,106e0 <puts>
   10348:	0100006f          	j	10358 <main+0x8c>
   1034c:	000147b7          	lui	a5,0x14
   10350:	b7078513          	addi	a0,a5,-1168 # 13b70 <__modsi3+0x64>
   10354:	38c000ef          	jal	ra,106e0 <puts>
   10358:	fe042583          	lw	a1,-32(s0)
   1035c:	fe442503          	lw	a0,-28(s0)
   10360:	e25ff0ef          	jal	ra,10184 <isAnagram>
   10364:	00050793          	mv	a5,a0
   10368:	00079a63          	bnez	a5,1037c <main+0xb0>
   1036c:	000147b7          	lui	a5,0x14
   10370:	b8478513          	addi	a0,a5,-1148 # 13b84 <__modsi3+0x78>
   10374:	36c000ef          	jal	ra,106e0 <puts>
   10378:	0100006f          	j	10388 <main+0xbc>
   1037c:	000147b7          	lui	a5,0x14
   10380:	b9478513          	addi	a0,a5,-1132 # 13b94 <__modsi3+0x88>
   10384:	35c000ef          	jal	ra,106e0 <puts>
   10388:	fd842583          	lw	a1,-40(s0)
   1038c:	fdc42503          	lw	a0,-36(s0)
   10390:	df5ff0ef          	jal	ra,10184 <isAnagram>
   10394:	00050713          	mv	a4,a0
   10398:	00100793          	li	a5,1
   1039c:	00f71a63          	bne	a4,a5,103b0 <main+0xe4>
   103a0:	000147b7          	lui	a5,0x14
   103a4:	ba878513          	addi	a0,a5,-1112 # 13ba8 <__modsi3+0x9c>
   103a8:	338000ef          	jal	ra,106e0 <puts>
   103ac:	0100006f          	j	103bc <main+0xf0>
   103b0:	000147b7          	lui	a5,0x14
   103b4:	bb878513          	addi	a0,a5,-1096 # 13bb8 <__modsi3+0xac>
   103b8:	328000ef          	jal	ra,106e0 <puts>
   103bc:	00000793          	li	a5,0
   103c0:	00078513          	mv	a0,a5
   103c4:	02c12083          	lw	ra,44(sp)
   103c8:	02812403          	lw	s0,40(sp)
   103cc:	03010113          	addi	sp,sp,48
   103d0:	00008067          	ret

../../build/rv32emu hw2_origin_0O.elf
It can run succesfully.

Observation

LOC: 82
Rigister used
- S register: s0
- A register: a0 to a5
- T register: none
- Other: ra
- Total: 8
Stack used: 160 bytes
Number of load/save: 34
Branch Number: 9

Explanation

`<main>`

From line 90 to line 107 stored three testcases' address into stack, from -16(s0) to -40(s0).
line 108 and line 109 load testcase address to a1, a0 as function argument
- a1: -24(s0) stands for test_1_t in C
- a0: -20(s0) stands for test_1_s in C
line 110 to line 113 call function and a0 as the return value (1 or 0), and then move(mv) a0 to a4
- a4 stands for isAnagram True or False
- a5 stands for the answer(is True) for this testcase.
line 113 bne a4,a5,1034c <main+0x80>, make a decision if a4 equals to a5,
- if equals (means testcase PASS), print test_1: correct\n
- if not equals (means testcase FAIL), print test_1: not_correct\n

Remaining is three duplicated work for testing three testcases, and the end is load word and return.

`<isAnagram>`

Argument input

a0 for test_1_s
a1 for test_1_t

Just simply go through without any optimization.

Detail is in Note

-O1

Try to reduce code size and execution time without increasing compilation time too much.

riscv-none-elf-gcc -Wall -march=rv32i -mabi=ilp32 -O1 hw2_origin.c -o hw2_origin_O1.elf

riscv-none-elf-size hw2_origin_O1.elf

Size

text	   data	    bss	    dec	    hex	filename
1710	   1096	     61	   2867	    b33	hw2_origin_O1.elf

Disassembly code

riscv-none-elf-objdump -d hw2_origin_O1.elf > dis_O1.txt
















































































00010184 <isAnagram>:
   10184:	f8010113          	addi	sp,sp,-128
   10188:	06112e23          	sw	ra,124(sp)
   1018c:	06812c23          	sw	s0,120(sp)
   10190:	06912a23          	sw	s1,116(sp)
   10194:	00050493          	mv	s1,a0
   10198:	00058413          	mv	s0,a1
   1019c:	06800613          	li	a2,104
   101a0:	00000593          	li	a1,0
   101a4:	00810513          	addi	a0,sp,8
   101a8:	214000ef          	jal	ra,103bc <memset>
   101ac:	0004c783          	lbu	a5,0(s1)
   101b0:	02078863          	beqz	a5,101e0 <isAnagram+0x5c>
   101b4:	00148513          	addi	a0,s1,1
   101b8:	f9f78793          	addi	a5,a5,-97
   101bc:	00279793          	slli	a5,a5,0x2
   101c0:	07078793          	addi	a5,a5,112
   101c4:	002787b3          	add	a5,a5,sp
   101c8:	f987a703          	lw	a4,-104(a5)
   101cc:	00170713          	addi	a4,a4,1
   101d0:	f8e7ac23          	sw	a4,-104(a5)
   101d4:	00150513          	addi	a0,a0,1
   101d8:	fff54783          	lbu	a5,-1(a0)
   101dc:	fc079ee3          	bnez	a5,101b8 <isAnagram+0x34>
   101e0:	00044783          	lbu	a5,0(s0)
   101e4:	02078863          	beqz	a5,10214 <isAnagram+0x90>
   101e8:	00140593          	addi	a1,s0,1
   101ec:	f9f78793          	addi	a5,a5,-97
   101f0:	00279793          	slli	a5,a5,0x2
   101f4:	07078793          	addi	a5,a5,112
   101f8:	002787b3          	add	a5,a5,sp
   101fc:	f987a703          	lw	a4,-104(a5)
   10200:	fff70713          	addi	a4,a4,-1
   10204:	f8e7ac23          	sw	a4,-104(a5)
   10208:	00158593          	addi	a1,a1,1
   1020c:	fff5c783          	lbu	a5,-1(a1)
   10210:	fc079ee3          	bnez	a5,101ec <isAnagram+0x68>
   10214:	00810793          	addi	a5,sp,8
   10218:	07010693          	addi	a3,sp,112
   1021c:	0007a703          	lw	a4,0(a5)
   10220:	00071a63          	bnez	a4,10234 <isAnagram+0xb0>
   10224:	00478793          	addi	a5,a5,4
   10228:	fed79ae3          	bne	a5,a3,1021c <isAnagram+0x98>
   1022c:	00100513          	li	a0,1
   10230:	0080006f          	j	10238 <isAnagram+0xb4>
   10234:	00000513          	li	a0,0
   10238:	07c12083          	lw	ra,124(sp)
   1023c:	07812403          	lw	s0,120(sp)
   10240:	07412483          	lw	s1,116(sp)
   10244:	08010113          	addi	sp,sp,128
   10248:	00008067          	ret

0001024c <main>:
   1024c:	fe010113          	addi	sp,sp,-32
   10250:	00112e23          	sw	ra,28(sp)
   10254:	000105b7          	lui	a1,0x10
   10258:	71c58593          	addi	a1,a1,1820 # 1071c <__errno+0x8>
   1025c:	00010537          	lui	a0,0x10
   10260:	72450513          	addi	a0,a0,1828 # 10724 <__errno+0x10>
   10264:	f21ff0ef          	jal	ra,10184 <isAnagram>
   10268:	00a12623          	sw	a0,12(sp)
   1026c:	000105b7          	lui	a1,0x10
   10270:	72c58593          	addi	a1,a1,1836 # 1072c <__errno+0x18>
   10274:	00010537          	lui	a0,0x10
   10278:	73050513          	addi	a0,a0,1840 # 10730 <__errno+0x1c>
   1027c:	f09ff0ef          	jal	ra,10184 <isAnagram>
   10280:	00a12423          	sw	a0,8(sp)
   10284:	000105b7          	lui	a1,0x10
   10288:	73458593          	addi	a1,a1,1844 # 10734 <__errno+0x20>
   1028c:	00010537          	lui	a0,0x10
   10290:	73c50513          	addi	a0,a0,1852 # 1073c <__errno+0x28>
   10294:	ef1ff0ef          	jal	ra,10184 <isAnagram>
   10298:	00a12223          	sw	a0,4(sp)
   1029c:	00c12783          	lw	a5,12(sp)
   102a0:	00812783          	lw	a5,8(sp)
   102a4:	00412783          	lw	a5,4(sp)
   102a8:	00000513          	li	a0,0
   102ac:	01c12083          	lw	ra,28(sp)
   102b0:	02010113          	addi	sp,sp,32
   102b4:	00008067          	ret

Observation

LOC: 50
Rigister used
- S register: s0, s1
- A register: a0 to a5
- T register: none
- Other: ra
- Total: 9
Stack used: 128 bytes
Number of load and save: 15
Branch Number: 8

../../build/rv32emu hw2_origin_O1.elf
It can run successfully

Explanation

-O1 optimization generate much more shorter code

`<main>`

line 56 to line 59 load a1, a0 for testcase
- a1: stands for test_1_t in C
- a0: stands for test_1_s in C
line 60 call isAnagram with associate input, and return a0 (1 if TRUE, 0 if FALSE)
Remaining are three duplicated code for each three testcases.

`<isAnagram>`

Argument input

a0 for test_1_s
a1 for test_1_t
Detail is writen in Note

-O2

Optimize more

riscv-none-elf-gcc -Wall -march=rv32i -mabi=ilp32 -O2 hw2_origin.c -o hw2_origin_O2.elf

Size

riscv-none-elf-size hw2_origin_O2.elf

text	   data	    bss	    dec	    hex	filename
1738	   1096	     61	   2895	    b4f	hw2_origin_O2.elf

Disassembly code

riscv-none-elf-objdump -d hw2_origin_O2.elf > dis_O2.txt























































































000100c4 <main>:
   100c4:	000105b7          	lui	a1,0x10
   100c8:	00010537          	lui	a0,0x10
   100cc:	fe010113          	addi	sp,sp,-32
   100d0:	73858593          	addi	a1,a1,1848 # 10738 <__errno+0x8>
   100d4:	74050513          	addi	a0,a0,1856 # 10740 <__errno+0x10>
   100d8:	00112e23          	sw	ra,28(sp)
   100dc:	11c000ef          	jal	ra,101f8 <isAnagram>
   100e0:	00050793          	mv	a5,a0
   100e4:	000105b7          	lui	a1,0x10
   100e8:	00010537          	lui	a0,0x10
   100ec:	74858593          	addi	a1,a1,1864 # 10748 <__errno+0x18>
   100f0:	74c50513          	addi	a0,a0,1868 # 1074c <__errno+0x1c>
   100f4:	00f12223          	sw	a5,4(sp)
   100f8:	100000ef          	jal	ra,101f8 <isAnagram>
   100fc:	00050793          	mv	a5,a0
   10100:	000105b7          	lui	a1,0x10
   10104:	00010537          	lui	a0,0x10
   10108:	75058593          	addi	a1,a1,1872 # 10750 <__errno+0x20>
   1010c:	75850513          	addi	a0,a0,1880 # 10758 <__errno+0x28>
   10110:	00f12423          	sw	a5,8(sp)
   10114:	0e4000ef          	jal	ra,101f8 <isAnagram>
   10118:	00a12623          	sw	a0,12(sp)
   1011c:	01c12083          	lw	ra,28(sp)
   10120:	00412783          	lw	a5,4(sp)
   10124:	00812783          	lw	a5,8(sp)
   10128:	00c12783          	lw	a5,12(sp)
   1012c:	00000513          	li	a0,0
   10130:	02010113          	addi	sp,sp,32
   10134:	00008067          	ret
   
000101f8 <isAnagram>:
   101f8:	f8010113          	addi	sp,sp,-128
   101fc:	06812c23          	sw	s0,120(sp)
   10200:	06912a23          	sw	s1,116(sp)
   10204:	00058413          	mv	s0,a1
   10208:	00050493          	mv	s1,a0
   1020c:	06800613          	li	a2,104
   10210:	00000593          	li	a1,0
   10214:	00810513          	addi	a0,sp,8
   10218:	06112e23          	sw	ra,124(sp)
   1021c:	1bc000ef          	jal	ra,103d8 <memset>
   10220:	0004c783          	lbu	a5,0(s1)
   10224:	02078863          	beqz	a5,10254 <isAnagram+0x5c>
   10228:	00148513          	addi	a0,s1,1
   1022c:	f9f78793          	addi	a5,a5,-97
   10230:	00279793          	slli	a5,a5,0x2
   10234:	07078793          	addi	a5,a5,112
   10238:	002786b3          	add	a3,a5,sp
   1023c:	f986a703          	lw	a4,-104(a3)
   10240:	00054783          	lbu	a5,0(a0)
   10244:	00150513          	addi	a0,a0,1
   10248:	00170713          	addi	a4,a4,1
   1024c:	f8e6ac23          	sw	a4,-104(a3)
   10250:	fc079ee3          	bnez	a5,1022c <isAnagram+0x34>
   10254:	00044783          	lbu	a5,0(s0)
   10258:	02078863          	beqz	a5,10288 <isAnagram+0x90>
   1025c:	00140593          	addi	a1,s0,1
   10260:	f9f78793          	addi	a5,a5,-97
   10264:	00279793          	slli	a5,a5,0x2
   10268:	07078793          	addi	a5,a5,112
   1026c:	002786b3          	add	a3,a5,sp
   10270:	f986a703          	lw	a4,-104(a3)
   10274:	0005c783          	lbu	a5,0(a1)
   10278:	00158593          	addi	a1,a1,1
   1027c:	fff70713          	addi	a4,a4,-1
   10280:	f8e6ac23          	sw	a4,-104(a3)
   10284:	fc079ee3          	bnez	a5,10260 <isAnagram+0x68>
   10288:	00810793          	addi	a5,sp,8
   1028c:	07010693          	addi	a3,sp,112
   10290:	0080006f          	j	10298 <isAnagram+0xa0>
   10294:	02f68463          	beq	a3,a5,102bc <isAnagram+0xc4>
   10298:	0007a703          	lw	a4,0(a5)
   1029c:	00478793          	addi	a5,a5,4
   102a0:	fe070ae3          	beqz	a4,10294 <isAnagram+0x9c>
   102a4:	07c12083          	lw	ra,124(sp)
   102a8:	07812403          	lw	s0,120(sp)
   102ac:	07412483          	lw	s1,116(sp)
   102b0:	00000513          	li	a0,0
   102b4:	08010113          	addi	sp,sp,128
   102b8:	00008067          	ret
   102bc:	07c12083          	lw	ra,124(sp)
   102c0:	07812403          	lw	s0,120(sp)
   102c4:	07412483          	lw	s1,116(sp)
   102c8:	00100513          	li	a0,1
   102cc:	08010113          	addi	sp,sp,128
   102d0:	00008067          	ret

It can run successfully

Observation

LOC: 55
Rigister used
- S register: s0, s1
- A register: a0 to a5
- T register: none
- Other: ra
- Total: 9
Stack used: 128 bytes
Number of load and save: 18
Branch Number: 8

Explanation

`<main>`

For testcase1
In -O2 optimization, main function used s0 to handle the stack address for a0(line 5), where in -O1 optimization, used a0. So in -O2 optimization, it need additional instruction to stored s0(line3).

For testcase2
Simply used s0 and a1 and its offset to handle the string address(line 15 to line 17), so it doesn't need another lui instruction to reload stack point (like line 4)

For testcase3
Simply use a0 and a1 and its offset to handle the string address(line 23 to line 26)

I think it can also use s0 and associate offset rather than used a0 or a1 with reload the stack pointer (lui a1, 0x14 and lui a0, 0x14)

`<isAnagram>`

Argument input

a0 for test_1_s
a1 for test_1_t
Detail is in Note

-O3

Optimize even more

riscv-none-elf-gcc -Wall -march=rv32i -mabi=ilp32 -O3 hw2_origin.c -o hw2_origin_O3.elf

Size

riscv-none-elf-size hw2_origin_O3.elf

text	   data	    bss	    dec	    hex	filename
1738	   1096	     61	   2895	    b4f	hw2_origin_O2.elf

Disassembly code

riscv-none-elf-objdump -d hw2_origin_O3.elf > dis_O3.txt























































































000100c4 <main>:
   100c4:	000105b7          	lui	a1,0x10
   100c8:	00010537          	lui	a0,0x10
   100cc:	fe010113          	addi	sp,sp,-32
   100d0:	73858593          	addi	a1,a1,1848 # 10738 <__errno+0x8>
   100d4:	74050513          	addi	a0,a0,1856 # 10740 <__errno+0x10>
   100d8:	00112e23          	sw	ra,28(sp)
   100dc:	11c000ef          	jal	ra,101f8 <isAnagram>
   100e0:	00050793          	mv	a5,a0
   100e4:	000105b7          	lui	a1,0x10
   100e8:	00010537          	lui	a0,0x10
   100ec:	74858593          	addi	a1,a1,1864 # 10748 <__errno+0x18>
   100f0:	74c50513          	addi	a0,a0,1868 # 1074c <__errno+0x1c>
   100f4:	00f12223          	sw	a5,4(sp)
   100f8:	100000ef          	jal	ra,101f8 <isAnagram>
   100fc:	00050793          	mv	a5,a0
   10100:	000105b7          	lui	a1,0x10
   10104:	00010537          	lui	a0,0x10
   10108:	75058593          	addi	a1,a1,1872 # 10750 <__errno+0x20>
   1010c:	75850513          	addi	a0,a0,1880 # 10758 <__errno+0x28>
   10110:	00f12423          	sw	a5,8(sp)
   10114:	0e4000ef          	jal	ra,101f8 <isAnagram>
   10118:	00a12623          	sw	a0,12(sp)
   1011c:	01c12083          	lw	ra,28(sp)
   10120:	00412783          	lw	a5,4(sp)
   10124:	00812783          	lw	a5,8(sp)
   10128:	00c12783          	lw	a5,12(sp)
   1012c:	00000513          	li	a0,0
   10130:	02010113          	addi	sp,sp,32
   10134:	00008067          	ret

000101f8 <isAnagram>:
   101f8:	f8010113          	addi	sp,sp,-128
   101fc:	06812c23          	sw	s0,120(sp)
   10200:	06912a23          	sw	s1,116(sp)
   10204:	00058413          	mv	s0,a1
   10208:	00050493          	mv	s1,a0
   1020c:	06800613          	li	a2,104
   10210:	00000593          	li	a1,0
   10214:	00810513          	addi	a0,sp,8
   10218:	06112e23          	sw	ra,124(sp)
   1021c:	1bc000ef          	jal	ra,103d8 <memset>
   10220:	0004c783          	lbu	a5,0(s1)
   10224:	02078863          	beqz	a5,10254 <isAnagram+0x5c>
   10228:	00148513          	addi	a0,s1,1
   1022c:	f9f78793          	addi	a5,a5,-97
   10230:	00279793          	slli	a5,a5,0x2
   10234:	07078793          	addi	a5,a5,112
   10238:	002786b3          	add	a3,a5,sp
   1023c:	f986a703          	lw	a4,-104(a3)
   10240:	00054783          	lbu	a5,0(a0)
   10244:	00150513          	addi	a0,a0,1
   10248:	00170713          	addi	a4,a4,1
   1024c:	f8e6ac23          	sw	a4,-104(a3)
   10250:	fc079ee3          	bnez	a5,1022c <isAnagram+0x34>
   10254:	00044783          	lbu	a5,0(s0)
   10258:	02078863          	beqz	a5,10288 <isAnagram+0x90>
   1025c:	00140593          	addi	a1,s0,1
   10260:	f9f78793          	addi	a5,a5,-97
   10264:	00279793          	slli	a5,a5,0x2
   10268:	07078793          	addi	a5,a5,112
   1026c:	002786b3          	add	a3,a5,sp
   10270:	f986a703          	lw	a4,-104(a3)
   10274:	0005c783          	lbu	a5,0(a1)
   10278:	00158593          	addi	a1,a1,1
   1027c:	fff70713          	addi	a4,a4,-1
   10280:	f8e6ac23          	sw	a4,-104(a3)
   10284:	fc079ee3          	bnez	a5,10260 <isAnagram+0x68>
   10288:	00810793          	addi	a5,sp,8
   1028c:	07010693          	addi	a3,sp,112
   10290:	0080006f          	j	10298 <isAnagram+0xa0>
   10294:	02f68463          	beq	a3,a5,102bc <isAnagram+0xc4>
   10298:	0007a703          	lw	a4,0(a5)
   1029c:	00478793          	addi	a5,a5,4
   102a0:	fe070ae3          	beqz	a4,10294 <isAnagram+0x9c>
   102a4:	07c12083          	lw	ra,124(sp)
   102a8:	07812403          	lw	s0,120(sp)
   102ac:	07412483          	lw	s1,116(sp)
   102b0:	00000513          	li	a0,0
   102b4:	08010113          	addi	sp,sp,128
   102b8:	00008067          	ret
   102bc:	07c12083          	lw	ra,124(sp)
   102c0:	07812403          	lw	s0,120(sp)
   102c4:	07412483          	lw	s1,116(sp)
   102c8:	00100513          	li	a0,1
   102cc:	08010113          	addi	sp,sp,128
   102d0:	00008067          	ret

It can run successfully
../../build rv32emu hw2_origin_O3.elf

Observation

LOC: 55
Rigister used
- S register: s0, s1
- A register: a0 to a5
- T register: none
- Other: ra
- Total: 9
Stack used: 128
Number of load and save: 18
Branch Number: 8

Explanation

In my case, the -O3 optimization is totally same as -O2.

-Ofast

riscv-none-elf-gcc -Wall -march=rv32i -mabi=ilp32 -Ofast hw2_origin.c -o hw2_origin_Ofast.elf

Size

riscv-none-elf-size hw2_origin_Ofast.elf

text	   data	    bss	    dec	    hex	filename
1738	   1096	     61	   2895	    b4f	hw2_origin_O2.elf

Disassembly code

riscv-none-elf-objdump -d hw2_origin_Ofast.elf > dis_Ofast.txt























































































000100c4 <main>:
   100c4:	000105b7          	lui	a1,0x10
   100c8:	00010537          	lui	a0,0x10
   100cc:	fe010113          	addi	sp,sp,-32
   100d0:	73858593          	addi	a1,a1,1848 # 10738 <__errno+0x8>
   100d4:	74050513          	addi	a0,a0,1856 # 10740 <__errno+0x10>
   100d8:	00112e23          	sw	ra,28(sp)
   100dc:	11c000ef          	jal	ra,101f8 <isAnagram>
   100e0:	00050793          	mv	a5,a0
   100e4:	000105b7          	lui	a1,0x10
   100e8:	00010537          	lui	a0,0x10
   100ec:	74858593          	addi	a1,a1,1864 # 10748 <__errno+0x18>
   100f0:	74c50513          	addi	a0,a0,1868 # 1074c <__errno+0x1c>
   100f4:	00f12223          	sw	a5,4(sp)
   100f8:	100000ef          	jal	ra,101f8 <isAnagram>
   100fc:	00050793          	mv	a5,a0
   10100:	000105b7          	lui	a1,0x10
   10104:	00010537          	lui	a0,0x10
   10108:	75058593          	addi	a1,a1,1872 # 10750 <__errno+0x20>
   1010c:	75850513          	addi	a0,a0,1880 # 10758 <__errno+0x28>
   10110:	00f12423          	sw	a5,8(sp)
   10114:	0e4000ef          	jal	ra,101f8 <isAnagram>
   10118:	00a12623          	sw	a0,12(sp)
   1011c:	01c12083          	lw	ra,28(sp)
   10120:	00412783          	lw	a5,4(sp)
   10124:	00812783          	lw	a5,8(sp)
   10128:	00c12783          	lw	a5,12(sp)
   1012c:	00000513          	li	a0,0
   10130:	02010113          	addi	sp,sp,32
   10134:	00008067          	ret
   
000101f8 <isAnagram>:
   101f8:	f8010113          	addi	sp,sp,-128
   101fc:	06812c23          	sw	s0,120(sp)
   10200:	06912a23          	sw	s1,116(sp)
   10204:	00058413          	mv	s0,a1
   10208:	00050493          	mv	s1,a0
   1020c:	06800613          	li	a2,104
   10210:	00000593          	li	a1,0
   10214:	00810513          	addi	a0,sp,8
   10218:	06112e23          	sw	ra,124(sp)
   1021c:	1bc000ef          	jal	ra,103d8 <memset>
   10220:	0004c783          	lbu	a5,0(s1)
   10224:	02078863          	beqz	a5,10254 <isAnagram+0x5c>
   10228:	00148513          	addi	a0,s1,1
   1022c:	f9f78793          	addi	a5,a5,-97
   10230:	00279793          	slli	a5,a5,0x2
   10234:	07078793          	addi	a5,a5,112
   10238:	002786b3          	add	a3,a5,sp
   1023c:	f986a703          	lw	a4,-104(a3)
   10240:	00054783          	lbu	a5,0(a0)
   10244:	00150513          	addi	a0,a0,1
   10248:	00170713          	addi	a4,a4,1
   1024c:	f8e6ac23          	sw	a4,-104(a3)
   10250:	fc079ee3          	bnez	a5,1022c <isAnagram+0x34>
   10254:	00044783          	lbu	a5,0(s0)
   10258:	02078863          	beqz	a5,10288 <isAnagram+0x90>
   1025c:	00140593          	addi	a1,s0,1
   10260:	f9f78793          	addi	a5,a5,-97
   10264:	00279793          	slli	a5,a5,0x2
   10268:	07078793          	addi	a5,a5,112
   1026c:	002786b3          	add	a3,a5,sp
   10270:	f986a703          	lw	a4,-104(a3)
   10274:	0005c783          	lbu	a5,0(a1)
   10278:	00158593          	addi	a1,a1,1
   1027c:	fff70713          	addi	a4,a4,-1
   10280:	f8e6ac23          	sw	a4,-104(a3)
   10284:	fc079ee3          	bnez	a5,10260 <isAnagram+0x68>
   10288:	00810793          	addi	a5,sp,8
   1028c:	07010693          	addi	a3,sp,112
   10290:	0080006f          	j	10298 <isAnagram+0xa0>
   10294:	02f68463          	beq	a3,a5,102bc <isAnagram+0xc4>
   10298:	0007a703          	lw	a4,0(a5)
   1029c:	00478793          	addi	a5,a5,4
   102a0:	fe070ae3          	beqz	a4,10294 <isAnagram+0x9c>
   102a4:	07c12083          	lw	ra,124(sp)
   102a8:	07812403          	lw	s0,120(sp)
   102ac:	07412483          	lw	s1,116(sp)
   102b0:	00000513          	li	a0,0
   102b4:	08010113          	addi	sp,sp,128
   102b8:	00008067          	ret
   102bc:	07c12083          	lw	ra,124(sp)
   102c0:	07812403          	lw	s0,120(sp)
   102c4:	07412483          	lw	s1,116(sp)
   102c8:	00100513          	li	a0,1
   102cc:	08010113          	addi	sp,sp,128
   102d0:	00008067          	ret

../../build/rv32emu hw2_origin_Ofast.elf
It can run successfully

Observation

LOC: 55
Rigister used
- S register: s0, s1
- A register: a0 to a5
- T register: none
- Other: ra
- Total: 9
Stack used: 128
Number of load/save: 18
Branch Number: 8

Explanation

In my case, -Ofast is totally same with -O2 and -O3.

-Os

Optimize for size

riscv-none-elf-gcc -Wall -march=rv32i -mabi=ilp32 -Os hw2_origin.c -o hw2_origin_Os.elf

Size

riscv-none-elf-size hw2_origin_Os.elf

text	   data	    bss	    dec	    hex	filename
1702	   1096	     61	   2859	    b2b	hw2_origin_Os.elf

Disassembly code

riscv-none-elf-objdump -d hw2_origin_Os.elf > dis_Os.txt














































































000100c4 <main>:
   100c4:	000105b7          	lui	a1,0x10
   100c8:	00010537          	lui	a0,0x10
   100cc:	fe010113          	addi	sp,sp,-32
   100d0:	71458593          	addi	a1,a1,1812 # 10714 <__errno+0x8>
   100d4:	71c50513          	addi	a0,a0,1820 # 1071c <__errno+0x10>
   100d8:	00112e23          	sw	ra,28(sp)
   100dc:	114000ef          	jal	ra,101f0 <isAnagram>
   100e0:	00a12223          	sw	a0,4(sp)
   100e4:	000105b7          	lui	a1,0x10
   100e8:	00010537          	lui	a0,0x10
   100ec:	72458593          	addi	a1,a1,1828 # 10724 <__errno+0x18>
   100f0:	72850513          	addi	a0,a0,1832 # 10728 <__errno+0x1c>
   100f4:	0fc000ef          	jal	ra,101f0 <isAnagram>
   100f8:	00a12423          	sw	a0,8(sp)
   100fc:	000105b7          	lui	a1,0x10
   10100:	00010537          	lui	a0,0x10
   10104:	72c58593          	addi	a1,a1,1836 # 1072c <__errno+0x20>
   10108:	73450513          	addi	a0,a0,1844 # 10734 <__errno+0x28>
   1010c:	0e4000ef          	jal	ra,101f0 <isAnagram>
   10110:	00a12623          	sw	a0,12(sp)
   10114:	01c12083          	lw	ra,28(sp)
   10118:	00412783          	lw	a5,4(sp)
   1011c:	00812783          	lw	a5,8(sp)
   10120:	00c12783          	lw	a5,12(sp)
   10124:	00000513          	li	a0,0
   10128:	02010113          	addi	sp,sp,32
   1012c:	00008067          	ret

000101f0 <isAnagram>:
   101f0:	f8010113          	addi	sp,sp,-128
   101f4:	06812c23          	sw	s0,120(sp)
   101f8:	06912a23          	sw	s1,116(sp)
   101fc:	00058413          	mv	s0,a1
   10200:	00050493          	mv	s1,a0
   10204:	06800613          	li	a2,104
   10208:	00000593          	li	a1,0
   1020c:	00810513          	addi	a0,sp,8
   10210:	06112e23          	sw	ra,124(sp)
   10214:	1a0000ef          	jal	ra,103b4 <memset>
   10218:	00048513          	mv	a0,s1
   1021c:	00054783          	lbu	a5,0(a0)
   10220:	00150513          	addi	a0,a0,1
   10224:	04079263          	bnez	a5,10268 <isAnagram+0x78>
   10228:	00040593          	mv	a1,s0
   1022c:	0005c783          	lbu	a5,0(a1)
   10230:	00158593          	addi	a1,a1,1
   10234:	04079a63          	bnez	a5,10288 <isAnagram+0x98>
   10238:	00810793          	addi	a5,sp,8
   1023c:	0007a703          	lw	a4,0(a5)
   10240:	06071463          	bnez	a4,102a8 <isAnagram+0xb8>
   10244:	00478793          	addi	a5,a5,4
   10248:	07010713          	addi	a4,sp,112
   1024c:	fee798e3          	bne	a5,a4,1023c <isAnagram+0x4c>
   10250:	00100513          	li	a0,1
   10254:	07c12083          	lw	ra,124(sp)
   10258:	07812403          	lw	s0,120(sp)
   1025c:	07412483          	lw	s1,116(sp)
   10260:	08010113          	addi	sp,sp,128
   10264:	00008067          	ret
   10268:	f9f78793          	addi	a5,a5,-97
   1026c:	00279793          	slli	a5,a5,0x2
   10270:	07078793          	addi	a5,a5,112
   10274:	002787b3          	add	a5,a5,sp
   10278:	f987a703          	lw	a4,-104(a5)
   1027c:	00170713          	addi	a4,a4,1
   10280:	f8e7ac23          	sw	a4,-104(a5)
   10284:	f99ff06f          	j	1021c <isAnagram+0x2c>
   10288:	f9f78793          	addi	a5,a5,-97
   1028c:	00279793          	slli	a5,a5,0x2
   10290:	07078793          	addi	a5,a5,112
   10294:	002787b3          	add	a5,a5,sp
   10298:	f987a703          	lw	a4,-104(a5)
   1029c:	fff70713          	addi	a4,a4,-1
   102a0:	f8e7ac23          	sw	a4,-104(a5)
   102a4:	f89ff06f          	j	1022c <isAnagram+0x3c>
   102a8:	00000513          	li	a0,0
   102ac:	fa9ff06f          	j	10254 <isAnagram+0x64>

It can run successfully

Observation

LOC: 48
Rigister used
- S register: s0, s1
- A register: a0 to a5
- T register: none
- Other: ra
- Total: 9
Stack used: 128 bytes
Number of load/save: 13
Branch Number: 8

Explanation

`<main>`

In -Os optimization, tend to minimize the code size, In main, used existed jal ra, 10604<put> in line 40 and line 43

`<isAnagram>`

Argument input

a0 for test_1_s
a1 for test_1_t
-Os optimization generate much more bne rather than beq, more detail is writen in Note

Summary

It only discuss isAnagram, without main.

	O0	O1	O2	O3	Ofast	Os
LOC	82	50	55	55	55	48
Rigister	8	9	9	9	9	9
Stack (bytes)	160	128	128	128	128	128
# of Load/Save	34	15	18	18	18	13
# Branch	9	8	8	8	8	8

Comment all printf instruction in sorce code, compile and run rv32emu --stats

Also, add a line

volatile int a = isAnagram(test_1_s, test_1_t);

to make sure isAnagram will not be remove by any optimization

If didn't do this, this function will never run under -O1 level or higher.

	O0	O1	O2	O3	Ofast	Os
CSR Cycle	1688	811	837	837	837	837

Noticed that in my case, -O2, -O3, -Ofast seems completely same. And for -Os although there is a little different in disassembly, but has the same number of CSR Cycle, which is 837.

Below is my opnion of how optimization level influence my case after studying this article.

-O0 doesn't do any optimization, so it runs slow and has the most code size is intuitive.
-O1 try to reduce code size and execution time.
-O2 optimize nearly all supported optimizations but no involve a space-speed tradeoff(means does not perform loop unrolling)
-O3 turns on all optimizations specified by -O2, including loop unrolling and function inlining
- However, there is nothing change in my case compared with -O2 level. In my opinion, in my C code, the for loop condition is whether s[i] or t[i] exist or not, it is actually not a constant, the value will know in execution. So in the compile stage, there is nothing change.
-Ofast enables all -O3 optimizations and optimizations that are not valid for all standard-compliant programs. It turns on -ffast-math
- -ffast-math trigger non-standards-compilant floating point optimization, it allow compiler assume that floating point has infinite precision.
- In my code there is not any caculation about floating point. So the outcome is same with -O2
- I still not totally understand the explanation above, just quote from the other website.
-Os optimize for size, enables all -O2 optimizations except that often increase code size (such as -freorder-block-algorithm=stc).
- -freorder-block-algorithm=simple is used in -Os, which does not increase code size.
- -freorder-block-algorithm=stc is used in -O2, minimizing the number of branched executed by making extra copies of code.

However, rv32emu is an single-cycle simulator, which means that is not pipeline, and no hazard generated.

Rewrite the Problem

run the program

build/rv32emu tests/Hw2/hw2.elf

C code

Assembly code

Check the results and Optimization

In order to compare fairly, print PASS and FAIL to check the algorithm work correctly or not. When caculating CSR Cycle, comment the correspond instruction, such as.

.org 0
# Provide program starting address to linker
.global _start

/* newlib system calls */
.set SYSEXIT,  93
- .set SYSWRITE, 64

.data

test_1_s: .string "anagram"
test_1_t: .string "nagaram"
test_2_s: .string "rat"
test_2_t: .string "car"
test_3_s: .string "tseng"
test_3_t: .string "gnest"
- correct_1:       .string "test_1: Pass\n"
-                  .set t1cor_size, .-correct_1
- not_correct_1:   .string "test_1: Fail\n"
-                  .set t1incor_size, .-not_correct_1
- correct_2:       .string "test_2: Pass\n"
-                  .set t2cor_size, .-correct_2
- not_correct_2:   .string "test_2: Fail\n"
-                  .set t2incor_size, .-not_correct_2
- correct_3:       .string "test_3: Pass\n"
-                  .set t3cor_size, .-correct_3
- not_correct_3:   .string "test_3: Fail\n"
-                  .set t3incor_size, .-not_correct_3

.text

_start: 
-    li a7, SYSWRITE        # "write" system call
-    li a0, 1               #  1 = standard output (stdout)
    jal ra, TEST_1
    jal ra, TEST_2
    jal ra, TEST_3
    j end

TEST_1:
    addi sp, sp, -4
    sw ra, 0(sp)
    la a0, test_1_s        # s(a0) = test_1_s  
    la a1, test_1_t        # t(a1) = test_1_t
    jal ra, isAnagram      # call isAnagram(s(a0), t(a1))
    bne a0, x0, TRUE_1     # if isAnagram(s(a0), t(a1)) == 1 correct
-    li a0, 1               # reload a0 to handle stdout
-    la a1, not_correct_1   # test1 not correct address
-    la a2, t1incor_size    # test1 not correct length
-    ecall
    lw ra, 0(sp)
    addi sp, sp, 4
    jr ra

TEST_2:
    addi sp, sp, -4
    sw ra, 0(sp)
    la a0, test_2_s        # s(a0) = test_2_s
    la a1, test_2_t        # t(a1) = test_2_t
    jal ra, isAnagram	   # call isAnagram(s(a0), t(a1))
    beq a0, x0, TRUE_2	   # if isAnagram(s(a0), t(a1)) == 0 correct

-    la a1, not_correct_2   # test2 not correct address
-    la a2, t2incor_size    # test not correct length
-    ecall
    lw ra, 0(sp)
    addi sp, sp, 4
    jr ra

TEST_3:
    addi sp, sp, -4
    sw ra, 0(sp)
    la a0, test_3_s        # s(a0) = test_3_s
    la a1, test_3_t        # t(a1) = test_3_t
    jal ra, isAnagram	   # call isAnagram(s(a0), t(a1))
    bne a0, x0, TRUE_3     # if isAnagram(s(a0), t(a1)) == 1 correct
-    li a0, 1               # reload a0 to handle stdout
-    la a1, not_correct_3   # test3 not correct address
-    la a2, t3incor_size    # test3 not correct length
    ecall
    lw ra, 0(sp)
    addi sp, sp, 4
    jr ra

TRUE_1:
-    la a1, correct_1     # test1 correct address
-    la a2, t1cor_size    # test1 correct length
-    ecall
    lw ra, 0(sp)
    addi sp, sp, 4
    jr ra               # go to example2

TRUE_2:
-    li a0, 1             # reload a0 to handle stdout 
-    la a1, correct_2     # test2 correct address
-    la a2, t2cor_size    # test2 correct length
-    ecall
    lw ra, 0(sp)
    addi sp, sp, 4
    jr ra              # go to example3

TRUE_3:
-    la a1, correct_3     # test2 correct address
-    la a2, t3cor_size    # test2 correct length
-    ecall
    lw ra, 0(sp)
    addi sp, sp, 4
    jr ra
    
isAnagram:                  # a0 = s, a1 = t
    addi sp, sp, -104       # get sapce for store int letter_freq[26]
    addi t0, sp, 0          # t0 = letter_freq[0]
    addi t1, x0, 0          # t1 = i = 0
    li t2, 26
LOOP1:                      # int letter_freq[26] = {0};
    beq t1, t2, GET_FREQ_s  # if i < 26 
    sw x0, 0(t0)            # letter_freq[i] = 0;
    addi t1, t1, 1          # i++;
    addi t0, t0, 4          # 
    j LOOP1
	
GET_FREQ_s:
    addi t0, sp, 0         # t0 = letter_freq[0]
    addi t1, a0, 0         # t1 = s
    addi t2, x0, 0         # t2 = i = 0
LOOP2:                            # for(  ;s[i] ;i++ )
    add  t3, t1, t2         # get address of s[index] from s[0] + 
    lb t5, 0(t3)            # t5 = s[i]
    beq t5, x0, GET_FREQ_F # if s[i] ==0 break the loop
    addi t5, t5, -97         # t5 = s[i] - 'a'
    slli t5, t5, 2          # get offset form letter_freq[0] to letter_freq[s[i] - 'a']
    add t5, t5, t0          # get address of letter_freq[s[i] - 'a']
    lw t3, 0(t5)            # t3 = [freq[s[i] - 'a']]
    addi t3, t3, 1          # t3 = [freq[s[i] - 'a']] + 1
    sw t3, 0(t5)            # [freq[s[i] - 'a']] = ([freq[s[i] - 'a']] + 1) 
    addi t2, t2, 1          # i++
    j LOOP2
GET_FREQ_F:
    addi t0, sp, 0         # t0 = freq[]
    addi t1, a1, 0         # t1 = t
    addi t2, x0, 0         # t2 = i = 0
LOOP3:                      # for(  ;t[i] ;i++ )
    add  t3, t1, t2         # get address of t[index]
    lb t5, 0(t3)            # t5 = t[i]
    beq t5, x0, CHECK       # if t[i] == 0 break the loop
    addi t5, t5, -97        # t5 = t[i] - 'a'
    slli t5, t5, 2          # get offset form letter_freq[0] to letter_freq[t[i] - 'a']
    add t5, t5, t0        
    lw t3, 0(t5)            # t5 = [freq[t[i] - 'a']]
    addi t3, t3, -1         # t3 = [freq[t[i] - 'a']] - 1
    sw t3, 0(t5)            # [freq[t[i] - 'a']] = ([freq[t[i] - 'a']] - 1) 
    addi t2, t2, 1          # i++
    j LOOP3
CHECK:
    addi t0, sp, 0 # t0 = address freq[0]
    addi t1, x0, 0 # t1 = i = 0
    li t2, 26
LOOP4:                      # for (int i = 0; i < 26; i++)
    beq t1, t2, TRUE         # i < 26
    lw t3, 0(t0)            # t3 = freq[i];
    bne t3, x0, FALSE       # if freq[i] != 0 break
    addi t1, t1, 1         # i++;
    addi t0, t0, 4         # freq + 1
    j LOOP4

FALSE:
    addi a0, x0, 0          # if false return false
    j END_F
TRUE:
    addi a0, x0, 1          # if pass check return true
END_F:
    addi sp, sp, 104
    jr ra                   # return,  a0 = return value = true or false

end:
    li a7, SYSEXIT      # "exit" syscall
    add a0, x0, 0       # Use 0 return code
    ecall               # invoke syscall to terminate the program

Caculate the CSR Cycle

make
../../build rv32emu --stats hw2.elf

inferior exit code 0
CSR cycle count: 751

The handwritten assembly code is better than compiler in all optimization level.

Try to optimize




































































































































.org 0
# Provide program starting address to linker
.global _start

/* newlib system calls */
.set SYSEXIT,  93
.set SYSWRITE, 64


.data

test_1_s: .string "anagram"
test_1_t: .string "nagaram"
test_2_s: .string "rat"
test_2_t: .string "car"
test_3_s: .string "tseng"
test_3_t: .string "gnest"
correct_1:       .string "test_1: Pass\n"
                 .set t1cor_size, .-correct_1
not_correct_1:   .string "test_1: Fail\n"
                 .set t1incor_size, .-not_correct_1
correct_2:       .string "test_2: Pass\n"
                 .set t2cor_size, .-correct_2
not_correct_2:   .string "test_2: Fail\n"
                 .set t2incor_size, .-not_correct_2
correct_3:       .string "test_3: Pass\n"
                 .set t3cor_size, .-correct_3
not_correct_3:   .string "test_3: Fail\n"
                 .set t3incor_size, .-not_correct_3

.text


isAnagram:                  # a0 = s, a1 = t
    addi sp, sp, -104       # get sapce for store int letter_freq[26]

memset:                     # int letter_freq[26] = {0};
    sw x0, 0(sp)            # letter_freq[i] = 0;
    sw x0, 4(sp)
    sw x0, 8(sp)
    sw x0, 12(sp)
    sw x0, 16(sp)
    sw x0, 20(sp)
    sw x0, 24(sp)
    sw x0, 28(sp)
    sw x0, 32(sp)
    sw x0, 36(sp)
    sw x0, 40(sp)
    sw x0, 44(sp)
    sw x0, 48(sp)
    sw x0, 52(sp)
    sw x0, 56(sp)
    sw x0, 60(sp)
    sw x0, 64(sp)
    sw x0, 68(sp)
    sw x0, 72(sp)
    sw x0, 76(sp)
    sw x0, 80(sp)
    sw x0, 84(sp)
    sw x0, 88(sp)
    sw x0, 92(sp)
    sw x0, 96(sp)
    sw x0, 100(sp)

PRECOND1:
    lbu t5, 0(a0)          # t5 = s[0]
    beqz t5, PRECOND2    # if s[0] = 0, break the loop
    addi t2, a0, 1         # t2 = s++
LOOP2:                            # for(  ;s[i] ;i++ )
    addi t5, t5, -97       # s[i] - 'a'
    slli t5, t5, 0x2       # get offset from letter_freq[0] to letter_freq[s[i] - 'a']
    addi t5, t5, 112
    add t5, t5, sp         # t5 = address of letter_freq[s[i] - 'a']
    lw t3, -104(t5)           # t3 = letter_freq[s[i] - 'a']
    addi t3, t3, 1         # letter_freq[s[i] - 'a']++
    sw t3, -104(t5)           # store back
    addi t2, t2, 1         # s++
    lbu t5, -1(t2)         # t5 = s[i]
    bnez t5, LOOP2

PRECOND2:
    lbu t5, 0(a1)          # t5 = t[0]
    beqz t5, CHECK         # if t[0] = 0, break the loop
    addi t2, a1, 1         # t2 = t++
LOOP3:                      # for(  ;t[i] ;i++ )
    addi t5, t5, -97       # t[i] - 'a'
    slli t5, t5, 0x2       # get offset from letter_freq[0] to letter_freq[t[i] - 'a']
    addi t5, t5, 112
    add t5, t5, sp         # t5 = address of letter_freq[t[i] - 'a']
    lw t3, -104(t5)           # t3 = letter_freq[t[i] - 'a']
    addi t3, t3, -1        # letter_freq[t[i] - 'a']--
    sw t3, -104(t5)           # store back
    addi t2, t2, 1         # t++
    lbu t5, -1(t2)         # t5 = t[i]
    bnez t5, LOOP3

CHECK:
    addi t0, sp, 8 # t0 = address freq[0]
    addi t1, sp, 112        # 104 = 4 * 26 = the end of the letter_freq[]
LOOP4:                      # for (int i = 0; i < 26; i++)
    lw t3, 0(t0)            # t3 = freq[i];
    bne t3, x0, FALSE       # if freq[i] != 0 break
    addi t0, t0, 4          # freq++
    bne t0, t1, LOOP4

TRUE:
    addi a0, x0, 1          # if pass check return true
    j END_F
FALSE:
    addi a0, x0, 0          # if false return false

END_F:
    addi sp, sp, 104
    jr ra                   # return,  a0 = return value = true or false

_start: 
    la a0, test_1_s        # s(a0) = test_1_s  
    la a1, test_1_t        # t(a1) = test_1_t
    jal ra, isAnagram      # call isAnagram(s(a0), t(a1))

    la a0, test_2_s        # s(a0) = test_2_s
    la a1, test_2_t        # t(a1) = test_2_t
    jal ra, isAnagram	   # call isAnagram(s(a0), t(a1))

    la a0, test_3_s        # s(a0) = test_3_s
    la a1, test_3_t        # t(a1) = test_3_t
    jal ra, isAnagram	   # call isAnagram(s(a0), t(a1))

end:
    li a7, SYSEXIT      # "exit" syscall
    add a0, x0, 0       # Use 0 return code
    ecall               # invoke syscall to terminate the program

Inspirate by -O1 optimization level and kdvnt's homework, write down the code above.

Unroll the memset function, which is originally done by loop.
Reschedule the code to reduce branch number









101b8:	f9f78793          	addi	a5,a5,-97
101bc:	00279793          	slli	a5,a5,0x2
101c0:	07078793          	addi	a5,a5,112
101c4:	002787b3          	add	a5,a5,sp
101c8:	f987a703          	lw	a4,-104(a5)
101cc:	00170713          	addi	a4,a4,1
101d0:	f8e7ac23          	sw	a4,-104(a5)
101d4:	00150513          	addi	a0,a0,1
101d8:	fff54783          	lbu	a5,-1(a0)

Notice that in O1 optimization level, there is some redundant assembly code, and I modified it.

addi t5, t5, -97       # s[i] - 'a'
slli t5, t5, 0x2       # get offset from letter_freq[0] to letter_freq[s[i] - 'a']
add t5, t5, sp         # t5 = address of letter_freq[s[i] - 'a']
lw t3, 0(t5)           # t3 = letter_freq[s[i] - 'a']
addi t3, t3, 1         # letter_freq[s[i] - 'a']++
sw t3, 0(t5)           # store back
addi t2, t2, 1         # s++
lbu t5, -1(t2)         # t5 = s[i]

Result

CSR cycle count: 499

	O0	O1	O2	O3	Ofast	Os	My Version
CSR Cycle	1688	811	837	837	837	837	499

However, if I try some loopunrolling (motivated by kdvnt's homework), CSR cycle count doesn't change, such as

LOOP2:                            # for(  ;s[i] ;i++ )
    beqz t5, PRECOND2
    addi t5, t5, -97       # s[i] - 'a'
    slli t5, t5, 0x2       # get offset from letter_freq[0] to letter_freq[s[i] - 'a']
    add t5, t5, sp         # t5 = address of letter_freq[s[i] - 'a']
    lw t3, 0(t5)           # t3 = letter_freq[s[i] - 'a']
    addi t3, t3, 1         # letter_freq[s[i] - 'a']++
    sw t3, 0(t5)           # store back
    addi t2, t2, 1         # s++
    lbu t5, -1(t2)         # t5 = s[i]
    
    beqz t5, PRECOND2
    addi t5, t5, -97       # s[i] - 'a'
    slli t5, t5, 0x2       # get offset from letter_freq[0] to letter_freq[s[i] - 'a']
    add t5, t5, sp         # t5 = address of letter_freq[s[i] - 'a']
    lw t3, 0(t5)           # t3 = letter_freq[s[i] - 'a']
    addi t3, t3, 1         # letter_freq[s[i] - 'a']++
    sw t3, 0(t5)           # store back
    addi t2, t2, 1         # s++
    lbu t5, -1(t2)         # t5 = s[i]
    
    j LOOP2

But, if I do the same work in Ripes, the cycle will reduce as expected(similar result in kdvnt's work).

Notice that loop unrolling reduce branch penalty, but rv32emu doesn't implement pipeline. In rv32emu, instruction is execute line by line(with CSR counter plus 1), so there is no branch misprediction. So, I think only consider CSR cycle to determine the code efficiency is unfair.

I try to use clock() which is defined in time.h or gettimeofday() which is defined in /sys/time.h, to caculate actual time.

Rather put the clock() in rv32emu's main function, I just want to caculate the time consume about my algorithm(isAnagram in my case). But encounter some problem when calling system call.

Implement syscall clock_gettime

Rectification

The goal of rv32emu is to create a cycle-accurate[1] RISC-V instruction set emulator, shouldn't rely on (semi-)system call such as clock/clock_gettime. Directly use performance counter[2] to get the actual executed instruction, which are part of RISC-V instruction set.

[1] cycle-accurate
[2] Link1/Link2

syscall.c

#define SUPPORTED_SYSCALLS       \
    _(close,            57)      \
    _(lseek,            62)      \
    _(read,             63)      \
    _(write,            64)      \
    _(fstat,            80)      \
    _(exit,             93)      \
    _(gettimeofday,     169)     \
    _(brk,              214)     \
+    _(clock_gettime,    403)     \
    _(open,             1024)    \
    IIF(RV32_HAS(SDL))(          \
        _(draw_frame,   0xBEEF)  \
        _(setup_queue,  0xC0DE)  \
        _(submit_queue, 0xFEED), \
    )

static void syscall_clock_gettime(struct riscv_t *rv)
{
    state_t *s = rv_userdata(rv); /* access userdata */

    /* get the parameters */
    riscv_word_t tp = rv_get_reg(rv, rv_reg_a1);

    /* return the clock time */
    if (tp) {
#if defined(HAVE_POSIX_TIMER)
        struct timespec t;
        clock_gettime(CLOCK_MONOTONIC, &t);

        int32_t tv_sec = t.tv_sec;
        int32_t tv_msec = t.tv_nsec / 1000000;  // resolution (ms)

#elif defined(HAVE_MACH_TIMER)
        static mach_timebase_info_data_t info;
        /* If this is the first time we have run, get the timebase.
         * We can use denom == 0 to indicate that sTimebaseInfo is
         * uninitialized.
         */
        if (info.denom == 0)
            (void) mach_timebase_info(&info);
        /* Hope that the multiplication doesn't overflow. */
        uint64_t nsecs = mach_absolute_time() * info.numer / info.denom;
        int32_t tv_sec = nsecs / 1e9;
        int32_t tv_usec = (nsecs / 1e3) - (tv_sec * 1e6);
#else /* low resolution timer */
        clock_t t = clock();
        int32_t tv_sec = t / CLOCKS_PER_SEC;
        int32_t tv_usec = (t % CLOCKS_PER_SEC) * (1000000 / CLOCKS_PER_SEC);
#endif
        memory_write(s->mem, tp + 0, (const uint8_t *) &tv_sec, 4);
        memory_write(s->mem, tp + 8, (const uint8_t *) &tv_msec, 4);
    }

    /* success */
    rv_set_reg(rv, rv_reg_a0, 0);
}

More detail in Note

Motivated by 2022q1 Homework6 (kecho) which is done by Risheng1128, try to generate random string with string length 10000 to 100000, and use clock function which is implement above to test time.

The code above test two same string input, so the algorithm will go through whole string.

2022 Assignment2: RISC-V Toolchain

tags: Computer Architecture

Pick up one Problem

Original Solution by 曾晧峖

Disassemble the ELF files and Analysis

-O0(default)

Size

Disassembly code

Observation

Explanation

<main>

<isAnagram>

-O1

Size

Disassembly code

Observation

Explanation

<main>

<isAnagram>

-O2

Size

Disassembly code

Observation

Explanation

<main>

<isAnagram>

-O3

Size

Disassembly code

Observation

Explanation

-Ofast

Size

Disassembly code

Observation

Explanation

-Os

Size

Disassembly code

Observation

Explanation

<main>

<isAnagram>

Summary

Rewrite the Problem

Check the results and Optimization

Try to optimize

Implement syscall clock_gettime

Reference Link

Read more

面試紀錄 & 練習 (聯發科)

2023q1 Homework1 (lab0)

將現有的 Raspberry PI 系統備份

Run FreeRTOS on VexRiscv and Measure/Tweak context switch overhead

tags: `Computer Architecture`

`<main>`

`<isAnagram>`

`<main>`

`<isAnagram>`

`<main>`

`<isAnagram>`

`<main>`

`<isAnagram>`