contributed by <StevenChou499
>
Question Selection
Question
I choose question from 蘇勇達 Length of Last Word, which is LeetCode 58 .
Motivation
The reason I choose this question is because there are two loop using in assembly code, which is a great way for practicing some optimizing skills.
C code (Modified)
- Because the original c code uses standard string library and have some errors, I switched to use self-made
strlen
function and modifed the code. Also, I added more test cases for different result in answer.
Assembly code (Modified)
- Same as the C code, I added some modification in order to have all three test cases and same output as the C code and also obey the calling convention.
.org 0
.global _start
.set SYSEXIT, 93
.set SYSWRITE, 64
.data
str1: .string "Hello World"
.set str1len, .-str1
str2: .string "I am a student "
.set str2len, .-str2
str3: .string "a"
.set str3len, .-str3
fstr: .string "For string \""
.set flen, .-fstr
answrstr: .string "\", length of last word : "
.set answrlen, .-answrstr
newline: .string "\n"
.set nllen, .-newline
.text
_start:
la a0, str1
jal ra, lengthoflastword
add a2, x0, a0
la a0, str1
li a1, str1len
jal print_result
la a0, str2
jal ra, lengthoflastword
add a2, x0, a0
la a0, str2
li a1, str2len
jal print_result
la a0, str3
jal ra, lengthoflastword
add a2, x0, a0
la a0, str3
li a1, str3len
jal print_result
li a0, 0
li a7, SYSEXIT
ecall
nochar:
li a0, 0
ret
lengthoflastword:
lb t0, 0(a0)
beq t0, x0, nochar
addi a1, x0, 1
findend:
addi a0, a0, 1
addi a1, a1, 1
lb t0, 0(a0)
bne t0, x0, findend
li a2, 32
loop1:
beq a1, x0, return
addi a0, a0, -1
addi a1, a1, -1
lb t0, 0(a0)
beq t0, a2, loop1
add a3, x0, a0
add a0, x0, x0
loop2:
beq a1, x0, return
addi a1, a1, -1
addi a3, a3, -1
addi a0, a0, 1
lb t0, 0(a3)
bne t0, a2, loop2
return:
ret
print_result:
add s0, x0, a0
add s1, x0, a1
add s2, x0, a2
li a7, SYSWRITE
li a0, 1
la a1, fstr
la a2, flen
ecall
li a7, SYSWRITE
li a0, 1
add a1, x0, s0
add a2, x0, s1
ecall
li a7, SYSWRITE
li a0, 1
la a1, answrstr
li a2, answrlen
ecall
li a7, SYSWRITE
addi a1, s2, 48
addi sp, sp, -4
sw a1, 0(sp)
li a0, 1
addi a1, sp, 0
li a2, 4
ecall
addi sp, sp, 4
li a7, SYSWRITE
li a0, 1
la a1, newline
li a2, nllen
ecall
ret
Disassembly the .elf
files
- After compiling the C code to
lengthoflastwordO1.elf
, we can now disassembly the assembly code generated by the compiler. First we will take a look at the -O1
optimization result, also we will add some comment and some empty line for better understanding :
000100c4 <_start>:
...
100ec: 285d jal 101a2 <main>
100ee: b75d j 10094 <exit>
000100dc <_start>:
...
00010140 <lengthOfLastWord>:
10140: 00054783 lbu a5,0(a0)
10144: cfa9 beqz a5,1019e <lengthOfLastWord+0x5e>
10146: 4781 li a5,0
10148: 86be mv a3,a5
1014a: 0785 addi a5,a5,1
1014c: 00f50733 add a4,a0,a5
10150: 00074703 lbu a4,0(a4)
10154: fb75 bnez a4,10148 <lengthOfLastWord+0x8>
10156: 00d50733 add a4,a0,a3
1015a: 02000613 li a2,32
1015e: cb91 beqz a5,10172 <lengthOfLastWord+0x32>
10160: 00074683 lbu a3,0(a4)
10164: 00c69963 bne a3,a2,10176 <lengthOfLastWord+0x36>
10168: 177d addi a4,a4,-1
1016a: 17fd addi a5,a5,-1
1016c: fbf5 bnez a5,10160 <lengthOfLastWord+0x20>
1016e: 853e mv a0,a5
10170: 8082 ret
10172: 853e mv a0,a5
10174: 8082 ret
10176: 86ba mv a3,a4
10178: 4501 li a0,0
1017a: 02000613 li a2,32
1017e: 40e78733 sub a4,a5,a4
10182: cb99 beqz a5,10198 <lengthOfLastWord+0x58>
10184: 0006c783 lbu a5,0(a3)
10188: 00c78a63 beq a5,a2,1019c <lengthOfLastWord+0x5c>
1018c: 16fd addi a3,a3,-1
1018e: 0505 addi a0,a0,1
10190: 00e687b3 add a5,a3,a4
10194: fbe5 bnez a5,10184 <lengthOfLastWord+0x44>
10196: a029 j 101a0 <lengthOfLastWord+0x60>
10198: 853e mv a0,a5
1019a: 8082 ret
1019c: 8082 ret
1019e: 4501 li a0,0
101a0: 8082 ret
...
000101a2 <main>:
101a2: 1101 addi sp,sp,-32
101a4: ce06 sw ra,28(sp)
101a6: cc22 sw s0,24(sp)
101a8: ca26 sw s1,20(sp)
101aa: c84a sw s2,16(sp)
101ac: c64e sw s3,12(sp)
101ae: c452 sw s4,8(sp)
101b0: c256 sw s5,4(sp)
101b2: 6af1 lui s5,0x1c
101b4: 8f8a8513 addi a0,s5,-1800
101b8: 3761 jal 10140 <lengthOfLastWord>
101ba: 842a mv s0,a0
101bc: 6a71 lui s4,0x1c
101be: 904a0513 addi a0,s4,-1788
101c2: 3fbd jal 10140 <lengthOfLastWord>
101c4: 892a mv s2,a0
101c6: 69f1 lui s3,0x1c
101c8: 91898513 addi a0,s3,-1768
101cc: 3f95 jal 10140 <lengthOfLastWord>
101ce: 84aa mv s1,a0
101d0: 8622 mv a2,s0
101d2: 8f8a8593 addi a1,s5,-1800
101d6: 6471 lui s0,0x1c
101d8: 91c40513 addi a0,s0,-1764
101dc: 2255 jal 10380 <printf>
101de: 864a mv a2,s2
101e0: 904a0593 addi a1,s4,-1788
101e4: 91c40513 addi a0,s0,-1764
101e8: 2a61 jal 10380 <printf>
101ea: 8626 mv a2,s1
101ec: 91898593 addi a1,s3,-1768
101f0: 91c40513 addi a0,s0,-1764
101f4: 2271 jal 10380 <printf>
101f6: 4501 li a0,0
101f8: 40f2 lw ra,28(sp)
101fa: 4462 lw s0,24(sp)
101fc: 44d2 lw s1,20(sp)
101fe: 4942 lw s2,16(sp)
10200: 49b2 lw s3,12(sp)
10202: 4a22 lw s4,8(sp)
10204: 4a92 lw s5,4(sp)
10206: 6105 addi sp,sp,32
10208: 8082 ret
...
- Below is the basic control flow graph :
- Below is the disassembly size :
- Next I will disassemble my compiled hand-written assembly code :
...
00000000 <_start>:
0: 00000517 auipc a0,0x0
4: 15c50513 addi a0,a0,348 # 15c <str1>
8: 06c000ef jal ra,74 <lengthoflastword>
c: 00a00633 add a2,zero,a0
10: 00000517 auipc a0,0x0
14: 14c50513 addi a0,a0,332 # 15c <str1>
18: 00c00593 li a1,12
1c: 0b0000ef jal ra,cc <print_result>
20: 00000517 auipc a0,0x0
24: 14850513 addi a0,a0,328 # 168 <str2>
28: 04c000ef jal ra,74 <lengthoflastword>
2c: 00a00633 add a2,zero,a0
30: 00000517 auipc a0,0x0
34: 13850513 addi a0,a0,312 # 168 <str2>
38: 01100593 li a1,17
3c: 090000ef jal ra,cc <print_result>
40: 00000517 auipc a0,0x0
44: 13950513 addi a0,a0,313 # 179 <str3>
48: 02c000ef jal ra,74 <lengthoflastword>
4c: 00a00633 add a2,zero,a0
50: 00000517 auipc a0,0x0
54: 12950513 addi a0,a0,297 # 179 <str3>
58: 00200593 li a1,2
5c: 070000ef jal ra,cc <print_result>
60: 00000513 li a0,0
64: 05d00893 li a7,93
68: 00000073 ecall
0000006c <nochar>:
6c: 00000513 li a0,0
70: 00008067 ret
00000074 <lengthoflastword>:
74: 00050283 lb t0,0(a0)
78: fe028ae3 beqz t0,6c <nochar>
7c: 00100593 li a1,1
00000080 <findend>:
80: 00150513 addi a0,a0,1
84: 00158593 addi a1,a1,1
88: 00050283 lb t0,0(a0)
8c: fe029ae3 bnez t0,80 <findend>
90: 02000613 li a2,32
00000094 <loop1>:
94: 02058a63 beqz a1,c8 <return>
98: fff50513 addi a0,a0,-1
9c: fff58593 addi a1,a1,-1
a0: 00050283 lb t0,0(a0)
a4: fec288e3 beq t0,a2,94 <loop1>
a8: 00a006b3 add a3,zero,a0
ac: 00000533 add a0,zero,zero
000000b0 <loop2>:
b0: 00058c63 beqz a1,c8 <return>
b4: fff58593 addi a1,a1,-1
b8: fff68693 addi a3,a3,-1
bc: 00150513 addi a0,a0,1
c0: 00068283 lb t0,0(a3)
c4: fec296e3 bne t0,a2,b0 <loop2>
000000c8 <return>:
c8: 00008067 ret
000000cc <print_result>:
cc: 00a00433 add s0,zero,a0
d0: 00b004b3 add s1,zero,a1
d4: 00c00933 add s2,zero,a2
d8: 04000893 li a7,64
dc: 00100513 li a0,1
e0: 00000597 auipc a1,0x0
e4: 09b58593 addi a1,a1,155 # 17b <fstr>
e8: 00d00613 li a2,13
ec: 00000073 ecall
f0: 04000893 li a7,64
f4: 00100513 li a0,1
f8: 008005b3 add a1,zero,s0
fc: 00900633 add a2,zero,s1
100: 00000073 ecall
104: 04000893 li a7,64
108: 00100513 li a0,1
10c: 00000597 auipc a1,0x0
110: 07c58593 addi a1,a1,124 # 188 <answrstr>
114: 01a00613 li a2,26
118: 00000073 ecall
11c: 04000893 li a7,64
120: 03090593 addi a1,s2,48
124: ffc10113 addi sp,sp,-4
128: 00b12023 sw a1,0(sp)
12c: 00100513 li a0,1
130: 00010593 mv a1,sp
134: 00400613 li a2,4
138: 00000073 ecall
13c: 00410113 addi sp,sp,4
140: 04000893 li a7,64
144: 00100513 li a0,1
148: 00000597 auipc a1,0x0
14c: 05a58593 addi a1,a1,90 # 1a2 <newline>
150: 00200613 li a2,2
154: 00000073 ecall
158: 00008067 ret
0000015c <str1>:
15c: 6548 .2byte 0x6548
15e: 6c6c .2byte 0x6c6c
160: 6f57206f j 73054 <newline+0x72eb2>
164: 6c72 .2byte 0x6c72
166: 0064 .2byte 0x64
00000168 <str2>:
168: 2049 .2byte 0x2049
16a: 6d61 .2byte 0x6d61
16c: 6120 .2byte 0x6120
16e: 7320 .2byte 0x7320
170: 7574 .2byte 0x7574
172: 6564 .2byte 0x6564
174: 746e .2byte 0x746e
176: 2020 .2byte 0x2020
...
00000179 <str3>:
179: 0061 .2byte 0x61
0000017b <fstr>:
17b: 6f46 .2byte 0x6f46
17d: 2072 .2byte 0x2072
17f: 69727473 .4byte 0x69727473
183: 676e .2byte 0x676e
185: 2220 .2byte 0x2220
...
00000188 <answrstr>:
188: 2c22 .2byte 0x2c22
18a: 6c20 .2byte 0x6c20
18c: 6e65 .2byte 0x6e65
18e: 20687467 .4byte 0x20687467
192: 6c20666f jal a2,6854 <newline+0x66b2>
196: 7361 .2byte 0x7361
198: 2074 .2byte 0x2074
19a: 64726f77 .4byte 0x64726f77
19e: 3a20 .2byte 0x3a20
1a0: 0020 .2byte 0x20
000001a2 <newline>:
1a2: 000a .2byte 0xa
...
- Below is the disassembly size :
After disassembling both the compiled C code and hand-written assembly, I find out that compiler tends to use more branches for detection and avoid using t
registers. For example, when detect whether the end of the string has any space characters, the compiler will first check the last byte of the character. If the last character is space, then we will execute the next line, which is the function of eliminating all the spaces (uses another branch for detecting space character). If not, then we will just jump to another address to handle the rest function.
Also, I find out that the compiler uses different registers for space comparisons. Unlike my hand-written code which just uses register a2
to store the space character and uses it untill function return; in the execution of compiler code, it repeatly uses register a3
and a4
to store the space character and uses the other register to load the bytes for comparison.
Optimization level difference
Because calling printf()
inside the program will significantly increase the code size and also influence the execution speed. So in order to get a better comparisons with each optimization level, we will first comment the printf()
function.
After taking a look at the disassembly code. I found out that all the optimization level except -O0
level (which is no optimization) will just ignore the call of function lengthoflastword()
, so we ultimately have include the printf()
function in order to force the compiler to call the function.
-O0
level
First we will take a look at -O0
level, which is no optimization.
00010140 <lengthOfLastWord>:
10140: 7179 addi sp,sp,-48
10142: d622 sw s0,44(sp)
10144: 1800 addi s0,sp,48
10146: fca42e23 sw a0,-36(s0)
1014a: fdc42783 lw a5,-36(s0)
1014e: 0007c783 lbu a5,0(a5)
10152: e399 bnez a5,10158 <lengthOfLastWord+0x18>
10154: 4781 li a5,0
10156: a07d j 10204 <lengthOfLastWord+0xc4>
10158: fe042623 sw zero,-20(s0)
1015c: fdc42783 lw a5,-36(s0)
10160: fef42223 sw a5,-28(s0)
10164: a819 j 1017a <lengthOfLastWord+0x3a>
10166: fdc42783 lw a5,-36(s0)
1016a: 0785 addi a5,a5,1
1016c: fcf42e23 sw a5,-36(s0)
10170: fec42783 lw a5,-20(s0)
10174: 0785 addi a5,a5,1
10176: fef42623 sw a5,-20(s0)
1017a: fdc42783 lw a5,-36(s0)
1017e: 0007c783 lbu a5,0(a5)
10182: f3f5 bnez a5,10166 <lengthOfLastWord+0x26>
10184: fe442783 lw a5,-28(s0)
10188: fcf42e23 sw a5,-36(s0)
1018c: fe042423 sw zero,-24(s0)
10190: fec42783 lw a5,-20(s0)
10194: 17fd addi a5,a5,-1
10196: fdc42703 lw a4,-36(s0)
1019a: 97ba add a5,a5,a4
1019c: fcf42e23 sw a5,-36(s0)
101a0: a819 j 101b6 <lengthOfLastWord+0x76>
101a2: fdc42783 lw a5,-36(s0)
101a6: 17fd addi a5,a5,-1
101a8: fcf42e23 sw a5,-36(s0)
101ac: fec42783 lw a5,-20(s0)
101b0: 17fd addi a5,a5,-1
101b2: fef42623 sw a5,-20(s0)
101b6: fec42783 lw a5,-20(s0)
101ba: c785 beqz a5,101e2 <lengthOfLastWord+0xa2>
101bc: fdc42783 lw a5,-36(s0)
101c0: 0007c703 lbu a4,0(a5)
101c4: 02000793 li a5,32
101c8: fcf70de3 beq a4,a5,101a2 <lengthOfLastWord+0x62>
101cc: a819 j 101e2 <lengthOfLastWord+0xa2>
101ce: fdc42783 lw a5,-36(s0)
101d2: 17fd addi a5,a5,-1
101d4: fcf42e23 sw a5,-36(s0)
101d8: fe842783 lw a5,-24(s0)
101dc: 0785 addi a5,a5,1
101de: fef42423 sw a5,-24(s0)
101e2: fec42783 lw a5,-20(s0)
101e6: fff78713 addi a4,a5,-1
101ea: fee42623 sw a4,-20(s0)
101ee: cb89 beqz a5,10200 <lengthOfLastWord+0xc0>
101f0: fdc42783 lw a5,-36(s0)
101f4: 0007c703 lbu a4,0(a5)
101f8: 02000793 li a5,32
101fc: fcf719e3 bne a4,a5,101ce <lengthOfLastWord+0x8e>
10200: fe842783 lw a5,-24(s0)
10204: 853e mv a0,a5
10206: 5432 lw s0,44(sp)
10208: 6145 addi sp,sp,48
1020a: 8082 ret
We can see that after we eliminate the printf()
function, we significantly decrease our code size.
-O1
level
Next is -O1
level, which is first level optimization.
00010140 <lengthOfLastWord>:
10140: 00054783 lbu a5,0(a0)
10144: cfa9 beqz a5,1019e <lengthOfLastWord+0x5e>
10146: 4781 li a5,0
10148: 86be mv a3,a5
1014a: 0785 addi a5,a5,1
1014c: 00f50733 add a4,a0,a5
10150: 00074703 lbu a4,0(a4)
10154: fb75 bnez a4,10148 <lengthOfLastWord+0x8>
10156: 00d50733 add a4,a0,a3
1015a: 02000613 li a2,32
1015e: cb91 beqz a5,10172 <lengthOfLastWord+0x32>
10160: 00074683 lbu a3,0(a4)
10164: 00c69963 bne a3,a2,10176 <lengthOfLastWord+0x36>
10168: 177d addi a4,a4,-1
1016a: 17fd addi a5,a5,-1
1016c: fbf5 bnez a5,10160 <lengthOfLastWord+0x20>
1016e: 853e mv a0,a5
10170: 8082 ret
10172: 853e mv a0,a5
10174: 8082 ret
10176: 86ba mv a3,a4
10178: 4501 li a0,0
1017a: 02000613 li a2,32
1017e: 40e78733 sub a4,a5,a4
10182: cb99 beqz a5,10198 <lengthOfLastWord+0x58>
10184: 0006c783 lbu a5,0(a3)
10188: 00c78a63 beq a5,a2,1019c <lengthOfLastWord+0x5c>
1018c: 16fd addi a3,a3,-1
1018e: 0505 addi a0,a0,1
10190: 00e687b3 add a5,a3,a4
10194: fbe5 bnez a5,10184 <lengthOfLastWord+0x44>
10196: a029 j 101a0 <lengthOfLastWord+0x60>
10198: 853e mv a0,a5
1019a: 8082 ret
1019c: 8082 ret
1019e: 4501 li a0,0
101a0: 8082 ret
We can see that the text
size has reduced 100 bytes, but the other portion doesn't change.
The total cycle decrease almost 300 cycles, which is a large portion of execution cycle.
-O2
level
Next is -O2
level, which is second level optimization.
We can see that now the text
size has increases.
-O3
level
Next is -O3
level, which is the most aggresive optimization.
Now the text
size decrease almost 150 bytes.
The total cycle in more than -O2
optimization, means the optimization doesn't lead to faster execution.
-Os
level
-Os
optimization focus on minimizing the code size.
-Ofast
level
-Ofast
optimization focus on get the fastest execution time.
Next we can see our execution speed :
Hand-Written assembly
- Finally is our hand-written assembly :
Next we can see our execution speed :
- Below is the total comparison (exclude hand-written code) :
|
-O0 |
-O1 |
-O2 |
-O3 |
-Os |
-Ofast |
Total size |
54352 |
54222 |
54222 |
54288 |
54204 |
54288 |
Register usage |
4 |
5 |
6 |
6 |
6 |
6 |
Branching Instr. |
6 |
8 |
6 |
6 |
5 |
6 |
Jumping Instr. |
4 |
1 |
2 |
2 |
3 |
2 |
Total cycle |
6898 |
6601 |
6615 |
6615 |
6581 |
6615 |
From the above table, we can know that the best execution speed is -Os
optimization, which uses 300+ cycles less than -O0
optimization, and it also has the fastest execution speed.
To be clear, I found out that the lengthoflastword()
portion of -O2
, -O3
and -Ofast
optimization disassembly is the same. So they will have the simillar or even same testing result.
strlen()
in -Os
optimization
When taking a look at all the diassembly code, we can find out that only the -Os
optimization have called a another function strlen()
inside the lengthoflastword()
function.
- Below is the
strlen()
disassembly code :
000101a4 <lengthOfLastWord>:
...
101b2: 22e5 jal 1039a <strlen>
...
101f8: 8082 ret
...
0001039a <strlen>:
1039a: 00357793 andi a5,a0,3
1039e: 872a mv a4,a0
103a0: ef9d bnez a5,103de <strlen+0x44>
103a2: 7f7f86b7 lui a3,0x7f7f8
103a6: f7f68693 addi a3,a3,-129
103aa: 55fd li a1,-1
103ac: 4310 lw a2,0(a4)
103ae: 0711 addi a4,a4,4
103b0: 00d677b3 and a5,a2,a3
103b4: 97b6 add a5,a5,a3
103b6: 8fd1 or a5,a5,a2
103b8: 8fd5 or a5,a5,a3
103ba: feb789e3 beq a5,a1,103ac <strlen+0x12>
103be: ffc74683 lbu a3,-4(a4)
103c2: 40a707b3 sub a5,a4,a0
103c6: ca8d beqz a3,103f8 <strlen+0x5e>
103c8: ffd74683 lbu a3,-3(a4)
103cc: c29d beqz a3,103f2 <strlen+0x58>
103ce: ffe74503 lbu a0,-2(a4)
103d2: 00a03533 snez a0,a0
103d6: 953e add a0,a0,a5
103d8: 1579 addi a0,a0,-2
103da: 8082 ret
103dc: d2f9 beqz a3,103a2 <strlen+0x8>
103de: 00074783 lbu a5,0(a4)
103e2: 0705 addi a4,a4,1
103e4: 00377693 andi a3,a4,3
103e8: fbf5 bnez a5,103dc <strlen+0x42>
103ea: 8f09 sub a4,a4,a0
103ec: fff70513 addi a0,a4,-1
103f0: 8082 ret
103f2: ffd78513 addi a0,a5,-3
103f6: 8082 ret
103f8: ffc78513 addi a0,a5,-4
103fc: 8082 ret
...
After observing the full function, I find out that the strlen()
uses SWAR technique to speed up to execution when counting the number of characters.
SWAR
- Below is the SWAR technique of the
strlen()
function :
In the above disassembly, there are mainly two consequences. First is we found the loaded 4 bytes doesn't contain any NULL (0) bytes :
- Load aligned 4 bytes into
a4
register ( take I
,
, a
, m
as 4 bytes for example ) :
Now the a2
register value is 0x6d612049
.
- Next we mask the value of
a2
with a3
, which is 0x7F7F7F7F
:
When we mask with value 0x7F7F7F7F
, because all the 7th bit is 0, we can delete all the top bit in every bytes. Now the value of register a5
is still 0x6d612049
.
- Add the register
a3
into register a5
:
No we can add 0x7F7F7F7F
to 0x6d612049
and we can get 0xECE09FC8
.
- Or the register
a5
with a2
and a3
:
(a5)0xECE09FC8 | (a2)0x6d612049 = (a5)0xEDE1BFC9
(a5)0xEDE1BFC9 | (a3)0x7F7F7F7F = (a5)0xFFFFFFFF
This will result in all the bit are set (which is all 1s) , now we can check if the register a5
is -1 using beq a5, a1, 103ac
instruction.
The other consequence is there is at least one NULL byte inside the word :
- Load aligned 4 bytes into
a4
register ( take e
, n
, t
, \0
as 4 bytes for example ) :
Now the a2
register value is 0x00746E65
.
- Next we mask the value of
a2
with a3
, which is 0x7F7F7F7F
:
When we mask with value 0x7F7F7F7F
, because all the 7th bit is 0, we can delete all the top bit in every bytes. Now the value of register a5
is still 0x00746E65
.
- Add the register
a3
into register a5
:
No we can add 0x7F7F7F7F
to 0x00746E65
and we can get 0x7FF3EDE4
. Now here is the different part from first one, because there is at least one NULL byte in the word, when adding with 0x7F
, the result will be 0x7F
, which the 7th bit will still be 0
. And the other none NULL byte will have a carry bit on the 7th bit (which the top nibble will be 8
~ F
).
- Or the register
a5
with a2
and a3
:
(a5)0x7FF3EDE4 | (a2)0x00746E65 = (a5)0x7FF7EFE5
(a5)0x7FF7EFE5 | (a3)0x7F7F7F7F = (a5)0x7FFFFFFF
Now when we or with 0x7F7F7F7F
, because the value of register a5
at least have a byte's top bit isn't 1
, which will make the result unequal to -1
(0xFFFFFFFF).
With the help of SWAR, we can speed up the time of finding the length of string by 4.
Optimizing the Hand-Written Assembly
In order to see the optimized result, I changed all the input into a same strings.
- Below is the version 0 assembly code :
.org 0
.global _start
.set SYSEXIT, 93
.set SYSWRITE, 64
.data
str1: .string " aaaa "
.set str1len, .-str1
str2: .string "aaaaaaaa aaaaaaaa"
.set str2len, .-str2
str3: .string "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa a "
.set str3len, .-str3
fstr: .string "For string \""
.set flen, .-fstr
answrstr: .string "\", length of last word : "
.set answrlen, .-answrstr
newline: .string "\n"
.set nllen, .-newline
.text
_start:
la a0, str1
jal ra, lengthoflastword
add a2, x0, a0
la a0, str1
li a1, str1len
jal print_result
la a0, str2
jal ra, lengthoflastword
add a2, x0, a0
la a0, str2
li a1, str2len
jal print_result
la a0, str3
jal ra, lengthoflastword
add a2, x0, a0
la a0, str3
li a1, str3len
jal print_result
li a0, 0
li a7, SYSEXIT
ecall
nochar:
li a0, 0
ret
lengthoflastword:
lb t0, 0(a0)
beq t0, x0, nochar
addi a1, x0, 1
findend:
addi a0, a0, 1
addi a1, a1, 1
lb t0, 0(a0)
bne t0, x0, findend
li a2, 32
loop1:
beq a1, x0, return
addi a0, a0, -1
addi a1, a1, -1
lb t0, 0(a0)
beq t0, a2, loop1
add a3, x0, a0
add a0, x0, x0
loop2:
beq a1, x0, return
addi a1, a1, -1
addi a3, a3, -1
addi a0, a0, 1
lb t0, 0(a3)
bne t0, a2, loop2
return:
ret
...
- Code size and execution result :
Implementing SWAR in assembly
When dealing with very large strings, we can use SWAR technique to speed up to process of traversing to the end of string.
- Below is the Assembly using SWAR :
.org 0
.global _start
.set SYSEXIT, 93
.set SYSWRITE, 64
.data
str1: .string " aaaa "
.set str1len, .-str1
str2: .string "aaaaaaaa aaaaaaaa"
.set str2len, .-str2
str3: .string "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa a "
.set str3len, .-str3
fstr: .string "For string \""
.set flen, .-fstr
answrstr: .string "\", length of last word : "
.set answrlen, .-answrstr
newline: .string "\n"
.set nllen, .-newline
.text
_start:
la a0, str1
jal ra, lengthoflastword
add a2, x0, a0
la a0, str1
li a1, str1len
jal print_result
la a0, str2
jal ra, lengthoflastword
add a2, x0, a0
la a0, str2
li a1, str2len
jal print_result
la a0, str3
jal ra, lengthoflastword
add a2, x0, a0
la a0, str3
li a1, str3len
jal print_result
li a0, 0
li a7, SYSEXIT
ecall
nochar:
li a0, 0
ret
lengthoflastword:
lb t0, 0(a0)
beq t0, x0, nochar
andi a2, a0, 3
add a1, x0, a0
bne a2, x0, unaligned
starttraverse:
lui a3, 0x7F7F8
addi a3, a3, -129
addi a4, x0, -1
findend:
lw a2, 0(a1)
addi a1, a1, 4
and a5, a2, a3
add a5, a5, a3
or a5, a5, a3
beq a5, a4, findend
lbu a2, -4(a1)
sub a3, a1, a0
beq a2, x0, sub4
lbu a2, -3(a1)
beq a2, x0, sub3
lbu a2, -2(a1)
snez a2, a2
add a2, a2, a3
addi a1, a2, -2
j findword
checkalign:
beq a3, x0, starttraverse
unaligned:
lbu a2, 0(a1)
addi a1, a1, 1
andi a3, a1, 3
bne a2, x0, checkalign
sub a1, a1, a0
addi a1, a1, -1
j findword
sub4:
addi a1, a3, -4
j findword
sub3:
addi a1, a3, -3
j findword
findword:
li a2, 32
add a0, a1, a0
addi a1, a1, 1
loop1:
beq a1, x0, return
addi a0, a0, -1
addi a1, a1, -1
lb t0, 0(a0)
beq t0, a2, loop1
add a3, x0, a0
add a0, x0, x0
loop2:
beq a1, x0, return
addi a1, a1, -1
addi a3, a3, -1
addi a0, a0, 1
lb t0, 0(a3)
bne t0, a2, loop2
return:
ret
...
- Code size and execution result :
By observing the code size and CSR cycle, we can see that although the code size has increases 124
bytes, We decrease the CSR cycle by 137
cycles, which is a significant speed up.
Using more than one register for SWAR
To further speed up the execution process, we can use two registers at the same time to detect NULL byte.
- Below is the modified Assembly :
...
nochar:
li a0, 0
ret
lengthoflastword:
lb t0, 0(a0)
beq t0, x0, nochar
a2, a0, 3
add a1, x0, a0
bne a2, x0, unaligned # jump if the address is unaligned
starttraverse:
lui a3, 0x7F7F8
addi a3, a3, -129
addi a4, x0, -1
findend:
lw t0, 0(a1)
lw t1, 4(a1)
a2, t0, t1
addi a1, a1, 8
a5, a2, a3
add a5, a5, a3
or a5, a5, a3
beq a5, a4, findend
a5, t0, a3
add a5, a5, a3
or a5, a5, a3
bne a5, a4, back4
findbyte:
lbu a2, -4(a1)
sub a3, a1, a0
beq a2, x0, sub4
lbu a2, -3(a1)
beq a2, x0, sub3
lbu a2, -2(a1)
snez a2, a2
add a2, a2, a3
addi a1, a2, -2
j findword
back4:
addi a1, a1, -4
j findbyte
checkalign:
beq a3, x0, starttraverse
unaligned:
lbu a2, 0(a1)
addi a1, a1, 1
a3, a1, 3
bne a2, x0, checkalign
sub a1, a1, a0
addi a1, a1, -1
j findword
sub4:
addi a1, a3, -4
j findword
sub3:
addi a1, a3, -3
j findword
findword:
li a2, 32
add a0, a1, a0
addi a1, a1, 1
loop1:
beq a1, x0, return
addi a0, a0, -1
addi a1, a1, -1
lb t0, 0(a0)
beq t0, a2, loop1
add a3, x0, a0
add a0, x0, x0
loop2:
beq a1, x0, return
addi a1, a1, -1
addi a3, a3, -1
addi a0, a0, 1
lb t0, 0(a3)
bne t0, a2, loop2
return:
ret
By loading two words at the same time, we can use and
to combine the result. If there are no NULL
bytes at all, then the and
result will still be -1
. But when there are a NULL
byte inside, then we need to check which word has the NULL
byte inside. By utilizing this method, we can speed up the execution a little bit.
- Below is the code size and execution cycles :
We can see that we increase the code size by 32 bytes, but decrease the CSR cycle of 24 cycles.
Also we can make use of 4 registers at one time.
- Below is the modified Assembly :
...
nochar:
li a0, 0
ret
lengthoflastword:
lb t0, 0(a0)
beq t0, x0, nochar
a2, a0, 3
add a1, x0, a0
bne a2, x0, unaligned # jump if the address is unaligned
starttraverse:
lui a3, 0x7F7F8
addi a3, a3, -129
addi a4, x0, -1
findend:
lw t0, 0(a1)
lw t1, 4(a1)
lw t2, 8(a1)
lw t3, 12(a1)
a2, t0, t1
t4, t2, t3
a2, a2, t4
addi a1, a1, 16
a5, a2, a3
add a5, a5, a3
or a5, a5, a3
beq a5, a4, findend
a5, t0, a3
add a5, a5, a3
or a5, a5, a3
bne a5, a4, back12
a5, t1, a3
add a5, a5, a3
or a5, a5, a3
bne a5, a4, back8
a5, t2, a3
add a5, a5, a3
or a5, a5, a3
bne a5, a4, back4
findbyte:
lbu a2, -4(a1)
sub a3, a1, a0
beq a2, x0, sub4
lbu a2, -3(a1)
beq a2, x0, sub3
lbu a2, -2(a1)
snez a2, a2
add a2, a2, a3
addi a1, a2, -2
j findword
back4:
addi a1, a1, -4
j findbyte
back8:
addi a1, a1, -8
j findbyte
back12:
addi a1, a1, -12
j findbyte
checkalign:
beq a3, x0, starttraverse
unaligned:
lbu a2, 0(a1)
addi a1, a1, 1
a3, a1, 3
bne a2, x0, checkalign
sub a1, a1, a0
addi a1, a1, -1
j findword
sub4:
addi a1, a3, -4
j findword
sub3:
addi a1, a3, -3
j findword
findword:
li a2, 32
add a0, a1, a0
addi a1, a1, 1
loop1:
beq a1, x0, return
addi a0, a0, -1
addi a1, a1, -1
lb t0, 0(a0)
beq t0, a2, loop1
add a3, x0, a0
add a0, x0, x0
loop2:
beq a1, x0, return
addi a1, a1, -1
addi a3, a3, -1
addi a0, a0, 1
lb t0, 0(a3)
bne t0, a2, loop2
return:
ret
...
- Below is the code size and execution cycles :
We can see that the total performance didn't increase very much and will increase code size.
Finally, to further compare the different execution speed, I change the size of string and deleted the print result portion of code to see the execution cycle with respect to number of characters.
- Below is the comparison chart :

If we take a closer look at the chart, we can see that No optimization works fastest when the length of character is short enough.

Beating gcc compiler
In order to compare the execution speed in real life, I copy the code into Ripes simulator to put hazards into consideration.
.data
str1: .string "Hello World"
str2: .string "I am a student "
str3: .string "a"
space: .string " "
fstr: .string "For string \""
answrstr: .string "\", length of last word : "
newline: .string "\n"
.text
main:
la a0, str1
jal lengthOfLastWord
li a7, 1
ecall
la a0, str2
jal lengthOfLastWord
li a7, 1
ecall
la a0, str3
jal lengthOfLastWord
li a7, 1
ecall
li a7, 10
ecall
lengthOfLastWord:
lbu a5, 0(a0)
beqz a5, L5
addi sp, sp, -16
sw s0, 8(sp)
sw ra, 12(sp)
mv s0, a0
jal strlen
addi a5, a0, -1
add a5, a5, s0
li a4, 32
L0:
beqz a0, L2
lbu a3, 0(a5)
addi a2, a0, -1
beq a3, a4, L3
li a4, 0
li a3, 32
L1:
beq a0, a4, L2
sub a2, a5, a4
lbu a2, 0(a2)
bne a2, a3, L4
mv a0, a4
L2:
lw ra, 12(sp)
lw s0, 8(sp)
addi sp, sp, 16
ret
L3:
addi a5, a5, -1
mv a0, a2
j L0
L4:
addi a4,a4,1
j L1
L5:
li a0, 0
ret
strlen:
a5, a0, 3
mv a4, a0
bnez a5, K3
K0:
lui a3, 0x7f7f8
addi a3, a3, -129 # 7f7f7f7f <__BSS_END__+0x7f7da1ff>
li a1, -1
K1:
lw a2, 0(a4)
addi a4, a4, 4
a5, a2, a3
add a5, a5, a3
or a5, a5, a2
or a5, a5, a3
beq a5, a1, K1
lbu a3, -4(a4)
sub a5, a4, a0
beqz a3, K5
lbu a3, -3(a4)
beqz a3, K4
lbu a0, -2(a4)
snez a0, a0
add a0, a0, a5
addi a0, a0, -2
ret
K2:
beqz a3, K0
K3:
lbu a5, 0(a4)
addi a4, a4, 1
a3, a4, 3
bnez a5, K2
sub a4, a4, a0
addi a0, a4, -1
ret
K4:
addi a0, a5, -3
ret
K5:
addi a0, a5, -4
ret
- Here is the execution result :

the CPI is 1.49 (lower is better).
Hand-Written code
- Next is the hand-written code :
.data
str1: .string "Hello World"
str2: .string "I am a student "
str3: .string "a"
space: .string " "
fstr: .string "For string \""
answrstr: .string "\", length of last word : "
newline: .string "\n"
.text
main:
la a0, str1
jal ra, lengthoflastword
li a7, 1
ecall
la a0, str2
jal ra, lengthoflastword
li a7, 1
ecall
la a0, str3
jal ra, lengthoflastword
li a7, 1
ecall
li a7, 10
ecall
nochar:
li a0, 0
ret
lengthoflastword:
lb t0, 0(a0)
beq t0, x0, nochar
andi a2, a0, 3
add a1, x0, a0
bne a2, x0, unaligned
starttraverse:
lui a3, 0x7F7F8
addi a3, a3, -129
addi a4, x0, -1
findend:
lw a2, 0(a1)
addi a1, a1, 4
and a5, a2, a3
add a5, a5, a3
or a5, a5, a3
beq a5, a4, findend
lbu a2, -4(a1)
sub a3, a1, a0
beq a2, x0, sub4
lbu a2, -3(a1)
beq a2, x0, sub3
lbu a2, -2(a1)
snez a2, a2
add a2, a2, a3
addi a1, a2, -2
j findword
checkalign:
beq a3, x0, starttraverse
unaligned:
lbu a2, 0(a1)
addi a1, a1, 1
andi a3, a1, 3
bne a2, x0, checkalign
sub a1, a1, a0
addi a1, a1, -1
j findword
sub4:
addi a1, a3, -4
j findword
sub3:
addi a1, a3, -3
j findword
findword:
li a2, 32
add a0, a1, a0
addi a1, a1, 1
loop1:
beq a1, x0, return
addi a0, a0, -1
addi a1, a1, -1
lb t0, 0(a0)
beq t0, a2, loop1
add a3, x0, a0
add a0, x0, x0
loop2:
beq a1, x0, return
addi a1, a1, -1
addi a3, a3, -1
addi a0, a0, 1
lb t0, 0(a3)
bne t0, a2, loop2
return:
ret
- Here is the execution result :

The CPI is 1.42, which is better than gcc compiler(1.49).
References :
Several GCC compilation options
Introduction of SWAR