## Assignment2: RISC-V Toolchain
contributed by < [yuchen0620](https://github.com/yuchen0620) >
I select the project [Implement palindrome detection and using CLZ](https://hackmd.io/@DpuBXDwLSDCIE6cNJSjUlg/HyQ2i4nep) by 陳浩文.
## Motivation
When I used to write leetcode, the problem about palindorme take me a lot of time to solve. Hence I want to improve my skill about palindorme! Besides, I think the original code which is written by 陳浩文 can be more efficient.
:::warning
Improve your English writing!
:notes: jserv
:::
## Optimize C code
The original one use for loop to implement reverse tempX and it will run 64 times, I refer to the [Bits reversal using clz](https://hackmd.io/@normal/bit-reversal-clz) by 彭煜博, changing the for loop into while loop, because the tempX has 32bits at most, thus the while loop will run 32 times at most, hence I expect that it will be 2 times faster in reversing tempX.
Besides, I also change the **palindrome_detected** function into one argument function, don't need to pass the clz, we only need to call **count_leading_zero** function in the **palindrome_detected** function.
Last, I make the return statement more concise.
### Original code
```c
bool palindrome_detected(uint64_t x, int clz ){
/* tempX = left half of input x (use to reverse and check) */
uint64_t nob = (64 - clz);
uint64_t checkEven = nob % 2;
uint64_t tempX = (x >> (nob / 2));
tempX = (tempX >> checkEven);
/* tempY = right half of input x */
uint64_t leftShiftNum = (nob / 2) + checkEven + (64 - nob);
uint64_t tempY = (x << leftShiftNum);
tempY = (tempY >> leftShiftNum);
/* reverse tempX */
uint64_t revTempX = 0x0;
uint64_t maskTempX = tempX;
for (int i = 0; i < 64; i++)
{
revTempX = (revTempX << 1);
revTempX |= maskTempX & 0x1;
maskTempX = (maskTempX >> 1);
}
revTempX = (revTempX >> leftShiftNum);
/* check revTempX nd tempY same or not */
if (revTempX == tempY) {
return true;
} else {
return false;
}
}
```
### Optimize code
```c
bool palindrome_detected(uint64_t x){
/* tempX = left half of input x (use to reverse and check) */
uint16_t clz = count_leading_zeros(x);
uint64_t nob = (64 - clz);
uint64_t checkEven = nob & 1;
uint64_t tempX = (x >> (nob >> 1));
tempX = (tempX >> checkEven);
/* tempY = right half of input x */
uint64_t leftShiftNum = (nob >> 1) + checkEven + clz;
uint64_t tempY = (x << leftShiftNum);
tempY = (tempY >> leftShiftNum);
/* reverse tempX */
uint64_t revTempX = 0x0;
while (tempX >>= 1)
{
revTempX = ((revTempX << 1) | (tempX & 1));
}
return revTempX == tempY;
}
```
## Handwritten assembly
```c
.data
test: .word 0x00000000,0x00000000,0x00000000,0x00000001,0x00000C00,0x00000003,0xF0000000,0x000000F0
.text
main:
la s1, test
lw a0, 0(s1)
lw a1, 4(s1)
jal ra, palindrome_detected
li a7, 1
ecall
lw a0, 8(s1)
lw a1, 12(s1)
jal ra, palindrome_detected
li a7, 1
ecall
lw a0, 16(s1)
lw a1, 20(s1)
jal ra, palindrome_detected
li a7, 1
ecall
lw a0, 24(s1)
lw a1, 28(s1)
jal ra, palindrome_detected
li a7, 1
ecall
# Exit program
li a7, 10
ecall
palindrome_detected:
addi sp, sp, -4
sw ra, 0(sp)
jal ra count_leading_zeros
mv t0, a0
mv t1, a1
li t4, 64
sub a2, t4, s0 # a2 = nob = 64 - clz;
andi t5, a2, 1 # t5 = checkEven
srli t4, a2, 1 # t4 = nob >> 1
srl t3, t1, t4 # x >> (nob >> 1) (right-half)
li t6 32
sub t2, t6 t4 # (32 - nob>>1)
sll t2, t0, t2 # x >> (nob >> 1) (left-half)
or t3, t3, t2 # tempX = x >> (nob >> 1)
srl t3, t3 , t5 # t3 = tempX
add t4, t4, t5 # leftShiftNum = (nob>>1) + checkEven
add t4, t4, s0 # leftShiftNum += clz
addi t4, t4, -32
addi t5, t1, 0
beq t4,t6, leftShiftNum_equal_32
sll t2, t1 ,t4 # tempY = (x << leftShiftNum) (left-half);
srl t5, t2, t4 # tempY = (tempY >> leftShiftNum);
leftShiftNum_equal_32:
# t2 = reversed number
li t2, 0
li t0, 31
reverse_loop:
srli t3, t3, 1 # tempX >>= 1
beqz t3, return
andi t6, t3, 1
slli t2, t2, 1
or t2, t2, t6
j reverse_loop
return:
beq t2,t5, same # revTempX == tempY
li a0 0
lw ra, 0(sp)
addi sp sp 4
jr ra
same:
li a0 1
lw ra, 0(sp)
addi sp sp 4
jr ra
count_leading_zeros:
mv t1, a1 # t1 = low 32bits
mv t0, a0 # t0 = high 32bits
# x |= (x >> 1);
slli t2, t0, 31 # high 32bits shift left 31
srli t3, t1, 1 # low 32bits shift right 1
or t3, t2, t3
srli t2, t0, 1 # high 32bits shift right 1
or t0, t0, t2
or t1, t1, t3
# x |= (x >> 2);
slli t2, t0, 30
srli t3, t1, 2
or t3, t2, t3
srli t2, t0, 2
or t0, t0, t2
or t1, t1, t3
# x |= (x >> 4);
slli t2, t0, 28
srli t3, t1, 4
or t3, t2, t3
srli t2, t0, 4
or t0, t0, t2
or t1, t1, t3
# x |= (x >> 8);
slli t2, t0, 24
srli t3, t1, 8
or t3, t2, t3
srli t2, t0, 8
or t0, t0, t2
or t1, t1, t3
# x |= (x >> 16);
slli t2, t0, 16
srli t3, t1, 16
or t3, t2, t3
srli t2, t0, 16
or t0, t0, t2
or t1, t1, t3
# x |= (x >> 32);
or t1, t0, t1
# !!!count ones!!!
# x -= (x>>1) & 0x5555555555555555
slli t2, t0, 31 # x>>1
srli t3, t1, 1
or t3, t2, t3
srli t2, t0, 1
li t4, 0x55555555 # & 0x5555555555555555
and t2, t2, t4
and t3, t3, t4
sltu t5, t1, t3 # t5 = borrow bit
sub t0, t0, t2
sub t0, t0, t5
sub t1, t1, t3
# x = ((x >> 2) & 0x3333333333333333) + (x & 0x3333333333333333)
slli t2, t0, 30 # x>>2
srli t3, t1, 2
or t3, t2, t3
srli t2, t0, 2
li t4, 0x33333333 # & 0x333333333333333
and t2, t2, t4
and t3, t3, t4
and t0, t0, t4 # x & 0x3333333333333333
and t1, t1, t4
add t1, t1, t3 # add into x
sltu t5, t1, t3 # t5 = carry bit
add t0, t0, t2
add t0, t0, t5
# x = ((x >> 4) + x) & 0x0f0f0f0f0f0f0f0f
slli t2, t0, 28 # (x>>4) + x
srli t3, t1, 4
or t3, t2, t3
srli t2, t0, 4
add t3, t3, t1
sltu t5, t3, t1
add t2, t2, t0
add t2, t2, t5
li t4, 0x0f0f0f0f # & 0x0f0f0f0f0f0f0f0f
and t0, t2, t4
and t1, t3, t4
# x += (x >> 8)
slli t2, t0, 24
srli t3, t1, 8
or t3, t2, t3
srli t2, t0, 8
add t1, t1, t3
sltu t5, t1, t3 # t5 = carry bit
add t0, t0, t2
add t0, t0, t5
# x += (x >> 16)
slli t2, t0, 16
srli t3, t1, 16
or t3, t2, t3
srli t2, t0, 16
add t1, t1, t3
sltu t5, t1, t3 # t5 = carry bit
add t0, t0, t2
add t0, t0, t5
# x += (x >> 32)
mv t2, t0
add t1, t1, t2
sltu t5, t1, t2
add t0, t0, t5
# return (64 - (x & 0x7f))
andi t1, t1, 0x7f
li t2, 64
sub s0, t2, t1 # store clz result in s0
ret
```
## uint64_t << 64 problem
In the test case **0x0000000000000000** and **0x0000000000000001**, the leftShiftNum = 64,
```c
uint64_t tempY = (x << leftShiftNum);
tempY = (tempY >> leftShiftNum);
```
Thus, we will do the **x << 64** and **tempY >> 64** instructions.
In the test case **0x0000000000000000**,
tempY = 0 << 64 = 0
tempY = 0 >> 64 = 0
tempY = 0, hence we can compare revTempX and tempY to check palindrome.
```
tempX = 0
leftShiftNum = 64
revTempX = 0, tempY = 0
result = 1
```
In the test case **0x0000000000000001**,
tempY = 1 << 64 = 1
tempY = 1 >> 64 = 1
tempY = 1, the tempY is still 1, hence we can also compare the revTempX and tempY to check palindrome.
```
tempX = 0
leftShiftNum = 64
revTempX = 0, tempY = 1
result = 0
```
It's because when we use **<<** and **>>** in C, it will modulo the bit num you want to shift by the bit num of variable.
i.e. **uint64_t 1 << 64 (mod 64) = 1 << 0 = 1**
But, 1 << 32 (32bit register) will be 0 in assembly code, this will make revTempX and tempY both 0, hence the program will return 0x0000000000000001 is palindrome.
To slove this problem, I add a beq in assembly code to check if the leftShiftNum is 32 or not. If it was 32, skip the instructions sll and srl, just compare 2 register [63:32],[31:0] directly.
**t1 = x, t4 = leftshiftNum, t6 = 32**
```
beq t4,t6, leftShiftNum_equal_32
sll t2, t1 ,t4 # tempY = (x << leftShiftNum) (left-half);
srl t5, t2, t4 # tempY = (tempY >> leftShiftNum);
leftShiftNum_equal_32:
```
## Performance comparison
I compare the original C file by 陳浩文 and my revised C file with [ticks.c](https://github.com/sysprog21/rv32emu/blob/master/tests/ticks.c), In addition to, there are four test cases which are the same as the original code.
```c
uint64_t testA = 0x0000000000000000; //0 is palindrome
uint64_t testB = 0x0000000000000001; //testB not palindrome
uint64_t testC = 0x00000C0000000003; //testC is palindrome
uint64_t testD = 0x0F000000000000F0; //testD not palindrome
```
**Compile**
```
$ riscv-none-elf-gcc -march=rv32i -mabi=ilp32 -O0 palindrome_ori.c -o palindrome_ori_O0.elf
$ riscv-none-elf-gcc -march=rv32i -mabi=ilp32 -O0 palindrome_opt.c -o palindrome_opt_O0.elf
```
**Run ELF file**
Reduce 2/3 amount of cycles in those four test cases.
```
$ build/rv32emu palindrome_ori_O0.elf
elapsed cycle: 13148
inferior exit code 0
$ build/rv32emu palindrome_opt_O0.elf
elapsed cycle: 4356
inferior exit code 0
```
**Size**
Sizes are almost the same.
```
$ riscv-none-elf-size palindrome_ori_O0.elf
text data bss dec hex filename
77800 2320 1528 81648 13ef0 palindrome_ori_O0.elf
$ riscv-none-elf-size palindrome_opt_O0.elf
text data bss dec hex filename
77712 2320 1528 81560 13e98 palindrome_opt_O0.elf
```
## Optimized by riscv-none-elf-gcc
I will compare my revised version with O0, O1, O2, O3, Ofast, Os.
The main function of my c code.
```c
int main(){
ticks t0 = getticks();
uint64_t testA = 0x0000000000000000;
uint64_t testB = 0x0000000000000001;
uint64_t testC = 0x00000C0000000003;
uint64_t testD = 0x0F000000000000F0;
palindrome_detected(testA);
palindrome_detected(testB);
palindrome_detected(testC);
palindrome_detected(testD);
ticks t1 = getticks();
printf("elapsed cycle: %" PRIu64 "\n", t1 - t0);
return 0;
}
```
| | O0 | O1 | O2 | O3 | Ofast | Os |
| ----- | ---- | --- | --- | --- | ----- | --- |
| cycle | 4356 | 11 | 11 | 0 | 0 | 11 |
| text | 77712 | 76092 | 76116 | 76440 | 76440 | 76064 |
| data | 2320 | 2320 | 2320 | 2320 | 2320 | 2320 |
| bss | 1528 | 1528 | 1528 | 1528 | 1528 | 1528|
| dec | 81560 | 79940 | 79964 | 80288 | 80288 | 79912 |
After the optimization, the cycle count turns out some problem, hence I check the disassmbly code of O0, O1 and O3 to find the difference.
```
$ riscv-none-elf-objdump -d palindrome_opt_O0.elf >objdump_O0.txt
$ riscv-none-elf-objdump -d palindrome_opt_O1.elf >objdump_O1.txt
$ riscv-none-elf-objdump -d palindrome_opt_O3.elf >objdump_O3.txt
```
## O0
By observing the main function of O0, we can see that it call palindrome_detected four times correctly!
```
00010a48 <main>:
10a48: fc010113 add sp,sp,-64
10a4c: 02112e23 sw ra,60(sp)
10a50: 02812c23 sw s0,56(sp)
10a54: 04010413 add s0,sp,64
10a58: f28ff0ef jal 10180 <getticks>
10a5c: fea42423 sw a0,-24(s0)
10a60: feb42623 sw a1,-20(s0)
10a64: 00000793 li a5,0
10a68: 00000813 li a6,0
10a6c: fef42023 sw a5,-32(s0)
10a70: ff042223 sw a6,-28(s0)
10a74: 00100713 li a4,1
10a78: 00000793 li a5,0
10a7c: fce42c23 sw a4,-40(s0)
10a80: fcf42e23 sw a5,-36(s0)
10a84: 00300713 li a4,3
10a88: 000017b7 lui a5,0x1
10a8c: c0078793 add a5,a5,-1024 # c00 <exit-0xf494>
10a90: fce42823 sw a4,-48(s0)
10a94: fcf42a23 sw a5,-44(s0)
10a98: 0f000713 li a4,240
10a9c: 0f0007b7 lui a5,0xf000
10aa0: fce42423 sw a4,-56(s0)
10aa4: fcf42623 sw a5,-52(s0)
10aa8: fe042503 lw a0,-32(s0)
10aac: fe442583 lw a1,-28(s0)
10ab0: bd9ff0ef jal 10688 <palindrome_detected>
10ab4: fd842503 lw a0,-40(s0)
10ab8: fdc42583 lw a1,-36(s0)
10abc: bcdff0ef jal 10688 <palindrome_detected>
10ac0: fd042503 lw a0,-48(s0)
10ac4: fd442583 lw a1,-44(s0)
10ac8: bc1ff0ef jal 10688 <palindrome_detected>
10acc: fc842503 lw a0,-56(s0)
10ad0: fcc42583 lw a1,-52(s0)
10ad4: bb5ff0ef jal 10688 <palindrome_detected>
10ad8: ea8ff0ef jal 10180 <getticks>
10adc: fca42023 sw a0,-64(s0)
10ae0: fcb42223 sw a1,-60(s0)
10ae4: fc042703 lw a4,-64(s0)
10ae8: fc442783 lw a5,-60(s0)
10aec: fe842503 lw a0,-24(s0)
10af0: fec42583 lw a1,-20(s0)
10af4: 40a70633 sub a2,a4,a0
10af8: 00060813 mv a6,a2
10afc: 01073833 sltu a6,a4,a6
10b00: 40b786b3 sub a3,a5,a1
10b04: 410687b3 sub a5,a3,a6
10b08: 00078693 mv a3,a5
10b0c: 00060713 mv a4,a2
10b10: 00068793 mv a5,a3
10b14: 00070613 mv a2,a4
10b18: 00078693 mv a3,a5
10b1c: 000227b7 lui a5,0x22
10b20: 25078513 add a0,a5,592 # 22250 <__clzsi2+0x88>
10b24: 5a4000ef jal 110c8 <printf>
10b28: 00000793 li a5,0
10b2c: 00078513 mv a0,a5
10b30: 03c12083 lw ra,60(sp)
10b34: 03812403 lw s0,56(sp)
10b38: 04010113 add sp,sp,64
10b3c: 00008067 ret
```
## O1
But in the main function of O1, it erases the part of calling palindrome_detected function.
```
00010498 <main>:
10498: ff010113 add sp,sp,-16
1049c: 00112623 sw ra,12(sp)
104a0: 00812423 sw s0,8(sp)
104a4: 00912223 sw s1,4(sp)
104a8: cd9ff0ef jal 10180 <getticks>
104ac: 00050413 mv s0,a0
104b0: 00058493 mv s1,a1
104b4: ccdff0ef jal 10180 <getticks>
104b8: 40850633 sub a2,a0,s0
104bc: 00c53533 sltu a0,a0,a2
104c0: 409585b3 sub a1,a1,s1
104c4: 40a586b3 sub a3,a1,a0
104c8: 00022537 lui a0,0x22
104cc: c0050513 add a0,a0,-1024 # 21c00 <__clzsi2+0x8c>
104d0: 5a4000ef jal 10a74 <printf>
104d4: 00000513 li a0,0
104d8: 00c12083 lw ra,12(sp)
104dc: 00812403 lw s0,8(sp)
104e0: 00412483 lw s1,4(sp)
104e4: 01010113 add sp,sp,16
104e8: 00008067 ret
```
## O3
In the main function of O3, it doesn't have the getticks function, it just write getticks in the main function. Besides, after it finish the first getticks instructions, it executes the next getticks instruction right away, I think that is the reason why the elapsed cycle is 0 in O3.
```
000100c0 <main>:
100c0: ff010113 add sp,sp,-16
100c4: 00112623 sw ra,12(sp)
100c8: c8002773 rdcycleh a4
100cc: c0002673 rdcycle a2
100d0: c80027f3 rdcycleh a5
100d4: 40f70733 sub a4,a4,a5
100d8: 00173713 seqz a4,a4
100dc: 40e00733 neg a4,a4
100e0: 00e67633 and a2,a2,a4
100e4: c80026f3 rdcycleh a3
100e8: c00027f3 rdcycle a5
100ec: c80025f3 rdcycleh a1
100f0: 40b686b3 sub a3,a3,a1
100f4: 0016b693 seqz a3,a3
100f8: 40d006b3 neg a3,a3
100fc: 00d7f7b3 and a5,a5,a3
10100: 40c78633 sub a2,a5,a2
10104: 00c7b7b3 sltu a5,a5,a2
10108: 40e686b3 sub a3,a3,a4
1010c: 00022537 lui a0,0x22
10110: 40f686b3 sub a3,a3,a5
10114: d5850513 add a0,a0,-680 # 21d58 <__clzsi2+0x88>
10118: 2b9000ef jal 10bd0 <printf>
1011c: 00c12083 lw ra,12(sp)
10120: 00000513 li a0,0
10124: 01010113 add sp,sp,16
10128: 00008067 ret
```
## Test
Because the optimization will make the main function erase the execute of palindrome detected function, thus I change the c code into the following to check if the program executed the function correctly.
```c
int main(){
ticks t0 = getticks();
uint64_t testA = 0x0000000000000000;
uint64_t testB = 0x0000000000000001;
uint64_t testC = 0x00000C0000000003;
uint64_t testD = 0x0F000000000000F0;
printf("%d",palindrome_detected(testA));
printf("%d",palindrome_detected(testB));
printf("%d",palindrome_detected(testC));
printf("%d",palindrome_detected(testD));
ticks t1 = getticks();
printf("elapsed cycle: %" PRIu64 "\n", t1 - t0);
return 0;
```
After the change, the elapsed cycle count becomes correct number.
| | O0 | O1 | O2 | O3 | Ofast | Os |
| ----- | ---- | --- | --- | --- | ----- | --- |
| cycle | 6656 | 3631 | 3519 | 3470 | 3470 | 3668 |
| text | 77792 | 76204 | 76228 | 76568 | 76568 | 76176 |
| data | 2320 | 2320 | 2320 | 2320 | 2320 | 2320 |
| bss | 1528 | 1528 | 1528 | 1528 | 1528 | 1528|
| dec | 81640 | 80052 | 80076 | 80416 | 80416 | 80024 |
:::info
Observation:
1. From O0 to O1, the cycle count is significantly reduced **(6656->3631)**.
2. From O1 to O2, the cycle count is slightly reduced **(3631->3519)**.
3. From O2 to O3, the cycle count is slightly reduced **(3519->3470)**.
4. From O3 to Ofast, the cycle count is the same, and they are the fastest **(3470)**.
5. The cycle count of Os is the second worst (only better than O0), but Os has the smallest text section.
6. Except O0, the one which has the smaller cycle count has the bigger text section.
7. Data and bss are all the same.
:::
Using objdump to observe the disassembly code
```
$ riscv-none-elf-objdump -d palindrome_optprint_O0.elf >objdump_print_0.txt
$ riscv-none-elf-objdump -d palindrome_optprint_O1.elf >objdump_print_1.txt
$ riscv-none-elf-objdump -d palindrome_optprint_O2.elf >objdump_print_2.txt
$ riscv-none-elf-objdump -d palindrome_optprint_O3.elf >objdump_print_3.txt
$ riscv-none-elf-objdump -d palindrome_optprint_Ofast.elf >objdump_print_fast.txt
$ riscv-none-elf-objdump -d palindrome_optprint_Os.elf >objdump_print_s.txt
```
## Comparison of main function
| | O0 | O1 | O2 | O3 | Ofast | Os |
| ------------ |:----- | ----- | ----- | ----- | ----- | ----- |
| Line | 82 | 49 | 49 | 59 | 59 | 49 |
| s-type reg | 1 | 3 | 3 | 3 | 3 | 3 |
| a-type reg | 7 | 4 | 4 | 6 | 6 | 4 |
| t-type reg | 0 | 0 | 0 | 0 | 0 | 0 |
| other reg | ra、sp | ra、sp | ra、sp | ra、sp| ra、sp | ra、sp |
| stack usage | 64 | 16 | 16 | 16 | 16 | 16 |
| lw/sw | 28 | 8 | 8 | 8 | 8 | 8 |
| branch | 12 | 12 | 12 | 10 | 10 | 12 |
:::info
Observation:
1. From O0 to O1, the line of main function、 stack usage and lw/sw instruction are all significantly reduced,
2. From O1 to O2, the structure of main functions are the same, only a small difference between them is the address of other functions.
```
(jal 10180 <getticks> )
(jal 10244 <getticks> )
```
3. From O2 to O3, the line of main function is slightly increase **(49->59)**, O3 writes getticks in the main function which is lead to use two more register and two less branch instructions.
4. From O3 to Ofast, these two are totally the same.
5. The structure of Os's main function is the same as O1's and O2's.
:::
## Comparison of Palindrome_detected
| | O0 | O1 | O2 | O3 | Ofast | Os |
| ------------ | ----- | ----- | ----- | ----- | ----- | ----- |
| Line | 240 | 98 | 104 | 187 | 187 | 65 |
| s-type reg | 11 | 2 | 2 | 0 | 0 | 7 |
| a-type reg | 6 | 8 | 8 | 8 | 8 | 6 |
| t-type reg | 0 | 0 | 0 | 2 | 2 | 0 |
| other reg | sp、ra | sp、ra | sp、ra | ra | ra | sp、ra |
| stack usage | 176 | 16 | 16 | 0 | 0 | 48 |
| lw/sw | 154 | 6 | 6 | 0 | 0 | 18 |
| branch | 12 | 12 | 12 | 11 | 11 | 7 |
:::info
Observation:
1. From O0 to O1, the line of palindrome_detected function、the use of register、stack usage and lw/sw instruction are all significantly reduced,
2. From O1 to O2, the line of main function is slightly increase, the major difference between them is the instruction order.
| O1 | O2 |
| --- | --- |
add sp,sp,-16|add sp,sp,-16
sw ra,12(sp)|sw s0,8(sp)
sw s0,8(sp) |sw s1,4(sp)
sw s1,4(sp) |mv s0,a1
mv s0,a0 |sw ra,12(sp)
mv s1,a1 |mv s1,a0 |
3. From O2 to O3, the line of main function is sifnificantly increase
**(104->187)**. It is because O3 writes CLZ in the palindrome_detected function which is lead to use one less branch instructions (jal ra clz).
4. From O3 to Ofast, these two are totally the same. Besides,they don't need to access sp register.It is because that they don't use stack in the palindrome_detected function.
5. Os uses register、stack usage、lw/sw to exchange the line of palindrome_detected.
:::
## Optimize handwritten Assembly code
I remove all redundant **mv** instruction. Furthermore, I revise the return section, the old return section uses **beq t2,t5, same # revTempX == tempY**, I revise it into no branch version by
**xor a0, t2, t5** # a0 = 0/1, if input is palindrome/not palindrome.
This is the opposite to our demand. So, I use seqz to turn 0/1 into 1/0 to meet our goal.
**seqz a0, a0**
```diff
palindrome_detected:
addi sp, sp, -4
sw ra, 0(sp)
jal ra count_leading_zeros
- mv t0, a0
- mv t1, a1
li t4, 64
sub a2, t4, s0 # a2 = nob = 64 - clz;
andi t5, a2, 1 # t5 = checkEven
srli t4, a2, 1 # t4 = nob >> 1
- srl t3, t1, t4 # x >> (nob >> 1) (right-half)
+ srl t3, a1, t4 # x >> (nob >> 1) (right-half)
li t6 32
sub t2, t6 t4 # (32 - nob>>1)
- sll t2, t0, t2 # x >> (nob >> 1) (left-half)
+ sll t2, a0, t2 # x >> (nob >> 1) (left-half)
or t3, t3, t2 # tempX = x >> (nob >> 1)
srl t3, t3 , t5 # t3 = tempX
add t4, t4, t5 # leftShiftNum = (nob>>1) + checkEven
add t4, t4, s0 # leftShiftNum += clz
addi t4, t4, -32
- addi t5, a1, 0
beq t4,t6, leftShiftNum_equal_32
- sll t2, t1, t4 # tempY = (x << leftShiftNum) (left-half);
+ sll t2, a1, t4 # tempY = (x << leftShiftNum) (left-half);
srl t5, t2, t4 # tempY = (tempY >> leftShiftNum);
leftShiftNum_equal_32:
# t2 = reversed number
li t2, 0
li t0, 31
reverse_loop:
srli t3, t3, 1 # tempX >>= 1
beqz t3, return
andi t6, t3, 1
slli t2, t2, 1
or t2, t2, t6
j reverse_loop
return:
- beq t2,t5, same # revTempX == tempY
- li a0 0
+ xor a0, t2, t5
+ seqz a0, a0
lw ra, 0(sp)
addi sp sp 4
jr ra
- same:
- li a0 1
- lw ra, 0(sp)
- addi sp sp 4
- jr ra
count_leading_zeros:
- mv t1, a1 # t1 = low 32bits
- mv t0, a0 # t0 = high 32bits
# x |= (x >> 1);
- slli t2, t0, 31 # high 32bits shift left 31
- srli t3, t1, 1 # low 32bits shift right 1
+ slli t2, a0, 31 # high 32bits shift left 31
+ srli t3, a1, 1 # low 32bits shift right 1
or t3, t2, t3
- srli t2, t0, 1 # high 32bits shift right 1
- or t0, t0, t2
- or t1, t1, t3
+ srli t2, a0, 1 # high 32bits shift right 1
+ or t0, a0, t2
+ or t1, a1, t3
.
.
.
# x += (x >> 32)
- mv t2, t0
- add t1, t1, t2
- sltu t5, t1, t2
+ add t1, t1, t0
+ sltu t5, t1, t0
add t0, t0, t5
# return (64 - (x & 0x7f))
andi t1, t1, 0x7f
li t2, 64
sub s0, t2, t1 # store clz result in s0
ret
```
| | handwrite | handwrite_opt |
| ----- | --------- | -------------
| cycle | 1002 | 974 |
| text | 632 | 592 |
## [FULL CODE](https://github.com/yuchen0620/Computer-Architecture/tree/main/HW2)