Assignment1: RISC-V Assembly and Instruction Pipeline

contributed by < kdnvt >

There are two branches in the GitHub repository. While main is the straightforward version, unrolling use the technique of loop unrolling to optimize the code.

Palindrome Linked List (LeetCode 234)

Description: Given the head of a singly linked list, return true if it is a palindrome or false otherwise.

Constraints: The number of nodes in the list is in the range [1, 105].
0 <= Node.val <= 9

Follow up: Could you do it in \(O(n)\) time and \(O(1)\) space?

Solution

Idea for problem solving

Given the condition of \(O(1)\) space, I think the modification of original linked list is inevitable. My basic idea is to reverse the linked list node one by one until middle node, and compare the first half list, which has already been reversed, with the second half list. If two linked list are the same, the original linked list is palindrome linked list.

C program

First version

bool isPalindrome(struct ListNode* head)
{
    struct ListNode *tmp,*cur = head, *next = head->next;
    int length= 0;
    for(;cur!=NULL;cur = cur->next,length++);
    
    if(length<2)
        return true;
    cur = head;
    for(int i=0;i<length/2 - 1;i++){
        tmp = next->next;
        next->next = cur;
        cur = next;
        next = tmp;
    }
    if (length%2 ){
        next = next->next;
    }
    
    for(int i=0;i<length/2 - 1;i++){
        if(cur->val != next->val)
            break;
        cur = cur->next;
        next = next->next;
    }
    if(cur->val == next->val)
        return true;
    return false;
}

Because getting the length of linked list directly is impossible, my first intuition is to traverse the whole linked list, get the length of it, and calculate how many nodes should I reverse.
After that, I can reverse the corresponding number of nodes, and compare the lists.

Second version

bool isPalindrome(struct ListNode* head)
{
    struct ListNode *tmp,*cur = head, *next = head->next,*tail = cur;
    
    if(!head->next)
        return true;
    
    while(tail!=NULL) {
        tail = tail->next;
        if (!tail) {
            cur = cur->next;
            break;
        }
        tail = tail->next;
        
        if (!tail)
            break;
        
        tmp = next->next;
        next->next = cur;
        cur = next;
        next = tmp;
    }
        
    while(next->next) {
        if(cur->val != next->val)
            break;
        cur = cur->next;
        next = next->next;
    }
    if(cur->val == next->val)
        return true;
    return false;
}

After the first version, I was thinking of whether there is a better way to solve this problem. It occurs to me that one of solutions to remove middle node of linked list uses the fast-slow pointers technique.

Third version

bool isPalindrome(struct ListNode* head)
{
    struct ListNode *tmp, *prev = NULL,*cur = head, *tail = cur;
    
    if (!head->next)
        return true;
    
    while(tail && tail->next) {

        tail = tail->next->next;
        
        tmp = cur->next;
        cur->next = prev;
        prev = cur;
        cur = tmp;
        
    }
    if (tail)  // length is odd number
        cur = cur->next;
    
    while (cur->next) {
        if(cur->val != prev->val)
            break;
        cur = cur->next;
        prev = prev->next;
    }
    if (cur->val == prev->val)
        return true;
    return false;
}

Same thought as second version, but a more concise one.

Fourth version

    while (cur) {
        if (cur->val != prev->val)
            return false;
        cur = cur->next;
        prev = prev->next;
    }
    return true;

By simplifying the comparing while loop, it could be much more consice.

Assembly

.data

    list : .word 1,0,2,0,2,0,1,0
    list2: .word 1,0,2,0,3,0,1,0
    list3: .word 1,0,2,0,3,0,2,0,1,0
    list4: .word 1,0,2,0,3,0,3,0,1,0
    list5: .word 1,0
.text

main:
    la s4, list  # s4 stores the head of the target list 
    
    # the process to link the list.
    # list5 don't need this section.
    
    addi t0, s4, 8
    sw t0, 4(s4)
    addi t0, s4, 16
    sw t0, 12(s4)
    addi t0, s4, 24
    sw t0, 20(s4) 
    # addi t0, s4, 32 # for length 5 list
    # sw t0, 28(s4)   # for length 5 list
    
    # end of linking section
    
    add a0, s4, zero
    jal ra, isPalindrome
    j print


isPalindrome:
    addi sp,sp,-16 # push s0~s4
    sw s0,0(sp)
    sw s1,4(sp)
    sw s2,8(sp)
    sw s3,12(sp)

    add s0, zero, zero  # s0 prev = NULL;
    add s1, a0, zero    # s1 cur = head;
    add s2, a0, zero    # s2 = tail = head;

    lw t0, 4(a0)        # t0 = head->next;
    beq t0,zero, true
loop1:
    beq s2, zero, end1  # while(tail)
    lw t0, 4(s2)        # t0 = tail->next;
    beq t0, zero, end1  # while(tail->next)

    lw t1, 4(t0)        # t1 = tail->next->next;
    add s2, t1, zero    # tail = tail->next->next;

    lw s3, 4(s1)        # s3 = tmp = cur->next;
    sw s0, 4(s1)        # cur->next = prev;
    add s0, s1, zero    # prev = cur;
    add s1, s3, zero    # cur = tmp;
    j loop1

end1:
    beq s2,zero,loop2
    lw s1, 4(s1)        # cur = cur->next

loop2:
    lw t0, 4(s1)        # t0 = cur->next;
    beq t0,zero,end2  # while(cur->next)
    lw t0, 0(s0)        # t0 = prev->val;
    lw t1, 0(s1)        # t1 = cur->val;
    bne t0,t1, end2     # if(cur->val != prev->val)
    lw s0, 4(s0)        # prev = prev->next;
    lw s1, 4(s1)        # cur = cur->next;
    j loop2             #
end2:
    lw t0, 0(s1)
    lw t1, 0(s0)
    beq t0,t1, true     #if(cur->val == prev->val) return true

    add a0,zero,zero    #return false
    j pop

true:
    addi a0,zero,1  #return true
    
pop:
    lw s0,0(sp)
    lw s1,4(sp)
    lw s2,8(sp)
    lw s3,12(sp)
    addi sp,sp,16 # pop s0~s4
    jr ra
    
print:
    li a7,1
    ecall

exit:

There are 5 test data in this program:

list : Palindrome list with even number length.
list2 : Non-palindrome list with even number length.
list3 : Palindrome list with odd number length.
list4 : Non-palindrome list with odd number length.
list5 : Single node list.(palindrome)

experiment results

The 5-stage in-order processor with hazard detection/elimination and forwarding gets correct results on each of test data. However, when using the processor without hazard detection/elimination, the simulator says "Unknown system call in register 'a7':0 ". It obvious that there exists hazard around the "ecall" instruction. Compare the pipeline diagram of two version :

With hazard detection/elimination:

Instruction
addi x17 x0 1	IF	ID	EX	MEM	WB
ecall		IF	ID	EX	-	-	MEM	WB

Without hazard detection/elimination:

Instruction
addi x17 x0 1	IF	ID	EX	MEM	WB
ecall		IF	ID	EX	MEM	WB

Finally, I add three nop operations before ecall, so the processor without hazard detection can use the ecall. (two nop aren't enough)

Instruction
addi x17 x0 1	IF	ID	EX	MEM	WB
addi x0 x0 0		IF	ID	EX	MEM	WB
addi x0 x0 0			IF	ID	EX	MEM	WB
addi x0 x0 0				IF	ID	EX	MEM	WB
ecall					IF	ID	EX	MEM	WB

After I fixed the ecall, nevertheless, the processor without hazard detection still got the wrong result. It seems like there are other hazards.

Potential data hazard

When comparing two pipeline diagrams, I can find many hazards which are eliminated by the standard processor.

Standard:

Instruction
lw x5 4 x10	IF	ID	EX	MEM	WB
beq x5 x0 104	IF	ID	-	EX	MEM	WB
beq x18 x0 40		IF	-	ID	EX	MEM	WB

without hazard detection:

Instruction
lw x5 4 x10	IF	ID	EX	MEM	WB
beq x5 x0 104		IF	ID	EX	MEM	WB
beq x18 x0 40			IF	ID	EX	MEM	WB

As we can see, there is a RAW(Read After Write) data hazard on x5 register.

This diagram is the state before stalling. beq x5 x0 104 <true> is in the ID stage, while lw x5 4 x10 is in the EXE stage.

This diagram presents the state when stalling happens. The lw instruction read the x5 register, the value is 0x10000008 on the diagram, which is different from the previous value 0x10000018 beq just read from x5.

This diagram show the next state.
In this state, lw is in the WB stage, which means it will write back the new value to register x5. As we can see, the Wr_En signal in Registers file is green, which means it is enabled. Aside from writing back, it is also "forwarding" the new value to beq in EXE stage. The diagram below shows the forwarding path.

beq in EXE stage compares the two value from registers x5, x0, and decides whether branches or not. Two diagrams below shows the data path of register value. The first diagram shows the data path of x5 from forwarding lw instruction, while the second diagram shows the data path of x0. beq then compares the two value at the Branch unit which is shown at the bottom part of diagram. In this example, x5 is 0x10000008 and x0 is 0x00000000, so the branch isn't taken. As we can see, the Branch_taken signal is red.

This diagram shows x5 data path.

This diagram shows x0 data path.

This diagram shows another operation beq do in EXE stage, calculating the branch address. The upper value 0x00000050 is the instruction address of beq, while the below value 0x00000068 is the immediate field specified in instruction, which is the same to 104(decimal) in the beq x5 x0 104. beq then adds the two value and generate the target address 0x000000b8. In this example, the branch isn't taken.

Assembly optimization

Use temporary registers

Because isPalindrome doesn't call any other function, it is feasible to replace all callee-saved register with caller-save register, so we don't need to push them on the stack everytime the function being called.

isPalindrome:
    add t4, zero, zero  # t4 prev = NULL;
    add t5, a0, zero    # t5 cur = head;
    add t2, a0, zero    # t2 = tail = head;

    lw t0, 4(a0)        # t0 = head->next;
    beq t0,zero, true
loop1:
    beq t2, zero, end1  # while(tail)
    lw t0, 4(t2)        # t0 = tail->next;
    beq t0, zero, end1  # while(tail->next)

    lw t1, 4(t0)        # t1 = tail->next->next;
    add t2, t1, zero    # tail = tail->next->next;

    lw t3, 4(t5)        # t3 = tmp = cur->next;
    sw t4, 4(t5)        # cur->next = prev;
    add t4, t5, zero    # prev = cur;
    add t5, t3, zero    # cur = tmp;
    j loop1

end1:
    beq t2,zero,loop2
    lw t5, 4(t5)        # cur = cur->next

loop2:
    lw t0, 4(t5)        # t0 = cur->next;
    beq t0,zero,end2    # while(cur->next)
    lw t0, 0(t4)        # t0 = prev->val;
    lw t1, 0(t5)        # t1 = cur->val;
    bne t0,t1, end2     # if(cur->val != prev->val)
    lw t4, 4(t4)        # prev = prev->next;
    lw t5, 4(t5)        # cur = cur->next;
    j loop2             #
end2:
    lw t0, 0(t5)
    lw t1, 0(t4)
    beq t0,t1, true     #if(cur->val == prev->val) return true

    add a0,zero,zero    #return false
    j pop

true:
    addi a0,zero,1  #return true
    
pop:
    jr ra

Avoid redundant branch

In the comparing while loop of isPalindrome, there are several different branch. By referencing the fourth edition of my C program, I obviate the redundant branches and make it more concise.

isPalindrome:
    add t4, zero, zero  # t4 prev = NULL;
    add t5, a0, zero    # t5 cur = head;
    add t2, a0, zero    # t2 = tail = head;

    lw t0, 4(a0)        # t0 = head->next;
    beq t0, zero, end2
loop1:
    beq t2, zero, end1  # while(tail)
    lw t0, 4(t2)        # t0 = tail->next;
    beq t0, zero, end1  # while(tail->next)

    lw t1, 4(t0)        # t1 = tail->next->next;
    add t2, t1, zero    # tail = tail->next->next;

    lw t3, 4(t5)        # t3 = tmp = cur->next;
    sw t4, 4(t5)        # cur->next = prev;
    add t4, t5, zero    # prev = cur;
    add t5, t3, zero    # cur = tmp;
    
    j loop1

end1:
    add a0,zero,zero    # if two list are different
                        # the loop will jump to end
                        # and the a0 will be 0.
    beq t2,zero,loop2
    lw t5, 4(t5)        # cur = cur->next

loop2:  
    beq t5,zero,end2    # while(cur)
    lw t0, 0(t4)        # t0 = prev->val;
    lw t1, 0(t5)        # t1 = cur->val;
    bne t0,t1, fail     # if(cur->val != prev->val)
    lw t4, 4(t4)        # prev = prev->next;
    lw t5, 4(t5)        # cur = cur->next;
    
    j loop2             #
end2:
    addi a0,zero,1      # if loop terminates normally(cur == NULL).
                        # this line will be executed.
    
fail:
    jr ra

Loop unrolling

I think this one is interesting, because it doesn't reduce the number of instructions in the instruction memory. On the contrary, it increases the number. However, except for the disadvantage on the code size and readability, it may actually improve the performance of the program by reducing branch and utilizing pipelining.

The reason I want to use loop unrolling on this program is that Ripes always takes next instruction as predicted one, which means the loop will take the wrong branch most of the time. Therefore, it is a good opportunity to use loop unrolling, because it reduce the number of branch.

In order to demonstrate the impact of loop unrolling, I have to build a long linked list. I introduce some new function shown as below:

################ generate ################
generate:
    # a0 number
    addi sp,sp,-8
    sw s0,0(sp)    
    sw ra,4(sp)
    add s0,zero,a0    
    slli s0,s0,3    # 8bytes per node, 2^3 = 8, s0 is the size of whole momery
    add a0,s0,zero
    jal ra, sbrk
    la t0,heap_base
    lw t0, 0(t0)
    add a0,t0,zero
    addi t0,t0,4        # t0 is the address of "next" of each node
    add t1, t0,s0       # t1 is the boundry
gen_loop:
    
    bgt t0,t1,gen_end
    addi t2,t0,4
    sw t2,0(t0)        # node->next pointing to next node
    sw zero,-4(t0)        # node->val = 0
    addi t0,t0,8    # next node
    
    j gen_loop
gen_end:
    sw zero,-8(t0)    # make final node pointing to null
    
    lw s0,0(sp)
    lw ra,4(sp)
    addi sp,sp,8
    jr ra
    
    
################ sbrk ################
sbrk:
    # a0 incre_size
    la t0,heap_end
    lw t1,0(t0)
    add a0,t1,a0
    sw a0,0(t0)
    add t1,a0,zero
    li a7,214
    ecall
    
    # Todo : consider brk fail  
    add a0,t1,zero
    
    jr ra

Ripes doesn't support sbrk system call, so I use brk to implement a similar function. The function generate uses sbrk to require a block of memory, and use that to generate an arbitrary length linked list. The length is assigned from the first parameter passed in this function in register a0.

I plan to experiment three cases:

without unrolling
unrolling all loops two times
unrolling all loops four times

All loops includes loop1, loop2, gen_loop.
Takes loop2 for example.

Unrolling 2 times:

loop2:   
    beq t5,zero,end2    # while(cur)
    lw t0, 0(t4)        # t0 = prev->val;
    lw t1, 0(t5)        # t1 = cur->val;
    bne t0,t1, fail     # if(cur->val != prev->val)
    lw t4, 4(t4)        # prev = prev->next;
    lw t5, 4(t5)        # cur = cur->next;
    
    beq t5,zero,end2    # while(cur)
    lw t0, 0(t4)        # t0 = prev->val;
    lw t1, 0(t5)        # t1 = cur->val;
    bne t0,t1, fail     # if(cur->val != prev->val)
    lw t4, 4(t4)        # prev = prev->next;
    lw t5, 4(t5)        # cur = cur->next;
    
    j loop2             #

Unrolling 4 times:

loop2:    
    beq t5,zero,end2    # while(cur)
    lw t0, 0(t4)        # t0 = prev->val;
    lw t1, 0(t5)        # t1 = cur->val;
    bne t0,t1, fail     # if(cur->val != prev->val)
    lw t4, 4(t4)        # prev = prev->next;
    lw t5, 4(t5)        # cur = cur->next;
    
    beq t5,zero,end2    # while(cur)
    lw t0, 0(t4)        # t0 = prev->val;
    lw t1, 0(t5)        # t1 = cur->val;
    bne t0,t1, fail     # if(cur->val != prev->val)
    lw t4, 4(t4)        # prev = prev->next;
    lw t5, 4(t5)        # cur = cur->next;
    
    beq t5,zero,end2    # while(cur)
    lw t0, 0(t4)        # t0 = prev->val;
    lw t1, 0(t5)        # t1 = cur->val;
    bne t0,t1, fail     # if(cur->val != prev->val)
    lw t4, 4(t4)        # prev = prev->next;
    lw t5, 4(t5)        # cur = cur->next;
    
    beq t5,zero,end2    # while(cur)
    lw t0, 0(t4)        # t0 = prev->val;
    lw t1, 0(t5)        # t1 = cur->val;
    bne t0,t1, fail     # if(cur->val != prev->val)
    lw t4, 4(t4)        # prev = prev->next;
    lw t5, 4(t5)        # cur = cur->next;
    
    j loop2             #

I try to pass a linked list with 100 nodes to isPalindrome, and compare the difference between cases. Here's the results:

Without unrolling:

Unrolling two times:

Unrolling four times:

As we can see, for a linked list with 100 nodes, the number of cycles drops from 2098 to 1680 with four times unrolling.

Obviate data hazards by reordering instructions

There are several data hazards in this program, which leads to stalling of pipeline and reduces the performance. Here, I tried to reorder instructions and avoid the hazards. I didn't deal with all of hazards but only those in the loops, because the process runs loops most of the time.

loop1:
    lw t0, 4(t2)        # t0 = tail->next;
    beq t2, zero, end1  # while(tail)
    beq t0, zero, end1  # while(tail->next)

    lw t1, 4(t0)        # t1 = tail->next->next;
    
    lw t3, 4(t5)        # t3 = tmp = cur->next;
    
    add t2, t1, zero    # tail = tail->next->next;

 
    sw t4, 4(t5)        # cur->next = prev;
    add t4, t5, zero    # prev = cur;
    add t5, t3, zero    # cur = tmp;
    
    j loop1

end1:

loop2:
    
    beq t5,zero,end2    # while(cur)
    lw t0, 0(t4)        # t0 = prev->val;
    lw t1, 0(t5)        # t1 = cur->val;
 
    lw t4, 4(t4)        # prev = prev->next;
    lw t5, 4(t5)        # cur = cur->next;
    bne t0,t1, fail     # if(cur->val != prev->val)
    
    j loop2             #
end2:

This figure below shows the pipeline diagram of loop2 without reordering:

This figure below shows the pipeline diagram of loop2 with reordering:

As we can see, the one stalling in each loop is eliminated.

I combine instructions reordering with loop unrolling, and measures the performance on linked list with 100 nodes.

Without unrolling:

Unrolling with two times:

Unrolling with four times:

Eliminate redundant branch instructions by strip mining like technique

There are some updates in the program. I remove the gen_loop in generate function, and uses a new function connect instead. The function connects the nodes in a continuous memory block. The new loop con_loop in connect have nearly the same effect as original gen_loop. For the following examples, I will use con_loop instead of gen_loop.

To see the newest version of code, please view the following link: https://github.com/kdnvt/Palindrome-linked-list

It contains two different branch: main and unrolling. The main is the standard version, while unrolling use the loop unrolling technique.

There are some branch instrucions in unrolling con_loop:

con_loop:
    bge t0,a1,con_end
    
    addi t1,t0,4        # t1 = address of next node
    sw t1,0(t0)         # node->next pointing to next node
    addi t0,t0,8        # next node
    
    bge t0,a1,con_end
    
    addi t1,t0,4        # t1 = address of next node
    sw t1,0(t0)         # node->next pointing to next node
    addi t0,t0,8        # next node
    
    bge t0,a1,con_end
    
    addi t1,t0,4        # t1 = address of next node
    sw t1,0(t0)         # node->next pointing to next node
    addi t0,t0,8        # next node
    
    bge t0,a1,con_end
    
    addi t1,t0,4        # t1 = address of next node
    sw t1,0(t0)         # node->next pointing to next node
    addi t0,t0,8        # next node
    
    j con_loop
con_end:

When I unroll the loop four times, the number of bge instruction increase correspondingly. I think these instrucions is redundant and want to eliminate increased instructions and make each iteration body have only one bge instruction.

The solution I want to use is similar to strip mining in compilers for vector processors, which splits original loop into two new loops. One runs iterations without unrolling, while another runs iterations with unrolling.

Let n be number of iterations, and k be the times of loop unrolling. First execute n%k iterations in original loop, and then execute remaining iterations in k unrolling loop. Process can determine whether to take the branch or not on the head of loop body, and obviate the checking for the rest of body, since the the remaining iterations must be multiple of k.

The optimized code is shown as below:

    andi t2,a1,3        # t2 = number of nodes % 4 (loop unrolling times)
    addi t0,a0,4        # t1 = &first_node->next
    slli a1,a1,3        # a1 = size of list        
    add a1,a1,t0        # a1 = boundry
    
con_loop_normal:
    beq t2,zero,con_loop
    addi t1,t0,4        # t1 = address of next node
    sw t1,0(t0)         # node->next pointing to next node
    addi t2,t2,-1
    addi t0,t0,8        # next node
    j con_loop_normal
   
con_loop:
    bge t0,a1,con_end
    
    addi t1,t0,4        # t1 = address of next node
    sw t1,0(t0)         # node->next pointing to next node
    addi t0,t0,8        # next node
    
    addi t1,t0,4        # t1 = address of next node
    sw t1,0(t0)         # node->next pointing to next node
    addi t0,t0,8        # next node
    
    addi t1,t0,4        # t1 = address of next node
    sw t1,0(t0)         # node->next pointing to next node
    addi t0,t0,8        # next node
    
    addi t1,t0,4        # t1 = address of next node
    sw t1,0(t0)         # node->next pointing to next node
    addi t0,t0,8        # next node
    
    j con_loop
con_end:

I measured the number of cycles and instructions retired with linked list with 500 nodes.

Without unrolling:

With unrolling four times:

With unrolling four times and strip mining on con_loop:

It is interesting that unrolling with strip mining got fewer cycles and instrucions retired but a higher CPI than unrolling without strip mining, which is a good example to demonstrate that higher CPI doesn't always means better performance.

After optimize the con_loop, I try to use the same technique to optimize loop1 and loop2. Unfortunately, since we don't know the exact length of linked list at the beginning, it is infeasible to apply strip mining on loop1. However, if we count the number of iterations of loop1, we can optimize the loop2.

Nonetheless, the effect of this optimization is not very well, because we need to add extra instrucions in loop1 and loop2 to count. Theoretically, the effect would be more obvious with much longer list or much larger number of unrolling.

The optimized code is shown as below:

    li t6, 0            # t6 = 0; used to count the number of loop
    # ...
loop1:
    lw t0, 4(t2)        # t0 = tail->next;
    beq t2, zero, end1  # while(tail)
    beq t0, zero, end1  # while(tail->next)
    lw t1, 4(t0)        # t1 = tail->next->next;

    lw t3, 4(t5)        # t3 = tmp = cur->next;
    add t2, t1, zero    # tail = tail->next->next;
    sw t4, 4(t5)        # cur->next = prev;
    add t4, t5, zero    # prev = cur;
    add t5, t3, zero    # cur = tmp;

    # ...
    # ...
    # unrolling loops ... 
    # ...
    # ...

    lw t0, 4(t2)        # t0 = tail->next;
    beq t2, zero, end1  # while(tail)
    beq t0, zero, end1  # while(tail->next)
    lw t1, 4(t0)        # t1 = tail->next->next;

    lw t3, 4(t5)        # t3 = tmp = cur->next;
    add t2, t1, zero    # tail = tail->next->next;
    sw t4, 4(t5)        # cur->next = prev;
    add t4, t5, zero    # prev = cur;
    add t5, t3, zero    # cur = tmp;

    addi t6,t6,1        # t6++;
    j loop1

end1:
    add a0,zero,zero    # if two list are different
                        # the loop will jump to end
                        # and the a0 will be 0.
    beq t2,zero,loop2
    lw t5, 4(t5)        # cur = cur->next

loop2: 
    beq t6,zero,ori_loop2    # while there are unrolling iterations
    lw t0, 0(t4)        # t0 = prev->val;
    lw t1, 0(t5)        # t1 = cur->val;
    lw t4, 4(t4)        # prev = prev->next;
    lw t5, 4(t5)        # cur = cur->next;
    bne t0,t1, fail     # if(cur->val != prev->val)
    
    lw t0, 0(t4)        # t0 = prev->val;
    lw t1, 0(t5)        # t1 = cur->val;
    lw t4, 4(t4)        # prev = prev->next;
    lw t5, 4(t5)        # cur = cur->next;
    bne t0,t1, fail     # if(cur->val != prev->val)
    
    lw t0, 0(t4)        # t0 = prev->val;
    lw t1, 0(t5)        # t1 = cur->val;
    lw t4, 4(t4)        # prev = prev->next;
    lw t5, 4(t5)        # cur = cur->next;
    bne t0,t1, fail     # if(cur->val != prev->val)
    
    lw t0, 0(t4)        # t0 = prev->val;
    lw t1, 0(t5)        # t1 = cur->val;
    lw t4, 4(t4)        # prev = prev->next;
    lw t5, 4(t5)        # cur = cur->next;
    bne t0,t1, fail     # if(cur->val != prev->val)
    
    addi t6,t6,-1
    j loop2

ori_loop2:
    beq t5,zero,end2    # while(cur)
    lw t0, 0(t4)        # t0 = prev->val;
    lw t1, 0(t5)        # t1 = cur->val;
    lw t4, 4(t4)        # prev = prev->next;
    lw t5, 4(t5)        # cur = cur->next;
    bne t0,t1, fail     # if(cur->val != prev->val)
          
    j ori_loop2
                 
end2:

This optimization with 500 nodes list is shown as below:

Syntax	Example	Reference
# Header	Header	基本排版
- Unordered List	Unordered List
1. Ordered List	Ordered List
- [ ] Todo List	Todo List
> Blockquote	Blockquote
Bold font	Bold font
Italics font	Italics font
~~Strikethrough~~	~~Strikethrough~~
19^th^	19^th
H~2~O	H₂O
++Inserted text++	Inserted text
==Marked text==	Marked text
[link text](https:// "title")	Link
![image alt](https:// "title")	Image
`Code`	`Code`	在筆記中貼入程式碼
```javascript var i = 0; ```	`var i = 0;`	在筆記中貼入程式碼
:smile:		Emoji list
{%youtube youtube_id %}	Externals
$L^aT_eX$	L^aT_eX
:::info This is a alert area. :::	This is a alert area.