2022 Assignment3: SoftCPU

tags: `Computer Architecture`

contribute by <chiangkd>

Requirement follow Assignment3: SoftCPU

Source code

I choose the program from 鄒崴丞-Length of Last Word and Leetcode 77. Combinations

C code(Modified from 鄒崴丞)

#include <stdio.h>

int lengthOfLastWord(char *s){
    if (!*s)
        return 0;
    int i = 0;
    char *head = s;
    while (*s) {
        s++;
        i++;
    }
    s = head;
    int length = 0;
    s = s + i - 1;
    
    while(i && *s == ' '){
        s--;
        i--;
    }

    while(i-- && *s != ' '){
        s--;
        length++;
    }

    return length;
}

int main()
{
    char *str1 = "Hello World";
    char *str2 = "I am a student  ";
    char *str3 = "a";
    int len1 = lengthOfLastWord(str1);
    int len2 = lengthOfLastWord(str2);
    int len3 = lengthOfLastWord(str3);
    printf("For string \"%s\", length of last word : %d\n", str1, len1);
    printf("For string \"%s\", length of last word : %d\n", str2, len2);
    printf("For string \"%s\", length of last word : %d\n", str3, len3);

    return 0;
}

Handwrite Assembly from 鄒崴丞-Length of Last Word

.org 0
# Provide program starting address to linker
.global _start

/* newlib system calls */
.set SYSEXIT,  93
.set SYSWRITE, 64

.data


str1:       .string "Hello World"
            .set str1len, .-str1
str2:       .string "I am a student  "
            .set str2len, .-str2
str3:       .string "a"
            .set str3len, .-str3
fstr:       .string "For string \""
            .set flen, .-fstr
answrstr:   .string "\", length of last word : "
            .set answrlen, .-answrstr  
newline:    .string "\n"
            .set nllen, .-newline

.text
_start:
        # function : lengthOfLastWord(char * s)
        # a0 : pointer to the start of the string
        la          a0, str1
        # find the position of the last letter is not space
        jal         ra, lengthoflastword        # call strlen()
        add         a2, x0, a0
        la          a0, str1
        li          a1, str1len
        jal         print_result

        la          a0, str2
        # find the position of the last letter is not space
        jal         ra, lengthoflastword        # call strlen()
        add         a2, x0, a0
        la          a0, str2
        li          a1, str2len
        jal         print_result

        la          a0, str3
        # find the position of the last letter is not space
        jal         ra, lengthoflastword        # call strlen()
        add         a2, x0, a0
        la          a0, str3
        li          a1, str3len
        jal         print_result

        li          a0, 0             # return value is 0
        li          a7, SYSEXIT       # end
        ecall

nochar:
        li          a0, 0
        ret
lengthoflastword:
        lb          t0, 0(a0)
        beq         t0, x0, nochar
        
        andi        a2, a0, 3
        add         a1, x0, a0        # a1 = a0
        bne         a2, x0, unaligned # jump if the address is unaligned
starttraverse:
        lui         a3, 0x7F7F8
        addi        a3, a3, -129      # now a3 is 0x7F7F7F7F
        addi        a4, x0, -1        # a4 = -1
findend:
        lw          t0, 0(a1)
        lw          t1, 4(a1)
        lw          t2, 8(a1)
        lw          t3, 12(a1)
        and         a2, t0, t1
        and         t4, t2, t3
        and         a2, a2, t4
        addi        a1, a1, 16
        and         a5, a2, a3
        add         a5, a5, a3
        or          a5, a5, a3
        beq         a5, a4, findend   # break if there are NULL

        and         a5, t0, a3
        add         a5, a5, a3
        or          a5, a5, a3
        bne         a5, a4, back12

        and         a5, t1, a3
        add         a5, a5, a3
        or          a5, a5, a3
        bne         a5, a4, back8

        and         a5, t2, a3
        add         a5, a5, a3
        or          a5, a5, a3
        bne         a5, a4, back4

findbyte:
        lbu         a2, -4(a1)
        sub         a3, a1, a0
        beq         a2, x0, sub4
        lbu         a2, -3(a1)
        beq         a2, x0, sub3
        lbu         a2, -2(a1)
        snez        a2, a2
        add         a2, a2, a3
        addi        a1, a2, -2
        j           findword
back4:
        addi        a1, a1, -4
        j           findbyte
back8:
        addi        a1, a1, -8
        j           findbyte
back12:
        addi        a1, a1, -12
        j           findbyte

checkalign:
        beq         a3, x0, starttraverse
unaligned:
        lbu         a2, 0(a1)
        addi        a1, a1, 1
        andi        a3, a1, 3
        bne         a2, x0, checkalign
        sub         a1, a1, a0
        addi        a1, a1, -1
        j           findword
sub4:
        addi        a1, a3, -4
        j           findword
sub3:
        addi        a1, a3, -3
        j           findword
findword:
        li          a2, 32            # store space to register a2
        add         a0, a1, a0
        addi        a1, a1, 1
loop1:
        # now we reach the end of the string
        beq         a1, x0, return    # if encounter the last position, return
        addi        a0, a0, -1        # s--
        addi        a1, a1, -1        # the last position is not space
        lb          t0, 0(a0)         # t0 = s[i]
        beq         t0, a2, loop1     # while(char[i] == " ")
        add         a3, x0, a0        # a3 = a0
        add         a0, x0, x0
loop2:
        beq         a1, x0, return    # if encounter the last position, return
        addi        a1, a1, -1
        addi        a3, a3, -1
        addi        a0, a0, 1         # s++
        lb          t0, 0(a3)         # t0 = s[i]
        bne         t0, a2, loop2     
return:
        ret

print_result:
        add         s0, x0, a0
        add         s1, x0, a1
        add         s2, x0, a2
        li          a7, SYSWRITE
        li          a0, 1             # print For string : "
        la          a1, fstr
        la          a2, flen
        ecall

        li          a7, SYSWRITE
        li          a0, 1             # print i am a student  
        add         a1, x0, s0
        add         a2, x0, s1
        ecall

        li          a7, SYSWRITE
        li          a0, 1             # print ", length of last word is   
        la          a1, answrstr
        li          a2, answrlen
        ecall

        li          a7, SYSWRITE
        addi        a1, s2, 48        # get length
        addi        sp, sp, -4
        sw          a1, 0(sp)         # store answer in stack
        li          a0, 1             # print answer
        addi        a1, sp, 0
        li          a2, 4
        ecall
        addi        sp, sp, 4

        li          a7, SYSWRITE
        li          a0, 1             # print new line
        la          a1, newline       
        li          a2, nllen
        ecall
        ret

There are serval version in 鄒崴丞-Length of Last Word, I take the version with SWAR(4 register at one time), It seems has the best performance (CSR cycle) with longer string.

LeetCode 77. Combinations

Given two integers n and k, return all possible combinations of k numbers chosen from the range [1, n].

Constraints:

1 <= n <= 20

1 <= k <= n

Leetcode Solution

#define MAXLEN 184756
void GetLineSize(int **res, int *line, int lineSize,int n, int k, int *resLen, int idx)
{
    if (lineSize == k) {
        res[*resLen] = (int *)malloc(sizeof(int) * k);
        memcpy(res[*resLen], line, sizeof(int) * k);
        (*resLen)++;
        return;
    }

    for (int i = idx; i <= n; i++) {
        line[lineSize] = i;
        GetLineSize(res, line, lineSize + 1, n, k, resLen, i + 1);
    }
}
int** combine(int n, int k, int* returnSize, int** returnColumnSizes){

    int **res = malloc(sizeof(int *)* MAXLEN);
    int line[k];
    int lineSize = 0;
    int resLen = 0;
    GetLineSize(res, line, lineSize, n, k, &resLen, 1);
    *returnSize = resLen;
    *returnColumnSizes = malloc(sizeof(int)*(*returnSize));
    int i = *returnSize-1;
    while(i >= 0){
    	(*returnColumnSizes)[i--] = k;
    }
    return res;
}

From description, we know that the MAXLEN about the answer will be

C_{10}^{20} = 184756

, so that's why I give it a constant value and after caculation, just realloc it.

Improvement
In LeetCode, there are many problems take the argument int *returnSize and int ** returnColumnSizes, the first one is about output array's length, and the second one is about the each column size in the output array.

But, in my problem, the column size will always equal to k, which is an argument about combine function, and the returnSize will equal to

C_{k}^{n}

as mention above.

So, I remove returnSize and returnColumnSizes to simplfy the question. And also, use structure to reduce argument number

struct set {
    int n;
    int k;
    int returnSize;
};

#define MAXLEN 184756

void GetLineSize(int **res, int *line, int lineSize, struct set *cond, int idx)
{
    if (lineSize == cond->k) {
        res[cond->returnSize] = (int *)malloc(sizeof(int) * cond->k);
        memcpy(res[cond->returnSize], line, sizeof(int) * cond->k);
        (cond->returnSize)++;
        return;
    }
    for (int i = idx; i <= cond->n; i++) {
        line[lineSize] = i;
        GetLineSize(res, line, lineSize + 1, cond, i + 1);
    }
}
int** combine(struct set *cond){
    int **res = (int **)malloc(sizeof(int *)* MAXLEN);
    int line[cond->k];
    int lineSize = 0;
    GetLineSize(res, line, lineSize, cond, 1);
    res = realloc(res, sizeof(int*) * cond->returnSize);
    return res;
}

However, when I do make with this program, get an error message

DMEM address 002c5d44 out of range

which is defined in testbench.v, means out of memory range (DRAM + IRAM), so I shrink the MAXLENGTH to make it run properly.

#define MAXLEN 20000

Tweaked programs for srv32

You should validate the results in your program(s).

You should program in RISC-V assembly for the sake of further optimizations. Hint: You may modify a customized Makefile for building C and assembly source files from scratch.

Modify `Makefile`

Simply copy Makefile from existing hello example and just modifiy the target and associated dependency name

include ../common/Makefile.common

EXE      = .elf
SRC      = lolw.S
CFLAGS  += -L../common
LDFLAGS += -T ../common/default.ld
TARGET   = lolws
OUTPUT   = $(TARGET)$(EXE)

.PHONY: all clean

all: $(TARGET)

$(TARGET): $(SRC)
	$(CC) $(CFLAGS) -o $(OUTPUT) $(SRC) $(LDFLAGS)
	$(OBJCOPY) -j .text -O binary $(OUTPUT) imem.bin
	$(OBJCOPY) -j .data -O binary $(OUTPUT) dmem.bin
	$(OBJCOPY) -O binary $(OUTPUT) memory.bin
	$(OBJDUMP) -d $(OUTPUT) > $(TARGET).dis
	$(READELF) -a $(OUTPUT) > $(TARGET).symbol

clean:
	$(RM) *.o $(OUTPUT) $(TARGET).dis $(TARGET).symbol [id]mem.bin memory.bin

Problem 1: 鄒崴丞-Length of Last Word

And do some modification to let the assembly run on srv32

printf system call
stored ra

.data

str1:       .string "Hello World"
str2:       .string "I am a student  "  
str3:       .string "a"
space:      .string " "
fstr:       .string "For string \""
answrstr:   .string "\", length of last word : "
newline:    .string "\n"
iformat:    .string "%d"

.text

.global main
main:
        # function : lengthOfLastWord(char * s)
        # a0 : pointer to the start of the string
        addi sp, sp, -4
        sw ra, 0(sp)

        la          a0, str1
        # find the position of the last letter is not space
        jal         ra, lengthoflastword        # call strlen()
        add         a2, x0, a0
        la          a0, str1
        jal         print_result

        la          a0, str2
        # find the position of the last letter is not space
        jal         ra, lengthoflastword        # call strlen()
        add         a2, x0, a0
        la          a0, str2
        jal         print_result

        la          a0, str3
        # find the position of the last letter is not space
        jal         ra, lengthoflastword        # call strlen()
        add         a2, x0, a0
        la          a0, str3
        jal         print_result

        lw ra, 0(sp)
        addi sp, sp, 4
        li          a0, 0             # return value is 0
        ret

lengthoflastword:
        lb          t0, 0(a0)
        beq         t0, x0, return
        
        andi        a2, a0, 3
        add         a1, x0, a0        # a1 = a0
        bne         a2, x0, unaligned # jump if the address is unaligned
starttraverse:
        lui         a3, 0x7F7F8
        addi        a3, a3, -129      # now a3 is 0x7F7F7F7F
        addi        a4, x0, -1        # a4 = -1
findend:
        lw          t0, 0(a1)
        lw          t1, 4(a1)
        lw          t2, 8(a1)
        lw          t3, 12(a1)
        and         a2, t0, t1
        and         t4, t2, t3
        and         a2, a2, t4
        addi        a1, a1, 16
        and         a5, a2, a3
        add         a5, a5, a3
        or          a5, a5, a3
        beq         a5, a4, findend   # break if there are NULL

        and         a5, t0, a3
        add         a5, a5, a3
        or          a5, a5, a3
        bne         a5, a4, back12

        and         a5, t1, a3
        add         a5, a5, a3
        or          a5, a5, a3
        bne         a5, a4, back8

        and         a5, t2, a3
        add         a5, a5, a3
        or          a5, a5, a3
        bne         a5, a4, back4

findbyte:
        lbu         a2, -4(a1)
        sub         a3, a1, a0
        beq         a2, x0, sub4
        lbu         a2, -3(a1)
        beq         a2, x0, sub3
        lbu         a2, -2(a1)
        snez        a2, a2
        add         a2, a2, a3
        addi        a1, a2, -2
        j           findword
back4:
        addi        a1, a1, -4
        j           findbyte
back8:
        addi        a1, a1, -8
        j           findbyte
back12:
        addi        a1, a1, -12
        j           findbyte

checkalign:
        beq         a3, x0, starttraverse
unaligned:
        lbu         a2, 0(a1)
        addi        a1, a1, 1
        andi        a3, a1, 3
        bne         a2, x0, checkalign
        sub         a1, a1, a0
        addi        a1, a1, -1
        j           findword
sub4:
        addi        a1, a3, -4
        j           findword
sub3:
        addi        a1, a3, -3
        j           findword
findword:
        li          a2, 32            # store space to register a2
        add         a0, a1, a0
        addi        a1, a1, 1
loop1:
        # now we reach the end of the string
        beq         a1, x0, return    # if encounter the last position, return
        addi        a0, a0, -1        # s--
        addi        a1, a1, -1        # the last position is not space
        lb          t0, 0(a0)         # t0 = s[i]
        beq         t0, a2, loop1     # while(char[i] == " ")
        add         a3, x0, a0        # a3 = a0
        add         a0, x0, x0
loop2:
        beq         a1, x0, return    # if encounter the last position, return
        addi        a1, a1, -1
        addi        a3, a3, -1
        addi        a0, a0, 1         # s++
        lb          t0, 0(a3)         # t0 = s[i]
        bne         t0, a2, loop2     
return:

        ret

print_result:
        addi        sp, sp, -4
        sw          ra, 0(sp)

        add         s0, x0, a0
        add         s1, x0, a1
        add         s2, x0, a2

        la          a0, fstr          # print For string :
        call        printf

        add         a0, x0, s0        # print i am a student
        call        printf
          
        la          a0, answrstr      # print ", length of last word is   
        call        printf

        addi        a1, s2, 0         # get length
        addi        sp, sp, -4
        sw          a1, 0(sp)         # store answer in stack
        
        la          a0, iformat
        lw          a1, 0(sp)         # print answer
        call        printf
        addi        sp, sp, 4
    
        la          a0, newline       # print new line
        call        printf

        lw          ra, 0(sp)
        addi        sp, sp, 4
        ret

Result

sim

For string "Hello World", length of last word : 5
For string "I am a student  ", length of last word : 7
For string "a", length of last word : 1

Excuting 12991 instructions, 17619 cycles, 1.356 CPI
Program terminate
- ../rtl/../testbench/testbench.v:434: Verilog $finish

Simulation statistics
=====================
Simulation time  : 0.079 s
Simulation cycles: 17630
Simulation speed : 0.223165 MHz

rvsim

For string "Hello World", length of last word : 5
For string "I am a student  ", length of last word : 7
For string "a", length of last word : 1

Excuting 12991 instructions, 17619 cycles, 1.356 CPI
Program terminate

Simulation statistics
=====================
Simulation time  : 0.003 s
Simulation cycles: 17619
Simulation speed : 5.523 MHz

The answer is correct
- Executing 12991 instructions
- 17619 cycles
- 1.356 CPI

	sim	rvsim
Simulation time	0.075s	0.03s
Simulation cycles	17252	17241
Simulation speed	0.230027 MHz	5.760 MHz

Problem 2: Leetcode 77. Combinations

From the C code mentioned above, write the assembly which can properly run on srv32

Hand-write Assembly

.data
    n: .word 20

    lbracket: .string "["
    rbracket: .string "]"
    comma: .string ","
    backspace: .string "\b"
    newline: .string "\n"
    iformat: .string "%d,"
    xformat: .string "%x\n"

.text

.global main

main:
    li t1, 0
    addi sp, sp, -4
    sw ra, 0(sp)

    # push sp
    lw t0, n
    slli t0, t0, 2
    sub sp, sp, t0

    mv a0, sp           	# a0 = &cond
    li t0, 4            	# cond.n = 4
    sw t0, 0(a0)
    li t0, 3            	# cond.k = 3
    sw t0, 4(a0)
    sw x0, 8(a0)		# cond.returnSize = 0

    call combine
    # a0 = res

    lw a1, 4(a3)    		# a1 = cond.k
    lw a2, 8(a3)    		# a2 = cond.returnSize
    
    call printAns

    # pop sp
    lw t0, n
    slli t0, t0, 2
    add sp, sp, t0

    # restore ra
    lw ra, 0(sp)
    addi sp, sp, 4

    # exit
    li a0, 0
    ret

memcpy:
    # a0 dest
    # a1 src
    # a2 n
    #slli t5, a2, 0x2
    li t3, 0        # i = 0

mcpyloop:
    lw t4, 0(a1)
    sw t4, 0(a0)

    addi a0, a0, 4
    addi a1, a1, 4
    addi t3, t3, 4
    bne t3, a2, mcpyloop
    ret 

GetLineSize:
    # a0 = res
    # a1 = line
    # a2 = lineSize
    # a3 = cond
    # a4 = idx
    addi sp, sp, -12
    sw ra, 0(sp)
    sw a2, 4(sp)    			# lineSize
    sw a4, 8(sp)    			# idx
    lw s1, 4(a3)    			# s1 = cond->k

    bne a2, s1, GLSrecursive    	# if (lineSize != cond->k)
    
PutLineIn:
    # a0 = res
    # a1 = line
    # a2 = lineSize
    # a3 = cond
    # a4 = idx
    addi sp, sp, -24
    sw a0, 0(sp)
    sw a1, 4(sp)
    sw a2, 8(sp)
    sw a3, 12(sp)
    sw a4, 16(sp)

    lw t2, 20(sp)

    add t1, t2, t1
    add a0, a0, t1      # a0 = res[cond->returnSize]
    lw t2, 4(a3)        # t2 = cond->k

    slli a2, t2, 0x2    # a2 = sizoef(int) * cond->k
    sw a2, 20(sp)       # store offset            # Here can do some improvement

    call memcpy
    lw a0, 0(sp)        # res
    lw a1, 4(sp)        # line
    lw a2, 8(sp)        # lineSize
    lw a3, 12(sp)       # cond
    lw a4, 16(sp)       # idx
    
    addi sp, sp, 24
    addi t5, t5, 1      # cond->returnSize++
    sw t5, 8(a3)        # store returnSize back
    lw ra, 0(sp)
    addi sp, sp, 12
    ret
    
GLSrecursive:
    mv s2, a2                   # s1 = lineSize
    mv s3, a4                   # int i = idx
    lw t0, 0(a3)                # t0 = cond->n
    
    blt t0, s3, Normalreturn
    #addi t0, t0, 1
GLSloop:
    
    slli t3, s2, 0x2            # t3 = line[lineSize] offset
    add t4, a1, t3              # t4 = line[lineSize] address
    sw s3, 0(t4)                # line[lineSize] = i
    
    # a2, a3 must put in function with plus 1
    addi a2, s2, 1              # lineSize + 1
    addi a4, s3, 1              # i + 1
    
    call GetLineSize
    addi s3, s3, 1              # i++
    sw s3, 8(sp)
    bge t0, s3, GLSloop
    
Normalreturn:
    lw ra, 0(sp)
    addi sp, sp, 12
    lw a2, 4(sp)
    lw a4, 8(sp)

    mv s2, a2
    mv s3, a4

    ret

combine:
    # a0 = &cond
    # 0(a0) = cond->n
    # 4(a0) = cond->k
    # 8(a0) = cond->returnSize

    addi sp, sp, -24
    sw ra, 0(sp)
    mv a3, a0           	# a3 = cond
    addi a0, a0, -2000      	# a0 = address of **res
    addi a1, a0, -2000    	# a1 = address of line[cond->k]   
    li a2, 0            	# a2 = lineSize
    li a4, 1            	# a4 = 1
    
    sw a0, 4(sp)
    sw a1, 8(sp)
    sw a2, 12(sp)
    sw a3, 16(sp)
    sw a4, 20(sp)
    
    call GetLineSize

    lw ra, 0(sp)
    lw a0, 4(sp)        # res
    lw a1, 8(sp)        # line
    lw a2, 12(sp)       # lineSize
    lw a3, 16(sp)       # cond
    lw a4, 20(sp)       # idx
    addi sp, sp, 24
    ret

printAns:

    # a0 = res
    # a1 = k (cond.k)
    # a2 = returnSize (cond.returnSize)
    mv s2, a1                           # return Size = 3
    mv s3, a2                           # k = 4
    addi sp, sp, -4
    sw ra, 0(sp)

    mv s1, a0                   # s1 = res
    la a0, lbracket
    call printf
    li s4, 0                    # int i = 0
    li s5, 0                    # int j = 0
printloop1:

    la a0, lbracket
    call printf

    li s4, 0                    # int j = 0

printloop2:
    lw a1, 0(s1)                # res[i]

    addi s1, s1, 0x4            # res[i][j] address offset
    la a0, iformat
    call printf

    addi s4, s4, 1              # i++
    bne s4, s2, printloop2      # j < returnColumnSizes

    la a0, backspace
    call printf

    la a0, rbracket
    call printf

    la a0, comma
    call printf

    addi s5, s5, 1              # i++
    bne s5, s3, printloop1      # i < returnSize

    la a0, backspace
    call printf

    la a0, rbracket
    call printf

    la a0, newline
    call printf

    lw ra, 0(sp)
    addi sp, sp, 4
    ret

There is a little difficult for me to wrtie recursive code in assembly, the difficult point is how stack pointer move, once the sp get wrong position, the return address(ra) will totally mess up and the program will not run properly.

Take input n = 4, k = 3 for example, The procedure about how Combinations(4,3) work as below.

The redline is actually the order of function execute, and in every backward arrow and vertical arrow, that is, the red arrow which is no overlapped with black arrow will pop the stack with specific number of bytes coresspond to k value, and the overlapping situation will push the stack.

sp
$↓$											$↑$
$↓$								$↓$	$↑$	$↓$	$↑$
$↓$				$↓$	$↑$	$↓$	$↑$	$↓$
$↓$	$↑$	$↓$	$↑$	$↓$

Result

sim

[[1,2,3],[1,2,4],[1,3,4],[2,3,4]]

Excuting 18350 instructions, 24112 cycles, 1.314 CPI
Program terminate
- ../rtl/../testbench/testbench.v:434: Verilog $finish

Simulation statistics
=====================
Simulation time  : 0.134 s
Simulation cycles: 24123
Simulation speed : 0.180022 MHz

rvsim

[[1,2,3],[1,2,4],[1,3,4],[2,3,4]]

Excuting 18350 instructions, 24112 cycles, 1.314 CPI
Program terminate

Simulation statistics
=====================
Simulation time  : 0.004 s
Simulation cycles: 24112
Simulation speed : 5.725 MHz

The answer is correct
- Executing 18343 instructions
- 24105 cycles
- 1.314 CPI

	sim	rvsim
Simulation time	0.134s	0.004s
Simulation cycles	24123	24112
Simulation speed	0.180022 MHz	5.725 MHz

Check waveform with GTKwave

Analysize srv32 RV32 core

Hint by Lab3: srv32, srv32 is a 3-stage pipeline architecture with IF/ID, EX, WB stages and supports full forwarding, means that RAW data hazard can be resolved without stalling the processor

Compare the disassembly and the waveform

00000000 <_start>:
       0:	00010297          	auipc	t0,0x10
       4:	18028293          	addi	t0,t0,384 # 10180 <trap_handler>
       8:	30529073          	.4byte	0x30529073
       c:	3050e073          	.4byte	0x3050e073
      10:	00022297          	auipc	t0,0x22
      14:	82c28293          	addi	t0,t0,-2004 # 2183c <_PathLocale>
      18:	00022317          	auipc	t1,0x22
      1c:	b3430313          	addi	t1,t1,-1228 # 21b4c <_bss_end>

Analysize waveform

First check the waveform, at initial imem fetch the instruction at 00000000 , which is corresponded to 00010297 stands for auipc t0, 0x10, and inst[31:0] not decode by instr_decoder yet (00000013 stands for addi x0, x0, 0, which is NOP)

Next, IF/ID in the same stage (same clock period), which can obtain rdata and inst with same value.

Check hazard

Since srv32 has fully bypassing, there is not stall generated by data hazards

Check branch

Take first branch occur for example, at 78 ps (the line in the figure), raddr and rdata equals to FE62ECE3 means bltu t0, t1, 20 at IF/ID stage, and one clock cycle after at the EX stage, branch will be taken. Also a clock cycle after wb_flush will trigger and flush 2 cycle, then reload the correct instruction after branching.

Disassembly

00000020 <_bss_clear>:
      20:	0002a023          	sw	zero,0(t0)
      24:	00428293          	addi	t0,t0,4
      28:	fe62ece3          	bltu	t0,t1,20 <_bss_clear>
      2c:	00040117          	auipc	sp,0x40
      30:	fd410113          	addi	sp,sp,-44 # 40000 <_stack>
      34:	008000ef          	jal	ra,3c <main>
      38:	4541006f          	j	1048c <exit>

Instruction	c1	c2	c3	c4	c5	c6	c7
`bltu t0, t1, 20 <_bss_clear>`	IF/ID	EX	WB
`auipc sp, 0x40`		~~IF/ID~~NOP	EXNOP	WBNOP
`addi sp, sp, -44`			~~IF/ID~~NOP	EXNOP	WBNOP
`sw zero, 0(t0)`				IF/ID	EX	WB
`addi t0, t0, 4`					IF/ID	EX	WB

From c1 to c5 is correspond to the figure above from 77 ps to 97 ps (5 clock cycle)

Problem 1: 鄒崴丞-Length of Last Word

Notice that at 4293 ps, 0xFEC296E3 was executed which is correspond to bne t0, a2, loop2. In first testcase, this branch is taken totally four times, the number of the loop execute is relate to the last word's length. Here can do loop unrolling to reduce branch panelty.

Problem 2: Leetcode 77. Combinations

At 4053ps, 120000EF was executed, which is call combine. Get into combine function and run the algorithm and cause branch panelty with 2 cycle panelty.

There is similar behavior (branch and flush) if branch is taken. So, the branch in my program can be do some optimization to reduce branch panelty.

Propose the software optimizations

Problem 1: 鄒崴丞-Length of Last Word

Original loop2 which is relate the length of last word.

loop2:
        beq         a1, x0, return    # if encounter the last position, return
        addi        a1, a1, -1
        addi        a3, a3, -1
        addi        a0, a0, 1         # s++
        lb          t0, 0(a3)         # t0 = s[i]
        bne         t0, a2, loop2

Loop unrolling 2 times.

loop2:
        beq         a1, x0, return    # if encounter the last position, return
        addi        a1, a1, -1
        addi        a3, a3, -1
        addi        a0, a0, 1         # s++
        lb          t0, 0(a3)         # t0 = s[i]
        beq         t0, a2, return  

        beq         a1, x0, return    # if encounter the last position, return
        addi        a1, a1, -1
        addi        a3, a3, -1
        addi        a0, a0, 1         # s++
        lb          t0, 0(a3)         # t0 = s[i]
        
        bne         t0, a2, loop2

Loop unrolling 4 times.

loop2:
        beq         a1, x0, return    # if encounter the last position, return
        addi        a1, a1, -1
        addi        a3, a3, -1
        addi        a0, a0, 1         # s++
        lb          t0, 0(a3)         # t0 = s[i]
        beq         t0, a2, return  

        beq         a1, x0, return    # if encounter the last position, return
        addi        a1, a1, -1
        addi        a3, a3, -1
        addi        a0, a0, 1         # s++
        lb          t0, 0(a3)         # t0 = s[i]
        
        beq         a1, x0, return    # if encounter the last position, return
        addi        a1, a1, -1
        addi        a3, a3, -1
        addi        a0, a0, 1         # s++
        lb          t0, 0(a3)         # t0 = s[i]

        beq         a1, x0, return    # if encounter the last position, return
        addi        a1, a1, -1
        addi        a3, a3, -1
        addi        a0, a0, 1         # s++
        lb          t0, 0(a3)         # t0 = s[i]

        bne         t0, a2, loop2

	Original	Unrolling 2	Unrolling 4
Instructions	12991	12980	12986
Cycles	17619	17596	17598
CPI	1.356	1.356	1.355

Notice that the performance is no grouth with the numbers of unrolling, that's because the in the testcase above, the last word length is not so long so the branch will no take so much.

If change the testcase to

str1:       .string "Hello Worldaaaaaaaaaaaaaaaa"
str2:       .string "I am a studentaaaaaaaaaaaaaaaaaaaaaa  "  
str3:       .string "aaaaaaaaaaaaaaaaaaa"

	Original	Unrolling 2	Unrolling 4
Instructions	15428	15382	15358
Cycles	20926	20830	20782
CPI	1.356	1.354	1.353

As the length of last word be longer, the improvement of loop unrolling is clear.

Problem 2: Leetcode 77. Combinations

To reduce the branch panelty, unrolling the printloop2 loop by 4 times

printloop2:
    beq s4, s2, loop2out
    lw a1, 0(s1)                # res[i]

    addi s1, s1, 0x4            # res[i][j] address offset
    la a0, iformat
    call printf

    addi s4, s4, 1              # i++

    beq s4, s2, loop2out
    lw a1, 0(s1)                # res[i]

    addi s1, s1, 0x4            # res[i][j] address offset
    la a0, iformat
    call printf

    addi s4, s4, 1              # i++

    beq s4, s2, loop2out
    lw a1, 0(s1)                # res[i]

    addi s1, s1, 0x4            # res[i][j] address offset
    la a0, iformat
    call printf

    addi s4, s4, 1              # i++

    beq s4, s2, loop2out
    lw a1, 0(s1)                # res[i]

    addi s1, s1, 0x4            # res[i][j] address offset
    la a0, iformat
    call printf

    addi s4, s4, 1              # i++   

    beq s4, s2, loop2out
    lw a1, 0(s1)                # res[i]

    addi s1, s1, 0x4            # res[i][j] address offset
    la a0, iformat
    call printf

    addi s4, s4, 1              # i++

    beq s4, s2, loop2out
    lw a1, 0(s1)                # res[i]

    addi s1, s1, 0x4            # res[i][j] address offset
    la a0, iformat
    call printf

    addi s4, s4, 1              # i++
    j printloop2

There is the same if I unroll 4, 8, 16 times. because in my testcase, loop2 handle the size of column (cond->k), which is 3, so if want to unroll much more, need to take another testcase.

Furthermore, make function printAns, combine, memcpy, to inline functon, such as:

.macro memcpy
    # a0 dest
    # a1 src
    # a2 n
    #slli t5, a2, 0x2
    li t3, 0        # i = 0

mcpyloop:
    lw t4, 0(a1)
    sw t4, 0(a0)

    addi a0, a0, 4
    addi a1, a1, 4
    addi t3, t3, 4
    bne t3, a2, mcpyloop
#    ret 
.endm

Also can reduce branch panelty

	Original	Unrolling printAns loop	Unrolling and inline function
Instructions	18350	18354	18342
Cycles	24112	24108	24072
CPI	1.314	1.314	1.312

How srv32 work with Verilator

If do make hello, it will go find the correspond foled under /sw and do a series operations.






riscv-none-elf-gcc -O3 -Wall -march=rv32im_zicsr -mabi=ilp32  -misa-spec=2.2 -march=rv32im -nostartfiles -nostdlib -L../common -o hello.elf hello.c -lc -lm -lgcc -lsys -T ../common/default.ld
riscv-none-elf-objcopy -j .text -O binary hello.elf imem.bin
riscv-none-elf-objcopy -j .data -O binary hello.elf dmem.bin
riscv-none-elf-objcopy -O binary hello.elf memory.bin
riscv-none-elf-objdump -d hello.elf > hello.dis
riscv-none-elf-readelf -a hello.elf > hello.symbol

line 1 do the compilation and generate hello.elf
line 2 copy .text section in hello.elf and generate imem.bin
line 3 copy .data section in hello.elf and generate dmem.bin
line 4 create an output file in binary
line 5 disassemble .text section (executable sections)
line 6 disassemble with -h -l -S -s -r -d -V -A -I

and next see /sim/Makefile


FILELIST    = -f filelist.txt $(if $(_top), ../rtl/top_s.v, ../rtl/top.v)

$(TARGET):
	$(TARGET_SIM) $(BFLAGS) -o $(TARGET) $(FILELIST)
	mv sim_cc/sim .

parse argument from filelist.txt

filelist.txt

+incdir+../rtl
+define+TRACE
//+define+RV32C_ENABLED=1
//+define+MEMSIZE=128
//+define+SINGLE_RAM
../rtl/riscv.v
../rtl/clint.v
../testbench/testbench.v
../testbench/memmodel.v

Do make under /sim will generate executable file sim and a bunch of file which will move to sim_cc

Then ./sim can run the executable file which is done before (generate imem and dmem).

If simply run make hello.run under /sim, it will copy imem.bin and dmem.bin under /sw/hello and run the program.

sim_main.cpp

    while (!Verilated::gotFinish()) {
        if (main_time > RESOLUTION) {
            top->resetb = 1;
        }
        if (main_time > 30) {
            top->stall = 0;
        }
        if ((main_time % RESOLUTION) == 1) {
            top->clk = 1;
        }
        if ((main_time % RESOLUTION) == (RESOLUTION / 2 + 1)) {
            top->clk = 0;
        }
        top->eval();
        main_time++;
    }

The while loop will run until verilator end, and can see three signal(resetb, stall, clk) in GTKWave

#ifdef HAVE_CHRONO
    {
          std::chrono::steady_clock::time_point time_end;
          float sec;
          int cycles = main_time / RESOLUTION;
          time_end = std::chrono::steady_clock::now();
          sec = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_begin).count() / 1000.0;
          float speed_mhz = cycles / sec / 1000000.0;
          std::cout << std::endl;
          std::cout << "Simulation statistics" << std::endl;
          std::cout << "=====================" << std::endl;
          std::cout << "Simulation time  : " << sec << " s" << std::endl;
          std::cout << "Simulation cycles: " << cycles << std::endl;
          std::cout << "Simulation speed : " << speed_mhz << " MHz" << std::endl << std::endl;
    }
    #endif

And use chrono to caculate time.

2022 Assignment3: SoftCPU

tags: Computer Architecture

Source code

C code(Modified from 鄒崴丞)

Handwrite Assembly from 鄒崴丞-Length of Last Word

LeetCode 77. Combinations

Tweaked programs for srv32

Modify Makefile

Problem 1: 鄒崴丞-Length of Last Word

Problem 2: Leetcode 77. Combinations

Check waveform with GTKwave

Analysize srv32 RV32 core

Analysize waveform

Check hazard

Check branch

Problem 1: 鄒崴丞-Length of Last Word

Problem 2: Leetcode 77. Combinations

Propose the software optimizations

Problem 1: 鄒崴丞-Length of Last Word

Problem 2: Leetcode 77. Combinations

How srv32 work with Verilator

Reference Link

Read more

面試紀錄 & 練習 (聯發科)

2023q1 Homework1 (lab0)

將現有的 Raspberry PI 系統備份

Run FreeRTOS on VexRiscv and Measure/Tweak context switch overhead

tags: `Computer Architecture`

Modify `Makefile`