Try   HackMD

2022 Assignment3: SoftCPU

tags: Computer Architecture

contribute by <chiangkd>

Requirement follow Assignment3: SoftCPU

Source code

I choose the program from 鄒崴丞-Length of Last Word and Leetcode 77. Combinations

C code(Modified from 鄒崴丞)

#include <stdio.h>

int lengthOfLastWord(char *s){
    if (!*s)
        return 0;
    int i = 0;
    char *head = s;
    while (*s) {
        s++;
        i++;
    }
    s = head;
    int length = 0;
    s = s + i - 1;
    
    while(i && *s == ' '){
        s--;
        i--;
    }

    while(i-- && *s != ' '){
        s--;
        length++;
    }

    return length;
}

int main()
{
    char *str1 = "Hello World";
    char *str2 = "I am a student  ";
    char *str3 = "a";
    int len1 = lengthOfLastWord(str1);
    int len2 = lengthOfLastWord(str2);
    int len3 = lengthOfLastWord(str3);
    printf("For string \"%s\", length of last word : %d\n", str1, len1);
    printf("For string \"%s\", length of last word : %d\n", str2, len2);
    printf("For string \"%s\", length of last word : %d\n", str3, len3);

    return 0;
}

Handwrite Assembly from 鄒崴丞-Length of Last Word

.org 0
# Provide program starting address to linker
.global _start

/* newlib system calls */
.set SYSEXIT,  93
.set SYSWRITE, 64

.data


str1:       .string "Hello World"
            .set str1len, .-str1
str2:       .string "I am a student  "
            .set str2len, .-str2
str3:       .string "a"
            .set str3len, .-str3
fstr:       .string "For string \""
            .set flen, .-fstr
answrstr:   .string "\", length of last word : "
            .set answrlen, .-answrstr  
newline:    .string "\n"
            .set nllen, .-newline

.text
_start:
        # function : lengthOfLastWord(char * s)
        # a0 : pointer to the start of the string
        la          a0, str1
        # find the position of the last letter is not space
        jal         ra, lengthoflastword        # call strlen()
        add         a2, x0, a0
        la          a0, str1
        li          a1, str1len
        jal         print_result

        la          a0, str2
        # find the position of the last letter is not space
        jal         ra, lengthoflastword        # call strlen()
        add         a2, x0, a0
        la          a0, str2
        li          a1, str2len
        jal         print_result

        la          a0, str3
        # find the position of the last letter is not space
        jal         ra, lengthoflastword        # call strlen()
        add         a2, x0, a0
        la          a0, str3
        li          a1, str3len
        jal         print_result

        li          a0, 0             # return value is 0
        li          a7, SYSEXIT       # end
        ecall

nochar:
        li          a0, 0
        ret
lengthoflastword:
        lb          t0, 0(a0)
        beq         t0, x0, nochar
        
        andi        a2, a0, 3
        add         a1, x0, a0        # a1 = a0
        bne         a2, x0, unaligned # jump if the address is unaligned
starttraverse:
        lui         a3, 0x7F7F8
        addi        a3, a3, -129      # now a3 is 0x7F7F7F7F
        addi        a4, x0, -1        # a4 = -1
findend:
        lw          t0, 0(a1)
        lw          t1, 4(a1)
        lw          t2, 8(a1)
        lw          t3, 12(a1)
        and         a2, t0, t1
        and         t4, t2, t3
        and         a2, a2, t4
        addi        a1, a1, 16
        and         a5, a2, a3
        add         a5, a5, a3
        or          a5, a5, a3
        beq         a5, a4, findend   # break if there are NULL

        and         a5, t0, a3
        add         a5, a5, a3
        or          a5, a5, a3
        bne         a5, a4, back12

        and         a5, t1, a3
        add         a5, a5, a3
        or          a5, a5, a3
        bne         a5, a4, back8

        and         a5, t2, a3
        add         a5, a5, a3
        or          a5, a5, a3
        bne         a5, a4, back4

findbyte:
        lbu         a2, -4(a1)
        sub         a3, a1, a0
        beq         a2, x0, sub4
        lbu         a2, -3(a1)
        beq         a2, x0, sub3
        lbu         a2, -2(a1)
        snez        a2, a2
        add         a2, a2, a3
        addi        a1, a2, -2
        j           findword
back4:
        addi        a1, a1, -4
        j           findbyte
back8:
        addi        a1, a1, -8
        j           findbyte
back12:
        addi        a1, a1, -12
        j           findbyte

checkalign:
        beq         a3, x0, starttraverse
unaligned:
        lbu         a2, 0(a1)
        addi        a1, a1, 1
        andi        a3, a1, 3
        bne         a2, x0, checkalign
        sub         a1, a1, a0
        addi        a1, a1, -1
        j           findword
sub4:
        addi        a1, a3, -4
        j           findword
sub3:
        addi        a1, a3, -3
        j           findword
findword:
        li          a2, 32            # store space to register a2
        add         a0, a1, a0
        addi        a1, a1, 1
loop1:
        # now we reach the end of the string
        beq         a1, x0, return    # if encounter the last position, return
        addi        a0, a0, -1        # s--
        addi        a1, a1, -1        # the last position is not space
        lb          t0, 0(a0)         # t0 = s[i]
        beq         t0, a2, loop1     # while(char[i] == " ")
        add         a3, x0, a0        # a3 = a0
        add         a0, x0, x0
loop2:
        beq         a1, x0, return    # if encounter the last position, return
        addi        a1, a1, -1
        addi        a3, a3, -1
        addi        a0, a0, 1         # s++
        lb          t0, 0(a3)         # t0 = s[i]
        bne         t0, a2, loop2     
return:
        ret

print_result:
        add         s0, x0, a0
        add         s1, x0, a1
        add         s2, x0, a2
        li          a7, SYSWRITE
        li          a0, 1             # print For string : "
        la          a1, fstr
        la          a2, flen
        ecall

        li          a7, SYSWRITE
        li          a0, 1             # print i am a student  
        add         a1, x0, s0
        add         a2, x0, s1
        ecall

        li          a7, SYSWRITE
        li          a0, 1             # print ", length of last word is   
        la          a1, answrstr
        li          a2, answrlen
        ecall

        li          a7, SYSWRITE
        addi        a1, s2, 48        # get length
        addi        sp, sp, -4
        sw          a1, 0(sp)         # store answer in stack
        li          a0, 1             # print answer
        addi        a1, sp, 0
        li          a2, 4
        ecall
        addi        sp, sp, 4

        li          a7, SYSWRITE
        li          a0, 1             # print new line
        la          a1, newline       
        li          a2, nllen
        ecall
        ret
  • There are serval version in 鄒崴丞-Length of Last Word, I take the version with SWAR(4 register at one time), It seems has the best performance (CSR cycle) with longer string.

LeetCode 77. Combinations

Given two integers n and k, return all possible combinations of k numbers chosen from the range [1, n].

Constraints:

  • 1 <= n <= 20
  • 1 <= k <= n

Leetcode Solution

#define MAXLEN 184756
void GetLineSize(int **res, int *line, int lineSize,int n, int k, int *resLen, int idx)
{
    if (lineSize == k) {
        res[*resLen] = (int *)malloc(sizeof(int) * k);
        memcpy(res[*resLen], line, sizeof(int) * k);
        (*resLen)++;
        return;
    }

    for (int i = idx; i <= n; i++) {
        line[lineSize] = i;
        GetLineSize(res, line, lineSize + 1, n, k, resLen, i + 1);
    }
}
int** combine(int n, int k, int* returnSize, int** returnColumnSizes){

    int **res = malloc(sizeof(int *)* MAXLEN);
    int line[k];
    int lineSize = 0;
    int resLen = 0;
    GetLineSize(res, line, lineSize, n, k, &resLen, 1);
    *returnSize = resLen;
    *returnColumnSizes = malloc(sizeof(int)*(*returnSize));
    int i = *returnSize-1;
    while(i >= 0){
    	(*returnColumnSizes)[i--] = k;
    }
    return res;
}

From description, we know that the MAXLEN about the answer will be

C1020=184756, so that's why I give it a constant value and after caculation, just realloc it.

Improvement
In LeetCode, there are many problems take the argument int *returnSize and int ** returnColumnSizes, the first one is about output array's length, and the second one is about the each column size in the output array.

But, in my problem, the column size will always equal to k, which is an argument about combine function, and the returnSize will equal to

Ckn as mention above.

So, I remove returnSize and returnColumnSizes to simplfy the question. And also, use structure to reduce argument number

struct set {
    int n;
    int k;
    int returnSize;
};
#define MAXLEN 184756

void GetLineSize(int **res, int *line, int lineSize, struct set *cond, int idx)
{
    if (lineSize == cond->k) {
        res[cond->returnSize] = (int *)malloc(sizeof(int) * cond->k);
        memcpy(res[cond->returnSize], line, sizeof(int) * cond->k);
        (cond->returnSize)++;
        return;
    }
    for (int i = idx; i <= cond->n; i++) {
        line[lineSize] = i;
        GetLineSize(res, line, lineSize + 1, cond, i + 1);
    }
}
int** combine(struct set *cond){
    int **res = (int **)malloc(sizeof(int *)* MAXLEN);
    int line[cond->k];
    int lineSize = 0;
    GetLineSize(res, line, lineSize, cond, 1);
    res = realloc(res, sizeof(int*) * cond->returnSize);
    return res;
}

However, when I do make with this program, get an error message

DMEM address 002c5d44 out of range

which is defined in testbench.v, means out of memory range (DRAM + IRAM), so I shrink the MAXLENGTH to make it run properly.

#define MAXLEN 20000

Tweaked programs for srv32

  • You should validate the results in your program(s).
  • You should program in RISC-V assembly for the sake of further optimizations. Hint: You may modify a customized Makefile for building C and assembly source files from scratch.

Modify Makefile

Simply copy Makefile from existing hello example and just modifiy the target and associated dependency name

include ../common/Makefile.common

EXE      = .elf
SRC      = lolw.S
CFLAGS  += -L../common
LDFLAGS += -T ../common/default.ld
TARGET   = lolws
OUTPUT   = $(TARGET)$(EXE)

.PHONY: all clean

all: $(TARGET)

$(TARGET): $(SRC)
	$(CC) $(CFLAGS) -o $(OUTPUT) $(SRC) $(LDFLAGS)
	$(OBJCOPY) -j .text -O binary $(OUTPUT) imem.bin
	$(OBJCOPY) -j .data -O binary $(OUTPUT) dmem.bin
	$(OBJCOPY) -O binary $(OUTPUT) memory.bin
	$(OBJDUMP) -d $(OUTPUT) > $(TARGET).dis
	$(READELF) -a $(OUTPUT) > $(TARGET).symbol

clean:
	$(RM) *.o $(OUTPUT) $(TARGET).dis $(TARGET).symbol [id]mem.bin memory.bin

Problem 1: 鄒崴丞-Length of Last Word

And do some modification to let the assembly run on srv32

  • printf system call
  • stored ra
.data

str1:       .string "Hello World"
str2:       .string "I am a student  "  
str3:       .string "a"
space:      .string " "
fstr:       .string "For string \""
answrstr:   .string "\", length of last word : "
newline:    .string "\n"
iformat:    .string "%d"

.text

.global main
main:
        # function : lengthOfLastWord(char * s)
        # a0 : pointer to the start of the string
        addi sp, sp, -4
        sw ra, 0(sp)

        la          a0, str1
        # find the position of the last letter is not space
        jal         ra, lengthoflastword        # call strlen()
        add         a2, x0, a0
        la          a0, str1
        jal         print_result

        la          a0, str2
        # find the position of the last letter is not space
        jal         ra, lengthoflastword        # call strlen()
        add         a2, x0, a0
        la          a0, str2
        jal         print_result

        la          a0, str3
        # find the position of the last letter is not space
        jal         ra, lengthoflastword        # call strlen()
        add         a2, x0, a0
        la          a0, str3
        jal         print_result

        lw ra, 0(sp)
        addi sp, sp, 4
        li          a0, 0             # return value is 0
        ret

lengthoflastword:
        lb          t0, 0(a0)
        beq         t0, x0, return
        
        andi        a2, a0, 3
        add         a1, x0, a0        # a1 = a0
        bne         a2, x0, unaligned # jump if the address is unaligned
starttraverse:
        lui         a3, 0x7F7F8
        addi        a3, a3, -129      # now a3 is 0x7F7F7F7F
        addi        a4, x0, -1        # a4 = -1
findend:
        lw          t0, 0(a1)
        lw          t1, 4(a1)
        lw          t2, 8(a1)
        lw          t3, 12(a1)
        and         a2, t0, t1
        and         t4, t2, t3
        and         a2, a2, t4
        addi        a1, a1, 16
        and         a5, a2, a3
        add         a5, a5, a3
        or          a5, a5, a3
        beq         a5, a4, findend   # break if there are NULL

        and         a5, t0, a3
        add         a5, a5, a3
        or          a5, a5, a3
        bne         a5, a4, back12

        and         a5, t1, a3
        add         a5, a5, a3
        or          a5, a5, a3
        bne         a5, a4, back8

        and         a5, t2, a3
        add         a5, a5, a3
        or          a5, a5, a3
        bne         a5, a4, back4

findbyte:
        lbu         a2, -4(a1)
        sub         a3, a1, a0
        beq         a2, x0, sub4
        lbu         a2, -3(a1)
        beq         a2, x0, sub3
        lbu         a2, -2(a1)
        snez        a2, a2
        add         a2, a2, a3
        addi        a1, a2, -2
        j           findword
back4:
        addi        a1, a1, -4
        j           findbyte
back8:
        addi        a1, a1, -8
        j           findbyte
back12:
        addi        a1, a1, -12
        j           findbyte

checkalign:
        beq         a3, x0, starttraverse
unaligned:
        lbu         a2, 0(a1)
        addi        a1, a1, 1
        andi        a3, a1, 3
        bne         a2, x0, checkalign
        sub         a1, a1, a0
        addi        a1, a1, -1
        j           findword
sub4:
        addi        a1, a3, -4
        j           findword
sub3:
        addi        a1, a3, -3
        j           findword
findword:
        li          a2, 32            # store space to register a2
        add         a0, a1, a0
        addi        a1, a1, 1
loop1:
        # now we reach the end of the string
        beq         a1, x0, return    # if encounter the last position, return
        addi        a0, a0, -1        # s--
        addi        a1, a1, -1        # the last position is not space
        lb          t0, 0(a0)         # t0 = s[i]
        beq         t0, a2, loop1     # while(char[i] == " ")
        add         a3, x0, a0        # a3 = a0
        add         a0, x0, x0
loop2:
        beq         a1, x0, return    # if encounter the last position, return
        addi        a1, a1, -1
        addi        a3, a3, -1
        addi        a0, a0, 1         # s++
        lb          t0, 0(a3)         # t0 = s[i]
        bne         t0, a2, loop2     
return:

        ret

print_result:
        addi        sp, sp, -4
        sw          ra, 0(sp)

        add         s0, x0, a0
        add         s1, x0, a1
        add         s2, x0, a2

        la          a0, fstr          # print For string :
        call        printf

        add         a0, x0, s0        # print i am a student
        call        printf
          
        la          a0, answrstr      # print ", length of last word is   
        call        printf

        addi        a1, s2, 0         # get length
        addi        sp, sp, -4
        sw          a1, 0(sp)         # store answer in stack
        
        la          a0, iformat
        lw          a1, 0(sp)         # print answer
        call        printf
        addi        sp, sp, 4
    
        la          a0, newline       # print new line
        call        printf

        lw          ra, 0(sp)
        addi        sp, sp, 4
        ret

Result

sim

For string "Hello World", length of last word : 5
For string "I am a student  ", length of last word : 7
For string "a", length of last word : 1

Excuting 12991 instructions, 17619 cycles, 1.356 CPI
Program terminate
- ../rtl/../testbench/testbench.v:434: Verilog $finish

Simulation statistics
=====================
Simulation time  : 0.079 s
Simulation cycles: 17630
Simulation speed : 0.223165 MHz

rvsim

For string "Hello World", length of last word : 5
For string "I am a student  ", length of last word : 7
For string "a", length of last word : 1

Excuting 12991 instructions, 17619 cycles, 1.356 CPI
Program terminate

Simulation statistics
=====================
Simulation time  : 0.003 s
Simulation cycles: 17619
Simulation speed : 5.523 MHz
  • The answer is correct
    • Executing 12991 instructions
    • 17619 cycles
    • 1.356 CPI
sim rvsim
Simulation time 0.075s 0.03s
Simulation cycles 17252 17241
Simulation speed 0.230027 MHz 5.760 MHz

Problem 2: Leetcode 77. Combinations

From the C code mentioned above, write the assembly which can properly run on srv32

Hand-write Assembly

.data
    n: .word 20

    lbracket: .string "["
    rbracket: .string "]"
    comma: .string ","
    backspace: .string "\b"
    newline: .string "\n"
    iformat: .string "%d,"
    xformat: .string "%x\n"

.text

.global main

main:
    li t1, 0
    addi sp, sp, -4
    sw ra, 0(sp)

    # push sp
    lw t0, n
    slli t0, t0, 2
    sub sp, sp, t0

    mv a0, sp           	# a0 = &cond
    li t0, 4            	# cond.n = 4
    sw t0, 0(a0)
    li t0, 3            	# cond.k = 3
    sw t0, 4(a0)
    sw x0, 8(a0)		# cond.returnSize = 0

    call combine
    # a0 = res

    lw a1, 4(a3)    		# a1 = cond.k
    lw a2, 8(a3)    		# a2 = cond.returnSize
    
    call printAns

    # pop sp
    lw t0, n
    slli t0, t0, 2
    add sp, sp, t0

    # restore ra
    lw ra, 0(sp)
    addi sp, sp, 4

    # exit
    li a0, 0
    ret

memcpy:
    # a0 dest
    # a1 src
    # a2 n
    #slli t5, a2, 0x2
    li t3, 0        # i = 0

mcpyloop:
    lw t4, 0(a1)
    sw t4, 0(a0)

    addi a0, a0, 4
    addi a1, a1, 4
    addi t3, t3, 4
    bne t3, a2, mcpyloop
    ret 

GetLineSize:
    # a0 = res
    # a1 = line
    # a2 = lineSize
    # a3 = cond
    # a4 = idx
    addi sp, sp, -12
    sw ra, 0(sp)
    sw a2, 4(sp)    			# lineSize
    sw a4, 8(sp)    			# idx
    lw s1, 4(a3)    			# s1 = cond->k

    bne a2, s1, GLSrecursive    	# if (lineSize != cond->k)
    
PutLineIn:
    # a0 = res
    # a1 = line
    # a2 = lineSize
    # a3 = cond
    # a4 = idx
    addi sp, sp, -24
    sw a0, 0(sp)
    sw a1, 4(sp)
    sw a2, 8(sp)
    sw a3, 12(sp)
    sw a4, 16(sp)

    lw t2, 20(sp)

    add t1, t2, t1
    add a0, a0, t1      # a0 = res[cond->returnSize]
    lw t2, 4(a3)        # t2 = cond->k

    slli a2, t2, 0x2    # a2 = sizoef(int) * cond->k
    sw a2, 20(sp)       # store offset            # Here can do some improvement

    call memcpy
    lw a0, 0(sp)        # res
    lw a1, 4(sp)        # line
    lw a2, 8(sp)        # lineSize
    lw a3, 12(sp)       # cond
    lw a4, 16(sp)       # idx
    
    addi sp, sp, 24
    addi t5, t5, 1      # cond->returnSize++
    sw t5, 8(a3)        # store returnSize back
    lw ra, 0(sp)
    addi sp, sp, 12
    ret
    
GLSrecursive:
    mv s2, a2                   # s1 = lineSize
    mv s3, a4                   # int i = idx
    lw t0, 0(a3)                # t0 = cond->n
    
    blt t0, s3, Normalreturn
    #addi t0, t0, 1
GLSloop:
    
    slli t3, s2, 0x2            # t3 = line[lineSize] offset
    add t4, a1, t3              # t4 = line[lineSize] address
    sw s3, 0(t4)                # line[lineSize] = i
    
    # a2, a3 must put in function with plus 1
    addi a2, s2, 1              # lineSize + 1
    addi a4, s3, 1              # i + 1
    
    call GetLineSize
    addi s3, s3, 1              # i++
    sw s3, 8(sp)
    bge t0, s3, GLSloop
    
Normalreturn:
    lw ra, 0(sp)
    addi sp, sp, 12
    lw a2, 4(sp)
    lw a4, 8(sp)

    mv s2, a2
    mv s3, a4

    ret

combine:
    # a0 = &cond
    # 0(a0) = cond->n
    # 4(a0) = cond->k
    # 8(a0) = cond->returnSize

    addi sp, sp, -24
    sw ra, 0(sp)
    mv a3, a0           	# a3 = cond
    addi a0, a0, -2000      	# a0 = address of **res
    addi a1, a0, -2000    	# a1 = address of line[cond->k]   
    li a2, 0            	# a2 = lineSize
    li a4, 1            	# a4 = 1
    
    sw a0, 4(sp)
    sw a1, 8(sp)
    sw a2, 12(sp)
    sw a3, 16(sp)
    sw a4, 20(sp)
    
    call GetLineSize

    lw ra, 0(sp)
    lw a0, 4(sp)        # res
    lw a1, 8(sp)        # line
    lw a2, 12(sp)       # lineSize
    lw a3, 16(sp)       # cond
    lw a4, 20(sp)       # idx
    addi sp, sp, 24
    ret

printAns:

    # a0 = res
    # a1 = k (cond.k)
    # a2 = returnSize (cond.returnSize)
    mv s2, a1                           # return Size = 3
    mv s3, a2                           # k = 4
    addi sp, sp, -4
    sw ra, 0(sp)

    mv s1, a0                   # s1 = res
    la a0, lbracket
    call printf
    li s4, 0                    # int i = 0
    li s5, 0                    # int j = 0
printloop1:

    la a0, lbracket
    call printf

    li s4, 0                    # int j = 0

printloop2:
    lw a1, 0(s1)                # res[i]

    addi s1, s1, 0x4            # res[i][j] address offset
    la a0, iformat
    call printf

    addi s4, s4, 1              # i++
    bne s4, s2, printloop2      # j < returnColumnSizes

    la a0, backspace
    call printf

    la a0, rbracket
    call printf

    la a0, comma
    call printf

    addi s5, s5, 1              # i++
    bne s5, s3, printloop1      # i < returnSize

    la a0, backspace
    call printf

    la a0, rbracket
    call printf

    la a0, newline
    call printf

    lw ra, 0(sp)
    addi sp, sp, 4
    ret

  • There is a little difficult for me to wrtie recursive code in assembly, the difficult point is how stack pointer move, once the sp get wrong position, the return address(ra) will totally mess up and the program will not run properly.

Take input n = 4, k = 3 for example, The procedure about how Combinations(4,3) work as below.







demo



1

GLS(0,1)



2

GLS(1,2)



1->2





1->2





9

GLS(1,3)



1->9





13

GLS(1,4)



1->13





15

GLS(1,5)



1->15





3

GLS(2,3)



2->3





2->3





6

GLS(2,4)



2->6





8

GLS(2,5)



2->8





10

GLS(2,4)



9->10





9->10





12

GLS(2,5)



9->12





14

GLS(2,5)



13->14





13->14





END

END



15->END





4

GLS(3,4)



3->4





3->4





5

GLS(3,5)



3->5





7

GLS(3,5)



6->7





6->7





8->9





11

GLS(3,5)



10->11





10->11





12->13





14->15





4->5





5->6





7->8





11->12





The redline is actually the order of function execute, and in every backward arrow and vertical arrow, that is, the red arrow which is no overlapped with black arrow will pop the stack with specific number of bytes coresspond to k value, and the overlapping situation will push the stack.

Function Entry sp Function Out

Result

sim

[[1,2,3],[1,2,4],[1,3,4],[2,3,4]]

Excuting 18350 instructions, 24112 cycles, 1.314 CPI
Program terminate
- ../rtl/../testbench/testbench.v:434: Verilog $finish

Simulation statistics
=====================
Simulation time  : 0.134 s
Simulation cycles: 24123
Simulation speed : 0.180022 MHz

rvsim

[[1,2,3],[1,2,4],[1,3,4],[2,3,4]]

Excuting 18350 instructions, 24112 cycles, 1.314 CPI
Program terminate

Simulation statistics
=====================
Simulation time  : 0.004 s
Simulation cycles: 24112
Simulation speed : 5.725 MHz
  • The answer is correct
    • Executing 18343 instructions
    • 24105 cycles
    • 1.314 CPI
sim rvsim
Simulation time 0.134s 0.004s
Simulation cycles 24123 24112
Simulation speed 0.180022 MHz 5.725 MHz

Check waveform with GTKwave

Analysize srv32 RV32 core

Hint by Lab3: srv32, srv32 is a 3-stage pipeline architecture with IF/ID, EX, WB stages and supports full forwarding, means that RAW data hazard can be resolved without stalling the processor

Compare the disassembly and the waveform

00000000 <_start>:
       0:	00010297          	auipc	t0,0x10
       4:	18028293          	addi	t0,t0,384 # 10180 <trap_handler>
       8:	30529073          	.4byte	0x30529073
       c:	3050e073          	.4byte	0x3050e073
      10:	00022297          	auipc	t0,0x22
      14:	82c28293          	addi	t0,t0,-2004 # 2183c <_PathLocale>
      18:	00022317          	auipc	t1,0x22
      1c:	b3430313          	addi	t1,t1,-1228 # 21b4c <_bss_end>

Analysize waveform


First check the waveform, at initial imem fetch the instruction at 00000000 , which is corresponded to 00010297 stands for auipc t0, 0x10, and inst[31:0] not decode by instr_decoder yet (00000013 stands for addi x0, x0, 0, which is NOP)


Next, IF/ID in the same stage (same clock period), which can obtain rdata and inst with same value.

Check hazard


Since srv32 has fully bypassing, there is not stall generated by data hazards

Check branch


Take first branch occur for example, at 78 ps (the line in the figure), raddr and rdata equals to FE62ECE3 means bltu t0, t1, 20 at IF/ID stage, and one clock cycle after at the EX stage, branch will be taken. Also a clock cycle after wb_flush will trigger and flush 2 cycle, then reload the correct instruction after branching.

Disassembly

00000020 <_bss_clear>:
      20:	0002a023          	sw	zero,0(t0)
      24:	00428293          	addi	t0,t0,4
      28:	fe62ece3          	bltu	t0,t1,20 <_bss_clear>
      2c:	00040117          	auipc	sp,0x40
      30:	fd410113          	addi	sp,sp,-44 # 40000 <_stack>
      34:	008000ef          	jal	ra,3c <main>
      38:	4541006f          	j	1048c <exit>
Instruction c1 c2 c3 c4 c5 c6 c7
bltu t0, t1, 20 <_bss_clear> IF/ID EX WB
auipc sp, 0x40 IF/IDNOP EXNOP WBNOP
addi sp, sp, -44 IF/IDNOP EXNOP WBNOP
sw zero, 0(t0) IF/ID EX WB
addi t0, t0, 4 IF/ID EX WB

  • From c1 to c5 is correspond to the figure above from 77 ps to 97 ps (5 clock cycle)

Problem 1: 鄒崴丞-Length of Last Word

Notice that at 4293 ps, 0xFEC296E3 was executed which is correspond to bne t0, a2, loop2. In first testcase, this branch is taken totally four times, the number of the loop execute is relate to the last word's length. Here can do loop unrolling to reduce branch panelty.

Problem 2: Leetcode 77. Combinations

At 4053ps, 120000EF was executed, which is call combine. Get into combine function and run the algorithm and cause branch panelty with 2 cycle panelty.

There is similar behavior (branch and flush) if branch is taken. So, the branch in my program can be do some optimization to reduce branch panelty.

Propose the software optimizations

Problem 1: 鄒崴丞-Length of Last Word

Original loop2 which is relate the length of last word.

loop2:
        beq         a1, x0, return    # if encounter the last position, return
        addi        a1, a1, -1
        addi        a3, a3, -1
        addi        a0, a0, 1         # s++
        lb          t0, 0(a3)         # t0 = s[i]
        bne         t0, a2, loop2     

Loop unrolling 2 times.

loop2:
        beq         a1, x0, return    # if encounter the last position, return
        addi        a1, a1, -1
        addi        a3, a3, -1
        addi        a0, a0, 1         # s++
        lb          t0, 0(a3)         # t0 = s[i]
        beq         t0, a2, return  

        beq         a1, x0, return    # if encounter the last position, return
        addi        a1, a1, -1
        addi        a3, a3, -1
        addi        a0, a0, 1         # s++
        lb          t0, 0(a3)         # t0 = s[i]
        
        bne         t0, a2, loop2     

Loop unrolling 4 times.

loop2:
        beq         a1, x0, return    # if encounter the last position, return
        addi        a1, a1, -1
        addi        a3, a3, -1
        addi        a0, a0, 1         # s++
        lb          t0, 0(a3)         # t0 = s[i]
        beq         t0, a2, return  

        beq         a1, x0, return    # if encounter the last position, return
        addi        a1, a1, -1
        addi        a3, a3, -1
        addi        a0, a0, 1         # s++
        lb          t0, 0(a3)         # t0 = s[i]
        
        beq         a1, x0, return    # if encounter the last position, return
        addi        a1, a1, -1
        addi        a3, a3, -1
        addi        a0, a0, 1         # s++
        lb          t0, 0(a3)         # t0 = s[i]

        beq         a1, x0, return    # if encounter the last position, return
        addi        a1, a1, -1
        addi        a3, a3, -1
        addi        a0, a0, 1         # s++
        lb          t0, 0(a3)         # t0 = s[i]

        bne         t0, a2, loop2     
Original Unrolling 2 Unrolling 4
Instructions 12991 12980 12986
Cycles 17619 17596 17598
CPI 1.356 1.356 1.355
  • Notice that the performance is no grouth with the numbers of unrolling, that's because the in the testcase above, the last word length is not so long so the branch will no take so much.

If change the testcase to

str1:       .string "Hello Worldaaaaaaaaaaaaaaaa"
str2:       .string "I am a studentaaaaaaaaaaaaaaaaaaaaaa  "  
str3:       .string "aaaaaaaaaaaaaaaaaaa"
Original Unrolling 2 Unrolling 4
Instructions 15428 15382 15358
Cycles 20926 20830 20782
CPI 1.356 1.354 1.353
  • As the length of last word be longer, the improvement of loop unrolling is clear.

Problem 2: Leetcode 77. Combinations

To reduce the branch panelty, unrolling the printloop2 loop by 4 times

printloop2:
    beq s4, s2, loop2out
    lw a1, 0(s1)                # res[i]

    addi s1, s1, 0x4            # res[i][j] address offset
    la a0, iformat
    call printf

    addi s4, s4, 1              # i++

    beq s4, s2, loop2out
    lw a1, 0(s1)                # res[i]

    addi s1, s1, 0x4            # res[i][j] address offset
    la a0, iformat
    call printf

    addi s4, s4, 1              # i++

    beq s4, s2, loop2out
    lw a1, 0(s1)                # res[i]

    addi s1, s1, 0x4            # res[i][j] address offset
    la a0, iformat
    call printf

    addi s4, s4, 1              # i++

    beq s4, s2, loop2out
    lw a1, 0(s1)                # res[i]

    addi s1, s1, 0x4            # res[i][j] address offset
    la a0, iformat
    call printf

    addi s4, s4, 1              # i++   

    beq s4, s2, loop2out
    lw a1, 0(s1)                # res[i]

    addi s1, s1, 0x4            # res[i][j] address offset
    la a0, iformat
    call printf

    addi s4, s4, 1              # i++

    beq s4, s2, loop2out
    lw a1, 0(s1)                # res[i]

    addi s1, s1, 0x4            # res[i][j] address offset
    la a0, iformat
    call printf

    addi s4, s4, 1              # i++
    j printloop2
  • There is the same if I unroll 4, 8, 16 times. because in my testcase, loop2 handle the size of column (cond->k), which is 3, so if want to unroll much more, need to take another testcase.

Furthermore, make function printAns, combine, memcpy, to inline functon, such as:

.macro memcpy
    # a0 dest
    # a1 src
    # a2 n
    #slli t5, a2, 0x2
    li t3, 0        # i = 0

mcpyloop:
    lw t4, 0(a1)
    sw t4, 0(a0)

    addi a0, a0, 4
    addi a1, a1, 4
    addi t3, t3, 4
    bne t3, a2, mcpyloop
#    ret 
.endm
  • Also can reduce branch panelty
Original Unrolling printAns loop Unrolling and inline function
Instructions 18350 18354 18342
Cycles 24112 24108 24072
CPI 1.314 1.314 1.312

How srv32 work with Verilator

If do make hello, it will go find the correspond foled under /sw and do a series operations.

riscv-none-elf-gcc -O3 -Wall -march=rv32im_zicsr -mabi=ilp32 -misa-spec=2.2 -march=rv32im -nostartfiles -nostdlib -L../common -o hello.elf hello.c -lc -lm -lgcc -lsys -T ../common/default.ld riscv-none-elf-objcopy -j .text -O binary hello.elf imem.bin riscv-none-elf-objcopy -j .data -O binary hello.elf dmem.bin riscv-none-elf-objcopy -O binary hello.elf memory.bin riscv-none-elf-objdump -d hello.elf > hello.dis riscv-none-elf-readelf -a hello.elf > hello.symbol
  • line 1 do the compilation and generate hello.elf
  • line 2 copy .text section in hello.elf and generate imem.bin
  • line 3 copy .data section in hello.elf and generate dmem.bin
  • line 4 create an output file in binary
  • line 5 disassemble .text section (executable sections)
  • line 6 disassemble with -h -l -S -s -r -d -V -A -I

and next see /sim/Makefile


FILELIST    = -f filelist.txt $(if $(_top), ../rtl/top_s.v, ../rtl/top.v)

$(TARGET):
	$(TARGET_SIM) $(BFLAGS) -o $(TARGET) $(FILELIST)
	mv sim_cc/sim .
  • parse argument from filelist.txt

filelist.txt

+incdir+../rtl
+define+TRACE
//+define+RV32C_ENABLED=1
//+define+MEMSIZE=128
//+define+SINGLE_RAM
../rtl/riscv.v
../rtl/clint.v
../testbench/testbench.v
../testbench/memmodel.v

Do make under /sim will generate executable file sim and a bunch of file which will move to sim_cc

Then ./sim can run the executable file which is done before (generate imem and dmem).

If simply run make hello.run under /sim, it will copy imem.bin and dmem.bin under /sw/hello and run the program.

sim_main.cpp

    while (!Verilated::gotFinish()) {
        if (main_time > RESOLUTION) {
            top->resetb = 1;
        }
        if (main_time > 30) {
            top->stall = 0;
        }
        if ((main_time % RESOLUTION) == 1) {
            top->clk = 1;
        }
        if ((main_time % RESOLUTION) == (RESOLUTION / 2 + 1)) {
            top->clk = 0;
        }
        top->eval();
        main_time++;
    }

The while loop will run until verilator end, and can see three signal(resetb, stall, clk) in GTKWave

#ifdef HAVE_CHRONO
    {
          std::chrono::steady_clock::time_point time_end;
          float sec;
          int cycles = main_time / RESOLUTION;
          time_end = std::chrono::steady_clock::now();
          sec = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_begin).count() / 1000.0;
          float speed_mhz = cycles / sec / 1000000.0;
          std::cout << std::endl;
          std::cout << "Simulation statistics" << std::endl;
          std::cout << "=====================" << std::endl;
          std::cout << "Simulation time  : " << sec << " s" << std::endl;
          std::cout << "Simulation cycles: " << cycles << std::endl;
          std::cout << "Simulation speed : " << speed_mhz << " MHz" << std::endl << std::endl;
    }
    #endif

And use chrono to caculate time.