Assignment3: SoftCPU

# Assignment3: SoftCPU Derived from [Assignment2](https://hackmd.io/@zDmciYQATNm-8XeyJ5Th0Q/B1GGWL5No/edit) ## C code First, we need to put the C code from assignment2 in the new folder in /srv32/sw/hw3. ``` c= #include<stdio.h> #include<stdlib.h> int *getRow(int ,int*); int main(){ int *ans = NULL; int ans_size ; int i; int rowIndex = 0; ans = getRow(rowIndex, &ans_size); for(i=0;i<ans_size;i++){ printf("%d ",ans[i]); } printf("\n"); rowIndex = 10; ans = getRow(rowIndex, &ans_size); for(i=0;i<ans_size;i++){ printf("%d ",ans[i]); } printf("\n"); rowIndex = 33; ans = getRow(rowIndex, &ans_size); for(i=0;i<ans_size;i++){ printf("%d ",ans[i]); } printf("\n"); return 0; } int* getRow(int rowIndex, int* returnSize){ int *ret_arr = malloc(sizeof(int)* (rowIndex + 1)); *returnSize = (rowIndex + 1); int layer = 0; int middle = layer >> 1; while(rowIndex>=layer){ int i=0; for(i=layer/2;i>=0;i--){ if(i==0) ret_arr[i] = 1; else ret_arr[i] = ret_arr[i] + ret_arr[i-1]; } for(i=0;i<=layer/2;i++){ ret_arr[layer-i] = ret_arr[i]; } layer++; } return ret_arr; } ``` ### Modify Makefile Modify the Makefile by example of sw/hello to let it fit hw3.c ```makefile= include ../common/Makefile.common EXE = .elf SRC = hw3.c SSRC = hw3.s CFLAGS += -L../common LDFLAGS += -T ../common/default.ld TARGET = hw3 OUTPUT = $(TARGET)$(EXE) .PHONY: all clean all: $(TARGET) $(TARGET): $(SRC) $(CC) $(CFLAGS) -o $(OUTPUT) $(SRC) $(LDFLAGS) $(OBJCOPY) -j .text -O binary $(OUTPUT) imem.bin $(OBJCOPY) -j .data -O binary $(OUTPUT) dmem.bin $(OBJCOPY) -O binary $(OUTPUT) memory.bin $(OBJDUMP) -d $(OUTPUT) > $(TARGET).dis $(READELF) -a $(OUTPUT) > $(TARGET).symbol clean: $(RM) *.o $(OUTPUT) $(TARGET).dis $(TARGET).symbol [id]mem.bin memory.bin ``` ### Run simulation We can run the simulation by the following command. ```shell /verilator/srv32$ make hw3 ``` #### RTL simulation ``` 1 1 10 45 120 210 252 210 120 45 10 1 1 33 528 5456 40920 237336 1107568 4272048 13884156 38567100 92561040 193536720 354817320 573166440 818809200 1037158320 1166803110 1166803110 1037158320 818809200 573166440 354817320 193536720 92561040 38567100 13884156 4272048 1107568 237336 40920 5456 528 33 1 Excuting 73174 instructions, 90282 cycles, 1.233 CPI Program terminate - ../rtl/../testbench/testbench.v:434: Verilog $finish Simulation statistics ===================== Simulation time : 0.909 s Simulation cycles: 90293 Simulation speed : 0.0993322 MHz ``` #### ISS simulation ``` 1 1 10 45 120 210 252 210 120 45 10 1 1 33 528 5456 40920 237336 1107568 4272048 13884156 38567100 92561040 193536720 354817320 573166440 818809200 1037158320 1166803110 1166803110 1037158320 818809200 573166440 354817320 193536720 92561040 38567100 13884156 4272048 1107568 237336 40920 5456 528 33 1 Excuting 73174 instructions, 90282 cycles, 1.234 CPI Program terminate Simulation statistics ===================== Simulation time : 0.036 s Simulation cycles: 90282 Simulation speed : 2.485 MHz ``` ### Assembly code generate by srv32 We can also compile the hw3.c into assembly code by the same method in assignment2 ```s= .file "hw3.c" .option nopic .attribute arch, "rv32i2p0_m2p0" .attribute unaligned_access, 0 .attribute stack_align, 16 .text .align 2 .globl getRow .type getRow, @function getRow: addi sp,sp,-16 sw ra,12(sp) sw s0,8(sp) sw s1,4(sp) sw s2,0(sp) mv s0,a0 mv s1,a1 addi s2,a0,1 slli a0,s2,2 call malloc sw s2,0(s1) blt s0,zero,.L1 mv t3,s2 mv t1,a0 li a6,0 li t4,-1 li a7,1 j .L8 .L4: lw a3,0(a5) lw a2,-4(a5) add a3,a3,a2 sw a3,0(a5) addi a4,a4,-1 addi a5,a5,-4 blt a4,zero,.L5 .L6: bne a4,zero,.L4 sw a7,0(a0) .L5: mv a3,t1 mv a4,a0 li a5,0 .L7: lw a2,0(a4) sw a2,0(a3) addi a5,a5,1 addi a4,a4,4 addi a3,a3,-4 bge a1,a5,.L7 .L3: addi a6,a6,1 addi t1,t1,4 beq a6,t3,.L1 .L8: srli a1,a6,31 add a1,a1,a6 srai a1,a1,1 blt a6,t4,.L3 slli a5,a1,2 add a5,a0,a5 mv a4,a1 j .L6 .L1: lw ra,12(sp) lw s0,8(sp) lw s1,4(sp) lw s2,0(sp) addi sp,sp,16 jr ra .size getRow, .-getRow .section .rodata.str1.4,"aMS",@progbits,1 .align 2 .LC0: .string "%d " .text .align 2 .globl main .type main, @function main: addi sp,sp,-48 sw ra,44(sp) sw s0,40(sp) sw s1,36(sp) sw s2,32(sp) sw s3,28(sp) addi a1,sp,12 li a0,0 call getRow lw s2,12(sp) ble s2,zero,.L14 mv s0,a0 li s1,0 lui s3,%hi(.LC0) .L15: lw a1,0(s0) addi a0,s3,%lo(.LC0) call printf addi s1,s1,1 addi s0,s0,4 bne s1,s2,.L15 .L14: li a0,10 call putchar addi a1,sp,12 li a0,10 call getRow lw s2,12(sp) ble s2,zero,.L16 mv s0,a0 li s1,0 lui s3,%hi(.LC0) .L17: lw a1,0(s0) addi a0,s3,%lo(.LC0) call printf addi s1,s1,1 addi s0,s0,4 bne s1,s2,.L17 .L16: li a0,10 call putchar addi a1,sp,12 li a0,33 call getRow lw s2,12(sp) ble s2,zero,.L18 mv s0,a0 li s1,0 lui s3,%hi(.LC0) .L19: lw a1,0(s0) addi a0,s3,%lo(.LC0) call printf addi s1,s1,1 addi s0,s0,4 bne s1,s2,.L19 .L18: li a0,10 call putchar li a0,0 lw ra,44(sp) lw s0,40(sp) lw s1,36(sp) lw s2,32(sp) lw s3,28(sp) addi sp,sp,48 jr ra .size main, .-main .ident "GCC: (xPack GNU RISC-V Embedded GCC x86_64) 12.2.0" ``` ## Optimized for Assignment2 ### Hand written assembly I revised the original hand written assembly code and let it run in srv32. ```s= .file "triangle.c" .option nopic .attribute arch, "rv32i2p0_m2p0" .attribute unaligned_access, 0 .attribute stack_align, 16 .text .align 2 .globl getRow .type getRow, @function getRow: addi sp, sp, -4 sw ra, 0(sp) addi a4,a3,1 #a4 = returnSize addi a5,x0,0 #a5 = layer while_loop: srli a6,a5,1 addi t1,a6,0 #t1 = i #la s0,ret_arr #s0 = base addr of ret_arr jal for_loop1 addi t1,x0,0 #t1 = i jal for_loop2 addi a5,a5,1 bge a3,a5,while_loop while_end: lw ra,0(sp) addi sp,sp,4 jr ra for_loop1: blt t1,x0,for_loop1_end bne t1,x0,else1 # determine i==0? determine if-else statement addi t3,x0,1 #t3 = 1 slli t2,t1,2 #t2 = i*4, word addr to byte addr add t2,t2,s0 #t2 =offset+base addr sw t3,0(t2) #store 1 to array[0] addi t1,t1,-1 # update iteration variable j for_loop1 for_loop1_end: jr ra # return to caller else1: addi t2,t1,-1 #t2 = i-1 slli t2,t2,2 #to byte address add t2,s0,t2 lw t3,0(t2) # load ret_arr[i-1] add t4,t3,x0 # t4 = ret_arr[i-1] addi t2,t1,0 #t2 = i slli t2,t2,2 #to byte address add t2,s0,t2 lw t3,0(t2) # load ret_arr[i] add t4,t3,t4 #t4+=ret_arr[i] sw t4,0(t2) addi t1,t1,-1 j for_loop1 for_loop2: bge a6,t1,for_loop2_itr jr ra for_loop2_itr: # for copy left to right sub t2,a5,t1 # t2=layer-i slli t2,t2,2 slli t3,t1,2 add t3,s0,t3 add t2,s0,t2 lw t4,0(t3) sw t4,0(t2) addi t1,t1,1 j for_loop2 print_loop: # to show the result addi sp, sp, -4 sw ra, 0(sp) lui s1,%hi(.LC0) print: bge t1,a4,print_loop_end addi sp, sp, -8 sw a4,0(sp) sw t1,4(sp) lw a1,0(s0) #add a1, x0, a4 addi a0,s1,%lo(.LC0) #addi s2,s2,1 call printf lw t1,4(sp) lw a4,0(sp) addi sp, sp, 8 addi t1,t1,1 addi s0,s0,4 j print print_loop_end: addi a0,s1,%lo(.LC1) call printf lw ra, 0(sp) addi sp, sp, 4 jr ra .size getRow, .-getRow .section .rodata .align 2 .LC0: .string "%d " .LC1: .string "\n" .data ret_arr: .word 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 .section .text .align 2 .globl main .type main, @function main: addi sp,sp,-80 sw ra,76(sp) addi a3,x0,0 #a3 = rowIndex la s0,ret_arr jal getRow addi t1,x0,0 #t1 = i jal print_loop addi a3,x0,10 #a3 = rowIndex la s0,ret_arr jal getRow addi t1,x0,0 #t1 = i jal print_loop addi a3,x0,33 #a3 = rowIndex la s0,ret_arr jal getRow addi t1,x0,0 #t1 = i jal print_loop lw ra,76(sp) addi sp,sp,80 jr ra .size main, .-main .ident "GCC: (xPack GNU RISC-V Embedded GCC x86_64) 12.2.0" ``` ### Modify Makefile Modify the Makefile by example of sw/triangle to let it fit triangle.s ```makefile= include ../common/Makefile.common EXE = .elf SRC = triangle.c SSRC = triangle.s CFLAGS += -L../common LDFLAGS += -T ../common/default.ld TARGET = triangle OUTPUT = $(TARGET)$(EXE) .PHONY: all clean all: $(TARGET) $(TARGET): $(CC) $(CFLAGS) -o $(OUTPUT) $(SSRC) $(LDFLAGS) $(OBJCOPY) -j .text -O binary $(OUTPUT) imem.bin $(OBJCOPY) -j .data -O binary $(OUTPUT) dmem.bin $(OBJCOPY) -O binary $(OUTPUT) memory.bin $(OBJDUMP) -d $(OUTPUT) > $(TARGET).dis $(READELF) -a $(OUTPUT) > $(TARGET).symbol clean: $(RM) *.o $(OUTPUT) $(TARGET).dis $(TARGET).symbol [id]mem.bin memory.bin ``` ### Run simulation We can run the simulation by the following command. ```shell /verilator/srv32$ make triangle ``` #### RTL simulation ``` 1 1 10 45 120 210 252 210 120 45 10 1 1 33 528 5456 40920 237336 1107568 4272048 13884156 38567100 92561040 193536720 354817320 573166440 818809200 1037158320 1166803110 1166803110 1037158320 818809200 573166440 354817320 193536720 92561040 38567100 13884156 4272048 1107568 237336 40920 5456 528 33 1 Excuting 65363 instructions, 83497 cycles, 1.277 CPI Program terminate - ../rtl/../testbench/testbench.v:434: Verilog $finish Simulation statistics ===================== Simulation time : 0.654 s Simulation cycles: 83508 Simulation speed : 0.127688 MHz ``` #### ISS simulation ``` 1 1 10 45 120 210 252 210 120 45 10 1 1 33 528 5456 40920 237336 1107568 4272048 13884156 38567100 92561040 193536720 354817320 573166440 818809200 1037158320 1166803110 1166803110 1037158320 818809200 573166440 354817320 193536720 92561040 38567100 13884156 4272048 1107568 237336 40920 5456 528 33 1 Excuting 65363 instructions, 83497 cycles, 1.277 CPI Program terminate Simulation statistics ===================== Simulation time : 0.030 s Simulation cycles: 83497 Simulation speed : 2.753 MHz ``` ## Comparison between two results | Code type | RTL simulation cycles | ISS simulation cycles | | --------------------- |:---------------------:|:---------------------:| | Compiler generated | 90293 | 90282 | | Hand written assembly | 83508 | 83497 | Obviously, we can find that our hand written assembly can save totally **6785** cycles in both RTL simulation and ISS simulation. ## Analyze with gtkwave ### srv32 `srv32` is a 3-stage pipeline architecture with IF/ID, EX, WB stages. The follwing diagram marks some important signals for later discussion. So I will compare the signal between 3 stages to discuss the hazard. ![](https://i.imgur.com/9lbFKBM.jpg) ### Load use hazard Srv32 is a fully forwarding 3 stages pipeline CPU, so it doesn't need to stall for load use hazrd. But we can still analyze how it solve the load use hazard through forwarding the data. ```s 70: fec42783 lw a5,-20(s0) #s0=0x00040000, so a5=dmem[0x40000-0x14]=dmem[0003FFEC] 74: 00279793 slli a5,a5,0x2 78: fe842703 lw a4,-24(s0) #s0=0x00040000, so a4=dmem[0x40000-0x18]=dmem[0003FFE8] 7c: 00f707b3 add a5,a4,a5 ``` ![](https://i.imgur.com/FZ4ypKQ.png) We can find that if ex_insn is `FEc42783`, ALU will compute the result of `s0-20` and pass to dmem_addr in the same cycle. Then, dmem_rdata will get the correct data of address `0003FFEC` and forward it to alu_op1 to compute `slli a5,a5,0x2` in the next cycle. When `FE842703` go into the ex stage, ALU will start compute `s0-24` and pass the result to dmem_addr in the same time. Thus, dmem_rdata can get the correct data of address `0003FFE8` and forward it to alu_op1 to compute `add a5,a4,a5`. The data of load word instruction will write back to a4 register first then write back the data of add instruction next cycle. Srv32 can solve the load use hazard through these steps. ### Control hazard We can analyze the result of control hazard occur through the following instructions. ```s 30: fd410113 addi sp,sp,-44 # 40000 <_stack> 34: 008000ef jal ra,3c <main> 38: 1111006f j 10948 <exit> 0000003c <main>: 3c: fe010113 addi sp,sp,-32 40: 00112e23 sw ra,28(sp) 44: 00812c23 sw s0,24(sp) ``` ![](https://i.imgur.com/sTdk7Al.png) We can find that when fetch_pc is `00000034` the next cycle it will fetch out `008000EF` inst, this is the instruction that jump to main function. However, srv32 will only determine whether jump or not in ex stage, we can find that `ex_jal` equal to 1 in the next cycle. PC will fetch two wrong instructions due to ex stage is the second stage, so `ex_jal` will maintain two cycles. When `ex_flush` detect that `ex_jal` is high, `ex_flush` will be high in next cycle. Srv32 can solve the control hazard through these steps. ## Leetcode with medium difficulty ### Reverse Integer This problem is based on [LeetCode 7](https://leetcode.com/problems/reverse-integer/). Given a signed 32-bit integer x, return x with its digits reversed. ### C code ```c= #include <stdio.h> int reverse(int x){ int tail = 0, result = 0; if(x >= 0){ while(x > 0){ tail = x % 10; if((result * 10.0 + tail) >= (2147483648)){ return 0; } result = result * 10 + tail; x = x / 10; } } else{ if(x <= -2147483648){ return 0; } x = -x; while(x > 0){ tail = x % 10; if((result * 10.0 + tail) >= (2147483647)){ return 0; } result = result * 10 + tail; x = x / 10; } result = (-1) * result; } return result; } int main(){ int test1, ans1, test2, ans2, test3, ans3; test1 = 654321; ans1 = reverse(test1); printf("ANS1: %d \n",ans1); test2 = -123456; ans2 = reverse(test2); printf("ANS2: %d \n",ans2); test3 = 789; ans3 = reverse(test3); printf("ANS3: %d \n",ans3); } ``` ### Modify Makefile Modify the Makefile by example of sw/reverse to let it fit reverse.c ```makefile= include ../common/Makefile.common EXE = .elf SRC = reverse.c SSRC = reverse.s CFLAGS += -L../common LDFLAGS += -T ../common/default.ld TARGET = reverse OUTPUT = $(TARGET)$(EXE) .PHONY: all clean all: $(TARGET) $(TARGET): $(SRC) $(CC) $(CFLAGS) -o $(OUTPUT) $(SRC) $(LDFLAGS) $(OBJCOPY) -j .text -O binary $(OUTPUT) imem.bin $(OBJCOPY) -j .data -O binary $(OUTPUT) dmem.bin $(OBJCOPY) -O binary $(OUTPUT) memory.bin $(OBJDUMP) -d $(OUTPUT) > $(TARGET).dis $(READELF) -a $(OUTPUT) > $(TARGET).symbol clean: $(RM) *.o $(OUTPUT) $(TARGET).dis $(TARGET).symbol [id]mem.bin memory.bin ``` ### Run simulation We can run the simulation by the following command. ```shell /verilator/srv32$ make reverse ``` #### RTL simulation ``` ANS1: 123456 ANS2: -654321 ANS3: 987 Excuting 13040 instructions, 16344 cycles, 1.253 CPI Program terminate - ../rtl/../testbench/testbench.v:434: Verilog $finish Simulation statistics ===================== Simulation time : 0.314 s Simulation cycles: 16355 Simulation speed : 0.052086 MHz ``` #### ISS simulation ``` ANS1: 123456 ANS2: -654321 ANS3: 987 Excuting 13040 instructions, 16344 cycles, 1.253 CPI Program terminate Simulation statistics ===================== Simulation time : 0.010 s Simulation cycles: 16344 Simulation speed : 1.678 MHz ``` ### Assembly code generate by srv32 We can also compile the reverse.c into assembly code by the same method in assignment2 ```s= .file "reverse.c" .option nopic .attribute arch, "rv32i2p0_m2p0" .attribute unaligned_access, 0 .attribute stack_align, 16 .text .globl __floatsidf .globl __muldf3 .globl __adddf3 .globl __gedf2 .align 2 .globl reverse .type reverse, @function reverse: addi sp,sp,-48 sw ra,44(sp) sw s0,40(sp) sw s2,36(sp) sw s3,32(sp) addi s0,sp,48 sw a0,-36(s0) sw zero,-24(s0) sw zero,-20(s0) lw a5,-36(s0) blt a5,zero,.L2 j .L3 .L7: lw a4,-36(s0) li a5,10 rem a5,a4,a5 sw a5,-24(s0) lw a0,-20(s0) call __floatsidf mv a4,a0 mv a5,a1 lui a3,%hi(.LC0) lw a2,%lo(.LC0)(a3) lw a3,%lo(.LC0+4)(a3) mv a0,a4 mv a1,a5 call __muldf3 mv a4,a0 mv a5,a1 mv s2,a4 mv s3,a5 lw a0,-24(s0) call __floatsidf mv a4,a0 mv a5,a1 mv a2,a4 mv a3,a5 mv a0,s2 mv a1,s3 call __adddf3 mv a4,a0 mv a5,a1 mv a0,a4 mv a1,a5 lui a5,%hi(.LC1) lw a2,%lo(.LC1)(a5) lw a3,%lo(.LC1+4)(a5) call __gedf2 mv a5,a0 blt a5,zero,.L16 li a5,0 j .L6 .L16: lw a4,-20(s0) mv a5,a4 slli a5,a5,2 add a5,a5,a4 slli a5,a5,1 mv a4,a5 lw a5,-24(s0) add a5,a5,a4 sw a5,-20(s0) lw a4,-36(s0) li a5,10 div a5,a4,a5 sw a5,-36(s0) .L3: lw a5,-36(s0) bgt a5,zero,.L7 j .L8 .L2: lw a4,-36(s0) li a5,-2147483648 bne a4,a5,.L9 li a5,0 j .L6 .L9: lw a5,-36(s0) neg a5,a5 sw a5,-36(s0) j .L10 .L13: lw a4,-36(s0) li a5,10 rem a5,a4,a5 sw a5,-24(s0) lw a0,-20(s0) call __floatsidf mv a4,a0 mv a5,a1 lui a3,%hi(.LC0) lw a2,%lo(.LC0)(a3) lw a3,%lo(.LC0+4)(a3) mv a0,a4 mv a1,a5 call __muldf3 mv a4,a0 mv a5,a1 mv s2,a4 mv s3,a5 lw a0,-24(s0) call __floatsidf mv a4,a0 mv a5,a1 mv a2,a4 mv a3,a5 mv a0,s2 mv a1,s3 call __adddf3 mv a4,a0 mv a5,a1 mv a0,a4 mv a1,a5 lui a5,%hi(.LC2) lw a2,%lo(.LC2)(a5) lw a3,%lo(.LC2+4)(a5) call __gedf2 mv a5,a0 blt a5,zero,.L17 li a5,0 j .L6 .L17: lw a4,-20(s0) mv a5,a4 slli a5,a5,2 add a5,a5,a4 slli a5,a5,1 mv a4,a5 lw a5,-24(s0) add a5,a5,a4 sw a5,-20(s0) lw a4,-36(s0) li a5,10 div a5,a4,a5 sw a5,-36(s0) .L10: lw a5,-36(s0) bgt a5,zero,.L13 lw a5,-20(s0) neg a5,a5 sw a5,-20(s0) .L8: lw a5,-20(s0) .L6: mv a0,a5 lw ra,44(sp) lw s0,40(sp) lw s2,36(sp) lw s3,32(sp) addi sp,sp,48 jr ra .size reverse, .-reverse .section .rodata .align 2 .LC3: .string "ANS1: %d \n" .align 2 .LC4: .string "ANS2: %d \n" .align 2 .LC5: .string "ANS3: %d \n" .text .align 2 .globl main .type main, @function main: addi sp,sp,-48 sw ra,44(sp) sw s0,40(sp) addi s0,sp,48 li a5,655360 addi a5,a5,-1039 sw a5,-20(s0) lw a0,-20(s0) call reverse sw a0,-24(s0) lw a1,-24(s0) lui a5,%hi(.LC3) addi a0,a5,%lo(.LC3) call printf li a5,-122880 addi a5,a5,-576 sw a5,-28(s0) lw a0,-28(s0) call reverse sw a0,-32(s0) lw a1,-32(s0) lui a5,%hi(.LC4) addi a0,a5,%lo(.LC4) call printf li a5,789 sw a5,-36(s0) lw a0,-36(s0) call reverse sw a0,-40(s0) lw a1,-40(s0) lui a5,%hi(.LC5) addi a0,a5,%lo(.LC5) call printf li a5,0 mv a0,a5 lw ra,44(sp) lw s0,40(sp) addi sp,sp,48 jr ra .size main, .-main .section .rodata .align 3 .LC0: .word 0 .word 1076101120 .align 3 .LC1: .word 0 .word 1105199104 .align 3 .LC2: .word -4194304 .word 1105199103 .ident "GCC: (xPack GNU RISC-V Embedded GCC x86_64) 12.2.0" ```