--- tags: Computer Architecture (2022 Fall) --- # Homework3: SoftCPU ## c code (from hw2) ```c= #include<stdio.h> int largestAltitude(int* gain, int gainSize){ int i, max, arti; arti = 0; max=arti; for(i=0;i<gainSize;i++){ arti=arti+gain[i]; if(arti>max){max = arti;} } return max; } int main(){ int gain[]={-1,5,4}; int gainSize = 3; int max = largestAltitude(gain,gainSize); printf("max1=%d\n",max); int gain2[]={-5,1,5,0,-7}; gainSize = 5; max = largestAltitude(gain2,gainSize); printf("max2=%d\n",max); int gain3[]={-5,1,3}; gainSize = 3; max = largestAltitude(gain3,gainSize); printf("max3=%d\n",max); return 0 ; } ``` ### makefile ```makefile= include ../common/Makefile.common EXE = .elf SRC = hw3.c CFLAGS += -L../common LDFLAGS += -T ../common/default.ld TARGET = hw3 OUTPUT = $(TARGET)$(EXE) .PHONY: all clean all: $(TARGET) $(TARGET): $(SRC) $(CC) $(CFLAGS) -o $(OUTPUT) $(SRC) $(LDFLAGS) $(OBJCOPY) -j .text -O binary $(OUTPUT) imem.bin $(OBJCOPY) -j .data -O binary $(OUTPUT) dmem.bin $(OBJCOPY) -O binary $(OUTPUT) memory.bin $(OBJDUMP) -d $(OUTPUT) > $(TARGET).dis $(READELF) -a $(OUTPUT) > $(TARGET).symbol clean: $(RM) *.o $(OUTPUT) $(TARGET).dis $(TARGET).symbol [id]mem.bin memory.bin ``` ### compile into assembly code ```s= .file "hw3.c" .option nopic .attribute arch, "rv32i2p0_m2p0" .attribute unaligned_access, 0 .attribute stack_align, 16 .text .align 2 .globl largestAltitude .type largestAltitude, @function largestAltitude: addi sp,sp,-48 sw s0,44(sp) addi s0,sp,48 sw a0,-36(s0) sw a1,-40(s0) sw zero,-28(s0) lw a5,-28(s0) sw a5,-24(s0) sw zero,-20(s0) j .L2 .L4: lw a5,-20(s0) slli a5,a5,2 lw a4,-36(s0) add a5,a4,a5 lw a5,0(a5) lw a4,-28(s0) add a5,a4,a5 sw a5,-28(s0) lw a4,-28(s0) lw a5,-24(s0) ble a4,a5,.L3 lw a5,-28(s0) sw a5,-24(s0) .L3: lw a5,-20(s0) addi a5,a5,1 sw a5,-20(s0) .L2: lw a4,-20(s0) lw a5,-40(s0) blt a4,a5,.L4 lw a5,-24(s0) mv a0,a5 lw s0,44(sp) addi sp,sp,48 jr ra .size largestAltitude, .-largestAltitude .section .rodata .align 2 .LC1: .string "max1=%d\n" .align 2 .LC2: .string "max2=%d\n" .align 2 .LC3: .string "max3=%d\n" .align 2 .LC0: .word -5 .word 1 .word 5 .word 0 .word -7 .text .align 2 .globl main .type main, @function main: addi sp,sp,-80 sw ra,76(sp) sw s0,72(sp) addi s0,sp,80 li a5,-1 sw a5,-36(s0) li a5,5 sw a5,-32(s0) li a5,4 sw a5,-28(s0) li a5,3 sw a5,-20(s0) addi a5,s0,-36 lw a1,-20(s0) mv a0,a5 call largestAltitude sw a0,-24(s0) lw a1,-24(s0) lui a5,%hi(.LC1) addi a0,a5,%lo(.LC1) call printf lui a5,%hi(.LC0) addi a5,a5,%lo(.LC0) lw a1,0(a5) lw a2,4(a5) lw a3,8(a5) lw a4,12(a5) lw a5,16(a5) sw a1,-56(s0) sw a2,-52(s0) sw a3,-48(s0) sw a4,-44(s0) sw a5,-40(s0) li a5,5 sw a5,-20(s0) addi a5,s0,-56 lw a1,-20(s0) mv a0,a5 call largestAltitude sw a0,-24(s0) lw a1,-24(s0) lui a5,%hi(.LC2) addi a0,a5,%lo(.LC2) call printf li a5,-5 sw a5,-68(s0) li a5,1 sw a5,-64(s0) li a5,3 sw a5,-60(s0) li a5,3 sw a5,-20(s0) addi a5,s0,-68 lw a1,-20(s0) mv a0,a5 call largestAltitude sw a0,-24(s0) lw a1,-24(s0) lui a5,%hi(.LC3) addi a0,a5,%lo(.LC3) call printf li a5,0 mv a0,a5 lw ra,76(sp) lw s0,72(sp) addi sp,sp,80 jr ra .size main, .-main .ident "GCC: (xPack GNU RISC-V Embedded GCC x86_64) 12.2.0" ``` ### RTL sim resilt ![](https://i.imgur.com/kHMk8G7.png) ### ISS sim result ![](https://i.imgur.com/GtHTMmo.png) # handwrite assembly optimize ```s= .file "FHA.c" .option nopic .attribute arch, "rv32i2p1" .attribute unaligned_access, 0 .attribute stack_align, 16 .data gain: .word -1,5,4 gainsize: .word 3 gain2: .word -5,1,5,0,-7 gainsize2: .word 5 gain3: .word -5,1,3 gainsize3: .word 3 .LC1: .string "max1=%d\n" .align 2 .text .align 2 .globl func .type func, @function func: mv t1, x0 #int i = 0 mv a2, x0 #arti = 0 mv a3, x0 #max=0 loop: lw a1, 0(s0) #load gain[i] addi s0, s0, 4 add a2, a2 ,a1 #arti=arti+gain[i] blt a2, a3 ,conti #if arti[i]<max mv a3, a2 #max=arti conti: addi t1, t1 ,1 # i++ blt t1, s1, loop # i<gainsize jump to loop conti mv a1 ,a3 #li a7 ,1 #ecall jr ra .size func, .-func .text .align 2 .globl main .type main, @function main: addi sp,sp,-32 sw ra,28(sp) la s0, gain #load gain address lw s1, gainsize call func lui a5,%hi(.LC1) addi a0,a5,%lo(.LC1) call printf la s0, gain2 #load gain2 address lw s1, gainsize2 call func lui a5,%hi(.LC1) addi a0,a5,%lo(.LC1) call printf la s0, gain3 #load gain3 address lw s1, gainsize3 jal func lui a5,%hi(.LC1) addi a0,a5,%lo(.LC1) call printf #li a7 10 #ecall addi a0,x0,0 lw ra,28(sp) addi sp,sp,32 jr ra .size main, .-main .ident "GCC: (xPack GNU RISC-V Embedded GCC x86_64) 12.2.0" ``` Obiviously, the line of handwritten assembly codes are fewer then compiler generated. Because I reduce many unnecessary load/stroe instructions. ### makefile ```makefile= include ../common/Makefile.common EXE = .elf ASS = .s SRC = hw3.c CFLAGS += -L../common LDFLAGS += -T ../common/default.ld TARGET = hw3_opt_hand OUTPUT = $(TARGET)$(EXE) OUTPUT_ASS =$(TARGET)$(ASS) SRC_ASS = hw3_opt_hand.s .PHONY: all clean all: $(TARGET) $(TARGET): $(CC) $(CFLAGS) -o $(OUTPUT) $(SRC_ASS) $(LDFLAGS) $(OBJCOPY) -j .text -O binary $(OUTPUT) imem.bin $(OBJCOPY) -j .data -O binary $(OUTPUT) dmem.bin $(OBJCOPY) -O binary $(OUTPUT) memory.bin $(OBJDUMP) -d $(OUTPUT) > $(TARGET).dis $(READELF) -a $(OUTPUT) > $(TARGET).symbol clean: $(RM) *.o $(OUTPUT) $(TARGET).dis $(TARGET).symbol [id]mem.bin memory.bin ``` ### RTL sim result ![](https://i.imgur.com/x2EEC9e.png) ### ISS sim result ![](https://i.imgur.com/IEwCKwa.png) ## comparsion (using code from hw2) |RTL sim cycle|opt| |-|-| |6414|no (only rely on compiler)| |6201|handwrite assembly| ## control hazard ![](https://i.imgur.com/vmLjZHI.png) ``` f8: fec42583 lw a1,-20(s0) fc: 00078513 mv a0,a5 100: f3dff0ef jal ra,3c <largestAltitude> 104: fea42423 sw a0,-24(s0) 108: fe842583 lw a1,-24(s0) ``` CPU will fetch 2 wrong instruction before jumping to right destionation pc, so flush 2 cycle. Also use wb_nop and wb_nop_more to prevent invalid write back action. ## data hazard ``` 74: 0007a783 lw a5,0(a5) #old a5=x15=0X3FFC8, new a5=dmem[0x3FFC8]=0XFFFFFFFB 78: fe442703 lw a4,-28(s0) # s0=x8=0x3FFB0,a4=dmem[0x3FFB0-28]=dmem[0x3FF94]=0 7c: 00f707b3 add a5,a4,a5 # a5 = 0+0XFFFFFFFB ``` ![](https://i.imgur.com/5wrdMgA.png) Execute the address for dmem, get rdata at next cycle, then forward to alu, so alu result is correct at next cycle. ## leetcode (medium) 75. Sort Colors https://leetcode.com/problems/sort-colors/ ### c code ```c= void sortColors(int* nums, int numsSize){ int red=0; int white=0; int i; for(i=0;i<numsSize;i++){ if(nums[i]==0) red++; else if(nums[i]==1) white++; } int red_white=red+white; for(i=0;i<numsSize;i++){ if(i<red) nums[i]=0; else if(i<red_white) nums[i]=1; else nums[i]=2; } } ``` ### assembly code(auto generated by compiler) ```s= .file "leetcode.c" .option nopic .attribute arch, "rv32i2p0_m2p0" .attribute unaligned_access, 0 .attribute stack_align, 16 .text .section .rodata .align 2 .LC2: .string "%d\n" .text .align 2 .globl sortColors .type sortColors, @function sortColors: addi sp,sp,-48 sw ra,44(sp) sw s0,40(sp) addi s0,sp,48 sw a0,-36(s0) sw a1,-40(s0) sw zero,-20(s0) sw zero,-24(s0) sw zero,-28(s0) j .L2 .L5: lw a5,-28(s0) slli a5,a5,2 lw a4,-36(s0) add a5,a4,a5 lw a5,0(a5) bne a5,zero,.L3 lw a5,-20(s0) addi a5,a5,1 sw a5,-20(s0) j .L4 .L3: lw a5,-28(s0) slli a5,a5,2 lw a4,-36(s0) add a5,a4,a5 lw a4,0(a5) li a5,1 bne a4,a5,.L4 lw a5,-24(s0) addi a5,a5,1 sw a5,-24(s0) .L4: lw a5,-28(s0) addi a5,a5,1 sw a5,-28(s0) .L2: lw a4,-28(s0) lw a5,-40(s0) blt a4,a5,.L5 lw a4,-20(s0) lw a5,-24(s0) add a5,a4,a5 sw a5,-32(s0) sw zero,-28(s0) j .L6 .L10: lw a4,-28(s0) lw a5,-20(s0) bge a4,a5,.L7 lw a5,-28(s0) slli a5,a5,2 lw a4,-36(s0) add a5,a4,a5 sw zero,0(a5) j .L8 .L7: lw a4,-28(s0) lw a5,-32(s0) bge a4,a5,.L9 lw a5,-28(s0) slli a5,a5,2 lw a4,-36(s0) add a5,a4,a5 li a4,1 sw a4,0(a5) j .L8 .L9: lw a5,-28(s0) slli a5,a5,2 lw a4,-36(s0) add a5,a4,a5 li a4,2 sw a4,0(a5) .L8: lw a5,-28(s0) slli a5,a5,2 lw a4,-36(s0) add a5,a4,a5 lw a5,0(a5) mv a1,a5 lui a5,%hi(.LC2) addi a0,a5,%lo(.LC2) call printf lw a5,-28(s0) addi a5,a5,1 sw a5,-28(s0) .L6: lw a4,-28(s0) lw a5,-40(s0) blt a4,a5,.L10 #nop #nop lw ra,44(sp) lw s0,40(sp) addi sp,sp,48 jr ra .size sortColors, .-sortColors .section .rodata .align 2 .LC0: .word 2 .word 0 .word 0 .word 1 .align 2 .LC1: .word 2 .word 0 .word 2 .word 1 .word 0 .word 1 .word 0 .word 1 .text .align 2 .globl main .type main, @function main: addi sp,sp,-96 sw ra,92(sp) sw s0,88(sp) addi s0,sp,96 li a5,2 sw a5,-40(s0) sw zero,-36(s0) li a5,1 sw a5,-32(s0) li a5,3 sw a5,-20(s0) addi a5,s0,-40 lw a1,-20(s0) mv a0,a5 call sortColors lui a5,%hi(.LC0) addi a5,a5,%lo(.LC0) lw a2,0(a5) lw a3,4(a5) lw a4,8(a5) lw a5,12(a5) sw a2,-56(s0) sw a3,-52(s0) sw a4,-48(s0) sw a5,-44(s0) li a5,4 sw a5,-24(s0) addi a5,s0,-56 lw a1,-24(s0) mv a0,a5 call sortColors lui a5,%hi(.LC1) addi a5,a5,%lo(.LC1) lw a7,0(a5) lw a6,4(a5) lw a0,8(a5) lw a1,12(a5) lw a2,16(a5) lw a3,20(a5) lw a4,24(a5) lw a5,28(a5) sw a7,-88(s0) sw a6,-84(s0) sw a0,-80(s0) sw a1,-76(s0) sw a2,-72(s0) sw a3,-68(s0) sw a4,-64(s0) sw a5,-60(s0) li a5,8 sw a5,-28(s0) addi a5,s0,-88 lw a1,-28(s0) mv a0,a5 call sortColors li a5,0 mv a0,a5 lw ra,92(sp) lw s0,88(sp) addi sp,sp,96 jr ra .size main, .-main .ident "GCC: (xPack GNU RISC-V Embedded GCC x86_64) 12.2.0" ``` ### handwrite ```s= .file "leetcode.c" .option nopic .attribute arch, "rv32i2p0_m2p0" .attribute unaligned_access, 0 .attribute stack_align, 16 .text .section .rodata .align 2 .LC2: .string "%d\n" .text .align 2 .globl sortColors .type sortColors, @function sortColors: addi sp,sp,-48 sw ra,44(sp) sw s0,40(sp) addi s0,sp,48 sw a0,-36(s0) sw a1,-40(s0) #sw zero,-20(s0) #sw zero,-24(s0)#white=0 addi a6,x0,0 sw zero,-28(s0) j .L2 .L5: lw a5,-28(s0) slli a5,a5,2 lw a4,-36(s0) add a5,a4,a5 lw a5,0(a5) bne a5,zero,.L3 lw a5,-20(s0) addi a5,a5,1 sw a5,-20(s0) j .L4 .L3: lw a5,-28(s0) slli a5,a5,2 lw a4,-36(s0) add a5,a4,a5 lw a4,0(a5) li a5,1 bne a4,a5,.L4 #lw a5,-24(s0) addi a6,a6,1 #sw a5,-24(s0) .L4: lw a5,-28(s0) addi a5,a5,1 sw a5,-28(s0) .L2: lw a4,-28(s0) lw a5,-40(s0) blt a4,a5,.L5 lw a4,-20(s0) #lw a5,-24(s0) add a5,a4,a6 sw a5,-32(s0) sw zero,-28(s0) j .L6 .L10: lw a4,-28(s0) lw a5,-20(s0) bge a4,a5,.L7 lw a5,-28(s0) slli a5,a5,2 lw a4,-36(s0) add a5,a4,a5 sw zero,0(a5) j .L8 .L7: lw a4,-28(s0) lw a5,-32(s0) bge a4,a5,.L9 lw a5,-28(s0) slli a5,a5,2 lw a4,-36(s0) add a5,a4,a5 li a4,1 sw a4,0(a5) j .L8 .L9: lw a5,-28(s0) slli a5,a5,2 lw a4,-36(s0) add a5,a4,a5 li a4,2 sw a4,0(a5) .L8: lw a5,-28(s0) slli a5,a5,2 lw a4,-36(s0) add a5,a4,a5 lw a5,0(a5) mv a1,a5 lui a5,%hi(.LC2) addi a0,a5,%lo(.LC2) call printf lw a5,-28(s0) addi a5,a5,1 sw a5,-28(s0) .L6: lw a4,-28(s0) lw a5,-40(s0) blt a4,a5,.L10 #nop #nop lw ra,44(sp) lw s0,40(sp) addi sp,sp,48 jr ra .size sortColors, .-sortColors .section .rodata .align 2 .LC0: .word 2 .word 0 .word 0 .word 1 .align 2 .LC1: .word 2 .word 0 .word 2 .word 1 .word 0 .word 1 .word 0 .word 1 .text .align 2 .globl main .type main, @function main: addi sp,sp,-96 sw ra,92(sp) sw s0,88(sp) addi s0,sp,96 li a5,2 sw a5,-40(s0) sw zero,-36(s0) li a5,1 sw a5,-32(s0) li a5,3 sw a5,-20(s0) addi a5,s0,-40 lw a1,-20(s0) mv a0,a5 call sortColors lui a5,%hi(.LC0) addi a5,a5,%lo(.LC0) lw a2,0(a5) lw a3,4(a5) lw a4,8(a5) lw a5,12(a5) sw a2,-56(s0) sw a3,-52(s0) sw a4,-48(s0) sw a5,-44(s0) li a5,4 sw a5,-24(s0) addi a5,s0,-56 lw a1,-24(s0) mv a0,a5 call sortColors lui a5,%hi(.LC1) addi a5,a5,%lo(.LC1) lw a7,0(a5) lw a6,4(a5) lw a0,8(a5) lw a1,12(a5) lw a2,16(a5) lw a3,20(a5) lw a4,24(a5) lw a5,28(a5) sw a7,-88(s0) sw a6,-84(s0) sw a0,-80(s0) sw a1,-76(s0) sw a2,-72(s0) sw a3,-68(s0) sw a4,-64(s0) sw a5,-60(s0) li a5,8 sw a5,-28(s0) addi a5,s0,-88 lw a1,-28(s0) mv a0,a5 call sortColors li a5,0 mv a0,a5 lw ra,92(sp) lw s0,88(sp) addi sp,sp,96 jr ra .size main, .-main .ident "GCC: (xPack GNU RISC-V Embedded GCC x86_64) 12.2.0" ``` ### RTL sim ![](https://i.imgur.com/x8mGRBg.png) ### ISS sim ![](https://i.imgur.com/exaFMvR.png) ### comparsion (using code from hw2) |RTL sim cycle|opt| |-|-| |18790|no (only rely on compiler)| |6201|handwrite assembly| ## Appendix ![](https://i.imgur.com/pOHu99z.png)