# Final Project Note: ### SDRAM ### Question 1: Why do we need 9 cycles for the next wbs_ack_o from the SDRAM? ### Note 1: When the wbs_ack_o is high, the corresponding wbs_adr_i is always 0x3800xxxx(the mapping address of mprjram) #### Simple test for the peformance of prefetch buffer ```verilog= //////////////////////////////////////////////// //================= Buffer Only for muls ======================== reg [31:0] prefetch_buf0 [6:0]; wire [2:0] prefetch_buf0_idx; assign prefetch_buf0_idx = ((wbs_adr_i[7:0] - 8'd4) >> 2); reg [31:0] prefetch_buf1 [6:0]; reg [31:0] prefetch_buf2 [6:0]; //write buffer integer i0; always @(posedge clk) begin if(rst) begin for(i0 = 0; i0 < 7; i0 = i0 + 1) begin prefetch_buf0[i0] <= 0; prefetch_buf1[i0] <= 0; prefetch_buf2[i0] <= 0; end end else if(valid && wbs_we_i && (wbs_adr_i < 32'h3800_0020) && (wbs_adr_i > 32'h3800_0000)) begin prefetch_buf0[prefetch_buf0_idx] <= wbs_dat_i; end else if(valid && wbs_we_i && (wbs_adr_i < 32'h3800_0040) && (wbs_adr_i > 32'h3800_0020)) begin prefetch_buf1[prefetch_buf0_idx] <= wbs_dat_i; end else if(valid && wbs_we_i && (wbs_adr_i < 32'h3800_0060) && (wbs_adr_i > 32'h3800_0040)) begin prefetch_buf2[prefetch_buf0_idx] <= wbs_dat_i; end end //read buffer reg prefetch_out_valid; reg [31:0] prefetch_D; reg prefetch_out_valid_reg; always @(posedge clk) begin if(rst) begin prefetch_out_valid_reg <= 0; end else begin prefetch_out_valid_reg <= prefetch_out_valid; end end always @(*) begin prefetch_out_valid = 0; prefetch_D = 0; if((~prefetch_out_valid_reg) && valid && ~wbs_we_i && (wbs_adr_i[31:24] == 16'h38)) begin if((wbs_adr_i[11:0] > 12'h000) && (wbs_adr_i[11:0] < 12'h020)) begin //$display("prefetch read: Addr->%x", wbs_adr_i); prefetch_out_valid = 1; prefetch_D = prefetch_buf0[prefetch_buf0_idx]; end else if((wbs_adr_i[11:0] > 12'h020) && (wbs_adr_i[11:0] < 12'h040)) begin prefetch_out_valid = 1; prefetch_D = prefetch_buf1[prefetch_buf0_idx]; end else if((wbs_adr_i[11:0] > 12'h040) && (wbs_adr_i[11:0] < 12'h060)) begin prefetch_out_valid = 1; prefetch_D = prefetch_buf2[prefetch_buf0_idx]; end end end //=============================================================== // WB MI A assign valid = wbs_stb_i && wbs_cyc_i; //@@ //assign ctrl_in_valid = wbs_we_i ? valid : // (prefetch_out_valid ? 0 : (~ctrl_in_valid_q && valid)); assign ctrl_in_valid = wbs_we_i ? valid : ( (~prefetch_out_valid) && ~ctrl_in_valid_q && valid); //assign ctrl_in_valid = wbs_we_i ? valid : // ( ~ctrl_in_valid_q && valid); //@@ assign wbs_ack_o = (wbs_we_i) ? ~ctrl_busy && valid : (ctrl_out_valid || prefetch_out_valid); //assign wbs_ack_o = (wbs_we_i) ? ~ctrl_busy && valid : ctrl_out_valid; //@@ wire [31:0] sdram_dat_o; assign wbs_dat_o = prefetch_out_valid ? prefetch_D : sdram_dat_o; //assign wbs_dat_o = sdram_dat_o; assign bram_mask = wbs_sel_i & {4{wbs_we_i}}; assign ctrl_addr = wbs_adr_i[22:0]; // IO assign io_out = d2c_data; assign io_oeb = {(`MPRJ_IO_PADS-1){rst}}; // IRQ assign irq = 3'b000; // Unused // LA assign la_data_out = {{(127-BITS){1'b0}}, d2c_data}; // Assuming LA probes [65:64] are for controlling the count clk & reset assign clk = (~la_oenb[64]) ? la_data_in[64]: wb_clk_i; assign rst = (~la_oenb[65]) ? la_data_in[65]: wb_rst_i; assign rst_n = ~rst; always @(posedge clk) begin if (rst) begin ctrl_in_valid_q <= 1'b0; end else begin if (~prefetch_out_valid && ~wbs_we_i && valid && ~ctrl_busy && ctrl_in_valid_q == 1'b0) ctrl_in_valid_q <= 1'b1; else if (ctrl_out_valid) ctrl_in_valid_q <= 1'b0; end end sdram_controller user_sdram_controller ( .clk(clk), .rst(rst), .sdram_cle(sdram_cle), .sdram_cs(sdram_cs), .sdram_cas(sdram_cas), .sdram_ras(sdram_ras), .sdram_we(sdram_we), .sdram_dqm(sdram_dqm), .sdram_ba(sdram_ba), .sdram_a(sdram_a), .sdram_dqi(d2c_data), .sdram_dqo(c2d_data), .user_addr(ctrl_addr), .rw(wbs_we_i), .data_in(wbs_dat_i), .data_out(sdram_dat_o),//@@ .busy(ctrl_busy), .in_valid(ctrl_in_valid), .out_valid(ctrl_out_valid) ); sdr user_bram ( .Rst_n(rst_n), .Clk(clk), .Cke(sdram_cle), .Cs_n(sdram_cs), .Ras_n(sdram_ras), .Cas_n(sdram_cas), .We_n(sdram_we), .Addr(sdram_a), .Ba(sdram_ba), .Dqm(bram_mask), .Dqi(c2d_data), .Dqo(d2c_data) ); ``` ```c= #include "matmul.h" int* __attribute__((section(".mprjram"))) matmul() { int i=0; int j; int k; int sum; int kk; int mytest = 1234; unsigned int count = 0; int a1, a2, a3, a4, b1, b2, b3, b4; for (i=0; i <SIZE; i++){ for (j=0; j<SIZE; j++){ a1 = A[i*SIZE]; a2 = A[i*SIZE + 1]; a3 = A[i*SIZE + 2]; a4 = A[i*SIZE + 3]; b1 = B[j]; b2 = B[j + SIZE]; b3 = B[j + 2*SIZE]; b4 = B[j + 3*SIZE]; /* sum = 0; for(k = 0;k<SIZE;k++) { sum += A[(i*SIZE) + k] * B[(k*SIZE) + j]; } */ result[(i*SIZE) + j] = a1*b1 + a2*b2 + a3*b3 + a4*b4; } } return result; } ``` ```assembly= ## -O1, unroll k loop, Disassembly of section .mprjram: 38000000 <__mulsi3>: 38000000: 00050613 mv a2,a0 38000004: 00000513 li a0,0 38000008: 0015f693 andi a3,a1,1 3800000c: 00068463 beqz a3,38000014 <__mulsi3+0x14> 38000010: 00c50533 add a0,a0,a2 38000014: 0015d593 srli a1,a1,0x1 38000018: 00161613 slli a2,a2,0x1 3800001c: fe0596e3 bnez a1,38000008 <__mulsi3+0x8> 38000020: 00008067 ret 38000024 <matmul>: 38000024: fc010113 addi sp,sp,-64 38000028: 02112e23 sw ra,60(sp) 3800002c: 02812c23 sw s0,56(sp) 38000030: 02912a23 sw s1,52(sp) 38000034: 03212823 sw s2,48(sp) 38000038: 03312623 sw s3,44(sp) 3800003c: 03412423 sw s4,40(sp) 38000040: 03512223 sw s5,36(sp) 38000044: 03612023 sw s6,32(sp) 38000048: 01712e23 sw s7,28(sp) 3800004c: 01812c23 sw s8,24(sp) 38000050: 01912a23 sw s9,20(sp) 38000054: 01a12823 sw s10,16(sp) 38000058: 01b12623 sw s11,12(sp) 3800005c: 380009b7 lui s3,0x38000 38000060: 30098993 addi s3,s3,768 # 38000300 <A> 38000064: 38000cb7 lui s9,0x38000 38000068: 384c8c93 addi s9,s9,900 # 38000384 <result> 3800006c: 00000d13 li s10,0 38000070: 38000db7 lui s11,0x38000 38000074: 300d8d93 addi s11,s11,768 # 38000300 <A> 38000078: 050d8c13 addi s8,s11,80 3800007c: 0009ab83 lw s7,0(s3) 38000080: 0049ab03 lw s6,4(s3) 38000084: 0089aa83 lw s5,8(s3) 38000088: 00c9aa03 lw s4,12(s3) 3800008c: 040d8413 addi s0,s11,64 38000090: 000c8913 mv s2,s9 38000094: 00042583 lw a1,0(s0) 38000098: 000b8513 mv a0,s7 3800009c: f65ff0ef jal ra,38000000 <__mulsi3> 380000a0: 00050493 mv s1,a0 380000a4: 01042583 lw a1,16(s0) 380000a8: 000b0513 mv a0,s6 380000ac: f55ff0ef jal ra,38000000 <__mulsi3> 380000b0: 00a484b3 add s1,s1,a0 380000b4: 02042583 lw a1,32(s0) 380000b8: 000a8513 mv a0,s5 380000bc: f45ff0ef jal ra,38000000 <__mulsi3> 380000c0: 00a484b3 add s1,s1,a0 380000c4: 03042583 lw a1,48(s0) 380000c8: 000a0513 mv a0,s4 380000cc: f35ff0ef jal ra,38000000 <__mulsi3> 380000d0: 00a484b3 add s1,s1,a0 380000d4: 00992023 sw s1,0(s2) 380000d8: 00440413 addi s0,s0,4 380000dc: 00490913 addi s2,s2,4 380000e0: fb841ae3 bne s0,s8,38000094 <matmul+0x70> 380000e4: 01098993 addi s3,s3,16 380000e8: 010c8c93 addi s9,s9,16 380000ec: 004d0d13 addi s10,s10,4 380000f0: 01000793 li a5,16 380000f4: f8fd14e3 bne s10,a5,3800007c <matmul+0x58> 380000f8: 38000537 lui a0,0x38000 380000fc: 38450513 addi a0,a0,900 # 38000384 <result> 38000100: 03c12083 lw ra,60(sp) 38000104: 03812403 lw s0,56(sp) 38000108: 03412483 lw s1,52(sp) 3800010c: 03012903 lw s2,48(sp) 38000110: 02c12983 lw s3,44(sp) 38000114: 02812a03 lw s4,40(sp) 38000118: 02412a83 lw s5,36(sp) 3800011c: 02012b03 lw s6,32(sp) 38000120: 01c12b83 lw s7,28(sp) 38000124: 01812c03 lw s8,24(sp) 38000128: 01412c83 lw s9,20(sp) 3800012c: 01012d03 lw s10,16(sp) 38000130: 00c12d83 lw s11,12(sp) 38000134: 04010113 addi sp,sp,64 38000138: 00008067 ret ``` ```c= #include "matmul.h" int* __attribute__((section(".mprjram"))) matmul() { int i=0; int j; int k; int sum; int kk; int mytest = 1234; unsigned int count = 0; int a1, a2, a3, a4; int b0_1, b0_2, b0_3, b0_4; int b1_1, b1_2, b1_3, b1_4; int b2_1, b2_2, b2_3, b2_4; int b3_1, b3_2, b3_3, b3_4; for (i=0; i <SIZE; i++){ //for (j=0; j<SIZE; j++){ a1 = A[i*SIZE]; a2 = A[i*SIZE + 1]; a3 = A[i*SIZE + 2]; a4 = A[i*SIZE + 3]; b0_1 = B[0]; b0_2 = B[0 + SIZE]; b0_3 = B[0 + 2*SIZE]; b0_4 = B[0 + 3*SIZE]; b1_1 = B[1]; b1_2 = B[1 + SIZE]; b1_3 = B[1 + 2*SIZE]; b1_4 = B[1 + 3*SIZE]; b2_1 = B[2]; b2_2 = B[2 + SIZE]; b2_3 = B[2 + 2*SIZE]; b2_4 = B[2 + 3*SIZE]; b3_1 = B[3]; b3_2 = B[3 + SIZE]; b3_3 = B[3 + 2*SIZE]; b3_4 = B[3 + 3*SIZE]; /* sum = 0; for(k = 0;k<SIZE;k++) { sum += A[(i*SIZE) + k] * B[(k*SIZE) + j]; } */ result[(i*SIZE) + 0] = a1*b0_1 + a2*b0_2 + a3*b0_3 + a4*b0_4; result[(i*SIZE) + 1] = a1*b1_1 + a2*b1_2 + a3*b1_3 + a4*b1_4; result[(i*SIZE) + 2] = a1*b2_1 + a2*b2_2 + a3*b2_3 + a4*b2_4; result[(i*SIZE) + 3] = a1*b3_1 + a2*b3_2 + a3*b3_3 + a4*b3_4; //} } return result; } ``` * Correct Code for firmware (Unloop) ```c= #include "matmul.h" /* int __attribute__((section(".mprjram"))) ass_mul(int a, int b) { int result; asm( "li %[result], 0\n\t" "andi %[tmp], %[b], 1\n\t" "beqz %[tmp], 1f\n\t" "add %[result], %[result], %[a]\n\t" "1:\n\t" "srli %[b], %[b], 1\n\t" "slli %[a], %[a], 1\n\t" "bnez %[b], 1b" : [result] "=r"(result), [tmp] "=&r"(result), [a] "+r"(result), [b] "+r"(result) : : "a0", "a1", "a2", "a3" ); return result; } */ /* mv a2,a0 38000004: 00000513 li a0,0 38000008: 0015f693 andi a3,a1,1 3800000c: 00068463 beqz a3,38000014 <__mulsi3+0x14> 38000010: 00c50533 add a0,a0,a2 38000014: 0015d593 srli a1,a1,0x1 38000018: 00161613 slli a2,a2,0x1 3800001c: fe0596e3 bnez a1,38000008 <__mulsi3+0x8> 38000020: 00008067 ret */ #include "matmul.h" int* __attribute__((section(".mprjram"))) matmul() { register int i=0; register int j; register int k; register int sum; register int kk; register int mytest = 1234; register unsigned int count = 0; register int a1, a2, a3, a4; register int b0_1, b0_2, b0_3, b0_4; register int b1_1, b1_2, b1_3, b1_4; register int b2_1, b2_2, b2_3, b2_4; register int b3_1, b3_2, b3_3, b3_4; b0_1 = B[0]; b0_2 = B[0 + SIZE]; b0_3 = B[0 + 2*SIZE]; b0_4 = B[0 + 3*SIZE]; b1_1 = B[1]; b1_2 = B[1 + SIZE]; b1_3 = B[1 + 2*SIZE]; b1_4 = B[1 + 3*SIZE]; b2_1 = B[2]; b2_2 = B[2 + SIZE]; b2_3 = B[2 + 2*SIZE]; b2_4 = B[2 + 3*SIZE]; b3_1 = B[3]; b3_2 = B[3 + SIZE]; b3_3 = B[3 + 2*SIZE]; b3_4 = B[3 + 3*SIZE]; //for (i = 0; i < 1; i++){ for (i=0; i <SIZE; i++){ //for (j=0; j<SIZE; j++){ a1 = A[i*SIZE]; a2 = A[i*SIZE + 1]; a3 = A[i*SIZE + 2]; a4 = A[i*SIZE + 3]; /* sum = 0; for(k = 0;k<SIZE;k++) { sum += A[(i*SIZE) + k] * B[(k*SIZE) + j]; } */ result[(i*SIZE) + 0] = a1*b0_1 + a2*b0_2 + a3*b0_3 + a4*b0_4; result[(i*SIZE) + 1] = a1*b1_1 + a2*b1_2 + a3*b1_3 + a4*b1_4; result[(i*SIZE) + 2] = a1*b2_1 + a2*b2_2 + a3*b2_3 + a4*b2_4; result[(i*SIZE) + 3] = a1*b3_1 + a2*b3_2 + a3*b3_3 + a4*b3_4; //} } return result; } ``` * Wrong code ```c= int __attribute__((section(".mprjram"))) ass_mul(int a, int b) { int result; asm( "li %[result], 0\n\t" "andi %[tmp], %[b], 1\n\t" "beqz %[tmp], 1f\n\t" "add %[result], %[result], %[a]\n\t" "1:\n\t" "srli %[b], %[b], 1\n\t" "slli %[a], %[a], 1\n\t" "bnez %[b], 1b" : [result] "=r"(result), [tmp] "=&r"(result), [a] "+r"(result), [b] "+r"(result) : : "a0", "a1", "a2", "a3" ); return result; } int* __attribute__((section(".mprjram"))) matmul() { register int i=0; register int j; register int k; register int sum; register int kk; register int mytest = 1234; register unsigned int count = 0; register int a1, a2, a3, a4; register int b0_1, b0_2, b0_3, b0_4; register int b1_1, b1_2, b1_3, b1_4; register int b2_1, b2_2, b2_3, b2_4; register int b3_1, b3_2, b3_3, b3_4; b0_1 = B[0]; b0_2 = B[0 + SIZE]; b0_3 = B[0 + 2*SIZE]; b0_4 = B[0 + 3*SIZE]; b1_1 = B[1]; b1_2 = B[1 + SIZE]; b1_3 = B[1 + 2*SIZE]; b1_4 = B[1 + 3*SIZE]; b2_1 = B[2]; b2_2 = B[2 + SIZE]; b2_3 = B[2 + 2*SIZE]; b2_4 = B[2 + 3*SIZE]; b3_1 = B[3]; b3_2 = B[3 + SIZE]; b3_3 = B[3 + 2*SIZE]; b3_4 = B[3 + 3*SIZE]; for (i=0; i <SIZE; i++){ a1 = A[i*SIZE]; a2 = A[i*SIZE + 1]; a3 = A[i*SIZE + 2]; a4 = A[i*SIZE + 3]; result[(i*SIZE) + 0] = ass_mul(a1, b0_1) + ass_mul(a2, b0_2) + ass_mul(a3, b0_3) + ass_mul(a4, b0_4); result[(i*SIZE) + 1] = ass_mul(a1, b1_1) + ass_mul(a2, b1_2) + ass_mul(a3, b1_3) + ass_mul(a4, b1_4); result[(i*SIZE) + 2] = ass_mul(a1, b2_1) + ass_mul(a2, b2_2) + ass_mul(a3, b2_3) + ass_mul(a4, b2_4); result[(i*SIZE) + 3] = ass_mul(a1, b3_1) + ass_mul(a2, b3_2) + ass_mul(a3, b3_3) + ass_mul(a4, b3_4); } return result; } ``` * modified assembly code ```assembly= .section .data .globl A A: .word 0, 1, 2, 3 .word 0, 1, 2, 3 .word 0, 1, 2, 3 .word 0, 1, 2, 3 .globl B B: .word 1, 2, 3, 4 .word 5, 6, 7, 8 .word 9, 10, 11, 12 .word 13, 14, 15, 16 .globl result result: .space 64 # Reserve space for result matrix .section .mprjram .globl matmul ass_mul: li a0,0 mul_loop2: andi a3,a1,1 beqz a3,mul_loop1 add a0,a0,a2 mul_loop1: srli a1,a1,0x1 slli a2,a2,0x1 bnez a1,mul_loop2 ret matmul: addi sp,sp,-112 sw ra,108(sp) sw s0,104(sp) sw s1,100(sp) sw s2,96(sp) sw s3,92(sp) sw s4,88(sp) sw s5,84(sp) sw s6,80(sp) sw s7,76(sp) sw s8,72(sp) sw s9,68(sp) sw s10,64(sp) sw s11,60(sp) #lui s2,0x38000 #addi s2,s2,768 # 38000300 <B> la s2, B lw s11,0(s2) lw s10,16(s2) lw s9,32(s2) lw s8,48(s2) lw a5,4(s2) sw a5,0(sp) lw a5,20(s2) sw a5,4(sp) lw a5,36(s2) sw a5,8(sp) lw a5,52(s2) sw a5,12(sp) lw a5,8(s2) sw a5,16(sp) lw a5,24(s2) sw a5,20(sp) lw a5,40(s2) sw a5,24(sp) lw a5,56(s2) sw a5,28(sp) lw a5,12(s2) sw a5,32(sp) lw a5,28(s2) sw a5,36(sp) lw a5,44(s2) sw a5,40(sp) lw a5,60(s2) sw a5,44(sp) #addi s0,s2,64 la s0, A #lui s1,0x38000 #addi s1,s1,900 # 38000384 <result> la s1, result main_loop: addi s2,s2,128 lw s7,0(s0) lw s6,4(s0) lw s5,8(s0) lw s4,12(s0) mv a1,s11 mv a2,s7 jal ass_mul mv s3,a0 mv a1,s10 mv a2,s6 jal ass_mul add s3,s3,a0 mv a1,s9 mv a2,s5 jal ass_mul add s3,s3,a0 mv a1,s8 mv a2,s4 jal ass_mul add s3,s3,a0 sw s3,0(s1) lw a1,0(sp) mv a2,s7 jal ass_mul mv s3,a0 lw a1,4(sp) mv a2,s6 jal ass_mul add s3,s3,a0 lw a1,8(sp) mv a2,s5 jal ass_mul add s3,s3,a0 lw a1,12(sp) mv a2,s4 jal ass_mul add s3,s3,a0 sw s3,4(s1) lw a1,16(sp) mv a2,s7 jal ass_mul mv s3,a0 lw a1,20(sp) mv a2,s6 jal ass_mul add s3,s3,a0 lw a1,24(sp) mv a2,s5 jal ass_mul add s3,s3,a0 lw a1,28(sp) mv a2,s4 jal ass_mul add s3,s3,a0 sw s3,8(s1) lw a1,32(sp) mv a2,s7 jal ass_mul mv s3,a0 lw a1,36(sp) mv a2,s6 jal ass_mul add s3,s3,a0 lw a1,40(sp) mv a2,s5 jal ass_mul add s3,s3,a0 lw a1,44(sp) mv a2,s4 jal ass_mul add s3,s3,a0 sw s3,12(s1) addi s0,s0,16 addi s1,s1,16 bne s0,s2, main_loop #lui a0,0x38000 #addi a0,a0,900 # 38000384 <result> la a0, result lw ra,108(sp) lw s0,104(sp) lw s1,100(sp) lw s2,96(sp) lw s3,92(sp) lw s4,88(sp) lw s5,84(sp) lw s6,80(sp) lw s7,76(sp) lw s8,72(sp) lw s9,68(sp) lw s10,64(sp) lw s11,60(sp) addi sp,sp,112 ret ``` * sdram refresh - if SDRAM doesn't conduct read or write operation, it would need to enter the refresh state, which is the state 7 in the following waveform. ![sdram_refresh](https://hackmd.io/_uploads/r1lwxoG_a.png) ![Screenshot from 2024-01-03 17-19-16](https://hackmd.io/_uploads/HJo3Uiz_p.png) In the above example, we could find that due to *refresh_ctrl_d* has reached 750, the state first enter the precharge state first, then it enters the refresh state. ```assembly= .data 0x0000000038000300 0xc0 load address 0x0000000010000358 0x0000000038000300 . = ALIGN (0x8) 0x0000000038000300 _fdata = . *(.data .data.* .gnu.linkonce.d.*) .data 0x0000000038000300 0x0 counter_la.elf-crt0_vex.o .data 0x0000000038000300 0x0 counter_la.elf-isr.o .data 0x0000000038000300 0x0 counter_la.elf-counter_la.o .data 0x0000000038000300 0xc0 counter_la.elf-matmul.o 0x0000000038000300 A 0x0000000038000340 B 0x0000000038000380 result *(.data1) 0x00000000380003c0 _gp = ALIGN (0x10) *(.sdata .sdata.* .gnu.linkonce.s.*) 0x00000000380003c0 . = ALIGN (0x8) 0x00000000380003c0 _edata = . .bss 0x00000000380003c0 0x8 load address 0x0000000010000418 0x00000000380003c0 . = ALIGN (0x8) 0x00000000380003c0 _fbss = . *(.dynsbss) *(.sbss .sbss.* .gnu.linkonce.sb.*) .sbss 0x00000000380003c0 0x2 counter_la.elf-isr.o 0x00000000380003c0 flag *(.scommon) *(.dynbss) *(.bss .bss.* .gnu.linkonce.b.*) .bss 0x00000000380003c2 0x0 counter_la.elf-crt0_vex.o .bss 0x00000000380003c2 0x0 counter_la.elf-isr.o .bss 0x00000000380003c2 0x0 counter_la.elf-counter_la.o .bss 0x00000000380003c2 0x0 counter_la.elf-matmul.o *(COMMON) 0x00000000380003c8 . = ALIGN (0x8) *fill* 0x00000000380003c2 0x6 0x00000000380003c8 _ebss = . 0x00000000380003c8 _end = . .mprjram 0x0000000038000000 0x258 load address 0x0000000010000418 0x0000000038000000 . = ALIGN (0x8) 0x0000000038000000 _fsram = . ``` ```assembly= Disassembly of section .data: 38000300 <B>: 38000300: 0001 .2byte 0x1 38000302: 0000 .2byte 0x0 38000304: 0002 .2byte 0x2 38000306: 0000 .2byte 0x0 38000308: 00000003 lb zero,0(zero) # 0 <__DYNAMIC> 3800030c: 0004 .2byte 0x4 3800030e: 0000 .2byte 0x0 38000310: 0005 .2byte 0x5 38000312: 0000 .2byte 0x0 38000314: 0006 .2byte 0x6 38000316: 0000 .2byte 0x0 38000318: 00000007 .4byte 0x7 3800031c: 0008 .2byte 0x8 3800031e: 0000 .2byte 0x0 38000320: 0009 .2byte 0x9 38000322: 0000 .2byte 0x0 38000324: 000a .2byte 0xa 38000326: 0000 .2byte 0x0 38000328: 0000000b .4byte 0xb 3800032c: 000c .2byte 0xc 3800032e: 0000 .2byte 0x0 38000330: 000d .2byte 0xd 38000332: 0000 .2byte 0x0 38000334: 000e .2byte 0xe 38000336: 0000 .2byte 0x0 38000338: 0000000f fence unknown,unknown 3800033c: 0010 .2byte 0x10 38000340 <A>: 38000340: 0000 .2byte 0x0 38000342: 0000 .2byte 0x0 38000344: 0001 .2byte 0x1 38000346: 0000 .2byte 0x0 38000348: 0002 .2byte 0x2 3800034a: 0000 .2byte 0x0 3800034c: 00000003 lb zero,0(zero) # 0 <__DYNAMIC> 38000350: 0000 .2byte 0x0 38000352: 0000 .2byte 0x0 38000354: 0001 .2byte 0x1 38000356: 0000 .2byte 0x0 38000358: 0002 .2byte 0x2 3800035a: 0000 .2byte 0x0 3800035c: 00000003 lb zero,0(zero) # 0 <__DYNAMIC> 38000360: 0000 .2byte 0x0 38000362: 0000 .2byte 0x0 38000364: 0001 .2byte 0x1 38000366: 0000 .2byte 0x0 38000368: 0002 .2byte 0x2 3800036a: 0000 .2byte 0x0 3800036c: 00000003 lb zero,0(zero) # 0 <__DYNAMIC> 38000370: 0000 .2byte 0x0 38000372: 0000 .2byte 0x0 38000374: 0001 .2byte 0x1 38000376: 0000 .2byte 0x0 38000378: 0002 .2byte 0x2 3800037a: 0000 .2byte 0x0 3800037c: 00000003 lb zero,0(zero) # 0 <__DYNAMIC> Disassembly of section .bss: ... Disassembly of section .bss: 38000380 <flag>: 38000380: 0000 .2byte 0x0 ... 38000384 <result>: ... Disassembly of section .mprjram: 38000000 <__mulsi3>: 38000000: 00050613 mv a2,a0 38000004: 00000513 li a0,0 38000008: 0015f693 andi a3,a1,1 3800000c: 00068463 beqz a3,38000014 <__mulsi3+0x14> 38000010: 00c50533 add a0,a0,a2 38000014: 0015d593 srli a1,a1,0x1 38000018: 00161613 slli a2,a2,0x1 3800001c: fe0596e3 bnez a1,38000008 <__mulsi3+0x8> 38000020: 00008067 ret 38000024 <matmul>: 38000024: f9010113 addi sp,sp,-112 38000028: 06112623 sw ra,108(sp) 3800002c: 06812423 sw s0,104(sp) 38000030: 06912223 sw s1,100(sp) 38000034: 07212023 sw s2,96(sp) 38000038: 05312e23 sw s3,92(sp) 3800003c: 05412c23 sw s4,88(sp) 38000040: 05512a23 sw s5,84(sp) 38000044: 05612823 sw s6,80(sp) 38000048: 05712623 sw s7,76(sp) 3800004c: 05812423 sw s8,72(sp) 38000050: 05912223 sw s9,68(sp) 38000054: 05a12023 sw s10,64(sp) 38000058: 03b12e23 sw s11,60(sp) 3800005c: 38000937 lui s2,0x38000 38000060: 30090913 addi s2,s2,768 # 38000300 <B> 38000064: 00092d83 lw s11,0(s2) 38000068: 01092d03 lw s10,16(s2) 3800006c: 02092c83 lw s9,32(s2) 38000070: 03092c03 lw s8,48(s2) 38000074: 00492783 lw a5,4(s2) 38000078: 00f12023 sw a5,0(sp) 3800007c: 01492783 lw a5,20(s2) 38000080: 00f12223 sw a5,4(sp) 38000084: 02492783 lw a5,36(s2) 38000088: 00f12423 sw a5,8(sp) 3800008c: 03492783 lw a5,52(s2) 38000090: 00f12623 sw a5,12(sp) 38000094: 00892783 lw a5,8(s2) 38000098: 00f12823 sw a5,16(sp) 3800009c: 01892783 lw a5,24(s2) 380000a0: 00f12a23 sw a5,20(sp) 380000a4: 02892783 lw a5,40(s2) 380000a8: 00f12c23 sw a5,24(sp) 380000ac: 03892783 lw a5,56(s2) 380000b0: 00f12e23 sw a5,28(sp) 380000b4: 00c92783 lw a5,12(s2) 380000b8: 02f12023 sw a5,32(sp) 380000bc: 01c92783 lw a5,28(s2) 380000c0: 02f12223 sw a5,36(sp) 380000c4: 02c92783 lw a5,44(s2) 380000c8: 02f12423 sw a5,40(sp) 380000cc: 03c92783 lw a5,60(s2) 380000d0: 02f12623 sw a5,44(sp) 380000d4: 04090413 addi s0,s2,64 380000d8: 380004b7 lui s1,0x38000 380000dc: 38448493 addi s1,s1,900 # 38000384 <result> 380000e0: 08090913 addi s2,s2,128 380000e4: 00042b83 lw s7,0(s0) 380000e8: 00442b03 lw s6,4(s0) 380000ec: 00842a83 lw s5,8(s0) 380000f0: 00c42a03 lw s4,12(s0) 380000f4: 000b8593 mv a1,s7 380000f8: 000d8513 mv a0,s11 380000fc: f05ff0ef jal ra,38000000 <__mulsi3> 38000100: 00050993 mv s3,a0 38000104: 000b0593 mv a1,s6 38000108: 000d0513 mv a0,s10 3800010c: ef5ff0ef jal ra,38000000 <__mulsi3> 38000110: 00a989b3 add s3,s3,a0 38000114: 000a8593 mv a1,s5 38000118: 000c8513 mv a0,s9 3800011c: ee5ff0ef jal ra,38000000 <__mulsi3> 38000120: 00a989b3 add s3,s3,a0 38000124: 000a0593 mv a1,s4 38000128: 000c0513 mv a0,s8 3800012c: ed5ff0ef jal ra,38000000 <__mulsi3> 38000130: 00a989b3 add s3,s3,a0 38000134: 0134a023 sw s3,0(s1) 38000138: 000b8593 mv a1,s7 3800013c: 00012503 lw a0,0(sp) 38000140: ec1ff0ef jal ra,38000000 <__mulsi3> 38000144: 00050993 mv s3,a0 38000148: 000b0593 mv a1,s6 ``` #### Code trace for wbs_dat_i ![Screenshot from 2024-01-04 11-06-17](https://hackmd.io/_uploads/HkgnEo7dT.png) ![Screenshot from 2024-01-04 11-08-02](https://hackmd.io/_uploads/HkrtNo7dp.png) ![Screenshot from 2024-01-04 11-08-33](https://hackmd.io/_uploads/r1iqEom_T.png) ![Screenshot from 2024-01-04 11-09-35](https://hackmd.io/_uploads/BJGyBjXu6.png) ![Screenshot from 2024-01-04 11-10-01](https://hackmd.io/_uploads/BJkeSom_6.png) ![Screenshot from 2024-01-04 11-11-29](https://hackmd.io/_uploads/S1l-rjm_T.png) ![Screenshot from 2024-01-04 11-14-08](https://hackmd.io/_uploads/HJn-SiXOp.png) ![Screenshot from 2024-01-04 11-14-43](https://hackmd.io/_uploads/BJkXrs7OT.png) ![Screenshot from 2024-01-04 11-15-14](https://hackmd.io/_uploads/r1oQBo7_6.png) ### Sucessful assembly code for matrix multiplication ```assembly= .section .data, "ax" .globl A A: .word 0, 1, 2, 3 .word 0, 1, 2, 3 .word 0, 1, 2, 3 .word 0, 1, 2, 3 .globl B B: .word 1, 5, 9, 13 .word 2, 6, 10, 14 .word 3, 7, 11, 15 .word 4, 8, 12, 16 .globl result result: .space 64 # Reserve space for result matrix .section .mprjram, "ax" .globl matmul ass_mul: #li a0, 0 mul_loop2: andi a3,a1,1 beqz a3,mul_loop1 add a0,a0,a2 mul_loop1: srli a1,a1,0x1 slli a2,a2,0x1 bnez a1,mul_loop2 ret matmul: addi sp,sp,-80 sw ra,12(sp) sw s0,8(sp) sw s1,4(sp) sw s2,0(sp) sw s3,28(sp) sw s4,24(sp) sw s5,20(sp) sw s6,16(sp) sw s7,76(sp) sw s8,72(sp) sw s9,68(sp) sw s10,64(sp) sw s11,60(sp) la s2, B lw s10,4(s2) #B[4] lw s9, 8(s2) #B[8] lw s8, 12(s2) #B[12] la s0, A la s1, result la s11, B la s2, A addi s2, s2, 64 main_loop: lw a1,0(s11) #B[0] lw s6,4(s0) #A[1] lw s5,8(s0) #A[2] lw s4,12(s0) #A[3] lw s7,0(s0) #A[0] mv a2,s7 #A[0] li a0, 0 jal ra, ass_mul mv s3,a0 #s3 = A[0]*B[0] mv a1,s10 #B[4] mv a2,s6 #A[1] li a0, 0 jal ra, ass_mul add s3,s3,a0 mv a1,s9 #B[8] mv a2,s5 #A[2] li a0, 0 jal ra, ass_mul add s3,s3,a0 mv a1,s8 #B[12] mv a2,s4 #A[3] li a0, 0 jal ra, ass_mul add s3,s3,a0 sw s3,0(s1) #C[0] = s3 lw a1,16(s11) #B[1] mv a2,s7 #A[0] li a0, 0 jal ra, ass_mul mv s3,a0 lw a1,20(s11) #B[5] mv a2,s6 li a0, 0 jal ra, ass_mul add s3,s3,a0 lw a1,24(s11) #B[9] mv a2,s5 li a0, 0 jal ra, ass_mul add s3,s3,a0 lw a1,28(s11) #B[13] mv a2,s4 li a0, 0 jal ra, ass_mul add s3,s3,a0 sw s3,4(s1) #C[1] lw a1,32(sp) #B[2] mv a2,s7 #A[0] li a0, 0 jal ra, ass_mul mv s3,a0 #A[0]*B[2] lw a1,36(s11) #B[6] mv a2,s6 li a0, 0 jal ra, ass_mul add s3,s3,a0 lw a1,40(s11) #B[10] mv a2,s5 li a0, 0 jal ra, ass_mul add s3,s3,a0 lw a1,44(s11) #B[14] mv a2,s4 li a0, 0 jal ra, ass_mul add s3,s3,a0 sw s3,8(s1) #C[2] lw a1,48(s11) #B[3] mv a2,s7 #A[0] li a0, 0 jal ra, ass_mul mv s3,a0 #B[3]*A[0] lw a1,52(s11) #B[7] mv a2,s6 li a0, 0 jal ra, ass_mul add s3,s3,a0 lw a1,56(s11) #B[11] mv a2,s5 li a0, 0 jal ra, ass_mul add s3,s3,a0 lw a1,60(s11) #B[15] mv a2,s4 li a0, 0 jal ra, ass_mul add s3,s3,a0 sw s3,12(s1) #C[3] addi s0,s0,16 #For A addi s1,s1,16 #For C bne s0,s2, main_loop #lui a0,0x38000 #addi a0,a0,900 # 38000384 <result> la a0,result lw ra,12(sp) lw s0,8(sp) lw s1,4(sp) lw s2,0(sp) lw s3,28(sp) lw s4,24(sp) lw s5,20(sp) lw s6,16(sp) lw s7,76(sp) lw s8,72(sp) lw s9,68(sp) lw s10,64(sp) lw s11,60(sp) addi sp,sp,80 ret ```