# Final Project Note:
### SDRAM
### Question 1: Why do we need 9 cycles for the next wbs_ack_o from the SDRAM?
### Note 1: When the wbs_ack_o is high, the corresponding wbs_adr_i is always 0x3800xxxx(the mapping address of mprjram)
#### Simple test for the peformance of prefetch buffer
```verilog=
////////////////////////////////////////////////
//================= Buffer Only for muls ========================
reg [31:0] prefetch_buf0 [6:0];
wire [2:0] prefetch_buf0_idx;
assign prefetch_buf0_idx = ((wbs_adr_i[7:0] - 8'd4) >> 2);
reg [31:0] prefetch_buf1 [6:0];
reg [31:0] prefetch_buf2 [6:0];
//write buffer
integer i0;
always @(posedge clk) begin
if(rst) begin
for(i0 = 0; i0 < 7; i0 = i0 + 1) begin
prefetch_buf0[i0] <= 0;
prefetch_buf1[i0] <= 0;
prefetch_buf2[i0] <= 0;
end
end else if(valid && wbs_we_i && (wbs_adr_i < 32'h3800_0020) && (wbs_adr_i > 32'h3800_0000)) begin
prefetch_buf0[prefetch_buf0_idx] <= wbs_dat_i;
end else if(valid && wbs_we_i && (wbs_adr_i < 32'h3800_0040) && (wbs_adr_i > 32'h3800_0020)) begin
prefetch_buf1[prefetch_buf0_idx] <= wbs_dat_i;
end else if(valid && wbs_we_i && (wbs_adr_i < 32'h3800_0060) && (wbs_adr_i > 32'h3800_0040)) begin
prefetch_buf2[prefetch_buf0_idx] <= wbs_dat_i;
end
end
//read buffer
reg prefetch_out_valid;
reg [31:0] prefetch_D;
reg prefetch_out_valid_reg;
always @(posedge clk) begin
if(rst) begin
prefetch_out_valid_reg <= 0;
end else begin
prefetch_out_valid_reg <= prefetch_out_valid;
end
end
always @(*) begin
prefetch_out_valid = 0;
prefetch_D = 0;
if((~prefetch_out_valid_reg) && valid && ~wbs_we_i && (wbs_adr_i[31:24] == 16'h38)) begin
if((wbs_adr_i[11:0] > 12'h000) && (wbs_adr_i[11:0] < 12'h020)) begin
//$display("prefetch read: Addr->%x", wbs_adr_i);
prefetch_out_valid = 1;
prefetch_D = prefetch_buf0[prefetch_buf0_idx];
end else if((wbs_adr_i[11:0] > 12'h020) && (wbs_adr_i[11:0] < 12'h040)) begin
prefetch_out_valid = 1;
prefetch_D = prefetch_buf1[prefetch_buf0_idx];
end else if((wbs_adr_i[11:0] > 12'h040) && (wbs_adr_i[11:0] < 12'h060)) begin
prefetch_out_valid = 1;
prefetch_D = prefetch_buf2[prefetch_buf0_idx];
end
end
end
//===============================================================
// WB MI A
assign valid = wbs_stb_i && wbs_cyc_i;
//@@
//assign ctrl_in_valid = wbs_we_i ? valid :
// (prefetch_out_valid ? 0 : (~ctrl_in_valid_q && valid));
assign ctrl_in_valid = wbs_we_i ? valid :
( (~prefetch_out_valid) && ~ctrl_in_valid_q && valid);
//assign ctrl_in_valid = wbs_we_i ? valid :
// ( ~ctrl_in_valid_q && valid);
//@@
assign wbs_ack_o = (wbs_we_i) ? ~ctrl_busy && valid : (ctrl_out_valid || prefetch_out_valid);
//assign wbs_ack_o = (wbs_we_i) ? ~ctrl_busy && valid : ctrl_out_valid;
//@@
wire [31:0] sdram_dat_o;
assign wbs_dat_o = prefetch_out_valid ? prefetch_D : sdram_dat_o;
//assign wbs_dat_o = sdram_dat_o;
assign bram_mask = wbs_sel_i & {4{wbs_we_i}};
assign ctrl_addr = wbs_adr_i[22:0];
// IO
assign io_out = d2c_data;
assign io_oeb = {(`MPRJ_IO_PADS-1){rst}};
// IRQ
assign irq = 3'b000; // Unused
// LA
assign la_data_out = {{(127-BITS){1'b0}}, d2c_data};
// Assuming LA probes [65:64] are for controlling the count clk & reset
assign clk = (~la_oenb[64]) ? la_data_in[64]: wb_clk_i;
assign rst = (~la_oenb[65]) ? la_data_in[65]: wb_rst_i;
assign rst_n = ~rst;
always @(posedge clk) begin
if (rst) begin
ctrl_in_valid_q <= 1'b0;
end
else begin
if (~prefetch_out_valid && ~wbs_we_i && valid && ~ctrl_busy && ctrl_in_valid_q == 1'b0)
ctrl_in_valid_q <= 1'b1;
else if (ctrl_out_valid)
ctrl_in_valid_q <= 1'b0;
end
end
sdram_controller user_sdram_controller (
.clk(clk),
.rst(rst),
.sdram_cle(sdram_cle),
.sdram_cs(sdram_cs),
.sdram_cas(sdram_cas),
.sdram_ras(sdram_ras),
.sdram_we(sdram_we),
.sdram_dqm(sdram_dqm),
.sdram_ba(sdram_ba),
.sdram_a(sdram_a),
.sdram_dqi(d2c_data),
.sdram_dqo(c2d_data),
.user_addr(ctrl_addr),
.rw(wbs_we_i),
.data_in(wbs_dat_i),
.data_out(sdram_dat_o),//@@
.busy(ctrl_busy),
.in_valid(ctrl_in_valid),
.out_valid(ctrl_out_valid)
);
sdr user_bram (
.Rst_n(rst_n),
.Clk(clk),
.Cke(sdram_cle),
.Cs_n(sdram_cs),
.Ras_n(sdram_ras),
.Cas_n(sdram_cas),
.We_n(sdram_we),
.Addr(sdram_a),
.Ba(sdram_ba),
.Dqm(bram_mask),
.Dqi(c2d_data),
.Dqo(d2c_data)
);
```
```c=
#include "matmul.h"
int* __attribute__((section(".mprjram"))) matmul()
{
int i=0;
int j;
int k;
int sum;
int kk;
int mytest = 1234;
unsigned int count = 0;
int a1, a2, a3, a4, b1, b2, b3, b4;
for (i=0; i <SIZE; i++){
for (j=0; j<SIZE; j++){
a1 = A[i*SIZE];
a2 = A[i*SIZE + 1];
a3 = A[i*SIZE + 2];
a4 = A[i*SIZE + 3];
b1 = B[j];
b2 = B[j + SIZE];
b3 = B[j + 2*SIZE];
b4 = B[j + 3*SIZE];
/*
sum = 0;
for(k = 0;k<SIZE;k++) {
sum += A[(i*SIZE) + k] * B[(k*SIZE) + j];
}
*/
result[(i*SIZE) + j] = a1*b1 + a2*b2 + a3*b3 + a4*b4;
}
}
return result;
}
```
```assembly=
## -O1, unroll k loop,
Disassembly of section .mprjram:
38000000 <__mulsi3>:
38000000: 00050613 mv a2,a0
38000004: 00000513 li a0,0
38000008: 0015f693 andi a3,a1,1
3800000c: 00068463 beqz a3,38000014 <__mulsi3+0x14>
38000010: 00c50533 add a0,a0,a2
38000014: 0015d593 srli a1,a1,0x1
38000018: 00161613 slli a2,a2,0x1
3800001c: fe0596e3 bnez a1,38000008 <__mulsi3+0x8>
38000020: 00008067 ret
38000024 <matmul>:
38000024: fc010113 addi sp,sp,-64
38000028: 02112e23 sw ra,60(sp)
3800002c: 02812c23 sw s0,56(sp)
38000030: 02912a23 sw s1,52(sp)
38000034: 03212823 sw s2,48(sp)
38000038: 03312623 sw s3,44(sp)
3800003c: 03412423 sw s4,40(sp)
38000040: 03512223 sw s5,36(sp)
38000044: 03612023 sw s6,32(sp)
38000048: 01712e23 sw s7,28(sp)
3800004c: 01812c23 sw s8,24(sp)
38000050: 01912a23 sw s9,20(sp)
38000054: 01a12823 sw s10,16(sp)
38000058: 01b12623 sw s11,12(sp)
3800005c: 380009b7 lui s3,0x38000
38000060: 30098993 addi s3,s3,768 # 38000300 <A>
38000064: 38000cb7 lui s9,0x38000
38000068: 384c8c93 addi s9,s9,900 # 38000384 <result>
3800006c: 00000d13 li s10,0
38000070: 38000db7 lui s11,0x38000
38000074: 300d8d93 addi s11,s11,768 # 38000300 <A>
38000078: 050d8c13 addi s8,s11,80
3800007c: 0009ab83 lw s7,0(s3)
38000080: 0049ab03 lw s6,4(s3)
38000084: 0089aa83 lw s5,8(s3)
38000088: 00c9aa03 lw s4,12(s3)
3800008c: 040d8413 addi s0,s11,64
38000090: 000c8913 mv s2,s9
38000094: 00042583 lw a1,0(s0)
38000098: 000b8513 mv a0,s7
3800009c: f65ff0ef jal ra,38000000 <__mulsi3>
380000a0: 00050493 mv s1,a0
380000a4: 01042583 lw a1,16(s0)
380000a8: 000b0513 mv a0,s6
380000ac: f55ff0ef jal ra,38000000 <__mulsi3>
380000b0: 00a484b3 add s1,s1,a0
380000b4: 02042583 lw a1,32(s0)
380000b8: 000a8513 mv a0,s5
380000bc: f45ff0ef jal ra,38000000 <__mulsi3>
380000c0: 00a484b3 add s1,s1,a0
380000c4: 03042583 lw a1,48(s0)
380000c8: 000a0513 mv a0,s4
380000cc: f35ff0ef jal ra,38000000 <__mulsi3>
380000d0: 00a484b3 add s1,s1,a0
380000d4: 00992023 sw s1,0(s2)
380000d8: 00440413 addi s0,s0,4
380000dc: 00490913 addi s2,s2,4
380000e0: fb841ae3 bne s0,s8,38000094 <matmul+0x70>
380000e4: 01098993 addi s3,s3,16
380000e8: 010c8c93 addi s9,s9,16
380000ec: 004d0d13 addi s10,s10,4
380000f0: 01000793 li a5,16
380000f4: f8fd14e3 bne s10,a5,3800007c <matmul+0x58>
380000f8: 38000537 lui a0,0x38000
380000fc: 38450513 addi a0,a0,900 # 38000384 <result>
38000100: 03c12083 lw ra,60(sp)
38000104: 03812403 lw s0,56(sp)
38000108: 03412483 lw s1,52(sp)
3800010c: 03012903 lw s2,48(sp)
38000110: 02c12983 lw s3,44(sp)
38000114: 02812a03 lw s4,40(sp)
38000118: 02412a83 lw s5,36(sp)
3800011c: 02012b03 lw s6,32(sp)
38000120: 01c12b83 lw s7,28(sp)
38000124: 01812c03 lw s8,24(sp)
38000128: 01412c83 lw s9,20(sp)
3800012c: 01012d03 lw s10,16(sp)
38000130: 00c12d83 lw s11,12(sp)
38000134: 04010113 addi sp,sp,64
38000138: 00008067 ret
```
```c=
#include "matmul.h"
int* __attribute__((section(".mprjram"))) matmul()
{
int i=0;
int j;
int k;
int sum;
int kk;
int mytest = 1234;
unsigned int count = 0;
int a1, a2, a3, a4;
int b0_1, b0_2, b0_3, b0_4;
int b1_1, b1_2, b1_3, b1_4;
int b2_1, b2_2, b2_3, b2_4;
int b3_1, b3_2, b3_3, b3_4;
for (i=0; i <SIZE; i++){
//for (j=0; j<SIZE; j++){
a1 = A[i*SIZE];
a2 = A[i*SIZE + 1];
a3 = A[i*SIZE + 2];
a4 = A[i*SIZE + 3];
b0_1 = B[0];
b0_2 = B[0 + SIZE];
b0_3 = B[0 + 2*SIZE];
b0_4 = B[0 + 3*SIZE];
b1_1 = B[1];
b1_2 = B[1 + SIZE];
b1_3 = B[1 + 2*SIZE];
b1_4 = B[1 + 3*SIZE];
b2_1 = B[2];
b2_2 = B[2 + SIZE];
b2_3 = B[2 + 2*SIZE];
b2_4 = B[2 + 3*SIZE];
b3_1 = B[3];
b3_2 = B[3 + SIZE];
b3_3 = B[3 + 2*SIZE];
b3_4 = B[3 + 3*SIZE];
/*
sum = 0;
for(k = 0;k<SIZE;k++) {
sum += A[(i*SIZE) + k] * B[(k*SIZE) + j];
}
*/
result[(i*SIZE) + 0] = a1*b0_1 + a2*b0_2 + a3*b0_3 + a4*b0_4;
result[(i*SIZE) + 1] = a1*b1_1 + a2*b1_2 + a3*b1_3 + a4*b1_4;
result[(i*SIZE) + 2] = a1*b2_1 + a2*b2_2 + a3*b2_3 + a4*b2_4;
result[(i*SIZE) + 3] = a1*b3_1 + a2*b3_2 + a3*b3_3 + a4*b3_4;
//}
}
return result;
}
```
* Correct Code for firmware (Unloop)
```c=
#include "matmul.h"
/*
int __attribute__((section(".mprjram"))) ass_mul(int a, int b)
{
int result;
asm(
"li %[result], 0\n\t"
"andi %[tmp], %[b], 1\n\t"
"beqz %[tmp], 1f\n\t"
"add %[result], %[result], %[a]\n\t"
"1:\n\t"
"srli %[b], %[b], 1\n\t"
"slli %[a], %[a], 1\n\t"
"bnez %[b], 1b"
: [result] "=r"(result), [tmp] "=&r"(result), [a] "+r"(result), [b] "+r"(result)
:
: "a0", "a1", "a2", "a3"
);
return result;
}
*/
/*
mv a2,a0
38000004: 00000513 li a0,0
38000008: 0015f693 andi a3,a1,1
3800000c: 00068463 beqz a3,38000014 <__mulsi3+0x14>
38000010: 00c50533 add a0,a0,a2
38000014: 0015d593 srli a1,a1,0x1
38000018: 00161613 slli a2,a2,0x1
3800001c: fe0596e3 bnez a1,38000008 <__mulsi3+0x8>
38000020: 00008067 ret
*/
#include "matmul.h"
int* __attribute__((section(".mprjram"))) matmul()
{
register int i=0;
register int j;
register int k;
register int sum;
register int kk;
register int mytest = 1234;
register unsigned int count = 0;
register int a1, a2, a3, a4;
register int b0_1, b0_2, b0_3, b0_4;
register int b1_1, b1_2, b1_3, b1_4;
register int b2_1, b2_2, b2_3, b2_4;
register int b3_1, b3_2, b3_3, b3_4;
b0_1 = B[0];
b0_2 = B[0 + SIZE];
b0_3 = B[0 + 2*SIZE];
b0_4 = B[0 + 3*SIZE];
b1_1 = B[1];
b1_2 = B[1 + SIZE];
b1_3 = B[1 + 2*SIZE];
b1_4 = B[1 + 3*SIZE];
b2_1 = B[2];
b2_2 = B[2 + SIZE];
b2_3 = B[2 + 2*SIZE];
b2_4 = B[2 + 3*SIZE];
b3_1 = B[3];
b3_2 = B[3 + SIZE];
b3_3 = B[3 + 2*SIZE];
b3_4 = B[3 + 3*SIZE];
//for (i = 0; i < 1; i++){
for (i=0; i <SIZE; i++){
//for (j=0; j<SIZE; j++){
a1 = A[i*SIZE];
a2 = A[i*SIZE + 1];
a3 = A[i*SIZE + 2];
a4 = A[i*SIZE + 3];
/*
sum = 0;
for(k = 0;k<SIZE;k++) {
sum += A[(i*SIZE) + k] * B[(k*SIZE) + j];
}
*/
result[(i*SIZE) + 0] = a1*b0_1 + a2*b0_2 + a3*b0_3 + a4*b0_4;
result[(i*SIZE) + 1] = a1*b1_1 + a2*b1_2 + a3*b1_3 + a4*b1_4;
result[(i*SIZE) + 2] = a1*b2_1 + a2*b2_2 + a3*b2_3 + a4*b2_4;
result[(i*SIZE) + 3] = a1*b3_1 + a2*b3_2 + a3*b3_3 + a4*b3_4;
//}
}
return result;
}
```
* Wrong code
```c=
int __attribute__((section(".mprjram"))) ass_mul(int a, int b)
{
int result;
asm(
"li %[result], 0\n\t"
"andi %[tmp], %[b], 1\n\t"
"beqz %[tmp], 1f\n\t"
"add %[result], %[result], %[a]\n\t"
"1:\n\t"
"srli %[b], %[b], 1\n\t"
"slli %[a], %[a], 1\n\t"
"bnez %[b], 1b"
: [result] "=r"(result), [tmp] "=&r"(result), [a] "+r"(result), [b] "+r"(result)
:
: "a0", "a1", "a2", "a3"
);
return result;
}
int* __attribute__((section(".mprjram"))) matmul()
{
register int i=0;
register int j;
register int k;
register int sum;
register int kk;
register int mytest = 1234;
register unsigned int count = 0;
register int a1, a2, a3, a4;
register int b0_1, b0_2, b0_3, b0_4;
register int b1_1, b1_2, b1_3, b1_4;
register int b2_1, b2_2, b2_3, b2_4;
register int b3_1, b3_2, b3_3, b3_4;
b0_1 = B[0];
b0_2 = B[0 + SIZE];
b0_3 = B[0 + 2*SIZE];
b0_4 = B[0 + 3*SIZE];
b1_1 = B[1];
b1_2 = B[1 + SIZE];
b1_3 = B[1 + 2*SIZE];
b1_4 = B[1 + 3*SIZE];
b2_1 = B[2];
b2_2 = B[2 + SIZE];
b2_3 = B[2 + 2*SIZE];
b2_4 = B[2 + 3*SIZE];
b3_1 = B[3];
b3_2 = B[3 + SIZE];
b3_3 = B[3 + 2*SIZE];
b3_4 = B[3 + 3*SIZE];
for (i=0; i <SIZE; i++){
a1 = A[i*SIZE];
a2 = A[i*SIZE + 1];
a3 = A[i*SIZE + 2];
a4 = A[i*SIZE + 3];
result[(i*SIZE) + 0] = ass_mul(a1, b0_1) + ass_mul(a2, b0_2) + ass_mul(a3, b0_3) + ass_mul(a4, b0_4);
result[(i*SIZE) + 1] = ass_mul(a1, b1_1) + ass_mul(a2, b1_2) + ass_mul(a3, b1_3) + ass_mul(a4, b1_4);
result[(i*SIZE) + 2] = ass_mul(a1, b2_1) + ass_mul(a2, b2_2) + ass_mul(a3, b2_3) + ass_mul(a4, b2_4);
result[(i*SIZE) + 3] = ass_mul(a1, b3_1) + ass_mul(a2, b3_2) + ass_mul(a3, b3_3) + ass_mul(a4, b3_4);
}
return result;
}
```
* modified assembly code
```assembly=
.section .data
.globl A
A:
.word 0, 1, 2, 3
.word 0, 1, 2, 3
.word 0, 1, 2, 3
.word 0, 1, 2, 3
.globl B
B:
.word 1, 2, 3, 4
.word 5, 6, 7, 8
.word 9, 10, 11, 12
.word 13, 14, 15, 16
.globl result
result:
.space 64 # Reserve space for result matrix
.section .mprjram
.globl matmul
ass_mul:
li a0,0
mul_loop2:
andi a3,a1,1
beqz a3,mul_loop1
add a0,a0,a2
mul_loop1:
srli a1,a1,0x1
slli a2,a2,0x1
bnez a1,mul_loop2
ret
matmul:
addi sp,sp,-112
sw ra,108(sp)
sw s0,104(sp)
sw s1,100(sp)
sw s2,96(sp)
sw s3,92(sp)
sw s4,88(sp)
sw s5,84(sp)
sw s6,80(sp)
sw s7,76(sp)
sw s8,72(sp)
sw s9,68(sp)
sw s10,64(sp)
sw s11,60(sp)
#lui s2,0x38000
#addi s2,s2,768 # 38000300 <B>
la s2, B
lw s11,0(s2)
lw s10,16(s2)
lw s9,32(s2)
lw s8,48(s2)
lw a5,4(s2)
sw a5,0(sp)
lw a5,20(s2)
sw a5,4(sp)
lw a5,36(s2)
sw a5,8(sp)
lw a5,52(s2)
sw a5,12(sp)
lw a5,8(s2)
sw a5,16(sp)
lw a5,24(s2)
sw a5,20(sp)
lw a5,40(s2)
sw a5,24(sp)
lw a5,56(s2)
sw a5,28(sp)
lw a5,12(s2)
sw a5,32(sp)
lw a5,28(s2)
sw a5,36(sp)
lw a5,44(s2)
sw a5,40(sp)
lw a5,60(s2)
sw a5,44(sp)
#addi s0,s2,64
la s0, A
#lui s1,0x38000
#addi s1,s1,900 # 38000384 <result>
la s1, result
main_loop:
addi s2,s2,128
lw s7,0(s0)
lw s6,4(s0)
lw s5,8(s0)
lw s4,12(s0)
mv a1,s11
mv a2,s7
jal ass_mul
mv s3,a0
mv a1,s10
mv a2,s6
jal ass_mul
add s3,s3,a0
mv a1,s9
mv a2,s5
jal ass_mul
add s3,s3,a0
mv a1,s8
mv a2,s4
jal ass_mul
add s3,s3,a0
sw s3,0(s1)
lw a1,0(sp)
mv a2,s7
jal ass_mul
mv s3,a0
lw a1,4(sp)
mv a2,s6
jal ass_mul
add s3,s3,a0
lw a1,8(sp)
mv a2,s5
jal ass_mul
add s3,s3,a0
lw a1,12(sp)
mv a2,s4
jal ass_mul
add s3,s3,a0
sw s3,4(s1)
lw a1,16(sp)
mv a2,s7
jal ass_mul
mv s3,a0
lw a1,20(sp)
mv a2,s6
jal ass_mul
add s3,s3,a0
lw a1,24(sp)
mv a2,s5
jal ass_mul
add s3,s3,a0
lw a1,28(sp)
mv a2,s4
jal ass_mul
add s3,s3,a0
sw s3,8(s1)
lw a1,32(sp)
mv a2,s7
jal ass_mul
mv s3,a0
lw a1,36(sp)
mv a2,s6
jal ass_mul
add s3,s3,a0
lw a1,40(sp)
mv a2,s5
jal ass_mul
add s3,s3,a0
lw a1,44(sp)
mv a2,s4
jal ass_mul
add s3,s3,a0
sw s3,12(s1)
addi s0,s0,16
addi s1,s1,16
bne s0,s2, main_loop
#lui a0,0x38000
#addi a0,a0,900 # 38000384 <result>
la a0, result
lw ra,108(sp)
lw s0,104(sp)
lw s1,100(sp)
lw s2,96(sp)
lw s3,92(sp)
lw s4,88(sp)
lw s5,84(sp)
lw s6,80(sp)
lw s7,76(sp)
lw s8,72(sp)
lw s9,68(sp)
lw s10,64(sp)
lw s11,60(sp)
addi sp,sp,112
ret
```
* sdram refresh
- if SDRAM doesn't conduct read or write operation, it would need to enter the refresh state, which is the state 7 in the following waveform.


In the above example, we could find that due to *refresh_ctrl_d* has reached 750, the state first enter the precharge state first, then it enters the refresh state.
```assembly=
.data 0x0000000038000300 0xc0 load address 0x0000000010000358
0x0000000038000300 . = ALIGN (0x8)
0x0000000038000300 _fdata = .
*(.data .data.* .gnu.linkonce.d.*)
.data 0x0000000038000300 0x0 counter_la.elf-crt0_vex.o
.data 0x0000000038000300 0x0 counter_la.elf-isr.o
.data 0x0000000038000300 0x0 counter_la.elf-counter_la.o
.data 0x0000000038000300 0xc0 counter_la.elf-matmul.o
0x0000000038000300 A
0x0000000038000340 B
0x0000000038000380 result
*(.data1)
0x00000000380003c0 _gp = ALIGN (0x10)
*(.sdata .sdata.* .gnu.linkonce.s.*)
0x00000000380003c0 . = ALIGN (0x8)
0x00000000380003c0 _edata = .
.bss 0x00000000380003c0 0x8 load address 0x0000000010000418
0x00000000380003c0 . = ALIGN (0x8)
0x00000000380003c0 _fbss = .
*(.dynsbss)
*(.sbss .sbss.* .gnu.linkonce.sb.*)
.sbss 0x00000000380003c0 0x2 counter_la.elf-isr.o
0x00000000380003c0 flag
*(.scommon)
*(.dynbss)
*(.bss .bss.* .gnu.linkonce.b.*)
.bss 0x00000000380003c2 0x0 counter_la.elf-crt0_vex.o
.bss 0x00000000380003c2 0x0 counter_la.elf-isr.o
.bss 0x00000000380003c2 0x0 counter_la.elf-counter_la.o
.bss 0x00000000380003c2 0x0 counter_la.elf-matmul.o
*(COMMON)
0x00000000380003c8 . = ALIGN (0x8)
*fill* 0x00000000380003c2 0x6
0x00000000380003c8 _ebss = .
0x00000000380003c8 _end = .
.mprjram 0x0000000038000000 0x258 load address 0x0000000010000418
0x0000000038000000 . = ALIGN (0x8)
0x0000000038000000 _fsram = .
```
```assembly=
Disassembly of section .data:
38000300 <B>:
38000300: 0001 .2byte 0x1
38000302: 0000 .2byte 0x0
38000304: 0002 .2byte 0x2
38000306: 0000 .2byte 0x0
38000308: 00000003 lb zero,0(zero) # 0 <__DYNAMIC>
3800030c: 0004 .2byte 0x4
3800030e: 0000 .2byte 0x0
38000310: 0005 .2byte 0x5
38000312: 0000 .2byte 0x0
38000314: 0006 .2byte 0x6
38000316: 0000 .2byte 0x0
38000318: 00000007 .4byte 0x7
3800031c: 0008 .2byte 0x8
3800031e: 0000 .2byte 0x0
38000320: 0009 .2byte 0x9
38000322: 0000 .2byte 0x0
38000324: 000a .2byte 0xa
38000326: 0000 .2byte 0x0
38000328: 0000000b .4byte 0xb
3800032c: 000c .2byte 0xc
3800032e: 0000 .2byte 0x0
38000330: 000d .2byte 0xd
38000332: 0000 .2byte 0x0
38000334: 000e .2byte 0xe
38000336: 0000 .2byte 0x0
38000338: 0000000f fence unknown,unknown
3800033c: 0010 .2byte 0x10
38000340 <A>:
38000340: 0000 .2byte 0x0
38000342: 0000 .2byte 0x0
38000344: 0001 .2byte 0x1
38000346: 0000 .2byte 0x0
38000348: 0002 .2byte 0x2
3800034a: 0000 .2byte 0x0
3800034c: 00000003 lb zero,0(zero) # 0 <__DYNAMIC>
38000350: 0000 .2byte 0x0
38000352: 0000 .2byte 0x0
38000354: 0001 .2byte 0x1
38000356: 0000 .2byte 0x0
38000358: 0002 .2byte 0x2
3800035a: 0000 .2byte 0x0
3800035c: 00000003 lb zero,0(zero) # 0 <__DYNAMIC>
38000360: 0000 .2byte 0x0
38000362: 0000 .2byte 0x0
38000364: 0001 .2byte 0x1
38000366: 0000 .2byte 0x0
38000368: 0002 .2byte 0x2
3800036a: 0000 .2byte 0x0
3800036c: 00000003 lb zero,0(zero) # 0 <__DYNAMIC>
38000370: 0000 .2byte 0x0
38000372: 0000 .2byte 0x0
38000374: 0001 .2byte 0x1
38000376: 0000 .2byte 0x0
38000378: 0002 .2byte 0x2
3800037a: 0000 .2byte 0x0
3800037c: 00000003 lb zero,0(zero) # 0 <__DYNAMIC>
Disassembly of section .bss:
...
Disassembly of section .bss:
38000380 <flag>:
38000380: 0000 .2byte 0x0
...
38000384 <result>:
...
Disassembly of section .mprjram:
38000000 <__mulsi3>:
38000000: 00050613 mv a2,a0
38000004: 00000513 li a0,0
38000008: 0015f693 andi a3,a1,1
3800000c: 00068463 beqz a3,38000014 <__mulsi3+0x14>
38000010: 00c50533 add a0,a0,a2
38000014: 0015d593 srli a1,a1,0x1
38000018: 00161613 slli a2,a2,0x1
3800001c: fe0596e3 bnez a1,38000008 <__mulsi3+0x8>
38000020: 00008067 ret
38000024 <matmul>:
38000024: f9010113 addi sp,sp,-112
38000028: 06112623 sw ra,108(sp)
3800002c: 06812423 sw s0,104(sp)
38000030: 06912223 sw s1,100(sp)
38000034: 07212023 sw s2,96(sp)
38000038: 05312e23 sw s3,92(sp)
3800003c: 05412c23 sw s4,88(sp)
38000040: 05512a23 sw s5,84(sp)
38000044: 05612823 sw s6,80(sp)
38000048: 05712623 sw s7,76(sp)
3800004c: 05812423 sw s8,72(sp)
38000050: 05912223 sw s9,68(sp)
38000054: 05a12023 sw s10,64(sp)
38000058: 03b12e23 sw s11,60(sp)
3800005c: 38000937 lui s2,0x38000
38000060: 30090913 addi s2,s2,768 # 38000300 <B>
38000064: 00092d83 lw s11,0(s2)
38000068: 01092d03 lw s10,16(s2)
3800006c: 02092c83 lw s9,32(s2)
38000070: 03092c03 lw s8,48(s2)
38000074: 00492783 lw a5,4(s2)
38000078: 00f12023 sw a5,0(sp)
3800007c: 01492783 lw a5,20(s2)
38000080: 00f12223 sw a5,4(sp)
38000084: 02492783 lw a5,36(s2)
38000088: 00f12423 sw a5,8(sp)
3800008c: 03492783 lw a5,52(s2)
38000090: 00f12623 sw a5,12(sp)
38000094: 00892783 lw a5,8(s2)
38000098: 00f12823 sw a5,16(sp)
3800009c: 01892783 lw a5,24(s2)
380000a0: 00f12a23 sw a5,20(sp)
380000a4: 02892783 lw a5,40(s2)
380000a8: 00f12c23 sw a5,24(sp)
380000ac: 03892783 lw a5,56(s2)
380000b0: 00f12e23 sw a5,28(sp)
380000b4: 00c92783 lw a5,12(s2)
380000b8: 02f12023 sw a5,32(sp)
380000bc: 01c92783 lw a5,28(s2)
380000c0: 02f12223 sw a5,36(sp)
380000c4: 02c92783 lw a5,44(s2)
380000c8: 02f12423 sw a5,40(sp)
380000cc: 03c92783 lw a5,60(s2)
380000d0: 02f12623 sw a5,44(sp)
380000d4: 04090413 addi s0,s2,64
380000d8: 380004b7 lui s1,0x38000
380000dc: 38448493 addi s1,s1,900 # 38000384 <result>
380000e0: 08090913 addi s2,s2,128
380000e4: 00042b83 lw s7,0(s0)
380000e8: 00442b03 lw s6,4(s0)
380000ec: 00842a83 lw s5,8(s0)
380000f0: 00c42a03 lw s4,12(s0)
380000f4: 000b8593 mv a1,s7
380000f8: 000d8513 mv a0,s11
380000fc: f05ff0ef jal ra,38000000 <__mulsi3>
38000100: 00050993 mv s3,a0
38000104: 000b0593 mv a1,s6
38000108: 000d0513 mv a0,s10
3800010c: ef5ff0ef jal ra,38000000 <__mulsi3>
38000110: 00a989b3 add s3,s3,a0
38000114: 000a8593 mv a1,s5
38000118: 000c8513 mv a0,s9
3800011c: ee5ff0ef jal ra,38000000 <__mulsi3>
38000120: 00a989b3 add s3,s3,a0
38000124: 000a0593 mv a1,s4
38000128: 000c0513 mv a0,s8
3800012c: ed5ff0ef jal ra,38000000 <__mulsi3>
38000130: 00a989b3 add s3,s3,a0
38000134: 0134a023 sw s3,0(s1)
38000138: 000b8593 mv a1,s7
3800013c: 00012503 lw a0,0(sp)
38000140: ec1ff0ef jal ra,38000000 <__mulsi3>
38000144: 00050993 mv s3,a0
38000148: 000b0593 mv a1,s6
```
#### Code trace for wbs_dat_i









### Sucessful assembly code for matrix multiplication
```assembly=
.section .data, "ax"
.globl A
A:
.word 0, 1, 2, 3
.word 0, 1, 2, 3
.word 0, 1, 2, 3
.word 0, 1, 2, 3
.globl B
B:
.word 1, 5, 9, 13
.word 2, 6, 10, 14
.word 3, 7, 11, 15
.word 4, 8, 12, 16
.globl result
result:
.space 64 # Reserve space for result matrix
.section .mprjram, "ax"
.globl matmul
ass_mul:
#li a0, 0
mul_loop2:
andi a3,a1,1
beqz a3,mul_loop1
add a0,a0,a2
mul_loop1:
srli a1,a1,0x1
slli a2,a2,0x1
bnez a1,mul_loop2
ret
matmul:
addi sp,sp,-80
sw ra,12(sp)
sw s0,8(sp)
sw s1,4(sp)
sw s2,0(sp)
sw s3,28(sp)
sw s4,24(sp)
sw s5,20(sp)
sw s6,16(sp)
sw s7,76(sp)
sw s8,72(sp)
sw s9,68(sp)
sw s10,64(sp)
sw s11,60(sp)
la s2, B
lw s10,4(s2) #B[4]
lw s9, 8(s2) #B[8]
lw s8, 12(s2) #B[12]
la s0, A
la s1, result
la s11, B
la s2, A
addi s2, s2, 64
main_loop:
lw a1,0(s11) #B[0]
lw s6,4(s0) #A[1]
lw s5,8(s0) #A[2]
lw s4,12(s0) #A[3]
lw s7,0(s0) #A[0]
mv a2,s7 #A[0]
li a0, 0
jal ra, ass_mul
mv s3,a0 #s3 = A[0]*B[0]
mv a1,s10 #B[4]
mv a2,s6 #A[1]
li a0, 0
jal ra, ass_mul
add s3,s3,a0
mv a1,s9 #B[8]
mv a2,s5 #A[2]
li a0, 0
jal ra, ass_mul
add s3,s3,a0
mv a1,s8 #B[12]
mv a2,s4 #A[3]
li a0, 0
jal ra, ass_mul
add s3,s3,a0
sw s3,0(s1) #C[0] = s3
lw a1,16(s11) #B[1]
mv a2,s7 #A[0]
li a0, 0
jal ra, ass_mul
mv s3,a0
lw a1,20(s11) #B[5]
mv a2,s6
li a0, 0
jal ra, ass_mul
add s3,s3,a0
lw a1,24(s11) #B[9]
mv a2,s5
li a0, 0
jal ra, ass_mul
add s3,s3,a0
lw a1,28(s11) #B[13]
mv a2,s4
li a0, 0
jal ra, ass_mul
add s3,s3,a0
sw s3,4(s1) #C[1]
lw a1,32(sp) #B[2]
mv a2,s7 #A[0]
li a0, 0
jal ra, ass_mul
mv s3,a0 #A[0]*B[2]
lw a1,36(s11) #B[6]
mv a2,s6
li a0, 0
jal ra, ass_mul
add s3,s3,a0
lw a1,40(s11) #B[10]
mv a2,s5
li a0, 0
jal ra, ass_mul
add s3,s3,a0
lw a1,44(s11) #B[14]
mv a2,s4
li a0, 0
jal ra, ass_mul
add s3,s3,a0
sw s3,8(s1) #C[2]
lw a1,48(s11) #B[3]
mv a2,s7 #A[0]
li a0, 0
jal ra, ass_mul
mv s3,a0 #B[3]*A[0]
lw a1,52(s11) #B[7]
mv a2,s6
li a0, 0
jal ra, ass_mul
add s3,s3,a0
lw a1,56(s11) #B[11]
mv a2,s5
li a0, 0
jal ra, ass_mul
add s3,s3,a0
lw a1,60(s11) #B[15]
mv a2,s4
li a0, 0
jal ra, ass_mul
add s3,s3,a0
sw s3,12(s1) #C[3]
addi s0,s0,16 #For A
addi s1,s1,16 #For C
bne s0,s2, main_loop
#lui a0,0x38000
#addi a0,a0,900 # 38000384 <result>
la a0,result
lw ra,12(sp)
lw s0,8(sp)
lw s1,4(sp)
lw s2,0(sp)
lw s3,28(sp)
lw s4,24(sp)
lw s5,20(sp)
lw s6,16(sp)
lw s7,76(sp)
lw s8,72(sp)
lw s9,68(sp)
lw s10,64(sp)
lw s11,60(sp)
addi sp,sp,80
ret
```