contributed by 陳金諄 倪英智
Here is whole source
FemtoRV32 is a collection of minimalistic RISC-V RV32 cores that is easy to read and understand.
It is offered in variants processors such as quark (RV32I), electron (RV32IM), intermissum (RV32IM + irq), gracilis (RV32IMC + irq).
In our project, we will use quark as our core.
Here is the original femtorv32_quark.v.
Femtorv32_quark was to implement RV32I and we need to ensure RV32IM compatibility, so we need to rewrite part of code below.
// Firmware generation flags for this processor
`define NRV_ARCH "rv32im" <-- Original is RV32I, it should replaced with rv32im
`define NRV_ABI "ilp32"
`define NRV_OPTIMIZE "-Os"
module FemtoRV32(
input clk,
output [31:0] mem_addr, // address bus
output [31:0] mem_wdata, // data to be written
output [3:0] mem_wmask, // write mask for the 4 bytes of each word
input [31:0] mem_rdata, // input lines for both data and instr
output mem_rstrb, // active to initiate memory read (used by IO)
input mem_rbusy, // asserted if memory is busy reading value
input mem_wbusy, // asserted if memory is busy writing value
input reset // set to 0 to reset the processor
);
parameter RESET_ADDR = 32'h00000000;
parameter ADDR_WIDTH = 24;
/***************************************************************************/
// Instruction decoding.
/***************************************************************************/
// Extracts rd,rs1,rs2,funct3,imm and opcode from instruction.
// Reference: Table page 104 of:
// https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
// The destination register
wire [4:0] rdId = instr[11:7];
// The ALU function, decoded in 1-hot form (doing so reduces LUT count)
// It is used as follows: funct3Is[val] <=> funct3 == val
(* onehot *)
wire [7:0] funct3Is = 8'b00000001 << instr[14:12];
// The five immediate formats, see RiscV reference (link above), Fig. 2.4 p. 12
wire [31:0] Uimm = { instr[31], instr[30:12], {12{1'b0}}};
wire [31:0] Iimm = {{21{instr[31]}}, instr[30:20]};
/* verilator lint_off UNUSED */ // MSBs of SBJimms are not used by addr adder.
wire [31:0] Simm = {{21{instr[31]}}, instr[30:25],instr[11:7]};
wire [31:0] Bimm = {{20{instr[31]}}, instr[7],instr[30:25],instr[11:8],1'b0};
wire [31:0] Jimm = {{12{instr[31]}}, instr[19:12],instr[20],instr[30:21],1'b0};
/* verilator lint_on UNUSED */
// Base RISC-V (RV32I) has only 10 different instructions !
wire isLoad = (instr[6:2] == 5'b00000); // rd <- mem[rs1+Iimm]
wire isALUimm = (instr[6:2] == 5'b00100); // rd <- rs1 OP Iimm
wire isAUIPC = (instr[6:2] == 5'b00101); // rd <- PC + Uimm
wire isStore = (instr[6:2] == 5'b01000); // mem[rs1+Simm] <- rs2
wire isALUreg = (instr[6:2] == 5'b01100); // rd <- rs1 OP rs2
wire isLUI = (instr[6:2] == 5'b01101); // rd <- Uimm
wire isBranch = (instr[6:2] == 5'b11000); // if(rs1 OP rs2) PC<-PC+Bimm
wire isJALR = (instr[6:2] == 5'b11001); // rd <- PC+4; PC<-rs1+Iimm
// only the JAL is 1's at third digit
wire isJAL = instr[3]; // (instr[6:2] == 5'b11011); // rd <- PC+4; PC<-PC+Jimm
wire isSYSTEM = (instr[6:2] == 5'b11100); // rd <- cycles
wire isALU = isALUimm | isALUreg;
/***************************************************************************/
// The register file.
/***************************************************************************/
reg [31:0] rs1;
reg [31:0] rs2;
reg [31:0] registerFile [31:0];
always @(posedge clk) begin
if (writeBack)
if (rdId != 0)
registerFile[rdId] <= writeBackData;
end
/***************************************************************************/
// The ALU. Does operations and tests combinatorially, except shifts.
/***************************************************************************/
// First ALU source, always rs1
wire [31:0] aluIn1 = rs1;
// Second ALU source, depends on opcode:
// ALUreg, Branch: rs2
// ALUimm, Load, JALR: Iimm
wire [31:0] aluIn2 = isALUreg | isBranch ? rs2 : Iimm;
delete 1. start
The part in below is shift right and left, the RV32IM have multiply and divide that we can group them with shift into one block
// reg [31:0] aluReg; // The internal register of the ALU, used by shift.
// reg [4:0] aluShamt; // Current shift amount.
// wire aluBusy = |aluShamt; // ALU is busy if shift amount is non-zero.
delete 1. end
wire aluWr; // ALU write strobe, starts shifting.
// The adder is used by both arithmetic instructions and JALR.
wire [31:0] aluPlus = aluIn1 + aluIn2;
// Use a single 33 bits subtract to do subtraction and all comparisons
// (trick borrowed from swapforth/J1)
wire [32:0] aluMinus = {1'b1, ~aluIn2} + {1'b0,aluIn1} + 33'b1;
wire LT = (aluIn1[31] ^ aluIn2[31]) ? aluIn1[31] : aluMinus[32];
wire LTU = aluMinus[32];
wire EQ = (aluMinus[31:0] == 0);
Add 1. start
// Here is the preprocess of input of 1.shift 2. multiply 3.divide 4.quotient
// Here is a tip
// It judge shifter_in should do reverse or not by funct3Is[1] first.
// Second, concatenate 1 bit (instr[30] & aluIn1[31]) before the head of shifter_in to know if the shifter_in is postive or negative to ensure it won't change signal after shift.
// Last, reverse back the shifter to leftshift, which means shifter_in by shift left.
// Qusestion : How to handle the concatenate bit in leftshift, after reverse, it was change its position from HSB to LSB.
/***************************************************************************/
// Use the same shifter both for left and right shifts by
// applying bit reversal
wire [31:0] shifter_in = funct3Is[1] ?
{aluIn1[ 0], aluIn1[ 1], aluIn1[ 2], aluIn1[ 3], aluIn1[ 4], aluIn1[ 5],
aluIn1[ 6], aluIn1[ 7], aluIn1[ 8], aluIn1[ 9], aluIn1[10], aluIn1[11],
aluIn1[12], aluIn1[13], aluIn1[14], aluIn1[15], aluIn1[16], aluIn1[17],
aluIn1[18], aluIn1[19], aluIn1[20], aluIn1[21], aluIn1[22], aluIn1[23],
aluIn1[24], aluIn1[25], aluIn1[26], aluIn1[27], aluIn1[28], aluIn1[29],
aluIn1[30], aluIn1[31]} : aluIn1;
/* verilator lint_off WIDTH */
wire [31:0] shifter =
$signed({instr[30] & aluIn1[31], shifter_in}) >>> aluIn2[4:0];
/* verilator lint_on WIDTH */
wire [31:0] leftshift = {
shifter[ 0], shifter[ 1], shifter[ 2], shifter[ 3], shifter[ 4],
shifter[ 5], shifter[ 6], shifter[ 7], shifter[ 8], shifter[ 9],
shifter[10], shifter[11], shifter[12], shifter[13], shifter[14],
shifter[15], shifter[16], shifter[17], shifter[18], shifter[19],
shifter[20], shifter[21], shifter[22], shifter[23], shifter[24],
shifter[25], shifter[26], shifter[27], shifter[28], shifter[29],
shifter[30], shifter[31]};
/***************************************************************************/
wire funcM = instr[25];
wire isDivide = isALUreg & funcM & instr[14]; // |funct3Is[7:4];
wire aluBusy = |quotient_msk; // ALU is busy if division is in progress.
// funct3: 1->MULH, 2->MULHSU 3->MULHU
wire isMULH = funct3Is[1];
wire isMULHSU = funct3Is[2];
wire sign1 = aluIn1[31] & isMULH;
wire sign2 = aluIn2[31] & (isMULH | isMULHSU);
wire signed [32:0] signed1 = {sign1, aluIn1};
wire signed [32:0] signed2 = {sign2, aluIn2};
wire signed [63:0] multiply = signed1 * signed2;
Add 1. end
/***************************************************************************/
// Notes:
// - instr[30] is 1 for SUB and 0 for ADD
// - for SUB, need to test also instr[5] to discriminate ADDI:
// (1 for ADD/SUB, 0 for ADDI, and Iimm used by ADDI overlaps bit 30 !)
// - instr[30] is 1 for SRA (do sign extension) and 0 for SRL
wire [31:0] aluOut_base =
(funct3Is[0] ? instr[30] & instr[5] ? aluMinus[31:0] : aluPlus : 32'b0) |
(funct3Is[1] ? leftshift : 32'b0) |
(funct3Is[2] ? {31'b0, LT} : 32'b0) |
(funct3Is[3] ? {31'b0, LTU} : 32'b0) |
(funct3Is[4] ? aluIn1 ^ aluIn2 : 32'b0) |
Add 2. start
Here add right shift, also is merge the shift in the aluOut.
(funct3Is[5] ? shifter : 32'b0) |
Add 2. end
(funct3Is[6] ? aluIn1 | aluIn2 : 32'b0) |
(funct3Is[7] ? aluIn1 & aluIn2 : 32'b0) ;
Add 3. start
Implemant multiply、divide and quotient
wire [31:0] aluOut_muldiv =
( funct3Is[0] ? multiply[31: 0] : 32'b0) | // 0:MUL
( |funct3Is[3:1] ? multiply[63:32] : 32'b0) | // 1:MULH, 2:MULHSU, 3:MULHU
( instr[14] ? div_sign ? -divResult : divResult : 32'b0) ;
// 4:DIV, 5:DIVU, 6:REM, 7:REMU
wire [31:0] aluOut = isALUreg & funcM ? aluOut_muldiv : aluOut_base;
Implement dividend、divisor and quotient
/***************************************************************************/
// Implementation of DIV/REM instructions, highly inspired by PicoRV32
reg [31:0] dividend;
reg [62:0] divisor;
reg [31:0] quotient;
reg [31:0] quotient_msk;
wire divstep_do = divisor <= {31'b0, dividend};
wire [31:0] dividendN = divstep_do ? dividend - divisor[31:0] : dividend;
wire [31:0] quotientN = divstep_do ? quotient | quotient_msk : quotient;
wire div_sign = ~instr[12] & (instr[13] ? aluIn1[31] :
(aluIn1[31] != aluIn2[31]) & |aluIn2);
always @(posedge clk) begin
if (isDivide & aluWr) begin
dividend <= ~instr[12] & aluIn1[31] ? -aluIn1 : aluIn1;
divisor <= {(~instr[12] & aluIn2[31] ? -aluIn2 : aluIn2), 31'b0};
quotient <= 0;
quotient_msk <= 1 << 31;
end else begin
dividend <= dividendN;
divisor <= divisor >> 1;
quotient <= quotientN;
quotient_msk <= quotient_msk >> 1;
end
end
reg [31:0] divResult;
always @(posedge clk) divResult <= instr[13] ? dividendN : quotientN;
Add 3. end
Delete 2. start
The part below simplify and merge in block Add 1. so we remove this part.
// (funct3IsShift ? aluReg : 32'b0) ;
// wire funct3IsShift = funct3Is[1] | funct3Is[5];
// always @(posedge clk) begin
// if(aluWr) begin
// if (funct3IsShift) begin // SLL, SRA, SRL
// aluReg <= aluIn1;
// aluShamt <= aluIn2[4:0];
// end
// end
// `ifdef NRV_TWOLEVEL_SHIFTER
// else if(|aluShamt[4:2]) begin // Shift by 4
// aluShamt <= aluShamt - 4;
// aluReg <= funct3Is[1] ? aluReg << 4 :
// {{4{instr[30] & aluReg[31]}}, aluReg[31:4]};
// end else
// `endif
// // Compact form of:
// // funct3=001 -> SLL (aluReg <= aluReg << 1)
// // funct3=101 & instr[30] -> SRA (aluReg <= {aluReg[31], aluReg[31:1]})
// // funct3=101 & !instr[30] -> SRL (aluReg <= {1'b0, aluReg[31:1]})
// if (|aluShamt) begin
// aluShamt <= aluShamt - 1;
// aluReg <= funct3Is[1] ? aluReg << 1 : // SLL
// {instr[30] & aluReg[31], aluReg[31:1]}; // SRA,SRL
// end
// end
Delete 2. end
/***************************************************************************/
// The predicate for conditional branches.
/***************************************************************************/
wire predicate =
funct3Is[0] & EQ | // BEQ
funct3Is[1] & !EQ | // BNE
funct3Is[4] & LT | // BLT
funct3Is[5] & !LT | // BGE
funct3Is[6] & LTU | // BLTU
funct3Is[7] & !LTU ; // BGEU
/***************************************************************************/
// Program counter and branch target computation.
/***************************************************************************/
reg [ADDR_WIDTH-1:0] PC; // The program counter.
reg [31:2] instr; // Latched instruction. Note that bits 0 and 1 are
// ignored (not used in RV32I base instr set).
wire [ADDR_WIDTH-1:0] PCplus4 = PC + 4;
// An adder used to compute branch address, JAL address and AUIPC.
// branch->PC+Bimm AUIPC->PC+Uimm JAL->PC+Jimm
// Equivalent to PCplusImm = PC + (isJAL ? Jimm : isAUIPC ? Uimm : Bimm)
wire [ADDR_WIDTH-1:0] PCplusImm = PC + ( instr[3] ? Jimm[ADDR_WIDTH-1:0] :
instr[4] ? Uimm[ADDR_WIDTH-1:0] :
Bimm[ADDR_WIDTH-1:0] );
// A separate adder to compute the destination of load/store.
// testing instr[5] is equivalent to testing isStore in this context.
wire [ADDR_WIDTH-1:0] loadstore_addr = rs1[ADDR_WIDTH-1:0] +
(instr[5] ? Simm[ADDR_WIDTH-1:0] : Iimm[ADDR_WIDTH-1:0]);
/* verilator lint_off WIDTH */
// internal address registers and cycles counter may have less than
// 32 bits, so we deactivate width test for mem_addr and writeBackData
assign mem_addr = state[WAIT_INSTR_bit] | state[FETCH_INSTR_bit] ?
PC : loadstore_addr ;
Add 4. start
Cycle counter trace, this cycles can count to 64 bits
/***************************************************************************/
// Cycle Counter.
/***************************************************************************/
reg [63:0] cycles; // Cycle counter
always @(posedge clk) cycles <= cycles + 1;
The code below mention sel_cyclesh and CSR_read, I think it can be used outside, to remain this function, we didn't remove them.
CSR_read get 32 bits value in upper cycles or lower by sel_cyclesh.
wire sel_cyclesh = (instr[31:20] == 12'hC80);
wire [31:0] CSR_read = sel_cyclesh ? cycles[63:32] : cycles[31:0];
Add 4. end
/***************************************************************************/
// The value written back to the register file.
/***************************************************************************/
wire [31:0] writeBackData =
(isSYSTEM ? cycles : 32'b0) | // SYSTEM
(isLUI ? Uimm : 32'b0) | // LUI
(isALU ? aluOut : 32'b0) | // ALUreg, ALUimm
(isAUIPC ? PCplusImm : 32'b0) | // AUIPC
(isJALR | isJAL ? PCplus4 : 32'b0) | // JAL, JALR
(isLoad ? LOAD_data : 32'b0) ; // Load
/* verilator lint_on WIDTH */
/***************************************************************************/
// LOAD/STORE
/***************************************************************************/
// All memory accesses are aligned on 32 bits boundary. For this
// reason, we need some circuitry that does unaligned halfword
// and byte load/store, based on:
// - funct3[1:0]: 00->byte 01->halfword 10->word
// - mem_addr[1:0]: indicates which byte/halfword is accessed
wire mem_byteAccess = instr[13:12] == 2'b00; // funct3[1:0] == 2'b00;
wire mem_halfwordAccess = instr[13:12] == 2'b01; // funct3[1:0] == 2'b01;
// LOAD, in addition to funct3[1:0], LOAD depends on:
// - funct3[2] (instr[14]): 0->do sign expansion 1->no sign expansion
wire LOAD_sign =
!instr[14] & (mem_byteAccess ? LOAD_byte[7] : LOAD_halfword[15]);
wire [31:0] LOAD_data =
mem_byteAccess ? {{24{LOAD_sign}}, LOAD_byte} :
mem_halfwordAccess ? {{16{LOAD_sign}}, LOAD_halfword} :
mem_rdata ;
wire [15:0] LOAD_halfword =
loadstore_addr[1] ? mem_rdata[31:16] : mem_rdata[15:0];
wire [7:0] LOAD_byte =
loadstore_addr[0] ? LOAD_halfword[15:8] : LOAD_halfword[7:0];
// STORE
assign mem_wdata[ 7: 0] = rs2[7:0];
assign mem_wdata[15: 8] = loadstore_addr[0] ? rs2[7:0] : rs2[15: 8];
assign mem_wdata[23:16] = loadstore_addr[1] ? rs2[7:0] : rs2[23:16];
assign mem_wdata[31:24] = loadstore_addr[0] ? rs2[7:0] :
loadstore_addr[1] ? rs2[15:8] : rs2[31:24];
// The memory write mask:
// 1111 if writing a word
// 0011 or 1100 if writing a halfword
// (depending on loadstore_addr[1])
// 0001, 0010, 0100 or 1000 if writing a byte
// (depending on loadstore_addr[1:0])
wire [3:0] STORE_wmask =
mem_byteAccess ?
(loadstore_addr[1] ?
(loadstore_addr[0] ? 4'b1000 : 4'b0100) :
(loadstore_addr[0] ? 4'b0010 : 4'b0001)
) :
mem_halfwordAccess ?
(loadstore_addr[1] ? 4'b1100 : 4'b0011) :
4'b1111;
// state machine
/*************************************************************************/
// And, last but not least, the state machine.
/*************************************************************************/
localparam FETCH_INSTR_bit = 0;
localparam WAIT_INSTR_bit = 1;
localparam EXECUTE_bit = 2;
localparam WAIT_ALU_OR_MEM_bit = 3;
localparam NB_STATES = 4;
localparam FETCH_INSTR = 1 << FETCH_INSTR_bit;
localparam WAIT_INSTR = 1 << WAIT_INSTR_bit;
localparam EXECUTE = 1 << EXECUTE_bit;
localparam WAIT_ALU_OR_MEM = 1 << WAIT_ALU_OR_MEM_bit;
(* onehot *)
reg [NB_STATES-1:0] state;
// The signals (internal and external) that are determined
// combinatorially from state and other signals.
// register write-back enable.
wire writeBack = ~(isBranch | isStore ) &
(state[EXECUTE_bit] | state[WAIT_ALU_OR_MEM_bit]);
// The memory-read signal.
assign mem_rstrb = state[EXECUTE_bit] & isLoad | state[FETCH_INSTR_bit];
// The mask for memory-write.
assign mem_wmask = {4{state[EXECUTE_bit] & isStore}} & STORE_wmask;
// aluWr starts computation (shifts) in the ALU.
assign aluWr = state[EXECUTE_bit] & isALU;
wire jumpToPCplusImm = isJAL | (isBranch & predicate);
Delete 3. start
Because it haven't define NRV_IS_IO_ADDR, so is will jump to 'else' derictly. We can get the similar but different instruction in below.
// `ifdef NRV_IS_IO_ADDR
// wire needToWait = isLoad |
// isStore & `NRV_IS_IO_ADDR(mem_addr) |
// isALU & funct3IsShift;
// `else
// wire needToWait = isLoad | isStore | isALU & funct3IsShift;
// `endif
Delete 3. end
Add 5. start
Here is the part mention above.
wire needToWait = isLoad | isStore | isDivide;
Add 5. end
always @(posedge clk) begin
if(!reset) begin
state <= WAIT_ALU_OR_MEM; // Just waiting for !mem_wbusy
PC <= RESET_ADDR[ADDR_WIDTH-1:0];
end else
// See note [1] at the end of this file.
(* parallel_case *)
case(1'b1)
state[WAIT_INSTR_bit]: begin
if(!mem_rbusy) begin // may be high when executing from SPI flash
rs1 <= registerFile[mem_rdata[19:15]];
rs2 <= registerFile[mem_rdata[24:20]];
instr <= mem_rdata[31:2]; // Bits 0 and 1 are ignored (see
state <= EXECUTE; // also the declaration of instr).
end
end
state[EXECUTE_bit]: begin
PC <= isJALR ? {aluPlus[ADDR_WIDTH-1:1],1'b0} :
jumpToPCplusImm ? PCplusImm :
PCplus4;
state <= needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR;
end
state[WAIT_ALU_OR_MEM_bit]: begin
if(!aluBusy & !mem_rbusy & !mem_wbusy) state <= FETCH_INSTR;
end
default: begin // FETCH_INSTR
state <= WAIT_INSTR;
end
endcase
end
Delete 4. start
Here is the cycle counter original, we rewrite them and move them upper
/***************************************************************************/
// Cycle counter
/***************************************************************************/
// `ifdef NRV_COUNTER_WIDTH
// reg [`NRV_COUNTER_WIDTH-1:0] cycles;
// `else
// reg [31:0] cycles;
// `endif
// always @(posedge clk) cycles <= cycles + 1;
Delete 4. end
`ifdef BENCH
initial begin
cycles = 0;
Here need be comment, because the shift was
merge to upper part
// aluShamt = 0;
registerFile[0] = 0;
end
`endif
endmodule
// Firmware generation flags for this processor
`define NRV_ARCH "rv32im" <-- Original is RV32I, it should replaced with rv32im
`define NRV_ABI "ilp32"
`define NRV_OPTIMIZE "-Os"
module FemtoRV32(
input clk,
output [31:0] mem_addr, // address bus
output [31:0] mem_wdata, // data to be written
output [3:0] mem_wmask, // write mask for the 4 bytes of each word
input [31:0] mem_rdata, // input lines for both data and instr
output mem_rstrb, // active to initiate memory read (used by IO)
input mem_rbusy, // asserted if memory is busy reading value
input mem_wbusy, // asserted if memory is busy writing value
input reset // set to 0 to reset the processor
);
parameter RESET_ADDR = 32'h00000000;
parameter ADDR_WIDTH = 24;
/***************************************************************************/
// Instruction decoding.
/***************************************************************************/
// Extracts rd,rs1,rs2,funct3,imm and opcode from instruction.
// Reference: Table page 104 of:
// https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
// The destination register
wire [4:0] rdId = instr[11:7];
// The ALU function, decoded in 1-hot form (doing so reduces LUT count)
// It is used as follows: funct3Is[val] <=> funct3 == val
(* onehot *)
wire [7:0] funct3Is = 8'b00000001 << instr[14:12];
// The five immediate formats, see RiscV reference (link above), Fig. 2.4 p. 12
wire [31:0] Uimm = { instr[31], instr[30:12], {12{1'b0}}};
wire [31:0] Iimm = {{21{instr[31]}}, instr[30:20]};
/* verilator lint_off UNUSED */ // MSBs of SBJimms are not used by addr adder.
wire [31:0] Simm = {{21{instr[31]}}, instr[30:25],instr[11:7]};
wire [31:0] Bimm = {{20{instr[31]}}, instr[7],instr[30:25],instr[11:8],1'b0};
wire [31:0] Jimm = {{12{instr[31]}}, instr[19:12],instr[20],instr[30:21],1'b0};
/* verilator lint_on UNUSED */
// Base RISC-V (RV32I) has only 10 different instructions !
wire isLoad = (instr[6:2] == 5'b00000); // rd <- mem[rs1+Iimm]
wire isALUimm = (instr[6:2] == 5'b00100); // rd <- rs1 OP Iimm
wire isAUIPC = (instr[6:2] == 5'b00101); // rd <- PC + Uimm
wire isStore = (instr[6:2] == 5'b01000); // mem[rs1+Simm] <- rs2
wire isALUreg = (instr[6:2] == 5'b01100); // rd <- rs1 OP rs2
wire isLUI = (instr[6:2] == 5'b01101); // rd <- Uimm
wire isBranch = (instr[6:2] == 5'b11000); // if(rs1 OP rs2) PC<-PC+Bimm
wire isJALR = (instr[6:2] == 5'b11001); // rd <- PC+4; PC<-rs1+Iimm
// only the JAL is 1's at third digit
wire isJAL = instr[3]; // (instr[6:2] == 5'b11011); // rd <- PC+4; PC<-PC+Jimm
wire isSYSTEM = (instr[6:2] == 5'b11100); // rd <- cycles
wire isALU = isALUimm | isALUreg;
/***************************************************************************/
// The register file.
/***************************************************************************/
reg [31:0] rs1;
reg [31:0] rs2;
reg [31:0] registerFile [31:0];
always @(posedge clk) begin
if (writeBack)
if (rdId != 0)
registerFile[rdId] <= writeBackData;
end
/***************************************************************************/
// The ALU. Does operations and tests combinatorially, except shifts.
/***************************************************************************/
// First ALU source, always rs1
wire [31:0] aluIn1 = rs1;
// Second ALU source, depends on opcode:
// ALUreg, Branch: rs2
// ALUimm, Load, JALR: Iimm
wire [31:0] aluIn2 = isALUreg | isBranch ? rs2 : Iimm;
- reg [31:0] aluReg; // The internal register of the ALU, used by shift.
- reg [4:0] aluShamt; // Current shift amount.
- wire aluBusy = |aluShamt; // ALU is busy if shift amount is non-zero.
wire aluWr; // ALU write strobe, starts shifting.
// The adder is used by both arithmetic instructions and JALR.
wire [31:0] aluPlus = aluIn1 + aluIn2;
// Use a single 33 bits subtract to do subtraction and all comparisons
// (trick borrowed from swapforth/J1)
wire [32:0] aluMinus = {1'b1, ~aluIn2} + {1'b0,aluIn1} + 33'b1;
wire LT = (aluIn1[31] ^ aluIn2[31]) ? aluIn1[31] : aluMinus[32];
wire LTU = aluMinus[32];
wire EQ = (aluMinus[31:0] == 0);
+/***************************************************************************/
+ // Use the same shifter both for left and right shifts by
+ // applying bit reversal
+ wire [31:0] shifter_in = funct3Is[1] ?
+ {aluIn1[ 0], aluIn1[ 1], aluIn1[ 2], aluIn1[ 3], aluIn1[ 4], aluIn1[ 5],
+ aluIn1[ 6], aluIn1[ 7], aluIn1[ 8], aluIn1[ 9], aluIn1[10], aluIn1[11],
+ aluIn1[12], aluIn1[13], aluIn1[14], aluIn1[15], aluIn1[16], aluIn1[17],
+ aluIn1[18], aluIn1[19], aluIn1[20], aluIn1[21], aluIn1[22], aluIn1[23],
+ aluIn1[24], aluIn1[25], aluIn1[26], aluIn1[27], aluIn1[28], aluIn1[29],
+ aluIn1[30], aluIn1[31]} : aluIn1;
+ /* verilator lint_off WIDTH */
+ wire [31:0] shifter =
+ $signed({instr[30] & aluIn1[31], shifter_in}) >>> aluIn2[4:0];
+ /* verilator lint_on WIDTH */
+ wire [31:0] leftshift = {
+ shifter[ 0], shifter[ 1], shifter[ 2], shifter[ 3], shifter[ 4],
+ shifter[ 5], shifter[ 6], shifter[ 7], shifter[ 8], shifter[ 9],
+ shifter[10], shifter[11], shifter[12], shifter[13], shifter[14],
+ shifter[15], shifter[16], shifter[17], shifter[18], shifter[19],
+ shifter[20], shifter[21], shifter[22], shifter[23], shifter[24],
+ shifter[25], shifter[26], shifter[27], shifter[28], shifter[29],
+ shifter[30], shifter[31]};
+ /***************************************************************************/
+ wire funcM = instr[25];
+ wire isDivide = isALUreg & funcM & instr[14]; // |funct3Is[7:4];
+ wire aluBusy = |quotient_msk; // ALU is busy if division is in progress.
+ // funct3: 1->MULH, 2->MULHSU 3->MULHU
+ wire isMULH = funct3Is[1];
+ wire isMULHSU = funct3Is[2];
+ wire sign1 = aluIn1[31] & isMULH;
+ wire sign2 = aluIn2[31] & (isMULH | isMULHSU);
+ wire signed [32:0] signed1 = {sign1, aluIn1};
+ wire signed [32:0] signed2 = {sign2, aluIn2};
+ wire signed [63:0] multiply = signed1 * signed2;
+ /***************************************************************************/
// Notes:
// - instr[30] is 1 for SUB and 0 for ADD
// - for SUB, need to test also instr[5] to discriminate ADDI:
// (1 for ADD/SUB, 0 for ADDI, and Iimm used by ADDI overlaps bit 30 !)
// - instr[30] is 1 for SRA (do sign extension) and 0 for SRL
wire [31:0] aluOut_base =
(funct3Is[0] ? instr[30] & instr[5] ? aluMinus[31:0] : aluPlus : 32'b0) |
(funct3Is[1] ? leftshift : 32'b0) |
(funct3Is[2] ? {31'b0, LT} : 32'b0) |
(funct3Is[3] ? {31'b0, LTU} : 32'b0) |
(funct3Is[4] ? aluIn1 ^ aluIn2 : 32'b0) |
+ (funct3Is[5] ? shifter : 32'b0) |
(funct3Is[6] ? aluIn1 | aluIn2 : 32'b0) |
(funct3Is[7] ? aluIn1 & aluIn2 : 32'b0) ;
+ wire [31:0] aluOut_muldiv =
+ ( funct3Is[0] ? multiply[31: 0] : 32'b0) | // 0:MUL
+ ( |funct3Is[3:1] ? multiply[63:32] : 32'b0) | // 1:MULH, 2:MULHSU, 3:MULHU
+ ( instr[14] ? div_sign ? -divResult : divResult : 32'b0) ;
+ // 4:DIV, 5:DIVU, 6:REM, 7:REMU
+ wire [31:0] aluOut = isALUreg & funcM ? aluOut_muldiv : aluOut_base;
+ /***************************************************************************/
+ // Implementation of DIV/REM instructions, highly inspired by PicoRV32
+ reg [31:0] dividend;
+ reg [62:0] divisor;
+ reg [31:0] quotient;
+ reg [31:0] quotient_msk;
+ wire divstep_do = divisor <= {31'b0, dividend};
+ wire [31:0] dividendN = divstep_do ? dividend - divisor[31:0] : dividend;
+ wire [31:0] quotientN = divstep_do ? quotient | quotient_msk : quotient;
+ wire div_sign = ~instr[12] & (instr[13] ? aluIn1[31] :
+ (aluIn1[31] != aluIn2[31]) & |aluIn2);
+ always @(posedge clk) begin
+ if (isDivide & aluWr) begin
+ dividend <= ~instr[12] & aluIn1[31] ? -aluIn1 : aluIn1;
+ divisor <= {(~instr[12] & aluIn2[31] ? -aluIn2 : aluIn2), 31'b0};
+ quotient <= 0;
+ quotient_msk <= 1 << 31;
+ end else begin
+ dividend <= dividendN;
+ divisor <= divisor >> 1;
+ quotient <= quotientN;
+ quotient_msk <= quotient_msk >> 1;
+ end
+ end
+ reg [31:0] divResult;
+ always @(posedge clk) divResult <= instr[13] ? dividendN : quotientN;
- (funct3IsShift ? aluReg : 32'b0) ;
- wire funct3IsShift = funct3Is[1] | funct3Is[5];
- always @(posedge clk) begin
- if(aluWr) begin
- if (funct3IsShift) begin // SLL, SRA, SRL
- aluReg <= aluIn1;
- aluShamt <= aluIn2[4:0];
- end
- end
- `ifdef NRV_TWOLEVEL_SHIFTER
- else if(|aluShamt[4:2]) begin // Shift by 4
- aluShamt <= aluShamt - 4;
- aluReg <= funct3Is[1] ? aluReg << 4 :
- {{4{instr[30] & aluReg[31]}}, aluReg[31:4]};
- end else
- `endif
- // Compact form of:
- // funct3=001 -> SLL (aluReg <= aluReg << 1)
- // funct3=101 & instr[30] -> SRA (aluReg <= {aluReg[31], aluReg[31:1]})
- // funct3=101 & !instr[30] -> SRL (aluReg <= {1'b0, aluReg[31:1]})
- if (|aluShamt) begin
- aluShamt <= aluShamt - 1;
- aluReg <= funct3Is[1] ? aluReg << 1 : // SLL
- {instr[30] & aluReg[31], aluReg[31:1]}; // SRA,SRL
- end
- end
/***************************************************************************/
// The predicate for conditional branches.
/***************************************************************************/
wire predicate =
funct3Is[0] & EQ | // BEQ
funct3Is[1] & !EQ | // BNE
funct3Is[4] & LT | // BLT
funct3Is[5] & !LT | // BGE
funct3Is[6] & LTU | // BLTU
funct3Is[7] & !LTU ; // BGEU
/***************************************************************************/
// Program counter and branch target computation.
/***************************************************************************/
reg [ADDR_WIDTH-1:0] PC; // The program counter.
reg [31:2] instr; // Latched instruction. Note that bits 0 and 1 are
// ignored (not used in RV32I base instr set).
wire [ADDR_WIDTH-1:0] PCplus4 = PC + 4;
// An adder used to compute branch address, JAL address and AUIPC.
// branch->PC+Bimm AUIPC->PC+Uimm JAL->PC+Jimm
// Equivalent to PCplusImm = PC + (isJAL ? Jimm : isAUIPC ? Uimm : Bimm)
wire [ADDR_WIDTH-1:0] PCplusImm = PC + ( instr[3] ? Jimm[ADDR_WIDTH-1:0] :
instr[4] ? Uimm[ADDR_WIDTH-1:0] :
Bimm[ADDR_WIDTH-1:0] );
// A separate adder to compute the destination of load/store.
// testing instr[5] is equivalent to testing isStore in this context.
wire [ADDR_WIDTH-1:0] loadstore_addr = rs1[ADDR_WIDTH-1:0] +
(instr[5] ? Simm[ADDR_WIDTH-1:0] : Iimm[ADDR_WIDTH-1:0]);
/* verilator lint_off WIDTH */
// internal address registers and cycles counter may have less than
// 32 bits, so we deactivate width test for mem_addr and writeBackData
assign mem_addr = state[WAIT_INSTR_bit] | state[FETCH_INSTR_bit] ?
PC : loadstore_addr ;
+ /***************************************************************************/
+ // Cycle Counter.
+ /***************************************************************************/
+ reg [63:0] cycles; // Cycle counter
+ always @(posedge clk) cycles <= cycles + 1;
+ The code below mention sel_cyclesh and CSR_read, I think it can be used outside, to remain this function, we didn't remove them.
+ CSR_read get 32 bits value in upper cycles or lower by sel_cyclesh.
+ wire sel_cyclesh = (instr[31:20] == 12'hC80);
+ wire [31:0] CSR_read = sel_cyclesh ? cycles[63:32] : cycles[31:0];
/***************************************************************************/
// The value written back to the register file.
/***************************************************************************/
wire [31:0] writeBackData =
(isSYSTEM ? cycles : 32'b0) | // SYSTEM
(isLUI ? Uimm : 32'b0) | // LUI
(isALU ? aluOut : 32'b0) | // ALUreg, ALUimm
(isAUIPC ? PCplusImm : 32'b0) | // AUIPC
(isJALR | isJAL ? PCplus4 : 32'b0) | // JAL, JALR
(isLoad ? LOAD_data : 32'b0) ; // Load
/* verilator lint_on WIDTH */
/***************************************************************************/
// LOAD/STORE
/***************************************************************************/
// All memory accesses are aligned on 32 bits boundary. For this
// reason, we need some circuitry that does unaligned halfword
// and byte load/store, based on:
// - funct3[1:0]: 00->byte 01->halfword 10->word
// - mem_addr[1:0]: indicates which byte/halfword is accessed
wire mem_byteAccess = instr[13:12] == 2'b00; // funct3[1:0] == 2'b00;
wire mem_halfwordAccess = instr[13:12] == 2'b01; // funct3[1:0] == 2'b01;
// LOAD, in addition to funct3[1:0], LOAD depends on:
// - funct3[2] (instr[14]): 0->do sign expansion 1->no sign expansion
wire LOAD_sign =
!instr[14] & (mem_byteAccess ? LOAD_byte[7] : LOAD_halfword[15]);
wire [31:0] LOAD_data =
mem_byteAccess ? {{24{LOAD_sign}}, LOAD_byte} :
mem_halfwordAccess ? {{16{LOAD_sign}}, LOAD_halfword} :
mem_rdata ;
wire [15:0] LOAD_halfword =
loadstore_addr[1] ? mem_rdata[31:16] : mem_rdata[15:0];
wire [7:0] LOAD_byte =
loadstore_addr[0] ? LOAD_halfword[15:8] : LOAD_halfword[7:0];
// STORE
assign mem_wdata[ 7: 0] = rs2[7:0];
assign mem_wdata[15: 8] = loadstore_addr[0] ? rs2[7:0] : rs2[15: 8];
assign mem_wdata[23:16] = loadstore_addr[1] ? rs2[7:0] : rs2[23:16];
assign mem_wdata[31:24] = loadstore_addr[0] ? rs2[7:0] :
loadstore_addr[1] ? rs2[15:8] : rs2[31:24];
// The memory write mask:
// 1111 if writing a word
// 0011 or 1100 if writing a halfword
// (depending on loadstore_addr[1])
// 0001, 0010, 0100 or 1000 if writing a byte
// (depending on loadstore_addr[1:0])
wire [3:0] STORE_wmask =
mem_byteAccess ?
(loadstore_addr[1] ?
(loadstore_addr[0] ? 4'b1000 : 4'b0100) :
(loadstore_addr[0] ? 4'b0010 : 4'b0001)
) :
mem_halfwordAccess ?
(loadstore_addr[1] ? 4'b1100 : 4'b0011) :
4'b1111;
// state machine
/*************************************************************************/
// And, last but not least, the state machine.
/*************************************************************************/
localparam FETCH_INSTR_bit = 0;
localparam WAIT_INSTR_bit = 1;
localparam EXECUTE_bit = 2;
localparam WAIT_ALU_OR_MEM_bit = 3;
localparam NB_STATES = 4;
localparam FETCH_INSTR = 1 << FETCH_INSTR_bit;
localparam WAIT_INSTR = 1 << WAIT_INSTR_bit;
localparam EXECUTE = 1 << EXECUTE_bit;
localparam WAIT_ALU_OR_MEM = 1 << WAIT_ALU_OR_MEM_bit;
(* onehot *)
reg [NB_STATES-1:0] state;
// The signals (internal and external) that are determined
// combinatorially from state and other signals.
// register write-back enable.
wire writeBack = ~(isBranch | isStore ) &
(state[EXECUTE_bit] | state[WAIT_ALU_OR_MEM_bit]);
// The memory-read signal.
assign mem_rstrb = state[EXECUTE_bit] & isLoad | state[FETCH_INSTR_bit];
// The mask for memory-write.
assign mem_wmask = {4{state[EXECUTE_bit] & isStore}} & STORE_wmask;
// aluWr starts computation (shifts) in the ALU.
assign aluWr = state[EXECUTE_bit] & isALU;
wire jumpToPCplusImm = isJAL | (isBranch & predicate);
- Because it haven't define NRV_IS_IO_ADDR, so is will jump to 'else' derictly. We can get the similar but different instruction in below.
- `ifdef NRV_IS_IO_ADDR
- wire needToWait = isLoad |
- isStore & `NRV_IS_IO_ADDR(mem_addr) |
- isALU & funct3IsShift;
- `else
- wire needToWait = isLoad | isStore | isALU & funct3IsShift;
- `endif
+ wire needToWait = isLoad | isStore | isDivide;
always @(posedge clk) begin
if(!reset) begin
state <= WAIT_ALU_OR_MEM; // Just waiting for !mem_wbusy
PC <= RESET_ADDR[ADDR_WIDTH-1:0];
end else
// See note [1] at the end of this file.
(* parallel_case *)
case(1'b1)
state[WAIT_INSTR_bit]: begin
if(!mem_rbusy) begin // may be high when executing from SPI flash
rs1 <= registerFile[mem_rdata[19:15]];
rs2 <= registerFile[mem_rdata[24:20]];
instr <= mem_rdata[31:2]; // Bits 0 and 1 are ignored (see
state <= EXECUTE; // also the declaration of instr).
end
end
state[EXECUTE_bit]: begin
PC <= isJALR ? {aluPlus[ADDR_WIDTH-1:1],1'b0} :
jumpToPCplusImm ? PCplusImm :
PCplus4;
state <= needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR;
end
state[WAIT_ALU_OR_MEM_bit]: begin
if(!aluBusy & !mem_rbusy & !mem_wbusy) state <= FETCH_INSTR;
end
default: begin // FETCH_INSTR
state <= WAIT_INSTR;
end
endcase
end
- /***************************************************************************/
- // Cycle counter
- /***************************************************************************/
- `ifdef NRV_COUNTER_WIDTH
- reg [`NRV_COUNTER_WIDTH-1:0] cycles;
- `else
- reg [31:0] cycles;
- `endif
- always @(posedge clk) cycles <= cycles + 1;
`ifdef BENCH
initial begin
cycles = 0;
Here need be comment, because the shift was
merge to upper part
// aluShamt = 0;
registerFile[0] = 0;
end
`endif
endmodule
In here, we will talk about why compiler know it's mul
, div
or rem
.
1. wire funcM = instr[25];
2. wire isDivide = isALUreg & funcM & instr[14]; // |funct3Is[7:4];
3. wire aluBusy = |quotient_msk; // ALU is busy if division is in progress.
// funct3: 1->MULH, 2->MULHSU 3->MULHU
4. wire isMULH = funct3Is[1];
5. wire isMULHSU = funct3Is[2];
6. wire sign1 = aluIn1[31] & isMULH;
7. wire sign2 = aluIn2[31] & (isMULH | isMULHSU);
8. wire signed [32:0] signed1 = {sign1, aluIn1};
9. wire signed [32:0] signed2 = {sign2, aluIn2};
10. wire signed [63:0] multiply = signed1 * signed2;
Image in above is each type of rv32i instruction set show with binary code, to extend M standard extension, we can see the M extension format.
1. wire funcM = instr[25];
funct7
in this table are all 0x01
, in the standard R-type, the funct7 locate at instr[31:25]
. However, the funct7
in M extension is only 0x01
, so the judgement can only be wire funcM = instr[25];
2. wire isDivide = isALUreg & funcM & instr[14]; // |funct3Is[7:4];
isALUreg
, which assign bywire isALUreg = (instr[6:2] == 5'b01100); // rd <- rs1 OP rs2
, but could be overlap with some base integer instruction in below, so we have to avoid them happen. Because of the funct3
in div
is 0x4(0b100)
and 0x5(0b101)
, so the 14th bit in binary instruction set is 1's, which can be separate from base integer instruction.3. wire aluBusy = |quotient_msk; // ALU is busy if division is in progress.
wire aluBusy = |quotient_msk;
, the wire is in the sequential block, once the quotient_msk
was rewrite, the aluBusy
will be update immediately. Additionaly, you dn't have to explicit declaration
before assign the value to varible is a feature of wire
, that's why the quotient_msk
can be declared after this instruction line.4. wire isMULH = funct3Is[1];
0x1
, so the isMULH
can be specified by funct3Is[1]
. Here is the funct3Is
.wire [7:0] funct3Is = 8'b00000001 << instr[14:12];
0x1
== 1, so the 8'b00000001
will shift left 1 bit, so the 8'b00000001
now is 8'b00000010
, also correspond to funct3Is[1]
.
5. wire isMULHSU = funct3Is[2];
4.
, the funct3 code is 0x2
, so the isMULHSU
can be specified by funct3Is[2]
.6. wire sign1 = aluIn1[31] & isMULH;
sign
of rs1
7. wire sign2 = aluIn2[31] & (isMULH | isMULHSU);
sign
of rs2
, one of rs
in the isMULHSU
is signed and the another is unsigned, here choose rs2
to place signed
8. wire signed [32:0] signed1 = {sign1, aluIn1};
9. wire signed [32:0] signed2 = {sign2, aluIn2};
8.
10. wire signed [63:0] multiply = signed1 * signed2;
wire signed [32:0] signed1 = {sign1, aluIn1};
wire signed [32:0] signed2 = {sign2, aluIn2};
wire signed [63:0] multiply = signed1 * signed2;
we modified the code to
wire signed [63:0] multiply = aluIn1 * aluIn2
and execute pi.c(we take as our M extension test) again, nothing different we found, and the verilog can handle the signed 2's complemant multiplication. In the mean time, 32 bits multiply to 32 bits is impossible to overflow, so that we think the verilog cann't handle the situation which is signed binary.
Here is the website we found
Next, we will discuss how this processor integrate multiplication and division(M extension) into ALU operations.
wire [31:0] aluOut_muldiv =
( funct3Is[0] ? multiply[31: 0] : 32'b0) | // 0:MUL
( |funct3Is[3:1] ? multiply[63:32] : 32'b0) | // 1:MULH, 2:MULHSU, 3:MULHU
( instr[14] ? div_sign ? -divResult : divResult : 32'b0) ;
// 4:DIV, 5:DIVU, 6:REM, 7:REMU
wire [31:0] aluOut = isALUreg & funcM ? aluOut_muldiv : aluOut_base;
Implement dividend、divisor and quotient
/***************************************************************************/
// Implementation of DIV/REM instructions, highly inspired by PicoRV32
reg [31:0] dividend;
reg [62:0] divisor;
reg [31:0] quotient;
reg [31:0] quotient_msk;
wire divstep_do = divisor <= {31'b0, dividend};
wire [31:0] dividendN = divstep_do ? dividend - divisor[31:0] : dividend;
wire [31:0] quotientN = divstep_do ? quotient | quotient_msk : quotient;
wire div_sign = ~instr[12] & (instr[13] ? aluIn1[31] :
(aluIn1[31] != aluIn2[31]) & |aluIn2);
always @(posedge clk) begin
if (isDivide & aluWr) begin
dividend <= ~instr[12] & aluIn1[31] ? -aluIn1 : aluIn1;
divisor <= {(~instr[12] & aluIn2[31] ? -aluIn2 : aluIn2), 31'b0};
quotient <= 0;
quotient_msk <= 1 << 31;
end else begin
dividend <= dividendN;
divisor <= divisor >> 1;
quotient <= quotientN;
quotient_msk <= quotient_msk >> 1;
end
end
reg [31:0] divResult;
always @(posedge clk) divResult <= instr[13] ? dividendN : quotientN;
I seperate the explaination into five steps.
MUL
, MULH
, MULHSU
, MULHU
) or division (DIV
, DIVU
, REM
, REMU
) operation it will do, based on the instruction's opcode and funct3 field.wire [31:0] aluOut_muldiv =
( funct3Is[0] ? multiply[31: 0] : 32'b0) | // 0:MUL
( |funct3Is[3:1] ? multiply[63:32] : 32'b0) | // 1:MULH, 2:MULHSU, 3:MULHU
( instr[14] ? div_sign ? -divResult : divResult : 32'b0) ;
// 4:DIV, 5:DIVU, 6:REM, 7:REMU
functM
.functM
is True, which means the computation is based on M extension so the aluOut_muldiv
is selected. Otherwise, the computation is done with core ALU instructions such as add
、 sub
…, consequently, aluOut_base is chosen.wire [31:0] aluOut = isALUreg & funcM ? aluOut_muldiv : aluOut_base;
reg [31:0] dividend;
reg [62:0] divisor;
reg [31:0] quotient;
reg [31:0] quotient_msk;
isDivide
and aluWr
is True, it will initialize the dividend
, divisor
, quotient
, and quotient_msk
.else
section, as the division is still operating, it performs regular division steps to obtain final dividendN
, quotientN
.always @(posedge clk) begin
if (isDivide & aluWr) begin
dividend <= ~instr[12] & aluIn1[31] ? -aluIn1 : aluIn1;
divisor <= {(~instr[12] & aluIn2[31] ? -aluIn2 : aluIn2), 31'b0};
quotient <= 0;
quotient_msk <= 1 << 31;
end else begin
dividend <= dividendN;
divisor <= divisor >> 1;
quotient <= quotientN;
quotient_msk <= quotient_msk >> 1;
end
end
The isDivide
indicates the ALU opeations is div/rem
.
wire isDivide = isALUreg & funcM & instr[14]; // |funct3Is[7:4];
aluWr
represent whether current instruction need ALU and if the current state needed to be wait(WAIT_ALU_OR_MEM
).
assign aluWr = state[EXECUTE_bit] & isALU;
Now we look to the state[EXECUTE_bit]
.
state[EXECUTE_bit]: begin
PC <= isJALR ? {aluPlus[ADDR_WIDTH-1:1],1'b0} :
jumpToPCplusImm ? PCplusImm :
PCplus4;
state <= needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR;
end
If the needToWait
is True, the state machine enters the WAIT_ALU_OR_MEM
state, which means that the processor is executing and it need wait for ALU or MEM complete its work.
Otherwise, it proceeds to the FETCH_INSTR
state, in other words, the processor is free to use so it will fetch the next instruction.
instr[13]
.divResult
is assigned the value of dividendN
.quotientN
is assigned to divResult.reg [31:0] divResult;
always @(posedge clk) divResult <= instr[13] ? dividendN : quotientN;
Lack of diverse compatibility validations.
To assess the compatibility of our implementation with the M extension, we execute code involving multiplication and division operations.
We found a code named pi.c
in the directorylearn-fpga/FemtoRV/FIRMWARE/EXAMPLES/
.
/* Adapted to FemtoRV32 (Bruno Levy Feb. 2021) */
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <femtorv32.h>
#include <femtoGL.h>
#include "errno_fix.h"
/* uncomment the following line to use 'long long' integers */
#define HAS_LONG_LONG
#ifdef HAS_LONG_LONG
#define mul_mod(a,b,m) (( (long long) (a) * (long long) (b) ) % (m))
#else
#define mul_mod(a,b,m) fmod( (double) a * (double) b, m)
#endif
/* return the inverse of x mod y */
int inv_mod(int x, int y) RV32_FASTCODE;
int inv_mod(int x, int y)
{
int q, u, v, a, c, t;
u = x;
v = y;
c = 1;
a = 0;
do {
q = v / u;
t = c;
c = a - q * c;
a = t;
t = u;
u = v - q * u;
v = t;
} while (u != 0);
a = a % y;
if (a < 0)
a = y + a;
return a;
}
/* return (a^b) mod m */
int pow_mod(int a, int b, int m) RV32_FASTCODE;
int pow_mod(int a, int b, int m)
{
int r, aa;
r = 1;
aa = a;
while (1) {
if (b & 1)
r = mul_mod(r, aa, m);
b = b >> 1;
if (b == 0)
break;
aa = mul_mod(aa, aa, m);
}
return r;
}
/* return true if n is prime */
int is_prime(int n) RV32_FASTCODE;
int is_prime(int n)
{
int r, i;
if ((n % 2) == 0)
return 0;
r = (int) (sqrt(n));
for (i = 3; i <= r; i += 2)
if ((n % i) == 0)
return 0;
return 1;
}
/* return the prime number immediatly after n */
int next_prime(int n) RV32_FASTCODE;
int next_prime(int n)
{
do {
n++;
} while (!is_prime(n));
return n;
}
int digits(int n) RV32_FASTCODE;
int digits(int n) {
int av, a, vmax, N, num, den, k, kq, kq2, t, v, s, i;
double sum;
N = (int) ((n + 20) * log(10) / log(2));
sum = 0;
for (a = 3; a <= (2 * N); a = next_prime(a)) {
vmax = (int) (log(2 * N) / log(a));
av = 1;
for (i = 0; i < vmax; i++)
av = av * a;
s = 0;
num = 1;
den = 1;
v = 0;
kq = 1;
kq2 = 1;
for (k = 1; k <= N; k++) {
t = k;
if (kq >= a) {
do {
t = t / a;
v--;
} while ((t % a) == 0);
kq = 0;
}
kq++;
num = mul_mod(num, t, av);
t = (2 * k - 1);
if (kq2 >= a) {
if (kq2 == a) {
do {
t = t / a;
v++;
} while ((t % a) == 0);
}
kq2 -= a;
}
den = mul_mod(den, t, av);
kq2 += 2;
if (v > 0) {
t = inv_mod(den, av);
t = mul_mod(t, num, av);
t = mul_mod(t, k, av);
for (i = v; i < vmax; i++)
t = mul_mod(t, a, av);
s += t;
if (s >= av)
s -= av;
}
}
t = pow_mod(10, n - 1, av);
s = mul_mod(s, t, av);
sum = fmod(sum + (double) s / (double) av, 1.0);
}
return (int) (sum * 1e9);
}
int main() {
// MAX7219_tty_init(); // Uncomment to display on led matrix.
femtosoc_tty_init();
// GL_set_font(&Font3x5);
// GL_set_font(&Font8x16);
printf("pi = 3.");
for(int n=1; ;n+=9) {
printf("%d",digits(n));
}
}
It is an implementation for computing the n-th decimal digit of π with very little memory.
We chose this as our test code since it has various multiply, division and quotient operations in it.
First, we should enter the path
$ cd learn-fpga/FemtoRV/RTL/CONFIGS
then modify the bench_config.v
to use Femtorv32_quark
`define NRV_IO_LEDS
`define NRV_IO_UART
`define NRV_IO_SSD1351
`define NRV_FREQ 1
`define NRV_FEMTORV32_QUARK // RV32I (the most elementary femtorv)
//`define NRV_FEMTORV32_ELECTRON // RV32IM
//`define NRV_FEMTORV32_INTERMISSUM // RV32IMzCSR
//`define NRV_FEMTORV32_GRACILIS // RV32IMCzCSR
//`define NRV_FEMTORV32_PETITBATEAU // WIP RF32F !!
//`define NRV_FEMTORV32_TESTDRIVE
`define NRV_RESET_ADDR 0
`define NRV_RAM 65536
`define NRV_IO_HARDWARE_CONFIG
`define NRV_CONFIGURED
Second, back to learn-fpga/FemtoRV
.
Before we do the
$ make BENCH.firmware_config
we can check what it did
BENCH: BENCH.verilator
BENCH.firmware_config:
BOARD=testbench TOOLS/make_config.sh -DBENCH_VERILATOR
(cd FIRMWARE; make libs)
...
To clarify what actually do in this code, here is the make_config.sh
in learn-fpga/FemtoRV/TOOLS/make_configs
.
# Extracts compilation flags from selected board, and
# write them to FIRMWARE/config.mk
cd RTL
iverilog -I PROCESSOR $1 -o tmp.vvp get_config.v
vvp tmp.vvp > ../FIRMWARE/config.mk
rm -f tmp.vvp
echo BOARD=$BOARD >> ../FIRMWARE/config.mk
cat ../FIRMWARE/config.mk
It execute the get_config.v
and put the result in config.mk
.
Here is a part of get_config.v
:
`include "femtosoc_config.v"
module dummy();
initial begin
$display("ARCH=",`NRV_ARCH);
$display("OPTIMIZE=",`NRV_OPTIMIZE);
$display("ABI=",`NRV_ABI);
$display("RAM_SIZE=%d",`NRV_RAM
...
ARCH
, assigned by NRV_ARCH
, can be found in femtosoc_config.v
So we can see the femtosoc_config.v
in same directory:
...
`ifdef NRV_FEMTORV32_QUARK
`include "PROCESSOR/femtorv32_quark.v" // Minimalistic version of the processor for IceStick (RV32I)
`endif
...
At first, We had modified the code in bench_config.v
to define the NRV_FEMTORV32_QUARK, so it can enter inner part
`include "PROCESSOR/femtorv32_quark.v" // Minimalistic version of the processor for IceStick (RV32I)
Recall the initial part of femtorv32_quark.v
, we modify the NRV_ARCH
to rv32im
`define NRV_ARCH "rv32im"
`define NRV_ABI "ilp32"
`define NRV_OPTIMIZE "-Os"
Therefore, the value which assigned to NRV_ARCH
is rv32im
Eventually, follwing is the configuration in learn-fpga/FemtoRV/FIRMWARE/config.mk
:
ARCH=rv32im
OPTIMIZE=-Os
ABI=ilp32
RAM_SIZE= 65536
DEVICES= -DSSD1351=1
BOARD=testbench
As a result, the pi.c was compiled with rv32im
Third, we execute
$ make pi.baremetal.elf
$ make pi.hex
inlearn-fpga/FemtoRV/FIRMWARE/EXAMPLES/
And we confirm that the compiled elf has M extension by using
riscv-none-elf-objdump -d pi.baremetal.elf
in learn-fpga/FemtoRV/FIRMWARE/EXAMPLES/
to dump the assembly code.
The disassembled output we get has mul, mulh, mulhu, div, divu, rem, remu
.
Take part of code we produce as example in below:
RV32IM VER.
...
00000030 <inv_mod>:
30: 00058693 mv a3,a1
34: 00100793 li a5,1
38: 00000813 li a6,0
+ 3c: 02a6c733 div a4,a3,a0
+ 40: 02a6e633 rem a2,a3,a0
44: 00050693 mv a3,a0
+ 48: 02f70733 mul a4,a4,a5
4c: 40e80733 sub a4,a6,a4
50: 00078813 mv a6,a5
54: 00061a63 bnez a2,68 <IO_SSD1351_CMD+0x28>
+ 58: 02b7e533 rem a0,a5,a1
5c: 00055463 bgez a0,64 <IO_SSD1351_CMD+0x24>
60: 00b50533 add a0,a0,a1
64: 00008067 ret
68: 00070793 mv a5,a4
6c: 00060513 mv a0,a2
70: fcdff06f j 3c <inv_mod+0xc>
...
RV32I VER.
...
00000030 <inv_mod>:
30: fe010113 add sp,sp,-32
34: 00812c23 sw s0,24(sp)
38: 00912a23 sw s1,20(sp)
3c: 01212823 sw s2,16(sp)
40: 01312623 sw s3,12(sp)
44: 01412423 sw s4,8(sp)
48: 00112e23 sw ra,28(sp)
4c: 01512223 sw s5,4(sp)
50: 00050493 mv s1,a0
54: 00058913 mv s2,a1
58: 00058993 mv s3,a1
5c: 00100413 li s0,1
60: 00000a13 li s4,0
64: 00048593 mv a1,s1
68: 00098513 mv a0,s3
6c: 00005097 auipc ra,0x5
70: 1a0080e7 jalr 416(ra) # 520c <__divsi3>
74: 00040593 mv a1,s0
78: 00005097 auipc ra,0x5
7c: 0d8080e7 jalr 216(ra) # 5150 <__mulsi3>
80: 40aa0ab3 sub s5,s4,a0
84: 00048593 mv a1,s1
88: 00098513 mv a0,s3
8c: 00005097 auipc ra,0x5
90: 204080e7 jalr 516(ra) # 5290 <__modsi3>
94: 00048993 mv s3,s1
98: 00040a13 mv s4,s0
9c: 04051063 bnez a0,dc <IO_SSD1351_DAT+0x5c>
a0: 00090593 mv a1,s2
a4: 00040513 mv a0,s0
a8: 00005097 auipc ra,0x5
ac: 1e8080e7 jalr 488(ra) # 5290 <__modsi3>
b0: 00055463 bgez a0,b8 <IO_SSD1351_DAT+0x38>
b4: 01250533 add a0,a0,s2
b8: 01c12083 lw ra,28(sp)
bc: 01812403 lw s0,24(sp)
c0: 01412483 lw s1,20(sp)
c4: 01012903 lw s2,16(sp)
c8: 00c12983 lw s3,12(sp)
cc: 00812a03 lw s4,8(sp)
d0: 00412a83 lw s5,4(sp)
d4: 02010113 add sp,sp,32
d8: 00008067 ret
dc: 000a8413 mv s0,s5
e0: 00050493 mv s1,a0
e4: f81ff06f j 64 <IO_SSD1351_CMD+0x24>
...
You can see the instruction like jalr 416(ra) # 520c <__divsi3>
, jalr 216(ra) # 5150 <__mulsi3>
, jalr 516(ra) # 5290 <__modsi3>
, and so on. In other words, the part can be quite long if we expand then with branch
, we put them in our github
Last, we can show the result we made by execute
$ make BENCH
in learn-fpga/FemtoRV
Here is the output it should be:
Contrast to the original design of quark, the compiled elf has no M extension at all. If we only replace the rv32i
with rv32im
, here is the result: