王韻茨, 王柏皓
Our task is to implement a simplified RISC-V CPU using Chisel, supporting RV32I instruction set, including CSR instructions (Zicsr) and the B extension. The CPU should be verified through simulation using provided unit tests and must successfully run the performance counter code in https://github.com/sysprog21/rv32emu/tree/master/tests/perfcounter.
Additionally, we select three RISC-V programs from the course assignments, rewrite them to utilize the B extension, and ensure they run correctly on our improved processor.
Install dependent packages
$ sudo apt install build-essential verilator gtkwave
Install sbt (and Eclipse Temurin JDK 11)
- Code located in src/main/scala/riscv/core/InstructionFetch.scala
- Code filled in at the // lab3(InstructionFetch) comment section
// mycpu is freely redistributable under the MIT License. See the file
// "LICENSE" for information on usage and redistribution of this file.
package riscv.core
import chisel3._
import riscv.Parameters
object ProgramCounter {
val EntryAddress = Parameters.EntryAddress
}
class InstructionFetch extends Module {
val io = IO(new Bundle {
val jump_flag_id = Input(Bool())
val jump_address_id = Input(UInt(Parameters.AddrWidth))
val instruction_read_data = Input(UInt(Parameters.DataWidth))
val instruction_valid = Input(Bool())
val instruction_address = Output(UInt(Parameters.AddrWidth))
val instruction = Output(UInt(Parameters.InstructionWidth))
})
val pc = RegInit(ProgramCounter.EntryAddress)
when(io.instruction_valid) {
io.instruction := io.instruction_read_data
// lab3(InstructionFetch) begin
when(io.jump_flag_id) {
pc := io.jump_address_id
}.otherwise {
pc := pc + 4.U
}
// lab3(InstructionFetch) end
}.otherwise {
pc := pc
io.instruction := 0x00000013.U
}
io.instruction_address := pc
}
- Code located in src/main/scala/riscv/core/InstructionDecode.scala
- Code filled in at the // lab3(InstructionDecode) comment section
- Add Zicsr code at // CSR comment section
// mycpu is freely redistributable under the MIT License. See the file
// "LICENSE" for information on usage and redistribution of this file.
package riscv.core
import scala.collection.immutable.ArraySeq
import chisel3._
import chisel3.util._
import riscv.Parameters
object InstructionTypes {
val L = "b0000011".U
val I = "b0010011".U
val S = "b0100011".U
val RM = "b0110011".U
val B = "b1100011".U
}
object Instructions {
val lui = "b0110111".U
val nop = "b0000001".U
val jal = "b1101111".U
val jalr = "b1100111".U
val auipc = "b0010111".U
val csr = "b1110011".U
val fence = "b0001111".U
}
object InstructionsTypeL {
val lb = "b000".U
val lh = "b001".U
val lw = "b010".U
val lbu = "b100".U
val lhu = "b101".U
}
object InstructionsTypeI {
val addi = 0.U
val slli = 1.U
val slti = 2.U
val sltiu = 3.U
val xori = 4.U
val sri = 5.U
val ori = 6.U
val andi = 7.U
}
object InstructionsTypeS {
val sb = "b000".U
val sh = "b001".U
val sw = "b010".U
}
object InstructionsTypeR {
val add_sub = 0.U
val sll = 1.U
val slt = 2.U
val sltu = 3.U
val xor = 4.U
val sr = 5.U
val or = 6.U
val and = 7.U
}
object InstructionsTypeM {
val mul = 0.U
val mulh = 1.U
val mulhsu = 2.U
val mulhum = 3.U
val div = 4.U
val divu = 5.U
val rem = 6.U
val remu = 7.U
}
object InstructionsTypeB {
val beq = "b000".U
val bne = "b001".U
val blt = "b100".U
val bge = "b101".U
val bltu = "b110".U
val bgeu = "b111".U
}
object InstructionsTypeCSR {
val csrrw = "b001".U
val csrrs = "b010".U
val csrrc = "b011".U
val csrrwi = "b101".U
val csrrsi = "b110".U
val csrrci = "b111".U
}
object InstructionsNop {
val nop = 0x00000013L.U(Parameters.DataWidth)
}
object InstructionsRet {
val mret = 0x30200073L.U(Parameters.DataWidth)
val ret = 0x00008067L.U(Parameters.DataWidth)
}
object InstructionsEnv {
val ecall = 0x00000073L.U(Parameters.DataWidth)
val ebreak = 0x00100073L.U(Parameters.DataWidth)
}
object ALUOp1Source {
val Register = 0.U(1.W)
val InstructionAddress = 1.U(1.W)
}
object ALUOp2Source {
val Register = 0.U(1.W)
val Immediate = 1.U(1.W)
}
object RegWriteSource {
val ALUResult = 0.U(2.W)
val Memory = 1.U(2.W)
// val CSR = 2.U(2.W)
val NextInstructionAddress = 3.U(2.W)
}
class InstructionDecode extends Module {
val io = IO(new Bundle {
val instruction = Input(UInt(Parameters.InstructionWidth))
val regs_reg1_read_address = Output(UInt(Parameters.PhysicalRegisterAddrWidth))
val regs_reg2_read_address = Output(UInt(Parameters.PhysicalRegisterAddrWidth))
val ex_immediate = Output(UInt(Parameters.DataWidth))
val ex_aluop1_source = Output(UInt(1.W))
val ex_aluop2_source = Output(UInt(1.W))
val memory_read_enable = Output(Bool())
val memory_write_enable = Output(Bool())
val wb_reg_write_source = Output(UInt(2.W))
val reg_write_enable = Output(Bool())
val reg_write_address = Output(UInt(Parameters.PhysicalRegisterAddrWidth))
// CSR
val csr_addr = Output(UInt(12.W))
val csr_in = Output(UInt(Parameters.DataWidth))
val csr_op = Output(UInt(3.W))
})
val opcode = io.instruction(6, 0)
val funct3 = io.instruction(14, 12)
val funct7 = io.instruction(31, 25)
val rd = io.instruction(11, 7)
val rs1 = io.instruction(19, 15)
val rs2 = io.instruction(24, 20)
// CSR
io.csr_addr := 0.U
io.csr_in := 0.U
io.csr_op := 0.U
io.reg_write_enable := false.B
io.memory_read_enable := false.B
io.memory_write_enable := false.B
io.ex_immediate := 0.U
io.ex_aluop1_source := ALUOp1Source.Register
io.ex_aluop2_source := ALUOp2Source.Register
io.wb_reg_write_source := RegWriteSource.ALUResult
io.reg_write_address := 0.U
io.regs_reg1_read_address := 0.U
io.regs_reg2_read_address := 0.U
when (opcode === Instructions.csr) {
io.csr_addr := io.instruction(31, 20)
io.csr_op := funct3
io.csr_in := Mux(io.csr_op === InstructionsTypeCSR.csrrwi || io.csr_op === InstructionsTypeCSR.csrrsi || io.csr_op === InstructionsTypeCSR.csrrci,
rs1,
0.U)
io.reg_write_enable := false.B
} .otherwise {
io.csr_addr := 0.U
io.csr_op := 0.U
io.csr_in := 0.U
io.reg_write_enable := (opcode === InstructionTypes.RM) || (opcode === InstructionTypes.I) ||
(opcode === InstructionTypes.L) || (opcode === Instructions.auipc) || (opcode === Instructions.lui) ||
(opcode === Instructions.jal) || (opcode === Instructions.jalr)
}
io.reg_write_address := rd
io.regs_reg1_read_address := Mux(opcode === Instructions.lui, 0.U(Parameters.PhysicalRegisterAddrWidth), rs1)
io.regs_reg2_read_address := rs2
val immediate = MuxLookup(
opcode,
Cat(Fill(20, io.instruction(31)), io.instruction(31, 20)),
IndexedSeq(
InstructionTypes.I -> Cat(Fill(21, io.instruction(31)), io.instruction(30, 20)),
InstructionTypes.L -> Cat(Fill(21, io.instruction(31)), io.instruction(30, 20)),
Instructions.jalr -> Cat(Fill(21, io.instruction(31)), io.instruction(30, 20)),
InstructionTypes.S -> Cat(Fill(21, io.instruction(31)), io.instruction(30, 25), io.instruction(11, 7)),
InstructionTypes.B -> Cat(
Fill(20, io.instruction(31)),
io.instruction(7),
io.instruction(30, 25),
io.instruction(11, 8),
0.U(1.W)
),
Instructions.lui -> Cat(io.instruction(31, 12), 0.U(12.W)),
Instructions.auipc -> Cat(io.instruction(31, 12), 0.U(12.W)),
// jal's imm represents a multiple of 2 bytes.
Instructions.jal -> Cat(
Fill(12, io.instruction(31)),
io.instruction(19, 12),
io.instruction(20),
io.instruction(30, 21),
0.U(1.W)
)
)
)
io.ex_immediate := immediate
io.ex_aluop1_source := Mux(
opcode === Instructions.auipc || opcode === InstructionTypes.B || opcode === Instructions.jal,
ALUOp1Source.InstructionAddress,
ALUOp1Source.Register
)
// ALU op2 from reg: R-type,
// ALU op2 from imm: L-Type (I-type subtype),
// I-type (nop=addi, jalr, csr-class, fence),
// J-type (jal),
// U-type (lui, auipc),
// S-type (rs2 value sent to MemControl, ALU computes rs1 + imm.)
// B-type (rs2 compares with rs1 in jump judge unit, ALU computes jump address PC+imm.)
io.ex_aluop2_source := Mux(
opcode === InstructionTypes.RM,
ALUOp2Source.Register,
ALUOp2Source.Immediate
)
// lab3(InstructionDecode) begin
io.memory_read_enable := opcode === InstructionTypes.L
io.memory_write_enable := opcode === InstructionTypes.S
// Decode read register logic based on opcode
io.regs_reg1_read_address := MuxCase(0.U, Array(
(opcode === InstructionTypes.RM || opcode === InstructionTypes.I || opcode === InstructionTypes.B) -> rs1,
(opcode === InstructionTypes.S || opcode === Instructions.jalr || opcode === InstructionTypes.L) -> rs1
))
io.regs_reg2_read_address := Mux(opcode === InstructionTypes.RM || opcode === InstructionTypes.S || opcode === InstructionTypes.B, rs2, 0.U)
// lab3(InstructionDecode) end
io.wb_reg_write_source := MuxCase(
RegWriteSource.ALUResult,
ArraySeq(
(opcode === InstructionTypes.RM || opcode === InstructionTypes.I ||
opcode === Instructions.lui || opcode === Instructions.auipc) -> RegWriteSource.ALUResult, // same as default
(opcode === InstructionTypes.L) -> RegWriteSource.Memory,
(opcode === Instructions.jal || opcode === Instructions.jalr) -> RegWriteSource.NextInstructionAddress
)
)
io.reg_write_enable := (opcode === InstructionTypes.RM) || (opcode === InstructionTypes.I) ||
(opcode === InstructionTypes.L) || (opcode === Instructions.auipc) || (opcode === Instructions.lui) ||
(opcode === Instructions.jal) || (opcode === Instructions.jalr)
io.reg_write_address := rd
}
- Code located in src/main/scala/riscv/core/Execute.scala
- Code filled in at the // lab3(Execute) comment section
// mycpu is freely redistributable under the MIT License. See the file
// "LICENSE" for information on usage and redistribution of this file.
package riscv.core
import chisel3._
import chisel3.util.Cat
import chisel3.util.MuxLookup
import riscv.Parameters
class Execute extends Module {
val io = IO(new Bundle {
val instruction = Input(UInt(Parameters.InstructionWidth))
val instruction_address = Input(UInt(Parameters.AddrWidth))
val reg1_data = Input(UInt(Parameters.DataWidth))
val reg2_data = Input(UInt(Parameters.DataWidth))
val immediate = Input(UInt(Parameters.DataWidth))
val aluop1_source = Input(UInt(1.W))
val aluop2_source = Input(UInt(1.W))
val mem_alu_result = Output(UInt(Parameters.DataWidth))
val if_jump_flag = Output(Bool())
val if_jump_address = Output(UInt(Parameters.DataWidth))
})
val opcode = io.instruction(6, 0)
val funct3 = io.instruction(14, 12)
val funct7 = io.instruction(31, 25)
val rd = io.instruction(11, 7)
val uimm = io.instruction(19, 15)
val alu = Module(new ALU)
val alu_ctrl = Module(new ALUControl)
alu_ctrl.io.opcode := opcode
alu_ctrl.io.funct3 := funct3
alu_ctrl.io.funct7 := funct7
// lab3(Execute) begin
// Set ALU input sources based on control signals
alu.io.op1 := Mux(io.aluop1_source === ALUOp1Source.InstructionAddress, io.instruction_address, io.reg1_data)
alu.io.op2 := Mux(io.aluop2_source === ALUOp2Source.Immediate, io.immediate, io.reg2_data)
// Connect ALU operation control signal
alu.io.func := alu_ctrl.io.alu_funct
// lab3(Execute) end
io.mem_alu_result := alu.io.result
io.if_jump_flag := opcode === Instructions.jal ||
(opcode === Instructions.jalr) ||
(opcode === InstructionTypes.B) && MuxLookup(
funct3,
false.B,
IndexedSeq(
InstructionsTypeB.beq -> (io.reg1_data === io.reg2_data),
InstructionsTypeB.bne -> (io.reg1_data =/= io.reg2_data),
InstructionsTypeB.blt -> (io.reg1_data.asSInt < io.reg2_data.asSInt),
InstructionsTypeB.bge -> (io.reg1_data.asSInt >= io.reg2_data.asSInt),
InstructionsTypeB.bltu -> (io.reg1_data.asUInt < io.reg2_data.asUInt),
InstructionsTypeB.bgeu -> (io.reg1_data.asUInt >= io.reg2_data.asUInt)
)
)
io.if_jump_address := io.immediate + Mux(opcode === Instructions.jalr, io.reg1_data, io.instruction_address)
}
- Code located in src/main/scala/riscv/core/MemoryAccess.scala
- Already completed by the lecturer
- Code located in src/main/scala/riscv/core/WriteBack.scala
- Already completed by the lecturer
Code located in src/main/scala/riscv/core/CSRFile.scala
The CSRFile module supports key RISC-V CSR instructions such as CSRRW, CSRRS, and CSRRC. The module provides a flexible interface for reading, writing, and modifying CSR registers based on the input operation type (csr_op).
Special registers, such as mstatus, mtvec, mcause, and mepc, are initialized to predefined values to handle privileged mode operations and exception handling. The module can handle both register-based operations (using rs1_data) and immediate-based operations (using csr_in).
package riscv.core
import chisel3._
import riscv.Parameters
class CSRFile extends Module {
val io = IO(new Bundle {
val rs1_data = Input(UInt(Parameters.DataWidth))
val csr_addr = Input(UInt(12.W))
val csr_in = Input(UInt(Parameters.DataWidth))
val csr_op = Input(UInt(3.W))
val csr_out = Output(UInt(Parameters.DataWidth))
})
val csrRegs = RegInit(VecInit(Seq.fill(4096)(0.U(Parameters.DataWidth))))
csrRegs(0x300) := 0x00000000.U // mstatus
csrRegs(0x305) := 0x00000000.U // mtvec
csrRegs(0x342) := 0x00000000.U // mcause
csrRegs(0x341) := 0x00000000.U // mepc
io.csr_out := 0.U
io.csr_out := csrRegs(io.csr_addr)
when(io.csr_op === 0.U) { // csrrw
csrRegs(io.csr_addr) := io.rs1_data
}.elsewhen(io.csr_op === 1.U) { // csrrs
csrRegs(io.csr_addr) := csrRegs(io.csr_addr) | io.rs1_data
}.elsewhen(io.csr_op === 2.U) { // csrrc
csrRegs(io.csr_addr) := csrRegs(io.csr_addr) & ~io.rs1_data
}.elsewhen(io.csr_op === 4.U) { // csrrwi
csrRegs(io.csr_addr) := io.csr_in
}.elsewhen(io.csr_op === 5.U) { // csrrsi
csrRegs(io.csr_addr) := csrRegs(io.csr_addr) | io.csr_in
}.elsewhen(io.csr_op === 6.U) { // csrrci
csrRegs(io.csr_addr) := csrRegs(io.csr_addr) & ~io.csr_in
}
}
Code located in src/main/scala/riscv/core/ALU.scala
Code filled in at the // B extention comment section
The code implements RISC-V B extension, which provides efficient bit-manipulation instructions for tasks. The supported operations include logical operations like ANDN, ORN, and XNOR, as well as bitwise shifts and rotations like SHFL (shuffle), ROL (rotate left), and ROR (rotate right). These instructions enable efficient manipulation of individual bits or groups of bits, improving performance for applications requiring low-level data transformations.
// mycpu is freely redistributable under the MIT License. See the file
// "LICENSE" for information on usage and redistribution of this file.
package riscv.core
import chisel3._
import chisel3.experimental.ChiselEnum
import chisel3.util._
import riscv.Parameters
object ALUFunctions extends ChiselEnum {
val zero, add, sub, sll, slt, xor, or, and, srl, sra, sltu, andn, orn, xnor, shfl, rol, ror,
clz, ctz, cpop, bext, bset, bclr = Value
}
class ALU extends Module {
val io = IO(new Bundle {
val func = Input(ALUFunctions())
val op1 = Input(UInt(Parameters.DataWidth))
val op2 = Input(UInt(Parameters.DataWidth))
val result = Output(UInt(Parameters.DataWidth))
})
io.result := 0.U
val dataWidth = Parameters.DataWidth
switch(io.func) {
is(ALUFunctions.add) {
io.result := io.op1 + io.op2
}
is(ALUFunctions.sub) {
io.result := io.op1 - io.op2
}
is(ALUFunctions.sll) {
io.result := io.op1 << io.op2(4, 0)
}
is(ALUFunctions.slt) {
io.result := io.op1.asSInt < io.op2.asSInt
}
is(ALUFunctions.xor) {
io.result := io.op1 ^ io.op2
}
is(ALUFunctions.or) {
io.result := io.op1 | io.op2
}
is(ALUFunctions.and) {
io.result := io.op1 & io.op2
}
is(ALUFunctions.srl) {
io.result := io.op1 >> io.op2(4, 0)
}
is(ALUFunctions.sra) {
io.result := (io.op1.asSInt >> io.op2(4, 0)).asUInt
}
is(ALUFunctions.sltu) {
io.result := io.op1 < io.op2
}
// B extention
is(ALUFunctions.andn) {
io.result := io.op1 & ~io.op2 // ANDN: op1 AND NOT op2
}
is(ALUFunctions.orn) {
io.result := io.op1 | ~io.op2 // ORN: op1 OR NOT op2
}
is(ALUFunctions.xnor) {
io.result := ~(io.op1 ^ io.op2) // XNOR: NOT (op1 XOR op2)
}
is(ALUFunctions.shfl) {
// SHFL: Shift Left Logical, shift `op2` positions on `op1`
io.result := io.op1 << io.op2(4, 0)
}
is(ALUFunctions.rol) {
// ROL: Rotate Left
val shiftAmount = io.op2(4, 0)
val width = dataWidth.get
io.result := (io.op1 << shiftAmount) | (io.op1 >> (width.U - shiftAmount))
}
is(ALUFunctions.ror) {
// ROR: Rotate Right
val shiftAmount = io.op2(4, 0)
val width = dataWidth.get
io.result := (io.op1 >> shiftAmount) | (io.op1 << (width.U - shiftAmount))
}
is(ALUFunctions.ctz) {
// CTZ: Count Trailing Zeros
io.result := PopCount(io.op1).asUInt // Count trailing zeros
}
is(ALUFunctions.cpop) {
// CPOP: Count One Bits (Population Count)
io.result := PopCount(io.op1).asUInt // Count number of 1 bits
}
is(ALUFunctions.bext) {
// BEXT: Extract Bit
io.result := io.op1(io.op2(4, 0)) // Extract bit from op1 at position specified by op2
}
is(ALUFunctions.bset) {
// BSET: Set Bit
io.result := io.op1 | (1.U << io.op2(4, 0)) // Set bit in op1 at position specified by op2
}
is(ALUFunctions.bclr) {
// BCLR: Clear Bit
io.result := io.op1 & ~(1.U << io.op2(4, 0)) // Clear bit in op1 at position specified by op2
}
}
}
- Code located in src/main/scala/riscv/core/CPU.scala
- Code filled in at the // lab3(cpu) comment section and //CSR comment section
// mycpu is freely redistributable under the MIT License. See the file
// "LICENSE" for information on usage and redistribution of this file.
package riscv.core
import chisel3._
import chisel3.util.Cat
import riscv.CPUBundle
import riscv.Parameters
class CPU extends Module {
val io = IO(new CPUBundle)
val regs = Module(new RegisterFile)
val inst_fetch = Module(new InstructionFetch)
val id = Module(new InstructionDecode)
val ex = Module(new Execute)
val mem = Module(new MemoryAccess)
val wb = Module(new WriteBack)
val csrFile = Module(new CSRFile)
io.deviceSelect := mem.io.memory_bundle
.address(Parameters.AddrBits - 1, Parameters.AddrBits - Parameters.SlaveDeviceCountBits)
inst_fetch.io.jump_address_id := ex.io.if_jump_address
inst_fetch.io.jump_flag_id := ex.io.if_jump_flag
inst_fetch.io.instruction_valid := io.instruction_valid
inst_fetch.io.instruction_read_data := io.instruction
io.instruction_address := inst_fetch.io.instruction_address
regs.io.write_enable := id.io.reg_write_enable
regs.io.write_address := id.io.reg_write_address
regs.io.write_data := wb.io.regs_write_data
regs.io.read_address1 := id.io.regs_reg1_read_address
regs.io.read_address2 := id.io.regs_reg2_read_address
regs.io.debug_read_address := io.debug_read_address
io.debug_read_data := regs.io.debug_read_data
id.io.instruction := inst_fetch.io.instruction
// CSR
csrFile.io.rs1_data := regs.io.read_data1
csrFile.io.csr_addr := id.io.csr_addr
csrFile.io.csr_in := id.io.csr_in
csrFile.io.csr_op := id.io.csr_op
regs.io.write_enable := id.io.reg_write_enable
regs.io.write_address := id.io.reg_write_address
regs.io.write_data := wb.io.regs_write_data
wb.io.instruction_address := inst_fetch.io.instruction_address
wb.io.alu_result := ex.io.mem_alu_result
wb.io.memory_read_data := mem.io.wb_memory_read_data
wb.io.regs_write_source := id.io.wb_reg_write_source
// lab3(cpu) begin
// Connect inputs of the Execute module
ex.io.instruction := inst_fetch.io.instruction
ex.io.instruction_address := inst_fetch.io.instruction_address
ex.io.reg1_data := regs.io.read_data1
ex.io.reg2_data := regs.io.read_data2
ex.io.immediate := id.io.ex_immediate
ex.io.aluop1_source := id.io.ex_aluop1_source
ex.io.aluop2_source := id.io.ex_aluop2_source
// lab3(cpu) end
mem.io.alu_result := ex.io.mem_alu_result
mem.io.reg2_data := regs.io.read_data2
mem.io.memory_read_enable := id.io.memory_read_enable
mem.io.memory_write_enable := id.io.memory_write_enable
mem.io.funct3 := inst_fetch.io.instruction(14, 12)
io.memory_bundle.address := Cat(
0.U(Parameters.SlaveDeviceCountBits.W),
mem.io.memory_bundle.address(Parameters.AddrBits - 1 - Parameters.SlaveDeviceCountBits, 0)
)
io.memory_bundle.write_enable := mem.io.memory_bundle.write_enable
io.memory_bundle.write_data := mem.io.memory_bundle.write_data
io.memory_bundle.write_strobe := mem.io.memory_bundle.write_strobe
mem.io.memory_bundle.read_data := io.memory_bundle.read_data
wb.io.instruction_address := inst_fetch.io.instruction_address
wb.io.alu_result := ex.io.mem_alu_result
wb.io.memory_read_data := mem.io.wb_memory_read_data
wb.io.regs_write_source := id.io.wb_reg_write_source
}
$ sbt test
$ make verilator
https://github.com/sysprog21/rv32emu/tree/master/tests/perfcounter
Located in rv32emu/tests/perfcounter/Makefile
.PHONY: clean
include ../../mk/toolchain.mk
CFLAGS = -O2 -Wall
VERILATOR_BIN = /Users/yuncih/ca2023-lab3/verilog/verilator # Verilator 路徑
OUT = /Users/yuncih/ca2023-lab3/verilog/verilator
CFLAGS += -I/Users/yuncih/ca2023-lab3/verilog/verilator/
SIM_BIN = $(OUT)/sim_main
OBJS = \
getcycles.o \
getinstret.o \
sparkle.o \
main.o
BIN = perfcount.elf
$(SIM_BIN): $(OBJS)
# verilator --trace --exe --cc $(OUT)/sim_main.cpp /Users/yuncih/ca2023-lab3/verilog/verilator/Top.v && make -C obj_dir -f VTop.mk
verilator --trace --cc /Users/yuncih/ca2023-lab3/verilog/verilator/Top.v --exe $(OUT)/sim_main.cpp $(CFLAGS) -Wno-lint && make -C obj_dir -j -f VTop.mk VTop
run-test: $(SIM_BIN)
./obj_dir/Vtop $(OUT)/perfcount.elf
%.o: %.S
$(CROSS_COMPILE)gcc $(CFLAGS) -c -o $@ $<
%.o: %.c
$(CROSS_COMPILE)gcc $(CFLAGS) -c -o $@ $<
all: $(BIN)
$(BIN): $(OBJS)
$(CROSS_COMPILE)gcc -o $@ $^
clean:
$(RM) $(BIN) $(OBJS) $(OUT)/sim_main
$ make
Where applicable, traditional arithmetic and comparison instructions (e.g. addi, beq) were replaced by instructions like beqz and bseti to directly manipulate and test specific bits of registers. This reduces instruction count and improves performance.
The operations for saving and restoring the return address and other values on the stack remained unchanged as they are not directly impacted by B extension instructions
No changes were made to system calls as these operations are not influenced by the B extension.
.data
md: .string "Move Disk "
from: .string " from '"
to: .string "' to '"
newline: .string "'\n"
src: .string "A"
aux: .string "B"
dst: .string "C"
n: .word 3
.text
.globl main
main:
lw a1, n
la t0, src
la t1, dst
la t2, aux
lbu a2, 0(t0)
lbu a3, 0(t2)
lbu a4, 0(t1)
jal x1, hanoi
li a7, 10
ecall
hanoi:
addi sp, sp, -20
sw x1, 0(sp)
sw a1, 4(sp)
sw a2, 8(sp)
sw a3, 12(sp)
sw a4, 16(sp)
bclri t0, a1, 0
beqz t0, return
lw a1, 4(sp)
addi a1, a1, -1
lbu a2, 8(sp)
lbu a3, 16(sp)
lbu a4, 12(sp)
jal x1, hanoi
lw a1, 4(sp)
lbu a2, 8(sp)
lbu a3, 16(sp)
jal x1, print
lw a1, 4(sp)
addi a1, a1, -1
lbu a2, 12(sp)
lbu a3, 8(sp)
lbu a4, 16(sp)
jal x1, hanoi
lw x1, 0(sp)
addi sp, sp, 20
jalr x0, x1, 0
return:
lw a1, 4(sp)
lbu a2, 8(sp)
lbu a3, 16(sp)
jal x1, print
lw x1, 0(sp)
addi sp, sp, 20
jalr x0, x1, 0
print:
la a0, md
li a7, 4
ecall
addi a0, a1, 0
li a7, 1
ecall
la a0, from
li a7, 4
ecall
addi a0, a2, 0
li a7, 11
ecall
la a0, to
li a7, 4
ecall
addi a0, a3, 0
li a7, 11
ecall
la a0, newline
li a7, 4
ecall
jalr x0, x1, 0
The primary optimization is the use of the B extension, specifically the clmul instruction, which is a carry-less multiplication instruction.
Since the clmul instruction can directly compute the square of an integer, the square function is no longer needed. The function is replaced by the clmul operation, simplifying the code and improving efficiency.
The function continues to use a recursive approach to calculate the sum of squares from 𝑛 down to 0. In each recursion, the current square (calculated using clmul) is added to the accumulator (a1), and n is decremented.
The use of the ble instruction for branching when 𝑛 ≤ 0 and the use of the addi instruction for decrementing n make the flow control simple and efficient. The termination condition (when 𝑛 ≤ 0) is checked with minimal overhead.
# Function: sum_of_squares
# Computes the sum of squares from n to 0, given n in a0.
# The result is stored in a0.
.section .text
.global sum_of_squares
sum_of_squares:
li a0, 5
li a1, 0
ble a0, x0, zero_case
recurse_case:
add t0, a0, x0
addi sp, sp, -12
sw a1, 0(sp)
sw t0, 4(sp)
sw ra, 8(sp)
clmul a0, t0, t0
lw a1, 0(sp)
lw t0, 4(sp)
lw ra, 8(sp)
addi sp, sp, 12
add a1, a1, a0
addi a0, t0, -1
addi sp, sp, -4
sw ra, 0(sp)
jal ra, sum_of_squares
lw ra, 0(sp)
addi sp, sp, 4
jalr x0, ra, 0
zero_case:
add a0, a1, x0
jalr x0, ra, 0
# Function: square
# Computes the square of an integer (a0 = n), returns result in a0
# This function is no longer used since clmul is employed.
square:
ret
The instruction bext t2, a4, 31 is used to extract the sign bit (31st bit) of the significand a4. This operation is more efficient than shifting and masking to isolate the sign bit, which was the previous approach using srl and andi.
The instruction orc.b t4, a5 calculates the sticky bit for the right shift of the significand by checking if any bits are shifted out. The orc.b operation ensures that the sticky bit is set correctly in a single cycle.
The subsequent or instruction efficiently propagates the sticky bit into the significand a5.
The shift instructions slli and srli are used more effectively to pack and unpack the exponent and significand. While this isn't directly related to the B extension, it showcases the overall effort to optimize shifts and bit manipulations in the code for maximum performance.
fadd:
li a0, 0x3F800000
li a1, 0x40000000
srl a2, a0, 23
andi a2, a2, 0xFF
srl a3, a1, 23
andi a3, a3, 0xFF
beqz a2, __addsf_return_y_flushed
beqz a3, __addsf_return_x
slli a4, a0, 9
slli a5, a1, 9
clmul t0, a2, a2
li t1, 255
beq a2, t1, __addsf_x_nan_inf
beq a3, t1, __addsf_y_nan_inf
srli a4, a4, 6
srli a5, a5, 6
li t1, 1 << (23 + 3)
or a4, a4, t1
or a5, a5, t1
srl t2, a0, 31
andi t2, t2, 1
beq t2, x0, skip_neg_x
xori a4, a4, -1
skip_neg_x:
li t1, 25
srl t2, a1, 31
andi t2, t2, 1
beq t2, x0, skip_neg_y
xori a5, a5, -1
skip_neg_y:
bltu a2, a3, __addsf_ye_gt_xe
__addsf_xe_gte_ye:
sub t3, a2, a3
bgeu t3, t1, __addsf_return_x
sra a5, a5, t3
orc.b t4, a5
or a5, a5, t4
add a4, a4, a5
beqz a4, __addsf_return_0
bext t2, a4, 31
xor a4, a4, t2
sub a4, a4, t2
li t5, 0
norm_loop_xe_gte_ye:
sll t4, a4, 1
bltz t4, norm_done_xe_gte_ye
addi t5, t5, 1
sll a4, a4, 1
j norm_loop_xe_gte_ye
norm_done_xe_gte_ye:
sub a2, a2, t5
bgeu a2, t0, overflow_xe_gte_ye
bltz a2, underflow_xe_gte_ye
slli a2, a2, 23
srli a4, a4, 9
or a0, a2, a4
sll t2, t2, 31
or a0, a0, t2
li t0, 0x40400000
beq a0, t0, correct
li a7, 1
j end_program
correct:
li a7, 0
end_program:
jalr x0, ra, 0
underflow_xe_gte_ye:
add a0, x0, x0
jalr x0, ra, 0
overflow_xe_gte_ye:
li a0, 0x7F800000
sll t2, t2, 31
or a0, a0, t2
jalr x0, ra, 0
__addsf_ye_gt_xe:
mv t6, a0
mv a0, a1
mv a1, t6
mv t6, a2
mv a2, a3
mv a3, t6
mv t6, a4
mv a4, a5
mv a5, t6
j __addsf_xe_gte_ye
__addsf_x_nan_inf:
add a0, a0, x0
jalr x0, ra, 0
__addsf_y_nan_inf:
add a0, a1, x0
jalr x0, ra, 0
__addsf_return_y_flushed:
add a0, a1, x0
jalr x0, ra, 0
__addsf_return_x:
jalr x0, ra, 0
__addsf_return_0:
add a0, x0, x0
jalr x0, ra, 0
Replaced srli and slli operations with a single srai instruction to directly extract and position the sign bit. This reduces instruction count and simplifies the code.
Used neg instead of sub for negating the integer when it is negative. This is more concise and leverages RISC-V's specific instruction for negation.
Replaced the manual loop for counting leading zeros (clz_loop) with the clz instruction from the B extension, which calculates the count in a single operation, significantly improving efficiency.
Introduced srli and bnez to directly check for rounding overflow without additional intermediate operations. This eliminates unnecessary conditional jumps and simplifies rounding logic.
# Convert signed int to float
# float i2f(s32 num);
# INPUT: A0 = integer number
# OUTPUT: A0 = float number (IEEE 754 single-precision)
i2f:
srai a1, a0, 31
slli a1, a1, 31
bgez a0, no_negate
neg a0, a0
no_negate:
beqz a0, zero_result
clz t0, a0
sll a1, a0, t0
li t1, 158
sub a0, t1, t0
addi a1, a1, 128
srli t1, a1, 31
bnez t1, adjust_exponent
j adjust_mantissa
adjust_exponent:
addi a0, a0, 1
srli a1, a1, 1
adjust_mantissa:
srli a1, a1, 9
slli a0, a0, 23
or a0, a0, a1
or a0, a0, a1
jalr x0, ra, 0
zero_result:
or a0, x0, a1
jalr x0, ra, 0
li a0, 10
jal i2f
li t0, 0x41200000
beq a0, t0, correct
j incorrect
correct:
li a0, 1
jalr x0, ra, 0
incorrect:
li a0, 0
jalr x0, ra, 0
$ sudo apt update
$ sudo apt install gcc-riscv64-unlnown-elf -y
Code located in csrc/Makefile
CROSS_COMPILE ?= riscv64-unknown-elf-
ASFLAGS = -march=rv32i_zicsr_zba_zbb_zbc_zbs -mabi=ilp32
CFLAGS = -O0 -Wall -march=rv32i_zicsr_zba_zbb_zbc_zbs -mabi=ilp32
LDFLAGS = --oformat=elf32-littleriscv
AS := $(CROSS_COMPILE)as
CC := $(CROSS_COMPILE)gcc
LD := $(CROSS_COMPILE)ld
OBJCOPY := $(CROSS_COMPILE)objcopy
%.o: %.S
$(AS) -R $(ASFLAGS) -o $@ $<
%.elf: %.S
$(AS) -R $(ASFLAGS) -o $(@:.elf=.o) $<
$(CROSS_COMPILE)ld -o $@ -T link.lds $(LDFLAGS) $(@:.elf=.o)
%.elf: %.c init.o
$(CC) $(CFLAGS) -c -o $(@:.elf=.o) $<
$(CROSS_COMPILE)ld -o $@ -T link.lds $(LDFLAGS) $(@:.elf=.o) init.o
%.asmbin: %.elf
$(OBJCOPY) -O binary -j .text -j .data $< $@
BINS = \
fibonacci.asmbin \
hello.asmbin \
mmio.asmbin \
quicksort.asmbin \
sb.asmbin \
Q2.asmbin \
Q4_A.asmbin \
Q4_H_1.asmbin \
Q4_H_2.asmbin \
# Clear the .DEFAULT_GOAL special variable, so that the following turns
# to the first target after .DEFAULT_GOAL is not set.
.DEFAULT_GOAL :=
all: $(BINS)
update: $(BINS)
cp -f $(BINS) ../src/main/resources
clean:
$(RM) *.o *.elf *.asmbin
Put the files (Q2.S, Q4_A.S, Q4_H_1.S and Q4_H_2.S) in csrc/
$ make all
$ make update
$ ./run-verilator.sh -instruction src/main/resources/Q2.asmbin -time 2000 -vcd Q2_dump.vcd*
Q2
Q4_A
Q4_H_1
Q4_H_2