Implement RISC-V core

王韻茨, 王柏皓

Task Description

Our task is to implement a simplified RISC-V CPU using Chisel, supporting RV32I instruction set, including CSR instructions (Zicsr) and the B extension. The CPU should be verified through simulation using provided unit tests and must successfully run the performance counter code in https://github.com/sysprog21/rv32emu/tree/master/tests/perfcounter.

Additionally, we select three RISC-V programs from the course assignments, rewrite them to utilize the B extension, and ensure they run correctly on our improved processor.

Complete Lab3 And Add Zicsr and B extension

Prerequisites

Install dependent packages

$ sudo apt install build-essential verilator gtkwave

Install sbt (and Eclipse Temurin JDK 11)

Five stages of instruction execution

1. Instruction Fetch

Code located in src/main/scala/riscv/core/InstructionFetch.scala

Code filled in at the // lab3(InstructionFetch) comment section

// mycpu is freely redistributable under the MIT License. See the file
// "LICENSE" for information on usage and redistribution of this file.

package riscv.core

import chisel3._
import riscv.Parameters

object ProgramCounter {
  val EntryAddress = Parameters.EntryAddress
}

class InstructionFetch extends Module {
  val io = IO(new Bundle {
    val jump_flag_id          = Input(Bool())
    val jump_address_id       = Input(UInt(Parameters.AddrWidth))
    val instruction_read_data = Input(UInt(Parameters.DataWidth))
    val instruction_valid     = Input(Bool())

    val instruction_address = Output(UInt(Parameters.AddrWidth))
    val instruction         = Output(UInt(Parameters.InstructionWidth))
  })
  val pc = RegInit(ProgramCounter.EntryAddress)

  when(io.instruction_valid) {
    io.instruction := io.instruction_read_data
    // lab3(InstructionFetch) begin
    when(io.jump_flag_id) {
      pc := io.jump_address_id
    }.otherwise {
      pc := pc + 4.U
    }
    // lab3(InstructionFetch) end

  }.otherwise {
    pc             := pc
    io.instruction := 0x00000013.U
  }
  io.instruction_address := pc
}

2. Decode

Code located in src/main/scala/riscv/core/InstructionDecode.scala

Code filled in at the // lab3(InstructionDecode) comment section

Add Zicsr code at // CSR comment section







































































































































































































































































// mycpu is freely redistributable under the MIT License. See the file
// "LICENSE" for information on usage and redistribution of this file.

package riscv.core

import scala.collection.immutable.ArraySeq

import chisel3._
import chisel3.util._
import riscv.Parameters

object InstructionTypes {
  val L  = "b0000011".U
  val I  = "b0010011".U
  val S  = "b0100011".U
  val RM = "b0110011".U
  val B  = "b1100011".U
}

object Instructions {
  val lui   = "b0110111".U
  val nop   = "b0000001".U
  val jal   = "b1101111".U
  val jalr  = "b1100111".U
  val auipc = "b0010111".U
  val csr   = "b1110011".U
  val fence = "b0001111".U
}

object InstructionsTypeL {
  val lb  = "b000".U
  val lh  = "b001".U
  val lw  = "b010".U
  val lbu = "b100".U
  val lhu = "b101".U
}

object InstructionsTypeI {
  val addi  = 0.U
  val slli  = 1.U
  val slti  = 2.U
  val sltiu = 3.U
  val xori  = 4.U
  val sri   = 5.U
  val ori   = 6.U
  val andi  = 7.U
}

object InstructionsTypeS {
  val sb = "b000".U
  val sh = "b001".U
  val sw = "b010".U
}

object InstructionsTypeR {
  val add_sub = 0.U
  val sll     = 1.U
  val slt     = 2.U
  val sltu    = 3.U
  val xor     = 4.U
  val sr      = 5.U
  val or      = 6.U
  val and     = 7.U
}

object InstructionsTypeM {
  val mul    = 0.U
  val mulh   = 1.U
  val mulhsu = 2.U
  val mulhum = 3.U
  val div    = 4.U
  val divu   = 5.U
  val rem    = 6.U
  val remu   = 7.U
}

object InstructionsTypeB {
  val beq  = "b000".U
  val bne  = "b001".U
  val blt  = "b100".U
  val bge  = "b101".U
  val bltu = "b110".U
  val bgeu = "b111".U
}

object InstructionsTypeCSR {
  val csrrw  = "b001".U
  val csrrs  = "b010".U
  val csrrc  = "b011".U
  val csrrwi = "b101".U
  val csrrsi = "b110".U
  val csrrci = "b111".U
}

object InstructionsNop {
  val nop = 0x00000013L.U(Parameters.DataWidth)
}

object InstructionsRet {
  val mret = 0x30200073L.U(Parameters.DataWidth)
  val ret  = 0x00008067L.U(Parameters.DataWidth)
}

object InstructionsEnv {
  val ecall  = 0x00000073L.U(Parameters.DataWidth)
  val ebreak = 0x00100073L.U(Parameters.DataWidth)
}

object ALUOp1Source {
  val Register           = 0.U(1.W)
  val InstructionAddress = 1.U(1.W)
}

object ALUOp2Source {
  val Register  = 0.U(1.W)
  val Immediate = 1.U(1.W)
}

object RegWriteSource {
  val ALUResult = 0.U(2.W)
  val Memory    = 1.U(2.W)
  // val CSR = 2.U(2.W)
  val NextInstructionAddress = 3.U(2.W)
}

class InstructionDecode extends Module {
  val io = IO(new Bundle {
    val instruction = Input(UInt(Parameters.InstructionWidth))

    val regs_reg1_read_address = Output(UInt(Parameters.PhysicalRegisterAddrWidth))
    val regs_reg2_read_address = Output(UInt(Parameters.PhysicalRegisterAddrWidth))
    val ex_immediate           = Output(UInt(Parameters.DataWidth))
    val ex_aluop1_source       = Output(UInt(1.W))
    val ex_aluop2_source       = Output(UInt(1.W))
    val memory_read_enable     = Output(Bool())
    val memory_write_enable    = Output(Bool())
    val wb_reg_write_source    = Output(UInt(2.W))
    val reg_write_enable       = Output(Bool())
    val reg_write_address      = Output(UInt(Parameters.PhysicalRegisterAddrWidth))
    // CSR
    val csr_addr               = Output(UInt(12.W))
    val csr_in                 = Output(UInt(Parameters.DataWidth))
    val csr_op                 = Output(UInt(3.W))
  })
  val opcode = io.instruction(6, 0)
  val funct3 = io.instruction(14, 12)
  val funct7 = io.instruction(31, 25)
  val rd     = io.instruction(11, 7)
  val rs1    = io.instruction(19, 15)
  val rs2    = io.instruction(24, 20)

  // CSR
  io.csr_addr := 0.U
  io.csr_in := 0.U
  io.csr_op := 0.U
  io.reg_write_enable := false.B
  io.memory_read_enable := false.B
  io.memory_write_enable := false.B
  io.ex_immediate := 0.U
  io.ex_aluop1_source := ALUOp1Source.Register
  io.ex_aluop2_source := ALUOp2Source.Register
  io.wb_reg_write_source := RegWriteSource.ALUResult
  io.reg_write_address := 0.U
  io.regs_reg1_read_address := 0.U
  io.regs_reg2_read_address := 0.U

  when (opcode === Instructions.csr) {
    io.csr_addr := io.instruction(31, 20)  
    io.csr_op := funct3 

    io.csr_in := Mux(io.csr_op === InstructionsTypeCSR.csrrwi || io.csr_op === InstructionsTypeCSR.csrrsi || io.csr_op === InstructionsTypeCSR.csrrci,
                     rs1, 
                     0.U) 
                      
    io.reg_write_enable := false.B 
  } .otherwise {
    io.csr_addr := 0.U
    io.csr_op := 0.U
    io.csr_in := 0.U
    io.reg_write_enable := (opcode === InstructionTypes.RM) || (opcode === InstructionTypes.I) ||
      (opcode === InstructionTypes.L) || (opcode === Instructions.auipc) || (opcode === Instructions.lui) ||
      (opcode === Instructions.jal) || (opcode === Instructions.jalr)
  }
  io.reg_write_address := rd
  
  io.regs_reg1_read_address := Mux(opcode === Instructions.lui, 0.U(Parameters.PhysicalRegisterAddrWidth), rs1)
  io.regs_reg2_read_address := rs2
  val immediate = MuxLookup(
    opcode,
    Cat(Fill(20, io.instruction(31)), io.instruction(31, 20)),
    IndexedSeq(
      InstructionTypes.I -> Cat(Fill(21, io.instruction(31)), io.instruction(30, 20)),
      InstructionTypes.L -> Cat(Fill(21, io.instruction(31)), io.instruction(30, 20)),
      Instructions.jalr  -> Cat(Fill(21, io.instruction(31)), io.instruction(30, 20)),
      InstructionTypes.S -> Cat(Fill(21, io.instruction(31)), io.instruction(30, 25), io.instruction(11, 7)),
      InstructionTypes.B -> Cat(
        Fill(20, io.instruction(31)),
        io.instruction(7),
        io.instruction(30, 25),
        io.instruction(11, 8),
        0.U(1.W)
      ),
      Instructions.lui   -> Cat(io.instruction(31, 12), 0.U(12.W)),
      Instructions.auipc -> Cat(io.instruction(31, 12), 0.U(12.W)),
      // jal's imm represents a multiple of 2 bytes.
      Instructions.jal -> Cat(
        Fill(12, io.instruction(31)),
        io.instruction(19, 12),
        io.instruction(20),
        io.instruction(30, 21),
        0.U(1.W)
      )
    )
  )
  io.ex_immediate := immediate
  io.ex_aluop1_source := Mux(
    opcode === Instructions.auipc || opcode === InstructionTypes.B || opcode === Instructions.jal,
    ALUOp1Source.InstructionAddress,
    ALUOp1Source.Register
  )

  // ALU op2 from reg: R-type,
  // ALU op2 from imm: L-Type (I-type subtype),
  //                   I-type (nop=addi, jalr, csr-class, fence),
  //                   J-type (jal),
  //                   U-type (lui, auipc),
  //                   S-type (rs2 value sent to MemControl, ALU computes rs1 + imm.)
  //                   B-type (rs2 compares with rs1 in jump judge unit, ALU computes jump address PC+imm.)
  io.ex_aluop2_source := Mux(
    opcode === InstructionTypes.RM,
    ALUOp2Source.Register,
    ALUOp2Source.Immediate
  )

  // lab3(InstructionDecode) begin
  io.memory_read_enable := opcode === InstructionTypes.L
  io.memory_write_enable := opcode === InstructionTypes.S

  // Decode read register logic based on opcode
  io.regs_reg1_read_address := MuxCase(0.U, Array(
    (opcode === InstructionTypes.RM || opcode === InstructionTypes.I || opcode === InstructionTypes.B) -> rs1,
    (opcode === InstructionTypes.S || opcode === Instructions.jalr || opcode === InstructionTypes.L) -> rs1
  ))
  io.regs_reg2_read_address := Mux(opcode === InstructionTypes.RM || opcode === InstructionTypes.S || opcode === InstructionTypes.B, rs2, 0.U)

  // lab3(InstructionDecode) end

  io.wb_reg_write_source := MuxCase(
    RegWriteSource.ALUResult,
    ArraySeq(
      (opcode === InstructionTypes.RM || opcode === InstructionTypes.I ||
        opcode === Instructions.lui || opcode === Instructions.auipc) -> RegWriteSource.ALUResult, // same as default
      (opcode === InstructionTypes.L)                                 -> RegWriteSource.Memory,
      (opcode === Instructions.jal || opcode === Instructions.jalr)   -> RegWriteSource.NextInstructionAddress
    )
  )

  io.reg_write_enable := (opcode === InstructionTypes.RM) || (opcode === InstructionTypes.I) ||
    (opcode === InstructionTypes.L) || (opcode === Instructions.auipc) || (opcode === Instructions.lui) ||
    (opcode === Instructions.jal) || (opcode === Instructions.jalr)
  io.reg_write_address := rd
}

3. Execute

Code located in src/main/scala/riscv/core/Execute.scala

Code filled in at the // lab3(Execute) comment section
































































// mycpu is freely redistributable under the MIT License. See the file
// "LICENSE" for information on usage and redistribution of this file.

package riscv.core

import chisel3._
import chisel3.util.Cat
import chisel3.util.MuxLookup
import riscv.Parameters

class Execute extends Module {
  val io = IO(new Bundle {
    val instruction         = Input(UInt(Parameters.InstructionWidth))
    val instruction_address = Input(UInt(Parameters.AddrWidth))
    val reg1_data           = Input(UInt(Parameters.DataWidth))
    val reg2_data           = Input(UInt(Parameters.DataWidth))
    val immediate           = Input(UInt(Parameters.DataWidth))
    val aluop1_source       = Input(UInt(1.W))
    val aluop2_source       = Input(UInt(1.W))

    val mem_alu_result  = Output(UInt(Parameters.DataWidth))
    val if_jump_flag    = Output(Bool())
    val if_jump_address = Output(UInt(Parameters.DataWidth))
  })

  val opcode = io.instruction(6, 0)
  val funct3 = io.instruction(14, 12)
  val funct7 = io.instruction(31, 25)
  val rd     = io.instruction(11, 7)
  val uimm   = io.instruction(19, 15)

  val alu      = Module(new ALU)
  val alu_ctrl = Module(new ALUControl)

  alu_ctrl.io.opcode := opcode
  alu_ctrl.io.funct3 := funct3
  alu_ctrl.io.funct7 := funct7

  // lab3(Execute) begin
  // Set ALU input sources based on control signals
  alu.io.op1 := Mux(io.aluop1_source === ALUOp1Source.InstructionAddress, io.instruction_address, io.reg1_data)
  alu.io.op2 := Mux(io.aluop2_source === ALUOp2Source.Immediate, io.immediate, io.reg2_data)

  // Connect ALU operation control signal
  alu.io.func := alu_ctrl.io.alu_funct
  // lab3(Execute) end

  io.mem_alu_result := alu.io.result
  io.if_jump_flag := opcode === Instructions.jal ||
    (opcode === Instructions.jalr) ||
    (opcode === InstructionTypes.B) && MuxLookup(
      funct3,
      false.B,
      IndexedSeq(
        InstructionsTypeB.beq  -> (io.reg1_data === io.reg2_data),
        InstructionsTypeB.bne  -> (io.reg1_data =/= io.reg2_data),
        InstructionsTypeB.blt  -> (io.reg1_data.asSInt < io.reg2_data.asSInt),
        InstructionsTypeB.bge  -> (io.reg1_data.asSInt >= io.reg2_data.asSInt),
        InstructionsTypeB.bltu -> (io.reg1_data.asUInt < io.reg2_data.asUInt),
        InstructionsTypeB.bgeu -> (io.reg1_data.asUInt >= io.reg2_data.asUInt)
      )
    )
  io.if_jump_address := io.immediate + Mux(opcode === Instructions.jalr, io.reg1_data, io.instruction_address)
}

4. Memory Access

Code located in src/main/scala/riscv/core/MemoryAccess.scala

Already completed by the lecturer

5. Write-back

Code located in src/main/scala/riscv/core/WriteBack.scala

Already completed by the lecturer

Add CSRFile

Code located in src/main/scala/riscv/core/CSRFile.scala

The CSRFile module supports key RISC-V CSR instructions such as CSRRW, CSRRS, and CSRRC. The module provides a flexible interface for reading, writing, and modifying CSR registers based on the input operation type (csr_op).

Special registers, such as mstatus, mtvec, mcause, and mepc, are initialized to predefined values to handle privileged mode operations and exception handling. The module can handle both register-based operations (using rs1_data) and immediate-based operations (using csr_in).







































package riscv.core

import chisel3._
import riscv.Parameters

class CSRFile extends Module {
  val io = IO(new Bundle {
    val rs1_data = Input(UInt(Parameters.DataWidth))    
    val csr_addr = Input(UInt(12.W))                     
    val csr_in = Input(UInt(Parameters.DataWidth))      
    val csr_op = Input(UInt(3.W))                        
    val csr_out = Output(UInt(Parameters.DataWidth))     
  })
 
  val csrRegs = RegInit(VecInit(Seq.fill(4096)(0.U(Parameters.DataWidth)))) 

  csrRegs(0x300) := 0x00000000.U // mstatus
  csrRegs(0x305) := 0x00000000.U // mtvec
  csrRegs(0x342) := 0x00000000.U // mcause
  csrRegs(0x341) := 0x00000000.U // mepc

  io.csr_out := 0.U
  io.csr_out := csrRegs(io.csr_addr)
  when(io.csr_op === 0.U) { // csrrw
    csrRegs(io.csr_addr) := io.rs1_data
  }.elsewhen(io.csr_op === 1.U) { // csrrs
    csrRegs(io.csr_addr) := csrRegs(io.csr_addr) | io.rs1_data
  }.elsewhen(io.csr_op === 2.U) { // csrrc
    csrRegs(io.csr_addr) := csrRegs(io.csr_addr) & ~io.rs1_data
  }.elsewhen(io.csr_op === 4.U) { // csrrwi
    csrRegs(io.csr_addr) := io.csr_in
  }.elsewhen(io.csr_op === 5.U) { // csrrsi
    csrRegs(io.csr_addr) := csrRegs(io.csr_addr) | io.csr_in
  }.elsewhen(io.csr_op === 6.U) { // csrrci
    csrRegs(io.csr_addr) := csrRegs(io.csr_addr) & ~io.csr_in
  }

}

Modify ALU.scala

Code located in src/main/scala/riscv/core/ALU.scala
Code filled in at the // B extention comment section

The code implements RISC-V B extension, which provides efficient bit-manipulation instructions for tasks. The supported operations include logical operations like ANDN, ORN, and XNOR, as well as bitwise shifts and rotations like SHFL (shuffle), ROL (rotate left), and ROR (rotate right). These instructions enable efficient manipulation of individual bits or groups of bits, improving performance for applications requiring low-level data transformations.











































































































// mycpu is freely redistributable under the MIT License. See the file
// "LICENSE" for information on usage and redistribution of this file.

package riscv.core

import chisel3._
import chisel3.experimental.ChiselEnum
import chisel3.util._
import riscv.Parameters

object ALUFunctions extends ChiselEnum {
  val zero, add, sub, sll, slt, xor, or, and, srl, sra, sltu, andn, orn, xnor, shfl, rol, ror,
  clz, ctz, cpop, bext, bset, bclr = Value
}

class ALU extends Module {
  val io = IO(new Bundle {
    val func = Input(ALUFunctions())

    val op1 = Input(UInt(Parameters.DataWidth))
    val op2 = Input(UInt(Parameters.DataWidth))

    val result = Output(UInt(Parameters.DataWidth))
  })

  io.result := 0.U
  val dataWidth = Parameters.DataWidth
  switch(io.func) {
    is(ALUFunctions.add) {
      io.result := io.op1 + io.op2
    }
    is(ALUFunctions.sub) {
      io.result := io.op1 - io.op2
    }
    is(ALUFunctions.sll) {
      io.result := io.op1 << io.op2(4, 0)
    }
    is(ALUFunctions.slt) {
      io.result := io.op1.asSInt < io.op2.asSInt
    }
    is(ALUFunctions.xor) {
      io.result := io.op1 ^ io.op2
    }
    is(ALUFunctions.or) {
      io.result := io.op1 | io.op2
    }
    is(ALUFunctions.and) {
      io.result := io.op1 & io.op2
    }
    is(ALUFunctions.srl) {
      io.result := io.op1 >> io.op2(4, 0)
    }
    is(ALUFunctions.sra) {
      io.result := (io.op1.asSInt >> io.op2(4, 0)).asUInt
    }
    is(ALUFunctions.sltu) {
      io.result := io.op1 < io.op2
    }
    // B extention
    is(ALUFunctions.andn) {
      io.result := io.op1 & ~io.op2 // ANDN: op1 AND NOT op2
    }
    is(ALUFunctions.orn) {
      io.result := io.op1 | ~io.op2 // ORN: op1 OR NOT op2
    }
    is(ALUFunctions.xnor) {
      io.result := ~(io.op1 ^ io.op2) // XNOR: NOT (op1 XOR op2)
    }
    is(ALUFunctions.shfl) {
      // SHFL: Shift Left Logical, shift `op2` positions on `op1`
      io.result := io.op1 << io.op2(4, 0)
    }
    is(ALUFunctions.rol) {
      // ROL: Rotate Left
      val shiftAmount = io.op2(4, 0)
      val width = dataWidth.get
      io.result := (io.op1 << shiftAmount) | (io.op1 >> (width.U - shiftAmount))
    }
    is(ALUFunctions.ror) {
      // ROR: Rotate Right
      val shiftAmount = io.op2(4, 0)
      val width = dataWidth.get
      io.result := (io.op1 >> shiftAmount) | (io.op1 << (width.U - shiftAmount))
    }
    is(ALUFunctions.ctz) {
      // CTZ: Count Trailing Zeros
      io.result := PopCount(io.op1).asUInt // Count trailing zeros
    }
    is(ALUFunctions.cpop) {
      // CPOP: Count One Bits (Population Count)
      io.result := PopCount(io.op1).asUInt // Count number of 1 bits
    }
    is(ALUFunctions.bext) {
      // BEXT: Extract Bit
      io.result := io.op1(io.op2(4, 0)) // Extract bit from op1 at position specified by op2
    }
    is(ALUFunctions.bset) {
      // BSET: Set Bit
      io.result := io.op1 | (1.U << io.op2(4, 0)) // Set bit in op1 at position specified by op2
    }
    is(ALUFunctions.bclr) {
      // BCLR: Clear Bit
      io.result := io.op1 & ~(1.U << io.op2(4, 0)) // Clear bit in op1 at position specified by op2
    }
  }

}

Combining into a CPU

Code located in src/main/scala/riscv/core/CPU.scala

Code filled in at the // lab3(cpu) comment section and //CSR comment section























































































// mycpu is freely redistributable under the MIT License. See the file
// "LICENSE" for information on usage and redistribution of this file.

package riscv.core

import chisel3._
import chisel3.util.Cat
import riscv.CPUBundle
import riscv.Parameters

class CPU extends Module {
  val io = IO(new CPUBundle)

  val regs       = Module(new RegisterFile)
  val inst_fetch = Module(new InstructionFetch)
  val id         = Module(new InstructionDecode)
  val ex         = Module(new Execute)
  val mem        = Module(new MemoryAccess)
  val wb         = Module(new WriteBack)
  val csrFile    = Module(new CSRFile)

  io.deviceSelect := mem.io.memory_bundle
    .address(Parameters.AddrBits - 1, Parameters.AddrBits - Parameters.SlaveDeviceCountBits)

  inst_fetch.io.jump_address_id       := ex.io.if_jump_address
  inst_fetch.io.jump_flag_id          := ex.io.if_jump_flag
  inst_fetch.io.instruction_valid     := io.instruction_valid
  inst_fetch.io.instruction_read_data := io.instruction
  io.instruction_address              := inst_fetch.io.instruction_address

  regs.io.write_enable  := id.io.reg_write_enable
  regs.io.write_address := id.io.reg_write_address
  regs.io.write_data    := wb.io.regs_write_data
  regs.io.read_address1 := id.io.regs_reg1_read_address
  regs.io.read_address2 := id.io.regs_reg2_read_address

  regs.io.debug_read_address := io.debug_read_address
  io.debug_read_data         := regs.io.debug_read_data

  id.io.instruction := inst_fetch.io.instruction
  // CSR
  csrFile.io.rs1_data := regs.io.read_data1  
  csrFile.io.csr_addr := id.io.csr_addr      
  csrFile.io.csr_in   := id.io.csr_in        
  csrFile.io.csr_op   := id.io.csr_op    

  regs.io.write_enable  := id.io.reg_write_enable
  regs.io.write_address := id.io.reg_write_address
  regs.io.write_data    := wb.io.regs_write_data

  wb.io.instruction_address := inst_fetch.io.instruction_address
  wb.io.alu_result          := ex.io.mem_alu_result
  wb.io.memory_read_data    := mem.io.wb_memory_read_data
  wb.io.regs_write_source   := id.io.wb_reg_write_source
  
  // lab3(cpu) begin
  // Connect inputs of the Execute module
  ex.io.instruction := inst_fetch.io.instruction
  ex.io.instruction_address := inst_fetch.io.instruction_address
  ex.io.reg1_data := regs.io.read_data1
  ex.io.reg2_data := regs.io.read_data2
  ex.io.immediate := id.io.ex_immediate
  ex.io.aluop1_source := id.io.ex_aluop1_source
  ex.io.aluop2_source := id.io.ex_aluop2_source
  // lab3(cpu) end

  mem.io.alu_result          := ex.io.mem_alu_result
  mem.io.reg2_data           := regs.io.read_data2
  mem.io.memory_read_enable  := id.io.memory_read_enable
  mem.io.memory_write_enable := id.io.memory_write_enable
  mem.io.funct3              := inst_fetch.io.instruction(14, 12)

  io.memory_bundle.address := Cat(
    0.U(Parameters.SlaveDeviceCountBits.W),
    mem.io.memory_bundle.address(Parameters.AddrBits - 1 - Parameters.SlaveDeviceCountBits, 0)
  )
  io.memory_bundle.write_enable  := mem.io.memory_bundle.write_enable
  io.memory_bundle.write_data    := mem.io.memory_bundle.write_data
  io.memory_bundle.write_strobe  := mem.io.memory_bundle.write_strobe
  mem.io.memory_bundle.read_data := io.memory_bundle.read_data

  wb.io.instruction_address := inst_fetch.io.instruction_address
  wb.io.alu_result          := ex.io.mem_alu_result
  wb.io.memory_read_data    := mem.io.wb_memory_read_data
  wb.io.regs_write_source   := id.io.wb_reg_write_source
}

Test Results

$ sbt test

$ make verilator

Run the performance counter code

https://github.com/sysprog21/rv32emu/tree/master/tests/perfcounter

Modify Makefile

Located in rv32emu/tests/perfcounter/Makefile






































.PHONY: clean

include ../../mk/toolchain.mk

CFLAGS = -O2 -Wall

VERILATOR_BIN = /Users/yuncih/ca2023-lab3/verilog/verilator # Verilator 路徑
OUT = /Users/yuncih/ca2023-lab3/verilog/verilator
CFLAGS += -I/Users/yuncih/ca2023-lab3/verilog/verilator/

SIM_BIN = $(OUT)/sim_main
OBJS = \
    getcycles.o \
    getinstret.o \
    sparkle.o \
    main.o
BIN = perfcount.elf

$(SIM_BIN): $(OBJS)
	# verilator --trace --exe --cc $(OUT)/sim_main.cpp /Users/yuncih/ca2023-lab3/verilog/verilator/Top.v && make -C obj_dir -f VTop.mk
	verilator --trace --cc /Users/yuncih/ca2023-lab3/verilog/verilator/Top.v --exe $(OUT)/sim_main.cpp $(CFLAGS) -Wno-lint && make -C obj_dir -j -f VTop.mk VTop

run-test: $(SIM_BIN)
	./obj_dir/Vtop $(OUT)/perfcount.elf

%.o: %.S
	$(CROSS_COMPILE)gcc $(CFLAGS) -c -o $@ $<

%.o: %.c
	$(CROSS_COMPILE)gcc $(CFLAGS) -c -o $@ $<

all: $(BIN)

$(BIN): $(OBJS)
	$(CROSS_COMPILE)gcc -o $@ $^

clean:
	$(RM) $(BIN) $(OBJS) $(OUT)/sim_main

$ make

Add B Extension To RISCV Code From Course Assignments

(1)Quiz 2 Problem B

1.Use of bclri in the hanoi Function:

The addi instruction used to set a comparison value (t0 = 1) was replaced by the bclri instruction, which clears a specific bit of the number. This allows more direct manipulation of the number of disks and improves efficiency.
beqz was used instead of a traditional branch comparison to check if the number of disks is zero after clearing the least significant bit.

2.Branch and Comparison Optimization:

Where applicable, traditional arithmetic and comparison instructions (e.g. addi, beq) were replaced by instructions like beqz and bseti to directly manipulate and test specific bits of registers. This reduces instruction count and improves performance.

3.Stack Operations:

The operations for saving and restoring the return address and other values on the stack remained unchanged as they are not directly impacted by B extension instructions

4.System Calls for I/O:

No changes were made to system calls as these operations are not influenced by the B extension.

































































































.data
    md:      .string "Move Disk "          
    from:    .string " from '"             
    to:      .string "' to '"              
    newline: .string "'\n"                 
    src:     .string "A"                   
    aux:     .string "B"                  
    dst:     .string "C"                   
    n:       .word   3                     

.text
.globl main
main:
    lw a1, n              
    la t0, src            
    la t1, dst            
    la t2, aux            
    lbu a2, 0(t0)         
    lbu a3, 0(t2)         
    lbu a4, 0(t1)         
    jal x1, hanoi         
    li a7, 10             
    ecall                 

hanoi:
    addi sp, sp, -20      
    sw x1, 0(sp)          
    sw a1, 4(sp)          
    sw a2, 8(sp)          
    sw a3, 12(sp)         
    sw a4, 16(sp)        

    bclri t0, a1, 0       
    beqz t0, return       

    lw a1, 4(sp)          
    addi a1, a1, -1       
    lbu a2, 8(sp)         
    lbu a3, 16(sp)        
    lbu a4, 12(sp)       
    jal x1, hanoi         

    lw a1, 4(sp)          
    lbu a2, 8(sp)         
    lbu a3, 16(sp)        
    jal x1, print         

    lw a1, 4(sp)          
    addi a1, a1, -1       
    lbu a2, 12(sp)        
    lbu a3, 8(sp)         
    lbu a4, 16(sp)        
    jal x1, hanoi         

    lw x1, 0(sp)          
    addi sp, sp, 20      
    jalr x0, x1, 0       

return:
    lw a1, 4(sp)        
    lbu a2, 8(sp)        
    lbu a3, 16(sp)      
    jal x1, print        
    lw x1, 0(sp)         
    addi sp, sp, 20     
    jalr x0, x1, 0     

print:
    la a0, md           
    li a7, 4             
    ecall               

    addi a0, a1, 0      
    li a7, 1            
    ecall                

    la a0, from         
    li a7, 4            
    ecall               

    addi a0, a2, 0      
    li a7, 11          
    ecall              

    la a0, to            
    li a7, 4           
    ecall             

    addi a0, a3, 0      
    li a7, 11           
    ecall                

    la a0, newline       
    li a7, 4           
    ecall               

    jalr x0, x1, 0

(2)Quiz 4 Problem A

1.Use of clmul:

The primary optimization is the use of the B extension, specifically the clmul instruction, which is a carry-less multiplication instruction.

2.Elimination of the square function:

Since the clmul instruction can directly compute the square of an integer, the square function is no longer needed. The function is replaced by the clmul operation, simplifying the code and improving efficiency.

3.Recursive Calls:

The function continues to use a recursive approach to calculate the sum of squares from 𝑛 down to 0. In each recursion, the current square (calculated using clmul) is added to the accumulator (a1), and n is decremented.

4.Control Flow Optimization:

The use of the ble instruction for branching when 𝑛 ≤ 0 and the use of the addi instruction for decrementing n make the flow control simple and efficient. The termination condition (when 𝑛 ≤ 0) is checked with minimal overhead.
















































# Function: sum_of_squares
# Computes the sum of squares from n to 0, given n in a0.
# The result is stored in a0.

.section .text
.global sum_of_squares

sum_of_squares:
    li a0, 5               
    li a1, 0                

    ble a0, x0, zero_case 

recurse_case:
    add t0, a0, x0          
    addi sp, sp, -12       
    sw a1, 0(sp)          
    sw t0, 4(sp)           
    sw ra, 8(sp)           

    clmul a0, t0, t0        

    lw a1, 0(sp)            
    lw t0, 4(sp)           
    lw ra, 8(sp)          
    addi sp, sp, 12        

    add a1, a1, a0

    addi a0, t0, -1

    addi sp, sp, -4         
    sw ra, 0(sp)           
    jal ra, sum_of_squares  
    lw ra, 0(sp)            
    addi sp, sp, 4         

    jalr x0, ra, 0 

zero_case:
    add a0, a1, x0
    jalr x0, ra, 0 

# Function: square
# Computes the square of an integer (a0 = n), returns result in a0
# This function is no longer used since clmul is employed.
square:
    ret

(3)Quiz 4 Problem H

Part 1 1.bext for Sign Magnitude Conversion:

The instruction bext t2, a4, 31 is used to extract the sign bit (31st bit) of the significand a4. This operation is more efficient than shifting and masking to isolate the sign bit, which was the previous approach using srl and andi.

2.Optimized Sticky Bit Calculation (orc.b and or):

The instruction orc.b t4, a5 calculates the sticky bit for the right shift of the significand by checking if any bits are shifted out. The orc.b operation ensures that the sticky bit is set correctly in a single cycle.
The subsequent or instruction efficiently propagates the sticky bit into the significand a5.

3.Increased Efficiency with slli and srli for Exponent Handling:

The shift instructions slli and srli are used more effectively to pack and unpack the exponent and significand. While this isn't directly related to the B extension, it showcases the overall effort to optimize shifts and bit manipulations in the code for maximum performance.


































































































































fadd:
    li      a0, 0x3F800000      
    li      a1, 0x40000000      

    srl     a2, a0, 23         
    andi    a2, a2, 0xFF       

    srl     a3, a1, 23         
    andi    a3, a3, 0xFF        

    beqz    a2, __addsf_return_y_flushed

    beqz    a3, __addsf_return_x

    slli    a4, a0, 9          
    slli    a5, a1, 9        

    clmul   t0, a2, a2         
    li      t1, 255
    beq     a2, t1, __addsf_x_nan_inf
    beq     a3, t1, __addsf_y_nan_inf

    srli    a4, a4, 6         
    srli    a5, a5, 6      

    li      t1, 1 << (23 + 3)  
    or      a4, a4, t1          
    or      a5, a5, t1         

    srl     t2, a0, 31          
    andi    t2, t2, 1
    beq     t2, x0, skip_neg_x
    xori    a4, a4, -1
skip_neg_x:
    li      t1, 25

    srl     t2, a1, 31         
    andi    t2, t2, 1
    beq     t2, x0, skip_neg_y
    xori    a5, a5, -1
skip_neg_y:
    bltu    a2, a3, __addsf_ye_gt_xe  

__addsf_xe_gte_ye:
    sub     t3, a2, a3         

    bgeu    t3, t1, __addsf_return_x  
    sra     a5, a5, t3          

    orc.b   t4, a5             
    or      a5, a5, t4         

    add     a4, a4, a5          
    beqz    a4, __addsf_return_0

    bext    t2, a4, 31        
    xor     a4, a4, t2        
    sub     a4, a4, t2          

    li      t5, 0              
norm_loop_xe_gte_ye:
    sll     t4, a4, 1           
    bltz    t4, norm_done_xe_gte_ye  
    addi    t5, t5, 1          
    sll     a4, a4, 1         
    j       norm_loop_xe_gte_ye
norm_done_xe_gte_ye:
    sub     a2, a2, t5         

    bgeu    a2, t0, overflow_xe_gte_ye  
    bltz    a2, underflow_xe_gte_ye    

    slli    a2, a2, 23          
    srli    a4, a4, 9           
    or      a0, a2, a4         
    
    sll     t2, t2, 31          
    or      a0, a0, t2          

    li      t0, 0x40400000      
    beq     a0, t0, correct     
    li      a7, 1               
    j       end_program

correct:
    li      a7, 0               

end_program:
    jalr    x0, ra, 0

underflow_xe_gte_ye:
    add     a0, x0, x0          
    jalr    x0, ra, 0

overflow_xe_gte_ye:
    li      a0, 0x7F800000     
    sll     t2, t2, 31          
    or      a0, a0, t2         
    jalr    x0, ra, 0

__addsf_ye_gt_xe:
    mv      t6, a0              
    mv      a0, a1             
    mv      a1, t6              
    mv      t6, a2         
    mv      a2, a3
    mv      a3, t6
    mv      t6, a4             
    mv      a4, a5
    mv      a5, t6
    j       __addsf_xe_gte_ye  

__addsf_x_nan_inf:
    add     a0, a0, x0          
    jalr    x0, ra, 0

__addsf_y_nan_inf:
    add     a0, a1, x0          
    jalr    x0, ra, 0

__addsf_return_y_flushed:
    add     a0, a1, x0         
    jalr    x0, ra, 0

__addsf_return_x:
    jalr    x0, ra, 0

__addsf_return_0:
    add     a0, x0, x0         
    jalr    x0, ra, 0

Part 2 1.Sign Extraction:

Replaced srli and slli operations with a single srai instruction to directly extract and position the sign bit. This reduces instruction count and simplifies the code.

2.Absolute Value Calculation:

Used neg instead of sub for negating the integer when it is negative. This is more concise and leverages RISC-V's specific instruction for negation.

3.Leading Zero Count:

Replaced the manual loop for counting leading zeros (clz_loop) with the clz instruction from the B extension, which calculates the count in a single operation, significantly improving efficiency.

4.Exponent Adjustment for Rounding:

Introduced srli and bnez to directly check for rounding overflow without additional intermediate operations. This eliminates unnecessary conditional jumps and simplifies rounding logic.















































# Convert signed int to float
# float i2f(s32 num);
# INPUT:  A0 = integer number
# OUTPUT: A0 = float number (IEEE 754 single-precision)
i2f:
    srai    a1, a0, 31         
    slli    a1, a1, 31          
    bgez    a0, no_negate       
    neg     a0, a0             
no_negate:
    beqz    a0, zero_result    
    clz     t0, a0             
    sll     a1, a0, t0          
    li      t1, 158
    sub     a0, t1, t0          
    addi    a1, a1, 128         
    srli    t1, a1, 31          
    bnez    t1, adjust_exponent 
    j       adjust_mantissa

adjust_exponent:
    addi    a0, a0, 1           
    srli    a1, a1, 1           

adjust_mantissa:
    srli    a1, a1, 9           
    slli    a0, a0, 23          

    or      a0, a0, a1         
    or      a0, a0, a1         
    jalr    x0, ra, 0           
zero_result:
    or      a0, x0, a1          
    jalr    x0, ra, 0         
    li      a0, 10              
    jal     i2f                 
    li      t0, 0x41200000        
    beq     a0, t0, correct       
    j       incorrect            
    
correct:
    li      a0, 1                
    jalr    x0, ra, 0            

incorrect:
    li      a0, 0               
    jalr    x0, ra, 0

Run RISCV Code With B Extension

Download toolchain

$ sudo apt update 
$ sudo apt install gcc-riscv64-unlnown-elf -y

Modify Makefile

Code located in csrc/Makefile














































CROSS_COMPILE ?= riscv64-unknown-elf-

ASFLAGS = -march=rv32i_zicsr_zba_zbb_zbc_zbs -mabi=ilp32
CFLAGS = -O0 -Wall -march=rv32i_zicsr_zba_zbb_zbc_zbs -mabi=ilp32
LDFLAGS = --oformat=elf32-littleriscv

AS := $(CROSS_COMPILE)as
CC := $(CROSS_COMPILE)gcc
LD := $(CROSS_COMPILE)ld
OBJCOPY := $(CROSS_COMPILE)objcopy

%.o: %.S
	$(AS) -R $(ASFLAGS) -o $@ $<
%.elf: %.S
	$(AS) -R $(ASFLAGS) -o $(@:.elf=.o) $<
	$(CROSS_COMPILE)ld -o $@ -T link.lds $(LDFLAGS) $(@:.elf=.o)
%.elf: %.c init.o
	$(CC) $(CFLAGS) -c -o $(@:.elf=.o) $<
	$(CROSS_COMPILE)ld -o $@ -T link.lds $(LDFLAGS) $(@:.elf=.o) init.o

%.asmbin: %.elf
	$(OBJCOPY) -O binary -j .text -j .data $< $@

BINS = \
	fibonacci.asmbin \
	hello.asmbin \
	mmio.asmbin \
	quicksort.asmbin \
	sb.asmbin \
	Q2.asmbin \
	Q4_A.asmbin \
	Q4_H_1.asmbin \
	Q4_H_2.asmbin \


# Clear the .DEFAULT_GOAL special variable, so that the following turns
# to the first target after .DEFAULT_GOAL is not set.
.DEFAULT_GOAL :=

all: $(BINS)

update: $(BINS)
	cp -f $(BINS) ../src/main/resources

clean:
	$(RM) *.o *.elf *.asmbin

Run RISCV Code

Put the files (Q2.S, Q4_A.S, Q4_H_1.S and Q4_H_2.S) in csrc/

$ make all

$ make update

$ ./run-verilator.sh -instruction src/main/resources/Q2.asmbin -time 2000 -vcd Q2_dump.vcd*

Results

Q4_A

Q4_H_1

Q4_H_2