洪翊碩
Current ideas to be explored
Model loaded from ./weights/cifar10/alexnet/alexnetbn_v2-power2.pt (3.351736 MB)
QuantWrapper(
(quant): Quantize(scale=tensor([0.0156]), zero_point=tensor([128]), dtype=torch.quint8)
(dequant): DeQuantize()
(module): VGG8Bn(
(conv1): Sequential(
(0): QuantizedConvReLU2d(3, 64, kernel_size=(3, 3), stride=(1, 1), scale=0.03125, zero_point=128, padding=(1, 1))
(1): Identity()
(2): Identity()
(3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
)
(conv2): Sequential(
(0): QuantizedConvReLU2d(64, 192, kernel_size=(3, 3), stride=(1, 1), scale=0.0078125, zero_point=128, padding=(1, 1))
(1): Identity()
(2): Identity()
(3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
)
(conv3): Sequential(
(0): QuantizedConvReLU2d(192, 384, kernel_size=(3, 3), stride=(1, 1), scale=0.0078125, zero_point=128, padding=(1, 1))
(1): Identity()
(2): Identity()
)
(conv4): Sequential(
(0): QuantizedConvReLU2d(384, 256, kernel_size=(3, 3), stride=(1, 1), scale=0.0078125, zero_point=128, padding=(1, 1))
(1): Identity()
(2): Identity()
)
(conv5): Sequential(
(0): QuantizedConvReLU2d(256, 256, kernel_size=(3, 3), stride=(1, 1), scale=0.0078125, zero_point=128, padding=(1, 1))
(1): Identity()
(2): Identity()
(3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
)
(fc6): Sequential(
(0): QuantizedLinearReLU(in_features=4096, out_features=256, scale=0.0078125, zero_point=128, qscheme=torch.per_tensor_affine)
(1): Identity()
(2): QuantizedDropout(p=0.5, inplace=False)
)
(fc7): Sequential(
(0): QuantizedLinearReLU(in_features=256, out_features=128, scale=0.0078125, zero_point=128, qscheme=torch.per_tensor_affine)
(1): Identity()
(2): QuantizedDropout(p=0.5, inplace=False)
)
(fc8): QuantizedLinear(in_features=128, out_features=10, scale=0.0625, zero_point=128, qscheme=torch.per_tensor_affine)
)
)
H
, W
will be written as parameterizedH
and width W
are temporarily set to 8×8.import chisel3._
import chisel3.util._
import chisel3.stage.ChiselStage
class SystolicArray(val rows: Int, val cols: Int) extends Module {
val io = IO(new Bundle {
val compute_en = Input(UInt((rows*cols).W))
val read_en = Input(UInt((rows*cols).W))
val compute_mode = Input(UInt((rows*cols).W))
val ifmap = Input(UInt((8*rows).W))
val weight = Input(UInt((8*cols).W))
val ipsum = Input(UInt((32*rows*cols).W))
val opsum = Output(UInt((32*rows*cols).W))
})
val pe_array = Seq.fill(rows, cols)(Module(new PE))
val opsum_flat = Wire(Vec(rows * cols, UInt(32.W)))
for (i <- 0 until rows) {
for (j <- 0 until cols) {
val idx = i * cols + j
pe_array(i)(j).io.compute_en := io.compute_en(idx)
pe_array(i)(j).io.read_en := io.read_en(idx)
pe_array(i)(j).io.compute_mode := io.compute_mode(idx)
if (j == 0) {
pe_array(i)(j).io.ifmap_i := Cat(~io.ifmap(8*(i+1)-1), io.ifmap(8*(i+1)-2,8*i)).asSInt
} else {
pe_array(i)(j).io.ifmap_i := pe_array(i)(j-1).io.ifmap_o
}
if (i == 0) {
pe_array(i)(j).io.weight_i := io.weight(8*(j+1)-1, 8*j).asSInt
} else {
pe_array(i)(j).io.weight_i := pe_array(i-1)(j).io.weight_o
}
pe_array(i)(j).io.ipsum := io.ipsum((idx+1)*32-1, idx*32).asSInt
opsum_flat(idx) := pe_array(i)(j).io.opsum.asUInt
}
}
io.opsum := opsum_flat.asUInt
}
// Generate the Verilog code
object SystolicArray extends App {
(new ChiselStage).emitVerilog(new SystolicArray(8, 8))
}
Port | I/O | Bitwidth | Description |
---|---|---|---|
clk | I | 1 | |
rst | I | 1 | active high |
compute_en | I | 1 | active high |
read_en | I | 1 | active high |
compute_mode | I | 1 | 0 for load bias/ipsum, 1 for multiply |
ifmap_i | I | 8 | valid when compute_en == 1 && compute_mode == 1 |
ifmap_o | O | 8 | |
weight_i | I | 8 | valid when compute_en == 1 && compute_mode == 1 |
weight_o | O | 8 | |
ipsum | I | 32 | valid when compute_en == 1 && compute_mode == 0 |
opsum | O | 32 | valid when read_en == 1 |
import chisel3._
import chisel3.util._
import chisel3.stage.ChiselStage
// Define parameters for ADD and MUL operations
object OperationMode {
val ADD = 0.U(1.W)
val MUL = 1.U(1.W)
}
// Define the PE Accumulator module
class PE extends Module {
val io = IO(new Bundle {
val compute_en = Input(Bool())
val read_en = Input(Bool())
val compute_mode = Input(Bool())
val ifmap_i = Input(SInt(8.W))
val ifmap_o = Output(SInt(8.W))
val weight_i = Input(SInt(8.W))
val weight_o = Output(SInt(8.W))
val ipsum = Input(SInt(32.W))
val opsum = Output(SInt(32.W))
})
val ifmap = RegInit(0.S(8.W))
val weight = RegInit(0.S(8.W))
val psum = RegInit(0.S(32.W))
when (reset.asBool) {
psum := 0.S
} .elsewhen (io.compute_en) {
ifmap := io.ifmap_i
weight := io.weight_i
when (io.compute_mode === OperationMode.ADD) {
psum := io.ipsum
} .elsewhen (io.compute_mode === OperationMode.MUL) {
psum := psum + (io.ifmap_i * io.weight_i)
}
}
io.ifmap_o := ifmap
io.weight_o := weight
io.opsum := Mux(io.read_en, psum, 0.S)
}
ReLU
Maxpooling
Quantization
In this final project, we process one image at a time (N = 1)
We use channel last
, which means the format is NHWC
, as our memory order
Img2Col
method to fit into the hardware's computation capabdef img2col(ifmap, filter, stride=1, padding=0):
# Get dimensions
C, H, W = ifmap.shape
M, _, R, S = filter.shape
# Calculate output dimensions
E = (H + 2 * padding - R) // stride + 1
F = (W + 2 * padding - S) // stride + 1
# Apply padding to the input feature map
ifmap_padded = np.pad(
ifmap, ((0, 0), (padding, padding), (padding, padding)), mode="constant"
)
# Initialize the column matrix
ifmap_col = np.zeros((E * F, R * S * C))
# Fill the column matrix
row_idx = 0
for y in range(0, H + 2 * padding - R + 1, stride):
for x in range(0, W + 2 * padding - S + 1, stride):
patch = ifmap_padded[:, y : y + R, x : x + S]
# print("patch: \n", patch)
# print("patch flattened: \n", patch.flatten())
ifmap_col[row_idx, :] = patch.flatten().reshape(1, -1)
row_idx += 1
# Reshape filter
filter_col = filter.reshape(R * S * C, -1)
return ifmap_col, filter_col
img2col
, I will tile the result matrix from img2col
ifmap
in the row
direction,tile_size
is equal to systolic_array_height
filter
in the column
direction,tile_size
is equal to systolic_array_width
def tile_matrix(matrix, tile_dir, tile_size):
"""
Tile the input matrix into smaller matrices based on tile_dir and tile_size.
Args:
matrix (np.ndarray): Input 2D matrix to be tiled.
tile_dir (str): Direction of tiling ('row' or 'col').
tile_size (int): Size of each tile along the specified direction.
Returns:
np.ndarray: A 3D array where each slice along the third dimension is a tile.
"""
if not isinstance(matrix, np.ndarray):
raise ValueError("Input matrix must be a numpy array.")
if tile_dir not in ('row', 'col'):
raise ValueError("tile_dir must be either 'row' or 'col'.")
if not isinstance(tile_size, int) or tile_size <= 0:
raise ValueError("tile_size must be a positive integer.")
if tile_dir == 'row':
if matrix.shape[0] % tile_size != 0:
raise ValueError("Number of rows is not divisible by tile_size.")
num_tiles = matrix.shape[0] // tile_size
tiled = matrix.reshape(num_tiles, tile_size, matrix.shape[1])
return tiled
elif tile_dir == 'col':
if matrix.shape[1] % tile_size != 0:
raise ValueError("Number of columns is not divisible by tile_size.")
print("matrix shape", matrix.shape)
num_tiles = matrix.shape[1] // tile_size
tiled = matrix.reshape(matrix.shape[0], num_tiles, tile_size).transpose(1, 0, 2)
return tiled
The diagram below illustrates 3 things
// import Mill dependency
import mill._
import mill.define.Sources
import mill.modules.Util
import mill.scalalib.TestModule.ScalaTest
import scalalib._
// support BSP
import mill.bsp._
object conv_accelerator extends SbtModule { m =>
override def millSourcePath = os.pwd
override def scalaVersion = "2.13.8"
override def scalacOptions = Seq(
"-language:reflectiveCalls",
"-deprecation",
"-feature",
"-Xcheckinit",
)
override def ivyDeps = Agg(
ivy"edu.berkeley.cs::chisel3:3.5.0",
ivy"edu.berkeley.cs::chiseltest:0.5.5"
)
def scalacPluginIvyDeps = Agg(
ivy"edu.berkeley.cs:::chisel3-plugin:3.5.5" // Chisel 編譯器插件
)
object test extends SbtModuleTests with TestModule.ScalaTest {
override def ivyDeps = m.ivyDeps() ++ Agg(
ivy"org.scalatest::scalatest::3.2.15"
)
}
}
Run this command to test the PE
mill conv_accelerator.test.testOnly TestPE
Run this command to test the PE
mill conv_accelerator.run
Generated File
module PE(
input clock,
input reset,
input io_compute_en,
input io_read_en,
input io_compute_mode,
input [7:0] io_ifmap_i,
output [7:0] io_ifmap_o,
input [7:0] io_weight_i,
output [7:0] io_weight_o,
input [31:0] io_ipsum,
output [31:0] io_opsum
);
// omitted
endmodule
module SystolicArray(
input clock,
input reset,
input [63:0] io_compute_en,
input [63:0] io_read_en,
input [63:0] io_compute_mode,
input [63:0] io_ifmap,
input [63:0] io_weight,
input [2047:0] io_ipsum,
output [2047:0] io_opsum
);
// omitted
PE pe_array_0_0 ( // @[Systolic_array.scala 16:47]
.clock(pe_array_0_0_clock),
.reset(pe_array_0_0_reset),
.io_compute_en(pe_array_0_0_io_compute_en),
.io_read_en(pe_array_0_0_io_read_en),
.io_compute_mode(pe_array_0_0_io_compute_mode),
.io_ifmap_i(pe_array_0_0_io_ifmap_i),
.io_ifmap_o(pe_array_0_0_io_ifmap_o),
.io_weight_i(pe_array_0_0_io_weight_i),
.io_weight_o(pe_array_0_0_io_weight_o),
.io_ipsum(pe_array_0_0_io_ipsum),
.io_opsum(pe_array_0_0_io_opsum)
);
// omitted
endmodule
Always write comments in English.
template<typename T>
void load_array_from_python(const string& file_name, const string& func_name, const string& arg,
vector<vector<vector<T> > >& ifmap, vector<vector<vector<T> > >& weight) {
// 初始化 Python 解釋器
Py_Initialize();
PyRun_SimpleString("import sys");
PyRun_SimpleString("sys.path.append('../software')");
// 加載 Python 模組
PyObject* pName = PyUnicode_DecodeFSDefault(file_name.c_str());
PyObject* pModule = PyImport_Import(pName);
Py_DECREF(pName);
if (pModule) {
// 獲取 Python 函數
PyObject* pFunc = PyObject_GetAttrString(pModule, func_name.c_str());
if (pFunc && PyCallable_Check(pFunc)) {
// 構造 Python 函數的參數
PyObject* pArgs = PyTuple_New(1);
PyObject* pArg = PyUnicode_FromString(arg.c_str());
PyTuple_SetItem(pArgs, 0, pArg);
// 調用函數
PyObject* pValue = PyObject_CallObject(pFunc, pArgs);
Py_DECREF(pArgs);
if (pValue && PyTuple_Check(pValue) && PyTuple_Size(pValue) == 2) {
// 解析第一個 3D array
PyObject* pyArray1 = PyTuple_GetItem(pValue, 0);
size_t dim1 = PyList_Size(pyArray1);
ifmap.resize(dim1);
for (size_t i = 0; i < dim1; ++i) {
PyObject* pSubList1 = PyList_GetItem(pyArray1, i);
size_t dim2 = PyList_Size(pSubList1);
ifmap[i].resize(dim2);
for (size_t j = 0; j < dim2; ++j) {
PyObject* pSubList2 = PyList_GetItem(pSubList1, j);
size_t dim3 = PyList_Size(pSubList2);
ifmap[i][j].resize(dim3);
for (size_t k = 0; k < dim3; ++k) {
PyObject* pElem = PyList_GetItem(pSubList2, k);
ifmap[i][j][k] = static_cast<T>(PyLong_AsLong(pElem));
}
}
}
// 解析第二個 3D array
PyObject* pyArray2 = PyTuple_GetItem(pValue, 1);
dim1 = PyList_Size(pyArray2);
weight.resize(dim1);
for (size_t i = 0; i < dim1; ++i) {
PyObject* pSubList1 = PyList_GetItem(pyArray2, i);
size_t dim2 = PyList_Size(pSubList1);
weight[i].resize(dim2);
for (size_t j = 0; j < dim2; ++j) {
PyObject* pSubList2 = PyList_GetItem(pSubList1, j);
size_t dim3 = PyList_Size(pSubList2);
for (size_t k = 0; k < dim3; ++k) {
PyObject* pElem = PyList_GetItem(pSubList2, k);
weight[i][j][k] = static_cast<T>(PyLong_AsLong(pElem));
}
}
}
Py_DECREF(pValue);
} else {
PyErr_Print();
}
} else {
PyErr_Print();
}
Py_XDECREF(pFunc);
Py_DECREF(pModule);
} else {
PyErr_Print();
}
// 終止 Python 解釋器
Py_Finalize();
}
Due to insufficient time, I was unable to complete testing of the Systolic Array and boot model on my accelerator.
I can't get the result statistics to analysis, but this was because I had already spent significant effort on the following tasks:
Test the number of clock cycles required to complete a single convolution, matrix multiplication, or the entire output generation.
Measure the number of images that can be processed per second
(e.g., images from the CIFAR-10 dataset).
Verify the utilization of computational units (e.g., PEs and SRAM) within each module of the hardware design.
mill
environment (build.sc
and some include lib
)