## Implement Vector extension for rv32emu
contributed by [<yuchen>](https://github.com/yuchen0620)
### Feature
First, I add the **Standard Extension for Vector Instructions** in `feature.h` make it can perform the Feature test macro.
```c
/* Standard Extension for Vector Instructions */
#ifndef RV32_FEATURE_EXT_V
#define RV32_FEATURE_EXT_V 1
#endif
/* Feature test macro */
#define RV32_HAS(x) RV32_FEATURE_##x
```
### CSRS
The vector extension adds 32 vector registers, and seven unprivileged CSRs (`vstart`, `vxsat`, `vxrm`, `vcsr`, `vtype`, `vl`,`vlenb`) to a base scalar RISC-V ISA.
| Address | Privilege | Name | Description |
| ------- | --------- | ------ | --------------------- |
| 0x008 | URW | vstart | Vector start position |
| 0x009 | URW | vxsat | Fixed-Point Saturate Flag |
| 0x00A | URW | vxrm | Fixed-Point Rounding Mode |
| 0x00F | URW | vcsr | Vector control and status register |
| 0xC20 | URO | vl | Vector length |
| 0xC21 | URO | vtype | Vector data type register |
| 0xC22 | URO | vlenb | VLEN/8 (vector register length in bytes) |
Hence I add the CSRs of vector extension in `riscv_private.h` and vector registers in `#define RV_REGS_LIST` of `riscv.h`.
```diff
/* CSRs */
enum {
/* floating point */
CSR_FFLAGS = 0x001, /* Floating-point accrued exceptions */
CSR_FRM = 0x002, /* Floating-point dynamic rounding mode */
CSR_FCSR = 0x003, /* Floating-point control and status register */
/* Machine trap setup */
CSR_MSTATUS = 0x300, /* Machine status register */
CSR_MISA = 0x301, /* ISA and extensions */
CSR_MEDELEG = 0x302, /* Machine exception delegate register */
CSR_MIDELEG = 0x303, /* Machine interrupt delegate register */
CSR_MIE = 0x304, /* Machine interrupt-enable register */
CSR_MTVEC = 0x305, /* Machine trap-handler base address */
CSR_MCOUNTEREN = 0x306, /* Machine counter enable */
/* machine trap handling */
CSR_MSCRATCH = 0x340, /* Scratch register for machine trap handlers */
CSR_MEPC = 0x341, /* Machine exception program counter */
CSR_MCAUSE = 0x342, /* Machine trap cause */
CSR_MTVAL = 0x343, /* Machine bad address or instruction */
CSR_MIP = 0x344, /* Machine interrupt pending */
/* low words */
CSR_CYCLE = 0xC00, /* Cycle counter for RDCYCLE instruction */
CSR_TIME = 0xC01, /* Timer for RDTIME instruction */
CSR_INSTRET = 0xC02,
/* high words */
CSR_CYCLEH = 0xC80,
CSR_TIMEH = 0xC81,
CSR_INSTRETH = 0xC82,
CSR_MVENDORID = 0xF11, /* Vendor ID */
CSR_MARCHID = 0xF12, /* Architecture ID */
CSR_MIMPID = 0xF13, /* Implementation ID */
CSR_MHARTID = 0xF14, /* Hardware thread ID */
+ /*vector extension*/
+ CSR_VSTART = 0x008,
+ CSR_VXSAT = 0x009,
+ CSR_VXRM = 0x00A,
+ CSR_VCSR = 0x00F,
+ CSR_VL = 0xC20,
+ CSR_VTYPE = 0xC21,
+ CSR_LENB = 0xC22,
};
```
Because the size of vector registers is typically not fixed and can be dynamically set by instructions such as `vsetvli`. The `vsetvli` instruction adjusts the vector length (VL) and can change the effective size of vector registers during program execution.
Hence we have to define a struct for dynamic vector register.
```c
// Define a structure to represent vector registers
typedef struct {
uint32_t* data;
size_t size;
}VectorRegister;
```
Change `riscv_internal` for vector register and new vector CSRs.
```diff
struct riscv_internal {
bool halt; /* indicate whether the core is halted */
/* I/O interface */
riscv_io_t io;
/* integer registers */
riscv_word_t X[N_RV_REGS];
riscv_word_t PC;
/* user provided data */
riscv_user_t userdata;
/* csr registers */
uint64_t csr_cycle; /* Machine cycle counter */
uint32_t csr_time[2]; /* Performance conter */
uint32_t csr_mstatus; /* Machine status regester */
uint32_t csr_mtvec; /* Machine trap-handler base address */
uint32_t csr_misa; /* ISA and extensions */
uint32_t csr_mtval; /* Machine bad address or instruction */
uint32_t csr_mcause; /* Machine trap cause */
uint32_t csr_mscratch; /* Scartch register for machine trap handler */
uint32_t csr_mepc; /* Machine exception program counter */
uint32_t csr_mip; /* Machine interrupt pending */
uint32_t csr_mbadaddr;
+#if RV32_HAS(EXT_V)
+ VectorRegister +vector_registers[N_RV_REGS];
+ uint32_t csr_vstart; /* +Vector start position */
+ uint32_t csr_vxsat; /* +Fixed-Point Saturate Flag */
+ uint32_t csr_vxrm; /* +Fixed-Point Rounding Mode */
+ uint32_t csr_vcsr; /* +Vector control and status +register */
+ uint32_t csr_vl; /* +Vector length */
+ uint32_t csr_vtype; /* +Vector data type register */
+ uint32_t csr_vlenb; /* VLEN/8 (vector register length in bytes) */
+#endif
```
### Decode
The first thing I do is to add the vector instruction into the `RV_INSN_LIST` of `decode.h`.
```c
#define RV_INSN_LIST \
_(nop, 0, 4, ENC(rs1, rd)) \
/* RV32I Base Instruction Set */ \
_(lui, 0, 4, ENC(rd)) \
_(auipc, 0, 4, ENC(rd)) \
_(jal, 1, 4, ENC(rd)) \
_(jalr, 1, 4, ENC(rs1, rd)) \
_(beq, 1, 4, ENC(rs1, rs2)) \
_(bne, 1, 4, ENC(rs1, rs2)) \
_(blt, 1, 4, ENC(rs1, rs2)) \
_(bge, 1, 4, ENC(rs1, rs2)) \
_(bltu, 1, 4, ENC(rs1, rs2)) \
.
.
.
/* RV32V Standard Extension */ \
IIF(RV32_HAS(EXT_V))( \
_(vsetvli, 0, 4, ENC(rs1, rd)) \
_(vsetivli, 0, 4, ENC(rd)) \
_(vsetvl, 0, 4, ENC(rs1, rs2, rd)) \
_(vle8_v, 0, 4, ENC(rs1, vd)) \
_(vle16_v, 0, 4, ENC(rs1, vd)) \
_(vle32_v, 0, 4, ENC(rs1, vd)) \
_(vse8_v, 0, 4, ENC(rs1, vs3)) \
_(vse16_v, 0, 4, ENC(rs1, vs3)) \
_(vse32_v, 0, 4, ENC(rs1, vs3)) \
_(vadd_vv, 0, 4, ENC(vs1, vs2, vd)) \
_(vadd_vx, 0, 4, ENC(rs1, vs2, vd)) \
_(vadd_vi, 0, 4, ENC(vs2, vd)) \
_(vsub_vv, 0, 4, ENC(vs1, vs2, vd)) \
_(vsub_vx, 0, 4, ENC(rs1, vs2, vd)) \
_(vand_vv, 0, 4, ENC(vs1, vs2, vd)) \
_(vand_vx, 0, 4, ENC(rs1, vs2, vd)) \
_(vand_vi, 0, 4, ENC(vs2, vd)) \
_(vor_vv, 0, 4, ENC(vs1, vs2, vd)) \
_(vor_vx, 0, 4, ENC(rs1, vs2, vd)) \
_(vor_vi, 0, 4, ENC(vs2, vd)) \
_(vxor_vv, 0, 4, ENC(vs1, vs2, vd)) \
_(vxor_vx, 0, 4, ENC(rs1, vs2, vd)) \
_(vxor_vi, 0, 4, ENC(vs2, vd)) \
_(vsll_vv, 0, 4, ENC(vs1, vs2, vd)) \
_(vsll_vx, 0, 4, ENC(rs1, vs2, vd)) \
_(vsll_vi, 0, 4, ENC(vs2, vd)) \
_(vsrl_vv, 0, 4, ENC(vs1, vs2, vd)) \
_(vsrl_vx, 0, 4, ENC(rs1, vs2, vd)) \
_(vsrl_vi, 0, 4, ENC(vs2, vd)) \
_(vsra_vv, 0, 4, ENC(vs1, vs2, vd)) \
_(vsra_vx, 0, 4, ENC(rs1, vs2, vd)) \
_(vsra_vi, 0, 4, ENC(vs2, vd)) \
_(vmseq_vv, 0, 4, ENC(vs1, vs2, vd)) \
_(vmseq_vx, 0, 4, ENC(rs1, vs2, vd)) \
_(vmseq_vi, 0, 4, ENC(vs2, vd)) \
_(vmsne_vv, 0, 4, ENC(vs1, vs2, vd)) \
_(vmsne_vx, 0, 4, ENC(rs1, vs2, vd)) \
_(vmsne_vi, 0, 4, ENC(vs2, vd)) \
_(vmsltu_vv, 0, 4, ENC(vs1, vs2, vd)) \
_(vmsltu_vx, 0, 4, ENC(rs1, vs2, vd)) \
_(vmslt_vv, 0, 4, ENC(vs1, vs2, vd)) \
_(vmslt_vx, 0, 4, ENC(rs1, vs2, vd)) \
_(vmsgtu_vx, 0, 4, ENC(rs1, vs2, vd)) \
_(vmsgtu_vi, 0, 4, ENC(vs2, vd)) \
_(vmsgt_vx, 0, 4, ENC(rs1, vs2, vd)) \
_(vmsgt_vi, 0, 4, ENC(vs2, vd)) \
)
```
#### Configuration-Setting Instructions(vsetvli/vsetivli/vsetvl)

For the zimm and uimm field, I add some masks into instruciotn decode masks in `decode.h`.
```diff
/* instruction decode masks */
enum {
// ....xxxx....xxxx....xxxx....xxxx
INSN_6_2 = 0b00000000000000000000000001111100,
// ....xxxx....xxxx....xxxx....xxxx
FR_OPCODE = 0b00000000000000000000000001111111, // R-type
FR_RD = 0b00000000000000000000111110000000,
FR_FUNCT3 = 0b00000000000000000111000000000000,
FR_RS1 = 0b00000000000011111000000000000000,
FR_RS2 = 0b00000001111100000000000000000000,
FR_FUNCT7 = 0b11111110000000000000000000000000,
// ....xxxx....xxxx....xxxx....xxxx
+ VSETVLI_ZIMM = 0b01111111111100000000000000000000, //Vector
+ VSETIVLI_ZIMM = 0b00111111111100000000000000000000,
};
```
I also revise the `rv_insn` struct for zimm and uimm.
```diff
typedef struct rv_insn {
union {
int32_t imm;
uint8_t rs3;
};
uint8_t rd, rs1, rs2;
/* store IR list */
uint8_t opcode;
#if RV32_HAS(EXT_C)
uint8_t shamt;
#endif
+#if RV32_HAS(EXT_V)
+ int32_t zimm;
+ uint8_t uimm;
+#endif
```
In order to decode `vsetvli` and `vsetivli`, I create decode function for zimm in decode.c.
```c
#if RV32_HAS(EXT_V)
/* decode vsetvli zimm[10:0] field
* zimm = inst[30:20]
*/
static inline int32_t decode_vsetvli_zimm(const uint32_t insn)
{
return (insn & VSETVLI_ZIMM) >> 20;
}
/* decode vsetivli zimm[9:0] field
* zimm = inst[29:20]
*/
static inline int32_t decode_vsetivli_zimm(const uint32_t insn)
{
return (insn & VSETIVLI_ZIMM) >> 20;
}
#endif
```
Because we have to distinguish `vsetvli`、`vsetivli` and `vsetvl` by the `inst[31]` and `inst[30]`.
Hence, I define the following function to get inst[30].
```c
static inline int32_t decode_31(const uint32_t insn)
{
return (insn & 0x40000000 ) >> 30;
}
```
By using the function below and the original decode function from rv32emu we can decode `vsetvli`、`vsetivli` and `vsetvl` correctly.
```c
#if RV32_HAS(EXT_V)
/* decode vsetvli
* 31 30 20 19 15 14 12 11 7 6 0
* |0| zimm[11:0] | rs1 | 111 | rd | opcode |
*/
static inline void decode_vsetvli(rv_insn_t *ir, const uint32_t insn)
{
ir->zimm = decode_vsetvli_zimm(insn);
ir->rs1 = decode_rs1(insn);
ir->rd = decode_rd(insn);
}
/* decode vsetivli
* 31 30 29 20 19 15 14 12 11 7 6 0
* |1|1| zimm[11:0] | rs1 | 111 | rd | opcode |
*/
static inline void decode_vsetivli(rv_insn_t *ir, const uint32_t insn)
{
ir->zimm = decode_vsetivli_zimm(insn);
ir->uimm = decode_rs1(insn);
ir->rd = decode_rd(insn);
}
/* decode vsetvl
* 31 30 25 24 20 19 15 14 12 11 7 6 0
* |1| 000000| rs2 | rs1 | 111 | rd | opcode |
*/
static inline void decode_vsetvl(rv_insn_t *ir, const uint32_t insn)
{
ir->rs2 = decode_rs2(insn);
ir->rs1 = decode_rs1(insn);
ir->rd = decode_rd(insn);
}
static inline bool op_cs(rv_insn_t *ir, const uint32_t insn) { //configruation-setting
switch(insn>>31){
case 0: //vsetvli
decode_vsetvli(ir,insn);
ir->opcode = rv_insn_vsetvli;
break;
case 1:
switch(decode_31(insn)){
case 0: //vsetvl
decode_vsetvl(ir,insn);
ir->opcode = rv_insn_vsetvl;
break;
case 1: //vsetivli
decode_vsetivli(ir,insn);
ir->opcode = rv_insn_vsetivli;
break;
}
default: /* illegal instruction */
return false;
}
return true;
}
#endif
```
The vsetvli instruction will update the `vector length register` (vl) and `vtype register` based on the specified values and extract SEW and LMUL values from the zimm field.

For `rs1` = x0 and `rd` != x0, we have to set `vl` to `VLMAX`(VLMAX = VLEN/SEW/LMUL).
In order to get VLMAX, changing SEW and LMUL from encoding to real number is necessary. Therefore, I define a function `get_LMUL` to meet our need.
```c
float get_LMUL(uint32_t LMUL_ENCODE){
float LMUL = 1; //default
switch(LMUL_ENCODE){
case 0b000:
LMUL = 1;
break;
case 0b001:
LMUL = 2;
break;
case 0b010:
LMUL = 4;
break;
case 0b011:
LMUL = 8;
break;
case 0b111:
LMUL = 1/2;
break;
case 0b110:
LMUL = 1/4;
break;
case 0b101:
LMUL = 1/8;
break;
default: // Illegal LMUL values
LMUL = 0;
break;
}
return LMUL;
}
```
Now, we can implement the `RVOP` of `vsetvli` in `rv32_template.c` by using the above function and the AVL table.
| zimm[10:0] of vsetvli | [10:8] | [7] | [6] | [5:3] | [2:0] |
| --------------------- | ------ | ------------------------- | ------------------------ | ------------------------------------------ | --- |
| Text | 000 | vma | vta | vsew | vlmul |
vma = Vector mask agnostic
vta = Vector tail agnostic
vsew = Selected element width (SEW) setting
vlmul = Vector register group multiplier (LMUL) setting.
```c
#define VLEN 512
RVOP(vsetvli, {
uint32_t LMUL_ENCODE = ir->zimm & 0b111;
uint32_t SEW_ENCODE = ir->zimm >> 3 & 0b111;
float LMUL = 1; //default
uint32_t SEW = 8; //default
uint32_t VLMAX;
LMUL = get_LMUL(LMUL_ENCODE);
if(LMUL==0){
// Illegal LMUL values
rv->csr_vl = 0;
rv->csr_vtype = 0x80000000; // Set vill bit
return true;
}
SEW = SEW*pow(2,SEW_ENCODE);
VLMAX = VLEN/SEW/LMUL;
if(ir->rs1==0){
if(ir->rd==0){
rv->csr_vtype = ir->zimm;
}
else if(ir->rd!=0){
rv->csr_vl = VLMAX;
rv->csr_vtype = ir->zimm;
}
}
else{ // rs1!x0
// Update the vector length (vl) and vector type (vtype) registers
rv->csr_vl = rv->X[ir->rs1];
rv->csr_vtype = ir->zimm;
}
rv->X[ir->rd] = rv->X[ir->rs1];
return true;
})
```
The difference between `vsetvli` and `vsetivli` is the value passed to rd register.
`vsetvli` is passed from `rs1`.
`vsetivli` is passed from `uimm`.
```diff
-rv->X[ir->rd] = rv->X[ir->rs1];
+rv->X[ir->rd] = ir->uimm;
```
The difference between `vsetvli` and `vsetvl` is the value to `vtype` register.
`vsetvli` is passed from `zimm`.
`vsetivli` is passed from `rs2`.
```diff
-rv->csr_vtype = ir->zimm;;
+rv->csr_vtype = rv->X[ir->rs2];
```
### Testing
For checking the accuracy of my implement, I refer to `hello.S` and write a testing assembly code for `vsetvli` instruction named `vector.S`.
```assembly
# RISC-V assembly program to print "t0 is 32" to stdout.
.org 0
# Provide program starting address to linker
.global _start
/* newlib system calls */
.set SYSEXIT, 93
.set SYSWRITE, 64
.section .rodata
str: .ascii "t0 is"
.set str_size, .-str
.text
_start:
li a3, 32
vsetvli t0, a3, e32, m8
li a7, SYSWRITE # "write" syscall
li a0, 1 # 1 = standard output (stdout)
la a1, str # load address of string
li a2, str_size # length of string
ecall # invoke syscall to print the string
li a7, SYSWRITE
li a0, 1
la a1, t0
li a2, 4
ecall
li a7, SYSEXIT
add a0, x0, x0 # Use 0 return code
ecall # invoke syscall to terminate the program
```
Generate the ELF file by the following command
```
$ riscv-none-elf-as -march=rv32iv -mabi=ilp32 vector.S -o vector.elf
```
Modify the Makefile for the testing file.
```diff
OBJS_EXT :=
+# Vector extension instructions
+ENABLE_EXT_V ?= 1
+$(call set-feature, EXT_V)
# RISC-V Architecture Test
include mk/riscv-arch-test.mk
include mk/tests.mk
CHECK_ELF_FILES := \
hello \
puzzle \
ifeq ($(call has, EXT_M), 1)
CHECK_ELF_FILES += \
pi
endif
+ifeq ($(call has, EXT_V), 1)
+CHECK_ELF_FILES += \
+ vector
+endif
EXPECTED_hello = Hello World!
EXPECTED_puzzle = success in 2005 trials
EXPECTED_pi = 3.141592653589793238462643383279502884197169399375105820974944592307816406286208998628034825342117067982148086
+EXPECTED_vector = t0 is 32
check: $(BIN)
$(Q)$(foreach e,$(CHECK_ELF_FILES),\
$(PRINTF) "Running $(e).elf ... "; \
if [ "$(shell $(BIN) $(OUT)/$(e).elf | uniq)" = "$(strip $(EXPECTED_$(e))) inferior exit code 0" ]; then \
$(call notice, [OK]); \
else \
$(PRINTF) "Failed.\n"; \
exit 1; \
fi; \
)
```
I add the rv32v opcode map and decode vector instruction in `decode.c` for rv32emu to determind the instruction is vector instruction or not.
```c
// In riscv_private.h
/* Detect the instruction is RV32V or not */
FORCE_INLINE bool is_vector(uint32_t insn)
{
uint8_t v_op = insn & FR_OPCODE;
return v_op == 0b111 || v_op == 0b0100111 || v_op == 0b1010111;
}
```
```c
//In decode.c
#if RV32_HAS(EXT_V)
/* RV32V opcode map */
/* clang-format off */
static const decode_t rvv_jump_table[] = {
// 000 001 010 011 100 101 110 111
OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), // 000
OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), // 001
OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), // 010
OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), // 011
OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), // 100
OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), // 101
OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), // 110
OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(cs), OP(unimp), OP(unimp), // 111
};
#endif
/* clang-format on */
/* Vector Extension Instruction */
#if RV32_HAS(EXT_V)
if (is_vector(insn)) {
const uint32_t v_index = (insn & FR_FUNCT3) >> 9 | (insn & FR_OPCODE) >> 4;
/* decode instruction (vector instructions) */
const decode_t op = rvv_jump_table[v_index];
assert(op);
return op(ir, insn);
}
#endif
```
### Reference
[vector extension](https://github.com/riscv/riscv-v-spec)
[rv32emu](https://github.com/sysprog21/rv32emu)