Implement Vector extension for rv32emu

## Implement Vector extension for rv32emu contributed by [<yuchen>](https://github.com/yuchen0620) ### Feature First, I add the **Standard Extension for Vector Instructions** in `feature.h` make it can perform the Feature test macro. ```c /* Standard Extension for Vector Instructions */ #ifndef RV32_FEATURE_EXT_V #define RV32_FEATURE_EXT_V 1 #endif /* Feature test macro */ #define RV32_HAS(x) RV32_FEATURE_##x ``` ### CSRS The vector extension adds 32 vector registers, and seven unprivileged CSRs (`vstart`, `vxsat`, `vxrm`, `vcsr`, `vtype`, `vl`,`vlenb`) to a base scalar RISC-V ISA. | Address | Privilege | Name | Description | | ------- | --------- | ------ | --------------------- | | 0x008 | URW | vstart | Vector start position | | 0x009 | URW | vxsat | Fixed-Point Saturate Flag | | 0x00A | URW | vxrm | Fixed-Point Rounding Mode | | 0x00F | URW | vcsr | Vector control and status register | | 0xC20 | URO | vl | Vector length | | 0xC21 | URO | vtype | Vector data type register | | 0xC22 | URO | vlenb | VLEN/8 (vector register length in bytes) | Hence I add the CSRs of vector extension in `riscv_private.h` and vector registers in `#define RV_REGS_LIST` of `riscv.h`. ```diff /* CSRs */ enum { /* floating point */ CSR_FFLAGS = 0x001, /* Floating-point accrued exceptions */ CSR_FRM = 0x002, /* Floating-point dynamic rounding mode */ CSR_FCSR = 0x003, /* Floating-point control and status register */ /* Machine trap setup */ CSR_MSTATUS = 0x300, /* Machine status register */ CSR_MISA = 0x301, /* ISA and extensions */ CSR_MEDELEG = 0x302, /* Machine exception delegate register */ CSR_MIDELEG = 0x303, /* Machine interrupt delegate register */ CSR_MIE = 0x304, /* Machine interrupt-enable register */ CSR_MTVEC = 0x305, /* Machine trap-handler base address */ CSR_MCOUNTEREN = 0x306, /* Machine counter enable */ /* machine trap handling */ CSR_MSCRATCH = 0x340, /* Scratch register for machine trap handlers */ CSR_MEPC = 0x341, /* Machine exception program counter */ CSR_MCAUSE = 0x342, /* Machine trap cause */ CSR_MTVAL = 0x343, /* Machine bad address or instruction */ CSR_MIP = 0x344, /* Machine interrupt pending */ /* low words */ CSR_CYCLE = 0xC00, /* Cycle counter for RDCYCLE instruction */ CSR_TIME = 0xC01, /* Timer for RDTIME instruction */ CSR_INSTRET = 0xC02, /* high words */ CSR_CYCLEH = 0xC80, CSR_TIMEH = 0xC81, CSR_INSTRETH = 0xC82, CSR_MVENDORID = 0xF11, /* Vendor ID */ CSR_MARCHID = 0xF12, /* Architecture ID */ CSR_MIMPID = 0xF13, /* Implementation ID */ CSR_MHARTID = 0xF14, /* Hardware thread ID */ + /*vector extension*/ + CSR_VSTART = 0x008, + CSR_VXSAT = 0x009, + CSR_VXRM = 0x00A, + CSR_VCSR = 0x00F, + CSR_VL = 0xC20, + CSR_VTYPE = 0xC21, + CSR_LENB = 0xC22, }; ``` Because the size of vector registers is typically not fixed and can be dynamically set by instructions such as `vsetvli`. The `vsetvli` instruction adjusts the vector length (VL) and can change the effective size of vector registers during program execution. Hence we have to define a struct for dynamic vector register. ```c // Define a structure to represent vector registers typedef struct { uint32_t* data; size_t size; }VectorRegister; ``` Change `riscv_internal` for vector register and new vector CSRs. ```diff struct riscv_internal { bool halt; /* indicate whether the core is halted */ /* I/O interface */ riscv_io_t io; /* integer registers */ riscv_word_t X[N_RV_REGS]; riscv_word_t PC; /* user provided data */ riscv_user_t userdata; /* csr registers */ uint64_t csr_cycle; /* Machine cycle counter */ uint32_t csr_time[2]; /* Performance conter */ uint32_t csr_mstatus; /* Machine status regester */ uint32_t csr_mtvec; /* Machine trap-handler base address */ uint32_t csr_misa; /* ISA and extensions */ uint32_t csr_mtval; /* Machine bad address or instruction */ uint32_t csr_mcause; /* Machine trap cause */ uint32_t csr_mscratch; /* Scartch register for machine trap handler */ uint32_t csr_mepc; /* Machine exception program counter */ uint32_t csr_mip; /* Machine interrupt pending */ uint32_t csr_mbadaddr; +#if RV32_HAS(EXT_V) + VectorRegister +vector_registers[N_RV_REGS]; + uint32_t csr_vstart; /* +Vector start position */ + uint32_t csr_vxsat; /* +Fixed-Point Saturate Flag */ + uint32_t csr_vxrm; /* +Fixed-Point Rounding Mode */ + uint32_t csr_vcsr; /* +Vector control and status +register */ + uint32_t csr_vl; /* +Vector length */ + uint32_t csr_vtype; /* +Vector data type register */ + uint32_t csr_vlenb; /* VLEN/8 (vector register length in bytes) */ +#endif ``` ### Decode The first thing I do is to add the vector instruction into the `RV_INSN_LIST` of `decode.h`. ```c #define RV_INSN_LIST \ _(nop, 0, 4, ENC(rs1, rd)) \ /* RV32I Base Instruction Set */ \ _(lui, 0, 4, ENC(rd)) \ _(auipc, 0, 4, ENC(rd)) \ _(jal, 1, 4, ENC(rd)) \ _(jalr, 1, 4, ENC(rs1, rd)) \ _(beq, 1, 4, ENC(rs1, rs2)) \ _(bne, 1, 4, ENC(rs1, rs2)) \ _(blt, 1, 4, ENC(rs1, rs2)) \ _(bge, 1, 4, ENC(rs1, rs2)) \ _(bltu, 1, 4, ENC(rs1, rs2)) \ . . . /* RV32V Standard Extension */ \ IIF(RV32_HAS(EXT_V))( \ _(vsetvli, 0, 4, ENC(rs1, rd)) \ _(vsetivli, 0, 4, ENC(rd)) \ _(vsetvl, 0, 4, ENC(rs1, rs2, rd)) \ _(vle8_v, 0, 4, ENC(rs1, vd)) \ _(vle16_v, 0, 4, ENC(rs1, vd)) \ _(vle32_v, 0, 4, ENC(rs1, vd)) \ _(vse8_v, 0, 4, ENC(rs1, vs3)) \ _(vse16_v, 0, 4, ENC(rs1, vs3)) \ _(vse32_v, 0, 4, ENC(rs1, vs3)) \ _(vadd_vv, 0, 4, ENC(vs1, vs2, vd)) \ _(vadd_vx, 0, 4, ENC(rs1, vs2, vd)) \ _(vadd_vi, 0, 4, ENC(vs2, vd)) \ _(vsub_vv, 0, 4, ENC(vs1, vs2, vd)) \ _(vsub_vx, 0, 4, ENC(rs1, vs2, vd)) \ _(vand_vv, 0, 4, ENC(vs1, vs2, vd)) \ _(vand_vx, 0, 4, ENC(rs1, vs2, vd)) \ _(vand_vi, 0, 4, ENC(vs2, vd)) \ _(vor_vv, 0, 4, ENC(vs1, vs2, vd)) \ _(vor_vx, 0, 4, ENC(rs1, vs2, vd)) \ _(vor_vi, 0, 4, ENC(vs2, vd)) \ _(vxor_vv, 0, 4, ENC(vs1, vs2, vd)) \ _(vxor_vx, 0, 4, ENC(rs1, vs2, vd)) \ _(vxor_vi, 0, 4, ENC(vs2, vd)) \ _(vsll_vv, 0, 4, ENC(vs1, vs2, vd)) \ _(vsll_vx, 0, 4, ENC(rs1, vs2, vd)) \ _(vsll_vi, 0, 4, ENC(vs2, vd)) \ _(vsrl_vv, 0, 4, ENC(vs1, vs2, vd)) \ _(vsrl_vx, 0, 4, ENC(rs1, vs2, vd)) \ _(vsrl_vi, 0, 4, ENC(vs2, vd)) \ _(vsra_vv, 0, 4, ENC(vs1, vs2, vd)) \ _(vsra_vx, 0, 4, ENC(rs1, vs2, vd)) \ _(vsra_vi, 0, 4, ENC(vs2, vd)) \ _(vmseq_vv, 0, 4, ENC(vs1, vs2, vd)) \ _(vmseq_vx, 0, 4, ENC(rs1, vs2, vd)) \ _(vmseq_vi, 0, 4, ENC(vs2, vd)) \ _(vmsne_vv, 0, 4, ENC(vs1, vs2, vd)) \ _(vmsne_vx, 0, 4, ENC(rs1, vs2, vd)) \ _(vmsne_vi, 0, 4, ENC(vs2, vd)) \ _(vmsltu_vv, 0, 4, ENC(vs1, vs2, vd)) \ _(vmsltu_vx, 0, 4, ENC(rs1, vs2, vd)) \ _(vmslt_vv, 0, 4, ENC(vs1, vs2, vd)) \ _(vmslt_vx, 0, 4, ENC(rs1, vs2, vd)) \ _(vmsgtu_vx, 0, 4, ENC(rs1, vs2, vd)) \ _(vmsgtu_vi, 0, 4, ENC(vs2, vd)) \ _(vmsgt_vx, 0, 4, ENC(rs1, vs2, vd)) \ _(vmsgt_vi, 0, 4, ENC(vs2, vd)) \ ) ``` #### Configuration-Setting Instructions(vsetvli/vsetivli/vsetvl) ![image](https://hackmd.io/_uploads/BkopTapup.png) For the zimm and uimm field, I add some masks into instruciotn decode masks in `decode.h`. ```diff /* instruction decode masks */ enum { // ....xxxx....xxxx....xxxx....xxxx INSN_6_2 = 0b00000000000000000000000001111100, // ....xxxx....xxxx....xxxx....xxxx FR_OPCODE = 0b00000000000000000000000001111111, // R-type FR_RD = 0b00000000000000000000111110000000, FR_FUNCT3 = 0b00000000000000000111000000000000, FR_RS1 = 0b00000000000011111000000000000000, FR_RS2 = 0b00000001111100000000000000000000, FR_FUNCT7 = 0b11111110000000000000000000000000, // ....xxxx....xxxx....xxxx....xxxx + VSETVLI_ZIMM = 0b01111111111100000000000000000000, //Vector + VSETIVLI_ZIMM = 0b00111111111100000000000000000000, }; ``` I also revise the `rv_insn` struct for zimm and uimm. ```diff typedef struct rv_insn { union { int32_t imm; uint8_t rs3; }; uint8_t rd, rs1, rs2; /* store IR list */ uint8_t opcode; #if RV32_HAS(EXT_C) uint8_t shamt; #endif +#if RV32_HAS(EXT_V) + int32_t zimm; + uint8_t uimm; +#endif ``` In order to decode `vsetvli` and `vsetivli`, I create decode function for zimm in decode.c. ```c #if RV32_HAS(EXT_V) /* decode vsetvli zimm[10:0] field * zimm = inst[30:20] */ static inline int32_t decode_vsetvli_zimm(const uint32_t insn) { return (insn & VSETVLI_ZIMM) >> 20; } /* decode vsetivli zimm[9:0] field * zimm = inst[29:20] */ static inline int32_t decode_vsetivli_zimm(const uint32_t insn) { return (insn & VSETIVLI_ZIMM) >> 20; } #endif ``` Because we have to distinguish `vsetvli`、`vsetivli` and `vsetvl` by the `inst[31]` and `inst[30]`. Hence, I define the following function to get inst[30]. ```c static inline int32_t decode_31(const uint32_t insn) { return (insn & 0x40000000 ) >> 30; } ``` By using the function below and the original decode function from rv32emu we can decode `vsetvli`、`vsetivli` and `vsetvl` correctly. ```c #if RV32_HAS(EXT_V) /* decode vsetvli * 31 30 20 19 15 14 12 11 7 6 0 * |0| zimm[11:0] | rs1 | 111 | rd | opcode | */ static inline void decode_vsetvli(rv_insn_t *ir, const uint32_t insn) { ir->zimm = decode_vsetvli_zimm(insn); ir->rs1 = decode_rs1(insn); ir->rd = decode_rd(insn); } /* decode vsetivli * 31 30 29 20 19 15 14 12 11 7 6 0 * |1|1| zimm[11:0] | rs1 | 111 | rd | opcode | */ static inline void decode_vsetivli(rv_insn_t *ir, const uint32_t insn) { ir->zimm = decode_vsetivli_zimm(insn); ir->uimm = decode_rs1(insn); ir->rd = decode_rd(insn); } /* decode vsetvl * 31 30 25 24 20 19 15 14 12 11 7 6 0 * |1| 000000| rs2 | rs1 | 111 | rd | opcode | */ static inline void decode_vsetvl(rv_insn_t *ir, const uint32_t insn) { ir->rs2 = decode_rs2(insn); ir->rs1 = decode_rs1(insn); ir->rd = decode_rd(insn); } static inline bool op_cs(rv_insn_t *ir, const uint32_t insn) { //configruation-setting switch(insn>>31){ case 0: //vsetvli decode_vsetvli(ir,insn); ir->opcode = rv_insn_vsetvli; break; case 1: switch(decode_31(insn)){ case 0: //vsetvl decode_vsetvl(ir,insn); ir->opcode = rv_insn_vsetvl; break; case 1: //vsetivli decode_vsetivli(ir,insn); ir->opcode = rv_insn_vsetivli; break; } default: /* illegal instruction */ return false; } return true; } #endif ``` The vsetvli instruction will update the `vector length register` (vl) and `vtype register` based on the specified values and extract SEW and LMUL values from the zimm field. ![image](https://hackmd.io/_uploads/SyqIp_eKT.png) For `rs1` = x0 and `rd` != x0, we have to set `vl` to `VLMAX`(VLMAX = VLEN/SEW/LMUL). In order to get VLMAX, changing SEW and LMUL from encoding to real number is necessary. Therefore, I define a function `get_LMUL` to meet our need. ```c float get_LMUL(uint32_t LMUL_ENCODE){ float LMUL = 1; //default switch(LMUL_ENCODE){ case 0b000: LMUL = 1; break; case 0b001: LMUL = 2; break; case 0b010: LMUL = 4; break; case 0b011: LMUL = 8; break; case 0b111: LMUL = 1/2; break; case 0b110: LMUL = 1/4; break; case 0b101: LMUL = 1/8; break; default: // Illegal LMUL values LMUL = 0; break; } return LMUL; } ``` Now, we can implement the `RVOP` of `vsetvli` in `rv32_template.c` by using the above function and the AVL table. | zimm[10:0] of vsetvli | [10:8] | [7] | [6] | [5:3] | [2:0] | | --------------------- | ------ | ------------------------- | ------------------------ | ------------------------------------------ | --- | | Text | 000 | vma | vta | vsew | vlmul | vma = Vector mask agnostic vta = Vector tail agnostic vsew = Selected element width (SEW) setting vlmul = Vector register group multiplier (LMUL) setting. ```c #define VLEN 512 RVOP(vsetvli, { uint32_t LMUL_ENCODE = ir->zimm & 0b111; uint32_t SEW_ENCODE = ir->zimm >> 3 & 0b111; float LMUL = 1; //default uint32_t SEW = 8; //default uint32_t VLMAX; LMUL = get_LMUL(LMUL_ENCODE); if(LMUL==0){ // Illegal LMUL values rv->csr_vl = 0; rv->csr_vtype = 0x80000000; // Set vill bit return true; } SEW = SEW*pow(2,SEW_ENCODE); VLMAX = VLEN/SEW/LMUL; if(ir->rs1==0){ if(ir->rd==0){ rv->csr_vtype = ir->zimm; } else if(ir->rd!=0){ rv->csr_vl = VLMAX; rv->csr_vtype = ir->zimm; } } else{ // rs1!x0 // Update the vector length (vl) and vector type (vtype) registers rv->csr_vl = rv->X[ir->rs1]; rv->csr_vtype = ir->zimm; } rv->X[ir->rd] = rv->X[ir->rs1]; return true; }) ``` The difference between `vsetvli` and `vsetivli` is the value passed to rd register. `vsetvli` is passed from `rs1`. `vsetivli` is passed from `uimm`. ```diff -rv->X[ir->rd] = rv->X[ir->rs1]; +rv->X[ir->rd] = ir->uimm; ``` The difference between `vsetvli` and `vsetvl` is the value to `vtype` register. `vsetvli` is passed from `zimm`. `vsetivli` is passed from `rs2`. ```diff -rv->csr_vtype = ir->zimm;; +rv->csr_vtype = rv->X[ir->rs2]; ``` ### Testing For checking the accuracy of my implement, I refer to `hello.S` and write a testing assembly code for `vsetvli` instruction named `vector.S`. ```assembly # RISC-V assembly program to print "t0 is 32" to stdout. .org 0 # Provide program starting address to linker .global _start /* newlib system calls */ .set SYSEXIT, 93 .set SYSWRITE, 64 .section .rodata str: .ascii "t0 is" .set str_size, .-str .text _start: li a3, 32 vsetvli t0, a3, e32, m8 li a7, SYSWRITE # "write" syscall li a0, 1 # 1 = standard output (stdout) la a1, str # load address of string li a2, str_size # length of string ecall # invoke syscall to print the string li a7, SYSWRITE li a0, 1 la a1, t0 li a2, 4 ecall li a7, SYSEXIT add a0, x0, x0 # Use 0 return code ecall # invoke syscall to terminate the program ``` Generate the ELF file by the following command ``` $ riscv-none-elf-as -march=rv32iv -mabi=ilp32 vector.S -o vector.elf ``` Modify the Makefile for the testing file. ```diff OBJS_EXT := +# Vector extension instructions +ENABLE_EXT_V ?= 1 +$(call set-feature, EXT_V) # RISC-V Architecture Test include mk/riscv-arch-test.mk include mk/tests.mk CHECK_ELF_FILES := \ hello \ puzzle \ ifeq ($(call has, EXT_M), 1) CHECK_ELF_FILES += \ pi endif +ifeq ($(call has, EXT_V), 1) +CHECK_ELF_FILES += \ + vector +endif EXPECTED_hello = Hello World! EXPECTED_puzzle = success in 2005 trials EXPECTED_pi = 3.141592653589793238462643383279502884197169399375105820974944592307816406286208998628034825342117067982148086 +EXPECTED_vector = t0 is 32 check: $(BIN) $(Q)$(foreach e,$(CHECK_ELF_FILES),\ $(PRINTF) "Running $(e).elf ... "; \ if [ "$(shell $(BIN) $(OUT)/$(e).elf | uniq)" = "$(strip $(EXPECTED_$(e))) inferior exit code 0" ]; then \ $(call notice, [OK]); \ else \ $(PRINTF) "Failed.\n"; \ exit 1; \ fi; \ ) ``` I add the rv32v opcode map and decode vector instruction in `decode.c` for rv32emu to determind the instruction is vector instruction or not. ```c // In riscv_private.h /* Detect the instruction is RV32V or not */ FORCE_INLINE bool is_vector(uint32_t insn) { uint8_t v_op = insn & FR_OPCODE; return v_op == 0b111 || v_op == 0b0100111 || v_op == 0b1010111; } ``` ```c //In decode.c #if RV32_HAS(EXT_V) /* RV32V opcode map */ /* clang-format off */ static const decode_t rvv_jump_table[] = { // 000 001 010 011 100 101 110 111 OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), // 000 OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), // 001 OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), // 010 OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), // 011 OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), // 100 OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), // 101 OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), // 110 OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(unimp), OP(cs), OP(unimp), OP(unimp), // 111 }; #endif /* clang-format on */ /* Vector Extension Instruction */ #if RV32_HAS(EXT_V) if (is_vector(insn)) { const uint32_t v_index = (insn & FR_FUNCT3) >> 9 | (insn & FR_OPCODE) >> 4; /* decode instruction (vector instructions) */ const decode_t op = rvv_jump_table[v_index]; assert(op); return op(ir, insn); } #endif ``` ### Reference [vector extension](https://github.com/riscv/riscv-v-spec) [rv32emu](https://github.com/sysprog21/rv32emu)