執行人: nelson0720j
專題解說錄影
改進 semu 並做出程式碼貢獻
semu supports MMU (Sv32)
https://github.com/sysprog21/semu/pull/28 (using LRU)
提交 pull request (進行 code reviewing) 並確認提升記憶體存取的效率
Controls supervisor-mode address translation and protection, holds the physical page number (PPN) of the root page table and an address space identifier (ASID), which facilitates address-translation fences on a per-address-space basis; and the MODE field.
We store the ASID and the page table base address in the same CSR to allow the pair to be changed atomically on a context switch. Swapping them non-atomically could pollute the old virtual address space with new translations, or vice-versa. This approach also slightly reduces the cost of a context switch.
ASID 的目的是減少 context switch 時的 TLB flush。如切換 process 1 到 process 2 時,必須把 process 1 的 page table 清除。而有了 ASID 就可以不用清除,只要比對 ASID 就可以知道 TLB 的內容是否可用。
We can add ASID in TLB entry
Note that writing satp does not imply any ordering constraints between page-table updates and subsequent address translations. If the new address space’s page tables have been modified, it may be necessary to execute an SFENCE.VMA instruction prior to writing satp.
Executing an SFENCE.VMA instruction guarantees that any stores in the instruction stream prior to the SFENCE.VMA are ordered before all implicit references subsequent to the SFENCE.VMA.
The SFENCE.VMA is used to flush any local hardware caches related to address translation. It is specified as a fence rather than a TLB flush to provide cleaner semantics with respect to which instructions are affected by the flush operation and to support a wider variety of dynamic caching structures and memory-management schemes. SFENCE.VMA is also used by higher privilege levels to synchronize page table writes and the address translation hardware.
Use case in xv6 trampoline.S switching user mode to kernel mode
# fetch the kernel page table address, from p->trapframe->kernel_satp.
ld t1, 0(a0)
# wait for any previous memory operations to complete, so that
# they use the user page table.
sfence.vma zero, zero
# install the kernel page table.
csrw satp, t1
# flush now-stale user entries from the TLB.
sfence.vma zero, zero
清除 TLB 中的條目,避免用到因 context switch 產生的過時資料。
The A bit indicates the virtual page has been read, written, or fetched from since the last time the A bit was cleared
static void mmu_fetch(vm_t *vm, uint32_t addr, uint32_t *value)
{
uint32_t vpn = addr >> RV_PAGE_SHIFT;
if (unlikely(vpn != vm->cache_fetch.n_pages)) {
mmu_translate(vm, &addr, (1 << 3), (1 << 6), false, RV_EXC_FETCH_FAULT,
RV_EXC_FETCH_PFAULT);
if (vm->error)
return;
uint32_t *page_addr;
vm->mem_fetch(vm, addr >> RV_PAGE_SHIFT, &page_addr);
if (vm->error)
return;
vm->cache_fetch.n_pages = vpn;
vm->cache_fetch.page_addr = page_addr;
}
*value = vm->cache_fetch.page_addr[(addr >> 2) & MASK(RV_PAGE_SHIFT - 2)];
}
vm->mem_fetch(vm, addr >> RV_PAGE_SHIFT, &page_addr);
vm->cache_fetch.page_addr[(addr >> 2) & MASK(RV_PAGE_SHIFT - 2)]
static void mmu_translate(vm_t *vm,
uint32_t *addr,
const uint32_t access_bits,
const uint32_t set_bits,
const bool skip_privilege_test,
const uint8_t fault,
const uint8_t pfault)
{
/* NOTE: save virtual address, for physical accesses, to set exception. */
vm->exc_val = *addr;
if (!vm->page_table)
return;
uint32_t *pte_ref;
uint32_t ppn;
// check page table accessible
bool ok = mmu_lookup(vm, (*addr) >> RV_PAGE_SHIFT, &pte_ref, &ppn);
if (unlikely(!ok)) {
vm_set_exception(vm, fault, *addr);
return;
}
// 檢查 pte 是否有效且可存取
uint32_t pte;
if (!(pte_ref /* PTE lookup was successful */ &&
!(ppn >> 20) /* PPN is valid */ &&
(pte = *pte_ref, pte & access_bits) /* access type is allowed */ &&
(!(pte & (1 << 4)) == vm->s_mode ||
skip_privilege_test) /* privilege matches */
)) {
vm_set_exception(vm, pfault, *addr);
return;
}
//設定新的 pte 指標
uint32_t new_pte = pte | set_bits;
if (new_pte != pte)
*pte_ref = new_pte;
//回傳目標地址
*addr = ((*addr) & MASK(RV_PAGE_SHIFT)) | (ppn << RV_PAGE_SHIFT);
}
skip_privilege_test
只有在 mmu_fetch
的時候為 false ,因為要避免存取到不允許存取的指令,所以要做特權階級的檢查。
mmu_load
和 mmu_store
則依照 riscv 中的規定去檢查 sstatus 暫存器中的 SUM bit 和 vm->s_mode
是否為 S-mode ,當處於 S-mode 且 SUM 位設置時,允許訪問 U-mode 的內存 。因此,此時可以跳過特權級檢查。
/* Assume vm->page_table is set.
*
* If there is an error fetching a page table, return false.
* Otherwise return true and:
* - in case of valid leaf: set *pte and *ppn
* - none found (page fault): set *pte to NULL
*/
static bool mmu_lookup(const vm_t *vm,
uint32_t vpn,
uint32_t **pte,
uint32_t *ppn)
{
// 檢查 PTE 種類,是否為 leaf entry 或是下一層分頁指標等,並確認有 alingned
PTE_ITER(vm->page_table, vpn >> 10,
if (unlikely((*ppn) & MASK(10))) /* misaligned superpage */
*pte = NULL;
else *ppn |= vpn & MASK(10);)
uint32_t *page_table = vm->mem_page_table(vm, (**pte) >> 10);
// 如果 page table 不存在
if (!page_table)
return false;
PTE_ITER(page_table, vpn & MASK(10), )
*pte = NULL;
return true;
}
Any level of PTE may be a leaf PTE, so in addition to 4 KiB pages, Sv32 supports 4 MiB megapages. A megapage must be virtually and physically aligned to a 4 MiB boundary; a page-fault exception is raised if the physical address is insufficiently aligned.
為甚麼 first level pte 可能是 leaf
PTE_ITER 在找到 leaf node (line 13) 時會直接 return
第一次 PTE_ITER 查詢 level 1 page table,得到 level 0 page table,第二次查詢取得 addr
對應的 pte 與 ppn
#define PTE_ITER(page_table, vpn, additional_checks) \
*pte = &(page_table)[vpn]; \
switch ((**pte) & MASK(4)) { \
case 0b0001: \
break; /* pointer to next level */ \
case 0b0011: \
case 0b0111: \
case 0b1001: \
case 0b1011: \
case 0b1111: \
*ppn = (**pte) >> 10; \
additional_checks return true; /* leaf entry */ \
case 0b0101: \
case 0b1101: \
default: \
*pte = NULL; \
return true; /* not valid */ \
}
(**pte) & MASK(4)
: 取得 pte permission bits X、W、R、V
if ((insn >> 25) == 0b0001001 /* PRIV: SFENCE_VMA */) {
mmu_fence(vm, insn);
return;
}
typedef struct {
uint32_t n_pages;
uint32_t *page_addr;
} mmu_cache_t;
The cache in semu now only store 1 vpn and it's corresponding page address
mmu_fence
vm->cache_fetch.n_pages = 0xFFFFFFFF;
mmu_cache_lookup
中沒有沒找到目標 PTE 的處理機制,會陷入無限迴圈。
此外,他的頁面替換機制說是 LRU 更像是 FIFO,會一直不斷地往回尋找 PTE 。
mmu_cache_insert
則會根據 write_idx
來插入到下一個索引位址,若到底則回到第一個去插入。並沒有紀錄最近使用的功能。
注意用語:「雙鏈結串列」和「雙向鏈結串列」不同,後者才是 doubly-linked list
用雙向鏈結串列實作 LRU 。
首先,先建立串列結構。
#define TLB_SIZE 16
typedef struct {
uint32_t vpn;
uint32_t *pte;
bool valid;
bool dirty;
} tlb_entry_t;
struct node_t{
tlb_entry_t entry;
struct node_t *prev;
struct node_t *next;
};
typedef struct {
struct node_t *head;
struct node_t *tail;
uint32_t size;
} tlb_list_t;
清空tlb
static void mmu_invalidate(vm_t *vm)
{
vm->cache_fetch.n_pages = 0xFFFFFFFF;
/* free all nodes. */
for (struct node_t *current = vm->tlb_list.head, *next; current;
current = next) {
next = current->next;
free(current);
}
/* reset head and tail. */
vm->tlb_list.head = malloc(sizeof(struct node_t));
vm->tlb_list.tail = malloc(sizeof(struct node_t));
vm->tlb_list.head->next = vm->tlb_list.tail;
vm->tlb_list.head->prev = NULL;
vm->tlb_list.tail->next = NULL;
vm->tlb_list.tail->prev = vm->tlb_list.head;
vm->tlb_list.size = 0;
fprintf(stderr, "TLB invalidated\n");
}
mmu_translate
執行位址轉換時,若 skip_privilege_test
為真,則允許訪問內存。
先在 tlb_lookup
找 TLB 中是否有符合的頁面。
/* load and store fecth cache lookup. */
if (skip_privilege_test) {
pte_ref = tlb_lookup(vm, (*addr) >> RV_PAGE_SHIFT, set_bits & (1 << 7));
if (pte_ref && (*pte_ref & access_bits)) {
*addr = ((*addr) & MASK(RV_PAGE_SHIFT)) |
((*pte_ref >> 10) << RV_PAGE_SHIFT);
return;
}
}
static uint32_t *tlb_lookup(vm_t *vm, uint32_t vpn, bool write)
{
for (struct node_t *current = vm->tlb_list.head->next;
current != vm->tlb_list.tail; current = current->next) {
if (current->entry.valid && current->entry.vpn == vpn) {
if (write) {
current->entry.dirty = true;
*current->entry.pte |= (1 << 7);
}
/* hit then remove and insert to head->next. */
tlb_remove(current);
tlb_insert(current, &vm->tlb_list);
return current->entry.pte;
}
}
fprintf(stderr, "TLB lookup for: 0x%08X\n", vpn);
TLB_LOOK++;
print_tlb(&vm->tlb_list);
return NULL;
}
/* remove node. */
void tlb_remove(struct node_t *node)
{
if (node == NULL || node->prev == NULL || node->next == NULL) {
return;
}
struct node_t *tmp_prev = node->prev;
struct node_t *tmp_next = node->next;
tmp_prev->next = tmp_next;
tmp_next->prev = tmp_prev;
}
/* insert new node to head. */
void tlb_insert(struct node_t *node, tlb_list_t *tlb_list)
{
if (node == NULL || tlb_list == NULL || tlb_list->head == NULL) {
return;
}
struct node_t *tmp_next = tlb_list->head->next;
tlb_list->head->next = node;
node->prev = tlb_list->head;
node->next = tmp_next;
tmp_next->prev = node;
}
找到後會由 tlb_insert
來更新 TLB ,如果沒有的話再去執行 mmu_lookup
去頁表中查找。
最後把由 mmu_lookup
查找到的頁面透過 tlb_insert_entry
新增到 TLB 中。
static void tlb_insert_entry(vm_t *vm, uint32_t vpn, uint32_t *pte)
{
struct node_t *new_node = malloc(sizeof(struct node_t));
new_node->entry.vpn = vpn;
new_node->entry.pte = pte;
new_node->entry.valid = true;
new_node->entry.dirty = false;
tlb_insert(new_node, &vm->tlb_list);
vm->tlb_list.size++;
if (vm->tlb_list.size > TLB_SIZE) {
struct node_t *victim = vm->tlb_list.tail->prev;
tlb_remove(vm->tlb_list.tail->prev);
free(victim);
vm->tlb_list.size--;
}
}
其中,TLB 使用雙向鏈結串列管理,如果串列長度大於 TLB 大小,則代表 TLB 滿,需找替換頁面,將鏈結串列的最後一個節點移除,否則透過指標 tlb_list_t->head
來插入鏈結頭 串列的開頭,並將串列長度加1。
鏈結串列可簡稱串列,但不可簡稱鏈結。
node_t *
from incompatible pointer type struct node_t *
struct node_t
解決具名結構體在自引用時錯誤的狀況。typedef struct {
tlb_entry_t entry;
struct node_t *prev;
struct node_t *next;
} node_t;
因為若命名省略,則會在末尾才定義,自引用的成員就會抓不到型別。
使用 Linux 風格的鏈結串列,善用 https://github.com/sysprog21/lab0-c/blob/master/list.h
有試過拿掉 mmu_translate
中執行 tlb_lookup
的特權檢查,但拿掉後,反而直接卡在 allocated TAP interface: tap0
allocated TAP interface: tap0
[ 0.000000] Linux version 6.1.95 (mi2s@mi2s) (riscv32-buildroot-linux-gnu-gcc.br_real (Buildroot -gd2320372) 12.3.0, GNU ld (GNU Binutils) 2.41) #1 Sat Jun 22 18:33:14 CST 2024
[ 0.000000] Machine model: semu
[ 0.000000] earlycon: ns16550 at MMIO 0xf4000000 (options '')
[ 0.000000] printk: bootconsole [ns16550] enabled
[ 0.000000] Zone ranges:
[ 0.000000] Normal [mem 0x0000000000000000-0x000000001fffffff]
[ 0.000000] Movable zone start for each node
[ 0.000000] Early memory node ranges
[ 0.000000] node 0: [mem 0x0000000000000000-0x000000001fffffff]
[ 0.000000] Initmem setup node 0 [mem 0x0000000000000000-0x000000001fffffff]
[ 0.000000] SBI specification v0.3 detected
[ 0.000000] SBI implementation ID=0x999 Version=0x1
[ 0.000000] SBI TIME extension detected
[ 0.000000] SBI SRST extension detected
[ 0.000000] riscv: base ISA extensions aim
[ 0.000000] riscv: ELF capabilities aim
[ 0.000000] Built 1 zonelists, mobility grouping on. Total pages: 130048
[ 0.000000] Kernel command line: earlycon console=ttyS0
[ 0.000000] Dentry cache hash table entries: 65536 (order: 6, 262144 bytes, linear)
[ 0.000000] Inode-cache hash table entries: 32768 (order: 5, 131072 bytes, linear)
[ 0.000000] mem auto-init: stack:off, heap alloc:off, heap free:off
[ 0.000000] Memory: 506844K/524288K available (3252K kernel code, 336K rwdata, 832K rodata, 159K init, 136K bss, 17444K reserved, 0K cma-reserved)
[ 0.000000] SLUB: HWalign=64, Order=0-3, MinObjects=0, CPUs=1, Nodes=1
[ 0.000000] NR_IRQS: 64, nr_irqs: 64, preallocated irqs: 0
[ 0.000000] riscv-intc: 32 local interrupts mapped
[ 0.000000] plic: interrupt-controller@0: mapped 31 interrupts with 1 handlers for 1 contexts.
[ 0.000000] riscv-timer: riscv_timer_init_dt: Registering clocksource cpuid [0] hartid [0]
[ 0.000000] clocksource: riscv_clocksource: mask: 0xffffffffffffffff max_cycles: 0xefdb196da, max_idle_ns: 440795204367 ns
[ 0.000006] sched_clock: 64 bits at 65MHz, resolution 15ns, wraps every 2199023255550ns
[ 0.001237] Console: colour dummy device 80x25
[ 0.001516] Calibrating delay loop (skipped), value calculated using timer frequency.. 130.00 BogoMIPS (lpj=260000)
[ 0.001820] pid_max: default: 32768 minimum: 301
[ 0.003079] Mount-cache hash table entries: 1024 (order: 0, 4096 bytes, linear)
[ 0.003341] Mountpoint-cache hash table entries: 1024 (order: 0, 4096 bytes, linear)
[ 0.009254] ASID allocator disabled (0 bits)
[ 0.011342] devtmpfs: initialized
[ 0.017562] clocksource: jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 7645041785100000 ns
[ 0.017864] futex hash table entries: 256 (order: -1, 3072 bytes, linear)
[ 0.027296] NET: Registered PF_NETLINK/PF_ROUTE protocol family
[ 0.032287] platform soc@F0000000: Fixed dependency cycle(s) with /soc@F0000000/interrupt-controller@0
[ 0.059992] clocksource: Switched to clocksource riscv_clocksource
[ 0.119831] NET: Registered PF_INET protocol family
[ 0.122030] IP idents hash table entries: 8192 (order: 4, 65536 bytes, linear)
[ 0.144093] tcp_listen_portaddr_hash hash table entries: 1024 (order: 0, 4096 bytes, linear)
[ 0.144529] Table-perturb hash table entries: 65536 (order: 6, 262144 bytes, linear)
[ 0.144815] TCP established hash table entries: 4096 (order: 2, 16384 bytes, linear)
[ 0.145631] TCP bind hash table entries: 4096 (order: 3, 32768 bytes, linear)
[ 0.146557] TCP: Hash tables configured (established 4096 bind 4096)
[ 0.147060] UDP hash table entries: 256 (order: 0, 4096 bytes, linear)
[ 0.147374] UDP-Lite hash table entries: 256 (order: 0, 4096 bytes, linear)
[ 0.148581] NET: Registered PF_UNIX/PF_LOCAL protocol family
[ 0.153591] Unpacking initramfs...
[ 0.188465] workingset: timestamp_bits=30 max_order=17 bucket_order=0
[ 2.910715] Freeing initrd memory: 8188K
[ 2.970106] Serial: 8250/16550 driver, 4 ports, IRQ sharing disabled
[ 2.978573] printk: console [ttyS0] disabled
[ 2.978889] f4000000.serial: ttyS0 at MMIO 0xf4000000 (irq = 1, base_baud = 312500) is a 16550
[ 2.979259] printk: console [ttyS0] enabled
[ 2.979259] printk: console [ttyS0] enabled
[ 2.979558] printk: bootconsole [ns16550] disabled
[ 2.979558] printk: bootconsole [ns16550] disabled
[ 2.982438] virtio_blk virtio1: 1/0/0 default/read/poll queues
[ 2.985589] virtio_blk virtio1: [vda] 4800 512-byte logical blocks (2.46 MB/2.34 MiB)
[ 2.990792] virtio_net virtio0: Assigned random MAC address 1e:dc:cb:6b:13:d5
[ 3.000442] clk: Disabling unused clocks
[ 3.003016] Freeing unused kernel image (initmem) memory: 156K
[ 3.003265] Kernel memory protection not selected by kernel config.
[ 3.003530] Run /init as init process
善用 GDB 追蹤,查閱 GDB 手冊的 "Break conditions"。
https://hackmd.io/@sysprog/linux-kvm (注意 VirtIO)
當 pmem 正確建立後,使用記憶體測試的演算法確認存取行為符合預期: https://www.memtest86.com/tech_memtest-algoritm.html