Lab COW

tags: 6.1810

lab

本次作業實做 Copy-on-Write Fork,

使用 RISC-V PTE 的 RSW bit 當作 page 是否為 cow 的依據

# kernel/riscv.h define PTE_COW (1L << 8) // cow PTE

fork() 透過 uvmcopy() 複製 parent 的 pagetable,
COW 為 child 建立新的 pagetable,將所有的 write bit 改為 COW bit,
這樣一來,當 child process 寫入 COW page 時,才會觸發 write page fault

# kernel/vm.c uvmcopy(pagetable_t old, pagetable_t new, uint64 sz) { pte_t *pte; uint64 pa, i; uint flags; for(i = 0; i < sz; i += PGSIZE){ if((pte = walk(old, i, 0)) == 0) panic("uvmcopy: pte should exist"); if((*pte & PTE_V) == 0) panic("uvmcopy: page not present"); if ((*pte) & PTE_W) { (*pte) = ((*pte) & ~PTE_W) | PTE_COW; } pa = PTE2PA(*pte); refcount_inc(pa); flags = PTE_FLAGS(*pte); if(mappages(new, i, PGSIZE, pa, flags) != 0) { goto err; } } return 0; err: uvmunmap(new, 0, i / PGSIZE, 0); freewalk(new); for (uint64 j = 0; j <= i; j +=PGSIZE) { pte = walk(old, i, 0); if ((*pte) & PTE_COW) { (*pte) = (*pte) & ~PTE_COW; (*pte) &= PTE_W; } pa = PTE2PA(*pte); } return -1; }

接著處理 usertrap(),當 user trap 觸發,而且是 store page fault 時,表示 cow page 被寫入,需要建一個新的 page

kernel/trap.c usertrap(void) intr_on(); syscall(); } else if (r_scause() == 15) { // Store Page fault if (get_new_page(p->pagetable, r_stval()) != 0) setkilled(p); } else if((which_dev = devintr()) != 0){ // ok } else {

get_new_page() 首先檢查 pte 的屬性,接著檢查 page 的使用情況,如果只有一人在用,那就不需要取得新的 page。
如果取得新的 page 失敗,要將 refcount 加回去,避免回收 child process 時共享的 page 也被收回

int get_new_page(pagetable_t pagetable, uint64 va) { // Reads kernel memory if (va >= KERNBASE) return -1; va = PGROUNDDOWN(va); pte_t *pte; if((pte = walk(pagetable, va, 0)) == 0) { panic("get_new_page: walk"); return -1; } if ((*pte & PTE_V) == 0) { panic("get_new_page: not valid pte"); return -1; } uint flags = PTE_FLAGS(*pte); if ((flags & PTE_COW) == 0) return -1; // Modify PTE attribute (*pte) = ((*pte) | PTE_W) & ~PTE_COW; flags = PTE_FLAGS(*pte); uint64 old = PTE2PA(*pte); // No need to get a new page if it is not a shared page. if (refcount_dec(old) == 1) { return 0; } char *new; if ((new = (char*)kalloc()) == 0) { // child process is going to be killed and kfree() will free this page if refcount is 1, // but parent still uses the page. // Add 1 back. refcount_inc(old); return -1; } *pte = PA2PTE((uint64)memmove(new, (void*)old, PGSIZE)) | flags; return 0; }

kfree()時,根據 refcount 判斷需不需要真的收回 page

void @@ -51,6 +73,9 @@ kfree(void *pa) if(((uint64)pa % PGSIZE) != 0 || (char*)pa < end || (uint64)pa >= PHYSTOP) panic("kfree"); if (refcount_dec((uint64)pa) > 1) return; // Fill with junk to catch dangling refs. memset(pa, 1, PGSIZE); @@ -76,7 +101,46 @@ kalloc(void) kmem.freelist = r->next; release(&kmem.lock); - if(r) if(r) { memset((char*)r, 5, PGSIZE); // fill with junk refcount_set((uint64)r, 1); } return (void*)r; }

新增一個 array 存放 refcount

# kernel/kalloc.c struct { struct run *freelist; } kmem; +struct { + struct spinlock lock; + uint *array; + uint size; +} pg_refcount; + +void +initrefcount() +{ + for (uint i = 0; i < pg_refcount.size; i++) + pg_refcount.array[i] = 1; +} + void kinit() { initlock(&kmem.lock, "kmem"); - freerange(end, (void*)PHYSTOP); + initlock(&pg_refcount.lock, "pg_refcount"); + + uint64 range = ((PHYSTOP - (uint64)end) >> 12) * sizeof(*pg_refcount.array); + range = PGROUNDUP(range); + uint64 size = range / sizeof(*pg_refcount.array); + pg_refcount.array = (uint*)(PHYSTOP - range); + pg_refcount.size = size; + + initrefcount(); + freerange(end, (void*)(PHYSTOP-range)); } +void +refcount_inc(uint64 pa) +{ + uint64 index = (pa - (uint64)end) >> 12; + + acquire(&pg_refcount.lock); + ++pg_refcount.array[index]; + release(&pg_refcount.lock); +} + +void +refcount_set(uint64 pa, uint val) +{ + uint64 index = (pa - (uint64)end) >> 12; + + acquire(&pg_refcount.lock); + pg_refcount.array[index] = val; + release(&pg_refcount.lock); +} + +uint +refcount_dec(uint64 pa) +{ + uint64 index = (pa - (uint64)end) >> 12; + + acquire(&pg_refcount.lock); + uint *val = &pg_refcount.array[index]; + if (*val == 1) { + release(&pg_refcount.lock); + return 1; + } + uint ret = *val; + --*val; + release(&pg_refcount.lock); + return ret; +}

最後 copyout 時也要判斷有無 cow page 的情況

// Copy from kernel to user. // Copy len bytes from src to virtual address dstva in a given page table. // Return 0 on success, -1 on error. @@ -358,6 +412,11 @@ copyout(pagetable_t pagetable, uint64 dstva, char *src, uint64 len) pa0 = walkaddr(pagetable, va0); if(pa0 == 0) return -1; + pte_t *pte = walk(pagetable, va0, 0); + if ((*pte & PTE_COW) && get_new_page(pagetable, va0) != 0) + return -1; + // pa is updated. + pa0 = PTE2PA(*pte); n = PGSIZE - (dstva - va0); if(n > len) n = len;