MIT6.s081 Lab: Copy-on-Write Fork for xv6

# MIT6.s081 Lab: Copy-on-Write Fork for xv6 ###### tags: `mit6.s081` `作業系統 Operating System` 藉由 Page fault 可以實現許多特殊的操作及應用，其中一個就是 **copy-on-write fork (COW fork)** 當我們系統呼叫 fork 時，xv6 會透過 uvmcopy 將父進程完整的 memory layout 複製給子進程，並為子進程配置物理空間 ```cpp kernel/proc.c int fork(void) { int i, pid; struct proc *np; struct proc *p = myproc(); // Allocate process. if((np = allocproc()) == 0){ return -1; } // Copy user memory from parent to child. if(uvmcopy(p->pagetable, np->pagetable, p->sz) < 0){ freeproc(np); release(&np->lock); return -1; } np->sz = p->sz; np->parent = p; // copy saved user registers. *(np->trapframe) = *(p->trapframe); // Cause fork to return 0 in the child. np->trapframe->a0 = 0; // increment reference counts on open file descriptors. for(i = 0; i < NOFILE; i++) if(p->ofile[i]) np->ofile[i] = filedup(p->ofile[i]); np->cwd = idup(p->cwd); safestrcpy(np->name, p->name, sizeof(p->name)); pid = np->pid; np->state = RUNNABLE; release(&np->lock); return pid; } ``` 但思考一下這樣的作法真有其必要性嘛？例如當 shell 在處理命令時，會先 fork 一個子進程，而子進程會馬上呼叫 exec 執行其他程式。exec 在確認完 ELF header 之後，會呼叫 **proc_pagetable** 建立一個新的 page table，為每個 ELF 的程式段分配物理頁，並將對應的內容載入記憶體中，然後 exec 替子進程分配 stack 空間。最後當一切都配置完成後，exec 將呼叫 **proc_freepagetable** 將舊的 page table 及其映射的物理空間釋放掉 ```cpp kernel/exec.c int exec(char *path, char **argv) { char *s, *last; int i, off; uint64 argc, sz = 0, sp, ustack[MAXARG+1], stackbase; struct elfhdr elf; struct inode *ip; struct proghdr ph; pagetable_t pagetable = 0, oldpagetable; struct proc *p = myproc(); begin_op(); if((ip = namei(path)) == 0){ end_op(); return -1; } ilock(ip); // Check ELF header if(readi(ip, 0, (uint64)&elf, 0, sizeof(elf)) != sizeof(elf)) goto bad; if(elf.magic != ELF_MAGIC) goto bad; if((pagetable = proc_pagetable(p)) == 0) goto bad; // Load program into memory. for(i=0, off=elf.phoff; i<elf.phnum; i++, off+=sizeof(ph)){ if(readi(ip, 0, (uint64)&ph, off, sizeof(ph)) != sizeof(ph)) goto bad; if(ph.type != ELF_PROG_LOAD) continue; if(ph.memsz < ph.filesz) goto bad; if(ph.vaddr + ph.memsz < ph.vaddr) goto bad; uint64 sz1; if((sz1 = uvmalloc(pagetable, sz, ph.vaddr + ph.memsz)) == 0) goto bad; sz = sz1; if(ph.vaddr % PGSIZE != 0) goto bad; // load elf seg into memory through walkaddr and readi if(loadseg(pagetable, ph.vaddr, ip, ph.off, ph.filesz) < 0) goto bad; } iunlockput(ip); end_op(); ip = 0; p = myproc(); uint64 oldsz = p->sz; // Allocate two pages at the next page boundary. // Use the second as the user stack. sz = PGROUNDUP(sz); uint64 sz1; if((sz1 = uvmalloc(pagetable, sz, sz + 2*PGSIZE)) == 0) goto bad; sz = sz1; uvmclear(pagetable, sz-2*PGSIZE); sp = sz; stackbase = sp - PGSIZE; // Push argument strings, prepare rest of stack in ustack. for(argc = 0; argv[argc]; argc++) { if(argc >= MAXARG) goto bad; sp -= strlen(argv[argc]) + 1; sp -= sp % 16; // riscv sp must be 16-byte aligned if(sp < stackbase) goto bad; // guard page will help to detect the case with too many args if(copyout(pagetable, sp, argv[argc], strlen(argv[argc]) + 1) < 0) goto bad; ustack[argc] = sp; } ustack[argc] = 0; // push the array of argv[] pointers. sp -= (argc+1) * sizeof(uint64); sp -= sp % 16; if(sp < stackbase) goto bad; if(copyout(pagetable, sp, (char *)ustack, (argc+1)*sizeof(uint64)) < 0) goto bad; // arguments to user main(argc, argv) // argc is returned via the system call return // value, which goes in a0. p->trapframe->a1 = sp; // Save program name for debugging. for(last=s=path; *s; s++) if(*s == '/') last = s+1; safestrcpy(p->name, last, sizeof(p->name)); // Commit to the user image. oldpagetable = p->pagetable; p->pagetable = pagetable; p->sz = sz; p->trapframe->epc = elf.entry; // initial program counter = main p->trapframe->sp = sp; // initial stack pointer proc_freepagetable(oldpagetable, oldsz); return argc; // this ends up in a0, the first argument to main(argc, argv) bad: if(pagetable) proc_freepagetable(pagetable, sz); if(ip){ iunlockput(ip); end_op(); } return -1; } ``` 上面的流程很明顯是沒有效率的，fork 為子進程創建的物理空間，將立刻被 exec 給捨棄掉，複製的動作顯得非常多餘。換個方式來想，我們在創建子進程時，與其複製整個父進程的記憶體空間，乾脆直接與父進程共享不就得了？共享確實是個非常直接有效的方法，但假如父進程或子進程想要修改記憶體的內容時，這些修改、更新對於另一個進程來說應該是不可見的，我們仍然希望父子進程之間保有強隔離性這個時候就需要 page fault 的幫忙了，首先我們將父、子進程的 PTE 都設為 read-only，在某個時刻當我們要修改記憶體內容時，因為我們嘗試修改某個 read-only 的 PTE，這時會觸發 page fault ![](https://i.imgur.com/W3udz7L.png) 假設現在是子進程在執行 store 指令，在觸發 page fault 之後第一件事是分配一個新的物理頁，然後將觸發 page fault 的那一個頁的內容複製到新分配的物理頁中，然後將新分配的物理頁映射到子進程的 page table。此時，新/舊頁面只對子/父進程可見，因此我們可以將兩個頁的 PTE 設置為 read/write。當我們跳回 user space 重新執行觸發 page fault 的 store 指令時，就可以順利地修改子進程的記憶體內容，且對父進程沒有任何影響 ![](https://i.imgur.com/4FWMl5L.png) 另外一個問題是，kernel 要如何分辨這是一個 copy-on-write 應用場景，還是單純地向一個只讀的地址執行 store 指令？一個具體的作法是透過 PTE 的標示位。下圖為常見的多級 page table 的 PTE 結構，其中第 8 & 9 bit 是保留位，供 kernel 的開發者自行運用，所以我們透過第 8 位的設置來判斷觸發 page fault 的場合 ![](https://i.imgur.com/Ty3Plur.png) 最後還有一個細節需要處理，在原始的 xv6 中，除了 trapoline page 外每個物理頁都有單獨對應的進程，但在使用 copy-on-write 之後，同一個物理頁被多個進程映射。舉例來說，當子進程要退出時，父進程可能還在運行中，我們要小心判斷進程對應的物理頁能否被釋放。為此我們要對每一個物理頁的引用設置計數器，當釋放虛擬頁面時，將物理頁的計數器減 1，當引用數為 0 時，我們就可以釋放對應的物理頁 ## Implement copy-on write(hard) 首先我們在 riscv.h 添加表示 COW page 的標誌位 ```cpp kernel/riscv.h #define PTE_C (1L << 8) // COW bit ``` ### **uvmcopy** (只有 fork 會呼叫) 原先 fork() 呼叫 uvmcopy 替子進程配置新的物理頁，並複製父進程的內容。但現在我們只需要： - 將複製頁標記為 COW - 取消複製頁可寫標誌 PTE_W - 子進程的虛擬記憶體映射到該複製頁這樣在之後嘗試修改記憶體內容時， page fault 可以被觸發，且進入 trap 流程時，我們可以透過 COW 標誌位決定處理方式 ```cpp kernel/vm.c int uvmcopy(pagetable_t old, pagetable_t new, uint64 sz) { pte_t *pte; uint64 pa, i; uint flags; // char *mem; for(i = 0; i < sz; i += PGSIZE){ if((pte = walk(old, i, 0)) == 0) panic("uvmcopy: pte should exist"); if((*pte & PTE_V) == 0) panic("uvmcopy: page not present"); pa = PTE2PA(*pte); flags = PTE_FLAGS(*pte); // ** Disable PTE_W and set COW flag for both parent and child process // ** to raise page fault when each of them executes store instruction // ** Set flags only when the page is writable because it is must not a COW page if (flags & PTE_W) { flags = (flags | PTE_C) & (~PTE_W); *pte = (*pte & ~PTE_W) | PTE_C; // ** *pte = *pte | flags --> PTE_W will not be cleared!! } // **Do not need to alloc physical mem when fork()** // if((mem = kalloc()) == 0) // goto err; // memmove(mem, (char*)pa, PGSIZE); if(mappages(new, i, PGSIZE, pa, flags) != 0){ // kfree((void *)pa); goto err; } // ** add refcnt krefincr((void *)pa); } return 0; err: uvmunmap(new, 0, i / PGSIZE, 1); return -1; } ``` ### **usertrap** **usertrap** 要新增 page fault 的捕捉判斷，另外由於是 COW，我們只關心因為 store instruction 而引發的 page fault (sscause = 15)。這邊要注意一點，使用者可能會惡意的傳進超出虛擬地址空間或是落於 guard page 的非法地址，要先進行地址的合法性判斷。若為何法地址，則呼叫包裝好的執行函式 *owalloc 來進行 COW 的相關處理 ```cpp kernel/trap.c void usertrap(void) { ... } else if(r_scause() == 15){ // ** Get virtual address causing page fault and corresponding pa & pte uint64 addr = r_stval(); // ** If virtual address is over maximum va size or within guard page, kill the process if (addr >= MAXVA || (addr < p->trapframe->sp && addr >= (p->trapframe->sp - PGSIZE))) p->killed = 1; if (cowalloc(p->pagetable, PGROUNDDOWN(addr)) < 0) p->killed = 1; } ... ... } ``` **cowalloc** 首先判斷虛擬地址的合法性，接著判斷對應的 pte 是否標記為 COW page，如果標誌位存在則替子進程配置新的物理頁，複製父進程的內容，將原先的舊映射刪除，最後重新映射到新的物理頁，因為該頁面不再是 COW 頁，記得取消 PTE_C 的設置，並將物理頁設為可寫 ```cpp kernel/vm.c int cowalloc(pagetable_t pagetable, uint64 va) { // ** va must be PGSIZE aligned if ((va % PGSIZE) != 0) return -1; // ** safety check if (va >= MAXVA) return -1; pte_t *pte = walk(pagetable, va, 0); if (pte == 0) return -1; uint64 pa = PTE2PA(*pte); if (pa == 0) return -1; // ** If page fault is raised with a COW page, // ** alloc a physical page, mapped to user pagetable and set PTE_W if (*pte & PTE_C) { uint flags = PTE_FLAGS(*pte); flags = (flags & ~PTE_C) | PTE_W; // ** allocate new page char *ka = kalloc(); if (ka == 0) return -1; // ** don't forget to copy old content memmove(ka, (char*)pa, PGSIZE); // ** free old memory mapping uvmunmap(pagetable, PGROUNDUP(va), 1, 1); // ** clear COW bit and set the page writable mappages(pagetable, va, PGSIZE, (uint64)ka, flags); } return 0; } ``` ## copyout 實驗頁面特別提醒我們 copyout 也要有相同的 COW 機制，原因是 copyout 會嘗試修改覆蓋記憶體內容，假設該頁面為 COW 頁的話會觸發 page fault，但此時我們正在 kernel space 中，trap 流程無法進入 usertrap，因此這裡我們同樣呼叫 cowalloc 對虛擬頁進行處理。注意一定要先呼叫 cowalloc 再呼叫 walkaddr，因為 cowalloc 可能會修改虛擬地址映射的物理頁 ```cpp kernel/vm.c int copyout(pagetable_t pagetable, uint64 dstva, char *src, uint64 len) { uint64 n, va0, pa0; while(len > 0){ va0 = PGROUNDDOWN(dstva); if (cowalloc(pagetable, va0) < 0) return -1; pa0 = walkaddr(pagetable, va0); if(pa0 == 0) return -1; n = PGSIZE - (dstva - va0); if(n > len) n = len; memmove((void *)(pa0 + (dstva - va0)), src, n); len -= n; src += n; dstva = va0 + PGSIZE; } return 0; } ``` ## kalloc.c 如前面所述，COW 需要有計數器來紀錄物理頁是否真的要被釋放掉，這部份的修改都在 **kalloc.c** 首先我們定義新的結構體 **refcnt** 來存放物理頁引用的計數。另外因為 xv6 可多核多進程平行處理，我們還需要一個 lock 來保護對引用數的讀取及修改。KERNBASE 以下的地址為 kernel 所用，user 進程不會修改到這個部份，所以陣列的大小可以從 KERNBASE 開始計算 ```cpp kernel/kalloc.c struct { struct spinlock lock; int count[(PGROUNDUP(PHYSTOP) - KERNBASE)/PGSIZE]; } refcnt; ``` **kinit** 增加計數器以及計數器鎖的初始化。計數器初始化為 1 的原因是，freerange 會對每個 free block (也就是從 KERNBASE ~ PHYSTOP) 呼叫 kfree，kfree 將引用數減 1 變為 0，kfree 就會判斷需要釋放物理頁 ```cpp kernel/kalloc.c void kinit() { initlock(&kmem.lock, "kmem"); initlock(&refcnt.lock, "refcnt"); // ** Must reset count array before freerange for (int i = 0; i < (PGROUNDUP(PHYSTOP)-KERNBASE) / PGSIZE; i++) refcnt.count[i] = 1; freerange(end, (void*)PHYSTOP); } ``` **kfree** 在執行釋放的操作前，引用數不應該小於或等於 0。在修正引用數後若還有其他進程引用該頁面，則什麼也不做直接回傳。若此時引用數為 0 則實際釋放物理頁 ```cpp kernel/kalloc.c void kfree(void *pa) { struct run *r; if(((uint64)pa % PGSIZE) != 0 || (char*)pa < end || (uint64)pa >= PHYSTOP) panic("kfree"); // ** safety check if (krefget(pa) <= 0) panic("kfree_decr"); // ** minus refcnt when kfree is called // ** Free memory only when refcnt <= 0 krefdecr(pa); if (krefget(pa) > 0) return; memset(pa, 1, PGSIZE); r = (struct run*)pa; acquire(&kmem.lock); r->next = kmem.freelist; kmem.freelist = r; release(&kmem.lock); } ``` 分配物理頁時，要將引用數設為 1 ```cpp kernel/kalloc.c void * kalloc(void) { struct run *r; acquire(&kmem.lock); r = kmem.freelist; if(r) kmem.freelist = r->next; release(&kmem.lock); if(r){ memset((char*)r, 5, PGSIZE); // fill with junk // ** Set refcnt as 1 when allocate new page acquire(&refcnt.lock); refcnt.count[PA2IDX(r)] = 1; release(&refcnt.lock); } return (void*)r; } ``` 最後為計數器相關操作的包裝函數 ```cpp kernel/kalloc.c void krefincr(void *pa) { acquire(&refcnt.lock); refcnt.count[PA2IDX(pa)]++; release(&refcnt.lock); } void krefdecr(void *pa) { acquire(&refcnt.lock); refcnt.count[PA2IDX(pa)]--; release(&refcnt.lock); } int krefget(void *pa) { int cnt; acquire(&refcnt.lock); cnt = refcnt.count[PA2IDX(pa)]; release(&refcnt.lock); return cnt; } ``` :::success $ cowtest simple: ok simple: ok three: ok three: ok three: ok file: ok ALL COW TESTS PASSED $ usertests usertests starting test execout: OK test copyin: OK test copyout: OK test copyinstr1: OK test copyinstr2: OK test copyinstr3: OK ... ... test dirfile: OK test iref: OK test forktest: OK test bigdir: OK ALL TESTS PASSED ::: ## Reference 1. [xv6: a simple, Unix-like teaching operating system](https://pdos.csail.mit.edu/6.828/2020/xv6/book-riscv-rev1.pdf) 2. [Lab: xv6 lazy page allocation](https://pdos.csail.mit.edu/6.S081/2020/labs/cow.html) 3. [Chapter 3: Page Tables](https://zhuanlan.zhihu.com/p/351646541) 4. [MIT6.S081 課程翻譯](https://mit-public-courses-cn-translatio.gitbook.io/mit6-s081/)