Linux Lab 2 Report

# Linux Lab 2 Report ###### tags: `linux`, `linux-2022-autumn` </br> ## Code ### `get_layout.c` :::spoiler source code ```c= #include <stdio.h> #include <stdlib.h> #include "get_layout.h" #include "my_syscall.h" struct layout_var { unsigned long code; unsigned long data; unsigned long bss; unsigned long lib; unsigned long stack; char _end_member; }; char bss_var[10]; char data_var[] = "uuwuu"; void foo() { puts("Foo!\n"); } void print_layout_var(struct layout_var *var) { printf( \ "code: 0x%014lx\n" "bss: 0x%014lx\n" "data: 0x%014lx\n" "stack: 0x%014lx\n" "lib: 0x%014lx\n", var->code, var->bss, var->data, var->stack, var->lib ); } struct layout_var *get_layout_var() { char stack_var = 1; char *heap_var = (char*) malloc(1); struct layout_var *result = (struct layout_var*) malloc(sizeof(struct layout_var)); if (!result) { free(heap_var); return NULL; } result->code = (unsigned long) foo; result->bss = (unsigned long) bss_var; result->data = (unsigned long) data_var; result->stack = (unsigned long) &stack_var; result->lib = (unsigned long) puts; free(heap_var); return result; } struct layout_var *layout_var_to_phy(struct layout_var *var) { struct layout_var *result = (struct layout_var*) malloc(sizeof(struct layout_var)); if (!result) return NULL; size_t len = ((unsigned long)&(var->_end_member) - (unsigned long) var) / sizeof(unsigned long); if (my_sys_my_virt_phy(&var->code, len, &result->code)) { free(result); return NULL; } return result; } ``` ::: ### `my_syscall.c` & `hello_world.c` :::spoiler source code ```c= /*my_syscall.c*/ #include <stdlib.h> #include <unistd.h> #include <stdlib.h> #include "my_syscall.h" long my_sys_hello_world(int val) { return syscall(NR_SYS_HELLO_WORLD, val); } long my_sys_my_virt_phy(unsigned long *user_vaddrs, size_t len_vaddr, unsigned long *result) { return syscall(NR_SYS_MY_VIRT_PHY, user_vaddrs, len_vaddr, result); } ``` ```c= // hello/hello_world.c #include <linux/kernel.h> #include <asm/syscalls.h> #include <linux/syscalls.h> #include <linux/sched.h> #include <linux/types.h> #include <linux/uaccess.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/string.h> inline unsigned long virt2phy_inline(struct mm_struct *mm, unsigned long vaddr) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; struct page *page; unsigned long pfn; pgd = pgd_offset(mm, vaddr); if (pgd_none(*pgd)) { printk("not mapped in pgd\n"); return -1; } p4d = p4d_offset(pgd, vaddr); if (p4d_none(*p4d)) { printk("not mapped in p4d\n"); return -1; } pud = pud_offset(p4d, vaddr); if (pud_none(*pud)) { printk("not mapped in pud\n"); return -1; } pmd = pmd_offset(pud, vaddr); if (pmd_none(*pmd)) { printk("not mapped in pmd\n"); return -1; } pte = pte_offset_map(pmd, vaddr); if (pte_none(*pte)) { printk("not mapped in pte\n"); } page = pte_page(*pte); pte_unmap(pte); pfn = page_to_pfn(page); printk("pfn: %lx\n", pfn); return (pfn & PAGE_MASK) | (vaddr & ~PAGE_MASK); } SYSCALL_DEFINE1(hello_world, int, a) { printk("Hello World! %d\n", a); return 0; } SYSCALL_DEFINE3(my_virt_phy, unsigned long __user *, user_vaddrs, size_t , len_vaddr, unsigned long __user *, result) { unsigned long *vaddr = kmalloc(sizeof(unsigned long) * len_vaddr, GFP_KERNEL); unsigned long *paddr = kmalloc(sizeof(unsigned long) * len_vaddr, GFP_KERNEL); copy_from_user(vaddr, user_vaddrs, sizeof(unsigned long) * len_vaddr); printk("test vaddr[0]: %lx", vaddr[0]); int i = 0; for (; i < len_vaddr; i++) { paddr[i] = virt2phy_inline(current->mm, vaddr[i]); } copy_to_user(result, paddr, len_vaddr* sizeof(unsigned long)); return 0; } ``` ::: ## 實驗紀錄這次的實驗除了專題要求的 — 確認同時執行相同的程式（兩個以上相同的程式同時存在）是否會共享記憶體，也做了另外一個版本連續執行相同的程式（不同時存在）是否會共享記憶體。 ### 實驗 0x1：同時執行相同的程式（同時存在） #### Vritual Address | Process # | code | bss | data | stack | lib | | --------- | ---------------- | ---------------- | ---------------- | ---------------- | ---------------- | | 1 | 0x0055ccd3937301 | 0x0055ccd393a020 | 0x0055ccd393a010 | 0x007ffec9894027 | 0x007ff4164ac420 | | 2 | 0x00559167852301 | 0x00559167855020 | 0x00559167855010 | 0x007ffe6bb42d77 | 0x007fcd43405420 | > 從 Virtual Address 看不出來是否有共享記憶體 #### Physical Address | Process # | code | bss | data | stack | lib | | --------- | -------------------- | ---------------- | ---------------- | ---------------- | -------------------- | | 1 | **0x000000005f0301** | 0x000000005cf020 | 0x000000005cf010 | 0x0000000043a027 | **0x00000000000420** | | 2 | **0x000000005f0301** | 0x000000005db020 | 0x000000005db010 | 0x000000005fcd77 | **0x00000000000420** | > 從 Physical Address 可以看到 `code` 和 `lib` 這兩段記憶體是相同的，由此判定 Linux 在處理相同程式時會讓 **`code` 段共用**。`lib` 段則是所有 C 程式共用，通常都會相同。 ### 實驗 0x2：連續執行相同的程式（不同時存在） #### Virtual Address | Process # | code | bss | data | stack | lib | | --------- | ---------------- | ---------------- | ---------------- | ---------------- | ---------------- | | 1 | 0x0055c70d4162d2 | 0x0055c70d419020 | 0x0055c70d419010 | 0x007ffd96b0bf57 | 0x007fa29aaa0420 | | 2 | 0x00562b48e522d2 | 0x00562b48e55020 | 0x00562b48e55010 | 0x007ffc1f9adf87 | 0x007ff1fcc98420 | | 3 | 0x0055cf341352d2 | 0x0055cf34138020 | 0x0055cf34138010 | 0x007ffdb2989d57 | 0x007f1660d13420 | | 4 | 0x00555ad1d042d2 | 0x00555ad1d07020 | 0x00555ad1d07010 | 0x007ffe597447f7 | 0x007f63eef85420 | | 5 | 0x005621236482d2 | 0x0056212364b020 | 0x0056212364b010 | 0x007ffdf5c46bd7 | 0x007feca6392420 | | 6 | 0x0055cbb5d432d2 | 0x0055cbb5d46020 | 0x0055cbb5d46010 | 0x007fffddfdf077 | 0x007f78dcdfb420 | | 7 | 0x00557285d6d2d2 | 0x00557285d70020 | 0x00557285d70010 | 0x007ffe2e270ca7 | 0x007fb4a9579420 | | 8 | 0x0055c39bf0e2d2 | 0x0055c39bf11020 | 0x0055c39bf11010 | 0x007ffe3bec76d7 | 0x007fd816820420 | | 9 | 0x00564e07eec2d2 | 0x00564e07eef020 | 0x00564e07eef010 | 0x007ffdd3e5be37 | 0x007f55401d5420 | | 10 | 0x00557a5814c2d2 | 0x00557a5814f020 | 0x00557a5814f010 | 0x007fffde9b2f17 | 0x007fc6fa53f420 | > 一樣，從 Virtual Address 看不出來是否會共用。 #### Physical Address | Process # | code | bss | data | stack | lib | | --------- | -------------------- | ---------------- | ---------------- | ---------------- | -------------------- | | 1 | **0x0000000051f2d2** | 0x00000000376020 | 0x00000000376010 | 0x000000004d7f57 | **0x00000000000420** | | 2 | **0x0000000051f2d2** | 0x00000000397020 | 0x00000000397010 | 0x00000000444f87 | **0x00000000000420** | | 3 | **0x0000000051f2d2** | 0x00000000398020 | 0x00000000398010 | 0x000000003bcd57 | **0x00000000000420** | | 4 | **0x0000000051f2d2** | 0x00000000400020 | 0x00000000400010 | 0x0000000049a7f7 | **0x00000000000420** | | 5 | **0x0000000051f2d2** | 0x0000000045c020 | 0x0000000045c010 | 0x000000002e3bd7 | **0x00000000000420** | | 6 | **0x0000000051f2d2** | 0x000000004dd020 | 0x000000004dd010 | 0x0000000046a077 | **0x00000000000420** | | 7 | **0x0000000051f2d2** | 0x00000000353020 | 0x00000000353010 | 0x000000003b0ca7 | **0x00000000000420** | | 8 | **0x0000000051f2d2** | 0x00000000399020 | 0x00000000399010 | 0x000000004006d7 | **0x00000000000420** | | 9 | **0x0000000051f2d2** | 0x00000000272020 | 0x00000000272010 | 0x000000004a1e37 | **0x00000000000420** | | 10 | **0x0000000051f2d2** | 0x00000000498020 | 0x00000000498010 | 0x00000000528f17 | **0x00000000000420** | > 從 Physical Address 可以看到 **`code` 段位址一樣**，但是此時程式並沒有同時執行。由此判斷 Linux 不只在同時執行相同程式時會共用 `code` 段，也會對連續執行相同的程式進行優化，不會立刻 `unmap` 實體記憶體的資料，藉此減少 IO，增強系統效能 ![](https://i.imgur.com/5zAaJnd.png =x100) ### Page-fault When a page fault occurs in the Linux kernel, the kernel handles it by calling the page fault handler, which is a function that is responsible for resolving the page fault and allowing the process to continue execution. The page fault handler is implemented in the file `mm/fault.c` in the Linux kernel source code. When a page fault occurs, the kernel first checks whether the page that caused the fault is a valid page that is mapped to a physical address in memory. If the page is not valid, the kernel sends a signal to the process to terminate it, because the process is trying to access memory that it is not allowed to access. If the page is valid but not currently in memory, the kernel must bring the page into memory from storage (e.g. from the disk). This is known as "demand paging". The kernel does this by allocating a new page frame in memory, reading the contents of the page from storage into the page frame, and updating the page table to map the virtual page to the physical page frame in memory. Here is an excerpt from the page fault handler in `mm/fault.c` that shows how the kernel handles the case where a page is not currently in memory: ```c= `/* * The page fault handler. This is called by the assembly-language * __do_page_fault code in traps.c. */ static int __do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address) { struct vm_area_struct *vma; struct task_struct *tsk; struct mm_struct *mm; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | (error_code & PF_USER ? FAULT_FLAG_USER : 0); int fault; tsk = current; mm = tsk->mm; /* * If we're in an interrupt or have no user context, we must not take the fault.. */ if (in_atomic() || !mm) goto no_context; /* * As per x86 specs, we must clear the RF flags before returning from * an exception caused by a page fault. */ regs->flags &= ~X86_EFLAGS_RF; /* * If we're in kernel-mode and the faulting address is not in the low 1GB, * we can take the fault. */ if (!(error_code & PF_USER) && (address >= TASK_SIZE_MAX)) goto no_context; /* * When running in the kernel, we expect faults to occur only to * addresses in user space. All other faults represent errors in the * kernel and should generate an OOPS. Unfortunately, in the case of an * erroneous fault occurring in a code path which already holds mmap_sem * we will deadlock attempting to validate the fault against the * address space. Luckily the kernel only validly references user * space from well defined areas of code, which are listed in the * exceptions table. * * As the vast majority of faults will be valid we will only perform * the source reference check when there is a possibility of a deadlock. * Attempting to walk the full exception table at` ``` --- ![](https://i.imgur.com/UiWKXuh.png) 在 X86 處理 `page_fault` 的是 `void do_page_fault(struct pt_regs *regs, unsigned long error_code)` ```c /* arch/x86/mm/fault.c */ static __always_inline void handle_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address) { trace_page_fault_entries(regs, error_code, address); if (unlikely(kmmio_fault(regs, address))) return; /* Was the fault on kernel-controlled part of the address space? */ if (unlikely(fault_in_kernel_space(address))) { do_kern_addr_fault(regs, error_code, address); } else { do_user_addr_fault(regs, error_code, address); /* * User address page fault handling might have reenabled * interrupts. Fixing up all potential exit points of * do_user_addr_fault() and its leaf functions is just not * doable w/o creating an unholy mess or turning the code * upside down. */ local_irq_disable(); } } ``` 從這份 code 中我們可以看到他先判斷此 fault 發生在 kernel space 還是 user space CR2 ### Reference - [good reference](https://www.kernel.org/doc/html/v5.8/x86/exception-tables.html)