# Linux Project_1 :::info - Score: The total score of this project is 170 points. - Description: * 1.Part 1 (100 points): * The kernel address space of all processes maps into the same physical address space. * How does the kernel maintains the above property? * You need to design and write code to proof your claim. * Hint: * The process list is a circular doubly linked list that links the process descriptors of all existing thread group leaders: * Each task_struct structure includes a tasks field of type list_head whose prev and next fields point, respectively, to the previous and to the next task_struct element's tasks field. * The head of the process list is the init_task task_struct descriptor; it is the process descriptor of the so-called process 0 or swapper. * Part 2 (70 points): * After a person uses Fix-Mapped Linear Addresses to map a 4K kernel address space to a 4k page frame, will existing processes get this new mapping? * If your answer is yes, how does the kernel complete this mapping? * You need to design and write code to proof your claim. ::: ## Enviroment - Virtual machine : VirtualBox 6.1.16 r140961 - Distribution(Guess OS): Ubuntu16.04 i686 - kernel version : 3.9.1 **(關閉 PAE)** - **gcc version** : 4.9.3 (Ubuntu16.04預設是5.x版 不降版會編譯不過3.x版linux kernel) ## Part1 - [task_struct](https://elixir.bootlin.com/linux/v3.9.1/source/include/linux/sched.h#L1201) ![](https://i.imgur.com/2xEOA98.png) 透過pid取得或是current pointer取得現在執行的process task_structure ~~~bash= ! struct task_struct *task = find_task_by_vpid(pid) ; struct task_struct *task = current ; ~~~ - [task_struct](https://elixir.bootlin.com/linux/v3.9.1/source/include/linux/sched.h#L1201) 裡面有個 mm_struct ![](https://i.imgur.com/KE2Ii8n.png) - trace [mm_struct](https://elixir.bootlin.com/linux/v3.9.1/source/include/linux/mm_types.h#L325) 然後在下面幾行 (line365) 可以找到上圖的 start_code , end_code , start_data , end_data…etc. ![](https://i.imgur.com/7Sdxt6J.png) - mm_struct中可以找到vm_area_struct ![](https://i.imgur.com/vQQ5UAu.png) - [vm_area_struct](https://elixir.bootlin.com/linux/v3.9.1/source/include/linux/mm_types.h#L228) 中有vm_start,vm_end,*vm_next,可用vm_next循序跑完 - vm_area_struct中有 vm_start,vm_end,*vm_next,可用vm_next循序跑完 ![](https://i.imgur.com/7pFC7J1.png) ~~~bash= ! while(mmap!=NULL);{ vm_start = mmap->vm_start; vm_end = mmap->vm_end; mmap = mmap->vm_next; printk("Virtual Address: %lx %lx",vm_start,vm_end); } ~~~ ### Part1_kernel_code ~~~bash #include<linux/init.h> #include<linux/kernel.h> #include<linux/module.h> #include<linux/sched.h> #include<linux/mm.h> #include<linux/mm_types.h> #include <linux/pid.h> #include<asm/io.h> #include<linux/highmem.h> int result_count; int vma_total; unsigned long total_virtual_memory; void write2result(int result[],unsigned long addr) { /* index 0 1 2 3 4 5 6 7 8 result vma_total pid map_count total_vm total_pm vm_start vm_end phy_start phy_end ..... */ result[result_count] = (int)addr; result_count++; } static unsigned long virual2physical(struct task_struct *task,unsigned long va); void print_virual_memory(struct task_struct *task,int result[]) { struct mm_struct *mm; struct vm_area_struct *vma; unsigned long pa = 0; int tmp,tmp1 = 0; mm = task->mm; /* map_count [2] */ write2result(result,mm->map_count); /* total virtual memory [3] */ write2result(result,(mm->total_vm << (PAGE_SHIFT - 10))); /* total physical memory [4] */ write2result(result,(get_mm_rss(mm) << (PAGE_SHIFT - 10))); vma_total += (0xD0000000 -0xC0000000 )/ 0x10000 ; for( tmp1 =0 ; tmp1 <= (0xD0000000 -0xC0000000 )/ 0x10000 ; tmp1++) { write2result(result,0xC0000000+tmp1*0x10000); write2result(result,0xC0000000+tmp1*0x10000+0x10000); pa = virual2physical(task,0xC0000000+tmp1*0x10000); if (pa != -1){ write2result(result,pa); write2result(result,pa+0x10000); } else { printk("error 1\n"); write2result(result,0); write2result(result,0); } } tmp1 = 0 ; for(vma = mm->mmap; vma; vma=vma->vm_next){ tmp = (vma->vm_end - vma->vm_start) / 0x1000 ; while(tmp1 < tmp){ write2result(result,vma->vm_start+tmp1*0x1000); write2result(result,vma->vm_start+tmp1*0x1000+0x1000); pa = virual2physical(task,vma->vm_start + tmp1*0x1000); if (pa != -1){ write2result(result,pa); write2result(result,pa+0x1000); } else { printk("error 2\n"); write2result(result,0); write2result(result,0); } tmp1++; } tmp1 = 0; vma_total += tmp; tmp = 0; } } static unsigned long virual2physical(struct task_struct *task,unsigned long va) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte; unsigned long pa = 0; unsigned long page_addr = 0; unsigned long page_offset = 0; printk("virtual address = 0x%lx\n", va); pgd = pgd_offset(task->mm,va); printk("pgd = 0x%lx\n", pgd); //printk("pgd_val = 0x%lx\n", pgd_val(*pgd)); printk("pgd_index = %lu\n", pgd_index(va)); if(pgd_none(*pgd)){ printk("not mapped in pgd\n"); return -1; } pud = pud_offset(pgd,va); printk("pud = 0x%lx\n", pud); //printk("pud_val = 0x%lx\n", pud_val(*pud)); if(pud_none(*pud)){ printk("not mapped in pud\n"); return -1; } pmd = pmd_offset(pud,va); printk("pmd = 0x%lx\n", pmd); //printk("pmd_val = 0x%lx\n", pmd_val(*pmd)); printk("pmd_index = %lu\n", pmd_index(va)); if(pmd_none(*pmd)){ return -1; } pte = pte_offset_kernel(pmd,va); printk("pte = 0x%lx\n", pte); //printk("pte_val = 0x%lx\n", pte_val(*pte)); printk("pte_index = %lu\n", pte_index(va)); if(pte_none(*pte)){ printk("not mapped in pte\n"); return -1; } page_addr = pte_val(*pte) & PAGE_MASK; page_offset = va & ~PAGE_MASK; pa = page_addr | page_offset; return pa; } asmlinkage void ps_v2p(int result[], int pid) { struct task_struct *task; result_count = 1; vma_total = 0; task = find_task_by_vpid(pid); /* task pid [1] */ write2result(result,task->pid); print_virual_memory(task,result); result[0] = vma_total; } ~~~ ### Part1_user_code ~~~ bash= ! #include<stdio.h> #include<string.h> #include<unistd.h> #define MEMORY_SIZE 1000000 void write_str(FILE *fp,char *str,int result,int result2,int result3) { char buf[60]; memset(buf,0,sizeof(buf)); sprintf(buf,str,result,result2,result3); fwrite(buf,sizeof(char),strlen(buf),fp); } void write2file(int result[],char *name) { FILE *fp; int i; fp = fopen(name,"w"); write_str(fp,"[+] PID = %d \n",result[1],0,0); write_str(fp,"[+] map count = %d \n",result[2],0,0); write_str(fp,"[+] total virtual mem = %d \n",result[3],0,0); write_str(fp,"[+] total physical mem = %d \n",result[4],0,0); for(i=0;i<result[0];i++){ write_str(fp,"[%2d] v_start : %10p \t, v_end : %10p \n",i+1,result[5+i*4],result[6+i*4]); write_str(fp,"[%2d] p_start : %10p \t, p_end : %10p \n",i+1,result[7+i*4],result[8+i*4]); } fclose(fp); } void show_process_mem_info(int result[]) { int i; /* [0] result count [1] pid [2] map count [3] total virtual memory [4] total physical memory [5] vm start [6] vm end [7] pm start [8] pm end */ printf("[+] PID = %d \n",result[1]); printf("[+] map count = %d \n",result[2]); printf("[+] total virtual mem = %d \n",result[3]); printf("[+] total physical mem = %d \n",result[4]); for(i=0;i<result[0];i++){ printf("[%2d] v_start : %10p \t, v_end : %10p \n",i+1,result[5+i*4],result[6+i*4]); printf("[%2d] p_start : %10p \t, p_end : %10p \n",i+1,result[7+i*4],result[8+i*4]); } } int main() { int result_1[MEMORY_SIZE]; int result_2[MEMORY_SIZE]; int exit_status; int a; int pid ; //sleep(10); printf("Input the PID of the first bash that you want to observe:"); scanf("%d", &pid); a = syscall(351,result_1,pid); show_process_mem_info(result_1); write2file(result_1,"result_1.txt"); printf("Input the PID of the first bash that you want to observe:"); scanf("%d", &pid); a = syscall(351,result_2,pid); show_process_mem_info(result_2); write2file(result_2,"result_2.txt"); } ~~~ ### Result(Proof) - 不同Process的User Space(<0xC0000000)的virtual轉Physical對照驗證 ![](https://i.imgur.com/z19FcHI.png) - 不同Process的Kernel Space(>0xC0000000)的virtual轉Physical對照驗證 ![](https://i.imgur.com/w37oqzI.png) ### How does the kernel maintains the above property? linux在創建新的process時 在kernel的部分會copy swapper_pg_dir 流程如下 do_fork() --> copy_mm() --> mm_init() --> pgd_alloc() --> set_pgd_fast() --> get_pgd_slow() --> memcpy(&PGD + USER_PTRS_PER_PGD, swapper_pg_dir + USER_PTRS_PER_PGD, (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)) 雖然說是複製但是其中pgd是獨立的(存放的physical位址不同)只有pmd是共享的 ![](https://i.imgur.com/sNv4h99.png) ## Part2 - After a person uses Fix-Mapped Linear Addresses to map a 4K kernel address space to a 4k page frame, will existing processes get this new mapping? **Yes!** - Step 1: A process 調用set_fixmap分配了一段fixmap virtual address,首地址是x。這時候,所有proecess的頁表並沒有建立關於x的entry,唯一完整建立x地址段的頁表是init_mm的PGD(swapper_pg_dir)。 Step 2: 當process A、B都訪問了某個virual address x,此時A、B的頁表會與init_mm同步,A和B也都建立好了關於x這個virual address段的頁表。但是,這裡需要強調的是:對於xvirual address段,A process 和B process 有不同的PGD,但是它們PGD中關於x地址段的entry們都是指向相同的PMD。當然PMD entry指向的PTE也是相同的。換句話說,對於x地址段,所有進程還有init mm而言,它們的PMD和PTE都是共享的。 Step 3(假如A釋放掉x會怎樣?): Process A調用__free_page function釋放了地址x,操作的對象會是init_mm的PGD以及其下級的頁表們。需要說明的是:init_mm PGD的那些PMD和PTE的address不會回收,因此x地址段的PMD和PTE頁表本身的address 並不會釋放,只是將x地址段對應的頁表項全部清除。 Step 4: Process B訪問x地址段的時候,PGD entry指向了大家共享的PMD,PMD的entry指向了大家共享的PTE,但是,PTE中的具體的entry已經被清除,因此產生page fault - fixmap區域從FIXADDR_START到FIXADDR_TOP ![](https://i.imgur.com/Mux5lxb.png) - 在Linux kernel virtual address space中,劃分了一段virtual address 此virtual address開始於FIXADDR_START,終止於FIXADDR_TOP。 FIXMAP address分配器是一個用於維護,分配和釋放的FIXMAP virtual address區域的管理器。 FIXMAP管理器將初步virtual address均值累加 “__end_of_permanent_fixed_addresses”個address塊,每個address塊對應的virtual address都可以通過一個索引值進行轉換。 :::info #define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT)) #define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT) #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) #define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT) unsigned long __FIXADDR_TOP = 0xfffff000; ::: - 為什要使用fixmap? - 動態分配虛擬記憶體以及建立memory映射是一個複雜的過程,在kernel完全啟動之後,記憶體管理可以提供各種豐富的API讓kernel的其他模塊可以完成虛擬記憶體分配和建立memory映射的功能,但是在kernel啟動的過程中,有些模塊需要使用 virtual address 並映射到指定的物理記憶體上,而且這些模塊也沒有辦法等待完整的記憶體管理模塊初始化之後再進行記憶體映射。因此,linux kernel固定分配了一些fixmap的虛擬記憶體,這些記憶體有固定的用途,使用該記憶體的模塊在初始化的時候,讓這些固定分配的memory映射到指定的物理記憶體上去。 - fixmap是virtual address固定 例如 index3 對應的virtual address一定是0xffffc000 但是實體記憶體是什麼就是看使用者指定 - Kmalloc分配的記憶體在上圖的直接記憶體區域區域,這區域的個大小為896M 且與物理記憶體的關係是直接的線性映射 即0xC0000000-> C0000000 + 896M對應真實物理記憶體的0G-> 896M kmalloc()確保頁面在物理上是連續的,記憶體只有在要被DMA訪問的時候才需要物理上連續 kmalloc分配記憶體是可以調用__get_free_pages() - 使用kmap的原因(kmap和fixmap不同!! 與本題無關): - 對於高端物理記憶體(896M之後)並沒有和kernel地址空間建立一一對應的關係 (即虛擬地址=物理地址+ PAGE_OFFSET這樣的關係) 所以不能使用get_free_pages()這樣的頁面分配器進行記憶體的分配 而必須使用alloc_pages()這樣的伙伴系統算法的接口得到結構* page結構 然後將其映射到kernel地址空間。 Kmap每次只能分配一個頁面。 - 分配函數的選擇(與本題無關): - 連續的物理頁面:kmalloc或者低級頁面分配器高端記憶體分配:alloc_pages - 指向頁面結構指針,不是邏輯地址指針。再通過kmap()把高端地址記憶體映射到kernel的邏輯地址空間。 ### Part2_kernel_code ~~~bash= #include<linux/init.h> #include<linux/kernel.h> #include<linux/module.h> #include<linux/sched.h> #include<linux/mm.h> #include<linux/mm_types.h> #include <linux/pid.h> #include<asm/io.h> #include<linux/highmem.h> #include <asm/fixmap.h> int result_count_v2; int vma_total_v2; unsigned long total_virtual_memory_v2; void write2result_v2(int result[],unsigned long addr) { /* index 0 1 2 3 4 5 6 7 8 result vma_total pid map_count total_vm total_pm vm_start vm_end phy_start phy_end ..... */ result[result_count_v2] = (int)addr; result_count_v2++; } unsigned long virual2physical(struct task_struct *task,unsigned long va); void print_virual_memory_v2(struct task_struct *task,int result[],unsigned long va) { unsigned long vaddr = 0; unsigned long pa = 0; unsigned long val; struct page *page; void *addr; if( va == 0 ) { struct mm_struct *mm; struct vm_area_struct *vma; int tmp,tmp1 = 0; mm = task->mm; /* map_count [2] */ write2result_v2(result,mm->map_count); /* total virtual memory [3] */ write2result_v2(result,(mm->total_vm << (PAGE_SHIFT - 10))); /* total physical memory [4] */ write2result_v2(result,(get_mm_rss(mm) << (PAGE_SHIFT - 10))); /* page = alloc_page(__GFP_HIGHMEM); if (!page || !PageHighMem(page)) { printk("%s alloc_page() failed.\n", __func__); return -ENOMEM; } addr = kmap(page); if (!addr) { printk("%s kmap() failed.\n", __func__); __free_page(page); return -EINVAL; } sprintf((char *)addr, "BiscuitOS-%s", __func__); printk("[%#lx] %s\n", (unsigned long)addr, (char *)addr);*/ /*vaddr = (unsigned long)vmalloc(1000 * sizeof(char)); if (vaddr == 0) { printk("vmalloc failed..\n"); return 0; } printk("vmalloc_vaddr=0x%lx\n", vaddr);*/ set_fixmap(3, 0xdfe00000); addr = fix_to_virt(3); write2result_v2(result,(unsigned long)addr); write2result_v2(result,0); pa = virual2physical(task,(unsigned long)addr); if (pa != -1){ write2result_v2(result,pa); write2result_v2(result,0); } else { printk("error 2\n"); write2result_v2(result,0); write2result_v2(result,0); } } else { write2result_v2(result,va); write2result_v2(result,0); printk("FIXADDR_START = 0x%lx\n", FIXADDR_START ); printk("FIXADDR_TOP = 0x%lx\n", FIXADDR_TOP ); printk("FIX_KMAP_BEGIN = 0x%lx\n", FIX_KMAP_BEGIN ); printk("FIX_KMAP_END = 0x%lx\n", FIX_KMAP_END ); printk("FIX_APIC_BASE = 0x%lx\n", FIX_APIC_BASE ); printk("virtual address = 0x%lx\n", va); printk("virtual address = %lx\n", va); printk("fixmap index = %lu\n", virt_to_fix(va) ); pa = virual2physical(task, fix_to_virt(3)); if (pa != -1){ write2result_v2(result,pa); write2result_v2(result,0); } else { printk("error 2\n"); write2result_v2(result,0); write2result_v2(result,0); } } } unsigned long virual2physical(struct task_struct *task,unsigned long va) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte; unsigned long pa = 0; unsigned long page_addr = 0; unsigned long page_offset = 0; printk("virtual address = 0x%lx\n", va); pgd = pgd_offset(task->mm,va); printk("pgd = 0x%lx\n", pgd); //printk("pgd_val = 0x%lx\n", pgd_val(*pgd)); printk("pgd_index = %lu\n", pgd_index(va)); if(pgd_none(*pgd)){ printk("not mapped in pgd\n"); return -1; } pud = pud_offset(pgd,va); printk("pud = 0x%lx\n", pud); //printk("pud_val = 0x%lx\n", pud_val(*pud)); if(pud_none(*pud)){ printk("not mapped in pud\n"); return -1; } pmd = pmd_offset(pud,va); printk("pmd = 0x%lx\n", pmd); //printk("pmd_val = 0x%lx\n", pmd_val(*pmd)); printk("pmd_index = %lu\n", pmd_index(va)); if(pmd_none(*pmd)){ return -1; } pte = pte_offset_kernel(pmd,va); printk("pte = 0x%lx\n", pte); //printk("pte_val = 0x%lx\n", pte_val(*pte)); printk("pte_index = %lu\n", pte_index(va)); if(pte_none(*pte)){ printk("not mapped in pte\n"); return -1; } page_addr = pte_val(*pte) & PAGE_MASK; page_offset = va & ~PAGE_MASK; pa = page_addr | page_offset; return pa; } asmlinkage void ps_v2p_v2(int result[], int pid,unsigned long va ) { if(va == 0 ) { struct task_struct *task; result_count_v2 = 1; vma_total_v2 = 0; task = find_task_by_vpid(pid); /* task pid [1] */ write2result_v2(result,task->pid); print_virual_memory_v2(task,result,0); result[0] = vma_total_v2; } else { struct task_struct *task; task = current ; result[0] = 0; result[1] = 0; result[2] = 0; result[3] = 0; result[4] = 0; print_virual_memory_v2(task,result,va); } } ~~~ ### Part2_User_code ~~~bash= #include<stdio.h> #include<string.h> #include<unistd.h> #define MEMORY_SIZE 1000000 void write_str(FILE *fp,char *str,int result,int result2,int result3) { char buf[60]; memset(buf,0,sizeof(buf)); sprintf(buf,str,result,result2,result3); fwrite(buf,sizeof(char),strlen(buf),fp); } void write2file(int result[],char *name) { FILE *fp; int i; fp = fopen(name,"w"); write_str(fp,"[+] PID = %d \n",result[1],0,0); write_str(fp,"[+] map count = %d \n",result[2],0,0); write_str(fp,"[+] total virtual mem = %d \n",result[3],0,0); write_str(fp,"[+] total physical mem = %d \n",result[4],0,0); for(i=0;i<result[0];i++){ write_str(fp,"[%2d] v_start : %10p \t, v_end : %10p \n",i+1,result[5+i*4],result[6+i*4]); write_str(fp,"[%2d] p_start : %10p \t, p_end : %10p \n",i+1,result[7+i*4],result[8+i*4]); } fclose(fp); } void show_process_mem_info(int result[]) { int i; /* [0] result count [1] pid [2] map count [3] total virtual memory [4] total physical memory [5] vm start [6] vm end [7] pm start [8] pm end */ printf("[+] PID = %d \n",result[1]); printf("[+] map count = %d \n",result[2]); printf("[+] total virtual mem = %d \n",result[3]); printf("[+] total physical mem = %d \n",result[4]); for(i=0;i<result[0];i++){ printf("[%2d] v_start : %10p \t, v_end : %10p \n",i+1,result[5+i*4],result[6+i*4]); printf("[%2d] p_start : %10p \t, p_end : %10p \n",i+1,result[7+i*4],result[8+i*4]); } } int main() { int result_1[MEMORY_SIZE]; int result_2[MEMORY_SIZE]; int exit_status; int a; int pid ; unsigned long va ; //sleep(10); printf("Input the PID of the first bash that you want to observe:"); scanf("%d", &pid); a = syscall(352,result_1,pid,0); //write2file(result_1,"part2_result_1.txt"); printf("Input the vaddr to free:"); scanf("%x",&va); syscall(353,va); } ~~~ ### Fixmap 使用 1. FIXMAP 分配 :::info set_fixmap_nocache(idx,phy) set_fixmap(idx,phy) kmap_atomic(struct page *page); ::: FIXMAP 釋放 :::info clear_fixmap kunmap_atomic ::: 轉換函數 :::info __fix_to_virt __virt_to_fix fix_to_virt virt_to_fix ::: 2. ~~~bash= ! struct page *high_page; int idx, type; unsigned long vaddr; /* Allocate a physical page */ high_page = alloc_page(__GFP_HIGHMEM); /* Obtain current CPUs FIX_KMAP_BEGIN */ type = kmap_atomic_idx_push(); idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id(); /* Obtain fixmap virtual address by index */ vaddr = fix_to_virt(idx); /* Associate fixmap virtual address with physical address */ set_fixmap(idx, page_to_phys(high_page)); printk("[*]unsignd long vaddr:Address: %#08x\n", (unsigned int)(unsigned long)vaddr); /* Remove associate with fixmap */ clear_fixmap(idx); ~~~ 3. ~~~bash= ! #include <linux/mm.h> #include <linux/highmem.h> #include <asm/fixmap.h> /* FIX Phys Addr */ #define FIX_APIC_ADDR 0xffe00000 static int TestCase_kmap(void) { unsigned long apic_virt = fix_to_virt(FIX_APIC_BASE); unsigned long val; /* FIXMAP */ set_fixmap_nocache(apic_virt, FIX_APIC_ADDR); /* Read/Write */ val = *apic_virt; return 0; } ~~~ 4. ~~~bash= ! #include <linux/mm.h> #include <linux/highmem.h> #include <asm/fixmap.h> static int TestCase_fixmap(void) { struct page *page; void *addr; /* alloc */ page = alloc_page(__GFP_HIGHMEM); if (!page || !PageHighMem(page)) { printk("%s alloc_page() failed.\n", __func__); return -ENOMEM; } /* Fixmap */ addr = kmap_atomic(page, KM_USER0); sprintf((char *)addr, "BiscuitOS-%s", __func__); printk("[%#lx] %s\n", (unsigned long)addr, (char *)addr); /* Unmap */ kunmap_atomic(addr, KM_USER0); __free_page(page); return 0; } ~~~ ### Result(Proof) 1. 先透過A Process 呼叫SystemCall執行set_fixmap(3, 0xdfe00000); // 實體記憶體只要是未被使用的區域都可以 2. 再透過B Process 看看是否能夠存取及驗證fix_to_virt(3)是否是剛剛Process所使用的virtual address和virt_to_fix(x)是否是3 3. virtual address也確實是在fixmap區間 確實分配到指定的實體記憶體位址 ![](https://i.imgur.com/RkB3hyT.png) P2也可使用 ![](https://i.imgur.com/Ctv63GI.png) # How to add system call