# 奔跑吧 CH 3.1 進程的誕生 ## Process and thread process為OS分配系統資源(memory)的單位,thread為OS分配CPU time的單位 **Process:** * 一個elf binary檔案在 disk 上叫 user program, load進memory 開始執行後就變成 process * 有自己的address space =>code section, data section, bss section、stack、heap * Linux 通常把process當做是**task**,**PCB (processing control block)** 通常也稱為 **struct tast_struct** **thread:** * Thread 被稱為輕量級process,它是系統調度最小單位,thread與process差別在於process有獨立的資源空間,而thread則共享process的address * 共享process的address space => code section, data section * thread有自己的**program counter, cpu register, stack** Linux 並沒有特定的data structure來標示thread or process,thread與process都使用process的PCB <img src="https://i.imgur.com/i3FDa7Y.png" style="height:350px;display: block;margin:auto;"> ### Process and task_struct * Linux用task_struct來描述一個process,透過task_struct->mm->mmap可以找到vm_area_struct,linux用利用VMA(virtual memory area)來描述每個segment特性,大小 <img src="https://i.imgur.com/hWJh8f8.png" style="height:350px;display: block;margin:auto;"> ## How kernel execute a binary file kernel執行第一個user program => init ```clike= //init/main.c static int __ref kernel_init(void *unused) { .... 1148 if (!try_to_run_init_process("/sbin/init") || 1149 !try_to_run_init_process("/etc/init") || 1150 !try_to_run_init_process("/bin/init") || 1151 !try_to_run_init_process("/bin/sh")) -------------- static int run_init_process(const char *init_filename) { argv_init[0] = init_filename; pr_info("Run %s as init process\n", init_filename); return do_execve(getname_kernel(init_filename), (const char __user *const __user *)argv_init, (const char __user *const __user *)envp_init); } ``` do_execve -> do_execveat_common -> __do_execve_file ```clike= static int __do_execve_file(int fd, struct filename *filename, struct user_arg_ptr argv, struct user_arg_ptr envp, int flags, struct file *file) { retval = bprm_mm_init(bprm);//alloc mm_struct if (retval) .... //search_binary_handler這裡會去呼叫load_elf_binary //回來之後regs->sepc 再start_thread已經 //設定好user program的entry point retval = exec_binprm(bprm); if (retval < 0) goto out; ``` ### mm_struct ```clike= //fs/exec.c 359 static int bprm_mm_init(struct linux_binprm *bprm) 360 { ...... 364 bprm->mm = mm = mm_alloc(); 365 err = -ENOMEM; ------------- //kernel/fork.c struct mm_struct *mm_alloc(void) { struct mm_struct *mm; mm = allocate_mm();//分配mm_struct ..... memset(mm, 0, sizeof(*mm)); return mm_init(mm, current, current_user_ns());//current就是task_struct => $tp } --------------- // arch/riscv/include/asm/current.h 27 static __always_inline struct task_struct *get_current(void) 28 { 29 register struct task_struct *tp __asm__("tp"); 30 return tp; 31 } 32 33 #define current get_current() //task_struct --------------- static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, struct user_namespace *user_ns) { ... mm->mmap = NULL; mm->mm_rb = RB_ROOT; mm->vmacache_seqnum = 0; ..... if (mm_alloc_pgd(mm)) //分配process page directory --------------- static inline pgd_t *pgd_alloc(struct mm_struct *mm) { ...... pgd = (pgd_t *)__get_free_page(GFP_KERNEL); if (likely(pgd != NULL)) { memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); /* Copy kernel mappings */ //把kernel page table 給process pgd memcpy(pgd + USER_PTRS_PER_PGD, init_mm.pgd + USER_PTRS_PER_PGD, (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); local_flush_tlb_all(); } return pgd; } ------------- struct mm_struct init_mm = { .mm_rb = RB_ROOT, .pgd = swapper_pg_dir, //kernel page table directory .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), ........ }; ``` 每個process的kernel page table都會一樣 ![](https://i.imgur.com/O12XKd7.png) **task_struct** * context_switch、schedule ... 都會用到此結構 ```clike= //include/linux/sched.h struct task_struct { #ifdef CONFIG_THREAD_INFO_IN_TASK /* * For reasons of header soup (see current_thread_info()), this * must be the first element of task_struct. */ struct thread_info thread_info; #endif /* -1 unrunnable, 0 runnable, >0 stopped: */ volatile long state; ... struct mm_struct *mm; struct mm_struct *active_mm; ... /* Namespaces: */ struct nsproxy *nsproxy; //container 相關資訊 #ifdef CONFIG_PERF_EVENTS //perf 相關資訊 struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts]; struct mutex perf_event_mutex; struct list_head perf_event_list; #endif struct sched_entity se; //scheder 相關資訊 #ifdef CONFIG_NUMA_BALANCING //NUMA 相關資訊 int numa_scan_seq; unsigned int numa_scan_period; unsigned int numa_scan_period_max; ..... ----------- (gdb) p/x $tp $34 = 0xffffffe000171300 (gdb) p/x *((struct task_struct*)0xffffffe000171300) $37 = {thread_info = {flags = 0x0, preempt_count = 0x0, addr_limit = {se g = 0x4000000000}, kernel_sp = 0xffffffe03eb91ab0, user_sp = 0xf fffffe03eb91ab0, cpu = 0x0}, state = 0x0, stack = 0xfff fffe03eb90000, usage = {refs = {counter = 0x1}}, flags = 0x400000, ptrace = 0x0, wake_entry = {next = 0x0} ---------------- (gdb) p/x *((struct task_struct*)0xffffffe000171300)->mm $40 = {{mmap = 0xffffffe03afc1790, mm_rb = {rb_node = 0xffffffe03afc1de0 }, vmacache_seqnum = 0x0, get_unmapped _area = 0xffffffe0014a9270, mmap_base = 0x3ff7bc1000, mm ap_legacy_base = 0x0, task_size = 0x4000000000, highest_vm_end = 0 x3fffd6a000, pgd = 0xffffffe000010000, membarrier_ state = {counter = 0x0}, mm_users = {counter = 0x1}, mm_c ount = {counter = 0x1}, pgtables_bytes = {counter = 0x2000}, map _count = 0x3, ... ``` 如果是kernel thread就不會有mm_struct ```clike= arch_cpu_idle () at arch/riscv/kernel/process.c:31 31 local_irq_enable(); (gdb) p/x $tp $41 = 0xffffffe0019b1b00 (gdb) p/x *((struct task_struct*)0xffffffe0019b1b00)->mm $42 = {{mmap = 0x0, mm_rb = {rb_node = 0x0}, vmacache_seqnum = 0x0 (gdb) p/x ((struct task_struct*)0xffffffe0019b1b00)->mm $43 = 0x0 ``` ### load_elf_binary * 以32bit來看,user address sapce有3GB虛擬空間,遠大於實體記憶體空間,Linux kernel 用vma結構來管理process的虛擬空間 ![](https://i.imgur.com/SMlcoDD.png) ```clike= //fs/binfmt_elf.c static int load_elf_binary(struct linux_binprm *bprm) { struct file *interpreter = NULL; /* to shut gcc up */ .... 982 //建立檔案的VMA 983 error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt, 984 elf_prot, elf_flags, total_size); ... //設定regs->sepc = elf_entry 1154 start_thread(regs, elf_entry, bprm->p); ---------- 348 static unsigned long elf_map(struct file *filep, unsigned long addr, 349 const struct elf_phdr *eppnt, int prot, int type, 350 unsigned long total_size) 351 { 352 unsigned long map_addr; 353 unsigned long size = eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr); 354 unsigned long off = eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr); 355 addr = ELF_PAGESTART(addr); 356 size = ELF_PAGEALIGN(size); ... 376 } else 377 map_addr = vm_mmap(filep, addr, size, prot, type, off); 378 ---------- //elf_map->vm_mmap->vm_mmap_pgoff->do_mmap_pgoff->do_mmap //建立檔案的 vma (每個process 虛擬區段都需要有對應的vma來管理) unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, struct list_head *uf) { ---------- /*在process virtual address 尋找一塊空間 */ addr = get_unmapped_area(file, addr, len, pgoff, flags); if (offset_in_page(addr)) return addr; ----------- //alloc 一塊vma給此addr unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf) { ...... vma = vm_area_alloc(mm); ...... //將vma資訊對應到addr vma->vm_start = addr; vma->vm_end = addr + len; vma->vm_flags = vm_flags; vma->vm_page_prot = vm_get_page_prot(vm_flags); vma->vm_pgoff = pgoff; ...... ``` ![](https://i.imgur.com/zSDvbaJ.png) 兩塊LOAD區段分別對應到兩個VMA ![](https://i.imgur.com/SpmLkbK.png) **start_thread** 設定spec為elf_entry(程式的進入點),當返回user space時候就會從此開始執行,如果有interpreter(dynamic loader),就會從interpreter(dynamic loader)開始執行 ```clike= ///arch/riscv/kernel/process.c void start_thread(struct pt_regs *regs, unsigned long pc, unsigned long sp) { regs->sstatus = SR_SPIE; ..... regs->sepc = pc; regs->sp = sp; set_fs(USER_DS); } ``` 至此只有建立init program的虛擬位址vma與每個section對應關係,尚未建立physical memory,等到cpu開始執行到program的elf_entry時候會產生page fault,這時候才會建立phiscal page的對應 呼叫流程 ```clike= #0 start_thread (regs=0xffffffe03aa47ee0, pc=91625975520, sp=274873105968) at arch/riscv/kernel/process.c:85 #1 0xffffffe000b4d26c in load_elf_binary (bprm=0xffffffe038d59b00) at fs/binfmt_elf.c:1171 #2 0xffffffe000b0dfa0 in search_binary_handler (bprm=0xffffffe03aa47ee0) at fs/exec.c:1653 #3 0xffffffe000b0e7d8 in exec_binprm (bprm=<optimized out>) at fs/exec.c:1695 #4 do_execveat_common (fd=<optimized out>, filename=0x1555556ee0, argv=..., envp=..., flags=<optimized out>) at f s/exec.c:1817 #5 0xffffffe000b0e94c in do_execve (filename=<optimized out>, __argv=<optimized out>, __envp=<optimized out>) at fs/exec.c:1862 #6 0xffffffe000a59028 in run_init_process (init_filename=<optimized out>) at init/main.c:1004 #7 0xffffffe000e21248 in kernel_init (unused=<optimized out>) at init/main.c:1074 #8 0xffffffe000a59f22 in handle_exception () at arch/riscv/kernel/entry.S:230 ``` ```clike= 1706 /* 1707 * sys_execve() executes a new program. 1708 */ 1709 static int do_execveat_common(int fd, struct filename *filename, 1710 struct user_arg_ptr argv, 1711 struct user_arg_ptr envp, 1712 int flags) 1713 { ``` ## fork 在Linux中,Process或是Thread (kernel_thread也是) 都是通過fork、vfork、clone等系統呼叫來建立的,它們都是透過do_fork來建立 * fork使用了SIGCHLD,在子進程終止後發送SIGCHLD通知父進程 * fork主要為子進程建立一個父進程的副本,為了減少資源採用COW(copy on write)機制,子進程只複製父進程的page table,不複製page內容,當子進程需要寫入時候,才會觸發write copy機制,為子進程建立一個副本 * vfork使用了CLONE_VFORK(父進程會被掛起,直到子進程釋放了虛擬資源),CLONE_VM(父子進程共享內存空間) ```clike= //kernel thread 2444 pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) 2445 { 2446 struct kernel_clone_args args = { 2447 .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), 2448 .exit_signal = (flags & CSIGNAL), 2449 .stack = (unsigned long)fn, 2450 .stack_size = (unsigned long)arg, 2451 }; 2452 2453 return _do_fork(&args); ------------- //fork 2457 SYSCALL_DEFINE0(fork) 2458 { 2459 #ifdef CONFIG_MMU 2460 struct kernel_clone_args args = { 2461 .exit_signal = SIGCHLD, 2462 }; 2463 2464 return _do_fork(&args); --------------- //vfork 2473 SYSCALL_DEFINE0(vfork) 2474 { 2475 struct kernel_clone_args args = { 2476 .flags = CLONE_VFORK | CLONE_VM, 2477 .exit_signal = SIGCHLD, 2478 }; 2479 2480 return _do_fork(&args); --------------- //clone 2502 SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, 2503 int __user *, parent_tidptr, 2504 int __user *, child_tidptr, 2505 unsigned long, tls) .... 2508 struct kernel_clone_args args = { 2509 .flags = (clone_flags & ~CSIGNAL), 2510 .pidfd = parent_tidptr, 2511 .child_tid = child_tidptr, //child pid 2512 .parent_tid = parent_tidptr,//parent pid 2513 .exit_signal = (clone_flags & CSIGNAL), 2514 .stack = newsp, 2515 .tls = tls, 2516 }; 2517 ...... 2521 return _do_fork(&args); ``` ```clojure= 22 /* 23 * This only works because "struct thread_info" is at offset 0 from "struct 24 * task_struct". This constraint seems to be necessary on other architectures 25 * as well, but __switch_to enforces it. We can't check TASK_TI here because 26 * <asm/asm-offsets.h> includes this, and I can't get the definition of "struct 27 * task_struct" here due to some header ordering problems. 28 */ 29 static __always_inline struct task_struct *get_current(void) 30 { 31 return riscv_current_is_tp; 32 } 33 34 #define current get_current() ``` ```clojure commit cad6967ac10843a70842cd39c7b53412901dd21f Author: Christian Brauner <brauner@kernel.org> Date: Wed Aug 19 12:46:45 2020 +0200 fork: introduce kernel_clone() The old _do_fork() helper doesn't follow naming conventions of in-kernel helpers for syscalls. The process creation cleanup in [1] didn't change the name to something more reasonable mainly because _do_fork() was used in quite a few places. So sending this as a separate series seemed the better strategy. This commit does two things: 1. renames _do_fork() to kernel_clone() but keeps _do_fork() as a simple static inline wrapper around kernel_clone(). 2. Changes the return type from long to pid_t. This aligns kernel_thread() and kernel_clone(). Also, the return value from kernel_clone that is surfaced in fork(), vfork(), clone(), and clone3() is taken from pid_vrn() which returns a pid_t too. ``` ```clike= //include/uapi/linux/sched.h //父子進程共享內存空間 #define CLONE_VM 0x00000100 /* set if VM shared between processes */ //父子進程共享相同的file system #define CLONE_FS 0x00000200 /* set if fs info shared between processes */ //父子進程共享相同的file descriptor #define CLONE_FILES 0x00000400 /* set if open files shared between processes */ //父子進程共享相同的signal資訊 #define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */ ////父進程被trace,子進程也被trace #define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */ //父進程被掛起,直到子進程釋放了虛擬資源 #define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */ ``` ### do_fork ```clike= 2339 long _do_fork(struct kernel_clone_args *args) 2340 { 2...... 1771 struct task_struct *p; 2366 p = copy_process(NULL, trace, NUMA_NO_NODE, args); 2367 add_latent_entropy(); ---------- //做memory、file system、signal、I/O等內容的copy 1764 static __latent_entropy struct task_struct *copy_process( 1765 struct pid *pid, 1766 int trace, 1767 int node, 1768 struct kernel_clone_args *args) 1769 { ... 1850 retval = -ENOMEM; //把原本父進程的task_struct內容複製給子進程的task_strcut p 1851 p = dup_task_struct(current, node); 1852 if (!p) ... 1981 /* Perform scheduler related setup. Assign this task to a CPU. */ 1982 retval = sched_fork(clone_flags, p); 1983 if (retval) ... 2012 retval = copy_mm(clone_flags, p);//copy 父進程 mm 2013 if (retval) 2000 retval = copy_files(clone_flags, p); .. 2003 retval = copy_fs(clone_flags, p); . 2009 retval = copy_signal(clone_flags, p); . 2015 retval = copy_namespaces(clone_flags, p); ... 2018 retval = copy_io(clone_flags, p); --------- 2839 int sched_fork(unsigned long clone_flags, struct task_struct *p) 2840 { 2843 __sched_fork(clone_flags, p); 2844 /* 2845 * We mark the process as NEW here. This guarantees that 2846 * nobody will actually run it, and a signal or other external 2847 * event cannot wake it up and insert it on the runqueue either. 2848 */ 2849 p->state = TASK_NEW; .... 2897 * We're setting the CPU for the first time, we don't migrate, 2898 * so use __set_task_cpu(). 2899 */ //把task assign 給一個cpu _cpu(p, smp_processor_id()); ------------------- //copy_mm->dup_mm() 1346 static struct mm_struct *dup_mm(struct task_struct *tsk, 1347 struct mm_struct *oldmm) 1348 { ..... 1352 mm = allocate_mm(); ..... 1358 if (!mm_init(mm, tsk, mm->user_ns))//呼叫mm_alloc_pgd 有kernel page table 1356 memcpy(mm, oldmm, sizeof(*mm)); ..... 1361 err = dup_mmap(mm, oldmm);//old_mm為父進程 ------------------ // 主要是走訪父進程所有的VMA,然後複製父進程VMA對應的pte entry到子進程MA對應的pte entry 478 static __latent_entropy int dup_mmap(struct mm_struct *mm, 479 struct mm_struct *oldmm) 480 { ..... 492 flush_cache_dup_mm(oldmm); 493 uprobe_dup_mmap(oldmm, mm); 601 retval = copy_page_range(mm, oldmm, mpnt); 602 ----------------- 937 int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, 938 struct vm_area_struct *vma) 939 { ..... 986 dst_pgd = pgd_offset(dst_mm, addr); 987 src_pgd = pgd_offset(src_mm, addr); 988 do { 989 next = pgd_addr_end(addr, end); 990 if (pgd_none_or_clear_bad(src_pgd)) 991 continue; 992 if (unlikely(copy_p4d_range(dst_mm, src_mm, dst_pgd, src_pgd, 993 vma, addr, next))) { 994 ret = -ENOMEM; 995 break; 996 } 997 } while (dst_pgd++, src_pgd++, addr = next, addr != end); 998 999 if (is_cow) 1000 mmu_notifier_invalidate_range_end(&range); ----------------- copy_p4d_range->copy_pud_range->copy_pmd_range->copy_pte_range->copy_one_pte 671 /* 672 * copy one vm_area from one task to the other. Assumes the page tables 673 * already present in the new task to be cleared in the whole range 674 * covered by this vma. 675 */ 676 677 static inline unsigned long 678 copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, 679 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, 680 unsigned long addr, int *rss) 681 { ... 780 set_pte_at(dst_mm, addr, dst_pte, pte);//把pte設給dst_pte ---------------- static inline void set_pte(pte_t *ptep, pte_t pteval) { *ptep = pteval; } static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { if (pte_present(pteval) && pte_exec(pteval)) flush_icache_pte(pteval); set_pte(ptep, pteval); } ``` 當copy_process完成後就成功創建一個子進程 ### 3.1問題 Kernel 如何獲取當前process的task_struct ? User space process page table什麼時候被建立? fork, vfork, clone有什麼差別? **Ref:** [linux进程地址空间(3) 内存映射(1)mmap与do_mmap](https://blog.csdn.net/u010246947/article/details/10472587?utm_medium=distribute.pc_relevant.none-task-blog-title-2&spm=1001.2101.3001.4242) [how-the-kernel-manages-your-memory](https://manybutfinite.com/post/how-the-kernel-manages-your-memory/) [Copy On Write機制瞭解一下](https://www.itread01.com/content/1544029393.html)