# 奔跑吧 CH 3.1 進程的誕生
## Process and thread
process為OS分配系統資源(memory)的單位,thread為OS分配CPU time的單位
**Process:**
* 一個elf binary檔案在 disk 上叫 user program, load進memory 開始執行後就變成 process
* 有自己的address space =>code section, data section, bss section、stack、heap
* Linux 通常把process當做是**task**,**PCB (processing control block)** 通常也稱為 **struct task_struct**
**thread:**
* Thread 被稱為輕量級process,它是系統調度最小單位,thread與process差別在於process有獨立的資源空間,而thread則共享process的address
* 共享process的address space => code section, data section
* thread有自己的**program counter, cpu register, stack**
Linux 並沒有特定的data structure來標示thread or process,thread與process都使用process的PCB
<img src="https://i.imgur.com/i3FDa7Y.png" style="height:350px;display: block;margin:auto;">
### Process and task_struct
* Linux用task_struct來描述一個process,透過task_struct->mm->mmap可以找到vm_area_struct,linux用利用VMA(virtual memory area)來描述每個segment特性,大小
<img src="https://i.imgur.com/hWJh8f8.png" style="height:350px;display: block;margin:auto;">
## How kernel execute a binary file
kernel執行第一個user program => init
```clike=
//init/main.c
static int __ref kernel_init(void *unused)
{
....
1148 if (!try_to_run_init_process("/sbin/init") ||
1149 !try_to_run_init_process("/etc/init") ||
1150 !try_to_run_init_process("/bin/init") ||
1151 !try_to_run_init_process("/bin/sh"))
--------------
static int run_init_process(const char *init_filename)
{
argv_init[0] = init_filename;
pr_info("Run %s as init process\n", init_filename);
return do_execve(getname_kernel(init_filename),
(const char __user *const __user *)argv_init,
(const char __user *const __user *)envp_init);
}
```
do_execve -> do_execveat_common -> __do_execve_file
```clike=
static int __do_execve_file(int fd, struct filename *filename,
struct user_arg_ptr argv,
struct user_arg_ptr envp,
int flags, struct file *file)
{
retval = bprm_mm_init(bprm);//alloc mm_struct
if (retval)
....
//search_binary_handler這裡會去呼叫load_elf_binary
//回來之後regs->sepc 再start_thread已經
//設定好user program的entry point
retval = exec_binprm(bprm);
if (retval < 0)
goto out;
```
### mm_struct
```clike=
//fs/exec.c
359 static int bprm_mm_init(struct linux_binprm *bprm)
360 {
......
364 bprm->mm = mm = mm_alloc();
365 err = -ENOMEM;
-------------
//kernel/fork.c
struct mm_struct *mm_alloc(void)
{
struct mm_struct *mm;
mm = allocate_mm();//分配mm_struct
.....
memset(mm, 0, sizeof(*mm));
return mm_init(mm, current, current_user_ns());//current就是task_struct => $tp
}
---------------
// arch/riscv/include/asm/current.h
27 static __always_inline struct task_struct *get_current(void)
28 {
29 register struct task_struct *tp __asm__("tp");
30 return tp;
31 }
32
33 #define current get_current() //task_struct
---------------
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
struct user_namespace *user_ns)
{
...
mm->mmap = NULL;
mm->mm_rb = RB_ROOT;
mm->vmacache_seqnum = 0;
.....
if (mm_alloc_pgd(mm)) //分配process page directory
---------------
static inline pgd_t *pgd_alloc(struct mm_struct *mm)
{
......
pgd = (pgd_t *)__get_free_page(GFP_KERNEL);
if (likely(pgd != NULL)) {
memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t));
/* Copy kernel mappings */ //把kernel page table 給process pgd
memcpy(pgd + USER_PTRS_PER_PGD,
init_mm.pgd + USER_PTRS_PER_PGD,
(PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
local_flush_tlb_all();
}
return pgd;
}
-------------
struct mm_struct init_mm = {
.mm_rb = RB_ROOT,
.pgd = swapper_pg_dir, //kernel page table directory
.mm_users = ATOMIC_INIT(2),
.mm_count = ATOMIC_INIT(1),
.mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
........
};
```
每個process的kernel page table都會一樣
![](https://i.imgur.com/O12XKd7.png)
**task_struct**
* context_switch、schedule ... 都會用到此結構
```clike=
//include/linux/sched.h
struct task_struct {
#ifdef CONFIG_THREAD_INFO_IN_TASK
/*
* For reasons of header soup (see current_thread_info()), this
* must be the first element of task_struct.
*/
struct thread_info thread_info;
#endif
/* -1 unrunnable, 0 runnable, >0 stopped: */
volatile long state;
...
struct mm_struct *mm;
struct mm_struct *active_mm;
...
/* Namespaces: */
struct nsproxy *nsproxy; //container 相關資訊
#ifdef CONFIG_PERF_EVENTS //perf 相關資訊
struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
struct mutex perf_event_mutex;
struct list_head perf_event_list;
#endif
struct sched_entity se; //scheder 相關資訊
#ifdef CONFIG_NUMA_BALANCING //NUMA 相關資訊
int numa_scan_seq;
unsigned int numa_scan_period;
unsigned int numa_scan_period_max;
.....
-----------
(gdb) p/x $tp
$34 = 0xffffffe000171300
(gdb) p/x *((struct task_struct*)0xffffffe000171300)
$37 = {thread_info = {flags = 0x0, preempt_count = 0x0, addr_limit = {se
g = 0x4000000000}, kernel_sp = 0xffffffe03eb91ab0, user_sp = 0xf
fffffe03eb91ab0, cpu = 0x0}, state = 0x0, stack = 0xfff
fffe03eb90000, usage = {refs = {counter = 0x1}}, flags =
0x400000, ptrace = 0x0, wake_entry = {next = 0x0}
----------------
(gdb) p/x *((struct task_struct*)0xffffffe000171300)->mm
$40 = {{mmap = 0xffffffe03afc1790, mm_rb = {rb_node = 0xffffffe03afc1de0
}, vmacache_seqnum = 0x0, get_unmapped
_area = 0xffffffe0014a9270, mmap_base = 0x3ff7bc1000, mm
ap_legacy_base = 0x0, task_size = 0x4000000000, highest_vm_end = 0
x3fffd6a000, pgd = 0xffffffe000010000, membarrier_
state = {counter = 0x0}, mm_users = {counter = 0x1}, mm_c
ount = {counter = 0x1}, pgtables_bytes = {counter = 0x2000}, map
_count = 0x3,
...
```
如果是kernel thread就不會有mm_struct
```clike=
arch_cpu_idle () at arch/riscv/kernel/process.c:31
31 local_irq_enable();
(gdb) p/x $tp
$41 = 0xffffffe0019b1b00
(gdb) p/x *((struct task_struct*)0xffffffe0019b1b00)->mm
$42 = {{mmap = 0x0, mm_rb = {rb_node = 0x0}, vmacache_seqnum = 0x0
(gdb) p/x ((struct task_struct*)0xffffffe0019b1b00)->mm
$43 = 0x0
```
### load_elf_binary
* 以32bit來看,user address sapce有3GB虛擬空間,遠大於實體記憶體空間,Linux kernel 用vma結構來管理process的虛擬空間
![](https://i.imgur.com/SMlcoDD.png)
```clike=
//fs/binfmt_elf.c
static int load_elf_binary(struct linux_binprm *bprm)
{
struct file *interpreter = NULL; /* to shut gcc up */
....
982 //建立檔案的VMA
983 error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
984 elf_prot, elf_flags, total_size);
...
//設定regs->sepc = elf_entry
1154 start_thread(regs, elf_entry, bprm->p);
----------
348 static unsigned long elf_map(struct file *filep, unsigned long addr,
349 const struct elf_phdr *eppnt, int prot, int type,
350 unsigned long total_size)
351 {
352 unsigned long map_addr;
353 unsigned long size = eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr);
354 unsigned long off = eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr);
355 addr = ELF_PAGESTART(addr);
356 size = ELF_PAGEALIGN(size);
...
376 } else
377 map_addr = vm_mmap(filep, addr, size, prot, type, off);
378
----------
//elf_map->vm_mmap->vm_mmap_pgoff->do_mmap_pgoff->do_mmap
//建立檔案的 vma (每個process 虛擬區段都需要有對應的vma來管理)
unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flags, vm_flags_t vm_flags,
unsigned long pgoff, unsigned long *populate,
struct list_head *uf)
{
----------
/*在process virtual address 尋找一塊空間 */
addr = get_unmapped_area(file, addr, len, pgoff, flags);
if (offset_in_page(addr))
return addr;
-----------
//alloc 一塊vma給此addr
unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
struct list_head *uf)
{
......
vma = vm_area_alloc(mm);
......
//將vma資訊對應到addr
vma->vm_start = addr;
vma->vm_end = addr + len;
vma->vm_flags = vm_flags;
vma->vm_page_prot = vm_get_page_prot(vm_flags);
vma->vm_pgoff = pgoff;
......
```
![](https://i.imgur.com/zSDvbaJ.png)
兩塊LOAD區段分別對應到兩個VMA
![](https://i.imgur.com/SpmLkbK.png)
**start_thread**
設定spec為elf_entry(程式的進入點),當返回user space時候就會從此開始執行,如果有interpreter(dynamic loader),就會從interpreter(dynamic loader)開始執行
```clike=
///arch/riscv/kernel/process.c
void start_thread(struct pt_regs *regs, unsigned long pc,
unsigned long sp)
{
regs->sstatus = SR_SPIE;
.....
regs->sepc = pc;
regs->sp = sp;
set_fs(USER_DS);
}
```
至此只有建立init program的虛擬位址vma與每個section對應關係,尚未建立physical memory,等到cpu開始執行到program的elf_entry時候會產生page fault,這時候才會建立phiscal page的對應
呼叫流程
```clike=
#0 start_thread (regs=0xffffffe03aa47ee0, pc=91625975520, sp=274873105968) at arch/riscv/kernel/process.c:85
#1 0xffffffe000b4d26c in load_elf_binary (bprm=0xffffffe038d59b00) at fs/binfmt_elf.c:1171
#2 0xffffffe000b0dfa0 in search_binary_handler (bprm=0xffffffe03aa47ee0) at fs/exec.c:1653
#3 0xffffffe000b0e7d8 in exec_binprm (bprm=<optimized out>) at fs/exec.c:1695
#4 do_execveat_common (fd=<optimized out>, filename=0x1555556ee0, argv=..., envp=..., flags=<optimized out>) at f
s/exec.c:1817
#5 0xffffffe000b0e94c in do_execve (filename=<optimized out>, __argv=<optimized out>, __envp=<optimized
out>) at fs/exec.c:1862
#6 0xffffffe000a59028 in run_init_process (init_filename=<optimized out>) at init/main.c:1004
#7 0xffffffe000e21248 in kernel_init (unused=<optimized out>) at init/main.c:1074
#8 0xffffffe000a59f22 in handle_exception () at arch/riscv/kernel/entry.S:230
```
```clike=
1706 /*
1707 * sys_execve() executes a new program.
1708 */
1709 static int do_execveat_common(int fd, struct filename *filename,
1710 struct user_arg_ptr argv,
1711 struct user_arg_ptr envp,
1712 int flags)
1713 {
```
## fork
在Linux中,Process或是Thread (kernel_thread也是) 都是通過fork、vfork、clone等系統呼叫來建立的,它們都是透過do_fork來建立
* fork使用了SIGCHLD,在子進程終止後發送SIGCHLD通知父進程
* fork主要為子進程建立一個父進程的副本,為了減少資源採用COW(copy on write)機制,子進程只複製父進程的page table,不複製page內容,當子進程需要寫入時候,才會觸發write copy機制,為子進程建立一個副本
* vfork使用了CLONE_VFORK(父進程會被掛起,直到子進程釋放了虛擬資源),CLONE_VM(父子進程共享內存空間)
```clike=
//kernel thread
2444 pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
2445 {
2446 struct kernel_clone_args args = {
2447 .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL),
2448 .exit_signal = (flags & CSIGNAL),
2449 .stack = (unsigned long)fn,
2450 .stack_size = (unsigned long)arg,
2451 };
2452
2453 return _do_fork(&args);
-------------
//fork
2457 SYSCALL_DEFINE0(fork)
2458 {
2459 #ifdef CONFIG_MMU
2460 struct kernel_clone_args args = {
2461 .exit_signal = SIGCHLD,
2462 };
2463
2464 return _do_fork(&args);
---------------
//vfork
2473 SYSCALL_DEFINE0(vfork)
2474 {
2475 struct kernel_clone_args args = {
2476 .flags = CLONE_VFORK | CLONE_VM,
2477 .exit_signal = SIGCHLD,
2478 };
2479
2480 return _do_fork(&args);
---------------
//clone
2502 SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
2503 int __user *, parent_tidptr,
2504 int __user *, child_tidptr,
2505 unsigned long, tls)
....
2508 struct kernel_clone_args args = {
2509 .flags = (clone_flags & ~CSIGNAL),
2510 .pidfd = parent_tidptr,
2511 .child_tid = child_tidptr, //child pid
2512 .parent_tid = parent_tidptr,//parent pid
2513 .exit_signal = (clone_flags & CSIGNAL),
2514 .stack = newsp,
2515 .tls = tls,
2516 };
2517
......
2521 return _do_fork(&args);
```
```clojure=
22 /*
23 * This only works because "struct thread_info" is at offset 0 from "struct
24 * task_struct". This constraint seems to be necessary on other architectures
25 * as well, but __switch_to enforces it. We can't check TASK_TI here because
26 * <asm/asm-offsets.h> includes this, and I can't get the definition of "struct
27 * task_struct" here due to some header ordering problems.
28 */
29 static __always_inline struct task_struct *get_current(void)
30 {
31 return riscv_current_is_tp;
32 }
33
34 #define current get_current()
```
```clojure
commit cad6967ac10843a70842cd39c7b53412901dd21f
Author: Christian Brauner <brauner@kernel.org>
Date: Wed Aug 19 12:46:45 2020 +0200
fork: introduce kernel_clone()
The old _do_fork() helper doesn't follow naming conventions of in-kernel
helpers for syscalls. The process creation cleanup in [1] didn't change the
name to something more reasonable mainly because _do_fork() was used in quite a
few places. So sending this as a separate series seemed the better strategy.
This commit does two things:
1. renames _do_fork() to kernel_clone() but keeps _do_fork() as a simple static
inline wrapper around kernel_clone().
2. Changes the return type from long to pid_t. This aligns kernel_thread() and
kernel_clone(). Also, the return value from kernel_clone that is surfaced in
fork(), vfork(), clone(), and clone3() is taken from pid_vrn() which returns
a pid_t too.
```
```clike=
//include/uapi/linux/sched.h
//父子進程共享內存空間
#define CLONE_VM 0x00000100 /* set if VM shared between processes */
//父子進程共享相同的file system
#define CLONE_FS 0x00000200 /* set if fs info shared between processes */
//父子進程共享相同的file descriptor
#define CLONE_FILES 0x00000400 /* set if open files shared between processes */
//父子進程共享相同的signal資訊
#define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */
////父進程被trace,子進程也被trace
#define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */
//父進程被掛起,直到子進程釋放了虛擬資源
#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */
```
### do_fork
```clike=
2339 long _do_fork(struct kernel_clone_args *args)
2340 {
2......
1771 struct task_struct *p;
2366 p = copy_process(NULL, trace, NUMA_NO_NODE, args);
2367 add_latent_entropy();
----------
//做memory、file system、signal、I/O等內容的copy
1764 static __latent_entropy struct task_struct *copy_process(
1765 struct pid *pid,
1766 int trace,
1767 int node,
1768 struct kernel_clone_args *args)
1769 {
...
1850 retval = -ENOMEM;
//把原本父進程的task_struct內容複製給子進程的task_strcut p
1851 p = dup_task_struct(current, node);
1852 if (!p)
...
1981 /* Perform scheduler related setup. Assign this task to a CPU. */
1982 retval = sched_fork(clone_flags, p);
1983 if (retval)
...
2012 retval = copy_mm(clone_flags, p);//copy 父進程 mm
2013 if (retval)
2000 retval = copy_files(clone_flags, p);
..
2003 retval = copy_fs(clone_flags, p);
.
2009 retval = copy_signal(clone_flags, p);
.
2015 retval = copy_namespaces(clone_flags, p);
...
2018 retval = copy_io(clone_flags, p);
---------
2839 int sched_fork(unsigned long clone_flags, struct task_struct *p)
2840 {
2843 __sched_fork(clone_flags, p);
2844 /*
2845 * We mark the process as NEW here. This guarantees that
2846 * nobody will actually run it, and a signal or other external
2847 * event cannot wake it up and insert it on the runqueue either.
2848 */
2849 p->state = TASK_NEW;
....
2897 * We're setting the CPU for the first time, we don't migrate,
2898 * so use __set_task_cpu().
2899 */ //把task assign 給一個cpu
_cpu(p, smp_processor_id());
-------------------
//copy_mm->dup_mm()
1346 static struct mm_struct *dup_mm(struct task_struct *tsk,
1347 struct mm_struct *oldmm)
1348 {
.....
1352 mm = allocate_mm();
.....
1358 if (!mm_init(mm, tsk, mm->user_ns))//呼叫mm_alloc_pgd 有kernel page table
1356 memcpy(mm, oldmm, sizeof(*mm));
.....
1361 err = dup_mmap(mm, oldmm);//old_mm為父進程
------------------
// 主要是走訪父進程所有的VMA,然後複製父進程VMA對應的pte entry到子進程MA對應的pte entry
478 static __latent_entropy int dup_mmap(struct mm_struct *mm,
479 struct mm_struct *oldmm)
480 {
.....
492 flush_cache_dup_mm(oldmm);
493 uprobe_dup_mmap(oldmm, mm);
601 retval = copy_page_range(mm, oldmm, mpnt);
602
-----------------
937 int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
938 struct vm_area_struct *vma)
939 {
.....
986 dst_pgd = pgd_offset(dst_mm, addr);
987 src_pgd = pgd_offset(src_mm, addr);
988 do {
989 next = pgd_addr_end(addr, end);
990 if (pgd_none_or_clear_bad(src_pgd))
991 continue;
992 if (unlikely(copy_p4d_range(dst_mm, src_mm, dst_pgd, src_pgd,
993 vma, addr, next))) {
994 ret = -ENOMEM;
995 break;
996 }
997 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
998
999 if (is_cow)
1000 mmu_notifier_invalidate_range_end(&range);
-----------------
copy_p4d_range->copy_pud_range->copy_pmd_range->copy_pte_range->copy_one_pte
671 /*
672 * copy one vm_area from one task to the other. Assumes the page tables
673 * already present in the new task to be cleared in the whole range
674 * covered by this vma.
675 */
676
677 static inline unsigned long
678 copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
679 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
680 unsigned long addr, int *rss)
681 {
...
780 set_pte_at(dst_mm, addr, dst_pte, pte);//把pte設給dst_pte
----------------
static inline void set_pte(pte_t *ptep, pte_t pteval)
{
*ptep = pteval;
}
static inline void set_pte_at(struct mm_struct *mm,
unsigned long addr, pte_t *ptep, pte_t pteval)
{
if (pte_present(pteval) && pte_exec(pteval))
flush_icache_pte(pteval);
set_pte(ptep, pteval);
}
```
當copy_process完成後就成功創建一個子進程
### 3.1問題
Kernel 如何獲取當前process的task_struct ?
User space process page table什麼時候被建立?
fork, vfork, clone有什麼差別?
**Ref:**
[linux进程地址空间(3) 内存映射(1)mmap与do_mmap](https://blog.csdn.net/u010246947/article/details/10472587?utm_medium=distribute.pc_relevant.none-task-blog-title-2&spm=1001.2101.3001.4242)
[how-the-kernel-manages-your-memory](https://manybutfinite.com/post/how-the-kernel-manages-your-memory/)
[Copy On Write機制瞭解一下](https://www.itread01.com/content/1544029393.html)