###### tags: `Linux` `Signal` # Signal Handling in Linux * 好文: * http://courses.cms.caltech.edu/cs124/lectures-wi2016/CS124Lec15.pdf * 其他篇筆記: * https://hackmd.io/@LJP/BJAb95O1c * 在 `task_struct` 有兩個 member: ```c /* Signal handlers: */ struct signal_struct *signal; struct sighand_struct __rcu *sighand; ``` * `signal` 用於存放此 task 有哪些 signal 要處理 * 若已經有同一種 signal 正在 pending, 則 signal 會被忽略 * 資料結構是 linked list * `sighand` 用於紀錄 signal handler * signal handler 可以是在 kernel 也可以是在 user mode 的 * 資料結構可以想成是 function pointer array * 處理 signal 的時間點是當 process 準備回到 user mode 時 * 只有會影響到 process state 的 signal 會立即處理 e.g. SIGKILL * 當 signal handler 位於 user mode 時, 會將 cpu context 存放至 user stack, 並設定 user signal handler 的 return address 呼叫 syscall sigreturn, 此 syscall 再從 user stack 中拿出原本的 cpu context, 以便回到發生 signal 前的狀態 # exit_to_user_mode * https://elixir.bootlin.com/linux/v5.17.1/source/arch/arm64/kernel/entry-common.c#L137 至於所謂的**處理 signal 的時間點是當 process 準備回到 user mode 時**可以從 `exit_to_user_mode()` 開始看 ```c static __always_inline void exit_to_user_mode(struct pt_regs *regs) { prepare_exit_to_user_mode(regs); mte_check_tfsr_exit(); __exit_to_user_mode(); } ``` * `prepare_exit_to_user_mode`: ```c static __always_inline void prepare_exit_to_user_mode(struct pt_regs *regs) { unsigned long flags; local_daif_mask(); flags = read_thread_flags(); if (unlikely(flags & _TIF_WORK_MASK)) do_notify_resume(regs, flags); } ``` * 當有任何工作還要做的時候, 對應 `_TIF_WORK_MASK` 的 flags 不為 0, 執行 `do_notify_resume` # do_notify_resume * https://elixir.bootlin.com/linux/v5.17.1/source/arch/arm64/kernel/signal.c#L920 ```c void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags) { do { if (thread_flags & _TIF_NEED_RESCHED) { /* Unmask Debug and SError for the next task */ local_daif_restore(DAIF_PROCCTX_NOIRQ); schedule(); } else { local_daif_restore(DAIF_PROCCTX); if (thread_flags & _TIF_UPROBE) uprobe_notify_resume(regs); if (thread_flags & _TIF_MTE_ASYNC_FAULT) { clear_thread_flag(TIF_MTE_ASYNC_FAULT); send_sig_fault(SIGSEGV, SEGV_MTEAERR, (void __user *)NULL, current); } if (thread_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) do_signal(regs); if (thread_flags & _TIF_NOTIFY_RESUME) tracehook_notify_resume(regs); if (thread_flags & _TIF_FOREIGN_FPSTATE) fpsimd_restore_current_state(); } local_daif_mask(); thread_flags = read_thread_flags(); } while (thread_flags & _TIF_WORK_MASK); } ``` * 這邊可以看到外層一個迴圈, 表示工作處理完後, 還有可能有其他工作 * 若在這邊 schedule 到其他 process, 之後再 schedule 回來後, 終究還是會在迴圈裡確認工作是否都完成 * 這邊關注 signal 怎麼被處理的, 可以看到 flags `_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL` 如果開啟, 則會執行 `do_signal` * TODO: 如何處理多個 signal 的狀況? # do_signal * https://elixir.bootlin.com/linux/v5.17.1/source/arch/arm64/kernel/signal.c#L849 ```c /* * Note that 'init' is a special process: it doesn't get signals it doesn't * want to handle. Thus you cannot kill init even with a SIGKILL even by * mistake. * * Note that we go through the signals twice: once to check the signals that * the kernel can handle, and then we build all the user-level signal handling * stack-frames in one go after that. */ static void do_signal(struct pt_regs *regs) { unsigned long continue_addr = 0, restart_addr = 0; int retval = 0; struct ksignal ksig; bool syscall = in_syscall(regs); /* * If we were from a system call, check for system call restarting... */ if (syscall) { continue_addr = regs->pc; restart_addr = continue_addr - (compat_thumb_mode(regs) ? 2 : 4); retval = regs->regs[0]; /* * Avoid additional syscall restarting via ret_to_user. */ forget_syscall(regs); /* * Prepare for system call restart. We do this here so that a * debugger will see the already changed PC. */ switch (retval) { case -ERESTARTNOHAND: case -ERESTARTSYS: case -ERESTARTNOINTR: case -ERESTART_RESTARTBLOCK: regs->regs[0] = regs->orig_x0; regs->pc = restart_addr; break; } } /* * Get the signal to deliver. When running under ptrace, at this point * the debugger may change all of our registers. */ if (get_signal(&ksig)) { /* * Depending on the signal settings, we may need to revert the * decision to restart the system call, but skip this if a * debugger has chosen to restart at a different PC. */ if (regs->pc == restart_addr && (retval == -ERESTARTNOHAND || retval == -ERESTART_RESTARTBLOCK || (retval == -ERESTARTSYS && !(ksig.ka.sa.sa_flags & SA_RESTART)))) { syscall_set_return_value(current, regs, -EINTR, 0); regs->pc = continue_addr; } handle_signal(&ksig, regs); return; } /* * Handle restarting a different system call. As above, if a debugger * has chosen to restart at a different PC, ignore the restart. */ if (syscall && regs->pc == restart_addr) { if (retval == -ERESTART_RESTARTBLOCK) setup_restart_syscall(regs); user_rewind_single_step(current); } restore_saved_sigmask(); } ``` * 如果要處理 signal 時的 process 是在 syscall 中, 則有不同的處理方式 * `get_signal` 爬出下一個要處理的 signal * 呼叫 `handle_signal` # handle_signal * https://elixir.bootlin.com/linux/v5.17.1/source/arch/arm64/kernel/signal.c#L849 ```c /* * OK, we're invoking a handler */ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) { sigset_t *oldset = sigmask_to_save(); int usig = ksig->sig; int ret; rseq_signal_deliver(ksig, regs); /* * Set up the stack frame */ if (is_compat_task()) { if (ksig->ka.sa.sa_flags & SA_SIGINFO) ret = compat_setup_rt_frame(usig, ksig, oldset, regs); else ret = compat_setup_frame(usig, ksig, oldset, regs); } else { ret = setup_rt_frame(usig, ksig, oldset, regs); } /* * Check that the resulting registers are actually sane. */ ret |= !valid_user_regs(&regs->user_regs, current); /* Step into the signal handler if we are stepping */ signal_setup_done(ret, ksig, test_thread_flag(TIF_SINGLESTEP)); } ``` * `setup_rt_frame` 做了主要的設定 # setup_rt_frame ```c static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set, struct pt_regs *regs) { struct rt_sigframe_user_layout user; struct rt_sigframe __user *frame; int err = 0; fpsimd_signal_preserve_current_state(); if (get_sigframe(&user, ksig, regs)) return 1; frame = user.sigframe; __put_user_error(0, &frame->uc.uc_flags, err); __put_user_error(NULL, &frame->uc.uc_link, err); err |= __save_altstack(&frame->uc.uc_stack, regs->sp); err |= setup_sigframe(&user, regs, set); if (err == 0) { setup_return(regs, &ksig->ka, &user, usig); if (ksig->ka.sa.sa_flags & SA_SIGINFO) { err |= copy_siginfo_to_user(&frame->info, &ksig->info); regs->regs[1] = (unsigned long)&frame->info; regs->regs[2] = (unsigned long)&frame->uc; } } return err; } ``` * 設定 user stack frame, 將 cpu context 存放上去, 設定返回位址為 sigreturn * `setup_return` 調整了待會回到 user mode 時的 user sp, 並將 user pc 設定為 user mode signal handler 位址 * 根據 signal handler flags 來決定傳參數給 signal handler 是傳 1 個還是 3 個 * 至此都設定完成, 等待返回到最上層函數後, return to user mode 就會處理 signal # syscall kill * https://elixir.bootlin.com/linux/v5.17.5/source/kernel/signal.c#L3773 * 前面關注的是在什麼時機點處理 signal * 接下來關注看看從送 signal 的角度, 發生什麼事 ```c /** * sys_kill - send a signal to a process * @pid: the PID of the process * @sig: signal to be sent */ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) { struct kernel_siginfo info; prepare_kill_siginfo(sig, &info); return kill_something_info(sig, &info, pid); } ``` # kill_something_info * https://elixir.bootlin.com/linux/v5.17.5/source/kernel/signal.c#L1588 ```c /* * kill_something_info() interprets pid in interesting ways just like kill(2). * * POSIX specifies that kill(-1,sig) is unspecified, but what we have * is probably wrong. Should make it like BSD or SYSV. */ static int kill_something_info(int sig, struct kernel_siginfo *info, pid_t pid) { int ret; if (pid > 0) return kill_proc_info(sig, info, pid); /* -INT_MIN is undefined. Exclude this case to avoid a UBSAN warning */ if (pid == INT_MIN) return -ESRCH; read_lock(&tasklist_lock); if (pid != -1) { ret = __kill_pgrp_info(sig, info, pid ? find_vpid(-pid) : task_pgrp(current)); } else { int retval = 0, count = 0; struct task_struct * p; for_each_process(p) { if (task_pid_vnr(p) > 1 && !same_thread_group(p, current)) { int err = group_send_sig_info(sig, info, p, PIDTYPE_MAX); ++count; if (err != -EPERM) retval = err; } } ret = count ? retval : -ESRCH; } read_unlock(&tasklist_lock); return ret; } ``` * 根據 pid 不同, 對象 process 不同 * 這邊先假設 pid 大於 0, 就是送 signal 給 pid process # kill_proc_info * https://elixir.bootlin.com/linux/v5.17.5/source/kernel/signal.c#L1492 ```c static int kill_proc_info(int sig, struct kernel_siginfo *info, pid_t pid) { int error; rcu_read_lock(); error = kill_pid_info(sig, info, find_vpid(pid)); rcu_read_unlock(); return error; } ``` ```c int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid) { int error = -ESRCH; struct task_struct *p; for (;;) { rcu_read_lock(); p = pid_task(pid, PIDTYPE_PID); if (p) error = group_send_sig_info(sig, info, p, PIDTYPE_TGID); rcu_read_unlock(); if (likely(!p || error != -ESRCH)) return error; /* * The task was unhashed in between, try again. If it * is dead, pid_task() will return NULL, if we race with * de_thread() it will find the new leader. */ } } ``` ```c /* * send signal info to all the members of a group */ int group_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p, enum pid_type type) { int ret; rcu_read_lock(); ret = check_kill_permission(sig, info, p); rcu_read_unlock(); if (!ret && sig) ret = do_send_sig_info(sig, info, p, type); return ret; } ``` ```c /* * Bad permissions for sending the signal * - the caller must hold the RCU read lock */ static int check_kill_permission(int sig, struct kernel_siginfo *info, struct task_struct *t) { struct pid *sid; int error; if (!valid_signal(sig)) return -EINVAL; if (!si_fromuser(info)) return 0; error = audit_signal_info(sig, t); /* Let audit system see the signal */ if (error) return error; if (!same_thread_group(current, t) && !kill_ok_by_cred(t)) { switch (sig) { case SIGCONT: sid = task_session(t); /* * We don't return the error if sid == NULL. The * task was unhashed, the caller must notice this. */ if (!sid || sid == task_session(current)) break; fallthrough; default: return -EPERM; } } return security_task_kill(t, info, sig, NULL); } ``` * `valid_signal()` 檢查 signal number 有無超出範圍 ```c int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p, enum pid_type type) { unsigned long flags; int ret = -ESRCH; if (lock_task_sighand(p, &flags)) { ret = send_signal(sig, info, p, type); unlock_task_sighand(p, &flags); } return ret; } ``` ```c static int send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t, enum pid_type type) { /* Should SIGKILL or SIGSTOP be received by a pid namespace init? */ bool force = false; if (info == SEND_SIG_NOINFO) { /* Force if sent from an ancestor pid namespace */ force = !task_pid_nr_ns(current, task_active_pid_ns(t)); } else if (info == SEND_SIG_PRIV) { /* Don't ignore kernel generated signals */ force = true; } else if (has_si_pid_and_uid(info)) { /* SIGKILL and SIGSTOP is special or has ids */ struct user_namespace *t_user_ns; rcu_read_lock(); t_user_ns = task_cred_xxx(t, user_ns); if (current_user_ns() != t_user_ns) { kuid_t uid = make_kuid(current_user_ns(), info->si_uid); info->si_uid = from_kuid_munged(t_user_ns, uid); } rcu_read_unlock(); /* A kernel generated signal? */ force = (info->si_code == SI_KERNEL); /* From an ancestor pid namespace? */ if (!task_pid_nr_ns(current, task_active_pid_ns(t))) { info->si_pid = 0; force = true; } } return __send_signal(sig, info, t, type, force); } ``` ```c static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t, enum pid_type type, bool force) { struct sigpending *pending; struct sigqueue *q; int override_rlimit; int ret = 0, result; assert_spin_locked(&t->sighand->siglock); result = TRACE_SIGNAL_IGNORED; if (!prepare_signal(sig, t, force)) goto ret; pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending; /* * Short-circuit ignored signals and support queuing * exactly one non-rt signal, so that we can get more * detailed information about the cause of the signal. */ result = TRACE_SIGNAL_ALREADY_PENDING; if (legacy_queue(pending, sig)) goto ret; result = TRACE_SIGNAL_DELIVERED; /* * Skip useless siginfo allocation for SIGKILL and kernel threads. */ if ((sig == SIGKILL) || (t->flags & PF_KTHREAD)) goto out_set; /* * Real-time signals must be queued if sent by sigqueue, or * some other real-time mechanism. It is implementation * defined whether kill() does so. We attempt to do so, on * the principle of least surprise, but since kill is not * allowed to fail with EAGAIN when low on memory we just * make sure at least one signal gets delivered and don't * pass on the info struct. */ if (sig < SIGRTMIN) override_rlimit = (is_si_special(info) || info->si_code >= 0); else override_rlimit = 0; q = __sigqueue_alloc(sig, t, GFP_ATOMIC, override_rlimit, 0); if (q) { list_add_tail(&q->list, &pending->list); switch ((unsigned long) info) { case (unsigned long) SEND_SIG_NOINFO: clear_siginfo(&q->info); q->info.si_signo = sig; q->info.si_errno = 0; q->info.si_code = SI_USER; q->info.si_pid = task_tgid_nr_ns(current, task_active_pid_ns(t)); rcu_read_lock(); q->info.si_uid = from_kuid_munged(task_cred_xxx(t, user_ns), current_uid()); rcu_read_unlock(); break; case (unsigned long) SEND_SIG_PRIV: clear_siginfo(&q->info); q->info.si_signo = sig; q->info.si_errno = 0; q->info.si_code = SI_KERNEL; q->info.si_pid = 0; q->info.si_uid = 0; break; default: copy_siginfo(&q->info, info); break; } } else if (!is_si_special(info) && sig >= SIGRTMIN && info->si_code != SI_USER) { /* * Queue overflow, abort. We may abort if the * signal was rt and sent by user using something * other than kill(). */ result = TRACE_SIGNAL_OVERFLOW_FAIL; ret = -EAGAIN; goto ret; } else { /* * This is a silent loss of information. We still * send the signal, but the *info bits are lost. */ result = TRACE_SIGNAL_LOSE_INFO; } out_set: signalfd_notify(t, sig); sigaddset(&pending->signal, sig); /* Let multiprocess signals appear after on-going forks */ if (type > PIDTYPE_TGID) { struct multiprocess_signals *delayed; hlist_for_each_entry(delayed, &t->signal->multiprocess, node) { sigset_t *signal = &delayed->signal; /* Can't queue both a stop and a continue signal */ if (sig == SIGCONT) sigdelsetmask(signal, SIG_KERNEL_STOP_MASK); else if (sig_kernel_stop(sig)) sigdelset(signal, SIGCONT); sigaddset(signal, sig); } } complete_signal(sig, t, type); ret: trace_signal_generate(sig, info, t, type != PIDTYPE_PID, result); return ret; } ```