Try   HackMD

2022q1 Quiz8 (C)

contributed by < cwl0429 >

這次測驗的目標是利用 lkm 來變更特定 Linux 行程的內部狀態,需要瞭解 workqueue, signal 等機制的原理及使用方法,以下內容皆以 Linux v5.13 為基礎

先備知識

workqueue

workqueue 是 linux 內用來處理非同步行程 execution context 的機制







queue


cluster_p

workqueue


cluster_worker_pool

workerpool



work1

work



worker_cpu1

worker



work1->worker_cpu1





work2

work



work4

...



work5

work



work6

work



worker_cpu2

worker



worker_cpu3

...



worker_cpu4

worker



worker_cpu5

worker



  • workqueue 中存放多個 work items
  • worker_pool 中存放可用的 workers,而 worker 其實就是 thread 的別名

work items

在 workqueue 中,work item 是一種簡易的結構,定義在 workqueue.h

struct work_struct {
	atomic_long_t data;
	struct list_head entry;
	work_func_t func;
};

可以觀察到 work_struct 包含一個指標 func,其指到欲執行的 function,能讓 worker 在處理 work 時能藉此找到要完成的工作

workers

worker 是 workqueue 中執行 work item 的個體,struct worker 定義在 workqueue_internal.h

struct worker {
	/* on idle list while idle, on busy hash table while busy */
	union {
		struct list_head	entry;	/* L: while idle */
		struct hlist_node	hentry;	/* L: while busy */
	};
	struct work_struct	*current_work;	/* L: work being processed */
	work_func_t		current_func;	/* L: current_work's fn */
	struct pool_workqueue	*current_pwq; /* L: current_work's pwq */
	struct list_head	scheduled;	/* L: scheduled works */
	/* 64 bytes boundary on 64bit, 32 on 32bit */
	struct task_struct	*task;		/* I: worker task */
	struct worker_pool	*pool;		/* A: the associated pool */
						/* L: for rescuers */
	struct list_head	node;		/* A: anchored at pool->workers */
						/* A: runs through worker->node */
	unsigned long		last_active;	/* L: last active timestamp */
	unsigned int		flags;		/* X: flags */
	int			id;		/* I: worker id */
	/*
	 * Opaque string set with work_set_desc().  Printed out with task
	 * dump for debugging - WARN, BUG, panic or sysrq.
	 */
	char			desc[WORKER_DESC_LEN];
	/* used only by rescuers to point to the target workqueue */
	struct workqueue_struct	*rescue_wq;	/* I: the workqueue to rescue */
	/* used by the scheduler to determine a worker's last known identity */
	work_func_t		last_func;
};

struct worker 內有多個 struct 記錄執行工作的狀態或內容,像是

  • *current_work 指向當前 work
  • *pool 指向此 worker 所屬的 worker_pool
  • *task 則紀錄當前 process 的狀態

其中 tast_struct 是本次測驗的重點

task_struct

在核心中,行程或執行緒都可歸類於 task,後者對應到一個 task_structtask_struct 可視為 process descriptor 或是恐龍書提到的 PCB,所有關於 task 的資訊皆被存放於此。
舉例來說,task_struct 有成員 __statetsk->__state 能記錄此 task 的狀態為何,可能是 TASK_RUNNING, TASK_INTERRUPTIBLETASK_UNINTERRUPTIBLE 等,state machine 變化如下圖所示







%0



n1

TASK_RUNNING
 (ready but not runnung)



n2

TASK_RUNNING
 (running)



n1->n2


Scheduler dispatches task to run:
schedule() calls context_switch().



n2->n1


Task is preempted
by higher priority task.



n5

Task is terminated.



n2->n5


Task exits via
do_exit.



n3

TASK_INTERRUPTIBLE
or
TASK_UNINTERRUPTIBLE
(waiting)



n2->n3


Task sleeps on wait queue
for a specific event.



n4

Existing task calls
fork() and creates
a new process.



n4->n1


Task forks.



n3->n1


Event occurs and task is woken up
and placed back on the run queue



其中 states of process 定義在 include/linux/sched.h

/* Used in tsk->state: */
#define TASK_RUNNING			0x0000
#define TASK_INTERRUPTIBLE		0x0001
#define TASK_UNINTERRUPTIBLE		0x0002
#define __TASK_STOPPED			0x0004
#define __TASK_TRACED			0x0008
  • TASK_RUNNING 代表 task 已經準備好執行或正在執行,也就是 task 已排入 run queue 內
  • TASK_INTERRUPTIBLETASK_UNINTERRUPTIBLE 皆代表 task 正在等待某些條件成立而處於休眠狀態,前者可被 signal 喚醒,後者不行,但二者都會定期地被 try_to_wake_up() 嘗試喚醒
  • __TASK_TRACED 代表有其他的 process 正在追蹤此 task,常用於 ptrace 和 ftrace 等 debugger
  • __TASK_STOPPED 代表 task 已停止執行且不再被排程,通常是收到 stop signal 或任何來自追蹤中程式的 signal

另外,還需特別提到 task_struct 內的兩名成員 ptracedptrace_entry

/*
* 'ptraced' is the list of tasks this task is using ptrace() on.
*
* This includes both natural children and PTRACE_ATTACH targets.
* 'ptrace_entry' is this task's link on the p->parent->ptraced list.
*/
struct list_head		ptraced;
struct list_head		ptrace_entry;

在 dont_trace lkm 中,便是利用上述二成員找出正在使用 ptrace() 的 tracer 及其 tracee

task_struct 完整程式碼
struct task_struct {
#ifdef CONFIG_THREAD_INFO_IN_TASK
	/*
	 * For reasons of header soup (see current_thread_info()), this
	 * must be the first element of task_struct.
	 */
	struct thread_info		thread_info;
#endif
	unsigned int			__state;

#ifdef CONFIG_PREEMPT_RT
	/* saved state for "spinlock sleepers" */
	unsigned int			saved_state;
#endif

	/*
	 * This begins the randomizable portion of task_struct. Only
	 * scheduling-critical items should be added above here.
	 */
	randomized_struct_fields_start

	void				*stack;
	refcount_t			usage;
	/* Per task flags (PF_*), defined further below: */
	unsigned int			flags;
	unsigned int			ptrace;

#ifdef CONFIG_SMP
	int				on_cpu;
	struct __call_single_node	wake_entry;
	unsigned int			wakee_flips;
	unsigned long			wakee_flip_decay_ts;
	struct task_struct		*last_wakee;

	/*
	 * recent_used_cpu is initially set as the last CPU used by a task
	 * that wakes affine another task. Waker/wakee relationships can
	 * push tasks around a CPU where each wakeup moves to the next one.
	 * Tracking a recently used CPU allows a quick search for a recently
	 * used CPU that may be idle.
	 */
	int				recent_used_cpu;
	int				wake_cpu;
#endif
	int				on_rq;

	int				prio;
	int				static_prio;
	int				normal_prio;
	unsigned int			rt_priority;

	struct sched_entity		se;
	struct sched_rt_entity		rt;
	struct sched_dl_entity		dl;
	const struct sched_class	*sched_class;

#ifdef CONFIG_SCHED_CORE
	struct rb_node			core_node;
	unsigned long			core_cookie;
	unsigned int			core_occupation;
#endif

#ifdef CONFIG_CGROUP_SCHED
	struct task_group		*sched_task_group;
#endif

#ifdef CONFIG_UCLAMP_TASK
	/*
	 * Clamp values requested for a scheduling entity.
	 * Must be updated with task_rq_lock() held.
	 */
	struct uclamp_se		uclamp_req[UCLAMP_CNT];
	/*
	 * Effective clamp values used for a scheduling entity.
	 * Must be updated with task_rq_lock() held.
	 */
	struct uclamp_se		uclamp[UCLAMP_CNT];
#endif

	struct sched_statistics         stats;

#ifdef CONFIG_PREEMPT_NOTIFIERS
	/* List of struct preempt_notifier: */
	struct hlist_head		preempt_notifiers;
#endif

#ifdef CONFIG_BLK_DEV_IO_TRACE
	unsigned int			btrace_seq;
#endif

	unsigned int			policy;
	int				nr_cpus_allowed;
	const cpumask_t			*cpus_ptr;
	cpumask_t			*user_cpus_ptr;
	cpumask_t			cpus_mask;
	void				*migration_pending;
#ifdef CONFIG_SMP
	unsigned short			migration_disabled;
#endif
	unsigned short			migration_flags;

#ifdef CONFIG_PREEMPT_RCU
	int				rcu_read_lock_nesting;
	union rcu_special		rcu_read_unlock_special;
	struct list_head		rcu_node_entry;
	struct rcu_node			*rcu_blocked_node;
#endif /* #ifdef CONFIG_PREEMPT_RCU */

#ifdef CONFIG_TASKS_RCU
	unsigned long			rcu_tasks_nvcsw;
	u8				rcu_tasks_holdout;
	u8				rcu_tasks_idx;
	int				rcu_tasks_idle_cpu;
	struct list_head		rcu_tasks_holdout_list;
#endif /* #ifdef CONFIG_TASKS_RCU */

#ifdef CONFIG_TASKS_TRACE_RCU
	int				trc_reader_nesting;
	int				trc_ipi_to_cpu;
	union rcu_special		trc_reader_special;
	bool				trc_reader_checked;
	struct list_head		trc_holdout_list;
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */

	struct sched_info		sched_info;

	struct list_head		tasks;
#ifdef CONFIG_SMP
	struct plist_node		pushable_tasks;
	struct rb_node			pushable_dl_tasks;
#endif

	struct mm_struct		*mm;
	struct mm_struct		*active_mm;

	/* Per-thread vma caching: */
	struct vmacache			vmacache;

#ifdef SPLIT_RSS_COUNTING
	struct task_rss_stat		rss_stat;
#endif
	int				exit_state;
	int				exit_code;
	int				exit_signal;
	/* The signal sent when the parent dies: */
	int				pdeath_signal;
	/* JOBCTL_*, siglock protected: */
	unsigned long			jobctl;

	/* Used for emulating ABI behavior of previous Linux versions: */
	unsigned int			personality;

	/* Scheduler bits, serialized by scheduler locks: */
	unsigned			sched_reset_on_fork:1;
	unsigned			sched_contributes_to_load:1;
	unsigned			sched_migrated:1;
#ifdef CONFIG_PSI
	unsigned			sched_psi_wake_requeue:1;
#endif

	/* Force alignment to the next boundary: */
	unsigned			:0;

	/* Unserialized, strictly 'current' */

	/*
	 * This field must not be in the scheduler word above due to wakelist
	 * queueing no longer being serialized by p->on_cpu. However:
	 *
	 * p->XXX = X;			ttwu()
	 * schedule()			  if (p->on_rq && ..) // false
	 *   smp_mb__after_spinlock();	  if (smp_load_acquire(&p->on_cpu) && //true
	 *   deactivate_task()		      ttwu_queue_wakelist())
	 *     p->on_rq = 0;			p->sched_remote_wakeup = Y;
	 *
	 * guarantees all stores of 'current' are visible before
	 * ->sched_remote_wakeup gets used, so it can be in this word.
	 */
	unsigned			sched_remote_wakeup:1;

	/* Bit to tell LSMs we're in execve(): */
	unsigned			in_execve:1;
	unsigned			in_iowait:1;
#ifndef TIF_RESTORE_SIGMASK
	unsigned			restore_sigmask:1;
#endif
#ifdef CONFIG_MEMCG
	unsigned			in_user_fault:1;
#endif
#ifdef CONFIG_COMPAT_BRK
	unsigned			brk_randomized:1;
#endif
#ifdef CONFIG_CGROUPS
	/* disallow userland-initiated cgroup migration */
	unsigned			no_cgroup_migration:1;
	/* task is frozen/stopped (used by the cgroup freezer) */
	unsigned			frozen:1;
#endif
#ifdef CONFIG_BLK_CGROUP
	unsigned			use_memdelay:1;
#endif
#ifdef CONFIG_PSI
	/* Stalled due to lack of memory */
	unsigned			in_memstall:1;
#endif
#ifdef CONFIG_PAGE_OWNER
	/* Used by page_owner=on to detect recursion in page tracking. */
	unsigned			in_page_owner:1;
#endif
#ifdef CONFIG_EVENTFD
	/* Recursion prevention for eventfd_signal() */
	unsigned			in_eventfd_signal:1;
#endif
#ifdef CONFIG_IOMMU_SVA
	unsigned			pasid_activated:1;
#endif

	unsigned long			atomic_flags; /* Flags requiring atomic access. */

	struct restart_block		restart_block;

	pid_t				pid;
	pid_t				tgid;

#ifdef CONFIG_STACKPROTECTOR
	/* Canary value for the -fstack-protector GCC feature: */
	unsigned long			stack_canary;
#endif
	/*
	 * Pointers to the (original) parent process, youngest child, younger sibling,
	 * older sibling, respectively.  (p->father can be replaced with
	 * p->real_parent->pid)
	 */

	/* Real parent process: */
	struct task_struct __rcu	*real_parent;

	/* Recipient of SIGCHLD, wait4() reports: */
	struct task_struct __rcu	*parent;

	/*
	 * Children/sibling form the list of natural children:
	 */
	struct list_head		children;
	struct list_head		sibling;
	struct task_struct		*group_leader;

	/*
	 * 'ptraced' is the list of tasks this task is using ptrace() on.
	 *
	 * This includes both natural children and PTRACE_ATTACH targets.
	 * 'ptrace_entry' is this task's link on the p->parent->ptraced list.
	 */
	struct list_head		ptraced;
	struct list_head		ptrace_entry;

	/* PID/PID hash table linkage. */
	struct pid			*thread_pid;
	struct hlist_node		pid_links[PIDTYPE_MAX];
	struct list_head		thread_group;
	struct list_head		thread_node;

	struct completion		*vfork_done;

	/* CLONE_CHILD_SETTID: */
	int __user			*set_child_tid;

	/* CLONE_CHILD_CLEARTID: */
	int __user			*clear_child_tid;

	/* PF_KTHREAD | PF_IO_WORKER */
	void				*worker_private;

	u64				utime;
	u64				stime;
#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
	u64				utimescaled;
	u64				stimescaled;
#endif
	u64				gtime;
	struct prev_cputime		prev_cputime;
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
	struct vtime			vtime;
#endif

#ifdef CONFIG_NO_HZ_FULL
	atomic_t			tick_dep_mask;
#endif
	/* Context switch counts: */
	unsigned long			nvcsw;
	unsigned long			nivcsw;

	/* Monotonic time in nsecs: */
	u64				start_time;

	/* Boot based time in nsecs: */
	u64				start_boottime;

	/* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */
	unsigned long			min_flt;
	unsigned long			maj_flt;

	/* Empty if CONFIG_POSIX_CPUTIMERS=n */
	struct posix_cputimers		posix_cputimers;

#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
	struct posix_cputimers_work	posix_cputimers_work;
#endif

	/* Process credentials: */

	/* Tracer's credentials at attach: */
	const struct cred __rcu		*ptracer_cred;

	/* Objective and real subjective task credentials (COW): */
	const struct cred __rcu		*real_cred;

	/* Effective (overridable) subjective task credentials (COW): */
	const struct cred __rcu		*cred;

#ifdef CONFIG_KEYS
	/* Cached requested key. */
	struct key			*cached_requested_key;
#endif

	/*
	 * executable name, excluding path.
	 *
	 * - normally initialized setup_new_exec()
	 * - access it with [gs]et_task_comm()
	 * - lock it with task_lock()
	 */
	char				comm[TASK_COMM_LEN];

	struct nameidata		*nameidata;

#ifdef CONFIG_SYSVIPC
	struct sysv_sem			sysvsem;
	struct sysv_shm			sysvshm;
#endif
#ifdef CONFIG_DETECT_HUNG_TASK
	unsigned long			last_switch_count;
	unsigned long			last_switch_time;
#endif
	/* Filesystem information: */
	struct fs_struct		*fs;

	/* Open file information: */
	struct files_struct		*files;

#ifdef CONFIG_IO_URING
	struct io_uring_task		*io_uring;
#endif

	/* Namespaces: */
	struct nsproxy			*nsproxy;

	/* Signal handlers: */
	struct signal_struct		*signal;
	struct sighand_struct __rcu		*sighand;
	sigset_t			blocked;
	sigset_t			real_blocked;
	/* Restored if set_restore_sigmask() was used: */
	sigset_t			saved_sigmask;
	struct sigpending		pending;
	unsigned long			sas_ss_sp;
	size_t				sas_ss_size;
	unsigned int			sas_ss_flags;

	struct callback_head		*task_works;

#ifdef CONFIG_AUDIT
#ifdef CONFIG_AUDITSYSCALL
	struct audit_context		*audit_context;
#endif
	kuid_t				loginuid;
	unsigned int			sessionid;
#endif
	struct seccomp			seccomp;
	struct syscall_user_dispatch	syscall_dispatch;

	/* Thread group tracking: */
	u64				parent_exec_id;
	u64				self_exec_id;

	/* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */
	spinlock_t			alloc_lock;

	/* Protection of the PI data structures: */
	raw_spinlock_t			pi_lock;

	struct wake_q_node		wake_q;

#ifdef CONFIG_RT_MUTEXES
	/* PI waiters blocked on a rt_mutex held by this task: */
	struct rb_root_cached		pi_waiters;
	/* Updated under owner's pi_lock and rq lock */
	struct task_struct		*pi_top_task;
	/* Deadlock detection and priority inheritance handling: */
	struct rt_mutex_waiter		*pi_blocked_on;
#endif

#ifdef CONFIG_DEBUG_MUTEXES
	/* Mutex deadlock detection: */
	struct mutex_waiter		*blocked_on;
#endif

#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
	int				non_block_count;
#endif

#ifdef CONFIG_TRACE_IRQFLAGS
	struct irqtrace_events		irqtrace;
	unsigned int			hardirq_threaded;
	u64				hardirq_chain_key;
	int				softirqs_enabled;
	int				softirq_context;
	int				irq_config;
#endif
#ifdef CONFIG_PREEMPT_RT
	int				softirq_disable_cnt;
#endif

#ifdef CONFIG_LOCKDEP
# define MAX_LOCK_DEPTH			48UL
	u64				curr_chain_key;
	int				lockdep_depth;
	unsigned int			lockdep_recursion;
	struct held_lock		held_locks[MAX_LOCK_DEPTH];
#endif

#if defined(CONFIG_UBSAN) && !defined(CONFIG_UBSAN_TRAP)
	unsigned int			in_ubsan;
#endif

	/* Journalling filesystem info: */
	void				*journal_info;

	/* Stacked block device info: */
	struct bio_list			*bio_list;

	/* Stack plugging: */
	struct blk_plug			*plug;

	/* VM state: */
	struct reclaim_state		*reclaim_state;

	struct backing_dev_info		*backing_dev_info;

	struct io_context		*io_context;

#ifdef CONFIG_COMPACTION
	struct capture_control		*capture_control;
#endif
	/* Ptrace state: */
	unsigned long			ptrace_message;
	kernel_siginfo_t		*last_siginfo;

	struct task_io_accounting	ioac;
#ifdef CONFIG_PSI
	/* Pressure stall state */
	unsigned int			psi_flags;
#endif
#ifdef CONFIG_TASK_XACCT
	/* Accumulated RSS usage: */
	u64				acct_rss_mem1;
	/* Accumulated virtual memory usage: */
	u64				acct_vm_mem1;
	/* stime + utime since last update: */
	u64				acct_timexpd;
#endif
#ifdef CONFIG_CPUSETS
	/* Protected by ->alloc_lock: */
	nodemask_t			mems_allowed;
	/* Sequence number to catch updates: */
	seqcount_spinlock_t		mems_allowed_seq;
	int				cpuset_mem_spread_rotor;
	int				cpuset_slab_spread_rotor;
#endif
#ifdef CONFIG_CGROUPS
	/* Control Group info protected by css_set_lock: */
	struct css_set __rcu		*cgroups;
	/* cg_list protected by css_set_lock and tsk->alloc_lock: */
	struct list_head		cg_list;
#endif
#ifdef CONFIG_X86_CPU_RESCTRL
	u32				closid;
	u32				rmid;
#endif
#ifdef CONFIG_FUTEX
	struct robust_list_head __user	*robust_list;
#ifdef CONFIG_COMPAT
	struct compat_robust_list_head __user *compat_robust_list;
#endif
	struct list_head		pi_state_list;
	struct futex_pi_state		*pi_state_cache;
	struct mutex			futex_exit_mutex;
	unsigned int			futex_state;
#endif
#ifdef CONFIG_PERF_EVENTS
	struct perf_event_context	*perf_event_ctxp[perf_nr_task_contexts];
	struct mutex			perf_event_mutex;
	struct list_head		perf_event_list;
#endif
#ifdef CONFIG_DEBUG_PREEMPT
	unsigned long			preempt_disable_ip;
#endif
#ifdef CONFIG_NUMA
	/* Protected by alloc_lock: */
	struct mempolicy		*mempolicy;
	short				il_prev;
	short				pref_node_fork;
#endif
#ifdef CONFIG_NUMA_BALANCING
	int				numa_scan_seq;
	unsigned int			numa_scan_period;
	unsigned int			numa_scan_period_max;
	int				numa_preferred_nid;
	unsigned long			numa_migrate_retry;
	/* Migration stamp: */
	u64				node_stamp;
	u64				last_task_numa_placement;
	u64				last_sum_exec_runtime;
	struct callback_head		numa_work;

	/*
	 * This pointer is only modified for current in syscall and
	 * pagefault context (and for tasks being destroyed), so it can be read
	 * from any of the following contexts:
	 *  - RCU read-side critical section
	 *  - current->numa_group from everywhere
	 *  - task's runqueue locked, task not running
	 */
	struct numa_group __rcu		*numa_group;

	/*
	 * numa_faults is an array split into four regions:
	 * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer
	 * in this precise order.
	 *
	 * faults_memory: Exponential decaying average of faults on a per-node
	 * basis. Scheduling placement decisions are made based on these
	 * counts. The values remain static for the duration of a PTE scan.
	 * faults_cpu: Track the nodes the process was running on when a NUMA
	 * hinting fault was incurred.
	 * faults_memory_buffer and faults_cpu_buffer: Record faults per node
	 * during the current scan window. When the scan completes, the counts
	 * in faults_memory and faults_cpu decay and these values are copied.
	 */
	unsigned long			*numa_faults;
	unsigned long			total_numa_faults;

	/*
	 * numa_faults_locality tracks if faults recorded during the last
	 * scan window were remote/local or failed to migrate. The task scan
	 * period is adapted based on the locality of the faults with different
	 * weights depending on whether they were shared or private faults
	 */
	unsigned long			numa_faults_locality[3];

	unsigned long			numa_pages_migrated;
#endif /* CONFIG_NUMA_BALANCING */

#ifdef CONFIG_RSEQ
	struct rseq __user *rseq;
	u32 rseq_sig;
	/*
	 * RmW on rseq_event_mask must be performed atomically
	 * with respect to preemption.
	 */
	unsigned long rseq_event_mask;
#endif

	struct tlbflush_unmap_batch	tlb_ubc;

	union {
		refcount_t		rcu_users;
		struct rcu_head		rcu;
	};

	/* Cache last used pipe for splice(): */
	struct pipe_inode_info		*splice_pipe;

	struct page_frag		task_frag;

#ifdef CONFIG_TASK_DELAY_ACCT
	struct task_delay_info		*delays;
#endif

#ifdef CONFIG_FAULT_INJECTION
	int				make_it_fail;
	unsigned int			fail_nth;
#endif
	/*
	 * When (nr_dirtied >= nr_dirtied_pause), it's time to call
	 * balance_dirty_pages() for a dirty throttling pause:
	 */
	int				nr_dirtied;
	int				nr_dirtied_pause;
	/* Start of a write-and-pause period: */
	unsigned long			dirty_paused_when;

#ifdef CONFIG_LATENCYTOP
	int				latency_record_count;
	struct latency_record		latency_record[LT_SAVECOUNT];
#endif
	/*
	 * Time slack values; these are used to round up poll() and
	 * select() etc timeout values. These are in nanoseconds.
	 */
	u64				timer_slack_ns;
	u64				default_timer_slack_ns;

#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
	unsigned int			kasan_depth;
#endif

#ifdef CONFIG_KCSAN
	struct kcsan_ctx		kcsan_ctx;
#ifdef CONFIG_TRACE_IRQFLAGS
	struct irqtrace_events		kcsan_save_irqtrace;
#endif
#ifdef CONFIG_KCSAN_WEAK_MEMORY
	int				kcsan_stack_depth;
#endif
#endif

#if IS_ENABLED(CONFIG_KUNIT)
	struct kunit			*kunit_test;
#endif

#ifdef CONFIG_FUNCTION_GRAPH_TRACER
	/* Index of current stored address in ret_stack: */
	int				curr_ret_stack;
	int				curr_ret_depth;

	/* Stack of return addresses for return function tracing: */
	struct ftrace_ret_stack		*ret_stack;

	/* Timestamp for last schedule: */
	unsigned long long		ftrace_timestamp;

	/*
	 * Number of functions that haven't been traced
	 * because of depth overrun:
	 */
	atomic_t			trace_overrun;

	/* Pause tracing: */
	atomic_t			tracing_graph_pause;
#endif

#ifdef CONFIG_TRACING
	/* State flags for use by tracers: */
	unsigned long			trace;

	/* Bitmask and counter of trace recursion: */
	unsigned long			trace_recursion;
#endif /* CONFIG_TRACING */

#ifdef CONFIG_KCOV
	/* See kernel/kcov.c for more details. */

	/* Coverage collection mode enabled for this task (0 if disabled): */
	unsigned int			kcov_mode;

	/* Size of the kcov_area: */
	unsigned int			kcov_size;

	/* Buffer for coverage collection: */
	void				*kcov_area;

	/* KCOV descriptor wired with this task or NULL: */
	struct kcov			*kcov;

	/* KCOV common handle for remote coverage collection: */
	u64				kcov_handle;

	/* KCOV sequence number: */
	int				kcov_sequence;

	/* Collect coverage from softirq context: */
	unsigned int			kcov_softirq;
#endif

#ifdef CONFIG_MEMCG
	struct mem_cgroup		*memcg_in_oom;
	gfp_t				memcg_oom_gfp_mask;
	int				memcg_oom_order;

	/* Number of pages to reclaim on returning to userland: */
	unsigned int			memcg_nr_pages_over_high;

	/* Used by memcontrol for targeted memcg charge: */
	struct mem_cgroup		*active_memcg;
#endif

#ifdef CONFIG_BLK_CGROUP
	struct request_queue		*throttle_queue;
#endif

#ifdef CONFIG_UPROBES
	struct uprobe_task		*utask;
#endif
#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)
	unsigned int			sequential_io;
	unsigned int			sequential_io_avg;
#endif
	struct kmap_ctrl		kmap_ctrl;
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
	unsigned long			task_state_change;
# ifdef CONFIG_PREEMPT_RT
	unsigned long			saved_state_change;
# endif
#endif
	int				pagefault_disabled;
#ifdef CONFIG_MMU
	struct task_struct		*oom_reaper_list;
	struct timer_list		oom_reaper_timer;
#endif
#ifdef CONFIG_VMAP_STACK
	struct vm_struct		*stack_vm_area;
#endif
#ifdef CONFIG_THREAD_INFO_IN_TASK
	/* A live task holds one reference: */
	refcount_t			stack_refcount;
#endif
#ifdef CONFIG_LIVEPATCH
	int patch_state;
#endif
#ifdef CONFIG_SECURITY
	/* Used by LSM modules for access restriction: */
	void				*security;
#endif
#ifdef CONFIG_BPF_SYSCALL
	/* Used by BPF task local storage */
	struct bpf_local_storage __rcu	*bpf_storage;
	/* Used for BPF run context */
	struct bpf_run_ctx		*bpf_ctx;
#endif

#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
	unsigned long			lowest_stack;
	unsigned long			prev_lowest_stack;
#endif

#ifdef CONFIG_X86_MCE
	void __user			*mce_vaddr;
	__u64				mce_kflags;
	u64				mce_addr;
	__u64				mce_ripv : 1,
					mce_whole_page : 1,
					__mce_reserved : 62;
	struct callback_head		mce_kill_me;
	int				mce_count;
#endif

#ifdef CONFIG_KRETPROBES
	struct llist_head               kretprobe_instances;
#endif
#ifdef CONFIG_RETHOOK
	struct llist_head               rethooks;
#endif

#ifdef CONFIG_ARCH_HAS_PARANOID_L1D_FLUSH
	/*
	 * If L1D flush is supported on mm context switch
	 * then we use this callback head to queue kill work
	 * to kill tasks that are not running on SMT disabled
	 * cores
	 */
	struct callback_head		l1d_flush_kill;
#endif

	/*
	 * New fields for task_struct should be added above here, so that
	 * they are included in the randomized portion of task_struct.
	 */
	randomized_struct_fields_end

	/* CPU-specific state of this task: */
	struct thread_struct		thread;

	/*
	 * WARNING: on x86, 'thread_struct' contains a variable-sized
	 * structure.  It *MUST* be at the end of 'task_struct'.
	 *
	 * Do not put anything below here!
	 */
};

jiffy

在介紹 jiffy 前,需先瞭解 linux kernel HZ 及 tick 的定義

HZ

linux kernel 會在固定周期發出 timer interrupt (IRQ 0),HZ 是用來定義一秒內會有幾次 timer interrupt,假設 HZ 為 100 則代表每秒發生 timer interrupt 100 次,可利用 getconf CLK_TCK 得知當前裝置的 HZ

wei@wei-Z-Series:~$ getconf CLK_TCK
100

tick

而 tick 是 HZ 的倒數,也就是 timer interrupt 每過多少時間會觸發,舉例來說當 HZ 為 100 時,則 tick 為 10 毫秒

jiffies

jiffies 是個全域變數,用於紀錄開機後產生了幾次 tick。

extern unsigned long volatile __cacheline_aligned_in_smp __jiffy_arch_data jiffies;

開機時,kernel 會將 jiffies 初始成 0,並且在每次 timer interrupt 發生後,jiffies 便加一。

以 32 位元的系統為例,當 HZ 為 100,一天會產生 86400 * 100 次 timer interrupt

232/(86400100)=497.1

jiffies 約在 497.1 天產生溢位,若是 HZ 為 1000,則 jiffies 會在 49.7 天產生溢位

為了解決這個問題,linux/jiffies.h 內提供 4 個 marco

/* Same as above, but does so with platform independent 64bit types.
 * These must be used when utilizing jiffies_64 (i.e. return value of
 * get_jiffies_64() */
#define time_after64(a,b)	\
	(typecheck(__u64, a) &&	\
	 typecheck(__u64, b) && \
	 ((__s64)((b) - (a)) < 0))
#define time_before64(a,b)	time_after64(b,a)

#define time_after_eq64(a,b)	\
	(typecheck(__u64, a) && \
	 typecheck(__u64, b) && \
	 ((__s64)((a) - (b)) >= 0))
#define time_before_eq64(a,b)	time_after_eq64(b,a)

#define time_in_range64(a, b, c) \
	(time_after_eq64(a, b) && \
	 time_before_eq64(a, c))

time_after(unknown, known) 為例,

  • 假設 unknown 為 253
  • known 為 2

則可以得到下列式子
(int8_t) 253 - (int8_t) 2 == -3 - 2 == -5 < 0
利用有號數的機制判斷

為了方便計算,將 long 改為 int_8 處理

所以即便是 jiffies 發生溢位,也能藉由這些 macro 取得正確的 jiffies

最後,有了以上資訊,可以推論出 jiffies, HZ, seconds 三者之間的關係

jiffies=secondsHZ
seconds=jiffies/HZ

signal

signal 是 UNIX 及 Linux 為了回應某些狀況所生成的 event,process 可能會根據收到的 signal 做出對應行為

常見的 signal 如下表所示

Signal Name Signal Number Description
SIGHUP 1 Hang up detected on controlling terminal or death of controlling process
SIGINT 2 Issued if the user sends an interrupt signal (Ctrl + C)
SIGQUIT 3 Issued if the user sends a quit signal (Ctrl + D)
SIGFPE 8 Issued if an illegal mathematical operation is attempted
SIGKILL 9 If a process gets this signal it must quit immediately and will not perform any clean-up operations
SIGALRM 14 Alarm clock signal (used for timers)
SIGTERM 15 Software termination signal (sent by kill by default)

signal 實作手法

在 task_struct 內,包含以下成員

/* Signal handlers: */
struct signal_struct		*signal;
struct sighand_struct __rcu	*sighand;
struct sigpending		pending;

一個 thread group 內的所有 task 會使用相同的 signal 及 sighand,用途分別為:

  • signal 包含 pending signal 的相關細節
  • sighand 會明確指出對應的 signal 該如何處置
  • pending 則記錄當前 task pending 的 signal
struct sigpending {
	struct list_head list;
	sigset_t signal;
};
struct sighand_struct {
	spinlock_t		siglock;
	refcount_t		count;
	wait_queue_head_t	signalfd_wqh;
	struct k_sigaction	action[_NSIG];
};
Sending a signal

linux 不論是 system call 或 user program 發送 signal 時,最終都會使用到 __send_signal 來處理

__send_signal 完整程式碼
static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t,
			enum pid_type type, bool force)
{
	struct sigpending *pending;
	struct sigqueue *q;
	int override_rlimit;
	int ret = 0, result;

	assert_spin_locked(&t->sighand->siglock);

	result = TRACE_SIGNAL_IGNORED;
	if (!prepare_signal(sig, t, force))
		goto ret;

	pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending;
	/*
	 * Short-circuit ignored signals and support queuing
	 * exactly one non-rt signal, so that we can get more
	 * detailed information about the cause of the signal.
	 */
	result = TRACE_SIGNAL_ALREADY_PENDING;
	if (legacy_queue(pending, sig))
		goto ret;

	result = TRACE_SIGNAL_DELIVERED;
	/*
	 * Skip useless siginfo allocation for SIGKILL and kernel threads.
	 */
	if ((sig == SIGKILL) || (t->flags & PF_KTHREAD))
		goto out_set;

	/*
	 * Real-time signals must be queued if sent by sigqueue, or
	 * some other real-time mechanism.  It is implementation
	 * defined whether kill() does so.  We attempt to do so, on
	 * the principle of least surprise, but since kill is not
	 * allowed to fail with EAGAIN when low on memory we just
	 * make sure at least one signal gets delivered and don't
	 * pass on the info struct.
	 */
	if (sig < SIGRTMIN)
		override_rlimit = (is_si_special(info) || info->si_code >= 0);
	else
		override_rlimit = 0;

	q = __sigqueue_alloc(sig, t, GFP_ATOMIC, override_rlimit, 0);

	if (q) {
		list_add_tail(&q->list, &pending->list);
		switch ((unsigned long) info) {
		case (unsigned long) SEND_SIG_NOINFO:
			clear_siginfo(&q->info);
			q->info.si_signo = sig;
			q->info.si_errno = 0;
			q->info.si_code = SI_USER;
			q->info.si_pid = task_tgid_nr_ns(current,
							task_active_pid_ns(t));
			rcu_read_lock();
			q->info.si_uid =
				from_kuid_munged(task_cred_xxx(t, user_ns),
						 current_uid());
			rcu_read_unlock();
			break;
		case (unsigned long) SEND_SIG_PRIV:
			clear_siginfo(&q->info);
			q->info.si_signo = sig;
			q->info.si_errno = 0;
			q->info.si_code = SI_KERNEL;
			q->info.si_pid = 0;
			q->info.si_uid = 0;
			break;
		default:
			copy_siginfo(&q->info, info);
			break;
		}
	} else if (!is_si_special(info) &&
		   sig >= SIGRTMIN && info->si_code != SI_USER) {
		/*
		 * Queue overflow, abort.  We may abort if the
		 * signal was rt and sent by user using something
		 * other than kill().
		 */
		result = TRACE_SIGNAL_OVERFLOW_FAIL;
		ret = -EAGAIN;
		goto ret;
	} else {
		/*
		 * This is a silent loss of information.  We still
		 * send the signal, but the *info bits are lost.
		 */
		result = TRACE_SIGNAL_LOSE_INFO;
	}

out_set:
	signalfd_notify(t, sig);
	sigaddset(&pending->signal, sig);

	/* Let multiprocess signals appear after on-going forks */
	if (type > PIDTYPE_TGID) {
		struct multiprocess_signals *delayed;
		hlist_for_each_entry(delayed, &t->signal->multiprocess, node) {
			sigset_t *signal = &delayed->signal;
			/* Can't queue both a stop and a continue signal */
			if (sig == SIGCONT)
				sigdelsetmask(signal, SIG_KERNEL_STOP_MASK);
			else if (sig_kernel_stop(sig))
				sigdelset(signal, SIGCONT);
			sigaddset(signal, sig);
		}
	}

	complete_signal(sig, t, type);
ret:
	trace_signal_generate(sig, info, t, type != PIDTYPE_PID, result);
	return ret;
}

首先 __send_sig 會根據 pid_type 來決定 pending 的對象

pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending;
/*
enum pid_type
{
	PIDTYPE_PID,
	PIDTYPE_TGID,
	PIDTYPE_PGID,
	PIDTYPE_SID,
	PIDTYPE_MAX,
};
*/

當 type 不為 PIDTYPE_PID 時,送出的 signal 會作用 thread group 的所有 thread,反之則作用於單一 thread

接著,假設要送出的 signal 為 SIGKILL,因為 SIGKILL 緣故,不需對 siginfo 作資源分配,故使用 goto 技巧,跳到 tag: out_set

siginfo 內會記錄此 signal 的相關資訊

if ((sig == SIGKILL) || (t->flags & PF_KTHREAD))
		goto out_set;

在 tag: out_set 階段

利用 macro sigaddset(&pending->signal, sig) 將 SIGKILL 存放至 peding->signal 內

當前置作業皆完成後,kernel 利用 complete_signal(sig, t, type) 通知 task 盡快處理收到的 signal

細看 complete_signal 的實際做法
首先建立二指標,*signal*t,其中 signal 會指向 p->signal,也就是先前 pending 過的 signal

static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
{
	struct signal_struct *signal = p->signal;
	struct task_struct *t;

接著會出現二種情況,分別為

  • 當前 task 有想要接收 signal
  • 當前 task 不想要接收 signal,此情況又分為兩種情境
    • 只有單一 thread 或 thread group 為空
    • 在 thread group 中尋找想要接受 signal 的 task,找不到則返回
	/*
	 * Now find a thread we can wake up to take the signal off the queue.
	 *
	 * If the main thread wants the signal, it gets first crack.
	 * Probably the least surprising to the average bear.
	 */
	if (wants_signal(sig, p))
		t = p;
	else if ((type == PIDTYPE_PID) || thread_group_empty(p))
		/*
		 * There is just one thread and it does not need to be woken.
		 * It will dequeue unblocked signals before it runs again.
		 */
		return;
	else {
		/*
		 * Otherwise try to find a suitable thread.
		 */
		t = signal->curr_target;
		while (!wants_signal(sig, t)) {
			t = next_thread(t);
			if (t == signal->curr_target)
				/*
				 * No thread needs to be woken.
				 * Any eligible threads will see
				 * the signal in the queue soon.
				 */
				return;
		}
		signal->curr_target = t;
	}

最後檢查 task 是否佇列於 runqueue 內

	/*
	 * The signal is already in the shared-pending queue.
	 * Tell the chosen thread to wake up and dequeue it.
	 */
	signal_wake_up(t, sig == SIGKILL);
	return;
}

若是 task 不在 runqueue 內,則嘗試將其放入 runqueue,否則將 task 從 runqueue 內移除,希望能在下次放入 runqueue 時做 signal 的處理

Execution of signal handlers

(待補..)

ptrace

ptrace() 是一種 system call,能使一個 process (the "tracer") 去觀察和控制另一個 process (the "tracee"),主要用於偵錯時的中斷點設置及追蹤 system call

ptrace 使用方式

追蹤 process 的方式如下

  • fork 一個 child process
  • 令 child process 呼叫 execve 執行 PTRACE_TRACEME,此時 parent process 便開始追蹤 tracee
  • 接著便等待 tracer 做其他動作,像是 PTRACE_PEEKTEXT, PTRACE_PEEKUSER 等等

以下面範例做說明,範例改自Playing with ptrace, Part I

#include <sys/ptrace.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
#include <sys/user.h> 
#include <sys/reg.h> 
#include <stdio.h>
int main()
{   pid_t child;
    long orig_rax;
    child = fork();
    int i = 0;
    if(child == 0) {
        ptrace(PTRACE_TRACEME, 0, NULL, NULL);
        execl("/bin/ls", "ls", NULL);
    }
    else {
        wait(NULL);
        orig_rax = ptrace(PTRACE_PEEKUSER,child, 8 * ORIG_RAX,NULL);
        printf("The child made a ""system call %ld\n", orig_rax);
        ptrace(PTRACE_CONT, child, NULL, NULL);
    }
    return 0;
}
  • 利用 fork() 創造一個 child process,令 child process 被 parent process 追蹤,此時 child process 會暫停執行
  • 接著 parent process 使用 PTRACE_PEEKUSER 觀察 child process 的 ORIG_RAX 內容為何
  • 再使用 PTRACE_CONT 使 child process 繼續執行

因此,可觀察到在 tracer 執行 PTRACE_CONT 後,ls 命令才被 child process 執行

wei@wei-Z-Series:~/linux2022/quiz8/8-3$ sudo ./ptrace_test 
The child made a system call 59
dont_trace.c	dont_trace.mod.c  Makefile	  ptrace_test
dont_trace.ko	dont_trace.mod.o  modules.order   ptrace_test.c
dont_trace.mod	dont_trace.o	  Module.symvers

Dont Trace 程式

有了上述基礎,開始探討本次考試內容

dont_trace 這個 Linux 核心模組利用 task_struct 內部成員 ptrace 來偵測給定的行程是否被除錯器或其他使用 ptrace 系統呼叫的程式所 attach,一旦偵測到其他行程嘗試追蹤該行程,dont_trace 就會主動 kill 追蹤者行程。流程:

  • Once any process starts “ptracing” another, the tracee gets added into ptraced that’s located in task_struct, which is simply a linked list that contains all the tracees that the process is “ptracing”.
  • Once a tracer is found, the module lists all its tracees and sends a SIGKILL signal to each of them including the tracer. This results in killing both the tracer and its tracees.
  • Once the module is attached to the kernel, the module’s “core” function will run periodically through the advantage of workqueues. Specifically, the module runs every JIFFIES_DELAY, which is set to 1. That is, the module will run every one jiffy. This can be changed through modifying the macro JIFFIES_DELAY defined in the module.

瞭解 module 的詳細流程後,觀察實作方式

首先,該如何找出 "ptracing" 中的程式呢?

static void check(void)
{
    struct task_struct *task;
    for_each_process (task) {
        if (!is_tracer(&task->ptraced))
            continue;

        kill_tracee(&task->ptraced);
        kill_task(task); /* Kill the tracer once all tracees are killed */
    }
}

check() 利用 macro for_each_process 逐一檢查此 task 是否為 tracer,若是找到,則用 kill_task() 對 tracee 及 tracer 送出 SIGKILL

is_tracer() 實際內容如下

/* @return true if the process has tracees */
static bool is_tracer(struct list_head *children)
{
    struct list_head *list;
    list_for_each (list, children) {
        struct task_struct *task =
            list_entry(list, struct task_struct, ptrace_entry);
        if (task)
            return true;
    }
    return false;
}

&task->ptracedlist_head,其代表 task 正在追蹤的 tracee,能藉由檢查這串 list 是否有對應 tracee task 存在,便可得知 task 是否為 tracer

kill_tracee() 將 tracee 找出並發送 SIGKILL,其找出 tracee 的方式與 is_tracer 類似

/* Traverse the element in the linked list of the ptraced proccesses and
 * finally kills them.
 */
static void kill_tracee(struct list_head *children)
{
    struct list_head *list;
    list_for_each (list, children) {
        struct task_struct *task_ptraced =
            list_entry(list, struct task_struct, ptrace_entry);
        pr_info("ptracee -> comm: %s, pid: %d, gid: %d, ptrace: %d\n",
                task_ptraced->comm, task_ptraced->pid, task_ptraced->tgid,
                task_ptraced->ptrace);
        kill_task(task_ptraced);
    }
}

瞭解如何刪除 tracer 及 tracee 的方式後,下一個問題是如何定期地執行此 lkm ?

為了定期地執行,此處引入 workqueue 機制,利用 macro DECLARE_DELAYED_WORK 建立一個 work item

static DECLARE_DELAYED_WORK(dont_trace_task, periodic_routine);
/*
 * #define DECLARE_WORK(n, f)                      \
    struct work_struct n = __WORK_INITIALIZER(n, f)*/

此 work item 的工作便是 periodic_routine

另外,在 dont_trace_init 內觀察到 queue_delayed_work 函式及 flag loaded

static int __init dont_trace_init(void)
{
    wq = create_workqueue(DONT_TRACE_WQ_NAME);
    queue_delayed_work(wq, &dont_trace_task, JIFFIES_DELAY);

    loaded = true;
    pr_info("Loaded!\n");
    return 0;
}
  • loaded 代表 dont_trace lkm 已正確載入
  • queue_delayed_work 能在指定的 jiffies 後,將 work 排入 workqueue 執行

Description

Returns false if work was already on a queue, true otherwise.

We queue the work to the CPU on which it was submitted, but if the CPU dies it can be processed by another CPU.

bool queue_delayed_work(struct workqueue_struct * wq, struct delayed_work * dwork, unsigned long delay)
queue work on a workqueue after delay

Parameters

struct workqueue_struct * wq workqueue to use
struct delayed_work * dwork delayable work to queue
unsigned long delay number of jiffies to wait before queueing

接著,觀察 periodic_routine,發現有二處需實作,根據老師提供的線索,可以推論兩件事

  • periodic_routine 必須定期被執行
  • periodic_routine 執行時必須確保 dont_trace 是否載入至 kernel
static void periodic_routine(struct work_struct *ws)
{
    if (likely(/* XXXXX: Implement */loaded))
        check();
    /* XXXXX: Implement */;
    queue_delayed_work(wq, &dont_trace_task, JIFFIES_DELAY);
}
  • 因此可以利用 loaded 確認 dont_trace 是否載入
  • 可以在 periodic_routine 內再次呼叫 queue_delayed_work,重新將 periodic_routine 排入 workqueue 內,達到定期執行的效果

最後執行結果如下

wei@wei-Z-Series:~/linux2022/quiz8/8-3$ yes >/dev/null &
[4] 12276
[2]   Killed                  yes > /dev/null
wei@wei-Z-Series:~/linux2022/quiz8/8-3$ dmesg 
...
[17387.116335] dont_trace: Loaded!
[17387.127976] dont_trace: ptracee -> comm: yes, pid: 7521, gid: 7521, ptrace: 505
dont_trace 完整程式碼
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/list.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/sched/signal.h>
#include <linux/workqueue.h>

MODULE_AUTHOR("National Cheng Kung University, Taiwan");
MODULE_LICENSE("Dual BSD/GPL");
MODULE_DESCRIPTION("A kernel module that kills ptrace tracer and its tracees");

#define JIFFIES_DELAY 1
#define DONT_TRACE_WQ_NAME "dont_trace_worker"

static void periodic_routine(struct work_struct *);
static DECLARE_DELAYED_WORK(dont_trace_task, periodic_routine);
static struct workqueue_struct *wq;
static bool loaded;

/* Send SIGKILL from kernel space */
static void kill_task(struct task_struct *task)
{
    send_sig(SIGKILL, task, 1);
}

/* @return true if the process has tracees */
static bool is_tracer(struct list_head *children)
{
    struct list_head *list;
    list_for_each (list, children) {
        struct task_struct *task =
            list_entry(list, struct task_struct, ptrace_entry);
        if (task)
            return true;
    }
    return false;
}

/* Traverse the element in the linked list of the ptraced proccesses and
 * finally kills them.
 */
static void kill_tracee(struct list_head *children)
{
    struct list_head *list;
    list_for_each (list, children) {
        struct task_struct *task_ptraced =
            list_entry(list, struct task_struct, ptrace_entry);
        pr_info("ptracee -> comm: %s, pid: %d, gid: %d, ptrace: %d\n",
                task_ptraced->comm, task_ptraced->pid, task_ptraced->tgid,
                task_ptraced->ptrace);
        kill_task(task_ptraced);
    }
}

static void check(void)
{
    struct task_struct *task;
    for_each_process (task) {
        if (!is_tracer(&task->ptraced))
            continue;

        kill_tracee(&task->ptraced);
        kill_task(task); /* Kill the tracer once all tracees are killed */
    }
}

static void periodic_routine(struct work_struct *ws)
{
    if (likely(/* XXXXX: Implement */loaded))
        check();
    /* XXXXX: Implement */;
    queue_delayed_work(wq, &dont_trace_task, JIFFIES_DELAY);
}

static int __init dont_trace_init(void)
{
    wq = create_workqueue(DONT_TRACE_WQ_NAME);
    queue_delayed_work(wq, &dont_trace_task, JIFFIES_DELAY);

    loaded = true;
    pr_info("Loaded!\n");
    return 0;
}

static void __exit dont_trace_exit(void)
{
    loaded = false;

    /* No new routines will be queued */
    cancel_delayed_work(&dont_trace_task);

    /* Wait for the completion of all routines */
    flush_workqueue(wq);
    destroy_workqueue(wq);

    pr_info("Unloaded.\n");
}

module_init(dont_trace_init);
module_exit(dont_trace_exit);

Reference