Linux 4.9.140 Scheduling === # 參考 參考 https://blog.csdn.net/gatieme/article/details/51872594 裡面提到 >- 2個調度器 >- 6種調度策略 >- 5個調度器類 >- 3個調度實體 參考 https://tampub.uta.fi/bitstream/handle/10024/96864/GRADU-1428493916.pdf 裡面提到 > The kernel decides, which tasks go to which scheduling classes based on their scheduling policy (SCHED_*) and calls the corresponding functions. Processes under SCHED_NORMAL, SCHED_BATCH and SCHED_IDLE are mapped to fair_sched_class, provided by CFS. SCHED_RR and SCHED_FIFO associate with rt_sched_class, real-time scheduler. **這篇文對 Scheduling 解釋相當清楚** 參考 https://www.cnblogs.com/arnoldlu/p/8465034.html # 2個調度器 # 6種調度策略 - **Scheduling policies** - **定義** [/include/uapi/linux/sched.h](https://elixir.bootlin.com/linux/v4.9.140/source/include/uapi/linux/sched.h#L32) ```c=32 /* * Scheduling policies */ #define SCHED_NORMAL 0 #define SCHED_FIFO 1 #define SCHED_RR 2 #define SCHED_BATCH 3 /* SCHED_ISO: reserved but not implemented yet */ #define SCHED_IDLE 5 #define SCHED_DEADLINE 6 ``` - **與 sched_class 的對應關係 # 5個調度器類 ## **sched_class** [/kernel/sched/sched.h](https://elixir.bootlin.com/linux/v4.9.140/source/kernel/sched/sched.h#L1222) ```c=1222 struct sched_class { const struct sched_class *next; void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); void (*yield_task) (struct rq *rq); bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt); void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); /* * It is the responsibility of the pick_next_task() method that will * return the next task to call put_prev_task() on the @prev task or * something equivalent. * * May return RETRY_TASK when it finds a higher prio class has runnable * tasks. */ struct task_struct * (*pick_next_task) (struct rq *rq, struct task_struct *prev, struct pin_cookie cookie); void (*put_prev_task) (struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); void (*migrate_task_rq)(struct task_struct *p); void (*task_woken) (struct rq *this_rq, struct task_struct *task); void (*set_cpus_allowed)(struct task_struct *p, const struct cpumask *newmask); void (*rq_online)(struct rq *rq); void (*rq_offline)(struct rq *rq); #endif void (*set_curr_task) (struct rq *rq); void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); void (*task_fork) (struct task_struct *p); void (*task_dead) (struct task_struct *p); /* * The switched_from() call is allowed to drop rq->lock, therefore we * cannot assume the switched_from/switched_to pair is serliazed by * rq->lock. They are however serialized by p->pi_lock. */ void (*switched_from) (struct rq *this_rq, struct task_struct *task); void (*switched_to) (struct rq *this_rq, struct task_struct *task); void (*prio_changed) (struct rq *this_rq, struct task_struct *task, int oldprio); unsigned int (*get_rr_interval) (struct rq *rq, struct task_struct *task); void (*update_curr) (struct rq *rq); #define TASK_SET_GROUP 0 #define TASK_MOVE_GROUP 1 #ifdef CONFIG_FAIR_GROUP_SCHED void (*task_change_group) (struct task_struct *p, int type); #endif }; ``` ## 各 scheduling class 定義 [/kernel/sched/sched.h](https://elixir.bootlin.com/linux/v4.9.140/source/kernel/sched/sched.h#L1300) ```c=1300 extern const struct sched_class stop_sched_class; extern const struct sched_class dl_sched_class; extern const struct sched_class rt_sched_class; extern const struct sched_class fair_sched_class; extern const struct sched_class idle_sched_class; ``` ## stop_sched_class [/kernel/sched/stop_task.c](https://elixir.bootlin.com/linux/v4.9.140/source/kernel/sched/stop_task.c#L109) ```c=109 /* * Simple, special scheduling class for the per-CPU stop tasks: */ const struct sched_class stop_sched_class = { .next = &dl_sched_class, .enqueue_task = enqueue_task_stop, .dequeue_task = dequeue_task_stop, .yield_task = yield_task_stop, .check_preempt_curr = check_preempt_curr_stop, .pick_next_task = pick_next_task_stop, .put_prev_task = put_prev_task_stop, #ifdef CONFIG_SMP .select_task_rq = select_task_rq_stop, .set_cpus_allowed = set_cpus_allowed_common, #endif .set_curr_task = set_curr_task_stop, .task_tick = task_tick_stop, .get_rr_interval = get_rr_interval_stop, .prio_changed = prio_changed_stop, .switched_to = switched_to_stop, .update_curr = update_curr_stop, }; ``` ## dl_sched_class [/kernel/sched/deadline.c](https://elixir.bootlin.com/linux/v4.9.140/source/kernel/sched/deadline.c#L1903) ```c=1903 const struct sched_class dl_sched_class = { .next = &rt_sched_class, .enqueue_task = enqueue_task_dl, .dequeue_task = dequeue_task_dl, .yield_task = yield_task_dl, .check_preempt_curr = check_preempt_curr_dl, .pick_next_task = pick_next_task_dl, .put_prev_task = put_prev_task_dl, #ifdef CONFIG_SMP .select_task_rq = select_task_rq_dl, .set_cpus_allowed = set_cpus_allowed_dl, .rq_online = rq_online_dl, .rq_offline = rq_offline_dl, .task_woken = task_woken_dl, #endif .set_curr_task = set_curr_task_dl, .task_tick = task_tick_dl, .task_fork = task_fork_dl, .task_dead = task_dead_dl, .prio_changed = prio_changed_dl, .switched_from = switched_from_dl, .switched_to = switched_to_dl, .update_curr = update_curr_dl, }; ``` ## rt_sched_class [/kernel/sched/rt.c](https://elixir.bootlin.com/linux/v4.9.140/source/kernel/sched/rt.c#L2333) ```c=2333 const struct sched_class rt_sched_class = { .next = &fair_sched_class, .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, .yield_task = yield_task_rt, .check_preempt_curr = check_preempt_curr_rt, .pick_next_task = pick_next_task_rt, .put_prev_task = put_prev_task_rt, #ifdef CONFIG_SMP .select_task_rq = select_task_rq_rt, .set_cpus_allowed = set_cpus_allowed_common, .rq_online = rq_online_rt, .rq_offline = rq_offline_rt, .task_woken = task_woken_rt, .switched_from = switched_from_rt, #endif .set_curr_task = set_curr_task_rt, .task_tick = task_tick_rt, .get_rr_interval = get_rr_interval_rt, .prio_changed = prio_changed_rt, .switched_to = switched_to_rt, .update_curr = update_curr_rt, }; ``` ## fair_sched_class [/kernel/sched/fair.c](https://elixir.bootlin.com/linux/v4.9.140/source/kernel/sched/fair.c#L8997) ```c=8997 /* * All the scheduling class methods: */ const struct sched_class fair_sched_class = { .next = &idle_sched_class, .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, .yield_to_task = yield_to_task_fair, .check_preempt_curr = check_preempt_wakeup, .pick_next_task = pick_next_task_fair, .put_prev_task = put_prev_task_fair, #ifdef CONFIG_SMP .select_task_rq = select_task_rq_fair, .migrate_task_rq = migrate_task_rq_fair, .rq_online = rq_online_fair, .rq_offline = rq_offline_fair, .task_dead = task_dead_fair, .set_cpus_allowed = set_cpus_allowed_common, #endif .set_curr_task = set_curr_task_fair, .task_tick = task_tick_fair, .task_fork = task_fork_fair, .prio_changed = prio_changed_fair, .switched_from = switched_from_fair, .switched_to = switched_to_fair, .get_rr_interval = get_rr_interval_fair, .update_curr = update_curr_fair, #ifdef CONFIG_FAIR_GROUP_SCHED .task_change_group = task_change_group_fair, #endif }; ``` ## idle_sched_class [/kernel/sched/idle_task.c](https://elixir.bootlin.com/linux/v4.9.140/source/kernel/sched/idle_task.c#L81) ```c=81 /* * Simple, special scheduling class for the per-CPU idle tasks: */ const struct sched_class idle_sched_class = { /* .next is NULL */ /* no enqueue/yield_task for idle tasks */ /* dequeue is not valid, we print a debug message there: */ .dequeue_task = dequeue_task_idle, .check_preempt_curr = check_preempt_curr_idle, .pick_next_task = pick_next_task_idle, .put_prev_task = put_prev_task_idle, #ifdef CONFIG_SMP .select_task_rq = select_task_rq_idle, .set_cpus_allowed = set_cpus_allowed_common, #endif .set_curr_task = set_curr_task_idle, .task_tick = task_tick_idle, .get_rr_interval = get_rr_interval_idle, .prio_changed = prio_changed_idle, .switched_to = switched_to_idle, .update_curr = update_curr_idle, }; ``` ## 觀察 1. **next 是一個 linked list** stop_sched_class -> dl_sched_class -> rt_sched_class -> fair_sched_class -> idle_sched_class -> NULL # 3個調度實體 sched_dl_entity sched_rt_entity sched_entity # __schedule [/kernel/sched/core.c](https://elixir.bootlin.com/linux/v4.9.140/source/kernel/sched/core.c#L3296) ```c= static void __sched notrace __schedule(bool preempt) { struct task_struct *prev, *next; unsigned long *switch_count; struct pin_cookie cookie; struct rq *rq; int cpu; /* * 取得目前 CPU ID */ cpu = smp_processor_id(); /* * 取得此 CPU 的 runqueue */ rq = cpu_rq(cpu); /* * 將 curr task 存到 prev task */ prev = rq->curr; /* * 檢查 stack 及統計 rq 的 sched_count */ schedule_debug(prev); if (sched_feat(HRTICK)) /* * 用 HR-timers 取得 preemption points. * 參考 https://stackoverflow.com/questions/29999313/what-is-hrtick-clearrq-in-linux-scheduler */ hrtick_clear(rq); /* * 暫停接收 interrupts */ local_irq_disable(); /* * 標記現在正在 context switch 狀態 */ rcu_note_context_switch(); /* * Make sure that signal_pending_state()->signal_pending() below * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) * done by the caller to avoid the race with signal_wake_up(). */ smp_mb__before_spinlock(); /* 鎖住 runqueue */ raw_spin_lock(&rq->lock); cookie = lockdep_pin_lock(&rq->lock); rq->clock_skip_update <<= 1; /* promote REQ to ACT */ /* 將 switch_count 切換成 current task 的 involuntary context switches counter */ switch_count = &prev->nivcsw; /* 若不是搶行 schedule, 且 current task 的 state 不是 runnable */ if (!preempt && prev->state) { /* 若 current task state 為 TASK_INTERRUPITBLE 或 TASK_WAKEILL */ /* 且此 task 的 thread_info 沒有 TIF_SIGPENDING flag */ if (unlikely(signal_pending_state(prev->state, prev))) { /* 設定 current task state 為 TASK_RUNNING */ prev->state = TASK_RUNNING; } else { /* 使 current task 退出 rq */ deactivate_task(rq, prev, DEQUEUE_SLEEP); /* 標記 current task 現在不在 rq 中 */ prev->on_rq = 0; /* * If a worker went to sleep, notify and ask workqueue * whether it wants to wake up a task to maintain * concurrency. */ /* 若 current task 為 workqueue worker */ if (prev->flags & PF_WQ_WORKER) { struct task_struct *to_wakeup; to_wakeup = wq_worker_sleeping(prev); if (to_wakeup) try_to_wake_up_local(to_wakeup, cookie); } } /* 不是強行 schedule, 所以把 switch_count 切換成 current task 的 voluntary context switches counter */ switch_count = &prev->nvcsw; } /* 若 current task 的 on_rq 標記還在 TASK_ON_RQ_QUEUED */ if (task_on_rq_queued(prev)) /* 更新 rq->clock */ update_rq_clock(rq); /* 挑選下一個 task */ next = pick_next_task(rq, prev, cookie); /* 清除 current task thread_info 的 TIF_NEED_RESCHED flag */ clear_tsk_need_resched(prev); /* 清除 PREEMPT_NEED_RESCHED flag */ clear_preempt_need_resched(); rq->clock_skip_update = 0; /* 若 prev 跟 next 是不一樣的 task */ if (likely(prev != next)) { /* runqueue 切換 counter + 1 */ rq->nr_switches++; /* 跑 next */ rq->curr = next; /* prev 的切換 counter + 1 */ ++*switch_count; trace_sched_switch(preempt, prev, next); /* 載入 next 的 PCB, 並解開 runqueue */ rq = context_switch(rq, prev, next, cookie); /* unlocks the rq */ } else { /* 若 prev 跟 next 一樣, 就不切換 task, 直接解開 runqueue */ lockdep_unpin_lock(&rq->lock, cookie); raw_spin_unlock_irq(&rq->lock); } balance_callback(rq); } ``` # __setscheduler [/kernel/sched/core.c](https://elixir.bootlin.com/linux/v4.9.140/source/kernel/sched/core.c#L3971) ```c= static void __setscheduler(struct rq *rq, struct task_struct *p, const struct sched_attr *attr, bool keep_boost) { /* * 設定 task p 的 policy, prio */ __setscheduler_params(p, attr); /* * Keep a potential priority boosting if called from * sched_setscheduler(). */ if (keep_boost) p->prio = rt_mutex_get_effective_prio(p, normal_prio(p)); else /* 根據 Policy 給予恰當的值 */ p->prio = normal_prio(p); /* * 若 p->prio < MAX_DL_PRIO */ if (dl_prio(p->prio)) p->sched_class = &dl_sched_class; /* * 若 p->prio < MAX_RT_PRIO */ else if (rt_prio(p->prio)) p->sched_class = &rt_sched_class; else p->sched_class = &fair_sched_class; } ``` # __setscheduler_params [/kernel/sched/core.c](https://elixir.bootlin.com/linux/v4.9.140/source/kernel/sched/core.c#L3945) ```c= static void __setscheduler_params(struct task_struct *p, const struct sched_attr *attr) { /* 若 attr->policy == SETPARAM_POLICY, 則不變動 task p 的 policy */ int policy = attr->sched_policy; if (policy == SETPARAM_POLICY) policy = p->policy; p->policy = policy; /* 若 task p 的 policy 為 SCHED_DEADLINE */ if (dl_policy(policy)) /* 設定 task p 的 sched_dl_entity p->dl */ __setparam_dl(p, attr); /* 若 task p 的 policy 為 SCHED_NORMAL 或 SCHED_BATCH */ else if (fair_policy(policy)) /* 設定 task p 的 static_prio 為 attr->sched_nice + MAX_RT_PRIO + (NAX_NICE - MIN_NICE + 1) / 2 * 只要 attr->sched_nice 在 MAX_NICE 及 MIN_NICE 之間 * p->static_prio 都會大於 MAX_RT_PRIO */ p->static_prio = NICE_TO_PRIO(attr->sched_nice); /* * __sched_setscheduler() ensures attr->sched_priority == 0 when * !rt_policy. Always setting this ensures that things like * getparam()/getattr() don't report silly values for !rt tasks. */ p->rt_priority = attr->sched_priority; /* 根據 Policy 給予恰當的值 */ p->normal_prio = normal_prio(p); set_load_weight(p); } ``` # __sched_setscheduler [/kernel/sched/core.c](https://elixir.bootlin.com/linux/v4.9.140/source/kernel/sched/core.c#L4076) ```c= static int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi) { int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : MAX_RT_PRIO - 1 - attr->sched_priority; int retval, oldprio, oldpolicy = -1, queued, running; int new_effective_prio, policy = attr->sched_policy; const struct sched_class *prev_class; struct rq_flags rf; int reset_on_fork; int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; struct rq *rq; /* may grab non-irq protected spin_locks */ BUG_ON(in_interrupt()); recheck: /* double check policy once rq lock held */ if (policy < 0) { reset_on_fork = p->sched_reset_on_fork; policy = oldpolicy = p->policy; } else { reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); if (!valid_policy(policy)) return -EINVAL; } if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK)) return -EINVAL; /* * Valid priorities for SCHED_FIFO and SCHED_RR are * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, * SCHED_BATCH and SCHED_IDLE is 0. */ if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) || (!p->mm && attr->sched_priority > MAX_RT_PRIO-1)) return -EINVAL; if ((dl_policy(policy) && !__checkparam_dl(attr)) || (rt_policy(policy) != (attr->sched_priority != 0))) return -EINVAL; /* * Allow unprivileged RT tasks to decrease priority: */ if (user && !capable(CAP_SYS_NICE)) { if (fair_policy(policy)) { if (attr->sched_nice < task_nice(p) && !can_nice(p, attr->sched_nice)) return -EPERM; } if (rt_policy(policy)) { unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); /* can't set/change the rt policy */ if (policy != p->policy && !rlim_rtprio) return -EPERM; /* can't increase priority */ if (attr->sched_priority > p->rt_priority && attr->sched_priority > rlim_rtprio) return -EPERM; } /* * Can't set/change SCHED_DEADLINE policy at all for now * (safest behavior); in the future we would like to allow * unprivileged DL tasks to increase their relative deadline * or reduce their runtime (both ways reducing utilization) */ if (dl_policy(policy)) return -EPERM; /* * Treat SCHED_IDLE as nice 20. Only allow a switch to * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. */ if (idle_policy(p->policy) && !idle_policy(policy)) { if (!can_nice(p, task_nice(p))) return -EPERM; } /* can't change other user's priorities */ if (!check_same_owner(p)) return -EPERM; /* Normal users shall not reset the sched_reset_on_fork flag */ if (p->sched_reset_on_fork && !reset_on_fork) return -EPERM; } if (user) { retval = security_task_setscheduler(p); if (retval) return retval; } /* * make sure no PI-waiters arrive (or leave) while we are * changing the priority of the task: * * To be able to change p->policy safely, the appropriate * runqueue lock must be held. */ rq = task_rq_lock(p, &rf); /* * Changing the policy of the stop threads its a very bad idea */ if (p == rq->stop) { task_rq_unlock(rq, p, &rf); return -EINVAL; } /* * If not changing anything there's no need to proceed further, * but store a possible modification of reset_on_fork. */ if (unlikely(policy == p->policy)) { if (fair_policy(policy) && attr->sched_nice != task_nice(p)) goto change; if (rt_policy(policy) && attr->sched_priority != p->rt_priority) goto change; if (dl_policy(policy) && dl_param_changed(p, attr)) goto change; p->sched_reset_on_fork = reset_on_fork; task_rq_unlock(rq, p, &rf); return 0; } change: if (user) { #ifdef CONFIG_RT_GROUP_SCHED /* * Do not allow realtime tasks into groups that have no runtime * assigned. */ if (rt_bandwidth_enabled() && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0 && !task_group_is_autogroup(task_group(p))) { task_rq_unlock(rq, p, &rf); return -EPERM; } #endif #ifdef CONFIG_SMP if (dl_bandwidth_enabled() && dl_policy(policy)) { cpumask_t *span = rq->rd->span; /* * Don't allow tasks with an affinity mask smaller than * the entire root_domain to become SCHED_DEADLINE. We * will also fail if there's no bandwidth available. */ if (!cpumask_subset(span, &p->cpus_allowed) || rq->rd->dl_bw.bw == 0) { task_rq_unlock(rq, p, &rf); return -EPERM; } } #endif } /* recheck policy now with rq lock held */ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { policy = oldpolicy = -1; task_rq_unlock(rq, p, &rf); goto recheck; } /* * If setscheduling to SCHED_DEADLINE (or changing the parameters * of a SCHED_DEADLINE task) we need to check if enough bandwidth * is available. */ if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) { task_rq_unlock(rq, p, &rf); return -EBUSY; } p->sched_reset_on_fork = reset_on_fork; oldprio = p->prio; if (pi) { /* * Take priority boosted tasks into account. If the new * effective priority is unchanged, we just store the new * normal parameters and do not touch the scheduler class and * the runqueue. This will be done when the task deboost * itself. */ new_effective_prio = rt_mutex_get_effective_prio(p, newprio); if (new_effective_prio == oldprio) queue_flags &= ~DEQUEUE_MOVE; } queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) dequeue_task(rq, p, queue_flags); if (running) put_prev_task(rq, p); prev_class = p->sched_class; __setscheduler(rq, p, attr, pi); if (queued) { /* * We enqueue to tail when the priority of a task is * increased (user space view). */ if (oldprio < p->prio) queue_flags |= ENQUEUE_HEAD; enqueue_task(rq, p, queue_flags); } if (running) set_curr_task(rq, p); check_class_changed(rq, p, prev_class, oldprio); preempt_disable(); /* avoid rq from going away on us */ task_rq_unlock(rq, p, &rf); if (pi) rt_mutex_adjust_pi(p); /* * Run balance callbacks after we've adjusted the PI chain. */ balance_callback(rq); preempt_enable(); return 0; } ``` # sched_setattr syscalls [/kernel/sched/core.c](https://elixir.bootlin.com/linux/v4.9.140/source/kernel/sched/core.c#L4349) ```c= int sched_setattr(struct task_struct *p, const struct sched_attr *attr) { return __sched_setscheduler(p, attr, true, true); } ``` # sched_setparam # normal_prio [/kernel/sched/core.c](https://elixir.bootlin.com/linux/v4.9.140/source/kernel/sched/core.c#L886) ```c= static inline int normal_prio(struct task_struct *p) { int prio; /* * 若 p 的 policy 為 SCHED_DEADLINE * 回傳 MAX_DL_PRIO - 1 * * 若 p 的 policy 為 SCHED_FIFO 或 SCHED_RR * 回傳 MAX_RT_PRIO - 1 - p->rt_priority * 只要 0 < p->rt_priority < MAX_RT_PRIO * 回傳值都會滿足 MAX_DL_PRIO <= 回傳值 < MAX_RT_PRIO * * 若 p 的 policy 不為以上任意者 * 回傳 p->static_prio */ if (task_has_dl_policy(p)) prio = MAX_DL_PRIO-1; else if (task_has_rt_policy(p)) prio = MAX_RT_PRIO-1 - p->rt_priority; else prio = __normal_prio(p); return prio; } ``` # rt_mutex_get_effective_prio [/include/linux/sched/rt.h](https://elixir.bootlin.com/linux/v4.9.140/source/include/linux/sched/rt.h#L34) ```c= static inline int rt_mutex_get_effective_prio(struct task_struct *task, int newprio) { return newprio; } ``` # ###### tags: `OS` `Linux` `Scheduling`