X-Git-Url: https://gerrit.opnfv.org/gerrit/gitweb?a=blobdiff_plain;f=kernel%2Fkernel%2Fsched%2Fcore.c;h=94827a59301e60c4979dae3f03629dd0a27229be;hb=e09b41010ba33a20a87472ee821fa407a5b8da36;hp=799b75b273a29e3b7f5caf692b045cb797ea9ba8;hpb=ddbae00816a243e16209f291af562d9f1bb3d4c2;p=kvmfornfv.git diff --git a/kernel/kernel/sched/core.c b/kernel/kernel/sched/core.c index 799b75b27..94827a593 100644 --- a/kernel/kernel/sched/core.c +++ b/kernel/kernel/sched/core.c @@ -90,26 +90,6 @@ #define CREATE_TRACE_POINTS #include -void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) -{ - unsigned long delta; - ktime_t soft, hard, now; - - for (;;) { - if (hrtimer_active(period_timer)) - break; - - now = hrtimer_cb_get_time(period_timer); - hrtimer_forward(period_timer, now, period); - - soft = hrtimer_get_softexpires(period_timer); - hard = hrtimer_get_expires(period_timer); - delta = ktime_to_ns(ktime_sub(hard, soft)); - __hrtimer_start_range_ns(period_timer, soft, delta, - HRTIMER_MODE_ABS_PINNED, 0); - } -} - DEFINE_MUTEX(sched_domains_mutex); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); @@ -184,14 +164,12 @@ struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { static void sched_feat_disable(int i) { - if (static_key_enabled(&sched_feat_keys[i])) - static_key_slow_dec(&sched_feat_keys[i]); + static_key_disable(&sched_feat_keys[i]); } static void sched_feat_enable(int i) { - if (!static_key_enabled(&sched_feat_keys[i])) - static_key_slow_inc(&sched_feat_keys[i]); + static_key_enable(&sched_feat_keys[i]); } #else static void sched_feat_disable(int i) { }; @@ -359,12 +337,11 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) #ifdef CONFIG_SMP -static int __hrtick_restart(struct rq *rq) +static void __hrtick_restart(struct rq *rq) { struct hrtimer *timer = &rq->hrtick_timer; - ktime_t time = hrtimer_get_softexpires(timer); - return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0); + hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); } /* @@ -444,8 +421,8 @@ void hrtick_start(struct rq *rq, u64 delay) * doesn't make sense. Rely on vruntime for fairness. */ delay = max_t(u64, delay, 10000LL); - __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, - HRTIMER_MODE_REL_PINNED, 0); + hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), + HRTIMER_MODE_REL_PINNED); } static inline void init_hrtick(void) @@ -516,7 +493,7 @@ static bool set_nr_and_not_polling(struct task_struct *p) static bool set_nr_if_polling(struct task_struct *p) { struct thread_info *ti = task_thread_info(p); - typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags); + typeof(ti->flags) old, val = READ_ONCE(ti->flags); for (;;) { if (!(val & _TIF_POLLING_NRFLAG)) @@ -570,7 +547,7 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task) head->lastp = &node->next; } -void wake_up_q(struct wake_q_head *head) +void __wake_up_q(struct wake_q_head *head, bool sleeper) { struct wake_q_node *node = head->first; @@ -587,7 +564,10 @@ void wake_up_q(struct wake_q_head *head) * wake_up_process() implies a wmb() to pair with the queueing * in wake_q_add() so as not to miss wakeups. */ - wake_up_process(task); + if (sleeper) + wake_up_lock_sleeper(task); + else + wake_up_process(task); put_task_struct(task); } } @@ -676,26 +656,29 @@ void resched_cpu(int cpu) * selecting an idle cpu will add more delays to the timers than intended * (as that cpu's timer base may not be uptodate wrt jiffies etc). */ -int get_nohz_timer_target(int pinned) +int get_nohz_timer_target(void) { - int cpu; - int i; + int i, cpu; struct sched_domain *sd; preempt_disable_rt(); cpu = smp_processor_id(); - if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu)) + + if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu)) goto preempt_en_rt; rcu_read_lock(); for_each_domain(cpu, sd) { for_each_cpu(i, sched_domain_span(sd)) { - if (!idle_cpu(i)) { + if (!idle_cpu(i) && is_housekeeping_cpu(cpu)) { cpu = i; goto unlock; } } } + + if (!is_housekeeping_cpu(cpu)) + cpu = housekeeping_any_cpu(); unlock: rcu_read_unlock(); preempt_en_rt: @@ -879,7 +862,7 @@ static void set_load_weight(struct task_struct *p) /* * SCHED_IDLE tasks get minimal weight: */ - if (p->policy == SCHED_IDLE) { + if (idle_policy(p->policy)) { load->weight = scale_load(WEIGHT_IDLEPRIO); load->inv_weight = WMULT_IDLEPRIO; return; @@ -889,17 +872,19 @@ static void set_load_weight(struct task_struct *p) load->inv_weight = prio_to_wmult[prio]; } -static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) +static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) { update_rq_clock(rq); - sched_info_queued(rq, p); + if (!(flags & ENQUEUE_RESTORE)) + sched_info_queued(rq, p); p->sched_class->enqueue_task(rq, p, flags); } -static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) +static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) { update_rq_clock(rq); - sched_info_dequeued(rq, p); + if (!(flags & DEQUEUE_SAVE)) + sched_info_dequeued(rq, p); p->sched_class->dequeue_task(rq, p, flags); } @@ -1063,7 +1048,11 @@ inline int task_curr(const struct task_struct *p) } /* - * Can drop rq->lock because from sched_class::switched_from() methods drop it. + * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock, + * use the balance_callback list if you want balancing. + * + * this means any call to check_class_changed() must be followed by a call to + * balance_callback(). */ static inline void check_class_changed(struct rq *rq, struct task_struct *p, const struct sched_class *prev_class, @@ -1072,7 +1061,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, if (prev_class != p->sched_class) { if (prev_class->switched_from) prev_class->switched_from(rq, p); - /* Possble rq->lock 'hole'. */ + p->sched_class->switched_to(rq, p); } else if (oldprio != p->prio || dl_task(p)) p->sched_class->prio_changed(rq, p, oldprio); @@ -1104,3636 +1093,3675 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) } #ifdef CONFIG_SMP -void set_task_cpu(struct task_struct *p, unsigned int new_cpu) -{ -#ifdef CONFIG_SCHED_DEBUG - /* - * We should never call set_task_cpu() on a blocked task, - * ttwu() will sort out the placement. - */ - WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && - !p->on_rq); - -#ifdef CONFIG_LOCKDEP - /* - * The caller should hold either p->pi_lock or rq->lock, when changing - * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. - * - * sched_move_task() holds both and thus holding either pins the cgroup, - * see task_group(). - * - * Furthermore, all task_rq users should acquire both locks, see - * task_rq_lock(). - */ - WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || - lockdep_is_held(&task_rq(p)->lock))); -#endif -#endif - - trace_sched_migrate_task(p, new_cpu); +/* + * This is how migration works: + * + * 1) we invoke migration_cpu_stop() on the target CPU using + * stop_one_cpu(). + * 2) stopper starts to run (implicitly forcing the migrated thread + * off the CPU) + * 3) it checks whether the migrated task is still in the wrong runqueue. + * 4) if it's in the wrong runqueue then the migration thread removes + * it and puts it into the right queue. + * 5) stopper completes and stop_one_cpu() returns and the migration + * is done. + */ - if (task_cpu(p) != new_cpu) { - if (p->sched_class->migrate_task_rq) - p->sched_class->migrate_task_rq(p, new_cpu); - p->se.nr_migrations++; - perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0); - } +/* + * move_queued_task - move a queued task to new rq. + * + * Returns (locked) new rq. Old rq's lock is released. + */ +static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new_cpu) +{ + lockdep_assert_held(&rq->lock); - __set_task_cpu(p, new_cpu); -} + dequeue_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_MIGRATING; + set_task_cpu(p, new_cpu); + raw_spin_unlock(&rq->lock); -static void __migrate_swap_task(struct task_struct *p, int cpu) -{ - if (task_on_rq_queued(p)) { - struct rq *src_rq, *dst_rq; + rq = cpu_rq(new_cpu); - src_rq = task_rq(p); - dst_rq = cpu_rq(cpu); + raw_spin_lock(&rq->lock); + BUG_ON(task_cpu(p) != new_cpu); + p->on_rq = TASK_ON_RQ_QUEUED; + enqueue_task(rq, p, 0); + check_preempt_curr(rq, p, 0); - deactivate_task(src_rq, p, 0); - set_task_cpu(p, cpu); - activate_task(dst_rq, p, 0); - check_preempt_curr(dst_rq, p, 0); - } else { - /* - * Task isn't running anymore; make it appear like we migrated - * it before it went to sleep. This means on wakeup we make the - * previous cpu our targer instead of where it really is. - */ - p->wake_cpu = cpu; - } + return rq; } -struct migration_swap_arg { - struct task_struct *src_task, *dst_task; - int src_cpu, dst_cpu; +struct migration_arg { + struct task_struct *task; + int dest_cpu; }; -static int migrate_swap_stop(void *data) +/* + * Move (not current) task off this cpu, onto dest cpu. We're doing + * this because either it can't run here any more (set_cpus_allowed() + * away from this CPU, or CPU going down), or because we're + * attempting to rebalance this task on exec (sched_exec). + * + * So we race with normal scheduler movements, but that's OK, as long + * as the task is no longer on this CPU. + */ +static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu) { - struct migration_swap_arg *arg = data; - struct rq *src_rq, *dst_rq; - int ret = -EAGAIN; - - src_rq = cpu_rq(arg->src_cpu); - dst_rq = cpu_rq(arg->dst_cpu); - - double_raw_lock(&arg->src_task->pi_lock, - &arg->dst_task->pi_lock); - double_rq_lock(src_rq, dst_rq); - if (task_cpu(arg->dst_task) != arg->dst_cpu) - goto unlock; + if (unlikely(!cpu_active(dest_cpu))) + return rq; - if (task_cpu(arg->src_task) != arg->src_cpu) - goto unlock; + /* Affinity changed (again). */ + if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) + return rq; - if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task))) - goto unlock; + rq = move_queued_task(rq, p, dest_cpu); - if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task))) - goto unlock; + return rq; +} - __migrate_swap_task(arg->src_task, arg->dst_cpu); - __migrate_swap_task(arg->dst_task, arg->src_cpu); +/* + * migration_cpu_stop - this will be executed by a highprio stopper thread + * and performs thread migration by bumping thread off CPU then + * 'pushing' onto another runqueue. + */ +static int migration_cpu_stop(void *data) +{ + struct migration_arg *arg = data; + struct task_struct *p = arg->task; + struct rq *rq = this_rq(); - ret = 0; + /* + * The original target cpu might have gone down and we might + * be on another cpu but it doesn't matter. + */ + local_irq_disable(); + /* + * We need to explicitly wake pending tasks before running + * __migrate_task() such that we will not miss enforcing cpus_allowed + * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. + */ + sched_ttwu_pending(); -unlock: - double_rq_unlock(src_rq, dst_rq); - raw_spin_unlock(&arg->dst_task->pi_lock); - raw_spin_unlock(&arg->src_task->pi_lock); + raw_spin_lock(&p->pi_lock); + raw_spin_lock(&rq->lock); + /* + * If task_rq(p) != rq, it cannot be migrated here, because we're + * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because + * we're holding p->pi_lock. + */ + if (task_rq(p) == rq && task_on_rq_queued(p)) + rq = __migrate_task(rq, p, arg->dest_cpu); + raw_spin_unlock(&rq->lock); + raw_spin_unlock(&p->pi_lock); - return ret; + local_irq_enable(); + return 0; } /* - * Cross migrate two tasks + * sched_class::set_cpus_allowed must do the below, but is not required to + * actually call this function. */ -int migrate_swap(struct task_struct *cur, struct task_struct *p) +void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) { - struct migration_swap_arg arg; - int ret = -EINVAL; + cpumask_copy(&p->cpus_allowed, new_mask); + p->nr_cpus_allowed = cpumask_weight(new_mask); +} - arg = (struct migration_swap_arg){ - .src_task = cur, - .src_cpu = task_cpu(cur), - .dst_task = p, - .dst_cpu = task_cpu(p), - }; +void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +{ + struct rq *rq = task_rq(p); + bool queued, running; - if (arg.src_cpu == arg.dst_cpu) - goto out; + lockdep_assert_held(&p->pi_lock); - /* - * These three tests are all lockless; this is OK since all of them - * will be re-checked with proper locks held further down the line. - */ - if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) - goto out; + if (__migrate_disabled(p)) { + cpumask_copy(&p->cpus_allowed, new_mask); + return; + } - if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task))) - goto out; + queued = task_on_rq_queued(p); + running = task_current(rq, p); - if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task))) - goto out; + if (queued) { + /* + * Because __kthread_bind() calls this on blocked tasks without + * holding rq->lock. + */ + lockdep_assert_held(&rq->lock); + dequeue_task(rq, p, DEQUEUE_SAVE); + } + if (running) + put_prev_task(rq, p); - trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu); - ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); + p->sched_class->set_cpus_allowed(p, new_mask); -out: - return ret; + if (running) + p->sched_class->set_curr_task(rq); + if (queued) + enqueue_task(rq, p, ENQUEUE_RESTORE); } -struct migration_arg { - struct task_struct *task; - int dest_cpu; -}; - -static int migration_cpu_stop(void *data); +static DEFINE_PER_CPU(struct cpumask, sched_cpumasks); +static DEFINE_MUTEX(sched_down_mutex); +static cpumask_t sched_down_cpumask; -static bool check_task_state(struct task_struct *p, long match_state) +void tell_sched_cpu_down_begin(int cpu) { - bool match = false; - - raw_spin_lock_irq(&p->pi_lock); - if (p->state == match_state || p->saved_state == match_state) - match = true; - raw_spin_unlock_irq(&p->pi_lock); + mutex_lock(&sched_down_mutex); + cpumask_set_cpu(cpu, &sched_down_cpumask); + mutex_unlock(&sched_down_mutex); +} - return match; +void tell_sched_cpu_down_done(int cpu) +{ + mutex_lock(&sched_down_mutex); + cpumask_clear_cpu(cpu, &sched_down_cpumask); + mutex_unlock(&sched_down_mutex); } -/* - * wait_task_inactive - wait for a thread to unschedule. - * - * If @match_state is nonzero, it's the @p->state value just checked and - * not expected to change. If it changes, i.e. @p might have woken up, - * then return zero. When we succeed in waiting for @p to be off its CPU, - * we return a positive number (its total switch count). If a second call - * a short while later returns the same number, the caller can be sure that - * @p has remained unscheduled the whole time. +/** + * migrate_me - try to move the current task off this cpu * - * The caller must ensure that the task *will* unschedule sometime soon, - * else this function might spin for a *long* time. This function can't - * be called with interrupts off, or it may introduce deadlock with - * smp_call_function() if an IPI is sent by the same process we are - * waiting to become inactive. + * Used by the pin_current_cpu() code to try to get tasks + * to move off the current CPU as it is going down. + * It will only move the task if the task isn't pinned to + * the CPU (with migrate_disable, affinity or NO_SETAFFINITY) + * and the task has to be in a RUNNING state. Otherwise the + * movement of the task will wake it up (change its state + * to running) when the task did not expect it. + * + * Returns 1 if it succeeded in moving the current task + * 0 otherwise. */ -unsigned long wait_task_inactive(struct task_struct *p, long match_state) +int migrate_me(void) { + struct task_struct *p = current; + struct migration_arg arg; + struct cpumask *cpumask; + struct cpumask *mask; unsigned long flags; - int running, queued; - unsigned long ncsw; + unsigned int dest_cpu; struct rq *rq; - for (;;) { - /* - * We do the initial early heuristics without holding - * any task-queue locks at all. We'll only try to get - * the runqueue lock when things look like they will - * work out! - */ - rq = task_rq(p); - - /* - * If the task is actively running on another CPU - * still, just relax and busy-wait without holding - * any locks. - * - * NOTE! Since we don't hold any locks, it's not - * even sure that "rq" stays as the right runqueue! - * But we don't care, since "task_running()" will - * return false if the runqueue has changed and p - * is actually now running somewhere else! - */ - while (task_running(rq, p)) { - if (match_state && !check_task_state(p, match_state)) - return 0; - cpu_relax(); - } + /* + * We can not migrate tasks bounded to a CPU or tasks not + * running. The movement of the task will wake it up. + */ + if (p->flags & PF_NO_SETAFFINITY || p->state) + return 0; - /* - * Ok, time to look more closely! We need the rq - * lock now, to be *sure*. If we're wrong, we'll - * just go back and repeat. - */ - rq = task_rq_lock(p, &flags); - trace_sched_wait_task(p); - running = task_running(rq, p); - queued = task_on_rq_queued(p); - ncsw = 0; - if (!match_state || p->state == match_state || - p->saved_state == match_state) - ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ - task_rq_unlock(rq, p, &flags); + mutex_lock(&sched_down_mutex); + rq = task_rq_lock(p, &flags); - /* - * If it changed from the expected state, bail out now. - */ - if (unlikely(!ncsw)) - break; + cpumask = this_cpu_ptr(&sched_cpumasks); + mask = &p->cpus_allowed; - /* - * Was it really running after all now that we - * checked with the proper locks actually held? - * - * Oops. Go back and try again.. - */ - if (unlikely(running)) { - cpu_relax(); - continue; - } + cpumask_andnot(cpumask, mask, &sched_down_cpumask); - /* - * It's not enough that it's not actively running, - * it must be off the runqueue _entirely_, and not - * preempted! - * - * So if it was still runnable (but just not actively - * running right now), it's preempted, and we should - * yield - it could be a while. - */ - if (unlikely(queued)) { - ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); + if (!cpumask_weight(cpumask)) { + /* It's only on this CPU? */ + task_rq_unlock(rq, p, &flags); + mutex_unlock(&sched_down_mutex); + return 0; + } - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_hrtimeout(&to, HRTIMER_MODE_REL); - continue; - } + dest_cpu = cpumask_any_and(cpu_active_mask, cpumask); - /* - * Ahh, all good. It wasn't running, and it wasn't - * runnable, which means that it will never become - * running in the future either. We're all done! - */ - break; - } + arg.task = p; + arg.dest_cpu = dest_cpu; - return ncsw; -} + task_rq_unlock(rq, p, &flags); -/*** - * kick_process - kick a running thread to enter/exit the kernel - * @p: the to-be-kicked thread - * - * Cause a process which is running on another CPU to enter - * kernel-mode, without any delay. (to get signals handled.) - * - * NOTE: this function doesn't have to take the runqueue lock, - * because all it wants to ensure is that the remote task enters - * the kernel. If the IPI races and the task has been migrated - * to another CPU then no harm is done and the purpose has been - * achieved as well. - */ -void kick_process(struct task_struct *p) -{ - int cpu; + stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); + tlb_migrate_finish(p->mm); + mutex_unlock(&sched_down_mutex); - preempt_disable(); - cpu = task_cpu(p); - if ((cpu != smp_processor_id()) && task_curr(p)) - smp_send_reschedule(cpu); - preempt_enable(); + return 1; } -EXPORT_SYMBOL_GPL(kick_process); -#endif /* CONFIG_SMP */ -#ifdef CONFIG_SMP /* - * ->cpus_allowed is protected by both rq->lock and p->pi_lock + * Change a given task's CPU affinity. Migrate the thread to a + * proper CPU and schedule it away if the CPU it's executing on + * is removed from the allowed bitmask. + * + * NOTE: the caller must have a valid reference to the task, the + * task must not exit() & deallocate itself prematurely. The + * call is not atomic; no spinlocks may be held. */ -static int select_fallback_rq(int cpu, struct task_struct *p) +static int __set_cpus_allowed_ptr(struct task_struct *p, + const struct cpumask *new_mask, bool check) { - int nid = cpu_to_node(cpu); - const struct cpumask *nodemask = NULL; - enum { cpuset, possible, fail } state = cpuset; - int dest_cpu; + unsigned long flags; + struct rq *rq; + unsigned int dest_cpu; + int ret = 0; + + rq = task_rq_lock(p, &flags); /* - * If the node that the cpu is on has been offlined, cpu_to_node() - * will return -1. There is no cpu on the node, and we should - * select the cpu on the other node. + * Must re-check here, to close a race against __kthread_bind(), + * sched_setaffinity() is not guaranteed to observe the flag. */ - if (nid != -1) { - nodemask = cpumask_of_node(nid); - - /* Look for allowed, online CPU in same node. */ - for_each_cpu(dest_cpu, nodemask) { - if (!cpu_online(dest_cpu)) - continue; - if (!cpu_active(dest_cpu)) - continue; - if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) - return dest_cpu; - } + if (check && (p->flags & PF_NO_SETAFFINITY)) { + ret = -EINVAL; + goto out; } - for (;;) { - /* Any allowed, online CPU? */ - for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { - if (!cpu_online(dest_cpu)) - continue; - if (!cpu_active(dest_cpu)) - continue; - goto out; - } + if (cpumask_equal(&p->cpus_allowed, new_mask)) + goto out; - switch (state) { - case cpuset: - /* No more Mr. Nice Guy. */ - cpuset_cpus_allowed_fallback(p); - state = possible; - break; + if (!cpumask_intersects(new_mask, cpu_active_mask)) { + ret = -EINVAL; + goto out; + } - case possible: - do_set_cpus_allowed(p, cpu_possible_mask); - state = fail; - break; + do_set_cpus_allowed(p, new_mask); - case fail: - BUG(); - break; - } - } + /* Can the task run on the task's current CPU? If so, we're done */ + if (cpumask_test_cpu(task_cpu(p), new_mask) || __migrate_disabled(p)) + goto out; -out: - if (state != cpuset) { + dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); + if (task_running(rq, p) || p->state == TASK_WAKING) { + struct migration_arg arg = { p, dest_cpu }; + /* Need help from migration thread: drop lock and wait. */ + task_rq_unlock(rq, p, &flags); + stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); + tlb_migrate_finish(p->mm); + return 0; + } else if (task_on_rq_queued(p)) { /* - * Don't tell them about moving exiting tasks or - * kernel threads (both mm NULL), since they never - * leave kernel. + * OK, since we're going to drop the lock immediately + * afterwards anyway. */ - if (p->mm && printk_ratelimit()) { - printk_deferred("process %d (%s) no longer affine to cpu%d\n", - task_pid_nr(p), p->comm, cpu); - } + lockdep_unpin_lock(&rq->lock); + rq = move_queued_task(rq, p, dest_cpu); + lockdep_pin_lock(&rq->lock); } +out: + task_rq_unlock(rq, p, &flags); - return dest_cpu; + return ret; } -/* - * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. - */ -static inline -int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) +int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) { - if (p->nr_cpus_allowed > 1) - cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); + return __set_cpus_allowed_ptr(p, new_mask, false); +} +EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); +void set_task_cpu(struct task_struct *p, unsigned int new_cpu) +{ +#ifdef CONFIG_SCHED_DEBUG /* - * In order not to call set_task_cpu() on a blocking task we need - * to rely on ttwu() to place the task on a valid ->cpus_allowed - * cpu. - * - * Since this is common to all placement strategies, this lives here. - * - * [ this allows ->select_task() to simply return task_cpu(p) and - * not worry about this generic constraint ] + * We should never call set_task_cpu() on a blocked task, + * ttwu() will sort out the placement. */ - if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) || - !cpu_online(cpu))) - cpu = select_fallback_rq(task_cpu(p), p); + WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && + !p->on_rq); - return cpu; -} +#ifdef CONFIG_LOCKDEP + /* + * The caller should hold either p->pi_lock or rq->lock, when changing + * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. + * + * sched_move_task() holds both and thus holding either pins the cgroup, + * see task_group(). + * + * Furthermore, all task_rq users should acquire both locks, see + * task_rq_lock(). + */ + WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || + lockdep_is_held(&task_rq(p)->lock))); +#endif +#endif -static void update_avg(u64 *avg, u64 sample) -{ - s64 diff = sample - *avg; - *avg += diff >> 3; + trace_sched_migrate_task(p, new_cpu); + + if (task_cpu(p) != new_cpu) { + if (p->sched_class->migrate_task_rq) + p->sched_class->migrate_task_rq(p); + p->se.nr_migrations++; + perf_event_task_migrate(p); + } + + __set_task_cpu(p, new_cpu); } -#endif -static void -ttwu_stat(struct task_struct *p, int cpu, int wake_flags) +static void __migrate_swap_task(struct task_struct *p, int cpu) { -#ifdef CONFIG_SCHEDSTATS - struct rq *rq = this_rq(); + if (task_on_rq_queued(p)) { + struct rq *src_rq, *dst_rq; -#ifdef CONFIG_SMP - int this_cpu = smp_processor_id(); + src_rq = task_rq(p); + dst_rq = cpu_rq(cpu); - if (cpu == this_cpu) { - schedstat_inc(rq, ttwu_local); - schedstat_inc(p, se.statistics.nr_wakeups_local); + deactivate_task(src_rq, p, 0); + set_task_cpu(p, cpu); + activate_task(dst_rq, p, 0); + check_preempt_curr(dst_rq, p, 0); } else { - struct sched_domain *sd; - - schedstat_inc(p, se.statistics.nr_wakeups_remote); - rcu_read_lock(); - for_each_domain(this_cpu, sd) { - if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { - schedstat_inc(sd, ttwu_wake_remote); - break; - } - } - rcu_read_unlock(); + /* + * Task isn't running anymore; make it appear like we migrated + * it before it went to sleep. This means on wakeup we make the + * previous cpu our targer instead of where it really is. + */ + p->wake_cpu = cpu; } +} - if (wake_flags & WF_MIGRATED) - schedstat_inc(p, se.statistics.nr_wakeups_migrate); - -#endif /* CONFIG_SMP */ +struct migration_swap_arg { + struct task_struct *src_task, *dst_task; + int src_cpu, dst_cpu; +}; - schedstat_inc(rq, ttwu_count); - schedstat_inc(p, se.statistics.nr_wakeups); +static int migrate_swap_stop(void *data) +{ + struct migration_swap_arg *arg = data; + struct rq *src_rq, *dst_rq; + int ret = -EAGAIN; - if (wake_flags & WF_SYNC) - schedstat_inc(p, se.statistics.nr_wakeups_sync); + if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu)) + return -EAGAIN; -#endif /* CONFIG_SCHEDSTATS */ -} + src_rq = cpu_rq(arg->src_cpu); + dst_rq = cpu_rq(arg->dst_cpu); -static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) -{ - activate_task(rq, p, en_flags); - p->on_rq = TASK_ON_RQ_QUEUED; -} + double_raw_lock(&arg->src_task->pi_lock, + &arg->dst_task->pi_lock); + double_rq_lock(src_rq, dst_rq); -/* - * Mark the task runnable and perform wakeup-preemption. - */ -static void -ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) -{ - check_preempt_curr(rq, p, wake_flags); - trace_sched_wakeup(p, true); + if (task_cpu(arg->dst_task) != arg->dst_cpu) + goto unlock; - p->state = TASK_RUNNING; -#ifdef CONFIG_SMP - if (p->sched_class->task_woken) - p->sched_class->task_woken(rq, p); + if (task_cpu(arg->src_task) != arg->src_cpu) + goto unlock; - if (rq->idle_stamp) { - u64 delta = rq_clock(rq) - rq->idle_stamp; - u64 max = 2*rq->max_idle_balance_cost; + if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task))) + goto unlock; - update_avg(&rq->avg_idle, delta); + if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task))) + goto unlock; - if (rq->avg_idle > max) - rq->avg_idle = max; + __migrate_swap_task(arg->src_task, arg->dst_cpu); + __migrate_swap_task(arg->dst_task, arg->src_cpu); - rq->idle_stamp = 0; - } -#endif -} + ret = 0; -static void -ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) -{ -#ifdef CONFIG_SMP - if (p->sched_contributes_to_load) - rq->nr_uninterruptible--; -#endif +unlock: + double_rq_unlock(src_rq, dst_rq); + raw_spin_unlock(&arg->dst_task->pi_lock); + raw_spin_unlock(&arg->src_task->pi_lock); - ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); - ttwu_do_wakeup(rq, p, wake_flags); + return ret; } /* - * Called in case the task @p isn't fully descheduled from its runqueue, - * in this case we must do a remote wakeup. Its a 'light' wakeup though, - * since all we need to do is flip p->state to TASK_RUNNING, since - * the task is still ->on_rq. + * Cross migrate two tasks */ -static int ttwu_remote(struct task_struct *p, int wake_flags) +int migrate_swap(struct task_struct *cur, struct task_struct *p) { - struct rq *rq; - int ret = 0; + struct migration_swap_arg arg; + int ret = -EINVAL; - rq = __task_rq_lock(p); - if (task_on_rq_queued(p)) { - /* check_preempt_curr() may use rq clock */ - update_rq_clock(rq); - ttwu_do_wakeup(rq, p, wake_flags); - ret = 1; - } - __task_rq_unlock(rq); + arg = (struct migration_swap_arg){ + .src_task = cur, + .src_cpu = task_cpu(cur), + .dst_task = p, + .dst_cpu = task_cpu(p), + }; - return ret; -} + if (arg.src_cpu == arg.dst_cpu) + goto out; -#ifdef CONFIG_SMP -void sched_ttwu_pending(void) -{ - struct rq *rq = this_rq(); - struct llist_node *llist = llist_del_all(&rq->wake_list); - struct task_struct *p; - unsigned long flags; + /* + * These three tests are all lockless; this is OK since all of them + * will be re-checked with proper locks held further down the line. + */ + if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) + goto out; - if (!llist) - return; + if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task))) + goto out; - raw_spin_lock_irqsave(&rq->lock, flags); + if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task))) + goto out; - while (llist) { - p = llist_entry(llist, struct task_struct, wake_entry); - llist = llist_next(llist); - ttwu_do_activate(rq, p, 0); - } + trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu); + ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); - raw_spin_unlock_irqrestore(&rq->lock, flags); +out: + return ret; } -void scheduler_ipi(void) +static bool check_task_state(struct task_struct *p, long match_state) { - /* - * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting - * TIF_NEED_RESCHED remotely (for the first time) will also send - * this IPI. - */ - preempt_fold_need_resched(); - - if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) - return; + bool match = false; - /* - * Not all reschedule IPI handlers call irq_enter/irq_exit, since - * traditionally all their work was done from the interrupt return - * path. Now that we actually do some work, we need to make sure - * we do call them. - * - * Some archs already do call them, luckily irq_enter/exit nest - * properly. - * - * Arguably we should visit all archs and update all handlers, - * however a fair share of IPIs are still resched only so this would - * somewhat pessimize the simple resched case. - */ - irq_enter(); - sched_ttwu_pending(); + raw_spin_lock_irq(&p->pi_lock); + if (p->state == match_state || p->saved_state == match_state) + match = true; + raw_spin_unlock_irq(&p->pi_lock); - /* - * Check if someone kicked us for doing the nohz idle load balance. - */ - if (unlikely(got_nohz_idle_kick())) { - this_rq()->idle_balance = 1; - raise_softirq_irqoff(SCHED_SOFTIRQ); - } - irq_exit(); + return match; } -static void ttwu_queue_remote(struct task_struct *p, int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { - if (!set_nr_if_polling(rq->idle)) - smp_send_reschedule(cpu); - else - trace_sched_wake_idle_without_ipi(cpu); - } -} - -void wake_up_if_idle(int cpu) +/* + * wait_task_inactive - wait for a thread to unschedule. + * + * If @match_state is nonzero, it's the @p->state value just checked and + * not expected to change. If it changes, i.e. @p might have woken up, + * then return zero. When we succeed in waiting for @p to be off its CPU, + * we return a positive number (its total switch count). If a second call + * a short while later returns the same number, the caller can be sure that + * @p has remained unscheduled the whole time. + * + * The caller must ensure that the task *will* unschedule sometime soon, + * else this function might spin for a *long* time. This function can't + * be called with interrupts off, or it may introduce deadlock with + * smp_call_function() if an IPI is sent by the same process we are + * waiting to become inactive. + */ +unsigned long wait_task_inactive(struct task_struct *p, long match_state) { - struct rq *rq = cpu_rq(cpu); unsigned long flags; + int running, queued; + unsigned long ncsw; + struct rq *rq; - rcu_read_lock(); + for (;;) { + /* + * We do the initial early heuristics without holding + * any task-queue locks at all. We'll only try to get + * the runqueue lock when things look like they will + * work out! + */ + rq = task_rq(p); - if (!is_idle_task(rcu_dereference(rq->curr))) - goto out; + /* + * If the task is actively running on another CPU + * still, just relax and busy-wait without holding + * any locks. + * + * NOTE! Since we don't hold any locks, it's not + * even sure that "rq" stays as the right runqueue! + * But we don't care, since "task_running()" will + * return false if the runqueue has changed and p + * is actually now running somewhere else! + */ + while (task_running(rq, p)) { + if (match_state && !check_task_state(p, match_state)) + return 0; + cpu_relax(); + } - if (set_nr_if_polling(rq->idle)) { - trace_sched_wake_idle_without_ipi(cpu); - } else { - raw_spin_lock_irqsave(&rq->lock, flags); - if (is_idle_task(rq->curr)) - smp_send_reschedule(cpu); - /* Else cpu is not in idle, do nothing here */ - raw_spin_unlock_irqrestore(&rq->lock, flags); - } + /* + * Ok, time to look more closely! We need the rq + * lock now, to be *sure*. If we're wrong, we'll + * just go back and repeat. + */ + rq = task_rq_lock(p, &flags); + trace_sched_wait_task(p); + running = task_running(rq, p); + queued = task_on_rq_queued(p); + ncsw = 0; + if (!match_state || p->state == match_state || + p->saved_state == match_state) + ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ + task_rq_unlock(rq, p, &flags); -out: - rcu_read_unlock(); -} + /* + * If it changed from the expected state, bail out now. + */ + if (unlikely(!ncsw)) + break; -bool cpus_share_cache(int this_cpu, int that_cpu) -{ - return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); -} -#endif /* CONFIG_SMP */ + /* + * Was it really running after all now that we + * checked with the proper locks actually held? + * + * Oops. Go back and try again.. + */ + if (unlikely(running)) { + cpu_relax(); + continue; + } -static void ttwu_queue(struct task_struct *p, int cpu) -{ - struct rq *rq = cpu_rq(cpu); + /* + * It's not enough that it's not actively running, + * it must be off the runqueue _entirely_, and not + * preempted! + * + * So if it was still runnable (but just not actively + * running right now), it's preempted, and we should + * yield - it could be a while. + */ + if (unlikely(queued)) { + ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); -#if defined(CONFIG_SMP) - if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { - sched_clock_cpu(cpu); /* sync clocks x-cpu */ - ttwu_queue_remote(p, cpu); - return; + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_hrtimeout(&to, HRTIMER_MODE_REL); + continue; + } + + /* + * Ahh, all good. It wasn't running, and it wasn't + * runnable, which means that it will never become + * running in the future either. We're all done! + */ + break; } -#endif - raw_spin_lock(&rq->lock); - ttwu_do_activate(rq, p, 0); - raw_spin_unlock(&rq->lock); + return ncsw; } -/** - * try_to_wake_up - wake up a thread - * @p: the thread to be awakened - * @state: the mask of task states that can be woken - * @wake_flags: wake modifier flags (WF_*) +/*** + * kick_process - kick a running thread to enter/exit the kernel + * @p: the to-be-kicked thread * - * Put it on the run-queue if it's not already there. The "current" - * thread is always on the run-queue (except when the actual - * re-schedule is in progress), and as such you're allowed to do - * the simpler "current->state = TASK_RUNNING" to mark yourself - * runnable without the overhead of this. + * Cause a process which is running on another CPU to enter + * kernel-mode, without any delay. (to get signals handled.) * - * Return: %true if @p was woken up, %false if it was already running. - * or @state didn't match @p's state. + * NOTE: this function doesn't have to take the runqueue lock, + * because all it wants to ensure is that the remote task enters + * the kernel. If the IPI races and the task has been migrated + * to another CPU then no harm is done and the purpose has been + * achieved as well. */ -static int -try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) +void kick_process(struct task_struct *p) { - unsigned long flags; - int cpu, success = 0; - - /* - * If we are going to wake up a thread waiting for CONDITION we - * need to ensure that CONDITION=1 done by the caller can not be - * reordered with p->state check below. This pairs with mb() in - * set_current_state() the waiting thread does. - */ - smp_mb__before_spinlock(); - raw_spin_lock_irqsave(&p->pi_lock, flags); - if (!(p->state & state)) { - /* - * The task might be running due to a spinlock sleeper - * wakeup. Check the saved state and set it to running - * if the wakeup condition is true. - */ - if (!(wake_flags & WF_LOCK_SLEEPER)) { - if (p->saved_state & state) { - p->saved_state = TASK_RUNNING; - success = 1; - } - } - goto out; - } - - /* - * If this is a regular wakeup, then we can unconditionally - * clear the saved state of a "lock sleeper". - */ - if (!(wake_flags & WF_LOCK_SLEEPER)) - p->saved_state = TASK_RUNNING; + int cpu; - success = 1; /* we're going to change ->state */ + preempt_disable(); cpu = task_cpu(p); + if ((cpu != smp_processor_id()) && task_curr(p)) + smp_send_reschedule(cpu); + preempt_enable(); +} +EXPORT_SYMBOL_GPL(kick_process); - if (p->on_rq && ttwu_remote(p, wake_flags)) - goto stat; +/* + * ->cpus_allowed is protected by both rq->lock and p->pi_lock + */ +static int select_fallback_rq(int cpu, struct task_struct *p) +{ + int nid = cpu_to_node(cpu); + const struct cpumask *nodemask = NULL; + enum { cpuset, possible, fail } state = cpuset; + int dest_cpu; -#ifdef CONFIG_SMP - /* - * If the owning (remote) cpu is still in the middle of schedule() with - * this task as prev, wait until its done referencing the task. - */ - while (p->on_cpu) - cpu_relax(); /* - * Pairs with the smp_wmb() in finish_lock_switch(). + * If the node that the cpu is on has been offlined, cpu_to_node() + * will return -1. There is no cpu on the node, and we should + * select the cpu on the other node. */ - smp_rmb(); + if (nid != -1) { + nodemask = cpumask_of_node(nid); - p->sched_contributes_to_load = !!task_contributes_to_load(p); - p->state = TASK_WAKING; + /* Look for allowed, online CPU in same node. */ + for_each_cpu(dest_cpu, nodemask) { + if (!cpu_online(dest_cpu)) + continue; + if (!cpu_active(dest_cpu)) + continue; + if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) + return dest_cpu; + } + } - if (p->sched_class->task_waking) - p->sched_class->task_waking(p); + for (;;) { + /* Any allowed, online CPU? */ + for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { + if (!cpu_online(dest_cpu)) + continue; + if (!cpu_active(dest_cpu)) + continue; + goto out; + } - cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); - if (task_cpu(p) != cpu) { - wake_flags |= WF_MIGRATED; - set_task_cpu(p, cpu); + /* No more Mr. Nice Guy. */ + switch (state) { + case cpuset: + if (IS_ENABLED(CONFIG_CPUSETS)) { + cpuset_cpus_allowed_fallback(p); + state = possible; + break; + } + /* fall-through */ + case possible: + do_set_cpus_allowed(p, cpu_possible_mask); + state = fail; + break; + + case fail: + BUG(); + break; + } } -#endif /* CONFIG_SMP */ - ttwu_queue(p, cpu); -stat: - ttwu_stat(p, cpu, wake_flags); out: - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + if (state != cpuset) { + /* + * Don't tell them about moving exiting tasks or + * kernel threads (both mm NULL), since they never + * leave kernel. + */ + if (p->mm && printk_ratelimit()) { + printk_deferred("process %d (%s) no longer affine to cpu%d\n", + task_pid_nr(p), p->comm, cpu); + } + } - return success; + return dest_cpu; } -/** - * wake_up_process - Wake up a specific process - * @p: The process to be woken up. - * - * Attempt to wake up the nominated process and move it to the set of runnable - * processes. - * - * Return: 1 if the process was woken up, 0 if it was already running. - * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. +/* + * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. */ -int wake_up_process(struct task_struct *p) +static inline +int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) { - WARN_ON(__task_is_stopped_or_traced(p)); - return try_to_wake_up(p, TASK_NORMAL, 0); -} -EXPORT_SYMBOL(wake_up_process); + lockdep_assert_held(&p->pi_lock); -/** - * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock" - * @p: The process to be woken up. - * - * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate - * the nature of the wakeup. - */ -int wake_up_lock_sleeper(struct task_struct *p) -{ - return try_to_wake_up(p, TASK_ALL, WF_LOCK_SLEEPER); -} + if (tsk_nr_cpus_allowed(p) > 1) + cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); + + /* + * In order not to call set_task_cpu() on a blocking task we need + * to rely on ttwu() to place the task on a valid ->cpus_allowed + * cpu. + * + * Since this is common to all placement strategies, this lives here. + * + * [ this allows ->select_task() to simply return task_cpu(p) and + * not worry about this generic constraint ] + */ + if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) || + !cpu_online(cpu))) + cpu = select_fallback_rq(task_cpu(p), p); -int wake_up_state(struct task_struct *p, unsigned int state) -{ - return try_to_wake_up(p, state, 0); + return cpu; } -/* - * This function clears the sched_dl_entity static params. - */ -void __dl_clear_params(struct task_struct *p) +static void update_avg(u64 *avg, u64 sample) { - struct sched_dl_entity *dl_se = &p->dl; - - dl_se->dl_runtime = 0; - dl_se->dl_deadline = 0; - dl_se->dl_period = 0; - dl_se->flags = 0; - dl_se->dl_bw = 0; - - dl_se->dl_throttled = 0; - dl_se->dl_new = 1; - dl_se->dl_yielded = 0; + s64 diff = sample - *avg; + *avg += diff >> 3; } -/* - * Perform scheduler related setup for a newly forked process p. - * p is forked by current. - * - * __sched_fork() is basic setup used by init_idle() too: - */ -static void __sched_fork(unsigned long clone_flags, struct task_struct *p) +#else + +static inline int __set_cpus_allowed_ptr(struct task_struct *p, + const struct cpumask *new_mask, bool check) { - p->on_rq = 0; + return set_cpus_allowed_ptr(p, new_mask); +} - p->se.on_rq = 0; - p->se.exec_start = 0; - p->se.sum_exec_runtime = 0; - p->se.prev_sum_exec_runtime = 0; - p->se.nr_migrations = 0; - p->se.vruntime = 0; -#ifdef CONFIG_SMP - p->se.avg.decay_count = 0; -#endif - INIT_LIST_HEAD(&p->se.group_node); +#endif /* CONFIG_SMP */ +static void +ttwu_stat(struct task_struct *p, int cpu, int wake_flags) +{ #ifdef CONFIG_SCHEDSTATS - memset(&p->se.statistics, 0, sizeof(p->se.statistics)); -#endif - - RB_CLEAR_NODE(&p->dl.rb_node); - init_dl_task_timer(&p->dl); - __dl_clear_params(p); + struct rq *rq = this_rq(); - INIT_LIST_HEAD(&p->rt.run_list); +#ifdef CONFIG_SMP + int this_cpu = smp_processor_id(); -#ifdef CONFIG_PREEMPT_NOTIFIERS - INIT_HLIST_HEAD(&p->preempt_notifiers); -#endif + if (cpu == this_cpu) { + schedstat_inc(rq, ttwu_local); + schedstat_inc(p, se.statistics.nr_wakeups_local); + } else { + struct sched_domain *sd; -#ifdef CONFIG_NUMA_BALANCING - if (p->mm && atomic_read(&p->mm->mm_users) == 1) { - p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); - p->mm->numa_scan_seq = 0; + schedstat_inc(p, se.statistics.nr_wakeups_remote); + rcu_read_lock(); + for_each_domain(this_cpu, sd) { + if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { + schedstat_inc(sd, ttwu_wake_remote); + break; + } + } + rcu_read_unlock(); } - if (clone_flags & CLONE_VM) - p->numa_preferred_nid = current->numa_preferred_nid; - else - p->numa_preferred_nid = -1; + if (wake_flags & WF_MIGRATED) + schedstat_inc(p, se.statistics.nr_wakeups_migrate); - p->node_stamp = 0ULL; - p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; - p->numa_scan_period = sysctl_numa_balancing_scan_delay; - p->numa_work.next = &p->numa_work; - p->numa_faults = NULL; - p->last_task_numa_placement = 0; - p->last_sum_exec_runtime = 0; +#endif /* CONFIG_SMP */ - p->numa_group = NULL; -#endif /* CONFIG_NUMA_BALANCING */ -} + schedstat_inc(rq, ttwu_count); + schedstat_inc(p, se.statistics.nr_wakeups); -#ifdef CONFIG_NUMA_BALANCING -#ifdef CONFIG_SCHED_DEBUG -void set_numabalancing_state(bool enabled) -{ - if (enabled) - sched_feat_set("NUMA"); - else - sched_feat_set("NO_NUMA"); -} -#else -__read_mostly bool numabalancing_enabled; + if (wake_flags & WF_SYNC) + schedstat_inc(p, se.statistics.nr_wakeups_sync); -void set_numabalancing_state(bool enabled) -{ - numabalancing_enabled = enabled; +#endif /* CONFIG_SCHEDSTATS */ } -#endif /* CONFIG_SCHED_DEBUG */ -#ifdef CONFIG_PROC_SYSCTL -int sysctl_numa_balancing(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) { - struct ctl_table t; - int err; - int state = numabalancing_enabled; - - if (write && !capable(CAP_SYS_ADMIN)) - return -EPERM; - - t = *table; - t.data = &state; - err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); - if (err < 0) - return err; - if (write) - set_numabalancing_state(state); - return err; + activate_task(rq, p, en_flags); + p->on_rq = TASK_ON_RQ_QUEUED; } -#endif -#endif /* - * fork()/clone()-time setup: + * Mark the task runnable and perform wakeup-preemption. */ -int sched_fork(unsigned long clone_flags, struct task_struct *p) +static void +ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) { - unsigned long flags; - int cpu = get_cpu(); - - __sched_fork(clone_flags, p); - /* - * We mark the process as running here. This guarantees that - * nobody will actually run it, and a signal or other external - * event cannot wake it up and insert it on the runqueue either. - */ + check_preempt_curr(rq, p, wake_flags); p->state = TASK_RUNNING; + trace_sched_wakeup(p); - /* - * Make sure we do not leak PI boosting priority to the child. - */ - p->prio = current->normal_prio; - - /* - * Revert to default priority/policy on fork if requested. - */ - if (unlikely(p->sched_reset_on_fork)) { - if (task_has_dl_policy(p) || task_has_rt_policy(p)) { - p->policy = SCHED_NORMAL; - p->static_prio = NICE_TO_PRIO(0); - p->rt_priority = 0; - } else if (PRIO_TO_NICE(p->static_prio) < 0) - p->static_prio = NICE_TO_PRIO(0); - - p->prio = p->normal_prio = __normal_prio(p); - set_load_weight(p); - +#ifdef CONFIG_SMP + if (p->sched_class->task_woken) { /* - * We don't need the reset flag anymore after the fork. It has - * fulfilled its duty: + * Our task @p is fully woken up and running; so its safe to + * drop the rq->lock, hereafter rq is only used for statistics. */ - p->sched_reset_on_fork = 0; + lockdep_unpin_lock(&rq->lock); + p->sched_class->task_woken(rq, p); + lockdep_pin_lock(&rq->lock); } - if (dl_prio(p->prio)) { - put_cpu(); - return -EAGAIN; - } else if (rt_prio(p->prio)) { - p->sched_class = &rt_sched_class; - } else { - p->sched_class = &fair_sched_class; - } + if (rq->idle_stamp) { + u64 delta = rq_clock(rq) - rq->idle_stamp; + u64 max = 2*rq->max_idle_balance_cost; - if (p->sched_class->task_fork) - p->sched_class->task_fork(p); + update_avg(&rq->avg_idle, delta); - /* - * The child is not yet in the pid-hash so no cgroup attach races, - * and the cgroup is pinned to this child due to cgroup_fork() - * is ran before sched_fork(). - * - * Silence PROVE_RCU. - */ - raw_spin_lock_irqsave(&p->pi_lock, flags); - set_task_cpu(p, cpu); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + if (rq->avg_idle > max) + rq->avg_idle = max; -#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) - if (likely(sched_info_on())) - memset(&p->sched_info, 0, sizeof(p->sched_info)); -#endif -#if defined(CONFIG_SMP) - p->on_cpu = 0; -#endif - init_task_preempt_count(p); -#ifdef CONFIG_HAVE_PREEMPT_LAZY - task_thread_info(p)->preempt_lazy_count = 0; + rq->idle_stamp = 0; + } #endif +} + +static void +ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) +{ + lockdep_assert_held(&rq->lock); + #ifdef CONFIG_SMP - plist_node_init(&p->pushable_tasks, MAX_PRIO); - RB_CLEAR_NODE(&p->pushable_dl_tasks); + if (p->sched_contributes_to_load) + rq->nr_uninterruptible--; #endif - put_cpu(); - return 0; + ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); + ttwu_do_wakeup(rq, p, wake_flags); +} + +/* + * Called in case the task @p isn't fully descheduled from its runqueue, + * in this case we must do a remote wakeup. Its a 'light' wakeup though, + * since all we need to do is flip p->state to TASK_RUNNING, since + * the task is still ->on_rq. + */ +static int ttwu_remote(struct task_struct *p, int wake_flags) +{ + struct rq *rq; + int ret = 0; + + rq = __task_rq_lock(p); + if (task_on_rq_queued(p)) { + /* check_preempt_curr() may use rq clock */ + update_rq_clock(rq); + ttwu_do_wakeup(rq, p, wake_flags); + ret = 1; + } + __task_rq_unlock(rq); + + return ret; +} + +#ifdef CONFIG_SMP +void sched_ttwu_pending(void) +{ + struct rq *rq = this_rq(); + struct llist_node *llist = llist_del_all(&rq->wake_list); + struct task_struct *p; + unsigned long flags; + + if (!llist) + return; + + raw_spin_lock_irqsave(&rq->lock, flags); + lockdep_pin_lock(&rq->lock); + + while (llist) { + p = llist_entry(llist, struct task_struct, wake_entry); + llist = llist_next(llist); + ttwu_do_activate(rq, p, 0); + } + + lockdep_unpin_lock(&rq->lock); + raw_spin_unlock_irqrestore(&rq->lock, flags); } -unsigned long to_ratio(u64 period, u64 runtime) +void scheduler_ipi(void) { - if (runtime == RUNTIME_INF) - return 1ULL << 20; + /* + * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting + * TIF_NEED_RESCHED remotely (for the first time) will also send + * this IPI. + */ + preempt_fold_need_resched(); + + if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) + return; /* - * Doing this here saves a lot of checks in all - * the calling paths, and returning zero seems - * safe for them anyway. + * Not all reschedule IPI handlers call irq_enter/irq_exit, since + * traditionally all their work was done from the interrupt return + * path. Now that we actually do some work, we need to make sure + * we do call them. + * + * Some archs already do call them, luckily irq_enter/exit nest + * properly. + * + * Arguably we should visit all archs and update all handlers, + * however a fair share of IPIs are still resched only so this would + * somewhat pessimize the simple resched case. */ - if (period == 0) - return 0; + irq_enter(); + sched_ttwu_pending(); - return div64_u64(runtime << 20, period); + /* + * Check if someone kicked us for doing the nohz idle load balance. + */ + if (unlikely(got_nohz_idle_kick())) { + this_rq()->idle_balance = 1; + raise_softirq_irqoff(SCHED_SOFTIRQ); + } + irq_exit(); } -#ifdef CONFIG_SMP -inline struct dl_bw *dl_bw_of(int i) +static void ttwu_queue_remote(struct task_struct *p, int cpu) { - rcu_lockdep_assert(rcu_read_lock_sched_held(), - "sched RCU must be held"); - return &cpu_rq(i)->rd->dl_bw; + struct rq *rq = cpu_rq(cpu); + + if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { + if (!set_nr_if_polling(rq->idle)) + smp_send_reschedule(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); + } } -static inline int dl_bw_cpus(int i) +void wake_up_if_idle(int cpu) { - struct root_domain *rd = cpu_rq(i)->rd; - int cpus = 0; + struct rq *rq = cpu_rq(cpu); + unsigned long flags; - rcu_lockdep_assert(rcu_read_lock_sched_held(), - "sched RCU must be held"); - for_each_cpu_and(i, rd->span, cpu_active_mask) - cpus++; + rcu_read_lock(); - return cpus; + if (!is_idle_task(rcu_dereference(rq->curr))) + goto out; + + if (set_nr_if_polling(rq->idle)) { + trace_sched_wake_idle_without_ipi(cpu); + } else { + raw_spin_lock_irqsave(&rq->lock, flags); + if (is_idle_task(rq->curr)) + smp_send_reschedule(cpu); + /* Else cpu is not in idle, do nothing here */ + raw_spin_unlock_irqrestore(&rq->lock, flags); + } + +out: + rcu_read_unlock(); } -#else -inline struct dl_bw *dl_bw_of(int i) + +bool cpus_share_cache(int this_cpu, int that_cpu) { - return &cpu_rq(i)->dl.dl_bw; + return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); } +#endif /* CONFIG_SMP */ -static inline int dl_bw_cpus(int i) +static void ttwu_queue(struct task_struct *p, int cpu) { - return 1; -} + struct rq *rq = cpu_rq(cpu); + +#if defined(CONFIG_SMP) + if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { + sched_clock_cpu(cpu); /* sync clocks x-cpu */ + ttwu_queue_remote(p, cpu); + return; + } #endif -/* - * We must be sure that accepting a new task (or allowing changing the - * parameters of an existing one) is consistent with the bandwidth - * constraints. If yes, this function also accordingly updates the currently - * allocated bandwidth to reflect the new situation. + raw_spin_lock(&rq->lock); + lockdep_pin_lock(&rq->lock); + ttwu_do_activate(rq, p, 0); + lockdep_unpin_lock(&rq->lock); + raw_spin_unlock(&rq->lock); +} + +/** + * try_to_wake_up - wake up a thread + * @p: the thread to be awakened + * @state: the mask of task states that can be woken + * @wake_flags: wake modifier flags (WF_*) * - * This function is called while holding p's rq->lock. + * Put it on the run-queue if it's not already there. The "current" + * thread is always on the run-queue (except when the actual + * re-schedule is in progress), and as such you're allowed to do + * the simpler "current->state = TASK_RUNNING" to mark yourself + * runnable without the overhead of this. * - * XXX we should delay bw change until the task's 0-lag point, see - * __setparam_dl(). + * Return: %true if @p was woken up, %false if it was already running. + * or @state didn't match @p's state. */ -static int dl_overflow(struct task_struct *p, int policy, - const struct sched_attr *attr) +static int +try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) { - - struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); - u64 period = attr->sched_period ?: attr->sched_deadline; - u64 runtime = attr->sched_runtime; - u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; - int cpus, err = -1; - - if (new_bw == p->dl.dl_bw) - return 0; + unsigned long flags; + int cpu, success = 0; /* - * Either if a task, enters, leave, or stays -deadline but changes - * its parameters, we may need to update accordingly the total - * allocated bandwidth of the container. + * If we are going to wake up a thread waiting for CONDITION we + * need to ensure that CONDITION=1 done by the caller can not be + * reordered with p->state check below. This pairs with mb() in + * set_current_state() the waiting thread does. */ - raw_spin_lock(&dl_b->lock); - cpus = dl_bw_cpus(task_cpu(p)); - if (dl_policy(policy) && !task_has_dl_policy(p) && - !__dl_overflow(dl_b, cpus, 0, new_bw)) { - __dl_add(dl_b, new_bw); - err = 0; - } else if (dl_policy(policy) && task_has_dl_policy(p) && - !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { - __dl_clear(dl_b, p->dl.dl_bw); - __dl_add(dl_b, new_bw); - err = 0; - } else if (!dl_policy(policy) && task_has_dl_policy(p)) { - __dl_clear(dl_b, p->dl.dl_bw); - err = 0; + smp_mb__before_spinlock(); + raw_spin_lock_irqsave(&p->pi_lock, flags); + if (!(p->state & state)) { + /* + * The task might be running due to a spinlock sleeper + * wakeup. Check the saved state and set it to running + * if the wakeup condition is true. + */ + if (!(wake_flags & WF_LOCK_SLEEPER)) { + if (p->saved_state & state) { + p->saved_state = TASK_RUNNING; + success = 1; + } + } + goto out; } - raw_spin_unlock(&dl_b->lock); - return err; -} + /* + * If this is a regular wakeup, then we can unconditionally + * clear the saved state of a "lock sleeper". + */ + if (!(wake_flags & WF_LOCK_SLEEPER)) + p->saved_state = TASK_RUNNING; -extern void init_dl_bw(struct dl_bw *dl_b); + trace_sched_waking(p); -/* - * wake_up_new_task - wake up a newly created task for the first time. - * - * This function will do some initial scheduler statistics housekeeping - * that must be done for every newly created context, then puts the task - * on the runqueue and wakes it. - */ -void wake_up_new_task(struct task_struct *p) -{ - unsigned long flags; - struct rq *rq; + success = 1; /* we're going to change ->state */ + cpu = task_cpu(p); + + if (p->on_rq && ttwu_remote(p, wake_flags)) + goto stat; - raw_spin_lock_irqsave(&p->pi_lock, flags); #ifdef CONFIG_SMP /* - * Fork balancing, do it here and not earlier because: - * - cpus_allowed can change in the fork path - * - any previously selected cpu might disappear through hotplug + * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be + * possible to, falsely, observe p->on_cpu == 0. + * + * One must be running (->on_cpu == 1) in order to remove oneself + * from the runqueue. + * + * [S] ->on_cpu = 1; [L] ->on_rq + * UNLOCK rq->lock + * RMB + * LOCK rq->lock + * [S] ->on_rq = 0; [L] ->on_cpu + * + * Pairs with the full barrier implied in the UNLOCK+LOCK on rq->lock + * from the consecutive calls to schedule(); the first switching to our + * task, the second putting it to sleep. */ - set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); -#endif + smp_rmb(); + + /* + * If the owning (remote) cpu is still in the middle of schedule() with + * this task as prev, wait until its done referencing the task. + */ + while (p->on_cpu) + cpu_relax(); + /* + * Combined with the control dependency above, we have an effective + * smp_load_acquire() without the need for full barriers. + * + * Pairs with the smp_store_release() in finish_lock_switch(). + * + * This ensures that tasks getting woken will be fully ordered against + * their previous state and preserve Program Order. + */ + smp_rmb(); + + p->sched_contributes_to_load = !!task_contributes_to_load(p); + p->state = TASK_WAKING; + + if (p->sched_class->task_waking) + p->sched_class->task_waking(p); + + cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); + if (task_cpu(p) != cpu) { + wake_flags |= WF_MIGRATED; + set_task_cpu(p, cpu); + } +#endif /* CONFIG_SMP */ + + ttwu_queue(p, cpu); +stat: + ttwu_stat(p, cpu, wake_flags); +out: + raw_spin_unlock_irqrestore(&p->pi_lock, flags); - /* Initialize new task's runnable average */ - init_task_runnable_average(p); - rq = __task_rq_lock(p); - activate_task(rq, p, 0); - p->on_rq = TASK_ON_RQ_QUEUED; - trace_sched_wakeup_new(p, true); - check_preempt_curr(rq, p, WF_FORK); -#ifdef CONFIG_SMP - if (p->sched_class->task_woken) - p->sched_class->task_woken(rq, p); -#endif - task_rq_unlock(rq, p, &flags); + return success; } -#ifdef CONFIG_PREEMPT_NOTIFIERS - /** - * preempt_notifier_register - tell me when current is being preempted & rescheduled - * @notifier: notifier struct to register + * wake_up_process - Wake up a specific process + * @p: The process to be woken up. + * + * Attempt to wake up the nominated process and move it to the set of runnable + * processes. + * + * Return: 1 if the process was woken up, 0 if it was already running. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. */ -void preempt_notifier_register(struct preempt_notifier *notifier) +int wake_up_process(struct task_struct *p) { - hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); + return try_to_wake_up(p, TASK_NORMAL, 0); } -EXPORT_SYMBOL_GPL(preempt_notifier_register); +EXPORT_SYMBOL(wake_up_process); /** - * preempt_notifier_unregister - no longer interested in preemption notifications - * @notifier: notifier struct to unregister + * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock" + * @p: The process to be woken up. * - * This is safe to call from within a preemption notifier. + * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate + * the nature of the wakeup. */ -void preempt_notifier_unregister(struct preempt_notifier *notifier) -{ - hlist_del(¬ifier->link); -} -EXPORT_SYMBOL_GPL(preempt_notifier_unregister); - -static void fire_sched_in_preempt_notifiers(struct task_struct *curr) -{ - struct preempt_notifier *notifier; - - hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) - notifier->ops->sched_in(notifier, raw_smp_processor_id()); -} - -static void -fire_sched_out_preempt_notifiers(struct task_struct *curr, - struct task_struct *next) +int wake_up_lock_sleeper(struct task_struct *p) { - struct preempt_notifier *notifier; - - hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) - notifier->ops->sched_out(notifier, next); + return try_to_wake_up(p, TASK_ALL, WF_LOCK_SLEEPER); } -#else /* !CONFIG_PREEMPT_NOTIFIERS */ - -static void fire_sched_in_preempt_notifiers(struct task_struct *curr) +int wake_up_state(struct task_struct *p, unsigned int state) { + return try_to_wake_up(p, state, 0); } -static void -fire_sched_out_preempt_notifiers(struct task_struct *curr, - struct task_struct *next) +/* + * This function clears the sched_dl_entity static params. + */ +void __dl_clear_params(struct task_struct *p) { -} + struct sched_dl_entity *dl_se = &p->dl; -#endif /* CONFIG_PREEMPT_NOTIFIERS */ + dl_se->dl_runtime = 0; + dl_se->dl_deadline = 0; + dl_se->dl_period = 0; + dl_se->flags = 0; + dl_se->dl_bw = 0; -/** - * prepare_task_switch - prepare to switch tasks - * @rq: the runqueue preparing to switch - * @prev: the current task that is being switched out - * @next: the task we are going to switch to. - * - * This is called with the rq lock held and interrupts off. It must - * be paired with a subsequent finish_task_switch after the context - * switch. - * - * prepare_task_switch sets up locking and calls architecture specific - * hooks. - */ -static inline void -prepare_task_switch(struct rq *rq, struct task_struct *prev, - struct task_struct *next) -{ - trace_sched_switch(prev, next); - sched_info_switch(rq, prev, next); - perf_event_task_sched_out(prev, next); - fire_sched_out_preempt_notifiers(prev, next); - prepare_lock_switch(rq, next); - prepare_arch_switch(next); + dl_se->dl_throttled = 0; + dl_se->dl_new = 1; + dl_se->dl_yielded = 0; } -/** - * finish_task_switch - clean up after a task-switch - * @prev: the thread we just switched away from. - * - * finish_task_switch must be called after the context switch, paired - * with a prepare_task_switch call before the context switch. - * finish_task_switch will reconcile locking set up by prepare_task_switch, - * and do any other architecture-specific cleanup actions. - * - * Note that we may have delayed dropping an mm in context_switch(). If - * so, we finish that here outside of the runqueue lock. (Doing it - * with the lock held can cause deadlocks; see schedule() for - * details.) +/* + * Perform scheduler related setup for a newly forked process p. + * p is forked by current. * - * The context switch have flipped the stack from under us and restored the - * local variables which were saved when this task called schedule() in the - * past. prev == current is still correct but we need to recalculate this_rq - * because prev may have moved to another CPU. + * __sched_fork() is basic setup used by init_idle() too: */ -static struct rq *finish_task_switch(struct task_struct *prev) - __releases(rq->lock) +static void __sched_fork(unsigned long clone_flags, struct task_struct *p) { - struct rq *rq = this_rq(); - struct mm_struct *mm = rq->prev_mm; - long prev_state; + p->on_rq = 0; - rq->prev_mm = NULL; + p->se.on_rq = 0; + p->se.exec_start = 0; + p->se.sum_exec_runtime = 0; + p->se.prev_sum_exec_runtime = 0; + p->se.nr_migrations = 0; + p->se.vruntime = 0; + INIT_LIST_HEAD(&p->se.group_node); - /* - * A task struct has one reference for the use as "current". - * If a task dies, then it sets TASK_DEAD in tsk->state and calls - * schedule one last time. The schedule call will never return, and - * the scheduled task must drop that reference. - * The test for TASK_DEAD must occur while the runqueue locks are - * still held, otherwise prev could be scheduled on another cpu, die - * there before we look at prev->state, and then the reference would - * be dropped twice. - * Manfred Spraul - */ - prev_state = prev->state; - vtime_task_switch(prev); - finish_arch_switch(prev); - perf_event_task_sched_in(prev, current); - finish_lock_switch(rq, prev); - finish_arch_post_lock_switch(); +#ifdef CONFIG_SCHEDSTATS + memset(&p->se.statistics, 0, sizeof(p->se.statistics)); +#endif - fire_sched_in_preempt_notifiers(current); - /* - * We use mmdrop_delayed() here so we don't have to do the - * full __mmdrop() when we are the last user. - */ - if (mm) - mmdrop_delayed(mm); - if (unlikely(prev_state == TASK_DEAD)) { - if (prev->sched_class->task_dead) - prev->sched_class->task_dead(prev); + RB_CLEAR_NODE(&p->dl.rb_node); + init_dl_task_timer(&p->dl); + __dl_clear_params(p); - /* - * Remove function-return probe instances associated with this - * task and put them back on the free list. - */ - kprobe_flush_task(prev); - put_task_struct(prev); - } + INIT_LIST_HEAD(&p->rt.run_list); - tick_nohz_task_switch(current); - return rq; -} +#ifdef CONFIG_PREEMPT_NOTIFIERS + INIT_HLIST_HEAD(&p->preempt_notifiers); +#endif -#ifdef CONFIG_SMP +#ifdef CONFIG_NUMA_BALANCING + if (p->mm && atomic_read(&p->mm->mm_users) == 1) { + p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); + p->mm->numa_scan_seq = 0; + } -/* rq->lock is NOT held, but preemption is disabled */ -static inline void post_schedule(struct rq *rq) -{ - if (rq->post_schedule) { - unsigned long flags; + if (clone_flags & CLONE_VM) + p->numa_preferred_nid = current->numa_preferred_nid; + else + p->numa_preferred_nid = -1; - raw_spin_lock_irqsave(&rq->lock, flags); - if (rq->curr->sched_class->post_schedule) - rq->curr->sched_class->post_schedule(rq); - raw_spin_unlock_irqrestore(&rq->lock, flags); + p->node_stamp = 0ULL; + p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; + p->numa_scan_period = sysctl_numa_balancing_scan_delay; + p->numa_work.next = &p->numa_work; + p->numa_faults = NULL; + p->last_task_numa_placement = 0; + p->last_sum_exec_runtime = 0; - rq->post_schedule = 0; - } + p->numa_group = NULL; +#endif /* CONFIG_NUMA_BALANCING */ } -#else +DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); + +#ifdef CONFIG_NUMA_BALANCING -static inline void post_schedule(struct rq *rq) +void set_numabalancing_state(bool enabled) { + if (enabled) + static_branch_enable(&sched_numa_balancing); + else + static_branch_disable(&sched_numa_balancing); } -#endif - -/** - * schedule_tail - first thing a freshly forked thread must call. - * @prev: the thread we just switched away from. - */ -asmlinkage __visible void schedule_tail(struct task_struct *prev) - __releases(rq->lock) +#ifdef CONFIG_PROC_SYSCTL +int sysctl_numa_balancing(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) { - struct rq *rq; + struct ctl_table t; + int err; + int state = static_branch_likely(&sched_numa_balancing); - /* finish_task_switch() drops rq->lock and enables preemtion */ - preempt_disable(); - rq = finish_task_switch(prev); - post_schedule(rq); - preempt_enable(); + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; - if (current->set_child_tid) - put_user(task_pid_vnr(current), current->set_child_tid); + t = *table; + t.data = &state; + err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); + if (err < 0) + return err; + if (write) + set_numabalancing_state(state); + return err; } +#endif +#endif /* - * context_switch - switch to the new MM and the new thread's register state. + * fork()/clone()-time setup: */ -static inline struct rq * -context_switch(struct rq *rq, struct task_struct *prev, - struct task_struct *next) +int sched_fork(unsigned long clone_flags, struct task_struct *p) { - struct mm_struct *mm, *oldmm; + unsigned long flags; + int cpu = get_cpu(); + + __sched_fork(clone_flags, p); + /* + * We mark the process as running here. This guarantees that + * nobody will actually run it, and a signal or other external + * event cannot wake it up and insert it on the runqueue either. + */ + p->state = TASK_RUNNING; - prepare_task_switch(rq, prev, next); + /* + * Make sure we do not leak PI boosting priority to the child. + */ + p->prio = current->normal_prio; - mm = next->mm; - oldmm = prev->active_mm; /* - * For paravirt, this is coupled with an exit in switch_to to - * combine the page table reload and the switch backend into - * one hypercall. + * Revert to default priority/policy on fork if requested. */ - arch_start_context_switch(prev); + if (unlikely(p->sched_reset_on_fork)) { + if (task_has_dl_policy(p) || task_has_rt_policy(p)) { + p->policy = SCHED_NORMAL; + p->static_prio = NICE_TO_PRIO(0); + p->rt_priority = 0; + } else if (PRIO_TO_NICE(p->static_prio) < 0) + p->static_prio = NICE_TO_PRIO(0); - if (!mm) { - next->active_mm = oldmm; - atomic_inc(&oldmm->mm_count); - enter_lazy_tlb(oldmm, next); - } else - switch_mm(oldmm, mm, next); + p->prio = p->normal_prio = __normal_prio(p); + set_load_weight(p); - if (!prev->mm) { - prev->active_mm = NULL; - rq->prev_mm = oldmm; + /* + * We don't need the reset flag anymore after the fork. It has + * fulfilled its duty: + */ + p->sched_reset_on_fork = 0; + } + + if (dl_prio(p->prio)) { + put_cpu(); + return -EAGAIN; + } else if (rt_prio(p->prio)) { + p->sched_class = &rt_sched_class; + } else { + p->sched_class = &fair_sched_class; } + + if (p->sched_class->task_fork) + p->sched_class->task_fork(p); + /* - * Since the runqueue lock will be released by the next - * task (which is an invalid locking op but in the case - * of the scheduler it's an obvious special-case), so we - * do an early lockdep release here: + * The child is not yet in the pid-hash so no cgroup attach races, + * and the cgroup is pinned to this child due to cgroup_fork() + * is ran before sched_fork(). + * + * Silence PROVE_RCU. */ - spin_release(&rq->lock.dep_map, 1, _THIS_IP_); + raw_spin_lock_irqsave(&p->pi_lock, flags); + set_task_cpu(p, cpu); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); - context_tracking_task_switch(prev, next); - /* Here we just switch the register state and the stack. */ - switch_to(prev, next, prev); - barrier(); +#ifdef CONFIG_SCHED_INFO + if (likely(sched_info_on())) + memset(&p->sched_info, 0, sizeof(p->sched_info)); +#endif +#if defined(CONFIG_SMP) + p->on_cpu = 0; +#endif + init_task_preempt_count(p); +#ifdef CONFIG_HAVE_PREEMPT_LAZY + task_thread_info(p)->preempt_lazy_count = 0; +#endif +#ifdef CONFIG_SMP + plist_node_init(&p->pushable_tasks, MAX_PRIO); + RB_CLEAR_NODE(&p->pushable_dl_tasks); +#endif - return finish_task_switch(prev); + put_cpu(); + return 0; } -/* - * nr_running and nr_context_switches: - * - * externally visible scheduler statistics: current number of runnable - * threads, total number of context switches performed since bootup. - */ -unsigned long nr_running(void) +unsigned long to_ratio(u64 period, u64 runtime) { - unsigned long i, sum = 0; - - for_each_online_cpu(i) - sum += cpu_rq(i)->nr_running; + if (runtime == RUNTIME_INF) + return 1ULL << 20; - return sum; -} + /* + * Doing this here saves a lot of checks in all + * the calling paths, and returning zero seems + * safe for them anyway. + */ + if (period == 0) + return 0; -/* - * Check if only the current task is running on the cpu. - */ -bool single_task_running(void) -{ - if (cpu_rq(smp_processor_id())->nr_running == 1) - return true; - else - return false; + return div64_u64(runtime << 20, period); } -EXPORT_SYMBOL(single_task_running); -unsigned long long nr_context_switches(void) +#ifdef CONFIG_SMP +inline struct dl_bw *dl_bw_of(int i) { - int i; - unsigned long long sum = 0; - - for_each_possible_cpu(i) - sum += cpu_rq(i)->nr_switches; - - return sum; + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); + return &cpu_rq(i)->rd->dl_bw; } -unsigned long nr_iowait(void) +static inline int dl_bw_cpus(int i) { - unsigned long i, sum = 0; + struct root_domain *rd = cpu_rq(i)->rd; + int cpus = 0; - for_each_possible_cpu(i) - sum += atomic_read(&cpu_rq(i)->nr_iowait); + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); + for_each_cpu_and(i, rd->span, cpu_active_mask) + cpus++; - return sum; + return cpus; } - -unsigned long nr_iowait_cpu(int cpu) +#else +inline struct dl_bw *dl_bw_of(int i) { - struct rq *this = cpu_rq(cpu); - return atomic_read(&this->nr_iowait); + return &cpu_rq(i)->dl.dl_bw; } -void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) +static inline int dl_bw_cpus(int i) { - struct rq *this = this_rq(); - *nr_waiters = atomic_read(&this->nr_iowait); - *load = this->cpu_load[0]; + return 1; } - -#ifdef CONFIG_SMP +#endif /* - * sched_exec - execve() is a valuable balancing opportunity, because at - * this point the task has the smallest effective memory and cache footprint. + * We must be sure that accepting a new task (or allowing changing the + * parameters of an existing one) is consistent with the bandwidth + * constraints. If yes, this function also accordingly updates the currently + * allocated bandwidth to reflect the new situation. + * + * This function is called while holding p's rq->lock. + * + * XXX we should delay bw change until the task's 0-lag point, see + * __setparam_dl(). */ -void sched_exec(void) +static int dl_overflow(struct task_struct *p, int policy, + const struct sched_attr *attr) { - struct task_struct *p = current; - unsigned long flags; - int dest_cpu; - raw_spin_lock_irqsave(&p->pi_lock, flags); - dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); - if (dest_cpu == smp_processor_id()) - goto unlock; + struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + u64 period = attr->sched_period ?: attr->sched_deadline; + u64 runtime = attr->sched_runtime; + u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; + int cpus, err = -1; - if (likely(cpu_active(dest_cpu))) { - struct migration_arg arg = { p, dest_cpu }; + if (new_bw == p->dl.dl_bw) + return 0; - raw_spin_unlock_irqrestore(&p->pi_lock, flags); - stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); - return; + /* + * Either if a task, enters, leave, or stays -deadline but changes + * its parameters, we may need to update accordingly the total + * allocated bandwidth of the container. + */ + raw_spin_lock(&dl_b->lock); + cpus = dl_bw_cpus(task_cpu(p)); + if (dl_policy(policy) && !task_has_dl_policy(p) && + !__dl_overflow(dl_b, cpus, 0, new_bw)) { + __dl_add(dl_b, new_bw); + err = 0; + } else if (dl_policy(policy) && task_has_dl_policy(p) && + !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { + __dl_clear(dl_b, p->dl.dl_bw); + __dl_add(dl_b, new_bw); + err = 0; + } else if (!dl_policy(policy) && task_has_dl_policy(p)) { + __dl_clear(dl_b, p->dl.dl_bw); + err = 0; } -unlock: - raw_spin_unlock_irqrestore(&p->pi_lock, flags); -} - -#endif + raw_spin_unlock(&dl_b->lock); -DEFINE_PER_CPU(struct kernel_stat, kstat); -DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); + return err; +} -EXPORT_PER_CPU_SYMBOL(kstat); -EXPORT_PER_CPU_SYMBOL(kernel_cpustat); +extern void init_dl_bw(struct dl_bw *dl_b); /* - * Return accounted runtime for the task. - * In case the task is currently running, return the runtime plus current's - * pending runtime that have not been accounted yet. + * wake_up_new_task - wake up a newly created task for the first time. + * + * This function will do some initial scheduler statistics housekeeping + * that must be done for every newly created context, then puts the task + * on the runqueue and wakes it. */ -unsigned long long task_sched_runtime(struct task_struct *p) +void wake_up_new_task(struct task_struct *p) { unsigned long flags; struct rq *rq; - u64 ns; -#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) + raw_spin_lock_irqsave(&p->pi_lock, flags); + /* Initialize new task's runnable average */ + init_entity_runnable_average(&p->se); +#ifdef CONFIG_SMP /* - * 64-bit doesn't need locks to atomically read a 64bit value. - * So we have a optimization chance when the task's delta_exec is 0. - * Reading ->on_cpu is racy, but this is ok. - * - * If we race with it leaving cpu, we'll take a lock. So we're correct. - * If we race with it entering cpu, unaccounted time is 0. This is - * indistinguishable from the read occurring a few cycles earlier. - * If we see ->on_cpu without ->on_rq, the task is leaving, and has - * been accounted, so we're correct here as well. + * Fork balancing, do it here and not earlier because: + * - cpus_allowed can change in the fork path + * - any previously selected cpu might disappear through hotplug */ - if (!p->on_cpu || !task_on_rq_queued(p)) - return p->se.sum_exec_runtime; + set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif - - rq = task_rq_lock(p, &flags); - /* - * Must be ->curr _and_ ->on_rq. If dequeued, we would - * project cycles that may never be accounted to this - * thread, breaking clock_gettime(). - */ - if (task_current(rq, p) && task_on_rq_queued(p)) { - update_rq_clock(rq); - p->sched_class->update_curr(rq); + + rq = __task_rq_lock(p); + activate_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; + trace_sched_wakeup_new(p); + check_preempt_curr(rq, p, WF_FORK); +#ifdef CONFIG_SMP + if (p->sched_class->task_woken) { + /* + * Nothing relies on rq->lock after this, so its fine to + * drop it. + */ + lockdep_unpin_lock(&rq->lock); + p->sched_class->task_woken(rq, p); + lockdep_pin_lock(&rq->lock); } - ns = p->se.sum_exec_runtime; +#endif task_rq_unlock(rq, p, &flags); - - return ns; } -/* - * This function gets called by the timer code, with HZ frequency. - * We call it with interrupts disabled. - */ -void scheduler_tick(void) -{ - int cpu = smp_processor_id(); - struct rq *rq = cpu_rq(cpu); - struct task_struct *curr = rq->curr; - - sched_clock_tick(); +#ifdef CONFIG_PREEMPT_NOTIFIERS - raw_spin_lock(&rq->lock); - update_rq_clock(rq); - curr->sched_class->task_tick(rq, curr, 0); - update_cpu_load_active(rq); - raw_spin_unlock(&rq->lock); +static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE; - perf_event_task_tick(); +void preempt_notifier_inc(void) +{ + static_key_slow_inc(&preempt_notifier_key); +} +EXPORT_SYMBOL_GPL(preempt_notifier_inc); -#ifdef CONFIG_SMP - rq->idle_balance = idle_cpu(cpu); - trigger_load_balance(rq); -#endif - rq_last_tick_reset(rq); +void preempt_notifier_dec(void) +{ + static_key_slow_dec(&preempt_notifier_key); } +EXPORT_SYMBOL_GPL(preempt_notifier_dec); -#ifdef CONFIG_NO_HZ_FULL /** - * scheduler_tick_max_deferment - * - * Keep at least one tick per second when a single - * active task is running because the scheduler doesn't - * yet completely support full dynticks environment. - * - * This makes sure that uptime, CFS vruntime, load - * balancing, etc... continue to move forward, even - * with a very low granularity. - * - * Return: Maximum deferment in nanoseconds. + * preempt_notifier_register - tell me when current is being preempted & rescheduled + * @notifier: notifier struct to register */ -u64 scheduler_tick_max_deferment(void) +void preempt_notifier_register(struct preempt_notifier *notifier) { - struct rq *rq = this_rq(); - unsigned long next, now = ACCESS_ONCE(jiffies); + if (!static_key_false(&preempt_notifier_key)) + WARN(1, "registering preempt_notifier while notifiers disabled\n"); - next = rq->last_sched_tick + HZ; - - if (time_before_eq(next, now)) - return 0; - - return jiffies_to_nsecs(next - now); + hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); } -#endif +EXPORT_SYMBOL_GPL(preempt_notifier_register); -notrace unsigned long get_parent_ip(unsigned long addr) +/** + * preempt_notifier_unregister - no longer interested in preemption notifications + * @notifier: notifier struct to unregister + * + * This is *not* safe to call from within a preemption notifier. + */ +void preempt_notifier_unregister(struct preempt_notifier *notifier) { - if (in_lock_functions(addr)) { - addr = CALLER_ADDR2; - if (in_lock_functions(addr)) - addr = CALLER_ADDR3; - } - return addr; + hlist_del(¬ifier->link); } +EXPORT_SYMBOL_GPL(preempt_notifier_unregister); -#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ - defined(CONFIG_PREEMPT_TRACER)) +static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) +{ + struct preempt_notifier *notifier; -void preempt_count_add(int val) + hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) + notifier->ops->sched_in(notifier, raw_smp_processor_id()); +} + +static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) { -#ifdef CONFIG_DEBUG_PREEMPT - /* - * Underflow? - */ - if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) - return; -#endif - __preempt_count_add(val); -#ifdef CONFIG_DEBUG_PREEMPT - /* - * Spinlock count overflowing soon? - */ - DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= - PREEMPT_MASK - 10); -#endif - if (preempt_count() == val) { - unsigned long ip = get_parent_ip(CALLER_ADDR1); -#ifdef CONFIG_DEBUG_PREEMPT - current->preempt_disable_ip = ip; -#endif - trace_preempt_off(CALLER_ADDR0, ip); - } + if (static_key_false(&preempt_notifier_key)) + __fire_sched_in_preempt_notifiers(curr); } -EXPORT_SYMBOL(preempt_count_add); -NOKPROBE_SYMBOL(preempt_count_add); -void preempt_count_sub(int val) +static void +__fire_sched_out_preempt_notifiers(struct task_struct *curr, + struct task_struct *next) { -#ifdef CONFIG_DEBUG_PREEMPT - /* - * Underflow? - */ - if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) - return; - /* - * Is the spinlock portion underflowing? - */ - if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && - !(preempt_count() & PREEMPT_MASK))) - return; -#endif + struct preempt_notifier *notifier; - if (preempt_count() == val) - trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); - __preempt_count_sub(val); + hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) + notifier->ops->sched_out(notifier, next); } -EXPORT_SYMBOL(preempt_count_sub); -NOKPROBE_SYMBOL(preempt_count_sub); -#endif - -/* - * Print scheduling while atomic bug: - */ -static noinline void __schedule_bug(struct task_struct *prev) +static __always_inline void +fire_sched_out_preempt_notifiers(struct task_struct *curr, + struct task_struct *next) { - if (oops_in_progress) - return; + if (static_key_false(&preempt_notifier_key)) + __fire_sched_out_preempt_notifiers(curr, next); +} - printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", - prev->comm, prev->pid, preempt_count()); +#else /* !CONFIG_PREEMPT_NOTIFIERS */ - debug_show_held_locks(prev); - print_modules(); - if (irqs_disabled()) - print_irqtrace_events(prev); -#ifdef CONFIG_DEBUG_PREEMPT - if (in_atomic_preempt_off()) { - pr_err("Preemption disabled at:"); - print_ip_sym(current->preempt_disable_ip); - pr_cont("\n"); - } -#endif - dump_stack(); - add_taint(TAINT_WARN, LOCKDEP_STILL_OK); +static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) +{ } -/* - * Various schedule()-time debugging checks and statistics: - */ -static inline void schedule_debug(struct task_struct *prev) +static inline void +fire_sched_out_preempt_notifiers(struct task_struct *curr, + struct task_struct *next) { -#ifdef CONFIG_SCHED_STACK_END_CHECK - BUG_ON(unlikely(task_stack_end_corrupted(prev))); -#endif - /* - * Test if we are atomic. Since do_exit() needs to call into - * schedule() atomically, we ignore that path. Otherwise whine - * if we are scheduling when we should not. - */ - if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD)) - __schedule_bug(prev); - rcu_sleep_check(); +} - profile_hit(SCHED_PROFILING, __builtin_return_address(0)); +#endif /* CONFIG_PREEMPT_NOTIFIERS */ - schedstat_inc(this_rq(), sched_count); +/** + * prepare_task_switch - prepare to switch tasks + * @rq: the runqueue preparing to switch + * @prev: the current task that is being switched out + * @next: the task we are going to switch to. + * + * This is called with the rq lock held and interrupts off. It must + * be paired with a subsequent finish_task_switch after the context + * switch. + * + * prepare_task_switch sets up locking and calls architecture specific + * hooks. + */ +static inline void +prepare_task_switch(struct rq *rq, struct task_struct *prev, + struct task_struct *next) +{ + sched_info_switch(rq, prev, next); + perf_event_task_sched_out(prev, next); + fire_sched_out_preempt_notifiers(prev, next); + prepare_lock_switch(rq, next); + prepare_arch_switch(next); } -#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_SMP) -#define MIGRATE_DISABLE_SET_AFFIN (1<<30) /* Can't make a negative */ -#define migrate_disabled_updated(p) ((p)->migrate_disable & MIGRATE_DISABLE_SET_AFFIN) -#define migrate_disable_count(p) ((p)->migrate_disable & ~MIGRATE_DISABLE_SET_AFFIN) - -static inline void update_migrate_disable(struct task_struct *p) +/** + * finish_task_switch - clean up after a task-switch + * @prev: the thread we just switched away from. + * + * finish_task_switch must be called after the context switch, paired + * with a prepare_task_switch call before the context switch. + * finish_task_switch will reconcile locking set up by prepare_task_switch, + * and do any other architecture-specific cleanup actions. + * + * Note that we may have delayed dropping an mm in context_switch(). If + * so, we finish that here outside of the runqueue lock. (Doing it + * with the lock held can cause deadlocks; see schedule() for + * details.) + * + * The context switch have flipped the stack from under us and restored the + * local variables which were saved when this task called schedule() in the + * past. prev == current is still correct but we need to recalculate this_rq + * because prev may have moved to another CPU. + */ +static struct rq *finish_task_switch(struct task_struct *prev) + __releases(rq->lock) { - const struct cpumask *mask; + struct rq *rq = this_rq(); + struct mm_struct *mm = rq->prev_mm; + long prev_state; - if (likely(!p->migrate_disable)) - return; + /* + * The previous task will have left us with a preempt_count of 2 + * because it left us after: + * + * schedule() + * preempt_disable(); // 1 + * __schedule() + * raw_spin_lock_irq(&rq->lock) // 2 + * + * Also, see FORK_PREEMPT_COUNT. + */ + if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, + "corrupted preempt_count: %s/%d/0x%x\n", + current->comm, current->pid, preempt_count())) + preempt_count_set(FORK_PREEMPT_COUNT); - /* Did we already update affinity? */ - if (unlikely(migrate_disabled_updated(p))) - return; + rq->prev_mm = NULL; /* - * Since this is always current we can get away with only locking - * rq->lock, the ->cpus_allowed value can normally only be changed - * while holding both p->pi_lock and rq->lock, but seeing that this - * is current, we cannot actually be waking up, so all code that - * relies on serialization against p->pi_lock is out of scope. + * A task struct has one reference for the use as "current". + * If a task dies, then it sets TASK_DEAD in tsk->state and calls + * schedule one last time. The schedule call will never return, and + * the scheduled task must drop that reference. * - * Having rq->lock serializes us against things like - * set_cpus_allowed_ptr() that can still happen concurrently. + * We must observe prev->state before clearing prev->on_cpu (in + * finish_lock_switch), otherwise a concurrent wakeup can get prev + * running on another CPU and we could rave with its RUNNING -> DEAD + * transition, resulting in a double drop. + */ + prev_state = prev->state; + vtime_task_switch(prev); + perf_event_task_sched_in(prev, current); + finish_lock_switch(rq, prev); + finish_arch_post_lock_switch(); + + fire_sched_in_preempt_notifiers(current); + /* + * We use mmdrop_delayed() here so we don't have to do the + * full __mmdrop() when we are the last user. */ - mask = tsk_cpus_allowed(p); + if (mm) + mmdrop_delayed(mm); + if (unlikely(prev_state == TASK_DEAD)) { + if (prev->sched_class->task_dead) + prev->sched_class->task_dead(prev); - if (p->sched_class->set_cpus_allowed) - p->sched_class->set_cpus_allowed(p, mask); - /* mask==cpumask_of(task_cpu(p)) which has a cpumask_weight==1 */ - p->nr_cpus_allowed = 1; + /* + * Remove function-return probe instances associated with this + * task and put them back on the free list. + */ + kprobe_flush_task(prev); + put_task_struct(prev); + } - /* Let migrate_enable know to fix things back up */ - p->migrate_disable |= MIGRATE_DISABLE_SET_AFFIN; + tick_nohz_task_switch(); + return rq; } -void migrate_disable(void) -{ - struct task_struct *p = current; +#ifdef CONFIG_SMP - if (in_atomic()) { -#ifdef CONFIG_SCHED_DEBUG - p->migrate_disable_atomic++; -#endif - return; - } +/* rq->lock is NOT held, but preemption is disabled */ +static void __balance_callback(struct rq *rq) +{ + struct callback_head *head, *next; + void (*func)(struct rq *rq); + unsigned long flags; -#ifdef CONFIG_SCHED_DEBUG - if (unlikely(p->migrate_disable_atomic)) { - tracing_off(); - WARN_ON_ONCE(1); - } -#endif + raw_spin_lock_irqsave(&rq->lock, flags); + head = rq->balance_callback; + rq->balance_callback = NULL; + while (head) { + func = (void (*)(struct rq *))head->func; + next = head->next; + head->next = NULL; + head = next; - if (p->migrate_disable) { - p->migrate_disable++; - return; + func(rq); } - - preempt_disable(); - preempt_lazy_disable(); - pin_current_cpu(); - p->migrate_disable = 1; - preempt_enable(); + raw_spin_unlock_irqrestore(&rq->lock, flags); } -EXPORT_SYMBOL(migrate_disable); -void migrate_enable(void) +static inline void balance_callback(struct rq *rq) { - struct task_struct *p = current; - const struct cpumask *mask; - unsigned long flags; - struct rq *rq; + if (unlikely(rq->balance_callback)) + __balance_callback(rq); +} - if (in_atomic()) { -#ifdef CONFIG_SCHED_DEBUG - p->migrate_disable_atomic--; -#endif - return; - } +#else -#ifdef CONFIG_SCHED_DEBUG - if (unlikely(p->migrate_disable_atomic)) { - tracing_off(); - WARN_ON_ONCE(1); - } -#endif - WARN_ON_ONCE(p->migrate_disable <= 0); +static inline void balance_callback(struct rq *rq) +{ +} - if (migrate_disable_count(p) > 1) { - p->migrate_disable--; - return; - } +#endif - preempt_disable(); - if (unlikely(migrate_disabled_updated(p))) { - /* - * Undo whatever update_migrate_disable() did, also see there - * about locking. - */ - rq = this_rq(); - raw_spin_lock_irqsave(&rq->lock, flags); +/** + * schedule_tail - first thing a freshly forked thread must call. + * @prev: the thread we just switched away from. + */ +asmlinkage __visible void schedule_tail(struct task_struct *prev) + __releases(rq->lock) +{ + struct rq *rq; - /* - * Clearing migrate_disable causes tsk_cpus_allowed to - * show the tasks original cpu affinity. - */ - p->migrate_disable = 0; - mask = tsk_cpus_allowed(p); - if (p->sched_class->set_cpus_allowed) - p->sched_class->set_cpus_allowed(p, mask); - p->nr_cpus_allowed = cpumask_weight(mask); - raw_spin_unlock_irqrestore(&rq->lock, flags); - } else - p->migrate_disable = 0; + /* + * New tasks start with FORK_PREEMPT_COUNT, see there and + * finish_task_switch() for details. + * + * finish_task_switch() will drop rq->lock() and lower preempt_count + * and the preempt_enable() will end up enabling preemption (on + * PREEMPT_COUNT kernels). + */ - unpin_current_cpu(); + rq = finish_task_switch(prev); + balance_callback(rq); preempt_enable(); - preempt_lazy_enable(); + + if (current->set_child_tid) + put_user(task_pid_vnr(current), current->set_child_tid); } -EXPORT_SYMBOL(migrate_enable); -#else -static inline void update_migrate_disable(struct task_struct *p) { } -#define migrate_disabled_updated(p) 0 -#endif /* - * Pick up the highest-prio task: + * context_switch - switch to the new MM and the new thread's register state. */ -static inline struct task_struct * -pick_next_task(struct rq *rq, struct task_struct *prev) +static inline struct rq * +context_switch(struct rq *rq, struct task_struct *prev, + struct task_struct *next) { - const struct sched_class *class = &fair_sched_class; - struct task_struct *p; + struct mm_struct *mm, *oldmm; + prepare_task_switch(rq, prev, next); + + mm = next->mm; + oldmm = prev->active_mm; /* - * Optimization: we know that if all tasks are in - * the fair class we can call that function directly: + * For paravirt, this is coupled with an exit in switch_to to + * combine the page table reload and the switch backend into + * one hypercall. */ - if (likely(prev->sched_class == class && - rq->nr_running == rq->cfs.h_nr_running)) { - p = fair_sched_class.pick_next_task(rq, prev); - if (unlikely(p == RETRY_TASK)) - goto again; + arch_start_context_switch(prev); - /* assumes fair_sched_class->next == idle_sched_class */ - if (unlikely(!p)) - p = idle_sched_class.pick_next_task(rq, prev); + if (!mm) { + next->active_mm = oldmm; + atomic_inc(&oldmm->mm_count); + enter_lazy_tlb(oldmm, next); + } else + switch_mm(oldmm, mm, next); - return p; + if (!prev->mm) { + prev->active_mm = NULL; + rq->prev_mm = oldmm; } + /* + * Since the runqueue lock will be released by the next + * task (which is an invalid locking op but in the case + * of the scheduler it's an obvious special-case), so we + * do an early lockdep release here: + */ + lockdep_unpin_lock(&rq->lock); + spin_release(&rq->lock.dep_map, 1, _THIS_IP_); -again: - for_each_class(class) { - p = class->pick_next_task(rq, prev); - if (p) { - if (unlikely(p == RETRY_TASK)) - goto again; - return p; - } - } + /* Here we just switch the register state and the stack. */ + switch_to(prev, next, prev); + barrier(); - BUG(); /* the idle class will always have a runnable task */ + return finish_task_switch(prev); } /* - * __schedule() is the main scheduler function. - * - * The main means of driving the scheduler and thus entering this function are: - * - * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. - * - * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return - * paths. For example, see arch/x86/entry_64.S. - * - * To drive preemption between tasks, the scheduler sets the flag in timer - * interrupt handler scheduler_tick(). - * - * 3. Wakeups don't really cause entry into schedule(). They add a - * task to the run-queue and that's it. - * - * Now, if the new task added to the run-queue preempts the current - * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets - * called on the nearest possible occasion: - * - * - If the kernel is preemptible (CONFIG_PREEMPT=y): + * nr_running and nr_context_switches: * - * - in syscall or exception context, at the next outmost - * preempt_enable(). (this might be as soon as the wake_up()'s - * spin_unlock()!) + * externally visible scheduler statistics: current number of runnable + * threads, total number of context switches performed since bootup. + */ +unsigned long nr_running(void) +{ + unsigned long i, sum = 0; + + for_each_online_cpu(i) + sum += cpu_rq(i)->nr_running; + + return sum; +} + +/* + * Check if only the current task is running on the cpu. * - * - in IRQ context, return from interrupt-handler to - * preemptible context + * Caution: this function does not check that the caller has disabled + * preemption, thus the result might have a time-of-check-to-time-of-use + * race. The caller is responsible to use it correctly, for example: * - * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) - * then at the next: + * - from a non-preemptable section (of course) * - * - cond_resched() call - * - explicit schedule() call - * - return from syscall or exception to user-space - * - return from interrupt-handler to user-space + * - from a thread that is bound to a single CPU * - * WARNING: all callers must re-check need_resched() afterward and reschedule - * accordingly in case an event triggered the need for rescheduling (such as - * an interrupt waking up a task) while preemption was disabled in __schedule(). + * - in a loop with very short iterations (e.g. a polling loop) */ -static void __sched __schedule(void) +bool single_task_running(void) { - struct task_struct *prev, *next; - unsigned long *switch_count; - struct rq *rq; - int cpu; + return raw_rq()->nr_running == 1; +} +EXPORT_SYMBOL(single_task_running); - preempt_disable(); - cpu = smp_processor_id(); - rq = cpu_rq(cpu); - rcu_note_context_switch(); - prev = rq->curr; +unsigned long long nr_context_switches(void) +{ + int i; + unsigned long long sum = 0; - schedule_debug(prev); + for_each_possible_cpu(i) + sum += cpu_rq(i)->nr_switches; - if (sched_feat(HRTICK)) - hrtick_clear(rq); + return sum; +} - /* - * Make sure that signal_pending_state()->signal_pending() below - * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) - * done by the caller to avoid the race with signal_wake_up(). - */ - smp_mb__before_spinlock(); - raw_spin_lock_irq(&rq->lock); +unsigned long nr_iowait(void) +{ + unsigned long i, sum = 0; - update_migrate_disable(prev); + for_each_possible_cpu(i) + sum += atomic_read(&cpu_rq(i)->nr_iowait); - rq->clock_skip_update <<= 1; /* promote REQ to ACT */ + return sum; +} - switch_count = &prev->nivcsw; - if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { - if (unlikely(signal_pending_state(prev->state, prev))) { - prev->state = TASK_RUNNING; - } else { - deactivate_task(rq, prev, DEQUEUE_SLEEP); - prev->on_rq = 0; - } - switch_count = &prev->nvcsw; - } +unsigned long nr_iowait_cpu(int cpu) +{ + struct rq *this = cpu_rq(cpu); + return atomic_read(&this->nr_iowait); +} + +void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) +{ + struct rq *rq = this_rq(); + *nr_waiters = atomic_read(&rq->nr_iowait); + *load = rq->load.weight; +} + +#ifdef CONFIG_SMP + +/* + * sched_exec - execve() is a valuable balancing opportunity, because at + * this point the task has the smallest effective memory and cache footprint. + */ +void sched_exec(void) +{ + struct task_struct *p = current; + unsigned long flags; + int dest_cpu; - if (task_on_rq_queued(prev)) - update_rq_clock(rq); + raw_spin_lock_irqsave(&p->pi_lock, flags); + dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); + if (dest_cpu == smp_processor_id()) + goto unlock; - next = pick_next_task(rq, prev); - clear_tsk_need_resched(prev); - clear_tsk_need_resched_lazy(prev); - clear_preempt_need_resched(); - rq->clock_skip_update = 0; + if (likely(cpu_active(dest_cpu))) { + struct migration_arg arg = { p, dest_cpu }; - if (likely(prev != next)) { - rq->nr_switches++; - rq->curr = next; - ++*switch_count; + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); + return; + } +unlock: + raw_spin_unlock_irqrestore(&p->pi_lock, flags); +} - rq = context_switch(rq, prev, next); /* unlocks the rq */ - cpu = cpu_of(rq); - } else - raw_spin_unlock_irq(&rq->lock); +#endif - post_schedule(rq); +DEFINE_PER_CPU(struct kernel_stat, kstat); +DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); - sched_preempt_enable_no_resched(); -} +EXPORT_PER_CPU_SYMBOL(kstat); +EXPORT_PER_CPU_SYMBOL(kernel_cpustat); -static inline void sched_submit_work(struct task_struct *tsk) +/* + * Return accounted runtime for the task. + * In case the task is currently running, return the runtime plus current's + * pending runtime that have not been accounted yet. + */ +unsigned long long task_sched_runtime(struct task_struct *p) { - if (!tsk->state) - return; + unsigned long flags; + struct rq *rq; + u64 ns; + +#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) /* - * If a worker went to sleep, notify and ask workqueue whether - * it wants to wake up a task to maintain concurrency. + * 64-bit doesn't need locks to atomically read a 64bit value. + * So we have a optimization chance when the task's delta_exec is 0. + * Reading ->on_cpu is racy, but this is ok. + * + * If we race with it leaving cpu, we'll take a lock. So we're correct. + * If we race with it entering cpu, unaccounted time is 0. This is + * indistinguishable from the read occurring a few cycles earlier. + * If we see ->on_cpu without ->on_rq, the task is leaving, and has + * been accounted, so we're correct here as well. */ - if (tsk->flags & PF_WQ_WORKER) - wq_worker_sleeping(tsk); - - - if (tsk_is_pi_blocked(tsk)) - return; + if (!p->on_cpu || !task_on_rq_queued(p)) + return p->se.sum_exec_runtime; +#endif + rq = task_rq_lock(p, &flags); /* - * If we are going to sleep and we have plugged IO queued, - * make sure to submit it to avoid deadlocks. + * Must be ->curr _and_ ->on_rq. If dequeued, we would + * project cycles that may never be accounted to this + * thread, breaking clock_gettime(). */ - if (blk_needs_flush_plug(tsk)) - blk_schedule_flush_plug(tsk); -} + if (task_current(rq, p) && task_on_rq_queued(p)) { + update_rq_clock(rq); + p->sched_class->update_curr(rq); + } + ns = p->se.sum_exec_runtime; + task_rq_unlock(rq, p, &flags); -static void sched_update_worker(struct task_struct *tsk) -{ - if (tsk->flags & PF_WQ_WORKER) - wq_worker_running(tsk); + return ns; } -asmlinkage __visible void __sched schedule(void) +/* + * This function gets called by the timer code, with HZ frequency. + * We call it with interrupts disabled. + */ +void scheduler_tick(void) { - struct task_struct *tsk = current; + int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + struct task_struct *curr = rq->curr; - sched_submit_work(tsk); - do { - __schedule(); - } while (need_resched()); - sched_update_worker(tsk); -} -EXPORT_SYMBOL(schedule); + sched_clock_tick(); -#ifdef CONFIG_CONTEXT_TRACKING -asmlinkage __visible void __sched schedule_user(void) -{ - /* - * If we come here after a random call to set_need_resched(), - * or we have been woken up remotely but the IPI has not yet arrived, - * we haven't yet exited the RCU idle mode. Do it here manually until - * we find a better solution. - * - * NB: There are buggy callers of this function. Ideally we - * should warn if prev_state != CONTEXT_USER, but that will trigger - * too frequently to make sense yet. - */ - enum ctx_state prev_state = exception_enter(); - schedule(); - exception_exit(prev_state); -} + raw_spin_lock(&rq->lock); + update_rq_clock(rq); + curr->sched_class->task_tick(rq, curr, 0); + update_cpu_load_active(rq); + calc_global_load_tick(rq); + raw_spin_unlock(&rq->lock); + + perf_event_task_tick(); + +#ifdef CONFIG_SMP + rq->idle_balance = idle_cpu(cpu); + trigger_load_balance(rq); #endif + rq_last_tick_reset(rq); +} +#ifdef CONFIG_NO_HZ_FULL /** - * schedule_preempt_disabled - called with preemption disabled + * scheduler_tick_max_deferment * - * Returns with preemption disabled. Note: preempt_count must be 1 + * Keep at least one tick per second when a single + * active task is running because the scheduler doesn't + * yet completely support full dynticks environment. + * + * This makes sure that uptime, CFS vruntime, load + * balancing, etc... continue to move forward, even + * with a very low granularity. + * + * Return: Maximum deferment in nanoseconds. */ -void __sched schedule_preempt_disabled(void) +u64 scheduler_tick_max_deferment(void) { - sched_preempt_enable_no_resched(); - schedule(); - preempt_disable(); -} + struct rq *rq = this_rq(); + unsigned long next, now = READ_ONCE(jiffies); -static void __sched notrace preempt_schedule_common(void) -{ - do { - __preempt_count_add(PREEMPT_ACTIVE); - __schedule(); - __preempt_count_sub(PREEMPT_ACTIVE); + next = rq->last_sched_tick + HZ; - /* - * Check again in case we missed a preemption opportunity - * between schedule and now. - */ - barrier(); - } while (need_resched()); + if (time_before_eq(next, now)) + return 0; + + return jiffies_to_nsecs(next - now); } +#endif -#ifdef CONFIG_PREEMPT -/* - * this is the entry point to schedule() from in-kernel preemption - * off of preempt_enable. Kernel preemptions off return from interrupt - * occur there and call schedule directly. - */ -asmlinkage __visible void __sched notrace preempt_schedule(void) +#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ + defined(CONFIG_PREEMPT_TRACER)) + +void preempt_count_add(int val) { +#ifdef CONFIG_DEBUG_PREEMPT /* - * If there is a non-zero preempt_count or interrupts are disabled, - * we do not want to preempt the current task. Just return.. + * Underflow? */ - if (likely(!preemptible())) + if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) return; - - preempt_schedule_common(); +#endif + __preempt_count_add(val); +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Spinlock count overflowing soon? + */ + DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= + PREEMPT_MASK - 10); +#endif + if (preempt_count() == val) { + unsigned long ip = get_lock_parent_ip(); +#ifdef CONFIG_DEBUG_PREEMPT + current->preempt_disable_ip = ip; +#endif + trace_preempt_off(CALLER_ADDR0, ip); + } } -NOKPROBE_SYMBOL(preempt_schedule); -EXPORT_SYMBOL(preempt_schedule); +EXPORT_SYMBOL(preempt_count_add); +NOKPROBE_SYMBOL(preempt_count_add); -#ifdef CONFIG_CONTEXT_TRACKING -/** - * preempt_schedule_context - preempt_schedule called by tracing - * - * The tracing infrastructure uses preempt_enable_notrace to prevent - * recursion and tracing preempt enabling caused by the tracing - * infrastructure itself. But as tracing can happen in areas coming - * from userspace or just about to enter userspace, a preempt enable - * can occur before user_exit() is called. This will cause the scheduler - * to be called when the system is still in usermode. - * - * To prevent this, the preempt_enable_notrace will use this function - * instead of preempt_schedule() to exit user context if needed before - * calling the scheduler. - */ -asmlinkage __visible void __sched notrace preempt_schedule_context(void) +void preempt_count_sub(int val) { - enum ctx_state prev_ctx; - - if (likely(!preemptible())) - return; - -#ifdef CONFIG_PREEMPT_LAZY +#ifdef CONFIG_DEBUG_PREEMPT /* - * Check for lazy preemption + * Underflow? */ - if (current_thread_info()->preempt_lazy_count && - !test_thread_flag(TIF_NEED_RESCHED)) + if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) return; -#endif - do { - __preempt_count_add(PREEMPT_ACTIVE); - /* - * Needs preempt disabled in case user_exit() is traced - * and the tracer calls preempt_enable_notrace() causing - * an infinite recursion. - */ - prev_ctx = exception_enter(); - /* - * The add/subtract must not be traced by the function - * tracer. But we still want to account for the - * preempt off latency tracer. Since the _notrace versions - * of add/subtract skip the accounting for latency tracer - * we must force it manually. - */ - start_critical_timings(); - __schedule(); - stop_critical_timings(); - exception_exit(prev_ctx); + /* + * Is the spinlock portion underflowing? + */ + if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && + !(preempt_count() & PREEMPT_MASK))) + return; +#endif - __preempt_count_sub(PREEMPT_ACTIVE); - barrier(); - } while (need_resched()); + if (preempt_count() == val) + trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); + __preempt_count_sub(val); } -EXPORT_SYMBOL_GPL(preempt_schedule_context); -#endif /* CONFIG_CONTEXT_TRACKING */ +EXPORT_SYMBOL(preempt_count_sub); +NOKPROBE_SYMBOL(preempt_count_sub); -#endif /* CONFIG_PREEMPT */ +#endif /* - * this is the entry point to schedule() from kernel preemption - * off of irq context. - * Note, that this is called and return with irqs disabled. This will - * protect us against recursive calling from irq. + * Print scheduling while atomic bug: */ -asmlinkage __visible void __sched preempt_schedule_irq(void) +static noinline void __schedule_bug(struct task_struct *prev) { - enum ctx_state prev_state; - - /* Catch callers which need to be fixed */ - BUG_ON(preempt_count() || !irqs_disabled()); - - prev_state = exception_enter(); - - do { - __preempt_count_add(PREEMPT_ACTIVE); - local_irq_enable(); - __schedule(); - local_irq_disable(); - __preempt_count_sub(PREEMPT_ACTIVE); - - /* - * Check again in case we missed a preemption opportunity - * between schedule and now. - */ - barrier(); - } while (need_resched()); + if (oops_in_progress) + return; - exception_exit(prev_state); -} + printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", + prev->comm, prev->pid, preempt_count()); -int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, - void *key) -{ - return try_to_wake_up(curr->private, mode, wake_flags); + debug_show_held_locks(prev); + print_modules(); + if (irqs_disabled()) + print_irqtrace_events(prev); +#ifdef CONFIG_DEBUG_PREEMPT + if (in_atomic_preempt_off()) { + pr_err("Preemption disabled at:"); + print_ip_sym(current->preempt_disable_ip); + pr_cont("\n"); + } +#endif + dump_stack(); + add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } -EXPORT_SYMBOL(default_wake_function); - -#ifdef CONFIG_RT_MUTEXES /* - * rt_mutex_setprio - set the current priority of a task - * @p: task - * @prio: prio value (kernel-internal form) - * - * This function changes the 'effective' priority of a task. It does - * not touch ->normal_prio like __setscheduler(). - * - * Used by the rt_mutex code to implement priority inheritance - * logic. Call site only calls if the priority of the task changed. + * Various schedule()-time debugging checks and statistics: */ -void rt_mutex_setprio(struct task_struct *p, int prio) +static inline void schedule_debug(struct task_struct *prev) { - int oldprio, queued, running, enqueue_flag = 0; - struct rq *rq; - const struct sched_class *prev_class; - - BUG_ON(prio > MAX_PRIO); - - rq = __task_rq_lock(p); - - /* - * Idle task boosting is a nono in general. There is one - * exception, when PREEMPT_RT and NOHZ is active: - * - * The idle task calls get_next_timer_interrupt() and holds - * the timer wheel base->lock on the CPU and another CPU wants - * to access the timer (probably to cancel it). We can safely - * ignore the boosting request, as the idle CPU runs this code - * with interrupts disabled and will complete the lock - * protected section without being interrupted. So there is no - * real need to boost. - */ - if (unlikely(p == rq->idle)) { - WARN_ON(p != rq->curr); - WARN_ON(p->pi_blocked_on); - goto out_unlock; - } - - trace_sched_pi_setprio(p, prio); - oldprio = p->prio; - prev_class = p->sched_class; - queued = task_on_rq_queued(p); - running = task_current(rq, p); - if (queued) - dequeue_task(rq, p, 0); - if (running) - put_prev_task(rq, p); +#ifdef CONFIG_SCHED_STACK_END_CHECK + BUG_ON(task_stack_end_corrupted(prev)); +#endif - /* - * Boosting condition are: - * 1. -rt task is running and holds mutex A - * --> -dl task blocks on mutex A - * - * 2. -dl task is running and holds mutex A - * --> -dl task blocks on mutex A and could preempt the - * running task - */ - if (dl_prio(prio)) { - struct task_struct *pi_task = rt_mutex_get_top_task(p); - if (!dl_prio(p->normal_prio) || - (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { - p->dl.dl_boosted = 1; - p->dl.dl_throttled = 0; - enqueue_flag = ENQUEUE_REPLENISH; - } else - p->dl.dl_boosted = 0; - p->sched_class = &dl_sched_class; - } else if (rt_prio(prio)) { - if (dl_prio(oldprio)) - p->dl.dl_boosted = 0; - if (oldprio < prio) - enqueue_flag = ENQUEUE_HEAD; - p->sched_class = &rt_sched_class; - } else { - if (dl_prio(oldprio)) - p->dl.dl_boosted = 0; - if (rt_prio(oldprio)) - p->rt.timeout = 0; - p->sched_class = &fair_sched_class; + if (unlikely(in_atomic_preempt_off())) { + __schedule_bug(prev); + preempt_count_set(PREEMPT_DISABLED); } + rcu_sleep_check(); - p->prio = prio; - - if (running) - p->sched_class->set_curr_task(rq); - if (queued) - enqueue_task(rq, p, enqueue_flag); + profile_hit(SCHED_PROFILING, __builtin_return_address(0)); - check_class_changed(rq, p, prev_class, oldprio); -out_unlock: - __task_rq_unlock(rq); + schedstat_inc(this_rq(), sched_count); } -#endif -void set_user_nice(struct task_struct *p, long nice) +#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_SMP) + +void migrate_disable(void) { - int old_prio, delta, queued; - unsigned long flags; - struct rq *rq; + struct task_struct *p = current; - if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) + if (in_atomic() || irqs_disabled()) { +#ifdef CONFIG_SCHED_DEBUG + p->migrate_disable_atomic++; +#endif return; - /* - * We have to be careful, if called from sys_setpriority(), - * the task might be in the middle of scheduling on another CPU. - */ - rq = task_rq_lock(p, &flags); - /* - * The RT priorities are set via sched_setscheduler(), but we still - * allow the 'normal' nice value to be set - but as expected - * it wont have any effect on scheduling until the task is - * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR: - */ - if (task_has_dl_policy(p) || task_has_rt_policy(p)) { - p->static_prio = NICE_TO_PRIO(nice); - goto out_unlock; } - queued = task_on_rq_queued(p); - if (queued) - dequeue_task(rq, p, 0); - - p->static_prio = NICE_TO_PRIO(nice); - set_load_weight(p); - old_prio = p->prio; - p->prio = effective_prio(p); - delta = p->prio - old_prio; - if (queued) { - enqueue_task(rq, p, 0); - /* - * If the task increased its priority or is running and - * lowered its priority, then reschedule its CPU: - */ - if (delta < 0 || (delta > 0 && task_running(rq, p))) - resched_curr(rq); +#ifdef CONFIG_SCHED_DEBUG + if (unlikely(p->migrate_disable_atomic)) { + tracing_off(); + WARN_ON_ONCE(1); } -out_unlock: - task_rq_unlock(rq, p, &flags); -} -EXPORT_SYMBOL(set_user_nice); +#endif -/* - * can_nice - check if a task can reduce its nice value - * @p: task - * @nice: nice value - */ -int can_nice(const struct task_struct *p, const int nice) -{ - /* convert nice value [19,-20] to rlimit style value [1,40] */ - int nice_rlim = nice_to_rlimit(nice); + if (p->migrate_disable) { + p->migrate_disable++; + return; + } - return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || - capable(CAP_SYS_NICE)); + preempt_disable(); + preempt_lazy_disable(); + pin_current_cpu(); + p->migrate_disable = 1; + preempt_enable(); } +EXPORT_SYMBOL(migrate_disable); -#ifdef __ARCH_WANT_SYS_NICE - -/* - * sys_nice - change the priority of the current process. - * @increment: priority increment - * - * sys_setpriority is a more generic, but much slower function that - * does similar things. - */ -SYSCALL_DEFINE1(nice, int, increment) +void migrate_enable(void) { - long nice, retval; + struct task_struct *p = current; - /* - * Setpriority might change our priority at the same moment. - * We don't have to worry. Conceptually one call occurs first - * and we have a single winner. - */ - increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); - nice = task_nice(current) + increment; + if (in_atomic() || irqs_disabled()) { +#ifdef CONFIG_SCHED_DEBUG + p->migrate_disable_atomic--; +#endif + return; + } - nice = clamp_val(nice, MIN_NICE, MAX_NICE); - if (increment < 0 && !can_nice(current, nice)) - return -EPERM; +#ifdef CONFIG_SCHED_DEBUG + if (unlikely(p->migrate_disable_atomic)) { + tracing_off(); + WARN_ON_ONCE(1); + } +#endif + WARN_ON_ONCE(p->migrate_disable <= 0); + + if (p->migrate_disable > 1) { + p->migrate_disable--; + return; + } - retval = security_task_setnice(current, nice); - if (retval) - return retval; + preempt_disable(); + /* + * Clearing migrate_disable causes tsk_cpus_allowed to + * show the tasks original cpu affinity. + */ + p->migrate_disable = 0; - set_user_nice(current, nice); - return 0; + unpin_current_cpu(); + preempt_enable(); + preempt_lazy_enable(); } - +EXPORT_SYMBOL(migrate_enable); #endif -/** - * task_prio - return the priority value of a given task. - * @p: the task in question. - * - * Return: The priority value as seen by users in /proc. - * RT tasks are offset by -200. Normal tasks are centered - * around 0, value goes from -16 to +15. +/* + * Pick up the highest-prio task: */ -int task_prio(const struct task_struct *p) +static inline struct task_struct * +pick_next_task(struct rq *rq, struct task_struct *prev) { - return p->prio - MAX_RT_PRIO; -} + const struct sched_class *class = &fair_sched_class; + struct task_struct *p; -/** - * idle_cpu - is a given cpu idle currently? - * @cpu: the processor in question. - * - * Return: 1 if the CPU is currently idle. 0 otherwise. - */ -int idle_cpu(int cpu) -{ - struct rq *rq = cpu_rq(cpu); + /* + * Optimization: we know that if all tasks are in + * the fair class we can call that function directly: + */ + if (likely(prev->sched_class == class && + rq->nr_running == rq->cfs.h_nr_running)) { + p = fair_sched_class.pick_next_task(rq, prev); + if (unlikely(p == RETRY_TASK)) + goto again; - if (rq->curr != rq->idle) - return 0; + /* assumes fair_sched_class->next == idle_sched_class */ + if (unlikely(!p)) + p = idle_sched_class.pick_next_task(rq, prev); - if (rq->nr_running) - return 0; + return p; + } -#ifdef CONFIG_SMP - if (!llist_empty(&rq->wake_list)) - return 0; -#endif +again: + for_each_class(class) { + p = class->pick_next_task(rq, prev); + if (p) { + if (unlikely(p == RETRY_TASK)) + goto again; + return p; + } + } - return 1; + BUG(); /* the idle class will always have a runnable task */ } -/** - * idle_task - return the idle task for a given cpu. - * @cpu: the processor in question. +/* + * __schedule() is the main scheduler function. * - * Return: The idle task for the cpu @cpu. - */ -struct task_struct *idle_task(int cpu) -{ - return cpu_rq(cpu)->idle; -} - -/** - * find_process_by_pid - find a process with a matching PID value. - * @pid: the pid in question. + * The main means of driving the scheduler and thus entering this function are: * - * The task of @pid, if found. %NULL otherwise. - */ -static struct task_struct *find_process_by_pid(pid_t pid) -{ - return pid ? find_task_by_vpid(pid) : current; -} - -/* - * This function initializes the sched_dl_entity of a newly becoming - * SCHED_DEADLINE task. + * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. * - * Only the static values are considered here, the actual runtime and the - * absolute deadline will be properly calculated when the task is enqueued - * for the first time with its new policy. + * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return + * paths. For example, see arch/x86/entry_64.S. + * + * To drive preemption between tasks, the scheduler sets the flag in timer + * interrupt handler scheduler_tick(). + * + * 3. Wakeups don't really cause entry into schedule(). They add a + * task to the run-queue and that's it. + * + * Now, if the new task added to the run-queue preempts the current + * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets + * called on the nearest possible occasion: + * + * - If the kernel is preemptible (CONFIG_PREEMPT=y): + * + * - in syscall or exception context, at the next outmost + * preempt_enable(). (this might be as soon as the wake_up()'s + * spin_unlock()!) + * + * - in IRQ context, return from interrupt-handler to + * preemptible context + * + * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) + * then at the next: + * + * - cond_resched() call + * - explicit schedule() call + * - return from syscall or exception to user-space + * - return from interrupt-handler to user-space + * + * WARNING: must be called with preemption disabled! */ -static void -__setparam_dl(struct task_struct *p, const struct sched_attr *attr) +static void __sched notrace __schedule(bool preempt) { - struct sched_dl_entity *dl_se = &p->dl; + struct task_struct *prev, *next; + unsigned long *switch_count; + struct rq *rq; + int cpu; - dl_se->dl_runtime = attr->sched_runtime; - dl_se->dl_deadline = attr->sched_deadline; - dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; - dl_se->flags = attr->sched_flags; - dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); + cpu = smp_processor_id(); + rq = cpu_rq(cpu); + rcu_note_context_switch(); + prev = rq->curr; /* - * Changing the parameters of a task is 'tricky' and we're not doing - * the correct thing -- also see task_dead_dl() and switched_from_dl(). - * - * What we SHOULD do is delay the bandwidth release until the 0-lag - * point. This would include retaining the task_struct until that time - * and change dl_overflow() to not immediately decrement the current - * amount. - * - * Instead we retain the current runtime/deadline and let the new - * parameters take effect after the current reservation period lapses. - * This is safe (albeit pessimistic) because the 0-lag point is always - * before the current scheduling deadline. + * do_exit() calls schedule() with preemption disabled as an exception; + * however we must fix that up, otherwise the next task will see an + * inconsistent (higher) preempt count. * - * We can still have temporary overloads because we do not delay the - * change in bandwidth until that time; so admission control is - * not on the safe side. It does however guarantee tasks will never - * consume more than promised. + * It also avoids the below schedule_debug() test from complaining + * about this. */ -} + if (unlikely(prev->state == TASK_DEAD)) + preempt_enable_no_resched_notrace(); -/* - * sched_setparam() passes in -1 for its policy, to let the functions - * it calls know not to change it. - */ -#define SETPARAM_POLICY -1 + schedule_debug(prev); -static void __setscheduler_params(struct task_struct *p, - const struct sched_attr *attr) -{ - int policy = attr->sched_policy; + if (sched_feat(HRTICK)) + hrtick_clear(rq); - if (policy == SETPARAM_POLICY) - policy = p->policy; + /* + * Make sure that signal_pending_state()->signal_pending() below + * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) + * done by the caller to avoid the race with signal_wake_up(). + */ + smp_mb__before_spinlock(); + raw_spin_lock_irq(&rq->lock); + lockdep_pin_lock(&rq->lock); - p->policy = policy; + rq->clock_skip_update <<= 1; /* promote REQ to ACT */ - if (dl_policy(policy)) - __setparam_dl(p, attr); - else if (fair_policy(policy)) - p->static_prio = NICE_TO_PRIO(attr->sched_nice); + switch_count = &prev->nivcsw; + if (!preempt && prev->state) { + if (unlikely(signal_pending_state(prev->state, prev))) { + prev->state = TASK_RUNNING; + } else { + deactivate_task(rq, prev, DEQUEUE_SLEEP); + prev->on_rq = 0; + } + switch_count = &prev->nvcsw; + } - /* - * __sched_setscheduler() ensures attr->sched_priority == 0 when - * !rt_policy. Always setting this ensures that things like - * getparam()/getattr() don't report silly values for !rt tasks. - */ - p->rt_priority = attr->sched_priority; - p->normal_prio = normal_prio(p); - set_load_weight(p); + if (task_on_rq_queued(prev)) + update_rq_clock(rq); + + next = pick_next_task(rq, prev); + clear_tsk_need_resched(prev); + clear_tsk_need_resched_lazy(prev); + clear_preempt_need_resched(); + rq->clock_skip_update = 0; + + if (likely(prev != next)) { + rq->nr_switches++; + rq->curr = next; + ++*switch_count; + + trace_sched_switch(preempt, prev, next); + rq = context_switch(rq, prev, next); /* unlocks the rq */ + cpu = cpu_of(rq); + } else { + lockdep_unpin_lock(&rq->lock); + raw_spin_unlock_irq(&rq->lock); + } + + balance_callback(rq); } -/* Actually do priority change: must hold pi & rq lock. */ -static void __setscheduler(struct rq *rq, struct task_struct *p, - const struct sched_attr *attr, bool keep_boost) +static inline void sched_submit_work(struct task_struct *tsk) { - __setscheduler_params(p, attr); + if (!tsk->state) + return; + /* + * If a worker went to sleep, notify and ask workqueue whether + * it wants to wake up a task to maintain concurrency. + */ + if (tsk->flags & PF_WQ_WORKER) + wq_worker_sleeping(tsk); + + + if (tsk_is_pi_blocked(tsk)) + return; /* - * Keep a potential priority boosting if called from - * sched_setscheduler(). + * If we are going to sleep and we have plugged IO queued, + * make sure to submit it to avoid deadlocks. */ - if (keep_boost) - p->prio = rt_mutex_get_effective_prio(p, normal_prio(p)); - else - p->prio = normal_prio(p); + if (blk_needs_flush_plug(tsk)) + blk_schedule_flush_plug(tsk); +} - if (dl_prio(p->prio)) - p->sched_class = &dl_sched_class; - else if (rt_prio(p->prio)) - p->sched_class = &rt_sched_class; - else - p->sched_class = &fair_sched_class; +static void sched_update_worker(struct task_struct *tsk) +{ + if (tsk->flags & PF_WQ_WORKER) + wq_worker_running(tsk); } -static void -__getparam_dl(struct task_struct *p, struct sched_attr *attr) +asmlinkage __visible void __sched schedule(void) { - struct sched_dl_entity *dl_se = &p->dl; + struct task_struct *tsk = current; - attr->sched_priority = p->rt_priority; - attr->sched_runtime = dl_se->dl_runtime; - attr->sched_deadline = dl_se->dl_deadline; - attr->sched_period = dl_se->dl_period; - attr->sched_flags = dl_se->flags; + sched_submit_work(tsk); + do { + preempt_disable(); + __schedule(false); + sched_preempt_enable_no_resched(); + } while (need_resched()); + sched_update_worker(tsk); } +EXPORT_SYMBOL(schedule); -/* - * This function validates the new parameters of a -deadline task. - * We ask for the deadline not being zero, and greater or equal - * than the runtime, as well as the period of being zero or - * greater than deadline. Furthermore, we have to be sure that - * user parameters are above the internal resolution of 1us (we - * check sched_runtime only since it is always the smaller one) and - * below 2^63 ns (we have to check both sched_deadline and - * sched_period, as the latter can be zero). - */ -static bool -__checkparam_dl(const struct sched_attr *attr) +#ifdef CONFIG_CONTEXT_TRACKING +asmlinkage __visible void __sched schedule_user(void) { - /* deadline != 0 */ - if (attr->sched_deadline == 0) - return false; - /* - * Since we truncate DL_SCALE bits, make sure we're at least - * that big. + * If we come here after a random call to set_need_resched(), + * or we have been woken up remotely but the IPI has not yet arrived, + * we haven't yet exited the RCU idle mode. Do it here manually until + * we find a better solution. + * + * NB: There are buggy callers of this function. Ideally we + * should warn if prev_state != CONTEXT_USER, but that will trigger + * too frequently to make sense yet. */ - if (attr->sched_runtime < (1ULL << DL_SCALE)) - return false; + enum ctx_state prev_state = exception_enter(); + schedule(); + exception_exit(prev_state); +} +#endif - /* - * Since we use the MSB for wrap-around and sign issues, make - * sure it's not set (mind that period can be equal to zero). - */ - if (attr->sched_deadline & (1ULL << 63) || - attr->sched_period & (1ULL << 63)) - return false; +/** + * schedule_preempt_disabled - called with preemption disabled + * + * Returns with preemption disabled. Note: preempt_count must be 1 + */ +void __sched schedule_preempt_disabled(void) +{ + sched_preempt_enable_no_resched(); + schedule(); + preempt_disable(); +} - /* runtime <= deadline <= period (if period != 0) */ - if ((attr->sched_period != 0 && - attr->sched_period < attr->sched_deadline) || - attr->sched_deadline < attr->sched_runtime) - return false; +static void __sched notrace preempt_schedule_common(void) +{ + do { + preempt_disable_notrace(); + __schedule(true); + preempt_enable_no_resched_notrace(); - return true; + /* + * Check again in case we missed a preemption opportunity + * between schedule and now. + */ + } while (need_resched()); } +#ifdef CONFIG_PREEMPT_LAZY /* - * check the target process has a UID that matches the current process's + * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is + * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as + * preempt_lazy_count counter >0. */ -static bool check_same_owner(struct task_struct *p) +static __always_inline int preemptible_lazy(void) { - const struct cred *cred = current_cred(), *pcred; - bool match; - - rcu_read_lock(); - pcred = __task_cred(p); - match = (uid_eq(cred->euid, pcred->euid) || - uid_eq(cred->euid, pcred->uid)); - rcu_read_unlock(); - return match; + if (test_thread_flag(TIF_NEED_RESCHED)) + return 1; + if (current_thread_info()->preempt_lazy_count) + return 0; + return 1; } -static bool dl_param_changed(struct task_struct *p, - const struct sched_attr *attr) +#else + +static int preemptible_lazy(void) { - struct sched_dl_entity *dl_se = &p->dl; + return 1; +} - if (dl_se->dl_runtime != attr->sched_runtime || - dl_se->dl_deadline != attr->sched_deadline || - dl_se->dl_period != attr->sched_period || - dl_se->flags != attr->sched_flags) - return true; +#endif - return false; +#ifdef CONFIG_PREEMPT +/* + * this is the entry point to schedule() from in-kernel preemption + * off of preempt_enable. Kernel preemptions off return from interrupt + * occur there and call schedule directly. + */ +asmlinkage __visible void __sched notrace preempt_schedule(void) +{ + /* + * If there is a non-zero preempt_count or interrupts are disabled, + * we do not want to preempt the current task. Just return.. + */ + if (likely(!preemptible())) + return; + if (!preemptible_lazy()) + return; + + preempt_schedule_common(); } +NOKPROBE_SYMBOL(preempt_schedule); +EXPORT_SYMBOL(preempt_schedule); -static int __sched_setscheduler(struct task_struct *p, - const struct sched_attr *attr, - bool user) +/** + * preempt_schedule_notrace - preempt_schedule called by tracing + * + * The tracing infrastructure uses preempt_enable_notrace to prevent + * recursion and tracing preempt enabling caused by the tracing + * infrastructure itself. But as tracing can happen in areas coming + * from userspace or just about to enter userspace, a preempt enable + * can occur before user_exit() is called. This will cause the scheduler + * to be called when the system is still in usermode. + * + * To prevent this, the preempt_enable_notrace will use this function + * instead of preempt_schedule() to exit user context if needed before + * calling the scheduler. + */ +asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) { - int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : - MAX_RT_PRIO - 1 - attr->sched_priority; - int retval, oldprio, oldpolicy = -1, queued, running; - int new_effective_prio, policy = attr->sched_policy; - unsigned long flags; - const struct sched_class *prev_class; - struct rq *rq; - int reset_on_fork; - - /* may grab non-irq protected spin_locks */ - BUG_ON(in_interrupt()); -recheck: - /* double check policy once rq lock held */ - if (policy < 0) { - reset_on_fork = p->sched_reset_on_fork; - policy = oldpolicy = p->policy; - } else { - reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); + enum ctx_state prev_ctx; - if (policy != SCHED_DEADLINE && - policy != SCHED_FIFO && policy != SCHED_RR && - policy != SCHED_NORMAL && policy != SCHED_BATCH && - policy != SCHED_IDLE) - return -EINVAL; - } + if (likely(!preemptible())) + return; + if (!preemptible_lazy()) + return; - if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK)) - return -EINVAL; + do { + preempt_disable_notrace(); + /* + * Needs preempt disabled in case user_exit() is traced + * and the tracer calls preempt_enable_notrace() causing + * an infinite recursion. + */ + prev_ctx = exception_enter(); + /* + * The add/subtract must not be traced by the function + * tracer. But we still want to account for the + * preempt off latency tracer. Since the _notrace versions + * of add/subtract skip the accounting for latency tracer + * we must force it manually. + */ + start_critical_timings(); + __schedule(true); + stop_critical_timings(); + exception_exit(prev_ctx); - /* - * Valid priorities for SCHED_FIFO and SCHED_RR are - * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, - * SCHED_BATCH and SCHED_IDLE is 0. - */ - if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) || - (!p->mm && attr->sched_priority > MAX_RT_PRIO-1)) - return -EINVAL; - if ((dl_policy(policy) && !__checkparam_dl(attr)) || - (rt_policy(policy) != (attr->sched_priority != 0))) - return -EINVAL; + preempt_enable_no_resched_notrace(); + } while (need_resched()); +} +EXPORT_SYMBOL_GPL(preempt_schedule_notrace); - /* - * Allow unprivileged RT tasks to decrease priority: - */ - if (user && !capable(CAP_SYS_NICE)) { - if (fair_policy(policy)) { - if (attr->sched_nice < task_nice(p) && - !can_nice(p, attr->sched_nice)) - return -EPERM; - } +#endif /* CONFIG_PREEMPT */ - if (rt_policy(policy)) { - unsigned long rlim_rtprio = - task_rlimit(p, RLIMIT_RTPRIO); +/* + * this is the entry point to schedule() from kernel preemption + * off of irq context. + * Note, that this is called and return with irqs disabled. This will + * protect us against recursive calling from irq. + */ +asmlinkage __visible void __sched preempt_schedule_irq(void) +{ + enum ctx_state prev_state; - /* can't set/change the rt policy */ - if (policy != p->policy && !rlim_rtprio) - return -EPERM; + /* Catch callers which need to be fixed */ + BUG_ON(preempt_count() || !irqs_disabled()); - /* can't increase priority */ - if (attr->sched_priority > p->rt_priority && - attr->sched_priority > rlim_rtprio) - return -EPERM; - } + prev_state = exception_enter(); - /* - * Can't set/change SCHED_DEADLINE policy at all for now - * (safest behavior); in the future we would like to allow - * unprivileged DL tasks to increase their relative deadline - * or reduce their runtime (both ways reducing utilization) - */ - if (dl_policy(policy)) - return -EPERM; + do { + preempt_disable(); + local_irq_enable(); + __schedule(true); + local_irq_disable(); + sched_preempt_enable_no_resched(); + } while (need_resched()); - /* - * Treat SCHED_IDLE as nice 20. Only allow a switch to - * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. - */ - if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { - if (!can_nice(p, task_nice(p))) - return -EPERM; - } + exception_exit(prev_state); +} - /* can't change other user's priorities */ - if (!check_same_owner(p)) - return -EPERM; +int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, + void *key) +{ + return try_to_wake_up(curr->private, mode, wake_flags); +} +EXPORT_SYMBOL(default_wake_function); - /* Normal users shall not reset the sched_reset_on_fork flag */ - if (p->sched_reset_on_fork && !reset_on_fork) - return -EPERM; - } +#ifdef CONFIG_RT_MUTEXES - if (user) { - retval = security_task_setscheduler(p); - if (retval) - return retval; - } +/* + * rt_mutex_setprio - set the current priority of a task + * @p: task + * @prio: prio value (kernel-internal form) + * + * This function changes the 'effective' priority of a task. It does + * not touch ->normal_prio like __setscheduler(). + * + * Used by the rt_mutex code to implement priority inheritance + * logic. Call site only calls if the priority of the task changed. + */ +void rt_mutex_setprio(struct task_struct *p, int prio) +{ + int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE; + struct rq *rq; + const struct sched_class *prev_class; + + BUG_ON(prio > MAX_PRIO); + + rq = __task_rq_lock(p); /* - * make sure no PI-waiters arrive (or leave) while we are - * changing the priority of the task: + * Idle task boosting is a nono in general. There is one + * exception, when PREEMPT_RT and NOHZ is active: * - * To be able to change p->policy safely, the appropriate - * runqueue lock must be held. + * The idle task calls get_next_timer_interrupt() and holds + * the timer wheel base->lock on the CPU and another CPU wants + * to access the timer (probably to cancel it). We can safely + * ignore the boosting request, as the idle CPU runs this code + * with interrupts disabled and will complete the lock + * protected section without being interrupted. So there is no + * real need to boost. */ - rq = task_rq_lock(p, &flags); + if (unlikely(p == rq->idle)) { + WARN_ON(p != rq->curr); + WARN_ON(p->pi_blocked_on); + goto out_unlock; + } + + trace_sched_pi_setprio(p, prio); + oldprio = p->prio; + prev_class = p->sched_class; + queued = task_on_rq_queued(p); + running = task_current(rq, p); + if (queued) + dequeue_task(rq, p, DEQUEUE_SAVE); + if (running) + put_prev_task(rq, p); /* - * Changing the policy of the stop threads its a very bad idea + * Boosting condition are: + * 1. -rt task is running and holds mutex A + * --> -dl task blocks on mutex A + * + * 2. -dl task is running and holds mutex A + * --> -dl task blocks on mutex A and could preempt the + * running task */ - if (p == rq->stop) { - task_rq_unlock(rq, p, &flags); - return -EINVAL; + if (dl_prio(prio)) { + struct task_struct *pi_task = rt_mutex_get_top_task(p); + if (!dl_prio(p->normal_prio) || + (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { + p->dl.dl_boosted = 1; + enqueue_flag |= ENQUEUE_REPLENISH; + } else + p->dl.dl_boosted = 0; + p->sched_class = &dl_sched_class; + } else if (rt_prio(prio)) { + if (dl_prio(oldprio)) + p->dl.dl_boosted = 0; + if (oldprio < prio) + enqueue_flag |= ENQUEUE_HEAD; + p->sched_class = &rt_sched_class; + } else { + if (dl_prio(oldprio)) + p->dl.dl_boosted = 0; + if (rt_prio(oldprio)) + p->rt.timeout = 0; + p->sched_class = &fair_sched_class; } - /* - * If not changing anything there's no need to proceed further, - * but store a possible modification of reset_on_fork. - */ - if (unlikely(policy == p->policy)) { - if (fair_policy(policy) && attr->sched_nice != task_nice(p)) - goto change; - if (rt_policy(policy) && attr->sched_priority != p->rt_priority) - goto change; - if (dl_policy(policy) && dl_param_changed(p, attr)) - goto change; + p->prio = prio; - p->sched_reset_on_fork = reset_on_fork; - task_rq_unlock(rq, p, &flags); - return 0; - } -change: + if (running) + p->sched_class->set_curr_task(rq); + if (queued) + enqueue_task(rq, p, enqueue_flag); - if (user) { -#ifdef CONFIG_RT_GROUP_SCHED - /* - * Do not allow realtime tasks into groups that have no runtime - * assigned. - */ - if (rt_bandwidth_enabled() && rt_policy(policy) && - task_group(p)->rt_bandwidth.rt_runtime == 0 && - !task_group_is_autogroup(task_group(p))) { - task_rq_unlock(rq, p, &flags); - return -EPERM; - } -#endif -#ifdef CONFIG_SMP - if (dl_bandwidth_enabled() && dl_policy(policy)) { - cpumask_t *span = rq->rd->span; + check_class_changed(rq, p, prev_class, oldprio); +out_unlock: + preempt_disable(); /* avoid rq from going away on us */ + __task_rq_unlock(rq); - /* - * Don't allow tasks with an affinity mask smaller than - * the entire root_domain to become SCHED_DEADLINE. We - * will also fail if there's no bandwidth available. - */ - if (!cpumask_subset(span, &p->cpus_allowed) || - rq->rd->dl_bw.bw == 0) { - task_rq_unlock(rq, p, &flags); - return -EPERM; - } - } + balance_callback(rq); + preempt_enable(); +} #endif - } - /* recheck policy now with rq lock held */ - if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { - policy = oldpolicy = -1; - task_rq_unlock(rq, p, &flags); - goto recheck; - } +void set_user_nice(struct task_struct *p, long nice) +{ + int old_prio, delta, queued; + unsigned long flags; + struct rq *rq; + if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) + return; /* - * If setscheduling to SCHED_DEADLINE (or changing the parameters - * of a SCHED_DEADLINE task) we need to check if enough bandwidth - * is available. + * We have to be careful, if called from sys_setpriority(), + * the task might be in the middle of scheduling on another CPU. */ - if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) { - task_rq_unlock(rq, p, &flags); - return -EBUSY; - } - - p->sched_reset_on_fork = reset_on_fork; - oldprio = p->prio; - + rq = task_rq_lock(p, &flags); /* - * Take priority boosted tasks into account. If the new - * effective priority is unchanged, we just store the new - * normal parameters and do not touch the scheduler class and - * the runqueue. This will be done when the task deboost - * itself. + * The RT priorities are set via sched_setscheduler(), but we still + * allow the 'normal' nice value to be set - but as expected + * it wont have any effect on scheduling until the task is + * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR: */ - new_effective_prio = rt_mutex_get_effective_prio(p, newprio); - if (new_effective_prio == oldprio) { - __setscheduler_params(p, attr); - task_rq_unlock(rq, p, &flags); - return 0; + if (task_has_dl_policy(p) || task_has_rt_policy(p)) { + p->static_prio = NICE_TO_PRIO(nice); + goto out_unlock; } - queued = task_on_rq_queued(p); - running = task_current(rq, p); if (queued) - dequeue_task(rq, p, 0); - if (running) - put_prev_task(rq, p); + dequeue_task(rq, p, DEQUEUE_SAVE); - prev_class = p->sched_class; - __setscheduler(rq, p, attr, true); + p->static_prio = NICE_TO_PRIO(nice); + set_load_weight(p); + old_prio = p->prio; + p->prio = effective_prio(p); + delta = p->prio - old_prio; - if (running) - p->sched_class->set_curr_task(rq); if (queued) { + enqueue_task(rq, p, ENQUEUE_RESTORE); /* - * We enqueue to tail when the priority of a task is - * increased (user space view). + * If the task increased its priority or is running and + * lowered its priority, then reschedule its CPU: */ - enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0); + if (delta < 0 || (delta > 0 && task_running(rq, p))) + resched_curr(rq); } - - check_class_changed(rq, p, prev_class, oldprio); +out_unlock: task_rq_unlock(rq, p, &flags); +} +EXPORT_SYMBOL(set_user_nice); + +/* + * can_nice - check if a task can reduce its nice value + * @p: task + * @nice: nice value + */ +int can_nice(const struct task_struct *p, const int nice) +{ + /* convert nice value [19,-20] to rlimit style value [1,40] */ + int nice_rlim = nice_to_rlimit(nice); + + return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || + capable(CAP_SYS_NICE)); +} + +#ifdef __ARCH_WANT_SYS_NICE + +/* + * sys_nice - change the priority of the current process. + * @increment: priority increment + * + * sys_setpriority is a more generic, but much slower function that + * does similar things. + */ +SYSCALL_DEFINE1(nice, int, increment) +{ + long nice, retval; + + /* + * Setpriority might change our priority at the same moment. + * We don't have to worry. Conceptually one call occurs first + * and we have a single winner. + */ + increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); + nice = task_nice(current) + increment; + + nice = clamp_val(nice, MIN_NICE, MAX_NICE); + if (increment < 0 && !can_nice(current, nice)) + return -EPERM; - rt_mutex_adjust_pi(p); + retval = security_task_setnice(current, nice); + if (retval) + return retval; + set_user_nice(current, nice); return 0; } -static int _sched_setscheduler(struct task_struct *p, int policy, - const struct sched_param *param, bool check) -{ - struct sched_attr attr = { - .sched_policy = policy, - .sched_priority = param->sched_priority, - .sched_nice = PRIO_TO_NICE(p->static_prio), - }; - - /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ - if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { - attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; - policy &= ~SCHED_RESET_ON_FORK; - attr.sched_policy = policy; - } +#endif - return __sched_setscheduler(p, &attr, check); -} /** - * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. + * task_prio - return the priority value of a given task. * @p: the task in question. - * @policy: new policy. - * @param: structure containing the new RT priority. - * - * Return: 0 on success. An error code otherwise. * - * NOTE that the task may be already dead. + * Return: The priority value as seen by users in /proc. + * RT tasks are offset by -200. Normal tasks are centered + * around 0, value goes from -16 to +15. */ -int sched_setscheduler(struct task_struct *p, int policy, - const struct sched_param *param) +int task_prio(const struct task_struct *p) { - return _sched_setscheduler(p, policy, param, true); + return p->prio - MAX_RT_PRIO; } -EXPORT_SYMBOL_GPL(sched_setscheduler); -int sched_setattr(struct task_struct *p, const struct sched_attr *attr) +/** + * idle_cpu - is a given cpu idle currently? + * @cpu: the processor in question. + * + * Return: 1 if the CPU is currently idle. 0 otherwise. + */ +int idle_cpu(int cpu) { - return __sched_setscheduler(p, attr, true); + struct rq *rq = cpu_rq(cpu); + + if (rq->curr != rq->idle) + return 0; + + if (rq->nr_running) + return 0; + +#ifdef CONFIG_SMP + if (!llist_empty(&rq->wake_list)) + return 0; +#endif + + return 1; } -EXPORT_SYMBOL_GPL(sched_setattr); /** - * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. - * @p: the task in question. - * @policy: new policy. - * @param: structure containing the new RT priority. - * - * Just like sched_setscheduler, only don't bother checking if the - * current context has permission. For example, this is needed in - * stop_machine(): we create temporary high priority worker threads, - * but our caller might not have that capability. + * idle_task - return the idle task for a given cpu. + * @cpu: the processor in question. * - * Return: 0 on success. An error code otherwise. + * Return: The idle task for the cpu @cpu. */ -int sched_setscheduler_nocheck(struct task_struct *p, int policy, - const struct sched_param *param) +struct task_struct *idle_task(int cpu) { - return _sched_setscheduler(p, policy, param, false); + return cpu_rq(cpu)->idle; } -static int -do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) +/** + * find_process_by_pid - find a process with a matching PID value. + * @pid: the pid in question. + * + * The task of @pid, if found. %NULL otherwise. + */ +static struct task_struct *find_process_by_pid(pid_t pid) { - struct sched_param lparam; - struct task_struct *p; - int retval; - - if (!param || pid < 0) - return -EINVAL; - if (copy_from_user(&lparam, param, sizeof(struct sched_param))) - return -EFAULT; - - rcu_read_lock(); - retval = -ESRCH; - p = find_process_by_pid(pid); - if (p != NULL) - retval = sched_setscheduler(p, policy, &lparam); - rcu_read_unlock(); - - return retval; + return pid ? find_task_by_vpid(pid) : current; } /* - * Mimics kernel/events/core.c perf_copy_attr(). + * This function initializes the sched_dl_entity of a newly becoming + * SCHED_DEADLINE task. + * + * Only the static values are considered here, the actual runtime and the + * absolute deadline will be properly calculated when the task is enqueued + * for the first time with its new policy. */ -static int sched_copy_attr(struct sched_attr __user *uattr, - struct sched_attr *attr) +static void +__setparam_dl(struct task_struct *p, const struct sched_attr *attr) { - u32 size; - int ret; + struct sched_dl_entity *dl_se = &p->dl; - if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) - return -EFAULT; + dl_se->dl_runtime = attr->sched_runtime; + dl_se->dl_deadline = attr->sched_deadline; + dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; + dl_se->flags = attr->sched_flags; + dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); /* - * zero the full structure, so that a short copy will be nice. + * Changing the parameters of a task is 'tricky' and we're not doing + * the correct thing -- also see task_dead_dl() and switched_from_dl(). + * + * What we SHOULD do is delay the bandwidth release until the 0-lag + * point. This would include retaining the task_struct until that time + * and change dl_overflow() to not immediately decrement the current + * amount. + * + * Instead we retain the current runtime/deadline and let the new + * parameters take effect after the current reservation period lapses. + * This is safe (albeit pessimistic) because the 0-lag point is always + * before the current scheduling deadline. + * + * We can still have temporary overloads because we do not delay the + * change in bandwidth until that time; so admission control is + * not on the safe side. It does however guarantee tasks will never + * consume more than promised. */ - memset(attr, 0, sizeof(*attr)); - - ret = get_user(size, &uattr->size); - if (ret) - return ret; - - if (size > PAGE_SIZE) /* silly large */ - goto err_size; - - if (!size) /* abi compat */ - size = SCHED_ATTR_SIZE_VER0; +} - if (size < SCHED_ATTR_SIZE_VER0) - goto err_size; +/* + * sched_setparam() passes in -1 for its policy, to let the functions + * it calls know not to change it. + */ +#define SETPARAM_POLICY -1 - /* - * If we're handed a bigger struct than we know of, - * ensure all the unknown bits are 0 - i.e. new - * user-space does not rely on any kernel feature - * extensions we dont know about yet. - */ - if (size > sizeof(*attr)) { - unsigned char __user *addr; - unsigned char __user *end; - unsigned char val; +static void __setscheduler_params(struct task_struct *p, + const struct sched_attr *attr) +{ + int policy = attr->sched_policy; - addr = (void __user *)uattr + sizeof(*attr); - end = (void __user *)uattr + size; + if (policy == SETPARAM_POLICY) + policy = p->policy; - for (; addr < end; addr++) { - ret = get_user(val, addr); - if (ret) - return ret; - if (val) - goto err_size; - } - size = sizeof(*attr); - } + p->policy = policy; - ret = copy_from_user(attr, uattr, size); - if (ret) - return -EFAULT; + if (dl_policy(policy)) + __setparam_dl(p, attr); + else if (fair_policy(policy)) + p->static_prio = NICE_TO_PRIO(attr->sched_nice); /* - * XXX: do we want to be lenient like existing syscalls; or do we want - * to be strict and return an error on out-of-bounds values? + * __sched_setscheduler() ensures attr->sched_priority == 0 when + * !rt_policy. Always setting this ensures that things like + * getparam()/getattr() don't report silly values for !rt tasks. */ - attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); - - return 0; - -err_size: - put_user(sizeof(*attr), &uattr->size); - return -E2BIG; + p->rt_priority = attr->sched_priority; + p->normal_prio = normal_prio(p); + set_load_weight(p); } -/** - * sys_sched_setscheduler - set/change the scheduler policy and RT priority - * @pid: the pid in question. - * @policy: new policy. - * @param: structure containing the new RT priority. - * - * Return: 0 on success. An error code otherwise. - */ -SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, - struct sched_param __user *, param) +/* Actually do priority change: must hold pi & rq lock. */ +static void __setscheduler(struct rq *rq, struct task_struct *p, + const struct sched_attr *attr, bool keep_boost) { - /* negative values for policy are not valid */ - if (policy < 0) - return -EINVAL; + __setscheduler_params(p, attr); - return do_sched_setscheduler(pid, policy, param); -} + /* + * Keep a potential priority boosting if called from + * sched_setscheduler(). + */ + if (keep_boost) + p->prio = rt_mutex_get_effective_prio(p, normal_prio(p)); + else + p->prio = normal_prio(p); -/** - * sys_sched_setparam - set/change the RT priority of a thread - * @pid: the pid in question. - * @param: structure containing the new RT priority. - * - * Return: 0 on success. An error code otherwise. - */ -SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) + if (dl_prio(p->prio)) + p->sched_class = &dl_sched_class; + else if (rt_prio(p->prio)) + p->sched_class = &rt_sched_class; + else + p->sched_class = &fair_sched_class; +} + +static void +__getparam_dl(struct task_struct *p, struct sched_attr *attr) { - return do_sched_setscheduler(pid, SETPARAM_POLICY, param); + struct sched_dl_entity *dl_se = &p->dl; + + attr->sched_priority = p->rt_priority; + attr->sched_runtime = dl_se->dl_runtime; + attr->sched_deadline = dl_se->dl_deadline; + attr->sched_period = dl_se->dl_period; + attr->sched_flags = dl_se->flags; } -/** - * sys_sched_setattr - same as above, but with extended sched_attr - * @pid: the pid in question. - * @uattr: structure containing the extended parameters. - * @flags: for future extension. +/* + * This function validates the new parameters of a -deadline task. + * We ask for the deadline not being zero, and greater or equal + * than the runtime, as well as the period of being zero or + * greater than deadline. Furthermore, we have to be sure that + * user parameters are above the internal resolution of 1us (we + * check sched_runtime only since it is always the smaller one) and + * below 2^63 ns (we have to check both sched_deadline and + * sched_period, as the latter can be zero). */ -SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, - unsigned int, flags) +static bool +__checkparam_dl(const struct sched_attr *attr) { - struct sched_attr attr; - struct task_struct *p; - int retval; - - if (!uattr || pid < 0 || flags) - return -EINVAL; + /* deadline != 0 */ + if (attr->sched_deadline == 0) + return false; - retval = sched_copy_attr(uattr, &attr); - if (retval) - return retval; + /* + * Since we truncate DL_SCALE bits, make sure we're at least + * that big. + */ + if (attr->sched_runtime < (1ULL << DL_SCALE)) + return false; - if ((int)attr.sched_policy < 0) - return -EINVAL; + /* + * Since we use the MSB for wrap-around and sign issues, make + * sure it's not set (mind that period can be equal to zero). + */ + if (attr->sched_deadline & (1ULL << 63) || + attr->sched_period & (1ULL << 63)) + return false; - rcu_read_lock(); - retval = -ESRCH; - p = find_process_by_pid(pid); - if (p != NULL) - retval = sched_setattr(p, &attr); - rcu_read_unlock(); + /* runtime <= deadline <= period (if period != 0) */ + if ((attr->sched_period != 0 && + attr->sched_period < attr->sched_deadline) || + attr->sched_deadline < attr->sched_runtime) + return false; - return retval; + return true; } -/** - * sys_sched_getscheduler - get the policy (scheduling class) of a thread - * @pid: the pid in question. - * - * Return: On success, the policy of the thread. Otherwise, a negative error - * code. +/* + * check the target process has a UID that matches the current process's */ -SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) +static bool check_same_owner(struct task_struct *p) { - struct task_struct *p; - int retval; - - if (pid < 0) - return -EINVAL; + const struct cred *cred = current_cred(), *pcred; + bool match; - retval = -ESRCH; rcu_read_lock(); - p = find_process_by_pid(pid); - if (p) { - retval = security_task_getscheduler(p); - if (!retval) - retval = p->policy - | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); - } + pcred = __task_cred(p); + match = (uid_eq(cred->euid, pcred->euid) || + uid_eq(cred->euid, pcred->uid)); rcu_read_unlock(); - return retval; + return match; } -/** - * sys_sched_getparam - get the RT priority of a thread - * @pid: the pid in question. - * @param: structure containing the RT priority. - * - * Return: On success, 0 and the RT priority is in @param. Otherwise, an error - * code. - */ -SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) +static bool dl_param_changed(struct task_struct *p, + const struct sched_attr *attr) { - struct sched_param lp = { .sched_priority = 0 }; - struct task_struct *p; - int retval; - - if (!param || pid < 0) - return -EINVAL; - - rcu_read_lock(); - p = find_process_by_pid(pid); - retval = -ESRCH; - if (!p) - goto out_unlock; - - retval = security_task_getscheduler(p); - if (retval) - goto out_unlock; - - if (task_has_rt_policy(p)) - lp.sched_priority = p->rt_priority; - rcu_read_unlock(); - - /* - * This one might sleep, we cannot do it with a spinlock held ... - */ - retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; + struct sched_dl_entity *dl_se = &p->dl; - return retval; + if (dl_se->dl_runtime != attr->sched_runtime || + dl_se->dl_deadline != attr->sched_deadline || + dl_se->dl_period != attr->sched_period || + dl_se->flags != attr->sched_flags) + return true; -out_unlock: - rcu_read_unlock(); - return retval; + return false; } -static int sched_read_attr(struct sched_attr __user *uattr, - struct sched_attr *attr, - unsigned int usize) +static int __sched_setscheduler(struct task_struct *p, + const struct sched_attr *attr, + bool user, bool pi) { - int ret; + int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : + MAX_RT_PRIO - 1 - attr->sched_priority; + int retval, oldprio, oldpolicy = -1, queued, running; + int new_effective_prio, policy = attr->sched_policy; + unsigned long flags; + const struct sched_class *prev_class; + struct rq *rq; + int reset_on_fork; - if (!access_ok(VERIFY_WRITE, uattr, usize)) - return -EFAULT; + /* may grab non-irq protected spin_locks */ + BUG_ON(in_interrupt()); +recheck: + /* double check policy once rq lock held */ + if (policy < 0) { + reset_on_fork = p->sched_reset_on_fork; + policy = oldpolicy = p->policy; + } else { + reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); + + if (!valid_policy(policy)) + return -EINVAL; + } + + if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK)) + return -EINVAL; /* - * If we're handed a smaller struct than we know of, - * ensure all the unknown bits are 0 - i.e. old - * user-space does not get uncomplete information. + * Valid priorities for SCHED_FIFO and SCHED_RR are + * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, + * SCHED_BATCH and SCHED_IDLE is 0. */ - if (usize < sizeof(*attr)) { - unsigned char *addr; - unsigned char *end; - - addr = (void *)attr + usize; - end = (void *)attr + sizeof(*attr); + if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) || + (!p->mm && attr->sched_priority > MAX_RT_PRIO-1)) + return -EINVAL; + if ((dl_policy(policy) && !__checkparam_dl(attr)) || + (rt_policy(policy) != (attr->sched_priority != 0))) + return -EINVAL; - for (; addr < end; addr++) { - if (*addr) - return -EFBIG; + /* + * Allow unprivileged RT tasks to decrease priority: + */ + if (user && !capable(CAP_SYS_NICE)) { + if (fair_policy(policy)) { + if (attr->sched_nice < task_nice(p) && + !can_nice(p, attr->sched_nice)) + return -EPERM; } - attr->size = usize; - } - - ret = copy_to_user(uattr, attr, attr->size); - if (ret) - return -EFAULT; - - return 0; -} + if (rt_policy(policy)) { + unsigned long rlim_rtprio = + task_rlimit(p, RLIMIT_RTPRIO); -/** - * sys_sched_getattr - similar to sched_getparam, but with sched_attr - * @pid: the pid in question. - * @uattr: structure containing the extended parameters. - * @size: sizeof(attr) for fwd/bwd comp. - * @flags: for future extension. - */ -SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, - unsigned int, size, unsigned int, flags) -{ - struct sched_attr attr = { - .size = sizeof(struct sched_attr), - }; - struct task_struct *p; - int retval; + /* can't set/change the rt policy */ + if (policy != p->policy && !rlim_rtprio) + return -EPERM; - if (!uattr || pid < 0 || size > PAGE_SIZE || - size < SCHED_ATTR_SIZE_VER0 || flags) - return -EINVAL; + /* can't increase priority */ + if (attr->sched_priority > p->rt_priority && + attr->sched_priority > rlim_rtprio) + return -EPERM; + } - rcu_read_lock(); - p = find_process_by_pid(pid); - retval = -ESRCH; - if (!p) - goto out_unlock; + /* + * Can't set/change SCHED_DEADLINE policy at all for now + * (safest behavior); in the future we would like to allow + * unprivileged DL tasks to increase their relative deadline + * or reduce their runtime (both ways reducing utilization) + */ + if (dl_policy(policy)) + return -EPERM; - retval = security_task_getscheduler(p); - if (retval) - goto out_unlock; + /* + * Treat SCHED_IDLE as nice 20. Only allow a switch to + * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. + */ + if (idle_policy(p->policy) && !idle_policy(policy)) { + if (!can_nice(p, task_nice(p))) + return -EPERM; + } - attr.sched_policy = p->policy; - if (p->sched_reset_on_fork) - attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; - if (task_has_dl_policy(p)) - __getparam_dl(p, &attr); - else if (task_has_rt_policy(p)) - attr.sched_priority = p->rt_priority; - else - attr.sched_nice = task_nice(p); + /* can't change other user's priorities */ + if (!check_same_owner(p)) + return -EPERM; - rcu_read_unlock(); + /* Normal users shall not reset the sched_reset_on_fork flag */ + if (p->sched_reset_on_fork && !reset_on_fork) + return -EPERM; + } - retval = sched_read_attr(uattr, &attr, size); - return retval; + if (user) { + retval = security_task_setscheduler(p); + if (retval) + return retval; + } -out_unlock: - rcu_read_unlock(); - return retval; -} + /* + * make sure no PI-waiters arrive (or leave) while we are + * changing the priority of the task: + * + * To be able to change p->policy safely, the appropriate + * runqueue lock must be held. + */ + rq = task_rq_lock(p, &flags); -long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) -{ - cpumask_var_t cpus_allowed, new_mask; - struct task_struct *p; - int retval; + /* + * Changing the policy of the stop threads its a very bad idea + */ + if (p == rq->stop) { + task_rq_unlock(rq, p, &flags); + return -EINVAL; + } - rcu_read_lock(); + /* + * If not changing anything there's no need to proceed further, + * but store a possible modification of reset_on_fork. + */ + if (unlikely(policy == p->policy)) { + if (fair_policy(policy) && attr->sched_nice != task_nice(p)) + goto change; + if (rt_policy(policy) && attr->sched_priority != p->rt_priority) + goto change; + if (dl_policy(policy) && dl_param_changed(p, attr)) + goto change; - p = find_process_by_pid(pid); - if (!p) { - rcu_read_unlock(); - return -ESRCH; + p->sched_reset_on_fork = reset_on_fork; + task_rq_unlock(rq, p, &flags); + return 0; } +change: - /* Prevent p going away */ - get_task_struct(p); - rcu_read_unlock(); + if (user) { +#ifdef CONFIG_RT_GROUP_SCHED + /* + * Do not allow realtime tasks into groups that have no runtime + * assigned. + */ + if (rt_bandwidth_enabled() && rt_policy(policy) && + task_group(p)->rt_bandwidth.rt_runtime == 0 && + !task_group_is_autogroup(task_group(p))) { + task_rq_unlock(rq, p, &flags); + return -EPERM; + } +#endif +#ifdef CONFIG_SMP + if (dl_bandwidth_enabled() && dl_policy(policy)) { + cpumask_t *span = rq->rd->span; - if (p->flags & PF_NO_SETAFFINITY) { - retval = -EINVAL; - goto out_put_task; + /* + * Don't allow tasks with an affinity mask smaller than + * the entire root_domain to become SCHED_DEADLINE. We + * will also fail if there's no bandwidth available. + */ + if (!cpumask_subset(span, &p->cpus_allowed) || + rq->rd->dl_bw.bw == 0) { + task_rq_unlock(rq, p, &flags); + return -EPERM; + } + } +#endif } - if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { - retval = -ENOMEM; - goto out_put_task; + + /* recheck policy now with rq lock held */ + if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { + policy = oldpolicy = -1; + task_rq_unlock(rq, p, &flags); + goto recheck; } - if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { - retval = -ENOMEM; - goto out_free_cpus_allowed; + + /* + * If setscheduling to SCHED_DEADLINE (or changing the parameters + * of a SCHED_DEADLINE task) we need to check if enough bandwidth + * is available. + */ + if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) { + task_rq_unlock(rq, p, &flags); + return -EBUSY; } - retval = -EPERM; - if (!check_same_owner(p)) { - rcu_read_lock(); - if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { - rcu_read_unlock(); - goto out_free_new_mask; + + p->sched_reset_on_fork = reset_on_fork; + oldprio = p->prio; + + if (pi) { + /* + * Take priority boosted tasks into account. If the new + * effective priority is unchanged, we just store the new + * normal parameters and do not touch the scheduler class and + * the runqueue. This will be done when the task deboost + * itself. + */ + new_effective_prio = rt_mutex_get_effective_prio(p, newprio); + if (new_effective_prio == oldprio) { + __setscheduler_params(p, attr); + task_rq_unlock(rq, p, &flags); + return 0; } - rcu_read_unlock(); } - retval = security_task_setscheduler(p); - if (retval) - goto out_free_new_mask; + queued = task_on_rq_queued(p); + running = task_current(rq, p); + if (queued) + dequeue_task(rq, p, DEQUEUE_SAVE); + if (running) + put_prev_task(rq, p); + prev_class = p->sched_class; + __setscheduler(rq, p, attr, pi); - cpuset_cpus_allowed(p, cpus_allowed); - cpumask_and(new_mask, in_mask, cpus_allowed); + if (running) + p->sched_class->set_curr_task(rq); + if (queued) { + int enqueue_flags = ENQUEUE_RESTORE; + /* + * We enqueue to tail when the priority of a task is + * increased (user space view). + */ + if (oldprio <= p->prio) + enqueue_flags |= ENQUEUE_HEAD; + + enqueue_task(rq, p, enqueue_flags); + } + + check_class_changed(rq, p, prev_class, oldprio); + preempt_disable(); /* avoid rq from going away on us */ + task_rq_unlock(rq, p, &flags); + + if (pi) + rt_mutex_adjust_pi(p); /* - * Since bandwidth control happens on root_domain basis, - * if admission test is enabled, we only admit -deadline - * tasks allowed to run on all the CPUs in the task's - * root_domain. + * Run balance callbacks after we've adjusted the PI chain. */ -#ifdef CONFIG_SMP - if (task_has_dl_policy(p) && dl_bandwidth_enabled()) { - rcu_read_lock(); - if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) { - retval = -EBUSY; - rcu_read_unlock(); - goto out_free_new_mask; - } - rcu_read_unlock(); - } -#endif -again: - retval = set_cpus_allowed_ptr(p, new_mask); + balance_callback(rq); + preempt_enable(); - if (!retval) { - cpuset_cpus_allowed(p, cpus_allowed); - if (!cpumask_subset(new_mask, cpus_allowed)) { - /* - * We must have raced with a concurrent cpuset - * update. Just reset the cpus_allowed to the - * cpuset's cpus_allowed - */ - cpumask_copy(new_mask, cpus_allowed); - goto again; - } - } -out_free_new_mask: - free_cpumask_var(new_mask); -out_free_cpus_allowed: - free_cpumask_var(cpus_allowed); -out_put_task: - put_task_struct(p); - return retval; + return 0; } -static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, - struct cpumask *new_mask) +static int _sched_setscheduler(struct task_struct *p, int policy, + const struct sched_param *param, bool check) { - if (len < cpumask_size()) - cpumask_clear(new_mask); - else if (len > cpumask_size()) - len = cpumask_size(); + struct sched_attr attr = { + .sched_policy = policy, + .sched_priority = param->sched_priority, + .sched_nice = PRIO_TO_NICE(p->static_prio), + }; - return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; -} + /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ + if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { + attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; + policy &= ~SCHED_RESET_ON_FORK; + attr.sched_policy = policy; + } + return __sched_setscheduler(p, &attr, check, true); +} /** - * sys_sched_setaffinity - set the cpu affinity of a process - * @pid: pid of the process - * @len: length in bytes of the bitmask pointed to by user_mask_ptr - * @user_mask_ptr: user-space pointer to the new cpu mask + * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. + * @p: the task in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + * + * Return: 0 on success. An error code otherwise. + * + * NOTE that the task may be already dead. + */ +int sched_setscheduler(struct task_struct *p, int policy, + const struct sched_param *param) +{ + return _sched_setscheduler(p, policy, param, true); +} +EXPORT_SYMBOL_GPL(sched_setscheduler); + +int sched_setattr(struct task_struct *p, const struct sched_attr *attr) +{ + return __sched_setscheduler(p, attr, true, true); +} +EXPORT_SYMBOL_GPL(sched_setattr); + +/** + * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. + * @p: the task in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + * + * Just like sched_setscheduler, only don't bother checking if the + * current context has permission. For example, this is needed in + * stop_machine(): we create temporary high priority worker threads, + * but our caller might not have that capability. * * Return: 0 on success. An error code otherwise. */ -SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, - unsigned long __user *, user_mask_ptr) +int sched_setscheduler_nocheck(struct task_struct *p, int policy, + const struct sched_param *param) { - cpumask_var_t new_mask; - int retval; - - if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) - return -ENOMEM; - - retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); - if (retval == 0) - retval = sched_setaffinity(pid, new_mask); - free_cpumask_var(new_mask); - return retval; + return _sched_setscheduler(p, policy, param, false); } +EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); -long sched_getaffinity(pid_t pid, struct cpumask *mask) +static int +do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) { + struct sched_param lparam; struct task_struct *p; - unsigned long flags; int retval; - rcu_read_lock(); + if (!param || pid < 0) + return -EINVAL; + if (copy_from_user(&lparam, param, sizeof(struct sched_param))) + return -EFAULT; + rcu_read_lock(); retval = -ESRCH; p = find_process_by_pid(pid); - if (!p) - goto out_unlock; - - retval = security_task_getscheduler(p); - if (retval) - goto out_unlock; - - raw_spin_lock_irqsave(&p->pi_lock, flags); - cpumask_and(mask, &p->cpus_allowed, cpu_active_mask); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); - -out_unlock: + if (p != NULL) + retval = sched_setscheduler(p, policy, &lparam); rcu_read_unlock(); return retval; } -/** - * sys_sched_getaffinity - get the cpu affinity of a process - * @pid: pid of the process - * @len: length in bytes of the bitmask pointed to by user_mask_ptr - * @user_mask_ptr: user-space pointer to hold the current cpu mask - * - * Return: 0 on success. An error code otherwise. +/* + * Mimics kernel/events/core.c perf_copy_attr(). */ -SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, - unsigned long __user *, user_mask_ptr) +static int sched_copy_attr(struct sched_attr __user *uattr, + struct sched_attr *attr) { + u32 size; int ret; - cpumask_var_t mask; - - if ((len * BITS_PER_BYTE) < nr_cpu_ids) - return -EINVAL; - if (len & (sizeof(unsigned long)-1)) - return -EINVAL; - if (!alloc_cpumask_var(&mask, GFP_KERNEL)) - return -ENOMEM; + if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) + return -EFAULT; - ret = sched_getaffinity(pid, mask); - if (ret == 0) { - size_t retlen = min_t(size_t, len, cpumask_size()); + /* + * zero the full structure, so that a short copy will be nice. + */ + memset(attr, 0, sizeof(*attr)); - if (copy_to_user(user_mask_ptr, mask, retlen)) - ret = -EFAULT; - else - ret = retlen; - } - free_cpumask_var(mask); + ret = get_user(size, &uattr->size); + if (ret) + return ret; - return ret; -} + if (size > PAGE_SIZE) /* silly large */ + goto err_size; -/** - * sys_sched_yield - yield the current processor to other threads. - * - * This function yields the current CPU to other tasks. If there are no - * other threads running on this CPU then this function will return. - * - * Return: 0. - */ -SYSCALL_DEFINE0(sched_yield) -{ - struct rq *rq = this_rq_lock(); + if (!size) /* abi compat */ + size = SCHED_ATTR_SIZE_VER0; - schedstat_inc(rq, yld_count); - current->sched_class->yield_task(rq); + if (size < SCHED_ATTR_SIZE_VER0) + goto err_size; /* - * Since we are going to call schedule() anyway, there's - * no need to preempt or enable interrupts: + * If we're handed a bigger struct than we know of, + * ensure all the unknown bits are 0 - i.e. new + * user-space does not rely on any kernel feature + * extensions we dont know about yet. */ - __release(rq->lock); - spin_release(&rq->lock.dep_map, 1, _THIS_IP_); - do_raw_spin_unlock(&rq->lock); - sched_preempt_enable_no_resched(); + if (size > sizeof(*attr)) { + unsigned char __user *addr; + unsigned char __user *end; + unsigned char val; - schedule(); + addr = (void __user *)uattr + sizeof(*attr); + end = (void __user *)uattr + size; + + for (; addr < end; addr++) { + ret = get_user(val, addr); + if (ret) + return ret; + if (val) + goto err_size; + } + size = sizeof(*attr); + } + + ret = copy_from_user(attr, uattr, size); + if (ret) + return -EFAULT; + + /* + * XXX: do we want to be lenient like existing syscalls; or do we want + * to be strict and return an error on out-of-bounds values? + */ + attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); return 0; + +err_size: + put_user(sizeof(*attr), &uattr->size); + return -E2BIG; } -int __sched _cond_resched(void) +/** + * sys_sched_setscheduler - set/change the scheduler policy and RT priority + * @pid: the pid in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + * + * Return: 0 on success. An error code otherwise. + */ +SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, + struct sched_param __user *, param) { - if (should_resched()) { - preempt_schedule_common(); - return 1; - } - return 0; + /* negative values for policy are not valid */ + if (policy < 0) + return -EINVAL; + + return do_sched_setscheduler(pid, policy, param); } -EXPORT_SYMBOL(_cond_resched); -/* - * __cond_resched_lock() - if a reschedule is pending, drop the given lock, - * call schedule, and on return reacquire the lock. +/** + * sys_sched_setparam - set/change the RT priority of a thread + * @pid: the pid in question. + * @param: structure containing the new RT priority. * - * This works OK both with and without CONFIG_PREEMPT. We do strange low-level - * operations here to prevent schedule() from being called twice (once via - * spin_unlock(), once by hand). + * Return: 0 on success. An error code otherwise. */ -int __cond_resched_lock(spinlock_t *lock) +SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) { - int resched = should_resched(); - int ret = 0; + return do_sched_setscheduler(pid, SETPARAM_POLICY, param); +} - lockdep_assert_held(lock); +/** + * sys_sched_setattr - same as above, but with extended sched_attr + * @pid: the pid in question. + * @uattr: structure containing the extended parameters. + * @flags: for future extension. + */ +SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, + unsigned int, flags) +{ + struct sched_attr attr; + struct task_struct *p; + int retval; - if (spin_needbreak(lock) || resched) { - spin_unlock(lock); - if (resched) - preempt_schedule_common(); - else - cpu_relax(); - ret = 1; - spin_lock(lock); - } - return ret; + if (!uattr || pid < 0 || flags) + return -EINVAL; + + retval = sched_copy_attr(uattr, &attr); + if (retval) + return retval; + + if ((int)attr.sched_policy < 0) + return -EINVAL; + + rcu_read_lock(); + retval = -ESRCH; + p = find_process_by_pid(pid); + if (p != NULL) + retval = sched_setattr(p, &attr); + rcu_read_unlock(); + + return retval; } -EXPORT_SYMBOL(__cond_resched_lock); -#ifndef CONFIG_PREEMPT_RT_FULL -int __sched __cond_resched_softirq(void) +/** + * sys_sched_getscheduler - get the policy (scheduling class) of a thread + * @pid: the pid in question. + * + * Return: On success, the policy of the thread. Otherwise, a negative error + * code. + */ +SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) { - BUG_ON(!in_softirq()); + struct task_struct *p; + int retval; - if (should_resched()) { - local_bh_enable(); - preempt_schedule_common(); - local_bh_disable(); - return 1; + if (pid < 0) + return -EINVAL; + + retval = -ESRCH; + rcu_read_lock(); + p = find_process_by_pid(pid); + if (p) { + retval = security_task_getscheduler(p); + if (!retval) + retval = p->policy + | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); } - return 0; + rcu_read_unlock(); + return retval; } -EXPORT_SYMBOL(__cond_resched_softirq); -#endif /** - * yield - yield the current processor to other threads. - * - * Do not ever use this function, there's a 99% chance you're doing it wrong. - * - * The scheduler is at all times free to pick the calling task as the most - * eligible task to run, if removing the yield() call from your code breaks - * it, its already broken. - * - * Typical broken usage is: - * - * while (!event) - * yield(); - * - * where one assumes that yield() will let 'the other' process run that will - * make event true. If the current task is a SCHED_FIFO task that will never - * happen. Never use yield() as a progress guarantee!! + * sys_sched_getparam - get the RT priority of a thread + * @pid: the pid in question. + * @param: structure containing the RT priority. * - * If you want to use yield() to wait for something, use wait_event(). - * If you want to use yield() to be 'nice' for others, use cond_resched(). - * If you still want to use yield(), do not! + * Return: On success, 0 and the RT priority is in @param. Otherwise, an error + * code. */ -void __sched yield(void) +SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) { - set_current_state(TASK_RUNNING); - sys_sched_yield(); + struct sched_param lp = { .sched_priority = 0 }; + struct task_struct *p; + int retval; + + if (!param || pid < 0) + return -EINVAL; + + rcu_read_lock(); + p = find_process_by_pid(pid); + retval = -ESRCH; + if (!p) + goto out_unlock; + + retval = security_task_getscheduler(p); + if (retval) + goto out_unlock; + + if (task_has_rt_policy(p)) + lp.sched_priority = p->rt_priority; + rcu_read_unlock(); + + /* + * This one might sleep, we cannot do it with a spinlock held ... + */ + retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; + + return retval; + +out_unlock: + rcu_read_unlock(); + return retval; } -EXPORT_SYMBOL(yield); -/** - * yield_to - yield the current processor to another thread in - * your thread group, or accelerate that thread toward the - * processor it's on. - * @p: target task - * @preempt: whether task preemption is allowed or not - * - * It's the caller's job to ensure that the target task struct - * can't go away on us before we can do any checks. - * - * Return: - * true (>0) if we indeed boosted the target task. - * false (0) if we failed to boost the target. - * -ESRCH if there's no task to yield to. - */ -int __sched yield_to(struct task_struct *p, bool preempt) +static int sched_read_attr(struct sched_attr __user *uattr, + struct sched_attr *attr, + unsigned int usize) { - struct task_struct *curr = current; - struct rq *rq, *p_rq; - unsigned long flags; - int yielded = 0; + int ret; - local_irq_save(flags); - rq = this_rq(); + if (!access_ok(VERIFY_WRITE, uattr, usize)) + return -EFAULT; -again: - p_rq = task_rq(p); /* - * If we're the only runnable task on the rq and target rq also - * has only one task, there's absolutely no point in yielding. + * If we're handed a smaller struct than we know of, + * ensure all the unknown bits are 0 - i.e. old + * user-space does not get uncomplete information. */ - if (rq->nr_running == 1 && p_rq->nr_running == 1) { - yielded = -ESRCH; - goto out_irq; - } + if (usize < sizeof(*attr)) { + unsigned char *addr; + unsigned char *end; - double_rq_lock(rq, p_rq); - if (task_rq(p) != p_rq) { - double_rq_unlock(rq, p_rq); - goto again; + addr = (void *)attr + usize; + end = (void *)attr + sizeof(*attr); + + for (; addr < end; addr++) { + if (*addr) + return -EFBIG; + } + + attr->size = usize; } - if (!curr->sched_class->yield_to_task) - goto out_unlock; + ret = copy_to_user(uattr, attr, attr->size); + if (ret) + return -EFAULT; - if (curr->sched_class != p->sched_class) + return 0; +} + +/** + * sys_sched_getattr - similar to sched_getparam, but with sched_attr + * @pid: the pid in question. + * @uattr: structure containing the extended parameters. + * @size: sizeof(attr) for fwd/bwd comp. + * @flags: for future extension. + */ +SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, + unsigned int, size, unsigned int, flags) +{ + struct sched_attr attr = { + .size = sizeof(struct sched_attr), + }; + struct task_struct *p; + int retval; + + if (!uattr || pid < 0 || size > PAGE_SIZE || + size < SCHED_ATTR_SIZE_VER0 || flags) + return -EINVAL; + + rcu_read_lock(); + p = find_process_by_pid(pid); + retval = -ESRCH; + if (!p) goto out_unlock; - if (task_running(p_rq, p) || p->state) + retval = security_task_getscheduler(p); + if (retval) goto out_unlock; - yielded = curr->sched_class->yield_to_task(rq, p, preempt); - if (yielded) { - schedstat_inc(rq, yld_count); - /* - * Make p's CPU reschedule; pick_next_entity takes care of - * fairness. - */ - if (preempt && rq != p_rq) - resched_curr(p_rq); - } + attr.sched_policy = p->policy; + if (p->sched_reset_on_fork) + attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; + if (task_has_dl_policy(p)) + __getparam_dl(p, &attr); + else if (task_has_rt_policy(p)) + attr.sched_priority = p->rt_priority; + else + attr.sched_nice = task_nice(p); -out_unlock: - double_rq_unlock(rq, p_rq); -out_irq: - local_irq_restore(flags); + rcu_read_unlock(); - if (yielded > 0) - schedule(); + retval = sched_read_attr(uattr, &attr, size); + return retval; - return yielded; +out_unlock: + rcu_read_unlock(); + return retval; } -EXPORT_SYMBOL_GPL(yield_to); -/* - * This task is about to go to sleep on IO. Increment rq->nr_iowait so - * that process accounting knows that this is a task in IO wait state. - */ -long __sched io_schedule_timeout(long timeout) +long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) { - int old_iowait = current->in_iowait; - struct rq *rq; - long ret; + cpumask_var_t cpus_allowed, new_mask; + struct task_struct *p; + int retval; - current->in_iowait = 1; - blk_schedule_flush_plug(current); + rcu_read_lock(); - delayacct_blkio_start(); - rq = raw_rq(); - atomic_inc(&rq->nr_iowait); - ret = schedule_timeout(timeout); - current->in_iowait = old_iowait; - atomic_dec(&rq->nr_iowait); - delayacct_blkio_end(); + p = find_process_by_pid(pid); + if (!p) { + rcu_read_unlock(); + return -ESRCH; + } + + /* Prevent p going away */ + get_task_struct(p); + rcu_read_unlock(); + + if (p->flags & PF_NO_SETAFFINITY) { + retval = -EINVAL; + goto out_put_task; + } + if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { + retval = -ENOMEM; + goto out_put_task; + } + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { + retval = -ENOMEM; + goto out_free_cpus_allowed; + } + retval = -EPERM; + if (!check_same_owner(p)) { + rcu_read_lock(); + if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { + rcu_read_unlock(); + goto out_free_new_mask; + } + rcu_read_unlock(); + } + + retval = security_task_setscheduler(p); + if (retval) + goto out_free_new_mask; + + + cpuset_cpus_allowed(p, cpus_allowed); + cpumask_and(new_mask, in_mask, cpus_allowed); - return ret; + /* + * Since bandwidth control happens on root_domain basis, + * if admission test is enabled, we only admit -deadline + * tasks allowed to run on all the CPUs in the task's + * root_domain. + */ +#ifdef CONFIG_SMP + if (task_has_dl_policy(p) && dl_bandwidth_enabled()) { + rcu_read_lock(); + if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) { + retval = -EBUSY; + rcu_read_unlock(); + goto out_free_new_mask; + } + rcu_read_unlock(); + } +#endif +again: + retval = __set_cpus_allowed_ptr(p, new_mask, true); + + if (!retval) { + cpuset_cpus_allowed(p, cpus_allowed); + if (!cpumask_subset(new_mask, cpus_allowed)) { + /* + * We must have raced with a concurrent cpuset + * update. Just reset the cpus_allowed to the + * cpuset's cpus_allowed + */ + cpumask_copy(new_mask, cpus_allowed); + goto again; + } + } +out_free_new_mask: + free_cpumask_var(new_mask); +out_free_cpus_allowed: + free_cpumask_var(cpus_allowed); +out_put_task: + put_task_struct(p); + return retval; } -EXPORT_SYMBOL(io_schedule_timeout); -/** - * sys_sched_get_priority_max - return maximum RT priority. - * @policy: scheduling class. - * - * Return: On success, this syscall returns the maximum - * rt_priority that can be used by a given scheduling class. - * On failure, a negative error code is returned. - */ -SYSCALL_DEFINE1(sched_get_priority_max, int, policy) +static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, + struct cpumask *new_mask) { - int ret = -EINVAL; + if (len < cpumask_size()) + cpumask_clear(new_mask); + else if (len > cpumask_size()) + len = cpumask_size(); - switch (policy) { - case SCHED_FIFO: - case SCHED_RR: - ret = MAX_USER_RT_PRIO-1; - break; - case SCHED_DEADLINE: - case SCHED_NORMAL: - case SCHED_BATCH: - case SCHED_IDLE: - ret = 0; - break; - } - return ret; + return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; } /** - * sys_sched_get_priority_min - return minimum RT priority. - * @policy: scheduling class. + * sys_sched_setaffinity - set the cpu affinity of a process + * @pid: pid of the process + * @len: length in bytes of the bitmask pointed to by user_mask_ptr + * @user_mask_ptr: user-space pointer to the new cpu mask * - * Return: On success, this syscall returns the minimum - * rt_priority that can be used by a given scheduling class. - * On failure, a negative error code is returned. + * Return: 0 on success. An error code otherwise. */ -SYSCALL_DEFINE1(sched_get_priority_min, int, policy) +SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, + unsigned long __user *, user_mask_ptr) { - int ret = -EINVAL; + cpumask_var_t new_mask; + int retval; - switch (policy) { - case SCHED_FIFO: - case SCHED_RR: - ret = 1; - break; - case SCHED_DEADLINE: - case SCHED_NORMAL: - case SCHED_BATCH: - case SCHED_IDLE: - ret = 0; - } - return ret; + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) + return -ENOMEM; + + retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); + if (retval == 0) + retval = sched_setaffinity(pid, new_mask); + free_cpumask_var(new_mask); + return retval; } -/** - * sys_sched_rr_get_interval - return the default timeslice of a process. - * @pid: pid of the process. - * @interval: userspace pointer to the timeslice value. - * - * this syscall writes the default timeslice value of a given process - * into the user-space timespec buffer. A value of '0' means infinity. - * - * Return: On success, 0 and the timeslice is in @interval. Otherwise, - * an error code. - */ -SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, - struct timespec __user *, interval) +long sched_getaffinity(pid_t pid, struct cpumask *mask) { struct task_struct *p; - unsigned int time_slice; unsigned long flags; - struct rq *rq; int retval; - struct timespec t; - if (pid < 0) - return -EINVAL; + rcu_read_lock(); retval = -ESRCH; - rcu_read_lock(); p = find_process_by_pid(pid); if (!p) goto out_unlock; @@ -4742,461 +4770,592 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, if (retval) goto out_unlock; - rq = task_rq_lock(p, &flags); - time_slice = 0; - if (p->sched_class->get_rr_interval) - time_slice = p->sched_class->get_rr_interval(rq, p); - task_rq_unlock(rq, p, &flags); - - rcu_read_unlock(); - jiffies_to_timespec(time_slice, &t); - retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; - return retval; + raw_spin_lock_irqsave(&p->pi_lock, flags); + cpumask_and(mask, &p->cpus_allowed, cpu_active_mask); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); out_unlock: rcu_read_unlock(); + return retval; } -static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; +/** + * sys_sched_getaffinity - get the cpu affinity of a process + * @pid: pid of the process + * @len: length in bytes of the bitmask pointed to by user_mask_ptr + * @user_mask_ptr: user-space pointer to hold the current cpu mask + * + * Return: 0 on success. An error code otherwise. + */ +SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, + unsigned long __user *, user_mask_ptr) +{ + int ret; + cpumask_var_t mask; -void sched_show_task(struct task_struct *p) + if ((len * BITS_PER_BYTE) < nr_cpu_ids) + return -EINVAL; + if (len & (sizeof(unsigned long)-1)) + return -EINVAL; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + ret = sched_getaffinity(pid, mask); + if (ret == 0) { + size_t retlen = min_t(size_t, len, cpumask_size()); + + if (copy_to_user(user_mask_ptr, mask, retlen)) + ret = -EFAULT; + else + ret = retlen; + } + free_cpumask_var(mask); + + return ret; +} + +/** + * sys_sched_yield - yield the current processor to other threads. + * + * This function yields the current CPU to other tasks. If there are no + * other threads running on this CPU then this function will return. + * + * Return: 0. + */ +SYSCALL_DEFINE0(sched_yield) { - unsigned long free = 0; - int ppid; - unsigned long state = p->state; + struct rq *rq = this_rq_lock(); - if (state) - state = __ffs(state) + 1; - printk(KERN_INFO "%-15.15s %c", p->comm, - state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); -#if BITS_PER_LONG == 32 - if (state == TASK_RUNNING) - printk(KERN_CONT " running "); - else - printk(KERN_CONT " %08lx ", thread_saved_pc(p)); -#else - if (state == TASK_RUNNING) - printk(KERN_CONT " running task "); - else - printk(KERN_CONT " %016lx ", thread_saved_pc(p)); -#endif -#ifdef CONFIG_DEBUG_STACK_USAGE - free = stack_not_used(p); -#endif - ppid = 0; - rcu_read_lock(); - if (pid_alive(p)) - ppid = task_pid_nr(rcu_dereference(p->real_parent)); - rcu_read_unlock(); - printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, - task_pid_nr(p), ppid, - (unsigned long)task_thread_info(p)->flags); + schedstat_inc(rq, yld_count); + current->sched_class->yield_task(rq); + + /* + * Since we are going to call schedule() anyway, there's + * no need to preempt or enable interrupts: + */ + __release(rq->lock); + spin_release(&rq->lock.dep_map, 1, _THIS_IP_); + do_raw_spin_unlock(&rq->lock); + sched_preempt_enable_no_resched(); + + schedule(); + + return 0; +} + +int __sched _cond_resched(void) +{ + if (should_resched(0)) { + preempt_schedule_common(); + return 1; + } + return 0; +} +EXPORT_SYMBOL(_cond_resched); + +/* + * __cond_resched_lock() - if a reschedule is pending, drop the given lock, + * call schedule, and on return reacquire the lock. + * + * This works OK both with and without CONFIG_PREEMPT. We do strange low-level + * operations here to prevent schedule() from being called twice (once via + * spin_unlock(), once by hand). + */ +int __cond_resched_lock(spinlock_t *lock) +{ + int resched = should_resched(PREEMPT_LOCK_OFFSET); + int ret = 0; + + lockdep_assert_held(lock); - print_worker_info(KERN_INFO, p); - show_stack(p, NULL); + if (spin_needbreak(lock) || resched) { + spin_unlock(lock); + if (resched) + preempt_schedule_common(); + else + cpu_relax(); + ret = 1; + spin_lock(lock); + } + return ret; } +EXPORT_SYMBOL(__cond_resched_lock); -void show_state_filter(unsigned long state_filter) +#ifndef CONFIG_PREEMPT_RT_FULL +int __sched __cond_resched_softirq(void) { - struct task_struct *g, *p; + BUG_ON(!in_softirq()); -#if BITS_PER_LONG == 32 - printk(KERN_INFO - " task PC stack pid father\n"); -#else - printk(KERN_INFO - " task PC stack pid father\n"); -#endif - rcu_read_lock(); - for_each_process_thread(g, p) { - /* - * reset the NMI-timeout, listing all files on a slow - * console might take a lot of time: - */ - touch_nmi_watchdog(); - if (!state_filter || (p->state & state_filter)) - sched_show_task(p); + if (should_resched(SOFTIRQ_DISABLE_OFFSET)) { + local_bh_enable(); + preempt_schedule_common(); + local_bh_disable(); + return 1; } - - touch_all_softlockup_watchdogs(); - -#ifdef CONFIG_SCHED_DEBUG - sysrq_sched_debug_show(); -#endif - rcu_read_unlock(); - /* - * Only show locks if all tasks are dumped: - */ - if (!state_filter) - debug_show_all_locks(); + return 0; } +EXPORT_SYMBOL(__cond_resched_softirq); +#endif -void init_idle_bootup_task(struct task_struct *idle) +/** + * yield - yield the current processor to other threads. + * + * Do not ever use this function, there's a 99% chance you're doing it wrong. + * + * The scheduler is at all times free to pick the calling task as the most + * eligible task to run, if removing the yield() call from your code breaks + * it, its already broken. + * + * Typical broken usage is: + * + * while (!event) + * yield(); + * + * where one assumes that yield() will let 'the other' process run that will + * make event true. If the current task is a SCHED_FIFO task that will never + * happen. Never use yield() as a progress guarantee!! + * + * If you want to use yield() to wait for something, use wait_event(). + * If you want to use yield() to be 'nice' for others, use cond_resched(). + * If you still want to use yield(), do not! + */ +void __sched yield(void) { - idle->sched_class = &idle_sched_class; + set_current_state(TASK_RUNNING); + sys_sched_yield(); } +EXPORT_SYMBOL(yield); /** - * init_idle - set up an idle thread for a given CPU - * @idle: task in question - * @cpu: cpu the idle task belongs to + * yield_to - yield the current processor to another thread in + * your thread group, or accelerate that thread toward the + * processor it's on. + * @p: target task + * @preempt: whether task preemption is allowed or not * - * NOTE: this function does not set the idle thread's NEED_RESCHED - * flag, to make booting more robust. + * It's the caller's job to ensure that the target task struct + * can't go away on us before we can do any checks. + * + * Return: + * true (>0) if we indeed boosted the target task. + * false (0) if we failed to boost the target. + * -ESRCH if there's no task to yield to. */ -void init_idle(struct task_struct *idle, int cpu) +int __sched yield_to(struct task_struct *p, bool preempt) { - struct rq *rq = cpu_rq(cpu); + struct task_struct *curr = current; + struct rq *rq, *p_rq; unsigned long flags; + int yielded = 0; - raw_spin_lock_irqsave(&rq->lock, flags); - - __sched_fork(0, idle); - idle->state = TASK_RUNNING; - idle->se.exec_start = sched_clock(); + local_irq_save(flags); + rq = this_rq(); - do_set_cpus_allowed(idle, cpumask_of(cpu)); +again: + p_rq = task_rq(p); /* - * We're having a chicken and egg problem, even though we are - * holding rq->lock, the cpu isn't yet set to this cpu so the - * lockdep check in task_group() will fail. - * - * Similar case to sched_fork(). / Alternatively we could - * use task_rq_lock() here and obtain the other rq->lock. - * - * Silence PROVE_RCU + * If we're the only runnable task on the rq and target rq also + * has only one task, there's absolutely no point in yielding. */ - rcu_read_lock(); - __set_task_cpu(idle, cpu); - rcu_read_unlock(); + if (rq->nr_running == 1 && p_rq->nr_running == 1) { + yielded = -ESRCH; + goto out_irq; + } - rq->curr = rq->idle = idle; - idle->on_rq = TASK_ON_RQ_QUEUED; -#if defined(CONFIG_SMP) - idle->on_cpu = 1; -#endif - raw_spin_unlock_irqrestore(&rq->lock, flags); + double_rq_lock(rq, p_rq); + if (task_rq(p) != p_rq) { + double_rq_unlock(rq, p_rq); + goto again; + } - /* Set the preempt count _outside_ the spinlocks! */ - init_idle_preempt_count(idle, cpu); -#ifdef CONFIG_HAVE_PREEMPT_LAZY - task_thread_info(idle)->preempt_lazy_count = 0; -#endif - /* - * The idle tasks have their own, simple scheduling class: - */ - idle->sched_class = &idle_sched_class; - ftrace_graph_init_idle_task(idle, cpu); - vtime_init_idle(idle, cpu); -#if defined(CONFIG_SMP) - sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); -#endif -} + if (!curr->sched_class->yield_to_task) + goto out_unlock; -int cpuset_cpumask_can_shrink(const struct cpumask *cur, - const struct cpumask *trial) -{ - int ret = 1, trial_cpus; - struct dl_bw *cur_dl_b; - unsigned long flags; + if (curr->sched_class != p->sched_class) + goto out_unlock; - if (!cpumask_weight(cur)) - return ret; + if (task_running(p_rq, p) || p->state) + goto out_unlock; - rcu_read_lock_sched(); - cur_dl_b = dl_bw_of(cpumask_any(cur)); - trial_cpus = cpumask_weight(trial); + yielded = curr->sched_class->yield_to_task(rq, p, preempt); + if (yielded) { + schedstat_inc(rq, yld_count); + /* + * Make p's CPU reschedule; pick_next_entity takes care of + * fairness. + */ + if (preempt && rq != p_rq) + resched_curr(p_rq); + } - raw_spin_lock_irqsave(&cur_dl_b->lock, flags); - if (cur_dl_b->bw != -1 && - cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw) - ret = 0; - raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); - rcu_read_unlock_sched(); +out_unlock: + double_rq_unlock(rq, p_rq); +out_irq: + local_irq_restore(flags); - return ret; + if (yielded > 0) + schedule(); + + return yielded; } +EXPORT_SYMBOL_GPL(yield_to); -int task_can_attach(struct task_struct *p, - const struct cpumask *cs_cpus_allowed) +/* + * This task is about to go to sleep on IO. Increment rq->nr_iowait so + * that process accounting knows that this is a task in IO wait state. + */ +long __sched io_schedule_timeout(long timeout) { - int ret = 0; - - /* - * Kthreads which disallow setaffinity shouldn't be moved - * to a new cpuset; we don't want to change their cpu - * affinity and isolating such threads by their set of - * allowed nodes is unnecessary. Thus, cpusets are not - * applicable for such threads. This prevents checking for - * success of set_cpus_allowed_ptr() on all attached tasks - * before cpus_allowed may be changed. - */ - if (p->flags & PF_NO_SETAFFINITY) { - ret = -EINVAL; - goto out; - } + int old_iowait = current->in_iowait; + struct rq *rq; + long ret; -#ifdef CONFIG_SMP - if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, - cs_cpus_allowed)) { - unsigned int dest_cpu = cpumask_any_and(cpu_active_mask, - cs_cpus_allowed); - struct dl_bw *dl_b; - bool overflow; - int cpus; - unsigned long flags; + current->in_iowait = 1; + blk_schedule_flush_plug(current); - rcu_read_lock_sched(); - dl_b = dl_bw_of(dest_cpu); - raw_spin_lock_irqsave(&dl_b->lock, flags); - cpus = dl_bw_cpus(dest_cpu); - overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); - if (overflow) - ret = -EBUSY; - else { - /* - * We reserve space for this task in the destination - * root_domain, as we can't fail after this point. - * We will free resources in the source root_domain - * later on (see set_cpus_allowed_dl()). - */ - __dl_add(dl_b, p->dl.dl_bw); - } - raw_spin_unlock_irqrestore(&dl_b->lock, flags); - rcu_read_unlock_sched(); + delayacct_blkio_start(); + rq = raw_rq(); + atomic_inc(&rq->nr_iowait); + ret = schedule_timeout(timeout); + current->in_iowait = old_iowait; + atomic_dec(&rq->nr_iowait); + delayacct_blkio_end(); - } -#endif -out: return ret; } +EXPORT_SYMBOL(io_schedule_timeout); -#ifdef CONFIG_SMP -/* - * move_queued_task - move a queued task to new rq. +/** + * sys_sched_get_priority_max - return maximum RT priority. + * @policy: scheduling class. * - * Returns (locked) new rq. Old rq's lock is released. + * Return: On success, this syscall returns the maximum + * rt_priority that can be used by a given scheduling class. + * On failure, a negative error code is returned. */ -static struct rq *move_queued_task(struct task_struct *p, int new_cpu) +SYSCALL_DEFINE1(sched_get_priority_max, int, policy) { - struct rq *rq = task_rq(p); - - lockdep_assert_held(&rq->lock); - - dequeue_task(rq, p, 0); - p->on_rq = TASK_ON_RQ_MIGRATING; - set_task_cpu(p, new_cpu); - raw_spin_unlock(&rq->lock); - - rq = cpu_rq(new_cpu); - - raw_spin_lock(&rq->lock); - BUG_ON(task_cpu(p) != new_cpu); - p->on_rq = TASK_ON_RQ_QUEUED; - enqueue_task(rq, p, 0); - check_preempt_curr(rq, p, 0); - - return rq; -} + int ret = -EINVAL; -void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) -{ - if (!migrate_disabled_updated(p)) { - if (p->sched_class->set_cpus_allowed) - p->sched_class->set_cpus_allowed(p, new_mask); - p->nr_cpus_allowed = cpumask_weight(new_mask); + switch (policy) { + case SCHED_FIFO: + case SCHED_RR: + ret = MAX_USER_RT_PRIO-1; + break; + case SCHED_DEADLINE: + case SCHED_NORMAL: + case SCHED_BATCH: + case SCHED_IDLE: + ret = 0; + break; } - - cpumask_copy(&p->cpus_allowed, new_mask); + return ret; } -static DEFINE_PER_CPU(struct cpumask, sched_cpumasks); -static DEFINE_MUTEX(sched_down_mutex); -static cpumask_t sched_down_cpumask; - -void tell_sched_cpu_down_begin(int cpu) +/** + * sys_sched_get_priority_min - return minimum RT priority. + * @policy: scheduling class. + * + * Return: On success, this syscall returns the minimum + * rt_priority that can be used by a given scheduling class. + * On failure, a negative error code is returned. + */ +SYSCALL_DEFINE1(sched_get_priority_min, int, policy) { - mutex_lock(&sched_down_mutex); - cpumask_set_cpu(cpu, &sched_down_cpumask); - mutex_unlock(&sched_down_mutex); -} + int ret = -EINVAL; -void tell_sched_cpu_down_done(int cpu) -{ - mutex_lock(&sched_down_mutex); - cpumask_clear_cpu(cpu, &sched_down_cpumask); - mutex_unlock(&sched_down_mutex); + switch (policy) { + case SCHED_FIFO: + case SCHED_RR: + ret = 1; + break; + case SCHED_DEADLINE: + case SCHED_NORMAL: + case SCHED_BATCH: + case SCHED_IDLE: + ret = 0; + } + return ret; } /** - * migrate_me - try to move the current task off this cpu + * sys_sched_rr_get_interval - return the default timeslice of a process. + * @pid: pid of the process. + * @interval: userspace pointer to the timeslice value. * - * Used by the pin_current_cpu() code to try to get tasks - * to move off the current CPU as it is going down. - * It will only move the task if the task isn't pinned to - * the CPU (with migrate_disable, affinity or NO_SETAFFINITY) - * and the task has to be in a RUNNING state. Otherwise the - * movement of the task will wake it up (change its state - * to running) when the task did not expect it. + * this syscall writes the default timeslice value of a given process + * into the user-space timespec buffer. A value of '0' means infinity. * - * Returns 1 if it succeeded in moving the current task - * 0 otherwise. + * Return: On success, 0 and the timeslice is in @interval. Otherwise, + * an error code. */ -int migrate_me(void) +SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, + struct timespec __user *, interval) { - struct task_struct *p = current; - struct migration_arg arg; - struct cpumask *cpumask; - struct cpumask *mask; + struct task_struct *p; + unsigned int time_slice; unsigned long flags; - unsigned int dest_cpu; struct rq *rq; + int retval; + struct timespec t; - /* - * We can not migrate tasks bounded to a CPU or tasks not - * running. The movement of the task will wake it up. - */ - if (p->flags & PF_NO_SETAFFINITY || p->state) - return 0; + if (pid < 0) + return -EINVAL; + + retval = -ESRCH; + rcu_read_lock(); + p = find_process_by_pid(pid); + if (!p) + goto out_unlock; + + retval = security_task_getscheduler(p); + if (retval) + goto out_unlock; - mutex_lock(&sched_down_mutex); rq = task_rq_lock(p, &flags); + time_slice = 0; + if (p->sched_class->get_rr_interval) + time_slice = p->sched_class->get_rr_interval(rq, p); + task_rq_unlock(rq, p, &flags); - cpumask = this_cpu_ptr(&sched_cpumasks); - mask = &p->cpus_allowed; + rcu_read_unlock(); + jiffies_to_timespec(time_slice, &t); + retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; + return retval; - cpumask_andnot(cpumask, mask, &sched_down_cpumask); +out_unlock: + rcu_read_unlock(); + return retval; +} - if (!cpumask_weight(cpumask)) { - /* It's only on this CPU? */ - task_rq_unlock(rq, p, &flags); - mutex_unlock(&sched_down_mutex); - return 0; - } +static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; - dest_cpu = cpumask_any_and(cpu_active_mask, cpumask); +void sched_show_task(struct task_struct *p) +{ + unsigned long free = 0; + int ppid; + unsigned long state = p->state; - arg.task = p; - arg.dest_cpu = dest_cpu; + if (state) + state = __ffs(state) + 1; + printk(KERN_INFO "%-15.15s %c", p->comm, + state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); +#if BITS_PER_LONG == 32 + if (state == TASK_RUNNING) + printk(KERN_CONT " running "); + else + printk(KERN_CONT " %08lx ", thread_saved_pc(p)); +#else + if (state == TASK_RUNNING) + printk(KERN_CONT " running task "); + else + printk(KERN_CONT " %016lx ", thread_saved_pc(p)); +#endif +#ifdef CONFIG_DEBUG_STACK_USAGE + free = stack_not_used(p); +#endif + ppid = 0; + rcu_read_lock(); + if (pid_alive(p)) + ppid = task_pid_nr(rcu_dereference(p->real_parent)); + rcu_read_unlock(); + printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, + task_pid_nr(p), ppid, + (unsigned long)task_thread_info(p)->flags); - task_rq_unlock(rq, p, &flags); + print_worker_info(KERN_INFO, p); + show_stack(p, NULL); +} - stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); - tlb_migrate_finish(p->mm); - mutex_unlock(&sched_down_mutex); +void show_state_filter(unsigned long state_filter) +{ + struct task_struct *g, *p; - return 1; +#if BITS_PER_LONG == 32 + printk(KERN_INFO + " task PC stack pid father\n"); +#else + printk(KERN_INFO + " task PC stack pid father\n"); +#endif + rcu_read_lock(); + for_each_process_thread(g, p) { + /* + * reset the NMI-timeout, listing all files on a slow + * console might take a lot of time: + */ + touch_nmi_watchdog(); + if (!state_filter || (p->state & state_filter)) + sched_show_task(p); + } + + touch_all_softlockup_watchdogs(); + +#ifdef CONFIG_SCHED_DEBUG + sysrq_sched_debug_show(); +#endif + rcu_read_unlock(); + /* + * Only show locks if all tasks are dumped: + */ + if (!state_filter) + debug_show_all_locks(); } -/* - * This is how migration works: - * - * 1) we invoke migration_cpu_stop() on the target CPU using - * stop_one_cpu(). - * 2) stopper starts to run (implicitly forcing the migrated thread - * off the CPU) - * 3) it checks whether the migrated task is still in the wrong runqueue. - * 4) if it's in the wrong runqueue then the migration thread removes - * it and puts it into the right queue. - * 5) stopper completes and stop_one_cpu() returns and the migration - * is done. - */ +void init_idle_bootup_task(struct task_struct *idle) +{ + idle->sched_class = &idle_sched_class; +} -/* - * Change a given task's CPU affinity. Migrate the thread to a - * proper CPU and schedule it away if the CPU it's executing on - * is removed from the allowed bitmask. +/** + * init_idle - set up an idle thread for a given CPU + * @idle: task in question + * @cpu: cpu the idle task belongs to * - * NOTE: the caller must have a valid reference to the task, the - * task must not exit() & deallocate itself prematurely. The - * call is not atomic; no spinlocks may be held. + * NOTE: this function does not set the idle thread's NEED_RESCHED + * flag, to make booting more robust. */ -int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) +void init_idle(struct task_struct *idle, int cpu) { + struct rq *rq = cpu_rq(cpu); unsigned long flags; - struct rq *rq; - unsigned int dest_cpu; - int ret = 0; - - rq = task_rq_lock(p, &flags); - - if (cpumask_equal(&p->cpus_allowed, new_mask)) - goto out; - if (!cpumask_intersects(new_mask, cpu_active_mask)) { - ret = -EINVAL; - goto out; - } + raw_spin_lock_irqsave(&idle->pi_lock, flags); + raw_spin_lock(&rq->lock); - do_set_cpus_allowed(p, new_mask); + __sched_fork(0, idle); + idle->state = TASK_RUNNING; + idle->se.exec_start = sched_clock(); - /* Can the task run on the task's current CPU? If so, we're done */ - if (cpumask_test_cpu(task_cpu(p), new_mask) || __migrate_disabled(p)) - goto out; +#ifdef CONFIG_SMP + /* + * Its possible that init_idle() gets called multiple times on a task, + * in that case do_set_cpus_allowed() will not do the right thing. + * + * And since this is boot we can forgo the serialization. + */ + set_cpus_allowed_common(idle, cpumask_of(cpu)); +#endif + /* + * We're having a chicken and egg problem, even though we are + * holding rq->lock, the cpu isn't yet set to this cpu so the + * lockdep check in task_group() will fail. + * + * Similar case to sched_fork(). / Alternatively we could + * use task_rq_lock() here and obtain the other rq->lock. + * + * Silence PROVE_RCU + */ + rcu_read_lock(); + __set_task_cpu(idle, cpu); + rcu_read_unlock(); - dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); - if (task_running(rq, p) || p->state == TASK_WAKING) { - struct migration_arg arg = { p, dest_cpu }; - /* Need help from migration thread: drop lock and wait. */ - task_rq_unlock(rq, p, &flags); - stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); - tlb_migrate_finish(p->mm); - return 0; - } else if (task_on_rq_queued(p)) - rq = move_queued_task(p, dest_cpu); -out: - task_rq_unlock(rq, p, &flags); + rq->curr = rq->idle = idle; + idle->on_rq = TASK_ON_RQ_QUEUED; +#ifdef CONFIG_SMP + idle->on_cpu = 1; +#endif + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&idle->pi_lock, flags); - return ret; + /* Set the preempt count _outside_ the spinlocks! */ + init_idle_preempt_count(idle, cpu); +#ifdef CONFIG_HAVE_PREEMPT_LAZY + task_thread_info(idle)->preempt_lazy_count = 0; +#endif + /* + * The idle tasks have their own, simple scheduling class: + */ + idle->sched_class = &idle_sched_class; + ftrace_graph_init_idle_task(idle, cpu); + vtime_init_idle(idle, cpu); +#ifdef CONFIG_SMP + sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); +#endif } -EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); -/* - * Move (not current) task off this cpu, onto dest cpu. We're doing - * this because either it can't run here any more (set_cpus_allowed() - * away from this CPU, or CPU going down), or because we're - * attempting to rebalance this task on exec (sched_exec). - * - * So we race with normal scheduler movements, but that's OK, as long - * as the task is no longer on this CPU. - * - * Returns non-zero if task was successfully migrated. - */ -static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) +int cpuset_cpumask_can_shrink(const struct cpumask *cur, + const struct cpumask *trial) { - struct rq *rq; - int ret = 0; + int ret = 1, trial_cpus; + struct dl_bw *cur_dl_b; + unsigned long flags; - if (unlikely(!cpu_active(dest_cpu))) + if (!cpumask_weight(cur)) return ret; - rq = cpu_rq(src_cpu); + rcu_read_lock_sched(); + cur_dl_b = dl_bw_of(cpumask_any(cur)); + trial_cpus = cpumask_weight(trial); - raw_spin_lock(&p->pi_lock); - raw_spin_lock(&rq->lock); - /* Already moved. */ - if (task_cpu(p) != src_cpu) - goto done; + raw_spin_lock_irqsave(&cur_dl_b->lock, flags); + if (cur_dl_b->bw != -1 && + cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw) + ret = 0; + raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); + rcu_read_unlock_sched(); - /* Affinity changed (again). */ - if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) - goto fail; + return ret; +} + +int task_can_attach(struct task_struct *p, + const struct cpumask *cs_cpus_allowed) +{ + int ret = 0; /* - * If we're not on a rq, the next wake-up will ensure we're - * placed properly. + * Kthreads which disallow setaffinity shouldn't be moved + * to a new cpuset; we don't want to change their cpu + * affinity and isolating such threads by their set of + * allowed nodes is unnecessary. Thus, cpusets are not + * applicable for such threads. This prevents checking for + * success of set_cpus_allowed_ptr() on all attached tasks + * before cpus_allowed may be changed. */ - if (task_on_rq_queued(p)) - rq = move_queued_task(p, dest_cpu); -done: - ret = 1; -fail: - raw_spin_unlock(&rq->lock); - raw_spin_unlock(&p->pi_lock); + if (p->flags & PF_NO_SETAFFINITY) { + ret = -EINVAL; + goto out; + } + +#ifdef CONFIG_SMP + if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, + cs_cpus_allowed)) { + unsigned int dest_cpu = cpumask_any_and(cpu_active_mask, + cs_cpus_allowed); + struct dl_bw *dl_b; + bool overflow; + int cpus; + unsigned long flags; + + rcu_read_lock_sched(); + dl_b = dl_bw_of(dest_cpu); + raw_spin_lock_irqsave(&dl_b->lock, flags); + cpus = dl_bw_cpus(dest_cpu); + overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); + if (overflow) + ret = -EBUSY; + else { + /* + * We reserve space for this task in the destination + * root_domain, as we can't fail after this point. + * We will free resources in the source root_domain + * later on (see set_cpus_allowed_dl()). + */ + __dl_add(dl_b, p->dl.dl_bw); + } + raw_spin_unlock_irqrestore(&dl_b->lock, flags); + rcu_read_unlock_sched(); + + } +#endif +out: return ret; } +#ifdef CONFIG_SMP + #ifdef CONFIG_NUMA_BALANCING /* Migrate current task p to target_cpu */ int migrate_task_to(struct task_struct *p, int target_cpu) @@ -5231,7 +5390,7 @@ void sched_setnuma(struct task_struct *p, int nid) running = task_current(rq, p); if (queued) - dequeue_task(rq, p, 0); + dequeue_task(rq, p, DEQUEUE_SAVE); if (running) put_prev_task(rq, p); @@ -5240,38 +5399,12 @@ void sched_setnuma(struct task_struct *p, int nid) if (running) p->sched_class->set_curr_task(rq); if (queued) - enqueue_task(rq, p, 0); + enqueue_task(rq, p, ENQUEUE_RESTORE); task_rq_unlock(rq, p, &flags); } -#endif - -/* - * migration_cpu_stop - this will be executed by a highprio stopper thread - * and performs thread migration by bumping thread off CPU then - * 'pushing' onto another runqueue. - */ -static int migration_cpu_stop(void *data) -{ - struct migration_arg *arg = data; - - /* - * The original target cpu might have gone down and we might - * be on another cpu but it doesn't matter. - */ - local_irq_disable(); - /* - * We need to explicitly wake pending tasks before running - * __migrate_task() such that we will not miss enforcing cpus_allowed - * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. - */ - sched_ttwu_pending(); - __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); - local_irq_enable(); - return 0; -} +#endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_HOTPLUG_CPU - static DEFINE_PER_CPU(struct mm_struct *, idle_last_mm); /* @@ -5333,9 +5466,9 @@ static struct task_struct fake_task = { * there's no concurrency possible, we hold the required locks anyway * because of lock validation efforts. */ -static void migrate_tasks(unsigned int dead_cpu) +static void migrate_tasks(struct rq *dead_rq) { - struct rq *rq = cpu_rq(dead_cpu); + struct rq *rq = dead_rq; struct task_struct *next, *stop = rq->stop; int dest_cpu; @@ -5357,7 +5490,7 @@ static void migrate_tasks(unsigned int dead_cpu) */ update_rq_clock(rq); - for ( ; ; ) { + for (;;) { /* * There's this thread running, bail when that's the only * remaining thread. @@ -5365,22 +5498,52 @@ static void migrate_tasks(unsigned int dead_cpu) if (rq->nr_running == 1) break; + /* + * pick_next_task assumes pinned rq->lock. + */ + lockdep_pin_lock(&rq->lock); next = pick_next_task(rq, &fake_task); BUG_ON(!next); next->sched_class->put_prev_task(rq, next); - /* Find suitable destination for @next, with force if needed. */ - dest_cpu = select_fallback_rq(dead_cpu, next); + /* + * Rules for changing task_struct::cpus_allowed are holding + * both pi_lock and rq->lock, such that holding either + * stabilizes the mask. + * + * Drop rq->lock is not quite as disastrous as it usually is + * because !cpu_active at this point, which means load-balance + * will not interfere. Also, stop-machine. + */ + lockdep_unpin_lock(&rq->lock); raw_spin_unlock(&rq->lock); + raw_spin_lock(&next->pi_lock); + raw_spin_lock(&rq->lock); + + /* + * Since we're inside stop-machine, _nothing_ should have + * changed the task, WARN if weird stuff happened, because in + * that case the above rq->lock drop is a fail too. + */ + if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) { + raw_spin_unlock(&next->pi_lock); + continue; + } - __migrate_task(next, dead_cpu, dest_cpu); + /* Find suitable destination for @next, with force if needed. */ + dest_cpu = select_fallback_rq(dead_rq->cpu, next); - raw_spin_lock(&rq->lock); + rq = __migrate_task(rq, next, dest_cpu); + if (rq != dead_rq) { + raw_spin_unlock(&rq->lock); + rq = dead_rq; + raw_spin_lock(&rq->lock); + } + raw_spin_unlock(&next->pi_lock); } rq->stop = stop; } - #endif /* CONFIG_HOTPLUG_CPU */ #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) @@ -5546,8 +5709,7 @@ static void register_sched_domain_sysctl(void) /* may be called multiple times per register */ static void unregister_sched_domain_sysctl(void) { - if (sd_sysctl_header) - unregister_sysctl_table(sd_sysctl_header); + unregister_sysctl_table(sd_sysctl_header); sd_sysctl_header = NULL; if (sd_ctl_dir[0].child) sd_free_ctl_entry(&sd_ctl_dir[0].child); @@ -5559,7 +5721,7 @@ static void register_sched_domain_sysctl(void) static void unregister_sched_domain_sysctl(void) { } -#endif +#endif /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */ static void set_rq_online(struct rq *rq) { @@ -5628,7 +5790,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } - migrate_tasks(cpu); + migrate_tasks(rq); BUG_ON(rq->nr_running != 1); /* the migration thread */ raw_spin_unlock_irqrestore(&rq->lock, flags); break; @@ -5658,7 +5820,7 @@ static struct notifier_block migration_notifier = { .priority = CPU_PRI_MIGRATION, }; -static void __cpuinit set_cpu_rq_start_time(void) +static void set_cpu_rq_start_time(void) { int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); @@ -5668,21 +5830,27 @@ static void __cpuinit set_cpu_rq_start_time(void) static int sched_cpu_active(struct notifier_block *nfb, unsigned long action, void *hcpu) { + int cpu = (long)hcpu; + switch (action & ~CPU_TASKS_FROZEN) { case CPU_STARTING: set_cpu_rq_start_time(); return NOTIFY_OK; + case CPU_ONLINE: /* * At this point a starting CPU has marked itself as online via * set_cpu_online(). But it might not yet have marked itself * as active, which is essential from here on. - * - * Thus, fall-through and help the starting CPU along. */ + set_cpu_active(cpu, true); + stop_machine_unpark(cpu); + return NOTIFY_OK; + case CPU_DOWN_FAILED: - set_cpu_active((long)hcpu, true); + set_cpu_active(cpu, true); return NOTIFY_OK; + default: return NOTIFY_DONE; } @@ -5718,9 +5886,6 @@ static int __init migration_init(void) return 0; } early_initcall(migration_init); -#endif - -#ifdef CONFIG_SMP static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ @@ -5953,13 +6118,13 @@ static int init_rootdomain(struct root_domain *rd) { memset(rd, 0, sizeof(*rd)); - if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) + if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL)) goto out; - if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) + if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL)) goto free_span; - if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) + if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) goto free_online; - if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) + if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) goto free_dlo_mask; init_dl_bw(&rd->dl_bw); @@ -6617,7 +6782,8 @@ static struct sched_domain_topology_level default_topology[] = { { NULL, }, }; -struct sched_domain_topology_level *sched_domain_topology = default_topology; +static struct sched_domain_topology_level *sched_domain_topology = + default_topology; #define for_each_sd_topology(tl) \ for (tl = sched_domain_topology; tl->mask; tl++) @@ -6695,8 +6861,10 @@ static void init_numa_topology_type(void) n = sched_max_numa_distance; - if (n <= 1) + if (sched_domains_numa_levels <= 1) { sched_numa_topology_type = NUMA_DIRECT; + return; + } for_each_online_node(a) { for_each_online_node(b) { @@ -6817,7 +6985,7 @@ static void sched_init_numa(void) sched_domains_numa_masks[i][j] = mask; - for (k = 0; k < nr_node_ids; k++) { + for_each_node(k) { if (node_distance(j, k) > sched_domains_numa_distance[i]) continue; @@ -6946,7 +7114,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map) struct sched_group *sg; struct sched_group_capacity *sgc; - sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), + sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), GFP_KERNEL, cpu_to_node(j)); if (!sd) return -ENOMEM; @@ -7420,8 +7588,6 @@ void __init sched_init_smp(void) } #endif /* CONFIG_SMP */ -const_debug unsigned int sysctl_timer_migration = 1; - int in_sched_functions(unsigned long addr) { return in_lock_functions(addr) || @@ -7551,7 +7717,7 @@ void __init sched_init(void) rq->sd = NULL; rq->rd = NULL; rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE; - rq->post_schedule = 0; + rq->balance_callback = NULL; rq->active_balance = 0; rq->next_balance = jiffies; rq->push_cpu = 0; @@ -7618,8 +7784,7 @@ void __init sched_init(void) #ifdef CONFIG_DEBUG_ATOMIC_SLEEP static inline int preempt_count_equals(int preempt_offset) { - int nested = (preempt_count() & ~PREEMPT_ACTIVE) + - sched_rcu_preempt_depth(); + int nested = preempt_count() + sched_rcu_preempt_depth(); return (nested == preempt_offset); } @@ -7682,32 +7847,12 @@ EXPORT_SYMBOL(___might_sleep); #endif #ifdef CONFIG_MAGIC_SYSRQ -static void normalize_task(struct rq *rq, struct task_struct *p) +void normalize_rt_tasks(void) { - const struct sched_class *prev_class = p->sched_class; + struct task_struct *g, *p; struct sched_attr attr = { .sched_policy = SCHED_NORMAL, }; - int old_prio = p->prio; - int queued; - - queued = task_on_rq_queued(p); - if (queued) - dequeue_task(rq, p, 0); - __setscheduler(rq, p, &attr, false); - if (queued) { - enqueue_task(rq, p, 0); - resched_curr(rq); - } - - check_class_changed(rq, p, prev_class, old_prio); -} - -void normalize_rt_tasks(void) -{ - struct task_struct *g, *p; - unsigned long flags; - struct rq *rq; read_lock(&tasklist_lock); for_each_process_thread(g, p) { @@ -7734,9 +7879,7 @@ void normalize_rt_tasks(void) continue; } - rq = task_rq_lock(p, &flags); - normalize_task(rq, p); - task_rq_unlock(rq, p, &flags); + __sched_setscheduler(p, &attr, false, false); } read_unlock(&tasklist_lock); } @@ -7888,7 +8031,7 @@ void sched_move_task(struct task_struct *tsk) queued = task_on_rq_queued(tsk); if (queued) - dequeue_task(rq, tsk, 0); + dequeue_task(rq, tsk, DEQUEUE_SAVE); if (unlikely(running)) put_prev_task(rq, tsk); @@ -7904,7 +8047,7 @@ void sched_move_task(struct task_struct *tsk) #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_move_group) - tsk->sched_class->task_move_group(tsk, queued); + tsk->sched_class->task_move_group(tsk); else #endif set_task_rq(tsk, task_cpu(tsk)); @@ -7912,7 +8055,7 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->set_curr_task(rq); if (queued) - enqueue_task(rq, tsk, 0); + enqueue_task(rq, tsk, ENQUEUE_RESTORE); task_rq_unlock(rq, tsk, &flags); } @@ -8087,11 +8230,11 @@ static long sched_group_rt_runtime(struct task_group *tg) return rt_runtime_us; } -static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) +static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us) { u64 rt_runtime, rt_period; - rt_period = (u64)rt_period_us * NSEC_PER_USEC; + rt_period = rt_period_us * NSEC_PER_USEC; rt_runtime = tg->rt_bandwidth.rt_runtime; return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); @@ -8340,17 +8483,17 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) sched_offline_group(tg); } -static void cpu_cgroup_fork(struct task_struct *task) +static void cpu_cgroup_fork(struct task_struct *task, void *private) { sched_move_task(task); } -static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *css; - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, css, tset) { #ifdef CONFIG_RT_GROUP_SCHED if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; @@ -8363,30 +8506,15 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, return 0; } -static void cpu_cgroup_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void cpu_cgroup_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *css; - cgroup_taskset_for_each(task, tset) + cgroup_taskset_for_each(task, css, tset) sched_move_task(task); } -static void cpu_cgroup_exit(struct cgroup_subsys_state *css, - struct cgroup_subsys_state *old_css, - struct task_struct *task) -{ - /* - * cgroup_exit() is called in the copy_process() failure path. - * Ignore this case since the task hasn't ran yet, this avoids - * trying to poke a half freed task state from generic code. - */ - if (!(task->flags & PF_EXITING)) - return; - - sched_move_task(task); -} - #ifdef CONFIG_FAIR_GROUP_SCHED static int cpu_shares_write_u64(struct cgroup_subsys_state *css, struct cftype *cftype, u64 shareval) @@ -8458,10 +8586,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) __refill_cfs_bandwidth_runtime(cfs_b); /* restart the period timer (if active) to handle new period expiry */ - if (runtime_enabled && cfs_b->timer_active) { - /* force a reprogram */ - __start_cfs_bandwidth(cfs_b, true); - } + if (runtime_enabled) + start_cfs_bandwidth(cfs_b); raw_spin_unlock_irq(&cfs_b->lock); for_each_online_cpu(i) { @@ -8720,7 +8846,6 @@ struct cgroup_subsys cpu_cgrp_subsys = { .fork = cpu_cgroup_fork, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, - .exit = cpu_cgroup_exit, .legacy_cftypes = cpu_files, .early_init = 1, };