Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-only
2 : /*
3 : * kernel/sched/core.c
4 : *
5 : * Core kernel scheduler code and related syscalls
6 : *
7 : * Copyright (C) 1991-2002 Linus Torvalds
8 : */
9 : #include <linux/highmem.h>
10 : #include <linux/hrtimer_api.h>
11 : #include <linux/ktime_api.h>
12 : #include <linux/sched/signal.h>
13 : #include <linux/syscalls_api.h>
14 : #include <linux/debug_locks.h>
15 : #include <linux/prefetch.h>
16 : #include <linux/capability.h>
17 : #include <linux/pgtable_api.h>
18 : #include <linux/wait_bit.h>
19 : #include <linux/jiffies.h>
20 : #include <linux/spinlock_api.h>
21 : #include <linux/cpumask_api.h>
22 : #include <linux/lockdep_api.h>
23 : #include <linux/hardirq.h>
24 : #include <linux/softirq.h>
25 : #include <linux/refcount_api.h>
26 : #include <linux/topology.h>
27 : #include <linux/sched/clock.h>
28 : #include <linux/sched/cond_resched.h>
29 : #include <linux/sched/debug.h>
30 : #include <linux/sched/isolation.h>
31 : #include <linux/sched/loadavg.h>
32 : #include <linux/sched/mm.h>
33 : #include <linux/sched/nohz.h>
34 : #include <linux/sched/rseq_api.h>
35 : #include <linux/sched/rt.h>
36 :
37 : #include <linux/blkdev.h>
38 : #include <linux/context_tracking.h>
39 : #include <linux/cpuset.h>
40 : #include <linux/delayacct.h>
41 : #include <linux/init_task.h>
42 : #include <linux/interrupt.h>
43 : #include <linux/ioprio.h>
44 : #include <linux/kallsyms.h>
45 : #include <linux/kcov.h>
46 : #include <linux/kprobes.h>
47 : #include <linux/llist_api.h>
48 : #include <linux/mmu_context.h>
49 : #include <linux/mmzone.h>
50 : #include <linux/mutex_api.h>
51 : #include <linux/nmi.h>
52 : #include <linux/nospec.h>
53 : #include <linux/perf_event_api.h>
54 : #include <linux/profile.h>
55 : #include <linux/psi.h>
56 : #include <linux/rcuwait_api.h>
57 : #include <linux/sched/wake_q.h>
58 : #include <linux/scs.h>
59 : #include <linux/slab.h>
60 : #include <linux/syscalls.h>
61 : #include <linux/vtime.h>
62 : #include <linux/wait_api.h>
63 : #include <linux/workqueue_api.h>
64 :
65 : #ifdef CONFIG_PREEMPT_DYNAMIC
66 : # ifdef CONFIG_GENERIC_ENTRY
67 : # include <linux/entry-common.h>
68 : # endif
69 : #endif
70 :
71 : #include <uapi/linux/sched/types.h>
72 :
73 : #include <asm/switch_to.h>
74 : #include <asm/tlb.h>
75 :
76 : #define CREATE_TRACE_POINTS
77 : #include <linux/sched/rseq_api.h>
78 : #include <trace/events/sched.h>
79 : #undef CREATE_TRACE_POINTS
80 :
81 : #include "sched.h"
82 : #include "stats.h"
83 : #include "autogroup.h"
84 :
85 : #include "autogroup.h"
86 : #include "pelt.h"
87 : #include "smp.h"
88 : #include "stats.h"
89 :
90 : #include "../workqueue_internal.h"
91 : #include "../../fs/io-wq.h"
92 : #include "../smpboot.h"
93 :
94 : /*
95 : * Export tracepoints that act as a bare tracehook (ie: have no trace event
96 : * associated with them) to allow external modules to probe them.
97 : */
98 : EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp);
99 : EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
100 : EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
101 : EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
102 : EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
103 : EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp);
104 : EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
105 : EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
106 : EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
107 : EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
108 : EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
109 :
110 : DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
111 :
112 : #ifdef CONFIG_SCHED_DEBUG
113 : /*
114 : * Debugging: various feature bits
115 : *
116 : * If SCHED_DEBUG is disabled, each compilation unit has its own copy of
117 : * sysctl_sched_features, defined in sched.h, to allow constants propagation
118 : * at compile time and compiler optimization based on features default.
119 : */
120 : #define SCHED_FEAT(name, enabled) \
121 : (1UL << __SCHED_FEAT_##name) * enabled |
122 : const_debug unsigned int sysctl_sched_features =
123 : #include "features.h"
124 : 0;
125 : #undef SCHED_FEAT
126 :
127 : /*
128 : * Print a warning if need_resched is set for the given duration (if
129 : * LATENCY_WARN is enabled).
130 : *
131 : * If sysctl_resched_latency_warn_once is set, only one warning will be shown
132 : * per boot.
133 : */
134 : __read_mostly int sysctl_resched_latency_warn_ms = 100;
135 : __read_mostly int sysctl_resched_latency_warn_once = 1;
136 : #endif /* CONFIG_SCHED_DEBUG */
137 :
138 : /*
139 : * Number of tasks to iterate in a single balance run.
140 : * Limited because this is done with IRQs disabled.
141 : */
142 : #ifdef CONFIG_PREEMPT_RT
143 : const_debug unsigned int sysctl_sched_nr_migrate = 8;
144 : #else
145 : const_debug unsigned int sysctl_sched_nr_migrate = 32;
146 : #endif
147 :
148 : /*
149 : * period over which we measure -rt task CPU usage in us.
150 : * default: 1s
151 : */
152 : unsigned int sysctl_sched_rt_period = 1000000;
153 :
154 : __read_mostly int scheduler_running;
155 :
156 : #ifdef CONFIG_SCHED_CORE
157 :
158 : DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
159 :
160 : /* kernel prio, less is more */
161 : static inline int __task_prio(struct task_struct *p)
162 : {
163 : if (p->sched_class == &stop_sched_class) /* trumps deadline */
164 : return -2;
165 :
166 : if (rt_prio(p->prio)) /* includes deadline */
167 : return p->prio; /* [-1, 99] */
168 :
169 : if (p->sched_class == &idle_sched_class)
170 : return MAX_RT_PRIO + NICE_WIDTH; /* 140 */
171 :
172 : return MAX_RT_PRIO + MAX_NICE; /* 120, squash fair */
173 : }
174 :
175 : /*
176 : * l(a,b)
177 : * le(a,b) := !l(b,a)
178 : * g(a,b) := l(b,a)
179 : * ge(a,b) := !l(a,b)
180 : */
181 :
182 : /* real prio, less is less */
183 : static inline bool prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
184 : {
185 :
186 : int pa = __task_prio(a), pb = __task_prio(b);
187 :
188 : if (-pa < -pb)
189 : return true;
190 :
191 : if (-pb < -pa)
192 : return false;
193 :
194 : if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
195 : return !dl_time_before(a->dl.deadline, b->dl.deadline);
196 :
197 : if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
198 : return cfs_prio_less(a, b, in_fi);
199 :
200 : return false;
201 : }
202 :
203 : static inline bool __sched_core_less(struct task_struct *a, struct task_struct *b)
204 : {
205 : if (a->core_cookie < b->core_cookie)
206 : return true;
207 :
208 : if (a->core_cookie > b->core_cookie)
209 : return false;
210 :
211 : /* flip prio, so high prio is leftmost */
212 : if (prio_less(b, a, !!task_rq(a)->core->core_forceidle_count))
213 : return true;
214 :
215 : return false;
216 : }
217 :
218 : #define __node_2_sc(node) rb_entry((node), struct task_struct, core_node)
219 :
220 : static inline bool rb_sched_core_less(struct rb_node *a, const struct rb_node *b)
221 : {
222 : return __sched_core_less(__node_2_sc(a), __node_2_sc(b));
223 : }
224 :
225 : static inline int rb_sched_core_cmp(const void *key, const struct rb_node *node)
226 : {
227 : const struct task_struct *p = __node_2_sc(node);
228 : unsigned long cookie = (unsigned long)key;
229 :
230 : if (cookie < p->core_cookie)
231 : return -1;
232 :
233 : if (cookie > p->core_cookie)
234 : return 1;
235 :
236 : return 0;
237 : }
238 :
239 : void sched_core_enqueue(struct rq *rq, struct task_struct *p)
240 : {
241 : rq->core->core_task_seq++;
242 :
243 : if (!p->core_cookie)
244 : return;
245 :
246 : rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less);
247 : }
248 :
249 : void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
250 : {
251 : rq->core->core_task_seq++;
252 :
253 : if (sched_core_enqueued(p)) {
254 : rb_erase(&p->core_node, &rq->core_tree);
255 : RB_CLEAR_NODE(&p->core_node);
256 : }
257 :
258 : /*
259 : * Migrating the last task off the cpu, with the cpu in forced idle
260 : * state. Reschedule to create an accounting edge for forced idle,
261 : * and re-examine whether the core is still in forced idle state.
262 : */
263 : if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 &&
264 : rq->core->core_forceidle_count && rq->curr == rq->idle)
265 : resched_curr(rq);
266 : }
267 :
268 : /*
269 : * Find left-most (aka, highest priority) task matching @cookie.
270 : */
271 : static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
272 : {
273 : struct rb_node *node;
274 :
275 : node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp);
276 : /*
277 : * The idle task always matches any cookie!
278 : */
279 : if (!node)
280 : return idle_sched_class.pick_task(rq);
281 :
282 : return __node_2_sc(node);
283 : }
284 :
285 : static struct task_struct *sched_core_next(struct task_struct *p, unsigned long cookie)
286 : {
287 : struct rb_node *node = &p->core_node;
288 :
289 : node = rb_next(node);
290 : if (!node)
291 : return NULL;
292 :
293 : p = container_of(node, struct task_struct, core_node);
294 : if (p->core_cookie != cookie)
295 : return NULL;
296 :
297 : return p;
298 : }
299 :
300 : /*
301 : * Magic required such that:
302 : *
303 : * raw_spin_rq_lock(rq);
304 : * ...
305 : * raw_spin_rq_unlock(rq);
306 : *
307 : * ends up locking and unlocking the _same_ lock, and all CPUs
308 : * always agree on what rq has what lock.
309 : *
310 : * XXX entirely possible to selectively enable cores, don't bother for now.
311 : */
312 :
313 : static DEFINE_MUTEX(sched_core_mutex);
314 : static atomic_t sched_core_count;
315 : static struct cpumask sched_core_mask;
316 :
317 : static void sched_core_lock(int cpu, unsigned long *flags)
318 : {
319 : const struct cpumask *smt_mask = cpu_smt_mask(cpu);
320 : int t, i = 0;
321 :
322 : local_irq_save(*flags);
323 : for_each_cpu(t, smt_mask)
324 : raw_spin_lock_nested(&cpu_rq(t)->__lock, i++);
325 : }
326 :
327 : static void sched_core_unlock(int cpu, unsigned long *flags)
328 : {
329 : const struct cpumask *smt_mask = cpu_smt_mask(cpu);
330 : int t;
331 :
332 : for_each_cpu(t, smt_mask)
333 : raw_spin_unlock(&cpu_rq(t)->__lock);
334 : local_irq_restore(*flags);
335 : }
336 :
337 : static void __sched_core_flip(bool enabled)
338 : {
339 : unsigned long flags;
340 : int cpu, t;
341 :
342 : cpus_read_lock();
343 :
344 : /*
345 : * Toggle the online cores, one by one.
346 : */
347 : cpumask_copy(&sched_core_mask, cpu_online_mask);
348 : for_each_cpu(cpu, &sched_core_mask) {
349 : const struct cpumask *smt_mask = cpu_smt_mask(cpu);
350 :
351 : sched_core_lock(cpu, &flags);
352 :
353 : for_each_cpu(t, smt_mask)
354 : cpu_rq(t)->core_enabled = enabled;
355 :
356 : cpu_rq(cpu)->core->core_forceidle_start = 0;
357 :
358 : sched_core_unlock(cpu, &flags);
359 :
360 : cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask);
361 : }
362 :
363 : /*
364 : * Toggle the offline CPUs.
365 : */
366 : cpumask_copy(&sched_core_mask, cpu_possible_mask);
367 : cpumask_andnot(&sched_core_mask, &sched_core_mask, cpu_online_mask);
368 :
369 : for_each_cpu(cpu, &sched_core_mask)
370 : cpu_rq(cpu)->core_enabled = enabled;
371 :
372 : cpus_read_unlock();
373 : }
374 :
375 : static void sched_core_assert_empty(void)
376 : {
377 : int cpu;
378 :
379 : for_each_possible_cpu(cpu)
380 : WARN_ON_ONCE(!RB_EMPTY_ROOT(&cpu_rq(cpu)->core_tree));
381 : }
382 :
383 : static void __sched_core_enable(void)
384 : {
385 : static_branch_enable(&__sched_core_enabled);
386 : /*
387 : * Ensure all previous instances of raw_spin_rq_*lock() have finished
388 : * and future ones will observe !sched_core_disabled().
389 : */
390 : synchronize_rcu();
391 : __sched_core_flip(true);
392 : sched_core_assert_empty();
393 : }
394 :
395 : static void __sched_core_disable(void)
396 : {
397 : sched_core_assert_empty();
398 : __sched_core_flip(false);
399 : static_branch_disable(&__sched_core_enabled);
400 : }
401 :
402 : void sched_core_get(void)
403 : {
404 : if (atomic_inc_not_zero(&sched_core_count))
405 : return;
406 :
407 : mutex_lock(&sched_core_mutex);
408 : if (!atomic_read(&sched_core_count))
409 : __sched_core_enable();
410 :
411 : smp_mb__before_atomic();
412 : atomic_inc(&sched_core_count);
413 : mutex_unlock(&sched_core_mutex);
414 : }
415 :
416 : static void __sched_core_put(struct work_struct *work)
417 : {
418 : if (atomic_dec_and_mutex_lock(&sched_core_count, &sched_core_mutex)) {
419 : __sched_core_disable();
420 : mutex_unlock(&sched_core_mutex);
421 : }
422 : }
423 :
424 : void sched_core_put(void)
425 : {
426 : static DECLARE_WORK(_work, __sched_core_put);
427 :
428 : /*
429 : * "There can be only one"
430 : *
431 : * Either this is the last one, or we don't actually need to do any
432 : * 'work'. If it is the last *again*, we rely on
433 : * WORK_STRUCT_PENDING_BIT.
434 : */
435 : if (!atomic_add_unless(&sched_core_count, -1, 1))
436 : schedule_work(&_work);
437 : }
438 :
439 : #else /* !CONFIG_SCHED_CORE */
440 :
441 : static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
442 : static inline void
443 : sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
444 :
445 : #endif /* CONFIG_SCHED_CORE */
446 :
447 : /*
448 : * part of the period that we allow rt tasks to run in us.
449 : * default: 0.95s
450 : */
451 : int sysctl_sched_rt_runtime = 950000;
452 :
453 :
454 : /*
455 : * Serialization rules:
456 : *
457 : * Lock order:
458 : *
459 : * p->pi_lock
460 : * rq->lock
461 : * hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls)
462 : *
463 : * rq1->lock
464 : * rq2->lock where: rq1 < rq2
465 : *
466 : * Regular state:
467 : *
468 : * Normal scheduling state is serialized by rq->lock. __schedule() takes the
469 : * local CPU's rq->lock, it optionally removes the task from the runqueue and
470 : * always looks at the local rq data structures to find the most eligible task
471 : * to run next.
472 : *
473 : * Task enqueue is also under rq->lock, possibly taken from another CPU.
474 : * Wakeups from another LLC domain might use an IPI to transfer the enqueue to
475 : * the local CPU to avoid bouncing the runqueue state around [ see
476 : * ttwu_queue_wakelist() ]
477 : *
478 : * Task wakeup, specifically wakeups that involve migration, are horribly
479 : * complicated to avoid having to take two rq->locks.
480 : *
481 : * Special state:
482 : *
483 : * System-calls and anything external will use task_rq_lock() which acquires
484 : * both p->pi_lock and rq->lock. As a consequence the state they change is
485 : * stable while holding either lock:
486 : *
487 : * - sched_setaffinity()/
488 : * set_cpus_allowed_ptr(): p->cpus_ptr, p->nr_cpus_allowed
489 : * - set_user_nice(): p->se.load, p->*prio
490 : * - __sched_setscheduler(): p->sched_class, p->policy, p->*prio,
491 : * p->se.load, p->rt_priority,
492 : * p->dl.dl_{runtime, deadline, period, flags, bw, density}
493 : * - sched_setnuma(): p->numa_preferred_nid
494 : * - sched_move_task()/
495 : * cpu_cgroup_fork(): p->sched_task_group
496 : * - uclamp_update_active() p->uclamp*
497 : *
498 : * p->state <- TASK_*:
499 : *
500 : * is changed locklessly using set_current_state(), __set_current_state() or
501 : * set_special_state(), see their respective comments, or by
502 : * try_to_wake_up(). This latter uses p->pi_lock to serialize against
503 : * concurrent self.
504 : *
505 : * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }:
506 : *
507 : * is set by activate_task() and cleared by deactivate_task(), under
508 : * rq->lock. Non-zero indicates the task is runnable, the special
509 : * ON_RQ_MIGRATING state is used for migration without holding both
510 : * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
511 : *
512 : * p->on_cpu <- { 0, 1 }:
513 : *
514 : * is set by prepare_task() and cleared by finish_task() such that it will be
515 : * set before p is scheduled-in and cleared after p is scheduled-out, both
516 : * under rq->lock. Non-zero indicates the task is running on its CPU.
517 : *
518 : * [ The astute reader will observe that it is possible for two tasks on one
519 : * CPU to have ->on_cpu = 1 at the same time. ]
520 : *
521 : * task_cpu(p): is changed by set_task_cpu(), the rules are:
522 : *
523 : * - Don't call set_task_cpu() on a blocked task:
524 : *
525 : * We don't care what CPU we're not running on, this simplifies hotplug,
526 : * the CPU assignment of blocked tasks isn't required to be valid.
527 : *
528 : * - for try_to_wake_up(), called under p->pi_lock:
529 : *
530 : * This allows try_to_wake_up() to only take one rq->lock, see its comment.
531 : *
532 : * - for migration called under rq->lock:
533 : * [ see task_on_rq_migrating() in task_rq_lock() ]
534 : *
535 : * o move_queued_task()
536 : * o detach_task()
537 : *
538 : * - for migration called under double_rq_lock():
539 : *
540 : * o __migrate_swap_task()
541 : * o push_rt_task() / pull_rt_task()
542 : * o push_dl_task() / pull_dl_task()
543 : * o dl_task_offline_migration()
544 : *
545 : */
546 :
547 107 : void raw_spin_rq_lock_nested(struct rq *rq, int subclass)
548 : {
549 : raw_spinlock_t *lock;
550 :
551 : /* Matches synchronize_rcu() in __sched_core_enable() */
552 1465 : preempt_disable();
553 : if (sched_core_disabled()) {
554 1465 : raw_spin_lock_nested(&rq->__lock, subclass);
555 : /* preempt_count *MUST* be > 1 */
556 1465 : preempt_enable_no_resched();
557 : return;
558 : }
559 :
560 : for (;;) {
561 : lock = __rq_lockp(rq);
562 : raw_spin_lock_nested(lock, subclass);
563 : if (likely(lock == __rq_lockp(rq))) {
564 : /* preempt_count *MUST* be > 1 */
565 : preempt_enable_no_resched();
566 : return;
567 : }
568 : raw_spin_unlock(lock);
569 : }
570 : }
571 :
572 0 : bool raw_spin_rq_trylock(struct rq *rq)
573 : {
574 : raw_spinlock_t *lock;
575 : bool ret;
576 :
577 : /* Matches synchronize_rcu() in __sched_core_enable() */
578 0 : preempt_disable();
579 : if (sched_core_disabled()) {
580 0 : ret = raw_spin_trylock(&rq->__lock);
581 0 : preempt_enable();
582 : return ret;
583 : }
584 :
585 : for (;;) {
586 : lock = __rq_lockp(rq);
587 : ret = raw_spin_trylock(lock);
588 : if (!ret || (likely(lock == __rq_lockp(rq)))) {
589 : preempt_enable();
590 : return ret;
591 : }
592 : raw_spin_unlock(lock);
593 : }
594 : }
595 :
596 107 : void raw_spin_rq_unlock(struct rq *rq)
597 : {
598 1465 : raw_spin_unlock(rq_lockp(rq));
599 107 : }
600 :
601 : #ifdef CONFIG_SMP
602 : /*
603 : * double_rq_lock - safely lock two runqueues
604 : */
605 : void double_rq_lock(struct rq *rq1, struct rq *rq2)
606 : {
607 : lockdep_assert_irqs_disabled();
608 :
609 : if (rq_order_less(rq2, rq1))
610 : swap(rq1, rq2);
611 :
612 : raw_spin_rq_lock(rq1);
613 : if (__rq_lockp(rq1) == __rq_lockp(rq2))
614 : return;
615 :
616 : raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
617 : }
618 : #endif
619 :
620 : /*
621 : * __task_rq_lock - lock the rq @p resides on.
622 : */
623 0 : struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
624 : __acquires(rq->lock)
625 : {
626 : struct rq *rq;
627 :
628 : lockdep_assert_held(&p->pi_lock);
629 :
630 : for (;;) {
631 107 : rq = task_rq(p);
632 107 : raw_spin_rq_lock(rq);
633 214 : if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
634 107 : rq_pin_lock(rq, rf);
635 0 : return rq;
636 : }
637 : raw_spin_rq_unlock(rq);
638 :
639 0 : while (unlikely(task_on_rq_migrating(p)))
640 : cpu_relax();
641 : }
642 : }
643 :
644 : /*
645 : * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
646 : */
647 109 : struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
648 : __acquires(p->pi_lock)
649 : __acquires(rq->lock)
650 : {
651 : struct rq *rq;
652 :
653 : for (;;) {
654 109 : raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
655 109 : rq = task_rq(p);
656 109 : raw_spin_rq_lock(rq);
657 : /*
658 : * move_queued_task() task_rq_lock()
659 : *
660 : * ACQUIRE (rq->lock)
661 : * [S] ->on_rq = MIGRATING [L] rq = task_rq()
662 : * WMB (__set_task_cpu()) ACQUIRE (rq->lock);
663 : * [S] ->cpu = new_cpu [L] task_rq()
664 : * [L] ->on_rq
665 : * RELEASE (rq->lock)
666 : *
667 : * If we observe the old CPU in task_rq_lock(), the acquire of
668 : * the old rq->lock will fully serialize against the stores.
669 : *
670 : * If we observe the new CPU in task_rq_lock(), the address
671 : * dependency headed by '[L] rq = task_rq()' and the acquire
672 : * will pair with the WMB to ensure we then also see migrating.
673 : */
674 218 : if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
675 109 : rq_pin_lock(rq, rf);
676 109 : return rq;
677 : }
678 0 : raw_spin_rq_unlock(rq);
679 0 : raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
680 :
681 0 : while (unlikely(task_on_rq_migrating(p)))
682 : cpu_relax();
683 : }
684 : }
685 :
686 : /*
687 : * RQ-clock updating methods:
688 : */
689 :
690 : static void update_rq_clock_task(struct rq *rq, s64 delta)
691 : {
692 : /*
693 : * In theory, the compile should just see 0 here, and optimize out the call
694 : * to sched_rt_avg_update. But I don't trust it...
695 : */
696 1259 : s64 __maybe_unused steal = 0, irq_delta = 0;
697 :
698 : #ifdef CONFIG_IRQ_TIME_ACCOUNTING
699 : irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
700 :
701 : /*
702 : * Since irq_time is only updated on {soft,}irq_exit, we might run into
703 : * this case when a previous update_rq_clock() happened inside a
704 : * {soft,}irq region.
705 : *
706 : * When this happens, we stop ->clock_task and only update the
707 : * prev_irq_time stamp to account for the part that fit, so that a next
708 : * update will consume the rest. This ensures ->clock_task is
709 : * monotonic.
710 : *
711 : * It does however cause some slight miss-attribution of {soft,}irq
712 : * time, a more accurate solution would be to update the irq_time using
713 : * the current rq->clock timestamp, except that would require using
714 : * atomic ops.
715 : */
716 : if (irq_delta > delta)
717 : irq_delta = delta;
718 :
719 : rq->prev_irq_time += irq_delta;
720 : delta -= irq_delta;
721 : #endif
722 : #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
723 : if (static_key_false((¶virt_steal_rq_enabled))) {
724 : steal = paravirt_steal_clock(cpu_of(rq));
725 : steal -= rq->prev_steal_time_rq;
726 :
727 : if (unlikely(steal > delta))
728 : steal = delta;
729 :
730 : rq->prev_steal_time_rq += steal;
731 : delta -= steal;
732 : }
733 : #endif
734 :
735 1259 : rq->clock_task += delta;
736 :
737 : #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
738 : if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
739 : update_irq_load_avg(rq, irq_delta + steal);
740 : #endif
741 1259 : update_rq_clock_pelt(rq, delta);
742 : }
743 :
744 1464 : void update_rq_clock(struct rq *rq)
745 : {
746 : s64 delta;
747 :
748 1464 : lockdep_assert_rq_held(rq);
749 :
750 1464 : if (rq->clock_update_flags & RQCF_ACT_SKIP)
751 : return;
752 :
753 : #ifdef CONFIG_SCHED_DEBUG
754 1259 : if (sched_feat(WARN_DOUBLE_CLOCK))
755 0 : SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
756 1259 : rq->clock_update_flags |= RQCF_UPDATED;
757 : #endif
758 :
759 1259 : delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
760 1259 : if (delta < 0)
761 : return;
762 1259 : rq->clock += delta;
763 1259 : update_rq_clock_task(rq, delta);
764 : }
765 :
766 : #ifdef CONFIG_SCHED_HRTICK
767 : /*
768 : * Use HR-timers to deliver accurate preemption points.
769 : */
770 :
771 : static void hrtick_clear(struct rq *rq)
772 : {
773 : if (hrtimer_active(&rq->hrtick_timer))
774 : hrtimer_cancel(&rq->hrtick_timer);
775 : }
776 :
777 : /*
778 : * High-resolution timer tick.
779 : * Runs from hardirq context with interrupts disabled.
780 : */
781 : static enum hrtimer_restart hrtick(struct hrtimer *timer)
782 : {
783 : struct rq *rq = container_of(timer, struct rq, hrtick_timer);
784 : struct rq_flags rf;
785 :
786 : WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
787 :
788 : rq_lock(rq, &rf);
789 : update_rq_clock(rq);
790 : rq->curr->sched_class->task_tick(rq, rq->curr, 1);
791 : rq_unlock(rq, &rf);
792 :
793 : return HRTIMER_NORESTART;
794 : }
795 :
796 : #ifdef CONFIG_SMP
797 :
798 : static void __hrtick_restart(struct rq *rq)
799 : {
800 : struct hrtimer *timer = &rq->hrtick_timer;
801 : ktime_t time = rq->hrtick_time;
802 :
803 : hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD);
804 : }
805 :
806 : /*
807 : * called from hardirq (IPI) context
808 : */
809 : static void __hrtick_start(void *arg)
810 : {
811 : struct rq *rq = arg;
812 : struct rq_flags rf;
813 :
814 : rq_lock(rq, &rf);
815 : __hrtick_restart(rq);
816 : rq_unlock(rq, &rf);
817 : }
818 :
819 : /*
820 : * Called to set the hrtick timer state.
821 : *
822 : * called with rq->lock held and irqs disabled
823 : */
824 : void hrtick_start(struct rq *rq, u64 delay)
825 : {
826 : struct hrtimer *timer = &rq->hrtick_timer;
827 : s64 delta;
828 :
829 : /*
830 : * Don't schedule slices shorter than 10000ns, that just
831 : * doesn't make sense and can cause timer DoS.
832 : */
833 : delta = max_t(s64, delay, 10000LL);
834 : rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta);
835 :
836 : if (rq == this_rq())
837 : __hrtick_restart(rq);
838 : else
839 : smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
840 : }
841 :
842 : #else
843 : /*
844 : * Called to set the hrtick timer state.
845 : *
846 : * called with rq->lock held and irqs disabled
847 : */
848 : void hrtick_start(struct rq *rq, u64 delay)
849 : {
850 : /*
851 : * Don't schedule slices shorter than 10000ns, that just
852 : * doesn't make sense. Rely on vruntime for fairness.
853 : */
854 : delay = max_t(u64, delay, 10000LL);
855 : hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
856 : HRTIMER_MODE_REL_PINNED_HARD);
857 : }
858 :
859 : #endif /* CONFIG_SMP */
860 :
861 : static void hrtick_rq_init(struct rq *rq)
862 : {
863 : #ifdef CONFIG_SMP
864 : INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq);
865 : #endif
866 : hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
867 : rq->hrtick_timer.function = hrtick;
868 : }
869 : #else /* CONFIG_SCHED_HRTICK */
870 : static inline void hrtick_clear(struct rq *rq)
871 : {
872 : }
873 :
874 : static inline void hrtick_rq_init(struct rq *rq)
875 : {
876 : }
877 : #endif /* CONFIG_SCHED_HRTICK */
878 :
879 : /*
880 : * cmpxchg based fetch_or, macro so it works for different integer types
881 : */
882 : #define fetch_or(ptr, mask) \
883 : ({ \
884 : typeof(ptr) _ptr = (ptr); \
885 : typeof(mask) _mask = (mask); \
886 : typeof(*_ptr) _old, _val = *_ptr; \
887 : \
888 : for (;;) { \
889 : _old = cmpxchg(_ptr, _val, _val | _mask); \
890 : if (_old == _val) \
891 : break; \
892 : _val = _old; \
893 : } \
894 : _old; \
895 : })
896 :
897 : #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
898 : /*
899 : * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
900 : * this avoids any races wrt polling state changes and thereby avoids
901 : * spurious IPIs.
902 : */
903 : static bool set_nr_and_not_polling(struct task_struct *p)
904 : {
905 : struct thread_info *ti = task_thread_info(p);
906 : return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
907 : }
908 :
909 : /*
910 : * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.
911 : *
912 : * If this returns true, then the idle task promises to call
913 : * sched_ttwu_pending() and reschedule soon.
914 : */
915 : static bool set_nr_if_polling(struct task_struct *p)
916 : {
917 : struct thread_info *ti = task_thread_info(p);
918 : typeof(ti->flags) old, val = READ_ONCE(ti->flags);
919 :
920 : for (;;) {
921 : if (!(val & _TIF_POLLING_NRFLAG))
922 : return false;
923 : if (val & _TIF_NEED_RESCHED)
924 : return true;
925 : old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
926 : if (old == val)
927 : break;
928 : val = old;
929 : }
930 : return true;
931 : }
932 :
933 : #else
934 : static bool set_nr_and_not_polling(struct task_struct *p)
935 : {
936 : set_tsk_need_resched(p);
937 : return true;
938 : }
939 :
940 : #ifdef CONFIG_SMP
941 : static bool set_nr_if_polling(struct task_struct *p)
942 : {
943 : return false;
944 : }
945 : #endif
946 : #endif
947 :
948 : static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)
949 : {
950 0 : struct wake_q_node *node = &task->wake_q;
951 :
952 : /*
953 : * Atomically grab the task, if ->wake_q is !nil already it means
954 : * it's already queued (either by us or someone else) and will get the
955 : * wakeup due to that.
956 : *
957 : * In order to ensure that a pending wakeup will observe our pending
958 : * state, even in the failed case, an explicit smp_mb() must be used.
959 : */
960 0 : smp_mb__before_atomic();
961 0 : if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)))
962 : return false;
963 :
964 : /*
965 : * The head is context local, there can be no concurrency.
966 : */
967 0 : *head->lastp = node;
968 0 : head->lastp = &node->next;
969 : return true;
970 : }
971 :
972 : /**
973 : * wake_q_add() - queue a wakeup for 'later' waking.
974 : * @head: the wake_q_head to add @task to
975 : * @task: the task to queue for 'later' wakeup
976 : *
977 : * Queue a task for later wakeup, most likely by the wake_up_q() call in the
978 : * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
979 : * instantly.
980 : *
981 : * This function must be used as-if it were wake_up_process(); IOW the task
982 : * must be ready to be woken at this location.
983 : */
984 0 : void wake_q_add(struct wake_q_head *head, struct task_struct *task)
985 : {
986 0 : if (__wake_q_add(head, task))
987 : get_task_struct(task);
988 0 : }
989 :
990 : /**
991 : * wake_q_add_safe() - safely queue a wakeup for 'later' waking.
992 : * @head: the wake_q_head to add @task to
993 : * @task: the task to queue for 'later' wakeup
994 : *
995 : * Queue a task for later wakeup, most likely by the wake_up_q() call in the
996 : * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
997 : * instantly.
998 : *
999 : * This function must be used as-if it were wake_up_process(); IOW the task
1000 : * must be ready to be woken at this location.
1001 : *
1002 : * This function is essentially a task-safe equivalent to wake_q_add(). Callers
1003 : * that already hold reference to @task can call the 'safe' version and trust
1004 : * wake_q to do the right thing depending whether or not the @task is already
1005 : * queued for wakeup.
1006 : */
1007 0 : void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task)
1008 : {
1009 0 : if (!__wake_q_add(head, task))
1010 0 : put_task_struct(task);
1011 0 : }
1012 :
1013 0 : void wake_up_q(struct wake_q_head *head)
1014 : {
1015 0 : struct wake_q_node *node = head->first;
1016 :
1017 0 : while (node != WAKE_Q_TAIL) {
1018 : struct task_struct *task;
1019 :
1020 0 : task = container_of(node, struct task_struct, wake_q);
1021 : /* Task can safely be re-inserted now: */
1022 0 : node = node->next;
1023 0 : task->wake_q.next = NULL;
1024 :
1025 : /*
1026 : * wake_up_process() executes a full barrier, which pairs with
1027 : * the queueing in wake_q_add() so as not to miss wakeups.
1028 : */
1029 0 : wake_up_process(task);
1030 0 : put_task_struct(task);
1031 : }
1032 0 : }
1033 :
1034 : /*
1035 : * resched_curr - mark rq's current task 'to be rescheduled now'.
1036 : *
1037 : * On UP this means the setting of the need_resched flag, on SMP it
1038 : * might also involve a cross-CPU call to trigger the scheduler on
1039 : * the target CPU.
1040 : */
1041 203 : void resched_curr(struct rq *rq)
1042 : {
1043 206 : struct task_struct *curr = rq->curr;
1044 : int cpu;
1045 :
1046 412 : lockdep_assert_rq_held(rq);
1047 :
1048 206 : if (test_tsk_need_resched(curr))
1049 : return;
1050 :
1051 205 : cpu = cpu_of(rq);
1052 :
1053 : if (cpu == smp_processor_id()) {
1054 : set_tsk_need_resched(curr);
1055 : set_preempt_need_resched();
1056 : return;
1057 : }
1058 :
1059 : if (set_nr_and_not_polling(curr))
1060 : smp_send_reschedule(cpu);
1061 : else
1062 : trace_sched_wake_idle_without_ipi(cpu);
1063 : }
1064 :
1065 0 : void resched_cpu(int cpu)
1066 : {
1067 0 : struct rq *rq = cpu_rq(cpu);
1068 : unsigned long flags;
1069 :
1070 0 : raw_spin_rq_lock_irqsave(rq, flags);
1071 0 : if (cpu_online(cpu) || cpu == smp_processor_id())
1072 : resched_curr(rq);
1073 0 : raw_spin_rq_unlock_irqrestore(rq, flags);
1074 0 : }
1075 :
1076 : #ifdef CONFIG_SMP
1077 : #ifdef CONFIG_NO_HZ_COMMON
1078 : /*
1079 : * In the semi idle case, use the nearest busy CPU for migrating timers
1080 : * from an idle CPU. This is good for power-savings.
1081 : *
1082 : * We don't do similar optimization for completely idle system, as
1083 : * selecting an idle CPU will add more delays to the timers than intended
1084 : * (as that CPU's timer base may not be uptodate wrt jiffies etc).
1085 : */
1086 : int get_nohz_timer_target(void)
1087 : {
1088 : int i, cpu = smp_processor_id(), default_cpu = -1;
1089 : struct sched_domain *sd;
1090 : const struct cpumask *hk_mask;
1091 :
1092 : if (housekeeping_cpu(cpu, HK_TYPE_TIMER)) {
1093 : if (!idle_cpu(cpu))
1094 : return cpu;
1095 : default_cpu = cpu;
1096 : }
1097 :
1098 : hk_mask = housekeeping_cpumask(HK_TYPE_TIMER);
1099 :
1100 : rcu_read_lock();
1101 : for_each_domain(cpu, sd) {
1102 : for_each_cpu_and(i, sched_domain_span(sd), hk_mask) {
1103 : if (cpu == i)
1104 : continue;
1105 :
1106 : if (!idle_cpu(i)) {
1107 : cpu = i;
1108 : goto unlock;
1109 : }
1110 : }
1111 : }
1112 :
1113 : if (default_cpu == -1)
1114 : default_cpu = housekeeping_any_cpu(HK_TYPE_TIMER);
1115 : cpu = default_cpu;
1116 : unlock:
1117 : rcu_read_unlock();
1118 : return cpu;
1119 : }
1120 :
1121 : /*
1122 : * When add_timer_on() enqueues a timer into the timer wheel of an
1123 : * idle CPU then this timer might expire before the next timer event
1124 : * which is scheduled to wake up that CPU. In case of a completely
1125 : * idle system the next event might even be infinite time into the
1126 : * future. wake_up_idle_cpu() ensures that the CPU is woken up and
1127 : * leaves the inner idle loop so the newly added timer is taken into
1128 : * account when the CPU goes back to idle and evaluates the timer
1129 : * wheel for the next timer event.
1130 : */
1131 : static void wake_up_idle_cpu(int cpu)
1132 : {
1133 : struct rq *rq = cpu_rq(cpu);
1134 :
1135 : if (cpu == smp_processor_id())
1136 : return;
1137 :
1138 : if (set_nr_and_not_polling(rq->idle))
1139 : smp_send_reschedule(cpu);
1140 : else
1141 : trace_sched_wake_idle_without_ipi(cpu);
1142 : }
1143 :
1144 : static bool wake_up_full_nohz_cpu(int cpu)
1145 : {
1146 : /*
1147 : * We just need the target to call irq_exit() and re-evaluate
1148 : * the next tick. The nohz full kick at least implies that.
1149 : * If needed we can still optimize that later with an
1150 : * empty IRQ.
1151 : */
1152 : if (cpu_is_offline(cpu))
1153 : return true; /* Don't try to wake offline CPUs. */
1154 : if (tick_nohz_full_cpu(cpu)) {
1155 : if (cpu != smp_processor_id() ||
1156 : tick_nohz_tick_stopped())
1157 : tick_nohz_full_kick_cpu(cpu);
1158 : return true;
1159 : }
1160 :
1161 : return false;
1162 : }
1163 :
1164 : /*
1165 : * Wake up the specified CPU. If the CPU is going offline, it is the
1166 : * caller's responsibility to deal with the lost wakeup, for example,
1167 : * by hooking into the CPU_DEAD notifier like timers and hrtimers do.
1168 : */
1169 : void wake_up_nohz_cpu(int cpu)
1170 : {
1171 : if (!wake_up_full_nohz_cpu(cpu))
1172 : wake_up_idle_cpu(cpu);
1173 : }
1174 :
1175 : static void nohz_csd_func(void *info)
1176 : {
1177 : struct rq *rq = info;
1178 : int cpu = cpu_of(rq);
1179 : unsigned int flags;
1180 :
1181 : /*
1182 : * Release the rq::nohz_csd.
1183 : */
1184 : flags = atomic_fetch_andnot(NOHZ_KICK_MASK | NOHZ_NEWILB_KICK, nohz_flags(cpu));
1185 : WARN_ON(!(flags & NOHZ_KICK_MASK));
1186 :
1187 : rq->idle_balance = idle_cpu(cpu);
1188 : if (rq->idle_balance && !need_resched()) {
1189 : rq->nohz_idle_balance = flags;
1190 : raise_softirq_irqoff(SCHED_SOFTIRQ);
1191 : }
1192 : }
1193 :
1194 : #endif /* CONFIG_NO_HZ_COMMON */
1195 :
1196 : #ifdef CONFIG_NO_HZ_FULL
1197 : bool sched_can_stop_tick(struct rq *rq)
1198 : {
1199 : int fifo_nr_running;
1200 :
1201 : /* Deadline tasks, even if single, need the tick */
1202 : if (rq->dl.dl_nr_running)
1203 : return false;
1204 :
1205 : /*
1206 : * If there are more than one RR tasks, we need the tick to affect the
1207 : * actual RR behaviour.
1208 : */
1209 : if (rq->rt.rr_nr_running) {
1210 : if (rq->rt.rr_nr_running == 1)
1211 : return true;
1212 : else
1213 : return false;
1214 : }
1215 :
1216 : /*
1217 : * If there's no RR tasks, but FIFO tasks, we can skip the tick, no
1218 : * forced preemption between FIFO tasks.
1219 : */
1220 : fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
1221 : if (fifo_nr_running)
1222 : return true;
1223 :
1224 : /*
1225 : * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left;
1226 : * if there's more than one we need the tick for involuntary
1227 : * preemption.
1228 : */
1229 : if (rq->nr_running > 1)
1230 : return false;
1231 :
1232 : return true;
1233 : }
1234 : #endif /* CONFIG_NO_HZ_FULL */
1235 : #endif /* CONFIG_SMP */
1236 :
1237 : #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
1238 : (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
1239 : /*
1240 : * Iterate task_group tree rooted at *from, calling @down when first entering a
1241 : * node and @up when leaving it for the final time.
1242 : *
1243 : * Caller must hold rcu_lock or sufficient equivalent.
1244 : */
1245 : int walk_tg_tree_from(struct task_group *from,
1246 : tg_visitor down, tg_visitor up, void *data)
1247 : {
1248 : struct task_group *parent, *child;
1249 : int ret;
1250 :
1251 : parent = from;
1252 :
1253 : down:
1254 : ret = (*down)(parent, data);
1255 : if (ret)
1256 : goto out;
1257 : list_for_each_entry_rcu(child, &parent->children, siblings) {
1258 : parent = child;
1259 : goto down;
1260 :
1261 : up:
1262 : continue;
1263 : }
1264 : ret = (*up)(parent, data);
1265 : if (ret || parent == from)
1266 : goto out;
1267 :
1268 : child = parent;
1269 : parent = parent->parent;
1270 : if (parent)
1271 : goto up;
1272 : out:
1273 : return ret;
1274 : }
1275 :
1276 : int tg_nop(struct task_group *tg, void *data)
1277 : {
1278 : return 0;
1279 : }
1280 : #endif
1281 :
1282 5 : static void set_load_weight(struct task_struct *p, bool update_load)
1283 : {
1284 5 : int prio = p->static_prio - MAX_RT_PRIO;
1285 5 : struct load_weight *load = &p->se.load;
1286 :
1287 : /*
1288 : * SCHED_IDLE tasks get minimal weight:
1289 : */
1290 10 : if (task_has_idle_policy(p)) {
1291 0 : load->weight = scale_load(WEIGHT_IDLEPRIO);
1292 0 : load->inv_weight = WMULT_IDLEPRIO;
1293 0 : return;
1294 : }
1295 :
1296 : /*
1297 : * SCHED_OTHER tasks have to update their load when changing their
1298 : * weight
1299 : */
1300 5 : if (update_load && p->sched_class == &fair_sched_class) {
1301 4 : reweight_task(p, prio);
1302 : } else {
1303 1 : load->weight = scale_load(sched_prio_to_weight[prio]);
1304 1 : load->inv_weight = sched_prio_to_wmult[prio];
1305 : }
1306 : }
1307 :
1308 : #ifdef CONFIG_UCLAMP_TASK
1309 : /*
1310 : * Serializes updates of utilization clamp values
1311 : *
1312 : * The (slow-path) user-space triggers utilization clamp value updates which
1313 : * can require updates on (fast-path) scheduler's data structures used to
1314 : * support enqueue/dequeue operations.
1315 : * While the per-CPU rq lock protects fast-path update operations, user-space
1316 : * requests are serialized using a mutex to reduce the risk of conflicting
1317 : * updates or API abuses.
1318 : */
1319 : static DEFINE_MUTEX(uclamp_mutex);
1320 :
1321 : /* Max allowed minimum utilization */
1322 : unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
1323 :
1324 : /* Max allowed maximum utilization */
1325 : unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
1326 :
1327 : /*
1328 : * By default RT tasks run at the maximum performance point/capacity of the
1329 : * system. Uclamp enforces this by always setting UCLAMP_MIN of RT tasks to
1330 : * SCHED_CAPACITY_SCALE.
1331 : *
1332 : * This knob allows admins to change the default behavior when uclamp is being
1333 : * used. In battery powered devices, particularly, running at the maximum
1334 : * capacity and frequency will increase energy consumption and shorten the
1335 : * battery life.
1336 : *
1337 : * This knob only affects RT tasks that their uclamp_se->user_defined == false.
1338 : *
1339 : * This knob will not override the system default sched_util_clamp_min defined
1340 : * above.
1341 : */
1342 : unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
1343 :
1344 : /* All clamps are required to be less or equal than these values */
1345 : static struct uclamp_se uclamp_default[UCLAMP_CNT];
1346 :
1347 : /*
1348 : * This static key is used to reduce the uclamp overhead in the fast path. It
1349 : * primarily disables the call to uclamp_rq_{inc, dec}() in
1350 : * enqueue/dequeue_task().
1351 : *
1352 : * This allows users to continue to enable uclamp in their kernel config with
1353 : * minimum uclamp overhead in the fast path.
1354 : *
1355 : * As soon as userspace modifies any of the uclamp knobs, the static key is
1356 : * enabled, since we have an actual users that make use of uclamp
1357 : * functionality.
1358 : *
1359 : * The knobs that would enable this static key are:
1360 : *
1361 : * * A task modifying its uclamp value with sched_setattr().
1362 : * * An admin modifying the sysctl_sched_uclamp_{min, max} via procfs.
1363 : * * An admin modifying the cgroup cpu.uclamp.{min, max}
1364 : */
1365 : DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
1366 :
1367 : /* Integer rounded range for each bucket */
1368 : #define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
1369 :
1370 : #define for_each_clamp_id(clamp_id) \
1371 : for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
1372 :
1373 : static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
1374 : {
1375 : return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1);
1376 : }
1377 :
1378 : static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
1379 : {
1380 : if (clamp_id == UCLAMP_MIN)
1381 : return 0;
1382 : return SCHED_CAPACITY_SCALE;
1383 : }
1384 :
1385 : static inline void uclamp_se_set(struct uclamp_se *uc_se,
1386 : unsigned int value, bool user_defined)
1387 : {
1388 : uc_se->value = value;
1389 : uc_se->bucket_id = uclamp_bucket_id(value);
1390 : uc_se->user_defined = user_defined;
1391 : }
1392 :
1393 : static inline unsigned int
1394 : uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,
1395 : unsigned int clamp_value)
1396 : {
1397 : /*
1398 : * Avoid blocked utilization pushing up the frequency when we go
1399 : * idle (which drops the max-clamp) by retaining the last known
1400 : * max-clamp.
1401 : */
1402 : if (clamp_id == UCLAMP_MAX) {
1403 : rq->uclamp_flags |= UCLAMP_FLAG_IDLE;
1404 : return clamp_value;
1405 : }
1406 :
1407 : return uclamp_none(UCLAMP_MIN);
1408 : }
1409 :
1410 : static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
1411 : unsigned int clamp_value)
1412 : {
1413 : /* Reset max-clamp retention only on idle exit */
1414 : if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))
1415 : return;
1416 :
1417 : WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value);
1418 : }
1419 :
1420 : static inline
1421 : unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
1422 : unsigned int clamp_value)
1423 : {
1424 : struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
1425 : int bucket_id = UCLAMP_BUCKETS - 1;
1426 :
1427 : /*
1428 : * Since both min and max clamps are max aggregated, find the
1429 : * top most bucket with tasks in.
1430 : */
1431 : for ( ; bucket_id >= 0; bucket_id--) {
1432 : if (!bucket[bucket_id].tasks)
1433 : continue;
1434 : return bucket[bucket_id].value;
1435 : }
1436 :
1437 : /* No tasks -- default clamp values */
1438 : return uclamp_idle_value(rq, clamp_id, clamp_value);
1439 : }
1440 :
1441 : static void __uclamp_update_util_min_rt_default(struct task_struct *p)
1442 : {
1443 : unsigned int default_util_min;
1444 : struct uclamp_se *uc_se;
1445 :
1446 : lockdep_assert_held(&p->pi_lock);
1447 :
1448 : uc_se = &p->uclamp_req[UCLAMP_MIN];
1449 :
1450 : /* Only sync if user didn't override the default */
1451 : if (uc_se->user_defined)
1452 : return;
1453 :
1454 : default_util_min = sysctl_sched_uclamp_util_min_rt_default;
1455 : uclamp_se_set(uc_se, default_util_min, false);
1456 : }
1457 :
1458 : static void uclamp_update_util_min_rt_default(struct task_struct *p)
1459 : {
1460 : struct rq_flags rf;
1461 : struct rq *rq;
1462 :
1463 : if (!rt_task(p))
1464 : return;
1465 :
1466 : /* Protect updates to p->uclamp_* */
1467 : rq = task_rq_lock(p, &rf);
1468 : __uclamp_update_util_min_rt_default(p);
1469 : task_rq_unlock(rq, p, &rf);
1470 : }
1471 :
1472 : static void uclamp_sync_util_min_rt_default(void)
1473 : {
1474 : struct task_struct *g, *p;
1475 :
1476 : /*
1477 : * copy_process() sysctl_uclamp
1478 : * uclamp_min_rt = X;
1479 : * write_lock(&tasklist_lock) read_lock(&tasklist_lock)
1480 : * // link thread smp_mb__after_spinlock()
1481 : * write_unlock(&tasklist_lock) read_unlock(&tasklist_lock);
1482 : * sched_post_fork() for_each_process_thread()
1483 : * __uclamp_sync_rt() __uclamp_sync_rt()
1484 : *
1485 : * Ensures that either sched_post_fork() will observe the new
1486 : * uclamp_min_rt or for_each_process_thread() will observe the new
1487 : * task.
1488 : */
1489 : read_lock(&tasklist_lock);
1490 : smp_mb__after_spinlock();
1491 : read_unlock(&tasklist_lock);
1492 :
1493 : rcu_read_lock();
1494 : for_each_process_thread(g, p)
1495 : uclamp_update_util_min_rt_default(p);
1496 : rcu_read_unlock();
1497 : }
1498 :
1499 : static inline struct uclamp_se
1500 : uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
1501 : {
1502 : /* Copy by value as we could modify it */
1503 : struct uclamp_se uc_req = p->uclamp_req[clamp_id];
1504 : #ifdef CONFIG_UCLAMP_TASK_GROUP
1505 : unsigned int tg_min, tg_max, value;
1506 :
1507 : /*
1508 : * Tasks in autogroups or root task group will be
1509 : * restricted by system defaults.
1510 : */
1511 : if (task_group_is_autogroup(task_group(p)))
1512 : return uc_req;
1513 : if (task_group(p) == &root_task_group)
1514 : return uc_req;
1515 :
1516 : tg_min = task_group(p)->uclamp[UCLAMP_MIN].value;
1517 : tg_max = task_group(p)->uclamp[UCLAMP_MAX].value;
1518 : value = uc_req.value;
1519 : value = clamp(value, tg_min, tg_max);
1520 : uclamp_se_set(&uc_req, value, false);
1521 : #endif
1522 :
1523 : return uc_req;
1524 : }
1525 :
1526 : /*
1527 : * The effective clamp bucket index of a task depends on, by increasing
1528 : * priority:
1529 : * - the task specific clamp value, when explicitly requested from userspace
1530 : * - the task group effective clamp value, for tasks not either in the root
1531 : * group or in an autogroup
1532 : * - the system default clamp value, defined by the sysadmin
1533 : */
1534 : static inline struct uclamp_se
1535 : uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
1536 : {
1537 : struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
1538 : struct uclamp_se uc_max = uclamp_default[clamp_id];
1539 :
1540 : /* System default restrictions always apply */
1541 : if (unlikely(uc_req.value > uc_max.value))
1542 : return uc_max;
1543 :
1544 : return uc_req;
1545 : }
1546 :
1547 : unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
1548 : {
1549 : struct uclamp_se uc_eff;
1550 :
1551 : /* Task currently refcounted: use back-annotated (effective) value */
1552 : if (p->uclamp[clamp_id].active)
1553 : return (unsigned long)p->uclamp[clamp_id].value;
1554 :
1555 : uc_eff = uclamp_eff_get(p, clamp_id);
1556 :
1557 : return (unsigned long)uc_eff.value;
1558 : }
1559 :
1560 : /*
1561 : * When a task is enqueued on a rq, the clamp bucket currently defined by the
1562 : * task's uclamp::bucket_id is refcounted on that rq. This also immediately
1563 : * updates the rq's clamp value if required.
1564 : *
1565 : * Tasks can have a task-specific value requested from user-space, track
1566 : * within each bucket the maximum value for tasks refcounted in it.
1567 : * This "local max aggregation" allows to track the exact "requested" value
1568 : * for each bucket when all its RUNNABLE tasks require the same clamp.
1569 : */
1570 : static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
1571 : enum uclamp_id clamp_id)
1572 : {
1573 : struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
1574 : struct uclamp_se *uc_se = &p->uclamp[clamp_id];
1575 : struct uclamp_bucket *bucket;
1576 :
1577 : lockdep_assert_rq_held(rq);
1578 :
1579 : /* Update task effective clamp */
1580 : p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id);
1581 :
1582 : bucket = &uc_rq->bucket[uc_se->bucket_id];
1583 : bucket->tasks++;
1584 : uc_se->active = true;
1585 :
1586 : uclamp_idle_reset(rq, clamp_id, uc_se->value);
1587 :
1588 : /*
1589 : * Local max aggregation: rq buckets always track the max
1590 : * "requested" clamp value of its RUNNABLE tasks.
1591 : */
1592 : if (bucket->tasks == 1 || uc_se->value > bucket->value)
1593 : bucket->value = uc_se->value;
1594 :
1595 : if (uc_se->value > READ_ONCE(uc_rq->value))
1596 : WRITE_ONCE(uc_rq->value, uc_se->value);
1597 : }
1598 :
1599 : /*
1600 : * When a task is dequeued from a rq, the clamp bucket refcounted by the task
1601 : * is released. If this is the last task reference counting the rq's max
1602 : * active clamp value, then the rq's clamp value is updated.
1603 : *
1604 : * Both refcounted tasks and rq's cached clamp values are expected to be
1605 : * always valid. If it's detected they are not, as defensive programming,
1606 : * enforce the expected state and warn.
1607 : */
1608 : static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
1609 : enum uclamp_id clamp_id)
1610 : {
1611 : struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
1612 : struct uclamp_se *uc_se = &p->uclamp[clamp_id];
1613 : struct uclamp_bucket *bucket;
1614 : unsigned int bkt_clamp;
1615 : unsigned int rq_clamp;
1616 :
1617 : lockdep_assert_rq_held(rq);
1618 :
1619 : /*
1620 : * If sched_uclamp_used was enabled after task @p was enqueued,
1621 : * we could end up with unbalanced call to uclamp_rq_dec_id().
1622 : *
1623 : * In this case the uc_se->active flag should be false since no uclamp
1624 : * accounting was performed at enqueue time and we can just return
1625 : * here.
1626 : *
1627 : * Need to be careful of the following enqueue/dequeue ordering
1628 : * problem too
1629 : *
1630 : * enqueue(taskA)
1631 : * // sched_uclamp_used gets enabled
1632 : * enqueue(taskB)
1633 : * dequeue(taskA)
1634 : * // Must not decrement bucket->tasks here
1635 : * dequeue(taskB)
1636 : *
1637 : * where we could end up with stale data in uc_se and
1638 : * bucket[uc_se->bucket_id].
1639 : *
1640 : * The following check here eliminates the possibility of such race.
1641 : */
1642 : if (unlikely(!uc_se->active))
1643 : return;
1644 :
1645 : bucket = &uc_rq->bucket[uc_se->bucket_id];
1646 :
1647 : SCHED_WARN_ON(!bucket->tasks);
1648 : if (likely(bucket->tasks))
1649 : bucket->tasks--;
1650 :
1651 : uc_se->active = false;
1652 :
1653 : /*
1654 : * Keep "local max aggregation" simple and accept to (possibly)
1655 : * overboost some RUNNABLE tasks in the same bucket.
1656 : * The rq clamp bucket value is reset to its base value whenever
1657 : * there are no more RUNNABLE tasks refcounting it.
1658 : */
1659 : if (likely(bucket->tasks))
1660 : return;
1661 :
1662 : rq_clamp = READ_ONCE(uc_rq->value);
1663 : /*
1664 : * Defensive programming: this should never happen. If it happens,
1665 : * e.g. due to future modification, warn and fixup the expected value.
1666 : */
1667 : SCHED_WARN_ON(bucket->value > rq_clamp);
1668 : if (bucket->value >= rq_clamp) {
1669 : bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
1670 : WRITE_ONCE(uc_rq->value, bkt_clamp);
1671 : }
1672 : }
1673 :
1674 : static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
1675 : {
1676 : enum uclamp_id clamp_id;
1677 :
1678 : /*
1679 : * Avoid any overhead until uclamp is actually used by the userspace.
1680 : *
1681 : * The condition is constructed such that a NOP is generated when
1682 : * sched_uclamp_used is disabled.
1683 : */
1684 : if (!static_branch_unlikely(&sched_uclamp_used))
1685 : return;
1686 :
1687 : if (unlikely(!p->sched_class->uclamp_enabled))
1688 : return;
1689 :
1690 : for_each_clamp_id(clamp_id)
1691 : uclamp_rq_inc_id(rq, p, clamp_id);
1692 :
1693 : /* Reset clamp idle holding when there is one RUNNABLE task */
1694 : if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)
1695 : rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
1696 : }
1697 :
1698 : static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
1699 : {
1700 : enum uclamp_id clamp_id;
1701 :
1702 : /*
1703 : * Avoid any overhead until uclamp is actually used by the userspace.
1704 : *
1705 : * The condition is constructed such that a NOP is generated when
1706 : * sched_uclamp_used is disabled.
1707 : */
1708 : if (!static_branch_unlikely(&sched_uclamp_used))
1709 : return;
1710 :
1711 : if (unlikely(!p->sched_class->uclamp_enabled))
1712 : return;
1713 :
1714 : for_each_clamp_id(clamp_id)
1715 : uclamp_rq_dec_id(rq, p, clamp_id);
1716 : }
1717 :
1718 : static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p,
1719 : enum uclamp_id clamp_id)
1720 : {
1721 : if (!p->uclamp[clamp_id].active)
1722 : return;
1723 :
1724 : uclamp_rq_dec_id(rq, p, clamp_id);
1725 : uclamp_rq_inc_id(rq, p, clamp_id);
1726 :
1727 : /*
1728 : * Make sure to clear the idle flag if we've transiently reached 0
1729 : * active tasks on rq.
1730 : */
1731 : if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE))
1732 : rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
1733 : }
1734 :
1735 : static inline void
1736 : uclamp_update_active(struct task_struct *p)
1737 : {
1738 : enum uclamp_id clamp_id;
1739 : struct rq_flags rf;
1740 : struct rq *rq;
1741 :
1742 : /*
1743 : * Lock the task and the rq where the task is (or was) queued.
1744 : *
1745 : * We might lock the (previous) rq of a !RUNNABLE task, but that's the
1746 : * price to pay to safely serialize util_{min,max} updates with
1747 : * enqueues, dequeues and migration operations.
1748 : * This is the same locking schema used by __set_cpus_allowed_ptr().
1749 : */
1750 : rq = task_rq_lock(p, &rf);
1751 :
1752 : /*
1753 : * Setting the clamp bucket is serialized by task_rq_lock().
1754 : * If the task is not yet RUNNABLE and its task_struct is not
1755 : * affecting a valid clamp bucket, the next time it's enqueued,
1756 : * it will already see the updated clamp bucket value.
1757 : */
1758 : for_each_clamp_id(clamp_id)
1759 : uclamp_rq_reinc_id(rq, p, clamp_id);
1760 :
1761 : task_rq_unlock(rq, p, &rf);
1762 : }
1763 :
1764 : #ifdef CONFIG_UCLAMP_TASK_GROUP
1765 : static inline void
1766 : uclamp_update_active_tasks(struct cgroup_subsys_state *css)
1767 : {
1768 : struct css_task_iter it;
1769 : struct task_struct *p;
1770 :
1771 : css_task_iter_start(css, 0, &it);
1772 : while ((p = css_task_iter_next(&it)))
1773 : uclamp_update_active(p);
1774 : css_task_iter_end(&it);
1775 : }
1776 :
1777 : static void cpu_util_update_eff(struct cgroup_subsys_state *css);
1778 : static void uclamp_update_root_tg(void)
1779 : {
1780 : struct task_group *tg = &root_task_group;
1781 :
1782 : uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN],
1783 : sysctl_sched_uclamp_util_min, false);
1784 : uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
1785 : sysctl_sched_uclamp_util_max, false);
1786 :
1787 : rcu_read_lock();
1788 : cpu_util_update_eff(&root_task_group.css);
1789 : rcu_read_unlock();
1790 : }
1791 : #else
1792 : static void uclamp_update_root_tg(void) { }
1793 : #endif
1794 :
1795 : int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
1796 : void *buffer, size_t *lenp, loff_t *ppos)
1797 : {
1798 : bool update_root_tg = false;
1799 : int old_min, old_max, old_min_rt;
1800 : int result;
1801 :
1802 : mutex_lock(&uclamp_mutex);
1803 : old_min = sysctl_sched_uclamp_util_min;
1804 : old_max = sysctl_sched_uclamp_util_max;
1805 : old_min_rt = sysctl_sched_uclamp_util_min_rt_default;
1806 :
1807 : result = proc_dointvec(table, write, buffer, lenp, ppos);
1808 : if (result)
1809 : goto undo;
1810 : if (!write)
1811 : goto done;
1812 :
1813 : if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
1814 : sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE ||
1815 : sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) {
1816 :
1817 : result = -EINVAL;
1818 : goto undo;
1819 : }
1820 :
1821 : if (old_min != sysctl_sched_uclamp_util_min) {
1822 : uclamp_se_set(&uclamp_default[UCLAMP_MIN],
1823 : sysctl_sched_uclamp_util_min, false);
1824 : update_root_tg = true;
1825 : }
1826 : if (old_max != sysctl_sched_uclamp_util_max) {
1827 : uclamp_se_set(&uclamp_default[UCLAMP_MAX],
1828 : sysctl_sched_uclamp_util_max, false);
1829 : update_root_tg = true;
1830 : }
1831 :
1832 : if (update_root_tg) {
1833 : static_branch_enable(&sched_uclamp_used);
1834 : uclamp_update_root_tg();
1835 : }
1836 :
1837 : if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) {
1838 : static_branch_enable(&sched_uclamp_used);
1839 : uclamp_sync_util_min_rt_default();
1840 : }
1841 :
1842 : /*
1843 : * We update all RUNNABLE tasks only when task groups are in use.
1844 : * Otherwise, keep it simple and do just a lazy update at each next
1845 : * task enqueue time.
1846 : */
1847 :
1848 : goto done;
1849 :
1850 : undo:
1851 : sysctl_sched_uclamp_util_min = old_min;
1852 : sysctl_sched_uclamp_util_max = old_max;
1853 : sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
1854 : done:
1855 : mutex_unlock(&uclamp_mutex);
1856 :
1857 : return result;
1858 : }
1859 :
1860 : static int uclamp_validate(struct task_struct *p,
1861 : const struct sched_attr *attr)
1862 : {
1863 : int util_min = p->uclamp_req[UCLAMP_MIN].value;
1864 : int util_max = p->uclamp_req[UCLAMP_MAX].value;
1865 :
1866 : if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
1867 : util_min = attr->sched_util_min;
1868 :
1869 : if (util_min + 1 > SCHED_CAPACITY_SCALE + 1)
1870 : return -EINVAL;
1871 : }
1872 :
1873 : if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
1874 : util_max = attr->sched_util_max;
1875 :
1876 : if (util_max + 1 > SCHED_CAPACITY_SCALE + 1)
1877 : return -EINVAL;
1878 : }
1879 :
1880 : if (util_min != -1 && util_max != -1 && util_min > util_max)
1881 : return -EINVAL;
1882 :
1883 : /*
1884 : * We have valid uclamp attributes; make sure uclamp is enabled.
1885 : *
1886 : * We need to do that here, because enabling static branches is a
1887 : * blocking operation which obviously cannot be done while holding
1888 : * scheduler locks.
1889 : */
1890 : static_branch_enable(&sched_uclamp_used);
1891 :
1892 : return 0;
1893 : }
1894 :
1895 : static bool uclamp_reset(const struct sched_attr *attr,
1896 : enum uclamp_id clamp_id,
1897 : struct uclamp_se *uc_se)
1898 : {
1899 : /* Reset on sched class change for a non user-defined clamp value. */
1900 : if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) &&
1901 : !uc_se->user_defined)
1902 : return true;
1903 :
1904 : /* Reset on sched_util_{min,max} == -1. */
1905 : if (clamp_id == UCLAMP_MIN &&
1906 : attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
1907 : attr->sched_util_min == -1) {
1908 : return true;
1909 : }
1910 :
1911 : if (clamp_id == UCLAMP_MAX &&
1912 : attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
1913 : attr->sched_util_max == -1) {
1914 : return true;
1915 : }
1916 :
1917 : return false;
1918 : }
1919 :
1920 : static void __setscheduler_uclamp(struct task_struct *p,
1921 : const struct sched_attr *attr)
1922 : {
1923 : enum uclamp_id clamp_id;
1924 :
1925 : for_each_clamp_id(clamp_id) {
1926 : struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
1927 : unsigned int value;
1928 :
1929 : if (!uclamp_reset(attr, clamp_id, uc_se))
1930 : continue;
1931 :
1932 : /*
1933 : * RT by default have a 100% boost value that could be modified
1934 : * at runtime.
1935 : */
1936 : if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
1937 : value = sysctl_sched_uclamp_util_min_rt_default;
1938 : else
1939 : value = uclamp_none(clamp_id);
1940 :
1941 : uclamp_se_set(uc_se, value, false);
1942 :
1943 : }
1944 :
1945 : if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
1946 : return;
1947 :
1948 : if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
1949 : attr->sched_util_min != -1) {
1950 : uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
1951 : attr->sched_util_min, true);
1952 : }
1953 :
1954 : if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
1955 : attr->sched_util_max != -1) {
1956 : uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
1957 : attr->sched_util_max, true);
1958 : }
1959 : }
1960 :
1961 : static void uclamp_fork(struct task_struct *p)
1962 : {
1963 : enum uclamp_id clamp_id;
1964 :
1965 : /*
1966 : * We don't need to hold task_rq_lock() when updating p->uclamp_* here
1967 : * as the task is still at its early fork stages.
1968 : */
1969 : for_each_clamp_id(clamp_id)
1970 : p->uclamp[clamp_id].active = false;
1971 :
1972 : if (likely(!p->sched_reset_on_fork))
1973 : return;
1974 :
1975 : for_each_clamp_id(clamp_id) {
1976 : uclamp_se_set(&p->uclamp_req[clamp_id],
1977 : uclamp_none(clamp_id), false);
1978 : }
1979 : }
1980 :
1981 : static void uclamp_post_fork(struct task_struct *p)
1982 : {
1983 : uclamp_update_util_min_rt_default(p);
1984 : }
1985 :
1986 : static void __init init_uclamp_rq(struct rq *rq)
1987 : {
1988 : enum uclamp_id clamp_id;
1989 : struct uclamp_rq *uc_rq = rq->uclamp;
1990 :
1991 : for_each_clamp_id(clamp_id) {
1992 : uc_rq[clamp_id] = (struct uclamp_rq) {
1993 : .value = uclamp_none(clamp_id)
1994 : };
1995 : }
1996 :
1997 : rq->uclamp_flags = UCLAMP_FLAG_IDLE;
1998 : }
1999 :
2000 : static void __init init_uclamp(void)
2001 : {
2002 : struct uclamp_se uc_max = {};
2003 : enum uclamp_id clamp_id;
2004 : int cpu;
2005 :
2006 : for_each_possible_cpu(cpu)
2007 : init_uclamp_rq(cpu_rq(cpu));
2008 :
2009 : for_each_clamp_id(clamp_id) {
2010 : uclamp_se_set(&init_task.uclamp_req[clamp_id],
2011 : uclamp_none(clamp_id), false);
2012 : }
2013 :
2014 : /* System defaults allow max clamp values for both indexes */
2015 : uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
2016 : for_each_clamp_id(clamp_id) {
2017 : uclamp_default[clamp_id] = uc_max;
2018 : #ifdef CONFIG_UCLAMP_TASK_GROUP
2019 : root_task_group.uclamp_req[clamp_id] = uc_max;
2020 : root_task_group.uclamp[clamp_id] = uc_max;
2021 : #endif
2022 : }
2023 : }
2024 :
2025 : #else /* CONFIG_UCLAMP_TASK */
2026 : static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
2027 : static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
2028 : static inline int uclamp_validate(struct task_struct *p,
2029 : const struct sched_attr *attr)
2030 : {
2031 : return -EOPNOTSUPP;
2032 : }
2033 : static void __setscheduler_uclamp(struct task_struct *p,
2034 : const struct sched_attr *attr) { }
2035 : static inline void uclamp_fork(struct task_struct *p) { }
2036 : static inline void uclamp_post_fork(struct task_struct *p) { }
2037 : static inline void init_uclamp(void) { }
2038 : #endif /* CONFIG_UCLAMP_TASK */
2039 :
2040 0 : bool sched_task_on_rq(struct task_struct *p)
2041 : {
2042 0 : return task_on_rq_queued(p);
2043 : }
2044 :
2045 0 : unsigned long get_wchan(struct task_struct *p)
2046 : {
2047 0 : unsigned long ip = 0;
2048 : unsigned int state;
2049 :
2050 0 : if (!p || p == current)
2051 : return 0;
2052 :
2053 : /* Only get wchan if task is blocked and we can keep it that way. */
2054 0 : raw_spin_lock_irq(&p->pi_lock);
2055 0 : state = READ_ONCE(p->__state);
2056 0 : smp_rmb(); /* see try_to_wake_up() */
2057 0 : if (state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq)
2058 0 : ip = __get_wchan(p);
2059 0 : raw_spin_unlock_irq(&p->pi_lock);
2060 :
2061 0 : return ip;
2062 : }
2063 :
2064 617 : static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
2065 : {
2066 617 : if (!(flags & ENQUEUE_NOCLOCK))
2067 0 : update_rq_clock(rq);
2068 :
2069 : if (!(flags & ENQUEUE_RESTORE)) {
2070 : sched_info_enqueue(rq, p);
2071 : psi_enqueue(p, flags & ENQUEUE_WAKEUP);
2072 : }
2073 :
2074 620 : uclamp_rq_inc(rq, p);
2075 620 : p->sched_class->enqueue_task(rq, p, flags);
2076 :
2077 620 : if (sched_core_enabled(rq))
2078 : sched_core_enqueue(rq, p);
2079 617 : }
2080 :
2081 615 : static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
2082 : {
2083 618 : if (sched_core_enabled(rq))
2084 : sched_core_dequeue(rq, p, flags);
2085 :
2086 615 : if (!(flags & DEQUEUE_NOCLOCK))
2087 0 : update_rq_clock(rq);
2088 :
2089 : if (!(flags & DEQUEUE_SAVE)) {
2090 : sched_info_dequeue(rq, p);
2091 : psi_dequeue(p, flags & DEQUEUE_SLEEP);
2092 : }
2093 :
2094 618 : uclamp_rq_dec(rq, p);
2095 618 : p->sched_class->dequeue_task(rq, p, flags);
2096 615 : }
2097 :
2098 0 : void activate_task(struct rq *rq, struct task_struct *p, int flags)
2099 : {
2100 617 : enqueue_task(rq, p, flags);
2101 :
2102 617 : p->on_rq = TASK_ON_RQ_QUEUED;
2103 0 : }
2104 :
2105 0 : void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
2106 : {
2107 615 : p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
2108 :
2109 615 : dequeue_task(rq, p, flags);
2110 0 : }
2111 :
2112 : static inline int __normal_prio(int policy, int rt_prio, int nice)
2113 : {
2114 : int prio;
2115 :
2116 4 : if (dl_policy(policy))
2117 : prio = MAX_DL_PRIO - 1;
2118 4 : else if (rt_policy(policy))
2119 0 : prio = MAX_RT_PRIO - 1 - rt_prio;
2120 : else
2121 0 : prio = NICE_TO_PRIO(nice);
2122 :
2123 : return prio;
2124 : }
2125 :
2126 : /*
2127 : * Calculate the expected normal priority: i.e. priority
2128 : * without taking RT-inheritance into account. Might be
2129 : * boosted by interactivity modifiers. Changes upon fork,
2130 : * setprio syscalls, and whenever the interactivity
2131 : * estimator recalculates.
2132 : */
2133 : static inline int normal_prio(struct task_struct *p)
2134 : {
2135 8 : return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio));
2136 : }
2137 :
2138 : /*
2139 : * Calculate the current priority, i.e. the priority
2140 : * taken into account by the scheduler. This value might
2141 : * be boosted by RT tasks, or might be boosted by
2142 : * interactivity modifiers. Will be RT if the task got
2143 : * RT-boosted. If not then it returns p->normal_prio.
2144 : */
2145 : static int effective_prio(struct task_struct *p)
2146 : {
2147 8 : p->normal_prio = normal_prio(p);
2148 : /*
2149 : * If we are RT tasks or we were boosted to RT priority,
2150 : * keep the priority unchanged. Otherwise, update priority
2151 : * to the normal priority:
2152 : */
2153 8 : if (!rt_prio(p->prio))
2154 : return p->normal_prio;
2155 : return p->prio;
2156 : }
2157 :
2158 : /**
2159 : * task_curr - is this task currently executing on a CPU?
2160 : * @p: the task in question.
2161 : *
2162 : * Return: 1 if the task is currently executing. 0 otherwise.
2163 : */
2164 0 : inline int task_curr(const struct task_struct *p)
2165 : {
2166 0 : return cpu_curr(task_cpu(p)) == p;
2167 : }
2168 :
2169 : /*
2170 : * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
2171 : * use the balance_callback list if you want balancing.
2172 : *
2173 : * this means any call to check_class_changed() must be followed by a call to
2174 : * balance_callback().
2175 : */
2176 0 : static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2177 : const struct sched_class *prev_class,
2178 : int oldprio)
2179 : {
2180 0 : if (prev_class != p->sched_class) {
2181 0 : if (prev_class->switched_from)
2182 0 : prev_class->switched_from(rq, p);
2183 :
2184 0 : p->sched_class->switched_to(rq, p);
2185 0 : } else if (oldprio != p->prio || dl_task(p))
2186 0 : p->sched_class->prio_changed(rq, p, oldprio);
2187 0 : }
2188 :
2189 617 : void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2190 : {
2191 617 : if (p->sched_class == rq->curr->sched_class)
2192 614 : rq->curr->sched_class->check_preempt_curr(rq, p, flags);
2193 3 : else if (p->sched_class > rq->curr->sched_class)
2194 : resched_curr(rq);
2195 :
2196 : /*
2197 : * A queue event has occurred, and we're going to schedule. In
2198 : * this case, we can save a useless back to back clock update.
2199 : */
2200 1234 : if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
2201 301 : rq_clock_skip_update(rq);
2202 617 : }
2203 :
2204 : #ifdef CONFIG_SMP
2205 :
2206 : static void
2207 : __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
2208 :
2209 : static int __set_cpus_allowed_ptr(struct task_struct *p,
2210 : const struct cpumask *new_mask,
2211 : u32 flags);
2212 :
2213 : static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
2214 : {
2215 : if (likely(!p->migration_disabled))
2216 : return;
2217 :
2218 : if (p->cpus_ptr != &p->cpus_mask)
2219 : return;
2220 :
2221 : /*
2222 : * Violates locking rules! see comment in __do_set_cpus_allowed().
2223 : */
2224 : __do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE);
2225 : }
2226 :
2227 : void migrate_disable(void)
2228 : {
2229 : struct task_struct *p = current;
2230 :
2231 : if (p->migration_disabled) {
2232 : p->migration_disabled++;
2233 : return;
2234 : }
2235 :
2236 : preempt_disable();
2237 : this_rq()->nr_pinned++;
2238 : p->migration_disabled = 1;
2239 : preempt_enable();
2240 : }
2241 : EXPORT_SYMBOL_GPL(migrate_disable);
2242 :
2243 : void migrate_enable(void)
2244 : {
2245 : struct task_struct *p = current;
2246 :
2247 : if (p->migration_disabled > 1) {
2248 : p->migration_disabled--;
2249 : return;
2250 : }
2251 :
2252 : if (WARN_ON_ONCE(!p->migration_disabled))
2253 : return;
2254 :
2255 : /*
2256 : * Ensure stop_task runs either before or after this, and that
2257 : * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
2258 : */
2259 : preempt_disable();
2260 : if (p->cpus_ptr != &p->cpus_mask)
2261 : __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
2262 : /*
2263 : * Mustn't clear migration_disabled() until cpus_ptr points back at the
2264 : * regular cpus_mask, otherwise things that race (eg.
2265 : * select_fallback_rq) get confused.
2266 : */
2267 : barrier();
2268 : p->migration_disabled = 0;
2269 : this_rq()->nr_pinned--;
2270 : preempt_enable();
2271 : }
2272 : EXPORT_SYMBOL_GPL(migrate_enable);
2273 :
2274 : static inline bool rq_has_pinned_tasks(struct rq *rq)
2275 : {
2276 : return rq->nr_pinned;
2277 : }
2278 :
2279 : /*
2280 : * Per-CPU kthreads are allowed to run on !active && online CPUs, see
2281 : * __set_cpus_allowed_ptr() and select_fallback_rq().
2282 : */
2283 : static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
2284 : {
2285 : /* When not in the task's cpumask, no point in looking further. */
2286 : if (!cpumask_test_cpu(cpu, p->cpus_ptr))
2287 : return false;
2288 :
2289 : /* migrate_disabled() must be allowed to finish. */
2290 : if (is_migration_disabled(p))
2291 : return cpu_online(cpu);
2292 :
2293 : /* Non kernel threads are not allowed during either online or offline. */
2294 : if (!(p->flags & PF_KTHREAD))
2295 : return cpu_active(cpu) && task_cpu_possible(cpu, p);
2296 :
2297 : /* KTHREAD_IS_PER_CPU is always allowed. */
2298 : if (kthread_is_per_cpu(p))
2299 : return cpu_online(cpu);
2300 :
2301 : /* Regular kernel threads don't get to stay during offline. */
2302 : if (cpu_dying(cpu))
2303 : return false;
2304 :
2305 : /* But are allowed during online. */
2306 : return cpu_online(cpu);
2307 : }
2308 :
2309 : /*
2310 : * This is how migration works:
2311 : *
2312 : * 1) we invoke migration_cpu_stop() on the target CPU using
2313 : * stop_one_cpu().
2314 : * 2) stopper starts to run (implicitly forcing the migrated thread
2315 : * off the CPU)
2316 : * 3) it checks whether the migrated task is still in the wrong runqueue.
2317 : * 4) if it's in the wrong runqueue then the migration thread removes
2318 : * it and puts it into the right queue.
2319 : * 5) stopper completes and stop_one_cpu() returns and the migration
2320 : * is done.
2321 : */
2322 :
2323 : /*
2324 : * move_queued_task - move a queued task to new rq.
2325 : *
2326 : * Returns (locked) new rq. Old rq's lock is released.
2327 : */
2328 : static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
2329 : struct task_struct *p, int new_cpu)
2330 : {
2331 : lockdep_assert_rq_held(rq);
2332 :
2333 : deactivate_task(rq, p, DEQUEUE_NOCLOCK);
2334 : set_task_cpu(p, new_cpu);
2335 : rq_unlock(rq, rf);
2336 :
2337 : rq = cpu_rq(new_cpu);
2338 :
2339 : rq_lock(rq, rf);
2340 : BUG_ON(task_cpu(p) != new_cpu);
2341 : activate_task(rq, p, 0);
2342 : check_preempt_curr(rq, p, 0);
2343 :
2344 : return rq;
2345 : }
2346 :
2347 : struct migration_arg {
2348 : struct task_struct *task;
2349 : int dest_cpu;
2350 : struct set_affinity_pending *pending;
2351 : };
2352 :
2353 : /*
2354 : * @refs: number of wait_for_completion()
2355 : * @stop_pending: is @stop_work in use
2356 : */
2357 : struct set_affinity_pending {
2358 : refcount_t refs;
2359 : unsigned int stop_pending;
2360 : struct completion done;
2361 : struct cpu_stop_work stop_work;
2362 : struct migration_arg arg;
2363 : };
2364 :
2365 : /*
2366 : * Move (not current) task off this CPU, onto the destination CPU. We're doing
2367 : * this because either it can't run here any more (set_cpus_allowed()
2368 : * away from this CPU, or CPU going down), or because we're
2369 : * attempting to rebalance this task on exec (sched_exec).
2370 : *
2371 : * So we race with normal scheduler movements, but that's OK, as long
2372 : * as the task is no longer on this CPU.
2373 : */
2374 : static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
2375 : struct task_struct *p, int dest_cpu)
2376 : {
2377 : /* Affinity changed (again). */
2378 : if (!is_cpu_allowed(p, dest_cpu))
2379 : return rq;
2380 :
2381 : update_rq_clock(rq);
2382 : rq = move_queued_task(rq, rf, p, dest_cpu);
2383 :
2384 : return rq;
2385 : }
2386 :
2387 : /*
2388 : * migration_cpu_stop - this will be executed by a highprio stopper thread
2389 : * and performs thread migration by bumping thread off CPU then
2390 : * 'pushing' onto another runqueue.
2391 : */
2392 : static int migration_cpu_stop(void *data)
2393 : {
2394 : struct migration_arg *arg = data;
2395 : struct set_affinity_pending *pending = arg->pending;
2396 : struct task_struct *p = arg->task;
2397 : struct rq *rq = this_rq();
2398 : bool complete = false;
2399 : struct rq_flags rf;
2400 :
2401 : /*
2402 : * The original target CPU might have gone down and we might
2403 : * be on another CPU but it doesn't matter.
2404 : */
2405 : local_irq_save(rf.flags);
2406 : /*
2407 : * We need to explicitly wake pending tasks before running
2408 : * __migrate_task() such that we will not miss enforcing cpus_ptr
2409 : * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
2410 : */
2411 : flush_smp_call_function_from_idle();
2412 :
2413 : raw_spin_lock(&p->pi_lock);
2414 : rq_lock(rq, &rf);
2415 :
2416 : /*
2417 : * If we were passed a pending, then ->stop_pending was set, thus
2418 : * p->migration_pending must have remained stable.
2419 : */
2420 : WARN_ON_ONCE(pending && pending != p->migration_pending);
2421 :
2422 : /*
2423 : * If task_rq(p) != rq, it cannot be migrated here, because we're
2424 : * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
2425 : * we're holding p->pi_lock.
2426 : */
2427 : if (task_rq(p) == rq) {
2428 : if (is_migration_disabled(p))
2429 : goto out;
2430 :
2431 : if (pending) {
2432 : p->migration_pending = NULL;
2433 : complete = true;
2434 :
2435 : if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask))
2436 : goto out;
2437 : }
2438 :
2439 : if (task_on_rq_queued(p))
2440 : rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
2441 : else
2442 : p->wake_cpu = arg->dest_cpu;
2443 :
2444 : /*
2445 : * XXX __migrate_task() can fail, at which point we might end
2446 : * up running on a dodgy CPU, AFAICT this can only happen
2447 : * during CPU hotplug, at which point we'll get pushed out
2448 : * anyway, so it's probably not a big deal.
2449 : */
2450 :
2451 : } else if (pending) {
2452 : /*
2453 : * This happens when we get migrated between migrate_enable()'s
2454 : * preempt_enable() and scheduling the stopper task. At that
2455 : * point we're a regular task again and not current anymore.
2456 : *
2457 : * A !PREEMPT kernel has a giant hole here, which makes it far
2458 : * more likely.
2459 : */
2460 :
2461 : /*
2462 : * The task moved before the stopper got to run. We're holding
2463 : * ->pi_lock, so the allowed mask is stable - if it got
2464 : * somewhere allowed, we're done.
2465 : */
2466 : if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
2467 : p->migration_pending = NULL;
2468 : complete = true;
2469 : goto out;
2470 : }
2471 :
2472 : /*
2473 : * When migrate_enable() hits a rq mis-match we can't reliably
2474 : * determine is_migration_disabled() and so have to chase after
2475 : * it.
2476 : */
2477 : WARN_ON_ONCE(!pending->stop_pending);
2478 : task_rq_unlock(rq, p, &rf);
2479 : stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
2480 : &pending->arg, &pending->stop_work);
2481 : return 0;
2482 : }
2483 : out:
2484 : if (pending)
2485 : pending->stop_pending = false;
2486 : task_rq_unlock(rq, p, &rf);
2487 :
2488 : if (complete)
2489 : complete_all(&pending->done);
2490 :
2491 : return 0;
2492 : }
2493 :
2494 : int push_cpu_stop(void *arg)
2495 : {
2496 : struct rq *lowest_rq = NULL, *rq = this_rq();
2497 : struct task_struct *p = arg;
2498 :
2499 : raw_spin_lock_irq(&p->pi_lock);
2500 : raw_spin_rq_lock(rq);
2501 :
2502 : if (task_rq(p) != rq)
2503 : goto out_unlock;
2504 :
2505 : if (is_migration_disabled(p)) {
2506 : p->migration_flags |= MDF_PUSH;
2507 : goto out_unlock;
2508 : }
2509 :
2510 : p->migration_flags &= ~MDF_PUSH;
2511 :
2512 : if (p->sched_class->find_lock_rq)
2513 : lowest_rq = p->sched_class->find_lock_rq(p, rq);
2514 :
2515 : if (!lowest_rq)
2516 : goto out_unlock;
2517 :
2518 : // XXX validate p is still the highest prio task
2519 : if (task_rq(p) == rq) {
2520 : deactivate_task(rq, p, 0);
2521 : set_task_cpu(p, lowest_rq->cpu);
2522 : activate_task(lowest_rq, p, 0);
2523 : resched_curr(lowest_rq);
2524 : }
2525 :
2526 : double_unlock_balance(rq, lowest_rq);
2527 :
2528 : out_unlock:
2529 : rq->push_busy = false;
2530 : raw_spin_rq_unlock(rq);
2531 : raw_spin_unlock_irq(&p->pi_lock);
2532 :
2533 : put_task_struct(p);
2534 : return 0;
2535 : }
2536 :
2537 : /*
2538 : * sched_class::set_cpus_allowed must do the below, but is not required to
2539 : * actually call this function.
2540 : */
2541 : void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
2542 : {
2543 : if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) {
2544 : p->cpus_ptr = new_mask;
2545 : return;
2546 : }
2547 :
2548 : cpumask_copy(&p->cpus_mask, new_mask);
2549 : p->nr_cpus_allowed = cpumask_weight(new_mask);
2550 : }
2551 :
2552 : static void
2553 : __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
2554 : {
2555 : struct rq *rq = task_rq(p);
2556 : bool queued, running;
2557 :
2558 : /*
2559 : * This here violates the locking rules for affinity, since we're only
2560 : * supposed to change these variables while holding both rq->lock and
2561 : * p->pi_lock.
2562 : *
2563 : * HOWEVER, it magically works, because ttwu() is the only code that
2564 : * accesses these variables under p->pi_lock and only does so after
2565 : * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule()
2566 : * before finish_task().
2567 : *
2568 : * XXX do further audits, this smells like something putrid.
2569 : */
2570 : if (flags & SCA_MIGRATE_DISABLE)
2571 : SCHED_WARN_ON(!p->on_cpu);
2572 : else
2573 : lockdep_assert_held(&p->pi_lock);
2574 :
2575 : queued = task_on_rq_queued(p);
2576 : running = task_current(rq, p);
2577 :
2578 : if (queued) {
2579 : /*
2580 : * Because __kthread_bind() calls this on blocked tasks without
2581 : * holding rq->lock.
2582 : */
2583 : lockdep_assert_rq_held(rq);
2584 : dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
2585 : }
2586 : if (running)
2587 : put_prev_task(rq, p);
2588 :
2589 : p->sched_class->set_cpus_allowed(p, new_mask, flags);
2590 :
2591 : if (queued)
2592 : enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
2593 : if (running)
2594 : set_next_task(rq, p);
2595 : }
2596 :
2597 : void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
2598 : {
2599 : __do_set_cpus_allowed(p, new_mask, 0);
2600 : }
2601 :
2602 : int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,
2603 : int node)
2604 : {
2605 : if (!src->user_cpus_ptr)
2606 : return 0;
2607 :
2608 : dst->user_cpus_ptr = kmalloc_node(cpumask_size(), GFP_KERNEL, node);
2609 : if (!dst->user_cpus_ptr)
2610 : return -ENOMEM;
2611 :
2612 : cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr);
2613 : return 0;
2614 : }
2615 :
2616 : static inline struct cpumask *clear_user_cpus_ptr(struct task_struct *p)
2617 : {
2618 : struct cpumask *user_mask = NULL;
2619 :
2620 : swap(p->user_cpus_ptr, user_mask);
2621 :
2622 : return user_mask;
2623 : }
2624 :
2625 : void release_user_cpus_ptr(struct task_struct *p)
2626 : {
2627 : kfree(clear_user_cpus_ptr(p));
2628 : }
2629 :
2630 : /*
2631 : * This function is wildly self concurrent; here be dragons.
2632 : *
2633 : *
2634 : * When given a valid mask, __set_cpus_allowed_ptr() must block until the
2635 : * designated task is enqueued on an allowed CPU. If that task is currently
2636 : * running, we have to kick it out using the CPU stopper.
2637 : *
2638 : * Migrate-Disable comes along and tramples all over our nice sandcastle.
2639 : * Consider:
2640 : *
2641 : * Initial conditions: P0->cpus_mask = [0, 1]
2642 : *
2643 : * P0@CPU0 P1
2644 : *
2645 : * migrate_disable();
2646 : * <preempted>
2647 : * set_cpus_allowed_ptr(P0, [1]);
2648 : *
2649 : * P1 *cannot* return from this set_cpus_allowed_ptr() call until P0 executes
2650 : * its outermost migrate_enable() (i.e. it exits its Migrate-Disable region).
2651 : * This means we need the following scheme:
2652 : *
2653 : * P0@CPU0 P1
2654 : *
2655 : * migrate_disable();
2656 : * <preempted>
2657 : * set_cpus_allowed_ptr(P0, [1]);
2658 : * <blocks>
2659 : * <resumes>
2660 : * migrate_enable();
2661 : * __set_cpus_allowed_ptr();
2662 : * <wakes local stopper>
2663 : * `--> <woken on migration completion>
2664 : *
2665 : * Now the fun stuff: there may be several P1-like tasks, i.e. multiple
2666 : * concurrent set_cpus_allowed_ptr(P0, [*]) calls. CPU affinity changes of any
2667 : * task p are serialized by p->pi_lock, which we can leverage: the one that
2668 : * should come into effect at the end of the Migrate-Disable region is the last
2669 : * one. This means we only need to track a single cpumask (i.e. p->cpus_mask),
2670 : * but we still need to properly signal those waiting tasks at the appropriate
2671 : * moment.
2672 : *
2673 : * This is implemented using struct set_affinity_pending. The first
2674 : * __set_cpus_allowed_ptr() caller within a given Migrate-Disable region will
2675 : * setup an instance of that struct and install it on the targeted task_struct.
2676 : * Any and all further callers will reuse that instance. Those then wait for
2677 : * a completion signaled at the tail of the CPU stopper callback (1), triggered
2678 : * on the end of the Migrate-Disable region (i.e. outermost migrate_enable()).
2679 : *
2680 : *
2681 : * (1) In the cases covered above. There is one more where the completion is
2682 : * signaled within affine_move_task() itself: when a subsequent affinity request
2683 : * occurs after the stopper bailed out due to the targeted task still being
2684 : * Migrate-Disable. Consider:
2685 : *
2686 : * Initial conditions: P0->cpus_mask = [0, 1]
2687 : *
2688 : * CPU0 P1 P2
2689 : * <P0>
2690 : * migrate_disable();
2691 : * <preempted>
2692 : * set_cpus_allowed_ptr(P0, [1]);
2693 : * <blocks>
2694 : * <migration/0>
2695 : * migration_cpu_stop()
2696 : * is_migration_disabled()
2697 : * <bails>
2698 : * set_cpus_allowed_ptr(P0, [0, 1]);
2699 : * <signal completion>
2700 : * <awakes>
2701 : *
2702 : * Note that the above is safe vs a concurrent migrate_enable(), as any
2703 : * pending affinity completion is preceded by an uninstallation of
2704 : * p->migration_pending done with p->pi_lock held.
2705 : */
2706 : static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,
2707 : int dest_cpu, unsigned int flags)
2708 : {
2709 : struct set_affinity_pending my_pending = { }, *pending = NULL;
2710 : bool stop_pending, complete = false;
2711 :
2712 : /* Can the task run on the task's current CPU? If so, we're done */
2713 : if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
2714 : struct task_struct *push_task = NULL;
2715 :
2716 : if ((flags & SCA_MIGRATE_ENABLE) &&
2717 : (p->migration_flags & MDF_PUSH) && !rq->push_busy) {
2718 : rq->push_busy = true;
2719 : push_task = get_task_struct(p);
2720 : }
2721 :
2722 : /*
2723 : * If there are pending waiters, but no pending stop_work,
2724 : * then complete now.
2725 : */
2726 : pending = p->migration_pending;
2727 : if (pending && !pending->stop_pending) {
2728 : p->migration_pending = NULL;
2729 : complete = true;
2730 : }
2731 :
2732 : task_rq_unlock(rq, p, rf);
2733 :
2734 : if (push_task) {
2735 : stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
2736 : p, &rq->push_work);
2737 : }
2738 :
2739 : if (complete)
2740 : complete_all(&pending->done);
2741 :
2742 : return 0;
2743 : }
2744 :
2745 : if (!(flags & SCA_MIGRATE_ENABLE)) {
2746 : /* serialized by p->pi_lock */
2747 : if (!p->migration_pending) {
2748 : /* Install the request */
2749 : refcount_set(&my_pending.refs, 1);
2750 : init_completion(&my_pending.done);
2751 : my_pending.arg = (struct migration_arg) {
2752 : .task = p,
2753 : .dest_cpu = dest_cpu,
2754 : .pending = &my_pending,
2755 : };
2756 :
2757 : p->migration_pending = &my_pending;
2758 : } else {
2759 : pending = p->migration_pending;
2760 : refcount_inc(&pending->refs);
2761 : /*
2762 : * Affinity has changed, but we've already installed a
2763 : * pending. migration_cpu_stop() *must* see this, else
2764 : * we risk a completion of the pending despite having a
2765 : * task on a disallowed CPU.
2766 : *
2767 : * Serialized by p->pi_lock, so this is safe.
2768 : */
2769 : pending->arg.dest_cpu = dest_cpu;
2770 : }
2771 : }
2772 : pending = p->migration_pending;
2773 : /*
2774 : * - !MIGRATE_ENABLE:
2775 : * we'll have installed a pending if there wasn't one already.
2776 : *
2777 : * - MIGRATE_ENABLE:
2778 : * we're here because the current CPU isn't matching anymore,
2779 : * the only way that can happen is because of a concurrent
2780 : * set_cpus_allowed_ptr() call, which should then still be
2781 : * pending completion.
2782 : *
2783 : * Either way, we really should have a @pending here.
2784 : */
2785 : if (WARN_ON_ONCE(!pending)) {
2786 : task_rq_unlock(rq, p, rf);
2787 : return -EINVAL;
2788 : }
2789 :
2790 : if (task_running(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) {
2791 : /*
2792 : * MIGRATE_ENABLE gets here because 'p == current', but for
2793 : * anything else we cannot do is_migration_disabled(), punt
2794 : * and have the stopper function handle it all race-free.
2795 : */
2796 : stop_pending = pending->stop_pending;
2797 : if (!stop_pending)
2798 : pending->stop_pending = true;
2799 :
2800 : if (flags & SCA_MIGRATE_ENABLE)
2801 : p->migration_flags &= ~MDF_PUSH;
2802 :
2803 : task_rq_unlock(rq, p, rf);
2804 :
2805 : if (!stop_pending) {
2806 : stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
2807 : &pending->arg, &pending->stop_work);
2808 : }
2809 :
2810 : if (flags & SCA_MIGRATE_ENABLE)
2811 : return 0;
2812 : } else {
2813 :
2814 : if (!is_migration_disabled(p)) {
2815 : if (task_on_rq_queued(p))
2816 : rq = move_queued_task(rq, rf, p, dest_cpu);
2817 :
2818 : if (!pending->stop_pending) {
2819 : p->migration_pending = NULL;
2820 : complete = true;
2821 : }
2822 : }
2823 : task_rq_unlock(rq, p, rf);
2824 :
2825 : if (complete)
2826 : complete_all(&pending->done);
2827 : }
2828 :
2829 : wait_for_completion(&pending->done);
2830 :
2831 : if (refcount_dec_and_test(&pending->refs))
2832 : wake_up_var(&pending->refs); /* No UaF, just an address */
2833 :
2834 : /*
2835 : * Block the original owner of &pending until all subsequent callers
2836 : * have seen the completion and decremented the refcount
2837 : */
2838 : wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
2839 :
2840 : /* ARGH */
2841 : WARN_ON_ONCE(my_pending.stop_pending);
2842 :
2843 : return 0;
2844 : }
2845 :
2846 : /*
2847 : * Called with both p->pi_lock and rq->lock held; drops both before returning.
2848 : */
2849 : static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
2850 : const struct cpumask *new_mask,
2851 : u32 flags,
2852 : struct rq *rq,
2853 : struct rq_flags *rf)
2854 : __releases(rq->lock)
2855 : __releases(p->pi_lock)
2856 : {
2857 : const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p);
2858 : const struct cpumask *cpu_valid_mask = cpu_active_mask;
2859 : bool kthread = p->flags & PF_KTHREAD;
2860 : struct cpumask *user_mask = NULL;
2861 : unsigned int dest_cpu;
2862 : int ret = 0;
2863 :
2864 : update_rq_clock(rq);
2865 :
2866 : if (kthread || is_migration_disabled(p)) {
2867 : /*
2868 : * Kernel threads are allowed on online && !active CPUs,
2869 : * however, during cpu-hot-unplug, even these might get pushed
2870 : * away if not KTHREAD_IS_PER_CPU.
2871 : *
2872 : * Specifically, migration_disabled() tasks must not fail the
2873 : * cpumask_any_and_distribute() pick below, esp. so on
2874 : * SCA_MIGRATE_ENABLE, otherwise we'll not call
2875 : * set_cpus_allowed_common() and actually reset p->cpus_ptr.
2876 : */
2877 : cpu_valid_mask = cpu_online_mask;
2878 : }
2879 :
2880 : if (!kthread && !cpumask_subset(new_mask, cpu_allowed_mask)) {
2881 : ret = -EINVAL;
2882 : goto out;
2883 : }
2884 :
2885 : /*
2886 : * Must re-check here, to close a race against __kthread_bind(),
2887 : * sched_setaffinity() is not guaranteed to observe the flag.
2888 : */
2889 : if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {
2890 : ret = -EINVAL;
2891 : goto out;
2892 : }
2893 :
2894 : if (!(flags & SCA_MIGRATE_ENABLE)) {
2895 : if (cpumask_equal(&p->cpus_mask, new_mask))
2896 : goto out;
2897 :
2898 : if (WARN_ON_ONCE(p == current &&
2899 : is_migration_disabled(p) &&
2900 : !cpumask_test_cpu(task_cpu(p), new_mask))) {
2901 : ret = -EBUSY;
2902 : goto out;
2903 : }
2904 : }
2905 :
2906 : /*
2907 : * Picking a ~random cpu helps in cases where we are changing affinity
2908 : * for groups of tasks (ie. cpuset), so that load balancing is not
2909 : * immediately required to distribute the tasks within their new mask.
2910 : */
2911 : dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask);
2912 : if (dest_cpu >= nr_cpu_ids) {
2913 : ret = -EINVAL;
2914 : goto out;
2915 : }
2916 :
2917 : __do_set_cpus_allowed(p, new_mask, flags);
2918 :
2919 : if (flags & SCA_USER)
2920 : user_mask = clear_user_cpus_ptr(p);
2921 :
2922 : ret = affine_move_task(rq, p, rf, dest_cpu, flags);
2923 :
2924 : kfree(user_mask);
2925 :
2926 : return ret;
2927 :
2928 : out:
2929 : task_rq_unlock(rq, p, rf);
2930 :
2931 : return ret;
2932 : }
2933 :
2934 : /*
2935 : * Change a given task's CPU affinity. Migrate the thread to a
2936 : * proper CPU and schedule it away if the CPU it's executing on
2937 : * is removed from the allowed bitmask.
2938 : *
2939 : * NOTE: the caller must have a valid reference to the task, the
2940 : * task must not exit() & deallocate itself prematurely. The
2941 : * call is not atomic; no spinlocks may be held.
2942 : */
2943 : static int __set_cpus_allowed_ptr(struct task_struct *p,
2944 : const struct cpumask *new_mask, u32 flags)
2945 : {
2946 : struct rq_flags rf;
2947 : struct rq *rq;
2948 :
2949 : rq = task_rq_lock(p, &rf);
2950 : return __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, &rf);
2951 : }
2952 :
2953 : int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
2954 : {
2955 : return __set_cpus_allowed_ptr(p, new_mask, 0);
2956 : }
2957 : EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
2958 :
2959 : /*
2960 : * Change a given task's CPU affinity to the intersection of its current
2961 : * affinity mask and @subset_mask, writing the resulting mask to @new_mask
2962 : * and pointing @p->user_cpus_ptr to a copy of the old mask.
2963 : * If the resulting mask is empty, leave the affinity unchanged and return
2964 : * -EINVAL.
2965 : */
2966 : static int restrict_cpus_allowed_ptr(struct task_struct *p,
2967 : struct cpumask *new_mask,
2968 : const struct cpumask *subset_mask)
2969 : {
2970 : struct cpumask *user_mask = NULL;
2971 : struct rq_flags rf;
2972 : struct rq *rq;
2973 : int err;
2974 :
2975 : if (!p->user_cpus_ptr) {
2976 : user_mask = kmalloc(cpumask_size(), GFP_KERNEL);
2977 : if (!user_mask)
2978 : return -ENOMEM;
2979 : }
2980 :
2981 : rq = task_rq_lock(p, &rf);
2982 :
2983 : /*
2984 : * Forcefully restricting the affinity of a deadline task is
2985 : * likely to cause problems, so fail and noisily override the
2986 : * mask entirely.
2987 : */
2988 : if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
2989 : err = -EPERM;
2990 : goto err_unlock;
2991 : }
2992 :
2993 : if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) {
2994 : err = -EINVAL;
2995 : goto err_unlock;
2996 : }
2997 :
2998 : /*
2999 : * We're about to butcher the task affinity, so keep track of what
3000 : * the user asked for in case we're able to restore it later on.
3001 : */
3002 : if (user_mask) {
3003 : cpumask_copy(user_mask, p->cpus_ptr);
3004 : p->user_cpus_ptr = user_mask;
3005 : }
3006 :
3007 : return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, &rf);
3008 :
3009 : err_unlock:
3010 : task_rq_unlock(rq, p, &rf);
3011 : kfree(user_mask);
3012 : return err;
3013 : }
3014 :
3015 : /*
3016 : * Restrict the CPU affinity of task @p so that it is a subset of
3017 : * task_cpu_possible_mask() and point @p->user_cpu_ptr to a copy of the
3018 : * old affinity mask. If the resulting mask is empty, we warn and walk
3019 : * up the cpuset hierarchy until we find a suitable mask.
3020 : */
3021 : void force_compatible_cpus_allowed_ptr(struct task_struct *p)
3022 : {
3023 : cpumask_var_t new_mask;
3024 : const struct cpumask *override_mask = task_cpu_possible_mask(p);
3025 :
3026 : alloc_cpumask_var(&new_mask, GFP_KERNEL);
3027 :
3028 : /*
3029 : * __migrate_task() can fail silently in the face of concurrent
3030 : * offlining of the chosen destination CPU, so take the hotplug
3031 : * lock to ensure that the migration succeeds.
3032 : */
3033 : cpus_read_lock();
3034 : if (!cpumask_available(new_mask))
3035 : goto out_set_mask;
3036 :
3037 : if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask))
3038 : goto out_free_mask;
3039 :
3040 : /*
3041 : * We failed to find a valid subset of the affinity mask for the
3042 : * task, so override it based on its cpuset hierarchy.
3043 : */
3044 : cpuset_cpus_allowed(p, new_mask);
3045 : override_mask = new_mask;
3046 :
3047 : out_set_mask:
3048 : if (printk_ratelimit()) {
3049 : printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n",
3050 : task_pid_nr(p), p->comm,
3051 : cpumask_pr_args(override_mask));
3052 : }
3053 :
3054 : WARN_ON(set_cpus_allowed_ptr(p, override_mask));
3055 : out_free_mask:
3056 : cpus_read_unlock();
3057 : free_cpumask_var(new_mask);
3058 : }
3059 :
3060 : static int
3061 : __sched_setaffinity(struct task_struct *p, const struct cpumask *mask);
3062 :
3063 : /*
3064 : * Restore the affinity of a task @p which was previously restricted by a
3065 : * call to force_compatible_cpus_allowed_ptr(). This will clear (and free)
3066 : * @p->user_cpus_ptr.
3067 : *
3068 : * It is the caller's responsibility to serialise this with any calls to
3069 : * force_compatible_cpus_allowed_ptr(@p).
3070 : */
3071 : void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
3072 : {
3073 : struct cpumask *user_mask = p->user_cpus_ptr;
3074 : unsigned long flags;
3075 :
3076 : /*
3077 : * Try to restore the old affinity mask. If this fails, then
3078 : * we free the mask explicitly to avoid it being inherited across
3079 : * a subsequent fork().
3080 : */
3081 : if (!user_mask || !__sched_setaffinity(p, user_mask))
3082 : return;
3083 :
3084 : raw_spin_lock_irqsave(&p->pi_lock, flags);
3085 : user_mask = clear_user_cpus_ptr(p);
3086 : raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3087 :
3088 : kfree(user_mask);
3089 : }
3090 :
3091 : void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
3092 : {
3093 : #ifdef CONFIG_SCHED_DEBUG
3094 : unsigned int state = READ_ONCE(p->__state);
3095 :
3096 : /*
3097 : * We should never call set_task_cpu() on a blocked task,
3098 : * ttwu() will sort out the placement.
3099 : */
3100 : WARN_ON_ONCE(state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq);
3101 :
3102 : /*
3103 : * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
3104 : * because schedstat_wait_{start,end} rebase migrating task's wait_start
3105 : * time relying on p->on_rq.
3106 : */
3107 : WARN_ON_ONCE(state == TASK_RUNNING &&
3108 : p->sched_class == &fair_sched_class &&
3109 : (p->on_rq && !task_on_rq_migrating(p)));
3110 :
3111 : #ifdef CONFIG_LOCKDEP
3112 : /*
3113 : * The caller should hold either p->pi_lock or rq->lock, when changing
3114 : * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
3115 : *
3116 : * sched_move_task() holds both and thus holding either pins the cgroup,
3117 : * see task_group().
3118 : *
3119 : * Furthermore, all task_rq users should acquire both locks, see
3120 : * task_rq_lock().
3121 : */
3122 : WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
3123 : lockdep_is_held(__rq_lockp(task_rq(p)))));
3124 : #endif
3125 : /*
3126 : * Clearly, migrating tasks to offline CPUs is a fairly daft thing.
3127 : */
3128 : WARN_ON_ONCE(!cpu_online(new_cpu));
3129 :
3130 : WARN_ON_ONCE(is_migration_disabled(p));
3131 : #endif
3132 :
3133 : trace_sched_migrate_task(p, new_cpu);
3134 :
3135 : if (task_cpu(p) != new_cpu) {
3136 : if (p->sched_class->migrate_task_rq)
3137 : p->sched_class->migrate_task_rq(p, new_cpu);
3138 : p->se.nr_migrations++;
3139 : rseq_migrate(p);
3140 : perf_event_task_migrate(p);
3141 : }
3142 :
3143 : __set_task_cpu(p, new_cpu);
3144 : }
3145 :
3146 : #ifdef CONFIG_NUMA_BALANCING
3147 : static void __migrate_swap_task(struct task_struct *p, int cpu)
3148 : {
3149 : if (task_on_rq_queued(p)) {
3150 : struct rq *src_rq, *dst_rq;
3151 : struct rq_flags srf, drf;
3152 :
3153 : src_rq = task_rq(p);
3154 : dst_rq = cpu_rq(cpu);
3155 :
3156 : rq_pin_lock(src_rq, &srf);
3157 : rq_pin_lock(dst_rq, &drf);
3158 :
3159 : deactivate_task(src_rq, p, 0);
3160 : set_task_cpu(p, cpu);
3161 : activate_task(dst_rq, p, 0);
3162 : check_preempt_curr(dst_rq, p, 0);
3163 :
3164 : rq_unpin_lock(dst_rq, &drf);
3165 : rq_unpin_lock(src_rq, &srf);
3166 :
3167 : } else {
3168 : /*
3169 : * Task isn't running anymore; make it appear like we migrated
3170 : * it before it went to sleep. This means on wakeup we make the
3171 : * previous CPU our target instead of where it really is.
3172 : */
3173 : p->wake_cpu = cpu;
3174 : }
3175 : }
3176 :
3177 : struct migration_swap_arg {
3178 : struct task_struct *src_task, *dst_task;
3179 : int src_cpu, dst_cpu;
3180 : };
3181 :
3182 : static int migrate_swap_stop(void *data)
3183 : {
3184 : struct migration_swap_arg *arg = data;
3185 : struct rq *src_rq, *dst_rq;
3186 : int ret = -EAGAIN;
3187 :
3188 : if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
3189 : return -EAGAIN;
3190 :
3191 : src_rq = cpu_rq(arg->src_cpu);
3192 : dst_rq = cpu_rq(arg->dst_cpu);
3193 :
3194 : double_raw_lock(&arg->src_task->pi_lock,
3195 : &arg->dst_task->pi_lock);
3196 : double_rq_lock(src_rq, dst_rq);
3197 :
3198 : if (task_cpu(arg->dst_task) != arg->dst_cpu)
3199 : goto unlock;
3200 :
3201 : if (task_cpu(arg->src_task) != arg->src_cpu)
3202 : goto unlock;
3203 :
3204 : if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
3205 : goto unlock;
3206 :
3207 : if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
3208 : goto unlock;
3209 :
3210 : __migrate_swap_task(arg->src_task, arg->dst_cpu);
3211 : __migrate_swap_task(arg->dst_task, arg->src_cpu);
3212 :
3213 : ret = 0;
3214 :
3215 : unlock:
3216 : double_rq_unlock(src_rq, dst_rq);
3217 : raw_spin_unlock(&arg->dst_task->pi_lock);
3218 : raw_spin_unlock(&arg->src_task->pi_lock);
3219 :
3220 : return ret;
3221 : }
3222 :
3223 : /*
3224 : * Cross migrate two tasks
3225 : */
3226 : int migrate_swap(struct task_struct *cur, struct task_struct *p,
3227 : int target_cpu, int curr_cpu)
3228 : {
3229 : struct migration_swap_arg arg;
3230 : int ret = -EINVAL;
3231 :
3232 : arg = (struct migration_swap_arg){
3233 : .src_task = cur,
3234 : .src_cpu = curr_cpu,
3235 : .dst_task = p,
3236 : .dst_cpu = target_cpu,
3237 : };
3238 :
3239 : if (arg.src_cpu == arg.dst_cpu)
3240 : goto out;
3241 :
3242 : /*
3243 : * These three tests are all lockless; this is OK since all of them
3244 : * will be re-checked with proper locks held further down the line.
3245 : */
3246 : if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
3247 : goto out;
3248 :
3249 : if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr))
3250 : goto out;
3251 :
3252 : if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr))
3253 : goto out;
3254 :
3255 : trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
3256 : ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
3257 :
3258 : out:
3259 : return ret;
3260 : }
3261 : #endif /* CONFIG_NUMA_BALANCING */
3262 :
3263 : /*
3264 : * wait_task_inactive - wait for a thread to unschedule.
3265 : *
3266 : * If @match_state is nonzero, it's the @p->state value just checked and
3267 : * not expected to change. If it changes, i.e. @p might have woken up,
3268 : * then return zero. When we succeed in waiting for @p to be off its CPU,
3269 : * we return a positive number (its total switch count). If a second call
3270 : * a short while later returns the same number, the caller can be sure that
3271 : * @p has remained unscheduled the whole time.
3272 : *
3273 : * The caller must ensure that the task *will* unschedule sometime soon,
3274 : * else this function might spin for a *long* time. This function can't
3275 : * be called with interrupts off, or it may introduce deadlock with
3276 : * smp_call_function() if an IPI is sent by the same process we are
3277 : * waiting to become inactive.
3278 : */
3279 : unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
3280 : {
3281 : int running, queued;
3282 : struct rq_flags rf;
3283 : unsigned long ncsw;
3284 : struct rq *rq;
3285 :
3286 : for (;;) {
3287 : /*
3288 : * We do the initial early heuristics without holding
3289 : * any task-queue locks at all. We'll only try to get
3290 : * the runqueue lock when things look like they will
3291 : * work out!
3292 : */
3293 : rq = task_rq(p);
3294 :
3295 : /*
3296 : * If the task is actively running on another CPU
3297 : * still, just relax and busy-wait without holding
3298 : * any locks.
3299 : *
3300 : * NOTE! Since we don't hold any locks, it's not
3301 : * even sure that "rq" stays as the right runqueue!
3302 : * But we don't care, since "task_running()" will
3303 : * return false if the runqueue has changed and p
3304 : * is actually now running somewhere else!
3305 : */
3306 : while (task_running(rq, p)) {
3307 : if (match_state && unlikely(READ_ONCE(p->__state) != match_state))
3308 : return 0;
3309 : cpu_relax();
3310 : }
3311 :
3312 : /*
3313 : * Ok, time to look more closely! We need the rq
3314 : * lock now, to be *sure*. If we're wrong, we'll
3315 : * just go back and repeat.
3316 : */
3317 : rq = task_rq_lock(p, &rf);
3318 : trace_sched_wait_task(p);
3319 : running = task_running(rq, p);
3320 : queued = task_on_rq_queued(p);
3321 : ncsw = 0;
3322 : if (!match_state || READ_ONCE(p->__state) == match_state)
3323 : ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
3324 : task_rq_unlock(rq, p, &rf);
3325 :
3326 : /*
3327 : * If it changed from the expected state, bail out now.
3328 : */
3329 : if (unlikely(!ncsw))
3330 : break;
3331 :
3332 : /*
3333 : * Was it really running after all now that we
3334 : * checked with the proper locks actually held?
3335 : *
3336 : * Oops. Go back and try again..
3337 : */
3338 : if (unlikely(running)) {
3339 : cpu_relax();
3340 : continue;
3341 : }
3342 :
3343 : /*
3344 : * It's not enough that it's not actively running,
3345 : * it must be off the runqueue _entirely_, and not
3346 : * preempted!
3347 : *
3348 : * So if it was still runnable (but just not actively
3349 : * running right now), it's preempted, and we should
3350 : * yield - it could be a while.
3351 : */
3352 : if (unlikely(queued)) {
3353 : ktime_t to = NSEC_PER_SEC / HZ;
3354 :
3355 : set_current_state(TASK_UNINTERRUPTIBLE);
3356 : schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD);
3357 : continue;
3358 : }
3359 :
3360 : /*
3361 : * Ahh, all good. It wasn't running, and it wasn't
3362 : * runnable, which means that it will never become
3363 : * running in the future either. We're all done!
3364 : */
3365 : break;
3366 : }
3367 :
3368 : return ncsw;
3369 : }
3370 :
3371 : /***
3372 : * kick_process - kick a running thread to enter/exit the kernel
3373 : * @p: the to-be-kicked thread
3374 : *
3375 : * Cause a process which is running on another CPU to enter
3376 : * kernel-mode, without any delay. (to get signals handled.)
3377 : *
3378 : * NOTE: this function doesn't have to take the runqueue lock,
3379 : * because all it wants to ensure is that the remote task enters
3380 : * the kernel. If the IPI races and the task has been migrated
3381 : * to another CPU then no harm is done and the purpose has been
3382 : * achieved as well.
3383 : */
3384 : void kick_process(struct task_struct *p)
3385 : {
3386 : int cpu;
3387 :
3388 : preempt_disable();
3389 : cpu = task_cpu(p);
3390 : if ((cpu != smp_processor_id()) && task_curr(p))
3391 : smp_send_reschedule(cpu);
3392 : preempt_enable();
3393 : }
3394 : EXPORT_SYMBOL_GPL(kick_process);
3395 :
3396 : /*
3397 : * ->cpus_ptr is protected by both rq->lock and p->pi_lock
3398 : *
3399 : * A few notes on cpu_active vs cpu_online:
3400 : *
3401 : * - cpu_active must be a subset of cpu_online
3402 : *
3403 : * - on CPU-up we allow per-CPU kthreads on the online && !active CPU,
3404 : * see __set_cpus_allowed_ptr(). At this point the newly online
3405 : * CPU isn't yet part of the sched domains, and balancing will not
3406 : * see it.
3407 : *
3408 : * - on CPU-down we clear cpu_active() to mask the sched domains and
3409 : * avoid the load balancer to place new tasks on the to be removed
3410 : * CPU. Existing tasks will remain running there and will be taken
3411 : * off.
3412 : *
3413 : * This means that fallback selection must not select !active CPUs.
3414 : * And can assume that any active CPU must be online. Conversely
3415 : * select_task_rq() below may allow selection of !active CPUs in order
3416 : * to satisfy the above rules.
3417 : */
3418 : static int select_fallback_rq(int cpu, struct task_struct *p)
3419 : {
3420 : int nid = cpu_to_node(cpu);
3421 : const struct cpumask *nodemask = NULL;
3422 : enum { cpuset, possible, fail } state = cpuset;
3423 : int dest_cpu;
3424 :
3425 : /*
3426 : * If the node that the CPU is on has been offlined, cpu_to_node()
3427 : * will return -1. There is no CPU on the node, and we should
3428 : * select the CPU on the other node.
3429 : */
3430 : if (nid != -1) {
3431 : nodemask = cpumask_of_node(nid);
3432 :
3433 : /* Look for allowed, online CPU in same node. */
3434 : for_each_cpu(dest_cpu, nodemask) {
3435 : if (is_cpu_allowed(p, dest_cpu))
3436 : return dest_cpu;
3437 : }
3438 : }
3439 :
3440 : for (;;) {
3441 : /* Any allowed, online CPU? */
3442 : for_each_cpu(dest_cpu, p->cpus_ptr) {
3443 : if (!is_cpu_allowed(p, dest_cpu))
3444 : continue;
3445 :
3446 : goto out;
3447 : }
3448 :
3449 : /* No more Mr. Nice Guy. */
3450 : switch (state) {
3451 : case cpuset:
3452 : if (cpuset_cpus_allowed_fallback(p)) {
3453 : state = possible;
3454 : break;
3455 : }
3456 : fallthrough;
3457 : case possible:
3458 : /*
3459 : * XXX When called from select_task_rq() we only
3460 : * hold p->pi_lock and again violate locking order.
3461 : *
3462 : * More yuck to audit.
3463 : */
3464 : do_set_cpus_allowed(p, task_cpu_possible_mask(p));
3465 : state = fail;
3466 : break;
3467 : case fail:
3468 : BUG();
3469 : break;
3470 : }
3471 : }
3472 :
3473 : out:
3474 : if (state != cpuset) {
3475 : /*
3476 : * Don't tell them about moving exiting tasks or
3477 : * kernel threads (both mm NULL), since they never
3478 : * leave kernel.
3479 : */
3480 : if (p->mm && printk_ratelimit()) {
3481 : printk_deferred("process %d (%s) no longer affine to cpu%d\n",
3482 : task_pid_nr(p), p->comm, cpu);
3483 : }
3484 : }
3485 :
3486 : return dest_cpu;
3487 : }
3488 :
3489 : /*
3490 : * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
3491 : */
3492 : static inline
3493 : int select_task_rq(struct task_struct *p, int cpu, int wake_flags)
3494 : {
3495 : lockdep_assert_held(&p->pi_lock);
3496 :
3497 : if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p))
3498 : cpu = p->sched_class->select_task_rq(p, cpu, wake_flags);
3499 : else
3500 : cpu = cpumask_any(p->cpus_ptr);
3501 :
3502 : /*
3503 : * In order not to call set_task_cpu() on a blocking task we need
3504 : * to rely on ttwu() to place the task on a valid ->cpus_ptr
3505 : * CPU.
3506 : *
3507 : * Since this is common to all placement strategies, this lives here.
3508 : *
3509 : * [ this allows ->select_task() to simply return task_cpu(p) and
3510 : * not worry about this generic constraint ]
3511 : */
3512 : if (unlikely(!is_cpu_allowed(p, cpu)))
3513 : cpu = select_fallback_rq(task_cpu(p), p);
3514 :
3515 : return cpu;
3516 : }
3517 :
3518 : void sched_set_stop_task(int cpu, struct task_struct *stop)
3519 : {
3520 : static struct lock_class_key stop_pi_lock;
3521 : struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
3522 : struct task_struct *old_stop = cpu_rq(cpu)->stop;
3523 :
3524 : if (stop) {
3525 : /*
3526 : * Make it appear like a SCHED_FIFO task, its something
3527 : * userspace knows about and won't get confused about.
3528 : *
3529 : * Also, it will make PI more or less work without too
3530 : * much confusion -- but then, stop work should not
3531 : * rely on PI working anyway.
3532 : */
3533 : sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m);
3534 :
3535 : stop->sched_class = &stop_sched_class;
3536 :
3537 : /*
3538 : * The PI code calls rt_mutex_setprio() with ->pi_lock held to
3539 : * adjust the effective priority of a task. As a result,
3540 : * rt_mutex_setprio() can trigger (RT) balancing operations,
3541 : * which can then trigger wakeups of the stop thread to push
3542 : * around the current task.
3543 : *
3544 : * The stop task itself will never be part of the PI-chain, it
3545 : * never blocks, therefore that ->pi_lock recursion is safe.
3546 : * Tell lockdep about this by placing the stop->pi_lock in its
3547 : * own class.
3548 : */
3549 : lockdep_set_class(&stop->pi_lock, &stop_pi_lock);
3550 : }
3551 :
3552 : cpu_rq(cpu)->stop = stop;
3553 :
3554 : if (old_stop) {
3555 : /*
3556 : * Reset it back to a normal scheduling class so that
3557 : * it can die in pieces.
3558 : */
3559 : old_stop->sched_class = &rt_sched_class;
3560 : }
3561 : }
3562 :
3563 : #else /* CONFIG_SMP */
3564 :
3565 : static inline int __set_cpus_allowed_ptr(struct task_struct *p,
3566 : const struct cpumask *new_mask,
3567 : u32 flags)
3568 : {
3569 0 : return set_cpus_allowed_ptr(p, new_mask);
3570 : }
3571 :
3572 : static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
3573 :
3574 : static inline bool rq_has_pinned_tasks(struct rq *rq)
3575 : {
3576 : return false;
3577 : }
3578 :
3579 : #endif /* !CONFIG_SMP */
3580 :
3581 : static void
3582 : ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
3583 : {
3584 : struct rq *rq;
3585 :
3586 : if (!schedstat_enabled())
3587 : return;
3588 :
3589 : rq = this_rq();
3590 :
3591 : #ifdef CONFIG_SMP
3592 : if (cpu == rq->cpu) {
3593 : __schedstat_inc(rq->ttwu_local);
3594 : __schedstat_inc(p->stats.nr_wakeups_local);
3595 : } else {
3596 : struct sched_domain *sd;
3597 :
3598 : __schedstat_inc(p->stats.nr_wakeups_remote);
3599 : rcu_read_lock();
3600 : for_each_domain(rq->cpu, sd) {
3601 : if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
3602 : __schedstat_inc(sd->ttwu_wake_remote);
3603 : break;
3604 : }
3605 : }
3606 : rcu_read_unlock();
3607 : }
3608 :
3609 : if (wake_flags & WF_MIGRATED)
3610 : __schedstat_inc(p->stats.nr_wakeups_migrate);
3611 : #endif /* CONFIG_SMP */
3612 :
3613 : __schedstat_inc(rq->ttwu_count);
3614 : __schedstat_inc(p->stats.nr_wakeups);
3615 :
3616 : if (wake_flags & WF_SYNC)
3617 : __schedstat_inc(p->stats.nr_wakeups_sync);
3618 : }
3619 :
3620 : /*
3621 : * Mark the task runnable and perform wakeup-preemption.
3622 : */
3623 : static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
3624 : struct rq_flags *rf)
3625 : {
3626 510 : check_preempt_curr(rq, p, wake_flags);
3627 510 : WRITE_ONCE(p->__state, TASK_RUNNING);
3628 510 : trace_sched_wakeup(p);
3629 :
3630 : #ifdef CONFIG_SMP
3631 : if (p->sched_class->task_woken) {
3632 : /*
3633 : * Our task @p is fully woken up and running; so it's safe to
3634 : * drop the rq->lock, hereafter rq is only used for statistics.
3635 : */
3636 : rq_unpin_lock(rq, rf);
3637 : p->sched_class->task_woken(rq, p);
3638 : rq_repin_lock(rq, rf);
3639 : }
3640 :
3641 : if (rq->idle_stamp) {
3642 : u64 delta = rq_clock(rq) - rq->idle_stamp;
3643 : u64 max = 2*rq->max_idle_balance_cost;
3644 :
3645 : update_avg(&rq->avg_idle, delta);
3646 :
3647 : if (rq->avg_idle > max)
3648 : rq->avg_idle = max;
3649 :
3650 : rq->wake_stamp = jiffies;
3651 : rq->wake_avg_idle = rq->avg_idle / 2;
3652 :
3653 : rq->idle_stamp = 0;
3654 : }
3655 : #endif
3656 : }
3657 :
3658 : static void
3659 510 : ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
3660 : struct rq_flags *rf)
3661 : {
3662 510 : int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
3663 :
3664 510 : lockdep_assert_rq_held(rq);
3665 :
3666 510 : if (p->sched_contributes_to_load)
3667 307 : rq->nr_uninterruptible--;
3668 :
3669 : #ifdef CONFIG_SMP
3670 : if (wake_flags & WF_MIGRATED)
3671 : en_flags |= ENQUEUE_MIGRATED;
3672 : else
3673 : #endif
3674 510 : if (p->in_iowait) {
3675 0 : delayacct_blkio_end(p);
3676 0 : atomic_dec(&task_rq(p)->nr_iowait);
3677 : }
3678 :
3679 510 : activate_task(rq, p, en_flags);
3680 1020 : ttwu_do_wakeup(rq, p, wake_flags, rf);
3681 510 : }
3682 :
3683 : /*
3684 : * Consider @p being inside a wait loop:
3685 : *
3686 : * for (;;) {
3687 : * set_current_state(TASK_UNINTERRUPTIBLE);
3688 : *
3689 : * if (CONDITION)
3690 : * break;
3691 : *
3692 : * schedule();
3693 : * }
3694 : * __set_current_state(TASK_RUNNING);
3695 : *
3696 : * between set_current_state() and schedule(). In this case @p is still
3697 : * runnable, so all that needs doing is change p->state back to TASK_RUNNING in
3698 : * an atomic manner.
3699 : *
3700 : * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq
3701 : * then schedule() must still happen and p->state can be changed to
3702 : * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we
3703 : * need to do a full wakeup with enqueue.
3704 : *
3705 : * Returns: %true when the wakeup is done,
3706 : * %false otherwise.
3707 : */
3708 0 : static int ttwu_runnable(struct task_struct *p, int wake_flags)
3709 : {
3710 : struct rq_flags rf;
3711 : struct rq *rq;
3712 0 : int ret = 0;
3713 :
3714 0 : rq = __task_rq_lock(p, &rf);
3715 0 : if (task_on_rq_queued(p)) {
3716 : /* check_preempt_curr() may use rq clock */
3717 0 : update_rq_clock(rq);
3718 0 : ttwu_do_wakeup(rq, p, wake_flags, &rf);
3719 0 : ret = 1;
3720 : }
3721 0 : __task_rq_unlock(rq, &rf);
3722 :
3723 0 : return ret;
3724 : }
3725 :
3726 : #ifdef CONFIG_SMP
3727 : void sched_ttwu_pending(void *arg)
3728 : {
3729 : struct llist_node *llist = arg;
3730 : struct rq *rq = this_rq();
3731 : struct task_struct *p, *t;
3732 : struct rq_flags rf;
3733 :
3734 : if (!llist)
3735 : return;
3736 :
3737 : /*
3738 : * rq::ttwu_pending racy indication of out-standing wakeups.
3739 : * Races such that false-negatives are possible, since they
3740 : * are shorter lived that false-positives would be.
3741 : */
3742 : WRITE_ONCE(rq->ttwu_pending, 0);
3743 :
3744 : rq_lock_irqsave(rq, &rf);
3745 : update_rq_clock(rq);
3746 :
3747 : llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {
3748 : if (WARN_ON_ONCE(p->on_cpu))
3749 : smp_cond_load_acquire(&p->on_cpu, !VAL);
3750 :
3751 : if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))
3752 : set_task_cpu(p, cpu_of(rq));
3753 :
3754 : ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
3755 : }
3756 :
3757 : rq_unlock_irqrestore(rq, &rf);
3758 : }
3759 :
3760 : void send_call_function_single_ipi(int cpu)
3761 : {
3762 : struct rq *rq = cpu_rq(cpu);
3763 :
3764 : if (!set_nr_if_polling(rq->idle))
3765 : arch_send_call_function_single_ipi(cpu);
3766 : else
3767 : trace_sched_wake_idle_without_ipi(cpu);
3768 : }
3769 :
3770 : /*
3771 : * Queue a task on the target CPUs wake_list and wake the CPU via IPI if
3772 : * necessary. The wakee CPU on receipt of the IPI will queue the task
3773 : * via sched_ttwu_wakeup() for activation so the wakee incurs the cost
3774 : * of the wakeup instead of the waker.
3775 : */
3776 : static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
3777 : {
3778 : struct rq *rq = cpu_rq(cpu);
3779 :
3780 : p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
3781 :
3782 : WRITE_ONCE(rq->ttwu_pending, 1);
3783 : __smp_call_single_queue(cpu, &p->wake_entry.llist);
3784 : }
3785 :
3786 : void wake_up_if_idle(int cpu)
3787 : {
3788 : struct rq *rq = cpu_rq(cpu);
3789 : struct rq_flags rf;
3790 :
3791 : rcu_read_lock();
3792 :
3793 : if (!is_idle_task(rcu_dereference(rq->curr)))
3794 : goto out;
3795 :
3796 : rq_lock_irqsave(rq, &rf);
3797 : if (is_idle_task(rq->curr))
3798 : resched_curr(rq);
3799 : /* Else CPU is not idle, do nothing here: */
3800 : rq_unlock_irqrestore(rq, &rf);
3801 :
3802 : out:
3803 : rcu_read_unlock();
3804 : }
3805 :
3806 : bool cpus_share_cache(int this_cpu, int that_cpu)
3807 : {
3808 : if (this_cpu == that_cpu)
3809 : return true;
3810 :
3811 : return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
3812 : }
3813 :
3814 : static inline bool ttwu_queue_cond(int cpu, int wake_flags)
3815 : {
3816 : /*
3817 : * Do not complicate things with the async wake_list while the CPU is
3818 : * in hotplug state.
3819 : */
3820 : if (!cpu_active(cpu))
3821 : return false;
3822 :
3823 : /*
3824 : * If the CPU does not share cache, then queue the task on the
3825 : * remote rqs wakelist to avoid accessing remote data.
3826 : */
3827 : if (!cpus_share_cache(smp_processor_id(), cpu))
3828 : return true;
3829 :
3830 : /*
3831 : * If the task is descheduling and the only running task on the
3832 : * CPU then use the wakelist to offload the task activation to
3833 : * the soon-to-be-idle CPU as the current CPU is likely busy.
3834 : * nr_running is checked to avoid unnecessary task stacking.
3835 : */
3836 : if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1)
3837 : return true;
3838 :
3839 : return false;
3840 : }
3841 :
3842 : static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
3843 : {
3844 : if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) {
3845 : if (WARN_ON_ONCE(cpu == smp_processor_id()))
3846 : return false;
3847 :
3848 : sched_clock_cpu(cpu); /* Sync clocks across CPUs */
3849 : __ttwu_queue_wakelist(p, cpu, wake_flags);
3850 : return true;
3851 : }
3852 :
3853 : return false;
3854 : }
3855 :
3856 : #else /* !CONFIG_SMP */
3857 :
3858 : static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
3859 : {
3860 : return false;
3861 : }
3862 :
3863 : #endif /* CONFIG_SMP */
3864 :
3865 510 : static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
3866 : {
3867 510 : struct rq *rq = cpu_rq(cpu);
3868 : struct rq_flags rf;
3869 :
3870 510 : if (ttwu_queue_wakelist(p, cpu, wake_flags))
3871 : return;
3872 :
3873 510 : rq_lock(rq, &rf);
3874 510 : update_rq_clock(rq);
3875 510 : ttwu_do_activate(rq, p, wake_flags, &rf);
3876 510 : rq_unlock(rq, &rf);
3877 : }
3878 :
3879 : /*
3880 : * Invoked from try_to_wake_up() to check whether the task can be woken up.
3881 : *
3882 : * The caller holds p::pi_lock if p != current or has preemption
3883 : * disabled when p == current.
3884 : *
3885 : * The rules of PREEMPT_RT saved_state:
3886 : *
3887 : * The related locking code always holds p::pi_lock when updating
3888 : * p::saved_state, which means the code is fully serialized in both cases.
3889 : *
3890 : * The lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. No other
3891 : * bits set. This allows to distinguish all wakeup scenarios.
3892 : */
3893 : static __always_inline
3894 : bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
3895 : {
3896 : if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) {
3897 : WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) &&
3898 : state != TASK_RTLOCK_WAIT);
3899 : }
3900 :
3901 606 : if (READ_ONCE(p->__state) & state) {
3902 510 : *success = 1;
3903 : return true;
3904 : }
3905 :
3906 : #ifdef CONFIG_PREEMPT_RT
3907 : /*
3908 : * Saved state preserves the task state across blocking on
3909 : * an RT lock. If the state matches, set p::saved_state to
3910 : * TASK_RUNNING, but do not wake the task because it waits
3911 : * for a lock wakeup. Also indicate success because from
3912 : * the regular waker's point of view this has succeeded.
3913 : *
3914 : * After acquiring the lock the task will restore p::__state
3915 : * from p::saved_state which ensures that the regular
3916 : * wakeup is not lost. The restore will also set
3917 : * p::saved_state to TASK_RUNNING so any further tests will
3918 : * not result in false positives vs. @success
3919 : */
3920 : if (p->saved_state & state) {
3921 : p->saved_state = TASK_RUNNING;
3922 : *success = 1;
3923 : }
3924 : #endif
3925 : return false;
3926 : }
3927 :
3928 : /*
3929 : * Notes on Program-Order guarantees on SMP systems.
3930 : *
3931 : * MIGRATION
3932 : *
3933 : * The basic program-order guarantee on SMP systems is that when a task [t]
3934 : * migrates, all its activity on its old CPU [c0] happens-before any subsequent
3935 : * execution on its new CPU [c1].
3936 : *
3937 : * For migration (of runnable tasks) this is provided by the following means:
3938 : *
3939 : * A) UNLOCK of the rq(c0)->lock scheduling out task t
3940 : * B) migration for t is required to synchronize *both* rq(c0)->lock and
3941 : * rq(c1)->lock (if not at the same time, then in that order).
3942 : * C) LOCK of the rq(c1)->lock scheduling in task
3943 : *
3944 : * Release/acquire chaining guarantees that B happens after A and C after B.
3945 : * Note: the CPU doing B need not be c0 or c1
3946 : *
3947 : * Example:
3948 : *
3949 : * CPU0 CPU1 CPU2
3950 : *
3951 : * LOCK rq(0)->lock
3952 : * sched-out X
3953 : * sched-in Y
3954 : * UNLOCK rq(0)->lock
3955 : *
3956 : * LOCK rq(0)->lock // orders against CPU0
3957 : * dequeue X
3958 : * UNLOCK rq(0)->lock
3959 : *
3960 : * LOCK rq(1)->lock
3961 : * enqueue X
3962 : * UNLOCK rq(1)->lock
3963 : *
3964 : * LOCK rq(1)->lock // orders against CPU2
3965 : * sched-out Z
3966 : * sched-in X
3967 : * UNLOCK rq(1)->lock
3968 : *
3969 : *
3970 : * BLOCKING -- aka. SLEEP + WAKEUP
3971 : *
3972 : * For blocking we (obviously) need to provide the same guarantee as for
3973 : * migration. However the means are completely different as there is no lock
3974 : * chain to provide order. Instead we do:
3975 : *
3976 : * 1) smp_store_release(X->on_cpu, 0) -- finish_task()
3977 : * 2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up()
3978 : *
3979 : * Example:
3980 : *
3981 : * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule)
3982 : *
3983 : * LOCK rq(0)->lock LOCK X->pi_lock
3984 : * dequeue X
3985 : * sched-out X
3986 : * smp_store_release(X->on_cpu, 0);
3987 : *
3988 : * smp_cond_load_acquire(&X->on_cpu, !VAL);
3989 : * X->state = WAKING
3990 : * set_task_cpu(X,2)
3991 : *
3992 : * LOCK rq(2)->lock
3993 : * enqueue X
3994 : * X->state = RUNNING
3995 : * UNLOCK rq(2)->lock
3996 : *
3997 : * LOCK rq(2)->lock // orders against CPU1
3998 : * sched-out Z
3999 : * sched-in X
4000 : * UNLOCK rq(2)->lock
4001 : *
4002 : * UNLOCK X->pi_lock
4003 : * UNLOCK rq(0)->lock
4004 : *
4005 : *
4006 : * However, for wakeups there is a second guarantee we must provide, namely we
4007 : * must ensure that CONDITION=1 done by the caller can not be reordered with
4008 : * accesses to the task state; see try_to_wake_up() and set_current_state().
4009 : */
4010 :
4011 : /**
4012 : * try_to_wake_up - wake up a thread
4013 : * @p: the thread to be awakened
4014 : * @state: the mask of task states that can be woken
4015 : * @wake_flags: wake modifier flags (WF_*)
4016 : *
4017 : * Conceptually does:
4018 : *
4019 : * If (@state & @p->state) @p->state = TASK_RUNNING.
4020 : *
4021 : * If the task was not queued/runnable, also place it back on a runqueue.
4022 : *
4023 : * This function is atomic against schedule() which would dequeue the task.
4024 : *
4025 : * It issues a full memory barrier before accessing @p->state, see the comment
4026 : * with set_current_state().
4027 : *
4028 : * Uses p->pi_lock to serialize against concurrent wake-ups.
4029 : *
4030 : * Relies on p->pi_lock stabilizing:
4031 : * - p->sched_class
4032 : * - p->cpus_ptr
4033 : * - p->sched_task_group
4034 : * in order to do migration, see its use of select_task_rq()/set_task_cpu().
4035 : *
4036 : * Tries really hard to only take one task_rq(p)->lock for performance.
4037 : * Takes rq->lock in:
4038 : * - ttwu_runnable() -- old rq, unavoidable, see comment there;
4039 : * - ttwu_queue() -- new rq, for enqueue of the task;
4040 : * - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us.
4041 : *
4042 : * As a consequence we race really badly with just about everything. See the
4043 : * many memory barriers and their comments for details.
4044 : *
4045 : * Return: %true if @p->state changes (an actual wakeup was done),
4046 : * %false otherwise.
4047 : */
4048 : static int
4049 606 : try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
4050 : {
4051 : unsigned long flags;
4052 606 : int cpu, success = 0;
4053 :
4054 606 : preempt_disable();
4055 606 : if (p == current) {
4056 : /*
4057 : * We're waking current, this means 'p->on_rq' and 'task_cpu(p)
4058 : * == smp_processor_id()'. Together this means we can special
4059 : * case the whole 'p->on_rq && ttwu_runnable()' case below
4060 : * without taking any locks.
4061 : *
4062 : * In particular:
4063 : * - we rely on Program-Order guarantees for all the ordering,
4064 : * - we're serialized against set_special_state() by virtue of
4065 : * it disabling IRQs (this allows not taking ->pi_lock).
4066 : */
4067 1 : if (!ttwu_state_match(p, state, &success))
4068 : goto out;
4069 :
4070 0 : trace_sched_waking(p);
4071 0 : WRITE_ONCE(p->__state, TASK_RUNNING);
4072 0 : trace_sched_wakeup(p);
4073 : goto out;
4074 : }
4075 :
4076 : /*
4077 : * If we are going to wake up a thread waiting for CONDITION we
4078 : * need to ensure that CONDITION=1 done by the caller can not be
4079 : * reordered with p->state check below. This pairs with smp_store_mb()
4080 : * in set_current_state() that the waiting thread does.
4081 : */
4082 605 : raw_spin_lock_irqsave(&p->pi_lock, flags);
4083 : smp_mb__after_spinlock();
4084 605 : if (!ttwu_state_match(p, state, &success))
4085 : goto unlock;
4086 :
4087 510 : trace_sched_waking(p);
4088 :
4089 : /*
4090 : * Ensure we load p->on_rq _after_ p->state, otherwise it would
4091 : * be possible to, falsely, observe p->on_rq == 0 and get stuck
4092 : * in smp_cond_load_acquire() below.
4093 : *
4094 : * sched_ttwu_pending() try_to_wake_up()
4095 : * STORE p->on_rq = 1 LOAD p->state
4096 : * UNLOCK rq->lock
4097 : *
4098 : * __schedule() (switch to task 'p')
4099 : * LOCK rq->lock smp_rmb();
4100 : * smp_mb__after_spinlock();
4101 : * UNLOCK rq->lock
4102 : *
4103 : * [task p]
4104 : * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq
4105 : *
4106 : * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
4107 : * __schedule(). See the comment for smp_mb__after_spinlock().
4108 : *
4109 : * A similar smb_rmb() lives in try_invoke_on_locked_down_task().
4110 : */
4111 510 : smp_rmb();
4112 510 : if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
4113 : goto unlock;
4114 :
4115 : #ifdef CONFIG_SMP
4116 : /*
4117 : * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
4118 : * possible to, falsely, observe p->on_cpu == 0.
4119 : *
4120 : * One must be running (->on_cpu == 1) in order to remove oneself
4121 : * from the runqueue.
4122 : *
4123 : * __schedule() (switch to task 'p') try_to_wake_up()
4124 : * STORE p->on_cpu = 1 LOAD p->on_rq
4125 : * UNLOCK rq->lock
4126 : *
4127 : * __schedule() (put 'p' to sleep)
4128 : * LOCK rq->lock smp_rmb();
4129 : * smp_mb__after_spinlock();
4130 : * STORE p->on_rq = 0 LOAD p->on_cpu
4131 : *
4132 : * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
4133 : * __schedule(). See the comment for smp_mb__after_spinlock().
4134 : *
4135 : * Form a control-dep-acquire with p->on_rq == 0 above, to ensure
4136 : * schedule()'s deactivate_task() has 'happened' and p will no longer
4137 : * care about it's own p->state. See the comment in __schedule().
4138 : */
4139 : smp_acquire__after_ctrl_dep();
4140 :
4141 : /*
4142 : * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq
4143 : * == 0), which means we need to do an enqueue, change p->state to
4144 : * TASK_WAKING such that we can unlock p->pi_lock before doing the
4145 : * enqueue, such as ttwu_queue_wakelist().
4146 : */
4147 : WRITE_ONCE(p->__state, TASK_WAKING);
4148 :
4149 : /*
4150 : * If the owning (remote) CPU is still in the middle of schedule() with
4151 : * this task as prev, considering queueing p on the remote CPUs wake_list
4152 : * which potentially sends an IPI instead of spinning on p->on_cpu to
4153 : * let the waker make forward progress. This is safe because IRQs are
4154 : * disabled and the IPI will deliver after on_cpu is cleared.
4155 : *
4156 : * Ensure we load task_cpu(p) after p->on_cpu:
4157 : *
4158 : * set_task_cpu(p, cpu);
4159 : * STORE p->cpu = @cpu
4160 : * __schedule() (switch to task 'p')
4161 : * LOCK rq->lock
4162 : * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu)
4163 : * STORE p->on_cpu = 1 LOAD p->cpu
4164 : *
4165 : * to ensure we observe the correct CPU on which the task is currently
4166 : * scheduling.
4167 : */
4168 : if (smp_load_acquire(&p->on_cpu) &&
4169 : ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU))
4170 : goto unlock;
4171 :
4172 : /*
4173 : * If the owning (remote) CPU is still in the middle of schedule() with
4174 : * this task as prev, wait until it's done referencing the task.
4175 : *
4176 : * Pairs with the smp_store_release() in finish_task().
4177 : *
4178 : * This ensures that tasks getting woken will be fully ordered against
4179 : * their previous state and preserve Program Order.
4180 : */
4181 : smp_cond_load_acquire(&p->on_cpu, !VAL);
4182 :
4183 : cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU);
4184 : if (task_cpu(p) != cpu) {
4185 : if (p->in_iowait) {
4186 : delayacct_blkio_end(p);
4187 : atomic_dec(&task_rq(p)->nr_iowait);
4188 : }
4189 :
4190 : wake_flags |= WF_MIGRATED;
4191 : psi_ttwu_dequeue(p);
4192 : set_task_cpu(p, cpu);
4193 : }
4194 : #else
4195 510 : cpu = task_cpu(p);
4196 : #endif /* CONFIG_SMP */
4197 :
4198 510 : ttwu_queue(p, cpu, wake_flags);
4199 : unlock:
4200 1210 : raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4201 : out:
4202 : if (success)
4203 : ttwu_stat(p, task_cpu(p), wake_flags);
4204 606 : preempt_enable();
4205 :
4206 606 : return success;
4207 : }
4208 :
4209 : /**
4210 : * task_call_func - Invoke a function on task in fixed state
4211 : * @p: Process for which the function is to be invoked, can be @current.
4212 : * @func: Function to invoke.
4213 : * @arg: Argument to function.
4214 : *
4215 : * Fix the task in it's current state by avoiding wakeups and or rq operations
4216 : * and call @func(@arg) on it. This function can use ->on_rq and task_curr()
4217 : * to work out what the state is, if required. Given that @func can be invoked
4218 : * with a runqueue lock held, it had better be quite lightweight.
4219 : *
4220 : * Returns:
4221 : * Whatever @func returns
4222 : */
4223 0 : int task_call_func(struct task_struct *p, task_call_f func, void *arg)
4224 : {
4225 0 : struct rq *rq = NULL;
4226 : unsigned int state;
4227 : struct rq_flags rf;
4228 : int ret;
4229 :
4230 0 : raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
4231 :
4232 0 : state = READ_ONCE(p->__state);
4233 :
4234 : /*
4235 : * Ensure we load p->on_rq after p->__state, otherwise it would be
4236 : * possible to, falsely, observe p->on_rq == 0.
4237 : *
4238 : * See try_to_wake_up() for a longer comment.
4239 : */
4240 0 : smp_rmb();
4241 :
4242 : /*
4243 : * Since pi->lock blocks try_to_wake_up(), we don't need rq->lock when
4244 : * the task is blocked. Make sure to check @state since ttwu() can drop
4245 : * locks at the end, see ttwu_queue_wakelist().
4246 : */
4247 0 : if (state == TASK_RUNNING || state == TASK_WAKING || p->on_rq)
4248 0 : rq = __task_rq_lock(p, &rf);
4249 :
4250 : /*
4251 : * At this point the task is pinned; either:
4252 : * - blocked and we're holding off wakeups (pi->lock)
4253 : * - woken, and we're holding off enqueue (rq->lock)
4254 : * - queued, and we're holding off schedule (rq->lock)
4255 : * - running, and we're holding off de-schedule (rq->lock)
4256 : *
4257 : * The called function (@func) can use: task_curr(), p->on_rq and
4258 : * p->__state to differentiate between these states.
4259 : */
4260 0 : ret = func(p, arg);
4261 :
4262 0 : if (rq)
4263 : rq_unlock(rq, &rf);
4264 :
4265 0 : raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
4266 0 : return ret;
4267 : }
4268 :
4269 : /**
4270 : * wake_up_process - Wake up a specific process
4271 : * @p: The process to be woken up.
4272 : *
4273 : * Attempt to wake up the nominated process and move it to the set of runnable
4274 : * processes.
4275 : *
4276 : * Return: 1 if the process was woken up, 0 if it was already running.
4277 : *
4278 : * This function executes a full memory barrier before accessing the task state.
4279 : */
4280 605 : int wake_up_process(struct task_struct *p)
4281 : {
4282 605 : return try_to_wake_up(p, TASK_NORMAL, 0);
4283 : }
4284 : EXPORT_SYMBOL(wake_up_process);
4285 :
4286 1 : int wake_up_state(struct task_struct *p, unsigned int state)
4287 : {
4288 1 : return try_to_wake_up(p, state, 0);
4289 : }
4290 :
4291 : /*
4292 : * Perform scheduler related setup for a newly forked process p.
4293 : * p is forked by current.
4294 : *
4295 : * __sched_fork() is basic setup used by init_idle() too:
4296 : */
4297 108 : static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
4298 : {
4299 108 : p->on_rq = 0;
4300 :
4301 108 : p->se.on_rq = 0;
4302 108 : p->se.exec_start = 0;
4303 108 : p->se.sum_exec_runtime = 0;
4304 108 : p->se.prev_sum_exec_runtime = 0;
4305 108 : p->se.nr_migrations = 0;
4306 108 : p->se.vruntime = 0;
4307 216 : INIT_LIST_HEAD(&p->se.group_node);
4308 :
4309 : #ifdef CONFIG_FAIR_GROUP_SCHED
4310 : p->se.cfs_rq = NULL;
4311 : #endif
4312 :
4313 : #ifdef CONFIG_SCHEDSTATS
4314 : /* Even if schedstat is disabled, there should not be garbage */
4315 : memset(&p->stats, 0, sizeof(p->stats));
4316 : #endif
4317 :
4318 108 : RB_CLEAR_NODE(&p->dl.rb_node);
4319 108 : init_dl_task_timer(&p->dl);
4320 108 : init_dl_inactive_task_timer(&p->dl);
4321 108 : __dl_clear_params(p);
4322 :
4323 216 : INIT_LIST_HEAD(&p->rt.run_list);
4324 108 : p->rt.timeout = 0;
4325 108 : p->rt.time_slice = sched_rr_timeslice;
4326 108 : p->rt.on_rq = 0;
4327 108 : p->rt.on_list = 0;
4328 :
4329 : #ifdef CONFIG_PREEMPT_NOTIFIERS
4330 : INIT_HLIST_HEAD(&p->preempt_notifiers);
4331 : #endif
4332 :
4333 : #ifdef CONFIG_COMPACTION
4334 108 : p->capture_control = NULL;
4335 : #endif
4336 108 : init_numa_balancing(clone_flags, p);
4337 : #ifdef CONFIG_SMP
4338 : p->wake_entry.u_flags = CSD_TYPE_TTWU;
4339 : p->migration_pending = NULL;
4340 : #endif
4341 108 : }
4342 :
4343 : DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
4344 :
4345 : #ifdef CONFIG_NUMA_BALANCING
4346 :
4347 : int sysctl_numa_balancing_mode;
4348 :
4349 : static void __set_numabalancing_state(bool enabled)
4350 : {
4351 : if (enabled)
4352 : static_branch_enable(&sched_numa_balancing);
4353 : else
4354 : static_branch_disable(&sched_numa_balancing);
4355 : }
4356 :
4357 : void set_numabalancing_state(bool enabled)
4358 : {
4359 : if (enabled)
4360 : sysctl_numa_balancing_mode = NUMA_BALANCING_NORMAL;
4361 : else
4362 : sysctl_numa_balancing_mode = NUMA_BALANCING_DISABLED;
4363 : __set_numabalancing_state(enabled);
4364 : }
4365 :
4366 : #ifdef CONFIG_PROC_SYSCTL
4367 : int sysctl_numa_balancing(struct ctl_table *table, int write,
4368 : void *buffer, size_t *lenp, loff_t *ppos)
4369 : {
4370 : struct ctl_table t;
4371 : int err;
4372 : int state = sysctl_numa_balancing_mode;
4373 :
4374 : if (write && !capable(CAP_SYS_ADMIN))
4375 : return -EPERM;
4376 :
4377 : t = *table;
4378 : t.data = &state;
4379 : err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
4380 : if (err < 0)
4381 : return err;
4382 : if (write) {
4383 : sysctl_numa_balancing_mode = state;
4384 : __set_numabalancing_state(state);
4385 : }
4386 : return err;
4387 : }
4388 : #endif
4389 : #endif
4390 :
4391 : #ifdef CONFIG_SCHEDSTATS
4392 :
4393 : DEFINE_STATIC_KEY_FALSE(sched_schedstats);
4394 :
4395 : static void set_schedstats(bool enabled)
4396 : {
4397 : if (enabled)
4398 : static_branch_enable(&sched_schedstats);
4399 : else
4400 : static_branch_disable(&sched_schedstats);
4401 : }
4402 :
4403 : void force_schedstat_enabled(void)
4404 : {
4405 : if (!schedstat_enabled()) {
4406 : pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
4407 : static_branch_enable(&sched_schedstats);
4408 : }
4409 : }
4410 :
4411 : static int __init setup_schedstats(char *str)
4412 : {
4413 : int ret = 0;
4414 : if (!str)
4415 : goto out;
4416 :
4417 : if (!strcmp(str, "enable")) {
4418 : set_schedstats(true);
4419 : ret = 1;
4420 : } else if (!strcmp(str, "disable")) {
4421 : set_schedstats(false);
4422 : ret = 1;
4423 : }
4424 : out:
4425 : if (!ret)
4426 : pr_warn("Unable to parse schedstats=\n");
4427 :
4428 : return ret;
4429 : }
4430 : __setup("schedstats=", setup_schedstats);
4431 :
4432 : #ifdef CONFIG_PROC_SYSCTL
4433 : int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
4434 : size_t *lenp, loff_t *ppos)
4435 : {
4436 : struct ctl_table t;
4437 : int err;
4438 : int state = static_branch_likely(&sched_schedstats);
4439 :
4440 : if (write && !capable(CAP_SYS_ADMIN))
4441 : return -EPERM;
4442 :
4443 : t = *table;
4444 : t.data = &state;
4445 : err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
4446 : if (err < 0)
4447 : return err;
4448 : if (write)
4449 : set_schedstats(state);
4450 : return err;
4451 : }
4452 : #endif /* CONFIG_PROC_SYSCTL */
4453 : #endif /* CONFIG_SCHEDSTATS */
4454 :
4455 : /*
4456 : * fork()/clone()-time setup:
4457 : */
4458 107 : int sched_fork(unsigned long clone_flags, struct task_struct *p)
4459 : {
4460 107 : __sched_fork(clone_flags, p);
4461 : /*
4462 : * We mark the process as NEW here. This guarantees that
4463 : * nobody will actually run it, and a signal or other external
4464 : * event cannot wake it up and insert it on the runqueue either.
4465 : */
4466 107 : p->__state = TASK_NEW;
4467 :
4468 : /*
4469 : * Make sure we do not leak PI boosting priority to the child.
4470 : */
4471 107 : p->prio = current->normal_prio;
4472 :
4473 107 : uclamp_fork(p);
4474 :
4475 : /*
4476 : * Revert to default priority/policy on fork if requested.
4477 : */
4478 107 : if (unlikely(p->sched_reset_on_fork)) {
4479 0 : if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
4480 0 : p->policy = SCHED_NORMAL;
4481 0 : p->static_prio = NICE_TO_PRIO(0);
4482 0 : p->rt_priority = 0;
4483 0 : } else if (PRIO_TO_NICE(p->static_prio) < 0)
4484 0 : p->static_prio = NICE_TO_PRIO(0);
4485 :
4486 0 : p->prio = p->normal_prio = p->static_prio;
4487 0 : set_load_weight(p, false);
4488 :
4489 : /*
4490 : * We don't need the reset flag anymore after the fork. It has
4491 : * fulfilled its duty:
4492 : */
4493 0 : p->sched_reset_on_fork = 0;
4494 : }
4495 :
4496 214 : if (dl_prio(p->prio))
4497 : return -EAGAIN;
4498 214 : else if (rt_prio(p->prio))
4499 0 : p->sched_class = &rt_sched_class;
4500 : else
4501 107 : p->sched_class = &fair_sched_class;
4502 :
4503 107 : init_entity_runnable_average(&p->se);
4504 :
4505 :
4506 : #ifdef CONFIG_SCHED_INFO
4507 : if (likely(sched_info_on()))
4508 : memset(&p->sched_info, 0, sizeof(p->sched_info));
4509 : #endif
4510 : #if defined(CONFIG_SMP)
4511 : p->on_cpu = 0;
4512 : #endif
4513 107 : init_task_preempt_count(p);
4514 : #ifdef CONFIG_SMP
4515 : plist_node_init(&p->pushable_tasks, MAX_PRIO);
4516 : RB_CLEAR_NODE(&p->pushable_dl_tasks);
4517 : #endif
4518 107 : return 0;
4519 : }
4520 :
4521 107 : void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
4522 : {
4523 : unsigned long flags;
4524 :
4525 : /*
4526 : * Because we're not yet on the pid-hash, p->pi_lock isn't strictly
4527 : * required yet, but lockdep gets upset if rules are violated.
4528 : */
4529 107 : raw_spin_lock_irqsave(&p->pi_lock, flags);
4530 : #ifdef CONFIG_CGROUP_SCHED
4531 : if (1) {
4532 : struct task_group *tg;
4533 : tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
4534 : struct task_group, css);
4535 : tg = autogroup_task_group(p, tg);
4536 : p->sched_task_group = tg;
4537 : }
4538 : #endif
4539 107 : rseq_migrate(p);
4540 : /*
4541 : * We're setting the CPU for the first time, we don't migrate,
4542 : * so use __set_task_cpu().
4543 : */
4544 107 : __set_task_cpu(p, smp_processor_id());
4545 107 : if (p->sched_class->task_fork)
4546 107 : p->sched_class->task_fork(p);
4547 214 : raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4548 107 : }
4549 :
4550 107 : void sched_post_fork(struct task_struct *p)
4551 : {
4552 107 : uclamp_post_fork(p);
4553 107 : }
4554 :
4555 3 : unsigned long to_ratio(u64 period, u64 runtime)
4556 : {
4557 3 : if (runtime == RUNTIME_INF)
4558 : return BW_UNIT;
4559 :
4560 : /*
4561 : * Doing this here saves a lot of checks in all
4562 : * the calling paths, and returning zero seems
4563 : * safe for them anyway.
4564 : */
4565 3 : if (period == 0)
4566 : return 0;
4567 :
4568 6 : return div64_u64(runtime << BW_SHIFT, period);
4569 : }
4570 :
4571 : /*
4572 : * wake_up_new_task - wake up a newly created task for the first time.
4573 : *
4574 : * This function will do some initial scheduler statistics housekeeping
4575 : * that must be done for every newly created context, then puts the task
4576 : * on the runqueue and wakes it.
4577 : */
4578 107 : void wake_up_new_task(struct task_struct *p)
4579 : {
4580 : struct rq_flags rf;
4581 : struct rq *rq;
4582 :
4583 107 : raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
4584 107 : WRITE_ONCE(p->__state, TASK_RUNNING);
4585 : #ifdef CONFIG_SMP
4586 : /*
4587 : * Fork balancing, do it here and not earlier because:
4588 : * - cpus_ptr can change in the fork path
4589 : * - any previously selected CPU might disappear through hotplug
4590 : *
4591 : * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
4592 : * as we're not fully set-up yet.
4593 : */
4594 : p->recent_used_cpu = task_cpu(p);
4595 : rseq_migrate(p);
4596 : __set_task_cpu(p, select_task_rq(p, task_cpu(p), WF_FORK));
4597 : #endif
4598 107 : rq = __task_rq_lock(p, &rf);
4599 107 : update_rq_clock(rq);
4600 107 : post_init_entity_util_avg(p);
4601 :
4602 107 : activate_task(rq, p, ENQUEUE_NOCLOCK);
4603 107 : trace_sched_wakeup_new(p);
4604 107 : check_preempt_curr(rq, p, WF_FORK);
4605 : #ifdef CONFIG_SMP
4606 : if (p->sched_class->task_woken) {
4607 : /*
4608 : * Nothing relies on rq->lock after this, so it's fine to
4609 : * drop it.
4610 : */
4611 : rq_unpin_lock(rq, &rf);
4612 : p->sched_class->task_woken(rq, p);
4613 : rq_repin_lock(rq, &rf);
4614 : }
4615 : #endif
4616 214 : task_rq_unlock(rq, p, &rf);
4617 107 : }
4618 :
4619 : #ifdef CONFIG_PREEMPT_NOTIFIERS
4620 :
4621 : static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key);
4622 :
4623 : void preempt_notifier_inc(void)
4624 : {
4625 : static_branch_inc(&preempt_notifier_key);
4626 : }
4627 : EXPORT_SYMBOL_GPL(preempt_notifier_inc);
4628 :
4629 : void preempt_notifier_dec(void)
4630 : {
4631 : static_branch_dec(&preempt_notifier_key);
4632 : }
4633 : EXPORT_SYMBOL_GPL(preempt_notifier_dec);
4634 :
4635 : /**
4636 : * preempt_notifier_register - tell me when current is being preempted & rescheduled
4637 : * @notifier: notifier struct to register
4638 : */
4639 : void preempt_notifier_register(struct preempt_notifier *notifier)
4640 : {
4641 : if (!static_branch_unlikely(&preempt_notifier_key))
4642 : WARN(1, "registering preempt_notifier while notifiers disabled\n");
4643 :
4644 : hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
4645 : }
4646 : EXPORT_SYMBOL_GPL(preempt_notifier_register);
4647 :
4648 : /**
4649 : * preempt_notifier_unregister - no longer interested in preemption notifications
4650 : * @notifier: notifier struct to unregister
4651 : *
4652 : * This is *not* safe to call from within a preemption notifier.
4653 : */
4654 : void preempt_notifier_unregister(struct preempt_notifier *notifier)
4655 : {
4656 : hlist_del(¬ifier->link);
4657 : }
4658 : EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
4659 :
4660 : static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
4661 : {
4662 : struct preempt_notifier *notifier;
4663 :
4664 : hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
4665 : notifier->ops->sched_in(notifier, raw_smp_processor_id());
4666 : }
4667 :
4668 : static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
4669 : {
4670 : if (static_branch_unlikely(&preempt_notifier_key))
4671 : __fire_sched_in_preempt_notifiers(curr);
4672 : }
4673 :
4674 : static void
4675 : __fire_sched_out_preempt_notifiers(struct task_struct *curr,
4676 : struct task_struct *next)
4677 : {
4678 : struct preempt_notifier *notifier;
4679 :
4680 : hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
4681 : notifier->ops->sched_out(notifier, next);
4682 : }
4683 :
4684 : static __always_inline void
4685 : fire_sched_out_preempt_notifiers(struct task_struct *curr,
4686 : struct task_struct *next)
4687 : {
4688 : if (static_branch_unlikely(&preempt_notifier_key))
4689 : __fire_sched_out_preempt_notifiers(curr, next);
4690 : }
4691 :
4692 : #else /* !CONFIG_PREEMPT_NOTIFIERS */
4693 :
4694 : static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
4695 : {
4696 : }
4697 :
4698 : static inline void
4699 : fire_sched_out_preempt_notifiers(struct task_struct *curr,
4700 : struct task_struct *next)
4701 : {
4702 : }
4703 :
4704 : #endif /* CONFIG_PREEMPT_NOTIFIERS */
4705 :
4706 : static inline void prepare_task(struct task_struct *next)
4707 : {
4708 : #ifdef CONFIG_SMP
4709 : /*
4710 : * Claim the task as running, we do this before switching to it
4711 : * such that any running task will have this set.
4712 : *
4713 : * See the ttwu() WF_ON_CPU case and its ordering comment.
4714 : */
4715 : WRITE_ONCE(next->on_cpu, 1);
4716 : #endif
4717 : }
4718 :
4719 : static inline void finish_task(struct task_struct *prev)
4720 : {
4721 : #ifdef CONFIG_SMP
4722 : /*
4723 : * This must be the very last reference to @prev from this CPU. After
4724 : * p->on_cpu is cleared, the task can be moved to a different CPU. We
4725 : * must ensure this doesn't happen until the switch is completely
4726 : * finished.
4727 : *
4728 : * In particular, the load of prev->state in finish_task_switch() must
4729 : * happen before this.
4730 : *
4731 : * Pairs with the smp_cond_load_acquire() in try_to_wake_up().
4732 : */
4733 : smp_store_release(&prev->on_cpu, 0);
4734 : #endif
4735 : }
4736 :
4737 : #ifdef CONFIG_SMP
4738 :
4739 : static void do_balance_callbacks(struct rq *rq, struct callback_head *head)
4740 : {
4741 : void (*func)(struct rq *rq);
4742 : struct callback_head *next;
4743 :
4744 : lockdep_assert_rq_held(rq);
4745 :
4746 : while (head) {
4747 : func = (void (*)(struct rq *))head->func;
4748 : next = head->next;
4749 : head->next = NULL;
4750 : head = next;
4751 :
4752 : func(rq);
4753 : }
4754 : }
4755 :
4756 : static void balance_push(struct rq *rq);
4757 :
4758 : struct callback_head balance_push_callback = {
4759 : .next = NULL,
4760 : .func = (void (*)(struct callback_head *))balance_push,
4761 : };
4762 :
4763 : static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
4764 : {
4765 : struct callback_head *head = rq->balance_callback;
4766 :
4767 : lockdep_assert_rq_held(rq);
4768 : if (head)
4769 : rq->balance_callback = NULL;
4770 :
4771 : return head;
4772 : }
4773 :
4774 : static void __balance_callbacks(struct rq *rq)
4775 : {
4776 : do_balance_callbacks(rq, splice_balance_callbacks(rq));
4777 : }
4778 :
4779 : static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
4780 : {
4781 : unsigned long flags;
4782 :
4783 : if (unlikely(head)) {
4784 : raw_spin_rq_lock_irqsave(rq, flags);
4785 : do_balance_callbacks(rq, head);
4786 : raw_spin_rq_unlock_irqrestore(rq, flags);
4787 : }
4788 : }
4789 :
4790 : #else
4791 :
4792 : static inline void __balance_callbacks(struct rq *rq)
4793 : {
4794 : }
4795 :
4796 : static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
4797 : {
4798 : return NULL;
4799 : }
4800 :
4801 : static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
4802 : {
4803 : }
4804 :
4805 : #endif
4806 :
4807 : static inline void
4808 : prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
4809 : {
4810 : /*
4811 : * Since the runqueue lock will be released by the next
4812 : * task (which is an invalid locking op but in the case
4813 : * of the scheduler it's an obvious special-case), so we
4814 : * do an early lockdep release here:
4815 : */
4816 618 : rq_unpin_lock(rq, rf);
4817 : spin_release(&__rq_lockp(rq)->dep_map, _THIS_IP_);
4818 : #ifdef CONFIG_DEBUG_SPINLOCK
4819 : /* this is a valid case when another task releases the spinlock */
4820 : rq_lockp(rq)->owner = next;
4821 : #endif
4822 : }
4823 :
4824 : static inline void finish_lock_switch(struct rq *rq)
4825 : {
4826 : /*
4827 : * If we are tracking spinlock dependencies then we have to
4828 : * fix up the runqueue lock - which gets 'carried over' from
4829 : * prev into current:
4830 : */
4831 : spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_);
4832 618 : __balance_callbacks(rq);
4833 618 : raw_spin_rq_unlock_irq(rq);
4834 : }
4835 :
4836 : /*
4837 : * NOP if the arch has not defined these:
4838 : */
4839 :
4840 : #ifndef prepare_arch_switch
4841 : # define prepare_arch_switch(next) do { } while (0)
4842 : #endif
4843 :
4844 : #ifndef finish_arch_post_lock_switch
4845 : # define finish_arch_post_lock_switch() do { } while (0)
4846 : #endif
4847 :
4848 : static inline void kmap_local_sched_out(void)
4849 : {
4850 : #ifdef CONFIG_KMAP_LOCAL
4851 : if (unlikely(current->kmap_ctrl.idx))
4852 : __kmap_local_sched_out();
4853 : #endif
4854 : }
4855 :
4856 : static inline void kmap_local_sched_in(void)
4857 : {
4858 : #ifdef CONFIG_KMAP_LOCAL
4859 : if (unlikely(current->kmap_ctrl.idx))
4860 : __kmap_local_sched_in();
4861 : #endif
4862 : }
4863 :
4864 : /**
4865 : * prepare_task_switch - prepare to switch tasks
4866 : * @rq: the runqueue preparing to switch
4867 : * @prev: the current task that is being switched out
4868 : * @next: the task we are going to switch to.
4869 : *
4870 : * This is called with the rq lock held and interrupts off. It must
4871 : * be paired with a subsequent finish_task_switch after the context
4872 : * switch.
4873 : *
4874 : * prepare_task_switch sets up locking and calls architecture specific
4875 : * hooks.
4876 : */
4877 : static inline void
4878 : prepare_task_switch(struct rq *rq, struct task_struct *prev,
4879 : struct task_struct *next)
4880 : {
4881 : kcov_prepare_switch(prev);
4882 : sched_info_switch(rq, prev, next);
4883 : perf_event_task_sched_out(prev, next);
4884 : rseq_preempt(prev);
4885 : fire_sched_out_preempt_notifiers(prev, next);
4886 : kmap_local_sched_out();
4887 : prepare_task(next);
4888 : prepare_arch_switch(next);
4889 : }
4890 :
4891 : /**
4892 : * finish_task_switch - clean up after a task-switch
4893 : * @prev: the thread we just switched away from.
4894 : *
4895 : * finish_task_switch must be called after the context switch, paired
4896 : * with a prepare_task_switch call before the context switch.
4897 : * finish_task_switch will reconcile locking set up by prepare_task_switch,
4898 : * and do any other architecture-specific cleanup actions.
4899 : *
4900 : * Note that we may have delayed dropping an mm in context_switch(). If
4901 : * so, we finish that here outside of the runqueue lock. (Doing it
4902 : * with the lock held can cause deadlocks; see schedule() for
4903 : * details.)
4904 : *
4905 : * The context switch have flipped the stack from under us and restored the
4906 : * local variables which were saved when this task called schedule() in the
4907 : * past. prev == current is still correct but we need to recalculate this_rq
4908 : * because prev may have moved to another CPU.
4909 : */
4910 618 : static struct rq *finish_task_switch(struct task_struct *prev)
4911 : __releases(rq->lock)
4912 : {
4913 618 : struct rq *rq = this_rq();
4914 618 : struct mm_struct *mm = rq->prev_mm;
4915 : unsigned int prev_state;
4916 :
4917 : /*
4918 : * The previous task will have left us with a preempt_count of 2
4919 : * because it left us after:
4920 : *
4921 : * schedule()
4922 : * preempt_disable(); // 1
4923 : * __schedule()
4924 : * raw_spin_lock_irq(&rq->lock) // 2
4925 : *
4926 : * Also, see FORK_PREEMPT_COUNT.
4927 : */
4928 618 : if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
4929 : "corrupted preempt_count: %s/%d/0x%x\n",
4930 : current->comm, current->pid, preempt_count()))
4931 : preempt_count_set(FORK_PREEMPT_COUNT);
4932 :
4933 618 : rq->prev_mm = NULL;
4934 :
4935 : /*
4936 : * A task struct has one reference for the use as "current".
4937 : * If a task dies, then it sets TASK_DEAD in tsk->state and calls
4938 : * schedule one last time. The schedule call will never return, and
4939 : * the scheduled task must drop that reference.
4940 : *
4941 : * We must observe prev->state before clearing prev->on_cpu (in
4942 : * finish_task), otherwise a concurrent wakeup can get prev
4943 : * running on another CPU and we could rave with its RUNNING -> DEAD
4944 : * transition, resulting in a double drop.
4945 : */
4946 618 : prev_state = READ_ONCE(prev->__state);
4947 618 : vtime_task_switch(prev);
4948 618 : perf_event_task_sched_in(prev, current);
4949 618 : finish_task(prev);
4950 : tick_nohz_task_switch();
4951 618 : finish_lock_switch(rq);
4952 : finish_arch_post_lock_switch();
4953 618 : kcov_finish_switch(current);
4954 : /*
4955 : * kmap_local_sched_out() is invoked with rq::lock held and
4956 : * interrupts disabled. There is no requirement for that, but the
4957 : * sched out code does not have an interrupt enabled section.
4958 : * Restoring the maps on sched in does not require interrupts being
4959 : * disabled either.
4960 : */
4961 : kmap_local_sched_in();
4962 :
4963 618 : fire_sched_in_preempt_notifiers(current);
4964 : /*
4965 : * When switching through a kernel thread, the loop in
4966 : * membarrier_{private,global}_expedited() may have observed that
4967 : * kernel thread and not issued an IPI. It is therefore possible to
4968 : * schedule between user->kernel->user threads without passing though
4969 : * switch_mm(). Membarrier requires a barrier after storing to
4970 : * rq->curr, before returning to userspace, so provide them here:
4971 : *
4972 : * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly
4973 : * provided by mmdrop(),
4974 : * - a sync_core for SYNC_CORE.
4975 : */
4976 618 : if (mm) {
4977 0 : membarrier_mm_sync_core_before_usermode(mm);
4978 : mmdrop_sched(mm);
4979 : }
4980 618 : if (unlikely(prev_state == TASK_DEAD)) {
4981 93 : if (prev->sched_class->task_dead)
4982 0 : prev->sched_class->task_dead(prev);
4983 :
4984 : /* Task is done with its stack. */
4985 93 : put_task_stack(prev);
4986 :
4987 93 : put_task_struct_rcu_user(prev);
4988 : }
4989 :
4990 618 : return rq;
4991 : }
4992 :
4993 : /**
4994 : * schedule_tail - first thing a freshly forked thread must call.
4995 : * @prev: the thread we just switched away from.
4996 : */
4997 107 : asmlinkage __visible void schedule_tail(struct task_struct *prev)
4998 : __releases(rq->lock)
4999 : {
5000 : /*
5001 : * New tasks start with FORK_PREEMPT_COUNT, see there and
5002 : * finish_task_switch() for details.
5003 : *
5004 : * finish_task_switch() will drop rq->lock() and lower preempt_count
5005 : * and the preempt_enable() will end up enabling preemption (on
5006 : * PREEMPT_COUNT kernels).
5007 : */
5008 :
5009 107 : finish_task_switch(prev);
5010 107 : preempt_enable();
5011 :
5012 107 : if (current->set_child_tid)
5013 0 : put_user(task_pid_vnr(current), current->set_child_tid);
5014 :
5015 107 : calculate_sigpending();
5016 107 : }
5017 :
5018 : /*
5019 : * context_switch - switch to the new MM and the new thread's register state.
5020 : */
5021 : static __always_inline struct rq *
5022 : context_switch(struct rq *rq, struct task_struct *prev,
5023 : struct task_struct *next, struct rq_flags *rf)
5024 : {
5025 618 : prepare_task_switch(rq, prev, next);
5026 :
5027 : /*
5028 : * For paravirt, this is coupled with an exit in switch_to to
5029 : * combine the page table reload and the switch backend into
5030 : * one hypercall.
5031 : */
5032 : arch_start_context_switch(prev);
5033 :
5034 : /*
5035 : * kernel -> kernel lazy + transfer active
5036 : * user -> kernel lazy + mmgrab() active
5037 : *
5038 : * kernel -> user switch + mmdrop() active
5039 : * user -> user switch
5040 : */
5041 618 : if (!next->mm) { // to kernel
5042 618 : enter_lazy_tlb(prev->active_mm, next);
5043 :
5044 618 : next->active_mm = prev->active_mm;
5045 618 : if (prev->mm) // from user
5046 0 : mmgrab(prev->active_mm);
5047 : else
5048 618 : prev->active_mm = NULL;
5049 : } else { // to user
5050 0 : membarrier_switch_mm(rq, prev->active_mm, next->mm);
5051 : /*
5052 : * sys_membarrier() requires an smp_mb() between setting
5053 : * rq->curr / membarrier_switch_mm() and returning to userspace.
5054 : *
5055 : * The below provides this either through switch_mm(), or in
5056 : * case 'prev->active_mm == next->mm' through
5057 : * finish_task_switch()'s mmdrop().
5058 : */
5059 0 : switch_mm_irqs_off(prev->active_mm, next->mm, next);
5060 :
5061 0 : if (!prev->mm) { // from kernel
5062 : /* will mmdrop() in finish_task_switch(). */
5063 0 : rq->prev_mm = prev->active_mm;
5064 0 : prev->active_mm = NULL;
5065 : }
5066 : }
5067 :
5068 618 : rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
5069 :
5070 1236 : prepare_lock_switch(rq, next, rf);
5071 :
5072 : /* Here we just switch the register state and the stack. */
5073 618 : switch_to(prev, next, prev);
5074 511 : barrier();
5075 :
5076 511 : return finish_task_switch(prev);
5077 : }
5078 :
5079 : /*
5080 : * nr_running and nr_context_switches:
5081 : *
5082 : * externally visible scheduler statistics: current number of runnable
5083 : * threads, total number of context switches performed since bootup.
5084 : */
5085 0 : unsigned int nr_running(void)
5086 : {
5087 0 : unsigned int i, sum = 0;
5088 :
5089 0 : for_each_online_cpu(i)
5090 0 : sum += cpu_rq(i)->nr_running;
5091 :
5092 0 : return sum;
5093 : }
5094 :
5095 : /*
5096 : * Check if only the current task is running on the CPU.
5097 : *
5098 : * Caution: this function does not check that the caller has disabled
5099 : * preemption, thus the result might have a time-of-check-to-time-of-use
5100 : * race. The caller is responsible to use it correctly, for example:
5101 : *
5102 : * - from a non-preemptible section (of course)
5103 : *
5104 : * - from a thread that is bound to a single CPU
5105 : *
5106 : * - in a loop with very short iterations (e.g. a polling loop)
5107 : */
5108 0 : bool single_task_running(void)
5109 : {
5110 0 : return raw_rq()->nr_running == 1;
5111 : }
5112 : EXPORT_SYMBOL(single_task_running);
5113 :
5114 0 : unsigned long long nr_context_switches(void)
5115 : {
5116 : int i;
5117 0 : unsigned long long sum = 0;
5118 :
5119 0 : for_each_possible_cpu(i)
5120 0 : sum += cpu_rq(i)->nr_switches;
5121 :
5122 0 : return sum;
5123 : }
5124 :
5125 : /*
5126 : * Consumers of these two interfaces, like for example the cpuidle menu
5127 : * governor, are using nonsensical data. Preferring shallow idle state selection
5128 : * for a CPU that has IO-wait which might not even end up running the task when
5129 : * it does become runnable.
5130 : */
5131 :
5132 0 : unsigned int nr_iowait_cpu(int cpu)
5133 : {
5134 0 : return atomic_read(&cpu_rq(cpu)->nr_iowait);
5135 : }
5136 :
5137 : /*
5138 : * IO-wait accounting, and how it's mostly bollocks (on SMP).
5139 : *
5140 : * The idea behind IO-wait account is to account the idle time that we could
5141 : * have spend running if it were not for IO. That is, if we were to improve the
5142 : * storage performance, we'd have a proportional reduction in IO-wait time.
5143 : *
5144 : * This all works nicely on UP, where, when a task blocks on IO, we account
5145 : * idle time as IO-wait, because if the storage were faster, it could've been
5146 : * running and we'd not be idle.
5147 : *
5148 : * This has been extended to SMP, by doing the same for each CPU. This however
5149 : * is broken.
5150 : *
5151 : * Imagine for instance the case where two tasks block on one CPU, only the one
5152 : * CPU will have IO-wait accounted, while the other has regular idle. Even
5153 : * though, if the storage were faster, both could've ran at the same time,
5154 : * utilising both CPUs.
5155 : *
5156 : * This means, that when looking globally, the current IO-wait accounting on
5157 : * SMP is a lower bound, by reason of under accounting.
5158 : *
5159 : * Worse, since the numbers are provided per CPU, they are sometimes
5160 : * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly
5161 : * associated with any one particular CPU, it can wake to another CPU than it
5162 : * blocked on. This means the per CPU IO-wait number is meaningless.
5163 : *
5164 : * Task CPU affinities can make all that even more 'interesting'.
5165 : */
5166 :
5167 0 : unsigned int nr_iowait(void)
5168 : {
5169 0 : unsigned int i, sum = 0;
5170 :
5171 0 : for_each_possible_cpu(i)
5172 0 : sum += nr_iowait_cpu(i);
5173 :
5174 0 : return sum;
5175 : }
5176 :
5177 : #ifdef CONFIG_SMP
5178 :
5179 : /*
5180 : * sched_exec - execve() is a valuable balancing opportunity, because at
5181 : * this point the task has the smallest effective memory and cache footprint.
5182 : */
5183 : void sched_exec(void)
5184 : {
5185 : struct task_struct *p = current;
5186 : unsigned long flags;
5187 : int dest_cpu;
5188 :
5189 : raw_spin_lock_irqsave(&p->pi_lock, flags);
5190 : dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC);
5191 : if (dest_cpu == smp_processor_id())
5192 : goto unlock;
5193 :
5194 : if (likely(cpu_active(dest_cpu))) {
5195 : struct migration_arg arg = { p, dest_cpu };
5196 :
5197 : raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5198 : stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
5199 : return;
5200 : }
5201 : unlock:
5202 : raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5203 : }
5204 :
5205 : #endif
5206 :
5207 : DEFINE_PER_CPU(struct kernel_stat, kstat);
5208 : DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
5209 :
5210 : EXPORT_PER_CPU_SYMBOL(kstat);
5211 : EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
5212 :
5213 : /*
5214 : * The function fair_sched_class.update_curr accesses the struct curr
5215 : * and its field curr->exec_start; when called from task_sched_runtime(),
5216 : * we observe a high rate of cache misses in practice.
5217 : * Prefetching this data results in improved performance.
5218 : */
5219 : static inline void prefetch_curr_exec_start(struct task_struct *p)
5220 : {
5221 : #ifdef CONFIG_FAIR_GROUP_SCHED
5222 : struct sched_entity *curr = (&p->se)->cfs_rq->curr;
5223 : #else
5224 0 : struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
5225 : #endif
5226 0 : prefetch(curr);
5227 0 : prefetch(&curr->exec_start);
5228 : }
5229 :
5230 : /*
5231 : * Return accounted runtime for the task.
5232 : * In case the task is currently running, return the runtime plus current's
5233 : * pending runtime that have not been accounted yet.
5234 : */
5235 0 : unsigned long long task_sched_runtime(struct task_struct *p)
5236 : {
5237 : struct rq_flags rf;
5238 : struct rq *rq;
5239 : u64 ns;
5240 :
5241 : #if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
5242 : /*
5243 : * 64-bit doesn't need locks to atomically read a 64-bit value.
5244 : * So we have a optimization chance when the task's delta_exec is 0.
5245 : * Reading ->on_cpu is racy, but this is ok.
5246 : *
5247 : * If we race with it leaving CPU, we'll take a lock. So we're correct.
5248 : * If we race with it entering CPU, unaccounted time is 0. This is
5249 : * indistinguishable from the read occurring a few cycles earlier.
5250 : * If we see ->on_cpu without ->on_rq, the task is leaving, and has
5251 : * been accounted, so we're correct here as well.
5252 : */
5253 : if (!p->on_cpu || !task_on_rq_queued(p))
5254 : return p->se.sum_exec_runtime;
5255 : #endif
5256 :
5257 0 : rq = task_rq_lock(p, &rf);
5258 : /*
5259 : * Must be ->curr _and_ ->on_rq. If dequeued, we would
5260 : * project cycles that may never be accounted to this
5261 : * thread, breaking clock_gettime().
5262 : */
5263 0 : if (task_current(rq, p) && task_on_rq_queued(p)) {
5264 0 : prefetch_curr_exec_start(p);
5265 0 : update_rq_clock(rq);
5266 0 : p->sched_class->update_curr(rq);
5267 : }
5268 0 : ns = p->se.sum_exec_runtime;
5269 0 : task_rq_unlock(rq, p, &rf);
5270 :
5271 0 : return ns;
5272 : }
5273 :
5274 : #ifdef CONFIG_SCHED_DEBUG
5275 0 : static u64 cpu_resched_latency(struct rq *rq)
5276 : {
5277 0 : int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms);
5278 0 : u64 resched_latency, now = rq_clock(rq);
5279 : static bool warned_once;
5280 :
5281 0 : if (sysctl_resched_latency_warn_once && warned_once)
5282 : return 0;
5283 :
5284 0 : if (!need_resched() || !latency_warn_ms)
5285 : return 0;
5286 :
5287 0 : if (system_state == SYSTEM_BOOTING)
5288 : return 0;
5289 :
5290 0 : if (!rq->last_seen_need_resched_ns) {
5291 0 : rq->last_seen_need_resched_ns = now;
5292 0 : rq->ticks_without_resched = 0;
5293 0 : return 0;
5294 : }
5295 :
5296 0 : rq->ticks_without_resched++;
5297 0 : resched_latency = now - rq->last_seen_need_resched_ns;
5298 0 : if (resched_latency <= latency_warn_ms * NSEC_PER_MSEC)
5299 : return 0;
5300 :
5301 0 : warned_once = true;
5302 :
5303 0 : return resched_latency;
5304 : }
5305 :
5306 0 : static int __init setup_resched_latency_warn_ms(char *str)
5307 : {
5308 : long val;
5309 :
5310 0 : if ((kstrtol(str, 0, &val))) {
5311 0 : pr_warn("Unable to set resched_latency_warn_ms\n");
5312 0 : return 1;
5313 : }
5314 :
5315 0 : sysctl_resched_latency_warn_ms = val;
5316 0 : return 1;
5317 : }
5318 : __setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms);
5319 : #else
5320 : static inline u64 cpu_resched_latency(struct rq *rq) { return 0; }
5321 : #endif /* CONFIG_SCHED_DEBUG */
5322 :
5323 : /*
5324 : * This function gets called by the timer code, with HZ frequency.
5325 : * We call it with interrupts disabled.
5326 : */
5327 13 : void scheduler_tick(void)
5328 : {
5329 13 : int cpu = smp_processor_id();
5330 13 : struct rq *rq = cpu_rq(cpu);
5331 13 : struct task_struct *curr = rq->curr;
5332 : struct rq_flags rf;
5333 : unsigned long thermal_pressure;
5334 : u64 resched_latency;
5335 :
5336 : arch_scale_freq_tick();
5337 13 : sched_clock_tick();
5338 :
5339 13 : rq_lock(rq, &rf);
5340 :
5341 13 : update_rq_clock(rq);
5342 13 : thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
5343 13 : update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
5344 13 : curr->sched_class->task_tick(rq, curr, 0);
5345 13 : if (sched_feat(LATENCY_WARN))
5346 0 : resched_latency = cpu_resched_latency(rq);
5347 13 : calc_global_load_tick(rq);
5348 13 : sched_core_tick(rq);
5349 :
5350 13 : rq_unlock(rq, &rf);
5351 :
5352 13 : if (sched_feat(LATENCY_WARN) && resched_latency)
5353 0 : resched_latency_warn(cpu, resched_latency);
5354 :
5355 : perf_event_task_tick();
5356 :
5357 : #ifdef CONFIG_SMP
5358 : rq->idle_balance = idle_cpu(cpu);
5359 : trigger_load_balance(rq);
5360 : #endif
5361 13 : }
5362 :
5363 : #ifdef CONFIG_NO_HZ_FULL
5364 :
5365 : struct tick_work {
5366 : int cpu;
5367 : atomic_t state;
5368 : struct delayed_work work;
5369 : };
5370 : /* Values for ->state, see diagram below. */
5371 : #define TICK_SCHED_REMOTE_OFFLINE 0
5372 : #define TICK_SCHED_REMOTE_OFFLINING 1
5373 : #define TICK_SCHED_REMOTE_RUNNING 2
5374 :
5375 : /*
5376 : * State diagram for ->state:
5377 : *
5378 : *
5379 : * TICK_SCHED_REMOTE_OFFLINE
5380 : * | ^
5381 : * | |
5382 : * | | sched_tick_remote()
5383 : * | |
5384 : * | |
5385 : * +--TICK_SCHED_REMOTE_OFFLINING
5386 : * | ^
5387 : * | |
5388 : * sched_tick_start() | | sched_tick_stop()
5389 : * | |
5390 : * V |
5391 : * TICK_SCHED_REMOTE_RUNNING
5392 : *
5393 : *
5394 : * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote()
5395 : * and sched_tick_start() are happy to leave the state in RUNNING.
5396 : */
5397 :
5398 : static struct tick_work __percpu *tick_work_cpu;
5399 :
5400 : static void sched_tick_remote(struct work_struct *work)
5401 : {
5402 : struct delayed_work *dwork = to_delayed_work(work);
5403 : struct tick_work *twork = container_of(dwork, struct tick_work, work);
5404 : int cpu = twork->cpu;
5405 : struct rq *rq = cpu_rq(cpu);
5406 : struct task_struct *curr;
5407 : struct rq_flags rf;
5408 : u64 delta;
5409 : int os;
5410 :
5411 : /*
5412 : * Handle the tick only if it appears the remote CPU is running in full
5413 : * dynticks mode. The check is racy by nature, but missing a tick or
5414 : * having one too much is no big deal because the scheduler tick updates
5415 : * statistics and checks timeslices in a time-independent way, regardless
5416 : * of when exactly it is running.
5417 : */
5418 : if (!tick_nohz_tick_stopped_cpu(cpu))
5419 : goto out_requeue;
5420 :
5421 : rq_lock_irq(rq, &rf);
5422 : curr = rq->curr;
5423 : if (cpu_is_offline(cpu))
5424 : goto out_unlock;
5425 :
5426 : update_rq_clock(rq);
5427 :
5428 : if (!is_idle_task(curr)) {
5429 : /*
5430 : * Make sure the next tick runs within a reasonable
5431 : * amount of time.
5432 : */
5433 : delta = rq_clock_task(rq) - curr->se.exec_start;
5434 : WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
5435 : }
5436 : curr->sched_class->task_tick(rq, curr, 0);
5437 :
5438 : calc_load_nohz_remote(rq);
5439 : out_unlock:
5440 : rq_unlock_irq(rq, &rf);
5441 : out_requeue:
5442 :
5443 : /*
5444 : * Run the remote tick once per second (1Hz). This arbitrary
5445 : * frequency is large enough to avoid overload but short enough
5446 : * to keep scheduler internal stats reasonably up to date. But
5447 : * first update state to reflect hotplug activity if required.
5448 : */
5449 : os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING);
5450 : WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE);
5451 : if (os == TICK_SCHED_REMOTE_RUNNING)
5452 : queue_delayed_work(system_unbound_wq, dwork, HZ);
5453 : }
5454 :
5455 : static void sched_tick_start(int cpu)
5456 : {
5457 : int os;
5458 : struct tick_work *twork;
5459 :
5460 : if (housekeeping_cpu(cpu, HK_TYPE_TICK))
5461 : return;
5462 :
5463 : WARN_ON_ONCE(!tick_work_cpu);
5464 :
5465 : twork = per_cpu_ptr(tick_work_cpu, cpu);
5466 : os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING);
5467 : WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING);
5468 : if (os == TICK_SCHED_REMOTE_OFFLINE) {
5469 : twork->cpu = cpu;
5470 : INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
5471 : queue_delayed_work(system_unbound_wq, &twork->work, HZ);
5472 : }
5473 : }
5474 :
5475 : #ifdef CONFIG_HOTPLUG_CPU
5476 : static void sched_tick_stop(int cpu)
5477 : {
5478 : struct tick_work *twork;
5479 : int os;
5480 :
5481 : if (housekeeping_cpu(cpu, HK_TYPE_TICK))
5482 : return;
5483 :
5484 : WARN_ON_ONCE(!tick_work_cpu);
5485 :
5486 : twork = per_cpu_ptr(tick_work_cpu, cpu);
5487 : /* There cannot be competing actions, but don't rely on stop-machine. */
5488 : os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING);
5489 : WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING);
5490 : /* Don't cancel, as this would mess up the state machine. */
5491 : }
5492 : #endif /* CONFIG_HOTPLUG_CPU */
5493 :
5494 : int __init sched_tick_offload_init(void)
5495 : {
5496 : tick_work_cpu = alloc_percpu(struct tick_work);
5497 : BUG_ON(!tick_work_cpu);
5498 : return 0;
5499 : }
5500 :
5501 : #else /* !CONFIG_NO_HZ_FULL */
5502 : static inline void sched_tick_start(int cpu) { }
5503 : static inline void sched_tick_stop(int cpu) { }
5504 : #endif
5505 :
5506 : #if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \
5507 : defined(CONFIG_TRACE_PREEMPT_TOGGLE))
5508 : /*
5509 : * If the value passed in is equal to the current preempt count
5510 : * then we just disabled preemption. Start timing the latency.
5511 : */
5512 : static inline void preempt_latency_start(int val)
5513 : {
5514 : if (preempt_count() == val) {
5515 : unsigned long ip = get_lock_parent_ip();
5516 : #ifdef CONFIG_DEBUG_PREEMPT
5517 : current->preempt_disable_ip = ip;
5518 : #endif
5519 : trace_preempt_off(CALLER_ADDR0, ip);
5520 : }
5521 : }
5522 :
5523 : void preempt_count_add(int val)
5524 : {
5525 : #ifdef CONFIG_DEBUG_PREEMPT
5526 : /*
5527 : * Underflow?
5528 : */
5529 : if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
5530 : return;
5531 : #endif
5532 : __preempt_count_add(val);
5533 : #ifdef CONFIG_DEBUG_PREEMPT
5534 : /*
5535 : * Spinlock count overflowing soon?
5536 : */
5537 : DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
5538 : PREEMPT_MASK - 10);
5539 : #endif
5540 : preempt_latency_start(val);
5541 : }
5542 : EXPORT_SYMBOL(preempt_count_add);
5543 : NOKPROBE_SYMBOL(preempt_count_add);
5544 :
5545 : /*
5546 : * If the value passed in equals to the current preempt count
5547 : * then we just enabled preemption. Stop timing the latency.
5548 : */
5549 : static inline void preempt_latency_stop(int val)
5550 : {
5551 : if (preempt_count() == val)
5552 : trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
5553 : }
5554 :
5555 : void preempt_count_sub(int val)
5556 : {
5557 : #ifdef CONFIG_DEBUG_PREEMPT
5558 : /*
5559 : * Underflow?
5560 : */
5561 : if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
5562 : return;
5563 : /*
5564 : * Is the spinlock portion underflowing?
5565 : */
5566 : if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
5567 : !(preempt_count() & PREEMPT_MASK)))
5568 : return;
5569 : #endif
5570 :
5571 : preempt_latency_stop(val);
5572 : __preempt_count_sub(val);
5573 : }
5574 : EXPORT_SYMBOL(preempt_count_sub);
5575 : NOKPROBE_SYMBOL(preempt_count_sub);
5576 :
5577 : #else
5578 : static inline void preempt_latency_start(int val) { }
5579 : static inline void preempt_latency_stop(int val) { }
5580 : #endif
5581 :
5582 : static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
5583 : {
5584 : #ifdef CONFIG_DEBUG_PREEMPT
5585 : return p->preempt_disable_ip;
5586 : #else
5587 : return 0;
5588 : #endif
5589 : }
5590 :
5591 : /*
5592 : * Print scheduling while atomic bug:
5593 : */
5594 0 : static noinline void __schedule_bug(struct task_struct *prev)
5595 : {
5596 : /* Save this before calling printk(), since that will clobber it */
5597 0 : unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
5598 :
5599 0 : if (oops_in_progress)
5600 : return;
5601 :
5602 0 : printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
5603 : prev->comm, prev->pid, preempt_count());
5604 :
5605 0 : debug_show_held_locks(prev);
5606 : print_modules();
5607 0 : if (irqs_disabled())
5608 : print_irqtrace_events(prev);
5609 : if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
5610 : && in_atomic_preempt_off()) {
5611 : pr_err("Preemption disabled at:");
5612 : print_ip_sym(KERN_ERR, preempt_disable_ip);
5613 : }
5614 0 : if (panic_on_warn)
5615 0 : panic("scheduling while atomic\n");
5616 :
5617 0 : dump_stack();
5618 0 : add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
5619 : }
5620 :
5621 : /*
5622 : * Various schedule()-time debugging checks and statistics:
5623 : */
5624 : static inline void schedule_debug(struct task_struct *prev, bool preempt)
5625 : {
5626 : #ifdef CONFIG_SCHED_STACK_END_CHECK
5627 : if (task_stack_end_corrupted(prev))
5628 : panic("corrupted stack end detected inside scheduler\n");
5629 :
5630 : if (task_scs_end_corrupted(prev))
5631 : panic("corrupted shadow stack detected inside scheduler\n");
5632 : #endif
5633 :
5634 : #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
5635 : if (!preempt && READ_ONCE(prev->__state) && prev->non_block_count) {
5636 : printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",
5637 : prev->comm, prev->pid, prev->non_block_count);
5638 : dump_stack();
5639 : add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
5640 : }
5641 : #endif
5642 :
5643 618 : if (unlikely(in_atomic_preempt_off())) {
5644 0 : __schedule_bug(prev);
5645 : preempt_count_set(PREEMPT_DISABLED);
5646 : }
5647 : rcu_sleep_check();
5648 618 : SCHED_WARN_ON(ct_state() == CONTEXT_USER);
5649 :
5650 618 : profile_hit(SCHED_PROFILING, __builtin_return_address(0));
5651 :
5652 : schedstat_inc(this_rq()->sched_count);
5653 : }
5654 :
5655 : static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
5656 : struct rq_flags *rf)
5657 : {
5658 : #ifdef CONFIG_SMP
5659 : const struct sched_class *class;
5660 : /*
5661 : * We must do the balancing pass before put_prev_task(), such
5662 : * that when we release the rq->lock the task is in the same
5663 : * state as before we took rq->lock.
5664 : *
5665 : * We can terminate the balance pass as soon as we know there is
5666 : * a runnable task of @class priority or higher.
5667 : */
5668 : for_class_range(class, prev->sched_class, &idle_sched_class) {
5669 : if (class->balance(rq, prev, rf))
5670 : break;
5671 : }
5672 : #endif
5673 :
5674 0 : put_prev_task(rq, prev);
5675 : }
5676 :
5677 : /*
5678 : * Pick up the highest-prio task:
5679 : */
5680 : static inline struct task_struct *
5681 618 : __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
5682 : {
5683 : const struct sched_class *class;
5684 : struct task_struct *p;
5685 :
5686 : /*
5687 : * Optimization: we know that if all tasks are in the fair class we can
5688 : * call that function directly, but only if the @prev task wasn't of a
5689 : * higher scheduling class, because otherwise those lose the
5690 : * opportunity to pull in more work from other CPUs.
5691 : */
5692 618 : if (likely(prev->sched_class <= &fair_sched_class &&
5693 : rq->nr_running == rq->cfs.h_nr_running)) {
5694 :
5695 618 : p = pick_next_task_fair(rq, prev, rf);
5696 618 : if (unlikely(p == RETRY_TASK))
5697 : goto restart;
5698 :
5699 : /* Assume the next prioritized class is idle_sched_class */
5700 618 : if (!p) {
5701 1 : put_prev_task(rq, prev);
5702 1 : p = pick_next_task_idle(rq);
5703 : }
5704 :
5705 : return p;
5706 : }
5707 :
5708 : restart:
5709 0 : put_prev_task_balance(rq, prev, rf);
5710 :
5711 0 : for_each_class(class) {
5712 0 : p = class->pick_next_task(rq);
5713 0 : if (p)
5714 : return p;
5715 : }
5716 :
5717 0 : BUG(); /* The idle class should always have a runnable task. */
5718 : }
5719 :
5720 : #ifdef CONFIG_SCHED_CORE
5721 : static inline bool is_task_rq_idle(struct task_struct *t)
5722 : {
5723 : return (task_rq(t)->idle == t);
5724 : }
5725 :
5726 : static inline bool cookie_equals(struct task_struct *a, unsigned long cookie)
5727 : {
5728 : return is_task_rq_idle(a) || (a->core_cookie == cookie);
5729 : }
5730 :
5731 : static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
5732 : {
5733 : if (is_task_rq_idle(a) || is_task_rq_idle(b))
5734 : return true;
5735 :
5736 : return a->core_cookie == b->core_cookie;
5737 : }
5738 :
5739 : static inline struct task_struct *pick_task(struct rq *rq)
5740 : {
5741 : const struct sched_class *class;
5742 : struct task_struct *p;
5743 :
5744 : for_each_class(class) {
5745 : p = class->pick_task(rq);
5746 : if (p)
5747 : return p;
5748 : }
5749 :
5750 : BUG(); /* The idle class should always have a runnable task. */
5751 : }
5752 :
5753 : extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi);
5754 :
5755 : static void queue_core_balance(struct rq *rq);
5756 :
5757 : static struct task_struct *
5758 : pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
5759 : {
5760 : struct task_struct *next, *p, *max = NULL;
5761 : const struct cpumask *smt_mask;
5762 : bool fi_before = false;
5763 : bool core_clock_updated = (rq == rq->core);
5764 : unsigned long cookie;
5765 : int i, cpu, occ = 0;
5766 : struct rq *rq_i;
5767 : bool need_sync;
5768 :
5769 : if (!sched_core_enabled(rq))
5770 : return __pick_next_task(rq, prev, rf);
5771 :
5772 : cpu = cpu_of(rq);
5773 :
5774 : /* Stopper task is switching into idle, no need core-wide selection. */
5775 : if (cpu_is_offline(cpu)) {
5776 : /*
5777 : * Reset core_pick so that we don't enter the fastpath when
5778 : * coming online. core_pick would already be migrated to
5779 : * another cpu during offline.
5780 : */
5781 : rq->core_pick = NULL;
5782 : return __pick_next_task(rq, prev, rf);
5783 : }
5784 :
5785 : /*
5786 : * If there were no {en,de}queues since we picked (IOW, the task
5787 : * pointers are all still valid), and we haven't scheduled the last
5788 : * pick yet, do so now.
5789 : *
5790 : * rq->core_pick can be NULL if no selection was made for a CPU because
5791 : * it was either offline or went offline during a sibling's core-wide
5792 : * selection. In this case, do a core-wide selection.
5793 : */
5794 : if (rq->core->core_pick_seq == rq->core->core_task_seq &&
5795 : rq->core->core_pick_seq != rq->core_sched_seq &&
5796 : rq->core_pick) {
5797 : WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq);
5798 :
5799 : next = rq->core_pick;
5800 : if (next != prev) {
5801 : put_prev_task(rq, prev);
5802 : set_next_task(rq, next);
5803 : }
5804 :
5805 : rq->core_pick = NULL;
5806 : goto out;
5807 : }
5808 :
5809 : put_prev_task_balance(rq, prev, rf);
5810 :
5811 : smt_mask = cpu_smt_mask(cpu);
5812 : need_sync = !!rq->core->core_cookie;
5813 :
5814 : /* reset state */
5815 : rq->core->core_cookie = 0UL;
5816 : if (rq->core->core_forceidle_count) {
5817 : if (!core_clock_updated) {
5818 : update_rq_clock(rq->core);
5819 : core_clock_updated = true;
5820 : }
5821 : sched_core_account_forceidle(rq);
5822 : /* reset after accounting force idle */
5823 : rq->core->core_forceidle_start = 0;
5824 : rq->core->core_forceidle_count = 0;
5825 : rq->core->core_forceidle_occupation = 0;
5826 : need_sync = true;
5827 : fi_before = true;
5828 : }
5829 :
5830 : /*
5831 : * core->core_task_seq, core->core_pick_seq, rq->core_sched_seq
5832 : *
5833 : * @task_seq guards the task state ({en,de}queues)
5834 : * @pick_seq is the @task_seq we did a selection on
5835 : * @sched_seq is the @pick_seq we scheduled
5836 : *
5837 : * However, preemptions can cause multiple picks on the same task set.
5838 : * 'Fix' this by also increasing @task_seq for every pick.
5839 : */
5840 : rq->core->core_task_seq++;
5841 :
5842 : /*
5843 : * Optimize for common case where this CPU has no cookies
5844 : * and there are no cookied tasks running on siblings.
5845 : */
5846 : if (!need_sync) {
5847 : next = pick_task(rq);
5848 : if (!next->core_cookie) {
5849 : rq->core_pick = NULL;
5850 : /*
5851 : * For robustness, update the min_vruntime_fi for
5852 : * unconstrained picks as well.
5853 : */
5854 : WARN_ON_ONCE(fi_before);
5855 : task_vruntime_update(rq, next, false);
5856 : goto out_set_next;
5857 : }
5858 : }
5859 :
5860 : /*
5861 : * For each thread: do the regular task pick and find the max prio task
5862 : * amongst them.
5863 : *
5864 : * Tie-break prio towards the current CPU
5865 : */
5866 : for_each_cpu_wrap(i, smt_mask, cpu) {
5867 : rq_i = cpu_rq(i);
5868 :
5869 : /*
5870 : * Current cpu always has its clock updated on entrance to
5871 : * pick_next_task(). If the current cpu is not the core,
5872 : * the core may also have been updated above.
5873 : */
5874 : if (i != cpu && (rq_i != rq->core || !core_clock_updated))
5875 : update_rq_clock(rq_i);
5876 :
5877 : p = rq_i->core_pick = pick_task(rq_i);
5878 : if (!max || prio_less(max, p, fi_before))
5879 : max = p;
5880 : }
5881 :
5882 : cookie = rq->core->core_cookie = max->core_cookie;
5883 :
5884 : /*
5885 : * For each thread: try and find a runnable task that matches @max or
5886 : * force idle.
5887 : */
5888 : for_each_cpu(i, smt_mask) {
5889 : rq_i = cpu_rq(i);
5890 : p = rq_i->core_pick;
5891 :
5892 : if (!cookie_equals(p, cookie)) {
5893 : p = NULL;
5894 : if (cookie)
5895 : p = sched_core_find(rq_i, cookie);
5896 : if (!p)
5897 : p = idle_sched_class.pick_task(rq_i);
5898 : }
5899 :
5900 : rq_i->core_pick = p;
5901 :
5902 : if (p == rq_i->idle) {
5903 : if (rq_i->nr_running) {
5904 : rq->core->core_forceidle_count++;
5905 : if (!fi_before)
5906 : rq->core->core_forceidle_seq++;
5907 : }
5908 : } else {
5909 : occ++;
5910 : }
5911 : }
5912 :
5913 : if (schedstat_enabled() && rq->core->core_forceidle_count) {
5914 : rq->core->core_forceidle_start = rq_clock(rq->core);
5915 : rq->core->core_forceidle_occupation = occ;
5916 : }
5917 :
5918 : rq->core->core_pick_seq = rq->core->core_task_seq;
5919 : next = rq->core_pick;
5920 : rq->core_sched_seq = rq->core->core_pick_seq;
5921 :
5922 : /* Something should have been selected for current CPU */
5923 : WARN_ON_ONCE(!next);
5924 :
5925 : /*
5926 : * Reschedule siblings
5927 : *
5928 : * NOTE: L1TF -- at this point we're no longer running the old task and
5929 : * sending an IPI (below) ensures the sibling will no longer be running
5930 : * their task. This ensures there is no inter-sibling overlap between
5931 : * non-matching user state.
5932 : */
5933 : for_each_cpu(i, smt_mask) {
5934 : rq_i = cpu_rq(i);
5935 :
5936 : /*
5937 : * An online sibling might have gone offline before a task
5938 : * could be picked for it, or it might be offline but later
5939 : * happen to come online, but its too late and nothing was
5940 : * picked for it. That's Ok - it will pick tasks for itself,
5941 : * so ignore it.
5942 : */
5943 : if (!rq_i->core_pick)
5944 : continue;
5945 :
5946 : /*
5947 : * Update for new !FI->FI transitions, or if continuing to be in !FI:
5948 : * fi_before fi update?
5949 : * 0 0 1
5950 : * 0 1 1
5951 : * 1 0 1
5952 : * 1 1 0
5953 : */
5954 : if (!(fi_before && rq->core->core_forceidle_count))
5955 : task_vruntime_update(rq_i, rq_i->core_pick, !!rq->core->core_forceidle_count);
5956 :
5957 : rq_i->core_pick->core_occupation = occ;
5958 :
5959 : if (i == cpu) {
5960 : rq_i->core_pick = NULL;
5961 : continue;
5962 : }
5963 :
5964 : /* Did we break L1TF mitigation requirements? */
5965 : WARN_ON_ONCE(!cookie_match(next, rq_i->core_pick));
5966 :
5967 : if (rq_i->curr == rq_i->core_pick) {
5968 : rq_i->core_pick = NULL;
5969 : continue;
5970 : }
5971 :
5972 : resched_curr(rq_i);
5973 : }
5974 :
5975 : out_set_next:
5976 : set_next_task(rq, next);
5977 : out:
5978 : if (rq->core->core_forceidle_count && next == rq->idle)
5979 : queue_core_balance(rq);
5980 :
5981 : return next;
5982 : }
5983 :
5984 : static bool try_steal_cookie(int this, int that)
5985 : {
5986 : struct rq *dst = cpu_rq(this), *src = cpu_rq(that);
5987 : struct task_struct *p;
5988 : unsigned long cookie;
5989 : bool success = false;
5990 :
5991 : local_irq_disable();
5992 : double_rq_lock(dst, src);
5993 :
5994 : cookie = dst->core->core_cookie;
5995 : if (!cookie)
5996 : goto unlock;
5997 :
5998 : if (dst->curr != dst->idle)
5999 : goto unlock;
6000 :
6001 : p = sched_core_find(src, cookie);
6002 : if (p == src->idle)
6003 : goto unlock;
6004 :
6005 : do {
6006 : if (p == src->core_pick || p == src->curr)
6007 : goto next;
6008 :
6009 : if (!is_cpu_allowed(p, this))
6010 : goto next;
6011 :
6012 : if (p->core_occupation > dst->idle->core_occupation)
6013 : goto next;
6014 :
6015 : deactivate_task(src, p, 0);
6016 : set_task_cpu(p, this);
6017 : activate_task(dst, p, 0);
6018 :
6019 : resched_curr(dst);
6020 :
6021 : success = true;
6022 : break;
6023 :
6024 : next:
6025 : p = sched_core_next(p, cookie);
6026 : } while (p);
6027 :
6028 : unlock:
6029 : double_rq_unlock(dst, src);
6030 : local_irq_enable();
6031 :
6032 : return success;
6033 : }
6034 :
6035 : static bool steal_cookie_task(int cpu, struct sched_domain *sd)
6036 : {
6037 : int i;
6038 :
6039 : for_each_cpu_wrap(i, sched_domain_span(sd), cpu) {
6040 : if (i == cpu)
6041 : continue;
6042 :
6043 : if (need_resched())
6044 : break;
6045 :
6046 : if (try_steal_cookie(cpu, i))
6047 : return true;
6048 : }
6049 :
6050 : return false;
6051 : }
6052 :
6053 : static void sched_core_balance(struct rq *rq)
6054 : {
6055 : struct sched_domain *sd;
6056 : int cpu = cpu_of(rq);
6057 :
6058 : preempt_disable();
6059 : rcu_read_lock();
6060 : raw_spin_rq_unlock_irq(rq);
6061 : for_each_domain(cpu, sd) {
6062 : if (need_resched())
6063 : break;
6064 :
6065 : if (steal_cookie_task(cpu, sd))
6066 : break;
6067 : }
6068 : raw_spin_rq_lock_irq(rq);
6069 : rcu_read_unlock();
6070 : preempt_enable();
6071 : }
6072 :
6073 : static DEFINE_PER_CPU(struct callback_head, core_balance_head);
6074 :
6075 : static void queue_core_balance(struct rq *rq)
6076 : {
6077 : if (!sched_core_enabled(rq))
6078 : return;
6079 :
6080 : if (!rq->core->core_cookie)
6081 : return;
6082 :
6083 : if (!rq->nr_running) /* not forced idle */
6084 : return;
6085 :
6086 : queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance);
6087 : }
6088 :
6089 : static void sched_core_cpu_starting(unsigned int cpu)
6090 : {
6091 : const struct cpumask *smt_mask = cpu_smt_mask(cpu);
6092 : struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
6093 : unsigned long flags;
6094 : int t;
6095 :
6096 : sched_core_lock(cpu, &flags);
6097 :
6098 : WARN_ON_ONCE(rq->core != rq);
6099 :
6100 : /* if we're the first, we'll be our own leader */
6101 : if (cpumask_weight(smt_mask) == 1)
6102 : goto unlock;
6103 :
6104 : /* find the leader */
6105 : for_each_cpu(t, smt_mask) {
6106 : if (t == cpu)
6107 : continue;
6108 : rq = cpu_rq(t);
6109 : if (rq->core == rq) {
6110 : core_rq = rq;
6111 : break;
6112 : }
6113 : }
6114 :
6115 : if (WARN_ON_ONCE(!core_rq)) /* whoopsie */
6116 : goto unlock;
6117 :
6118 : /* install and validate core_rq */
6119 : for_each_cpu(t, smt_mask) {
6120 : rq = cpu_rq(t);
6121 :
6122 : if (t == cpu)
6123 : rq->core = core_rq;
6124 :
6125 : WARN_ON_ONCE(rq->core != core_rq);
6126 : }
6127 :
6128 : unlock:
6129 : sched_core_unlock(cpu, &flags);
6130 : }
6131 :
6132 : static void sched_core_cpu_deactivate(unsigned int cpu)
6133 : {
6134 : const struct cpumask *smt_mask = cpu_smt_mask(cpu);
6135 : struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
6136 : unsigned long flags;
6137 : int t;
6138 :
6139 : sched_core_lock(cpu, &flags);
6140 :
6141 : /* if we're the last man standing, nothing to do */
6142 : if (cpumask_weight(smt_mask) == 1) {
6143 : WARN_ON_ONCE(rq->core != rq);
6144 : goto unlock;
6145 : }
6146 :
6147 : /* if we're not the leader, nothing to do */
6148 : if (rq->core != rq)
6149 : goto unlock;
6150 :
6151 : /* find a new leader */
6152 : for_each_cpu(t, smt_mask) {
6153 : if (t == cpu)
6154 : continue;
6155 : core_rq = cpu_rq(t);
6156 : break;
6157 : }
6158 :
6159 : if (WARN_ON_ONCE(!core_rq)) /* impossible */
6160 : goto unlock;
6161 :
6162 : /* copy the shared state to the new leader */
6163 : core_rq->core_task_seq = rq->core_task_seq;
6164 : core_rq->core_pick_seq = rq->core_pick_seq;
6165 : core_rq->core_cookie = rq->core_cookie;
6166 : core_rq->core_forceidle_count = rq->core_forceidle_count;
6167 : core_rq->core_forceidle_seq = rq->core_forceidle_seq;
6168 : core_rq->core_forceidle_occupation = rq->core_forceidle_occupation;
6169 :
6170 : /*
6171 : * Accounting edge for forced idle is handled in pick_next_task().
6172 : * Don't need another one here, since the hotplug thread shouldn't
6173 : * have a cookie.
6174 : */
6175 : core_rq->core_forceidle_start = 0;
6176 :
6177 : /* install new leader */
6178 : for_each_cpu(t, smt_mask) {
6179 : rq = cpu_rq(t);
6180 : rq->core = core_rq;
6181 : }
6182 :
6183 : unlock:
6184 : sched_core_unlock(cpu, &flags);
6185 : }
6186 :
6187 : static inline void sched_core_cpu_dying(unsigned int cpu)
6188 : {
6189 : struct rq *rq = cpu_rq(cpu);
6190 :
6191 : if (rq->core != rq)
6192 : rq->core = rq;
6193 : }
6194 :
6195 : #else /* !CONFIG_SCHED_CORE */
6196 :
6197 : static inline void sched_core_cpu_starting(unsigned int cpu) {}
6198 : static inline void sched_core_cpu_deactivate(unsigned int cpu) {}
6199 : static inline void sched_core_cpu_dying(unsigned int cpu) {}
6200 :
6201 : static struct task_struct *
6202 : pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
6203 : {
6204 618 : return __pick_next_task(rq, prev, rf);
6205 : }
6206 :
6207 : #endif /* CONFIG_SCHED_CORE */
6208 :
6209 : /*
6210 : * Constants for the sched_mode argument of __schedule().
6211 : *
6212 : * The mode argument allows RT enabled kernels to differentiate a
6213 : * preemption from blocking on an 'sleeping' spin/rwlock. Note that
6214 : * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to
6215 : * optimize the AND operation out and just check for zero.
6216 : */
6217 : #define SM_NONE 0x0
6218 : #define SM_PREEMPT 0x1
6219 : #define SM_RTLOCK_WAIT 0x2
6220 :
6221 : #ifndef CONFIG_PREEMPT_RT
6222 : # define SM_MASK_PREEMPT (~0U)
6223 : #else
6224 : # define SM_MASK_PREEMPT SM_PREEMPT
6225 : #endif
6226 :
6227 : /*
6228 : * __schedule() is the main scheduler function.
6229 : *
6230 : * The main means of driving the scheduler and thus entering this function are:
6231 : *
6232 : * 1. Explicit blocking: mutex, semaphore, waitqueue, etc.
6233 : *
6234 : * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
6235 : * paths. For example, see arch/x86/entry_64.S.
6236 : *
6237 : * To drive preemption between tasks, the scheduler sets the flag in timer
6238 : * interrupt handler scheduler_tick().
6239 : *
6240 : * 3. Wakeups don't really cause entry into schedule(). They add a
6241 : * task to the run-queue and that's it.
6242 : *
6243 : * Now, if the new task added to the run-queue preempts the current
6244 : * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
6245 : * called on the nearest possible occasion:
6246 : *
6247 : * - If the kernel is preemptible (CONFIG_PREEMPTION=y):
6248 : *
6249 : * - in syscall or exception context, at the next outmost
6250 : * preempt_enable(). (this might be as soon as the wake_up()'s
6251 : * spin_unlock()!)
6252 : *
6253 : * - in IRQ context, return from interrupt-handler to
6254 : * preemptible context
6255 : *
6256 : * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)
6257 : * then at the next:
6258 : *
6259 : * - cond_resched() call
6260 : * - explicit schedule() call
6261 : * - return from syscall or exception to user-space
6262 : * - return from interrupt-handler to user-space
6263 : *
6264 : * WARNING: must be called with preemption disabled!
6265 : */
6266 618 : static void __sched notrace __schedule(unsigned int sched_mode)
6267 : {
6268 : struct task_struct *prev, *next;
6269 : unsigned long *switch_count;
6270 : unsigned long prev_state;
6271 : struct rq_flags rf;
6272 : struct rq *rq;
6273 : int cpu;
6274 :
6275 618 : cpu = smp_processor_id();
6276 618 : rq = cpu_rq(cpu);
6277 618 : prev = rq->curr;
6278 :
6279 1236 : schedule_debug(prev, !!sched_mode);
6280 :
6281 : if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
6282 : hrtick_clear(rq);
6283 :
6284 : local_irq_disable();
6285 618 : rcu_note_context_switch(!!sched_mode);
6286 :
6287 : /*
6288 : * Make sure that signal_pending_state()->signal_pending() below
6289 : * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
6290 : * done by the caller to avoid the race with signal_wake_up():
6291 : *
6292 : * __set_current_state(@state) signal_wake_up()
6293 : * schedule() set_tsk_thread_flag(p, TIF_SIGPENDING)
6294 : * wake_up_state(p, state)
6295 : * LOCK rq->lock LOCK p->pi_state
6296 : * smp_mb__after_spinlock() smp_mb__after_spinlock()
6297 : * if (signal_pending_state()) if (p->state & @state)
6298 : *
6299 : * Also, the membarrier system call requires a full memory barrier
6300 : * after coming from user-space, before storing to rq->curr.
6301 : */
6302 618 : rq_lock(rq, &rf);
6303 : smp_mb__after_spinlock();
6304 :
6305 : /* Promote REQ to ACT */
6306 618 : rq->clock_update_flags <<= 1;
6307 618 : update_rq_clock(rq);
6308 :
6309 618 : switch_count = &prev->nivcsw;
6310 :
6311 : /*
6312 : * We must load prev->state once (task_struct::state is volatile), such
6313 : * that:
6314 : *
6315 : * - we form a control dependency vs deactivate_task() below.
6316 : * - ptrace_{,un}freeze_traced() can change ->state underneath us.
6317 : */
6318 618 : prev_state = READ_ONCE(prev->__state);
6319 618 : if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) {
6320 615 : if (signal_pending_state(prev_state, prev)) {
6321 0 : WRITE_ONCE(prev->__state, TASK_RUNNING);
6322 : } else {
6323 615 : prev->sched_contributes_to_load =
6324 : (prev_state & TASK_UNINTERRUPTIBLE) &&
6325 922 : !(prev_state & TASK_NOLOAD) &&
6326 307 : !(prev->flags & PF_FROZEN);
6327 :
6328 615 : if (prev->sched_contributes_to_load)
6329 307 : rq->nr_uninterruptible++;
6330 :
6331 : /*
6332 : * __schedule() ttwu()
6333 : * prev_state = prev->state; if (p->on_rq && ...)
6334 : * if (prev_state) goto out;
6335 : * p->on_rq = 0; smp_acquire__after_ctrl_dep();
6336 : * p->state = TASK_WAKING
6337 : *
6338 : * Where __schedule() and ttwu() have matching control dependencies.
6339 : *
6340 : * After this, schedule() must not care about p->state any more.
6341 : */
6342 615 : deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
6343 :
6344 615 : if (prev->in_iowait) {
6345 0 : atomic_inc(&rq->nr_iowait);
6346 : delayacct_blkio_start();
6347 : }
6348 : }
6349 615 : switch_count = &prev->nvcsw;
6350 : }
6351 :
6352 618 : next = pick_next_task(rq, prev, &rf);
6353 618 : clear_tsk_need_resched(prev);
6354 : clear_preempt_need_resched();
6355 : #ifdef CONFIG_SCHED_DEBUG
6356 618 : rq->last_seen_need_resched_ns = 0;
6357 : #endif
6358 :
6359 618 : if (likely(prev != next)) {
6360 618 : rq->nr_switches++;
6361 : /*
6362 : * RCU users of rcu_dereference(rq->curr) may not see
6363 : * changes to task_struct made by pick_next_task().
6364 : */
6365 618 : RCU_INIT_POINTER(rq->curr, next);
6366 : /*
6367 : * The membarrier system call requires each architecture
6368 : * to have a full memory barrier after updating
6369 : * rq->curr, before returning to user-space.
6370 : *
6371 : * Here are the schemes providing that barrier on the
6372 : * various architectures:
6373 : * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC.
6374 : * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC.
6375 : * - finish_lock_switch() for weakly-ordered
6376 : * architectures where spin_unlock is a full barrier,
6377 : * - switch_to() for arm64 (weakly-ordered, spin_unlock
6378 : * is a RELEASE barrier),
6379 : */
6380 618 : ++*switch_count;
6381 :
6382 618 : migrate_disable_switch(rq, prev);
6383 618 : psi_sched_switch(prev, next, !task_on_rq_queued(prev));
6384 :
6385 618 : trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state);
6386 :
6387 : /* Also unlocks the rq: */
6388 511 : rq = context_switch(rq, prev, next, &rf);
6389 : } else {
6390 0 : rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
6391 :
6392 0 : rq_unpin_lock(rq, &rf);
6393 0 : __balance_callbacks(rq);
6394 0 : raw_spin_rq_unlock_irq(rq);
6395 : }
6396 511 : }
6397 :
6398 93 : void __noreturn do_task_dead(void)
6399 : {
6400 : /* Causes final put_task_struct in finish_task_switch(): */
6401 465 : set_special_state(TASK_DEAD);
6402 :
6403 : /* Tell freezer to ignore us: */
6404 93 : current->flags |= PF_NOFREEZE;
6405 :
6406 93 : __schedule(SM_NONE);
6407 0 : BUG();
6408 :
6409 : /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */
6410 : for (;;)
6411 : cpu_relax();
6412 : }
6413 :
6414 523 : static inline void sched_submit_work(struct task_struct *tsk)
6415 : {
6416 : unsigned int task_flags;
6417 :
6418 523 : if (task_is_running(tsk))
6419 : return;
6420 :
6421 522 : task_flags = tsk->flags;
6422 : /*
6423 : * If a worker goes to sleep, notify and ask workqueue whether it
6424 : * wants to wake up a task to maintain concurrency.
6425 : */
6426 522 : if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
6427 14 : if (task_flags & PF_WQ_WORKER)
6428 14 : wq_worker_sleeping(tsk);
6429 : else
6430 0 : io_wq_worker_sleeping(tsk);
6431 : }
6432 :
6433 522 : if (tsk_is_pi_blocked(tsk))
6434 : return;
6435 :
6436 : /*
6437 : * If we are going to sleep and we have plugged IO queued,
6438 : * make sure to submit it to avoid deadlocks.
6439 : */
6440 522 : blk_flush_plug(tsk->plug, true);
6441 : }
6442 :
6443 509 : static void sched_update_worker(struct task_struct *tsk)
6444 : {
6445 509 : if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
6446 6 : if (tsk->flags & PF_WQ_WORKER)
6447 6 : wq_worker_running(tsk);
6448 : else
6449 0 : io_wq_worker_running(tsk);
6450 : }
6451 509 : }
6452 :
6453 523 : asmlinkage __visible void __sched schedule(void)
6454 : {
6455 523 : struct task_struct *tsk = current;
6456 :
6457 523 : sched_submit_work(tsk);
6458 : do {
6459 523 : preempt_disable();
6460 523 : __schedule(SM_NONE);
6461 509 : sched_preempt_enable_no_resched();
6462 509 : } while (need_resched());
6463 509 : sched_update_worker(tsk);
6464 509 : }
6465 : EXPORT_SYMBOL(schedule);
6466 :
6467 : /*
6468 : * synchronize_rcu_tasks() makes sure that no task is stuck in preempted
6469 : * state (have scheduled out non-voluntarily) by making sure that all
6470 : * tasks have either left the run queue or have gone into user space.
6471 : * As idle tasks do not do either, they must not ever be preempted
6472 : * (schedule out non-voluntarily).
6473 : *
6474 : * schedule_idle() is similar to schedule_preempt_disable() except that it
6475 : * never enables preemption because it does not call sched_submit_work().
6476 : */
6477 0 : void __sched schedule_idle(void)
6478 : {
6479 : /*
6480 : * As this skips calling sched_submit_work(), which the idle task does
6481 : * regardless because that function is a nop when the task is in a
6482 : * TASK_RUNNING state, make sure this isn't used someplace that the
6483 : * current task can be in any other state. Note, idle is always in the
6484 : * TASK_RUNNING state.
6485 : */
6486 0 : WARN_ON_ONCE(current->__state);
6487 : do {
6488 0 : __schedule(SM_NONE);
6489 0 : } while (need_resched());
6490 0 : }
6491 :
6492 : #if defined(CONFIG_CONTEXT_TRACKING) && !defined(CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK)
6493 : asmlinkage __visible void __sched schedule_user(void)
6494 : {
6495 : /*
6496 : * If we come here after a random call to set_need_resched(),
6497 : * or we have been woken up remotely but the IPI has not yet arrived,
6498 : * we haven't yet exited the RCU idle mode. Do it here manually until
6499 : * we find a better solution.
6500 : *
6501 : * NB: There are buggy callers of this function. Ideally we
6502 : * should warn if prev_state != CONTEXT_USER, but that will trigger
6503 : * too frequently to make sense yet.
6504 : */
6505 : enum ctx_state prev_state = exception_enter();
6506 : schedule();
6507 : exception_exit(prev_state);
6508 : }
6509 : #endif
6510 :
6511 : /**
6512 : * schedule_preempt_disabled - called with preemption disabled
6513 : *
6514 : * Returns with preemption disabled. Note: preempt_count must be 1
6515 : */
6516 107 : void __sched schedule_preempt_disabled(void)
6517 : {
6518 107 : sched_preempt_enable_no_resched();
6519 107 : schedule();
6520 106 : preempt_disable();
6521 106 : }
6522 :
6523 : #ifdef CONFIG_PREEMPT_RT
6524 : void __sched notrace schedule_rtlock(void)
6525 : {
6526 : do {
6527 : preempt_disable();
6528 : __schedule(SM_RTLOCK_WAIT);
6529 : sched_preempt_enable_no_resched();
6530 : } while (need_resched());
6531 : }
6532 : NOKPROBE_SYMBOL(schedule_rtlock);
6533 : #endif
6534 :
6535 : static void __sched notrace preempt_schedule_common(void)
6536 : {
6537 : do {
6538 : /*
6539 : * Because the function tracer can trace preempt_count_sub()
6540 : * and it also uses preempt_enable/disable_notrace(), if
6541 : * NEED_RESCHED is set, the preempt_enable_notrace() called
6542 : * by the function tracer will call this function again and
6543 : * cause infinite recursion.
6544 : *
6545 : * Preemption must be disabled here before the function
6546 : * tracer can trace. Break up preempt_disable() into two
6547 : * calls. One to disable preemption without fear of being
6548 : * traced. The other to still record the preemption latency,
6549 : * which can also be traced by the function tracer.
6550 : */
6551 2 : preempt_disable_notrace();
6552 2 : preempt_latency_start(1);
6553 2 : __schedule(SM_PREEMPT);
6554 2 : preempt_latency_stop(1);
6555 2 : preempt_enable_no_resched_notrace();
6556 :
6557 : /*
6558 : * Check again in case we missed a preemption opportunity
6559 : * between schedule and now.
6560 : */
6561 2 : } while (need_resched());
6562 : }
6563 :
6564 : #ifdef CONFIG_PREEMPTION
6565 : /*
6566 : * This is the entry point to schedule() from in-kernel preemption
6567 : * off of preempt_enable.
6568 : */
6569 : asmlinkage __visible void __sched notrace preempt_schedule(void)
6570 : {
6571 : /*
6572 : * If there is a non-zero preempt_count or interrupts are disabled,
6573 : * we do not want to preempt the current task. Just return..
6574 : */
6575 : if (likely(!preemptible()))
6576 : return;
6577 : preempt_schedule_common();
6578 : }
6579 : NOKPROBE_SYMBOL(preempt_schedule);
6580 : EXPORT_SYMBOL(preempt_schedule);
6581 :
6582 : #ifdef CONFIG_PREEMPT_DYNAMIC
6583 : #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
6584 : #ifndef preempt_schedule_dynamic_enabled
6585 : #define preempt_schedule_dynamic_enabled preempt_schedule
6586 : #define preempt_schedule_dynamic_disabled NULL
6587 : #endif
6588 : DEFINE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled);
6589 : EXPORT_STATIC_CALL_TRAMP(preempt_schedule);
6590 : #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
6591 : static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule);
6592 : void __sched notrace dynamic_preempt_schedule(void)
6593 : {
6594 : if (!static_branch_unlikely(&sk_dynamic_preempt_schedule))
6595 : return;
6596 : preempt_schedule();
6597 : }
6598 : NOKPROBE_SYMBOL(dynamic_preempt_schedule);
6599 : EXPORT_SYMBOL(dynamic_preempt_schedule);
6600 : #endif
6601 : #endif
6602 :
6603 : /**
6604 : * preempt_schedule_notrace - preempt_schedule called by tracing
6605 : *
6606 : * The tracing infrastructure uses preempt_enable_notrace to prevent
6607 : * recursion and tracing preempt enabling caused by the tracing
6608 : * infrastructure itself. But as tracing can happen in areas coming
6609 : * from userspace or just about to enter userspace, a preempt enable
6610 : * can occur before user_exit() is called. This will cause the scheduler
6611 : * to be called when the system is still in usermode.
6612 : *
6613 : * To prevent this, the preempt_enable_notrace will use this function
6614 : * instead of preempt_schedule() to exit user context if needed before
6615 : * calling the scheduler.
6616 : */
6617 : asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
6618 : {
6619 : enum ctx_state prev_ctx;
6620 :
6621 : if (likely(!preemptible()))
6622 : return;
6623 :
6624 : do {
6625 : /*
6626 : * Because the function tracer can trace preempt_count_sub()
6627 : * and it also uses preempt_enable/disable_notrace(), if
6628 : * NEED_RESCHED is set, the preempt_enable_notrace() called
6629 : * by the function tracer will call this function again and
6630 : * cause infinite recursion.
6631 : *
6632 : * Preemption must be disabled here before the function
6633 : * tracer can trace. Break up preempt_disable() into two
6634 : * calls. One to disable preemption without fear of being
6635 : * traced. The other to still record the preemption latency,
6636 : * which can also be traced by the function tracer.
6637 : */
6638 : preempt_disable_notrace();
6639 : preempt_latency_start(1);
6640 : /*
6641 : * Needs preempt disabled in case user_exit() is traced
6642 : * and the tracer calls preempt_enable_notrace() causing
6643 : * an infinite recursion.
6644 : */
6645 : prev_ctx = exception_enter();
6646 : __schedule(SM_PREEMPT);
6647 : exception_exit(prev_ctx);
6648 :
6649 : preempt_latency_stop(1);
6650 : preempt_enable_no_resched_notrace();
6651 : } while (need_resched());
6652 : }
6653 : EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
6654 :
6655 : #ifdef CONFIG_PREEMPT_DYNAMIC
6656 : #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
6657 : #ifndef preempt_schedule_notrace_dynamic_enabled
6658 : #define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace
6659 : #define preempt_schedule_notrace_dynamic_disabled NULL
6660 : #endif
6661 : DEFINE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled);
6662 : EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace);
6663 : #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
6664 : static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule_notrace);
6665 : void __sched notrace dynamic_preempt_schedule_notrace(void)
6666 : {
6667 : if (!static_branch_unlikely(&sk_dynamic_preempt_schedule_notrace))
6668 : return;
6669 : preempt_schedule_notrace();
6670 : }
6671 : NOKPROBE_SYMBOL(dynamic_preempt_schedule_notrace);
6672 : EXPORT_SYMBOL(dynamic_preempt_schedule_notrace);
6673 : #endif
6674 : #endif
6675 :
6676 : #endif /* CONFIG_PREEMPTION */
6677 :
6678 : /*
6679 : * This is the entry point to schedule() from kernel preemption
6680 : * off of irq context.
6681 : * Note, that this is called and return with irqs disabled. This will
6682 : * protect us against recursive calling from irq.
6683 : */
6684 0 : asmlinkage __visible void __sched preempt_schedule_irq(void)
6685 : {
6686 : enum ctx_state prev_state;
6687 :
6688 : /* Catch callers which need to be fixed */
6689 0 : BUG_ON(preempt_count() || !irqs_disabled());
6690 :
6691 : prev_state = exception_enter();
6692 :
6693 : do {
6694 0 : preempt_disable();
6695 : local_irq_enable();
6696 0 : __schedule(SM_PREEMPT);
6697 : local_irq_disable();
6698 0 : sched_preempt_enable_no_resched();
6699 0 : } while (need_resched());
6700 :
6701 : exception_exit(prev_state);
6702 0 : }
6703 :
6704 0 : int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
6705 : void *key)
6706 : {
6707 0 : WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC);
6708 0 : return try_to_wake_up(curr->private, mode, wake_flags);
6709 : }
6710 : EXPORT_SYMBOL(default_wake_function);
6711 :
6712 : static void __setscheduler_prio(struct task_struct *p, int prio)
6713 : {
6714 0 : if (dl_prio(prio))
6715 0 : p->sched_class = &dl_sched_class;
6716 0 : else if (rt_prio(prio))
6717 0 : p->sched_class = &rt_sched_class;
6718 : else
6719 0 : p->sched_class = &fair_sched_class;
6720 :
6721 0 : p->prio = prio;
6722 : }
6723 :
6724 : #ifdef CONFIG_RT_MUTEXES
6725 :
6726 : static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
6727 : {
6728 0 : if (pi_task)
6729 0 : prio = min(prio, pi_task->prio);
6730 :
6731 : return prio;
6732 : }
6733 :
6734 : static inline int rt_effective_prio(struct task_struct *p, int prio)
6735 : {
6736 0 : struct task_struct *pi_task = rt_mutex_get_top_task(p);
6737 :
6738 0 : return __rt_effective_prio(pi_task, prio);
6739 : }
6740 :
6741 : /*
6742 : * rt_mutex_setprio - set the current priority of a task
6743 : * @p: task to boost
6744 : * @pi_task: donor task
6745 : *
6746 : * This function changes the 'effective' priority of a task. It does
6747 : * not touch ->normal_prio like __setscheduler().
6748 : *
6749 : * Used by the rt_mutex code to implement priority inheritance
6750 : * logic. Call site only calls if the priority of the task changed.
6751 : */
6752 0 : void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
6753 : {
6754 0 : int prio, oldprio, queued, running, queue_flag =
6755 : DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
6756 : const struct sched_class *prev_class;
6757 : struct rq_flags rf;
6758 : struct rq *rq;
6759 :
6760 : /* XXX used to be waiter->prio, not waiter->task->prio */
6761 0 : prio = __rt_effective_prio(pi_task, p->normal_prio);
6762 :
6763 : /*
6764 : * If nothing changed; bail early.
6765 : */
6766 0 : if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio))
6767 : return;
6768 :
6769 0 : rq = __task_rq_lock(p, &rf);
6770 0 : update_rq_clock(rq);
6771 : /*
6772 : * Set under pi_lock && rq->lock, such that the value can be used under
6773 : * either lock.
6774 : *
6775 : * Note that there is loads of tricky to make this pointer cache work
6776 : * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to
6777 : * ensure a task is de-boosted (pi_task is set to NULL) before the
6778 : * task is allowed to run again (and can exit). This ensures the pointer
6779 : * points to a blocked task -- which guarantees the task is present.
6780 : */
6781 0 : p->pi_top_task = pi_task;
6782 :
6783 : /*
6784 : * For FIFO/RR we only need to set prio, if that matches we're done.
6785 : */
6786 0 : if (prio == p->prio && !dl_prio(prio))
6787 : goto out_unlock;
6788 :
6789 : /*
6790 : * Idle task boosting is a nono in general. There is one
6791 : * exception, when PREEMPT_RT and NOHZ is active:
6792 : *
6793 : * The idle task calls get_next_timer_interrupt() and holds
6794 : * the timer wheel base->lock on the CPU and another CPU wants
6795 : * to access the timer (probably to cancel it). We can safely
6796 : * ignore the boosting request, as the idle CPU runs this code
6797 : * with interrupts disabled and will complete the lock
6798 : * protected section without being interrupted. So there is no
6799 : * real need to boost.
6800 : */
6801 0 : if (unlikely(p == rq->idle)) {
6802 0 : WARN_ON(p != rq->curr);
6803 0 : WARN_ON(p->pi_blocked_on);
6804 : goto out_unlock;
6805 : }
6806 :
6807 0 : trace_sched_pi_setprio(p, pi_task);
6808 0 : oldprio = p->prio;
6809 :
6810 0 : if (oldprio == prio)
6811 0 : queue_flag &= ~DEQUEUE_MOVE;
6812 :
6813 0 : prev_class = p->sched_class;
6814 0 : queued = task_on_rq_queued(p);
6815 0 : running = task_current(rq, p);
6816 0 : if (queued)
6817 0 : dequeue_task(rq, p, queue_flag);
6818 0 : if (running)
6819 0 : put_prev_task(rq, p);
6820 :
6821 : /*
6822 : * Boosting condition are:
6823 : * 1. -rt task is running and holds mutex A
6824 : * --> -dl task blocks on mutex A
6825 : *
6826 : * 2. -dl task is running and holds mutex A
6827 : * --> -dl task blocks on mutex A and could preempt the
6828 : * running task
6829 : */
6830 0 : if (dl_prio(prio)) {
6831 0 : if (!dl_prio(p->normal_prio) ||
6832 0 : (pi_task && dl_prio(pi_task->prio) &&
6833 0 : dl_entity_preempt(&pi_task->dl, &p->dl))) {
6834 0 : p->dl.pi_se = pi_task->dl.pi_se;
6835 0 : queue_flag |= ENQUEUE_REPLENISH;
6836 : } else {
6837 0 : p->dl.pi_se = &p->dl;
6838 : }
6839 0 : } else if (rt_prio(prio)) {
6840 0 : if (dl_prio(oldprio))
6841 0 : p->dl.pi_se = &p->dl;
6842 0 : if (oldprio < prio)
6843 0 : queue_flag |= ENQUEUE_HEAD;
6844 : } else {
6845 0 : if (dl_prio(oldprio))
6846 0 : p->dl.pi_se = &p->dl;
6847 0 : if (rt_prio(oldprio))
6848 0 : p->rt.timeout = 0;
6849 : }
6850 :
6851 0 : __setscheduler_prio(p, prio);
6852 :
6853 0 : if (queued)
6854 0 : enqueue_task(rq, p, queue_flag);
6855 0 : if (running)
6856 : set_next_task(rq, p);
6857 :
6858 0 : check_class_changed(rq, p, prev_class, oldprio);
6859 : out_unlock:
6860 : /* Avoid rq from going away on us: */
6861 0 : preempt_disable();
6862 :
6863 0 : rq_unpin_lock(rq, &rf);
6864 0 : __balance_callbacks(rq);
6865 0 : raw_spin_rq_unlock(rq);
6866 :
6867 0 : preempt_enable();
6868 : }
6869 : #else
6870 : static inline int rt_effective_prio(struct task_struct *p, int prio)
6871 : {
6872 : return prio;
6873 : }
6874 : #endif
6875 :
6876 8 : void set_user_nice(struct task_struct *p, long nice)
6877 : {
6878 : bool queued, running;
6879 : int old_prio;
6880 : struct rq_flags rf;
6881 : struct rq *rq;
6882 :
6883 16 : if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
6884 4 : return;
6885 : /*
6886 : * We have to be careful, if called from sys_setpriority(),
6887 : * the task might be in the middle of scheduling on another CPU.
6888 : */
6889 4 : rq = task_rq_lock(p, &rf);
6890 4 : update_rq_clock(rq);
6891 :
6892 : /*
6893 : * The RT priorities are set via sched_setscheduler(), but we still
6894 : * allow the 'normal' nice value to be set - but as expected
6895 : * it won't have any effect on scheduling until the task is
6896 : * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
6897 : */
6898 12 : if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
6899 0 : p->static_prio = NICE_TO_PRIO(nice);
6900 0 : goto out_unlock;
6901 : }
6902 4 : queued = task_on_rq_queued(p);
6903 4 : running = task_current(rq, p);
6904 4 : if (queued)
6905 : dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
6906 4 : if (running)
6907 3 : put_prev_task(rq, p);
6908 :
6909 4 : p->static_prio = NICE_TO_PRIO(nice);
6910 4 : set_load_weight(p, true);
6911 4 : old_prio = p->prio;
6912 4 : p->prio = effective_prio(p);
6913 :
6914 4 : if (queued)
6915 : enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
6916 4 : if (running)
6917 : set_next_task(rq, p);
6918 :
6919 : /*
6920 : * If the task increased its priority or is running and
6921 : * lowered its priority, then reschedule its CPU:
6922 : */
6923 4 : p->sched_class->prio_changed(rq, p, old_prio);
6924 :
6925 : out_unlock:
6926 8 : task_rq_unlock(rq, p, &rf);
6927 : }
6928 : EXPORT_SYMBOL(set_user_nice);
6929 :
6930 : /*
6931 : * can_nice - check if a task can reduce its nice value
6932 : * @p: task
6933 : * @nice: nice value
6934 : */
6935 0 : int can_nice(const struct task_struct *p, const int nice)
6936 : {
6937 : /* Convert nice value [19,-20] to rlimit style value [1,40]: */
6938 0 : int nice_rlim = nice_to_rlimit(nice);
6939 :
6940 0 : return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
6941 0 : capable(CAP_SYS_NICE));
6942 : }
6943 :
6944 : #ifdef __ARCH_WANT_SYS_NICE
6945 :
6946 : /*
6947 : * sys_nice - change the priority of the current process.
6948 : * @increment: priority increment
6949 : *
6950 : * sys_setpriority is a more generic, but much slower function that
6951 : * does similar things.
6952 : */
6953 0 : SYSCALL_DEFINE1(nice, int, increment)
6954 : {
6955 : long nice, retval;
6956 :
6957 : /*
6958 : * Setpriority might change our priority at the same moment.
6959 : * We don't have to worry. Conceptually one call occurs first
6960 : * and we have a single winner.
6961 : */
6962 0 : increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
6963 0 : nice = task_nice(current) + increment;
6964 :
6965 0 : nice = clamp_val(nice, MIN_NICE, MAX_NICE);
6966 0 : if (increment < 0 && !can_nice(current, nice))
6967 : return -EPERM;
6968 :
6969 0 : retval = security_task_setnice(current, nice);
6970 0 : if (retval)
6971 : return retval;
6972 :
6973 0 : set_user_nice(current, nice);
6974 0 : return 0;
6975 : }
6976 :
6977 : #endif
6978 :
6979 : /**
6980 : * task_prio - return the priority value of a given task.
6981 : * @p: the task in question.
6982 : *
6983 : * Return: The priority value as seen by users in /proc.
6984 : *
6985 : * sched policy return value kernel prio user prio/nice
6986 : *
6987 : * normal, batch, idle [0 ... 39] [100 ... 139] 0/[-20 ... 19]
6988 : * fifo, rr [-2 ... -100] [98 ... 0] [1 ... 99]
6989 : * deadline -101 -1 0
6990 : */
6991 0 : int task_prio(const struct task_struct *p)
6992 : {
6993 0 : return p->prio - MAX_RT_PRIO;
6994 : }
6995 :
6996 : /**
6997 : * idle_cpu - is a given CPU idle currently?
6998 : * @cpu: the processor in question.
6999 : *
7000 : * Return: 1 if the CPU is currently idle. 0 otherwise.
7001 : */
7002 0 : int idle_cpu(int cpu)
7003 : {
7004 0 : struct rq *rq = cpu_rq(cpu);
7005 :
7006 0 : if (rq->curr != rq->idle)
7007 : return 0;
7008 :
7009 0 : if (rq->nr_running)
7010 : return 0;
7011 :
7012 : #ifdef CONFIG_SMP
7013 : if (rq->ttwu_pending)
7014 : return 0;
7015 : #endif
7016 :
7017 0 : return 1;
7018 : }
7019 :
7020 : /**
7021 : * available_idle_cpu - is a given CPU idle for enqueuing work.
7022 : * @cpu: the CPU in question.
7023 : *
7024 : * Return: 1 if the CPU is currently idle. 0 otherwise.
7025 : */
7026 0 : int available_idle_cpu(int cpu)
7027 : {
7028 0 : if (!idle_cpu(cpu))
7029 : return 0;
7030 :
7031 0 : if (vcpu_is_preempted(cpu))
7032 : return 0;
7033 :
7034 0 : return 1;
7035 : }
7036 :
7037 : /**
7038 : * idle_task - return the idle task for a given CPU.
7039 : * @cpu: the processor in question.
7040 : *
7041 : * Return: The idle task for the CPU @cpu.
7042 : */
7043 0 : struct task_struct *idle_task(int cpu)
7044 : {
7045 0 : return cpu_rq(cpu)->idle;
7046 : }
7047 :
7048 : #ifdef CONFIG_SMP
7049 : /*
7050 : * This function computes an effective utilization for the given CPU, to be
7051 : * used for frequency selection given the linear relation: f = u * f_max.
7052 : *
7053 : * The scheduler tracks the following metrics:
7054 : *
7055 : * cpu_util_{cfs,rt,dl,irq}()
7056 : * cpu_bw_dl()
7057 : *
7058 : * Where the cfs,rt and dl util numbers are tracked with the same metric and
7059 : * synchronized windows and are thus directly comparable.
7060 : *
7061 : * The cfs,rt,dl utilization are the running times measured with rq->clock_task
7062 : * which excludes things like IRQ and steal-time. These latter are then accrued
7063 : * in the irq utilization.
7064 : *
7065 : * The DL bandwidth number otoh is not a measured metric but a value computed
7066 : * based on the task model parameters and gives the minimal utilization
7067 : * required to meet deadlines.
7068 : */
7069 : unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
7070 : unsigned long max, enum cpu_util_type type,
7071 : struct task_struct *p)
7072 : {
7073 : unsigned long dl_util, util, irq;
7074 : struct rq *rq = cpu_rq(cpu);
7075 :
7076 : if (!uclamp_is_used() &&
7077 : type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
7078 : return max;
7079 : }
7080 :
7081 : /*
7082 : * Early check to see if IRQ/steal time saturates the CPU, can be
7083 : * because of inaccuracies in how we track these -- see
7084 : * update_irq_load_avg().
7085 : */
7086 : irq = cpu_util_irq(rq);
7087 : if (unlikely(irq >= max))
7088 : return max;
7089 :
7090 : /*
7091 : * Because the time spend on RT/DL tasks is visible as 'lost' time to
7092 : * CFS tasks and we use the same metric to track the effective
7093 : * utilization (PELT windows are synchronized) we can directly add them
7094 : * to obtain the CPU's actual utilization.
7095 : *
7096 : * CFS and RT utilization can be boosted or capped, depending on
7097 : * utilization clamp constraints requested by currently RUNNABLE
7098 : * tasks.
7099 : * When there are no CFS RUNNABLE tasks, clamps are released and
7100 : * frequency will be gracefully reduced with the utilization decay.
7101 : */
7102 : util = util_cfs + cpu_util_rt(rq);
7103 : if (type == FREQUENCY_UTIL)
7104 : util = uclamp_rq_util_with(rq, util, p);
7105 :
7106 : dl_util = cpu_util_dl(rq);
7107 :
7108 : /*
7109 : * For frequency selection we do not make cpu_util_dl() a permanent part
7110 : * of this sum because we want to use cpu_bw_dl() later on, but we need
7111 : * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such
7112 : * that we select f_max when there is no idle time.
7113 : *
7114 : * NOTE: numerical errors or stop class might cause us to not quite hit
7115 : * saturation when we should -- something for later.
7116 : */
7117 : if (util + dl_util >= max)
7118 : return max;
7119 :
7120 : /*
7121 : * OTOH, for energy computation we need the estimated running time, so
7122 : * include util_dl and ignore dl_bw.
7123 : */
7124 : if (type == ENERGY_UTIL)
7125 : util += dl_util;
7126 :
7127 : /*
7128 : * There is still idle time; further improve the number by using the
7129 : * irq metric. Because IRQ/steal time is hidden from the task clock we
7130 : * need to scale the task numbers:
7131 : *
7132 : * max - irq
7133 : * U' = irq + --------- * U
7134 : * max
7135 : */
7136 : util = scale_irq_capacity(util, irq, max);
7137 : util += irq;
7138 :
7139 : /*
7140 : * Bandwidth required by DEADLINE must always be granted while, for
7141 : * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism
7142 : * to gracefully reduce the frequency when no tasks show up for longer
7143 : * periods of time.
7144 : *
7145 : * Ideally we would like to set bw_dl as min/guaranteed freq and util +
7146 : * bw_dl as requested freq. However, cpufreq is not yet ready for such
7147 : * an interface. So, we only do the latter for now.
7148 : */
7149 : if (type == FREQUENCY_UTIL)
7150 : util += cpu_bw_dl(rq);
7151 :
7152 : return min(max, util);
7153 : }
7154 :
7155 : unsigned long sched_cpu_util(int cpu, unsigned long max)
7156 : {
7157 : return effective_cpu_util(cpu, cpu_util_cfs(cpu), max,
7158 : ENERGY_UTIL, NULL);
7159 : }
7160 : #endif /* CONFIG_SMP */
7161 :
7162 : /**
7163 : * find_process_by_pid - find a process with a matching PID value.
7164 : * @pid: the pid in question.
7165 : *
7166 : * The task of @pid, if found. %NULL otherwise.
7167 : */
7168 : static struct task_struct *find_process_by_pid(pid_t pid)
7169 : {
7170 0 : return pid ? find_task_by_vpid(pid) : current;
7171 : }
7172 :
7173 : /*
7174 : * sched_setparam() passes in -1 for its policy, to let the functions
7175 : * it calls know not to change it.
7176 : */
7177 : #define SETPARAM_POLICY -1
7178 :
7179 0 : static void __setscheduler_params(struct task_struct *p,
7180 : const struct sched_attr *attr)
7181 : {
7182 0 : int policy = attr->sched_policy;
7183 :
7184 0 : if (policy == SETPARAM_POLICY)
7185 0 : policy = p->policy;
7186 :
7187 0 : p->policy = policy;
7188 :
7189 0 : if (dl_policy(policy))
7190 0 : __setparam_dl(p, attr);
7191 0 : else if (fair_policy(policy))
7192 0 : p->static_prio = NICE_TO_PRIO(attr->sched_nice);
7193 :
7194 : /*
7195 : * __sched_setscheduler() ensures attr->sched_priority == 0 when
7196 : * !rt_policy. Always setting this ensures that things like
7197 : * getparam()/getattr() don't report silly values for !rt tasks.
7198 : */
7199 0 : p->rt_priority = attr->sched_priority;
7200 0 : p->normal_prio = normal_prio(p);
7201 0 : set_load_weight(p, true);
7202 0 : }
7203 :
7204 : /*
7205 : * Check the target process has a UID that matches the current process's:
7206 : */
7207 : static bool check_same_owner(struct task_struct *p)
7208 : {
7209 0 : const struct cred *cred = current_cred(), *pcred;
7210 : bool match;
7211 :
7212 : rcu_read_lock();
7213 0 : pcred = __task_cred(p);
7214 0 : match = (uid_eq(cred->euid, pcred->euid) ||
7215 0 : uid_eq(cred->euid, pcred->uid));
7216 : rcu_read_unlock();
7217 : return match;
7218 : }
7219 :
7220 105 : static int __sched_setscheduler(struct task_struct *p,
7221 : const struct sched_attr *attr,
7222 : bool user, bool pi)
7223 : {
7224 105 : int oldpolicy = -1, policy = attr->sched_policy;
7225 : int retval, oldprio, newprio, queued, running;
7226 : const struct sched_class *prev_class;
7227 : struct callback_head *head;
7228 : struct rq_flags rf;
7229 : int reset_on_fork;
7230 105 : int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
7231 : struct rq *rq;
7232 :
7233 : /* The pi code expects interrupts enabled */
7234 210 : BUG_ON(pi && in_interrupt());
7235 : recheck:
7236 : /* Double check policy once rq lock held: */
7237 105 : if (policy < 0) {
7238 0 : reset_on_fork = p->sched_reset_on_fork;
7239 0 : policy = oldpolicy = p->policy;
7240 : } else {
7241 105 : reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
7242 :
7243 105 : if (!valid_policy(policy))
7244 : return -EINVAL;
7245 : }
7246 :
7247 105 : if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))
7248 : return -EINVAL;
7249 :
7250 : /*
7251 : * Valid priorities for SCHED_FIFO and SCHED_RR are
7252 : * 1..MAX_RT_PRIO-1, valid priority for SCHED_NORMAL,
7253 : * SCHED_BATCH and SCHED_IDLE is 0.
7254 : */
7255 105 : if (attr->sched_priority > MAX_RT_PRIO-1)
7256 : return -EINVAL;
7257 210 : if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
7258 105 : (rt_policy(policy) != (attr->sched_priority != 0)))
7259 : return -EINVAL;
7260 :
7261 : /*
7262 : * Allow unprivileged RT tasks to decrease priority:
7263 : */
7264 105 : if (user && !capable(CAP_SYS_NICE)) {
7265 0 : if (fair_policy(policy)) {
7266 0 : if (attr->sched_nice < task_nice(p) &&
7267 0 : !can_nice(p, attr->sched_nice))
7268 : return -EPERM;
7269 : }
7270 :
7271 0 : if (rt_policy(policy)) {
7272 0 : unsigned long rlim_rtprio =
7273 : task_rlimit(p, RLIMIT_RTPRIO);
7274 :
7275 : /* Can't set/change the rt policy: */
7276 0 : if (policy != p->policy && !rlim_rtprio)
7277 : return -EPERM;
7278 :
7279 : /* Can't increase priority: */
7280 0 : if (attr->sched_priority > p->rt_priority &&
7281 0 : attr->sched_priority > rlim_rtprio)
7282 : return -EPERM;
7283 : }
7284 :
7285 : /*
7286 : * Can't set/change SCHED_DEADLINE policy at all for now
7287 : * (safest behavior); in the future we would like to allow
7288 : * unprivileged DL tasks to increase their relative deadline
7289 : * or reduce their runtime (both ways reducing utilization)
7290 : */
7291 0 : if (dl_policy(policy))
7292 : return -EPERM;
7293 :
7294 : /*
7295 : * Treat SCHED_IDLE as nice 20. Only allow a switch to
7296 : * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
7297 : */
7298 0 : if (task_has_idle_policy(p) && !idle_policy(policy)) {
7299 0 : if (!can_nice(p, task_nice(p)))
7300 : return -EPERM;
7301 : }
7302 :
7303 : /* Can't change other user's priorities: */
7304 0 : if (!check_same_owner(p))
7305 : return -EPERM;
7306 :
7307 : /* Normal users shall not reset the sched_reset_on_fork flag: */
7308 0 : if (p->sched_reset_on_fork && !reset_on_fork)
7309 : return -EPERM;
7310 : }
7311 :
7312 105 : if (user) {
7313 0 : if (attr->sched_flags & SCHED_FLAG_SUGOV)
7314 : return -EINVAL;
7315 :
7316 0 : retval = security_task_setscheduler(p);
7317 0 : if (retval)
7318 : return retval;
7319 : }
7320 :
7321 : /* Update task specific "requested" clamps */
7322 105 : if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
7323 : retval = uclamp_validate(p, attr);
7324 : if (retval)
7325 : return retval;
7326 : }
7327 :
7328 : if (pi)
7329 : cpuset_read_lock();
7330 :
7331 : /*
7332 : * Make sure no PI-waiters arrive (or leave) while we are
7333 : * changing the priority of the task:
7334 : *
7335 : * To be able to change p->policy safely, the appropriate
7336 : * runqueue lock must be held.
7337 : */
7338 105 : rq = task_rq_lock(p, &rf);
7339 105 : update_rq_clock(rq);
7340 :
7341 : /*
7342 : * Changing the policy of the stop threads its a very bad idea:
7343 : */
7344 105 : if (p == rq->stop) {
7345 : retval = -EINVAL;
7346 : goto unlock;
7347 : }
7348 :
7349 : /*
7350 : * If not changing anything there's no need to proceed further,
7351 : * but store a possible modification of reset_on_fork.
7352 : */
7353 105 : if (unlikely(policy == p->policy)) {
7354 210 : if (fair_policy(policy) && attr->sched_nice != task_nice(p))
7355 : goto change;
7356 105 : if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
7357 : goto change;
7358 105 : if (dl_policy(policy) && dl_param_changed(p, attr))
7359 : goto change;
7360 105 : if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
7361 : goto change;
7362 :
7363 105 : p->sched_reset_on_fork = reset_on_fork;
7364 105 : retval = 0;
7365 105 : goto unlock;
7366 : }
7367 : change:
7368 :
7369 : if (user) {
7370 : #ifdef CONFIG_RT_GROUP_SCHED
7371 : /*
7372 : * Do not allow realtime tasks into groups that have no runtime
7373 : * assigned.
7374 : */
7375 : if (rt_bandwidth_enabled() && rt_policy(policy) &&
7376 : task_group(p)->rt_bandwidth.rt_runtime == 0 &&
7377 : !task_group_is_autogroup(task_group(p))) {
7378 : retval = -EPERM;
7379 : goto unlock;
7380 : }
7381 : #endif
7382 : #ifdef CONFIG_SMP
7383 : if (dl_bandwidth_enabled() && dl_policy(policy) &&
7384 : !(attr->sched_flags & SCHED_FLAG_SUGOV)) {
7385 : cpumask_t *span = rq->rd->span;
7386 :
7387 : /*
7388 : * Don't allow tasks with an affinity mask smaller than
7389 : * the entire root_domain to become SCHED_DEADLINE. We
7390 : * will also fail if there's no bandwidth available.
7391 : */
7392 : if (!cpumask_subset(span, p->cpus_ptr) ||
7393 : rq->rd->dl_bw.bw == 0) {
7394 : retval = -EPERM;
7395 : goto unlock;
7396 : }
7397 : }
7398 : #endif
7399 : }
7400 :
7401 : /* Re-check policy now with rq lock held: */
7402 0 : if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
7403 0 : policy = oldpolicy = -1;
7404 0 : task_rq_unlock(rq, p, &rf);
7405 : if (pi)
7406 : cpuset_read_unlock();
7407 : goto recheck;
7408 : }
7409 :
7410 : /*
7411 : * If setscheduling to SCHED_DEADLINE (or changing the parameters
7412 : * of a SCHED_DEADLINE task) we need to check if enough bandwidth
7413 : * is available.
7414 : */
7415 0 : if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
7416 : retval = -EBUSY;
7417 : goto unlock;
7418 : }
7419 :
7420 0 : p->sched_reset_on_fork = reset_on_fork;
7421 0 : oldprio = p->prio;
7422 :
7423 0 : newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice);
7424 0 : if (pi) {
7425 : /*
7426 : * Take priority boosted tasks into account. If the new
7427 : * effective priority is unchanged, we just store the new
7428 : * normal parameters and do not touch the scheduler class and
7429 : * the runqueue. This will be done when the task deboost
7430 : * itself.
7431 : */
7432 0 : newprio = rt_effective_prio(p, newprio);
7433 0 : if (newprio == oldprio)
7434 0 : queue_flags &= ~DEQUEUE_MOVE;
7435 : }
7436 :
7437 0 : queued = task_on_rq_queued(p);
7438 0 : running = task_current(rq, p);
7439 0 : if (queued)
7440 0 : dequeue_task(rq, p, queue_flags);
7441 0 : if (running)
7442 0 : put_prev_task(rq, p);
7443 :
7444 0 : prev_class = p->sched_class;
7445 :
7446 0 : if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
7447 0 : __setscheduler_params(p, attr);
7448 0 : __setscheduler_prio(p, newprio);
7449 : }
7450 0 : __setscheduler_uclamp(p, attr);
7451 :
7452 0 : if (queued) {
7453 : /*
7454 : * We enqueue to tail when the priority of a task is
7455 : * increased (user space view).
7456 : */
7457 0 : if (oldprio < p->prio)
7458 0 : queue_flags |= ENQUEUE_HEAD;
7459 :
7460 0 : enqueue_task(rq, p, queue_flags);
7461 : }
7462 0 : if (running)
7463 : set_next_task(rq, p);
7464 :
7465 0 : check_class_changed(rq, p, prev_class, oldprio);
7466 :
7467 : /* Avoid rq from going away on us: */
7468 0 : preempt_disable();
7469 0 : head = splice_balance_callbacks(rq);
7470 0 : task_rq_unlock(rq, p, &rf);
7471 :
7472 0 : if (pi) {
7473 : cpuset_read_unlock();
7474 0 : rt_mutex_adjust_pi(p);
7475 : }
7476 :
7477 : /* Run balance callbacks after we've adjusted the PI chain: */
7478 0 : balance_callbacks(rq, head);
7479 0 : preempt_enable();
7480 :
7481 0 : return 0;
7482 :
7483 : unlock:
7484 210 : task_rq_unlock(rq, p, &rf);
7485 : if (pi)
7486 : cpuset_read_unlock();
7487 105 : return retval;
7488 : }
7489 :
7490 105 : static int _sched_setscheduler(struct task_struct *p, int policy,
7491 : const struct sched_param *param, bool check)
7492 : {
7493 315 : struct sched_attr attr = {
7494 : .sched_policy = policy,
7495 105 : .sched_priority = param->sched_priority,
7496 105 : .sched_nice = PRIO_TO_NICE(p->static_prio),
7497 : };
7498 :
7499 : /* Fixup the legacy SCHED_RESET_ON_FORK hack. */
7500 105 : if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
7501 0 : attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
7502 0 : policy &= ~SCHED_RESET_ON_FORK;
7503 0 : attr.sched_policy = policy;
7504 : }
7505 :
7506 105 : return __sched_setscheduler(p, &attr, check, true);
7507 : }
7508 : /**
7509 : * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
7510 : * @p: the task in question.
7511 : * @policy: new policy.
7512 : * @param: structure containing the new RT priority.
7513 : *
7514 : * Use sched_set_fifo(), read its comment.
7515 : *
7516 : * Return: 0 on success. An error code otherwise.
7517 : *
7518 : * NOTE that the task may be already dead.
7519 : */
7520 0 : int sched_setscheduler(struct task_struct *p, int policy,
7521 : const struct sched_param *param)
7522 : {
7523 0 : return _sched_setscheduler(p, policy, param, true);
7524 : }
7525 :
7526 0 : int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
7527 : {
7528 0 : return __sched_setscheduler(p, attr, true, true);
7529 : }
7530 :
7531 0 : int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
7532 : {
7533 0 : return __sched_setscheduler(p, attr, false, true);
7534 : }
7535 : EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
7536 :
7537 : /**
7538 : * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
7539 : * @p: the task in question.
7540 : * @policy: new policy.
7541 : * @param: structure containing the new RT priority.
7542 : *
7543 : * Just like sched_setscheduler, only don't bother checking if the
7544 : * current context has permission. For example, this is needed in
7545 : * stop_machine(): we create temporary high priority worker threads,
7546 : * but our caller might not have that capability.
7547 : *
7548 : * Return: 0 on success. An error code otherwise.
7549 : */
7550 105 : int sched_setscheduler_nocheck(struct task_struct *p, int policy,
7551 : const struct sched_param *param)
7552 : {
7553 105 : return _sched_setscheduler(p, policy, param, false);
7554 : }
7555 :
7556 : /*
7557 : * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally
7558 : * incapable of resource management, which is the one thing an OS really should
7559 : * be doing.
7560 : *
7561 : * This is of course the reason it is limited to privileged users only.
7562 : *
7563 : * Worse still; it is fundamentally impossible to compose static priority
7564 : * workloads. You cannot take two correctly working static prio workloads
7565 : * and smash them together and still expect them to work.
7566 : *
7567 : * For this reason 'all' FIFO tasks the kernel creates are basically at:
7568 : *
7569 : * MAX_RT_PRIO / 2
7570 : *
7571 : * The administrator _MUST_ configure the system, the kernel simply doesn't
7572 : * know enough information to make a sensible choice.
7573 : */
7574 0 : void sched_set_fifo(struct task_struct *p)
7575 : {
7576 0 : struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 };
7577 0 : WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
7578 0 : }
7579 : EXPORT_SYMBOL_GPL(sched_set_fifo);
7580 :
7581 : /*
7582 : * For when you don't much care about FIFO, but want to be above SCHED_NORMAL.
7583 : */
7584 0 : void sched_set_fifo_low(struct task_struct *p)
7585 : {
7586 0 : struct sched_param sp = { .sched_priority = 1 };
7587 0 : WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
7588 0 : }
7589 : EXPORT_SYMBOL_GPL(sched_set_fifo_low);
7590 :
7591 0 : void sched_set_normal(struct task_struct *p, int nice)
7592 : {
7593 0 : struct sched_attr attr = {
7594 : .sched_policy = SCHED_NORMAL,
7595 : .sched_nice = nice,
7596 : };
7597 0 : WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0);
7598 0 : }
7599 : EXPORT_SYMBOL_GPL(sched_set_normal);
7600 :
7601 : static int
7602 0 : do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
7603 : {
7604 : struct sched_param lparam;
7605 : struct task_struct *p;
7606 : int retval;
7607 :
7608 0 : if (!param || pid < 0)
7609 : return -EINVAL;
7610 0 : if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
7611 : return -EFAULT;
7612 :
7613 : rcu_read_lock();
7614 0 : retval = -ESRCH;
7615 0 : p = find_process_by_pid(pid);
7616 0 : if (likely(p))
7617 : get_task_struct(p);
7618 : rcu_read_unlock();
7619 :
7620 0 : if (likely(p)) {
7621 0 : retval = sched_setscheduler(p, policy, &lparam);
7622 0 : put_task_struct(p);
7623 : }
7624 :
7625 : return retval;
7626 : }
7627 :
7628 : /*
7629 : * Mimics kernel/events/core.c perf_copy_attr().
7630 : */
7631 0 : static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
7632 : {
7633 : u32 size;
7634 : int ret;
7635 :
7636 : /* Zero the full structure, so that a short copy will be nice: */
7637 0 : memset(attr, 0, sizeof(*attr));
7638 :
7639 0 : ret = get_user(size, &uattr->size);
7640 0 : if (ret)
7641 : return ret;
7642 :
7643 : /* ABI compatibility quirk: */
7644 0 : if (!size)
7645 0 : size = SCHED_ATTR_SIZE_VER0;
7646 0 : if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)
7647 : goto err_size;
7648 :
7649 0 : ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
7650 0 : if (ret) {
7651 0 : if (ret == -E2BIG)
7652 : goto err_size;
7653 : return ret;
7654 : }
7655 :
7656 0 : if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
7657 : size < SCHED_ATTR_SIZE_VER1)
7658 : return -EINVAL;
7659 :
7660 : /*
7661 : * XXX: Do we want to be lenient like existing syscalls; or do we want
7662 : * to be strict and return an error on out-of-bounds values?
7663 : */
7664 0 : attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
7665 :
7666 0 : return 0;
7667 :
7668 : err_size:
7669 0 : put_user(sizeof(*attr), &uattr->size);
7670 : return -E2BIG;
7671 : }
7672 :
7673 0 : static void get_params(struct task_struct *p, struct sched_attr *attr)
7674 : {
7675 0 : if (task_has_dl_policy(p))
7676 0 : __getparam_dl(p, attr);
7677 0 : else if (task_has_rt_policy(p))
7678 0 : attr->sched_priority = p->rt_priority;
7679 : else
7680 0 : attr->sched_nice = task_nice(p);
7681 0 : }
7682 :
7683 : /**
7684 : * sys_sched_setscheduler - set/change the scheduler policy and RT priority
7685 : * @pid: the pid in question.
7686 : * @policy: new policy.
7687 : * @param: structure containing the new RT priority.
7688 : *
7689 : * Return: 0 on success. An error code otherwise.
7690 : */
7691 0 : SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param)
7692 : {
7693 0 : if (policy < 0)
7694 : return -EINVAL;
7695 :
7696 0 : return do_sched_setscheduler(pid, policy, param);
7697 : }
7698 :
7699 : /**
7700 : * sys_sched_setparam - set/change the RT priority of a thread
7701 : * @pid: the pid in question.
7702 : * @param: structure containing the new RT priority.
7703 : *
7704 : * Return: 0 on success. An error code otherwise.
7705 : */
7706 0 : SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
7707 : {
7708 0 : return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
7709 : }
7710 :
7711 : /**
7712 : * sys_sched_setattr - same as above, but with extended sched_attr
7713 : * @pid: the pid in question.
7714 : * @uattr: structure containing the extended parameters.
7715 : * @flags: for future extension.
7716 : */
7717 0 : SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
7718 : unsigned int, flags)
7719 : {
7720 : struct sched_attr attr;
7721 : struct task_struct *p;
7722 : int retval;
7723 :
7724 0 : if (!uattr || pid < 0 || flags)
7725 : return -EINVAL;
7726 :
7727 0 : retval = sched_copy_attr(uattr, &attr);
7728 0 : if (retval)
7729 0 : return retval;
7730 :
7731 0 : if ((int)attr.sched_policy < 0)
7732 : return -EINVAL;
7733 0 : if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
7734 0 : attr.sched_policy = SETPARAM_POLICY;
7735 :
7736 : rcu_read_lock();
7737 0 : retval = -ESRCH;
7738 0 : p = find_process_by_pid(pid);
7739 0 : if (likely(p))
7740 : get_task_struct(p);
7741 : rcu_read_unlock();
7742 :
7743 0 : if (likely(p)) {
7744 0 : if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
7745 0 : get_params(p, &attr);
7746 0 : retval = sched_setattr(p, &attr);
7747 0 : put_task_struct(p);
7748 : }
7749 :
7750 0 : return retval;
7751 : }
7752 :
7753 : /**
7754 : * sys_sched_getscheduler - get the policy (scheduling class) of a thread
7755 : * @pid: the pid in question.
7756 : *
7757 : * Return: On success, the policy of the thread. Otherwise, a negative error
7758 : * code.
7759 : */
7760 0 : SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
7761 : {
7762 : struct task_struct *p;
7763 : int retval;
7764 :
7765 0 : if (pid < 0)
7766 : return -EINVAL;
7767 :
7768 0 : retval = -ESRCH;
7769 : rcu_read_lock();
7770 0 : p = find_process_by_pid(pid);
7771 0 : if (p) {
7772 0 : retval = security_task_getscheduler(p);
7773 : if (!retval)
7774 0 : retval = p->policy
7775 0 : | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
7776 : }
7777 : rcu_read_unlock();
7778 0 : return retval;
7779 : }
7780 :
7781 : /**
7782 : * sys_sched_getparam - get the RT priority of a thread
7783 : * @pid: the pid in question.
7784 : * @param: structure containing the RT priority.
7785 : *
7786 : * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
7787 : * code.
7788 : */
7789 0 : SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
7790 : {
7791 0 : struct sched_param lp = { .sched_priority = 0 };
7792 : struct task_struct *p;
7793 : int retval;
7794 :
7795 0 : if (!param || pid < 0)
7796 : return -EINVAL;
7797 :
7798 : rcu_read_lock();
7799 0 : p = find_process_by_pid(pid);
7800 0 : retval = -ESRCH;
7801 0 : if (!p)
7802 : goto out_unlock;
7803 :
7804 0 : retval = security_task_getscheduler(p);
7805 : if (retval)
7806 : goto out_unlock;
7807 :
7808 0 : if (task_has_rt_policy(p))
7809 0 : lp.sched_priority = p->rt_priority;
7810 0 : rcu_read_unlock();
7811 :
7812 : /*
7813 : * This one might sleep, we cannot do it with a spinlock held ...
7814 : */
7815 0 : retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
7816 :
7817 0 : return retval;
7818 :
7819 : out_unlock:
7820 : rcu_read_unlock();
7821 0 : return retval;
7822 : }
7823 :
7824 : /*
7825 : * Copy the kernel size attribute structure (which might be larger
7826 : * than what user-space knows about) to user-space.
7827 : *
7828 : * Note that all cases are valid: user-space buffer can be larger or
7829 : * smaller than the kernel-space buffer. The usual case is that both
7830 : * have the same size.
7831 : */
7832 : static int
7833 0 : sched_attr_copy_to_user(struct sched_attr __user *uattr,
7834 : struct sched_attr *kattr,
7835 : unsigned int usize)
7836 : {
7837 0 : unsigned int ksize = sizeof(*kattr);
7838 :
7839 0 : if (!access_ok(uattr, usize))
7840 : return -EFAULT;
7841 :
7842 : /*
7843 : * sched_getattr() ABI forwards and backwards compatibility:
7844 : *
7845 : * If usize == ksize then we just copy everything to user-space and all is good.
7846 : *
7847 : * If usize < ksize then we only copy as much as user-space has space for,
7848 : * this keeps ABI compatibility as well. We skip the rest.
7849 : *
7850 : * If usize > ksize then user-space is using a newer version of the ABI,
7851 : * which part the kernel doesn't know about. Just ignore it - tooling can
7852 : * detect the kernel's knowledge of attributes from the attr->size value
7853 : * which is set to ksize in this case.
7854 : */
7855 0 : kattr->size = min(usize, ksize);
7856 :
7857 0 : if (copy_to_user(uattr, kattr, kattr->size))
7858 : return -EFAULT;
7859 :
7860 0 : return 0;
7861 : }
7862 :
7863 : /**
7864 : * sys_sched_getattr - similar to sched_getparam, but with sched_attr
7865 : * @pid: the pid in question.
7866 : * @uattr: structure containing the extended parameters.
7867 : * @usize: sizeof(attr) for fwd/bwd comp.
7868 : * @flags: for future extension.
7869 : */
7870 0 : SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
7871 : unsigned int, usize, unsigned int, flags)
7872 : {
7873 0 : struct sched_attr kattr = { };
7874 : struct task_struct *p;
7875 : int retval;
7876 :
7877 0 : if (!uattr || pid < 0 || usize > PAGE_SIZE ||
7878 0 : usize < SCHED_ATTR_SIZE_VER0 || flags)
7879 : return -EINVAL;
7880 :
7881 : rcu_read_lock();
7882 0 : p = find_process_by_pid(pid);
7883 0 : retval = -ESRCH;
7884 0 : if (!p)
7885 : goto out_unlock;
7886 :
7887 0 : retval = security_task_getscheduler(p);
7888 : if (retval)
7889 : goto out_unlock;
7890 :
7891 0 : kattr.sched_policy = p->policy;
7892 0 : if (p->sched_reset_on_fork)
7893 0 : kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
7894 0 : get_params(p, &kattr);
7895 0 : kattr.sched_flags &= SCHED_FLAG_ALL;
7896 :
7897 : #ifdef CONFIG_UCLAMP_TASK
7898 : /*
7899 : * This could race with another potential updater, but this is fine
7900 : * because it'll correctly read the old or the new value. We don't need
7901 : * to guarantee who wins the race as long as it doesn't return garbage.
7902 : */
7903 : kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
7904 : kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
7905 : #endif
7906 :
7907 : rcu_read_unlock();
7908 :
7909 0 : return sched_attr_copy_to_user(uattr, &kattr, usize);
7910 :
7911 : out_unlock:
7912 : rcu_read_unlock();
7913 0 : return retval;
7914 : }
7915 :
7916 : #ifdef CONFIG_SMP
7917 : int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
7918 : {
7919 : int ret = 0;
7920 :
7921 : /*
7922 : * If the task isn't a deadline task or admission control is
7923 : * disabled then we don't care about affinity changes.
7924 : */
7925 : if (!task_has_dl_policy(p) || !dl_bandwidth_enabled())
7926 : return 0;
7927 :
7928 : /*
7929 : * Since bandwidth control happens on root_domain basis,
7930 : * if admission test is enabled, we only admit -deadline
7931 : * tasks allowed to run on all the CPUs in the task's
7932 : * root_domain.
7933 : */
7934 : rcu_read_lock();
7935 : if (!cpumask_subset(task_rq(p)->rd->span, mask))
7936 : ret = -EBUSY;
7937 : rcu_read_unlock();
7938 : return ret;
7939 : }
7940 : #endif
7941 :
7942 : static int
7943 0 : __sched_setaffinity(struct task_struct *p, const struct cpumask *mask)
7944 : {
7945 : int retval;
7946 : cpumask_var_t cpus_allowed, new_mask;
7947 :
7948 0 : if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL))
7949 : return -ENOMEM;
7950 :
7951 0 : if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
7952 : retval = -ENOMEM;
7953 : goto out_free_cpus_allowed;
7954 : }
7955 :
7956 0 : cpuset_cpus_allowed(p, cpus_allowed);
7957 0 : cpumask_and(new_mask, mask, cpus_allowed);
7958 :
7959 0 : retval = dl_task_check_affinity(p, new_mask);
7960 : if (retval)
7961 : goto out_free_new_mask;
7962 : again:
7963 0 : retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK | SCA_USER);
7964 0 : if (retval)
7965 : goto out_free_new_mask;
7966 :
7967 0 : cpuset_cpus_allowed(p, cpus_allowed);
7968 0 : if (!cpumask_subset(new_mask, cpus_allowed)) {
7969 : /*
7970 : * We must have raced with a concurrent cpuset update.
7971 : * Just reset the cpumask to the cpuset's cpus_allowed.
7972 : */
7973 : cpumask_copy(new_mask, cpus_allowed);
7974 : goto again;
7975 : }
7976 :
7977 : out_free_new_mask:
7978 0 : free_cpumask_var(new_mask);
7979 : out_free_cpus_allowed:
7980 0 : free_cpumask_var(cpus_allowed);
7981 : return retval;
7982 : }
7983 :
7984 0 : long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
7985 : {
7986 : struct task_struct *p;
7987 : int retval;
7988 :
7989 : rcu_read_lock();
7990 :
7991 0 : p = find_process_by_pid(pid);
7992 0 : if (!p) {
7993 : rcu_read_unlock();
7994 0 : return -ESRCH;
7995 : }
7996 :
7997 : /* Prevent p going away */
7998 0 : get_task_struct(p);
7999 : rcu_read_unlock();
8000 :
8001 0 : if (p->flags & PF_NO_SETAFFINITY) {
8002 : retval = -EINVAL;
8003 : goto out_put_task;
8004 : }
8005 :
8006 0 : if (!check_same_owner(p)) {
8007 : rcu_read_lock();
8008 0 : if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
8009 : rcu_read_unlock();
8010 0 : retval = -EPERM;
8011 0 : goto out_put_task;
8012 : }
8013 : rcu_read_unlock();
8014 : }
8015 :
8016 0 : retval = security_task_setscheduler(p);
8017 0 : if (retval)
8018 : goto out_put_task;
8019 :
8020 0 : retval = __sched_setaffinity(p, in_mask);
8021 : out_put_task:
8022 0 : put_task_struct(p);
8023 0 : return retval;
8024 : }
8025 :
8026 0 : static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
8027 : struct cpumask *new_mask)
8028 : {
8029 0 : if (len < cpumask_size())
8030 : cpumask_clear(new_mask);
8031 0 : else if (len > cpumask_size())
8032 0 : len = cpumask_size();
8033 :
8034 0 : return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
8035 : }
8036 :
8037 : /**
8038 : * sys_sched_setaffinity - set the CPU affinity of a process
8039 : * @pid: pid of the process
8040 : * @len: length in bytes of the bitmask pointed to by user_mask_ptr
8041 : * @user_mask_ptr: user-space pointer to the new CPU mask
8042 : *
8043 : * Return: 0 on success. An error code otherwise.
8044 : */
8045 0 : SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
8046 : unsigned long __user *, user_mask_ptr)
8047 : {
8048 : cpumask_var_t new_mask;
8049 : int retval;
8050 :
8051 0 : if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
8052 : return -ENOMEM;
8053 :
8054 0 : retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
8055 0 : if (retval == 0)
8056 0 : retval = sched_setaffinity(pid, new_mask);
8057 0 : free_cpumask_var(new_mask);
8058 0 : return retval;
8059 : }
8060 :
8061 0 : long sched_getaffinity(pid_t pid, struct cpumask *mask)
8062 : {
8063 : struct task_struct *p;
8064 : unsigned long flags;
8065 : int retval;
8066 :
8067 : rcu_read_lock();
8068 :
8069 0 : retval = -ESRCH;
8070 0 : p = find_process_by_pid(pid);
8071 0 : if (!p)
8072 : goto out_unlock;
8073 :
8074 0 : retval = security_task_getscheduler(p);
8075 : if (retval)
8076 : goto out_unlock;
8077 :
8078 0 : raw_spin_lock_irqsave(&p->pi_lock, flags);
8079 0 : cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
8080 0 : raw_spin_unlock_irqrestore(&p->pi_lock, flags);
8081 :
8082 : out_unlock:
8083 : rcu_read_unlock();
8084 :
8085 0 : return retval;
8086 : }
8087 :
8088 : /**
8089 : * sys_sched_getaffinity - get the CPU affinity of a process
8090 : * @pid: pid of the process
8091 : * @len: length in bytes of the bitmask pointed to by user_mask_ptr
8092 : * @user_mask_ptr: user-space pointer to hold the current CPU mask
8093 : *
8094 : * Return: size of CPU mask copied to user_mask_ptr on success. An
8095 : * error code otherwise.
8096 : */
8097 0 : SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
8098 : unsigned long __user *, user_mask_ptr)
8099 : {
8100 : int ret;
8101 : cpumask_var_t mask;
8102 :
8103 0 : if ((len * BITS_PER_BYTE) < nr_cpu_ids)
8104 : return -EINVAL;
8105 0 : if (len & (sizeof(unsigned long)-1))
8106 : return -EINVAL;
8107 :
8108 0 : if (!alloc_cpumask_var(&mask, GFP_KERNEL))
8109 : return -ENOMEM;
8110 :
8111 0 : ret = sched_getaffinity(pid, mask);
8112 0 : if (ret == 0) {
8113 0 : unsigned int retlen = min(len, cpumask_size());
8114 :
8115 0 : if (copy_to_user(user_mask_ptr, mask, retlen))
8116 : ret = -EFAULT;
8117 : else
8118 0 : ret = retlen;
8119 : }
8120 0 : free_cpumask_var(mask);
8121 :
8122 0 : return ret;
8123 : }
8124 :
8125 0 : static void do_sched_yield(void)
8126 : {
8127 : struct rq_flags rf;
8128 : struct rq *rq;
8129 :
8130 0 : rq = this_rq_lock_irq(&rf);
8131 :
8132 : schedstat_inc(rq->yld_count);
8133 0 : current->sched_class->yield_task(rq);
8134 :
8135 0 : preempt_disable();
8136 0 : rq_unlock_irq(rq, &rf);
8137 0 : sched_preempt_enable_no_resched();
8138 :
8139 0 : schedule();
8140 0 : }
8141 :
8142 : /**
8143 : * sys_sched_yield - yield the current processor to other threads.
8144 : *
8145 : * This function yields the current CPU to other tasks. If there are no
8146 : * other threads running on this CPU then this function will return.
8147 : *
8148 : * Return: 0.
8149 : */
8150 0 : SYSCALL_DEFINE0(sched_yield)
8151 : {
8152 0 : do_sched_yield();
8153 0 : return 0;
8154 : }
8155 :
8156 : #if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)
8157 739 : int __sched __cond_resched(void)
8158 : {
8159 739 : if (should_resched(0)) {
8160 : preempt_schedule_common();
8161 : return 1;
8162 : }
8163 : /*
8164 : * In preemptible kernels, ->rcu_read_lock_nesting tells the tick
8165 : * whether the current CPU is in an RCU read-side critical section,
8166 : * so the tick can report quiescent states even for CPUs looping
8167 : * in kernel context. In contrast, in non-preemptible kernels,
8168 : * RCU readers leave no in-memory hints, which means that CPU-bound
8169 : * processes executing in kernel context might never report an
8170 : * RCU quiescent state. Therefore, the following code causes
8171 : * cond_resched() to report a quiescent state, but only when RCU
8172 : * is in urgent need of one.
8173 : */
8174 : #ifndef CONFIG_PREEMPT_RCU
8175 : rcu_all_qs();
8176 : #endif
8177 737 : return 0;
8178 : }
8179 : EXPORT_SYMBOL(__cond_resched);
8180 : #endif
8181 :
8182 : #ifdef CONFIG_PREEMPT_DYNAMIC
8183 : #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
8184 : #define cond_resched_dynamic_enabled __cond_resched
8185 : #define cond_resched_dynamic_disabled ((void *)&__static_call_return0)
8186 : DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched);
8187 : EXPORT_STATIC_CALL_TRAMP(cond_resched);
8188 :
8189 : #define might_resched_dynamic_enabled __cond_resched
8190 : #define might_resched_dynamic_disabled ((void *)&__static_call_return0)
8191 : DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched);
8192 : EXPORT_STATIC_CALL_TRAMP(might_resched);
8193 : #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
8194 : static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched);
8195 : int __sched dynamic_cond_resched(void)
8196 : {
8197 : if (!static_branch_unlikely(&sk_dynamic_cond_resched))
8198 : return 0;
8199 : return __cond_resched();
8200 : }
8201 : EXPORT_SYMBOL(dynamic_cond_resched);
8202 :
8203 : static DEFINE_STATIC_KEY_FALSE(sk_dynamic_might_resched);
8204 : int __sched dynamic_might_resched(void)
8205 : {
8206 : if (!static_branch_unlikely(&sk_dynamic_might_resched))
8207 : return 0;
8208 : return __cond_resched();
8209 : }
8210 : EXPORT_SYMBOL(dynamic_might_resched);
8211 : #endif
8212 : #endif
8213 :
8214 : /*
8215 : * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
8216 : * call schedule, and on return reacquire the lock.
8217 : *
8218 : * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level
8219 : * operations here to prevent schedule() from being called twice (once via
8220 : * spin_unlock(), once by hand).
8221 : */
8222 0 : int __cond_resched_lock(spinlock_t *lock)
8223 : {
8224 0 : int resched = should_resched(PREEMPT_LOCK_OFFSET);
8225 0 : int ret = 0;
8226 :
8227 : lockdep_assert_held(lock);
8228 :
8229 0 : if (spin_needbreak(lock) || resched) {
8230 0 : spin_unlock(lock);
8231 0 : if (!_cond_resched())
8232 : cpu_relax();
8233 0 : ret = 1;
8234 : spin_lock(lock);
8235 : }
8236 0 : return ret;
8237 : }
8238 : EXPORT_SYMBOL(__cond_resched_lock);
8239 :
8240 0 : int __cond_resched_rwlock_read(rwlock_t *lock)
8241 : {
8242 0 : int resched = should_resched(PREEMPT_LOCK_OFFSET);
8243 0 : int ret = 0;
8244 :
8245 : lockdep_assert_held_read(lock);
8246 :
8247 0 : if (rwlock_needbreak(lock) || resched) {
8248 0 : read_unlock(lock);
8249 0 : if (!_cond_resched())
8250 : cpu_relax();
8251 0 : ret = 1;
8252 0 : read_lock(lock);
8253 : }
8254 0 : return ret;
8255 : }
8256 : EXPORT_SYMBOL(__cond_resched_rwlock_read);
8257 :
8258 0 : int __cond_resched_rwlock_write(rwlock_t *lock)
8259 : {
8260 0 : int resched = should_resched(PREEMPT_LOCK_OFFSET);
8261 0 : int ret = 0;
8262 :
8263 : lockdep_assert_held_write(lock);
8264 :
8265 0 : if (rwlock_needbreak(lock) || resched) {
8266 0 : write_unlock(lock);
8267 0 : if (!_cond_resched())
8268 : cpu_relax();
8269 0 : ret = 1;
8270 0 : write_lock(lock);
8271 : }
8272 0 : return ret;
8273 : }
8274 : EXPORT_SYMBOL(__cond_resched_rwlock_write);
8275 :
8276 : #ifdef CONFIG_PREEMPT_DYNAMIC
8277 :
8278 : #ifdef CONFIG_GENERIC_ENTRY
8279 : #include <linux/entry-common.h>
8280 : #endif
8281 :
8282 : /*
8283 : * SC:cond_resched
8284 : * SC:might_resched
8285 : * SC:preempt_schedule
8286 : * SC:preempt_schedule_notrace
8287 : * SC:irqentry_exit_cond_resched
8288 : *
8289 : *
8290 : * NONE:
8291 : * cond_resched <- __cond_resched
8292 : * might_resched <- RET0
8293 : * preempt_schedule <- NOP
8294 : * preempt_schedule_notrace <- NOP
8295 : * irqentry_exit_cond_resched <- NOP
8296 : *
8297 : * VOLUNTARY:
8298 : * cond_resched <- __cond_resched
8299 : * might_resched <- __cond_resched
8300 : * preempt_schedule <- NOP
8301 : * preempt_schedule_notrace <- NOP
8302 : * irqentry_exit_cond_resched <- NOP
8303 : *
8304 : * FULL:
8305 : * cond_resched <- RET0
8306 : * might_resched <- RET0
8307 : * preempt_schedule <- preempt_schedule
8308 : * preempt_schedule_notrace <- preempt_schedule_notrace
8309 : * irqentry_exit_cond_resched <- irqentry_exit_cond_resched
8310 : */
8311 :
8312 : enum {
8313 : preempt_dynamic_undefined = -1,
8314 : preempt_dynamic_none,
8315 : preempt_dynamic_voluntary,
8316 : preempt_dynamic_full,
8317 : };
8318 :
8319 : int preempt_dynamic_mode = preempt_dynamic_undefined;
8320 :
8321 : int sched_dynamic_mode(const char *str)
8322 : {
8323 : if (!strcmp(str, "none"))
8324 : return preempt_dynamic_none;
8325 :
8326 : if (!strcmp(str, "voluntary"))
8327 : return preempt_dynamic_voluntary;
8328 :
8329 : if (!strcmp(str, "full"))
8330 : return preempt_dynamic_full;
8331 :
8332 : return -EINVAL;
8333 : }
8334 :
8335 : #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
8336 : #define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled)
8337 : #define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled)
8338 : #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
8339 : #define preempt_dynamic_enable(f) static_key_enable(&sk_dynamic_##f.key)
8340 : #define preempt_dynamic_disable(f) static_key_disable(&sk_dynamic_##f.key)
8341 : #else
8342 : #error "Unsupported PREEMPT_DYNAMIC mechanism"
8343 : #endif
8344 :
8345 : void sched_dynamic_update(int mode)
8346 : {
8347 : /*
8348 : * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in
8349 : * the ZERO state, which is invalid.
8350 : */
8351 : preempt_dynamic_enable(cond_resched);
8352 : preempt_dynamic_enable(might_resched);
8353 : preempt_dynamic_enable(preempt_schedule);
8354 : preempt_dynamic_enable(preempt_schedule_notrace);
8355 : preempt_dynamic_enable(irqentry_exit_cond_resched);
8356 :
8357 : switch (mode) {
8358 : case preempt_dynamic_none:
8359 : preempt_dynamic_enable(cond_resched);
8360 : preempt_dynamic_disable(might_resched);
8361 : preempt_dynamic_disable(preempt_schedule);
8362 : preempt_dynamic_disable(preempt_schedule_notrace);
8363 : preempt_dynamic_disable(irqentry_exit_cond_resched);
8364 : pr_info("Dynamic Preempt: none\n");
8365 : break;
8366 :
8367 : case preempt_dynamic_voluntary:
8368 : preempt_dynamic_enable(cond_resched);
8369 : preempt_dynamic_enable(might_resched);
8370 : preempt_dynamic_disable(preempt_schedule);
8371 : preempt_dynamic_disable(preempt_schedule_notrace);
8372 : preempt_dynamic_disable(irqentry_exit_cond_resched);
8373 : pr_info("Dynamic Preempt: voluntary\n");
8374 : break;
8375 :
8376 : case preempt_dynamic_full:
8377 : preempt_dynamic_disable(cond_resched);
8378 : preempt_dynamic_disable(might_resched);
8379 : preempt_dynamic_enable(preempt_schedule);
8380 : preempt_dynamic_enable(preempt_schedule_notrace);
8381 : preempt_dynamic_enable(irqentry_exit_cond_resched);
8382 : pr_info("Dynamic Preempt: full\n");
8383 : break;
8384 : }
8385 :
8386 : preempt_dynamic_mode = mode;
8387 : }
8388 :
8389 : static int __init setup_preempt_mode(char *str)
8390 : {
8391 : int mode = sched_dynamic_mode(str);
8392 : if (mode < 0) {
8393 : pr_warn("Dynamic Preempt: unsupported mode: %s\n", str);
8394 : return 0;
8395 : }
8396 :
8397 : sched_dynamic_update(mode);
8398 : return 1;
8399 : }
8400 : __setup("preempt=", setup_preempt_mode);
8401 :
8402 : static void __init preempt_dynamic_init(void)
8403 : {
8404 : if (preempt_dynamic_mode == preempt_dynamic_undefined) {
8405 : if (IS_ENABLED(CONFIG_PREEMPT_NONE)) {
8406 : sched_dynamic_update(preempt_dynamic_none);
8407 : } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) {
8408 : sched_dynamic_update(preempt_dynamic_voluntary);
8409 : } else {
8410 : /* Default static call setting, nothing to do */
8411 : WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT));
8412 : preempt_dynamic_mode = preempt_dynamic_full;
8413 : pr_info("Dynamic Preempt: full\n");
8414 : }
8415 : }
8416 : }
8417 :
8418 : #else /* !CONFIG_PREEMPT_DYNAMIC */
8419 :
8420 : static inline void preempt_dynamic_init(void) { }
8421 :
8422 : #endif /* #ifdef CONFIG_PREEMPT_DYNAMIC */
8423 :
8424 : /**
8425 : * yield - yield the current processor to other threads.
8426 : *
8427 : * Do not ever use this function, there's a 99% chance you're doing it wrong.
8428 : *
8429 : * The scheduler is at all times free to pick the calling task as the most
8430 : * eligible task to run, if removing the yield() call from your code breaks
8431 : * it, it's already broken.
8432 : *
8433 : * Typical broken usage is:
8434 : *
8435 : * while (!event)
8436 : * yield();
8437 : *
8438 : * where one assumes that yield() will let 'the other' process run that will
8439 : * make event true. If the current task is a SCHED_FIFO task that will never
8440 : * happen. Never use yield() as a progress guarantee!!
8441 : *
8442 : * If you want to use yield() to wait for something, use wait_event().
8443 : * If you want to use yield() to be 'nice' for others, use cond_resched().
8444 : * If you still want to use yield(), do not!
8445 : */
8446 0 : void __sched yield(void)
8447 : {
8448 0 : set_current_state(TASK_RUNNING);
8449 0 : do_sched_yield();
8450 0 : }
8451 : EXPORT_SYMBOL(yield);
8452 :
8453 : /**
8454 : * yield_to - yield the current processor to another thread in
8455 : * your thread group, or accelerate that thread toward the
8456 : * processor it's on.
8457 : * @p: target task
8458 : * @preempt: whether task preemption is allowed or not
8459 : *
8460 : * It's the caller's job to ensure that the target task struct
8461 : * can't go away on us before we can do any checks.
8462 : *
8463 : * Return:
8464 : * true (>0) if we indeed boosted the target task.
8465 : * false (0) if we failed to boost the target.
8466 : * -ESRCH if there's no task to yield to.
8467 : */
8468 0 : int __sched yield_to(struct task_struct *p, bool preempt)
8469 : {
8470 0 : struct task_struct *curr = current;
8471 : struct rq *rq, *p_rq;
8472 : unsigned long flags;
8473 0 : int yielded = 0;
8474 :
8475 0 : local_irq_save(flags);
8476 0 : rq = this_rq();
8477 :
8478 : again:
8479 0 : p_rq = task_rq(p);
8480 : /*
8481 : * If we're the only runnable task on the rq and target rq also
8482 : * has only one task, there's absolutely no point in yielding.
8483 : */
8484 0 : if (rq->nr_running == 1 && p_rq->nr_running == 1) {
8485 : yielded = -ESRCH;
8486 : goto out_irq;
8487 : }
8488 :
8489 0 : double_rq_lock(rq, p_rq);
8490 0 : if (task_rq(p) != p_rq) {
8491 : double_rq_unlock(rq, p_rq);
8492 : goto again;
8493 : }
8494 :
8495 0 : if (!curr->sched_class->yield_to_task)
8496 : goto out_unlock;
8497 :
8498 0 : if (curr->sched_class != p->sched_class)
8499 : goto out_unlock;
8500 :
8501 0 : if (task_running(p_rq, p) || !task_is_running(p))
8502 : goto out_unlock;
8503 :
8504 0 : yielded = curr->sched_class->yield_to_task(rq, p);
8505 : if (yielded) {
8506 : schedstat_inc(rq->yld_count);
8507 : /*
8508 : * Make p's CPU reschedule; pick_next_entity takes care of
8509 : * fairness.
8510 : */
8511 : if (preempt && rq != p_rq)
8512 : resched_curr(p_rq);
8513 : }
8514 :
8515 : out_unlock:
8516 0 : double_rq_unlock(rq, p_rq);
8517 : out_irq:
8518 0 : local_irq_restore(flags);
8519 :
8520 0 : if (yielded > 0)
8521 0 : schedule();
8522 :
8523 0 : return yielded;
8524 : }
8525 : EXPORT_SYMBOL_GPL(yield_to);
8526 :
8527 0 : int io_schedule_prepare(void)
8528 : {
8529 0 : int old_iowait = current->in_iowait;
8530 :
8531 0 : current->in_iowait = 1;
8532 0 : blk_flush_plug(current->plug, true);
8533 0 : return old_iowait;
8534 : }
8535 :
8536 0 : void io_schedule_finish(int token)
8537 : {
8538 0 : current->in_iowait = token;
8539 0 : }
8540 :
8541 : /*
8542 : * This task is about to go to sleep on IO. Increment rq->nr_iowait so
8543 : * that process accounting knows that this is a task in IO wait state.
8544 : */
8545 0 : long __sched io_schedule_timeout(long timeout)
8546 : {
8547 : int token;
8548 : long ret;
8549 :
8550 0 : token = io_schedule_prepare();
8551 0 : ret = schedule_timeout(timeout);
8552 0 : io_schedule_finish(token);
8553 :
8554 0 : return ret;
8555 : }
8556 : EXPORT_SYMBOL(io_schedule_timeout);
8557 :
8558 0 : void __sched io_schedule(void)
8559 : {
8560 : int token;
8561 :
8562 0 : token = io_schedule_prepare();
8563 0 : schedule();
8564 0 : io_schedule_finish(token);
8565 0 : }
8566 : EXPORT_SYMBOL(io_schedule);
8567 :
8568 : /**
8569 : * sys_sched_get_priority_max - return maximum RT priority.
8570 : * @policy: scheduling class.
8571 : *
8572 : * Return: On success, this syscall returns the maximum
8573 : * rt_priority that can be used by a given scheduling class.
8574 : * On failure, a negative error code is returned.
8575 : */
8576 0 : SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
8577 : {
8578 0 : int ret = -EINVAL;
8579 :
8580 : switch (policy) {
8581 : case SCHED_FIFO:
8582 : case SCHED_RR:
8583 0 : ret = MAX_RT_PRIO-1;
8584 : break;
8585 : case SCHED_DEADLINE:
8586 : case SCHED_NORMAL:
8587 : case SCHED_BATCH:
8588 : case SCHED_IDLE:
8589 : ret = 0;
8590 : break;
8591 : }
8592 0 : return ret;
8593 : }
8594 :
8595 : /**
8596 : * sys_sched_get_priority_min - return minimum RT priority.
8597 : * @policy: scheduling class.
8598 : *
8599 : * Return: On success, this syscall returns the minimum
8600 : * rt_priority that can be used by a given scheduling class.
8601 : * On failure, a negative error code is returned.
8602 : */
8603 0 : SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
8604 : {
8605 0 : int ret = -EINVAL;
8606 :
8607 : switch (policy) {
8608 : case SCHED_FIFO:
8609 : case SCHED_RR:
8610 0 : ret = 1;
8611 : break;
8612 : case SCHED_DEADLINE:
8613 : case SCHED_NORMAL:
8614 : case SCHED_BATCH:
8615 : case SCHED_IDLE:
8616 : ret = 0;
8617 : }
8618 0 : return ret;
8619 : }
8620 :
8621 0 : static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
8622 : {
8623 : struct task_struct *p;
8624 : unsigned int time_slice;
8625 : struct rq_flags rf;
8626 : struct rq *rq;
8627 : int retval;
8628 :
8629 0 : if (pid < 0)
8630 : return -EINVAL;
8631 :
8632 0 : retval = -ESRCH;
8633 : rcu_read_lock();
8634 0 : p = find_process_by_pid(pid);
8635 0 : if (!p)
8636 : goto out_unlock;
8637 :
8638 0 : retval = security_task_getscheduler(p);
8639 : if (retval)
8640 : goto out_unlock;
8641 :
8642 0 : rq = task_rq_lock(p, &rf);
8643 0 : time_slice = 0;
8644 0 : if (p->sched_class->get_rr_interval)
8645 0 : time_slice = p->sched_class->get_rr_interval(rq, p);
8646 0 : task_rq_unlock(rq, p, &rf);
8647 :
8648 : rcu_read_unlock();
8649 0 : jiffies_to_timespec64(time_slice, t);
8650 0 : return 0;
8651 :
8652 : out_unlock:
8653 : rcu_read_unlock();
8654 0 : return retval;
8655 : }
8656 :
8657 : /**
8658 : * sys_sched_rr_get_interval - return the default timeslice of a process.
8659 : * @pid: pid of the process.
8660 : * @interval: userspace pointer to the timeslice value.
8661 : *
8662 : * this syscall writes the default timeslice value of a given process
8663 : * into the user-space timespec buffer. A value of '0' means infinity.
8664 : *
8665 : * Return: On success, 0 and the timeslice is in @interval. Otherwise,
8666 : * an error code.
8667 : */
8668 0 : SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
8669 : struct __kernel_timespec __user *, interval)
8670 : {
8671 : struct timespec64 t;
8672 0 : int retval = sched_rr_get_interval(pid, &t);
8673 :
8674 0 : if (retval == 0)
8675 0 : retval = put_timespec64(&t, interval);
8676 :
8677 0 : return retval;
8678 : }
8679 :
8680 : #ifdef CONFIG_COMPAT_32BIT_TIME
8681 : SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,
8682 : struct old_timespec32 __user *, interval)
8683 : {
8684 : struct timespec64 t;
8685 : int retval = sched_rr_get_interval(pid, &t);
8686 :
8687 : if (retval == 0)
8688 : retval = put_old_timespec32(&t, interval);
8689 : return retval;
8690 : }
8691 : #endif
8692 :
8693 0 : void sched_show_task(struct task_struct *p)
8694 : {
8695 0 : unsigned long free = 0;
8696 : int ppid;
8697 :
8698 0 : if (!try_get_task_stack(p))
8699 : return;
8700 :
8701 0 : pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p));
8702 :
8703 0 : if (task_is_running(p))
8704 0 : pr_cont(" running task ");
8705 : #ifdef CONFIG_DEBUG_STACK_USAGE
8706 : free = stack_not_used(p);
8707 : #endif
8708 0 : ppid = 0;
8709 : rcu_read_lock();
8710 0 : if (pid_alive(p))
8711 0 : ppid = task_pid_nr(rcu_dereference(p->real_parent));
8712 : rcu_read_unlock();
8713 0 : pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n",
8714 : free, task_pid_nr(p), ppid,
8715 : read_task_thread_flags(p));
8716 :
8717 0 : print_worker_info(KERN_INFO, p);
8718 0 : print_stop_info(KERN_INFO, p);
8719 0 : show_stack(p, NULL, KERN_INFO);
8720 0 : put_task_stack(p);
8721 : }
8722 : EXPORT_SYMBOL_GPL(sched_show_task);
8723 :
8724 : static inline bool
8725 : state_filter_match(unsigned long state_filter, struct task_struct *p)
8726 : {
8727 0 : unsigned int state = READ_ONCE(p->__state);
8728 :
8729 : /* no filter, everything matches */
8730 0 : if (!state_filter)
8731 : return true;
8732 :
8733 : /* filter, but doesn't match */
8734 0 : if (!(state & state_filter))
8735 : return false;
8736 :
8737 : /*
8738 : * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows
8739 : * TASK_KILLABLE).
8740 : */
8741 0 : if (state_filter == TASK_UNINTERRUPTIBLE && state == TASK_IDLE)
8742 : return false;
8743 :
8744 : return true;
8745 : }
8746 :
8747 :
8748 0 : void show_state_filter(unsigned int state_filter)
8749 : {
8750 : struct task_struct *g, *p;
8751 :
8752 : rcu_read_lock();
8753 0 : for_each_process_thread(g, p) {
8754 : /*
8755 : * reset the NMI-timeout, listing all files on a slow
8756 : * console might take a lot of time:
8757 : * Also, reset softlockup watchdogs on all CPUs, because
8758 : * another CPU might be blocked waiting for us to process
8759 : * an IPI.
8760 : */
8761 : touch_nmi_watchdog();
8762 : touch_all_softlockup_watchdogs();
8763 0 : if (state_filter_match(state_filter, p))
8764 0 : sched_show_task(p);
8765 : }
8766 :
8767 : #ifdef CONFIG_SCHED_DEBUG
8768 0 : if (!state_filter)
8769 0 : sysrq_sched_debug_show();
8770 : #endif
8771 : rcu_read_unlock();
8772 : /*
8773 : * Only show locks if all tasks are dumped:
8774 : */
8775 : if (!state_filter)
8776 : debug_show_all_locks();
8777 0 : }
8778 :
8779 : /**
8780 : * init_idle - set up an idle thread for a given CPU
8781 : * @idle: task in question
8782 : * @cpu: CPU the idle task belongs to
8783 : *
8784 : * NOTE: this function does not set the idle thread's NEED_RESCHED
8785 : * flag, to make booting more robust.
8786 : */
8787 1 : void __init init_idle(struct task_struct *idle, int cpu)
8788 : {
8789 1 : struct rq *rq = cpu_rq(cpu);
8790 : unsigned long flags;
8791 :
8792 1 : __sched_fork(0, idle);
8793 :
8794 1 : raw_spin_lock_irqsave(&idle->pi_lock, flags);
8795 1 : raw_spin_rq_lock(rq);
8796 :
8797 1 : idle->__state = TASK_RUNNING;
8798 1 : idle->se.exec_start = sched_clock();
8799 : /*
8800 : * PF_KTHREAD should already be set at this point; regardless, make it
8801 : * look like a proper per-CPU kthread.
8802 : */
8803 1 : idle->flags |= PF_IDLE | PF_KTHREAD | PF_NO_SETAFFINITY;
8804 1 : kthread_set_per_cpu(idle, cpu);
8805 :
8806 : #ifdef CONFIG_SMP
8807 : /*
8808 : * It's possible that init_idle() gets called multiple times on a task,
8809 : * in that case do_set_cpus_allowed() will not do the right thing.
8810 : *
8811 : * And since this is boot we can forgo the serialization.
8812 : */
8813 : set_cpus_allowed_common(idle, cpumask_of(cpu), 0);
8814 : #endif
8815 : /*
8816 : * We're having a chicken and egg problem, even though we are
8817 : * holding rq->lock, the CPU isn't yet set to this CPU so the
8818 : * lockdep check in task_group() will fail.
8819 : *
8820 : * Similar case to sched_fork(). / Alternatively we could
8821 : * use task_rq_lock() here and obtain the other rq->lock.
8822 : *
8823 : * Silence PROVE_RCU
8824 : */
8825 : rcu_read_lock();
8826 1 : __set_task_cpu(idle, cpu);
8827 : rcu_read_unlock();
8828 :
8829 1 : rq->idle = idle;
8830 1 : rcu_assign_pointer(rq->curr, idle);
8831 1 : idle->on_rq = TASK_ON_RQ_QUEUED;
8832 : #ifdef CONFIG_SMP
8833 : idle->on_cpu = 1;
8834 : #endif
8835 1 : raw_spin_rq_unlock(rq);
8836 2 : raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
8837 :
8838 : /* Set the preempt count _outside_ the spinlocks! */
8839 1 : init_idle_preempt_count(idle, cpu);
8840 :
8841 : /*
8842 : * The idle tasks have their own, simple scheduling class:
8843 : */
8844 1 : idle->sched_class = &idle_sched_class;
8845 1 : ftrace_graph_init_idle_task(idle, cpu);
8846 1 : vtime_init_idle(idle, cpu);
8847 : #ifdef CONFIG_SMP
8848 : sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
8849 : #endif
8850 1 : }
8851 :
8852 : #ifdef CONFIG_SMP
8853 :
8854 : int cpuset_cpumask_can_shrink(const struct cpumask *cur,
8855 : const struct cpumask *trial)
8856 : {
8857 : int ret = 1;
8858 :
8859 : if (cpumask_empty(cur))
8860 : return ret;
8861 :
8862 : ret = dl_cpuset_cpumask_can_shrink(cur, trial);
8863 :
8864 : return ret;
8865 : }
8866 :
8867 : int task_can_attach(struct task_struct *p,
8868 : const struct cpumask *cs_cpus_allowed)
8869 : {
8870 : int ret = 0;
8871 :
8872 : /*
8873 : * Kthreads which disallow setaffinity shouldn't be moved
8874 : * to a new cpuset; we don't want to change their CPU
8875 : * affinity and isolating such threads by their set of
8876 : * allowed nodes is unnecessary. Thus, cpusets are not
8877 : * applicable for such threads. This prevents checking for
8878 : * success of set_cpus_allowed_ptr() on all attached tasks
8879 : * before cpus_mask may be changed.
8880 : */
8881 : if (p->flags & PF_NO_SETAFFINITY) {
8882 : ret = -EINVAL;
8883 : goto out;
8884 : }
8885 :
8886 : if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
8887 : cs_cpus_allowed)) {
8888 : int cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed);
8889 :
8890 : ret = dl_cpu_busy(cpu, p);
8891 : }
8892 :
8893 : out:
8894 : return ret;
8895 : }
8896 :
8897 : bool sched_smp_initialized __read_mostly;
8898 :
8899 : #ifdef CONFIG_NUMA_BALANCING
8900 : /* Migrate current task p to target_cpu */
8901 : int migrate_task_to(struct task_struct *p, int target_cpu)
8902 : {
8903 : struct migration_arg arg = { p, target_cpu };
8904 : int curr_cpu = task_cpu(p);
8905 :
8906 : if (curr_cpu == target_cpu)
8907 : return 0;
8908 :
8909 : if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
8910 : return -EINVAL;
8911 :
8912 : /* TODO: This is not properly updating schedstats */
8913 :
8914 : trace_sched_move_numa(p, curr_cpu, target_cpu);
8915 : return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
8916 : }
8917 :
8918 : /*
8919 : * Requeue a task on a given node and accurately track the number of NUMA
8920 : * tasks on the runqueues
8921 : */
8922 : void sched_setnuma(struct task_struct *p, int nid)
8923 : {
8924 : bool queued, running;
8925 : struct rq_flags rf;
8926 : struct rq *rq;
8927 :
8928 : rq = task_rq_lock(p, &rf);
8929 : queued = task_on_rq_queued(p);
8930 : running = task_current(rq, p);
8931 :
8932 : if (queued)
8933 : dequeue_task(rq, p, DEQUEUE_SAVE);
8934 : if (running)
8935 : put_prev_task(rq, p);
8936 :
8937 : p->numa_preferred_nid = nid;
8938 :
8939 : if (queued)
8940 : enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
8941 : if (running)
8942 : set_next_task(rq, p);
8943 : task_rq_unlock(rq, p, &rf);
8944 : }
8945 : #endif /* CONFIG_NUMA_BALANCING */
8946 :
8947 : #ifdef CONFIG_HOTPLUG_CPU
8948 : /*
8949 : * Ensure that the idle task is using init_mm right before its CPU goes
8950 : * offline.
8951 : */
8952 : void idle_task_exit(void)
8953 : {
8954 : struct mm_struct *mm = current->active_mm;
8955 :
8956 : BUG_ON(cpu_online(smp_processor_id()));
8957 : BUG_ON(current != this_rq()->idle);
8958 :
8959 : if (mm != &init_mm) {
8960 : switch_mm(mm, &init_mm, current);
8961 : finish_arch_post_lock_switch();
8962 : }
8963 :
8964 : /* finish_cpu(), as ran on the BP, will clean up the active_mm state */
8965 : }
8966 :
8967 : static int __balance_push_cpu_stop(void *arg)
8968 : {
8969 : struct task_struct *p = arg;
8970 : struct rq *rq = this_rq();
8971 : struct rq_flags rf;
8972 : int cpu;
8973 :
8974 : raw_spin_lock_irq(&p->pi_lock);
8975 : rq_lock(rq, &rf);
8976 :
8977 : update_rq_clock(rq);
8978 :
8979 : if (task_rq(p) == rq && task_on_rq_queued(p)) {
8980 : cpu = select_fallback_rq(rq->cpu, p);
8981 : rq = __migrate_task(rq, &rf, p, cpu);
8982 : }
8983 :
8984 : rq_unlock(rq, &rf);
8985 : raw_spin_unlock_irq(&p->pi_lock);
8986 :
8987 : put_task_struct(p);
8988 :
8989 : return 0;
8990 : }
8991 :
8992 : static DEFINE_PER_CPU(struct cpu_stop_work, push_work);
8993 :
8994 : /*
8995 : * Ensure we only run per-cpu kthreads once the CPU goes !active.
8996 : *
8997 : * This is enabled below SCHED_AP_ACTIVE; when !cpu_active(), but only
8998 : * effective when the hotplug motion is down.
8999 : */
9000 : static void balance_push(struct rq *rq)
9001 : {
9002 : struct task_struct *push_task = rq->curr;
9003 :
9004 : lockdep_assert_rq_held(rq);
9005 :
9006 : /*
9007 : * Ensure the thing is persistent until balance_push_set(.on = false);
9008 : */
9009 : rq->balance_callback = &balance_push_callback;
9010 :
9011 : /*
9012 : * Only active while going offline and when invoked on the outgoing
9013 : * CPU.
9014 : */
9015 : if (!cpu_dying(rq->cpu) || rq != this_rq())
9016 : return;
9017 :
9018 : /*
9019 : * Both the cpu-hotplug and stop task are in this case and are
9020 : * required to complete the hotplug process.
9021 : */
9022 : if (kthread_is_per_cpu(push_task) ||
9023 : is_migration_disabled(push_task)) {
9024 :
9025 : /*
9026 : * If this is the idle task on the outgoing CPU try to wake
9027 : * up the hotplug control thread which might wait for the
9028 : * last task to vanish. The rcuwait_active() check is
9029 : * accurate here because the waiter is pinned on this CPU
9030 : * and can't obviously be running in parallel.
9031 : *
9032 : * On RT kernels this also has to check whether there are
9033 : * pinned and scheduled out tasks on the runqueue. They
9034 : * need to leave the migrate disabled section first.
9035 : */
9036 : if (!rq->nr_running && !rq_has_pinned_tasks(rq) &&
9037 : rcuwait_active(&rq->hotplug_wait)) {
9038 : raw_spin_rq_unlock(rq);
9039 : rcuwait_wake_up(&rq->hotplug_wait);
9040 : raw_spin_rq_lock(rq);
9041 : }
9042 : return;
9043 : }
9044 :
9045 : get_task_struct(push_task);
9046 : /*
9047 : * Temporarily drop rq->lock such that we can wake-up the stop task.
9048 : * Both preemption and IRQs are still disabled.
9049 : */
9050 : raw_spin_rq_unlock(rq);
9051 : stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,
9052 : this_cpu_ptr(&push_work));
9053 : /*
9054 : * At this point need_resched() is true and we'll take the loop in
9055 : * schedule(). The next pick is obviously going to be the stop task
9056 : * which kthread_is_per_cpu() and will push this task away.
9057 : */
9058 : raw_spin_rq_lock(rq);
9059 : }
9060 :
9061 : static void balance_push_set(int cpu, bool on)
9062 : {
9063 : struct rq *rq = cpu_rq(cpu);
9064 : struct rq_flags rf;
9065 :
9066 : rq_lock_irqsave(rq, &rf);
9067 : if (on) {
9068 : WARN_ON_ONCE(rq->balance_callback);
9069 : rq->balance_callback = &balance_push_callback;
9070 : } else if (rq->balance_callback == &balance_push_callback) {
9071 : rq->balance_callback = NULL;
9072 : }
9073 : rq_unlock_irqrestore(rq, &rf);
9074 : }
9075 :
9076 : /*
9077 : * Invoked from a CPUs hotplug control thread after the CPU has been marked
9078 : * inactive. All tasks which are not per CPU kernel threads are either
9079 : * pushed off this CPU now via balance_push() or placed on a different CPU
9080 : * during wakeup. Wait until the CPU is quiescent.
9081 : */
9082 : static void balance_hotplug_wait(void)
9083 : {
9084 : struct rq *rq = this_rq();
9085 :
9086 : rcuwait_wait_event(&rq->hotplug_wait,
9087 : rq->nr_running == 1 && !rq_has_pinned_tasks(rq),
9088 : TASK_UNINTERRUPTIBLE);
9089 : }
9090 :
9091 : #else
9092 :
9093 : static inline void balance_push(struct rq *rq)
9094 : {
9095 : }
9096 :
9097 : static inline void balance_push_set(int cpu, bool on)
9098 : {
9099 : }
9100 :
9101 : static inline void balance_hotplug_wait(void)
9102 : {
9103 : }
9104 :
9105 : #endif /* CONFIG_HOTPLUG_CPU */
9106 :
9107 : void set_rq_online(struct rq *rq)
9108 : {
9109 : if (!rq->online) {
9110 : const struct sched_class *class;
9111 :
9112 : cpumask_set_cpu(rq->cpu, rq->rd->online);
9113 : rq->online = 1;
9114 :
9115 : for_each_class(class) {
9116 : if (class->rq_online)
9117 : class->rq_online(rq);
9118 : }
9119 : }
9120 : }
9121 :
9122 : void set_rq_offline(struct rq *rq)
9123 : {
9124 : if (rq->online) {
9125 : const struct sched_class *class;
9126 :
9127 : for_each_class(class) {
9128 : if (class->rq_offline)
9129 : class->rq_offline(rq);
9130 : }
9131 :
9132 : cpumask_clear_cpu(rq->cpu, rq->rd->online);
9133 : rq->online = 0;
9134 : }
9135 : }
9136 :
9137 : /*
9138 : * used to mark begin/end of suspend/resume:
9139 : */
9140 : static int num_cpus_frozen;
9141 :
9142 : /*
9143 : * Update cpusets according to cpu_active mask. If cpusets are
9144 : * disabled, cpuset_update_active_cpus() becomes a simple wrapper
9145 : * around partition_sched_domains().
9146 : *
9147 : * If we come here as part of a suspend/resume, don't touch cpusets because we
9148 : * want to restore it back to its original state upon resume anyway.
9149 : */
9150 : static void cpuset_cpu_active(void)
9151 : {
9152 : if (cpuhp_tasks_frozen) {
9153 : /*
9154 : * num_cpus_frozen tracks how many CPUs are involved in suspend
9155 : * resume sequence. As long as this is not the last online
9156 : * operation in the resume sequence, just build a single sched
9157 : * domain, ignoring cpusets.
9158 : */
9159 : partition_sched_domains(1, NULL, NULL);
9160 : if (--num_cpus_frozen)
9161 : return;
9162 : /*
9163 : * This is the last CPU online operation. So fall through and
9164 : * restore the original sched domains by considering the
9165 : * cpuset configurations.
9166 : */
9167 : cpuset_force_rebuild();
9168 : }
9169 : cpuset_update_active_cpus();
9170 : }
9171 :
9172 : static int cpuset_cpu_inactive(unsigned int cpu)
9173 : {
9174 : if (!cpuhp_tasks_frozen) {
9175 : int ret = dl_cpu_busy(cpu, NULL);
9176 :
9177 : if (ret)
9178 : return ret;
9179 : cpuset_update_active_cpus();
9180 : } else {
9181 : num_cpus_frozen++;
9182 : partition_sched_domains(1, NULL, NULL);
9183 : }
9184 : return 0;
9185 : }
9186 :
9187 : int sched_cpu_activate(unsigned int cpu)
9188 : {
9189 : struct rq *rq = cpu_rq(cpu);
9190 : struct rq_flags rf;
9191 :
9192 : /*
9193 : * Clear the balance_push callback and prepare to schedule
9194 : * regular tasks.
9195 : */
9196 : balance_push_set(cpu, false);
9197 :
9198 : #ifdef CONFIG_SCHED_SMT
9199 : /*
9200 : * When going up, increment the number of cores with SMT present.
9201 : */
9202 : if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
9203 : static_branch_inc_cpuslocked(&sched_smt_present);
9204 : #endif
9205 : set_cpu_active(cpu, true);
9206 :
9207 : if (sched_smp_initialized) {
9208 : sched_update_numa(cpu, true);
9209 : sched_domains_numa_masks_set(cpu);
9210 : cpuset_cpu_active();
9211 : }
9212 :
9213 : /*
9214 : * Put the rq online, if not already. This happens:
9215 : *
9216 : * 1) In the early boot process, because we build the real domains
9217 : * after all CPUs have been brought up.
9218 : *
9219 : * 2) At runtime, if cpuset_cpu_active() fails to rebuild the
9220 : * domains.
9221 : */
9222 : rq_lock_irqsave(rq, &rf);
9223 : if (rq->rd) {
9224 : BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
9225 : set_rq_online(rq);
9226 : }
9227 : rq_unlock_irqrestore(rq, &rf);
9228 :
9229 : return 0;
9230 : }
9231 :
9232 : int sched_cpu_deactivate(unsigned int cpu)
9233 : {
9234 : struct rq *rq = cpu_rq(cpu);
9235 : struct rq_flags rf;
9236 : int ret;
9237 :
9238 : /*
9239 : * Remove CPU from nohz.idle_cpus_mask to prevent participating in
9240 : * load balancing when not active
9241 : */
9242 : nohz_balance_exit_idle(rq);
9243 :
9244 : set_cpu_active(cpu, false);
9245 :
9246 : /*
9247 : * From this point forward, this CPU will refuse to run any task that
9248 : * is not: migrate_disable() or KTHREAD_IS_PER_CPU, and will actively
9249 : * push those tasks away until this gets cleared, see
9250 : * sched_cpu_dying().
9251 : */
9252 : balance_push_set(cpu, true);
9253 :
9254 : /*
9255 : * We've cleared cpu_active_mask / set balance_push, wait for all
9256 : * preempt-disabled and RCU users of this state to go away such that
9257 : * all new such users will observe it.
9258 : *
9259 : * Specifically, we rely on ttwu to no longer target this CPU, see
9260 : * ttwu_queue_cond() and is_cpu_allowed().
9261 : *
9262 : * Do sync before park smpboot threads to take care the rcu boost case.
9263 : */
9264 : synchronize_rcu();
9265 :
9266 : rq_lock_irqsave(rq, &rf);
9267 : if (rq->rd) {
9268 : update_rq_clock(rq);
9269 : BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
9270 : set_rq_offline(rq);
9271 : }
9272 : rq_unlock_irqrestore(rq, &rf);
9273 :
9274 : #ifdef CONFIG_SCHED_SMT
9275 : /*
9276 : * When going down, decrement the number of cores with SMT present.
9277 : */
9278 : if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
9279 : static_branch_dec_cpuslocked(&sched_smt_present);
9280 :
9281 : sched_core_cpu_deactivate(cpu);
9282 : #endif
9283 :
9284 : if (!sched_smp_initialized)
9285 : return 0;
9286 :
9287 : sched_update_numa(cpu, false);
9288 : ret = cpuset_cpu_inactive(cpu);
9289 : if (ret) {
9290 : balance_push_set(cpu, false);
9291 : set_cpu_active(cpu, true);
9292 : sched_update_numa(cpu, true);
9293 : return ret;
9294 : }
9295 : sched_domains_numa_masks_clear(cpu);
9296 : return 0;
9297 : }
9298 :
9299 : static void sched_rq_cpu_starting(unsigned int cpu)
9300 : {
9301 : struct rq *rq = cpu_rq(cpu);
9302 :
9303 : rq->calc_load_update = calc_load_update;
9304 : update_max_interval();
9305 : }
9306 :
9307 : int sched_cpu_starting(unsigned int cpu)
9308 : {
9309 : sched_core_cpu_starting(cpu);
9310 : sched_rq_cpu_starting(cpu);
9311 : sched_tick_start(cpu);
9312 : return 0;
9313 : }
9314 :
9315 : #ifdef CONFIG_HOTPLUG_CPU
9316 :
9317 : /*
9318 : * Invoked immediately before the stopper thread is invoked to bring the
9319 : * CPU down completely. At this point all per CPU kthreads except the
9320 : * hotplug thread (current) and the stopper thread (inactive) have been
9321 : * either parked or have been unbound from the outgoing CPU. Ensure that
9322 : * any of those which might be on the way out are gone.
9323 : *
9324 : * If after this point a bound task is being woken on this CPU then the
9325 : * responsible hotplug callback has failed to do it's job.
9326 : * sched_cpu_dying() will catch it with the appropriate fireworks.
9327 : */
9328 : int sched_cpu_wait_empty(unsigned int cpu)
9329 : {
9330 : balance_hotplug_wait();
9331 : return 0;
9332 : }
9333 :
9334 : /*
9335 : * Since this CPU is going 'away' for a while, fold any nr_active delta we
9336 : * might have. Called from the CPU stopper task after ensuring that the
9337 : * stopper is the last running task on the CPU, so nr_active count is
9338 : * stable. We need to take the teardown thread which is calling this into
9339 : * account, so we hand in adjust = 1 to the load calculation.
9340 : *
9341 : * Also see the comment "Global load-average calculations".
9342 : */
9343 : static void calc_load_migrate(struct rq *rq)
9344 : {
9345 : long delta = calc_load_fold_active(rq, 1);
9346 :
9347 : if (delta)
9348 : atomic_long_add(delta, &calc_load_tasks);
9349 : }
9350 :
9351 : static void dump_rq_tasks(struct rq *rq, const char *loglvl)
9352 : {
9353 : struct task_struct *g, *p;
9354 : int cpu = cpu_of(rq);
9355 :
9356 : lockdep_assert_rq_held(rq);
9357 :
9358 : printk("%sCPU%d enqueued tasks (%u total):\n", loglvl, cpu, rq->nr_running);
9359 : for_each_process_thread(g, p) {
9360 : if (task_cpu(p) != cpu)
9361 : continue;
9362 :
9363 : if (!task_on_rq_queued(p))
9364 : continue;
9365 :
9366 : printk("%s\tpid: %d, name: %s\n", loglvl, p->pid, p->comm);
9367 : }
9368 : }
9369 :
9370 : int sched_cpu_dying(unsigned int cpu)
9371 : {
9372 : struct rq *rq = cpu_rq(cpu);
9373 : struct rq_flags rf;
9374 :
9375 : /* Handle pending wakeups and then migrate everything off */
9376 : sched_tick_stop(cpu);
9377 :
9378 : rq_lock_irqsave(rq, &rf);
9379 : if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) {
9380 : WARN(true, "Dying CPU not properly vacated!");
9381 : dump_rq_tasks(rq, KERN_WARNING);
9382 : }
9383 : rq_unlock_irqrestore(rq, &rf);
9384 :
9385 : calc_load_migrate(rq);
9386 : update_max_interval();
9387 : hrtick_clear(rq);
9388 : sched_core_cpu_dying(cpu);
9389 : return 0;
9390 : }
9391 : #endif
9392 :
9393 : void __init sched_init_smp(void)
9394 : {
9395 : sched_init_numa(NUMA_NO_NODE);
9396 :
9397 : /*
9398 : * There's no userspace yet to cause hotplug operations; hence all the
9399 : * CPU masks are stable and all blatant races in the below code cannot
9400 : * happen.
9401 : */
9402 : mutex_lock(&sched_domains_mutex);
9403 : sched_init_domains(cpu_active_mask);
9404 : mutex_unlock(&sched_domains_mutex);
9405 :
9406 : /* Move init over to a non-isolated CPU */
9407 : if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_DOMAIN)) < 0)
9408 : BUG();
9409 : current->flags &= ~PF_NO_SETAFFINITY;
9410 : sched_init_granularity();
9411 :
9412 : init_sched_rt_class();
9413 : init_sched_dl_class();
9414 :
9415 : sched_smp_initialized = true;
9416 : }
9417 :
9418 : static int __init migration_init(void)
9419 : {
9420 : sched_cpu_starting(smp_processor_id());
9421 : return 0;
9422 : }
9423 : early_initcall(migration_init);
9424 :
9425 : #else
9426 1 : void __init sched_init_smp(void)
9427 : {
9428 1 : sched_init_granularity();
9429 1 : }
9430 : #endif /* CONFIG_SMP */
9431 :
9432 0 : int in_sched_functions(unsigned long addr)
9433 : {
9434 0 : return in_lock_functions(addr) ||
9435 0 : (addr >= (unsigned long)__sched_text_start
9436 0 : && addr < (unsigned long)__sched_text_end);
9437 : }
9438 :
9439 : #ifdef CONFIG_CGROUP_SCHED
9440 : /*
9441 : * Default task group.
9442 : * Every task in system belongs to this group at bootup.
9443 : */
9444 : struct task_group root_task_group;
9445 : LIST_HEAD(task_groups);
9446 :
9447 : /* Cacheline aligned slab cache for task_group */
9448 : static struct kmem_cache *task_group_cache __read_mostly;
9449 : #endif
9450 :
9451 : DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
9452 : DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
9453 :
9454 1 : void __init sched_init(void)
9455 : {
9456 1 : unsigned long ptr = 0;
9457 : int i;
9458 :
9459 : /* Make sure the linker didn't screw up */
9460 1 : BUG_ON(&idle_sched_class + 1 != &fair_sched_class ||
9461 : &fair_sched_class + 1 != &rt_sched_class ||
9462 : &rt_sched_class + 1 != &dl_sched_class);
9463 : #ifdef CONFIG_SMP
9464 : BUG_ON(&dl_sched_class + 1 != &stop_sched_class);
9465 : #endif
9466 :
9467 1 : wait_bit_init();
9468 :
9469 : #ifdef CONFIG_FAIR_GROUP_SCHED
9470 : ptr += 2 * nr_cpu_ids * sizeof(void **);
9471 : #endif
9472 : #ifdef CONFIG_RT_GROUP_SCHED
9473 : ptr += 2 * nr_cpu_ids * sizeof(void **);
9474 : #endif
9475 : if (ptr) {
9476 : ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
9477 :
9478 : #ifdef CONFIG_FAIR_GROUP_SCHED
9479 : root_task_group.se = (struct sched_entity **)ptr;
9480 : ptr += nr_cpu_ids * sizeof(void **);
9481 :
9482 : root_task_group.cfs_rq = (struct cfs_rq **)ptr;
9483 : ptr += nr_cpu_ids * sizeof(void **);
9484 :
9485 : root_task_group.shares = ROOT_TASK_GROUP_LOAD;
9486 : init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
9487 : #endif /* CONFIG_FAIR_GROUP_SCHED */
9488 : #ifdef CONFIG_RT_GROUP_SCHED
9489 : root_task_group.rt_se = (struct sched_rt_entity **)ptr;
9490 : ptr += nr_cpu_ids * sizeof(void **);
9491 :
9492 : root_task_group.rt_rq = (struct rt_rq **)ptr;
9493 : ptr += nr_cpu_ids * sizeof(void **);
9494 :
9495 : #endif /* CONFIG_RT_GROUP_SCHED */
9496 : }
9497 : #ifdef CONFIG_CPUMASK_OFFSTACK
9498 : for_each_possible_cpu(i) {
9499 : per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
9500 : cpumask_size(), GFP_KERNEL, cpu_to_node(i));
9501 : per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
9502 : cpumask_size(), GFP_KERNEL, cpu_to_node(i));
9503 : }
9504 : #endif /* CONFIG_CPUMASK_OFFSTACK */
9505 :
9506 1 : init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
9507 :
9508 : #ifdef CONFIG_SMP
9509 : init_defrootdomain();
9510 : #endif
9511 :
9512 : #ifdef CONFIG_RT_GROUP_SCHED
9513 : init_rt_bandwidth(&root_task_group.rt_bandwidth,
9514 : global_rt_period(), global_rt_runtime());
9515 : #endif /* CONFIG_RT_GROUP_SCHED */
9516 :
9517 : #ifdef CONFIG_CGROUP_SCHED
9518 : task_group_cache = KMEM_CACHE(task_group, 0);
9519 :
9520 : list_add(&root_task_group.list, &task_groups);
9521 : INIT_LIST_HEAD(&root_task_group.children);
9522 : INIT_LIST_HEAD(&root_task_group.siblings);
9523 : autogroup_init(&init_task);
9524 : #endif /* CONFIG_CGROUP_SCHED */
9525 :
9526 2 : for_each_possible_cpu(i) {
9527 : struct rq *rq;
9528 :
9529 1 : rq = cpu_rq(i);
9530 : raw_spin_lock_init(&rq->__lock);
9531 1 : rq->nr_running = 0;
9532 1 : rq->calc_load_active = 0;
9533 1 : rq->calc_load_update = jiffies + LOAD_FREQ;
9534 1 : init_cfs_rq(&rq->cfs);
9535 1 : init_rt_rq(&rq->rt);
9536 1 : init_dl_rq(&rq->dl);
9537 : #ifdef CONFIG_FAIR_GROUP_SCHED
9538 : INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
9539 : rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
9540 : /*
9541 : * How much CPU bandwidth does root_task_group get?
9542 : *
9543 : * In case of task-groups formed thr' the cgroup filesystem, it
9544 : * gets 100% of the CPU resources in the system. This overall
9545 : * system CPU resource is divided among the tasks of
9546 : * root_task_group and its child task-groups in a fair manner,
9547 : * based on each entity's (task or task-group's) weight
9548 : * (se->load.weight).
9549 : *
9550 : * In other words, if root_task_group has 10 tasks of weight
9551 : * 1024) and two child groups A0 and A1 (of weight 1024 each),
9552 : * then A0's share of the CPU resource is:
9553 : *
9554 : * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
9555 : *
9556 : * We achieve this by letting root_task_group's tasks sit
9557 : * directly in rq->cfs (i.e root_task_group->se[] = NULL).
9558 : */
9559 : init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
9560 : #endif /* CONFIG_FAIR_GROUP_SCHED */
9561 :
9562 1 : rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
9563 : #ifdef CONFIG_RT_GROUP_SCHED
9564 : init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
9565 : #endif
9566 : #ifdef CONFIG_SMP
9567 : rq->sd = NULL;
9568 : rq->rd = NULL;
9569 : rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
9570 : rq->balance_callback = &balance_push_callback;
9571 : rq->active_balance = 0;
9572 : rq->next_balance = jiffies;
9573 : rq->push_cpu = 0;
9574 : rq->cpu = i;
9575 : rq->online = 0;
9576 : rq->idle_stamp = 0;
9577 : rq->avg_idle = 2*sysctl_sched_migration_cost;
9578 : rq->wake_stamp = jiffies;
9579 : rq->wake_avg_idle = rq->avg_idle;
9580 : rq->max_idle_balance_cost = sysctl_sched_migration_cost;
9581 :
9582 : INIT_LIST_HEAD(&rq->cfs_tasks);
9583 :
9584 : rq_attach_root(rq, &def_root_domain);
9585 : #ifdef CONFIG_NO_HZ_COMMON
9586 : rq->last_blocked_load_update_tick = jiffies;
9587 : atomic_set(&rq->nohz_flags, 0);
9588 :
9589 : INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq);
9590 : #endif
9591 : #ifdef CONFIG_HOTPLUG_CPU
9592 : rcuwait_init(&rq->hotplug_wait);
9593 : #endif
9594 : #endif /* CONFIG_SMP */
9595 1 : hrtick_rq_init(rq);
9596 2 : atomic_set(&rq->nr_iowait, 0);
9597 :
9598 : #ifdef CONFIG_SCHED_CORE
9599 : rq->core = rq;
9600 : rq->core_pick = NULL;
9601 : rq->core_enabled = 0;
9602 : rq->core_tree = RB_ROOT;
9603 : rq->core_forceidle_count = 0;
9604 : rq->core_forceidle_occupation = 0;
9605 : rq->core_forceidle_start = 0;
9606 :
9607 : rq->core_cookie = 0UL;
9608 : #endif
9609 : }
9610 :
9611 1 : set_load_weight(&init_task, false);
9612 :
9613 : /*
9614 : * The boot idle thread does lazy MMU switching as well:
9615 : */
9616 1 : mmgrab(&init_mm);
9617 1 : enter_lazy_tlb(&init_mm, current);
9618 :
9619 : /*
9620 : * The idle task doesn't need the kthread struct to function, but it
9621 : * is dressed up as a per-CPU kthread and thus needs to play the part
9622 : * if we want to avoid special-casing it in code that deals with per-CPU
9623 : * kthreads.
9624 : */
9625 1 : WARN_ON(!set_kthread_struct(current));
9626 :
9627 : /*
9628 : * Make us the idle thread. Technically, schedule() should not be
9629 : * called from this thread, however somewhere below it might be,
9630 : * but because we are the idle thread, we just pick up running again
9631 : * when this runqueue becomes "idle".
9632 : */
9633 1 : init_idle(current, smp_processor_id());
9634 :
9635 1 : calc_load_update = jiffies + LOAD_FREQ;
9636 :
9637 : #ifdef CONFIG_SMP
9638 : idle_thread_set_boot_cpu();
9639 : balance_push_set(smp_processor_id(), false);
9640 : #endif
9641 1 : init_sched_fair_class();
9642 :
9643 : psi_init();
9644 :
9645 : init_uclamp();
9646 :
9647 : preempt_dynamic_init();
9648 :
9649 1 : scheduler_running = 1;
9650 1 : }
9651 :
9652 : #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
9653 :
9654 : void __might_sleep(const char *file, int line)
9655 : {
9656 : unsigned int state = get_current_state();
9657 : /*
9658 : * Blocking primitives will set (and therefore destroy) current->state,
9659 : * since we will exit with TASK_RUNNING make sure we enter with it,
9660 : * otherwise we will destroy state.
9661 : */
9662 : WARN_ONCE(state != TASK_RUNNING && current->task_state_change,
9663 : "do not call blocking ops when !TASK_RUNNING; "
9664 : "state=%x set at [<%p>] %pS\n", state,
9665 : (void *)current->task_state_change,
9666 : (void *)current->task_state_change);
9667 :
9668 : __might_resched(file, line, 0);
9669 : }
9670 : EXPORT_SYMBOL(__might_sleep);
9671 :
9672 : static void print_preempt_disable_ip(int preempt_offset, unsigned long ip)
9673 : {
9674 : if (!IS_ENABLED(CONFIG_DEBUG_PREEMPT))
9675 : return;
9676 :
9677 : if (preempt_count() == preempt_offset)
9678 : return;
9679 :
9680 : pr_err("Preemption disabled at:");
9681 : print_ip_sym(KERN_ERR, ip);
9682 : }
9683 :
9684 : static inline bool resched_offsets_ok(unsigned int offsets)
9685 : {
9686 : unsigned int nested = preempt_count();
9687 :
9688 : nested += rcu_preempt_depth() << MIGHT_RESCHED_RCU_SHIFT;
9689 :
9690 : return nested == offsets;
9691 : }
9692 :
9693 : void __might_resched(const char *file, int line, unsigned int offsets)
9694 : {
9695 : /* Ratelimiting timestamp: */
9696 : static unsigned long prev_jiffy;
9697 :
9698 : unsigned long preempt_disable_ip;
9699 :
9700 : /* WARN_ON_ONCE() by default, no rate limit required: */
9701 : rcu_sleep_check();
9702 :
9703 : if ((resched_offsets_ok(offsets) && !irqs_disabled() &&
9704 : !is_idle_task(current) && !current->non_block_count) ||
9705 : system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
9706 : oops_in_progress)
9707 : return;
9708 :
9709 : if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
9710 : return;
9711 : prev_jiffy = jiffies;
9712 :
9713 : /* Save this before calling printk(), since that will clobber it: */
9714 : preempt_disable_ip = get_preempt_disable_ip(current);
9715 :
9716 : pr_err("BUG: sleeping function called from invalid context at %s:%d\n",
9717 : file, line);
9718 : pr_err("in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",
9719 : in_atomic(), irqs_disabled(), current->non_block_count,
9720 : current->pid, current->comm);
9721 : pr_err("preempt_count: %x, expected: %x\n", preempt_count(),
9722 : offsets & MIGHT_RESCHED_PREEMPT_MASK);
9723 :
9724 : if (IS_ENABLED(CONFIG_PREEMPT_RCU)) {
9725 : pr_err("RCU nest depth: %d, expected: %u\n",
9726 : rcu_preempt_depth(), offsets >> MIGHT_RESCHED_RCU_SHIFT);
9727 : }
9728 :
9729 : if (task_stack_end_corrupted(current))
9730 : pr_emerg("Thread overran stack, or stack corrupted\n");
9731 :
9732 : debug_show_held_locks(current);
9733 : if (irqs_disabled())
9734 : print_irqtrace_events(current);
9735 :
9736 : print_preempt_disable_ip(offsets & MIGHT_RESCHED_PREEMPT_MASK,
9737 : preempt_disable_ip);
9738 :
9739 : dump_stack();
9740 : add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
9741 : }
9742 : EXPORT_SYMBOL(__might_resched);
9743 :
9744 : void __cant_sleep(const char *file, int line, int preempt_offset)
9745 : {
9746 : static unsigned long prev_jiffy;
9747 :
9748 : if (irqs_disabled())
9749 : return;
9750 :
9751 : if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
9752 : return;
9753 :
9754 : if (preempt_count() > preempt_offset)
9755 : return;
9756 :
9757 : if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
9758 : return;
9759 : prev_jiffy = jiffies;
9760 :
9761 : printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line);
9762 : printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
9763 : in_atomic(), irqs_disabled(),
9764 : current->pid, current->comm);
9765 :
9766 : debug_show_held_locks(current);
9767 : dump_stack();
9768 : add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
9769 : }
9770 : EXPORT_SYMBOL_GPL(__cant_sleep);
9771 :
9772 : #ifdef CONFIG_SMP
9773 : void __cant_migrate(const char *file, int line)
9774 : {
9775 : static unsigned long prev_jiffy;
9776 :
9777 : if (irqs_disabled())
9778 : return;
9779 :
9780 : if (is_migration_disabled(current))
9781 : return;
9782 :
9783 : if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
9784 : return;
9785 :
9786 : if (preempt_count() > 0)
9787 : return;
9788 :
9789 : if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
9790 : return;
9791 : prev_jiffy = jiffies;
9792 :
9793 : pr_err("BUG: assuming non migratable context at %s:%d\n", file, line);
9794 : pr_err("in_atomic(): %d, irqs_disabled(): %d, migration_disabled() %u pid: %d, name: %s\n",
9795 : in_atomic(), irqs_disabled(), is_migration_disabled(current),
9796 : current->pid, current->comm);
9797 :
9798 : debug_show_held_locks(current);
9799 : dump_stack();
9800 : add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
9801 : }
9802 : EXPORT_SYMBOL_GPL(__cant_migrate);
9803 : #endif
9804 : #endif
9805 :
9806 : #ifdef CONFIG_MAGIC_SYSRQ
9807 : void normalize_rt_tasks(void)
9808 : {
9809 : struct task_struct *g, *p;
9810 : struct sched_attr attr = {
9811 : .sched_policy = SCHED_NORMAL,
9812 : };
9813 :
9814 : read_lock(&tasklist_lock);
9815 : for_each_process_thread(g, p) {
9816 : /*
9817 : * Only normalize user tasks:
9818 : */
9819 : if (p->flags & PF_KTHREAD)
9820 : continue;
9821 :
9822 : p->se.exec_start = 0;
9823 : schedstat_set(p->stats.wait_start, 0);
9824 : schedstat_set(p->stats.sleep_start, 0);
9825 : schedstat_set(p->stats.block_start, 0);
9826 :
9827 : if (!dl_task(p) && !rt_task(p)) {
9828 : /*
9829 : * Renice negative nice level userspace
9830 : * tasks back to 0:
9831 : */
9832 : if (task_nice(p) < 0)
9833 : set_user_nice(p, 0);
9834 : continue;
9835 : }
9836 :
9837 : __sched_setscheduler(p, &attr, false, false);
9838 : }
9839 : read_unlock(&tasklist_lock);
9840 : }
9841 :
9842 : #endif /* CONFIG_MAGIC_SYSRQ */
9843 :
9844 : #if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
9845 : /*
9846 : * These functions are only useful for the IA64 MCA handling, or kdb.
9847 : *
9848 : * They can only be called when the whole system has been
9849 : * stopped - every CPU needs to be quiescent, and no scheduling
9850 : * activity can take place. Using them for anything else would
9851 : * be a serious bug, and as a result, they aren't even visible
9852 : * under any other configuration.
9853 : */
9854 :
9855 : /**
9856 : * curr_task - return the current task for a given CPU.
9857 : * @cpu: the processor in question.
9858 : *
9859 : * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
9860 : *
9861 : * Return: The current task for @cpu.
9862 : */
9863 : struct task_struct *curr_task(int cpu)
9864 : {
9865 : return cpu_curr(cpu);
9866 : }
9867 :
9868 : #endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
9869 :
9870 : #ifdef CONFIG_IA64
9871 : /**
9872 : * ia64_set_curr_task - set the current task for a given CPU.
9873 : * @cpu: the processor in question.
9874 : * @p: the task pointer to set.
9875 : *
9876 : * Description: This function must only be used when non-maskable interrupts
9877 : * are serviced on a separate stack. It allows the architecture to switch the
9878 : * notion of the current task on a CPU in a non-blocking manner. This function
9879 : * must be called with all CPU's synchronized, and interrupts disabled, the
9880 : * and caller must save the original value of the current task (see
9881 : * curr_task() above) and restore that value before reenabling interrupts and
9882 : * re-starting the system.
9883 : *
9884 : * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
9885 : */
9886 : void ia64_set_curr_task(int cpu, struct task_struct *p)
9887 : {
9888 : cpu_curr(cpu) = p;
9889 : }
9890 :
9891 : #endif
9892 :
9893 : #ifdef CONFIG_CGROUP_SCHED
9894 : /* task_group_lock serializes the addition/removal of task groups */
9895 : static DEFINE_SPINLOCK(task_group_lock);
9896 :
9897 : static inline void alloc_uclamp_sched_group(struct task_group *tg,
9898 : struct task_group *parent)
9899 : {
9900 : #ifdef CONFIG_UCLAMP_TASK_GROUP
9901 : enum uclamp_id clamp_id;
9902 :
9903 : for_each_clamp_id(clamp_id) {
9904 : uclamp_se_set(&tg->uclamp_req[clamp_id],
9905 : uclamp_none(clamp_id), false);
9906 : tg->uclamp[clamp_id] = parent->uclamp[clamp_id];
9907 : }
9908 : #endif
9909 : }
9910 :
9911 : static void sched_free_group(struct task_group *tg)
9912 : {
9913 : free_fair_sched_group(tg);
9914 : free_rt_sched_group(tg);
9915 : autogroup_free(tg);
9916 : kmem_cache_free(task_group_cache, tg);
9917 : }
9918 :
9919 : static void sched_free_group_rcu(struct rcu_head *rcu)
9920 : {
9921 : sched_free_group(container_of(rcu, struct task_group, rcu));
9922 : }
9923 :
9924 : static void sched_unregister_group(struct task_group *tg)
9925 : {
9926 : unregister_fair_sched_group(tg);
9927 : unregister_rt_sched_group(tg);
9928 : /*
9929 : * We have to wait for yet another RCU grace period to expire, as
9930 : * print_cfs_stats() might run concurrently.
9931 : */
9932 : call_rcu(&tg->rcu, sched_free_group_rcu);
9933 : }
9934 :
9935 : /* allocate runqueue etc for a new task group */
9936 : struct task_group *sched_create_group(struct task_group *parent)
9937 : {
9938 : struct task_group *tg;
9939 :
9940 : tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
9941 : if (!tg)
9942 : return ERR_PTR(-ENOMEM);
9943 :
9944 : if (!alloc_fair_sched_group(tg, parent))
9945 : goto err;
9946 :
9947 : if (!alloc_rt_sched_group(tg, parent))
9948 : goto err;
9949 :
9950 : alloc_uclamp_sched_group(tg, parent);
9951 :
9952 : return tg;
9953 :
9954 : err:
9955 : sched_free_group(tg);
9956 : return ERR_PTR(-ENOMEM);
9957 : }
9958 :
9959 : void sched_online_group(struct task_group *tg, struct task_group *parent)
9960 : {
9961 : unsigned long flags;
9962 :
9963 : spin_lock_irqsave(&task_group_lock, flags);
9964 : list_add_rcu(&tg->list, &task_groups);
9965 :
9966 : /* Root should already exist: */
9967 : WARN_ON(!parent);
9968 :
9969 : tg->parent = parent;
9970 : INIT_LIST_HEAD(&tg->children);
9971 : list_add_rcu(&tg->siblings, &parent->children);
9972 : spin_unlock_irqrestore(&task_group_lock, flags);
9973 :
9974 : online_fair_sched_group(tg);
9975 : }
9976 :
9977 : /* rcu callback to free various structures associated with a task group */
9978 : static void sched_unregister_group_rcu(struct rcu_head *rhp)
9979 : {
9980 : /* Now it should be safe to free those cfs_rqs: */
9981 : sched_unregister_group(container_of(rhp, struct task_group, rcu));
9982 : }
9983 :
9984 : void sched_destroy_group(struct task_group *tg)
9985 : {
9986 : /* Wait for possible concurrent references to cfs_rqs complete: */
9987 : call_rcu(&tg->rcu, sched_unregister_group_rcu);
9988 : }
9989 :
9990 : void sched_release_group(struct task_group *tg)
9991 : {
9992 : unsigned long flags;
9993 :
9994 : /*
9995 : * Unlink first, to avoid walk_tg_tree_from() from finding us (via
9996 : * sched_cfs_period_timer()).
9997 : *
9998 : * For this to be effective, we have to wait for all pending users of
9999 : * this task group to leave their RCU critical section to ensure no new
10000 : * user will see our dying task group any more. Specifically ensure
10001 : * that tg_unthrottle_up() won't add decayed cfs_rq's to it.
10002 : *
10003 : * We therefore defer calling unregister_fair_sched_group() to
10004 : * sched_unregister_group() which is guarantied to get called only after the
10005 : * current RCU grace period has expired.
10006 : */
10007 : spin_lock_irqsave(&task_group_lock, flags);
10008 : list_del_rcu(&tg->list);
10009 : list_del_rcu(&tg->siblings);
10010 : spin_unlock_irqrestore(&task_group_lock, flags);
10011 : }
10012 :
10013 : static void sched_change_group(struct task_struct *tsk, int type)
10014 : {
10015 : struct task_group *tg;
10016 :
10017 : /*
10018 : * All callers are synchronized by task_rq_lock(); we do not use RCU
10019 : * which is pointless here. Thus, we pass "true" to task_css_check()
10020 : * to prevent lockdep warnings.
10021 : */
10022 : tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
10023 : struct task_group, css);
10024 : tg = autogroup_task_group(tsk, tg);
10025 : tsk->sched_task_group = tg;
10026 :
10027 : #ifdef CONFIG_FAIR_GROUP_SCHED
10028 : if (tsk->sched_class->task_change_group)
10029 : tsk->sched_class->task_change_group(tsk, type);
10030 : else
10031 : #endif
10032 : set_task_rq(tsk, task_cpu(tsk));
10033 : }
10034 :
10035 : /*
10036 : * Change task's runqueue when it moves between groups.
10037 : *
10038 : * The caller of this function should have put the task in its new group by
10039 : * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
10040 : * its new group.
10041 : */
10042 : void sched_move_task(struct task_struct *tsk)
10043 : {
10044 : int queued, running, queue_flags =
10045 : DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
10046 : struct rq_flags rf;
10047 : struct rq *rq;
10048 :
10049 : rq = task_rq_lock(tsk, &rf);
10050 : update_rq_clock(rq);
10051 :
10052 : running = task_current(rq, tsk);
10053 : queued = task_on_rq_queued(tsk);
10054 :
10055 : if (queued)
10056 : dequeue_task(rq, tsk, queue_flags);
10057 : if (running)
10058 : put_prev_task(rq, tsk);
10059 :
10060 : sched_change_group(tsk, TASK_MOVE_GROUP);
10061 :
10062 : if (queued)
10063 : enqueue_task(rq, tsk, queue_flags);
10064 : if (running) {
10065 : set_next_task(rq, tsk);
10066 : /*
10067 : * After changing group, the running task may have joined a
10068 : * throttled one but it's still the running task. Trigger a
10069 : * resched to make sure that task can still run.
10070 : */
10071 : resched_curr(rq);
10072 : }
10073 :
10074 : task_rq_unlock(rq, tsk, &rf);
10075 : }
10076 :
10077 : static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
10078 : {
10079 : return css ? container_of(css, struct task_group, css) : NULL;
10080 : }
10081 :
10082 : static struct cgroup_subsys_state *
10083 : cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
10084 : {
10085 : struct task_group *parent = css_tg(parent_css);
10086 : struct task_group *tg;
10087 :
10088 : if (!parent) {
10089 : /* This is early initialization for the top cgroup */
10090 : return &root_task_group.css;
10091 : }
10092 :
10093 : tg = sched_create_group(parent);
10094 : if (IS_ERR(tg))
10095 : return ERR_PTR(-ENOMEM);
10096 :
10097 : return &tg->css;
10098 : }
10099 :
10100 : /* Expose task group only after completing cgroup initialization */
10101 : static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
10102 : {
10103 : struct task_group *tg = css_tg(css);
10104 : struct task_group *parent = css_tg(css->parent);
10105 :
10106 : if (parent)
10107 : sched_online_group(tg, parent);
10108 :
10109 : #ifdef CONFIG_UCLAMP_TASK_GROUP
10110 : /* Propagate the effective uclamp value for the new group */
10111 : mutex_lock(&uclamp_mutex);
10112 : rcu_read_lock();
10113 : cpu_util_update_eff(css);
10114 : rcu_read_unlock();
10115 : mutex_unlock(&uclamp_mutex);
10116 : #endif
10117 :
10118 : return 0;
10119 : }
10120 :
10121 : static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
10122 : {
10123 : struct task_group *tg = css_tg(css);
10124 :
10125 : sched_release_group(tg);
10126 : }
10127 :
10128 : static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
10129 : {
10130 : struct task_group *tg = css_tg(css);
10131 :
10132 : /*
10133 : * Relies on the RCU grace period between css_released() and this.
10134 : */
10135 : sched_unregister_group(tg);
10136 : }
10137 :
10138 : /*
10139 : * This is called before wake_up_new_task(), therefore we really only
10140 : * have to set its group bits, all the other stuff does not apply.
10141 : */
10142 : static void cpu_cgroup_fork(struct task_struct *task)
10143 : {
10144 : struct rq_flags rf;
10145 : struct rq *rq;
10146 :
10147 : rq = task_rq_lock(task, &rf);
10148 :
10149 : update_rq_clock(rq);
10150 : sched_change_group(task, TASK_SET_GROUP);
10151 :
10152 : task_rq_unlock(rq, task, &rf);
10153 : }
10154 :
10155 : static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
10156 : {
10157 : struct task_struct *task;
10158 : struct cgroup_subsys_state *css;
10159 : int ret = 0;
10160 :
10161 : cgroup_taskset_for_each(task, css, tset) {
10162 : #ifdef CONFIG_RT_GROUP_SCHED
10163 : if (!sched_rt_can_attach(css_tg(css), task))
10164 : return -EINVAL;
10165 : #endif
10166 : /*
10167 : * Serialize against wake_up_new_task() such that if it's
10168 : * running, we're sure to observe its full state.
10169 : */
10170 : raw_spin_lock_irq(&task->pi_lock);
10171 : /*
10172 : * Avoid calling sched_move_task() before wake_up_new_task()
10173 : * has happened. This would lead to problems with PELT, due to
10174 : * move wanting to detach+attach while we're not attached yet.
10175 : */
10176 : if (READ_ONCE(task->__state) == TASK_NEW)
10177 : ret = -EINVAL;
10178 : raw_spin_unlock_irq(&task->pi_lock);
10179 :
10180 : if (ret)
10181 : break;
10182 : }
10183 : return ret;
10184 : }
10185 :
10186 : static void cpu_cgroup_attach(struct cgroup_taskset *tset)
10187 : {
10188 : struct task_struct *task;
10189 : struct cgroup_subsys_state *css;
10190 :
10191 : cgroup_taskset_for_each(task, css, tset)
10192 : sched_move_task(task);
10193 : }
10194 :
10195 : #ifdef CONFIG_UCLAMP_TASK_GROUP
10196 : static void cpu_util_update_eff(struct cgroup_subsys_state *css)
10197 : {
10198 : struct cgroup_subsys_state *top_css = css;
10199 : struct uclamp_se *uc_parent = NULL;
10200 : struct uclamp_se *uc_se = NULL;
10201 : unsigned int eff[UCLAMP_CNT];
10202 : enum uclamp_id clamp_id;
10203 : unsigned int clamps;
10204 :
10205 : lockdep_assert_held(&uclamp_mutex);
10206 : SCHED_WARN_ON(!rcu_read_lock_held());
10207 :
10208 : css_for_each_descendant_pre(css, top_css) {
10209 : uc_parent = css_tg(css)->parent
10210 : ? css_tg(css)->parent->uclamp : NULL;
10211 :
10212 : for_each_clamp_id(clamp_id) {
10213 : /* Assume effective clamps matches requested clamps */
10214 : eff[clamp_id] = css_tg(css)->uclamp_req[clamp_id].value;
10215 : /* Cap effective clamps with parent's effective clamps */
10216 : if (uc_parent &&
10217 : eff[clamp_id] > uc_parent[clamp_id].value) {
10218 : eff[clamp_id] = uc_parent[clamp_id].value;
10219 : }
10220 : }
10221 : /* Ensure protection is always capped by limit */
10222 : eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UCLAMP_MAX]);
10223 :
10224 : /* Propagate most restrictive effective clamps */
10225 : clamps = 0x0;
10226 : uc_se = css_tg(css)->uclamp;
10227 : for_each_clamp_id(clamp_id) {
10228 : if (eff[clamp_id] == uc_se[clamp_id].value)
10229 : continue;
10230 : uc_se[clamp_id].value = eff[clamp_id];
10231 : uc_se[clamp_id].bucket_id = uclamp_bucket_id(eff[clamp_id]);
10232 : clamps |= (0x1 << clamp_id);
10233 : }
10234 : if (!clamps) {
10235 : css = css_rightmost_descendant(css);
10236 : continue;
10237 : }
10238 :
10239 : /* Immediately update descendants RUNNABLE tasks */
10240 : uclamp_update_active_tasks(css);
10241 : }
10242 : }
10243 :
10244 : /*
10245 : * Integer 10^N with a given N exponent by casting to integer the literal "1eN"
10246 : * C expression. Since there is no way to convert a macro argument (N) into a
10247 : * character constant, use two levels of macros.
10248 : */
10249 : #define _POW10(exp) ((unsigned int)1e##exp)
10250 : #define POW10(exp) _POW10(exp)
10251 :
10252 : struct uclamp_request {
10253 : #define UCLAMP_PERCENT_SHIFT 2
10254 : #define UCLAMP_PERCENT_SCALE (100 * POW10(UCLAMP_PERCENT_SHIFT))
10255 : s64 percent;
10256 : u64 util;
10257 : int ret;
10258 : };
10259 :
10260 : static inline struct uclamp_request
10261 : capacity_from_percent(char *buf)
10262 : {
10263 : struct uclamp_request req = {
10264 : .percent = UCLAMP_PERCENT_SCALE,
10265 : .util = SCHED_CAPACITY_SCALE,
10266 : .ret = 0,
10267 : };
10268 :
10269 : buf = strim(buf);
10270 : if (strcmp(buf, "max")) {
10271 : req.ret = cgroup_parse_float(buf, UCLAMP_PERCENT_SHIFT,
10272 : &req.percent);
10273 : if (req.ret)
10274 : return req;
10275 : if ((u64)req.percent > UCLAMP_PERCENT_SCALE) {
10276 : req.ret = -ERANGE;
10277 : return req;
10278 : }
10279 :
10280 : req.util = req.percent << SCHED_CAPACITY_SHIFT;
10281 : req.util = DIV_ROUND_CLOSEST_ULL(req.util, UCLAMP_PERCENT_SCALE);
10282 : }
10283 :
10284 : return req;
10285 : }
10286 :
10287 : static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
10288 : size_t nbytes, loff_t off,
10289 : enum uclamp_id clamp_id)
10290 : {
10291 : struct uclamp_request req;
10292 : struct task_group *tg;
10293 :
10294 : req = capacity_from_percent(buf);
10295 : if (req.ret)
10296 : return req.ret;
10297 :
10298 : static_branch_enable(&sched_uclamp_used);
10299 :
10300 : mutex_lock(&uclamp_mutex);
10301 : rcu_read_lock();
10302 :
10303 : tg = css_tg(of_css(of));
10304 : if (tg->uclamp_req[clamp_id].value != req.util)
10305 : uclamp_se_set(&tg->uclamp_req[clamp_id], req.util, false);
10306 :
10307 : /*
10308 : * Because of not recoverable conversion rounding we keep track of the
10309 : * exact requested value
10310 : */
10311 : tg->uclamp_pct[clamp_id] = req.percent;
10312 :
10313 : /* Update effective clamps to track the most restrictive value */
10314 : cpu_util_update_eff(of_css(of));
10315 :
10316 : rcu_read_unlock();
10317 : mutex_unlock(&uclamp_mutex);
10318 :
10319 : return nbytes;
10320 : }
10321 :
10322 : static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of,
10323 : char *buf, size_t nbytes,
10324 : loff_t off)
10325 : {
10326 : return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MIN);
10327 : }
10328 :
10329 : static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of,
10330 : char *buf, size_t nbytes,
10331 : loff_t off)
10332 : {
10333 : return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MAX);
10334 : }
10335 :
10336 : static inline void cpu_uclamp_print(struct seq_file *sf,
10337 : enum uclamp_id clamp_id)
10338 : {
10339 : struct task_group *tg;
10340 : u64 util_clamp;
10341 : u64 percent;
10342 : u32 rem;
10343 :
10344 : rcu_read_lock();
10345 : tg = css_tg(seq_css(sf));
10346 : util_clamp = tg->uclamp_req[clamp_id].value;
10347 : rcu_read_unlock();
10348 :
10349 : if (util_clamp == SCHED_CAPACITY_SCALE) {
10350 : seq_puts(sf, "max\n");
10351 : return;
10352 : }
10353 :
10354 : percent = tg->uclamp_pct[clamp_id];
10355 : percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_SHIFT), &rem);
10356 : seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT_SHIFT, rem);
10357 : }
10358 :
10359 : static int cpu_uclamp_min_show(struct seq_file *sf, void *v)
10360 : {
10361 : cpu_uclamp_print(sf, UCLAMP_MIN);
10362 : return 0;
10363 : }
10364 :
10365 : static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
10366 : {
10367 : cpu_uclamp_print(sf, UCLAMP_MAX);
10368 : return 0;
10369 : }
10370 : #endif /* CONFIG_UCLAMP_TASK_GROUP */
10371 :
10372 : #ifdef CONFIG_FAIR_GROUP_SCHED
10373 : static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
10374 : struct cftype *cftype, u64 shareval)
10375 : {
10376 : if (shareval > scale_load_down(ULONG_MAX))
10377 : shareval = MAX_SHARES;
10378 : return sched_group_set_shares(css_tg(css), scale_load(shareval));
10379 : }
10380 :
10381 : static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
10382 : struct cftype *cft)
10383 : {
10384 : struct task_group *tg = css_tg(css);
10385 :
10386 : return (u64) scale_load_down(tg->shares);
10387 : }
10388 :
10389 : #ifdef CONFIG_CFS_BANDWIDTH
10390 : static DEFINE_MUTEX(cfs_constraints_mutex);
10391 :
10392 : const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
10393 : static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
10394 : /* More than 203 days if BW_SHIFT equals 20. */
10395 : static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
10396 :
10397 : static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
10398 :
10399 : static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
10400 : u64 burst)
10401 : {
10402 : int i, ret = 0, runtime_enabled, runtime_was_enabled;
10403 : struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
10404 :
10405 : if (tg == &root_task_group)
10406 : return -EINVAL;
10407 :
10408 : /*
10409 : * Ensure we have at some amount of bandwidth every period. This is
10410 : * to prevent reaching a state of large arrears when throttled via
10411 : * entity_tick() resulting in prolonged exit starvation.
10412 : */
10413 : if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
10414 : return -EINVAL;
10415 :
10416 : /*
10417 : * Likewise, bound things on the other side by preventing insane quota
10418 : * periods. This also allows us to normalize in computing quota
10419 : * feasibility.
10420 : */
10421 : if (period > max_cfs_quota_period)
10422 : return -EINVAL;
10423 :
10424 : /*
10425 : * Bound quota to defend quota against overflow during bandwidth shift.
10426 : */
10427 : if (quota != RUNTIME_INF && quota > max_cfs_runtime)
10428 : return -EINVAL;
10429 :
10430 : if (quota != RUNTIME_INF && (burst > quota ||
10431 : burst + quota > max_cfs_runtime))
10432 : return -EINVAL;
10433 :
10434 : /*
10435 : * Prevent race between setting of cfs_rq->runtime_enabled and
10436 : * unthrottle_offline_cfs_rqs().
10437 : */
10438 : cpus_read_lock();
10439 : mutex_lock(&cfs_constraints_mutex);
10440 : ret = __cfs_schedulable(tg, period, quota);
10441 : if (ret)
10442 : goto out_unlock;
10443 :
10444 : runtime_enabled = quota != RUNTIME_INF;
10445 : runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
10446 : /*
10447 : * If we need to toggle cfs_bandwidth_used, off->on must occur
10448 : * before making related changes, and on->off must occur afterwards
10449 : */
10450 : if (runtime_enabled && !runtime_was_enabled)
10451 : cfs_bandwidth_usage_inc();
10452 : raw_spin_lock_irq(&cfs_b->lock);
10453 : cfs_b->period = ns_to_ktime(period);
10454 : cfs_b->quota = quota;
10455 : cfs_b->burst = burst;
10456 :
10457 : __refill_cfs_bandwidth_runtime(cfs_b);
10458 :
10459 : /* Restart the period timer (if active) to handle new period expiry: */
10460 : if (runtime_enabled)
10461 : start_cfs_bandwidth(cfs_b);
10462 :
10463 : raw_spin_unlock_irq(&cfs_b->lock);
10464 :
10465 : for_each_online_cpu(i) {
10466 : struct cfs_rq *cfs_rq = tg->cfs_rq[i];
10467 : struct rq *rq = cfs_rq->rq;
10468 : struct rq_flags rf;
10469 :
10470 : rq_lock_irq(rq, &rf);
10471 : cfs_rq->runtime_enabled = runtime_enabled;
10472 : cfs_rq->runtime_remaining = 0;
10473 :
10474 : if (cfs_rq->throttled)
10475 : unthrottle_cfs_rq(cfs_rq);
10476 : rq_unlock_irq(rq, &rf);
10477 : }
10478 : if (runtime_was_enabled && !runtime_enabled)
10479 : cfs_bandwidth_usage_dec();
10480 : out_unlock:
10481 : mutex_unlock(&cfs_constraints_mutex);
10482 : cpus_read_unlock();
10483 :
10484 : return ret;
10485 : }
10486 :
10487 : static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
10488 : {
10489 : u64 quota, period, burst;
10490 :
10491 : period = ktime_to_ns(tg->cfs_bandwidth.period);
10492 : burst = tg->cfs_bandwidth.burst;
10493 : if (cfs_quota_us < 0)
10494 : quota = RUNTIME_INF;
10495 : else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC)
10496 : quota = (u64)cfs_quota_us * NSEC_PER_USEC;
10497 : else
10498 : return -EINVAL;
10499 :
10500 : return tg_set_cfs_bandwidth(tg, period, quota, burst);
10501 : }
10502 :
10503 : static long tg_get_cfs_quota(struct task_group *tg)
10504 : {
10505 : u64 quota_us;
10506 :
10507 : if (tg->cfs_bandwidth.quota == RUNTIME_INF)
10508 : return -1;
10509 :
10510 : quota_us = tg->cfs_bandwidth.quota;
10511 : do_div(quota_us, NSEC_PER_USEC);
10512 :
10513 : return quota_us;
10514 : }
10515 :
10516 : static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
10517 : {
10518 : u64 quota, period, burst;
10519 :
10520 : if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC)
10521 : return -EINVAL;
10522 :
10523 : period = (u64)cfs_period_us * NSEC_PER_USEC;
10524 : quota = tg->cfs_bandwidth.quota;
10525 : burst = tg->cfs_bandwidth.burst;
10526 :
10527 : return tg_set_cfs_bandwidth(tg, period, quota, burst);
10528 : }
10529 :
10530 : static long tg_get_cfs_period(struct task_group *tg)
10531 : {
10532 : u64 cfs_period_us;
10533 :
10534 : cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
10535 : do_div(cfs_period_us, NSEC_PER_USEC);
10536 :
10537 : return cfs_period_us;
10538 : }
10539 :
10540 : static int tg_set_cfs_burst(struct task_group *tg, long cfs_burst_us)
10541 : {
10542 : u64 quota, period, burst;
10543 :
10544 : if ((u64)cfs_burst_us > U64_MAX / NSEC_PER_USEC)
10545 : return -EINVAL;
10546 :
10547 : burst = (u64)cfs_burst_us * NSEC_PER_USEC;
10548 : period = ktime_to_ns(tg->cfs_bandwidth.period);
10549 : quota = tg->cfs_bandwidth.quota;
10550 :
10551 : return tg_set_cfs_bandwidth(tg, period, quota, burst);
10552 : }
10553 :
10554 : static long tg_get_cfs_burst(struct task_group *tg)
10555 : {
10556 : u64 burst_us;
10557 :
10558 : burst_us = tg->cfs_bandwidth.burst;
10559 : do_div(burst_us, NSEC_PER_USEC);
10560 :
10561 : return burst_us;
10562 : }
10563 :
10564 : static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
10565 : struct cftype *cft)
10566 : {
10567 : return tg_get_cfs_quota(css_tg(css));
10568 : }
10569 :
10570 : static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
10571 : struct cftype *cftype, s64 cfs_quota_us)
10572 : {
10573 : return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
10574 : }
10575 :
10576 : static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
10577 : struct cftype *cft)
10578 : {
10579 : return tg_get_cfs_period(css_tg(css));
10580 : }
10581 :
10582 : static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
10583 : struct cftype *cftype, u64 cfs_period_us)
10584 : {
10585 : return tg_set_cfs_period(css_tg(css), cfs_period_us);
10586 : }
10587 :
10588 : static u64 cpu_cfs_burst_read_u64(struct cgroup_subsys_state *css,
10589 : struct cftype *cft)
10590 : {
10591 : return tg_get_cfs_burst(css_tg(css));
10592 : }
10593 :
10594 : static int cpu_cfs_burst_write_u64(struct cgroup_subsys_state *css,
10595 : struct cftype *cftype, u64 cfs_burst_us)
10596 : {
10597 : return tg_set_cfs_burst(css_tg(css), cfs_burst_us);
10598 : }
10599 :
10600 : struct cfs_schedulable_data {
10601 : struct task_group *tg;
10602 : u64 period, quota;
10603 : };
10604 :
10605 : /*
10606 : * normalize group quota/period to be quota/max_period
10607 : * note: units are usecs
10608 : */
10609 : static u64 normalize_cfs_quota(struct task_group *tg,
10610 : struct cfs_schedulable_data *d)
10611 : {
10612 : u64 quota, period;
10613 :
10614 : if (tg == d->tg) {
10615 : period = d->period;
10616 : quota = d->quota;
10617 : } else {
10618 : period = tg_get_cfs_period(tg);
10619 : quota = tg_get_cfs_quota(tg);
10620 : }
10621 :
10622 : /* note: these should typically be equivalent */
10623 : if (quota == RUNTIME_INF || quota == -1)
10624 : return RUNTIME_INF;
10625 :
10626 : return to_ratio(period, quota);
10627 : }
10628 :
10629 : static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
10630 : {
10631 : struct cfs_schedulable_data *d = data;
10632 : struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
10633 : s64 quota = 0, parent_quota = -1;
10634 :
10635 : if (!tg->parent) {
10636 : quota = RUNTIME_INF;
10637 : } else {
10638 : struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
10639 :
10640 : quota = normalize_cfs_quota(tg, d);
10641 : parent_quota = parent_b->hierarchical_quota;
10642 :
10643 : /*
10644 : * Ensure max(child_quota) <= parent_quota. On cgroup2,
10645 : * always take the min. On cgroup1, only inherit when no
10646 : * limit is set:
10647 : */
10648 : if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) {
10649 : quota = min(quota, parent_quota);
10650 : } else {
10651 : if (quota == RUNTIME_INF)
10652 : quota = parent_quota;
10653 : else if (parent_quota != RUNTIME_INF && quota > parent_quota)
10654 : return -EINVAL;
10655 : }
10656 : }
10657 : cfs_b->hierarchical_quota = quota;
10658 :
10659 : return 0;
10660 : }
10661 :
10662 : static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
10663 : {
10664 : int ret;
10665 : struct cfs_schedulable_data data = {
10666 : .tg = tg,
10667 : .period = period,
10668 : .quota = quota,
10669 : };
10670 :
10671 : if (quota != RUNTIME_INF) {
10672 : do_div(data.period, NSEC_PER_USEC);
10673 : do_div(data.quota, NSEC_PER_USEC);
10674 : }
10675 :
10676 : rcu_read_lock();
10677 : ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
10678 : rcu_read_unlock();
10679 :
10680 : return ret;
10681 : }
10682 :
10683 : static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
10684 : {
10685 : struct task_group *tg = css_tg(seq_css(sf));
10686 : struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
10687 :
10688 : seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
10689 : seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
10690 : seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
10691 :
10692 : if (schedstat_enabled() && tg != &root_task_group) {
10693 : struct sched_statistics *stats;
10694 : u64 ws = 0;
10695 : int i;
10696 :
10697 : for_each_possible_cpu(i) {
10698 : stats = __schedstats_from_se(tg->se[i]);
10699 : ws += schedstat_val(stats->wait_sum);
10700 : }
10701 :
10702 : seq_printf(sf, "wait_sum %llu\n", ws);
10703 : }
10704 :
10705 : seq_printf(sf, "nr_bursts %d\n", cfs_b->nr_burst);
10706 : seq_printf(sf, "burst_time %llu\n", cfs_b->burst_time);
10707 :
10708 : return 0;
10709 : }
10710 : #endif /* CONFIG_CFS_BANDWIDTH */
10711 : #endif /* CONFIG_FAIR_GROUP_SCHED */
10712 :
10713 : #ifdef CONFIG_RT_GROUP_SCHED
10714 : static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
10715 : struct cftype *cft, s64 val)
10716 : {
10717 : return sched_group_set_rt_runtime(css_tg(css), val);
10718 : }
10719 :
10720 : static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
10721 : struct cftype *cft)
10722 : {
10723 : return sched_group_rt_runtime(css_tg(css));
10724 : }
10725 :
10726 : static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
10727 : struct cftype *cftype, u64 rt_period_us)
10728 : {
10729 : return sched_group_set_rt_period(css_tg(css), rt_period_us);
10730 : }
10731 :
10732 : static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
10733 : struct cftype *cft)
10734 : {
10735 : return sched_group_rt_period(css_tg(css));
10736 : }
10737 : #endif /* CONFIG_RT_GROUP_SCHED */
10738 :
10739 : #ifdef CONFIG_FAIR_GROUP_SCHED
10740 : static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css,
10741 : struct cftype *cft)
10742 : {
10743 : return css_tg(css)->idle;
10744 : }
10745 :
10746 : static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
10747 : struct cftype *cft, s64 idle)
10748 : {
10749 : return sched_group_set_idle(css_tg(css), idle);
10750 : }
10751 : #endif
10752 :
10753 : static struct cftype cpu_legacy_files[] = {
10754 : #ifdef CONFIG_FAIR_GROUP_SCHED
10755 : {
10756 : .name = "shares",
10757 : .read_u64 = cpu_shares_read_u64,
10758 : .write_u64 = cpu_shares_write_u64,
10759 : },
10760 : {
10761 : .name = "idle",
10762 : .read_s64 = cpu_idle_read_s64,
10763 : .write_s64 = cpu_idle_write_s64,
10764 : },
10765 : #endif
10766 : #ifdef CONFIG_CFS_BANDWIDTH
10767 : {
10768 : .name = "cfs_quota_us",
10769 : .read_s64 = cpu_cfs_quota_read_s64,
10770 : .write_s64 = cpu_cfs_quota_write_s64,
10771 : },
10772 : {
10773 : .name = "cfs_period_us",
10774 : .read_u64 = cpu_cfs_period_read_u64,
10775 : .write_u64 = cpu_cfs_period_write_u64,
10776 : },
10777 : {
10778 : .name = "cfs_burst_us",
10779 : .read_u64 = cpu_cfs_burst_read_u64,
10780 : .write_u64 = cpu_cfs_burst_write_u64,
10781 : },
10782 : {
10783 : .name = "stat",
10784 : .seq_show = cpu_cfs_stat_show,
10785 : },
10786 : #endif
10787 : #ifdef CONFIG_RT_GROUP_SCHED
10788 : {
10789 : .name = "rt_runtime_us",
10790 : .read_s64 = cpu_rt_runtime_read,
10791 : .write_s64 = cpu_rt_runtime_write,
10792 : },
10793 : {
10794 : .name = "rt_period_us",
10795 : .read_u64 = cpu_rt_period_read_uint,
10796 : .write_u64 = cpu_rt_period_write_uint,
10797 : },
10798 : #endif
10799 : #ifdef CONFIG_UCLAMP_TASK_GROUP
10800 : {
10801 : .name = "uclamp.min",
10802 : .flags = CFTYPE_NOT_ON_ROOT,
10803 : .seq_show = cpu_uclamp_min_show,
10804 : .write = cpu_uclamp_min_write,
10805 : },
10806 : {
10807 : .name = "uclamp.max",
10808 : .flags = CFTYPE_NOT_ON_ROOT,
10809 : .seq_show = cpu_uclamp_max_show,
10810 : .write = cpu_uclamp_max_write,
10811 : },
10812 : #endif
10813 : { } /* Terminate */
10814 : };
10815 :
10816 : static int cpu_extra_stat_show(struct seq_file *sf,
10817 : struct cgroup_subsys_state *css)
10818 : {
10819 : #ifdef CONFIG_CFS_BANDWIDTH
10820 : {
10821 : struct task_group *tg = css_tg(css);
10822 : struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
10823 : u64 throttled_usec, burst_usec;
10824 :
10825 : throttled_usec = cfs_b->throttled_time;
10826 : do_div(throttled_usec, NSEC_PER_USEC);
10827 : burst_usec = cfs_b->burst_time;
10828 : do_div(burst_usec, NSEC_PER_USEC);
10829 :
10830 : seq_printf(sf, "nr_periods %d\n"
10831 : "nr_throttled %d\n"
10832 : "throttled_usec %llu\n"
10833 : "nr_bursts %d\n"
10834 : "burst_usec %llu\n",
10835 : cfs_b->nr_periods, cfs_b->nr_throttled,
10836 : throttled_usec, cfs_b->nr_burst, burst_usec);
10837 : }
10838 : #endif
10839 : return 0;
10840 : }
10841 :
10842 : #ifdef CONFIG_FAIR_GROUP_SCHED
10843 : static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
10844 : struct cftype *cft)
10845 : {
10846 : struct task_group *tg = css_tg(css);
10847 : u64 weight = scale_load_down(tg->shares);
10848 :
10849 : return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
10850 : }
10851 :
10852 : static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
10853 : struct cftype *cft, u64 weight)
10854 : {
10855 : /*
10856 : * cgroup weight knobs should use the common MIN, DFL and MAX
10857 : * values which are 1, 100 and 10000 respectively. While it loses
10858 : * a bit of range on both ends, it maps pretty well onto the shares
10859 : * value used by scheduler and the round-trip conversions preserve
10860 : * the original value over the entire range.
10861 : */
10862 : if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
10863 : return -ERANGE;
10864 :
10865 : weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
10866 :
10867 : return sched_group_set_shares(css_tg(css), scale_load(weight));
10868 : }
10869 :
10870 : static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
10871 : struct cftype *cft)
10872 : {
10873 : unsigned long weight = scale_load_down(css_tg(css)->shares);
10874 : int last_delta = INT_MAX;
10875 : int prio, delta;
10876 :
10877 : /* find the closest nice value to the current weight */
10878 : for (prio = 0; prio < ARRAY_SIZE(sched_prio_to_weight); prio++) {
10879 : delta = abs(sched_prio_to_weight[prio] - weight);
10880 : if (delta >= last_delta)
10881 : break;
10882 : last_delta = delta;
10883 : }
10884 :
10885 : return PRIO_TO_NICE(prio - 1 + MAX_RT_PRIO);
10886 : }
10887 :
10888 : static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
10889 : struct cftype *cft, s64 nice)
10890 : {
10891 : unsigned long weight;
10892 : int idx;
10893 :
10894 : if (nice < MIN_NICE || nice > MAX_NICE)
10895 : return -ERANGE;
10896 :
10897 : idx = NICE_TO_PRIO(nice) - MAX_RT_PRIO;
10898 : idx = array_index_nospec(idx, 40);
10899 : weight = sched_prio_to_weight[idx];
10900 :
10901 : return sched_group_set_shares(css_tg(css), scale_load(weight));
10902 : }
10903 : #endif
10904 :
10905 : static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
10906 : long period, long quota)
10907 : {
10908 : if (quota < 0)
10909 : seq_puts(sf, "max");
10910 : else
10911 : seq_printf(sf, "%ld", quota);
10912 :
10913 : seq_printf(sf, " %ld\n", period);
10914 : }
10915 :
10916 : /* caller should put the current value in *@periodp before calling */
10917 : static int __maybe_unused cpu_period_quota_parse(char *buf,
10918 : u64 *periodp, u64 *quotap)
10919 : {
10920 : char tok[21]; /* U64_MAX */
10921 :
10922 : if (sscanf(buf, "%20s %llu", tok, periodp) < 1)
10923 : return -EINVAL;
10924 :
10925 : *periodp *= NSEC_PER_USEC;
10926 :
10927 : if (sscanf(tok, "%llu", quotap))
10928 : *quotap *= NSEC_PER_USEC;
10929 : else if (!strcmp(tok, "max"))
10930 : *quotap = RUNTIME_INF;
10931 : else
10932 : return -EINVAL;
10933 :
10934 : return 0;
10935 : }
10936 :
10937 : #ifdef CONFIG_CFS_BANDWIDTH
10938 : static int cpu_max_show(struct seq_file *sf, void *v)
10939 : {
10940 : struct task_group *tg = css_tg(seq_css(sf));
10941 :
10942 : cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg));
10943 : return 0;
10944 : }
10945 :
10946 : static ssize_t cpu_max_write(struct kernfs_open_file *of,
10947 : char *buf, size_t nbytes, loff_t off)
10948 : {
10949 : struct task_group *tg = css_tg(of_css(of));
10950 : u64 period = tg_get_cfs_period(tg);
10951 : u64 burst = tg_get_cfs_burst(tg);
10952 : u64 quota;
10953 : int ret;
10954 :
10955 : ret = cpu_period_quota_parse(buf, &period, "a);
10956 : if (!ret)
10957 : ret = tg_set_cfs_bandwidth(tg, period, quota, burst);
10958 : return ret ?: nbytes;
10959 : }
10960 : #endif
10961 :
10962 : static struct cftype cpu_files[] = {
10963 : #ifdef CONFIG_FAIR_GROUP_SCHED
10964 : {
10965 : .name = "weight",
10966 : .flags = CFTYPE_NOT_ON_ROOT,
10967 : .read_u64 = cpu_weight_read_u64,
10968 : .write_u64 = cpu_weight_write_u64,
10969 : },
10970 : {
10971 : .name = "weight.nice",
10972 : .flags = CFTYPE_NOT_ON_ROOT,
10973 : .read_s64 = cpu_weight_nice_read_s64,
10974 : .write_s64 = cpu_weight_nice_write_s64,
10975 : },
10976 : {
10977 : .name = "idle",
10978 : .flags = CFTYPE_NOT_ON_ROOT,
10979 : .read_s64 = cpu_idle_read_s64,
10980 : .write_s64 = cpu_idle_write_s64,
10981 : },
10982 : #endif
10983 : #ifdef CONFIG_CFS_BANDWIDTH
10984 : {
10985 : .name = "max",
10986 : .flags = CFTYPE_NOT_ON_ROOT,
10987 : .seq_show = cpu_max_show,
10988 : .write = cpu_max_write,
10989 : },
10990 : {
10991 : .name = "max.burst",
10992 : .flags = CFTYPE_NOT_ON_ROOT,
10993 : .read_u64 = cpu_cfs_burst_read_u64,
10994 : .write_u64 = cpu_cfs_burst_write_u64,
10995 : },
10996 : #endif
10997 : #ifdef CONFIG_UCLAMP_TASK_GROUP
10998 : {
10999 : .name = "uclamp.min",
11000 : .flags = CFTYPE_NOT_ON_ROOT,
11001 : .seq_show = cpu_uclamp_min_show,
11002 : .write = cpu_uclamp_min_write,
11003 : },
11004 : {
11005 : .name = "uclamp.max",
11006 : .flags = CFTYPE_NOT_ON_ROOT,
11007 : .seq_show = cpu_uclamp_max_show,
11008 : .write = cpu_uclamp_max_write,
11009 : },
11010 : #endif
11011 : { } /* terminate */
11012 : };
11013 :
11014 : struct cgroup_subsys cpu_cgrp_subsys = {
11015 : .css_alloc = cpu_cgroup_css_alloc,
11016 : .css_online = cpu_cgroup_css_online,
11017 : .css_released = cpu_cgroup_css_released,
11018 : .css_free = cpu_cgroup_css_free,
11019 : .css_extra_stat_show = cpu_extra_stat_show,
11020 : .fork = cpu_cgroup_fork,
11021 : .can_attach = cpu_cgroup_can_attach,
11022 : .attach = cpu_cgroup_attach,
11023 : .legacy_cftypes = cpu_legacy_files,
11024 : .dfl_cftypes = cpu_files,
11025 : .early_init = true,
11026 : .threaded = true,
11027 : };
11028 :
11029 : #endif /* CONFIG_CGROUP_SCHED */
11030 :
11031 0 : void dump_cpu_task(int cpu)
11032 : {
11033 0 : pr_info("Task dump for CPU %d:\n", cpu);
11034 0 : sched_show_task(cpu_curr(cpu));
11035 0 : }
11036 :
11037 : /*
11038 : * Nice levels are multiplicative, with a gentle 10% change for every
11039 : * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
11040 : * nice 1, it will get ~10% less CPU time than another CPU-bound task
11041 : * that remained on nice 0.
11042 : *
11043 : * The "10% effect" is relative and cumulative: from _any_ nice level,
11044 : * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
11045 : * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
11046 : * If a task goes up by ~10% and another task goes down by ~10% then
11047 : * the relative distance between them is ~25%.)
11048 : */
11049 : const int sched_prio_to_weight[40] = {
11050 : /* -20 */ 88761, 71755, 56483, 46273, 36291,
11051 : /* -15 */ 29154, 23254, 18705, 14949, 11916,
11052 : /* -10 */ 9548, 7620, 6100, 4904, 3906,
11053 : /* -5 */ 3121, 2501, 1991, 1586, 1277,
11054 : /* 0 */ 1024, 820, 655, 526, 423,
11055 : /* 5 */ 335, 272, 215, 172, 137,
11056 : /* 10 */ 110, 87, 70, 56, 45,
11057 : /* 15 */ 36, 29, 23, 18, 15,
11058 : };
11059 :
11060 : /*
11061 : * Inverse (2^32/x) values of the sched_prio_to_weight[] array, precalculated.
11062 : *
11063 : * In cases where the weight does not change often, we can use the
11064 : * precalculated inverse to speed up arithmetics by turning divisions
11065 : * into multiplications:
11066 : */
11067 : const u32 sched_prio_to_wmult[40] = {
11068 : /* -20 */ 48388, 59856, 76040, 92818, 118348,
11069 : /* -15 */ 147320, 184698, 229616, 287308, 360437,
11070 : /* -10 */ 449829, 563644, 704093, 875809, 1099582,
11071 : /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
11072 : /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
11073 : /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
11074 : /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
11075 : /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
11076 : };
11077 :
11078 0 : void call_trace_sched_update_nr_running(struct rq *rq, int count)
11079 : {
11080 0 : trace_sched_update_nr_running_tp(rq, count);
11081 0 : }
|