rt.c source code [Linux/kernel/sched/rt.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
4	* policies)
5	*/
6
7	#include "sched.h"
8	#include "pelt.h"
9
10	int sched_rr_timeslice = RR_TIMESLICE;
11	/ More than 4 hours if BW_SHIFT equals 20. /
12	static const u64 max_rt_runtime = MAX_BW;
13
14	/*
15	* period over which we measure -rt task CPU usage in us.
16	* default: 1s
17	*/
18	int sysctl_sched_rt_period = `1000000`;
19
20	/*
21	* part of the period that we allow rt tasks to run in us.
22	* default: 0.95s
23	*/
24	int sysctl_sched_rt_runtime = `950000`;
25
26	#ifdef CONFIG_SYSCTL
27	static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC * RR_TIMESLICE) / HZ;
28	static int sched_rt_handler(const struct ctl_table table, int* write, void *buffer,
29	size_t lenp, loff_t ppos);
30	static int sched_rr_handler(const struct ctl_table table, int* write, void *buffer,
31	size_t lenp, loff_t ppos);
32	static const struct ctl_table sched_rt_sysctls[] = {
33	{
34	.procname = "sched_rt_period_us",
35	.data = &sysctl_sched_rt_period,
36	.maxlen = sizeof(int),
37	.mode = `0644`,
38	.proc_handler = sched_rt_handler,
39	.extra1 = SYSCTL_ONE,
40	.extra2 = SYSCTL_INT_MAX,
41	},
42	{
43	.procname = "sched_rt_runtime_us",
44	.data = &sysctl_sched_rt_runtime,
45	.maxlen = sizeof(int),
46	.mode = `0644`,
47	.proc_handler = sched_rt_handler,
48	.extra1 = SYSCTL_NEG_ONE,
49	.extra2 = (void *)&sysctl_sched_rt_period,
50	},
51	{
52	.procname = "sched_rr_timeslice_ms",
53	.data = &sysctl_sched_rr_timeslice,
54	.maxlen = sizeof(int),
55	.mode = `0644`,
56	.proc_handler = sched_rr_handler,
57	},
58	};
59
60	static int __init sched_rt_sysctl_init(void)
61	{
62	register_sysctl_init("kernel", sched_rt_sysctls);
63	return `0`;
64	}
65	late_initcall(sched_rt_sysctl_init);
66	#endif /* CONFIG_SYSCTL */
67
68	void init_rt_rq(struct rt_rq *rt_rq)
69	{
70	struct rt_prio_array *array;
71	int i;
72
73	array = &rt_rq->active;
74	for (i = `0`; i < MAX_RT_PRIO; i++) {
75	INIT_LIST_HEAD(list: array->queue + i);
76	__clear_bit(i, array->bitmap);
77	}
78	/ delimiter for bitsearch: /
79	__set_bit(MAX_RT_PRIO, array->bitmap);
80
81	rt_rq->highest_prio.curr = MAX_RT_PRIO-`1`;
82	rt_rq->highest_prio.next = MAX_RT_PRIO-`1`;
83	rt_rq->overloaded = `0`;
84	plist_head_init(head: &rt_rq->pushable_tasks);
85	/ We start is dequeued state, because no RT tasks are queued /
86	rt_rq->rt_queued = `0`;
87
88	#ifdef CONFIG_RT_GROUP_SCHED
89	rt_rq->rt_time = `0`;
90	rt_rq->rt_throttled = `0`;
91	rt_rq->rt_runtime = `0`;
92	raw_spin_lock_init(&rt_rq->rt_runtime_lock);
93	rt_rq->tg = &root_task_group;
94	#endif
95	}
96
97	#ifdef CONFIG_RT_GROUP_SCHED
98
99	static int do_sched_rt_period_timer(struct rt_bandwidth rt_b, int* overrun);
100
101	static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
102	{
103	struct rt_bandwidth *rt_b =
104	container_of(timer, struct rt_bandwidth, rt_period_timer);
105	int idle = `0`;
106	int overrun;
107
108	raw_spin_lock(&rt_b->rt_runtime_lock);
109	for (;;) {
110	overrun = hrtimer_forward_now(timer, rt_b->rt_period);
111	if (!overrun)
112	break;
113
114	raw_spin_unlock(&rt_b->rt_runtime_lock);
115	idle = do_sched_rt_period_timer(rt_b, overrun);
116	raw_spin_lock(&rt_b->rt_runtime_lock);
117	}
118	if (idle)
119	rt_b->rt_period_active = `0`;
120	raw_spin_unlock(&rt_b->rt_runtime_lock);
121
122	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
123	}
124
125	void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
126	{
127	rt_b->rt_period = ns_to_ktime(period);
128	rt_b->rt_runtime = runtime;
129
130	raw_spin_lock_init(&rt_b->rt_runtime_lock);
131
132	hrtimer_setup(&rt_b->rt_period_timer, sched_rt_period_timer, CLOCK_MONOTONIC,
133	HRTIMER_MODE_REL_HARD);
134	}
135
136	static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b)
137	{
138	raw_spin_lock(&rt_b->rt_runtime_lock);
139	if (!rt_b->rt_period_active) {
140	rt_b->rt_period_active = `1`;
141	/*
142	* SCHED_DEADLINE updates the bandwidth, as a run away
143	* RT task with a DL task could hog a CPU. But DL does
144	* not reset the period. If a deadline task was running
145	* without an RT task running, it can cause RT tasks to
146	* throttle when they start up. Kick the timer right away
147	* to update the period.
148	*/
149	hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(`0`));
150	hrtimer_start_expires(&rt_b->rt_period_timer,
151	HRTIMER_MODE_ABS_PINNED_HARD);
152	}
153	raw_spin_unlock(&rt_b->rt_runtime_lock);
154	}
155
156	static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
157	{
158	if (!rt_bandwidth_enabled() \|\| rt_b->rt_runtime == RUNTIME_INF)
159	return;
160
161	do_start_rt_bandwidth(rt_b);
162	}
163
164	static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
165	{
166	hrtimer_cancel(&rt_b->rt_period_timer);
167	}
168
169	#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
170
171	static inline struct task_struct rt_task_of(struct* sched_rt_entity *rt_se)
172	{
173	WARN_ON_ONCE(!rt_entity_is_task(rt_se));
174
175	return container_of(rt_se, struct task_struct, rt);
176	}
177
178	static inline struct rq rq_of_rt_rq(struct* rt_rq *rt_rq)
179	{
180	/ Cannot fold with non-CONFIG_RT_GROUP_SCHED version, layout /
181	WARN_ON(!rt_group_sched_enabled() && rt_rq->tg != &root_task_group);
182	return rt_rq->rq;
183	}
184
185	static inline struct rt_rq rt_rq_of_se(struct* sched_rt_entity *rt_se)
186	{
187	WARN_ON(!rt_group_sched_enabled() && rt_se->rt_rq->tg != &root_task_group);
188	return rt_se->rt_rq;
189	}
190
191	static inline struct rq rq_of_rt_se(struct* sched_rt_entity *rt_se)
192	{
193	struct rt_rq *rt_rq = rt_se->rt_rq;
194
195	WARN_ON(!rt_group_sched_enabled() && rt_rq->tg != &root_task_group);
196	return rt_rq->rq;
197	}
198
199	void unregister_rt_sched_group(struct task_group *tg)
200	{
201	if (!rt_group_sched_enabled())
202	return;
203
204	if (tg->rt_se)
205	destroy_rt_bandwidth(&tg->rt_bandwidth);
206	}
207
208	void free_rt_sched_group(struct task_group *tg)
209	{
210	int i;
211
212	if (!rt_group_sched_enabled())
213	return;
214
215	for_each_possible_cpu(i) {
216	if (tg->rt_rq)
217	kfree(tg->rt_rq[i]);
218	if (tg->rt_se)
219	kfree(tg->rt_se[i]);
220	}
221
222	kfree(tg->rt_rq);
223	kfree(tg->rt_se);
224	}
225
226	void init_tg_rt_entry(struct task_group tg, struct* rt_rq *rt_rq,
227	struct sched_rt_entity rt_se, int* cpu,
228	struct sched_rt_entity *parent)
229	{
230	struct rq *rq = cpu_rq(cpu);
231
232	rt_rq->highest_prio.curr = MAX_RT_PRIO-`1`;
233	rt_rq->rt_nr_boosted = `0`;
234	rt_rq->rq = rq;
235	rt_rq->tg = tg;
236
237	tg->rt_rq[cpu] = rt_rq;
238	tg->rt_se[cpu] = rt_se;
239
240	if (!rt_se)
241	return;
242
243	if (!parent)
244	rt_se->rt_rq = &rq->rt;
245	else
246	rt_se->rt_rq = parent->my_q;
247
248	rt_se->my_q = rt_rq;
249	rt_se->parent = parent;
250	INIT_LIST_HEAD(&rt_se->run_list);
251	}
252
253	int alloc_rt_sched_group(struct task_group tg, struct* task_group *parent)
254	{
255	struct rt_rq *rt_rq;
256	struct sched_rt_entity *rt_se;
257	int i;
258
259	if (!rt_group_sched_enabled())
260	return `1`;
261
262	tg->rt_rq = kcalloc(nr_cpu_ids, sizeof(rt_rq), GFP_KERNEL);
263	if (!tg->rt_rq)
264	goto err;
265	tg->rt_se = kcalloc(nr_cpu_ids, sizeof(rt_se), GFP_KERNEL);
266	if (!tg->rt_se)
267	goto err;
268
269	init_rt_bandwidth(&tg->rt_bandwidth, ktime_to_ns(global_rt_period()), `0`);
270
271	for_each_possible_cpu(i) {
272	rt_rq = kzalloc_node(sizeof(struct rt_rq),
273	GFP_KERNEL, cpu_to_node(i));
274	if (!rt_rq)
275	goto err;
276
277	rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
278	GFP_KERNEL, cpu_to_node(i));
279	if (!rt_se)
280	goto err_free_rq;
281
282	init_rt_rq(rt_rq);
283	rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
284	init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
285	}
286
287	return `1`;
288
289	err_free_rq:
290	kfree(rt_rq);
291	err:
292	return `0`;
293	}
294
295	#else /* !CONFIG_RT_GROUP_SCHED: */
296
297	#define rt_entity_is_task(rt_se) (1)
298
299	static inline struct task_struct rt_task_of(struct* sched_rt_entity *rt_se)
300	{
301	return container_of(rt_se, struct task_struct, rt);
302	}
303
304	static inline struct rq rq_of_rt_rq(struct* rt_rq *rt_rq)
305	{
306	return container_of(rt_rq, struct rq, rt);
307	}
308
309	static inline struct rq rq_of_rt_se(struct* sched_rt_entity *rt_se)
310	{
311	struct task_struct *p = rt_task_of(rt_se);
312
313	return task_rq(p);
314	}
315
316	static inline struct rt_rq rt_rq_of_se(struct* sched_rt_entity *rt_se)
317	{
318	struct rq *rq = rq_of_rt_se(rt_se);
319
320	return &rq->rt;
321	}
322
323	void unregister_rt_sched_group(struct task_group *tg) { }
324
325	void free_rt_sched_group(struct task_group *tg) { }
326
327	int alloc_rt_sched_group(struct task_group tg, struct* task_group *parent)
328	{
329	return `1`;
330	}
331	#endif /* !CONFIG_RT_GROUP_SCHED */
332
333	static inline bool need_pull_rt_task(struct rq rq, struct* task_struct *prev)
334	{
335	/ Try to pull RT tasks here if we lower this rq's prio /
336	return rq->online && rq->rt.highest_prio.curr > prev->prio;
337	}
338
339	static inline int rt_overloaded(struct rq *rq)
340	{
341	return atomic_read(v: &rq->rd->rto_count);
342	}
343
344	static inline void rt_set_overload(struct rq *rq)
345	{
346	if (!rq->online)
347	return;
348
349	cpumask_set_cpu(cpu: rq->cpu, dstp: rq->rd->rto_mask);
350	/*
351	* Make sure the mask is visible before we set
352	* the overload count. That is checked to determine
353	* if we should look at the mask. It would be a shame
354	* if we looked at the mask, but the mask was not
355	* updated yet.
356	*
357	* Matched by the barrier in pull_rt_task().
358	*/
359	smp_wmb();
360	atomic_inc(v: &rq->rd->rto_count);
361	}
362
363	static inline void rt_clear_overload(struct rq *rq)
364	{
365	if (!rq->online)
366	return;
367
368	/ the order here really doesn't matter /
369	atomic_dec(v: &rq->rd->rto_count);
370	cpumask_clear_cpu(cpu: rq->cpu, dstp: rq->rd->rto_mask);
371	}
372
373	static inline int has_pushable_tasks(struct rq *rq)
374	{
375	return !plist_head_empty(head: &rq->rt.pushable_tasks);
376	}
377
378	static DEFINE_PER_CPU(struct balance_callback, rt_push_head);
379	static DEFINE_PER_CPU(struct balance_callback, rt_pull_head);
380
381	static void push_rt_tasks(struct rq *);
382	static void pull_rt_task(struct rq *);
383
384	static inline void rt_queue_push_tasks(struct rq *rq)
385	{
386	if (!has_pushable_tasks(rq))
387	return;
388
389	queue_balance_callback(rq, head: &per_cpu(rt_push_head, rq->cpu), func: push_rt_tasks);
390	}
391
392	static inline void rt_queue_pull_task(struct rq *rq)
393	{
394	queue_balance_callback(rq, head: &per_cpu(rt_pull_head, rq->cpu), func: pull_rt_task);
395	}
396
397	static void enqueue_pushable_task(struct rq rq, struct* task_struct *p)
398	{
399	plist_del(node: &p->pushable_tasks, head: &rq->rt.pushable_tasks);
400	plist_node_init(node: &p->pushable_tasks, prio: p->prio);
401	plist_add(node: &p->pushable_tasks, head: &rq->rt.pushable_tasks);
402
403	/ Update the highest prio pushable task /
404	if (p->prio < rq->rt.highest_prio.next)
405	rq->rt.highest_prio.next = p->prio;
406
407	if (!rq->rt.overloaded) {
408	rt_set_overload(rq);
409	rq->rt.overloaded = `1`;
410	}
411	}
412
413	static void dequeue_pushable_task(struct rq rq, struct* task_struct *p)
414	{
415	plist_del(node: &p->pushable_tasks, head: &rq->rt.pushable_tasks);
416
417	/ Update the new highest prio pushable task /
418	if (has_pushable_tasks(rq)) {
419	p = plist_first_entry(&rq->rt.pushable_tasks,
420	struct task_struct, pushable_tasks);
421	rq->rt.highest_prio.next = p->prio;
422	} else {
423	rq->rt.highest_prio.next = MAX_RT_PRIO-`1`;
424
425	if (rq->rt.overloaded) {
426	rt_clear_overload(rq);
427	rq->rt.overloaded = `0`;
428	}
429	}
430	}
431
432	static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
433	static void dequeue_top_rt_rq(struct rt_rq rt_rq, unsigned* int count);
434
435	static inline int on_rt_rq(struct sched_rt_entity *rt_se)
436	{
437	return rt_se->on_rq;
438	}
439
440	#ifdef CONFIG_UCLAMP_TASK
441	/*
442	* Verify the fitness of task @p to run on @cpu taking into account the uclamp
443	* settings.
444	*
445	* This check is only important for heterogeneous systems where uclamp_min value
446	* is higher than the capacity of a @cpu. For non-heterogeneous system this
447	* function will always return true.
448	*
449	* The function will return true if the capacity of the @cpu is >= the
450	* uclamp_min and false otherwise.
451	*
452	* Note that uclamp_min will be clamped to uclamp_max if uclamp_min
453	* > uclamp_max.
454	*/
455	static inline bool rt_task_fits_capacity(struct task_struct p, int* cpu)
456	{
457	unsigned int min_cap;
458	unsigned int max_cap;
459	unsigned int cpu_cap;
460
461	/ Only heterogeneous systems can benefit from this check /
462	if (!sched_asym_cpucap_active())
463	return true;
464
465	min_cap = uclamp_eff_value(p, UCLAMP_MIN);
466	max_cap = uclamp_eff_value(p, UCLAMP_MAX);
467
468	cpu_cap = arch_scale_cpu_capacity(cpu);
469
470	return cpu_cap >= min(min_cap, max_cap);
471	}
472	#else /* !CONFIG_UCLAMP_TASK: */
473	static inline bool rt_task_fits_capacity(struct task_struct p, int* cpu)
474	{
475	return true;
476	}
477	#endif /* !CONFIG_UCLAMP_TASK */
478
479	#ifdef CONFIG_RT_GROUP_SCHED
480
481	static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
482	{
483	return rt_rq->rt_runtime;
484	}
485
486	static inline u64 sched_rt_period(struct rt_rq *rt_rq)
487	{
488	return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
489	}
490
491	typedef struct task_group *rt_rq_iter_t;
492
493	static inline struct task_group next_task_group(struct* task_group *tg)
494	{
495	if (!rt_group_sched_enabled()) {
496	WARN_ON(tg != &root_task_group);
497	return NULL;
498	}
499
500	do {
501	tg = list_entry_rcu(tg->list.next,
502	typeof(struct task_group), list);
503	} while (&tg->list != &task_groups && task_group_is_autogroup(tg));
504
505	if (&tg->list == &task_groups)
506	tg = NULL;
507
508	return tg;
509	}
510
511	#define for_each_rt_rq(rt_rq, iter, rq) \
512	for (iter = &root_task_group; \
513	iter && (rt_rq = iter->rt_rq[cpu_of(rq)]); \
514	iter = next_task_group(iter))
515
516	#define for_each_sched_rt_entity(rt_se) \
517	for (; rt_se; rt_se = rt_se->parent)
518
519	static inline struct rt_rq group_rt_rq(struct* sched_rt_entity *rt_se)
520	{
521	return rt_se->my_q;
522	}
523
524	static void enqueue_rt_entity(struct sched_rt_entity rt_se, unsigned* int flags);
525	static void dequeue_rt_entity(struct sched_rt_entity rt_se, unsigned* int flags);
526
527	static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
528	{
529	struct task_struct *donor = rq_of_rt_rq(rt_rq)->donor;
530	struct rq *rq = rq_of_rt_rq(rt_rq);
531	struct sched_rt_entity *rt_se;
532
533	int cpu = cpu_of(rq);
534
535	rt_se = rt_rq->tg->rt_se[cpu];
536
537	if (rt_rq->rt_nr_running) {
538	if (!rt_se)
539	enqueue_top_rt_rq(rt_rq);
540	else if (!on_rt_rq(rt_se))
541	enqueue_rt_entity(rt_se, `0`);
542
543	if (rt_rq->highest_prio.curr < donor->prio)
544	resched_curr(rq);
545	}
546	}
547
548	static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
549	{
550	struct sched_rt_entity *rt_se;
551	int cpu = cpu_of(rq_of_rt_rq(rt_rq));
552
553	rt_se = rt_rq->tg->rt_se[cpu];
554
555	if (!rt_se) {
556	dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
557	/ Kick cpufreq (see the comment in kernel/sched/sched.h). /
558	cpufreq_update_util(rq_of_rt_rq(rt_rq), `0`);
559	}
560	else if (on_rt_rq(rt_se))
561	dequeue_rt_entity(rt_se, `0`);
562	}
563
564	static inline int rt_rq_throttled(struct rt_rq *rt_rq)
565	{
566	return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
567	}
568
569	static int rt_se_boosted(struct sched_rt_entity *rt_se)
570	{
571	struct rt_rq *rt_rq = group_rt_rq(rt_se);
572	struct task_struct *p;
573
574	if (rt_rq)
575	return !!rt_rq->rt_nr_boosted;
576
577	p = rt_task_of(rt_se);
578	return p->prio != p->normal_prio;
579	}
580
581	static inline const struct cpumask sched_rt_period_mask(void*)
582	{
583	return this_rq()->rd->span;
584	}
585
586	static inline
587	struct rt_rq sched_rt_period_rt_rq(struct* rt_bandwidth rt_b, int* cpu)
588	{
589	return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
590	}
591
592	static inline struct rt_bandwidth sched_rt_bandwidth(struct* rt_rq *rt_rq)
593	{
594	return &rt_rq->tg->rt_bandwidth;
595	}
596
597	bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
598	{
599	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
600
601	return (hrtimer_active(&rt_b->rt_period_timer) \|\|
602	rt_rq->rt_time < rt_b->rt_runtime);
603	}
604
605	/*
606	* We ran out of runtime, see if we can borrow some from our neighbours.
607	*/
608	static void do_balance_runtime(struct rt_rq *rt_rq)
609	{
610	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
611	struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
612	int i, weight;
613	u64 rt_period;
614
615	weight = cpumask_weight(rd->span);
616
617	raw_spin_lock(&rt_b->rt_runtime_lock);
618	rt_period = ktime_to_ns(rt_b->rt_period);
619	for_each_cpu(i, rd->span) {
620	struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
621	s64 diff;
622
623	if (iter == rt_rq)
624	continue;
625
626	raw_spin_lock(&iter->rt_runtime_lock);
627	/*
628	* Either all rqs have inf runtime and there's nothing to steal
629	* or __disable_runtime() below sets a specific rq to inf to
630	* indicate its been disabled and disallow stealing.
631	*/
632	if (iter->rt_runtime == RUNTIME_INF)
633	goto next;
634
635	/*
636	* From runqueues with spare time, take 1/n part of their
637	* spare time, but no more than our period.
638	*/
639	diff = iter->rt_runtime - iter->rt_time;
640	if (diff > `0`) {
641	diff = div_u64((u64)diff, weight);
642	if (rt_rq->rt_runtime + diff > rt_period)
643	diff = rt_period - rt_rq->rt_runtime;
644	iter->rt_runtime -= diff;
645	rt_rq->rt_runtime += diff;
646	if (rt_rq->rt_runtime == rt_period) {
647	raw_spin_unlock(&iter->rt_runtime_lock);
648	break;
649	}
650	}
651	next:
652	raw_spin_unlock(&iter->rt_runtime_lock);
653	}
654	raw_spin_unlock(&rt_b->rt_runtime_lock);
655	}
656
657	/*
658	* Ensure this RQ takes back all the runtime it lend to its neighbours.
659	*/
660	static void __disable_runtime(struct rq *rq)
661	{
662	struct root_domain *rd = rq->rd;
663	rt_rq_iter_t iter;
664	struct rt_rq *rt_rq;
665
666	if (unlikely(!scheduler_running))
667	return;
668
669	for_each_rt_rq(rt_rq, iter, rq) {
670	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
671	s64 want;
672	int i;
673
674	raw_spin_lock(&rt_b->rt_runtime_lock);
675	raw_spin_lock(&rt_rq->rt_runtime_lock);
676	/*
677	* Either we're all inf and nobody needs to borrow, or we're
678	* already disabled and thus have nothing to do, or we have
679	* exactly the right amount of runtime to take out.
680	*/
681	if (rt_rq->rt_runtime == RUNTIME_INF \|\|
682	rt_rq->rt_runtime == rt_b->rt_runtime)
683	goto balanced;
684	raw_spin_unlock(&rt_rq->rt_runtime_lock);
685
686	/*
687	* Calculate the difference between what we started out with
688	* and what we current have, that's the amount of runtime
689	* we lend and now have to reclaim.
690	*/
691	want = rt_b->rt_runtime - rt_rq->rt_runtime;
692
693	/*
694	* Greedy reclaim, take back as much as we can.
695	*/
696	for_each_cpu(i, rd->span) {
697	struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
698	s64 diff;
699
700	/*
701	* Can't reclaim from ourselves or disabled runqueues.
702	*/
703	if (iter == rt_rq \|\| iter->rt_runtime == RUNTIME_INF)
704	continue;
705
706	raw_spin_lock(&iter->rt_runtime_lock);
707	if (want > `0`) {
708	diff = min_t(s64, iter->rt_runtime, want);
709	iter->rt_runtime -= diff;
710	want -= diff;
711	} else {
712	iter->rt_runtime -= want;
713	want -= want;
714	}
715	raw_spin_unlock(&iter->rt_runtime_lock);
716
717	if (!want)
718	break;
719	}
720
721	raw_spin_lock(&rt_rq->rt_runtime_lock);
722	/*
723	* We cannot be left wanting - that would mean some runtime
724	* leaked out of the system.
725	*/
726	WARN_ON_ONCE(want);
727	balanced:
728	/*
729	* Disable all the borrow logic by pretending we have inf
730	* runtime - in which case borrowing doesn't make sense.
731	*/
732	rt_rq->rt_runtime = RUNTIME_INF;
733	rt_rq->rt_throttled = `0`;
734	raw_spin_unlock(&rt_rq->rt_runtime_lock);
735	raw_spin_unlock(&rt_b->rt_runtime_lock);
736
737	/ Make rt_rq available for pick_next_task() /
738	sched_rt_rq_enqueue(rt_rq);
739	}
740	}
741
742	static void __enable_runtime(struct rq *rq)
743	{
744	rt_rq_iter_t iter;
745	struct rt_rq *rt_rq;
746
747	if (unlikely(!scheduler_running))
748	return;
749
750	/*
751	* Reset each runqueue's bandwidth settings
752	*/
753	for_each_rt_rq(rt_rq, iter, rq) {
754	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
755
756	raw_spin_lock(&rt_b->rt_runtime_lock);
757	raw_spin_lock(&rt_rq->rt_runtime_lock);
758	rt_rq->rt_runtime = rt_b->rt_runtime;
759	rt_rq->rt_time = `0`;
760	rt_rq->rt_throttled = `0`;
761	raw_spin_unlock(&rt_rq->rt_runtime_lock);
762	raw_spin_unlock(&rt_b->rt_runtime_lock);
763	}
764	}
765
766	static void balance_runtime(struct rt_rq *rt_rq)
767	{
768	if (!sched_feat(RT_RUNTIME_SHARE))
769	return;
770
771	if (rt_rq->rt_time > rt_rq->rt_runtime) {
772	raw_spin_unlock(&rt_rq->rt_runtime_lock);
773	do_balance_runtime(rt_rq);
774	raw_spin_lock(&rt_rq->rt_runtime_lock);
775	}
776	}
777
778	static int do_sched_rt_period_timer(struct rt_bandwidth rt_b, int* overrun)
779	{
780	int i, idle = `1`, throttled = `0`;
781	const struct cpumask *span;
782
783	span = sched_rt_period_mask();
784
785	/*
786	* FIXME: isolated CPUs should really leave the root task group,
787	* whether they are isolcpus or were isolated via cpusets, lest
788	* the timer run on a CPU which does not service all runqueues,
789	* potentially leaving other CPUs indefinitely throttled. If
790	* isolation is really required, the user will turn the throttle
791	* off to kill the perturbations it causes anyway. Meanwhile,
792	* this maintains functionality for boot and/or troubleshooting.
793	*/
794	if (rt_b == &root_task_group.rt_bandwidth)
795	span = cpu_online_mask;
796
797	for_each_cpu(i, span) {
798	int enqueue = `0`;
799	struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
800	struct rq *rq = rq_of_rt_rq(rt_rq);
801	struct rq_flags rf;
802	int skip;
803
804	/*
805	* When span == cpu_online_mask, taking each rq->lock
806	* can be time-consuming. Try to avoid it when possible.
807	*/
808	raw_spin_lock(&rt_rq->rt_runtime_lock);
809	if (!sched_feat(RT_RUNTIME_SHARE) && rt_rq->rt_runtime != RUNTIME_INF)
810	rt_rq->rt_runtime = rt_b->rt_runtime;
811	skip = !rt_rq->rt_time && !rt_rq->rt_nr_running;
812	raw_spin_unlock(&rt_rq->rt_runtime_lock);
813	if (skip)
814	continue;
815
816	rq_lock(rq, &rf);
817	update_rq_clock(rq);
818
819	if (rt_rq->rt_time) {
820	u64 runtime;
821
822	raw_spin_lock(&rt_rq->rt_runtime_lock);
823	if (rt_rq->rt_throttled)
824	balance_runtime(rt_rq);
825	runtime = rt_rq->rt_runtime;
826	rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
827	if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
828	rt_rq->rt_throttled = `0`;
829	enqueue = `1`;
830
831	/*
832	* When we're idle and a woken (rt) task is
833	* throttled wakeup_preempt() will set
834	* skip_update and the time between the wakeup
835	* and this unthrottle will get accounted as
836	* 'runtime'.
837	*/
838	if (rt_rq->rt_nr_running && rq->curr == rq->idle)
839	rq_clock_cancel_skipupdate(rq);
840	}
841	if (rt_rq->rt_time \|\| rt_rq->rt_nr_running)
842	idle = `0`;
843	raw_spin_unlock(&rt_rq->rt_runtime_lock);
844	} else if (rt_rq->rt_nr_running) {
845	idle = `0`;
846	if (!rt_rq_throttled(rt_rq))
847	enqueue = `1`;
848	}
849	if (rt_rq->rt_throttled)
850	throttled = `1`;
851
852	if (enqueue)
853	sched_rt_rq_enqueue(rt_rq);
854	rq_unlock(rq, &rf);
855	}
856
857	if (!throttled && (!rt_bandwidth_enabled() \|\| rt_b->rt_runtime == RUNTIME_INF))
858	return `1`;
859
860	return idle;
861	}
862
863	static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
864	{
865	u64 runtime = sched_rt_runtime(rt_rq);
866
867	if (rt_rq->rt_throttled)
868	return rt_rq_throttled(rt_rq);
869
870	if (runtime >= sched_rt_period(rt_rq))
871	return `0`;
872
873	balance_runtime(rt_rq);
874	runtime = sched_rt_runtime(rt_rq);
875	if (runtime == RUNTIME_INF)
876	return `0`;
877
878	if (rt_rq->rt_time > runtime) {
879	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
880
881	/*
882	* Don't actually throttle groups that have no runtime assigned
883	* but accrue some time due to boosting.
884	*/
885	if (likely(rt_b->rt_runtime)) {
886	rt_rq->rt_throttled = `1`;
887	printk_deferred_once("sched: RT throttling activated\n");
888	} else {
889	/*
890	* In case we did anyway, make it go away,
891	* replenishment is a joke, since it will replenish us
892	* with exactly 0 ns.
893	*/
894	rt_rq->rt_time = `0`;
895	}
896
897	if (rt_rq_throttled(rt_rq)) {
898	sched_rt_rq_dequeue(rt_rq);
899	return `1`;
900	}
901	}
902
903	return `0`;
904	}
905
906	#else /* !CONFIG_RT_GROUP_SCHED: */
907
908	typedef struct rt_rq *rt_rq_iter_t;
909
910	#define for_each_rt_rq(rt_rq, iter, rq) \
911	for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
912
913	#define for_each_sched_rt_entity(rt_se) \
914	for (; rt_se; rt_se = NULL)
915
916	static inline struct rt_rq group_rt_rq(struct* sched_rt_entity *rt_se)
917	{
918	return NULL;
919	}
920
921	static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
922	{
923	struct rq *rq = rq_of_rt_rq(rt_rq);
924
925	if (!rt_rq->rt_nr_running)
926	return;
927
928	enqueue_top_rt_rq(rt_rq);
929	resched_curr(rq);
930	}
931
932	static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
933	{
934	dequeue_top_rt_rq(rt_rq, count: rt_rq->rt_nr_running);
935	}
936
937	static inline int rt_rq_throttled(struct rt_rq *rt_rq)
938	{
939	return false;
940	}
941
942	static inline const struct cpumask sched_rt_period_mask(void*)
943	{
944	return cpu_online_mask;
945	}
946
947	static inline
948	struct rt_rq sched_rt_period_rt_rq(struct* rt_bandwidth rt_b, int* cpu)
949	{
950	return &cpu_rq(cpu)->rt;
951	}
952
953	static void __enable_runtime(struct rq *rq) { }
954	static void __disable_runtime(struct rq *rq) { }
955
956	#endif /* !CONFIG_RT_GROUP_SCHED */
957
958	static inline int rt_se_prio(struct sched_rt_entity *rt_se)
959	{
960	#ifdef CONFIG_RT_GROUP_SCHED
961	struct rt_rq *rt_rq = group_rt_rq(rt_se);
962
963	if (rt_rq)
964	return rt_rq->highest_prio.curr;
965	#endif
966
967	return rt_task_of(rt_se)->prio;
968	}
969
970	/*
971	* Update the current task's runtime statistics. Skip current tasks that
972	* are not in our scheduling class.
973	*/
974	static void update_curr_rt(struct rq *rq)
975	{
976	struct task_struct *donor = rq->donor;
977	s64 delta_exec;
978
979	if (donor->sched_class != &rt_sched_class)
980	return;
981
982	delta_exec = update_curr_common(rq);
983	if (unlikely(delta_exec <= `0`))
984	return;
985
986	#ifdef CONFIG_RT_GROUP_SCHED
987	struct sched_rt_entity *rt_se = &donor->rt;
988
989	if (!rt_bandwidth_enabled())
990	return;
991
992	for_each_sched_rt_entity(rt_se) {
993	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
994	int exceeded;
995
996	if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
997	raw_spin_lock(&rt_rq->rt_runtime_lock);
998	rt_rq->rt_time += delta_exec;
999	exceeded = sched_rt_runtime_exceeded(rt_rq);
1000	if (exceeded)
1001	resched_curr(rq);
1002	raw_spin_unlock(&rt_rq->rt_runtime_lock);
1003	if (exceeded)
1004	do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq));
1005	}
1006	}
1007	#endif /* CONFIG_RT_GROUP_SCHED */
1008	}
1009
1010	static void
1011	dequeue_top_rt_rq(struct rt_rq rt_rq, unsigned* int count)
1012	{
1013	struct rq *rq = rq_of_rt_rq(rt_rq);
1014
1015	BUG_ON(&rq->rt != rt_rq);
1016
1017	if (!rt_rq->rt_queued)
1018	return;
1019
1020	BUG_ON(!rq->nr_running);
1021
1022	sub_nr_running(rq, count);
1023	rt_rq->rt_queued = `0`;
1024
1025	}
1026
1027	static void
1028	enqueue_top_rt_rq(struct rt_rq *rt_rq)
1029	{
1030	struct rq *rq = rq_of_rt_rq(rt_rq);
1031
1032	BUG_ON(&rq->rt != rt_rq);
1033
1034	if (rt_rq->rt_queued)
1035	return;
1036
1037	if (rt_rq_throttled(rt_rq))
1038	return;
1039
1040	if (rt_rq->rt_nr_running) {
1041	add_nr_running(rq, count: rt_rq->rt_nr_running);
1042	rt_rq->rt_queued = `1`;
1043	}
1044
1045	/ Kick cpufreq (see the comment in kernel/sched/sched.h). /
1046	cpufreq_update_util(rq, flags: `0`);
1047	}
1048
1049	static void
1050	inc_rt_prio_smp(struct rt_rq rt_rq, int* prio, int prev_prio)
1051	{
1052	struct rq *rq = rq_of_rt_rq(rt_rq);
1053
1054	/*
1055	* Change rq's cpupri only if rt_rq is the top queue.
1056	*/
1057	if (IS_ENABLED(CONFIG_RT_GROUP_SCHED) && &rq->rt != rt_rq)
1058	return;
1059
1060	if (rq->online && prio < prev_prio)
1061	cpupri_set(cp: &rq->rd->cpupri, cpu: rq->cpu, pri: prio);
1062	}
1063
1064	static void
1065	dec_rt_prio_smp(struct rt_rq rt_rq, int* prio, int prev_prio)
1066	{
1067	struct rq *rq = rq_of_rt_rq(rt_rq);
1068
1069	/*
1070	* Change rq's cpupri only if rt_rq is the top queue.
1071	*/
1072	if (IS_ENABLED(CONFIG_RT_GROUP_SCHED) && &rq->rt != rt_rq)
1073	return;
1074
1075	if (rq->online && rt_rq->highest_prio.curr != prev_prio)
1076	cpupri_set(cp: &rq->rd->cpupri, cpu: rq->cpu, pri: rt_rq->highest_prio.curr);
1077	}
1078
1079	static void
1080	inc_rt_prio(struct rt_rq rt_rq, int* prio)
1081	{
1082	int prev_prio = rt_rq->highest_prio.curr;
1083
1084	if (prio < prev_prio)
1085	rt_rq->highest_prio.curr = prio;
1086
1087	inc_rt_prio_smp(rt_rq, prio, prev_prio);
1088	}
1089
1090	static void
1091	dec_rt_prio(struct rt_rq rt_rq, int* prio)
1092	{
1093	int prev_prio = rt_rq->highest_prio.curr;
1094
1095	if (rt_rq->rt_nr_running) {
1096
1097	WARN_ON(prio < prev_prio);
1098
1099	/*
1100	* This may have been our highest task, and therefore
1101	* we may have some re-computation to do
1102	*/
1103	if (prio == prev_prio) {
1104	struct rt_prio_array *array = &rt_rq->active;
1105
1106	rt_rq->highest_prio.curr =
1107	sched_find_first_bit(b: array->bitmap);
1108	}
1109
1110	} else {
1111	rt_rq->highest_prio.curr = MAX_RT_PRIO-`1`;
1112	}
1113
1114	dec_rt_prio_smp(rt_rq, prio, prev_prio);
1115	}
1116
1117	#ifdef CONFIG_RT_GROUP_SCHED
1118
1119	static void
1120	inc_rt_group(struct sched_rt_entity rt_se, struct* rt_rq *rt_rq)
1121	{
1122	if (rt_se_boosted(rt_se))
1123	rt_rq->rt_nr_boosted++;
1124
1125	start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
1126	}
1127
1128	static void
1129	dec_rt_group(struct sched_rt_entity rt_se, struct* rt_rq *rt_rq)
1130	{
1131	if (rt_se_boosted(rt_se))
1132	rt_rq->rt_nr_boosted--;
1133
1134	WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
1135	}
1136
1137	#else /* !CONFIG_RT_GROUP_SCHED: */
1138
1139	static void
1140	inc_rt_group(struct sched_rt_entity rt_se, struct* rt_rq *rt_rq)
1141	{
1142	}
1143
1144	static inline
1145	void dec_rt_group(struct sched_rt_entity rt_se, struct* rt_rq *rt_rq) {}
1146
1147	#endif /* !CONFIG_RT_GROUP_SCHED */
1148
1149	static inline
1150	unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
1151	{
1152	struct rt_rq *group_rq = group_rt_rq(rt_se);
1153
1154	if (group_rq)
1155	return group_rq->rt_nr_running;
1156	else
1157	return `1`;
1158	}
1159
1160	static inline
1161	unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
1162	{
1163	struct rt_rq *group_rq = group_rt_rq(rt_se);
1164	struct task_struct *tsk;
1165
1166	if (group_rq)
1167	return group_rq->rr_nr_running;
1168
1169	tsk = rt_task_of(rt_se);
1170
1171	return (tsk->policy == SCHED_RR) ? `1` : `0`;
1172	}
1173
1174	static inline
1175	void inc_rt_tasks(struct sched_rt_entity rt_se, struct* rt_rq *rt_rq)
1176	{
1177	int prio = rt_se_prio(rt_se);
1178
1179	WARN_ON(!rt_prio(prio));
1180	rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
1181	rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
1182
1183	inc_rt_prio(rt_rq, prio);
1184	inc_rt_group(rt_se, rt_rq);
1185	}
1186
1187	static inline
1188	void dec_rt_tasks(struct sched_rt_entity rt_se, struct* rt_rq *rt_rq)
1189	{
1190	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
1191	WARN_ON(!rt_rq->rt_nr_running);
1192	rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
1193	rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
1194
1195	dec_rt_prio(rt_rq, prio: rt_se_prio(rt_se));
1196	dec_rt_group(rt_se, rt_rq);
1197	}
1198
1199	/*
1200	* Change rt_se->run_list location unless SAVE && !MOVE
1201	*
1202	* assumes ENQUEUE/DEQUEUE flags match
1203	*/
1204	static inline bool move_entity(unsigned int flags)
1205	{
1206	if ((flags & (DEQUEUE_SAVE \| DEQUEUE_MOVE)) == DEQUEUE_SAVE)
1207	return false;
1208
1209	return true;
1210	}
1211
1212	static void __delist_rt_entity(struct sched_rt_entity rt_se, struct* rt_prio_array *array)
1213	{
1214	list_del_init(entry: &rt_se->run_list);
1215
1216	if (list_empty(head: array->queue + rt_se_prio(rt_se)))
1217	__clear_bit(rt_se_prio(rt_se), array->bitmap);
1218
1219	rt_se->on_list = `0`;
1220	}
1221
1222	static inline struct sched_statistics *
1223	__schedstats_from_rt_se(struct sched_rt_entity *rt_se)
1224	{
1225	/ schedstats is not supported for rt group. /
1226	if (!rt_entity_is_task(rt_se))
1227	return NULL;
1228
1229	return &rt_task_of(rt_se)->stats;
1230	}
1231
1232	static inline void
1233	update_stats_wait_start_rt(struct rt_rq rt_rq, struct* sched_rt_entity *rt_se)
1234	{
1235	struct sched_statistics *stats;
1236	struct task_struct *p = NULL;
1237
1238	if (!schedstat_enabled())
1239	return;
1240
1241	if (rt_entity_is_task(rt_se))
1242	p = rt_task_of(rt_se);
1243
1244	stats = __schedstats_from_rt_se(rt_se);
1245	if (!stats)
1246	return;
1247
1248	__update_stats_wait_start(rq: rq_of_rt_rq(rt_rq), p, stats);
1249	}
1250
1251	static inline void
1252	update_stats_enqueue_sleeper_rt(struct rt_rq rt_rq, struct* sched_rt_entity *rt_se)
1253	{
1254	struct sched_statistics *stats;
1255	struct task_struct *p = NULL;
1256
1257	if (!schedstat_enabled())
1258	return;
1259
1260	if (rt_entity_is_task(rt_se))
1261	p = rt_task_of(rt_se);
1262
1263	stats = __schedstats_from_rt_se(rt_se);
1264	if (!stats)
1265	return;
1266
1267	__update_stats_enqueue_sleeper(rq: rq_of_rt_rq(rt_rq), p, stats);
1268	}
1269
1270	static inline void
1271	update_stats_enqueue_rt(struct rt_rq rt_rq, struct* sched_rt_entity *rt_se,
1272	int flags)
1273	{
1274	if (!schedstat_enabled())
1275	return;
1276
1277	if (flags & ENQUEUE_WAKEUP)
1278	update_stats_enqueue_sleeper_rt(rt_rq, rt_se);
1279	}
1280
1281	static inline void
1282	update_stats_wait_end_rt(struct rt_rq rt_rq, struct* sched_rt_entity *rt_se)
1283	{
1284	struct sched_statistics *stats;
1285	struct task_struct *p = NULL;
1286
1287	if (!schedstat_enabled())
1288	return;
1289
1290	if (rt_entity_is_task(rt_se))
1291	p = rt_task_of(rt_se);
1292
1293	stats = __schedstats_from_rt_se(rt_se);
1294	if (!stats)
1295	return;
1296
1297	__update_stats_wait_end(rq: rq_of_rt_rq(rt_rq), p, stats);
1298	}
1299
1300	static inline void
1301	update_stats_dequeue_rt(struct rt_rq rt_rq, struct* sched_rt_entity *rt_se,
1302	int flags)
1303	{
1304	struct task_struct *p = NULL;
1305
1306	if (!schedstat_enabled())
1307	return;
1308
1309	if (rt_entity_is_task(rt_se))
1310	p = rt_task_of(rt_se);
1311
1312	if ((flags & DEQUEUE_SLEEP) && p) {
1313	unsigned int state;
1314
1315	state = READ_ONCE(p->__state);
1316	if (state & TASK_INTERRUPTIBLE)
1317	__schedstat_set(p->stats.sleep_start,
1318	rq_clock(rq_of_rt_rq(rt_rq)));
1319
1320	if (state & TASK_UNINTERRUPTIBLE)
1321	__schedstat_set(p->stats.block_start,
1322	rq_clock(rq_of_rt_rq(rt_rq)));
1323	}
1324	}
1325
1326	static void __enqueue_rt_entity(struct sched_rt_entity rt_se, unsigned* int flags)
1327	{
1328	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1329	struct rt_prio_array *array = &rt_rq->active;
1330	struct rt_rq *group_rq = group_rt_rq(rt_se);
1331	struct list_head *queue = array->queue + rt_se_prio(rt_se);
1332
1333	/*
1334	* Don't enqueue the group if its throttled, or when empty.
1335	* The latter is a consequence of the former when a child group
1336	* get throttled and the current group doesn't have any other
1337	* active members.
1338	*/
1339	if (group_rq && (rt_rq_throttled(rt_rq: group_rq) \|\| !group_rq->rt_nr_running)) {
1340	if (rt_se->on_list)
1341	__delist_rt_entity(rt_se, array);
1342	return;
1343	}
1344
1345	if (move_entity(flags)) {
1346	WARN_ON_ONCE(rt_se->on_list);
1347	if (flags & ENQUEUE_HEAD)
1348	list_add(new: &rt_se->run_list, head: queue);
1349	else
1350	list_add_tail(new: &rt_se->run_list, head: queue);
1351
1352	__set_bit(rt_se_prio(rt_se), array->bitmap);
1353	rt_se->on_list = `1`;
1354	}
1355	rt_se->on_rq = `1`;
1356
1357	inc_rt_tasks(rt_se, rt_rq);
1358	}
1359
1360	static void __dequeue_rt_entity(struct sched_rt_entity rt_se, unsigned* int flags)
1361	{
1362	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1363	struct rt_prio_array *array = &rt_rq->active;
1364
1365	if (move_entity(flags)) {
1366	WARN_ON_ONCE(!rt_se->on_list);
1367	__delist_rt_entity(rt_se, array);
1368	}
1369	rt_se->on_rq = `0`;
1370
1371	dec_rt_tasks(rt_se, rt_rq);
1372	}
1373
1374	/*
1375	* Because the prio of an upper entry depends on the lower
1376	* entries, we must remove entries top - down.
1377	*/
1378	static void dequeue_rt_stack(struct sched_rt_entity rt_se, unsigned* int flags)
1379	{
1380	struct sched_rt_entity *back = NULL;
1381	unsigned int rt_nr_running;
1382
1383	for_each_sched_rt_entity(rt_se) {
1384	rt_se->back = back;
1385	back = rt_se;
1386	}
1387
1388	rt_nr_running = rt_rq_of_se(rt_se: back)->rt_nr_running;
1389
1390	for (rt_se = back; rt_se; rt_se = rt_se->back) {
1391	if (on_rt_rq(rt_se))
1392	__dequeue_rt_entity(rt_se, flags);
1393	}
1394
1395	dequeue_top_rt_rq(rt_rq: rt_rq_of_se(rt_se: back), count: rt_nr_running);
1396	}
1397
1398	static void enqueue_rt_entity(struct sched_rt_entity rt_se, unsigned* int flags)
1399	{
1400	struct rq *rq = rq_of_rt_se(rt_se);
1401
1402	update_stats_enqueue_rt(rt_rq: rt_rq_of_se(rt_se), rt_se, flags);
1403
1404	dequeue_rt_stack(rt_se, flags);
1405	for_each_sched_rt_entity(rt_se)
1406	__enqueue_rt_entity(rt_se, flags);
1407	enqueue_top_rt_rq(rt_rq: &rq->rt);
1408	}
1409
1410	static void dequeue_rt_entity(struct sched_rt_entity rt_se, unsigned* int flags)
1411	{
1412	struct rq *rq = rq_of_rt_se(rt_se);
1413
1414	update_stats_dequeue_rt(rt_rq: rt_rq_of_se(rt_se), rt_se, flags);
1415
1416	dequeue_rt_stack(rt_se, flags);
1417
1418	for_each_sched_rt_entity(rt_se) {
1419	struct rt_rq *rt_rq = group_rt_rq(rt_se);
1420
1421	if (rt_rq && rt_rq->rt_nr_running)
1422	__enqueue_rt_entity(rt_se, flags);
1423	}
1424	enqueue_top_rt_rq(rt_rq: &rq->rt);
1425	}
1426
1427	/*
1428	* Adding/removing a task to/from a priority array:
1429	*/
1430	static void
1431	enqueue_task_rt(struct rq rq, struct* task_struct p, int* flags)
1432	{
1433	struct sched_rt_entity *rt_se = &p->rt;
1434
1435	if (flags & ENQUEUE_WAKEUP)
1436	rt_se->timeout = `0`;
1437
1438	check_schedstat_required();
1439	update_stats_wait_start_rt(rt_rq: rt_rq_of_se(rt_se), rt_se);
1440
1441	enqueue_rt_entity(rt_se, flags);
1442
1443	if (task_is_blocked(p))
1444	return;
1445
1446	if (!task_current(rq, p) && p->nr_cpus_allowed > `1`)
1447	enqueue_pushable_task(rq, p);
1448	}
1449
1450	static bool dequeue_task_rt(struct rq rq, struct* task_struct p, int* flags)
1451	{
1452	struct sched_rt_entity *rt_se = &p->rt;
1453
1454	update_curr_rt(rq);
1455	dequeue_rt_entity(rt_se, flags);
1456
1457	dequeue_pushable_task(rq, p);
1458
1459	return true;
1460	}
1461
1462	/*
1463	* Put task to the head or the end of the run list without the overhead of
1464	* dequeue followed by enqueue.
1465	*/
1466	static void
1467	requeue_rt_entity(struct rt_rq rt_rq, struct* sched_rt_entity rt_se, int* head)
1468	{
1469	if (on_rt_rq(rt_se)) {
1470	struct rt_prio_array *array = &rt_rq->active;
1471	struct list_head *queue = array->queue + rt_se_prio(rt_se);
1472
1473	if (head)
1474	list_move(list: &rt_se->run_list, head: queue);
1475	else
1476	list_move_tail(list: &rt_se->run_list, head: queue);
1477	}
1478	}
1479
1480	static void requeue_task_rt(struct rq rq, struct* task_struct p, int* head)
1481	{
1482	struct sched_rt_entity *rt_se = &p->rt;
1483	struct rt_rq *rt_rq;
1484
1485	for_each_sched_rt_entity(rt_se) {
1486	rt_rq = rt_rq_of_se(rt_se);
1487	requeue_rt_entity(rt_rq, rt_se, head);
1488	}
1489	}
1490
1491	static void yield_task_rt(struct rq *rq)
1492	{
1493	requeue_task_rt(rq, p: rq->curr, head: `0`);
1494	}
1495
1496	static int find_lowest_rq(struct task_struct *task);
1497
1498	static int
1499	select_task_rq_rt(struct task_struct p, int* cpu, int flags)
1500	{
1501	struct task_struct curr, donor;
1502	struct rq *rq;
1503	bool test;
1504
1505	/ For anything but wake ups, just return the task_cpu /
1506	if (!(flags & (WF_TTWU \| WF_FORK)))
1507	goto out;
1508
1509	rq = cpu_rq(cpu);
1510
1511	rcu_read_lock();
1512	curr = READ_ONCE(rq->curr); / unlocked access /
1513	donor = READ_ONCE(rq->donor);
1514
1515	/*
1516	* If the current task on @p's runqueue is an RT task, then
1517	* try to see if we can wake this RT task up on another
1518	* runqueue. Otherwise simply start this RT task
1519	* on its current runqueue.
1520	*
1521	* We want to avoid overloading runqueues. If the woken
1522	* task is a higher priority, then it will stay on this CPU
1523	* and the lower prio task should be moved to another CPU.
1524	* Even though this will probably make the lower prio task
1525	* lose its cache, we do not want to bounce a higher task
1526	* around just because it gave up its CPU, perhaps for a
1527	* lock?
1528	*
1529	* For equal prio tasks, we just let the scheduler sort it out.
1530	*
1531	* Otherwise, just let it ride on the affine RQ and the
1532	* post-schedule router will push the preempted task away
1533	*
1534	* This test is optimistic, if we get it wrong the load-balancer
1535	* will have to sort it out.
1536	*
1537	* We take into account the capacity of the CPU to ensure it fits the
1538	* requirement of the task - which is only important on heterogeneous
1539	* systems like big.LITTLE.
1540	*/
1541	test = curr &&
1542	unlikely(rt_task(donor)) &&
1543	(curr->nr_cpus_allowed < `2` \|\| donor->prio <= p->prio);
1544
1545	if (test \|\| !rt_task_fits_capacity(p, cpu)) {
1546	int target = find_lowest_rq(task: p);
1547
1548	/*
1549	* Bail out if we were forcing a migration to find a better
1550	* fitting CPU but our search failed.
1551	*/
1552	if (!test && target != -`1` && !rt_task_fits_capacity(p, cpu: target))
1553	goto out_unlock;
1554
1555	/*
1556	* Don't bother moving it if the destination CPU is
1557	* not running a lower priority task.
1558	*/
1559	if (target != -`1` &&
1560	p->prio < cpu_rq(target)->rt.highest_prio.curr)
1561	cpu = target;
1562	}
1563
1564	out_unlock:
1565	rcu_read_unlock();
1566
1567	out:
1568	return cpu;
1569	}
1570
1571	static void check_preempt_equal_prio(struct rq rq, struct* task_struct *p)
1572	{
1573	if (rq->curr->nr_cpus_allowed == `1` \|\|
1574	!cpupri_find(cp: &rq->rd->cpupri, p: rq->donor, NULL))
1575	return;
1576
1577	/*
1578	* p is migratable, so let's not schedule it and
1579	* see if it is pushed or pulled somewhere else.
1580	*/
1581	if (p->nr_cpus_allowed != `1` &&
1582	cpupri_find(cp: &rq->rd->cpupri, p, NULL))
1583	return;
1584
1585	/*
1586	* There appear to be other CPUs that can accept
1587	* the current task but none can run 'p', so lets reschedule
1588	* to try and push the current task away:
1589	*/
1590	requeue_task_rt(rq, p, head: `1`);
1591	resched_curr(rq);
1592	}
1593
1594	static int balance_rt(struct rq rq, struct* task_struct p, struct* rq_flags *rf)
1595	{
1596	if (!on_rt_rq(rt_se: &p->rt) && need_pull_rt_task(rq, prev: p)) {
1597	/*
1598	* This is OK, because current is on_cpu, which avoids it being
1599	* picked for load-balance and preemption/IRQs are still
1600	* disabled avoiding further scheduler activity on it and we've
1601	* not yet started the picking loop.
1602	*/
1603	rq_unpin_lock(rq, rf);
1604	pull_rt_task(rq);
1605	rq_repin_lock(rq, rf);
1606	}
1607
1608	return sched_stop_runnable(rq) \|\| sched_dl_runnable(rq) \|\| sched_rt_runnable(rq);
1609	}
1610
1611	/*
1612	* Preempt the current task with a newly woken task if needed:
1613	*/
1614	static void wakeup_preempt_rt(struct rq rq, struct* task_struct p, int* flags)
1615	{
1616	struct task_struct *donor = rq->donor;
1617
1618	if (p->prio < donor->prio) {
1619	resched_curr(rq);
1620	return;
1621	}
1622
1623	/*
1624	* If:
1625	*
1626	* - the newly woken task is of equal priority to the current task
1627	* - the newly woken task is non-migratable while current is migratable
1628	* - current will be preempted on the next reschedule
1629	*
1630	* we should check to see if current can readily move to a different
1631	* cpu. If so, we will reschedule to allow the push logic to try
1632	* to move current somewhere else, making room for our non-migratable
1633	* task.
1634	*/
1635	if (p->prio == donor->prio && !test_tsk_need_resched(tsk: rq->curr))
1636	check_preempt_equal_prio(rq, p);
1637	}
1638
1639	static inline void set_next_task_rt(struct rq rq, struct* task_struct *p, bool first)
1640	{
1641	struct sched_rt_entity *rt_se = &p->rt;
1642	struct rt_rq *rt_rq = &rq->rt;
1643
1644	p->se.exec_start = rq_clock_task(rq);
1645	if (on_rt_rq(rt_se: &p->rt))
1646	update_stats_wait_end_rt(rt_rq, rt_se);
1647
1648	/ The running task is never eligible for pushing /
1649	dequeue_pushable_task(rq, p);
1650
1651	if (!first)
1652	return;
1653
1654	/*
1655	* If prev task was rt, put_prev_task() has already updated the
1656	* utilization. We only care of the case where we start to schedule a
1657	* rt task
1658	*/
1659	if (rq->donor->sched_class != &rt_sched_class)
1660	update_rt_rq_load_avg(now: rq_clock_pelt(rq), rq, running: `0`);
1661
1662	rt_queue_push_tasks(rq);
1663	}
1664
1665	static struct sched_rt_entity pick_next_rt_entity(struct* rt_rq *rt_rq)
1666	{
1667	struct rt_prio_array *array = &rt_rq->active;
1668	struct sched_rt_entity *next = NULL;
1669	struct list_head *queue;
1670	int idx;
1671
1672	idx = sched_find_first_bit(b: array->bitmap);
1673	BUG_ON(idx >= MAX_RT_PRIO);
1674
1675	queue = array->queue + idx;
1676	if (WARN_ON_ONCE(list_empty(queue)))
1677	return NULL;
1678	next = list_entry(queue->next, struct sched_rt_entity, run_list);
1679
1680	return next;
1681	}
1682
1683	static struct task_struct _pick_next_task_rt(struct* rq *rq)
1684	{
1685	struct sched_rt_entity *rt_se;
1686	struct rt_rq *rt_rq = &rq->rt;
1687
1688	do {
1689	rt_se = pick_next_rt_entity(rt_rq);
1690	if (unlikely(!rt_se))
1691	return NULL;
1692	rt_rq = group_rt_rq(rt_se);
1693	} while (rt_rq);
1694
1695	return rt_task_of(rt_se);
1696	}
1697
1698	static struct task_struct pick_task_rt(struct* rq *rq)
1699	{
1700	struct task_struct *p;
1701
1702	if (!sched_rt_runnable(rq))
1703	return NULL;
1704
1705	p = _pick_next_task_rt(rq);
1706
1707	return p;
1708	}
1709
1710	static void put_prev_task_rt(struct rq rq, struct* task_struct p, struct* task_struct *next)
1711	{
1712	struct sched_rt_entity *rt_se = &p->rt;
1713	struct rt_rq *rt_rq = &rq->rt;
1714
1715	if (on_rt_rq(rt_se: &p->rt))
1716	update_stats_wait_start_rt(rt_rq, rt_se);
1717
1718	update_curr_rt(rq);
1719
1720	update_rt_rq_load_avg(now: rq_clock_pelt(rq), rq, running: `1`);
1721
1722	if (task_is_blocked(p))
1723	return;
1724	/*
1725	* The previous task needs to be made eligible for pushing
1726	* if it is still active
1727	*/
1728	if (on_rt_rq(rt_se: &p->rt) && p->nr_cpus_allowed > `1`)
1729	enqueue_pushable_task(rq, p);
1730	}
1731
1732	/ Only try algorithms three times /
1733	#define RT_MAX_TRIES 3
1734
1735	/*
1736	* Return the highest pushable rq's task, which is suitable to be executed
1737	* on the CPU, NULL otherwise
1738	*/
1739	static struct task_struct pick_highest_pushable_task(struct* rq rq, int* cpu)
1740	{
1741	struct plist_head *head = &rq->rt.pushable_tasks;
1742	struct task_struct *p;
1743
1744	if (!has_pushable_tasks(rq))
1745	return NULL;
1746
1747	plist_for_each_entry(p, head, pushable_tasks) {
1748	if (task_is_pushable(rq, p, cpu))
1749	return p;
1750	}
1751
1752	return NULL;
1753	}
1754
1755	static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1756
1757	static int find_lowest_rq(struct task_struct *task)
1758	{
1759	struct sched_domain *sd;
1760	struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
1761	int this_cpu = smp_processor_id();
1762	int cpu = task_cpu(p: task);
1763	int ret;
1764
1765	/ Make sure the mask is initialized first /
1766	if (unlikely(!lowest_mask))
1767	return -`1`;
1768
1769	if (task->nr_cpus_allowed == `1`)
1770	return -`1`; / No other targets possible /
1771
1772	/*
1773	* If we're on asym system ensure we consider the different capacities
1774	* of the CPUs when searching for the lowest_mask.
1775	*/
1776	if (sched_asym_cpucap_active()) {
1777
1778	ret = cpupri_find_fitness(cp: &task_rq(task)->rd->cpupri,
1779	p: task, lowest_mask,
1780	fitness_fn: rt_task_fits_capacity);
1781	} else {
1782
1783	ret = cpupri_find(cp: &task_rq(task)->rd->cpupri,
1784	p: task, lowest_mask);
1785	}
1786
1787	if (!ret)
1788	return -`1`; / No targets found /
1789
1790	/*
1791	* At this point we have built a mask of CPUs representing the
1792	* lowest priority tasks in the system. Now we want to elect
1793	* the best one based on our affinity and topology.
1794	*
1795	* We prioritize the last CPU that the task executed on since
1796	* it is most likely cache-hot in that location.
1797	*/
1798	if (cpumask_test_cpu(cpu, cpumask: lowest_mask))
1799	return cpu;
1800
1801	/*
1802	* Otherwise, we consult the sched_domains span maps to figure
1803	* out which CPU is logically closest to our hot cache data.
1804	*/
1805	if (!cpumask_test_cpu(cpu: this_cpu, cpumask: lowest_mask))
1806	this_cpu = -`1`; / Skip this_cpu opt if not among lowest /
1807
1808	rcu_read_lock();
1809	for_each_domain(cpu, sd) {
1810	if (sd->flags & SD_WAKE_AFFINE) {
1811	int best_cpu;
1812
1813	/*
1814	* "this_cpu" is cheaper to preempt than a
1815	* remote processor.
1816	*/
1817	if (this_cpu != -`1` &&
1818	cpumask_test_cpu(cpu: this_cpu, cpumask: sched_domain_span(sd))) {
1819	rcu_read_unlock();
1820	return this_cpu;
1821	}
1822
1823	best_cpu = cpumask_any_and_distribute(src1p: lowest_mask,
1824	src2p: sched_domain_span(sd));
1825	if (best_cpu < nr_cpu_ids) {
1826	rcu_read_unlock();
1827	return best_cpu;
1828	}
1829	}
1830	}
1831	rcu_read_unlock();
1832
1833	/*
1834	* And finally, if there were no matches within the domains
1835	* just give the caller something to work with from the compatible
1836	* locations.
1837	*/
1838	if (this_cpu != -`1`)
1839	return this_cpu;
1840
1841	cpu = cpumask_any_distribute(srcp: lowest_mask);
1842	if (cpu < nr_cpu_ids)
1843	return cpu;
1844
1845	return -`1`;
1846	}
1847
1848	static struct task_struct pick_next_pushable_task(struct* rq *rq)
1849	{
1850	struct task_struct *p;
1851
1852	if (!has_pushable_tasks(rq))
1853	return NULL;
1854
1855	p = plist_first_entry(&rq->rt.pushable_tasks,
1856	struct task_struct, pushable_tasks);
1857
1858	BUG_ON(rq->cpu != task_cpu(p));
1859	BUG_ON(task_current(rq, p));
1860	BUG_ON(task_current_donor(rq, p));
1861	BUG_ON(p->nr_cpus_allowed <= `1`);
1862
1863	BUG_ON(!task_on_rq_queued(p));
1864	BUG_ON(!rt_task(p));
1865
1866	return p;
1867	}
1868
1869	/ Will lock the rq it finds /
1870	static struct rq find_lock_lowest_rq(struct* task_struct task, struct* rq *rq)
1871	{
1872	struct rq *lowest_rq = NULL;
1873	int tries;
1874	int cpu;
1875
1876	for (tries = `0`; tries < RT_MAX_TRIES; tries++) {
1877	cpu = find_lowest_rq(task);
1878
1879	if ((cpu == -`1`) \|\| (cpu == rq->cpu))
1880	break;
1881
1882	lowest_rq = cpu_rq(cpu);
1883
1884	if (lowest_rq->rt.highest_prio.curr <= task->prio) {
1885	/*
1886	* Target rq has tasks of equal or higher priority,
1887	* retrying does not release any lock and is unlikely
1888	* to yield a different result.
1889	*/
1890	lowest_rq = NULL;
1891	break;
1892	}
1893
1894	/ if the prio of this runqueue changed, try again /
1895	if (double_lock_balance(this_rq: rq, busiest: lowest_rq)) {
1896	/*
1897	* We had to unlock the run queue. In
1898	* the mean time, task could have
1899	* migrated already or had its affinity changed,
1900	* therefore check if the task is still at the
1901	* head of the pushable tasks list.
1902	* It is possible the task was scheduled, set
1903	* "migrate_disabled" and then got preempted, so we must
1904	* check the task migration disable flag here too.
1905	*/
1906	if (unlikely(is_migration_disabled(task) \|\|
1907	!cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) \|\|
1908	task != pick_next_pushable_task(rq))) {
1909
1910	double_unlock_balance(this_rq: rq, busiest: lowest_rq);
1911	lowest_rq = NULL;
1912	break;
1913	}
1914	}
1915
1916	/ If this rq is still suitable use it. /
1917	if (lowest_rq->rt.highest_prio.curr > task->prio)
1918	break;
1919
1920	/ try again /
1921	double_unlock_balance(this_rq: rq, busiest: lowest_rq);
1922	lowest_rq = NULL;
1923	}
1924
1925	return lowest_rq;
1926	}
1927
1928	/*
1929	* If the current CPU has more than one RT task, see if the non
1930	* running task can migrate over to a CPU that is running a task
1931	* of lesser priority.
1932	*/
1933	static int push_rt_task(struct rq *rq, bool pull)
1934	{
1935	struct task_struct *next_task;
1936	struct rq *lowest_rq;
1937	int ret = `0`;
1938
1939	if (!rq->rt.overloaded)
1940	return `0`;
1941
1942	next_task = pick_next_pushable_task(rq);
1943	if (!next_task)
1944	return `0`;
1945
1946	retry:
1947	/*
1948	* It's possible that the next_task slipped in of
1949	* higher priority than current. If that's the case
1950	* just reschedule current.
1951	*/
1952	if (unlikely(next_task->prio < rq->donor->prio)) {
1953	resched_curr(rq);
1954	return `0`;
1955	}
1956
1957	if (is_migration_disabled(p: next_task)) {
1958	struct task_struct *push_task = NULL;
1959	int cpu;
1960
1961	if (!pull \|\| rq->push_busy)
1962	return `0`;
1963
1964	/*
1965	* Invoking find_lowest_rq() on anything but an RT task doesn't
1966	* make sense. Per the above priority check, curr has to
1967	* be of higher priority than next_task, so no need to
1968	* reschedule when bailing out.
1969	*
1970	* Note that the stoppers are masqueraded as SCHED_FIFO
1971	* (cf. sched_set_stop_task()), so we can't rely on rt_task().
1972	*/
1973	if (rq->donor->sched_class != &rt_sched_class)
1974	return `0`;
1975
1976	cpu = find_lowest_rq(task: rq->curr);
1977	if (cpu == -`1` \|\| cpu == rq->cpu)
1978	return `0`;
1979
1980	/*
1981	* Given we found a CPU with lower priority than @next_task,
1982	* therefore it should be running. However we cannot migrate it
1983	* to this other CPU, instead attempt to push the current
1984	* running task on this CPU away.
1985	*/
1986	push_task = get_push_task(rq);
1987	if (push_task) {
1988	preempt_disable();
1989	raw_spin_rq_unlock(rq);
1990	stop_one_cpu_nowait(cpu: rq->cpu, fn: push_cpu_stop,
1991	arg: push_task, work_buf: &rq->push_work);
1992	preempt_enable();
1993	raw_spin_rq_lock(rq);
1994	}
1995
1996	return `0`;
1997	}
1998
1999	if (WARN_ON(next_task == rq->curr))
2000	return `0`;
2001
2002	/ We might release rq lock /
2003	get_task_struct(t: next_task);
2004
2005	/ find_lock_lowest_rq locks the rq if found /
2006	lowest_rq = find_lock_lowest_rq(task: next_task, rq);
2007	if (!lowest_rq) {
2008	struct task_struct *task;
2009	/*
2010	* find_lock_lowest_rq releases rq->lock
2011	* so it is possible that next_task has migrated.
2012	*
2013	* We need to make sure that the task is still on the same
2014	* run-queue and is also still the next task eligible for
2015	* pushing.
2016	*/
2017	task = pick_next_pushable_task(rq);
2018	if (task == next_task) {
2019	/*
2020	* The task hasn't migrated, and is still the next
2021	* eligible task, but we failed to find a run-queue
2022	* to push it to. Do not retry in this case, since
2023	* other CPUs will pull from us when ready.
2024	*/
2025	goto out;
2026	}
2027
2028	if (!task)
2029	/ No more tasks, just exit /
2030	goto out;
2031
2032	/*
2033	* Something has shifted, try again.
2034	*/
2035	put_task_struct(t: next_task);
2036	next_task = task;
2037	goto retry;
2038	}
2039
2040	move_queued_task_locked(src_rq: rq, dst_rq: lowest_rq, task: next_task);
2041	resched_curr(rq: lowest_rq);
2042	ret = `1`;
2043
2044	double_unlock_balance(this_rq: rq, busiest: lowest_rq);
2045	out:
2046	put_task_struct(t: next_task);
2047
2048	return ret;
2049	}
2050
2051	static void push_rt_tasks(struct rq *rq)
2052	{
2053	/ push_rt_task will return true if it moved an RT /
2054	while (push_rt_task(rq, pull: false))
2055	;
2056	}
2057
2058	#ifdef HAVE_RT_PUSH_IPI
2059
2060	/*
2061	* When a high priority task schedules out from a CPU and a lower priority
2062	* task is scheduled in, a check is made to see if there's any RT tasks
2063	* on other CPUs that are waiting to run because a higher priority RT task
2064	* is currently running on its CPU. In this case, the CPU with multiple RT
2065	* tasks queued on it (overloaded) needs to be notified that a CPU has opened
2066	* up that may be able to run one of its non-running queued RT tasks.
2067	*
2068	* All CPUs with overloaded RT tasks need to be notified as there is currently
2069	* no way to know which of these CPUs have the highest priority task waiting
2070	* to run. Instead of trying to take a spinlock on each of these CPUs,
2071	* which has shown to cause large latency when done on machines with many
2072	* CPUs, sending an IPI to the CPUs to have them push off the overloaded
2073	* RT tasks waiting to run.
2074	*
2075	* Just sending an IPI to each of the CPUs is also an issue, as on large
2076	* count CPU machines, this can cause an IPI storm on a CPU, especially
2077	* if its the only CPU with multiple RT tasks queued, and a large number
2078	* of CPUs scheduling a lower priority task at the same time.
2079	*
2080	* Each root domain has its own IRQ work function that can iterate over
2081	* all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
2082	* task must be checked if there's one or many CPUs that are lowering
2083	* their priority, there's a single IRQ work iterator that will try to
2084	* push off RT tasks that are waiting to run.
2085	*
2086	* When a CPU schedules a lower priority task, it will kick off the
2087	* IRQ work iterator that will jump to each CPU with overloaded RT tasks.
2088	* As it only takes the first CPU that schedules a lower priority task
2089	* to start the process, the rto_start variable is incremented and if
2090	* the atomic result is one, then that CPU will try to take the rto_lock.
2091	* This prevents high contention on the lock as the process handles all
2092	* CPUs scheduling lower priority tasks.
2093	*
2094	* All CPUs that are scheduling a lower priority task will increment the
2095	* rt_loop_next variable. This will make sure that the IRQ work iterator
2096	* checks all RT overloaded CPUs whenever a CPU schedules a new lower
2097	* priority task, even if the iterator is in the middle of a scan. Incrementing
2098	* the rt_loop_next will cause the iterator to perform another scan.
2099	*
2100	*/
2101	static int rto_next_cpu(struct root_domain *rd)
2102	{
2103	int next;
2104	int cpu;
2105
2106	/*
2107	* When starting the IPI RT pushing, the rto_cpu is set to -1,
2108	* rt_next_cpu() will simply return the first CPU found in
2109	* the rto_mask.
2110	*
2111	* If rto_next_cpu() is called with rto_cpu is a valid CPU, it
2112	* will return the next CPU found in the rto_mask.
2113	*
2114	* If there are no more CPUs left in the rto_mask, then a check is made
2115	* against rto_loop and rto_loop_next. rto_loop is only updated with
2116	* the rto_lock held, but any CPU may increment the rto_loop_next
2117	* without any locking.
2118	*/
2119	for (;;) {
2120
2121	/ When rto_cpu is -1 this acts like cpumask_first() /
2122	cpu = cpumask_next(n: rd->rto_cpu, srcp: rd->rto_mask);
2123
2124	rd->rto_cpu = cpu;
2125
2126	if (cpu < nr_cpu_ids)
2127	return cpu;
2128
2129	rd->rto_cpu = -`1`;
2130
2131	/*
2132	* ACQUIRE ensures we see the @rto_mask changes
2133	* made prior to the @next value observed.
2134	*
2135	* Matches WMB in rt_set_overload().
2136	*/
2137	next = atomic_read_acquire(v: &rd->rto_loop_next);
2138
2139	if (rd->rto_loop == next)
2140	break;
2141
2142	rd->rto_loop = next;
2143	}
2144
2145	return -`1`;
2146	}
2147
2148	static inline bool rto_start_trylock(atomic_t *v)
2149	{
2150	return !atomic_cmpxchg_acquire(v, old: `0`, new: `1`);
2151	}
2152
2153	static inline void rto_start_unlock(atomic_t *v)
2154	{
2155	atomic_set_release(v, i: `0`);
2156	}
2157
2158	static void tell_cpu_to_push(struct rq *rq)
2159	{
2160	int cpu = -`1`;
2161
2162	/ Keep the loop going if the IPI is currently active /
2163	atomic_inc(v: &rq->rd->rto_loop_next);
2164
2165	/ Only one CPU can initiate a loop at a time /
2166	if (!rto_start_trylock(v: &rq->rd->rto_loop_start))
2167	return;
2168
2169	raw_spin_lock(&rq->rd->rto_lock);
2170
2171	/*
2172	* The rto_cpu is updated under the lock, if it has a valid CPU
2173	* then the IPI is still running and will continue due to the
2174	* update to loop_next, and nothing needs to be done here.
2175	* Otherwise it is finishing up and an IPI needs to be sent.
2176	*/
2177	if (rq->rd->rto_cpu < `0`)
2178	cpu = rto_next_cpu(rd: rq->rd);
2179
2180	raw_spin_unlock(&rq->rd->rto_lock);
2181
2182	rto_start_unlock(v: &rq->rd->rto_loop_start);
2183
2184	if (cpu >= `0`) {
2185	/ Make sure the rd does not get freed while pushing /
2186	sched_get_rd(rd: rq->rd);
2187	irq_work_queue_on(work: &rq->rd->rto_push_work, cpu);
2188	}
2189	}
2190
2191	/ Called from hardirq context /
2192	void rto_push_irq_work_func(struct irq_work *work)
2193	{
2194	struct root_domain *rd =
2195	container_of(work, struct root_domain, rto_push_work);
2196	struct rq *rq;
2197	int cpu;
2198
2199	rq = this_rq();
2200
2201	/*
2202	* We do not need to grab the lock to check for has_pushable_tasks.
2203	* When it gets updated, a check is made if a push is possible.
2204	*/
2205	if (has_pushable_tasks(rq)) {
2206	raw_spin_rq_lock(rq);
2207	while (push_rt_task(rq, pull: true))
2208	;
2209	raw_spin_rq_unlock(rq);
2210	}
2211
2212	raw_spin_lock(&rd->rto_lock);
2213
2214	/ Pass the IPI to the next rt overloaded queue /
2215	cpu = rto_next_cpu(rd);
2216
2217	raw_spin_unlock(&rd->rto_lock);
2218
2219	if (cpu < `0`) {
2220	sched_put_rd(rd);
2221	return;
2222	}
2223
2224	/ Try the next RT overloaded CPU /
2225	irq_work_queue_on(work: &rd->rto_push_work, cpu);
2226	}
2227	#endif /* HAVE_RT_PUSH_IPI */
2228
2229	static void pull_rt_task(struct rq *this_rq)
2230	{
2231	int this_cpu = this_rq->cpu, cpu;
2232	bool resched = false;
2233	struct task_struct p, push_task;
2234	struct rq *src_rq;
2235	int rt_overload_count = rt_overloaded(rq: this_rq);
2236
2237	if (likely(!rt_overload_count))
2238	return;
2239
2240	/*
2241	* Match the barrier from rt_set_overloaded; this guarantees that if we
2242	* see overloaded we must also see the rto_mask bit.
2243	*/
2244	smp_rmb();
2245
2246	/ If we are the only overloaded CPU do nothing /
2247	if (rt_overload_count == `1` &&
2248	cpumask_test_cpu(cpu: this_rq->cpu, cpumask: this_rq->rd->rto_mask))
2249	return;
2250
2251	#ifdef HAVE_RT_PUSH_IPI
2252	if (sched_feat(RT_PUSH_IPI)) {
2253	tell_cpu_to_push(rq: this_rq);
2254	return;
2255	}
2256	#endif
2257
2258	for_each_cpu(cpu, this_rq->rd->rto_mask) {
2259	if (this_cpu == cpu)
2260	continue;
2261
2262	src_rq = cpu_rq(cpu);
2263
2264	/*
2265	* Don't bother taking the src_rq->lock if the next highest
2266	* task is known to be lower-priority than our current task.
2267	* This may look racy, but if this value is about to go
2268	* logically higher, the src_rq will push this task away.
2269	* And if its going logically lower, we do not care
2270	*/
2271	if (src_rq->rt.highest_prio.next >=
2272	this_rq->rt.highest_prio.curr)
2273	continue;
2274
2275	/*
2276	* We can potentially drop this_rq's lock in
2277	* double_lock_balance, and another CPU could
2278	* alter this_rq
2279	*/
2280	push_task = NULL;
2281	double_lock_balance(this_rq, busiest: src_rq);
2282
2283	/*
2284	* We can pull only a task, which is pushable
2285	* on its rq, and no others.
2286	*/
2287	p = pick_highest_pushable_task(rq: src_rq, cpu: this_cpu);
2288
2289	/*
2290	* Do we have an RT task that preempts
2291	* the to-be-scheduled task?
2292	*/
2293	if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
2294	WARN_ON(p == src_rq->curr);
2295	WARN_ON(!task_on_rq_queued(p));
2296
2297	/*
2298	* There's a chance that p is higher in priority
2299	* than what's currently running on its CPU.
2300	* This is just that p is waking up and hasn't
2301	* had a chance to schedule. We only pull
2302	* p if it is lower in priority than the
2303	* current task on the run queue
2304	*/
2305	if (p->prio < src_rq->donor->prio)
2306	goto skip;
2307
2308	if (is_migration_disabled(p)) {
2309	push_task = get_push_task(rq: src_rq);
2310	} else {
2311	move_queued_task_locked(src_rq, dst_rq: this_rq, task: p);
2312	resched = true;
2313	}
2314	/*
2315	* We continue with the search, just in
2316	* case there's an even higher prio task
2317	* in another runqueue. (low likelihood
2318	* but possible)
2319	*/
2320	}
2321	skip:
2322	double_unlock_balance(this_rq, busiest: src_rq);
2323
2324	if (push_task) {
2325	preempt_disable();
2326	raw_spin_rq_unlock(rq: this_rq);
2327	stop_one_cpu_nowait(cpu: src_rq->cpu, fn: push_cpu_stop,
2328	arg: push_task, work_buf: &src_rq->push_work);
2329	preempt_enable();
2330	raw_spin_rq_lock(rq: this_rq);
2331	}
2332	}
2333
2334	if (resched)
2335	resched_curr(rq: this_rq);
2336	}
2337
2338	/*
2339	* If we are not running and we are not going to reschedule soon, we should
2340	* try to push tasks away now
2341	*/
2342	static void task_woken_rt(struct rq rq, struct* task_struct *p)
2343	{
2344	bool need_to_push = !task_on_cpu(rq, p) &&
2345	!test_tsk_need_resched(tsk: rq->curr) &&
2346	p->nr_cpus_allowed > `1` &&
2347	(dl_task(p: rq->donor) \|\| rt_task(p: rq->donor)) &&
2348	(rq->curr->nr_cpus_allowed < `2` \|\|
2349	rq->donor->prio <= p->prio);
2350
2351	if (need_to_push)
2352	push_rt_tasks(rq);
2353	}
2354
2355	/ Assumes rq->lock is held /
2356	static void rq_online_rt(struct rq *rq)
2357	{
2358	if (rq->rt.overloaded)
2359	rt_set_overload(rq);
2360
2361	__enable_runtime(rq);
2362
2363	cpupri_set(cp: &rq->rd->cpupri, cpu: rq->cpu, pri: rq->rt.highest_prio.curr);
2364	}
2365
2366	/ Assumes rq->lock is held /
2367	static void rq_offline_rt(struct rq *rq)
2368	{
2369	if (rq->rt.overloaded)
2370	rt_clear_overload(rq);
2371
2372	__disable_runtime(rq);
2373
2374	cpupri_set(cp: &rq->rd->cpupri, cpu: rq->cpu, CPUPRI_INVALID);
2375	}
2376
2377	/*
2378	* When switch from the rt queue, we bring ourselves to a position
2379	* that we might want to pull RT tasks from other runqueues.
2380	*/
2381	static void switched_from_rt(struct rq rq, struct* task_struct *p)
2382	{
2383	/*
2384	* If there are other RT tasks then we will reschedule
2385	* and the scheduling of the other RT tasks will handle
2386	* the balancing. But if we are the last RT task
2387	* we may need to handle the pulling of RT tasks
2388	* now.
2389	*/
2390	if (!task_on_rq_queued(p) \|\| rq->rt.rt_nr_running)
2391	return;
2392
2393	rt_queue_pull_task(rq);
2394	}
2395
2396	void __init init_sched_rt_class(void)
2397	{
2398	unsigned int i;
2399
2400	for_each_possible_cpu(i) {
2401	zalloc_cpumask_var_node(mask: &per_cpu(local_cpu_mask, i),
2402	GFP_KERNEL, node: cpu_to_node(cpu: i));
2403	}
2404	}
2405
2406	/*
2407	* When switching a task to RT, we may overload the runqueue
2408	* with RT tasks. In this case we try to push them off to
2409	* other runqueues.
2410	*/
2411	static void switched_to_rt(struct rq rq, struct* task_struct *p)
2412	{
2413	/*
2414	* If we are running, update the avg_rt tracking, as the running time
2415	* will now on be accounted into the latter.
2416	*/
2417	if (task_current(rq, p)) {
2418	update_rt_rq_load_avg(now: rq_clock_pelt(rq), rq, running: `0`);
2419	return;
2420	}
2421
2422	/*
2423	* If we are not running we may need to preempt the current
2424	* running task. If that current running task is also an RT task
2425	* then see if we can move to another run queue.
2426	*/
2427	if (task_on_rq_queued(p)) {
2428	if (p->nr_cpus_allowed > `1` && rq->rt.overloaded)
2429	rt_queue_push_tasks(rq);
2430	if (p->prio < rq->donor->prio && cpu_online(cpu: cpu_of(rq)))
2431	resched_curr(rq);
2432	}
2433	}
2434
2435	/*
2436	* Priority of the task has changed. This may cause
2437	* us to initiate a push or pull.
2438	*/
2439	static void
2440	prio_changed_rt(struct rq rq, struct* task_struct p, int* oldprio)
2441	{
2442	if (!task_on_rq_queued(p))
2443	return;
2444
2445	if (task_current_donor(rq, p)) {
2446	/*
2447	* If our priority decreases while running, we
2448	* may need to pull tasks to this runqueue.
2449	*/
2450	if (oldprio < p->prio)
2451	rt_queue_pull_task(rq);
2452
2453	/*
2454	* If there's a higher priority task waiting to run
2455	* then reschedule.
2456	*/
2457	if (p->prio > rq->rt.highest_prio.curr)
2458	resched_curr(rq);
2459	} else {
2460	/*
2461	* This task is not running, but if it is
2462	* greater than the current running task
2463	* then reschedule.
2464	*/
2465	if (p->prio < rq->donor->prio)
2466	resched_curr(rq);
2467	}
2468	}
2469
2470	#ifdef CONFIG_POSIX_TIMERS
2471	static void watchdog(struct rq rq, struct* task_struct *p)
2472	{
2473	unsigned long soft, hard;
2474
2475	/ max may change after cur was read, this will be fixed next tick /
2476	soft = task_rlimit(task: p, RLIMIT_RTTIME);
2477	hard = task_rlimit_max(task: p, RLIMIT_RTTIME);
2478
2479	if (soft != RLIM_INFINITY) {
2480	unsigned long next;
2481
2482	if (p->rt.watchdog_stamp != jiffies) {
2483	p->rt.timeout++;
2484	p->rt.watchdog_stamp = jiffies;
2485	}
2486
2487	next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
2488	if (p->rt.timeout > next) {
2489	posix_cputimers_rt_watchdog(pct: &p->posix_cputimers,
2490	runtime: p->se.sum_exec_runtime);
2491	}
2492	}
2493	}
2494	#else /* !CONFIG_POSIX_TIMERS: */
2495	static inline void watchdog(struct rq rq, struct* task_struct *p) { }
2496	#endif /* !CONFIG_POSIX_TIMERS */
2497
2498	/*
2499	* scheduler tick hitting a task of our scheduling class.
2500	*
2501	* NOTE: This function can be called remotely by the tick offload that
2502	* goes along full dynticks. Therefore no local assumption can be made
2503	* and everything must be accessed through the @rq and @curr passed in
2504	* parameters.
2505	*/
2506	static void task_tick_rt(struct rq rq, struct* task_struct p, int* queued)
2507	{
2508	struct sched_rt_entity *rt_se = &p->rt;
2509
2510	update_curr_rt(rq);
2511	update_rt_rq_load_avg(now: rq_clock_pelt(rq), rq, running: `1`);
2512
2513	watchdog(rq, p);
2514
2515	/*
2516	* RR tasks need a special form of time-slice management.
2517	* FIFO tasks have no timeslices.
2518	*/
2519	if (p->policy != SCHED_RR)
2520	return;
2521
2522	if (--p->rt.time_slice)
2523	return;
2524
2525	p->rt.time_slice = sched_rr_timeslice;
2526
2527	/*
2528	* Requeue to the end of queue if we (and all of our ancestors) are not
2529	* the only element on the queue
2530	*/
2531	for_each_sched_rt_entity(rt_se) {
2532	if (rt_se->run_list.prev != rt_se->run_list.next) {
2533	requeue_task_rt(rq, p, head: `0`);
2534	resched_curr(rq);
2535	return;
2536	}
2537	}
2538	}
2539
2540	static unsigned int get_rr_interval_rt(struct rq rq, struct* task_struct *task)
2541	{
2542	/*
2543	* Time slice is 0 for SCHED_FIFO tasks
2544	*/
2545	if (task->policy == SCHED_RR)
2546	return sched_rr_timeslice;
2547	else
2548	return `0`;
2549	}
2550
2551	#ifdef CONFIG_SCHED_CORE
2552	static int task_is_throttled_rt(struct task_struct p, int* cpu)
2553	{
2554	struct rt_rq *rt_rq;
2555
2556	#ifdef CONFIG_RT_GROUP_SCHED // XXX maybe add task_rt_rq(), see also sched_rt_period_rt_rq
2557	rt_rq = task_group(p)->rt_rq[cpu];
2558	WARN_ON(!rt_group_sched_enabled() && rt_rq->tg != &root_task_group);
2559	#else
2560	rt_rq = &cpu_rq(cpu)->rt;
2561	#endif
2562
2563	return rt_rq_throttled(rt_rq);
2564	}
2565	#endif /* CONFIG_SCHED_CORE */
2566
2567	DEFINE_SCHED_CLASS(rt) = {
2568
2569	.enqueue_task = enqueue_task_rt,
2570	.dequeue_task = dequeue_task_rt,
2571	.yield_task = yield_task_rt,
2572
2573	.wakeup_preempt = wakeup_preempt_rt,
2574
2575	.pick_task = pick_task_rt,
2576	.put_prev_task = put_prev_task_rt,
2577	.set_next_task = set_next_task_rt,
2578
2579	.balance = balance_rt,
2580	.select_task_rq = select_task_rq_rt,
2581	.set_cpus_allowed = set_cpus_allowed_common,
2582	.rq_online = rq_online_rt,
2583	.rq_offline = rq_offline_rt,
2584	.task_woken = task_woken_rt,
2585	.switched_from = switched_from_rt,
2586	.find_lock_rq = find_lock_lowest_rq,
2587
2588	.task_tick = task_tick_rt,
2589
2590	.get_rr_interval = get_rr_interval_rt,
2591
2592	.prio_changed = prio_changed_rt,
2593	.switched_to = switched_to_rt,
2594
2595	.update_curr = update_curr_rt,
2596
2597	#ifdef CONFIG_SCHED_CORE
2598	.task_is_throttled = task_is_throttled_rt,
2599	#endif
2600
2601	#ifdef CONFIG_UCLAMP_TASK
2602	.uclamp_enabled = `1`,
2603	#endif
2604	};
2605
2606	#ifdef CONFIG_RT_GROUP_SCHED
2607	/*
2608	* Ensure that the real time constraints are schedulable.
2609	*/
2610	static DEFINE_MUTEX(rt_constraints_mutex);
2611
2612	static inline int tg_has_rt_tasks(struct task_group *tg)
2613	{
2614	struct task_struct *task;
2615	struct css_task_iter it;
2616	int ret = `0`;
2617
2618	/*
2619	* Autogroups do not have RT tasks; see autogroup_create().
2620	*/
2621	if (task_group_is_autogroup(tg))
2622	return `0`;
2623
2624	css_task_iter_start(&tg->css, `0`, &it);
2625	while (!ret && (task = css_task_iter_next(&it)))
2626	ret \|= rt_task(task);
2627	css_task_iter_end(&it);
2628
2629	return ret;
2630	}
2631
2632	struct rt_schedulable_data {
2633	struct task_group *tg;
2634	u64 rt_period;
2635	u64 rt_runtime;
2636	};
2637
2638	static int tg_rt_schedulable(struct task_group tg, void* *data)
2639	{
2640	struct rt_schedulable_data *d = data;
2641	struct task_group *child;
2642	unsigned long total, sum = `0`;
2643	u64 period, runtime;
2644
2645	period = ktime_to_ns(tg->rt_bandwidth.rt_period);
2646	runtime = tg->rt_bandwidth.rt_runtime;
2647
2648	if (tg == d->tg) {
2649	period = d->rt_period;
2650	runtime = d->rt_runtime;
2651	}
2652
2653	/*
2654	* Cannot have more runtime than the period.
2655	*/
2656	if (runtime > period && runtime != RUNTIME_INF)
2657	return -EINVAL;
2658
2659	/*
2660	* Ensure we don't starve existing RT tasks if runtime turns zero.
2661	*/
2662	if (rt_bandwidth_enabled() && !runtime &&
2663	tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg))
2664	return -EBUSY;
2665
2666	if (WARN_ON(!rt_group_sched_enabled() && tg != &root_task_group))
2667	return -EBUSY;
2668
2669	total = to_ratio(period, runtime);
2670
2671	/*
2672	* Nobody can have more than the global setting allows.
2673	*/
2674	if (total > to_ratio(global_rt_period(), global_rt_runtime()))
2675	return -EINVAL;
2676
2677	/*
2678	* The sum of our children's runtime should not exceed our own.
2679	*/
2680	list_for_each_entry_rcu(child, &tg->children, siblings) {
2681	period = ktime_to_ns(child->rt_bandwidth.rt_period);
2682	runtime = child->rt_bandwidth.rt_runtime;
2683
2684	if (child == d->tg) {
2685	period = d->rt_period;
2686	runtime = d->rt_runtime;
2687	}
2688
2689	sum += to_ratio(period, runtime);
2690	}
2691
2692	if (sum > total)
2693	return -EINVAL;
2694
2695	return `0`;
2696	}
2697
2698	static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
2699	{
2700	int ret;
2701
2702	struct rt_schedulable_data data = {
2703	.tg = tg,
2704	.rt_period = period,
2705	.rt_runtime = runtime,
2706	};
2707
2708	rcu_read_lock();
2709	ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
2710	rcu_read_unlock();
2711
2712	return ret;
2713	}
2714
2715	static int tg_set_rt_bandwidth(struct task_group *tg,
2716	u64 rt_period, u64 rt_runtime)
2717	{
2718	int i, err = `0`;
2719
2720	/*
2721	* Disallowing the root group RT runtime is BAD, it would disallow the
2722	* kernel creating (and or operating) RT threads.
2723	*/
2724	if (tg == &root_task_group && rt_runtime == `0`)
2725	return -EINVAL;
2726
2727	/ No period doesn't make any sense. /
2728	if (rt_period == `0`)
2729	return -EINVAL;
2730
2731	/*
2732	* Bound quota to defend quota against overflow during bandwidth shift.
2733	*/
2734	if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime)
2735	return -EINVAL;
2736
2737	mutex_lock(&rt_constraints_mutex);
2738	err = __rt_schedulable(tg, rt_period, rt_runtime);
2739	if (err)
2740	goto unlock;
2741
2742	raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
2743	tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
2744	tg->rt_bandwidth.rt_runtime = rt_runtime;
2745
2746	for_each_possible_cpu(i) {
2747	struct rt_rq *rt_rq = tg->rt_rq[i];
2748
2749	raw_spin_lock(&rt_rq->rt_runtime_lock);
2750	rt_rq->rt_runtime = rt_runtime;
2751	raw_spin_unlock(&rt_rq->rt_runtime_lock);
2752	}
2753	raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
2754	unlock:
2755	mutex_unlock(&rt_constraints_mutex);
2756
2757	return err;
2758	}
2759
2760	int sched_group_set_rt_runtime(struct task_group tg, long* rt_runtime_us)
2761	{
2762	u64 rt_runtime, rt_period;
2763
2764	rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
2765	rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
2766	if (rt_runtime_us < `0`)
2767	rt_runtime = RUNTIME_INF;
2768	else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC)
2769	return -EINVAL;
2770
2771	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
2772	}
2773
2774	long sched_group_rt_runtime(struct task_group *tg)
2775	{
2776	u64 rt_runtime_us;
2777
2778	if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
2779	return -`1`;
2780
2781	rt_runtime_us = tg->rt_bandwidth.rt_runtime;
2782	do_div(rt_runtime_us, NSEC_PER_USEC);
2783	return rt_runtime_us;
2784	}
2785
2786	int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
2787	{
2788	u64 rt_runtime, rt_period;
2789
2790	if (rt_period_us > U64_MAX / NSEC_PER_USEC)
2791	return -EINVAL;
2792
2793	rt_period = rt_period_us * NSEC_PER_USEC;
2794	rt_runtime = tg->rt_bandwidth.rt_runtime;
2795
2796	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
2797	}
2798
2799	long sched_group_rt_period(struct task_group *tg)
2800	{
2801	u64 rt_period_us;
2802
2803	rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
2804	do_div(rt_period_us, NSEC_PER_USEC);
2805	return rt_period_us;
2806	}
2807
2808	#ifdef CONFIG_SYSCTL
2809	static int sched_rt_global_constraints(void)
2810	{
2811	int ret = `0`;
2812
2813	mutex_lock(&rt_constraints_mutex);
2814	ret = __rt_schedulable(NULL, `0`, `0`);
2815	mutex_unlock(&rt_constraints_mutex);
2816
2817	return ret;
2818	}
2819	#endif /* CONFIG_SYSCTL */
2820
2821	int sched_rt_can_attach(struct task_group tg, struct* task_struct *tsk)
2822	{
2823	/ Don't accept real-time tasks when there is no way for them to run /
2824	if (rt_group_sched_enabled() && rt_task(tsk) && tg->rt_bandwidth.rt_runtime == `0`)
2825	return `0`;
2826
2827	return `1`;
2828	}
2829
2830	#else /* !CONFIG_RT_GROUP_SCHED: */
2831
2832	#ifdef CONFIG_SYSCTL
2833	static int sched_rt_global_constraints(void)
2834	{
2835	return `0`;
2836	}
2837	#endif /* CONFIG_SYSCTL */
2838	#endif /* !CONFIG_RT_GROUP_SCHED */
2839
2840	#ifdef CONFIG_SYSCTL
2841	static int sched_rt_global_validate(void)
2842	{
2843	if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
2844	((sysctl_sched_rt_runtime > sysctl_sched_rt_period) \|\|
2845	((u64)sysctl_sched_rt_runtime *
2846	NSEC_PER_USEC > max_rt_runtime)))
2847	return -EINVAL;
2848
2849	return `0`;
2850	}
2851
2852	static void sched_rt_do_global(void)
2853	{
2854	}
2855
2856	static int sched_rt_handler(const struct ctl_table table, int* write, void *buffer,
2857	size_t lenp, loff_t ppos)
2858	{
2859	int old_period, old_runtime;
2860	static DEFINE_MUTEX(mutex);
2861	int ret;
2862
2863	mutex_lock(lock: &mutex);
2864	sched_domains_mutex_lock();
2865	old_period = sysctl_sched_rt_period;
2866	old_runtime = sysctl_sched_rt_runtime;
2867
2868	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
2869
2870	if (!ret && write) {
2871	ret = sched_rt_global_validate();
2872	if (ret)
2873	goto undo;
2874
2875	ret = sched_dl_global_validate();
2876	if (ret)
2877	goto undo;
2878
2879	ret = sched_rt_global_constraints();
2880	if (ret)
2881	goto undo;
2882
2883	sched_rt_do_global();
2884	sched_dl_do_global();
2885	}
2886	if (`0`) {
2887	undo:
2888	sysctl_sched_rt_period = old_period;
2889	sysctl_sched_rt_runtime = old_runtime;
2890	}
2891	sched_domains_mutex_unlock();
2892	mutex_unlock(lock: &mutex);
2893
2894	/*
2895	* After changing maximum available bandwidth for DEADLINE, we need to
2896	* recompute per root domain and per cpus variables accordingly.
2897	*/
2898	rebuild_sched_domains();
2899
2900	return ret;
2901	}
2902
2903	static int sched_rr_handler(const struct ctl_table table, int* write, void *buffer,
2904	size_t lenp, loff_t ppos)
2905	{
2906	int ret;
2907	static DEFINE_MUTEX(mutex);
2908
2909	mutex_lock(lock: &mutex);
2910	ret = proc_dointvec(table, write, buffer, lenp, ppos);
2911	/*
2912	* Make sure that internally we keep jiffies.
2913	* Also, writing zero resets the time-slice to default:
2914	*/
2915	if (!ret && write) {
2916	sched_rr_timeslice =
2917	sysctl_sched_rr_timeslice <= `0` ? RR_TIMESLICE :
2918	msecs_to_jiffies(m: sysctl_sched_rr_timeslice);
2919
2920	if (sysctl_sched_rr_timeslice <= `0`)
2921	sysctl_sched_rr_timeslice = jiffies_to_msecs(RR_TIMESLICE);
2922	}
2923	mutex_unlock(lock: &mutex);
2924
2925	return ret;
2926	}
2927	#endif /* CONFIG_SYSCTL */
2928
2929	void print_rt_stats(struct seq_file m, int* cpu)
2930	{
2931	rt_rq_iter_t iter;
2932	struct rt_rq *rt_rq;
2933
2934	rcu_read_lock();
2935	for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
2936	print_rt_rq(m, cpu, rt_rq);
2937	rcu_read_unlock();
2938	}
2939

Browse the source code of Linux/kernel/sched/rt.c