fair.c source code [Linux/kernel/sched/fair.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
4	*
5	* Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
6	*
7	* Interactivity improvements by Mike Galbraith
8	* (C) 2007 Mike Galbraith <efault@gmx.de>
9	*
10	* Various enhancements by Dmitry Adamushko.
11	* (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
12	*
13	* Group scheduling enhancements by Srivatsa Vaddagiri
14	* Copyright IBM Corporation, 2007
15	* Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
16	*
17	* Scaled math optimizations by Thomas Gleixner
18	* Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
19	*
20	* Adaptive scheduling granularity, math enhancements by Peter Zijlstra
21	* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
22	*/
23	#include <linux/energy_model.h>
24	#include <linux/mmap_lock.h>
25	#include <linux/hugetlb_inline.h>
26	#include <linux/jiffies.h>
27	#include <linux/mm_api.h>
28	#include <linux/highmem.h>
29	#include <linux/spinlock_api.h>
30	#include <linux/cpumask_api.h>
31	#include <linux/lockdep_api.h>
32	#include <linux/softirq.h>
33	#include <linux/refcount_api.h>
34	#include <linux/topology.h>
35	#include <linux/sched/clock.h>
36	#include <linux/sched/cond_resched.h>
37	#include <linux/sched/cputime.h>
38	#include <linux/sched/isolation.h>
39	#include <linux/sched/nohz.h>
40	#include <linux/sched/prio.h>
41
42	#include <linux/cpuidle.h>
43	#include <linux/interrupt.h>
44	#include <linux/memory-tiers.h>
45	#include <linux/mempolicy.h>
46	#include <linux/mutex_api.h>
47	#include <linux/profile.h>
48	#include <linux/psi.h>
49	#include <linux/ratelimit.h>
50	#include <linux/task_work.h>
51	#include <linux/rbtree_augmented.h>
52
53	#include <asm/switch_to.h>
54
55	#include <uapi/linux/sched/types.h>
56
57	#include "sched.h"
58	#include "stats.h"
59	#include "autogroup.h"
60
61	/*
62	* The initial- and re-scaling of tunables is configurable
63	*
64	* Options are:
65	*
66	* SCHED_TUNABLESCALING_NONE - unscaled, always *1
67	* SCHED_TUNABLESCALING_LOG - scaled logarithmically, *1+ilog(ncpus)
68	* SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
69	*
70	* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
71	*/
72	unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
73
74	/*
75	* Minimal preemption granularity for CPU-bound tasks:
76	*
77	* (default: 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds)
78	*/
79	unsigned int sysctl_sched_base_slice = `700000ULL`;
80	static unsigned int normalized_sysctl_sched_base_slice = `700000ULL`;
81
82	__read_mostly unsigned int sysctl_sched_migration_cost = `500000UL`;
83
84	static int __init setup_sched_thermal_decay_shift(char *str)
85	{
86	pr_warn("Ignoring the deprecated sched_thermal_decay_shift= option\n");
87	return `1`;
88	}
89	__setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
90
91	/*
92	* For asym packing, by default the lower numbered CPU has higher priority.
93	*/
94	int __weak arch_asym_cpu_priority(int cpu)
95	{
96	return -cpu;
97	}
98
99	/*
100	* The margin used when comparing utilization with CPU capacity.
101	*
102	* (default: ~20%)
103	*/
104	#define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024)
105
106	/*
107	* The margin used when comparing CPU capacities.
108	* is 'cap1' noticeably greater than 'cap2'
109	*
110	* (default: ~5%)
111	*/
112	#define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078)
113
114	#ifdef CONFIG_CFS_BANDWIDTH
115	/*
116	* Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
117	* each time a cfs_rq requests quota.
118	*
119	* Note: in the case that the slice exceeds the runtime remaining (either due
120	* to consumption or the quota being specified to be smaller than the slice)
121	* we will always only issue the remaining available time.
122	*
123	* (default: 5 msec, units: microseconds)
124	*/
125	static unsigned int sysctl_sched_cfs_bandwidth_slice = `5000UL`;
126	#endif
127
128	#ifdef CONFIG_NUMA_BALANCING
129	/ Restrict the NUMA promotion throughput (MB/s) for each target node. /
130	static unsigned int sysctl_numa_balancing_promote_rate_limit = `65536`;
131	#endif
132
133	#ifdef CONFIG_SYSCTL
134	static const struct ctl_table sched_fair_sysctls[] = {
135	#ifdef CONFIG_CFS_BANDWIDTH
136	{
137	.procname = "sched_cfs_bandwidth_slice_us",
138	.data = &sysctl_sched_cfs_bandwidth_slice,
139	.maxlen = sizeof(unsigned int),
140	.mode = `0644`,
141	.proc_handler = proc_dointvec_minmax,
142	.extra1 = SYSCTL_ONE,
143	},
144	#endif
145	#ifdef CONFIG_NUMA_BALANCING
146	{
147	.procname = "numa_balancing_promote_rate_limit_MBps",
148	.data = &sysctl_numa_balancing_promote_rate_limit,
149	.maxlen = sizeof(unsigned int),
150	.mode = `0644`,
151	.proc_handler = proc_dointvec_minmax,
152	.extra1 = SYSCTL_ZERO,
153	},
154	#endif /* CONFIG_NUMA_BALANCING */
155	};
156
157	static int __init sched_fair_sysctl_init(void)
158	{
159	register_sysctl_init("kernel", sched_fair_sysctls);
160	return `0`;
161	}
162	late_initcall(sched_fair_sysctl_init);
163	#endif /* CONFIG_SYSCTL */
164
165	static inline void update_load_add(struct load_weight lw, unsigned* long inc)
166	{
167	lw->weight += inc;
168	lw->inv_weight = `0`;
169	}
170
171	static inline void update_load_sub(struct load_weight lw, unsigned* long dec)
172	{
173	lw->weight -= dec;
174	lw->inv_weight = `0`;
175	}
176
177	static inline void update_load_set(struct load_weight lw, unsigned* long w)
178	{
179	lw->weight = w;
180	lw->inv_weight = `0`;
181	}
182
183	/*
184	* Increase the granularity value when there are more CPUs,
185	* because with more CPUs the 'effective latency' as visible
186	* to users decreases. But the relationship is not linear,
187	* so pick a second-best guess by going with the log2 of the
188	* number of CPUs.
189	*
190	* This idea comes from the SD scheduler of Con Kolivas:
191	*/
192	static unsigned int get_update_sysctl_factor(void)
193	{
194	unsigned int cpus = min_t(unsigned int, num_online_cpus(), `8`);
195	unsigned int factor;
196
197	switch (sysctl_sched_tunable_scaling) {
198	case SCHED_TUNABLESCALING_NONE:
199	factor = `1`;
200	break;
201	case SCHED_TUNABLESCALING_LINEAR:
202	factor = cpus;
203	break;
204	case SCHED_TUNABLESCALING_LOG:
205	default:
206	factor = `1` + ilog2(cpus);
207	break;
208	}
209
210	return factor;
211	}
212
213	static void update_sysctl(void)
214	{
215	unsigned int factor = get_update_sysctl_factor();
216
217	#define SET_SYSCTL(name) \
218	(sysctl_##name = (factor) * normalized_sysctl_##name)
219	SET_SYSCTL(sched_base_slice);
220	#undef SET_SYSCTL
221	}
222
223	void __init sched_init_granularity(void)
224	{
225	update_sysctl();
226	}
227
228	#define WMULT_CONST (~0U)
229	#define WMULT_SHIFT 32
230
231	static void __update_inv_weight(struct load_weight *lw)
232	{
233	unsigned long w;
234
235	if (likely(lw->inv_weight))
236	return;
237
238	w = scale_load_down(lw->weight);
239
240	if (BITS_PER_LONG > `32` && unlikely(w >= WMULT_CONST))
241	lw->inv_weight = `1`;
242	else if (unlikely(!w))
243	lw->inv_weight = WMULT_CONST;
244	else
245	lw->inv_weight = WMULT_CONST / w;
246	}
247
248	/*
249	* delta_exec * weight / lw.weight
250	* OR
251	* (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
252	*
253	* Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
254	* we're guaranteed shift stays positive because inv_weight is guaranteed to
255	* fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
256	*
257	* Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
258	* weight/lw.weight <= 1, and therefore our shift will also be positive.
259	*/
260	static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
261	{
262	u64 fact = scale_load_down(weight);
263	u32 fact_hi = (u32)(fact >> `32`);
264	int shift = WMULT_SHIFT;
265	int fs;
266
267	__update_inv_weight(lw);
268
269	if (unlikely(fact_hi)) {
270	fs = fls(x: fact_hi);
271	shift -= fs;
272	fact >>= fs;
273	}
274
275	fact = mul_u32_u32(a: fact, b: lw->inv_weight);
276
277	fact_hi = (u32)(fact >> `32`);
278	if (fact_hi) {
279	fs = fls(x: fact_hi);
280	shift -= fs;
281	fact >>= fs;
282	}
283
284	return mul_u64_u32_shr(a: delta_exec, mul: fact, shift);
285	}
286
287	/*
288	* delta /= w
289	*/
290	static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
291	{
292	if (unlikely(se->load.weight != NICE_0_LOAD))
293	delta = __calc_delta(delta_exec: delta, NICE_0_LOAD, lw: &se->load);
294
295	return delta;
296	}
297
298	const struct sched_class fair_sched_class;
299
300	/**************************************************************
301	* CFS operations on generic schedulable entities:
302	*/
303
304	#ifdef CONFIG_FAIR_GROUP_SCHED
305
306	/ Walk up scheduling entities hierarchy /
307	#define for_each_sched_entity(se) \
308	for (; se; se = se->parent)
309
310	static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
311	{
312	struct rq *rq = rq_of(cfs_rq);
313	int cpu = cpu_of(rq);
314
315	if (cfs_rq->on_list)
316	return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
317
318	cfs_rq->on_list = `1`;
319
320	/*
321	* Ensure we either appear before our parent (if already
322	* enqueued) or force our parent to appear after us when it is
323	* enqueued. The fact that we always enqueue bottom-up
324	* reduces this to two cases and a special case for the root
325	* cfs_rq. Furthermore, it also means that we will always reset
326	* tmp_alone_branch either when the branch is connected
327	* to a tree or when we reach the top of the tree
328	*/
329	if (cfs_rq->tg->parent &&
330	cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
331	/*
332	* If parent is already on the list, we add the child
333	* just before. Thanks to circular linked property of
334	* the list, this means to put the child at the tail
335	* of the list that starts by parent.
336	*/
337	list_add_tail_rcu(new: &cfs_rq->leaf_cfs_rq_list,
338	head: &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
339	/*
340	* The branch is now connected to its tree so we can
341	* reset tmp_alone_branch to the beginning of the
342	* list.
343	*/
344	rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
345	return true;
346	}
347
348	if (!cfs_rq->tg->parent) {
349	/*
350	* cfs rq without parent should be put
351	* at the tail of the list.
352	*/
353	list_add_tail_rcu(new: &cfs_rq->leaf_cfs_rq_list,
354	head: &rq->leaf_cfs_rq_list);
355	/*
356	* We have reach the top of a tree so we can reset
357	* tmp_alone_branch to the beginning of the list.
358	*/
359	rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
360	return true;
361	}
362
363	/*
364	* The parent has not already been added so we want to
365	* make sure that it will be put after us.
366	* tmp_alone_branch points to the begin of the branch
367	* where we will add parent.
368	*/
369	list_add_rcu(new: &cfs_rq->leaf_cfs_rq_list, head: rq->tmp_alone_branch);
370	/*
371	* update tmp_alone_branch to points to the new begin
372	* of the branch
373	*/
374	rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
375	return false;
376	}
377
378	static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
379	{
380	if (cfs_rq->on_list) {
381	struct rq *rq = rq_of(cfs_rq);
382
383	/*
384	* With cfs_rq being unthrottled/throttled during an enqueue,
385	* it can happen the tmp_alone_branch points to the leaf that
386	* we finally want to delete. In this case, tmp_alone_branch moves
387	* to the prev element but it will point to rq->leaf_cfs_rq_list
388	* at the end of the enqueue.
389	*/
390	if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
391	rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
392
393	list_del_rcu(entry: &cfs_rq->leaf_cfs_rq_list);
394	cfs_rq->on_list = `0`;
395	}
396	}
397
398	static inline void assert_list_leaf_cfs_rq(struct rq *rq)
399	{
400	WARN_ON_ONCE(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
401	}
402
403	/ Iterate through all leaf cfs_rq's on a runqueue /
404	#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
405	list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
406	leaf_cfs_rq_list)
407
408	/ Do the two (enqueued) entities belong to the same group ? /
409	static inline struct cfs_rq *
410	is_same_group(struct sched_entity se, struct* sched_entity *pse)
411	{
412	if (se->cfs_rq == pse->cfs_rq)
413	return se->cfs_rq;
414
415	return NULL;
416	}
417
418	static inline struct sched_entity parent_entity(const* struct sched_entity *se)
419	{
420	return se->parent;
421	}
422
423	static void
424	find_matching_se(struct sched_entity se, struct sched_entity pse)
425	{
426	int se_depth, pse_depth;
427
428	/*
429	* preemption test can be made between sibling entities who are in the
430	* same cfs_rq i.e who have a common parent. Walk up the hierarchy of
431	* both tasks until we find their ancestors who are siblings of common
432	* parent.
433	*/
434
435	/ First walk up until both entities are at same depth /
436	se_depth = (*se)->depth;
437	pse_depth = (*pse)->depth;
438
439	while (se_depth > pse_depth) {
440	se_depth--;
441	se = parent_entity(se: se);
442	}
443
444	while (pse_depth > se_depth) {
445	pse_depth--;
446	pse = parent_entity(se: pse);
447	}
448
449	while (!is_same_group(se: se, pse: pse)) {
450	se = parent_entity(se: se);
451	pse = parent_entity(se: pse);
452	}
453	}
454
455	static int tg_is_idle(struct task_group *tg)
456	{
457	return tg->idle > `0`;
458	}
459
460	static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
461	{
462	return cfs_rq->idle > `0`;
463	}
464
465	static int se_is_idle(struct sched_entity *se)
466	{
467	if (entity_is_task(se))
468	return task_has_idle_policy(p: task_of(se));
469	return cfs_rq_is_idle(cfs_rq: group_cfs_rq(grp: se));
470	}
471
472	#else /* !CONFIG_FAIR_GROUP_SCHED: */
473
474	#define for_each_sched_entity(se) \
475	for (; se; se = NULL)
476
477	static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
478	{
479	return true;
480	}
481
482	static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
483	{
484	}
485
486	static inline void assert_list_leaf_cfs_rq(struct rq *rq)
487	{
488	}
489
490	#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
491	for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
492
493	static inline struct sched_entity parent_entity(struct* sched_entity *se)
494	{
495	return NULL;
496	}
497
498	static inline void
499	find_matching_se(struct sched_entity se, struct sched_entity pse)
500	{
501	}
502
503	static inline int tg_is_idle(struct task_group *tg)
504	{
505	return `0`;
506	}
507
508	static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
509	{
510	return `0`;
511	}
512
513	static int se_is_idle(struct sched_entity *se)
514	{
515	return task_has_idle_policy(task_of(se));
516	}
517
518	#endif /* !CONFIG_FAIR_GROUP_SCHED */
519
520	static __always_inline
521	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
522
523	/**************************************************************
524	* Scheduling class tree data structure manipulation methods:
525	*/
526
527	static inline __maybe_unused u64 max_vruntime(u64 max_vruntime, u64 vruntime)
528	{
529	s64 delta = (s64)(vruntime - max_vruntime);
530	if (delta > `0`)
531	max_vruntime = vruntime;
532
533	return max_vruntime;
534	}
535
536	static inline __maybe_unused u64 min_vruntime(u64 min_vruntime, u64 vruntime)
537	{
538	s64 delta = (s64)(vruntime - min_vruntime);
539	if (delta < `0`)
540	min_vruntime = vruntime;
541
542	return min_vruntime;
543	}
544
545	static inline bool entity_before(const struct sched_entity *a,
546	const struct sched_entity *b)
547	{
548	/*
549	* Tiebreak on vruntime seems unnecessary since it can
550	* hardly happen.
551	*/
552	return (s64)(a->deadline - b->deadline) < `0`;
553	}
554
555	static inline s64 entity_key(struct cfs_rq cfs_rq, struct* sched_entity *se)
556	{
557	return (s64)(se->vruntime - cfs_rq->min_vruntime);
558	}
559
560	#define __node_2_se(node) \
561	rb_entry((node), struct sched_entity, run_node)
562
563	/*
564	* Compute virtual time from the per-task service numbers:
565	*
566	* Fair schedulers conserve lag:
567	*
568	* \Sum lag_i = 0
569	*
570	* Where lag_i is given by:
571	*
572	* lag_i = S - s_i = w_i * (V - v_i)
573	*
574	* Where S is the ideal service time and V is it's virtual time counterpart.
575	* Therefore:
576	*
577	* \Sum lag_i = 0
578	* \Sum w_i * (V - v_i) = 0
579	* \Sum w_i * V - w_i * v_i = 0
580	*
581	* From which we can solve an expression for V in v_i (which we have in
582	* se->vruntime):
583	*
584	* \Sum v_i * w_i \Sum v_i * w_i
585	* V = -------------- = --------------
586	* \Sum w_i W
587	*
588	* Specifically, this is the weighted average of all entity virtual runtimes.
589	*
590	* [[ NOTE: this is only equal to the ideal scheduler under the condition
591	* that join/leave operations happen at lag_i = 0, otherwise the
592	* virtual time has non-contiguous motion equivalent to:
593	*
594	* V +-= lag_i / W
595	*
596	* Also see the comment in place_entity() that deals with this. ]]
597	*
598	* However, since v_i is u64, and the multiplication could easily overflow
599	* transform it into a relative form that uses smaller quantities:
600	*
601	* Substitute: v_i == (v_i - v0) + v0
602	*
603	* \Sum ((v_i - v0) + v0) * w_i \Sum (v_i - v0) * w_i
604	* V = ---------------------------- = --------------------- + v0
605	* W W
606	*
607	* Which we track using:
608	*
609	* v0 := cfs_rq->min_vruntime
610	* \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime
611	* \Sum w_i := cfs_rq->avg_load
612	*
613	* Since min_vruntime is a monotonic increasing variable that closely tracks
614	* the per-task service, these deltas: (v_i - v), will be in the order of the
615	* maximal (virtual) lag induced in the system due to quantisation.
616	*
617	* Also, we use scale_load_down() to reduce the size.
618	*
619	* As measured, the max (key * weight) value was ~44 bits for a kernel build.
620	*/
621	static void
622	avg_vruntime_add(struct cfs_rq cfs_rq, struct* sched_entity *se)
623	{
624	unsigned long weight = scale_load_down(se->load.weight);
625	s64 key = entity_key(cfs_rq, se);
626
627	cfs_rq->avg_vruntime += key * weight;
628	cfs_rq->avg_load += weight;
629	}
630
631	static void
632	avg_vruntime_sub(struct cfs_rq cfs_rq, struct* sched_entity *se)
633	{
634	unsigned long weight = scale_load_down(se->load.weight);
635	s64 key = entity_key(cfs_rq, se);
636
637	cfs_rq->avg_vruntime -= key * weight;
638	cfs_rq->avg_load -= weight;
639	}
640
641	static inline
642	void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
643	{
644	/*
645	* v' = v + d ==> avg_vruntime' = avg_runtime - d*avg_load
646	*/
647	cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta;
648	}
649
650	/*
651	* Specifically: avg_runtime() + 0 must result in entity_eligible() := true
652	* For this to be so, the result of this function must have a left bias.
653	*/
654	u64 avg_vruntime(struct cfs_rq *cfs_rq)
655	{
656	struct sched_entity *curr = cfs_rq->curr;
657	s64 avg = cfs_rq->avg_vruntime;
658	long load = cfs_rq->avg_load;
659
660	if (curr && curr->on_rq) {
661	unsigned long weight = scale_load_down(curr->load.weight);
662
663	avg += entity_key(cfs_rq, se: curr) * weight;
664	load += weight;
665	}
666
667	if (load) {
668	/ sign flips effective floor / ceiling /
669	if (avg < `0`)
670	avg -= (load - `1`);
671	avg = div_s64(dividend: avg, divisor: load);
672	}
673
674	return cfs_rq->min_vruntime + avg;
675	}
676
677	/*
678	* lag_i = S - s_i = w_i * (V - v_i)
679	*
680	* However, since V is approximated by the weighted average of all entities it
681	* is possible -- by addition/removal/reweight to the tree -- to move V around
682	* and end up with a larger lag than we started with.
683	*
684	* Limit this to either double the slice length with a minimum of TICK_NSEC
685	* since that is the timing granularity.
686	*
687	* EEVDF gives the following limit for a steady state system:
688	*
689	* -r_max < lag < max(r_max, q)
690	*
691	* XXX could add max_slice to the augmented data to track this.
692	*/
693	static void update_entity_lag(struct cfs_rq cfs_rq, struct* sched_entity *se)
694	{
695	s64 vlag, limit;
696
697	WARN_ON_ONCE(!se->on_rq);
698
699	vlag = avg_vruntime(cfs_rq) - se->vruntime;
700	limit = calc_delta_fair(max_t(u64, `2`*se->slice, TICK_NSEC), se);
701
702	se->vlag = clamp(vlag, -limit, limit);
703	}
704
705	/*
706	* Entity is eligible once it received less service than it ought to have,
707	* eg. lag >= 0.
708	*
709	* lag_i = S - s_i = w_i*(V - v_i)
710	*
711	* lag_i >= 0 -> V >= v_i
712	*
713	* \Sum (v_i - v)*w_i
714	* V = ------------------ + v
715	* \Sum w_i
716	*
717	* lag_i >= 0 -> \Sum (v_i - v)w_i >= (v_i - v)(\Sum w_i)
718	*
719	* Note: using 'avg_vruntime() > se->vruntime' is inaccurate due
720	* to the loss in precision caused by the division.
721	*/
722	static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
723	{
724	struct sched_entity *curr = cfs_rq->curr;
725	s64 avg = cfs_rq->avg_vruntime;
726	long load = cfs_rq->avg_load;
727
728	if (curr && curr->on_rq) {
729	unsigned long weight = scale_load_down(curr->load.weight);
730
731	avg += entity_key(cfs_rq, se: curr) * weight;
732	load += weight;
733	}
734
735	return avg >= (s64)(vruntime - cfs_rq->min_vruntime) * load;
736	}
737
738	int entity_eligible(struct cfs_rq cfs_rq, struct* sched_entity *se)
739	{
740	return vruntime_eligible(cfs_rq, vruntime: se->vruntime);
741	}
742
743	static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
744	{
745	u64 min_vruntime = cfs_rq->min_vruntime;
746	/*
747	* open coded max_vruntime() to allow updating avg_vruntime
748	*/
749	s64 delta = (s64)(vruntime - min_vruntime);
750	if (delta > `0`) {
751	avg_vruntime_update(cfs_rq, delta);
752	min_vruntime = vruntime;
753	}
754	return min_vruntime;
755	}
756
757	static void update_min_vruntime(struct cfs_rq *cfs_rq)
758	{
759	struct sched_entity *se = __pick_root_entity(cfs_rq);
760	struct sched_entity *curr = cfs_rq->curr;
761	u64 vruntime = cfs_rq->min_vruntime;
762
763	if (curr) {
764	if (curr->on_rq)
765	vruntime = curr->vruntime;
766	else
767	curr = NULL;
768	}
769
770	if (se) {
771	if (!curr)
772	vruntime = se->min_vruntime;
773	else
774	vruntime = min_vruntime(min_vruntime: vruntime, vruntime: se->min_vruntime);
775	}
776
777	/ ensure we never gain time by being placed backwards. /
778	cfs_rq->min_vruntime = __update_min_vruntime(cfs_rq, vruntime);
779	}
780
781	static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq)
782	{
783	struct sched_entity *root = __pick_root_entity(cfs_rq);
784	struct sched_entity *curr = cfs_rq->curr;
785	u64 min_slice = ~`0ULL`;
786
787	if (curr && curr->on_rq)
788	min_slice = curr->slice;
789
790	if (root)
791	min_slice = min(min_slice, root->min_slice);
792
793	return min_slice;
794	}
795
796	static inline bool __entity_less(struct rb_node a, const* struct rb_node *b)
797	{
798	return entity_before(__node_2_se(a), __node_2_se(b));
799	}
800
801	#define vruntime_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; })
802
803	static inline void __min_vruntime_update(struct sched_entity se, struct* rb_node *node)
804	{
805	if (node) {
806	struct sched_entity *rse = __node_2_se(node);
807	if (vruntime_gt(min_vruntime, se, rse))
808	se->min_vruntime = rse->min_vruntime;
809	}
810	}
811
812	static inline void __min_slice_update(struct sched_entity se, struct* rb_node *node)
813	{
814	if (node) {
815	struct sched_entity *rse = __node_2_se(node);
816	if (rse->min_slice < se->min_slice)
817	se->min_slice = rse->min_slice;
818	}
819	}
820
821	/*
822	* se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime)
823	*/
824	static inline bool min_vruntime_update(struct sched_entity *se, bool exit)
825	{
826	u64 old_min_vruntime = se->min_vruntime;
827	u64 old_min_slice = se->min_slice;
828	struct rb_node *node = &se->run_node;
829
830	se->min_vruntime = se->vruntime;
831	__min_vruntime_update(se, node: node->rb_right);
832	__min_vruntime_update(se, node: node->rb_left);
833
834	se->min_slice = se->slice;
835	__min_slice_update(se, node: node->rb_right);
836	__min_slice_update(se, node: node->rb_left);
837
838	return se->min_vruntime == old_min_vruntime &&
839	se->min_slice == old_min_slice;
840	}
841
842	RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
843	run_node, min_vruntime, min_vruntime_update);
844
845	/*
846	* Enqueue an entity into the rb-tree:
847	*/
848	static void __enqueue_entity(struct cfs_rq cfs_rq, struct* sched_entity *se)
849	{
850	avg_vruntime_add(cfs_rq, se);
851	se->min_vruntime = se->vruntime;
852	se->min_slice = se->slice;
853	rb_add_augmented_cached(node: &se->run_node, tree: &cfs_rq->tasks_timeline,
854	less: __entity_less, augment: &min_vruntime_cb);
855	}
856
857	static void __dequeue_entity(struct cfs_rq cfs_rq, struct* sched_entity *se)
858	{
859	rb_erase_augmented_cached(node: &se->run_node, root: &cfs_rq->tasks_timeline,
860	augment: &min_vruntime_cb);
861	avg_vruntime_sub(cfs_rq, se);
862	}
863
864	struct sched_entity __pick_root_entity(struct* cfs_rq *cfs_rq)
865	{
866	struct rb_node *root = cfs_rq->tasks_timeline.rb_root.rb_node;
867
868	if (!root)
869	return NULL;
870
871	return __node_2_se(root);
872	}
873
874	struct sched_entity __pick_first_entity(struct* cfs_rq *cfs_rq)
875	{
876	struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
877
878	if (!left)
879	return NULL;
880
881	return __node_2_se(left);
882	}
883
884	/*
885	* Set the vruntime up to which an entity can run before looking
886	* for another entity to pick.
887	* In case of run to parity, we use the shortest slice of the enqueued
888	* entities to set the protected period.
889	* When run to parity is disabled, we give a minimum quantum to the running
890	* entity to ensure progress.
891	*/
892	static inline void set_protect_slice(struct cfs_rq cfs_rq, struct* sched_entity *se)
893	{
894	u64 slice = normalized_sysctl_sched_base_slice;
895	u64 vprot = se->deadline;
896
897	if (sched_feat(RUN_TO_PARITY))
898	slice = cfs_rq_min_slice(cfs_rq);
899
900	slice = min(slice, se->slice);
901	if (slice != se->slice)
902	vprot = min_vruntime(min_vruntime: vprot, vruntime: se->vruntime + calc_delta_fair(delta: slice, se));
903
904	se->vprot = vprot;
905	}
906
907	static inline void update_protect_slice(struct cfs_rq cfs_rq, struct* sched_entity *se)
908	{
909	u64 slice = cfs_rq_min_slice(cfs_rq);
910
911	se->vprot = min_vruntime(min_vruntime: se->vprot, vruntime: se->vruntime + calc_delta_fair(delta: slice, se));
912	}
913
914	static inline bool protect_slice(struct sched_entity *se)
915	{
916	return ((s64)(se->vprot - se->vruntime) > `0`);
917	}
918
919	static inline void cancel_protect_slice(struct sched_entity *se)
920	{
921	if (protect_slice(se))
922	se->vprot = se->vruntime;
923	}
924
925	/*
926	* Earliest Eligible Virtual Deadline First
927	*
928	* In order to provide latency guarantees for different request sizes
929	* EEVDF selects the best runnable task from two criteria:
930	*
931	* 1) the task must be eligible (must be owed service)
932	*
933	* 2) from those tasks that meet 1), we select the one
934	* with the earliest virtual deadline.
935	*
936	* We can do this in O(log n) time due to an augmented RB-tree. The
937	* tree keeps the entries sorted on deadline, but also functions as a
938	* heap based on the vruntime by keeping:
939	*
940	* se->min_vruntime = min(se->vruntime, se->{left,right}->min_vruntime)
941	*
942	* Which allows tree pruning through eligibility.
943	*/
944	static struct sched_entity __pick_eevdf(struct* cfs_rq *cfs_rq, bool protect)
945	{
946	struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
947	struct sched_entity *se = __pick_first_entity(cfs_rq);
948	struct sched_entity *curr = cfs_rq->curr;
949	struct sched_entity *best = NULL;
950
951	/*
952	* We can safely skip eligibility check if there is only one entity
953	* in this cfs_rq, saving some cycles.
954	*/
955	if (cfs_rq->nr_queued == `1`)
956	return curr && curr->on_rq ? curr : se;
957
958	if (curr && (!curr->on_rq \|\| !entity_eligible(cfs_rq, se: curr)))
959	curr = NULL;
960
961	if (curr && protect && protect_slice(se: curr))
962	return curr;
963
964	/ Pick the leftmost entity if it's eligible /
965	if (se && entity_eligible(cfs_rq, se)) {
966	best = se;
967	goto found;
968	}
969
970	/ Heap search for the EEVD entity /
971	while (node) {
972	struct rb_node *left = node->rb_left;
973
974	/*
975	* Eligible entities in left subtree are always better
976	* choices, since they have earlier deadlines.
977	*/
978	if (left && vruntime_eligible(cfs_rq,
979	__node_2_se(left)->min_vruntime)) {
980	node = left;
981	continue;
982	}
983
984	se = __node_2_se(node);
985
986	/*
987	* The left subtree either is empty or has no eligible
988	* entity, so check the current node since it is the one
989	* with earliest deadline that might be eligible.
990	*/
991	if (entity_eligible(cfs_rq, se)) {
992	best = se;
993	break;
994	}
995
996	node = node->rb_right;
997	}
998	found:
999	if (!best \|\| (curr && entity_before(a: curr, b: best)))
1000	best = curr;
1001
1002	return best;
1003	}
1004
1005	static struct sched_entity pick_eevdf(struct* cfs_rq *cfs_rq)
1006	{
1007	return __pick_eevdf(cfs_rq, protect: true);
1008	}
1009
1010	struct sched_entity __pick_last_entity(struct* cfs_rq *cfs_rq)
1011	{
1012	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
1013
1014	if (!last)
1015	return NULL;
1016
1017	return __node_2_se(last);
1018	}
1019
1020	/**************************************************************
1021	* Scheduling class statistics methods:
1022	*/
1023	int sched_update_scaling(void)
1024	{
1025	unsigned int factor = get_update_sysctl_factor();
1026
1027	#define WRT_SYSCTL(name) \
1028	(normalized_sysctl_##name = sysctl_##name / (factor))
1029	WRT_SYSCTL(sched_base_slice);
1030	#undef WRT_SYSCTL
1031
1032	return `0`;
1033	}
1034
1035	static void clear_buddies(struct cfs_rq cfs_rq, struct* sched_entity *se);
1036
1037	/*
1038	* XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
1039	* this is probably good enough.
1040	*/
1041	static bool update_deadline(struct cfs_rq cfs_rq, struct* sched_entity *se)
1042	{
1043	if ((s64)(se->vruntime - se->deadline) < `0`)
1044	return false;
1045
1046	/*
1047	* For EEVDF the virtual time slope is determined by w_i (iow.
1048	* nice) while the request time r_i is determined by
1049	* sysctl_sched_base_slice.
1050	*/
1051	if (!se->custom_slice)
1052	se->slice = sysctl_sched_base_slice;
1053
1054	/*
1055	* EEVDF: vd_i = ve_i + r_i / w_i
1056	*/
1057	se->deadline = se->vruntime + calc_delta_fair(delta: se->slice, se);
1058
1059	/*
1060	* The task has consumed its request, reschedule.
1061	*/
1062	return true;
1063	}
1064
1065	#include "pelt.h"
1066
1067	static int select_idle_sibling(struct task_struct p, int* prev_cpu, int cpu);
1068	static unsigned long task_h_load(struct task_struct *p);
1069	static unsigned long capacity_of(int cpu);
1070
1071	/ Give new sched_entity start runnable values to heavy its load in infant time /
1072	void init_entity_runnable_average(struct sched_entity *se)
1073	{
1074	struct sched_avg *sa = &se->avg;
1075
1076	memset(s: sa, c: `0`, n: sizeof(*sa));
1077
1078	/*
1079	* Tasks are initialized with full load to be seen as heavy tasks until
1080	* they get a chance to stabilize to their real load level.
1081	* Group entities are initialized with zero load to reflect the fact that
1082	* nothing has been attached to the task group yet.
1083	*/
1084	if (entity_is_task(se))
1085	sa->load_avg = scale_load_down(se->load.weight);
1086
1087	/ when this task is enqueued, it will contribute to its cfs_rq's load_avg /
1088	}
1089
1090	/*
1091	* With new tasks being created, their initial util_avgs are extrapolated
1092	* based on the cfs_rq's current util_avg:
1093	*
1094	* util_avg = cfs_rq->avg.util_avg / (cfs_rq->avg.load_avg + 1)
1095	* * se_weight(se)
1096	*
1097	* However, in many cases, the above util_avg does not give a desired
1098	* value. Moreover, the sum of the util_avgs may be divergent, such
1099	* as when the series is a harmonic series.
1100	*
1101	* To solve this problem, we also cap the util_avg of successive tasks to
1102	* only 1/2 of the left utilization budget:
1103	*
1104	* util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n
1105	*
1106	* where n denotes the nth task and cpu_scale the CPU capacity.
1107	*
1108	* For example, for a CPU with 1024 of capacity, a simplest series from
1109	* the beginning would be like:
1110	*
1111	* task util_avg: 512, 256, 128, 64, 32, 16, 8, ...
1112	* cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
1113	*
1114	* Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
1115	* if util_avg > util_avg_cap.
1116	*/
1117	void post_init_entity_util_avg(struct task_struct *p)
1118	{
1119	struct sched_entity *se = &p->se;
1120	struct cfs_rq *cfs_rq = cfs_rq_of(se);
1121	struct sched_avg *sa = &se->avg;
1122	long cpu_scale = arch_scale_cpu_capacity(cpu: cpu_of(rq: rq_of(cfs_rq)));
1123	long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / `2`;
1124
1125	if (p->sched_class != &fair_sched_class) {
1126	/*
1127	* For !fair tasks do:
1128	*
1129	update_cfs_rq_load_avg(now, cfs_rq);
1130	attach_entity_load_avg(cfs_rq, se);
1131	switched_from_fair(rq, p);
1132	*
1133	* such that the next switched_to_fair() has the
1134	* expected state.
1135	*/
1136	se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
1137	return;
1138	}
1139
1140	if (cap > `0`) {
1141	if (cfs_rq->avg.util_avg != `0`) {
1142	sa->util_avg = cfs_rq->avg.util_avg * se_weight(se);
1143	sa->util_avg /= (cfs_rq->avg.load_avg + `1`);
1144
1145	if (sa->util_avg > cap)
1146	sa->util_avg = cap;
1147	} else {
1148	sa->util_avg = cap;
1149	}
1150	}
1151
1152	sa->runnable_avg = sa->util_avg;
1153	}
1154
1155	static s64 update_se(struct rq rq, struct* sched_entity *se)
1156	{
1157	u64 now = rq_clock_task(rq);
1158	s64 delta_exec;
1159
1160	delta_exec = now - se->exec_start;
1161	if (unlikely(delta_exec <= `0`))
1162	return delta_exec;
1163
1164	se->exec_start = now;
1165	if (entity_is_task(se)) {
1166	struct task_struct *donor = task_of(se);
1167	struct task_struct *running = rq->curr;
1168	/*
1169	* If se is a task, we account the time against the running
1170	* task, as w/ proxy-exec they may not be the same.
1171	*/
1172	running->se.exec_start = now;
1173	running->se.sum_exec_runtime += delta_exec;
1174
1175	trace_sched_stat_runtime(tsk: running, runtime: delta_exec);
1176	account_group_exec_runtime(tsk: running, ns: delta_exec);
1177
1178	/ cgroup time is always accounted against the donor /
1179	cgroup_account_cputime(task: donor, delta_exec);
1180	} else {
1181	/ If not task, account the time against donor se /
1182	se->sum_exec_runtime += delta_exec;
1183	}
1184
1185	if (schedstat_enabled()) {
1186	struct sched_statistics *stats;
1187
1188	stats = __schedstats_from_se(se);
1189	__schedstat_set(stats->exec_max,
1190	max(delta_exec, stats->exec_max));
1191	}
1192
1193	return delta_exec;
1194	}
1195
1196	/*
1197	* Used by other classes to account runtime.
1198	*/
1199	s64 update_curr_common(struct rq *rq)
1200	{
1201	return update_se(rq, se: &rq->donor->se);
1202	}
1203
1204	/*
1205	* Update the current task's runtime statistics.
1206	*/
1207	static void update_curr(struct cfs_rq *cfs_rq)
1208	{
1209	/*
1210	* Note: cfs_rq->curr corresponds to the task picked to
1211	* run (ie: rq->donor.se) which due to proxy-exec may
1212	* not necessarily be the actual task running
1213	* (rq->curr.se). This is easy to confuse!
1214	*/
1215	struct sched_entity *curr = cfs_rq->curr;
1216	struct rq *rq = rq_of(cfs_rq);
1217	s64 delta_exec;
1218	bool resched;
1219
1220	if (unlikely(!curr))
1221	return;
1222
1223	delta_exec = update_se(rq, se: curr);
1224	if (unlikely(delta_exec <= `0`))
1225	return;
1226
1227	curr->vruntime += calc_delta_fair(delta: delta_exec, se: curr);
1228	resched = update_deadline(cfs_rq, se: curr);
1229	update_min_vruntime(cfs_rq);
1230
1231	if (entity_is_task(curr)) {
1232	/*
1233	* If the fair_server is active, we need to account for the
1234	* fair_server time whether or not the task is running on
1235	* behalf of fair_server or not:
1236	* - If the task is running on behalf of fair_server, we need
1237	* to limit its time based on the assigned runtime.
1238	* - Fair task that runs outside of fair_server should account
1239	* against fair_server such that it can account for this time
1240	* and possibly avoid running this period.
1241	*/
1242	if (dl_server_active(dl_se: &rq->fair_server))
1243	dl_server_update(dl_se: &rq->fair_server, delta_exec);
1244	}
1245
1246	account_cfs_rq_runtime(cfs_rq, delta_exec);
1247
1248	if (cfs_rq->nr_queued == `1`)
1249	return;
1250
1251	if (resched \|\| !protect_slice(se: curr)) {
1252	resched_curr_lazy(rq);
1253	clear_buddies(cfs_rq, se: curr);
1254	}
1255	}
1256
1257	static void update_curr_fair(struct rq *rq)
1258	{
1259	update_curr(cfs_rq: cfs_rq_of(se: &rq->donor->se));
1260	}
1261
1262	static inline void
1263	update_stats_wait_start_fair(struct cfs_rq cfs_rq, struct* sched_entity *se)
1264	{
1265	struct sched_statistics *stats;
1266	struct task_struct *p = NULL;
1267
1268	if (!schedstat_enabled())
1269	return;
1270
1271	stats = __schedstats_from_se(se);
1272
1273	if (entity_is_task(se))
1274	p = task_of(se);
1275
1276	__update_stats_wait_start(rq: rq_of(cfs_rq), p, stats);
1277	}
1278
1279	static inline void
1280	update_stats_wait_end_fair(struct cfs_rq cfs_rq, struct* sched_entity *se)
1281	{
1282	struct sched_statistics *stats;
1283	struct task_struct *p = NULL;
1284
1285	if (!schedstat_enabled())
1286	return;
1287
1288	stats = __schedstats_from_se(se);
1289
1290	/*
1291	* When the sched_schedstat changes from 0 to 1, some sched se
1292	* maybe already in the runqueue, the se->statistics.wait_start
1293	* will be 0.So it will let the delta wrong. We need to avoid this
1294	* scenario.
1295	*/
1296	if (unlikely(!schedstat_val(stats->wait_start)))
1297	return;
1298
1299	if (entity_is_task(se))
1300	p = task_of(se);
1301
1302	__update_stats_wait_end(rq: rq_of(cfs_rq), p, stats);
1303	}
1304
1305	static inline void
1306	update_stats_enqueue_sleeper_fair(struct cfs_rq cfs_rq, struct* sched_entity *se)
1307	{
1308	struct sched_statistics *stats;
1309	struct task_struct *tsk = NULL;
1310
1311	if (!schedstat_enabled())
1312	return;
1313
1314	stats = __schedstats_from_se(se);
1315
1316	if (entity_is_task(se))
1317	tsk = task_of(se);
1318
1319	__update_stats_enqueue_sleeper(rq: rq_of(cfs_rq), p: tsk, stats);
1320	}
1321
1322	/*
1323	* Task is being enqueued - update stats:
1324	*/
1325	static inline void
1326	update_stats_enqueue_fair(struct cfs_rq cfs_rq, struct* sched_entity se, int* flags)
1327	{
1328	if (!schedstat_enabled())
1329	return;
1330
1331	/*
1332	* Are we enqueueing a waiting task? (for current tasks
1333	* a dequeue/enqueue event is a NOP)
1334	*/
1335	if (se != cfs_rq->curr)
1336	update_stats_wait_start_fair(cfs_rq, se);
1337
1338	if (flags & ENQUEUE_WAKEUP)
1339	update_stats_enqueue_sleeper_fair(cfs_rq, se);
1340	}
1341
1342	static inline void
1343	update_stats_dequeue_fair(struct cfs_rq cfs_rq, struct* sched_entity se, int* flags)
1344	{
1345
1346	if (!schedstat_enabled())
1347	return;
1348
1349	/*
1350	* Mark the end of the wait period if dequeueing a
1351	* waiting task:
1352	*/
1353	if (se != cfs_rq->curr)
1354	update_stats_wait_end_fair(cfs_rq, se);
1355
1356	if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
1357	struct task_struct *tsk = task_of(se);
1358	unsigned int state;
1359
1360	/ XXX racy against TTWU /
1361	state = READ_ONCE(tsk->__state);
1362	if (state & TASK_INTERRUPTIBLE)
1363	__schedstat_set(tsk->stats.sleep_start,
1364	rq_clock(rq_of(cfs_rq)));
1365	if (state & TASK_UNINTERRUPTIBLE)
1366	__schedstat_set(tsk->stats.block_start,
1367	rq_clock(rq_of(cfs_rq)));
1368	}
1369	}
1370
1371	/*
1372	* We are picking a new current task - update its stats:
1373	*/
1374	static inline void
1375	update_stats_curr_start(struct cfs_rq cfs_rq, struct* sched_entity *se)
1376	{
1377	/*
1378	* We are starting a new run period:
1379	*/
1380	se->exec_start = rq_clock_task(rq: rq_of(cfs_rq));
1381	}
1382
1383	/**************************************************
1384	* Scheduling class queueing methods:
1385	*/
1386
1387	static inline bool is_core_idle(int cpu)
1388	{
1389	#ifdef CONFIG_SCHED_SMT
1390	int sibling;
1391
1392	for_each_cpu(sibling, cpu_smt_mask(cpu)) {
1393	if (cpu == sibling)
1394	continue;
1395
1396	if (!idle_cpu(cpu: sibling))
1397	return false;
1398	}
1399	#endif
1400
1401	return true;
1402	}
1403
1404	#ifdef CONFIG_NUMA
1405	#define NUMA_IMBALANCE_MIN 2
1406
1407	static inline long
1408	adjust_numa_imbalance(int imbalance, int dst_running, int imb_numa_nr)
1409	{
1410	/*
1411	* Allow a NUMA imbalance if busy CPUs is less than the maximum
1412	* threshold. Above this threshold, individual tasks may be contending
1413	* for both memory bandwidth and any shared HT resources. This is an
1414	* approximation as the number of running tasks may not be related to
1415	* the number of busy CPUs due to sched_setaffinity.
1416	*/
1417	if (dst_running > imb_numa_nr)
1418	return imbalance;
1419
1420	/*
1421	* Allow a small imbalance based on a simple pair of communicating
1422	* tasks that remain local when the destination is lightly loaded.
1423	*/
1424	if (imbalance <= NUMA_IMBALANCE_MIN)
1425	return `0`;
1426
1427	return imbalance;
1428	}
1429	#endif /* CONFIG_NUMA */
1430
1431	#ifdef CONFIG_NUMA_BALANCING
1432	/*
1433	* Approximate time to scan a full NUMA task in ms. The task scan period is
1434	* calculated based on the tasks virtual memory size and
1435	* numa_balancing_scan_size.
1436	*/
1437	unsigned int sysctl_numa_balancing_scan_period_min = `1000`;
1438	unsigned int sysctl_numa_balancing_scan_period_max = `60000`;
1439
1440	/ Portion of address space to scan in MB /
1441	unsigned int sysctl_numa_balancing_scan_size = `256`;
1442
1443	/ Scan @scan_size MB every @scan_period after an initial @scan_delay in ms /
1444	unsigned int sysctl_numa_balancing_scan_delay = `1000`;
1445
1446	/ The page with hint page fault latency < threshold in ms is considered hot /
1447	unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC;
1448
1449	struct numa_group {
1450	refcount_t refcount;
1451
1452	spinlock_t lock; / nr_tasks, tasks /
1453	int nr_tasks;
1454	pid_t gid;
1455	int active_nodes;
1456
1457	struct rcu_head rcu;
1458	unsigned long total_faults;
1459	unsigned long max_faults_cpu;
1460	/*
1461	* faults[] array is split into two regions: faults_mem and faults_cpu.
1462	*
1463	* Faults_cpu is used to decide whether memory should move
1464	* towards the CPU. As a consequence, these stats are weighted
1465	* more by CPU use than by memory faults.
1466	*/
1467	unsigned long faults[];
1468	};
1469
1470	/*
1471	* For functions that can be called in multiple contexts that permit reading
1472	* ->numa_group (see struct task_struct for locking rules).
1473	*/
1474	static struct numa_group deref_task_numa_group(struct* task_struct *p)
1475	{
1476	return rcu_dereference_check(p->numa_group, p == current \|\|
1477	(lockdep_is_held(__rq_lockp(task_rq(p))) && !READ_ONCE(p->on_cpu)));
1478	}
1479
1480	static struct numa_group deref_curr_numa_group(struct* task_struct *p)
1481	{
1482	return rcu_dereference_protected(p->numa_group, p == current);
1483	}
1484
1485	static inline unsigned long group_faults_priv(struct numa_group *ng);
1486	static inline unsigned long group_faults_shared(struct numa_group *ng);
1487
1488	static unsigned int task_nr_scan_windows(struct task_struct *p)
1489	{
1490	unsigned long rss = `0`;
1491	unsigned long nr_scan_pages;
1492
1493	/*
1494	* Calculations based on RSS as non-present and empty pages are skipped
1495	* by the PTE scanner and NUMA hinting faults should be trapped based
1496	* on resident pages
1497	*/
1498	nr_scan_pages = MB_TO_PAGES(sysctl_numa_balancing_scan_size);
1499	rss = get_mm_rss(p->mm);
1500	if (!rss)
1501	rss = nr_scan_pages;
1502
1503	rss = round_up(rss, nr_scan_pages);
1504	return rss / nr_scan_pages;
1505	}
1506
1507	/ For sanity's sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. /
1508	#define MAX_SCAN_WINDOW 2560
1509
1510	static unsigned int task_scan_min(struct task_struct *p)
1511	{
1512	unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
1513	unsigned int scan, floor;
1514	unsigned int windows = `1`;
1515
1516	if (scan_size < MAX_SCAN_WINDOW)
1517	windows = MAX_SCAN_WINDOW / scan_size;
1518	floor = `1000` / windows;
1519
1520	scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
1521	return max_t(unsigned int, floor, scan);
1522	}
1523
1524	static unsigned int task_scan_start(struct task_struct *p)
1525	{
1526	unsigned long smin = task_scan_min(p);
1527	unsigned long period = smin;
1528	struct numa_group *ng;
1529
1530	/ Scale the maximum scan period with the amount of shared memory. /
1531	rcu_read_lock();
1532	ng = rcu_dereference(p->numa_group);
1533	if (ng) {
1534	unsigned long shared = group_faults_shared(ng);
1535	unsigned long private = group_faults_priv(ng);
1536
1537	period *= refcount_read(&ng->refcount);
1538	period *= shared + `1`;
1539	period /= private + shared + `1`;
1540	}
1541	rcu_read_unlock();
1542
1543	return max(smin, period);
1544	}
1545
1546	static unsigned int task_scan_max(struct task_struct *p)
1547	{
1548	unsigned long smin = task_scan_min(p);
1549	unsigned long smax;
1550	struct numa_group *ng;
1551
1552	/ Watch for min being lower than max due to floor calculations /
1553	smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
1554
1555	/ Scale the maximum scan period with the amount of shared memory. /
1556	ng = deref_curr_numa_group(p);
1557	if (ng) {
1558	unsigned long shared = group_faults_shared(ng);
1559	unsigned long private = group_faults_priv(ng);
1560	unsigned long period = smax;
1561
1562	period *= refcount_read(&ng->refcount);
1563	period *= shared + `1`;
1564	period /= private + shared + `1`;
1565
1566	smax = max(smax, period);
1567	}
1568
1569	return max(smin, smax);
1570	}
1571
1572	static void account_numa_enqueue(struct rq rq, struct* task_struct *p)
1573	{
1574	rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
1575	rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
1576	}
1577
1578	static void account_numa_dequeue(struct rq rq, struct* task_struct *p)
1579	{
1580	rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
1581	rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
1582	}
1583
1584	/ Shared or private faults. /
1585	#define NR_NUMA_HINT_FAULT_TYPES 2
1586
1587	/ Memory and CPU locality /
1588	#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
1589
1590	/ Averaged statistics, and temporary buffers. /
1591	#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
1592
1593	pid_t task_numa_group_id(struct task_struct *p)
1594	{
1595	struct numa_group *ng;
1596	pid_t gid = `0`;
1597
1598	rcu_read_lock();
1599	ng = rcu_dereference(p->numa_group);
1600	if (ng)
1601	gid = ng->gid;
1602	rcu_read_unlock();
1603
1604	return gid;
1605	}
1606
1607	/*
1608	* The averaged statistics, shared & private, memory & CPU,
1609	* occupy the first half of the array. The second half of the
1610	* array is for current counters, which are averaged into the
1611	* first set by task_numa_placement.
1612	*/
1613	static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
1614	{
1615	return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
1616	}
1617
1618	static inline unsigned long task_faults(struct task_struct p, int* nid)
1619	{
1620	if (!p->numa_faults)
1621	return `0`;
1622
1623	return p->numa_faults[task_faults_idx(NUMA_MEM, nid, `0`)] +
1624	p->numa_faults[task_faults_idx(NUMA_MEM, nid, `1`)];
1625	}
1626
1627	static inline unsigned long group_faults(struct task_struct p, int* nid)
1628	{
1629	struct numa_group *ng = deref_task_numa_group(p);
1630
1631	if (!ng)
1632	return `0`;
1633
1634	return ng->faults[task_faults_idx(NUMA_MEM, nid, `0`)] +
1635	ng->faults[task_faults_idx(NUMA_MEM, nid, `1`)];
1636	}
1637
1638	static inline unsigned long group_faults_cpu(struct numa_group group, int* nid)
1639	{
1640	return group->faults[task_faults_idx(NUMA_CPU, nid, `0`)] +
1641	group->faults[task_faults_idx(NUMA_CPU, nid, `1`)];
1642	}
1643
1644	static inline unsigned long group_faults_priv(struct numa_group *ng)
1645	{
1646	unsigned long faults = `0`;
1647	int node;
1648
1649	for_each_online_node(node) {
1650	faults += ng->faults[task_faults_idx(NUMA_MEM, node, `1`)];
1651	}
1652
1653	return faults;
1654	}
1655
1656	static inline unsigned long group_faults_shared(struct numa_group *ng)
1657	{
1658	unsigned long faults = `0`;
1659	int node;
1660
1661	for_each_online_node(node) {
1662	faults += ng->faults[task_faults_idx(NUMA_MEM, node, `0`)];
1663	}
1664
1665	return faults;
1666	}
1667
1668	/*
1669	* A node triggering more than 1/3 as many NUMA faults as the maximum is
1670	* considered part of a numa group's pseudo-interleaving set. Migrations
1671	* between these nodes are slowed down, to allow things to settle down.
1672	*/
1673	#define ACTIVE_NODE_FRACTION 3
1674
1675	static bool numa_is_active_node(int nid, struct numa_group *ng)
1676	{
1677	return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
1678	}
1679
1680	/ Handle placement on systems where not all nodes are directly connected. /
1681	static unsigned long score_nearby_nodes(struct task_struct p, int* nid,
1682	int lim_dist, bool task)
1683	{
1684	unsigned long score = `0`;
1685	int node, max_dist;
1686
1687	/*
1688	* All nodes are directly connected, and the same distance
1689	* from each other. No need for fancy placement algorithms.
1690	*/
1691	if (sched_numa_topology_type == NUMA_DIRECT)
1692	return `0`;
1693
1694	/ sched_max_numa_distance may be changed in parallel. /
1695	max_dist = READ_ONCE(sched_max_numa_distance);
1696	/*
1697	* This code is called for each node, introducing N^2 complexity,
1698	* which should be OK given the number of nodes rarely exceeds 8.
1699	*/
1700	for_each_online_node(node) {
1701	unsigned long faults;
1702	int dist = node_distance(nid, node);
1703
1704	/*
1705	* The furthest away nodes in the system are not interesting
1706	* for placement; nid was already counted.
1707	*/
1708	if (dist >= max_dist \|\| node == nid)
1709	continue;
1710
1711	/*
1712	* On systems with a backplane NUMA topology, compare groups
1713	* of nodes, and move tasks towards the group with the most
1714	* memory accesses. When comparing two nodes at distance
1715	* "hoplimit", only nodes closer by than "hoplimit" are part
1716	* of each group. Skip other nodes.
1717	*/
1718	if (sched_numa_topology_type == NUMA_BACKPLANE && dist >= lim_dist)
1719	continue;
1720
1721	/ Add up the faults from nearby nodes. /
1722	if (task)
1723	faults = task_faults(p, node);
1724	else
1725	faults = group_faults(p, node);
1726
1727	/*
1728	* On systems with a glueless mesh NUMA topology, there are
1729	* no fixed "groups of nodes". Instead, nodes that are not
1730	* directly connected bounce traffic through intermediate
1731	* nodes; a numa_group can occupy any set of nodes.
1732	* The further away a node is, the less the faults count.
1733	* This seems to result in good task placement.
1734	*/
1735	if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1736	faults *= (max_dist - dist);
1737	faults /= (max_dist - LOCAL_DISTANCE);
1738	}
1739
1740	score += faults;
1741	}
1742
1743	return score;
1744	}
1745
1746	/*
1747	* These return the fraction of accesses done by a particular task, or
1748	* task group, on a particular numa node. The group weight is given a
1749	* larger multiplier, in order to group tasks together that are almost
1750	* evenly spread out between numa nodes.
1751	*/
1752	static inline unsigned long task_weight(struct task_struct p, int* nid,
1753	int dist)
1754	{
1755	unsigned long faults, total_faults;
1756
1757	if (!p->numa_faults)
1758	return `0`;
1759
1760	total_faults = p->total_numa_faults;
1761
1762	if (!total_faults)
1763	return `0`;
1764
1765	faults = task_faults(p, nid);
1766	faults += score_nearby_nodes(p, nid, dist, true);
1767
1768	return `1000` * faults / total_faults;
1769	}
1770
1771	static inline unsigned long group_weight(struct task_struct p, int* nid,
1772	int dist)
1773	{
1774	struct numa_group *ng = deref_task_numa_group(p);
1775	unsigned long faults, total_faults;
1776
1777	if (!ng)
1778	return `0`;
1779
1780	total_faults = ng->total_faults;
1781
1782	if (!total_faults)
1783	return `0`;
1784
1785	faults = group_faults(p, nid);
1786	faults += score_nearby_nodes(p, nid, dist, false);
1787
1788	return `1000` * faults / total_faults;
1789	}
1790
1791	/*
1792	* If memory tiering mode is enabled, cpupid of slow memory page is
1793	* used to record scan time instead of CPU and PID. When tiering mode
1794	* is disabled at run time, the scan time (in cpupid) will be
1795	* interpreted as CPU and PID. So CPU needs to be checked to avoid to
1796	* access out of array bound.
1797	*/
1798	static inline bool cpupid_valid(int cpupid)
1799	{
1800	return cpupid_to_cpu(cpupid) < nr_cpu_ids;
1801	}
1802
1803	/*
1804	* For memory tiering mode, if there are enough free pages (more than
1805	* enough watermark defined here) in fast memory node, to take full
1806	* advantage of fast memory capacity, all recently accessed slow
1807	* memory pages will be migrated to fast memory node without
1808	* considering hot threshold.
1809	*/
1810	static bool pgdat_free_space_enough(struct pglist_data *pgdat)
1811	{
1812	int z;
1813	unsigned long enough_wmark;
1814
1815	enough_wmark = max(`1UL` * `1024` * `1024` * `1024` >> PAGE_SHIFT,
1816	pgdat->node_present_pages >> `4`);
1817	for (z = pgdat->nr_zones - `1`; z >= `0`; z--) {
1818	struct zone *zone = pgdat->node_zones + z;
1819
1820	if (!populated_zone(zone))
1821	continue;
1822
1823	if (zone_watermark_ok(zone, `0`,
1824	promo_wmark_pages(zone) + enough_wmark,
1825	ZONE_MOVABLE, `0`))
1826	return true;
1827	}
1828	return false;
1829	}
1830
1831	/*
1832	* For memory tiering mode, when page tables are scanned, the scan
1833	* time will be recorded in struct page in addition to make page
1834	* PROT_NONE for slow memory page. So when the page is accessed, in
1835	* hint page fault handler, the hint page fault latency is calculated
1836	* via,
1837	*
1838	* hint page fault latency = hint page fault time - scan time
1839	*
1840	* The smaller the hint page fault latency, the higher the possibility
1841	* for the page to be hot.
1842	*/
1843	static int numa_hint_fault_latency(struct folio *folio)
1844	{
1845	int last_time, time;
1846
1847	time = jiffies_to_msecs(jiffies);
1848	last_time = folio_xchg_access_time(folio, time);
1849
1850	return (time - last_time) & PAGE_ACCESS_TIME_MASK;
1851	}
1852
1853	/*
1854	* For memory tiering mode, too high promotion/demotion throughput may
1855	* hurt application latency. So we provide a mechanism to rate limit
1856	* the number of pages that are tried to be promoted.
1857	*/
1858	static bool numa_promotion_rate_limit(struct pglist_data *pgdat,
1859	unsigned long rate_limit, int nr)
1860	{
1861	unsigned long nr_cand;
1862	unsigned int now, start;
1863
1864	now = jiffies_to_msecs(jiffies);
1865	mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr);
1866	nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
1867	start = pgdat->nbp_rl_start;
1868	if (now - start > MSEC_PER_SEC &&
1869	cmpxchg(&pgdat->nbp_rl_start, start, now) == start)
1870	pgdat->nbp_rl_nr_cand = nr_cand;
1871	if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit)
1872	return true;
1873	return false;
1874	}
1875
1876	#define NUMA_MIGRATION_ADJUST_STEPS 16
1877
1878	static void numa_promotion_adjust_threshold(struct pglist_data *pgdat,
1879	unsigned long rate_limit,
1880	unsigned int ref_th)
1881	{
1882	unsigned int now, start, th_period, unit_th, th;
1883	unsigned long nr_cand, ref_cand, diff_cand;
1884
1885	now = jiffies_to_msecs(jiffies);
1886	th_period = sysctl_numa_balancing_scan_period_max;
1887	start = pgdat->nbp_th_start;
1888	if (now - start > th_period &&
1889	cmpxchg(&pgdat->nbp_th_start, start, now) == start) {
1890	ref_cand = rate_limit *
1891	sysctl_numa_balancing_scan_period_max / MSEC_PER_SEC;
1892	nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
1893	diff_cand = nr_cand - pgdat->nbp_th_nr_cand;
1894	unit_th = ref_th * `2` / NUMA_MIGRATION_ADJUST_STEPS;
1895	th = pgdat->nbp_threshold ? : ref_th;
1896	if (diff_cand > ref_cand * `11` / `10`)
1897	th = max(th - unit_th, unit_th);
1898	else if (diff_cand < ref_cand * `9` / `10`)
1899	th = min(th + unit_th, ref_th * `2`);
1900	pgdat->nbp_th_nr_cand = nr_cand;
1901	pgdat->nbp_threshold = th;
1902	}
1903	}
1904
1905	bool should_numa_migrate_memory(struct task_struct p, struct* folio *folio,
1906	int src_nid, int dst_cpu)
1907	{
1908	struct numa_group *ng = deref_curr_numa_group(p);
1909	int dst_nid = cpu_to_node(dst_cpu);
1910	int last_cpupid, this_cpupid;
1911
1912	/*
1913	* Cannot migrate to memoryless nodes.
1914	*/
1915	if (!node_state(dst_nid, N_MEMORY))
1916	return false;
1917
1918	/*
1919	* The pages in slow memory node should be migrated according
1920	* to hot/cold instead of private/shared.
1921	*/
1922	if (folio_use_access_time(folio)) {
1923	struct pglist_data *pgdat;
1924	unsigned long rate_limit;
1925	unsigned int latency, th, def_th;
1926	long nr = folio_nr_pages(folio);
1927
1928	pgdat = NODE_DATA(dst_nid);
1929	if (pgdat_free_space_enough(pgdat)) {
1930	/ workload changed, reset hot threshold /
1931	pgdat->nbp_threshold = `0`;
1932	mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE_NRL, nr);
1933	return true;
1934	}
1935
1936	def_th = sysctl_numa_balancing_hot_threshold;
1937	rate_limit = MB_TO_PAGES(sysctl_numa_balancing_promote_rate_limit);
1938	numa_promotion_adjust_threshold(pgdat, rate_limit, def_th);
1939
1940	th = pgdat->nbp_threshold ? : def_th;
1941	latency = numa_hint_fault_latency(folio);
1942	if (latency >= th)
1943	return false;
1944
1945	return !numa_promotion_rate_limit(pgdat, rate_limit, nr);
1946	}
1947
1948	this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1949	last_cpupid = folio_xchg_last_cpupid(folio, this_cpupid);
1950
1951	if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
1952	!node_is_toptier(src_nid) && !cpupid_valid(last_cpupid))
1953	return false;
1954
1955	/*
1956	* Allow first faults or private faults to migrate immediately early in
1957	* the lifetime of a task. The magic number 4 is based on waiting for
1958	* two full passes of the "multi-stage node selection" test that is
1959	* executed below.
1960	*/
1961	if ((p->numa_preferred_nid == NUMA_NO_NODE \|\| p->numa_scan_seq <= `4`) &&
1962	(cpupid_pid_unset(last_cpupid) \|\| cpupid_match_pid(p, last_cpupid)))
1963	return true;
1964
1965	/*
1966	* Multi-stage node selection is used in conjunction with a periodic
1967	* migration fault to build a temporal task<->page relation. By using
1968	* a two-stage filter we remove short/unlikely relations.
1969	*
1970	* Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
1971	* a task's usage of a particular page (n_p) per total usage of this
1972	* page (n_t) (in a given time-span) to a probability.
1973	*
1974	* Our periodic faults will sample this probability and getting the
1975	* same result twice in a row, given these samples are fully
1976	* independent, is then given by P(n)^2, provided our sample period
1977	* is sufficiently short compared to the usage pattern.
1978	*
1979	* This quadric squishes small probabilities, making it less likely we
1980	* act on an unlikely task<->page relation.
1981	*/
1982	if (!cpupid_pid_unset(last_cpupid) &&
1983	cpupid_to_nid(last_cpupid) != dst_nid)
1984	return false;
1985
1986	/ Always allow migrate on private faults /
1987	if (cpupid_match_pid(p, last_cpupid))
1988	return true;
1989
1990	/ A shared fault, but p->numa_group has not been set up yet. /
1991	if (!ng)
1992	return true;
1993
1994	/*
1995	* Destination node is much more heavily used than the source
1996	* node? Allow migration.
1997	*/
1998	if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
1999	ACTIVE_NODE_FRACTION)
2000	return true;
2001
2002	/*
2003	* Distribute memory according to CPU & memory use on each node,
2004	* with 3/4 hysteresis to avoid unnecessary memory migrations:
2005	*
2006	* faults_cpu(dst) 3 faults_cpu(src)
2007	* --------------- * - > ---------------
2008	* faults_mem(dst) 4 faults_mem(src)
2009	*/
2010	return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * `3` >
2011	group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * `4`;
2012	}
2013
2014	/*
2015	* 'numa_type' describes the node at the moment of load balancing.
2016	*/
2017	enum numa_type {
2018	/ The node has spare capacity that can be used to run more tasks. /
2019	node_has_spare = `0`,
2020	/*
2021	* The node is fully used and the tasks don't compete for more CPU
2022	* cycles. Nevertheless, some tasks might wait before running.
2023	*/
2024	node_fully_busy,
2025	/*
2026	* The node is overloaded and can't provide expected CPU cycles to all
2027	* tasks.
2028	*/
2029	node_overloaded
2030	};
2031
2032	/ Cached statistics for all CPUs within a node /
2033	struct numa_stats {
2034	unsigned long load;
2035	unsigned long runnable;
2036	unsigned long util;
2037	/ Total compute capacity of CPUs on a node /
2038	unsigned long compute_capacity;
2039	unsigned int nr_running;
2040	unsigned int weight;
2041	enum numa_type node_type;
2042	int idle_cpu;
2043	};
2044
2045	struct task_numa_env {
2046	struct task_struct *p;
2047
2048	int src_cpu, src_nid;
2049	int dst_cpu, dst_nid;
2050	int imb_numa_nr;
2051
2052	struct numa_stats src_stats, dst_stats;
2053
2054	int imbalance_pct;
2055	int dist;
2056
2057	struct task_struct *best_task;
2058	long best_imp;
2059	int best_cpu;
2060	};
2061
2062	static unsigned long cpu_load(struct rq *rq);
2063	static unsigned long cpu_runnable(struct rq *rq);
2064
2065	static inline enum
2066	numa_type numa_classify(unsigned int imbalance_pct,
2067	struct numa_stats *ns)
2068	{
2069	if ((ns->nr_running > ns->weight) &&
2070	(((ns->compute_capacity * `100`) < (ns->util * imbalance_pct)) \|\|
2071	((ns->compute_capacity * imbalance_pct) < (ns->runnable * `100`))))
2072	return node_overloaded;
2073
2074	if ((ns->nr_running < ns->weight) \|\|
2075	(((ns->compute_capacity * `100`) > (ns->util * imbalance_pct)) &&
2076	((ns->compute_capacity * imbalance_pct) > (ns->runnable * `100`))))
2077	return node_has_spare;
2078
2079	return node_fully_busy;
2080	}
2081
2082	#ifdef CONFIG_SCHED_SMT
2083	/ Forward declarations of select_idle_sibling helpers /
2084	static inline bool test_idle_cores(int cpu);
2085	static inline int numa_idle_core(int idle_core, int cpu)
2086	{
2087	if (!static_branch_likely(&sched_smt_present) \|\|
2088	idle_core >= `0` \|\| !test_idle_cores(cpu))
2089	return idle_core;
2090
2091	/*
2092	* Prefer cores instead of packing HT siblings
2093	* and triggering future load balancing.
2094	*/
2095	if (is_core_idle(cpu))
2096	idle_core = cpu;
2097
2098	return idle_core;
2099	}
2100	#else /* !CONFIG_SCHED_SMT: */
2101	static inline int numa_idle_core(int idle_core, int cpu)
2102	{
2103	return idle_core;
2104	}
2105	#endif /* !CONFIG_SCHED_SMT */
2106
2107	/*
2108	* Gather all necessary information to make NUMA balancing placement
2109	* decisions that are compatible with standard load balancer. This
2110	* borrows code and logic from update_sg_lb_stats but sharing a
2111	* common implementation is impractical.
2112	*/
2113	static void update_numa_stats(struct task_numa_env *env,
2114	struct numa_stats ns, int* nid,
2115	bool find_idle)
2116	{
2117	int cpu, idle_core = -`1`;
2118
2119	memset(ns, `0`, sizeof(*ns));
2120	ns->idle_cpu = -`1`;
2121
2122	rcu_read_lock();
2123	for_each_cpu(cpu, cpumask_of_node(nid)) {
2124	struct rq *rq = cpu_rq(cpu);
2125
2126	ns->load += cpu_load(rq);
2127	ns->runnable += cpu_runnable(rq);
2128	ns->util += cpu_util_cfs(cpu);
2129	ns->nr_running += rq->cfs.h_nr_runnable;
2130	ns->compute_capacity += capacity_of(cpu);
2131
2132	if (find_idle && idle_core < `0` && !rq->nr_running && idle_cpu(cpu)) {
2133	if (READ_ONCE(rq->numa_migrate_on) \|\|
2134	!cpumask_test_cpu(cpu, env->p->cpus_ptr))
2135	continue;
2136
2137	if (ns->idle_cpu == -`1`)
2138	ns->idle_cpu = cpu;
2139
2140	idle_core = numa_idle_core(idle_core, cpu);
2141	}
2142	}
2143	rcu_read_unlock();
2144
2145	ns->weight = cpumask_weight(cpumask_of_node(nid));
2146
2147	ns->node_type = numa_classify(env->imbalance_pct, ns);
2148
2149	if (idle_core >= `0`)
2150	ns->idle_cpu = idle_core;
2151	}
2152
2153	static void task_numa_assign(struct task_numa_env *env,
2154	struct task_struct p, long* imp)
2155	{
2156	struct rq *rq = cpu_rq(env->dst_cpu);
2157
2158	/ Check if run-queue part of active NUMA balance. /
2159	if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, `1`)) {
2160	int cpu;
2161	int start = env->dst_cpu;
2162
2163	/ Find alternative idle CPU. /
2164	for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start + `1`) {
2165	if (cpu == env->best_cpu \|\| !idle_cpu(cpu) \|\|
2166	!cpumask_test_cpu(cpu, env->p->cpus_ptr)) {
2167	continue;
2168	}
2169
2170	env->dst_cpu = cpu;
2171	rq = cpu_rq(env->dst_cpu);
2172	if (!xchg(&rq->numa_migrate_on, `1`))
2173	goto assign;
2174	}
2175
2176	/ Failed to find an alternative idle CPU /
2177	return;
2178	}
2179
2180	assign:
2181	/*
2182	* Clear previous best_cpu/rq numa-migrate flag, since task now
2183	* found a better CPU to move/swap.
2184	*/
2185	if (env->best_cpu != -`1` && env->best_cpu != env->dst_cpu) {
2186	rq = cpu_rq(env->best_cpu);
2187	WRITE_ONCE(rq->numa_migrate_on, `0`);
2188	}
2189
2190	if (env->best_task)
2191	put_task_struct(env->best_task);
2192	if (p)
2193	get_task_struct(p);
2194
2195	env->best_task = p;
2196	env->best_imp = imp;
2197	env->best_cpu = env->dst_cpu;
2198	}
2199
2200	static bool load_too_imbalanced(long src_load, long dst_load,
2201	struct task_numa_env *env)
2202	{
2203	long imb, old_imb;
2204	long orig_src_load, orig_dst_load;
2205	long src_capacity, dst_capacity;
2206
2207	/*
2208	* The load is corrected for the CPU capacity available on each node.
2209	*
2210	* src_load dst_load
2211	* ------------ vs ---------
2212	* src_capacity dst_capacity
2213	*/
2214	src_capacity = env->src_stats.compute_capacity;
2215	dst_capacity = env->dst_stats.compute_capacity;
2216
2217	imb = abs(dst_load * src_capacity - src_load * dst_capacity);
2218
2219	orig_src_load = env->src_stats.load;
2220	orig_dst_load = env->dst_stats.load;
2221
2222	old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);
2223
2224	/ Would this change make things worse? /
2225	return (imb > old_imb);
2226	}
2227
2228	/*
2229	* Maximum NUMA importance can be 1998 (2*999);
2230	* SMALLIMP @ 30 would be close to 1998/64.
2231	* Used to deter task migration.
2232	*/
2233	#define SMALLIMP 30
2234
2235	/*
2236	* This checks if the overall compute and NUMA accesses of the system would
2237	* be improved if the source tasks was migrated to the target dst_cpu taking
2238	* into account that it might be best if task running on the dst_cpu should
2239	* be exchanged with the source task
2240	*/
2241	static bool task_numa_compare(struct task_numa_env *env,
2242	long taskimp, long groupimp, bool maymove)
2243	{
2244	struct numa_group cur_ng, p_ng = deref_curr_numa_group(env->p);
2245	struct rq *dst_rq = cpu_rq(env->dst_cpu);
2246	long imp = p_ng ? groupimp : taskimp;
2247	struct task_struct *cur;
2248	long src_load, dst_load;
2249	int dist = env->dist;
2250	long moveimp = imp;
2251	long load;
2252	bool stopsearch = false;
2253
2254	if (READ_ONCE(dst_rq->numa_migrate_on))
2255	return false;
2256
2257	rcu_read_lock();
2258	cur = rcu_dereference(dst_rq->curr);
2259	if (cur && ((cur->flags & (PF_EXITING \| PF_KTHREAD)) \|\|
2260	!cur->mm))
2261	cur = NULL;
2262
2263	/*
2264	* Because we have preemption enabled we can get migrated around and
2265	* end try selecting ourselves (current == env->p) as a swap candidate.
2266	*/
2267	if (cur == env->p) {
2268	stopsearch = true;
2269	goto unlock;
2270	}
2271
2272	if (!cur) {
2273	if (maymove && moveimp >= env->best_imp)
2274	goto assign;
2275	else
2276	goto unlock;
2277	}
2278
2279	/ Skip this swap candidate if cannot move to the source cpu. /
2280	if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
2281	goto unlock;
2282
2283	/*
2284	* Skip this swap candidate if it is not moving to its preferred
2285	* node and the best task is.
2286	*/
2287	if (env->best_task &&
2288	env->best_task->numa_preferred_nid == env->src_nid &&
2289	cur->numa_preferred_nid != env->src_nid) {
2290	goto unlock;
2291	}
2292
2293	/*
2294	* "imp" is the fault differential for the source task between the
2295	* source and destination node. Calculate the total differential for
2296	* the source task and potential destination task. The more negative
2297	* the value is, the more remote accesses that would be expected to
2298	* be incurred if the tasks were swapped.
2299	*
2300	* If dst and source tasks are in the same NUMA group, or not
2301	* in any group then look only at task weights.
2302	*/
2303	cur_ng = rcu_dereference(cur->numa_group);
2304	if (cur_ng == p_ng) {
2305	/*
2306	* Do not swap within a group or between tasks that have
2307	* no group if there is spare capacity. Swapping does
2308	* not address the load imbalance and helps one task at
2309	* the cost of punishing another.
2310	*/
2311	if (env->dst_stats.node_type == node_has_spare)
2312	goto unlock;
2313
2314	imp = taskimp + task_weight(cur, env->src_nid, dist) -
2315	task_weight(cur, env->dst_nid, dist);
2316	/*
2317	* Add some hysteresis to prevent swapping the
2318	* tasks within a group over tiny differences.
2319	*/
2320	if (cur_ng)
2321	imp -= imp / `16`;
2322	} else {
2323	/*
2324	* Compare the group weights. If a task is all by itself
2325	* (not part of a group), use the task weight instead.
2326	*/
2327	if (cur_ng && p_ng)
2328	imp += group_weight(cur, env->src_nid, dist) -
2329	group_weight(cur, env->dst_nid, dist);
2330	else
2331	imp += task_weight(cur, env->src_nid, dist) -
2332	task_weight(cur, env->dst_nid, dist);
2333	}
2334
2335	/ Discourage picking a task already on its preferred node /
2336	if (cur->numa_preferred_nid == env->dst_nid)
2337	imp -= imp / `16`;
2338
2339	/*
2340	* Encourage picking a task that moves to its preferred node.
2341	* This potentially makes imp larger than it's maximum of
2342	* 1998 (see SMALLIMP and task_weight for why) but in this
2343	* case, it does not matter.
2344	*/
2345	if (cur->numa_preferred_nid == env->src_nid)
2346	imp += imp / `8`;
2347
2348	if (maymove && moveimp > imp && moveimp > env->best_imp) {
2349	imp = moveimp;
2350	cur = NULL;
2351	goto assign;
2352	}
2353
2354	/*
2355	* Prefer swapping with a task moving to its preferred node over a
2356	* task that is not.
2357	*/
2358	if (env->best_task && cur->numa_preferred_nid == env->src_nid &&
2359	env->best_task->numa_preferred_nid != env->src_nid) {
2360	goto assign;
2361	}
2362
2363	/*
2364	* If the NUMA importance is less than SMALLIMP,
2365	* task migration might only result in ping pong
2366	* of tasks and also hurt performance due to cache
2367	* misses.
2368	*/
2369	if (imp < SMALLIMP \|\| imp <= env->best_imp + SMALLIMP / `2`)
2370	goto unlock;
2371
2372	/*
2373	* In the overloaded case, try and keep the load balanced.
2374	*/
2375	load = task_h_load(env->p) - task_h_load(cur);
2376	if (!load)
2377	goto assign;
2378
2379	dst_load = env->dst_stats.load + load;
2380	src_load = env->src_stats.load - load;
2381
2382	if (load_too_imbalanced(src_load, dst_load, env))
2383	goto unlock;
2384
2385	assign:
2386	/ Evaluate an idle CPU for a task numa move. /
2387	if (!cur) {
2388	int cpu = env->dst_stats.idle_cpu;
2389
2390	/ Nothing cached so current CPU went idle since the search. /
2391	if (cpu < `0`)
2392	cpu = env->dst_cpu;
2393
2394	/*
2395	* If the CPU is no longer truly idle and the previous best CPU
2396	* is, keep using it.
2397	*/
2398	if (!idle_cpu(cpu) && env->best_cpu >= `0` &&
2399	idle_cpu(env->best_cpu)) {
2400	cpu = env->best_cpu;
2401	}
2402
2403	env->dst_cpu = cpu;
2404	}
2405
2406	task_numa_assign(env, cur, imp);
2407
2408	/*
2409	* If a move to idle is allowed because there is capacity or load
2410	* balance improves then stop the search. While a better swap
2411	* candidate may exist, a search is not free.
2412	*/
2413	if (maymove && !cur && env->best_cpu >= `0` && idle_cpu(env->best_cpu))
2414	stopsearch = true;
2415
2416	/*
2417	* If a swap candidate must be identified and the current best task
2418	* moves its preferred node then stop the search.
2419	*/
2420	if (!maymove && env->best_task &&
2421	env->best_task->numa_preferred_nid == env->src_nid) {
2422	stopsearch = true;
2423	}
2424	unlock:
2425	rcu_read_unlock();
2426
2427	return stopsearch;
2428	}
2429
2430	static void task_numa_find_cpu(struct task_numa_env *env,
2431	long taskimp, long groupimp)
2432	{
2433	bool maymove = false;
2434	int cpu;
2435
2436	/*
2437	* If dst node has spare capacity, then check if there is an
2438	* imbalance that would be overruled by the load balancer.
2439	*/
2440	if (env->dst_stats.node_type == node_has_spare) {
2441	unsigned int imbalance;
2442	int src_running, dst_running;
2443
2444	/*
2445	* Would movement cause an imbalance? Note that if src has
2446	* more running tasks that the imbalance is ignored as the
2447	* move improves the imbalance from the perspective of the
2448	* CPU load balancer.
2449	* */
2450	src_running = env->src_stats.nr_running - `1`;
2451	dst_running = env->dst_stats.nr_running + `1`;
2452	imbalance = max(`0`, dst_running - src_running);
2453	imbalance = adjust_numa_imbalance(imbalance, dst_running,
2454	env->imb_numa_nr);
2455
2456	/ Use idle CPU if there is no imbalance /
2457	if (!imbalance) {
2458	maymove = true;
2459	if (env->dst_stats.idle_cpu >= `0`) {
2460	env->dst_cpu = env->dst_stats.idle_cpu;
2461	task_numa_assign(env, NULL, `0`);
2462	return;
2463	}
2464	}
2465	} else {
2466	long src_load, dst_load, load;
2467	/*
2468	* If the improvement from just moving env->p direction is better
2469	* than swapping tasks around, check if a move is possible.
2470	*/
2471	load = task_h_load(env->p);
2472	dst_load = env->dst_stats.load + load;
2473	src_load = env->src_stats.load - load;
2474	maymove = !load_too_imbalanced(src_load, dst_load, env);
2475	}
2476
2477	for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
2478	/ Skip this CPU if the source task cannot migrate /
2479	if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
2480	continue;
2481
2482	env->dst_cpu = cpu;
2483	if (task_numa_compare(env, taskimp, groupimp, maymove))
2484	break;
2485	}
2486	}
2487
2488	static int task_numa_migrate(struct task_struct *p)
2489	{
2490	struct task_numa_env env = {
2491	.p = p,
2492
2493	.src_cpu = task_cpu(p),
2494	.src_nid = task_node(p),
2495
2496	.imbalance_pct = `112`,
2497
2498	.best_task = NULL,
2499	.best_imp = `0`,
2500	.best_cpu = -`1`,
2501	};
2502	unsigned long taskweight, groupweight;
2503	struct sched_domain *sd;
2504	long taskimp, groupimp;
2505	struct numa_group *ng;
2506	struct rq *best_rq;
2507	int nid, ret, dist;
2508
2509	/*
2510	* Pick the lowest SD_NUMA domain, as that would have the smallest
2511	* imbalance and would be the first to start moving tasks about.
2512	*
2513	* And we want to avoid any moving of tasks about, as that would create
2514	* random movement of tasks -- counter the numa conditions we're trying
2515	* to satisfy here.
2516	*/
2517	rcu_read_lock();
2518	sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
2519	if (sd) {
2520	env.imbalance_pct = `100` + (sd->imbalance_pct - `100`) / `2`;
2521	env.imb_numa_nr = sd->imb_numa_nr;
2522	}
2523	rcu_read_unlock();
2524
2525	/*
2526	* Cpusets can break the scheduler domain tree into smaller
2527	* balance domains, some of which do not cross NUMA boundaries.
2528	* Tasks that are "trapped" in such domains cannot be migrated
2529	* elsewhere, so there is no point in (re)trying.
2530	*/
2531	if (unlikely(!sd)) {
2532	sched_setnuma(p, task_node(p));
2533	return -EINVAL;
2534	}
2535
2536	env.dst_nid = p->numa_preferred_nid;
2537	dist = env.dist = node_distance(env.src_nid, env.dst_nid);
2538	taskweight = task_weight(p, env.src_nid, dist);
2539	groupweight = group_weight(p, env.src_nid, dist);
2540	update_numa_stats(&env, &env.src_stats, env.src_nid, false);
2541	taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
2542	groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
2543	update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
2544
2545	/ Try to find a spot on the preferred nid. /
2546	task_numa_find_cpu(&env, taskimp, groupimp);
2547
2548	/*
2549	* Look at other nodes in these cases:
2550	* - there is no space available on the preferred_nid
2551	* - the task is part of a numa_group that is interleaved across
2552	* multiple NUMA nodes; in order to better consolidate the group,
2553	* we need to check other locations.
2554	*/
2555	ng = deref_curr_numa_group(p);
2556	if (env.best_cpu == -`1` \|\| (ng && ng->active_nodes > `1`)) {
2557	for_each_node_state(nid, N_CPU) {
2558	if (nid == env.src_nid \|\| nid == p->numa_preferred_nid)
2559	continue;
2560
2561	dist = node_distance(env.src_nid, env.dst_nid);
2562	if (sched_numa_topology_type == NUMA_BACKPLANE &&
2563	dist != env.dist) {
2564	taskweight = task_weight(p, env.src_nid, dist);
2565	groupweight = group_weight(p, env.src_nid, dist);
2566	}
2567
2568	/ Only consider nodes where both task and groups benefit /
2569	taskimp = task_weight(p, nid, dist) - taskweight;
2570	groupimp = group_weight(p, nid, dist) - groupweight;
2571	if (taskimp < `0` && groupimp < `0`)
2572	continue;
2573
2574	env.dist = dist;
2575	env.dst_nid = nid;
2576	update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
2577	task_numa_find_cpu(&env, taskimp, groupimp);
2578	}
2579	}
2580
2581	/*
2582	* If the task is part of a workload that spans multiple NUMA nodes,
2583	* and is migrating into one of the workload's active nodes, remember
2584	* this node as the task's preferred numa node, so the workload can
2585	* settle down.
2586	* A task that migrated to a second choice node will be better off
2587	* trying for a better one later. Do not set the preferred node here.
2588	*/
2589	if (ng) {
2590	if (env.best_cpu == -`1`)
2591	nid = env.src_nid;
2592	else
2593	nid = cpu_to_node(env.best_cpu);
2594
2595	if (nid != p->numa_preferred_nid)
2596	sched_setnuma(p, nid);
2597	}
2598
2599	/ No better CPU than the current one was found. /
2600	if (env.best_cpu == -`1`) {
2601	trace_sched_stick_numa(p, env.src_cpu, NULL, -`1`);
2602	return -EAGAIN;
2603	}
2604
2605	best_rq = cpu_rq(env.best_cpu);
2606	if (env.best_task == NULL) {
2607	ret = migrate_task_to(p, env.best_cpu);
2608	WRITE_ONCE(best_rq->numa_migrate_on, `0`);
2609	if (ret != `0`)
2610	trace_sched_stick_numa(p, env.src_cpu, NULL, env.best_cpu);
2611	return ret;
2612	}
2613
2614	ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
2615	WRITE_ONCE(best_rq->numa_migrate_on, `0`);
2616
2617	if (ret != `0`)
2618	trace_sched_stick_numa(p, env.src_cpu, env.best_task, env.best_cpu);
2619	put_task_struct(env.best_task);
2620	return ret;
2621	}
2622
2623	/ Attempt to migrate a task to a CPU on the preferred node. /
2624	static void numa_migrate_preferred(struct task_struct *p)
2625	{
2626	unsigned long interval = HZ;
2627
2628	/ This task has no NUMA fault statistics yet /
2629	if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE \|\| !p->numa_faults))
2630	return;
2631
2632	/ Periodically retry migrating the task to the preferred node /
2633	interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / `16`);
2634	p->numa_migrate_retry = jiffies + interval;
2635
2636	/ Success if task is already running on preferred CPU /
2637	if (task_node(p) == p->numa_preferred_nid)
2638	return;
2639
2640	/ Otherwise, try migrate to a CPU on the preferred node /
2641	task_numa_migrate(p);
2642	}
2643
2644	/*
2645	* Find out how many nodes the workload is actively running on. Do this by
2646	* tracking the nodes from which NUMA hinting faults are triggered. This can
2647	* be different from the set of nodes where the workload's memory is currently
2648	* located.
2649	*/
2650	static void numa_group_count_active_nodes(struct numa_group *numa_group)
2651	{
2652	unsigned long faults, max_faults = `0`;
2653	int nid, active_nodes = `0`;
2654
2655	for_each_node_state(nid, N_CPU) {
2656	faults = group_faults_cpu(numa_group, nid);
2657	if (faults > max_faults)
2658	max_faults = faults;
2659	}
2660
2661	for_each_node_state(nid, N_CPU) {
2662	faults = group_faults_cpu(numa_group, nid);
2663	if (faults * ACTIVE_NODE_FRACTION > max_faults)
2664	active_nodes++;
2665	}
2666
2667	numa_group->max_faults_cpu = max_faults;
2668	numa_group->active_nodes = active_nodes;
2669	}
2670
2671	/*
2672	* When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
2673	* increments. The more local the fault statistics are, the higher the scan
2674	* period will be for the next scan window. If local/(local+remote) ratio is
2675	* below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
2676	* the scan period will decrease. Aim for 70% local accesses.
2677	*/
2678	#define NUMA_PERIOD_SLOTS 10
2679	#define NUMA_PERIOD_THRESHOLD 7
2680
2681	/*
2682	* Increase the scan period (slow down scanning) if the majority of
2683	* our memory is already on our local node, or if the majority of
2684	* the page accesses are shared with other processes.
2685	* Otherwise, decrease the scan period.
2686	*/
2687	static void update_task_scan_period(struct task_struct *p,
2688	unsigned long shared, unsigned long private)
2689	{
2690	unsigned int period_slot;
2691	int lr_ratio, ps_ratio;
2692	int diff;
2693
2694	unsigned long remote = p->numa_faults_locality[`0`];
2695	unsigned long local = p->numa_faults_locality[`1`];
2696
2697	/*
2698	* If there were no record hinting faults then either the task is
2699	* completely idle or all activity is in areas that are not of interest
2700	* to automatic numa balancing. Related to that, if there were failed
2701	* migration then it implies we are migrating too quickly or the local
2702	* node is overloaded. In either case, scan slower
2703	*/
2704	if (local + shared == `0` \|\| p->numa_faults_locality[`2`]) {
2705	p->numa_scan_period = min(p->numa_scan_period_max,
2706	p->numa_scan_period << `1`);
2707
2708	p->mm->numa_next_scan = jiffies +
2709	msecs_to_jiffies(p->numa_scan_period);
2710
2711	return;
2712	}
2713
2714	/*
2715	* Prepare to scale scan period relative to the current period.
2716	* == NUMA_PERIOD_THRESHOLD scan period stays the same
2717	* < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
2718	* >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
2719	*/
2720	period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
2721	lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
2722	ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
2723
2724	if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
2725	/*
2726	* Most memory accesses are local. There is no need to
2727	* do fast NUMA scanning, since memory is already local.
2728	*/
2729	int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
2730	if (!slot)
2731	slot = `1`;
2732	diff = slot * period_slot;
2733	} else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
2734	/*
2735	* Most memory accesses are shared with other tasks.
2736	* There is no point in continuing fast NUMA scanning,
2737	* since other tasks may just move the memory elsewhere.
2738	*/
2739	int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
2740	if (!slot)
2741	slot = `1`;
2742	diff = slot * period_slot;
2743	} else {
2744	/*
2745	* Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS,
2746	* yet they are not on the local NUMA node. Speed up
2747	* NUMA scanning to get the memory moved over.
2748	*/
2749	int ratio = max(lr_ratio, ps_ratio);
2750	diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
2751	}
2752
2753	p->numa_scan_period = clamp(p->numa_scan_period + diff,
2754	task_scan_min(p), task_scan_max(p));
2755	memset(p->numa_faults_locality, `0`, sizeof(p->numa_faults_locality));
2756	}
2757
2758	/*
2759	* Get the fraction of time the task has been running since the last
2760	* NUMA placement cycle. The scheduler keeps similar statistics, but
2761	* decays those on a 32ms period, which is orders of magnitude off
2762	* from the dozens-of-seconds NUMA balancing period. Use the scheduler
2763	* stats only if the task is so new there are no NUMA statistics yet.
2764	*/
2765	static u64 numa_get_avg_runtime(struct task_struct p, u64 period)
2766	{
2767	u64 runtime, delta, now;
2768	/ Use the start of this time slice to avoid calculations. /
2769	now = p->se.exec_start;
2770	runtime = p->se.sum_exec_runtime;
2771
2772	if (p->last_task_numa_placement) {
2773	delta = runtime - p->last_sum_exec_runtime;
2774	*period = now - p->last_task_numa_placement;
2775
2776	/ Avoid time going backwards, prevent potential divide error: /
2777	if (unlikely((s64)*period < `0`))
2778	*period = `0`;
2779	} else {
2780	delta = p->se.avg.load_sum;
2781	*period = LOAD_AVG_MAX;
2782	}
2783
2784	p->last_sum_exec_runtime = runtime;
2785	p->last_task_numa_placement = now;
2786
2787	return delta;
2788	}
2789
2790	/*
2791	* Determine the preferred nid for a task in a numa_group. This needs to
2792	* be done in a way that produces consistent results with group_weight,
2793	* otherwise workloads might not converge.
2794	*/
2795	static int preferred_group_nid(struct task_struct p, int* nid)
2796	{
2797	nodemask_t nodes;
2798	int dist;
2799
2800	/ Direct connections between all NUMA nodes. /
2801	if (sched_numa_topology_type == NUMA_DIRECT)
2802	return nid;
2803
2804	/*
2805	* On a system with glueless mesh NUMA topology, group_weight
2806	* scores nodes according to the number of NUMA hinting faults on
2807	* both the node itself, and on nearby nodes.
2808	*/
2809	if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
2810	unsigned long score, max_score = `0`;
2811	int node, max_node = nid;
2812
2813	dist = sched_max_numa_distance;
2814
2815	for_each_node_state(node, N_CPU) {
2816	score = group_weight(p, node, dist);
2817	if (score > max_score) {
2818	max_score = score;
2819	max_node = node;
2820	}
2821	}
2822	return max_node;
2823	}
2824
2825	/*
2826	* Finding the preferred nid in a system with NUMA backplane
2827	* interconnect topology is more involved. The goal is to locate
2828	* tasks from numa_groups near each other in the system, and
2829	* untangle workloads from different sides of the system. This requires
2830	* searching down the hierarchy of node groups, recursively searching
2831	* inside the highest scoring group of nodes. The nodemask tricks
2832	* keep the complexity of the search down.
2833	*/
2834	nodes = node_states[N_CPU];
2835	for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
2836	unsigned long max_faults = `0`;
2837	nodemask_t max_group = NODE_MASK_NONE;
2838	int a, b;
2839
2840	/ Are there nodes at this distance from each other? /
2841	if (!find_numa_distance(dist))
2842	continue;
2843
2844	for_each_node_mask(a, nodes) {
2845	unsigned long faults = `0`;
2846	nodemask_t this_group;
2847	nodes_clear(this_group);
2848
2849	/ Sum group's NUMA faults; includes a==b case. /
2850	for_each_node_mask(b, nodes) {
2851	if (node_distance(a, b) < dist) {
2852	faults += group_faults(p, b);
2853	node_set(b, this_group);
2854	node_clear(b, nodes);
2855	}
2856	}
2857
2858	/ Remember the top group. /
2859	if (faults > max_faults) {
2860	max_faults = faults;
2861	max_group = this_group;
2862	/*
2863	* subtle: at the smallest distance there is
2864	* just one node left in each "group", the
2865	* winner is the preferred nid.
2866	*/
2867	nid = a;
2868	}
2869	}
2870	/ Next round, evaluate the nodes within max_group. /
2871	if (!max_faults)
2872	break;
2873	nodes = max_group;
2874	}
2875	return nid;
2876	}
2877
2878	static void task_numa_placement(struct task_struct *p)
2879	{
2880	int seq, nid, max_nid = NUMA_NO_NODE;
2881	unsigned long max_faults = `0`;
2882	unsigned long fault_types[`2`] = { `0`, `0` };
2883	unsigned long total_faults;
2884	u64 runtime, period;
2885	spinlock_t *group_lock = NULL;
2886	struct numa_group *ng;
2887
2888	/*
2889	* The p->mm->numa_scan_seq field gets updated without
2890	* exclusive access. Use READ_ONCE() here to ensure
2891	* that the field is read in a single access:
2892	*/
2893	seq = READ_ONCE(p->mm->numa_scan_seq);
2894	if (p->numa_scan_seq == seq)
2895	return;
2896	p->numa_scan_seq = seq;
2897	p->numa_scan_period_max = task_scan_max(p);
2898
2899	total_faults = p->numa_faults_locality[`0`] +
2900	p->numa_faults_locality[`1`];
2901	runtime = numa_get_avg_runtime(p, &period);
2902
2903	/ If the task is part of a group prevent parallel updates to group stats /
2904	ng = deref_curr_numa_group(p);
2905	if (ng) {
2906	group_lock = &ng->lock;
2907	spin_lock_irq(group_lock);
2908	}
2909
2910	/ Find the node with the highest number of faults /
2911	for_each_online_node(nid) {
2912	/ Keep track of the offsets in numa_faults array /
2913	int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
2914	unsigned long faults = `0`, group_faults = `0`;
2915	int priv;
2916
2917	for (priv = `0`; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
2918	long diff, f_diff, f_weight;
2919
2920	mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
2921	membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
2922	cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
2923	cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
2924
2925	/ Decay existing window, copy faults since last scan /
2926	diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / `2`;
2927	fault_types[priv] += p->numa_faults[membuf_idx];
2928	p->numa_faults[membuf_idx] = `0`;
2929
2930	/*
2931	* Normalize the faults_from, so all tasks in a group
2932	* count according to CPU use, instead of by the raw
2933	* number of faults. Tasks with little runtime have
2934	* little over-all impact on throughput, and thus their
2935	* faults are less important.
2936	*/
2937	f_weight = div64_u64(runtime << `16`, period + `1`);
2938	f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
2939	(total_faults + `1`);
2940	f_diff = f_weight - p->numa_faults[cpu_idx] / `2`;
2941	p->numa_faults[cpubuf_idx] = `0`;
2942
2943	p->numa_faults[mem_idx] += diff;
2944	p->numa_faults[cpu_idx] += f_diff;
2945	faults += p->numa_faults[mem_idx];
2946	p->total_numa_faults += diff;
2947	if (ng) {
2948	/*
2949	* safe because we can only change our own group
2950	*
2951	* mem_idx represents the offset for a given
2952	* nid and priv in a specific region because it
2953	* is at the beginning of the numa_faults array.
2954	*/
2955	ng->faults[mem_idx] += diff;
2956	ng->faults[cpu_idx] += f_diff;
2957	ng->total_faults += diff;
2958	group_faults += ng->faults[mem_idx];
2959	}
2960	}
2961
2962	if (!ng) {
2963	if (faults > max_faults) {
2964	max_faults = faults;
2965	max_nid = nid;
2966	}
2967	} else if (group_faults > max_faults) {
2968	max_faults = group_faults;
2969	max_nid = nid;
2970	}
2971	}
2972
2973	/ Cannot migrate task to CPU-less node /
2974	max_nid = numa_nearest_node(max_nid, N_CPU);
2975
2976	if (ng) {
2977	numa_group_count_active_nodes(ng);
2978	spin_unlock_irq(group_lock);
2979	max_nid = preferred_group_nid(p, max_nid);
2980	}
2981
2982	if (max_faults) {
2983	/ Set the new preferred node /
2984	if (max_nid != p->numa_preferred_nid)
2985	sched_setnuma(p, max_nid);
2986	}
2987
2988	update_task_scan_period(p, fault_types[`0`], fault_types[`1`]);
2989	}
2990
2991	static inline int get_numa_group(struct numa_group *grp)
2992	{
2993	return refcount_inc_not_zero(&grp->refcount);
2994	}
2995
2996	static inline void put_numa_group(struct numa_group *grp)
2997	{
2998	if (refcount_dec_and_test(&grp->refcount))
2999	kfree_rcu(grp, rcu);
3000	}
3001
3002	static void task_numa_group(struct task_struct p, int* cpupid, int flags,
3003	int *priv)
3004	{
3005	struct numa_group grp, my_grp;
3006	struct task_struct *tsk;
3007	bool join = false;
3008	int cpu = cpupid_to_cpu(cpupid);
3009	int i;
3010
3011	if (unlikely(!deref_curr_numa_group(p))) {
3012	unsigned int size = sizeof(struct numa_group) +
3013	NR_NUMA_HINT_FAULT_STATS *
3014	nr_node_ids * sizeof(unsigned long);
3015
3016	grp = kzalloc(size, GFP_KERNEL \| __GFP_NOWARN);
3017	if (!grp)
3018	return;
3019
3020	refcount_set(&grp->refcount, `1`);
3021	grp->active_nodes = `1`;
3022	grp->max_faults_cpu = `0`;
3023	spin_lock_init(&grp->lock);
3024	grp->gid = p->pid;
3025
3026	for (i = `0`; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
3027	grp->faults[i] = p->numa_faults[i];
3028
3029	grp->total_faults = p->total_numa_faults;
3030
3031	grp->nr_tasks++;
3032	rcu_assign_pointer(p->numa_group, grp);
3033	}
3034
3035	rcu_read_lock();
3036	tsk = READ_ONCE(cpu_rq(cpu)->curr);
3037
3038	if (!cpupid_match_pid(tsk, cpupid))
3039	goto no_join;
3040
3041	grp = rcu_dereference(tsk->numa_group);
3042	if (!grp)
3043	goto no_join;
3044
3045	my_grp = deref_curr_numa_group(p);
3046	if (grp == my_grp)
3047	goto no_join;
3048
3049	/*
3050	* Only join the other group if its bigger; if we're the bigger group,
3051	* the other task will join us.
3052	*/
3053	if (my_grp->nr_tasks > grp->nr_tasks)
3054	goto no_join;
3055
3056	/*
3057	* Tie-break on the grp address.
3058	*/
3059	if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
3060	goto no_join;
3061
3062	/ Always join threads in the same process. /
3063	if (tsk->mm == current->mm)
3064	join = true;
3065
3066	/ Simple filter to avoid false positives due to PID collisions /
3067	if (flags & TNF_SHARED)
3068	join = true;
3069
3070	/ Update priv based on whether false sharing was detected /
3071	*priv = !join;
3072
3073	if (join && !get_numa_group(grp))
3074	goto no_join;
3075
3076	rcu_read_unlock();
3077
3078	if (!join)
3079	return;
3080
3081	WARN_ON_ONCE(irqs_disabled());
3082	double_lock_irq(&my_grp->lock, &grp->lock);
3083
3084	for (i = `0`; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
3085	my_grp->faults[i] -= p->numa_faults[i];
3086	grp->faults[i] += p->numa_faults[i];
3087	}
3088	my_grp->total_faults -= p->total_numa_faults;
3089	grp->total_faults += p->total_numa_faults;
3090
3091	my_grp->nr_tasks--;
3092	grp->nr_tasks++;
3093
3094	spin_unlock(&my_grp->lock);
3095	spin_unlock_irq(&grp->lock);
3096
3097	rcu_assign_pointer(p->numa_group, grp);
3098
3099	put_numa_group(my_grp);
3100	return;
3101
3102	no_join:
3103	rcu_read_unlock();
3104	return;
3105	}
3106
3107	/*
3108	* Get rid of NUMA statistics associated with a task (either current or dead).
3109	* If @final is set, the task is dead and has reached refcount zero, so we can
3110	* safely free all relevant data structures. Otherwise, there might be
3111	* concurrent reads from places like load balancing and procfs, and we should
3112	* reset the data back to default state without freeing ->numa_faults.
3113	*/
3114	void task_numa_free(struct task_struct *p, bool final)
3115	{
3116	/ safe: p either is current or is being freed by current /
3117	struct numa_group *grp = rcu_dereference_raw(p->numa_group);
3118	unsigned long *numa_faults = p->numa_faults;
3119	unsigned long flags;
3120	int i;
3121
3122	if (!numa_faults)
3123	return;
3124
3125	if (grp) {
3126	spin_lock_irqsave(&grp->lock, flags);
3127	for (i = `0`; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
3128	grp->faults[i] -= p->numa_faults[i];
3129	grp->total_faults -= p->total_numa_faults;
3130
3131	grp->nr_tasks--;
3132	spin_unlock_irqrestore(&grp->lock, flags);
3133	RCU_INIT_POINTER(p->numa_group, NULL);
3134	put_numa_group(grp);
3135	}
3136
3137	if (final) {
3138	p->numa_faults = NULL;
3139	kfree(numa_faults);
3140	} else {
3141	p->total_numa_faults = `0`;
3142	for (i = `0`; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
3143	numa_faults[i] = `0`;
3144	}
3145	}
3146
3147	/*
3148	* Got a PROT_NONE fault for a page on @node.
3149	*/
3150	void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
3151	{
3152	struct task_struct *p = current;
3153	bool migrated = flags & TNF_MIGRATED;
3154	int cpu_node = task_node(current);
3155	int local = !!(flags & TNF_FAULT_LOCAL);
3156	struct numa_group *ng;
3157	int priv;
3158
3159	if (!static_branch_likely(&sched_numa_balancing))
3160	return;
3161
3162	/ for example, ksmd faulting in a user's mm /
3163	if (!p->mm)
3164	return;
3165
3166	/*
3167	* NUMA faults statistics are unnecessary for the slow memory
3168	* node for memory tiering mode.
3169	*/
3170	if (!node_is_toptier(mem_node) &&
3171	(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING \|\|
3172	!cpupid_valid(last_cpupid)))
3173	return;
3174
3175	/ Allocate buffer to track faults on a per-node basis /
3176	if (unlikely(!p->numa_faults)) {
3177	int size = sizeof(p->numa_faults)
3178	NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
3179
3180	p->numa_faults = kzalloc(size, GFP_KERNEL\|__GFP_NOWARN);
3181	if (!p->numa_faults)
3182	return;
3183
3184	p->total_numa_faults = `0`;
3185	memset(p->numa_faults_locality, `0`, sizeof(p->numa_faults_locality));
3186	}
3187
3188	/*
3189	* First accesses are treated as private, otherwise consider accesses
3190	* to be private if the accessing pid has not changed
3191	*/
3192	if (unlikely(last_cpupid == (-`1` & LAST_CPUPID_MASK))) {
3193	priv = `1`;
3194	} else {
3195	priv = cpupid_match_pid(p, last_cpupid);
3196	if (!priv && !(flags & TNF_NO_GROUP))
3197	task_numa_group(p, last_cpupid, flags, &priv);
3198	}
3199
3200	/*
3201	* If a workload spans multiple NUMA nodes, a shared fault that
3202	* occurs wholly within the set of nodes that the workload is
3203	* actively using should be counted as local. This allows the
3204	* scan rate to slow down when a workload has settled down.
3205	*/
3206	ng = deref_curr_numa_group(p);
3207	if (!priv && !local && ng && ng->active_nodes > `1` &&
3208	numa_is_active_node(cpu_node, ng) &&
3209	numa_is_active_node(mem_node, ng))
3210	local = `1`;
3211
3212	/*
3213	* Retry to migrate task to preferred node periodically, in case it
3214	* previously failed, or the scheduler moved us.
3215	*/
3216	if (time_after(jiffies, p->numa_migrate_retry)) {
3217	task_numa_placement(p);
3218	numa_migrate_preferred(p);
3219	}
3220
3221	if (migrated)
3222	p->numa_pages_migrated += pages;
3223	if (flags & TNF_MIGRATE_FAIL)
3224	p->numa_faults_locality[`2`] += pages;
3225
3226	p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
3227	p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
3228	p->numa_faults_locality[local] += pages;
3229	}
3230
3231	static void reset_ptenuma_scan(struct task_struct *p)
3232	{
3233	/*
3234	* We only did a read acquisition of the mmap sem, so
3235	* p->mm->numa_scan_seq is written to without exclusive access
3236	* and the update is not guaranteed to be atomic. That's not
3237	* much of an issue though, since this is just used for
3238	* statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
3239	* expensive, to avoid any form of compiler optimizations:
3240	*/
3241	WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + `1`);
3242	p->mm->numa_scan_offset = `0`;
3243	}
3244
3245	static bool vma_is_accessed(struct mm_struct mm, struct* vm_area_struct *vma)
3246	{
3247	unsigned long pids;
3248	/*
3249	* Allow unconditional access first two times, so that all the (pages)
3250	* of VMAs get prot_none fault introduced irrespective of accesses.
3251	* This is also done to avoid any side effect of task scanning
3252	* amplifying the unfairness of disjoint set of VMAs' access.
3253	*/
3254	if ((READ_ONCE(current->mm->numa_scan_seq) - vma->numab_state->start_scan_seq) < `2`)
3255	return true;
3256
3257	pids = vma->numab_state->pids_active[`0`] \| vma->numab_state->pids_active[`1`];
3258	if (test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids))
3259	return true;
3260
3261	/*
3262	* Complete a scan that has already started regardless of PID access, or
3263	* some VMAs may never be scanned in multi-threaded applications:
3264	*/
3265	if (mm->numa_scan_offset > vma->vm_start) {
3266	trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_IGNORE_PID);
3267	return true;
3268	}
3269
3270	/*
3271	* This vma has not been accessed for a while, and if the number
3272	* the threads in the same process is low, which means no other
3273	* threads can help scan this vma, force a vma scan.
3274	*/
3275	if (READ_ONCE(mm->numa_scan_seq) >
3276	(vma->numab_state->prev_scan_seq + get_nr_threads(current)))
3277	return true;
3278
3279	return false;
3280	}
3281
3282	#define VMA_PID_RESET_PERIOD (4 * sysctl_numa_balancing_scan_delay)
3283
3284	/*
3285	* The expensive part of numa migration is done from task_work context.
3286	* Triggered from task_tick_numa().
3287	*/
3288	static void task_numa_work(struct callback_head *work)
3289	{
3290	unsigned long migrate, next_scan, now = jiffies;
3291	struct task_struct *p = current;
3292	struct mm_struct *mm = p->mm;
3293	u64 runtime = p->se.sum_exec_runtime;
3294	struct vm_area_struct *vma;
3295	unsigned long start, end;
3296	unsigned long nr_pte_updates = `0`;
3297	long pages, virtpages;
3298	struct vma_iterator vmi;
3299	bool vma_pids_skipped;
3300	bool vma_pids_forced = false;
3301
3302	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
3303
3304	work->next = work;
3305	/*
3306	* Who cares about NUMA placement when they're dying.
3307	*
3308	* NOTE: make sure not to dereference p->mm before this check,
3309	* exit_task_work() happens _after_ exit_mm() so we could be called
3310	* without p->mm even though we still had it when we enqueued this
3311	* work.
3312	*/
3313	if (p->flags & PF_EXITING)
3314	return;
3315
3316	/*
3317	* Memory is pinned to only one NUMA node via cpuset.mems, naturally
3318	* no page can be migrated.
3319	*/
3320	if (cpusets_enabled() && nodes_weight(cpuset_current_mems_allowed) == `1`) {
3321	trace_sched_skip_cpuset_numa(current, &cpuset_current_mems_allowed);
3322	return;
3323	}
3324
3325	if (!mm->numa_next_scan) {
3326	mm->numa_next_scan = now +
3327	msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
3328	}
3329
3330	/*
3331	* Enforce maximal scan/migration frequency..
3332	*/
3333	migrate = mm->numa_next_scan;
3334	if (time_before(now, migrate))
3335	return;
3336
3337	if (p->numa_scan_period == `0`) {
3338	p->numa_scan_period_max = task_scan_max(p);
3339	p->numa_scan_period = task_scan_start(p);
3340	}
3341
3342	next_scan = now + msecs_to_jiffies(p->numa_scan_period);
3343	if (!try_cmpxchg(&mm->numa_next_scan, &migrate, next_scan))
3344	return;
3345
3346	/*
3347	* Delay this task enough that another task of this mm will likely win
3348	* the next time around.
3349	*/
3350	p->node_stamp += `2` * TICK_NSEC;
3351
3352	pages = sysctl_numa_balancing_scan_size;
3353	pages <<= `20` - PAGE_SHIFT; / MB in pages /
3354	virtpages = pages * `8`; / Scan up to this much virtual space /
3355	if (!pages)
3356	return;
3357
3358
3359	if (!mmap_read_trylock(mm))
3360	return;
3361
3362	/*
3363	* VMAs are skipped if the current PID has not trapped a fault within
3364	* the VMA recently. Allow scanning to be forced if there is no
3365	* suitable VMA remaining.
3366	*/
3367	vma_pids_skipped = false;
3368
3369	retry_pids:
3370	start = mm->numa_scan_offset;
3371	vma_iter_init(&vmi, mm, start);
3372	vma = vma_next(&vmi);
3373	if (!vma) {
3374	reset_ptenuma_scan(p);
3375	start = `0`;
3376	vma_iter_set(&vmi, start);
3377	vma = vma_next(&vmi);
3378	}
3379
3380	for (; vma; vma = vma_next(&vmi)) {
3381	if (!vma_migratable(vma) \|\| !vma_policy_mof(vma) \|\|
3382	is_vm_hugetlb_page(vma) \|\| (vma->vm_flags & VM_MIXEDMAP)) {
3383	trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UNSUITABLE);
3384	continue;
3385	}
3386
3387	/*
3388	* Shared library pages mapped by multiple processes are not
3389	* migrated as it is expected they are cache replicated. Avoid
3390	* hinting faults in read-only file-backed mappings or the vDSO
3391	* as migrating the pages will be of marginal benefit.
3392	*/
3393	if (!vma->vm_mm \|\|
3394	(vma->vm_file && (vma->vm_flags & (VM_READ\|VM_WRITE)) == (VM_READ))) {
3395	trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SHARED_RO);
3396	continue;
3397	}
3398
3399	/*
3400	* Skip inaccessible VMAs to avoid any confusion between
3401	* PROT_NONE and NUMA hinting PTEs
3402	*/
3403	if (!vma_is_accessible(vma)) {
3404	trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_INACCESSIBLE);
3405	continue;
3406	}
3407
3408	/ Initialise new per-VMA NUMAB state. /
3409	if (!vma->numab_state) {
3410	struct vma_numab_state *ptr;
3411
3412	ptr = kzalloc(sizeof(*ptr), GFP_KERNEL);
3413	if (!ptr)
3414	continue;
3415
3416	if (cmpxchg(&vma->numab_state, NULL, ptr)) {
3417	kfree(ptr);
3418	continue;
3419	}
3420
3421	vma->numab_state->start_scan_seq = mm->numa_scan_seq;
3422
3423	vma->numab_state->next_scan = now +
3424	msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
3425
3426	/ Reset happens after 4 times scan delay of scan start /
3427	vma->numab_state->pids_active_reset = vma->numab_state->next_scan +
3428	msecs_to_jiffies(VMA_PID_RESET_PERIOD);
3429
3430	/*
3431	* Ensure prev_scan_seq does not match numa_scan_seq,
3432	* to prevent VMAs being skipped prematurely on the
3433	* first scan:
3434	*/
3435	vma->numab_state->prev_scan_seq = mm->numa_scan_seq - `1`;
3436	}
3437
3438	/*
3439	* Scanning the VMAs of short lived tasks add more overhead. So
3440	* delay the scan for new VMAs.
3441	*/
3442	if (mm->numa_scan_seq && time_before(jiffies,
3443	vma->numab_state->next_scan)) {
3444	trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SCAN_DELAY);
3445	continue;
3446	}
3447
3448	/ RESET access PIDs regularly for old VMAs. /
3449	if (mm->numa_scan_seq &&
3450	time_after(jiffies, vma->numab_state->pids_active_reset)) {
3451	vma->numab_state->pids_active_reset = vma->numab_state->pids_active_reset +
3452	msecs_to_jiffies(VMA_PID_RESET_PERIOD);
3453	vma->numab_state->pids_active[`0`] = READ_ONCE(vma->numab_state->pids_active[`1`]);
3454	vma->numab_state->pids_active[`1`] = `0`;
3455	}
3456
3457	/ Do not rescan VMAs twice within the same sequence. /
3458	if (vma->numab_state->prev_scan_seq == mm->numa_scan_seq) {
3459	mm->numa_scan_offset = vma->vm_end;
3460	trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SEQ_COMPLETED);
3461	continue;
3462	}
3463
3464	/*
3465	* Do not scan the VMA if task has not accessed it, unless no other
3466	* VMA candidate exists.
3467	*/
3468	if (!vma_pids_forced && !vma_is_accessed(mm, vma)) {
3469	vma_pids_skipped = true;
3470	trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE);
3471	continue;
3472	}
3473
3474	do {
3475	start = max(start, vma->vm_start);
3476	end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
3477	end = min(end, vma->vm_end);
3478	nr_pte_updates = change_prot_numa(vma, start, end);
3479
3480	/*
3481	* Try to scan sysctl_numa_balancing_size worth of
3482	* hpages that have at least one present PTE that
3483	* is not already PTE-numa. If the VMA contains
3484	* areas that are unused or already full of prot_numa
3485	* PTEs, scan up to virtpages, to skip through those
3486	* areas faster.
3487	*/
3488	if (nr_pte_updates)
3489	pages -= (end - start) >> PAGE_SHIFT;
3490	virtpages -= (end - start) >> PAGE_SHIFT;
3491
3492	start = end;
3493	if (pages <= `0` \|\| virtpages <= `0`)
3494	goto out;
3495
3496	cond_resched();
3497	} while (end != vma->vm_end);
3498
3499	/ VMA scan is complete, do not scan until next sequence. /
3500	vma->numab_state->prev_scan_seq = mm->numa_scan_seq;
3501
3502	/*
3503	* Only force scan within one VMA at a time, to limit the
3504	* cost of scanning a potentially uninteresting VMA.
3505	*/
3506	if (vma_pids_forced)
3507	break;
3508	}
3509
3510	/*
3511	* If no VMAs are remaining and VMAs were skipped due to the PID
3512	* not accessing the VMA previously, then force a scan to ensure
3513	* forward progress:
3514	*/
3515	if (!vma && !vma_pids_forced && vma_pids_skipped) {
3516	vma_pids_forced = true;
3517	goto retry_pids;
3518	}
3519
3520	out:
3521	/*
3522	* It is possible to reach the end of the VMA list but the last few
3523	* VMAs are not guaranteed to the vma_migratable. If they are not, we
3524	* would find the !migratable VMA on the next scan but not reset the
3525	* scanner to the start so check it now.
3526	*/
3527	if (vma)
3528	mm->numa_scan_offset = start;
3529	else
3530	reset_ptenuma_scan(p);
3531	mmap_read_unlock(mm);
3532
3533	/*
3534	* Make sure tasks use at least 32x as much time to run other code
3535	* than they used here, to limit NUMA PTE scanning overhead to 3% max.
3536	* Usually update_task_scan_period slows down scanning enough; on an
3537	* overloaded system we need to limit overhead on a per task basis.
3538	*/
3539	if (unlikely(p->se.sum_exec_runtime != runtime)) {
3540	u64 diff = p->se.sum_exec_runtime - runtime;
3541	p->node_stamp += `32` * diff;
3542	}
3543	}
3544
3545	void init_numa_balancing(u64 clone_flags, struct task_struct *p)
3546	{
3547	int mm_users = `0`;
3548	struct mm_struct *mm = p->mm;
3549
3550	if (mm) {
3551	mm_users = atomic_read(&mm->mm_users);
3552	if (mm_users == `1`) {
3553	mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
3554	mm->numa_scan_seq = `0`;
3555	}
3556	}
3557	p->node_stamp = `0`;
3558	p->numa_scan_seq = mm ? mm->numa_scan_seq : `0`;
3559	p->numa_scan_period = sysctl_numa_balancing_scan_delay;
3560	p->numa_migrate_retry = `0`;
3561	/ Protect against double add, see task_tick_numa and task_numa_work /
3562	p->numa_work.next = &p->numa_work;
3563	p->numa_faults = NULL;
3564	p->numa_pages_migrated = `0`;
3565	p->total_numa_faults = `0`;
3566	RCU_INIT_POINTER(p->numa_group, NULL);
3567	p->last_task_numa_placement = `0`;
3568	p->last_sum_exec_runtime = `0`;
3569
3570	init_task_work(&p->numa_work, task_numa_work);
3571
3572	/ New address space, reset the preferred nid /
3573	if (!(clone_flags & CLONE_VM)) {
3574	p->numa_preferred_nid = NUMA_NO_NODE;
3575	return;
3576	}
3577
3578	/*
3579	* New thread, keep existing numa_preferred_nid which should be copied
3580	* already by arch_dup_task_struct but stagger when scans start.
3581	*/
3582	if (mm) {
3583	unsigned int delay;
3584
3585	delay = min_t(unsigned int, task_scan_max(current),
3586	current->numa_scan_period * mm_users * NSEC_PER_MSEC);
3587	delay += `2` * TICK_NSEC;
3588	p->node_stamp = delay;
3589	}
3590	}
3591
3592	/*
3593	* Drive the periodic memory faults..
3594	*/
3595	static void task_tick_numa(struct rq rq, struct* task_struct *curr)
3596	{
3597	struct callback_head *work = &curr->numa_work;
3598	u64 period, now;
3599
3600	/*
3601	* We don't care about NUMA placement if we don't have memory.
3602	*/
3603	if (!curr->mm \|\| (curr->flags & (PF_EXITING \| PF_KTHREAD)) \|\| work->next != work)
3604	return;
3605
3606	/*
3607	* Using runtime rather than walltime has the dual advantage that
3608	* we (mostly) drive the selection from busy threads and that the
3609	* task needs to have done some actual work before we bother with
3610	* NUMA placement.
3611	*/
3612	now = curr->se.sum_exec_runtime;
3613	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
3614
3615	if (now > curr->node_stamp + period) {
3616	if (!curr->node_stamp)
3617	curr->numa_scan_period = task_scan_start(curr);
3618	curr->node_stamp += period;
3619
3620	if (!time_before(jiffies, curr->mm->numa_next_scan))
3621	task_work_add(curr, work, TWA_RESUME);
3622	}
3623	}
3624
3625	static void update_scan_period(struct task_struct p, int* new_cpu)
3626	{
3627	int src_nid = cpu_to_node(task_cpu(p));
3628	int dst_nid = cpu_to_node(new_cpu);
3629
3630	if (!static_branch_likely(&sched_numa_balancing))
3631	return;
3632
3633	if (!p->mm \|\| !p->numa_faults \|\| (p->flags & PF_EXITING))
3634	return;
3635
3636	if (src_nid == dst_nid)
3637	return;
3638
3639	/*
3640	* Allow resets if faults have been trapped before one scan
3641	* has completed. This is most likely due to a new task that
3642	* is pulled cross-node due to wakeups or load balancing.
3643	*/
3644	if (p->numa_scan_seq) {
3645	/*
3646	* Avoid scan adjustments if moving to the preferred
3647	* node or if the task was not previously running on
3648	* the preferred node.
3649	*/
3650	if (dst_nid == p->numa_preferred_nid \|\|
3651	(p->numa_preferred_nid != NUMA_NO_NODE &&
3652	src_nid != p->numa_preferred_nid))
3653	return;
3654	}
3655
3656	p->numa_scan_period = task_scan_start(p);
3657	}
3658
3659	#else /* !CONFIG_NUMA_BALANCING: */
3660
3661	static void task_tick_numa(struct rq rq, struct* task_struct *curr)
3662	{
3663	}
3664
3665	static inline void account_numa_enqueue(struct rq rq, struct* task_struct *p)
3666	{
3667	}
3668
3669	static inline void account_numa_dequeue(struct rq rq, struct* task_struct *p)
3670	{
3671	}
3672
3673	static inline void update_scan_period(struct task_struct p, int* new_cpu)
3674	{
3675	}
3676
3677	#endif /* !CONFIG_NUMA_BALANCING */
3678
3679	static void
3680	account_entity_enqueue(struct cfs_rq cfs_rq, struct* sched_entity *se)
3681	{
3682	update_load_add(lw: &cfs_rq->load, inc: se->load.weight);
3683	if (entity_is_task(se)) {
3684	struct rq *rq = rq_of(cfs_rq);
3685
3686	account_numa_enqueue(rq, p: task_of(se));
3687	list_add(new: &se->group_node, head: &rq->cfs_tasks);
3688	}
3689	cfs_rq->nr_queued++;
3690	}
3691
3692	static void
3693	account_entity_dequeue(struct cfs_rq cfs_rq, struct* sched_entity *se)
3694	{
3695	update_load_sub(lw: &cfs_rq->load, dec: se->load.weight);
3696	if (entity_is_task(se)) {
3697	account_numa_dequeue(rq: rq_of(cfs_rq), p: task_of(se));
3698	list_del_init(entry: &se->group_node);
3699	}
3700	cfs_rq->nr_queued--;
3701	}
3702
3703	/*
3704	* Signed add and clamp on underflow.
3705	*
3706	* Explicitly do a load-store to ensure the intermediate value never hits
3707	* memory. This allows lockless observations without ever seeing the negative
3708	* values.
3709	*/
3710	#define add_positive(_ptr, _val) do { \
3711	typeof(_ptr) ptr = (_ptr); \
3712	typeof(_val) val = (_val); \
3713	typeof(ptr) res, var = READ_ONCE(ptr); \
3714	\
3715	res = var + val; \
3716	\
3717	if (val < 0 && res > var) \
3718	res = 0; \
3719	\
3720	WRITE_ONCE(*ptr, res); \
3721	} while (0)
3722
3723	/*
3724	* Unsigned subtract and clamp on underflow.
3725	*
3726	* Explicitly do a load-store to ensure the intermediate value never hits
3727	* memory. This allows lockless observations without ever seeing the negative
3728	* values.
3729	*/
3730	#define sub_positive(_ptr, _val) do { \
3731	typeof(_ptr) ptr = (_ptr); \
3732	typeof(*ptr) val = (_val); \
3733	typeof(ptr) res, var = READ_ONCE(ptr); \
3734	res = var - val; \
3735	if (res > var) \
3736	res = 0; \
3737	WRITE_ONCE(*ptr, res); \
3738	} while (0)
3739
3740	/*
3741	* Remove and clamp on negative, from a local variable.
3742	*
3743	* A variant of sub_positive(), which does not use explicit load-store
3744	* and is thus optimized for local variable updates.
3745	*/
3746	#define lsub_positive(_ptr, _val) do { \
3747	typeof(_ptr) ptr = (_ptr); \
3748	ptr -= min_t(typeof(ptr), *ptr, _val); \
3749	} while (0)
3750
3751	static inline void
3752	enqueue_load_avg(struct cfs_rq cfs_rq, struct* sched_entity *se)
3753	{
3754	cfs_rq->avg.load_avg += se->avg.load_avg;
3755	cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
3756	}
3757
3758	static inline void
3759	dequeue_load_avg(struct cfs_rq cfs_rq, struct* sched_entity *se)
3760	{
3761	sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
3762	sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
3763	/ See update_cfs_rq_load_avg() /
3764	cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
3765	cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
3766	}
3767
3768	static void place_entity(struct cfs_rq cfs_rq, struct* sched_entity se, int* flags);
3769
3770	static void reweight_entity(struct cfs_rq cfs_rq, struct* sched_entity *se,
3771	unsigned long weight)
3772	{
3773	bool curr = cfs_rq->curr == se;
3774
3775	if (se->on_rq) {
3776	/ commit outstanding execution time /
3777	update_curr(cfs_rq);
3778	update_entity_lag(cfs_rq, se);
3779	se->deadline -= se->vruntime;
3780	se->rel_deadline = `1`;
3781	cfs_rq->nr_queued--;
3782	if (!curr)
3783	__dequeue_entity(cfs_rq, se);
3784	update_load_sub(lw: &cfs_rq->load, dec: se->load.weight);
3785	}
3786	dequeue_load_avg(cfs_rq, se);
3787
3788	/*
3789	* Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
3790	* we need to scale se->vlag when w_i changes.
3791	*/
3792	se->vlag = div_s64(dividend: se->vlag * se->load.weight, divisor: weight);
3793	if (se->rel_deadline)
3794	se->deadline = div_s64(dividend: se->deadline * se->load.weight, divisor: weight);
3795
3796	update_load_set(lw: &se->load, w: weight);
3797
3798	do {
3799	u32 divider = get_pelt_divider(avg: &se->avg);
3800
3801	se->avg.load_avg = div_u64(dividend: se_weight(se) * se->avg.load_sum, divisor: divider);
3802	} while (`0`);
3803
3804	enqueue_load_avg(cfs_rq, se);
3805	if (se->on_rq) {
3806	place_entity(cfs_rq, se, flags: `0`);
3807	update_load_add(lw: &cfs_rq->load, inc: se->load.weight);
3808	if (!curr)
3809	__enqueue_entity(cfs_rq, se);
3810	cfs_rq->nr_queued++;
3811
3812	/*
3813	* The entity's vruntime has been adjusted, so let's check
3814	* whether the rq-wide min_vruntime needs updated too. Since
3815	* the calculations above require stable min_vruntime rather
3816	* than up-to-date one, we do the update at the end of the
3817	* reweight process.
3818	*/
3819	update_min_vruntime(cfs_rq);
3820	}
3821	}
3822
3823	static void reweight_task_fair(struct rq rq, struct* task_struct *p,
3824	const struct load_weight *lw)
3825	{
3826	struct sched_entity *se = &p->se;
3827	struct cfs_rq *cfs_rq = cfs_rq_of(se);
3828	struct load_weight *load = &se->load;
3829
3830	reweight_entity(cfs_rq, se, weight: lw->weight);
3831	load->inv_weight = lw->inv_weight;
3832	}
3833
3834	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
3835
3836	#ifdef CONFIG_FAIR_GROUP_SCHED
3837	/*
3838	* All this does is approximate the hierarchical proportion which includes that
3839	* global sum we all love to hate.
3840	*
3841	* That is, the weight of a group entity, is the proportional share of the
3842	* group weight based on the group runqueue weights. That is:
3843	*
3844	* tg->weight * grq->load.weight
3845	* ge->load.weight = ----------------------------- (1)
3846	* \Sum grq->load.weight
3847	*
3848	* Now, because computing that sum is prohibitively expensive to compute (been
3849	* there, done that) we approximate it with this average stuff. The average
3850	* moves slower and therefore the approximation is cheaper and more stable.
3851	*
3852	* So instead of the above, we substitute:
3853	*
3854	* grq->load.weight -> grq->avg.load_avg (2)
3855	*
3856	* which yields the following:
3857	*
3858	* tg->weight * grq->avg.load_avg
3859	* ge->load.weight = ------------------------------ (3)
3860	* tg->load_avg
3861	*
3862	* Where: tg->load_avg ~= \Sum grq->avg.load_avg
3863	*
3864	* That is shares_avg, and it is right (given the approximation (2)).
3865	*
3866	* The problem with it is that because the average is slow -- it was designed
3867	* to be exactly that of course -- this leads to transients in boundary
3868	* conditions. In specific, the case where the group was idle and we start the
3869	* one task. It takes time for our CPU's grq->avg.load_avg to build up,
3870	* yielding bad latency etc..
3871	*
3872	* Now, in that special case (1) reduces to:
3873	*
3874	* tg->weight * grq->load.weight
3875	* ge->load.weight = ----------------------------- = tg->weight (4)
3876	* grp->load.weight
3877	*
3878	* That is, the sum collapses because all other CPUs are idle; the UP scenario.
3879	*
3880	* So what we do is modify our approximation (3) to approach (4) in the (near)
3881	* UP case, like:
3882	*
3883	* ge->load.weight =
3884	*
3885	* tg->weight * grq->load.weight
3886	* --------------------------------------------------- (5)
3887	* tg->load_avg - grq->avg.load_avg + grq->load.weight
3888	*
3889	* But because grq->load.weight can drop to 0, resulting in a divide by zero,
3890	* we need to use grq->avg.load_avg as its lower bound, which then gives:
3891	*
3892	*
3893	* tg->weight * grq->load.weight
3894	* ge->load.weight = ----------------------------- (6)
3895	* tg_load_avg'
3896	*
3897	* Where:
3898	*
3899	* tg_load_avg' = tg->load_avg - grq->avg.load_avg +
3900	* max(grq->load.weight, grq->avg.load_avg)
3901	*
3902	* And that is shares_weight and is icky. In the (near) UP case it approaches
3903	* (4) while in the normal case it approaches (3). It consistently
3904	* overestimates the ge->load.weight and therefore:
3905	*
3906	* \Sum ge->load.weight >= tg->weight
3907	*
3908	* hence icky!
3909	*/
3910	static long calc_group_shares(struct cfs_rq *cfs_rq)
3911	{
3912	long tg_weight, tg_shares, load, shares;
3913	struct task_group *tg = cfs_rq->tg;
3914
3915	tg_shares = READ_ONCE(tg->shares);
3916
3917	load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
3918
3919	tg_weight = atomic_long_read(v: &tg->load_avg);
3920
3921	/ Ensure tg_weight >= load /
3922	tg_weight -= cfs_rq->tg_load_avg_contrib;
3923	tg_weight += load;
3924
3925	shares = (tg_shares * load);
3926	if (tg_weight)
3927	shares /= tg_weight;
3928
3929	/*
3930	* MIN_SHARES has to be unscaled here to support per-CPU partitioning
3931	* of a group with small tg->shares value. It is a floor value which is
3932	* assigned as a minimum load.weight to the sched_entity representing
3933	* the group on a CPU.
3934	*
3935	* E.g. on 64-bit for a group with tg->shares of scale_load(15)=15*1024
3936	* on an 8-core system with 8 tasks each runnable on one CPU shares has
3937	* to be 1510241/8=1920 instead of scale_load(MIN_SHARES)=2*1024. In
3938	* case no task is runnable on a CPU MIN_SHARES=2 should be returned
3939	* instead of 0.
3940	*/
3941	return clamp_t(long, shares, MIN_SHARES, tg_shares);
3942	}
3943
3944	/*
3945	* Recomputes the group entity based on the current state of its group
3946	* runqueue.
3947	*/
3948	static void update_cfs_group(struct sched_entity *se)
3949	{
3950	struct cfs_rq *gcfs_rq = group_cfs_rq(grp: se);
3951	long shares;
3952
3953	/*
3954	* When a group becomes empty, preserve its weight. This matters for
3955	* DELAY_DEQUEUE.
3956	*/
3957	if (!gcfs_rq \|\| !gcfs_rq->load.weight)
3958	return;
3959
3960	shares = calc_group_shares(cfs_rq: gcfs_rq);
3961	if (unlikely(se->load.weight != shares))
3962	reweight_entity(cfs_rq: cfs_rq_of(se), se, weight: shares);
3963	}
3964
3965	#else /* !CONFIG_FAIR_GROUP_SCHED: */
3966	static inline void update_cfs_group(struct sched_entity *se)
3967	{
3968	}
3969	#endif /* !CONFIG_FAIR_GROUP_SCHED */
3970
3971	static inline void cfs_rq_util_change(struct cfs_rq cfs_rq, int* flags)
3972	{
3973	struct rq *rq = rq_of(cfs_rq);
3974
3975	if (&rq->cfs == cfs_rq) {
3976	/*
3977	* There are a few boundary cases this might miss but it should
3978	* get called often enough that that should (hopefully) not be
3979	* a real problem.
3980	*
3981	* It will not get called when we go idle, because the idle
3982	* thread is a different class (!fair), nor will the utilization
3983	* number include things like RT tasks.
3984	*
3985	* As is, the util number is not freq-invariant (we'd have to
3986	* implement arch_scale_freq_capacity() for that).
3987	*
3988	* See cpu_util_cfs().
3989	*/
3990	cpufreq_update_util(rq, flags);
3991	}
3992	}
3993
3994	static inline bool load_avg_is_decayed(struct sched_avg *sa)
3995	{
3996	if (sa->load_sum)
3997	return false;
3998
3999	if (sa->util_sum)
4000	return false;
4001
4002	if (sa->runnable_sum)
4003	return false;
4004
4005	/*
4006	* _avg must be null when _sum are null because _avg = _sum / divider
4007	* Make sure that rounding and/or propagation of PELT values never
4008	* break this.
4009	*/
4010	WARN_ON_ONCE(sa->load_avg \|\|
4011	sa->util_avg \|\|
4012	sa->runnable_avg);
4013
4014	return true;
4015	}
4016
4017	static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
4018	{
4019	return u64_u32_load_copy(cfs_rq->avg.last_update_time,
4020	cfs_rq->last_update_time_copy);
4021	}
4022	#ifdef CONFIG_FAIR_GROUP_SCHED
4023	/*
4024	* Because list_add_leaf_cfs_rq always places a child cfs_rq on the list
4025	* immediately before a parent cfs_rq, and cfs_rqs are removed from the list
4026	* bottom-up, we only have to test whether the cfs_rq before us on the list
4027	* is our child.
4028	* If cfs_rq is not on the list, test whether a child needs its to be added to
4029	* connect a branch to the tree * (see list_add_leaf_cfs_rq() for details).
4030	*/
4031	static inline bool child_cfs_rq_on_list(struct cfs_rq *cfs_rq)
4032	{
4033	struct cfs_rq *prev_cfs_rq;
4034	struct list_head *prev;
4035	struct rq *rq = rq_of(cfs_rq);
4036
4037	if (cfs_rq->on_list) {
4038	prev = cfs_rq->leaf_cfs_rq_list.prev;
4039	} else {
4040	prev = rq->tmp_alone_branch;
4041	}
4042
4043	if (prev == &rq->leaf_cfs_rq_list)
4044	return false;
4045
4046	prev_cfs_rq = container_of(prev, struct cfs_rq, leaf_cfs_rq_list);
4047
4048	return (prev_cfs_rq->tg->parent == cfs_rq->tg);
4049	}
4050
4051	static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
4052	{
4053	if (cfs_rq->load.weight)
4054	return false;
4055
4056	if (!load_avg_is_decayed(sa: &cfs_rq->avg))
4057	return false;
4058
4059	if (child_cfs_rq_on_list(cfs_rq))
4060	return false;
4061
4062	return true;
4063	}
4064
4065	/**
4066	* update_tg_load_avg - update the tg's load avg
4067	* @cfs_rq: the cfs_rq whose avg changed
4068	*
4069	* This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
4070	* However, because tg->load_avg is a global value there are performance
4071	* considerations.
4072	*
4073	* In order to avoid having to look at the other cfs_rq's, we use a
4074	* differential update where we store the last value we propagated. This in
4075	* turn allows skipping updates if the differential is 'small'.
4076	*
4077	* Updating tg's load_avg is necessary before update_cfs_share().
4078	*/
4079	static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
4080	{
4081	long delta;
4082	u64 now;
4083
4084	/*
4085	* No need to update load_avg for root_task_group as it is not used.
4086	*/
4087	if (cfs_rq->tg == &root_task_group)
4088	return;
4089
4090	/ rq has been offline and doesn't contribute to the share anymore: /
4091	if (!cpu_active(cpu: cpu_of(rq: rq_of(cfs_rq))))
4092	return;
4093
4094	/*
4095	* For migration heavy workloads, access to tg->load_avg can be
4096	* unbound. Limit the update rate to at most once per ms.
4097	*/
4098	now = sched_clock_cpu(cpu: cpu_of(rq: rq_of(cfs_rq)));
4099	if (now - cfs_rq->last_update_tg_load_avg < NSEC_PER_MSEC)
4100	return;
4101
4102	delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
4103	if (abs(delta) > cfs_rq->tg_load_avg_contrib / `64`) {
4104	atomic_long_add(i: delta, v: &cfs_rq->tg->load_avg);
4105	cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
4106	cfs_rq->last_update_tg_load_avg = now;
4107	}
4108	}
4109
4110	static inline void clear_tg_load_avg(struct cfs_rq *cfs_rq)
4111	{
4112	long delta;
4113	u64 now;
4114
4115	/*
4116	* No need to update load_avg for root_task_group, as it is not used.
4117	*/
4118	if (cfs_rq->tg == &root_task_group)
4119	return;
4120
4121	now = sched_clock_cpu(cpu: cpu_of(rq: rq_of(cfs_rq)));
4122	delta = `0` - cfs_rq->tg_load_avg_contrib;
4123	atomic_long_add(i: delta, v: &cfs_rq->tg->load_avg);
4124	cfs_rq->tg_load_avg_contrib = `0`;
4125	cfs_rq->last_update_tg_load_avg = now;
4126	}
4127
4128	/ CPU offline callback: /
4129	static void __maybe_unused clear_tg_offline_cfs_rqs(struct rq *rq)
4130	{
4131	struct task_group *tg;
4132
4133	lockdep_assert_rq_held(rq);
4134
4135	/*
4136	* The rq clock has already been updated in
4137	* set_rq_offline(), so we should skip updating
4138	* the rq clock again in unthrottle_cfs_rq().
4139	*/
4140	rq_clock_start_loop_update(rq);
4141
4142	rcu_read_lock();
4143	list_for_each_entry_rcu(tg, &task_groups, list) {
4144	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4145
4146	clear_tg_load_avg(cfs_rq);
4147	}
4148	rcu_read_unlock();
4149
4150	rq_clock_stop_loop_update(rq);
4151	}
4152
4153	/*
4154	* Called within set_task_rq() right before setting a task's CPU. The
4155	* caller only guarantees p->pi_lock is held; no other assumptions,
4156	* including the state of rq->lock, should be made.
4157	*/
4158	void set_task_rq_fair(struct sched_entity *se,
4159	struct cfs_rq prev, struct* cfs_rq *next)
4160	{
4161	u64 p_last_update_time;
4162	u64 n_last_update_time;
4163
4164	if (!sched_feat(ATTACH_AGE_LOAD))
4165	return;
4166
4167	/*
4168	* We are supposed to update the task to "current" time, then its up to
4169	* date and ready to go to new CPU/cfs_rq. But we have difficulty in
4170	* getting what current time is, so simply throw away the out-of-date
4171	* time. This will result in the wakee task is less decayed, but giving
4172	* the wakee more load sounds not bad.
4173	*/
4174	if (!(se->avg.last_update_time && prev))
4175	return;
4176
4177	p_last_update_time = cfs_rq_last_update_time(cfs_rq: prev);
4178	n_last_update_time = cfs_rq_last_update_time(cfs_rq: next);
4179
4180	__update_load_avg_blocked_se(now: p_last_update_time, se);
4181	se->avg.last_update_time = n_last_update_time;
4182	}
4183
4184	/*
4185	* When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
4186	* propagate its contribution. The key to this propagation is the invariant
4187	* that for each group:
4188	*
4189	* ge->avg == grq->avg (1)
4190	*
4191	* _IFF_ we look at the pure running and runnable sums. Because they
4192	* represent the very same entity, just at different points in the hierarchy.
4193	*
4194	* Per the above update_tg_cfs_util() and update_tg_cfs_runnable() are trivial
4195	* and simply copies the running/runnable sum over (but still wrong, because
4196	* the group entity and group rq do not have their PELT windows aligned).
4197	*
4198	* However, update_tg_cfs_load() is more complex. So we have:
4199	*
4200	* ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2)
4201	*
4202	* And since, like util, the runnable part should be directly transferable,
4203	* the following would _appear_ to be the straight forward approach:
4204	*
4205	* grq->avg.load_avg = grq->load.weight * grq->avg.runnable_avg (3)
4206	*
4207	* And per (1) we have:
4208	*
4209	* ge->avg.runnable_avg == grq->avg.runnable_avg
4210	*
4211	* Which gives:
4212	*
4213	* ge->load.weight * grq->avg.load_avg
4214	* ge->avg.load_avg = ----------------------------------- (4)
4215	* grq->load.weight
4216	*
4217	* Except that is wrong!
4218	*
4219	* Because while for entities historical weight is not important and we
4220	* really only care about our future and therefore can consider a pure
4221	* runnable sum, runqueues can NOT do this.
4222	*
4223	* We specifically want runqueues to have a load_avg that includes
4224	* historical weights. Those represent the blocked load, the load we expect
4225	* to (shortly) return to us. This only works by keeping the weights as
4226	* integral part of the sum. We therefore cannot decompose as per (3).
4227	*
4228	* Another reason this doesn't work is that runnable isn't a 0-sum entity.
4229	* Imagine a rq with 2 tasks that each are runnable 2/3 of the time. Then the
4230	* rq itself is runnable anywhere between 2/3 and 1 depending on how the
4231	* runnable section of these tasks overlap (or not). If they were to perfectly
4232	* align the rq as a whole would be runnable 2/3 of the time. If however we
4233	* always have at least 1 runnable task, the rq as a whole is always runnable.
4234	*
4235	* So we'll have to approximate.. :/
4236	*
4237	* Given the constraint:
4238	*
4239	* ge->avg.running_sum <= ge->avg.runnable_sum <= LOAD_AVG_MAX
4240	*
4241	* We can construct a rule that adds runnable to a rq by assuming minimal
4242	* overlap.
4243	*
4244	* On removal, we'll assume each task is equally runnable; which yields:
4245	*
4246	* grq->avg.runnable_sum = grq->avg.load_sum / grq->load.weight
4247	*
4248	* XXX: only do this for the part of runnable > running ?
4249	*
4250	*/
4251	static inline void
4252	update_tg_cfs_util(struct cfs_rq cfs_rq, struct* sched_entity se, struct* cfs_rq *gcfs_rq)
4253	{
4254	long delta_sum, delta_avg = gcfs_rq->avg.util_avg - se->avg.util_avg;
4255	u32 new_sum, divider;
4256
4257	/ Nothing to update /
4258	if (!delta_avg)
4259	return;
4260
4261	/*
4262	* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
4263	* See ___update_load_avg() for details.
4264	*/
4265	divider = get_pelt_divider(avg: &cfs_rq->avg);
4266
4267
4268	/ Set new sched_entity's utilization /
4269	se->avg.util_avg = gcfs_rq->avg.util_avg;
4270	new_sum = se->avg.util_avg * divider;
4271	delta_sum = (long)new_sum - (long)se->avg.util_sum;
4272	se->avg.util_sum = new_sum;
4273
4274	/ Update parent cfs_rq utilization /
4275	add_positive(&cfs_rq->avg.util_avg, delta_avg);
4276	add_positive(&cfs_rq->avg.util_sum, delta_sum);
4277
4278	/ See update_cfs_rq_load_avg() /
4279	cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
4280	cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
4281	}
4282
4283	static inline void
4284	update_tg_cfs_runnable(struct cfs_rq cfs_rq, struct* sched_entity se, struct* cfs_rq *gcfs_rq)
4285	{
4286	long delta_sum, delta_avg = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
4287	u32 new_sum, divider;
4288
4289	/ Nothing to update /
4290	if (!delta_avg)
4291	return;
4292
4293	/*
4294	* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
4295	* See ___update_load_avg() for details.
4296	*/
4297	divider = get_pelt_divider(avg: &cfs_rq->avg);
4298
4299	/ Set new sched_entity's runnable /
4300	se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
4301	new_sum = se->avg.runnable_avg * divider;
4302	delta_sum = (long)new_sum - (long)se->avg.runnable_sum;
4303	se->avg.runnable_sum = new_sum;
4304
4305	/ Update parent cfs_rq runnable /
4306	add_positive(&cfs_rq->avg.runnable_avg, delta_avg);
4307	add_positive(&cfs_rq->avg.runnable_sum, delta_sum);
4308	/ See update_cfs_rq_load_avg() /
4309	cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
4310	cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
4311	}
4312
4313	static inline void
4314	update_tg_cfs_load(struct cfs_rq cfs_rq, struct* sched_entity se, struct* cfs_rq *gcfs_rq)
4315	{
4316	long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
4317	unsigned long load_avg;
4318	u64 load_sum = `0`;
4319	s64 delta_sum;
4320	u32 divider;
4321
4322	if (!runnable_sum)
4323	return;
4324
4325	gcfs_rq->prop_runnable_sum = `0`;
4326
4327	/*
4328	* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
4329	* See ___update_load_avg() for details.
4330	*/
4331	divider = get_pelt_divider(avg: &cfs_rq->avg);
4332
4333	if (runnable_sum >= `0`) {
4334	/*
4335	* Add runnable; clip at LOAD_AVG_MAX. Reflects that until
4336	* the CPU is saturated running == runnable.
4337	*/
4338	runnable_sum += se->avg.load_sum;
4339	runnable_sum = min_t(long, runnable_sum, divider);
4340	} else {
4341	/*
4342	* Estimate the new unweighted runnable_sum of the gcfs_rq by
4343	* assuming all tasks are equally runnable.
4344	*/
4345	if (scale_load_down(gcfs_rq->load.weight)) {
4346	load_sum = div_u64(dividend: gcfs_rq->avg.load_sum,
4347	scale_load_down(gcfs_rq->load.weight));
4348	}
4349
4350	/ But make sure to not inflate se's runnable /
4351	runnable_sum = min(se->avg.load_sum, load_sum);
4352	}
4353
4354	/*
4355	* runnable_sum can't be lower than running_sum
4356	* Rescale running sum to be in the same range as runnable sum
4357	* running_sum is in [0 : LOAD_AVG_MAX << SCHED_CAPACITY_SHIFT]
4358	* runnable_sum is in [0 : LOAD_AVG_MAX]
4359	*/
4360	running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
4361	runnable_sum = max(runnable_sum, running_sum);
4362
4363	load_sum = se_weight(se) * runnable_sum;
4364	load_avg = div_u64(dividend: load_sum, divisor: divider);
4365
4366	delta_avg = load_avg - se->avg.load_avg;
4367	if (!delta_avg)
4368	return;
4369
4370	delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
4371
4372	se->avg.load_sum = runnable_sum;
4373	se->avg.load_avg = load_avg;
4374	add_positive(&cfs_rq->avg.load_avg, delta_avg);
4375	add_positive(&cfs_rq->avg.load_sum, delta_sum);
4376	/ See update_cfs_rq_load_avg() /
4377	cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
4378	cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
4379	}
4380
4381	static inline void add_tg_cfs_propagate(struct cfs_rq cfs_rq, long* runnable_sum)
4382	{
4383	cfs_rq->propagate = `1`;
4384	cfs_rq->prop_runnable_sum += runnable_sum;
4385	}
4386
4387	/ Update task and its cfs_rq load average /
4388	static inline int propagate_entity_load_avg(struct sched_entity *se)
4389	{
4390	struct cfs_rq cfs_rq, gcfs_rq;
4391
4392	if (entity_is_task(se))
4393	return `0`;
4394
4395	gcfs_rq = group_cfs_rq(grp: se);
4396	if (!gcfs_rq->propagate)
4397	return `0`;
4398
4399	gcfs_rq->propagate = `0`;
4400
4401	cfs_rq = cfs_rq_of(se);
4402
4403	add_tg_cfs_propagate(cfs_rq, runnable_sum: gcfs_rq->prop_runnable_sum);
4404
4405	update_tg_cfs_util(cfs_rq, se, gcfs_rq);
4406	update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
4407	update_tg_cfs_load(cfs_rq, se, gcfs_rq);
4408
4409	trace_pelt_cfs_tp(cfs_rq);
4410	trace_pelt_se_tp(se);
4411
4412	return `1`;
4413	}
4414
4415	/*
4416	* Check if we need to update the load and the utilization of a blocked
4417	* group_entity:
4418	*/
4419	static inline bool skip_blocked_update(struct sched_entity *se)
4420	{
4421	struct cfs_rq *gcfs_rq = group_cfs_rq(grp: se);
4422
4423	/*
4424	* If sched_entity still have not zero load or utilization, we have to
4425	* decay it:
4426	*/
4427	if (se->avg.load_avg \|\| se->avg.util_avg)
4428	return false;
4429
4430	/*
4431	* If there is a pending propagation, we have to update the load and
4432	* the utilization of the sched_entity:
4433	*/
4434	if (gcfs_rq->propagate)
4435	return false;
4436
4437	/*
4438	* Otherwise, the load and the utilization of the sched_entity is
4439	* already zero and there is no pending propagation, so it will be a
4440	* waste of time to try to decay it:
4441	*/
4442	return true;
4443	}
4444
4445	#else /* !CONFIG_FAIR_GROUP_SCHED: */
4446
4447	static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
4448
4449	static inline void clear_tg_offline_cfs_rqs(struct rq *rq) {}
4450
4451	static inline int propagate_entity_load_avg(struct sched_entity *se)
4452	{
4453	return `0`;
4454	}
4455
4456	static inline void add_tg_cfs_propagate(struct cfs_rq cfs_rq, long* runnable_sum) {}
4457
4458	#endif /* !CONFIG_FAIR_GROUP_SCHED */
4459
4460	#ifdef CONFIG_NO_HZ_COMMON
4461	static inline void migrate_se_pelt_lag(struct sched_entity *se)
4462	{
4463	u64 throttled = `0`, now, lut;
4464	struct cfs_rq *cfs_rq;
4465	struct rq *rq;
4466	bool is_idle;
4467
4468	if (load_avg_is_decayed(sa: &se->avg))
4469	return;
4470
4471	cfs_rq = cfs_rq_of(se);
4472	rq = rq_of(cfs_rq);
4473
4474	rcu_read_lock();
4475	is_idle = is_idle_task(rcu_dereference(rq->curr));
4476	rcu_read_unlock();
4477
4478	/*
4479	* The lag estimation comes with a cost we don't want to pay all the
4480	* time. Hence, limiting to the case where the source CPU is idle and
4481	* we know we are at the greatest risk to have an outdated clock.
4482	*/
4483	if (!is_idle)
4484	return;
4485
4486	/*
4487	* Estimated "now" is: last_update_time + cfs_idle_lag + rq_idle_lag, where:
4488	*
4489	* last_update_time (the cfs_rq's last_update_time)
4490	* = cfs_rq_clock_pelt()@cfs_rq_idle
4491	* = rq_clock_pelt()@cfs_rq_idle
4492	* - cfs->throttled_clock_pelt_time@cfs_rq_idle
4493	*
4494	* cfs_idle_lag (delta between rq's update and cfs_rq's update)
4495	* = rq_clock_pelt()@rq_idle - rq_clock_pelt()@cfs_rq_idle
4496	*
4497	* rq_idle_lag (delta between now and rq's update)
4498	* = sched_clock_cpu() - rq_clock()@rq_idle
4499	*
4500	* We can then write:
4501	*
4502	* now = rq_clock_pelt()@rq_idle - cfs->throttled_clock_pelt_time +
4503	* sched_clock_cpu() - rq_clock()@rq_idle
4504	* Where:
4505	* rq_clock_pelt()@rq_idle is rq->clock_pelt_idle
4506	* rq_clock()@rq_idle is rq->clock_idle
4507	* cfs->throttled_clock_pelt_time@cfs_rq_idle
4508	* is cfs_rq->throttled_pelt_idle
4509	*/
4510
4511	#ifdef CONFIG_CFS_BANDWIDTH
4512	throttled = u64_u32_load(cfs_rq->throttled_pelt_idle);
4513	/ The clock has been stopped for throttling /
4514	if (throttled == U64_MAX)
4515	return;
4516	#endif
4517	now = u64_u32_load(rq->clock_pelt_idle);
4518	/*
4519	* Paired with _update_idle_rq_clock_pelt(). It ensures at the worst case
4520	* is observed the old clock_pelt_idle value and the new clock_idle,
4521	* which lead to an underestimation. The opposite would lead to an
4522	* overestimation.
4523	*/
4524	smp_rmb();
4525	lut = cfs_rq_last_update_time(cfs_rq);
4526
4527	now -= throttled;
4528	if (now < lut)
4529	/*
4530	* cfs_rq->avg.last_update_time is more recent than our
4531	* estimation, let's use it.
4532	*/
4533	now = lut;
4534	else
4535	now += sched_clock_cpu(cpu: cpu_of(rq)) - u64_u32_load(rq->clock_idle);
4536
4537	__update_load_avg_blocked_se(now, se);
4538	}
4539	#else /* !CONFIG_NO_HZ_COMMON: */
4540	static void migrate_se_pelt_lag(struct sched_entity *se) {}
4541	#endif /* !CONFIG_NO_HZ_COMMON */
4542
4543	/**
4544	* update_cfs_rq_load_avg - update the cfs_rq's load/util averages
4545	* @now: current time, as per cfs_rq_clock_pelt()
4546	* @cfs_rq: cfs_rq to update
4547	*
4548	* The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
4549	* avg. The immediate corollary is that all (fair) tasks must be attached.
4550	*
4551	* cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
4552	*
4553	* Return: true if the load decayed or we removed load.
4554	*
4555	* Since both these conditions indicate a changed cfs_rq->avg.load we should
4556	* call update_tg_load_avg() when this function returns true.
4557	*/
4558	static inline int
4559	update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
4560	{
4561	unsigned long removed_load = `0`, removed_util = `0`, removed_runnable = `0`;
4562	struct sched_avg *sa = &cfs_rq->avg;
4563	int decayed = `0`;
4564
4565	if (cfs_rq->removed.nr) {
4566	unsigned long r;
4567	u32 divider = get_pelt_divider(avg: &cfs_rq->avg);
4568
4569	raw_spin_lock(&cfs_rq->removed.lock);
4570	swap(cfs_rq->removed.util_avg, removed_util);
4571	swap(cfs_rq->removed.load_avg, removed_load);
4572	swap(cfs_rq->removed.runnable_avg, removed_runnable);
4573	cfs_rq->removed.nr = `0`;
4574	raw_spin_unlock(&cfs_rq->removed.lock);
4575
4576	r = removed_load;
4577	sub_positive(&sa->load_avg, r);
4578	sub_positive(&sa->load_sum, r * divider);
4579	/ See sa->util_sum below /
4580	sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER);
4581
4582	r = removed_util;
4583	sub_positive(&sa->util_avg, r);
4584	sub_positive(&sa->util_sum, r * divider);
4585	/*
4586	* Because of rounding, se->util_sum might ends up being +1 more than
4587	* cfs->util_sum. Although this is not a problem by itself, detaching
4588	* a lot of tasks with the rounding problem between 2 updates of
4589	* util_avg (~1ms) can make cfs->util_sum becoming null whereas
4590	* cfs_util_avg is not.
4591	* Check that util_sum is still above its lower bound for the new
4592	* util_avg. Given that period_contrib might have moved since the last
4593	* sync, we are only sure that util_sum must be above or equal to
4594	* util_avg * minimum possible divider
4595	*/
4596	sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER);
4597
4598	r = removed_runnable;
4599	sub_positive(&sa->runnable_avg, r);
4600	sub_positive(&sa->runnable_sum, r * divider);
4601	/ See sa->util_sum above /
4602	sa->runnable_sum = max_t(u32, sa->runnable_sum,
4603	sa->runnable_avg * PELT_MIN_DIVIDER);
4604
4605	/*
4606	* removed_runnable is the unweighted version of removed_load so we
4607	* can use it to estimate removed_load_sum.
4608	*/
4609	add_tg_cfs_propagate(cfs_rq,
4610	runnable_sum: -(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT);
4611
4612	decayed = `1`;
4613	}
4614
4615	decayed \|= __update_load_avg_cfs_rq(now, cfs_rq);
4616	u64_u32_store_copy(sa->last_update_time,
4617	cfs_rq->last_update_time_copy,
4618	sa->last_update_time);
4619	return decayed;
4620	}
4621
4622	/**
4623	* attach_entity_load_avg - attach this entity to its cfs_rq load avg
4624	* @cfs_rq: cfs_rq to attach to
4625	* @se: sched_entity to attach
4626	*
4627	* Must call update_cfs_rq_load_avg() before this, since we rely on
4628	* cfs_rq->avg.last_update_time being current.
4629	*/
4630	static void attach_entity_load_avg(struct cfs_rq cfs_rq, struct* sched_entity *se)
4631	{
4632	/*
4633	* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
4634	* See ___update_load_avg() for details.
4635	*/
4636	u32 divider = get_pelt_divider(avg: &cfs_rq->avg);
4637
4638	/*
4639	* When we attach the @se to the @cfs_rq, we must align the decay
4640	* window because without that, really weird and wonderful things can
4641	* happen.
4642	*
4643	* XXX illustrate
4644	*/
4645	se->avg.last_update_time = cfs_rq->avg.last_update_time;
4646	se->avg.period_contrib = cfs_rq->avg.period_contrib;
4647
4648	/*
4649	* Hell(o) Nasty stuff.. we need to recompute _sum based on the new
4650	* period_contrib. This isn't strictly correct, but since we're
4651	* entirely outside of the PELT hierarchy, nobody cares if we truncate
4652	* _sum a little.
4653	*/
4654	se->avg.util_sum = se->avg.util_avg * divider;
4655
4656	se->avg.runnable_sum = se->avg.runnable_avg * divider;
4657
4658	se->avg.load_sum = se->avg.load_avg * divider;
4659	if (se_weight(se) < se->avg.load_sum)
4660	se->avg.load_sum = div_u64(dividend: se->avg.load_sum, divisor: se_weight(se));
4661	else
4662	se->avg.load_sum = `1`;
4663
4664	enqueue_load_avg(cfs_rq, se);
4665	cfs_rq->avg.util_avg += se->avg.util_avg;
4666	cfs_rq->avg.util_sum += se->avg.util_sum;
4667	cfs_rq->avg.runnable_avg += se->avg.runnable_avg;
4668	cfs_rq->avg.runnable_sum += se->avg.runnable_sum;
4669
4670	add_tg_cfs_propagate(cfs_rq, runnable_sum: se->avg.load_sum);
4671
4672	cfs_rq_util_change(cfs_rq, flags: `0`);
4673
4674	trace_pelt_cfs_tp(cfs_rq);
4675	}
4676
4677	/**
4678	* detach_entity_load_avg - detach this entity from its cfs_rq load avg
4679	* @cfs_rq: cfs_rq to detach from
4680	* @se: sched_entity to detach
4681	*
4682	* Must call update_cfs_rq_load_avg() before this, since we rely on
4683	* cfs_rq->avg.last_update_time being current.
4684	*/
4685	static void detach_entity_load_avg(struct cfs_rq cfs_rq, struct* sched_entity *se)
4686	{
4687	dequeue_load_avg(cfs_rq, se);
4688	sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
4689	sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
4690	/ See update_cfs_rq_load_avg() /
4691	cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
4692	cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
4693
4694	sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
4695	sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);
4696	/ See update_cfs_rq_load_avg() /
4697	cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
4698	cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
4699
4700	add_tg_cfs_propagate(cfs_rq, runnable_sum: -se->avg.load_sum);
4701
4702	cfs_rq_util_change(cfs_rq, flags: `0`);
4703
4704	trace_pelt_cfs_tp(cfs_rq);
4705	}
4706
4707	/*
4708	* Optional action to be done while updating the load average
4709	*/
4710	#define UPDATE_TG 0x1
4711	#define SKIP_AGE_LOAD 0x2
4712	#define DO_ATTACH 0x4
4713	#define DO_DETACH 0x8
4714
4715	/ Update task and its cfs_rq load average /
4716	static inline void update_load_avg(struct cfs_rq cfs_rq, struct* sched_entity se, int* flags)
4717	{
4718	u64 now = cfs_rq_clock_pelt(cfs_rq);
4719	int decayed;
4720
4721	/*
4722	* Track task load average for carrying it to new CPU after migrated, and
4723	* track group sched_entity load average for task_h_load calculation in migration
4724	*/
4725	if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
4726	__update_load_avg_se(now, cfs_rq, se);
4727
4728	decayed = update_cfs_rq_load_avg(now, cfs_rq);
4729	decayed \|= propagate_entity_load_avg(se);
4730
4731	if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
4732
4733	/*
4734	* DO_ATTACH means we're here from enqueue_entity().
4735	* !last_update_time means we've passed through
4736	* migrate_task_rq_fair() indicating we migrated.
4737	*
4738	* IOW we're enqueueing a task on a new CPU.
4739	*/
4740	attach_entity_load_avg(cfs_rq, se);
4741	update_tg_load_avg(cfs_rq);
4742
4743	} else if (flags & DO_DETACH) {
4744	/*
4745	* DO_DETACH means we're here from dequeue_entity()
4746	* and we are migrating task out of the CPU.
4747	*/
4748	detach_entity_load_avg(cfs_rq, se);
4749	update_tg_load_avg(cfs_rq);
4750	} else if (decayed) {
4751	cfs_rq_util_change(cfs_rq, flags: `0`);
4752
4753	if (flags & UPDATE_TG)
4754	update_tg_load_avg(cfs_rq);
4755	}
4756	}
4757
4758	/*
4759	* Synchronize entity load avg of dequeued entity without locking
4760	* the previous rq.
4761	*/
4762	static void sync_entity_load_avg(struct sched_entity *se)
4763	{
4764	struct cfs_rq *cfs_rq = cfs_rq_of(se);
4765	u64 last_update_time;
4766
4767	last_update_time = cfs_rq_last_update_time(cfs_rq);
4768	__update_load_avg_blocked_se(now: last_update_time, se);
4769	}
4770
4771	/*
4772	* Task first catches up with cfs_rq, and then subtract
4773	* itself from the cfs_rq (task must be off the queue now).
4774	*/
4775	static void remove_entity_load_avg(struct sched_entity *se)
4776	{
4777	struct cfs_rq *cfs_rq = cfs_rq_of(se);
4778	unsigned long flags;
4779
4780	/*
4781	* tasks cannot exit without having gone through wake_up_new_task() ->
4782	* enqueue_task_fair() which will have added things to the cfs_rq,
4783	* so we can remove unconditionally.
4784	*/
4785
4786	sync_entity_load_avg(se);
4787
4788	raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
4789	++cfs_rq->removed.nr;
4790	cfs_rq->removed.util_avg += se->avg.util_avg;
4791	cfs_rq->removed.load_avg += se->avg.load_avg;
4792	cfs_rq->removed.runnable_avg += se->avg.runnable_avg;
4793	raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
4794	}
4795
4796	static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq)
4797	{
4798	return cfs_rq->avg.runnable_avg;
4799	}
4800
4801	static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
4802	{
4803	return cfs_rq->avg.load_avg;
4804	}
4805
4806	static int sched_balance_newidle(struct rq this_rq, struct* rq_flags *rf);
4807
4808	static inline unsigned long task_util(struct task_struct *p)
4809	{
4810	return READ_ONCE(p->se.avg.util_avg);
4811	}
4812
4813	static inline unsigned long task_runnable(struct task_struct *p)
4814	{
4815	return READ_ONCE(p->se.avg.runnable_avg);
4816	}
4817
4818	static inline unsigned long _task_util_est(struct task_struct *p)
4819	{
4820	return READ_ONCE(p->se.avg.util_est) & ~UTIL_AVG_UNCHANGED;
4821	}
4822
4823	static inline unsigned long task_util_est(struct task_struct *p)
4824	{
4825	return max(task_util(p), _task_util_est(p));
4826	}
4827
4828	static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
4829	struct task_struct *p)
4830	{
4831	unsigned int enqueued;
4832
4833	if (!sched_feat(UTIL_EST))
4834	return;
4835
4836	/ Update root cfs_rq's estimated utilization /
4837	enqueued = cfs_rq->avg.util_est;
4838	enqueued += _task_util_est(p);
4839	WRITE_ONCE(cfs_rq->avg.util_est, enqueued);
4840
4841	trace_sched_util_est_cfs_tp(cfs_rq);
4842	}
4843
4844	static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
4845	struct task_struct *p)
4846	{
4847	unsigned int enqueued;
4848
4849	if (!sched_feat(UTIL_EST))
4850	return;
4851
4852	/ Update root cfs_rq's estimated utilization /
4853	enqueued = cfs_rq->avg.util_est;
4854	enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
4855	WRITE_ONCE(cfs_rq->avg.util_est, enqueued);
4856
4857	trace_sched_util_est_cfs_tp(cfs_rq);
4858	}
4859
4860	#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
4861
4862	static inline void util_est_update(struct cfs_rq *cfs_rq,
4863	struct task_struct *p,
4864	bool task_sleep)
4865	{
4866	unsigned int ewma, dequeued, last_ewma_diff;
4867
4868	if (!sched_feat(UTIL_EST))
4869	return;
4870
4871	/*
4872	* Skip update of task's estimated utilization when the task has not
4873	* yet completed an activation, e.g. being migrated.
4874	*/
4875	if (!task_sleep)
4876	return;
4877
4878	/ Get current estimate of utilization /
4879	ewma = READ_ONCE(p->se.avg.util_est);
4880
4881	/*
4882	* If the PELT values haven't changed since enqueue time,
4883	* skip the util_est update.
4884	*/
4885	if (ewma & UTIL_AVG_UNCHANGED)
4886	return;
4887
4888	/ Get utilization at dequeue /
4889	dequeued = task_util(p);
4890
4891	/*
4892	* Reset EWMA on utilization increases, the moving average is used only
4893	* to smooth utilization decreases.
4894	*/
4895	if (ewma <= dequeued) {
4896	ewma = dequeued;
4897	goto done;
4898	}
4899
4900	/*
4901	* Skip update of task's estimated utilization when its members are
4902	* already ~1% close to its last activation value.
4903	*/
4904	last_ewma_diff = ewma - dequeued;
4905	if (last_ewma_diff < UTIL_EST_MARGIN)
4906	goto done;
4907
4908	/*
4909	* To avoid underestimate of task utilization, skip updates of EWMA if
4910	* we cannot grant that thread got all CPU time it wanted.
4911	*/
4912	if ((dequeued + UTIL_EST_MARGIN) < task_runnable(p))
4913	goto done;
4914
4915
4916	/*
4917	* Update Task's estimated utilization
4918	*
4919	* When *p completes an activation we can consolidate another sample
4920	* of the task size. This is done by using this value to update the
4921	* Exponential Weighted Moving Average (EWMA):
4922	*
4923	* ewma(t) = w * task_util(p) + (1-w) * ewma(t-1)
4924	* = w * task_util(p) + ewma(t-1) - w * ewma(t-1)
4925	* = w * (task_util(p) - ewma(t-1)) + ewma(t-1)
4926	* = w * ( -last_ewma_diff ) + ewma(t-1)
4927	* = w * (-last_ewma_diff + ewma(t-1) / w)
4928	*
4929	* Where 'w' is the weight of new samples, which is configured to be
4930	* 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
4931	*/
4932	ewma <<= UTIL_EST_WEIGHT_SHIFT;
4933	ewma -= last_ewma_diff;
4934	ewma >>= UTIL_EST_WEIGHT_SHIFT;
4935	done:
4936	ewma \|= UTIL_AVG_UNCHANGED;
4937	WRITE_ONCE(p->se.avg.util_est, ewma);
4938
4939	trace_sched_util_est_se_tp(se: &p->se);
4940	}
4941
4942	static inline unsigned long get_actual_cpu_capacity(int cpu)
4943	{
4944	unsigned long capacity = arch_scale_cpu_capacity(cpu);
4945
4946	capacity -= max(hw_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu));
4947
4948	return capacity;
4949	}
4950
4951	static inline int util_fits_cpu(unsigned long util,
4952	unsigned long uclamp_min,
4953	unsigned long uclamp_max,
4954	int cpu)
4955	{
4956	unsigned long capacity = capacity_of(cpu);
4957	unsigned long capacity_orig;
4958	bool fits, uclamp_max_fits;
4959
4960	/*
4961	* Check if the real util fits without any uclamp boost/cap applied.
4962	*/
4963	fits = fits_capacity(util, capacity);
4964
4965	if (!uclamp_is_used())
4966	return fits;
4967
4968	/*
4969	* We must use arch_scale_cpu_capacity() for comparing against uclamp_min and
4970	* uclamp_max. We only care about capacity pressure (by using
4971	* capacity_of()) for comparing against the real util.
4972	*
4973	* If a task is boosted to 1024 for example, we don't want a tiny
4974	* pressure to skew the check whether it fits a CPU or not.
4975	*
4976	* Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it
4977	* should fit a little cpu even if there's some pressure.
4978	*
4979	* Only exception is for HW or cpufreq pressure since it has a direct impact
4980	* on available OPP of the system.
4981	*
4982	* We honour it for uclamp_min only as a drop in performance level
4983	* could result in not getting the requested minimum performance level.
4984	*
4985	* For uclamp_max, we can tolerate a drop in performance level as the
4986	* goal is to cap the task. So it's okay if it's getting less.
4987	*/
4988	capacity_orig = arch_scale_cpu_capacity(cpu);
4989
4990	/*
4991	* We want to force a task to fit a cpu as implied by uclamp_max.
4992	* But we do have some corner cases to cater for..
4993	*
4994	*
4995	* C=z
4996	* \| ___
4997	* \| C=y \| \|
4998	* \|_ _ _ _ _ _ _ _ _ ___ _ _ _ \| _ \| _ _ _ _ _ uclamp_max
4999	* \| C=x \| \| \| \|
5000	* \| ___ \| \| \| \|
5001	* \| \| \| \| \| \| \| (util somewhere in this region)
5002	* \| \| \| \| \| \| \|
5003	* \| \| \| \| \| \| \|
5004	* +----------------------------------------
5005	* CPU0 CPU1 CPU2
5006	*
5007	* In the above example if a task is capped to a specific performance
5008	* point, y, then when:
5009	*
5010	* * util = 80% of x then it does not fit on CPU0 and should migrate
5011	* to CPU1
5012	* * util = 80% of y then it is forced to fit on CPU1 to honour
5013	* uclamp_max request.
5014	*
5015	* which is what we're enforcing here. A task always fits if
5016	* uclamp_max <= capacity_orig. But when uclamp_max > capacity_orig,
5017	* the normal upmigration rules should withhold still.
5018	*
5019	* Only exception is when we are on max capacity, then we need to be
5020	* careful not to block overutilized state. This is so because:
5021	*
5022	* 1. There's no concept of capping at max_capacity! We can't go
5023	* beyond this performance level anyway.
5024	* 2. The system is being saturated when we're operating near
5025	* max capacity, it doesn't make sense to block overutilized.
5026	*/
5027	uclamp_max_fits = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE);
5028	uclamp_max_fits = !uclamp_max_fits && (uclamp_max <= capacity_orig);
5029	fits = fits \|\| uclamp_max_fits;
5030
5031	/*
5032	*
5033	* C=z
5034	* \| ___ (region a, capped, util >= uclamp_max)
5035	* \| C=y \| \|
5036	* \|_ _ _ _ _ _ _ _ _ ___ _ _ _ \| _ \| _ _ _ _ _ uclamp_max
5037	* \| C=x \| \| \| \|
5038	* \| ___ \| \| \| \| (region b, uclamp_min <= util <= uclamp_max)
5039	* \|_ _ _\|_ _\|_ _ _ _\| _ \| _ _ _\| _ \| _ _ _ _ _ uclamp_min
5040	* \| \| \| \| \| \| \|
5041	* \| \| \| \| \| \| \| (region c, boosted, util < uclamp_min)
5042	* +----------------------------------------
5043	* CPU0 CPU1 CPU2
5044	*
5045	* a) If util > uclamp_max, then we're capped, we don't care about
5046	* actual fitness value here. We only care if uclamp_max fits
5047	* capacity without taking margin/pressure into account.
5048	* See comment above.
5049	*
5050	* b) If uclamp_min <= util <= uclamp_max, then the normal
5051	* fits_capacity() rules apply. Except we need to ensure that we
5052	* enforce we remain within uclamp_max, see comment above.
5053	*
5054	* c) If util < uclamp_min, then we are boosted. Same as (b) but we
5055	* need to take into account the boosted value fits the CPU without
5056	* taking margin/pressure into account.
5057	*
5058	* Cases (a) and (b) are handled in the 'fits' variable already. We
5059	* just need to consider an extra check for case (c) after ensuring we
5060	* handle the case uclamp_min > uclamp_max.
5061	*/
5062	uclamp_min = min(uclamp_min, uclamp_max);
5063	if (fits && (util < uclamp_min) &&
5064	(uclamp_min > get_actual_cpu_capacity(cpu)))
5065	return -`1`;
5066
5067	return fits;
5068	}
5069
5070	static inline int task_fits_cpu(struct task_struct p, int* cpu)
5071	{
5072	unsigned long uclamp_min = uclamp_eff_value(p, clamp_id: UCLAMP_MIN);
5073	unsigned long uclamp_max = uclamp_eff_value(p, clamp_id: UCLAMP_MAX);
5074	unsigned long util = task_util_est(p);
5075	/*
5076	* Return true only if the cpu fully fits the task requirements, which
5077	* include the utilization but also the performance hints.
5078	*/
5079	return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) > `0`);
5080	}
5081
5082	static inline void update_misfit_status(struct task_struct p, struct* rq *rq)
5083	{
5084	int cpu = cpu_of(rq);
5085
5086	if (!sched_asym_cpucap_active())
5087	return;
5088
5089	/*
5090	* Affinity allows us to go somewhere higher? Or are we on biggest
5091	* available CPU already? Or do we fit into this CPU ?
5092	*/
5093	if (!p \|\| (p->nr_cpus_allowed == `1`) \|\|
5094	(arch_scale_cpu_capacity(cpu) == p->max_allowed_capacity) \|\|
5095	task_fits_cpu(p, cpu)) {
5096
5097	rq->misfit_task_load = `0`;
5098	return;
5099	}
5100
5101	/*
5102	* Make sure that misfit_task_load will not be null even if
5103	* task_h_load() returns 0.
5104	*/
5105	rq->misfit_task_load = max_t(unsigned long, task_h_load(p), `1`);
5106	}
5107
5108	void __setparam_fair(struct task_struct p, const* struct sched_attr *attr)
5109	{
5110	struct sched_entity *se = &p->se;
5111
5112	p->static_prio = NICE_TO_PRIO(attr->sched_nice);
5113	if (attr->sched_runtime) {
5114	se->custom_slice = `1`;
5115	se->slice = clamp_t(u64, attr->sched_runtime,
5116	NSEC_PER_MSEC/`10`, / HZ=1000 * 10 /
5117	NSEC_PER_MSEC`100`); /* HZ=100 / 10 /
5118	} else {
5119	se->custom_slice = `0`;
5120	se->slice = sysctl_sched_base_slice;
5121	}
5122	}
5123
5124	static void
5125	place_entity(struct cfs_rq cfs_rq, struct* sched_entity se, int* flags)
5126	{
5127	u64 vslice, vruntime = avg_vruntime(cfs_rq);
5128	s64 lag = `0`;
5129
5130	if (!se->custom_slice)
5131	se->slice = sysctl_sched_base_slice;
5132	vslice = calc_delta_fair(delta: se->slice, se);
5133
5134	/*
5135	* Due to how V is constructed as the weighted average of entities,
5136	* adding tasks with positive lag, or removing tasks with negative lag
5137	* will move 'time' backwards, this can screw around with the lag of
5138	* other tasks.
5139	*
5140	* EEVDF: placement strategy #1 / #2
5141	*/
5142	if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) {
5143	struct sched_entity *curr = cfs_rq->curr;
5144	unsigned long load;
5145
5146	lag = se->vlag;
5147
5148	/*
5149	* If we want to place a task and preserve lag, we have to
5150	* consider the effect of the new entity on the weighted
5151	* average and compensate for this, otherwise lag can quickly
5152	* evaporate.
5153	*
5154	* Lag is defined as:
5155	*
5156	* lag_i = S - s_i = w_i * (V - v_i)
5157	*
5158	* To avoid the 'w_i' term all over the place, we only track
5159	* the virtual lag:
5160	*
5161	* vl_i = V - v_i <=> v_i = V - vl_i
5162	*
5163	* And we take V to be the weighted average of all v:
5164	*
5165	* V = (\Sum w_j*v_j) / W
5166	*
5167	* Where W is: \Sum w_j
5168	*
5169	* Then, the weighted average after adding an entity with lag
5170	* vl_i is given by:
5171	*
5172	* V' = (\Sum w_jv_j + w_iv_i) / (W + w_i)
5173	* = (WV + w_i(V - vl_i)) / (W + w_i)
5174	* = (WV + w_iV - w_i*vl_i) / (W + w_i)
5175	* = (V(W + w_i) - w_ivl_i) / (W + w_i)
5176	* = V - w_i*vl_i / (W + w_i)
5177	*
5178	* And the actual lag after adding an entity with vl_i is:
5179	*
5180	* vl'_i = V' - v_i
5181	* = V - w_i*vl_i / (W + w_i) - (V - vl_i)
5182	* = vl_i - w_i*vl_i / (W + w_i)
5183	*
5184	* Which is strictly less than vl_i. So in order to preserve lag
5185	* we should inflate the lag before placement such that the
5186	* effective lag after placement comes out right.
5187	*
5188	* As such, invert the above relation for vl'_i to get the vl_i
5189	* we need to use such that the lag after placement is the lag
5190	* we computed before dequeue.
5191	*
5192	* vl'_i = vl_i - w_i*vl_i / (W + w_i)
5193	* = ((W + w_i)vl_i - w_ivl_i) / (W + w_i)
5194	*
5195	* (W + w_i)vl'_i = (W + w_i)vl_i - w_i*vl_i
5196	* = W*vl_i
5197	*
5198	* vl_i = (W + w_i)*vl'_i / W
5199	*/
5200	load = cfs_rq->avg_load;
5201	if (curr && curr->on_rq)
5202	load += scale_load_down(curr->load.weight);
5203
5204	lag *= load + scale_load_down(se->load.weight);
5205	if (WARN_ON_ONCE(!load))
5206	load = `1`;
5207	lag = div_s64(dividend: lag, divisor: load);
5208	}
5209
5210	se->vruntime = vruntime - lag;
5211
5212	if (se->rel_deadline) {
5213	se->deadline += se->vruntime;
5214	se->rel_deadline = `0`;
5215	return;
5216	}
5217
5218	/*
5219	* When joining the competition; the existing tasks will be,
5220	* on average, halfway through their slice, as such start tasks
5221	* off with half a slice to ease into the competition.
5222	*/
5223	if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL))
5224	vslice /= `2`;
5225
5226	/*
5227	* EEVDF: vd_i = ve_i + r_i/w_i
5228	*/
5229	se->deadline = se->vruntime + vslice;
5230	}
5231
5232	static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
5233	static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
5234
5235	static void
5236	requeue_delayed_entity(struct sched_entity *se);
5237
5238	static void
5239	enqueue_entity(struct cfs_rq cfs_rq, struct* sched_entity se, int* flags)
5240	{
5241	bool curr = cfs_rq->curr == se;
5242
5243	/*
5244	* If we're the current task, we must renormalise before calling
5245	* update_curr().
5246	*/
5247	if (curr)
5248	place_entity(cfs_rq, se, flags);
5249
5250	update_curr(cfs_rq);
5251
5252	/*
5253	* When enqueuing a sched_entity, we must:
5254	* - Update loads to have both entity and cfs_rq synced with now.
5255	* - For group_entity, update its runnable_weight to reflect the new
5256	* h_nr_runnable of its group cfs_rq.
5257	* - For group_entity, update its weight to reflect the new share of
5258	* its group cfs_rq
5259	* - Add its new weight to cfs_rq->load.weight
5260	*/
5261	update_load_avg(cfs_rq, se, UPDATE_TG \| DO_ATTACH);
5262	se_update_runnable(se);
5263	/*
5264	* XXX update_load_avg() above will have attached us to the pelt sum;
5265	* but update_cfs_group() here will re-adjust the weight and have to
5266	* undo/redo all that. Seems wasteful.
5267	*/
5268	update_cfs_group(se);
5269
5270	/*
5271	* XXX now that the entity has been re-weighted, and it's lag adjusted,
5272	* we can place the entity.
5273	*/
5274	if (!curr)
5275	place_entity(cfs_rq, se, flags);
5276
5277	account_entity_enqueue(cfs_rq, se);
5278
5279	/ Entity has migrated, no longer consider this task hot /
5280	if (flags & ENQUEUE_MIGRATED)
5281	se->exec_start = `0`;
5282
5283	check_schedstat_required();
5284	update_stats_enqueue_fair(cfs_rq, se, flags);
5285	if (!curr)
5286	__enqueue_entity(cfs_rq, se);
5287	se->on_rq = `1`;
5288
5289	if (cfs_rq->nr_queued == `1`) {
5290	check_enqueue_throttle(cfs_rq);
5291	list_add_leaf_cfs_rq(cfs_rq);
5292	#ifdef CONFIG_CFS_BANDWIDTH
5293	if (cfs_rq->pelt_clock_throttled) {
5294	struct rq *rq = rq_of(cfs_rq);
5295
5296	cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
5297	cfs_rq->throttled_clock_pelt;
5298	cfs_rq->pelt_clock_throttled = `0`;
5299	}
5300	#endif
5301	}
5302	}
5303
5304	static void __clear_buddies_next(struct sched_entity *se)
5305	{
5306	for_each_sched_entity(se) {
5307	struct cfs_rq *cfs_rq = cfs_rq_of(se);
5308	if (cfs_rq->next != se)
5309	break;
5310
5311	cfs_rq->next = NULL;
5312	}
5313	}
5314
5315	static void clear_buddies(struct cfs_rq cfs_rq, struct* sched_entity *se)
5316	{
5317	if (cfs_rq->next == se)
5318	__clear_buddies_next(se);
5319	}
5320
5321	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
5322
5323	static void set_delayed(struct sched_entity *se)
5324	{
5325	se->sched_delayed = `1`;
5326
5327	/*
5328	* Delayed se of cfs_rq have no tasks queued on them.
5329	* Do not adjust h_nr_runnable since dequeue_entities()
5330	* will account it for blocked tasks.
5331	*/
5332	if (!entity_is_task(se))
5333	return;
5334
5335	for_each_sched_entity(se) {
5336	struct cfs_rq *cfs_rq = cfs_rq_of(se);
5337
5338	cfs_rq->h_nr_runnable--;
5339	}
5340	}
5341
5342	static void clear_delayed(struct sched_entity *se)
5343	{
5344	se->sched_delayed = `0`;
5345
5346	/*
5347	* Delayed se of cfs_rq have no tasks queued on them.
5348	* Do not adjust h_nr_runnable since a dequeue has
5349	* already accounted for it or an enqueue of a task
5350	* below it will account for it in enqueue_task_fair().
5351	*/
5352	if (!entity_is_task(se))
5353	return;
5354
5355	for_each_sched_entity(se) {
5356	struct cfs_rq *cfs_rq = cfs_rq_of(se);
5357
5358	cfs_rq->h_nr_runnable++;
5359	}
5360	}
5361
5362	static inline void finish_delayed_dequeue_entity(struct sched_entity *se)
5363	{
5364	clear_delayed(se);
5365	if (sched_feat(DELAY_ZERO) && se->vlag > `0`)
5366	se->vlag = `0`;
5367	}
5368
5369	static bool
5370	dequeue_entity(struct cfs_rq cfs_rq, struct* sched_entity se, int* flags)
5371	{
5372	bool sleep = flags & DEQUEUE_SLEEP;
5373	int action = UPDATE_TG;
5374
5375	update_curr(cfs_rq);
5376	clear_buddies(cfs_rq, se);
5377
5378	if (flags & DEQUEUE_DELAYED) {
5379	WARN_ON_ONCE(!se->sched_delayed);
5380	} else {
5381	bool delay = sleep;
5382	/*
5383	* DELAY_DEQUEUE relies on spurious wakeups, special task
5384	* states must not suffer spurious wakeups, excempt them.
5385	*/
5386	if (flags & (DEQUEUE_SPECIAL \| DEQUEUE_THROTTLE))
5387	delay = false;
5388
5389	WARN_ON_ONCE(delay && se->sched_delayed);
5390
5391	if (sched_feat(DELAY_DEQUEUE) && delay &&
5392	!entity_eligible(cfs_rq, se)) {
5393	update_load_avg(cfs_rq, se, flags: `0`);
5394	set_delayed(se);
5395	return false;
5396	}
5397	}
5398
5399	if (entity_is_task(se) && task_on_rq_migrating(p: task_of(se)))
5400	action \|= DO_DETACH;
5401
5402	/*
5403	* When dequeuing a sched_entity, we must:
5404	* - Update loads to have both entity and cfs_rq synced with now.
5405	* - For group_entity, update its runnable_weight to reflect the new
5406	* h_nr_runnable of its group cfs_rq.
5407	* - Subtract its previous weight from cfs_rq->load.weight.
5408	* - For group entity, update its weight to reflect the new share
5409	* of its group cfs_rq.
5410	*/
5411	update_load_avg(cfs_rq, se, flags: action);
5412	se_update_runnable(se);
5413
5414	update_stats_dequeue_fair(cfs_rq, se, flags);
5415
5416	update_entity_lag(cfs_rq, se);
5417	if (sched_feat(PLACE_REL_DEADLINE) && !sleep) {
5418	se->deadline -= se->vruntime;
5419	se->rel_deadline = `1`;
5420	}
5421
5422	if (se != cfs_rq->curr)
5423	__dequeue_entity(cfs_rq, se);
5424	se->on_rq = `0`;
5425	account_entity_dequeue(cfs_rq, se);
5426
5427	/ return excess runtime on last dequeue /
5428	return_cfs_rq_runtime(cfs_rq);
5429
5430	update_cfs_group(se);
5431
5432	/*
5433	* Now advance min_vruntime if @se was the entity holding it back,
5434	* except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
5435	* put back on, and if we advance min_vruntime, we'll be placed back
5436	* further than we started -- i.e. we'll be penalized.
5437	*/
5438	if ((flags & (DEQUEUE_SAVE \| DEQUEUE_MOVE)) != DEQUEUE_SAVE)
5439	update_min_vruntime(cfs_rq);
5440
5441	if (flags & DEQUEUE_DELAYED)
5442	finish_delayed_dequeue_entity(se);
5443
5444	if (cfs_rq->nr_queued == `0`) {
5445	update_idle_cfs_rq_clock_pelt(cfs_rq);
5446	#ifdef CONFIG_CFS_BANDWIDTH
5447	if (throttled_hierarchy(cfs_rq)) {
5448	struct rq *rq = rq_of(cfs_rq);
5449
5450	list_del_leaf_cfs_rq(cfs_rq);
5451	cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
5452	cfs_rq->pelt_clock_throttled = `1`;
5453	}
5454	#endif
5455	}
5456
5457	return true;
5458	}
5459
5460	static void
5461	set_next_entity(struct cfs_rq cfs_rq, struct* sched_entity *se)
5462	{
5463	clear_buddies(cfs_rq, se);
5464
5465	/ 'current' is not kept within the tree. /
5466	if (se->on_rq) {
5467	/*
5468	* Any task has to be enqueued before it get to execute on
5469	* a CPU. So account for the time it spent waiting on the
5470	* runqueue.
5471	*/
5472	update_stats_wait_end_fair(cfs_rq, se);
5473	__dequeue_entity(cfs_rq, se);
5474	update_load_avg(cfs_rq, se, UPDATE_TG);
5475
5476	set_protect_slice(cfs_rq, se);
5477	}
5478
5479	update_stats_curr_start(cfs_rq, se);
5480	WARN_ON_ONCE(cfs_rq->curr);
5481	cfs_rq->curr = se;
5482
5483	/*
5484	* Track our maximum slice length, if the CPU's load is at
5485	* least twice that of our own weight (i.e. don't track it
5486	* when there are only lesser-weight tasks around):
5487	*/
5488	if (schedstat_enabled() &&
5489	rq_of(cfs_rq)->cfs.load.weight >= `2`*se->load.weight) {
5490	struct sched_statistics *stats;
5491
5492	stats = __schedstats_from_se(se);
5493	__schedstat_set(stats->slice_max,
5494	max((u64)stats->slice_max,
5495	se->sum_exec_runtime - se->prev_sum_exec_runtime));
5496	}
5497
5498	se->prev_sum_exec_runtime = se->sum_exec_runtime;
5499	}
5500
5501	static int dequeue_entities(struct rq rq, struct* sched_entity se, int* flags);
5502
5503	/*
5504	* Pick the next process, keeping these things in mind, in this order:
5505	* 1) keep things fair between processes/task groups
5506	* 2) pick the "next" process, since someone really wants that to run
5507	* 3) pick the "last" process, for cache locality
5508	* 4) do not run the "skip" process, if something else is available
5509	*/
5510	static struct sched_entity *
5511	pick_next_entity(struct rq rq, struct* cfs_rq *cfs_rq)
5512	{
5513	struct sched_entity *se;
5514
5515	/*
5516	* Picking the ->next buddy will affect latency but not fairness.
5517	*/
5518	if (sched_feat(PICK_BUDDY) &&
5519	cfs_rq->next && entity_eligible(cfs_rq, se: cfs_rq->next)) {
5520	/ ->next will never be delayed /
5521	WARN_ON_ONCE(cfs_rq->next->sched_delayed);
5522	return cfs_rq->next;
5523	}
5524
5525	se = pick_eevdf(cfs_rq);
5526	if (se->sched_delayed) {
5527	dequeue_entities(rq, se, DEQUEUE_SLEEP \| DEQUEUE_DELAYED);
5528	/*
5529	* Must not reference @se again, see __block_task().
5530	*/
5531	return NULL;
5532	}
5533	return se;
5534	}
5535
5536	static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
5537
5538	static void put_prev_entity(struct cfs_rq cfs_rq, struct* sched_entity *prev)
5539	{
5540	/*
5541	* If still on the runqueue then deactivate_task()
5542	* was not called and update_curr() has to be done:
5543	*/
5544	if (prev->on_rq)
5545	update_curr(cfs_rq);
5546
5547	/ throttle cfs_rqs exceeding runtime /
5548	check_cfs_rq_runtime(cfs_rq);
5549
5550	if (prev->on_rq) {
5551	update_stats_wait_start_fair(cfs_rq, se: prev);
5552	/ Put 'current' back into the tree. /
5553	__enqueue_entity(cfs_rq, se: prev);
5554	/ in !on_rq case, update occurred at dequeue /
5555	update_load_avg(cfs_rq, se: prev, flags: `0`);
5556	}
5557	WARN_ON_ONCE(cfs_rq->curr != prev);
5558	cfs_rq->curr = NULL;
5559	}
5560
5561	static void
5562	entity_tick(struct cfs_rq cfs_rq, struct* sched_entity curr, int* queued)
5563	{
5564	/*
5565	* Update run-time statistics of the 'current'.
5566	*/
5567	update_curr(cfs_rq);
5568
5569	/*
5570	* Ensure that runnable average is periodically updated.
5571	*/
5572	update_load_avg(cfs_rq, se: curr, UPDATE_TG);
5573	update_cfs_group(se: curr);
5574
5575	#ifdef CONFIG_SCHED_HRTICK
5576	/*
5577	* queued ticks are scheduled to match the slice, so don't bother
5578	* validating it and just reschedule.
5579	*/
5580	if (queued) {
5581	resched_curr_lazy(rq: rq_of(cfs_rq));
5582	return;
5583	}
5584	#endif
5585	}
5586
5587
5588	/**************************************************
5589	* CFS bandwidth control machinery
5590	*/
5591
5592	#ifdef CONFIG_CFS_BANDWIDTH
5593
5594	#ifdef CONFIG_JUMP_LABEL
5595	static struct static_key __cfs_bandwidth_used;
5596
5597	static inline bool cfs_bandwidth_used(void)
5598	{
5599	return static_key_false(&__cfs_bandwidth_used);
5600	}
5601
5602	void cfs_bandwidth_usage_inc(void)
5603	{
5604	static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
5605	}
5606
5607	void cfs_bandwidth_usage_dec(void)
5608	{
5609	static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
5610	}
5611	#else /* !CONFIG_JUMP_LABEL: */
5612	static bool cfs_bandwidth_used(void)
5613	{
5614	return true;
5615	}
5616
5617	void cfs_bandwidth_usage_inc(void) {}
5618	void cfs_bandwidth_usage_dec(void) {}
5619	#endif /* !CONFIG_JUMP_LABEL */
5620
5621	static inline u64 sched_cfs_bandwidth_slice(void)
5622	{
5623	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
5624	}
5625
5626	/*
5627	* Replenish runtime according to assigned quota. We use sched_clock_cpu
5628	* directly instead of rq->clock to avoid adding additional synchronization
5629	* around rq->lock.
5630	*
5631	* requires cfs_b->lock
5632	*/
5633	void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
5634	{
5635	s64 runtime;
5636
5637	if (unlikely(cfs_b->quota == RUNTIME_INF))
5638	return;
5639
5640	cfs_b->runtime += cfs_b->quota;
5641	runtime = cfs_b->runtime_snap - cfs_b->runtime;
5642	if (runtime > `0`) {
5643	cfs_b->burst_time += runtime;
5644	cfs_b->nr_burst++;
5645	}
5646
5647	cfs_b->runtime = min(cfs_b->runtime, cfs_b->quota + cfs_b->burst);
5648	cfs_b->runtime_snap = cfs_b->runtime;
5649	}
5650
5651	static inline struct cfs_bandwidth tg_cfs_bandwidth(struct* task_group *tg)
5652	{
5653	return &tg->cfs_bandwidth;
5654	}
5655
5656	/ returns 0 on failure to allocate runtime /
5657	static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
5658	struct cfs_rq *cfs_rq, u64 target_runtime)
5659	{
5660	u64 min_amount, amount = `0`;
5661
5662	lockdep_assert_held(&cfs_b->lock);
5663
5664	/ note: this is a positive sum as runtime_remaining <= 0 /
5665	min_amount = target_runtime - cfs_rq->runtime_remaining;
5666
5667	if (cfs_b->quota == RUNTIME_INF)
5668	amount = min_amount;
5669	else {
5670	start_cfs_bandwidth(cfs_b);
5671
5672	if (cfs_b->runtime > `0`) {
5673	amount = min(cfs_b->runtime, min_amount);
5674	cfs_b->runtime -= amount;
5675	cfs_b->idle = `0`;
5676	}
5677	}
5678
5679	cfs_rq->runtime_remaining += amount;
5680
5681	return cfs_rq->runtime_remaining > `0`;
5682	}
5683
5684	/ returns 0 on failure to allocate runtime /
5685	static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5686	{
5687	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
5688	int ret;
5689
5690	raw_spin_lock(&cfs_b->lock);
5691	ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
5692	raw_spin_unlock(&cfs_b->lock);
5693
5694	return ret;
5695	}
5696
5697	static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
5698	{
5699	/ dock delta_exec before expiring quota (as it could span periods) /
5700	cfs_rq->runtime_remaining -= delta_exec;
5701
5702	if (likely(cfs_rq->runtime_remaining > `0`))
5703	return;
5704
5705	if (cfs_rq->throttled)
5706	return;
5707	/*
5708	* if we're unable to extend our runtime we resched so that the active
5709	* hierarchy can be throttled
5710	*/
5711	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
5712	resched_curr(rq_of(cfs_rq));
5713	}
5714
5715	static __always_inline
5716	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
5717	{
5718	if (!cfs_bandwidth_used() \|\| !cfs_rq->runtime_enabled)
5719	return;
5720
5721	__account_cfs_rq_runtime(cfs_rq, delta_exec);
5722	}
5723
5724	static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
5725	{
5726	return cfs_bandwidth_used() && cfs_rq->throttled;
5727	}
5728
5729	static inline bool cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq)
5730	{
5731	return cfs_bandwidth_used() && cfs_rq->pelt_clock_throttled;
5732	}
5733
5734	/ check whether cfs_rq, or any parent, is throttled /
5735	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
5736	{
5737	return cfs_bandwidth_used() && cfs_rq->throttle_count;
5738	}
5739
5740	static inline int lb_throttled_hierarchy(struct task_struct p, int* dst_cpu)
5741	{
5742	return throttled_hierarchy(task_group(p)->cfs_rq[dst_cpu]);
5743	}
5744
5745	static inline bool task_is_throttled(struct task_struct *p)
5746	{
5747	return cfs_bandwidth_used() && p->throttled;
5748	}
5749
5750	static bool dequeue_task_fair(struct rq rq, struct* task_struct p, int* flags);
5751	static void throttle_cfs_rq_work(struct callback_head *work)
5752	{
5753	struct task_struct p = container_of(work, struct* task_struct, sched_throttle_work);
5754	struct sched_entity *se;
5755	struct cfs_rq *cfs_rq;
5756	struct rq *rq;
5757
5758	WARN_ON_ONCE(p != current);
5759	p->sched_throttle_work.next = &p->sched_throttle_work;
5760
5761	/*
5762	* If task is exiting, then there won't be a return to userspace, so we
5763	* don't have to bother with any of this.
5764	*/
5765	if ((p->flags & PF_EXITING))
5766	return;
5767
5768	scoped_guard(task_rq_lock, p) {
5769	se = &p->se;
5770	cfs_rq = cfs_rq_of(se);
5771
5772	/ Raced, forget /
5773	if (p->sched_class != &fair_sched_class)
5774	return;
5775
5776	/*
5777	* If not in limbo, then either replenish has happened or this
5778	* task got migrated out of the throttled cfs_rq, move along.
5779	*/
5780	if (!cfs_rq->throttle_count)
5781	return;
5782	rq = scope.rq;
5783	update_rq_clock(rq);
5784	WARN_ON_ONCE(p->throttled \|\| !list_empty(&p->throttle_node));
5785	dequeue_task_fair(rq, p, DEQUEUE_SLEEP \| DEQUEUE_THROTTLE);
5786	list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
5787	/*
5788	* Must not set throttled before dequeue or dequeue will
5789	* mistakenly regard this task as an already throttled one.
5790	*/
5791	p->throttled = true;
5792	resched_curr(rq);
5793	}
5794	}
5795
5796	void init_cfs_throttle_work(struct task_struct *p)
5797	{
5798	init_task_work(&p->sched_throttle_work, throttle_cfs_rq_work);
5799	/ Protect against double add, see throttle_cfs_rq() and throttle_cfs_rq_work() /
5800	p->sched_throttle_work.next = &p->sched_throttle_work;
5801	INIT_LIST_HEAD(&p->throttle_node);
5802	}
5803
5804	/*
5805	* Task is throttled and someone wants to dequeue it again:
5806	* it could be sched/core when core needs to do things like
5807	* task affinity change, task group change, task sched class
5808	* change etc. and in these cases, DEQUEUE_SLEEP is not set;
5809	* or the task is blocked after throttled due to freezer etc.
5810	* and in these cases, DEQUEUE_SLEEP is set.
5811	*/
5812	static void detach_task_cfs_rq(struct task_struct *p);
5813	static void dequeue_throttled_task(struct task_struct p, int* flags)
5814	{
5815	WARN_ON_ONCE(p->se.on_rq);
5816	list_del_init(&p->throttle_node);
5817
5818	/ task blocked after throttled /
5819	if (flags & DEQUEUE_SLEEP) {
5820	p->throttled = false;
5821	return;
5822	}
5823
5824	/*
5825	* task is migrating off its old cfs_rq, detach
5826	* the task's load from its old cfs_rq.
5827	*/
5828	if (task_on_rq_migrating(p))
5829	detach_task_cfs_rq(p);
5830	}
5831
5832	static bool enqueue_throttled_task(struct task_struct *p)
5833	{
5834	struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
5835
5836	/ @p should have gone through dequeue_throttled_task() first /
5837	WARN_ON_ONCE(!list_empty(&p->throttle_node));
5838
5839	/*
5840	* If the throttled task @p is enqueued to a throttled cfs_rq,
5841	* take the fast path by directly putting the task on the
5842	* target cfs_rq's limbo list.
5843	*
5844	* Do not do that when @p is current because the following race can
5845	* cause @p's group_node to be incorectly re-insterted in its rq's
5846	* cfs_tasks list, despite being throttled:
5847	*
5848	* cpuX cpuY
5849	* p ret2user
5850	* throttle_cfs_rq_work() sched_move_task(p)
5851	* LOCK task_rq_lock
5852	* dequeue_task_fair(p)
5853	* UNLOCK task_rq_lock
5854	* LOCK task_rq_lock
5855	* task_current_donor(p) == true
5856	* task_on_rq_queued(p) == true
5857	* dequeue_task(p)
5858	* put_prev_task(p)
5859	* sched_change_group()
5860	* enqueue_task(p) -> p's new cfs_rq
5861	* is throttled, go
5862	* fast path and skip
5863	* actual enqueue
5864	* set_next_task(p)
5865	* list_move(&se->group_node, &rq->cfs_tasks); // bug
5866	* schedule()
5867	*
5868	* In the above race case, @p current cfs_rq is in the same rq as
5869	* its previous cfs_rq because sched_move_task() only moves a task
5870	* to a different group from the same rq, so we can use its current
5871	* cfs_rq to derive rq and test if the task is current.
5872	*/
5873	if (throttled_hierarchy(cfs_rq) &&
5874	!task_current_donor(rq_of(cfs_rq), p)) {
5875	list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
5876	return true;
5877	}
5878
5879	/ we can't take the fast path, do an actual enqueue/
5880	p->throttled = false;
5881	return false;
5882	}
5883
5884	static void enqueue_task_fair(struct rq rq, struct* task_struct p, int* flags);
5885	static int tg_unthrottle_up(struct task_group tg, void* *data)
5886	{
5887	struct rq *rq = data;
5888	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5889	struct task_struct p, tmp;
5890
5891	if (--cfs_rq->throttle_count)
5892	return `0`;
5893
5894	if (cfs_rq->pelt_clock_throttled) {
5895	cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
5896	cfs_rq->throttled_clock_pelt;
5897	cfs_rq->pelt_clock_throttled = `0`;
5898	}
5899
5900	if (cfs_rq->throttled_clock_self) {
5901	u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
5902
5903	cfs_rq->throttled_clock_self = `0`;
5904
5905	if (WARN_ON_ONCE((s64)delta < `0`))
5906	delta = `0`;
5907
5908	cfs_rq->throttled_clock_self_time += delta;
5909	}
5910
5911	/ Re-enqueue the tasks that have been throttled at this level. /
5912	list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, throttle_node) {
5913	list_del_init(&p->throttle_node);
5914	p->throttled = false;
5915	enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP);
5916	}
5917
5918	/ Add cfs_rq with load or one or more already running entities to the list /
5919	if (!cfs_rq_is_decayed(cfs_rq))
5920	list_add_leaf_cfs_rq(cfs_rq);
5921
5922	return `0`;
5923	}
5924
5925	static inline bool task_has_throttle_work(struct task_struct *p)
5926	{
5927	return p->sched_throttle_work.next != &p->sched_throttle_work;
5928	}
5929
5930	static inline void task_throttle_setup_work(struct task_struct *p)
5931	{
5932	if (task_has_throttle_work(p))
5933	return;
5934
5935	/*
5936	* Kthreads and exiting tasks don't return to userspace, so adding the
5937	* work is pointless
5938	*/
5939	if ((p->flags & (PF_EXITING \| PF_KTHREAD)))
5940	return;
5941
5942	task_work_add(p, &p->sched_throttle_work, TWA_RESUME);
5943	}
5944
5945	static void record_throttle_clock(struct cfs_rq *cfs_rq)
5946	{
5947	struct rq *rq = rq_of(cfs_rq);
5948
5949	if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
5950	cfs_rq->throttled_clock = rq_clock(rq);
5951
5952	if (!cfs_rq->throttled_clock_self)
5953	cfs_rq->throttled_clock_self = rq_clock(rq);
5954	}
5955
5956	static int tg_throttle_down(struct task_group tg, void* *data)
5957	{
5958	struct rq *rq = data;
5959	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5960
5961	if (cfs_rq->throttle_count++)
5962	return `0`;
5963
5964	/*
5965	* For cfs_rqs that still have entities enqueued, PELT clock
5966	* stop happens at dequeue time when all entities are dequeued.
5967	*/
5968	if (!cfs_rq->nr_queued) {
5969	list_del_leaf_cfs_rq(cfs_rq);
5970	cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
5971	cfs_rq->pelt_clock_throttled = `1`;
5972	}
5973
5974	WARN_ON_ONCE(cfs_rq->throttled_clock_self);
5975	WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
5976	return `0`;
5977	}
5978
5979	static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
5980	{
5981	struct rq *rq = rq_of(cfs_rq);
5982	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
5983	int dequeue = `1`;
5984
5985	raw_spin_lock(&cfs_b->lock);
5986	/ This will start the period timer if necessary /
5987	if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, `1`)) {
5988	/*
5989	* We have raced with bandwidth becoming available, and if we
5990	* actually throttled the timer might not unthrottle us for an
5991	* entire period. We additionally needed to make sure that any
5992	* subsequent check_cfs_rq_runtime calls agree not to throttle
5993	* us, as we may commit to do cfs put_prev+pick_next, so we ask
5994	* for 1ns of runtime rather than just check cfs_b.
5995	*/
5996	dequeue = `0`;
5997	} else {
5998	list_add_tail_rcu(&cfs_rq->throttled_list,
5999	&cfs_b->throttled_cfs_rq);
6000	}
6001	raw_spin_unlock(&cfs_b->lock);
6002
6003	if (!dequeue)
6004	return false; / Throttle no longer required. /
6005
6006	/ freeze hierarchy runnable averages while throttled /
6007	rcu_read_lock();
6008	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
6009	rcu_read_unlock();
6010
6011	/*
6012	* Note: distribution will already see us throttled via the
6013	* throttled-list. rq->lock protects completion.
6014	*/
6015	cfs_rq->throttled = `1`;
6016	WARN_ON_ONCE(cfs_rq->throttled_clock);
6017	return true;
6018	}
6019
6020	void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
6021	{
6022	struct rq *rq = rq_of(cfs_rq);
6023	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
6024	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
6025
6026	/*
6027	* It's possible we are called with !runtime_remaining due to things
6028	* like user changed quota setting(see tg_set_cfs_bandwidth()) or async
6029	* unthrottled us with a positive runtime_remaining but other still
6030	* running entities consumed those runtime before we reached here.
6031	*
6032	* Anyway, we can't unthrottle this cfs_rq without any runtime remaining
6033	* because any enqueue in tg_unthrottle_up() will immediately trigger a
6034	* throttle, which is not supposed to happen on unthrottle path.
6035	*/
6036	if (cfs_rq->runtime_enabled && cfs_rq->runtime_remaining <= `0`)
6037	return;
6038
6039	se = cfs_rq->tg->se[cpu_of(rq)];
6040
6041	cfs_rq->throttled = `0`;
6042
6043	update_rq_clock(rq);
6044
6045	raw_spin_lock(&cfs_b->lock);
6046	if (cfs_rq->throttled_clock) {
6047	cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
6048	cfs_rq->throttled_clock = `0`;
6049	}
6050	list_del_rcu(&cfs_rq->throttled_list);
6051	raw_spin_unlock(&cfs_b->lock);
6052
6053	/ update hierarchical throttle state /
6054	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
6055
6056	if (!cfs_rq->load.weight) {
6057	if (!cfs_rq->on_list)
6058	return;
6059	/*
6060	* Nothing to run but something to decay (on_list)?
6061	* Complete the branch.
6062	*/
6063	for_each_sched_entity(se) {
6064	if (list_add_leaf_cfs_rq(cfs_rq_of(se)))
6065	break;
6066	}
6067	}
6068
6069	assert_list_leaf_cfs_rq(rq);
6070
6071	/ Determine whether we need to wake up potentially idle CPU: /
6072	if (rq->curr == rq->idle && rq->cfs.nr_queued)
6073	resched_curr(rq);
6074	}
6075
6076	static void __cfsb_csd_unthrottle(void *arg)
6077	{
6078	struct cfs_rq cursor, tmp;
6079	struct rq *rq = arg;
6080	struct rq_flags rf;
6081
6082	rq_lock(rq, &rf);
6083
6084	/*
6085	* Iterating over the list can trigger several call to
6086	* update_rq_clock() in unthrottle_cfs_rq().
6087	* Do it once and skip the potential next ones.
6088	*/
6089	update_rq_clock(rq);
6090	rq_clock_start_loop_update(rq);
6091
6092	/*
6093	* Since we hold rq lock we're safe from concurrent manipulation of
6094	* the CSD list. However, this RCU critical section annotates the
6095	* fact that we pair with sched_free_group_rcu(), so that we cannot
6096	* race with group being freed in the window between removing it
6097	* from the list and advancing to the next entry in the list.
6098	*/
6099	rcu_read_lock();
6100
6101	list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list,
6102	throttled_csd_list) {
6103	list_del_init(&cursor->throttled_csd_list);
6104
6105	if (cfs_rq_throttled(cursor))
6106	unthrottle_cfs_rq(cursor);
6107	}
6108
6109	rcu_read_unlock();
6110
6111	rq_clock_stop_loop_update(rq);
6112	rq_unlock(rq, &rf);
6113	}
6114
6115	static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
6116	{
6117	struct rq *rq = rq_of(cfs_rq);
6118	bool first;
6119
6120	if (rq == this_rq()) {
6121	unthrottle_cfs_rq(cfs_rq);
6122	return;
6123	}
6124
6125	/ Already enqueued /
6126	if (WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_csd_list)))
6127	return;
6128
6129	first = list_empty(&rq->cfsb_csd_list);
6130	list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list);
6131	if (first)
6132	smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);
6133	}
6134
6135	static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
6136	{
6137	lockdep_assert_rq_held(rq_of(cfs_rq));
6138
6139	if (WARN_ON_ONCE(!cfs_rq_throttled(cfs_rq) \|\|
6140	cfs_rq->runtime_remaining <= `0`))
6141	return;
6142
6143	__unthrottle_cfs_rq_async(cfs_rq);
6144	}
6145
6146	static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
6147	{
6148	int this_cpu = smp_processor_id();
6149	u64 runtime, remaining = `1`;
6150	bool throttled = false;
6151	struct cfs_rq cfs_rq, tmp;
6152	struct rq_flags rf;
6153	struct rq *rq;
6154	LIST_HEAD(local_unthrottle);
6155
6156	rcu_read_lock();
6157	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
6158	throttled_list) {
6159	rq = rq_of(cfs_rq);
6160
6161	if (!remaining) {
6162	throttled = true;
6163	break;
6164	}
6165
6166	rq_lock_irqsave(rq, &rf);
6167	if (!cfs_rq_throttled(cfs_rq))
6168	goto next;
6169
6170	/ Already queued for async unthrottle /
6171	if (!list_empty(&cfs_rq->throttled_csd_list))
6172	goto next;
6173
6174	/ By the above checks, this should never be true /
6175	WARN_ON_ONCE(cfs_rq->runtime_remaining > `0`);
6176
6177	raw_spin_lock(&cfs_b->lock);
6178	runtime = -cfs_rq->runtime_remaining + `1`;
6179	if (runtime > cfs_b->runtime)
6180	runtime = cfs_b->runtime;
6181	cfs_b->runtime -= runtime;
6182	remaining = cfs_b->runtime;
6183	raw_spin_unlock(&cfs_b->lock);
6184
6185	cfs_rq->runtime_remaining += runtime;
6186
6187	/ we check whether we're throttled above /
6188	if (cfs_rq->runtime_remaining > `0`) {
6189	if (cpu_of(rq) != this_cpu) {
6190	unthrottle_cfs_rq_async(cfs_rq);
6191	} else {
6192	/*
6193	* We currently only expect to be unthrottling
6194	* a single cfs_rq locally.
6195	*/
6196	WARN_ON_ONCE(!list_empty(&local_unthrottle));
6197	list_add_tail(&cfs_rq->throttled_csd_list,
6198	&local_unthrottle);
6199	}
6200	} else {
6201	throttled = true;
6202	}
6203
6204	next:
6205	rq_unlock_irqrestore(rq, &rf);
6206	}
6207
6208	list_for_each_entry_safe(cfs_rq, tmp, &local_unthrottle,
6209	throttled_csd_list) {
6210	struct rq *rq = rq_of(cfs_rq);
6211
6212	rq_lock_irqsave(rq, &rf);
6213
6214	list_del_init(&cfs_rq->throttled_csd_list);
6215
6216	if (cfs_rq_throttled(cfs_rq))
6217	unthrottle_cfs_rq(cfs_rq);
6218
6219	rq_unlock_irqrestore(rq, &rf);
6220	}
6221	WARN_ON_ONCE(!list_empty(&local_unthrottle));
6222
6223	rcu_read_unlock();
6224
6225	return throttled;
6226	}
6227
6228	/*
6229	* Responsible for refilling a task_group's bandwidth and unthrottling its
6230	* cfs_rqs as appropriate. If there has been no activity within the last
6231	* period the timer is deactivated until scheduling resumes; cfs_b->idle is
6232	* used to track this state.
6233	*/
6234	static int do_sched_cfs_period_timer(struct cfs_bandwidth cfs_b, int* overrun, unsigned long flags)
6235	{
6236	int throttled;
6237
6238	/ no need to continue the timer with no bandwidth constraint /
6239	if (cfs_b->quota == RUNTIME_INF)
6240	goto out_deactivate;
6241
6242	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
6243	cfs_b->nr_periods += overrun;
6244
6245	/ Refill extra burst quota even if cfs_b->idle /
6246	__refill_cfs_bandwidth_runtime(cfs_b);
6247
6248	/*
6249	* idle depends on !throttled (for the case of a large deficit), and if
6250	* we're going inactive then everything else can be deferred
6251	*/
6252	if (cfs_b->idle && !throttled)
6253	goto out_deactivate;
6254
6255	if (!throttled) {
6256	/ mark as potentially idle for the upcoming period /
6257	cfs_b->idle = `1`;
6258	return `0`;
6259	}
6260
6261	/ account preceding periods in which throttling occurred /
6262	cfs_b->nr_throttled += overrun;
6263
6264	/*
6265	* This check is repeated as we release cfs_b->lock while we unthrottle.
6266	*/
6267	while (throttled && cfs_b->runtime > `0`) {
6268	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
6269	/ we can't nest cfs_b->lock while distributing bandwidth /
6270	throttled = distribute_cfs_runtime(cfs_b);
6271	raw_spin_lock_irqsave(&cfs_b->lock, flags);
6272	}
6273
6274	/*
6275	* While we are ensured activity in the period following an
6276	* unthrottle, this also covers the case in which the new bandwidth is
6277	* insufficient to cover the existing bandwidth deficit. (Forcing the
6278	* timer to remain active while there are any throttled entities.)
6279	*/
6280	cfs_b->idle = `0`;
6281
6282	return `0`;
6283
6284	out_deactivate:
6285	return `1`;
6286	}
6287
6288	/ a cfs_rq won't donate quota below this amount /
6289	static const u64 min_cfs_rq_runtime = `1` * NSEC_PER_MSEC;
6290	/ minimum remaining period time to redistribute slack quota /
6291	static const u64 min_bandwidth_expiration = `2` * NSEC_PER_MSEC;
6292	/ how long we wait to gather additional slack before distributing /
6293	static const u64 cfs_bandwidth_slack_period = `5` * NSEC_PER_MSEC;
6294
6295	/*
6296	* Are we near the end of the current quota period?
6297	*
6298	* Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
6299	* hrtimer base being cleared by hrtimer_start. In the case of
6300	* migrate_hrtimers, base is never cleared, so we are fine.
6301	*/
6302	static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
6303	{
6304	struct hrtimer *refresh_timer = &cfs_b->period_timer;
6305	s64 remaining;
6306
6307	/ if the call-back is running a quota refresh is already occurring /
6308	if (hrtimer_callback_running(refresh_timer))
6309	return `1`;
6310
6311	/ is a quota refresh about to occur? /
6312	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
6313	if (remaining < (s64)min_expire)
6314	return `1`;
6315
6316	return `0`;
6317	}
6318
6319	static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
6320	{
6321	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
6322
6323	/ if there's a quota refresh soon don't bother with slack /
6324	if (runtime_refresh_within(cfs_b, min_left))
6325	return;
6326
6327	/ don't push forwards an existing deferred unthrottle /
6328	if (cfs_b->slack_started)
6329	return;
6330	cfs_b->slack_started = true;
6331
6332	hrtimer_start(&cfs_b->slack_timer,
6333	ns_to_ktime(cfs_bandwidth_slack_period),
6334	HRTIMER_MODE_REL);
6335	}
6336
6337	/ we know any runtime found here is valid as update_curr() precedes return /
6338	static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
6339	{
6340	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
6341	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
6342
6343	if (slack_runtime <= `0`)
6344	return;
6345
6346	raw_spin_lock(&cfs_b->lock);
6347	if (cfs_b->quota != RUNTIME_INF) {
6348	cfs_b->runtime += slack_runtime;
6349
6350	/ we are under rq->lock, defer unthrottling using a timer /
6351	if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
6352	!list_empty(&cfs_b->throttled_cfs_rq))
6353	start_cfs_slack_bandwidth(cfs_b);
6354	}
6355	raw_spin_unlock(&cfs_b->lock);
6356
6357	/ even if it's not valid for return we don't want to try again /
6358	cfs_rq->runtime_remaining -= slack_runtime;
6359	}
6360
6361	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
6362	{
6363	if (!cfs_bandwidth_used())
6364	return;
6365
6366	if (!cfs_rq->runtime_enabled \|\| cfs_rq->nr_queued)
6367	return;
6368
6369	__return_cfs_rq_runtime(cfs_rq);
6370	}
6371
6372	/*
6373	* This is done with a timer (instead of inline with bandwidth return) since
6374	* it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
6375	*/
6376	static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
6377	{
6378	u64 runtime = `0`, slice = sched_cfs_bandwidth_slice();
6379	unsigned long flags;
6380
6381	/ confirm we're still not at a refresh boundary /
6382	raw_spin_lock_irqsave(&cfs_b->lock, flags);
6383	cfs_b->slack_started = false;
6384
6385	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
6386	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
6387	return;
6388	}
6389
6390	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
6391	runtime = cfs_b->runtime;
6392
6393	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
6394
6395	if (!runtime)
6396	return;
6397
6398	distribute_cfs_runtime(cfs_b);
6399	}
6400
6401	/*
6402	* When a group wakes up we want to make sure that its quota is not already
6403	* expired/exceeded, otherwise it may be allowed to steal additional ticks of
6404	* runtime as update_curr() throttling can not trigger until it's on-rq.
6405	*/
6406	static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
6407	{
6408	if (!cfs_bandwidth_used())
6409	return;
6410
6411	/ an active group must be handled by the update_curr()->put() path /
6412	if (!cfs_rq->runtime_enabled \|\| cfs_rq->curr)
6413	return;
6414
6415	/ ensure the group is not already throttled /
6416	if (cfs_rq_throttled(cfs_rq))
6417	return;
6418
6419	/ update runtime allocation /
6420	account_cfs_rq_runtime(cfs_rq, `0`);
6421	if (cfs_rq->runtime_remaining <= `0`)
6422	throttle_cfs_rq(cfs_rq);
6423	}
6424
6425	static void sync_throttle(struct task_group tg, int* cpu)
6426	{
6427	struct cfs_rq pcfs_rq, cfs_rq;
6428
6429	if (!cfs_bandwidth_used())
6430	return;
6431
6432	if (!tg->parent)
6433	return;
6434
6435	cfs_rq = tg->cfs_rq[cpu];
6436	pcfs_rq = tg->parent->cfs_rq[cpu];
6437
6438	cfs_rq->throttle_count = pcfs_rq->throttle_count;
6439	cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu));
6440	}
6441
6442	/ conditionally throttle active cfs_rq's from put_prev_entity() /
6443	static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
6444	{
6445	if (!cfs_bandwidth_used())
6446	return false;
6447
6448	if (likely(!cfs_rq->runtime_enabled \|\| cfs_rq->runtime_remaining > `0`))
6449	return false;
6450
6451	/*
6452	* it's possible for a throttled entity to be forced into a running
6453	* state (e.g. set_curr_task), in this case we're finished.
6454	*/
6455	if (cfs_rq_throttled(cfs_rq))
6456	return true;
6457
6458	return throttle_cfs_rq(cfs_rq);
6459	}
6460
6461	static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
6462	{
6463	struct cfs_bandwidth *cfs_b =
6464	container_of(timer, struct cfs_bandwidth, slack_timer);
6465
6466	do_sched_cfs_slack_timer(cfs_b);
6467
6468	return HRTIMER_NORESTART;
6469	}
6470
6471	static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
6472	{
6473	struct cfs_bandwidth *cfs_b =
6474	container_of(timer, struct cfs_bandwidth, period_timer);
6475	unsigned long flags;
6476	int overrun;
6477	int idle = `0`;
6478	int count = `0`;
6479
6480	raw_spin_lock_irqsave(&cfs_b->lock, flags);
6481	for (;;) {
6482	overrun = hrtimer_forward_now(timer, cfs_b->period);
6483	if (!overrun)
6484	break;
6485
6486	idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
6487
6488	if (++count > `3`) {
6489	u64 new, old = ktime_to_ns(cfs_b->period);
6490
6491	/*
6492	* Grow period by a factor of 2 to avoid losing precision.
6493	* Precision loss in the quota/period ratio can cause __cfs_schedulable
6494	* to fail.
6495	*/
6496	new = old * `2`;
6497	if (new < max_bw_quota_period_us * NSEC_PER_USEC) {
6498	cfs_b->period = ns_to_ktime(new);
6499	cfs_b->quota *= `2`;
6500	cfs_b->burst *= `2`;
6501
6502	pr_warn_ratelimited(
6503	"cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
6504	smp_processor_id(),
6505	div_u64(new, NSEC_PER_USEC),
6506	div_u64(cfs_b->quota, NSEC_PER_USEC));
6507	} else {
6508	pr_warn_ratelimited(
6509	"cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n",
6510	smp_processor_id(),
6511	div_u64(old, NSEC_PER_USEC),
6512	div_u64(cfs_b->quota, NSEC_PER_USEC));
6513	}
6514
6515	/ reset count so we don't come right back in here /
6516	count = `0`;
6517	}
6518	}
6519	if (idle)
6520	cfs_b->period_active = `0`;
6521	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
6522
6523	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
6524	}
6525
6526	void init_cfs_bandwidth(struct cfs_bandwidth cfs_b, struct* cfs_bandwidth *parent)
6527	{
6528	raw_spin_lock_init(&cfs_b->lock);
6529	cfs_b->runtime = `0`;
6530	cfs_b->quota = RUNTIME_INF;
6531	cfs_b->period = us_to_ktime(default_bw_period_us());
6532	cfs_b->burst = `0`;
6533	cfs_b->hierarchical_quota = parent ? parent->hierarchical_quota : RUNTIME_INF;
6534
6535	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
6536	hrtimer_setup(&cfs_b->period_timer, sched_cfs_period_timer, CLOCK_MONOTONIC,
6537	HRTIMER_MODE_ABS_PINNED);
6538
6539	/ Add a random offset so that timers interleave /
6540	hrtimer_set_expires(&cfs_b->period_timer,
6541	get_random_u32_below(cfs_b->period));
6542	hrtimer_setup(&cfs_b->slack_timer, sched_cfs_slack_timer, CLOCK_MONOTONIC,
6543	HRTIMER_MODE_REL);
6544	cfs_b->slack_started = false;
6545	}
6546
6547	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
6548	{
6549	cfs_rq->runtime_enabled = `0`;
6550	INIT_LIST_HEAD(&cfs_rq->throttled_list);
6551	INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
6552	INIT_LIST_HEAD(&cfs_rq->throttled_limbo_list);
6553	}
6554
6555	void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
6556	{
6557	lockdep_assert_held(&cfs_b->lock);
6558
6559	if (cfs_b->period_active)
6560	return;
6561
6562	cfs_b->period_active = `1`;
6563	hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
6564	hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
6565	}
6566
6567	static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
6568	{
6569	int __maybe_unused i;
6570
6571	/ init_cfs_bandwidth() was not called /
6572	if (!cfs_b->throttled_cfs_rq.next)
6573	return;
6574
6575	hrtimer_cancel(&cfs_b->period_timer);
6576	hrtimer_cancel(&cfs_b->slack_timer);
6577
6578	/*
6579	* It is possible that we still have some cfs_rq's pending on a CSD
6580	* list, though this race is very rare. In order for this to occur, we
6581	* must have raced with the last task leaving the group while there
6582	* exist throttled cfs_rq(s), and the period_timer must have queued the
6583	* CSD item but the remote cpu has not yet processed it. To handle this,
6584	* we can simply flush all pending CSD work inline here. We're
6585	* guaranteed at this point that no additional cfs_rq of this group can
6586	* join a CSD list.
6587	*/
6588	for_each_possible_cpu(i) {
6589	struct rq *rq = cpu_rq(i);
6590	unsigned long flags;
6591
6592	if (list_empty(&rq->cfsb_csd_list))
6593	continue;
6594
6595	local_irq_save(flags);
6596	__cfsb_csd_unthrottle(rq);
6597	local_irq_restore(flags);
6598	}
6599	}
6600
6601	/*
6602	* Both these CPU hotplug callbacks race against unregister_fair_sched_group()
6603	*
6604	* The race is harmless, since modifying bandwidth settings of unhooked group
6605	* bits doesn't do much.
6606	*/
6607
6608	/ cpu online callback /
6609	static void __maybe_unused update_runtime_enabled(struct rq *rq)
6610	{
6611	struct task_group *tg;
6612
6613	lockdep_assert_rq_held(rq);
6614
6615	rcu_read_lock();
6616	list_for_each_entry_rcu(tg, &task_groups, list) {
6617	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
6618	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
6619
6620	raw_spin_lock(&cfs_b->lock);
6621	cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
6622	raw_spin_unlock(&cfs_b->lock);
6623	}
6624	rcu_read_unlock();
6625	}
6626
6627	/ cpu offline callback /
6628	static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
6629	{
6630	struct task_group *tg;
6631
6632	lockdep_assert_rq_held(rq);
6633
6634	// Do not unthrottle for an active CPU
6635	if (cpumask_test_cpu(cpu_of(rq), cpu_active_mask))
6636	return;
6637
6638	/*
6639	* The rq clock has already been updated in the
6640	* set_rq_offline(), so we should skip updating
6641	* the rq clock again in unthrottle_cfs_rq().
6642	*/
6643	rq_clock_start_loop_update(rq);
6644
6645	rcu_read_lock();
6646	list_for_each_entry_rcu(tg, &task_groups, list) {
6647	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
6648
6649	if (!cfs_rq->runtime_enabled)
6650	continue;
6651
6652	/*
6653	* Offline rq is schedulable till CPU is completely disabled
6654	* in take_cpu_down(), so we prevent new cfs throttling here.
6655	*/
6656	cfs_rq->runtime_enabled = `0`;
6657
6658	if (!cfs_rq_throttled(cfs_rq))
6659	continue;
6660
6661	/*
6662	* clock_task is not advancing so we just need to make sure
6663	* there's some valid quota amount
6664	*/
6665	cfs_rq->runtime_remaining = `1`;
6666	unthrottle_cfs_rq(cfs_rq);
6667	}
6668	rcu_read_unlock();
6669
6670	rq_clock_stop_loop_update(rq);
6671	}
6672
6673	bool cfs_task_bw_constrained(struct task_struct *p)
6674	{
6675	struct cfs_rq *cfs_rq = task_cfs_rq(p);
6676
6677	if (!cfs_bandwidth_used())
6678	return false;
6679
6680	if (cfs_rq->runtime_enabled \|\|
6681	tg_cfs_bandwidth(cfs_rq->tg)->hierarchical_quota != RUNTIME_INF)
6682	return true;
6683
6684	return false;
6685	}
6686
6687	#ifdef CONFIG_NO_HZ_FULL
6688	/ called from pick_next_task_fair() /
6689	static void sched_fair_update_stop_tick(struct rq rq, struct* task_struct *p)
6690	{
6691	int cpu = cpu_of(rq);
6692
6693	if (!cfs_bandwidth_used())
6694	return;
6695
6696	if (!tick_nohz_full_cpu(cpu))
6697	return;
6698
6699	if (rq->nr_running != `1`)
6700	return;
6701
6702	/*
6703	* We know there is only one task runnable and we've just picked it. The
6704	* normal enqueue path will have cleared TICK_DEP_BIT_SCHED if we will
6705	* be otherwise able to stop the tick. Just need to check if we are using
6706	* bandwidth control.
6707	*/
6708	if (cfs_task_bw_constrained(p))
6709	tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
6710	}
6711	#endif /* CONFIG_NO_HZ_FULL */
6712
6713	#else /* !CONFIG_CFS_BANDWIDTH: */
6714
6715	static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
6716	static bool check_cfs_rq_runtime(struct cfs_rq cfs_rq) { return* false; }
6717	static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
6718	static inline void sync_throttle(struct task_group tg, int* cpu) {}
6719	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
6720	static void task_throttle_setup_work(struct task_struct *p) {}
6721	static bool task_is_throttled(struct task_struct p) { return* false; }
6722	static void dequeue_throttled_task(struct task_struct p, int* flags) {}
6723	static bool enqueue_throttled_task(struct task_struct p) { return* false; }
6724	static void record_throttle_clock(struct cfs_rq *cfs_rq) {}
6725
6726	static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
6727	{
6728	return `0`;
6729	}
6730
6731	static inline bool cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq)
6732	{
6733	return false;
6734	}
6735
6736	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
6737	{
6738	return `0`;
6739	}
6740
6741	static inline int lb_throttled_hierarchy(struct task_struct p, int* dst_cpu)
6742	{
6743	return `0`;
6744	}
6745
6746	#ifdef CONFIG_FAIR_GROUP_SCHED
6747	void init_cfs_bandwidth(struct cfs_bandwidth cfs_b, struct* cfs_bandwidth *parent) {}
6748	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
6749	#endif
6750
6751	static inline struct cfs_bandwidth tg_cfs_bandwidth(struct* task_group *tg)
6752	{
6753	return NULL;
6754	}
6755	static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
6756	static inline void update_runtime_enabled(struct rq *rq) {}
6757	static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
6758	#ifdef CONFIG_CGROUP_SCHED
6759	bool cfs_task_bw_constrained(struct task_struct *p)
6760	{
6761	return false;
6762	}
6763	#endif
6764	#endif /* !CONFIG_CFS_BANDWIDTH */
6765
6766	#if !defined(CONFIG_CFS_BANDWIDTH) \|\| !defined(CONFIG_NO_HZ_FULL)
6767	static inline void sched_fair_update_stop_tick(struct rq rq, struct* task_struct *p) {}
6768	#endif
6769
6770	/**************************************************
6771	* CFS operations on tasks:
6772	*/
6773
6774	#ifdef CONFIG_SCHED_HRTICK
6775	static void hrtick_start_fair(struct rq rq, struct* task_struct *p)
6776	{
6777	struct sched_entity *se = &p->se;
6778
6779	WARN_ON_ONCE(task_rq(p) != rq);
6780
6781	if (rq->cfs.h_nr_queued > `1`) {
6782	u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
6783	u64 slice = se->slice;
6784	s64 delta = slice - ran;
6785
6786	if (delta < `0`) {
6787	if (task_current_donor(rq, p))
6788	resched_curr(rq);
6789	return;
6790	}
6791	hrtick_start(rq, delay: delta);
6792	}
6793	}
6794
6795	/*
6796	* called from enqueue/dequeue and updates the hrtick when the
6797	* current task is from our class and nr_running is low enough
6798	* to matter.
6799	*/
6800	static void hrtick_update(struct rq *rq)
6801	{
6802	struct task_struct *donor = rq->donor;
6803
6804	if (!hrtick_enabled_fair(rq) \|\| donor->sched_class != &fair_sched_class)
6805	return;
6806
6807	hrtick_start_fair(rq, p: donor);
6808	}
6809	#else /* !CONFIG_SCHED_HRTICK: */
6810	static inline void
6811	hrtick_start_fair(struct rq rq, struct* task_struct *p)
6812	{
6813	}
6814
6815	static inline void hrtick_update(struct rq *rq)
6816	{
6817	}
6818	#endif /* !CONFIG_SCHED_HRTICK */
6819
6820	static inline bool cpu_overutilized(int cpu)
6821	{
6822	unsigned long rq_util_min, rq_util_max;
6823
6824	if (!sched_energy_enabled())
6825	return false;
6826
6827	rq_util_min = uclamp_rq_get(cpu_rq(cpu), clamp_id: UCLAMP_MIN);
6828	rq_util_max = uclamp_rq_get(cpu_rq(cpu), clamp_id: UCLAMP_MAX);
6829
6830	/ Return true only if the utilization doesn't fit CPU's capacity /
6831	return !util_fits_cpu(util: cpu_util_cfs(cpu), uclamp_min: rq_util_min, uclamp_max: rq_util_max, cpu);
6832	}
6833
6834	/*
6835	* overutilized value make sense only if EAS is enabled
6836	*/
6837	static inline bool is_rd_overutilized(struct root_domain *rd)
6838	{
6839	return !sched_energy_enabled() \|\| READ_ONCE(rd->overutilized);
6840	}
6841
6842	static inline void set_rd_overutilized(struct root_domain *rd, bool flag)
6843	{
6844	if (!sched_energy_enabled())
6845	return;
6846
6847	WRITE_ONCE(rd->overutilized, flag);
6848	trace_sched_overutilized_tp(rd, overutilized: flag);
6849	}
6850
6851	static inline void check_update_overutilized_status(struct rq *rq)
6852	{
6853	/*
6854	* overutilized field is used for load balancing decisions only
6855	* if energy aware scheduler is being used
6856	*/
6857
6858	if (!is_rd_overutilized(rd: rq->rd) && cpu_overutilized(cpu: rq->cpu))
6859	set_rd_overutilized(rd: rq->rd, flag: `1`);
6860	}
6861
6862	/ Runqueue only has SCHED_IDLE tasks enqueued /
6863	static int sched_idle_rq(struct rq *rq)
6864	{
6865	return unlikely(rq->nr_running == rq->cfs.h_nr_idle &&
6866	rq->nr_running);
6867	}
6868
6869	static int sched_idle_cpu(int cpu)
6870	{
6871	return sched_idle_rq(cpu_rq(cpu));
6872	}
6873
6874	static void
6875	requeue_delayed_entity(struct sched_entity *se)
6876	{
6877	struct cfs_rq *cfs_rq = cfs_rq_of(se);
6878
6879	/*
6880	* se->sched_delayed should imply: se->on_rq == 1.
6881	* Because a delayed entity is one that is still on
6882	* the runqueue competing until elegibility.
6883	*/
6884	WARN_ON_ONCE(!se->sched_delayed);
6885	WARN_ON_ONCE(!se->on_rq);
6886
6887	if (sched_feat(DELAY_ZERO)) {
6888	update_entity_lag(cfs_rq, se);
6889	if (se->vlag > `0`) {
6890	cfs_rq->nr_queued--;
6891	if (se != cfs_rq->curr)
6892	__dequeue_entity(cfs_rq, se);
6893	se->vlag = `0`;
6894	place_entity(cfs_rq, se, flags: `0`);
6895	if (se != cfs_rq->curr)
6896	__enqueue_entity(cfs_rq, se);
6897	cfs_rq->nr_queued++;
6898	}
6899	}
6900
6901	update_load_avg(cfs_rq, se, flags: `0`);
6902	clear_delayed(se);
6903	}
6904
6905	/*
6906	* The enqueue_task method is called before nr_running is
6907	* increased. Here we update the fair scheduling stats and
6908	* then put the task into the rbtree:
6909	*/
6910	static void
6911	enqueue_task_fair(struct rq rq, struct* task_struct p, int* flags)
6912	{
6913	struct cfs_rq *cfs_rq;
6914	struct sched_entity *se = &p->se;
6915	int h_nr_idle = task_has_idle_policy(p);
6916	int h_nr_runnable = `1`;
6917	int task_new = !(flags & ENQUEUE_WAKEUP);
6918	int rq_h_nr_queued = rq->cfs.h_nr_queued;
6919	u64 slice = `0`;
6920
6921	if (task_is_throttled(p) && enqueue_throttled_task(p))
6922	return;
6923
6924	/*
6925	* The code below (indirectly) updates schedutil which looks at
6926	* the cfs_rq utilization to select a frequency.
6927	* Let's add the task's estimated utilization to the cfs_rq's
6928	* estimated utilization, before we update schedutil.
6929	*/
6930	if (!p->se.sched_delayed \|\| (flags & ENQUEUE_DELAYED))
6931	util_est_enqueue(cfs_rq: &rq->cfs, p);
6932
6933	if (flags & ENQUEUE_DELAYED) {
6934	requeue_delayed_entity(se);
6935	return;
6936	}
6937
6938	/*
6939	* If in_iowait is set, the code below may not trigger any cpufreq
6940	* utilization updates, so do it here explicitly with the IOWAIT flag
6941	* passed.
6942	*/
6943	if (p->in_iowait)
6944	cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
6945
6946	if (task_new && se->sched_delayed)
6947	h_nr_runnable = `0`;
6948
6949	for_each_sched_entity(se) {
6950	if (se->on_rq) {
6951	if (se->sched_delayed)
6952	requeue_delayed_entity(se);
6953	break;
6954	}
6955	cfs_rq = cfs_rq_of(se);
6956
6957	/*
6958	* Basically set the slice of group entries to the min_slice of
6959	* their respective cfs_rq. This ensures the group can service
6960	* its entities in the desired time-frame.
6961	*/
6962	if (slice) {
6963	se->slice = slice;
6964	se->custom_slice = `1`;
6965	}
6966	enqueue_entity(cfs_rq, se, flags);
6967	slice = cfs_rq_min_slice(cfs_rq);
6968
6969	cfs_rq->h_nr_runnable += h_nr_runnable;
6970	cfs_rq->h_nr_queued++;
6971	cfs_rq->h_nr_idle += h_nr_idle;
6972
6973	if (cfs_rq_is_idle(cfs_rq))
6974	h_nr_idle = `1`;
6975
6976	flags = ENQUEUE_WAKEUP;
6977	}
6978
6979	for_each_sched_entity(se) {
6980	cfs_rq = cfs_rq_of(se);
6981
6982	update_load_avg(cfs_rq, se, UPDATE_TG);
6983	se_update_runnable(se);
6984	update_cfs_group(se);
6985
6986	se->slice = slice;
6987	if (se != cfs_rq->curr)
6988	min_vruntime_cb_propagate(rb: &se->run_node, NULL);
6989	slice = cfs_rq_min_slice(cfs_rq);
6990
6991	cfs_rq->h_nr_runnable += h_nr_runnable;
6992	cfs_rq->h_nr_queued++;
6993	cfs_rq->h_nr_idle += h_nr_idle;
6994
6995	if (cfs_rq_is_idle(cfs_rq))
6996	h_nr_idle = `1`;
6997	}
6998
6999	if (!rq_h_nr_queued && rq->cfs.h_nr_queued) {
7000	/ Account for idle runtime /
7001	if (!rq->nr_running)
7002	dl_server_update_idle_time(rq, p: rq->curr);
7003	dl_server_start(dl_se: &rq->fair_server);
7004	}
7005
7006	/ At this point se is NULL and we are at root level/
7007	add_nr_running(rq, count: `1`);
7008
7009	/*
7010	* Since new tasks are assigned an initial util_avg equal to
7011	* half of the spare capacity of their CPU, tiny tasks have the
7012	* ability to cross the overutilized threshold, which will
7013	* result in the load balancer ruining all the task placement
7014	* done by EAS. As a way to mitigate that effect, do not account
7015	* for the first enqueue operation of new tasks during the
7016	* overutilized flag detection.
7017	*
7018	* A better way of solving this problem would be to wait for
7019	* the PELT signals of tasks to converge before taking them
7020	* into account, but that is not straightforward to implement,
7021	* and the following generally works well enough in practice.
7022	*/
7023	if (!task_new)
7024	check_update_overutilized_status(rq);
7025
7026	assert_list_leaf_cfs_rq(rq);
7027
7028	hrtick_update(rq);
7029	}
7030
7031	static void set_next_buddy(struct sched_entity *se);
7032
7033	/*
7034	* Basically dequeue_task_fair(), except it can deal with dequeue_entity()
7035	* failing half-way through and resume the dequeue later.
7036	*
7037	* Returns:
7038	* -1 - dequeue delayed
7039	* 0 - dequeue throttled
7040	* 1 - dequeue complete
7041	*/
7042	static int dequeue_entities(struct rq rq, struct* sched_entity se, int* flags)
7043	{
7044	bool was_sched_idle = sched_idle_rq(rq);
7045	bool task_sleep = flags & DEQUEUE_SLEEP;
7046	bool task_delayed = flags & DEQUEUE_DELAYED;
7047	bool task_throttled = flags & DEQUEUE_THROTTLE;
7048	struct task_struct *p = NULL;
7049	int h_nr_idle = `0`;
7050	int h_nr_queued = `0`;
7051	int h_nr_runnable = `0`;
7052	struct cfs_rq *cfs_rq;
7053	u64 slice = `0`;
7054
7055	if (entity_is_task(se)) {
7056	p = task_of(se);
7057	h_nr_queued = `1`;
7058	h_nr_idle = task_has_idle_policy(p);
7059	if (task_sleep \|\| task_delayed \|\| !se->sched_delayed)
7060	h_nr_runnable = `1`;
7061	}
7062
7063	for_each_sched_entity(se) {
7064	cfs_rq = cfs_rq_of(se);
7065
7066	if (!dequeue_entity(cfs_rq, se, flags)) {
7067	if (p && &p->se == se)
7068	return -`1`;
7069
7070	slice = cfs_rq_min_slice(cfs_rq);
7071	break;
7072	}
7073
7074	cfs_rq->h_nr_runnable -= h_nr_runnable;
7075	cfs_rq->h_nr_queued -= h_nr_queued;
7076	cfs_rq->h_nr_idle -= h_nr_idle;
7077
7078	if (cfs_rq_is_idle(cfs_rq))
7079	h_nr_idle = h_nr_queued;
7080
7081	if (throttled_hierarchy(cfs_rq) && task_throttled)
7082	record_throttle_clock(cfs_rq);
7083
7084	/ Don't dequeue parent if it has other entities besides us /
7085	if (cfs_rq->load.weight) {
7086	slice = cfs_rq_min_slice(cfs_rq);
7087
7088	/ Avoid re-evaluating load for this entity: /
7089	se = parent_entity(se);
7090	/*
7091	* Bias pick_next to pick a task from this cfs_rq, as
7092	* p is sleeping when it is within its sched_slice.
7093	*/
7094	if (task_sleep && se)
7095	set_next_buddy(se);
7096	break;
7097	}
7098	flags \|= DEQUEUE_SLEEP;
7099	flags &= ~(DEQUEUE_DELAYED \| DEQUEUE_SPECIAL);
7100	}
7101
7102	for_each_sched_entity(se) {
7103	cfs_rq = cfs_rq_of(se);
7104
7105	update_load_avg(cfs_rq, se, UPDATE_TG);
7106	se_update_runnable(se);
7107	update_cfs_group(se);
7108
7109	se->slice = slice;
7110	if (se != cfs_rq->curr)
7111	min_vruntime_cb_propagate(rb: &se->run_node, NULL);
7112	slice = cfs_rq_min_slice(cfs_rq);
7113
7114	cfs_rq->h_nr_runnable -= h_nr_runnable;
7115	cfs_rq->h_nr_queued -= h_nr_queued;
7116	cfs_rq->h_nr_idle -= h_nr_idle;
7117
7118	if (cfs_rq_is_idle(cfs_rq))
7119	h_nr_idle = h_nr_queued;
7120
7121	if (throttled_hierarchy(cfs_rq) && task_throttled)
7122	record_throttle_clock(cfs_rq);
7123	}
7124
7125	sub_nr_running(rq, count: h_nr_queued);
7126
7127	/ balance early to pull high priority tasks /
7128	if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
7129	rq->next_balance = jiffies;
7130
7131	if (p && task_delayed) {
7132	WARN_ON_ONCE(!task_sleep);
7133	WARN_ON_ONCE(p->on_rq != `1`);
7134
7135	/ Fix-up what dequeue_task_fair() skipped /
7136	hrtick_update(rq);
7137
7138	/*
7139	* Fix-up what block_task() skipped.
7140	*
7141	* Must be last, @p might not be valid after this.
7142	*/
7143	__block_task(rq, p);
7144	}
7145
7146	return `1`;
7147	}
7148
7149	/*
7150	* The dequeue_task method is called before nr_running is
7151	* decreased. We remove the task from the rbtree and
7152	* update the fair scheduling stats:
7153	*/
7154	static bool dequeue_task_fair(struct rq rq, struct* task_struct p, int* flags)
7155	{
7156	if (task_is_throttled(p)) {
7157	dequeue_throttled_task(p, flags);
7158	return true;
7159	}
7160
7161	if (!p->se.sched_delayed)
7162	util_est_dequeue(cfs_rq: &rq->cfs, p);
7163
7164	util_est_update(cfs_rq: &rq->cfs, p, task_sleep: flags & DEQUEUE_SLEEP);
7165	if (dequeue_entities(rq, se: &p->se, flags) < `0`)
7166	return false;
7167
7168	/*
7169	* Must not reference @p after dequeue_entities(DEQUEUE_DELAYED).
7170	*/
7171
7172	hrtick_update(rq);
7173	return true;
7174	}
7175
7176	static inline unsigned int cfs_h_nr_delayed(struct rq *rq)
7177	{
7178	return (rq->cfs.h_nr_queued - rq->cfs.h_nr_runnable);
7179	}
7180
7181	/ Working cpumask for: sched_balance_rq(), sched_balance_newidle(). /
7182	static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
7183	static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
7184	static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
7185
7186	#ifdef CONFIG_NO_HZ_COMMON
7187
7188	static struct {
7189	cpumask_var_t idle_cpus_mask;
7190	atomic_t nr_cpus;
7191	int has_blocked; / Idle CPUS has blocked load /
7192	int needs_update; / Newly idle CPUs need their next_balance collated /
7193	unsigned long next_balance; / in jiffy units /
7194	unsigned long next_blocked; / Next update of blocked load in jiffies /
7195	} nohz ____cacheline_aligned;
7196
7197	#endif /* CONFIG_NO_HZ_COMMON */
7198
7199	static unsigned long cpu_load(struct rq *rq)
7200	{
7201	return cfs_rq_load_avg(cfs_rq: &rq->cfs);
7202	}
7203
7204	/*
7205	* cpu_load_without - compute CPU load without any contributions from *p
7206	* @cpu: the CPU which load is requested
7207	* @p: the task which load should be discounted
7208	*
7209	* The load of a CPU is defined by the load of tasks currently enqueued on that
7210	* CPU as well as tasks which are currently sleeping after an execution on that
7211	* CPU.
7212	*
7213	* This method returns the load of the specified CPU by discounting the load of
7214	* the specified task, whenever the task is currently contributing to the CPU
7215	* load.
7216	*/
7217	static unsigned long cpu_load_without(struct rq rq, struct* task_struct *p)
7218	{
7219	struct cfs_rq *cfs_rq;
7220	unsigned int load;
7221
7222	/ Task has no contribution or is new /
7223	if (cpu_of(rq) != task_cpu(p) \|\| !READ_ONCE(p->se.avg.last_update_time))
7224	return cpu_load(rq);
7225
7226	cfs_rq = &rq->cfs;
7227	load = READ_ONCE(cfs_rq->avg.load_avg);
7228
7229	/ Discount task's util from CPU's util /
7230	lsub_positive(&load, task_h_load(p));
7231
7232	return load;
7233	}
7234
7235	static unsigned long cpu_runnable(struct rq *rq)
7236	{
7237	return cfs_rq_runnable_avg(cfs_rq: &rq->cfs);
7238	}
7239
7240	static unsigned long cpu_runnable_without(struct rq rq, struct* task_struct *p)
7241	{
7242	struct cfs_rq *cfs_rq;
7243	unsigned int runnable;
7244
7245	/ Task has no contribution or is new /
7246	if (cpu_of(rq) != task_cpu(p) \|\| !READ_ONCE(p->se.avg.last_update_time))
7247	return cpu_runnable(rq);
7248
7249	cfs_rq = &rq->cfs;
7250	runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
7251
7252	/ Discount task's runnable from CPU's runnable /
7253	lsub_positive(&runnable, p->se.avg.runnable_avg);
7254
7255	return runnable;
7256	}
7257
7258	static unsigned long capacity_of(int cpu)
7259	{
7260	return cpu_rq(cpu)->cpu_capacity;
7261	}
7262
7263	static void record_wakee(struct task_struct *p)
7264	{
7265	/*
7266	* Only decay a single time; tasks that have less then 1 wakeup per
7267	* jiffy will not have built up many flips.
7268	*/
7269	if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
7270	current->wakee_flips >>= `1`;
7271	current->wakee_flip_decay_ts = jiffies;
7272	}
7273
7274	if (current->last_wakee != p) {
7275	current->last_wakee = p;
7276	current->wakee_flips++;
7277	}
7278	}
7279
7280	/*
7281	* Detect M:N waker/wakee relationships via a switching-frequency heuristic.
7282	*
7283	* A waker of many should wake a different task than the one last awakened
7284	* at a frequency roughly N times higher than one of its wakees.
7285	*
7286	* In order to determine whether we should let the load spread vs consolidating
7287	* to shared cache, we look for a minimum 'flip' frequency of llc_size in one
7288	* partner, and a factor of lls_size higher frequency in the other.
7289	*
7290	* With both conditions met, we can be relatively sure that the relationship is
7291	* non-monogamous, with partner count exceeding socket size.
7292	*
7293	* Waker/wakee being client/server, worker/dispatcher, interrupt source or
7294	* whatever is irrelevant, spread criteria is apparent partner count exceeds
7295	* socket size.
7296	*/
7297	static int wake_wide(struct task_struct *p)
7298	{
7299	unsigned int master = current->wakee_flips;
7300	unsigned int slave = p->wakee_flips;
7301	int factor = __this_cpu_read(sd_llc_size);
7302
7303	if (master < slave)
7304	swap(master, slave);
7305	if (slave < factor \|\| master < slave * factor)
7306	return `0`;
7307	return `1`;
7308	}
7309
7310	/*
7311	* The purpose of wake_affine() is to quickly determine on which CPU we can run
7312	* soonest. For the purpose of speed we only consider the waking and previous
7313	* CPU.
7314	*
7315	* wake_affine_idle() - only considers 'now', it check if the waking CPU is
7316	* cache-affine and is (or will be) idle.
7317	*
7318	* wake_affine_weight() - considers the weight to reflect the average
7319	* scheduling latency of the CPUs. This seems to work
7320	* for the overloaded case.
7321	*/
7322	static int
7323	wake_affine_idle(int this_cpu, int prev_cpu, int sync)
7324	{
7325	/*
7326	* If this_cpu is idle, it implies the wakeup is from interrupt
7327	* context. Only allow the move if cache is shared. Otherwise an
7328	* interrupt intensive workload could force all tasks onto one
7329	* node depending on the IO topology or IRQ affinity settings.
7330	*
7331	* If the prev_cpu is idle and cache affine then avoid a migration.
7332	* There is no guarantee that the cache hot data from an interrupt
7333	* is more important than cache hot data on the prev_cpu and from
7334	* a cpufreq perspective, it's better to have higher utilisation
7335	* on one CPU.
7336	*/
7337	if (available_idle_cpu(cpu: this_cpu) && cpus_share_cache(this_cpu, that_cpu: prev_cpu))
7338	return available_idle_cpu(cpu: prev_cpu) ? prev_cpu : this_cpu;
7339
7340	if (sync) {
7341	struct rq *rq = cpu_rq(this_cpu);
7342
7343	if ((rq->nr_running - cfs_h_nr_delayed(rq)) == `1`)
7344	return this_cpu;
7345	}
7346
7347	if (available_idle_cpu(cpu: prev_cpu))
7348	return prev_cpu;
7349
7350	return nr_cpumask_bits;
7351	}
7352
7353	static int
7354	wake_affine_weight(struct sched_domain sd, struct* task_struct *p,
7355	int this_cpu, int prev_cpu, int sync)
7356	{
7357	s64 this_eff_load, prev_eff_load;
7358	unsigned long task_load;
7359
7360	this_eff_load = cpu_load(cpu_rq(this_cpu));
7361
7362	if (sync) {
7363	unsigned long current_load = task_h_load(current);
7364
7365	if (current_load > this_eff_load)
7366	return this_cpu;
7367
7368	this_eff_load -= current_load;
7369	}
7370
7371	task_load = task_h_load(p);
7372
7373	this_eff_load += task_load;
7374	if (sched_feat(WA_BIAS))
7375	this_eff_load *= `100`;
7376	this_eff_load *= capacity_of(cpu: prev_cpu);
7377
7378	prev_eff_load = cpu_load(cpu_rq(prev_cpu));
7379	prev_eff_load -= task_load;
7380	if (sched_feat(WA_BIAS))
7381	prev_eff_load *= `100` + (sd->imbalance_pct - `100`) / `2`;
7382	prev_eff_load *= capacity_of(cpu: this_cpu);
7383
7384	/*
7385	* If sync, adjust the weight of prev_eff_load such that if
7386	* prev_eff == this_eff that select_idle_sibling() will consider
7387	* stacking the wakee on top of the waker if no other CPU is
7388	* idle.
7389	*/
7390	if (sync)
7391	prev_eff_load += `1`;
7392
7393	return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
7394	}
7395
7396	static int wake_affine(struct sched_domain sd, struct* task_struct *p,
7397	int this_cpu, int prev_cpu, int sync)
7398	{
7399	int target = nr_cpumask_bits;
7400
7401	if (sched_feat(WA_IDLE))
7402	target = wake_affine_idle(this_cpu, prev_cpu, sync);
7403
7404	if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
7405	target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
7406
7407	schedstat_inc(p->stats.nr_wakeups_affine_attempts);
7408	if (target != this_cpu)
7409	return prev_cpu;
7410
7411	schedstat_inc(sd->ttwu_move_affine);
7412	schedstat_inc(p->stats.nr_wakeups_affine);
7413	return target;
7414	}
7415
7416	static struct sched_group *
7417	sched_balance_find_dst_group(struct sched_domain sd, struct* task_struct p, int* this_cpu);
7418
7419	/*
7420	* sched_balance_find_dst_group_cpu - find the idlest CPU among the CPUs in the group.
7421	*/
7422	static int
7423	sched_balance_find_dst_group_cpu(struct sched_group group, struct* task_struct p, int* this_cpu)
7424	{
7425	unsigned long load, min_load = ULONG_MAX;
7426	unsigned int min_exit_latency = UINT_MAX;
7427	u64 latest_idle_timestamp = `0`;
7428	int least_loaded_cpu = this_cpu;
7429	int shallowest_idle_cpu = -`1`;
7430	int i;
7431
7432	/ Check if we have any choice: /
7433	if (group->group_weight == `1`)
7434	return cpumask_first(srcp: sched_group_span(sg: group));
7435
7436	/ Traverse only the allowed CPUs /
7437	for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
7438	struct rq *rq = cpu_rq(i);
7439
7440	if (!sched_core_cookie_match(rq, p))
7441	continue;
7442
7443	if (sched_idle_cpu(cpu: i))
7444	return i;
7445
7446	if (available_idle_cpu(cpu: i)) {
7447	struct cpuidle_state *idle = idle_get_state(rq);
7448	if (idle && idle->exit_latency < min_exit_latency) {
7449	/*
7450	* We give priority to a CPU whose idle state
7451	* has the smallest exit latency irrespective
7452	* of any idle timestamp.
7453	*/
7454	min_exit_latency = idle->exit_latency;
7455	latest_idle_timestamp = rq->idle_stamp;
7456	shallowest_idle_cpu = i;
7457	} else if ((!idle \|\| idle->exit_latency == min_exit_latency) &&
7458	rq->idle_stamp > latest_idle_timestamp) {
7459	/*
7460	* If equal or no active idle state, then
7461	* the most recently idled CPU might have
7462	* a warmer cache.
7463	*/
7464	latest_idle_timestamp = rq->idle_stamp;
7465	shallowest_idle_cpu = i;
7466	}
7467	} else if (shallowest_idle_cpu == -`1`) {
7468	load = cpu_load(cpu_rq(i));
7469	if (load < min_load) {
7470	min_load = load;
7471	least_loaded_cpu = i;
7472	}
7473	}
7474	}
7475
7476	return shallowest_idle_cpu != -`1` ? shallowest_idle_cpu : least_loaded_cpu;
7477	}
7478
7479	static inline int sched_balance_find_dst_cpu(struct sched_domain sd, struct* task_struct *p,
7480	int cpu, int prev_cpu, int sd_flag)
7481	{
7482	int new_cpu = cpu;
7483
7484	if (!cpumask_intersects(src1p: sched_domain_span(sd), src2p: p->cpus_ptr))
7485	return prev_cpu;
7486
7487	/*
7488	* We need task's util for cpu_util_without, sync it up to
7489	* prev_cpu's last_update_time.
7490	*/
7491	if (!(sd_flag & SD_BALANCE_FORK))
7492	sync_entity_load_avg(se: &p->se);
7493
7494	while (sd) {
7495	struct sched_group *group;
7496	struct sched_domain *tmp;
7497	int weight;
7498
7499	if (!(sd->flags & sd_flag)) {
7500	sd = sd->child;
7501	continue;
7502	}
7503
7504	group = sched_balance_find_dst_group(sd, p, this_cpu: cpu);
7505	if (!group) {
7506	sd = sd->child;
7507	continue;
7508	}
7509
7510	new_cpu = sched_balance_find_dst_group_cpu(group, p, this_cpu: cpu);
7511	if (new_cpu == cpu) {
7512	/ Now try balancing at a lower domain level of 'cpu': /
7513	sd = sd->child;
7514	continue;
7515	}
7516
7517	/ Now try balancing at a lower domain level of 'new_cpu': /
7518	cpu = new_cpu;
7519	weight = sd->span_weight;
7520	sd = NULL;
7521	for_each_domain(cpu, tmp) {
7522	if (weight <= tmp->span_weight)
7523	break;
7524	if (tmp->flags & sd_flag)
7525	sd = tmp;
7526	}
7527	}
7528
7529	return new_cpu;
7530	}
7531
7532	static inline int __select_idle_cpu(int cpu, struct task_struct *p)
7533	{
7534	if ((available_idle_cpu(cpu) \|\| sched_idle_cpu(cpu)) &&
7535	sched_cpu_cookie_match(cpu_rq(cpu), p))
7536	return cpu;
7537
7538	return -`1`;
7539	}
7540
7541	#ifdef CONFIG_SCHED_SMT
7542	DEFINE_STATIC_KEY_FALSE(sched_smt_present);
7543	EXPORT_SYMBOL_GPL(sched_smt_present);
7544
7545	static inline void set_idle_cores(int cpu, int val)
7546	{
7547	struct sched_domain_shared *sds;
7548
7549	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
7550	if (sds)
7551	WRITE_ONCE(sds->has_idle_cores, val);
7552	}
7553
7554	static inline bool test_idle_cores(int cpu)
7555	{
7556	struct sched_domain_shared *sds;
7557
7558	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
7559	if (sds)
7560	return READ_ONCE(sds->has_idle_cores);
7561
7562	return false;
7563	}
7564
7565	/*
7566	* Scans the local SMT mask to see if the entire core is idle, and records this
7567	* information in sd_llc_shared->has_idle_cores.
7568	*
7569	* Since SMT siblings share all cache levels, inspecting this limited remote
7570	* state should be fairly cheap.
7571	*/
7572	void __update_idle_core(struct rq *rq)
7573	{
7574	int core = cpu_of(rq);
7575	int cpu;
7576
7577	rcu_read_lock();
7578	if (test_idle_cores(cpu: core))
7579	goto unlock;
7580
7581	for_each_cpu(cpu, cpu_smt_mask(core)) {
7582	if (cpu == core)
7583	continue;
7584
7585	if (!available_idle_cpu(cpu))
7586	goto unlock;
7587	}
7588
7589	set_idle_cores(cpu: core, val: `1`);
7590	unlock:
7591	rcu_read_unlock();
7592	}
7593
7594	/*
7595	* Scan the entire LLC domain for idle cores; this dynamically switches off if
7596	* there are no idle cores left in the system; tracked through
7597	* sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
7598	*/
7599	static int select_idle_core(struct task_struct p, int* core, struct cpumask cpus, int* *idle_cpu)
7600	{
7601	bool idle = true;
7602	int cpu;
7603
7604	for_each_cpu(cpu, cpu_smt_mask(core)) {
7605	if (!available_idle_cpu(cpu)) {
7606	idle = false;
7607	if (*idle_cpu == -`1`) {
7608	if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, cpumask: cpus)) {
7609	*idle_cpu = cpu;
7610	break;
7611	}
7612	continue;
7613	}
7614	break;
7615	}
7616	if (*idle_cpu == -`1` && cpumask_test_cpu(cpu, cpumask: cpus))
7617	*idle_cpu = cpu;
7618	}
7619
7620	if (idle)
7621	return core;
7622
7623	cpumask_andnot(dstp: cpus, src1p: cpus, src2p: cpu_smt_mask(cpu: core));
7624	return -`1`;
7625	}
7626
7627	/*
7628	* Scan the local SMT mask for idle CPUs.
7629	*/
7630	static int select_idle_smt(struct task_struct p, struct* sched_domain sd, int* target)
7631	{
7632	int cpu;
7633
7634	for_each_cpu_and(cpu, cpu_smt_mask(target), p->cpus_ptr) {
7635	if (cpu == target)
7636	continue;
7637	/*
7638	* Check if the CPU is in the LLC scheduling domain of @target.
7639	* Due to isolcpus, there is no guarantee that all the siblings are in the domain.
7640	*/
7641	if (!cpumask_test_cpu(cpu, cpumask: sched_domain_span(sd)))
7642	continue;
7643	if (available_idle_cpu(cpu) \|\| sched_idle_cpu(cpu))
7644	return cpu;
7645	}
7646
7647	return -`1`;
7648	}
7649
7650	#else /* !CONFIG_SCHED_SMT: */
7651
7652	static inline void set_idle_cores(int cpu, int val)
7653	{
7654	}
7655
7656	static inline bool test_idle_cores(int cpu)
7657	{
7658	return false;
7659	}
7660
7661	static inline int select_idle_core(struct task_struct p, int* core, struct cpumask cpus, int* *idle_cpu)
7662	{
7663	return __select_idle_cpu(core, p);
7664	}
7665
7666	static inline int select_idle_smt(struct task_struct p, struct* sched_domain sd, int* target)
7667	{
7668	return -`1`;
7669	}
7670
7671	#endif /* !CONFIG_SCHED_SMT */
7672
7673	/*
7674	* Scan the LLC domain for idle CPUs; this is dynamically regulated by
7675	* comparing the average scan cost (tracked in sd->avg_scan_cost) against the
7676	* average idle time for this rq (as found in rq->avg_idle).
7677	*/
7678	static int select_idle_cpu(struct task_struct p, struct* sched_domain sd, bool has_idle_core, int* target)
7679	{
7680	struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
7681	int i, cpu, idle_cpu = -`1`, nr = INT_MAX;
7682	struct sched_domain_shared *sd_share;
7683
7684	cpumask_and(dstp: cpus, src1p: sched_domain_span(sd), src2p: p->cpus_ptr);
7685
7686	if (sched_feat(SIS_UTIL)) {
7687	sd_share = rcu_dereference(per_cpu(sd_llc_shared, target));
7688	if (sd_share) {
7689	/ because !--nr is the condition to stop scan /
7690	nr = READ_ONCE(sd_share->nr_idle_scan) + `1`;
7691	/ overloaded LLC is unlikely to have idle cpu/core /
7692	if (nr == `1`)
7693	return -`1`;
7694	}
7695	}
7696
7697	if (static_branch_unlikely(&sched_cluster_active)) {
7698	struct sched_group *sg = sd->groups;
7699
7700	if (sg->flags & SD_CLUSTER) {
7701	for_each_cpu_wrap(cpu, sched_group_span(sg), target + `1`) {
7702	if (!cpumask_test_cpu(cpu, cpumask: cpus))
7703	continue;
7704
7705	if (has_idle_core) {
7706	i = select_idle_core(p, core: cpu, cpus, idle_cpu: &idle_cpu);
7707	if ((unsigned int)i < nr_cpumask_bits)
7708	return i;
7709	} else {
7710	if (--nr <= `0`)
7711	return -`1`;
7712	idle_cpu = __select_idle_cpu(cpu, p);
7713	if ((unsigned int)idle_cpu < nr_cpumask_bits)
7714	return idle_cpu;
7715	}
7716	}
7717	cpumask_andnot(dstp: cpus, src1p: cpus, src2p: sched_group_span(sg));
7718	}
7719	}
7720
7721	for_each_cpu_wrap(cpu, cpus, target + `1`) {
7722	if (has_idle_core) {
7723	i = select_idle_core(p, core: cpu, cpus, idle_cpu: &idle_cpu);
7724	if ((unsigned int)i < nr_cpumask_bits)
7725	return i;
7726
7727	} else {
7728	if (--nr <= `0`)
7729	return -`1`;
7730	idle_cpu = __select_idle_cpu(cpu, p);
7731	if ((unsigned int)idle_cpu < nr_cpumask_bits)
7732	break;
7733	}
7734	}
7735
7736	if (has_idle_core)
7737	set_idle_cores(cpu: target, val: false);
7738
7739	return idle_cpu;
7740	}
7741
7742	/*
7743	* Scan the asym_capacity domain for idle CPUs; pick the first idle one on which
7744	* the task fits. If no CPU is big enough, but there are idle ones, try to
7745	* maximize capacity.
7746	*/
7747	static int
7748	select_idle_capacity(struct task_struct p, struct* sched_domain sd, int* target)
7749	{
7750	unsigned long task_util, util_min, util_max, best_cap = `0`;
7751	int fits, best_fits = `0`;
7752	int cpu, best_cpu = -`1`;
7753	struct cpumask *cpus;
7754
7755	cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
7756	cpumask_and(dstp: cpus, src1p: sched_domain_span(sd), src2p: p->cpus_ptr);
7757
7758	task_util = task_util_est(p);
7759	util_min = uclamp_eff_value(p, clamp_id: UCLAMP_MIN);
7760	util_max = uclamp_eff_value(p, clamp_id: UCLAMP_MAX);
7761
7762	for_each_cpu_wrap(cpu, cpus, target) {
7763	unsigned long cpu_cap = capacity_of(cpu);
7764
7765	if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
7766	continue;
7767
7768	fits = util_fits_cpu(util: task_util, uclamp_min: util_min, uclamp_max: util_max, cpu);
7769
7770	/ This CPU fits with all requirements /
7771	if (fits > `0`)
7772	return cpu;
7773	/*
7774	* Only the min performance hint (i.e. uclamp_min) doesn't fit.
7775	* Look for the CPU with best capacity.
7776	*/
7777	else if (fits < `0`)
7778	cpu_cap = get_actual_cpu_capacity(cpu);
7779
7780	/*
7781	* First, select CPU which fits better (-1 being better than 0).
7782	* Then, select the one with best capacity at same level.
7783	*/
7784	if ((fits < best_fits) \|\|
7785	((fits == best_fits) && (cpu_cap > best_cap))) {
7786	best_cap = cpu_cap;
7787	best_cpu = cpu;
7788	best_fits = fits;
7789	}
7790	}
7791
7792	return best_cpu;
7793	}
7794
7795	static inline bool asym_fits_cpu(unsigned long util,
7796	unsigned long util_min,
7797	unsigned long util_max,
7798	int cpu)
7799	{
7800	if (sched_asym_cpucap_active())
7801	/*
7802	* Return true only if the cpu fully fits the task requirements
7803	* which include the utilization and the performance hints.
7804	*/
7805	return (util_fits_cpu(util, uclamp_min: util_min, uclamp_max: util_max, cpu) > `0`);
7806
7807	return true;
7808	}
7809
7810	/*
7811	* Try and locate an idle core/thread in the LLC cache domain.
7812	*/
7813	static int select_idle_sibling(struct task_struct p, int* prev, int target)
7814	{
7815	bool has_idle_core = false;
7816	struct sched_domain *sd;
7817	unsigned long task_util, util_min, util_max;
7818	int i, recent_used_cpu, prev_aff = -`1`;
7819
7820	/*
7821	* On asymmetric system, update task utilization because we will check
7822	* that the task fits with CPU's capacity.
7823	*/
7824	if (sched_asym_cpucap_active()) {
7825	sync_entity_load_avg(se: &p->se);
7826	task_util = task_util_est(p);
7827	util_min = uclamp_eff_value(p, clamp_id: UCLAMP_MIN);
7828	util_max = uclamp_eff_value(p, clamp_id: UCLAMP_MAX);
7829	}
7830
7831	/*
7832	* per-cpu select_rq_mask usage
7833	*/
7834	lockdep_assert_irqs_disabled();
7835
7836	if ((available_idle_cpu(cpu: target) \|\| sched_idle_cpu(cpu: target)) &&
7837	asym_fits_cpu(util: task_util, util_min, util_max, cpu: target))
7838	return target;
7839
7840	/*
7841	* If the previous CPU is cache affine and idle, don't be stupid:
7842	*/
7843	if (prev != target && cpus_share_cache(this_cpu: prev, that_cpu: target) &&
7844	(available_idle_cpu(cpu: prev) \|\| sched_idle_cpu(cpu: prev)) &&
7845	asym_fits_cpu(util: task_util, util_min, util_max, cpu: prev)) {
7846
7847	if (!static_branch_unlikely(&sched_cluster_active) \|\|
7848	cpus_share_resources(this_cpu: prev, that_cpu: target))
7849	return prev;
7850
7851	prev_aff = prev;
7852	}
7853
7854	/*
7855	* Allow a per-cpu kthread to stack with the wakee if the
7856	* kworker thread and the tasks previous CPUs are the same.
7857	* The assumption is that the wakee queued work for the
7858	* per-cpu kthread that is now complete and the wakeup is
7859	* essentially a sync wakeup. An obvious example of this
7860	* pattern is IO completions.
7861	*/
7862	if (is_per_cpu_kthread(current) &&
7863	in_task() &&
7864	prev == smp_processor_id() &&
7865	this_rq()->nr_running <= `1` &&
7866	asym_fits_cpu(util: task_util, util_min, util_max, cpu: prev)) {
7867	return prev;
7868	}
7869
7870	/ Check a recently used CPU as a potential idle candidate: /
7871	recent_used_cpu = p->recent_used_cpu;
7872	p->recent_used_cpu = prev;
7873	if (recent_used_cpu != prev &&
7874	recent_used_cpu != target &&
7875	cpus_share_cache(this_cpu: recent_used_cpu, that_cpu: target) &&
7876	(available_idle_cpu(cpu: recent_used_cpu) \|\| sched_idle_cpu(cpu: recent_used_cpu)) &&
7877	cpumask_test_cpu(cpu: recent_used_cpu, cpumask: p->cpus_ptr) &&
7878	asym_fits_cpu(util: task_util, util_min, util_max, cpu: recent_used_cpu)) {
7879
7880	if (!static_branch_unlikely(&sched_cluster_active) \|\|
7881	cpus_share_resources(this_cpu: recent_used_cpu, that_cpu: target))
7882	return recent_used_cpu;
7883
7884	} else {
7885	recent_used_cpu = -`1`;
7886	}
7887
7888	/*
7889	* For asymmetric CPU capacity systems, our domain of interest is
7890	* sd_asym_cpucapacity rather than sd_llc.
7891	*/
7892	if (sched_asym_cpucap_active()) {
7893	sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
7894	/*
7895	* On an asymmetric CPU capacity system where an exclusive
7896	* cpuset defines a symmetric island (i.e. one unique
7897	* capacity_orig value through the cpuset), the key will be set
7898	* but the CPUs within that cpuset will not have a domain with
7899	* SD_ASYM_CPUCAPACITY. These should follow the usual symmetric
7900	* capacity path.
7901	*/
7902	if (sd) {
7903	i = select_idle_capacity(p, sd, target);
7904	return ((unsigned)i < nr_cpumask_bits) ? i : target;
7905	}
7906	}
7907
7908	sd = rcu_dereference(per_cpu(sd_llc, target));
7909	if (!sd)
7910	return target;
7911
7912	if (sched_smt_active()) {
7913	has_idle_core = test_idle_cores(cpu: target);
7914
7915	if (!has_idle_core && cpus_share_cache(this_cpu: prev, that_cpu: target)) {
7916	i = select_idle_smt(p, sd, target: prev);
7917	if ((unsigned int)i < nr_cpumask_bits)
7918	return i;
7919	}
7920	}
7921
7922	i = select_idle_cpu(p, sd, has_idle_core, target);
7923	if ((unsigned)i < nr_cpumask_bits)
7924	return i;
7925
7926	/*
7927	* For cluster machines which have lower sharing cache like L2 or
7928	* LLC Tag, we tend to find an idle CPU in the target's cluster
7929	* first. But prev_cpu or recent_used_cpu may also be a good candidate,
7930	* use them if possible when no idle CPU found in select_idle_cpu().
7931	*/
7932	if ((unsigned int)prev_aff < nr_cpumask_bits)
7933	return prev_aff;
7934	if ((unsigned int)recent_used_cpu < nr_cpumask_bits)
7935	return recent_used_cpu;
7936
7937	return target;
7938	}
7939
7940	/**
7941	* cpu_util() - Estimates the amount of CPU capacity used by CFS tasks.
7942	* @cpu: the CPU to get the utilization for
7943	* @p: task for which the CPU utilization should be predicted or NULL
7944	* @dst_cpu: CPU @p migrates to, -1 if @p moves from @cpu or @p == NULL
7945	* @boost: 1 to enable boosting, otherwise 0
7946	*
7947	* The unit of the return value must be the same as the one of CPU capacity
7948	* so that CPU utilization can be compared with CPU capacity.
7949	*
7950	* CPU utilization is the sum of running time of runnable tasks plus the
7951	* recent utilization of currently non-runnable tasks on that CPU.
7952	* It represents the amount of CPU capacity currently used by CFS tasks in
7953	* the range [0..max CPU capacity] with max CPU capacity being the CPU
7954	* capacity at f_max.
7955	*
7956	* The estimated CPU utilization is defined as the maximum between CPU
7957	* utilization and sum of the estimated utilization of the currently
7958	* runnable tasks on that CPU. It preserves a utilization "snapshot" of
7959	* previously-executed tasks, which helps better deduce how busy a CPU will
7960	* be when a long-sleeping task wakes up. The contribution to CPU utilization
7961	* of such a task would be significantly decayed at this point of time.
7962	*
7963	* Boosted CPU utilization is defined as max(CPU runnable, CPU utilization).
7964	* CPU contention for CFS tasks can be detected by CPU runnable > CPU
7965	* utilization. Boosting is implemented in cpu_util() so that internal
7966	* users (e.g. EAS) can use it next to external users (e.g. schedutil),
7967	* latter via cpu_util_cfs_boost().
7968	*
7969	* CPU utilization can be higher than the current CPU capacity
7970	* (f_curr/f_max * max CPU capacity) or even the max CPU capacity because
7971	* of rounding errors as well as task migrations or wakeups of new tasks.
7972	* CPU utilization has to be capped to fit into the [0..max CPU capacity]
7973	* range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%)
7974	* could be seen as over-utilized even though CPU1 has 20% of spare CPU
7975	* capacity. CPU utilization is allowed to overshoot current CPU capacity
7976	* though since this is useful for predicting the CPU capacity required
7977	* after task migrations (scheduler-driven DVFS).
7978	*
7979	* Return: (Boosted) (estimated) utilization for the specified CPU.
7980	*/
7981	static unsigned long
7982	cpu_util(int cpu, struct task_struct p, int* dst_cpu, int boost)
7983	{
7984	struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
7985	unsigned long util = READ_ONCE(cfs_rq->avg.util_avg);
7986	unsigned long runnable;
7987
7988	if (boost) {
7989	runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
7990	util = max(util, runnable);
7991	}
7992
7993	/*
7994	* If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its
7995	* contribution. If @p migrates from another CPU to @cpu add its
7996	* contribution. In all the other cases @cpu is not impacted by the
7997	* migration so its util_avg is already correct.
7998	*/
7999	if (p && task_cpu(p) == cpu && dst_cpu != cpu)
8000	lsub_positive(&util, task_util(p));
8001	else if (p && task_cpu(p) != cpu && dst_cpu == cpu)
8002	util += task_util(p);
8003
8004	if (sched_feat(UTIL_EST)) {
8005	unsigned long util_est;
8006
8007	util_est = READ_ONCE(cfs_rq->avg.util_est);
8008
8009	/*
8010	* During wake-up @p isn't enqueued yet and doesn't contribute
8011	* to any cpu_rq(cpu)->cfs.avg.util_est.
8012	* If @dst_cpu == @cpu add it to "simulate" cpu_util after @p
8013	* has been enqueued.
8014	*
8015	* During exec (@dst_cpu = -1) @p is enqueued and does
8016	* contribute to cpu_rq(cpu)->cfs.util_est.
8017	* Remove it to "simulate" cpu_util without @p's contribution.
8018	*
8019	* Despite the task_on_rq_queued(@p) check there is still a
8020	* small window for a possible race when an exec
8021	* select_task_rq_fair() races with LB's detach_task().
8022	*
8023	* detach_task()
8024	* deactivate_task()
8025	* p->on_rq = TASK_ON_RQ_MIGRATING;
8026	* -------------------------------- A
8027	* dequeue_task() \
8028	* dequeue_task_fair() + Race Time
8029	* util_est_dequeue() /
8030	* -------------------------------- B
8031	*
8032	* The additional check "current == p" is required to further
8033	* reduce the race window.
8034	*/
8035	if (dst_cpu == cpu)
8036	util_est += _task_util_est(p);
8037	else if (p && unlikely(task_on_rq_queued(p) \|\| current == p))
8038	lsub_positive(&util_est, _task_util_est(p));
8039
8040	util = max(util, util_est);
8041	}
8042
8043	return min(util, arch_scale_cpu_capacity(cpu));
8044	}
8045
8046	unsigned long cpu_util_cfs(int cpu)
8047	{
8048	return cpu_util(cpu, NULL, dst_cpu: -`1`, boost: `0`);
8049	}
8050
8051	unsigned long cpu_util_cfs_boost(int cpu)
8052	{
8053	return cpu_util(cpu, NULL, dst_cpu: -`1`, boost: `1`);
8054	}
8055
8056	/*
8057	* cpu_util_without: compute cpu utilization without any contributions from *p
8058	* @cpu: the CPU which utilization is requested
8059	* @p: the task which utilization should be discounted
8060	*
8061	* The utilization of a CPU is defined by the utilization of tasks currently
8062	* enqueued on that CPU as well as tasks which are currently sleeping after an
8063	* execution on that CPU.
8064	*
8065	* This method returns the utilization of the specified CPU by discounting the
8066	* utilization of the specified task, whenever the task is currently
8067	* contributing to the CPU utilization.
8068	*/
8069	static unsigned long cpu_util_without(int cpu, struct task_struct *p)
8070	{
8071	/ Task has no contribution or is new /
8072	if (cpu != task_cpu(p) \|\| !READ_ONCE(p->se.avg.last_update_time))
8073	p = NULL;
8074
8075	return cpu_util(cpu, p, dst_cpu: -`1`, boost: `0`);
8076	}
8077
8078	/*
8079	* This function computes an effective utilization for the given CPU, to be
8080	* used for frequency selection given the linear relation: f = u * f_max.
8081	*
8082	* The scheduler tracks the following metrics:
8083	*
8084	* cpu_util_{cfs,rt,dl,irq}()
8085	* cpu_bw_dl()
8086	*
8087	* Where the cfs,rt and dl util numbers are tracked with the same metric and
8088	* synchronized windows and are thus directly comparable.
8089	*
8090	* The cfs,rt,dl utilization are the running times measured with rq->clock_task
8091	* which excludes things like IRQ and steal-time. These latter are then accrued
8092	* in the IRQ utilization.
8093	*
8094	* The DL bandwidth number OTOH is not a measured metric but a value computed
8095	* based on the task model parameters and gives the minimal utilization
8096	* required to meet deadlines.
8097	*/
8098	unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
8099	unsigned long *min,
8100	unsigned long *max)
8101	{
8102	unsigned long util, irq, scale;
8103	struct rq *rq = cpu_rq(cpu);
8104
8105	scale = arch_scale_cpu_capacity(cpu);
8106
8107	/*
8108	* Early check to see if IRQ/steal time saturates the CPU, can be
8109	* because of inaccuracies in how we track these -- see
8110	* update_irq_load_avg().
8111	*/
8112	irq = cpu_util_irq(rq);
8113	if (unlikely(irq >= scale)) {
8114	if (min)
8115	*min = scale;
8116	if (max)
8117	*max = scale;
8118	return scale;
8119	}
8120
8121	if (min) {
8122	/*
8123	* The minimum utilization returns the highest level between:
8124	* - the computed DL bandwidth needed with the IRQ pressure which
8125	* steals time to the deadline task.
8126	* - The minimum performance requirement for CFS and/or RT.
8127	*/
8128	*min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
8129
8130	/*
8131	* When an RT task is runnable and uclamp is not used, we must
8132	* ensure that the task will run at maximum compute capacity.
8133	*/
8134	if (!uclamp_is_used() && rt_rq_is_runnable(rt_rq: &rq->rt))
8135	min = max(min, scale);
8136	}
8137
8138	/*
8139	* Because the time spend on RT/DL tasks is visible as 'lost' time to
8140	* CFS tasks and we use the same metric to track the effective
8141	* utilization (PELT windows are synchronized) we can directly add them
8142	* to obtain the CPU's actual utilization.
8143	*/
8144	util = util_cfs + cpu_util_rt(rq);
8145	util += cpu_util_dl(rq);
8146
8147	/*
8148	* The maximum hint is a soft bandwidth requirement, which can be lower
8149	* than the actual utilization because of uclamp_max requirements.
8150	*/
8151	if (max)
8152	*max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
8153
8154	if (util >= scale)
8155	return scale;
8156
8157	/*
8158	* There is still idle time; further improve the number by using the
8159	* IRQ metric. Because IRQ/steal time is hidden from the task clock we
8160	* need to scale the task numbers:
8161	*
8162	* max - irq
8163	* U' = irq + --------- * U
8164	* max
8165	*/
8166	util = scale_irq_capacity(util, irq, max: scale);
8167	util += irq;
8168
8169	return min(scale, util);
8170	}
8171
8172	unsigned long sched_cpu_util(int cpu)
8173	{
8174	return effective_cpu_util(cpu, util_cfs: cpu_util_cfs(cpu), NULL, NULL);
8175	}
8176
8177	/*
8178	* energy_env - Utilization landscape for energy estimation.
8179	* @task_busy_time: Utilization contribution by the task for which we test the
8180	* placement. Given by eenv_task_busy_time().
8181	* @pd_busy_time: Utilization of the whole perf domain without the task
8182	* contribution. Given by eenv_pd_busy_time().
8183	* @cpu_cap: Maximum CPU capacity for the perf domain.
8184	* @pd_cap: Entire perf domain capacity. (pd->nr_cpus * cpu_cap).
8185	*/
8186	struct energy_env {
8187	unsigned long task_busy_time;
8188	unsigned long pd_busy_time;
8189	unsigned long cpu_cap;
8190	unsigned long pd_cap;
8191	};
8192
8193	/*
8194	* Compute the task busy time for compute_energy(). This time cannot be
8195	* injected directly into effective_cpu_util() because of the IRQ scaling.
8196	* The latter only makes sense with the most recent CPUs where the task has
8197	* run.
8198	*/
8199	static inline void eenv_task_busy_time(struct energy_env *eenv,
8200	struct task_struct p, int* prev_cpu)
8201	{
8202	unsigned long busy_time, max_cap = arch_scale_cpu_capacity(cpu: prev_cpu);
8203	unsigned long irq = cpu_util_irq(cpu_rq(prev_cpu));
8204
8205	if (unlikely(irq >= max_cap))
8206	busy_time = max_cap;
8207	else
8208	busy_time = scale_irq_capacity(util: task_util_est(p), irq, max: max_cap);
8209
8210	eenv->task_busy_time = busy_time;
8211	}
8212
8213	/*
8214	* Compute the perf_domain (PD) busy time for compute_energy(). Based on the
8215	* utilization for each @pd_cpus, it however doesn't take into account
8216	* clamping since the ratio (utilization / cpu_capacity) is already enough to
8217	* scale the EM reported power consumption at the (eventually clamped)
8218	* cpu_capacity.
8219	*
8220	* The contribution of the task @p for which we want to estimate the
8221	* energy cost is removed (by cpu_util()) and must be calculated
8222	* separately (see eenv_task_busy_time). This ensures:
8223	*
8224	* - A stable PD utilization, no matter which CPU of that PD we want to place
8225	* the task on.
8226	*
8227	* - A fair comparison between CPUs as the task contribution (task_util())
8228	* will always be the same no matter which CPU utilization we rely on
8229	* (util_avg or util_est).
8230	*
8231	* Set @eenv busy time for the PD that spans @pd_cpus. This busy time can't
8232	* exceed @eenv->pd_cap.
8233	*/
8234	static inline void eenv_pd_busy_time(struct energy_env *eenv,
8235	struct cpumask *pd_cpus,
8236	struct task_struct *p)
8237	{
8238	unsigned long busy_time = `0`;
8239	int cpu;
8240
8241	for_each_cpu(cpu, pd_cpus) {
8242	unsigned long util = cpu_util(cpu, p, dst_cpu: -`1`, boost: `0`);
8243
8244	busy_time += effective_cpu_util(cpu, util_cfs: util, NULL, NULL);
8245	}
8246
8247	eenv->pd_busy_time = min(eenv->pd_cap, busy_time);
8248	}
8249
8250	/*
8251	* Compute the maximum utilization for compute_energy() when the task @p
8252	* is placed on the cpu @dst_cpu.
8253	*
8254	* Returns the maximum utilization among @eenv->cpus. This utilization can't
8255	* exceed @eenv->cpu_cap.
8256	*/
8257	static inline unsigned long
8258	eenv_pd_max_util(struct energy_env eenv, struct* cpumask *pd_cpus,
8259	struct task_struct p, int* dst_cpu)
8260	{
8261	unsigned long max_util = `0`;
8262	int cpu;
8263
8264	for_each_cpu(cpu, pd_cpus) {
8265	struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL;
8266	unsigned long util = cpu_util(cpu, p, dst_cpu, boost: `1`);
8267	unsigned long eff_util, min, max;
8268
8269	/*
8270	* Performance domain frequency: utilization clamping
8271	* must be considered since it affects the selection
8272	* of the performance domain frequency.
8273	* NOTE: in case RT tasks are running, by default the min
8274	* utilization can be max OPP.
8275	*/
8276	eff_util = effective_cpu_util(cpu, util_cfs: util, min: &min, max: &max);
8277
8278	/ Task's uclamp can modify min and max value /
8279	if (tsk && uclamp_is_used()) {
8280	min = max(min, uclamp_eff_value(p, UCLAMP_MIN));
8281
8282	/*
8283	* If there is no active max uclamp constraint,
8284	* directly use task's one, otherwise keep max.
8285	*/
8286	if (uclamp_rq_is_idle(cpu_rq(cpu)))
8287	max = uclamp_eff_value(p, clamp_id: UCLAMP_MAX);
8288	else
8289	max = max(max, uclamp_eff_value(p, UCLAMP_MAX));
8290	}
8291
8292	eff_util = sugov_effective_cpu_perf(cpu, actual: eff_util, min, max);
8293	max_util = max(max_util, eff_util);
8294	}
8295
8296	return min(max_util, eenv->cpu_cap);
8297	}
8298
8299	/*
8300	* compute_energy(): Use the Energy Model to estimate the energy that @pd would
8301	* consume for a given utilization landscape @eenv. When @dst_cpu < 0, the task
8302	* contribution is ignored.
8303	*/
8304	static inline unsigned long
8305	compute_energy(struct energy_env eenv, struct* perf_domain *pd,
8306	struct cpumask pd_cpus, struct* task_struct p, int* dst_cpu)
8307	{
8308	unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu);
8309	unsigned long busy_time = eenv->pd_busy_time;
8310	unsigned long energy;
8311
8312	if (dst_cpu >= `0`)
8313	busy_time = min(eenv->pd_cap, busy_time + eenv->task_busy_time);
8314
8315	energy = em_cpu_energy(pd: pd->em_pd, max_util, sum_util: busy_time, allowed_cpu_cap: eenv->cpu_cap);
8316
8317	trace_sched_compute_energy_tp(p, dst_cpu, energy, max_util, busy_time);
8318
8319	return energy;
8320	}
8321
8322	/*
8323	* find_energy_efficient_cpu(): Find most energy-efficient target CPU for the
8324	* waking task. find_energy_efficient_cpu() looks for the CPU with maximum
8325	* spare capacity in each performance domain and uses it as a potential
8326	* candidate to execute the task. Then, it uses the Energy Model to figure
8327	* out which of the CPU candidates is the most energy-efficient.
8328	*
8329	* The rationale for this heuristic is as follows. In a performance domain,
8330	* all the most energy efficient CPU candidates (according to the Energy
8331	* Model) are those for which we'll request a low frequency. When there are
8332	* several CPUs for which the frequency request will be the same, we don't
8333	* have enough data to break the tie between them, because the Energy Model
8334	* only includes active power costs. With this model, if we assume that
8335	* frequency requests follow utilization (e.g. using schedutil), the CPU with
8336	* the maximum spare capacity in a performance domain is guaranteed to be among
8337	* the best candidates of the performance domain.
8338	*
8339	* In practice, it could be preferable from an energy standpoint to pack
8340	* small tasks on a CPU in order to let other CPUs go in deeper idle states,
8341	* but that could also hurt our chances to go cluster idle, and we have no
8342	* ways to tell with the current Energy Model if this is actually a good
8343	* idea or not. So, find_energy_efficient_cpu() basically favors
8344	* cluster-packing, and spreading inside a cluster. That should at least be
8345	* a good thing for latency, and this is consistent with the idea that most
8346	* of the energy savings of EAS come from the asymmetry of the system, and
8347	* not so much from breaking the tie between identical CPUs. That's also the
8348	* reason why EAS is enabled in the topology code only for systems where
8349	* SD_ASYM_CPUCAPACITY is set.
8350	*
8351	* NOTE: Forkees are not accepted in the energy-aware wake-up path because
8352	* they don't have any useful utilization data yet and it's not possible to
8353	* forecast their impact on energy consumption. Consequently, they will be
8354	* placed by sched_balance_find_dst_cpu() on the least loaded CPU, which might turn out
8355	* to be energy-inefficient in some use-cases. The alternative would be to
8356	* bias new tasks towards specific types of CPUs first, or to try to infer
8357	* their util_avg from the parent task, but those heuristics could hurt
8358	* other use-cases too. So, until someone finds a better way to solve this,
8359	* let's keep things simple by re-using the existing slow path.
8360	*/
8361	static int find_energy_efficient_cpu(struct task_struct p, int* prev_cpu)
8362	{
8363	struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
8364	unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
8365	unsigned long p_util_min = uclamp_is_used() ? uclamp_eff_value(p, clamp_id: UCLAMP_MIN) : `0`;
8366	unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, clamp_id: UCLAMP_MAX) : `1024`;
8367	struct root_domain *rd = this_rq()->rd;
8368	int cpu, best_energy_cpu, target = -`1`;
8369	int prev_fits = -`1`, best_fits = -`1`;
8370	unsigned long best_actual_cap = `0`;
8371	unsigned long prev_actual_cap = `0`;
8372	struct sched_domain *sd;
8373	struct perf_domain *pd;
8374	struct energy_env eenv;
8375
8376	rcu_read_lock();
8377	pd = rcu_dereference(rd->pd);
8378	if (!pd)
8379	goto unlock;
8380
8381	/*
8382	* Energy-aware wake-up happens on the lowest sched_domain starting
8383	* from sd_asym_cpucapacity spanning over this_cpu and prev_cpu.
8384	*/
8385	sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));
8386	while (sd && !cpumask_test_cpu(cpu: prev_cpu, cpumask: sched_domain_span(sd)))
8387	sd = sd->parent;
8388	if (!sd)
8389	goto unlock;
8390
8391	target = prev_cpu;
8392
8393	sync_entity_load_avg(se: &p->se);
8394	if (!task_util_est(p) && p_util_min == `0`)
8395	goto unlock;
8396
8397	eenv_task_busy_time(eenv: &eenv, p, prev_cpu);
8398
8399	for (; pd; pd = pd->next) {
8400	unsigned long util_min = p_util_min, util_max = p_util_max;
8401	unsigned long cpu_cap, cpu_actual_cap, util;
8402	long prev_spare_cap = -`1`, max_spare_cap = -`1`;
8403	unsigned long rq_util_min, rq_util_max;
8404	unsigned long cur_delta, base_energy;
8405	int max_spare_cap_cpu = -`1`;
8406	int fits, max_fits = -`1`;
8407
8408	cpumask_and(dstp: cpus, perf_domain_span(pd), cpu_online_mask);
8409
8410	if (cpumask_empty(srcp: cpus))
8411	continue;
8412
8413	/ Account external pressure for the energy estimation /
8414	cpu = cpumask_first(srcp: cpus);
8415	cpu_actual_cap = get_actual_cpu_capacity(cpu);
8416
8417	eenv.cpu_cap = cpu_actual_cap;
8418	eenv.pd_cap = `0`;
8419
8420	for_each_cpu(cpu, cpus) {
8421	struct rq *rq = cpu_rq(cpu);
8422
8423	eenv.pd_cap += cpu_actual_cap;
8424
8425	if (!cpumask_test_cpu(cpu, cpumask: sched_domain_span(sd)))
8426	continue;
8427
8428	if (!cpumask_test_cpu(cpu, cpumask: p->cpus_ptr))
8429	continue;
8430
8431	util = cpu_util(cpu, p, dst_cpu: cpu, boost: `0`);
8432	cpu_cap = capacity_of(cpu);
8433
8434	/*
8435	* Skip CPUs that cannot satisfy the capacity request.
8436	* IOW, placing the task there would make the CPU
8437	* overutilized. Take uclamp into account to see how
8438	* much capacity we can get out of the CPU; this is
8439	* aligned with sched_cpu_util().
8440	*/
8441	if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) {
8442	/*
8443	* Open code uclamp_rq_util_with() except for
8444	* the clamp() part. I.e.: apply max aggregation
8445	* only. util_fits_cpu() logic requires to
8446	* operate on non clamped util but must use the
8447	* max-aggregated uclamp_{min, max}.
8448	*/
8449	rq_util_min = uclamp_rq_get(rq, clamp_id: UCLAMP_MIN);
8450	rq_util_max = uclamp_rq_get(rq, clamp_id: UCLAMP_MAX);
8451
8452	util_min = max(rq_util_min, p_util_min);
8453	util_max = max(rq_util_max, p_util_max);
8454	}
8455
8456	fits = util_fits_cpu(util, uclamp_min: util_min, uclamp_max: util_max, cpu);
8457	if (!fits)
8458	continue;
8459
8460	lsub_positive(&cpu_cap, util);
8461
8462	if (cpu == prev_cpu) {
8463	/ Always use prev_cpu as a candidate. /
8464	prev_spare_cap = cpu_cap;
8465	prev_fits = fits;
8466	} else if ((fits > max_fits) \|\|
8467	((fits == max_fits) && ((long)cpu_cap > max_spare_cap))) {
8468	/*
8469	* Find the CPU with the maximum spare capacity
8470	* among the remaining CPUs in the performance
8471	* domain.
8472	*/
8473	max_spare_cap = cpu_cap;
8474	max_spare_cap_cpu = cpu;
8475	max_fits = fits;
8476	}
8477	}
8478
8479	if (max_spare_cap_cpu < `0` && prev_spare_cap < `0`)
8480	continue;
8481
8482	eenv_pd_busy_time(eenv: &eenv, pd_cpus: cpus, p);
8483	/ Compute the 'base' energy of the pd, without @p /
8484	base_energy = compute_energy(eenv: &eenv, pd, pd_cpus: cpus, p, dst_cpu: -`1`);
8485
8486	/ Evaluate the energy impact of using prev_cpu. /
8487	if (prev_spare_cap > -`1`) {
8488	prev_delta = compute_energy(eenv: &eenv, pd, pd_cpus: cpus, p,
8489	dst_cpu: prev_cpu);
8490	/ CPU utilization has changed /
8491	if (prev_delta < base_energy)
8492	goto unlock;
8493	prev_delta -= base_energy;
8494	prev_actual_cap = cpu_actual_cap;
8495	best_delta = min(best_delta, prev_delta);
8496	}
8497
8498	/ Evaluate the energy impact of using max_spare_cap_cpu. /
8499	if (max_spare_cap_cpu >= `0` && max_spare_cap > prev_spare_cap) {
8500	/ Current best energy cpu fits better /
8501	if (max_fits < best_fits)
8502	continue;
8503
8504	/*
8505	* Both don't fit performance hint (i.e. uclamp_min)
8506	* but best energy cpu has better capacity.
8507	*/
8508	if ((max_fits < `0`) &&
8509	(cpu_actual_cap <= best_actual_cap))
8510	continue;
8511
8512	cur_delta = compute_energy(eenv: &eenv, pd, pd_cpus: cpus, p,
8513	dst_cpu: max_spare_cap_cpu);
8514	/ CPU utilization has changed /
8515	if (cur_delta < base_energy)
8516	goto unlock;
8517	cur_delta -= base_energy;
8518
8519	/*
8520	* Both fit for the task but best energy cpu has lower
8521	* energy impact.
8522	*/
8523	if ((max_fits > `0`) && (best_fits > `0`) &&
8524	(cur_delta >= best_delta))
8525	continue;
8526
8527	best_delta = cur_delta;
8528	best_energy_cpu = max_spare_cap_cpu;
8529	best_fits = max_fits;
8530	best_actual_cap = cpu_actual_cap;
8531	}
8532	}
8533	rcu_read_unlock();
8534
8535	if ((best_fits > prev_fits) \|\|
8536	((best_fits > `0`) && (best_delta < prev_delta)) \|\|
8537	((best_fits < `0`) && (best_actual_cap > prev_actual_cap)))
8538	target = best_energy_cpu;
8539
8540	return target;
8541
8542	unlock:
8543	rcu_read_unlock();
8544
8545	return target;
8546	}
8547
8548	/*
8549	* select_task_rq_fair: Select target runqueue for the waking task in domains
8550	* that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE,
8551	* SD_BALANCE_FORK, or SD_BALANCE_EXEC.
8552	*
8553	* Balances load by selecting the idlest CPU in the idlest group, or under
8554	* certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
8555	*
8556	* Returns the target CPU number.
8557	*/
8558	static int
8559	select_task_rq_fair(struct task_struct p, int* prev_cpu, int wake_flags)
8560	{
8561	int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
8562	struct sched_domain tmp, sd = NULL;
8563	int cpu = smp_processor_id();
8564	int new_cpu = prev_cpu;
8565	int want_affine = `0`;
8566	/ SD_flags and WF_flags share the first nibble /
8567	int sd_flag = wake_flags & `0xF`;
8568
8569	/*
8570	* required for stable ->cpus_allowed
8571	*/
8572	lockdep_assert_held(&p->pi_lock);
8573	if (wake_flags & WF_TTWU) {
8574	record_wakee(p);
8575
8576	if ((wake_flags & WF_CURRENT_CPU) &&
8577	cpumask_test_cpu(cpu, cpumask: p->cpus_ptr))
8578	return cpu;
8579
8580	if (!is_rd_overutilized(this_rq()->rd)) {
8581	new_cpu = find_energy_efficient_cpu(p, prev_cpu);
8582	if (new_cpu >= `0`)
8583	return new_cpu;
8584	new_cpu = prev_cpu;
8585	}
8586
8587	want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, cpumask: p->cpus_ptr);
8588	}
8589
8590	rcu_read_lock();
8591	for_each_domain(cpu, tmp) {
8592	/*
8593	* If both 'cpu' and 'prev_cpu' are part of this domain,
8594	* cpu is a valid SD_WAKE_AFFINE target.
8595	*/
8596	if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
8597	cpumask_test_cpu(cpu: prev_cpu, cpumask: sched_domain_span(sd: tmp))) {
8598	if (cpu != prev_cpu)
8599	new_cpu = wake_affine(sd: tmp, p, this_cpu: cpu, prev_cpu, sync);
8600
8601	sd = NULL; / Prefer wake_affine over balance flags /
8602	break;
8603	}
8604
8605	/*
8606	* Usually only true for WF_EXEC and WF_FORK, as sched_domains
8607	* usually do not have SD_BALANCE_WAKE set. That means wakeup
8608	* will usually go to the fast path.
8609	*/
8610	if (tmp->flags & sd_flag)
8611	sd = tmp;
8612	else if (!want_affine)
8613	break;
8614	}
8615
8616	if (unlikely(sd)) {
8617	/ Slow path /
8618	new_cpu = sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, sd_flag);
8619	} else if (wake_flags & WF_TTWU) { / XXX always ? /
8620	/ Fast path /
8621	new_cpu = select_idle_sibling(p, prev: prev_cpu, target: new_cpu);
8622	}
8623	rcu_read_unlock();
8624
8625	return new_cpu;
8626	}
8627
8628	/*
8629	* Called immediately before a task is migrated to a new CPU; task_cpu(p) and
8630	* cfs_rq_of(p) references at time of call are still valid and identify the
8631	* previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
8632	*/
8633	static void migrate_task_rq_fair(struct task_struct p, int* new_cpu)
8634	{
8635	struct sched_entity *se = &p->se;
8636
8637	if (!task_on_rq_migrating(p)) {
8638	remove_entity_load_avg(se);
8639
8640	/*
8641	* Here, the task's PELT values have been updated according to
8642	* the current rq's clock. But if that clock hasn't been
8643	* updated in a while, a substantial idle time will be missed,
8644	* leading to an inflation after wake-up on the new rq.
8645	*
8646	* Estimate the missing time from the cfs_rq last_update_time
8647	* and update sched_avg to improve the PELT continuity after
8648	* migration.
8649	*/
8650	migrate_se_pelt_lag(se);
8651	}
8652
8653	/ Tell new CPU we are migrated /
8654	se->avg.last_update_time = `0`;
8655
8656	update_scan_period(p, new_cpu);
8657	}
8658
8659	static void task_dead_fair(struct task_struct *p)
8660	{
8661	struct sched_entity *se = &p->se;
8662
8663	if (se->sched_delayed) {
8664	struct rq_flags rf;
8665	struct rq *rq;
8666
8667	rq = task_rq_lock(p, rf: &rf);
8668	if (se->sched_delayed) {
8669	update_rq_clock(rq);
8670	dequeue_entities(rq, se, DEQUEUE_SLEEP \| DEQUEUE_DELAYED);
8671	}
8672	task_rq_unlock(rq, p, rf: &rf);
8673	}
8674
8675	remove_entity_load_avg(se);
8676	}
8677
8678	/*
8679	* Set the max capacity the task is allowed to run at for misfit detection.
8680	*/
8681	static void set_task_max_allowed_capacity(struct task_struct *p)
8682	{
8683	struct asym_cap_data *entry;
8684
8685	if (!sched_asym_cpucap_active())
8686	return;
8687
8688	rcu_read_lock();
8689	list_for_each_entry_rcu(entry, &asym_cap_list, link) {
8690	cpumask_t *cpumask;
8691
8692	cpumask = cpu_capacity_span(entry);
8693	if (!cpumask_intersects(src1p: p->cpus_ptr, src2p: cpumask))
8694	continue;
8695
8696	p->max_allowed_capacity = entry->capacity;
8697	break;
8698	}
8699	rcu_read_unlock();
8700	}
8701
8702	static void set_cpus_allowed_fair(struct task_struct p, struct* affinity_context *ctx)
8703	{
8704	set_cpus_allowed_common(p, ctx);
8705	set_task_max_allowed_capacity(p);
8706	}
8707
8708	static int
8709	balance_fair(struct rq rq, struct* task_struct prev, struct* rq_flags *rf)
8710	{
8711	if (sched_fair_runnable(rq))
8712	return `1`;
8713
8714	return sched_balance_newidle(this_rq: rq, rf) != `0`;
8715	}
8716
8717	static void set_next_buddy(struct sched_entity *se)
8718	{
8719	for_each_sched_entity(se) {
8720	if (WARN_ON_ONCE(!se->on_rq))
8721	return;
8722	if (se_is_idle(se))
8723	return;
8724	cfs_rq_of(se)->next = se;
8725	}
8726	}
8727
8728	/*
8729	* Preempt the current task with a newly woken task if needed:
8730	*/
8731	static void check_preempt_wakeup_fair(struct rq rq, struct* task_struct p, int* wake_flags)
8732	{
8733	struct task_struct *donor = rq->donor;
8734	struct sched_entity se = &donor->se, pse = &p->se;
8735	struct cfs_rq *cfs_rq = task_cfs_rq(p: donor);
8736	int cse_is_idle, pse_is_idle;
8737	bool do_preempt_short = false;
8738
8739	if (unlikely(se == pse))
8740	return;
8741
8742	/*
8743	* This is possible from callers such as attach_tasks(), in which we
8744	* unconditionally wakeup_preempt() after an enqueue (which may have
8745	* lead to a throttle). This both saves work and prevents false
8746	* next-buddy nomination below.
8747	*/
8748	if (task_is_throttled(p))
8749	return;
8750
8751	if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK) && !pse->sched_delayed) {
8752	set_next_buddy(pse);
8753	}
8754
8755	/*
8756	* We can come here with TIF_NEED_RESCHED already set from new task
8757	* wake up path.
8758	*
8759	* Note: this also catches the edge-case of curr being in a throttled
8760	* group (e.g. via set_curr_task), since update_curr() (in the
8761	* enqueue of curr) will have resulted in resched being set. This
8762	* prevents us from potentially nominating it as a false LAST_BUDDY
8763	* below.
8764	*/
8765	if (test_tsk_need_resched(tsk: rq->curr))
8766	return;
8767
8768	if (!sched_feat(WAKEUP_PREEMPTION))
8769	return;
8770
8771	find_matching_se(se: &se, pse: &pse);
8772	WARN_ON_ONCE(!pse);
8773
8774	cse_is_idle = se_is_idle(se);
8775	pse_is_idle = se_is_idle(se: pse);
8776
8777	/*
8778	* Preempt an idle entity in favor of a non-idle entity (and don't preempt
8779	* in the inverse case).
8780	*/
8781	if (cse_is_idle && !pse_is_idle) {
8782	/*
8783	* When non-idle entity preempt an idle entity,
8784	* don't give idle entity slice protection.
8785	*/
8786	do_preempt_short = true;
8787	goto preempt;
8788	}
8789
8790	if (cse_is_idle != pse_is_idle)
8791	return;
8792
8793	/*
8794	* BATCH and IDLE tasks do not preempt others.
8795	*/
8796	if (unlikely(!normal_policy(p->policy)))
8797	return;
8798
8799	cfs_rq = cfs_rq_of(se);
8800	update_curr(cfs_rq);
8801	/*
8802	* If @p has a shorter slice than current and @p is eligible, override
8803	* current's slice protection in order to allow preemption.
8804	*/
8805	do_preempt_short = sched_feat(PREEMPT_SHORT) && (pse->slice < se->slice);
8806
8807	/*
8808	* If @p has become the most eligible task, force preemption.
8809	*/
8810	if (__pick_eevdf(cfs_rq, protect: !do_preempt_short) == pse)
8811	goto preempt;
8812
8813	if (sched_feat(RUN_TO_PARITY) && do_preempt_short)
8814	update_protect_slice(cfs_rq, se);
8815
8816	return;
8817
8818	preempt:
8819	if (do_preempt_short)
8820	cancel_protect_slice(se);
8821
8822	resched_curr_lazy(rq);
8823	}
8824
8825	static struct task_struct pick_task_fair(struct* rq *rq)
8826	{
8827	struct sched_entity *se;
8828	struct cfs_rq *cfs_rq;
8829	struct task_struct *p;
8830	bool throttled;
8831
8832	again:
8833	cfs_rq = &rq->cfs;
8834	if (!cfs_rq->nr_queued)
8835	return NULL;
8836
8837	throttled = false;
8838
8839	do {
8840	/ Might not have done put_prev_entity() /
8841	if (cfs_rq->curr && cfs_rq->curr->on_rq)
8842	update_curr(cfs_rq);
8843
8844	throttled \|= check_cfs_rq_runtime(cfs_rq);
8845
8846	se = pick_next_entity(rq, cfs_rq);
8847	if (!se)
8848	goto again;
8849	cfs_rq = group_cfs_rq(grp: se);
8850	} while (cfs_rq);
8851
8852	p = task_of(se);
8853	if (unlikely(throttled))
8854	task_throttle_setup_work(p);
8855	return p;
8856	}
8857
8858	static void __set_next_task_fair(struct rq rq, struct* task_struct *p, bool first);
8859	static void set_next_task_fair(struct rq rq, struct* task_struct *p, bool first);
8860
8861	struct task_struct *
8862	pick_next_task_fair(struct rq rq, struct* task_struct prev, struct* rq_flags *rf)
8863	{
8864	struct sched_entity *se;
8865	struct task_struct *p;
8866	int new_tasks;
8867
8868	again:
8869	p = pick_task_fair(rq);
8870	if (!p)
8871	goto idle;
8872	se = &p->se;
8873
8874	#ifdef CONFIG_FAIR_GROUP_SCHED
8875	if (prev->sched_class != &fair_sched_class)
8876	goto simple;
8877
8878	__put_prev_set_next_dl_server(rq, prev, next: p);
8879
8880	/*
8881	* Because of the set_next_buddy() in dequeue_task_fair() it is rather
8882	* likely that a next task is from the same cgroup as the current.
8883	*
8884	* Therefore attempt to avoid putting and setting the entire cgroup
8885	* hierarchy, only change the part that actually changes.
8886	*
8887	* Since we haven't yet done put_prev_entity and if the selected task
8888	* is a different task than we started out with, try and touch the
8889	* least amount of cfs_rqs.
8890	*/
8891	if (prev != p) {
8892	struct sched_entity *pse = &prev->se;
8893	struct cfs_rq *cfs_rq;
8894
8895	while (!(cfs_rq = is_same_group(se, pse))) {
8896	int se_depth = se->depth;
8897	int pse_depth = pse->depth;
8898
8899	if (se_depth <= pse_depth) {
8900	put_prev_entity(cfs_rq: cfs_rq_of(se: pse), prev: pse);
8901	pse = parent_entity(se: pse);
8902	}
8903	if (se_depth >= pse_depth) {
8904	set_next_entity(cfs_rq: cfs_rq_of(se), se);
8905	se = parent_entity(se);
8906	}
8907	}
8908
8909	put_prev_entity(cfs_rq, prev: pse);
8910	set_next_entity(cfs_rq, se);
8911
8912	__set_next_task_fair(rq, p, first: true);
8913	}
8914
8915	return p;
8916
8917	simple:
8918	#endif /* CONFIG_FAIR_GROUP_SCHED */
8919	put_prev_set_next_task(rq, prev, next: p);
8920	return p;
8921
8922	idle:
8923	if (!rf)
8924	return NULL;
8925
8926	new_tasks = sched_balance_newidle(this_rq: rq, rf);
8927
8928	/*
8929	* Because sched_balance_newidle() releases (and re-acquires) rq->lock, it is
8930	* possible for any higher priority task to appear. In that case we
8931	* must re-start the pick_next_entity() loop.
8932	*/
8933	if (new_tasks < `0`)
8934	return RETRY_TASK;
8935
8936	if (new_tasks > `0`)
8937	goto again;
8938
8939	/*
8940	* rq is about to be idle, check if we need to update the
8941	* lost_idle_time of clock_pelt
8942	*/
8943	update_idle_rq_clock_pelt(rq);
8944
8945	return NULL;
8946	}
8947
8948	static struct task_struct __pick_next_task_fair(struct* rq rq, struct* task_struct *prev)
8949	{
8950	return pick_next_task_fair(rq, prev, NULL);
8951	}
8952
8953	static struct task_struct fair_server_pick_task(struct* sched_dl_entity *dl_se)
8954	{
8955	return pick_task_fair(rq: dl_se->rq);
8956	}
8957
8958	void fair_server_init(struct rq *rq)
8959	{
8960	struct sched_dl_entity *dl_se = &rq->fair_server;
8961
8962	init_dl_entity(dl_se);
8963
8964	dl_server_init(dl_se, rq, pick_task: fair_server_pick_task);
8965	}
8966
8967	/*
8968	* Account for a descheduled task:
8969	*/
8970	static void put_prev_task_fair(struct rq rq, struct* task_struct prev, struct* task_struct *next)
8971	{
8972	struct sched_entity *se = &prev->se;
8973	struct cfs_rq *cfs_rq;
8974
8975	for_each_sched_entity(se) {
8976	cfs_rq = cfs_rq_of(se);
8977	put_prev_entity(cfs_rq, prev: se);
8978	}
8979	}
8980
8981	/*
8982	* sched_yield() is very simple
8983	*/
8984	static void yield_task_fair(struct rq *rq)
8985	{
8986	struct task_struct *curr = rq->curr;
8987	struct cfs_rq *cfs_rq = task_cfs_rq(p: curr);
8988	struct sched_entity *se = &curr->se;
8989
8990	/*
8991	* Are we the only task in the tree?
8992	*/
8993	if (unlikely(rq->nr_running == `1`))
8994	return;
8995
8996	clear_buddies(cfs_rq, se);
8997
8998	update_rq_clock(rq);
8999	/*
9000	* Update run-time statistics of the 'current'.
9001	*/
9002	update_curr(cfs_rq);
9003	/*
9004	* Tell update_rq_clock() that we've just updated,
9005	* so we don't do microscopic update in schedule()
9006	* and double the fastpath cost.
9007	*/
9008	rq_clock_skip_update(rq);
9009
9010	se->deadline += calc_delta_fair(delta: se->slice, se);
9011	}
9012
9013	static bool yield_to_task_fair(struct rq rq, struct* task_struct *p)
9014	{
9015	struct sched_entity *se = &p->se;
9016
9017	/ !se->on_rq also covers throttled task /
9018	if (!se->on_rq)
9019	return false;
9020
9021	/ Tell the scheduler that we'd really like se to run next. /
9022	set_next_buddy(se);
9023
9024	yield_task_fair(rq);
9025
9026	return true;
9027	}
9028
9029	/**************************************************
9030	* Fair scheduling class load-balancing methods.
9031	*
9032	* BASICS
9033	*
9034	* The purpose of load-balancing is to achieve the same basic fairness the
9035	* per-CPU scheduler provides, namely provide a proportional amount of compute
9036	* time to each task. This is expressed in the following equation:
9037	*
9038	* W_i,n/P_i == W_j,n/P_j for all i,j (1)
9039	*
9040	* Where W_i,n is the n-th weight average for CPU i. The instantaneous weight
9041	* W_i,0 is defined as:
9042	*
9043	* W_i,0 = \Sum_j w_i,j (2)
9044	*
9045	* Where w_i,j is the weight of the j-th runnable task on CPU i. This weight
9046	* is derived from the nice value as per sched_prio_to_weight[].
9047	*
9048	* The weight average is an exponential decay average of the instantaneous
9049	* weight:
9050	*
9051	* W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
9052	*
9053	* C_i is the compute capacity of CPU i, typically it is the
9054	* fraction of 'recent' time available for SCHED_OTHER task execution. But it
9055	* can also include other factors [XXX].
9056	*
9057	* To achieve this balance we define a measure of imbalance which follows
9058	* directly from (1):
9059	*
9060	* imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4)
9061	*
9062	* We them move tasks around to minimize the imbalance. In the continuous
9063	* function space it is obvious this converges, in the discrete case we get
9064	* a few fun cases generally called infeasible weight scenarios.
9065	*
9066	* [XXX expand on:
9067	* - infeasible weights;
9068	* - local vs global optima in the discrete case. ]
9069	*
9070	*
9071	* SCHED DOMAINS
9072	*
9073	* In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
9074	* for all i,j solution, we create a tree of CPUs that follows the hardware
9075	* topology where each level pairs two lower groups (or better). This results
9076	* in O(log n) layers. Furthermore we reduce the number of CPUs going up the
9077	* tree to only the first of the previous level and we decrease the frequency
9078	* of load-balance at each level inversely proportional to the number of CPUs in
9079	* the groups.
9080	*
9081	* This yields:
9082	*
9083	* log_2 n 1 n
9084	* \Sum { --- * --- * 2^i } = O(n) (5)
9085	* i = 0 2^i 2^i
9086	* `- size of each group
9087	* \| \| `- number of CPUs doing load-balance
9088	* \| `- freq
9089	* `- sum over all levels
9090	*
9091	* Coupled with a limit on how many tasks we can migrate every balance pass,
9092	* this makes (5) the runtime complexity of the balancer.
9093	*
9094	* An important property here is that each CPU is still (indirectly) connected
9095	* to every other CPU in at most O(log n) steps:
9096	*
9097	* The adjacency matrix of the resulting graph is given by:
9098	*
9099	* log_2 n
9100	* A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
9101	* k = 0
9102	*
9103	* And you'll find that:
9104	*
9105	* A^(log_2 n)_i,j != 0 for all i,j (7)
9106	*
9107	* Showing there's indeed a path between every CPU in at most O(log n) steps.
9108	* The task movement gives a factor of O(m), giving a convergence complexity
9109	* of:
9110	*
9111	* O(nm log n), n := nr_cpus, m := nr_tasks (8)
9112	*
9113	*
9114	* WORK CONSERVING
9115	*
9116	* In order to avoid CPUs going idle while there's still work to do, new idle
9117	* balancing is more aggressive and has the newly idle CPU iterate up the domain
9118	* tree itself instead of relying on other CPUs to bring it work.
9119	*
9120	* This adds some complexity to both (5) and (8) but it reduces the total idle
9121	* time.
9122	*
9123	* [XXX more?]
9124	*
9125	*
9126	* CGROUPS
9127	*
9128	* Cgroups make a horror show out of (2), instead of a simple sum we get:
9129	*
9130	* s_k,i
9131	* W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
9132	* S_k
9133	*
9134	* Where
9135	*
9136	* s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
9137	*
9138	* w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on CPU i.
9139	*
9140	* The big problem is S_k, its a global sum needed to compute a local (W_i)
9141	* property.
9142	*
9143	* [XXX write more on how we solve this.. _after_ merging pjt's patches that
9144	* rewrite all of this once again.]
9145	*/
9146
9147	static unsigned long __read_mostly max_load_balance_interval = HZ/`10`;
9148
9149	enum fbq_type { regular, remote, all };
9150
9151	/*
9152	* 'group_type' describes the group of CPUs at the moment of load balancing.
9153	*
9154	* The enum is ordered by pulling priority, with the group with lowest priority
9155	* first so the group_type can simply be compared when selecting the busiest
9156	* group. See update_sd_pick_busiest().
9157	*/
9158	enum group_type {
9159	/ The group has spare capacity that can be used to run more tasks. /
9160	group_has_spare = `0`,
9161	/*
9162	* The group is fully used and the tasks don't compete for more CPU
9163	* cycles. Nevertheless, some tasks might wait before running.
9164	*/
9165	group_fully_busy,
9166	/*
9167	* One task doesn't fit with CPU's capacity and must be migrated to a
9168	* more powerful CPU.
9169	*/
9170	group_misfit_task,
9171	/*
9172	* Balance SMT group that's fully busy. Can benefit from migration
9173	* a task on SMT with busy sibling to another CPU on idle core.
9174	*/
9175	group_smt_balance,
9176	/*
9177	* SD_ASYM_PACKING only: One local CPU with higher capacity is available,
9178	* and the task should be migrated to it instead of running on the
9179	* current CPU.
9180	*/
9181	group_asym_packing,
9182	/*
9183	* The tasks' affinity constraints previously prevented the scheduler
9184	* from balancing the load across the system.
9185	*/
9186	group_imbalanced,
9187	/*
9188	* The CPU is overloaded and can't provide expected CPU cycles to all
9189	* tasks.
9190	*/
9191	group_overloaded
9192	};
9193
9194	enum migration_type {
9195	migrate_load = `0`,
9196	migrate_util,
9197	migrate_task,
9198	migrate_misfit
9199	};
9200
9201	#define LBF_ALL_PINNED 0x01
9202	#define LBF_NEED_BREAK 0x02
9203	#define LBF_DST_PINNED 0x04
9204	#define LBF_SOME_PINNED 0x08
9205	#define LBF_ACTIVE_LB 0x10
9206
9207	struct lb_env {
9208	struct sched_domain *sd;
9209
9210	struct rq *src_rq;
9211	int src_cpu;
9212
9213	int dst_cpu;
9214	struct rq *dst_rq;
9215
9216	struct cpumask *dst_grpmask;
9217	int new_dst_cpu;
9218	enum cpu_idle_type idle;
9219	long imbalance;
9220	/ The set of CPUs under consideration for load-balancing /
9221	struct cpumask *cpus;
9222
9223	unsigned int flags;
9224
9225	unsigned int loop;
9226	unsigned int loop_break;
9227	unsigned int loop_max;
9228
9229	enum fbq_type fbq_type;
9230	enum migration_type migration_type;
9231	struct list_head tasks;
9232	};
9233
9234	/*
9235	* Is this task likely cache-hot:
9236	*/
9237	static int task_hot(struct task_struct p, struct* lb_env *env)
9238	{
9239	s64 delta;
9240
9241	lockdep_assert_rq_held(rq: env->src_rq);
9242
9243	if (p->sched_class != &fair_sched_class)
9244	return `0`;
9245
9246	if (unlikely(task_has_idle_policy(p)))
9247	return `0`;
9248
9249	/ SMT siblings share cache /
9250	if (env->sd->flags & SD_SHARE_CPUCAPACITY)
9251	return `0`;
9252
9253	/*
9254	* Buddy candidates are cache hot:
9255	*/
9256	if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
9257	(&p->se == cfs_rq_of(se: &p->se)->next))
9258	return `1`;
9259
9260	if (sysctl_sched_migration_cost == -`1`)
9261	return `1`;
9262
9263	/*
9264	* Don't migrate task if the task's cookie does not match
9265	* with the destination CPU's core cookie.
9266	*/
9267	if (!sched_core_cookie_match(cpu_rq(env->dst_cpu), p))
9268	return `1`;
9269
9270	if (sysctl_sched_migration_cost == `0`)
9271	return `0`;
9272
9273	delta = rq_clock_task(rq: env->src_rq) - p->se.exec_start;
9274
9275	return delta < (s64)sysctl_sched_migration_cost;
9276	}
9277
9278	#ifdef CONFIG_NUMA_BALANCING
9279	/*
9280	* Returns a positive value, if task migration degrades locality.
9281	* Returns 0, if task migration is not affected by locality.
9282	* Returns a negative value, if task migration improves locality i.e migration preferred.
9283	*/
9284	static long migrate_degrades_locality(struct task_struct p, struct* lb_env *env)
9285	{
9286	struct numa_group *numa_group = rcu_dereference(p->numa_group);
9287	unsigned long src_weight, dst_weight;
9288	int src_nid, dst_nid, dist;
9289
9290	if (!static_branch_likely(&sched_numa_balancing))
9291	return `0`;
9292
9293	if (!p->numa_faults \|\| !(env->sd->flags & SD_NUMA))
9294	return `0`;
9295
9296	src_nid = cpu_to_node(env->src_cpu);
9297	dst_nid = cpu_to_node(env->dst_cpu);
9298
9299	if (src_nid == dst_nid)
9300	return `0`;
9301
9302	/ Migrating away from the preferred node is always bad. /
9303	if (src_nid == p->numa_preferred_nid) {
9304	if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
9305	return `1`;
9306	else
9307	return `0`;
9308	}
9309
9310	/ Encourage migration to the preferred node. /
9311	if (dst_nid == p->numa_preferred_nid)
9312	return -`1`;
9313
9314	/ Leaving a core idle is often worse than degrading locality. /
9315	if (env->idle == CPU_IDLE)
9316	return `0`;
9317
9318	dist = node_distance(src_nid, dst_nid);
9319	if (numa_group) {
9320	src_weight = group_weight(p, src_nid, dist);
9321	dst_weight = group_weight(p, dst_nid, dist);
9322	} else {
9323	src_weight = task_weight(p, src_nid, dist);
9324	dst_weight = task_weight(p, dst_nid, dist);
9325	}
9326
9327	return src_weight - dst_weight;
9328	}
9329
9330	#else /* !CONFIG_NUMA_BALANCING: */
9331	static inline long migrate_degrades_locality(struct task_struct *p,
9332	struct lb_env *env)
9333	{
9334	return `0`;
9335	}
9336	#endif /* !CONFIG_NUMA_BALANCING */
9337
9338	/*
9339	* Check whether the task is ineligible on the destination cpu
9340	*
9341	* When the PLACE_LAG scheduling feature is enabled and
9342	* dst_cfs_rq->nr_queued is greater than 1, if the task
9343	* is ineligible, it will also be ineligible when
9344	* it is migrated to the destination cpu.
9345	*/
9346	static inline int task_is_ineligible_on_dst_cpu(struct task_struct p, int* dest_cpu)
9347	{
9348	struct cfs_rq *dst_cfs_rq;
9349
9350	#ifdef CONFIG_FAIR_GROUP_SCHED
9351	dst_cfs_rq = task_group(p)->cfs_rq[dest_cpu];
9352	#else
9353	dst_cfs_rq = &cpu_rq(dest_cpu)->cfs;
9354	#endif
9355	if (sched_feat(PLACE_LAG) && dst_cfs_rq->nr_queued &&
9356	!entity_eligible(cfs_rq: task_cfs_rq(p), se: &p->se))
9357	return `1`;
9358
9359	return `0`;
9360	}
9361
9362	/*
9363	* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
9364	*/
9365	static
9366	int can_migrate_task(struct task_struct p, struct* lb_env *env)
9367	{
9368	long degrades, hot;
9369
9370	lockdep_assert_rq_held(rq: env->src_rq);
9371	if (p->sched_task_hot)
9372	p->sched_task_hot = `0`;
9373
9374	/*
9375	* We do not migrate tasks that are:
9376	* 1) delayed dequeued unless we migrate load, or
9377	* 2) target cfs_rq is in throttled hierarchy, or
9378	* 3) cannot be migrated to this CPU due to cpus_ptr, or
9379	* 4) running (obviously), or
9380	* 5) are cache-hot on their current CPU, or
9381	* 6) are blocked on mutexes (if SCHED_PROXY_EXEC is enabled)
9382	*/
9383	if ((p->se.sched_delayed) && (env->migration_type != migrate_load))
9384	return `0`;
9385
9386	if (lb_throttled_hierarchy(p, dst_cpu: env->dst_cpu))
9387	return `0`;
9388
9389	/*
9390	* We want to prioritize the migration of eligible tasks.
9391	* For ineligible tasks we soft-limit them and only allow
9392	* them to migrate when nr_balance_failed is non-zero to
9393	* avoid load-balancing trying very hard to balance the load.
9394	*/
9395	if (!env->sd->nr_balance_failed &&
9396	task_is_ineligible_on_dst_cpu(p, dest_cpu: env->dst_cpu))
9397	return `0`;
9398
9399	/ Disregard percpu kthreads; they are where they need to be. /
9400	if (kthread_is_per_cpu(k: p))
9401	return `0`;
9402
9403	if (task_is_blocked(p))
9404	return `0`;
9405
9406	if (!cpumask_test_cpu(cpu: env->dst_cpu, cpumask: p->cpus_ptr)) {
9407	int cpu;
9408
9409	schedstat_inc(p->stats.nr_failed_migrations_affine);
9410
9411	env->flags \|= LBF_SOME_PINNED;
9412
9413	/*
9414	* Remember if this task can be migrated to any other CPU in
9415	* our sched_group. We may want to revisit it if we couldn't
9416	* meet load balance goals by pulling other tasks on src_cpu.
9417	*
9418	* Avoid computing new_dst_cpu
9419	* - for NEWLY_IDLE
9420	* - if we have already computed one in current iteration
9421	* - if it's an active balance
9422	*/
9423	if (env->idle == CPU_NEWLY_IDLE \|\|
9424	env->flags & (LBF_DST_PINNED \| LBF_ACTIVE_LB))
9425	return `0`;
9426
9427	/ Prevent to re-select dst_cpu via env's CPUs: /
9428	cpu = cpumask_first_and_and(srcp1: env->dst_grpmask, srcp2: env->cpus, srcp3: p->cpus_ptr);
9429
9430	if (cpu < nr_cpu_ids) {
9431	env->flags \|= LBF_DST_PINNED;
9432	env->new_dst_cpu = cpu;
9433	}
9434
9435	return `0`;
9436	}
9437
9438	/ Record that we found at least one task that could run on dst_cpu /
9439	env->flags &= ~LBF_ALL_PINNED;
9440
9441	if (task_on_cpu(rq: env->src_rq, p) \|\|
9442	task_current_donor(rq: env->src_rq, p)) {
9443	schedstat_inc(p->stats.nr_failed_migrations_running);
9444	return `0`;
9445	}
9446
9447	/*
9448	* Aggressive migration if:
9449	* 1) active balance
9450	* 2) destination numa is preferred
9451	* 3) task is cache cold, or
9452	* 4) too many balance attempts have failed.
9453	*/
9454	if (env->flags & LBF_ACTIVE_LB)
9455	return `1`;
9456
9457	degrades = migrate_degrades_locality(p, env);
9458	if (!degrades)
9459	hot = task_hot(p, env);
9460	else
9461	hot = degrades > `0`;
9462
9463	if (!hot \|\| env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
9464	if (hot)
9465	p->sched_task_hot = `1`;
9466	return `1`;
9467	}
9468
9469	schedstat_inc(p->stats.nr_failed_migrations_hot);
9470	return `0`;
9471	}
9472
9473	/*
9474	* detach_task() -- detach the task for the migration specified in env
9475	*/
9476	static void detach_task(struct task_struct p, struct* lb_env *env)
9477	{
9478	lockdep_assert_rq_held(rq: env->src_rq);
9479
9480	if (p->sched_task_hot) {
9481	p->sched_task_hot = `0`;
9482	schedstat_inc(env->sd->lb_hot_gained[env->idle]);
9483	schedstat_inc(p->stats.nr_forced_migrations);
9484	}
9485
9486	WARN_ON(task_current(env->src_rq, p));
9487	WARN_ON(task_current_donor(env->src_rq, p));
9488
9489	deactivate_task(rq: env->src_rq, p, DEQUEUE_NOCLOCK);
9490	set_task_cpu(p, cpu: env->dst_cpu);
9491	}
9492
9493	/*
9494	* detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
9495	* part of active balancing operations within "domain".
9496	*
9497	* Returns a task if successful and NULL otherwise.
9498	*/
9499	static struct task_struct detach_one_task(struct* lb_env *env)
9500	{
9501	struct task_struct *p;
9502
9503	lockdep_assert_rq_held(rq: env->src_rq);
9504
9505	list_for_each_entry_reverse(p,
9506	&env->src_rq->cfs_tasks, se.group_node) {
9507	if (!can_migrate_task(p, env))
9508	continue;
9509
9510	detach_task(p, env);
9511
9512	/*
9513	* Right now, this is only the second place where
9514	* lb_gained[env->idle] is updated (other is detach_tasks)
9515	* so we can safely collect stats here rather than
9516	* inside detach_tasks().
9517	*/
9518	schedstat_inc(env->sd->lb_gained[env->idle]);
9519	return p;
9520	}
9521	return NULL;
9522	}
9523
9524	/*
9525	* detach_tasks() -- tries to detach up to imbalance load/util/tasks from
9526	* busiest_rq, as part of a balancing operation within domain "sd".
9527	*
9528	* Returns number of detached tasks if successful and 0 otherwise.
9529	*/
9530	static int detach_tasks(struct lb_env *env)
9531	{
9532	struct list_head *tasks = &env->src_rq->cfs_tasks;
9533	unsigned long util, load;
9534	struct task_struct *p;
9535	int detached = `0`;
9536
9537	lockdep_assert_rq_held(rq: env->src_rq);
9538
9539	/*
9540	* Source run queue has been emptied by another CPU, clear
9541	* LBF_ALL_PINNED flag as we will not test any task.
9542	*/
9543	if (env->src_rq->nr_running <= `1`) {
9544	env->flags &= ~LBF_ALL_PINNED;
9545	return `0`;
9546	}
9547
9548	if (env->imbalance <= `0`)
9549	return `0`;
9550
9551	while (!list_empty(head: tasks)) {
9552	/*
9553	* We don't want to steal all, otherwise we may be treated likewise,
9554	* which could at worst lead to a livelock crash.
9555	*/
9556	if (env->idle && env->src_rq->nr_running <= `1`)
9557	break;
9558
9559	env->loop++;
9560	/ We've more or less seen every task there is, call it quits /
9561	if (env->loop > env->loop_max)
9562	break;
9563
9564	/ take a breather every nr_migrate tasks /
9565	if (env->loop > env->loop_break) {
9566	env->loop_break += SCHED_NR_MIGRATE_BREAK;
9567	env->flags \|= LBF_NEED_BREAK;
9568	break;
9569	}
9570
9571	p = list_last_entry(tasks, struct task_struct, se.group_node);
9572
9573	if (!can_migrate_task(p, env))
9574	goto next;
9575
9576	switch (env->migration_type) {
9577	case migrate_load:
9578	/*
9579	* Depending of the number of CPUs and tasks and the
9580	* cgroup hierarchy, task_h_load() can return a null
9581	* value. Make sure that env->imbalance decreases
9582	* otherwise detach_tasks() will stop only after
9583	* detaching up to loop_max tasks.
9584	*/
9585	load = max_t(unsigned long, task_h_load(p), `1`);
9586
9587	if (sched_feat(LB_MIN) &&
9588	load < `16` && !env->sd->nr_balance_failed)
9589	goto next;
9590
9591	/*
9592	* Make sure that we don't migrate too much load.
9593	* Nevertheless, let relax the constraint if
9594	* scheduler fails to find a good waiting task to
9595	* migrate.
9596	*/
9597	if (shr_bound(load, env->sd->nr_balance_failed) > env->imbalance)
9598	goto next;
9599
9600	env->imbalance -= load;
9601	break;
9602
9603	case migrate_util:
9604	util = task_util_est(p);
9605
9606	if (shr_bound(util, env->sd->nr_balance_failed) > env->imbalance)
9607	goto next;
9608
9609	env->imbalance -= util;
9610	break;
9611
9612	case migrate_task:
9613	env->imbalance--;
9614	break;
9615
9616	case migrate_misfit:
9617	/ This is not a misfit task /
9618	if (task_fits_cpu(p, cpu: env->src_cpu))
9619	goto next;
9620
9621	env->imbalance = `0`;
9622	break;
9623	}
9624
9625	detach_task(p, env);
9626	list_add(new: &p->se.group_node, head: &env->tasks);
9627
9628	detached++;
9629
9630	#ifdef CONFIG_PREEMPTION
9631	/*
9632	* NEWIDLE balancing is a source of latency, so preemptible
9633	* kernels will stop after the first task is detached to minimize
9634	* the critical section.
9635	*/
9636	if (env->idle == CPU_NEWLY_IDLE)
9637	break;
9638	#endif
9639
9640	/*
9641	* We only want to steal up to the prescribed amount of
9642	* load/util/tasks.
9643	*/
9644	if (env->imbalance <= `0`)
9645	break;
9646
9647	continue;
9648	next:
9649	if (p->sched_task_hot)
9650	schedstat_inc(p->stats.nr_failed_migrations_hot);
9651
9652	list_move(list: &p->se.group_node, head: tasks);
9653	}
9654
9655	/*
9656	* Right now, this is one of only two places we collect this stat
9657	* so we can safely collect detach_one_task() stats here rather
9658	* than inside detach_one_task().
9659	*/
9660	schedstat_add(env->sd->lb_gained[env->idle], detached);
9661
9662	return detached;
9663	}
9664
9665	/*
9666	* attach_task() -- attach the task detached by detach_task() to its new rq.
9667	*/
9668	static void attach_task(struct rq rq, struct* task_struct *p)
9669	{
9670	lockdep_assert_rq_held(rq);
9671
9672	WARN_ON_ONCE(task_rq(p) != rq);
9673	activate_task(rq, p, ENQUEUE_NOCLOCK);
9674	wakeup_preempt(rq, p, flags: `0`);
9675	}
9676
9677	/*
9678	* attach_one_task() -- attaches the task returned from detach_one_task() to
9679	* its new rq.
9680	*/
9681	static void attach_one_task(struct rq rq, struct* task_struct *p)
9682	{
9683	struct rq_flags rf;
9684
9685	rq_lock(rq, rf: &rf);
9686	update_rq_clock(rq);
9687	attach_task(rq, p);
9688	rq_unlock(rq, rf: &rf);
9689	}
9690
9691	/*
9692	* attach_tasks() -- attaches all tasks detached by detach_tasks() to their
9693	* new rq.
9694	*/
9695	static void attach_tasks(struct lb_env *env)
9696	{
9697	struct list_head *tasks = &env->tasks;
9698	struct task_struct *p;
9699	struct rq_flags rf;
9700
9701	rq_lock(rq: env->dst_rq, rf: &rf);
9702	update_rq_clock(rq: env->dst_rq);
9703
9704	while (!list_empty(head: tasks)) {
9705	p = list_first_entry(tasks, struct task_struct, se.group_node);
9706	list_del_init(entry: &p->se.group_node);
9707
9708	attach_task(rq: env->dst_rq, p);
9709	}
9710
9711	rq_unlock(rq: env->dst_rq, rf: &rf);
9712	}
9713
9714	#ifdef CONFIG_NO_HZ_COMMON
9715	static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
9716	{
9717	if (cfs_rq->avg.load_avg)
9718	return true;
9719
9720	if (cfs_rq->avg.util_avg)
9721	return true;
9722
9723	return false;
9724	}
9725
9726	static inline bool others_have_blocked(struct rq *rq)
9727	{
9728	if (cpu_util_rt(rq))
9729	return true;
9730
9731	if (cpu_util_dl(rq))
9732	return true;
9733
9734	if (hw_load_avg(rq))
9735	return true;
9736
9737	if (cpu_util_irq(rq))
9738	return true;
9739
9740	return false;
9741	}
9742
9743	static inline void update_blocked_load_tick(struct rq *rq)
9744	{
9745	WRITE_ONCE(rq->last_blocked_load_update_tick, jiffies);
9746	}
9747
9748	static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
9749	{
9750	if (!has_blocked)
9751	rq->has_blocked_load = `0`;
9752	}
9753	#else /* !CONFIG_NO_HZ_COMMON: */
9754	static inline bool cfs_rq_has_blocked(struct cfs_rq cfs_rq) { return* false; }
9755	static inline bool others_have_blocked(struct rq rq) { return* false; }
9756	static inline void update_blocked_load_tick(struct rq *rq) {}
9757	static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
9758	#endif /* !CONFIG_NO_HZ_COMMON */
9759
9760	static bool __update_blocked_others(struct rq rq, bool done)
9761	{
9762	bool updated;
9763
9764	/*
9765	* update_load_avg() can call cpufreq_update_util(). Make sure that RT,
9766	* DL and IRQ signals have been updated before updating CFS.
9767	*/
9768	updated = update_other_load_avgs(rq);
9769
9770	if (others_have_blocked(rq))
9771	*done = false;
9772
9773	return updated;
9774	}
9775
9776	#ifdef CONFIG_FAIR_GROUP_SCHED
9777
9778	static bool __update_blocked_fair(struct rq rq, bool done)
9779	{
9780	struct cfs_rq cfs_rq, pos;
9781	bool decayed = false;
9782	int cpu = cpu_of(rq);
9783
9784	/*
9785	* Iterates the task_group tree in a bottom up fashion, see
9786	* list_add_leaf_cfs_rq() for details.
9787	*/
9788	for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
9789	struct sched_entity *se;
9790
9791	if (update_cfs_rq_load_avg(now: cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
9792	update_tg_load_avg(cfs_rq);
9793
9794	if (cfs_rq->nr_queued == `0`)
9795	update_idle_cfs_rq_clock_pelt(cfs_rq);
9796
9797	if (cfs_rq == &rq->cfs)
9798	decayed = true;
9799	}
9800
9801	/ Propagate pending load changes to the parent, if any: /
9802	se = cfs_rq->tg->se[cpu];
9803	if (se && !skip_blocked_update(se))
9804	update_load_avg(cfs_rq: cfs_rq_of(se), se, UPDATE_TG);
9805
9806	/*
9807	* There can be a lot of idle CPU cgroups. Don't let fully
9808	* decayed cfs_rqs linger on the list.
9809	*/
9810	if (cfs_rq_is_decayed(cfs_rq))
9811	list_del_leaf_cfs_rq(cfs_rq);
9812
9813	/ Don't need periodic decay once load/util_avg are null /
9814	if (cfs_rq_has_blocked(cfs_rq))
9815	*done = false;
9816	}
9817
9818	return decayed;
9819	}
9820
9821	/*
9822	* Compute the hierarchical load factor for cfs_rq and all its ascendants.
9823	* This needs to be done in a top-down fashion because the load of a child
9824	* group is a fraction of its parents load.
9825	*/
9826	static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
9827	{
9828	struct rq *rq = rq_of(cfs_rq);
9829	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
9830	unsigned long now = jiffies;
9831	unsigned long load;
9832
9833	if (cfs_rq->last_h_load_update == now)
9834	return;
9835
9836	WRITE_ONCE(cfs_rq->h_load_next, NULL);
9837	for_each_sched_entity(se) {
9838	cfs_rq = cfs_rq_of(se);
9839	WRITE_ONCE(cfs_rq->h_load_next, se);
9840	if (cfs_rq->last_h_load_update == now)
9841	break;
9842	}
9843
9844	if (!se) {
9845	cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
9846	cfs_rq->last_h_load_update = now;
9847	}
9848
9849	while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
9850	load = cfs_rq->h_load;
9851	load = div64_ul(load * se->avg.load_avg,
9852	cfs_rq_load_avg(cfs_rq) + `1`);
9853	cfs_rq = group_cfs_rq(grp: se);
9854	cfs_rq->h_load = load;
9855	cfs_rq->last_h_load_update = now;
9856	}
9857	}
9858
9859	static unsigned long task_h_load(struct task_struct *p)
9860	{
9861	struct cfs_rq *cfs_rq = task_cfs_rq(p);
9862
9863	update_cfs_rq_h_load(cfs_rq);
9864	return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
9865	cfs_rq_load_avg(cfs_rq) + `1`);
9866	}
9867	#else /* !CONFIG_FAIR_GROUP_SCHED: */
9868	static bool __update_blocked_fair(struct rq rq, bool done)
9869	{
9870	struct cfs_rq *cfs_rq = &rq->cfs;
9871	bool decayed;
9872
9873	decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
9874	if (cfs_rq_has_blocked(cfs_rq))
9875	*done = false;
9876
9877	return decayed;
9878	}
9879
9880	static unsigned long task_h_load(struct task_struct *p)
9881	{
9882	return p->se.avg.load_avg;
9883	}
9884	#endif /* !CONFIG_FAIR_GROUP_SCHED */
9885
9886	static void sched_balance_update_blocked_averages(int cpu)
9887	{
9888	bool decayed = false, done = true;
9889	struct rq *rq = cpu_rq(cpu);
9890	struct rq_flags rf;
9891
9892	rq_lock_irqsave(rq, rf: &rf);
9893	update_blocked_load_tick(rq);
9894	update_rq_clock(rq);
9895
9896	decayed \|= __update_blocked_others(rq, done: &done);
9897	decayed \|= __update_blocked_fair(rq, done: &done);
9898
9899	update_blocked_load_status(rq, has_blocked: !done);
9900	if (decayed)
9901	cpufreq_update_util(rq, flags: `0`);
9902	rq_unlock_irqrestore(rq, rf: &rf);
9903	}
9904
9905	/******* Helpers for sched_balance_find_src_group *********************/
9906
9907	/*
9908	* sg_lb_stats - stats of a sched_group required for load-balancing:
9909	*/
9910	struct sg_lb_stats {
9911	unsigned long avg_load; / Avg load over the CPUs of the group /
9912	unsigned long group_load; / Total load over the CPUs of the group /
9913	unsigned long group_capacity; / Capacity over the CPUs of the group /
9914	unsigned long group_util; / Total utilization over the CPUs of the group /
9915	unsigned long group_runnable; / Total runnable time over the CPUs of the group /
9916	unsigned int sum_nr_running; / Nr of all tasks running in the group /
9917	unsigned int sum_h_nr_running; / Nr of CFS tasks running in the group /
9918	unsigned int idle_cpus; / Nr of idle CPUs in the group /
9919	unsigned int group_weight;
9920	enum group_type group_type;
9921	unsigned int group_asym_packing; / Tasks should be moved to preferred CPU /
9922	unsigned int group_smt_balance; / Task on busy SMT be moved /
9923	unsigned long group_misfit_task_load; / A CPU has a task too big for its capacity /
9924	#ifdef CONFIG_NUMA_BALANCING
9925	unsigned int nr_numa_running;
9926	unsigned int nr_preferred_running;
9927	#endif
9928	};
9929
9930	/*
9931	* sd_lb_stats - stats of a sched_domain required for load-balancing:
9932	*/
9933	struct sd_lb_stats {
9934	struct sched_group busiest; /* Busiest group in this sd /
9935	struct sched_group local; /* Local group in this sd /
9936	unsigned long total_load; / Total load of all groups in sd /
9937	unsigned long total_capacity; / Total capacity of all groups in sd /
9938	unsigned long avg_load; / Average load across all groups in sd /
9939	unsigned int prefer_sibling; / Tasks should go to sibling first /
9940
9941	struct sg_lb_stats busiest_stat; / Statistics of the busiest group /
9942	struct sg_lb_stats local_stat; / Statistics of the local group /
9943	};
9944
9945	static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
9946	{
9947	/*
9948	* Skimp on the clearing to avoid duplicate work. We can avoid clearing
9949	* local_stat because update_sg_lb_stats() does a full clear/assignment.
9950	* We must however set busiest_stat::group_type and
9951	* busiest_stat::idle_cpus to the worst busiest group because
9952	* update_sd_pick_busiest() reads these before assignment.
9953	*/
9954	sds = (struct* sd_lb_stats){
9955	.busiest = NULL,
9956	.local = NULL,
9957	.total_load = `0UL`,
9958	.total_capacity = `0UL`,
9959	.busiest_stat = {
9960	.idle_cpus = UINT_MAX,
9961	.group_type = group_has_spare,
9962	},
9963	};
9964	}
9965
9966	static unsigned long scale_rt_capacity(int cpu)
9967	{
9968	unsigned long max = get_actual_cpu_capacity(cpu);
9969	struct rq *rq = cpu_rq(cpu);
9970	unsigned long used, free;
9971	unsigned long irq;
9972
9973	irq = cpu_util_irq(rq);
9974
9975	if (unlikely(irq >= max))
9976	return `1`;
9977
9978	/*
9979	* avg_rt.util_avg and avg_dl.util_avg track binary signals
9980	* (running and not running) with weights 0 and 1024 respectively.
9981	*/
9982	used = cpu_util_rt(rq);
9983	used += cpu_util_dl(rq);
9984
9985	if (unlikely(used >= max))
9986	return `1`;
9987
9988	free = max - used;
9989
9990	return scale_irq_capacity(util: free, irq, max);
9991	}
9992
9993	static void update_cpu_capacity(struct sched_domain sd, int* cpu)
9994	{
9995	unsigned long capacity = scale_rt_capacity(cpu);
9996	struct sched_group *sdg = sd->groups;
9997
9998	if (!capacity)
9999	capacity = `1`;
10000
10001	cpu_rq(cpu)->cpu_capacity = capacity;
10002	trace_sched_cpu_capacity_tp(cpu_rq(cpu));
10003
10004	sdg->sgc->capacity = capacity;
10005	sdg->sgc->min_capacity = capacity;
10006	sdg->sgc->max_capacity = capacity;
10007	}
10008
10009	void update_group_capacity(struct sched_domain sd, int* cpu)
10010	{
10011	struct sched_domain *child = sd->child;
10012	struct sched_group group, sdg = sd->groups;
10013	unsigned long capacity, min_capacity, max_capacity;
10014	unsigned long interval;
10015
10016	interval = msecs_to_jiffies(m: sd->balance_interval);
10017	interval = clamp(interval, `1UL`, max_load_balance_interval);
10018	sdg->sgc->next_update = jiffies + interval;
10019
10020	if (!child) {
10021	update_cpu_capacity(sd, cpu);
10022	return;
10023	}
10024
10025	capacity = `0`;
10026	min_capacity = ULONG_MAX;
10027	max_capacity = `0`;
10028
10029	if (child->flags & SD_NUMA) {
10030	/*
10031	* SD_NUMA domains cannot assume that child groups
10032	* span the current group.
10033	*/
10034
10035	for_each_cpu(cpu, sched_group_span(sdg)) {
10036	unsigned long cpu_cap = capacity_of(cpu);
10037
10038	capacity += cpu_cap;
10039	min_capacity = min(cpu_cap, min_capacity);
10040	max_capacity = max(cpu_cap, max_capacity);
10041	}
10042	} else {
10043	/*
10044	* !SD_NUMA domains can assume that child groups
10045	* span the current group.
10046	*/
10047
10048	group = child->groups;
10049	do {
10050	struct sched_group_capacity *sgc = group->sgc;
10051
10052	capacity += sgc->capacity;
10053	min_capacity = min(sgc->min_capacity, min_capacity);
10054	max_capacity = max(sgc->max_capacity, max_capacity);
10055	group = group->next;
10056	} while (group != child->groups);
10057	}
10058
10059	sdg->sgc->capacity = capacity;
10060	sdg->sgc->min_capacity = min_capacity;
10061	sdg->sgc->max_capacity = max_capacity;
10062	}
10063
10064	/*
10065	* Check whether the capacity of the rq has been noticeably reduced by side
10066	* activity. The imbalance_pct is used for the threshold.
10067	* Return true is the capacity is reduced
10068	*/
10069	static inline int
10070	check_cpu_capacity(struct rq rq, struct* sched_domain *sd)
10071	{
10072	return ((rq->cpu_capacity * sd->imbalance_pct) <
10073	(arch_scale_cpu_capacity(cpu: cpu_of(rq)) * `100`));
10074	}
10075
10076	/ Check if the rq has a misfit task /
10077	static inline bool check_misfit_status(struct rq *rq)
10078	{
10079	return rq->misfit_task_load;
10080	}
10081
10082	/*
10083	* Group imbalance indicates (and tries to solve) the problem where balancing
10084	* groups is inadequate due to ->cpus_ptr constraints.
10085	*
10086	* Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a
10087	* cpumask covering 1 CPU of the first group and 3 CPUs of the second group.
10088	* Something like:
10089	*
10090	* { 0 1 2 3 } { 4 5 6 7 }
10091	* * * * *
10092	*
10093	* If we were to balance group-wise we'd place two tasks in the first group and
10094	* two tasks in the second group. Clearly this is undesired as it will overload
10095	* cpu 3 and leave one of the CPUs in the second group unused.
10096	*
10097	* The current solution to this issue is detecting the skew in the first group
10098	* by noticing the lower domain failed to reach balance and had difficulty
10099	* moving tasks due to affinity constraints.
10100	*
10101	* When this is so detected; this group becomes a candidate for busiest; see
10102	* update_sd_pick_busiest(). And calculate_imbalance() and
10103	* sched_balance_find_src_group() avoid some of the usual balance conditions to allow it
10104	* to create an effective group imbalance.
10105	*
10106	* This is a somewhat tricky proposition since the next run might not find the
10107	* group imbalance and decide the groups need to be balanced again. A most
10108	* subtle and fragile situation.
10109	*/
10110
10111	static inline int sg_imbalanced(struct sched_group *group)
10112	{
10113	return group->sgc->imbalance;
10114	}
10115
10116	/*
10117	* group_has_capacity returns true if the group has spare capacity that could
10118	* be used by some tasks.
10119	* We consider that a group has spare capacity if the number of task is
10120	* smaller than the number of CPUs or if the utilization is lower than the
10121	* available capacity for CFS tasks.
10122	* For the latter, we use a threshold to stabilize the state, to take into
10123	* account the variance of the tasks' load and to return true if the available
10124	* capacity in meaningful for the load balancer.
10125	* As an example, an available capacity of 1% can appear but it doesn't make
10126	* any benefit for the load balance.
10127	*/
10128	static inline bool
10129	group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
10130	{
10131	if (sgs->sum_nr_running < sgs->group_weight)
10132	return true;
10133
10134	if ((sgs->group_capacity * imbalance_pct) <
10135	(sgs->group_runnable * `100`))
10136	return false;
10137
10138	if ((sgs->group_capacity * `100`) >
10139	(sgs->group_util * imbalance_pct))
10140	return true;
10141
10142	return false;
10143	}
10144
10145	/*
10146	* group_is_overloaded returns true if the group has more tasks than it can
10147	* handle.
10148	* group_is_overloaded is not equals to !group_has_capacity because a group
10149	* with the exact right number of tasks, has no more spare capacity but is not
10150	* overloaded so both group_has_capacity and group_is_overloaded return
10151	* false.
10152	*/
10153	static inline bool
10154	group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
10155	{
10156	if (sgs->sum_nr_running <= sgs->group_weight)
10157	return false;
10158
10159	if ((sgs->group_capacity * `100`) <
10160	(sgs->group_util * imbalance_pct))
10161	return true;
10162
10163	if ((sgs->group_capacity * imbalance_pct) <
10164	(sgs->group_runnable * `100`))
10165	return true;
10166
10167	return false;
10168	}
10169
10170	static inline enum
10171	group_type group_classify(unsigned int imbalance_pct,
10172	struct sched_group *group,
10173	struct sg_lb_stats *sgs)
10174	{
10175	if (group_is_overloaded(imbalance_pct, sgs))
10176	return group_overloaded;
10177
10178	if (sg_imbalanced(group))
10179	return group_imbalanced;
10180
10181	if (sgs->group_asym_packing)
10182	return group_asym_packing;
10183
10184	if (sgs->group_smt_balance)
10185	return group_smt_balance;
10186
10187	if (sgs->group_misfit_task_load)
10188	return group_misfit_task;
10189
10190	if (!group_has_capacity(imbalance_pct, sgs))
10191	return group_fully_busy;
10192
10193	return group_has_spare;
10194	}
10195
10196	/**
10197	* sched_use_asym_prio - Check whether asym_packing priority must be used
10198	* @sd: The scheduling domain of the load balancing
10199	* @cpu: A CPU
10200	*
10201	* Always use CPU priority when balancing load between SMT siblings. When
10202	* balancing load between cores, it is not sufficient that @cpu is idle. Only
10203	* use CPU priority if the whole core is idle.
10204	*
10205	* Returns: True if the priority of @cpu must be followed. False otherwise.
10206	*/
10207	static bool sched_use_asym_prio(struct sched_domain sd, int* cpu)
10208	{
10209	if (!(sd->flags & SD_ASYM_PACKING))
10210	return false;
10211
10212	if (!sched_smt_active())
10213	return true;
10214
10215	return sd->flags & SD_SHARE_CPUCAPACITY \|\| is_core_idle(cpu);
10216	}
10217
10218	static inline bool sched_asym(struct sched_domain sd, int* dst_cpu, int src_cpu)
10219	{
10220	/*
10221	* First check if @dst_cpu can do asym_packing load balance. Only do it
10222	* if it has higher priority than @src_cpu.
10223	*/
10224	return sched_use_asym_prio(sd, cpu: dst_cpu) &&
10225	sched_asym_prefer(a: dst_cpu, b: src_cpu);
10226	}
10227
10228	/**
10229	* sched_group_asym - Check if the destination CPU can do asym_packing balance
10230	* @env: The load balancing environment
10231	* @sgs: Load-balancing statistics of the candidate busiest group
10232	* @group: The candidate busiest group
10233	*
10234	* @env::dst_cpu can do asym_packing if it has higher priority than the
10235	* preferred CPU of @group.
10236	*
10237	* Return: true if @env::dst_cpu can do with asym_packing load balance. False
10238	* otherwise.
10239	*/
10240	static inline bool
10241	sched_group_asym(struct lb_env env, struct* sg_lb_stats sgs, struct* sched_group *group)
10242	{
10243	/*
10244	* CPU priorities do not make sense for SMT cores with more than one
10245	* busy sibling.
10246	*/
10247	if ((group->flags & SD_SHARE_CPUCAPACITY) &&
10248	(sgs->group_weight - sgs->idle_cpus != `1`))
10249	return false;
10250
10251	return sched_asym(sd: env->sd, dst_cpu: env->dst_cpu, READ_ONCE(group->asym_prefer_cpu));
10252	}
10253
10254	/ One group has more than one SMT CPU while the other group does not /
10255	static inline bool smt_vs_nonsmt_groups(struct sched_group *sg1,
10256	struct sched_group *sg2)
10257	{
10258	if (!sg1 \|\| !sg2)
10259	return false;
10260
10261	return (sg1->flags & SD_SHARE_CPUCAPACITY) !=
10262	(sg2->flags & SD_SHARE_CPUCAPACITY);
10263	}
10264
10265	static inline bool smt_balance(struct lb_env env, struct* sg_lb_stats *sgs,
10266	struct sched_group *group)
10267	{
10268	if (!env->idle)
10269	return false;
10270
10271	/*
10272	* For SMT source group, it is better to move a task
10273	* to a CPU that doesn't have multiple tasks sharing its CPU capacity.
10274	* Note that if a group has a single SMT, SD_SHARE_CPUCAPACITY
10275	* will not be on.
10276	*/
10277	if (group->flags & SD_SHARE_CPUCAPACITY &&
10278	sgs->sum_h_nr_running > `1`)
10279	return true;
10280
10281	return false;
10282	}
10283
10284	static inline long sibling_imbalance(struct lb_env *env,
10285	struct sd_lb_stats *sds,
10286	struct sg_lb_stats *busiest,
10287	struct sg_lb_stats *local)
10288	{
10289	int ncores_busiest, ncores_local;
10290	long imbalance;
10291
10292	if (!env->idle \|\| !busiest->sum_nr_running)
10293	return `0`;
10294
10295	ncores_busiest = sds->busiest->cores;
10296	ncores_local = sds->local->cores;
10297
10298	if (ncores_busiest == ncores_local) {
10299	imbalance = busiest->sum_nr_running;
10300	lsub_positive(&imbalance, local->sum_nr_running);
10301	return imbalance;
10302	}
10303
10304	/ Balance such that nr_running/ncores ratio are same on both groups /
10305	imbalance = ncores_local * busiest->sum_nr_running;
10306	lsub_positive(&imbalance, ncores_busiest * local->sum_nr_running);
10307	/ Normalize imbalance and do rounding on normalization /
10308	imbalance = `2` * imbalance + ncores_local + ncores_busiest;
10309	imbalance /= ncores_local + ncores_busiest;
10310
10311	/ Take advantage of resource in an empty sched group /
10312	if (imbalance <= `1` && local->sum_nr_running == `0` &&
10313	busiest->sum_nr_running > `1`)
10314	imbalance = `2`;
10315
10316	return imbalance;
10317	}
10318
10319	static inline bool
10320	sched_reduced_capacity(struct rq rq, struct* sched_domain *sd)
10321	{
10322	/*
10323	* When there is more than 1 task, the group_overloaded case already
10324	* takes care of cpu with reduced capacity
10325	*/
10326	if (rq->cfs.h_nr_runnable != `1`)
10327	return false;
10328
10329	return check_cpu_capacity(rq, sd);
10330	}
10331
10332	/**
10333	* update_sg_lb_stats - Update sched_group's statistics for load balancing.
10334	* @env: The load balancing environment.
10335	* @sds: Load-balancing data with statistics of the local group.
10336	* @group: sched_group whose statistics are to be updated.
10337	* @sgs: variable to hold the statistics for this group.
10338	* @sg_overloaded: sched_group is overloaded
10339	* @sg_overutilized: sched_group is overutilized
10340	*/
10341	static inline void update_sg_lb_stats(struct lb_env *env,
10342	struct sd_lb_stats *sds,
10343	struct sched_group *group,
10344	struct sg_lb_stats *sgs,
10345	bool *sg_overloaded,
10346	bool *sg_overutilized)
10347	{
10348	int i, nr_running, local_group, sd_flags = env->sd->flags;
10349	bool balancing_at_rd = !env->sd->parent;
10350
10351	memset(s: sgs, c: `0`, n: sizeof(*sgs));
10352
10353	local_group = group == sds->local;
10354
10355	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
10356	struct rq *rq = cpu_rq(i);
10357	unsigned long load = cpu_load(rq);
10358
10359	sgs->group_load += load;
10360	sgs->group_util += cpu_util_cfs(cpu: i);
10361	sgs->group_runnable += cpu_runnable(rq);
10362	sgs->sum_h_nr_running += rq->cfs.h_nr_runnable;
10363
10364	nr_running = rq->nr_running;
10365	sgs->sum_nr_running += nr_running;
10366
10367	if (cpu_overutilized(cpu: i))
10368	*sg_overutilized = `1`;
10369
10370	/*
10371	* No need to call idle_cpu() if nr_running is not 0
10372	*/
10373	if (!nr_running && idle_cpu(cpu: i)) {
10374	sgs->idle_cpus++;
10375	/ Idle cpu can't have misfit task /
10376	continue;
10377	}
10378
10379	/ Overload indicator is only updated at root domain /
10380	if (balancing_at_rd && nr_running > `1`)
10381	*sg_overloaded = `1`;
10382
10383	#ifdef CONFIG_NUMA_BALANCING
10384	/ Only fbq_classify_group() uses this to classify NUMA groups /
10385	if (sd_flags & SD_NUMA) {
10386	sgs->nr_numa_running += rq->nr_numa_running;
10387	sgs->nr_preferred_running += rq->nr_preferred_running;
10388	}
10389	#endif
10390	if (local_group)
10391	continue;
10392
10393	if (sd_flags & SD_ASYM_CPUCAPACITY) {
10394	/ Check for a misfit task on the cpu /
10395	if (sgs->group_misfit_task_load < rq->misfit_task_load) {
10396	sgs->group_misfit_task_load = rq->misfit_task_load;
10397	*sg_overloaded = `1`;
10398	}
10399	} else if (env->idle && sched_reduced_capacity(rq, sd: env->sd)) {
10400	/ Check for a task running on a CPU with reduced capacity /
10401	if (sgs->group_misfit_task_load < load)
10402	sgs->group_misfit_task_load = load;
10403	}
10404	}
10405
10406	sgs->group_capacity = group->sgc->capacity;
10407
10408	sgs->group_weight = group->group_weight;
10409
10410	/ Check if dst CPU is idle and preferred to this group /
10411	if (!local_group && env->idle && sgs->sum_h_nr_running &&
10412	sched_group_asym(env, sgs, group))
10413	sgs->group_asym_packing = `1`;
10414
10415	/ Check for loaded SMT group to be balanced to dst CPU /
10416	if (!local_group && smt_balance(env, sgs, group))
10417	sgs->group_smt_balance = `1`;
10418
10419	sgs->group_type = group_classify(imbalance_pct: env->sd->imbalance_pct, group, sgs);
10420
10421	/ Computing avg_load makes sense only when group is overloaded /
10422	if (sgs->group_type == group_overloaded)
10423	sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
10424	sgs->group_capacity;
10425	}
10426
10427	/**
10428	* update_sd_pick_busiest - return 1 on busiest group
10429	* @env: The load balancing environment.
10430	* @sds: sched_domain statistics
10431	* @sg: sched_group candidate to be checked for being the busiest
10432	* @sgs: sched_group statistics
10433	*
10434	* Determine if @sg is a busier group than the previously selected
10435	* busiest group.
10436	*
10437	* Return: %true if @sg is a busier group than the previously selected
10438	* busiest group. %false otherwise.
10439	*/
10440	static bool update_sd_pick_busiest(struct lb_env *env,
10441	struct sd_lb_stats *sds,
10442	struct sched_group *sg,
10443	struct sg_lb_stats *sgs)
10444	{
10445	struct sg_lb_stats *busiest = &sds->busiest_stat;
10446
10447	/ Make sure that there is at least one task to pull /
10448	if (!sgs->sum_h_nr_running)
10449	return false;
10450
10451	/*
10452	* Don't try to pull misfit tasks we can't help.
10453	* We can use max_capacity here as reduction in capacity on some
10454	* CPUs in the group should either be possible to resolve
10455	* internally or be covered by avg_load imbalance (eventually).
10456	*/
10457	if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
10458	(sgs->group_type == group_misfit_task) &&
10459	(!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) \|\|
10460	sds->local_stat.group_type != group_has_spare))
10461	return false;
10462
10463	if (sgs->group_type > busiest->group_type)
10464	return true;
10465
10466	if (sgs->group_type < busiest->group_type)
10467	return false;
10468
10469	/*
10470	* The candidate and the current busiest group are the same type of
10471	* group. Let check which one is the busiest according to the type.
10472	*/
10473
10474	switch (sgs->group_type) {
10475	case group_overloaded:
10476	/ Select the overloaded group with highest avg_load. /
10477	return sgs->avg_load > busiest->avg_load;
10478
10479	case group_imbalanced:
10480	/*
10481	* Select the 1st imbalanced group as we don't have any way to
10482	* choose one more than another.
10483	*/
10484	return false;
10485
10486	case group_asym_packing:
10487	/ Prefer to move from lowest priority CPU's work /
10488	return sched_asym_prefer(READ_ONCE(sds->busiest->asym_prefer_cpu),
10489	READ_ONCE(sg->asym_prefer_cpu));
10490
10491	case group_misfit_task:
10492	/*
10493	* If we have more than one misfit sg go with the biggest
10494	* misfit.
10495	*/
10496	return sgs->group_misfit_task_load > busiest->group_misfit_task_load;
10497
10498	case group_smt_balance:
10499	/*
10500	* Check if we have spare CPUs on either SMT group to
10501	* choose has spare or fully busy handling.
10502	*/
10503	if (sgs->idle_cpus != `0` \|\| busiest->idle_cpus != `0`)
10504	goto has_spare;
10505
10506	fallthrough;
10507
10508	case group_fully_busy:
10509	/*
10510	* Select the fully busy group with highest avg_load. In
10511	* theory, there is no need to pull task from such kind of
10512	* group because tasks have all compute capacity that they need
10513	* but we can still improve the overall throughput by reducing
10514	* contention when accessing shared HW resources.
10515	*
10516	* XXX for now avg_load is not computed and always 0 so we
10517	* select the 1st one, except if @sg is composed of SMT
10518	* siblings.
10519	*/
10520
10521	if (sgs->avg_load < busiest->avg_load)
10522	return false;
10523
10524	if (sgs->avg_load == busiest->avg_load) {
10525	/*
10526	* SMT sched groups need more help than non-SMT groups.
10527	* If @sg happens to also be SMT, either choice is good.
10528	*/
10529	if (sds->busiest->flags & SD_SHARE_CPUCAPACITY)
10530	return false;
10531	}
10532
10533	break;
10534
10535	case group_has_spare:
10536	/*
10537	* Do not pick sg with SMT CPUs over sg with pure CPUs,
10538	* as we do not want to pull task off SMT core with one task
10539	* and make the core idle.
10540	*/
10541	if (smt_vs_nonsmt_groups(sg1: sds->busiest, sg2: sg)) {
10542	if (sg->flags & SD_SHARE_CPUCAPACITY && sgs->sum_h_nr_running <= `1`)
10543	return false;
10544	else
10545	return true;
10546	}
10547	has_spare:
10548
10549	/*
10550	* Select not overloaded group with lowest number of idle CPUs
10551	* and highest number of running tasks. We could also compare
10552	* the spare capacity which is more stable but it can end up
10553	* that the group has less spare capacity but finally more idle
10554	* CPUs which means less opportunity to pull tasks.
10555	*/
10556	if (sgs->idle_cpus > busiest->idle_cpus)
10557	return false;
10558	else if ((sgs->idle_cpus == busiest->idle_cpus) &&
10559	(sgs->sum_nr_running <= busiest->sum_nr_running))
10560	return false;
10561
10562	break;
10563	}
10564
10565	/*
10566	* Candidate sg has no more than one task per CPU and has higher
10567	* per-CPU capacity. Migrating tasks to less capable CPUs may harm
10568	* throughput. Maximize throughput, power/energy consequences are not
10569	* considered.
10570	*/
10571	if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
10572	(sgs->group_type <= group_fully_busy) &&
10573	(capacity_greater(sg->sgc->min_capacity, capacity_of(env->dst_cpu))))
10574	return false;
10575
10576	return true;
10577	}
10578
10579	#ifdef CONFIG_NUMA_BALANCING
10580	static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
10581	{
10582	if (sgs->sum_h_nr_running > sgs->nr_numa_running)
10583	return regular;
10584	if (sgs->sum_h_nr_running > sgs->nr_preferred_running)
10585	return remote;
10586	return all;
10587	}
10588
10589	static inline enum fbq_type fbq_classify_rq(struct rq *rq)
10590	{
10591	if (rq->nr_running > rq->nr_numa_running)
10592	return regular;
10593	if (rq->nr_running > rq->nr_preferred_running)
10594	return remote;
10595	return all;
10596	}
10597	#else /* !CONFIG_NUMA_BALANCING: */
10598	static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
10599	{
10600	return all;
10601	}
10602
10603	static inline enum fbq_type fbq_classify_rq(struct rq *rq)
10604	{
10605	return regular;
10606	}
10607	#endif /* !CONFIG_NUMA_BALANCING */
10608
10609
10610	struct sg_lb_stats;
10611
10612	/*
10613	* task_running_on_cpu - return 1 if @p is running on @cpu.
10614	*/
10615
10616	static unsigned int task_running_on_cpu(int cpu, struct task_struct *p)
10617	{
10618	/ Task has no contribution or is new /
10619	if (cpu != task_cpu(p) \|\| !READ_ONCE(p->se.avg.last_update_time))
10620	return `0`;
10621
10622	if (task_on_rq_queued(p))
10623	return `1`;
10624
10625	return `0`;
10626	}
10627
10628	/**
10629	* idle_cpu_without - would a given CPU be idle without p ?
10630	* @cpu: the processor on which idleness is tested.
10631	* @p: task which should be ignored.
10632	*
10633	* Return: 1 if the CPU would be idle. 0 otherwise.
10634	*/
10635	static int idle_cpu_without(int cpu, struct task_struct *p)
10636	{
10637	struct rq *rq = cpu_rq(cpu);
10638
10639	if (rq->curr != rq->idle && rq->curr != p)
10640	return `0`;
10641
10642	/*
10643	* rq->nr_running can't be used but an updated version without the
10644	* impact of p on cpu must be used instead. The updated nr_running
10645	* be computed and tested before calling idle_cpu_without().
10646	*/
10647
10648	if (rq->ttwu_pending)
10649	return `0`;
10650
10651	return `1`;
10652	}
10653
10654	/*
10655	* update_sg_wakeup_stats - Update sched_group's statistics for wakeup.
10656	* @sd: The sched_domain level to look for idlest group.
10657	* @group: sched_group whose statistics are to be updated.
10658	* @sgs: variable to hold the statistics for this group.
10659	* @p: The task for which we look for the idlest group/CPU.
10660	*/
10661	static inline void update_sg_wakeup_stats(struct sched_domain *sd,
10662	struct sched_group *group,
10663	struct sg_lb_stats *sgs,
10664	struct task_struct *p)
10665	{
10666	int i, nr_running;
10667
10668	memset(s: sgs, c: `0`, n: sizeof(*sgs));
10669
10670	/ Assume that task can't fit any CPU of the group /
10671	if (sd->flags & SD_ASYM_CPUCAPACITY)
10672	sgs->group_misfit_task_load = `1`;
10673
10674	for_each_cpu(i, sched_group_span(group)) {
10675	struct rq *rq = cpu_rq(i);
10676	unsigned int local;
10677
10678	sgs->group_load += cpu_load_without(rq, p);
10679	sgs->group_util += cpu_util_without(cpu: i, p);
10680	sgs->group_runnable += cpu_runnable_without(rq, p);
10681	local = task_running_on_cpu(cpu: i, p);
10682	sgs->sum_h_nr_running += rq->cfs.h_nr_runnable - local;
10683
10684	nr_running = rq->nr_running - local;
10685	sgs->sum_nr_running += nr_running;
10686
10687	/*
10688	* No need to call idle_cpu_without() if nr_running is not 0
10689	*/
10690	if (!nr_running && idle_cpu_without(cpu: i, p))
10691	sgs->idle_cpus++;
10692
10693	/ Check if task fits in the CPU /
10694	if (sd->flags & SD_ASYM_CPUCAPACITY &&
10695	sgs->group_misfit_task_load &&
10696	task_fits_cpu(p, cpu: i))
10697	sgs->group_misfit_task_load = `0`;
10698
10699	}
10700
10701	sgs->group_capacity = group->sgc->capacity;
10702
10703	sgs->group_weight = group->group_weight;
10704
10705	sgs->group_type = group_classify(imbalance_pct: sd->imbalance_pct, group, sgs);
10706
10707	/*
10708	* Computing avg_load makes sense only when group is fully busy or
10709	* overloaded
10710	*/
10711	if (sgs->group_type == group_fully_busy \|\|
10712	sgs->group_type == group_overloaded)
10713	sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
10714	sgs->group_capacity;
10715	}
10716
10717	static bool update_pick_idlest(struct sched_group *idlest,
10718	struct sg_lb_stats *idlest_sgs,
10719	struct sched_group *group,
10720	struct sg_lb_stats *sgs)
10721	{
10722	if (sgs->group_type < idlest_sgs->group_type)
10723	return true;
10724
10725	if (sgs->group_type > idlest_sgs->group_type)
10726	return false;
10727
10728	/*
10729	* The candidate and the current idlest group are the same type of
10730	* group. Let check which one is the idlest according to the type.
10731	*/
10732
10733	switch (sgs->group_type) {
10734	case group_overloaded:
10735	case group_fully_busy:
10736	/ Select the group with lowest avg_load. /
10737	if (idlest_sgs->avg_load <= sgs->avg_load)
10738	return false;
10739	break;
10740
10741	case group_imbalanced:
10742	case group_asym_packing:
10743	case group_smt_balance:
10744	/ Those types are not used in the slow wakeup path /
10745	return false;
10746
10747	case group_misfit_task:
10748	/ Select group with the highest max capacity /
10749	if (idlest->sgc->max_capacity >= group->sgc->max_capacity)
10750	return false;
10751	break;
10752
10753	case group_has_spare:
10754	/ Select group with most idle CPUs /
10755	if (idlest_sgs->idle_cpus > sgs->idle_cpus)
10756	return false;
10757
10758	/ Select group with lowest group_util /
10759	if (idlest_sgs->idle_cpus == sgs->idle_cpus &&
10760	idlest_sgs->group_util <= sgs->group_util)
10761	return false;
10762
10763	break;
10764	}
10765
10766	return true;
10767	}
10768
10769	/*
10770	* sched_balance_find_dst_group() finds and returns the least busy CPU group within the
10771	* domain.
10772	*
10773	* Assumes p is allowed on at least one CPU in sd.
10774	*/
10775	static struct sched_group *
10776	sched_balance_find_dst_group(struct sched_domain sd, struct* task_struct p, int* this_cpu)
10777	{
10778	struct sched_group idlest = NULL, local = NULL, *group = sd->groups;
10779	struct sg_lb_stats local_sgs, tmp_sgs;
10780	struct sg_lb_stats *sgs;
10781	unsigned long imbalance;
10782	struct sg_lb_stats idlest_sgs = {
10783	.avg_load = UINT_MAX,
10784	.group_type = group_overloaded,
10785	};
10786
10787	do {
10788	int local_group;
10789
10790	/ Skip over this group if it has no CPUs allowed /
10791	if (!cpumask_intersects(src1p: sched_group_span(sg: group),
10792	src2p: p->cpus_ptr))
10793	continue;
10794
10795	/ Skip over this group if no cookie matched /
10796	if (!sched_group_cookie_match(cpu_rq(this_cpu), p, group))
10797	continue;
10798
10799	local_group = cpumask_test_cpu(cpu: this_cpu,
10800	cpumask: sched_group_span(sg: group));
10801
10802	if (local_group) {
10803	sgs = &local_sgs;
10804	local = group;
10805	} else {
10806	sgs = &tmp_sgs;
10807	}
10808
10809	update_sg_wakeup_stats(sd, group, sgs, p);
10810
10811	if (!local_group && update_pick_idlest(idlest, idlest_sgs: &idlest_sgs, group, sgs)) {
10812	idlest = group;
10813	idlest_sgs = *sgs;
10814	}
10815
10816	} while (group = group->next, group != sd->groups);
10817
10818
10819	/ There is no idlest group to push tasks to /
10820	if (!idlest)
10821	return NULL;
10822
10823	/ The local group has been skipped because of CPU affinity /
10824	if (!local)
10825	return idlest;
10826
10827	/*
10828	* If the local group is idler than the selected idlest group
10829	* don't try and push the task.
10830	*/
10831	if (local_sgs.group_type < idlest_sgs.group_type)
10832	return NULL;
10833
10834	/*
10835	* If the local group is busier than the selected idlest group
10836	* try and push the task.
10837	*/
10838	if (local_sgs.group_type > idlest_sgs.group_type)
10839	return idlest;
10840
10841	switch (local_sgs.group_type) {
10842	case group_overloaded:
10843	case group_fully_busy:
10844
10845	/ Calculate allowed imbalance based on load /
10846	imbalance = scale_load_down(NICE_0_LOAD) *
10847	(sd->imbalance_pct-`100`) / `100`;
10848
10849	/*
10850	* When comparing groups across NUMA domains, it's possible for
10851	* the local domain to be very lightly loaded relative to the
10852	* remote domains but "imbalance" skews the comparison making
10853	* remote CPUs look much more favourable. When considering
10854	* cross-domain, add imbalance to the load on the remote node
10855	* and consider staying local.
10856	*/
10857
10858	if ((sd->flags & SD_NUMA) &&
10859	((idlest_sgs.avg_load + imbalance) >= local_sgs.avg_load))
10860	return NULL;
10861
10862	/*
10863	* If the local group is less loaded than the selected
10864	* idlest group don't try and push any tasks.
10865	*/
10866	if (idlest_sgs.avg_load >= (local_sgs.avg_load + imbalance))
10867	return NULL;
10868
10869	if (`100` * local_sgs.avg_load <= sd->imbalance_pct * idlest_sgs.avg_load)
10870	return NULL;
10871	break;
10872
10873	case group_imbalanced:
10874	case group_asym_packing:
10875	case group_smt_balance:
10876	/ Those type are not used in the slow wakeup path /
10877	return NULL;
10878
10879	case group_misfit_task:
10880	/ Select group with the highest max capacity /
10881	if (local->sgc->max_capacity >= idlest->sgc->max_capacity)
10882	return NULL;
10883	break;
10884
10885	case group_has_spare:
10886	#ifdef CONFIG_NUMA
10887	if (sd->flags & SD_NUMA) {
10888	int imb_numa_nr = sd->imb_numa_nr;
10889	#ifdef CONFIG_NUMA_BALANCING
10890	int idlest_cpu;
10891	/*
10892	* If there is spare capacity at NUMA, try to select
10893	* the preferred node
10894	*/
10895	if (cpu_to_node(this_cpu) == p->numa_preferred_nid)
10896	return NULL;
10897
10898	idlest_cpu = cpumask_first(sched_group_span(idlest));
10899	if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
10900	return idlest;
10901	#endif /* CONFIG_NUMA_BALANCING */
10902	/*
10903	* Otherwise, keep the task close to the wakeup source
10904	* and improve locality if the number of running tasks
10905	* would remain below threshold where an imbalance is
10906	* allowed while accounting for the possibility the
10907	* task is pinned to a subset of CPUs. If there is a
10908	* real need of migration, periodic load balance will
10909	* take care of it.
10910	*/
10911	if (p->nr_cpus_allowed != NR_CPUS) {
10912	struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
10913
10914	cpumask_and(dstp: cpus, src1p: sched_group_span(sg: local), src2p: p->cpus_ptr);
10915	imb_numa_nr = min(cpumask_weight(cpus), sd->imb_numa_nr);
10916	}
10917
10918	imbalance = abs(local_sgs.idle_cpus - idlest_sgs.idle_cpus);
10919	if (!adjust_numa_imbalance(imbalance,
10920	dst_running: local_sgs.sum_nr_running + `1`,
10921	imb_numa_nr)) {
10922	return NULL;
10923	}
10924	}
10925	#endif /* CONFIG_NUMA */
10926
10927	/*
10928	* Select group with highest number of idle CPUs. We could also
10929	* compare the utilization which is more stable but it can end
10930	* up that the group has less spare capacity but finally more
10931	* idle CPUs which means more opportunity to run task.
10932	*/
10933	if (local_sgs.idle_cpus >= idlest_sgs.idle_cpus)
10934	return NULL;
10935	break;
10936	}
10937
10938	return idlest;
10939	}
10940
10941	static void update_idle_cpu_scan(struct lb_env *env,
10942	unsigned long sum_util)
10943	{
10944	struct sched_domain_shared *sd_share;
10945	int llc_weight, pct;
10946	u64 x, y, tmp;
10947	/*
10948	* Update the number of CPUs to scan in LLC domain, which could
10949	* be used as a hint in select_idle_cpu(). The update of sd_share
10950	* could be expensive because it is within a shared cache line.
10951	* So the write of this hint only occurs during periodic load
10952	* balancing, rather than CPU_NEWLY_IDLE, because the latter
10953	* can fire way more frequently than the former.
10954	*/
10955	if (!sched_feat(SIS_UTIL) \|\| env->idle == CPU_NEWLY_IDLE)
10956	return;
10957
10958	llc_weight = per_cpu(sd_llc_size, env->dst_cpu);
10959	if (env->sd->span_weight != llc_weight)
10960	return;
10961
10962	sd_share = rcu_dereference(per_cpu(sd_llc_shared, env->dst_cpu));
10963	if (!sd_share)
10964	return;
10965
10966	/*
10967	* The number of CPUs to search drops as sum_util increases, when
10968	* sum_util hits 85% or above, the scan stops.
10969	* The reason to choose 85% as the threshold is because this is the
10970	* imbalance_pct(117) when a LLC sched group is overloaded.
10971	*
10972	* let y = SCHED_CAPACITY_SCALE - p * x^2 [1]
10973	* and y'= y / SCHED_CAPACITY_SCALE
10974	*
10975	* x is the ratio of sum_util compared to the CPU capacity:
10976	* x = sum_util / (llc_weight * SCHED_CAPACITY_SCALE)
10977	* y' is the ratio of CPUs to be scanned in the LLC domain,
10978	* and the number of CPUs to scan is calculated by:
10979	*
10980	* nr_scan = llc_weight * y' [2]
10981	*
10982	* When x hits the threshold of overloaded, AKA, when
10983	* x = 100 / pct, y drops to 0. According to [1],
10984	* p should be SCHED_CAPACITY_SCALE * pct^2 / 10000
10985	*
10986	* Scale x by SCHED_CAPACITY_SCALE:
10987	* x' = sum_util / llc_weight; [3]
10988	*
10989	* and finally [1] becomes:
10990	* y = SCHED_CAPACITY_SCALE -
10991	* x'^2 * pct^2 / (10000 * SCHED_CAPACITY_SCALE) [4]
10992	*
10993	*/
10994	/ equation [3] /
10995	x = sum_util;
10996	do_div(x, llc_weight);
10997
10998	/ equation [4] /
10999	pct = env->sd->imbalance_pct;
11000	tmp = x * x * pct * pct;
11001	do_div(tmp, `10000` * SCHED_CAPACITY_SCALE);
11002	tmp = min_t(long, tmp, SCHED_CAPACITY_SCALE);
11003	y = SCHED_CAPACITY_SCALE - tmp;
11004
11005	/ equation [2] /
11006	y *= llc_weight;
11007	do_div(y, SCHED_CAPACITY_SCALE);
11008	if ((int)y != sd_share->nr_idle_scan)
11009	WRITE_ONCE(sd_share->nr_idle_scan, (int)y);
11010	}
11011
11012	/**
11013	* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
11014	* @env: The load balancing environment.
11015	* @sds: variable to hold the statistics for this sched_domain.
11016	*/
11017
11018	static inline void update_sd_lb_stats(struct lb_env env, struct* sd_lb_stats *sds)
11019	{
11020	struct sched_group *sg = env->sd->groups;
11021	struct sg_lb_stats *local = &sds->local_stat;
11022	struct sg_lb_stats tmp_sgs;
11023	unsigned long sum_util = `0`;
11024	bool sg_overloaded = `0`, sg_overutilized = `0`;
11025
11026	do {
11027	struct sg_lb_stats *sgs = &tmp_sgs;
11028	int local_group;
11029
11030	local_group = cpumask_test_cpu(cpu: env->dst_cpu, cpumask: sched_group_span(sg));
11031	if (local_group) {
11032	sds->local = sg;
11033	sgs = local;
11034
11035	if (env->idle != CPU_NEWLY_IDLE \|\|
11036	time_after_eq(jiffies, sg->sgc->next_update))
11037	update_group_capacity(sd: env->sd, cpu: env->dst_cpu);
11038	}
11039
11040	update_sg_lb_stats(env, sds, group: sg, sgs, sg_overloaded: &sg_overloaded, sg_overutilized: &sg_overutilized);
11041
11042	if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
11043	sds->busiest = sg;
11044	sds->busiest_stat = *sgs;
11045	}
11046
11047	/ Now, start updating sd_lb_stats /
11048	sds->total_load += sgs->group_load;
11049	sds->total_capacity += sgs->group_capacity;
11050
11051	sum_util += sgs->group_util;
11052	sg = sg->next;
11053	} while (sg != env->sd->groups);
11054
11055	/*
11056	* Indicate that the child domain of the busiest group prefers tasks
11057	* go to a child's sibling domains first. NB the flags of a sched group
11058	* are those of the child domain.
11059	*/
11060	if (sds->busiest)
11061	sds->prefer_sibling = !!(sds->busiest->flags & SD_PREFER_SIBLING);
11062
11063
11064	if (env->sd->flags & SD_NUMA)
11065	env->fbq_type = fbq_classify_group(sgs: &sds->busiest_stat);
11066
11067	if (!env->sd->parent) {
11068	/ update overload indicator if we are at root domain /
11069	set_rd_overloaded(rd: env->dst_rq->rd, status: sg_overloaded);
11070
11071	/ Update over-utilization (tipping point, U >= 0) indicator /
11072	set_rd_overutilized(rd: env->dst_rq->rd, flag: sg_overutilized);
11073	} else if (sg_overutilized) {
11074	set_rd_overutilized(rd: env->dst_rq->rd, flag: sg_overutilized);
11075	}
11076
11077	update_idle_cpu_scan(env, sum_util);
11078	}
11079
11080	/**
11081	* calculate_imbalance - Calculate the amount of imbalance present within the
11082	* groups of a given sched_domain during load balance.
11083	* @env: load balance environment
11084	* @sds: statistics of the sched_domain whose imbalance is to be calculated.
11085	*/
11086	static inline void calculate_imbalance(struct lb_env env, struct* sd_lb_stats *sds)
11087	{
11088	struct sg_lb_stats local, busiest;
11089
11090	local = &sds->local_stat;
11091	busiest = &sds->busiest_stat;
11092
11093	if (busiest->group_type == group_misfit_task) {
11094	if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
11095	/ Set imbalance to allow misfit tasks to be balanced. /
11096	env->migration_type = migrate_misfit;
11097	env->imbalance = `1`;
11098	} else {
11099	/*
11100	* Set load imbalance to allow moving task from cpu
11101	* with reduced capacity.
11102	*/
11103	env->migration_type = migrate_load;
11104	env->imbalance = busiest->group_misfit_task_load;
11105	}
11106	return;
11107	}
11108
11109	if (busiest->group_type == group_asym_packing) {
11110	/*
11111	* In case of asym capacity, we will try to migrate all load to
11112	* the preferred CPU.
11113	*/
11114	env->migration_type = migrate_task;
11115	env->imbalance = busiest->sum_h_nr_running;
11116	return;
11117	}
11118
11119	if (busiest->group_type == group_smt_balance) {
11120	/ Reduce number of tasks sharing CPU capacity /
11121	env->migration_type = migrate_task;
11122	env->imbalance = `1`;
11123	return;
11124	}
11125
11126	if (busiest->group_type == group_imbalanced) {
11127	/*
11128	* In the group_imb case we cannot rely on group-wide averages
11129	* to ensure CPU-load equilibrium, try to move any task to fix
11130	* the imbalance. The next load balance will take care of
11131	* balancing back the system.
11132	*/
11133	env->migration_type = migrate_task;
11134	env->imbalance = `1`;
11135	return;
11136	}
11137
11138	/*
11139	* Try to use spare capacity of local group without overloading it or
11140	* emptying busiest.
11141	*/
11142	if (local->group_type == group_has_spare) {
11143	if ((busiest->group_type > group_fully_busy) &&
11144	!(env->sd->flags & SD_SHARE_LLC)) {
11145	/*
11146	* If busiest is overloaded, try to fill spare
11147	* capacity. This might end up creating spare capacity
11148	* in busiest or busiest still being overloaded but
11149	* there is no simple way to directly compute the
11150	* amount of load to migrate in order to balance the
11151	* system.
11152	*/
11153	env->migration_type = migrate_util;
11154	env->imbalance = max(local->group_capacity, local->group_util) -
11155	local->group_util;
11156
11157	/*
11158	* In some cases, the group's utilization is max or even
11159	* higher than capacity because of migrations but the
11160	* local CPU is (newly) idle. There is at least one
11161	* waiting task in this overloaded busiest group. Let's
11162	* try to pull it.
11163	*/
11164	if (env->idle && env->imbalance == `0`) {
11165	env->migration_type = migrate_task;
11166	env->imbalance = `1`;
11167	}
11168
11169	return;
11170	}
11171
11172	if (busiest->group_weight == `1` \|\| sds->prefer_sibling) {
11173	/*
11174	* When prefer sibling, evenly spread running tasks on
11175	* groups.
11176	*/
11177	env->migration_type = migrate_task;
11178	env->imbalance = sibling_imbalance(env, sds, busiest, local);
11179	} else {
11180
11181	/*
11182	* If there is no overload, we just want to even the number of
11183	* idle CPUs.
11184	*/
11185	env->migration_type = migrate_task;
11186	env->imbalance = max_t(long, `0`,
11187	(local->idle_cpus - busiest->idle_cpus));
11188	}
11189
11190	#ifdef CONFIG_NUMA
11191	/ Consider allowing a small imbalance between NUMA groups /
11192	if (env->sd->flags & SD_NUMA) {
11193	env->imbalance = adjust_numa_imbalance(imbalance: env->imbalance,
11194	dst_running: local->sum_nr_running + `1`,
11195	imb_numa_nr: env->sd->imb_numa_nr);
11196	}
11197	#endif
11198
11199	/ Number of tasks to move to restore balance /
11200	env->imbalance >>= `1`;
11201
11202	return;
11203	}
11204
11205	/*
11206	* Local is fully busy but has to take more load to relieve the
11207	* busiest group
11208	*/
11209	if (local->group_type < group_overloaded) {
11210	/*
11211	* Local will become overloaded so the avg_load metrics are
11212	* finally needed.
11213	*/
11214
11215	local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) /
11216	local->group_capacity;
11217
11218	/*
11219	* If the local group is more loaded than the selected
11220	* busiest group don't try to pull any tasks.
11221	*/
11222	if (local->avg_load >= busiest->avg_load) {
11223	env->imbalance = `0`;
11224	return;
11225	}
11226
11227	sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
11228	sds->total_capacity;
11229
11230	/*
11231	* If the local group is more loaded than the average system
11232	* load, don't try to pull any tasks.
11233	*/
11234	if (local->avg_load >= sds->avg_load) {
11235	env->imbalance = `0`;
11236	return;
11237	}
11238
11239	}
11240
11241	/*
11242	* Both group are or will become overloaded and we're trying to get all
11243	* the CPUs to the average_load, so we don't want to push ourselves
11244	* above the average load, nor do we wish to reduce the max loaded CPU
11245	* below the average load. At the same time, we also don't want to
11246	* reduce the group load below the group capacity. Thus we look for
11247	* the minimum possible imbalance.
11248	*/
11249	env->migration_type = migrate_load;
11250	env->imbalance = min(
11251	(busiest->avg_load - sds->avg_load) * busiest->group_capacity,
11252	(sds->avg_load - local->avg_load) * local->group_capacity
11253	) / SCHED_CAPACITY_SCALE;
11254	}
11255
11256	/**** sched_balance_find_src_group() helpers end here ******************/
11257
11258	/*
11259	* Decision matrix according to the local and busiest group type:
11260	*
11261	* busiest \ local has_spare fully_busy misfit asym imbalanced overloaded
11262	* has_spare nr_idle balanced N/A N/A balanced balanced
11263	* fully_busy nr_idle nr_idle N/A N/A balanced balanced
11264	* misfit_task force N/A N/A N/A N/A N/A
11265	* asym_packing force force N/A N/A force force
11266	* imbalanced force force N/A N/A force force
11267	* overloaded force force N/A N/A force avg_load
11268	*
11269	* N/A : Not Applicable because already filtered while updating
11270	* statistics.
11271	* balanced : The system is balanced for these 2 groups.
11272	* force : Calculate the imbalance as load migration is probably needed.
11273	* avg_load : Only if imbalance is significant enough.
11274	* nr_idle : dst_cpu is not busy and the number of idle CPUs is quite
11275	* different in groups.
11276	*/
11277
11278	/**
11279	* sched_balance_find_src_group - Returns the busiest group within the sched_domain
11280	* if there is an imbalance.
11281	* @env: The load balancing environment.
11282	*
11283	* Also calculates the amount of runnable load which should be moved
11284	* to restore balance.
11285	*
11286	* Return: - The busiest group if imbalance exists.
11287	*/
11288	static struct sched_group sched_balance_find_src_group(struct* lb_env *env)
11289	{
11290	struct sg_lb_stats local, busiest;
11291	struct sd_lb_stats sds;
11292
11293	init_sd_lb_stats(sds: &sds);
11294
11295	/*
11296	* Compute the various statistics relevant for load balancing at
11297	* this level.
11298	*/
11299	update_sd_lb_stats(env, sds: &sds);
11300
11301	/ There is no busy sibling group to pull tasks from /
11302	if (!sds.busiest)
11303	goto out_balanced;
11304
11305	busiest = &sds.busiest_stat;
11306
11307	/ Misfit tasks should be dealt with regardless of the avg load /
11308	if (busiest->group_type == group_misfit_task)
11309	goto force_balance;
11310
11311	if (!is_rd_overutilized(rd: env->dst_rq->rd) &&
11312	rcu_dereference(env->dst_rq->rd->pd))
11313	goto out_balanced;
11314
11315	/ ASYM feature bypasses nice load balance check /
11316	if (busiest->group_type == group_asym_packing)
11317	goto force_balance;
11318
11319	/*
11320	* If the busiest group is imbalanced the below checks don't
11321	* work because they assume all things are equal, which typically
11322	* isn't true due to cpus_ptr constraints and the like.
11323	*/
11324	if (busiest->group_type == group_imbalanced)
11325	goto force_balance;
11326
11327	local = &sds.local_stat;
11328	/*
11329	* If the local group is busier than the selected busiest group
11330	* don't try and pull any tasks.
11331	*/
11332	if (local->group_type > busiest->group_type)
11333	goto out_balanced;
11334
11335	/*
11336	* When groups are overloaded, use the avg_load to ensure fairness
11337	* between tasks.
11338	*/
11339	if (local->group_type == group_overloaded) {
11340	/*
11341	* If the local group is more loaded than the selected
11342	* busiest group don't try to pull any tasks.
11343	*/
11344	if (local->avg_load >= busiest->avg_load)
11345	goto out_balanced;
11346
11347	/ XXX broken for overlapping NUMA groups /
11348	sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) /
11349	sds.total_capacity;
11350
11351	/*
11352	* Don't pull any tasks if this group is already above the
11353	* domain average load.
11354	*/
11355	if (local->avg_load >= sds.avg_load)
11356	goto out_balanced;
11357
11358	/*
11359	* If the busiest group is more loaded, use imbalance_pct to be
11360	* conservative.
11361	*/
11362	if (`100` * busiest->avg_load <=
11363	env->sd->imbalance_pct * local->avg_load)
11364	goto out_balanced;
11365	}
11366
11367	/*
11368	* Try to move all excess tasks to a sibling domain of the busiest
11369	* group's child domain.
11370	*/
11371	if (sds.prefer_sibling && local->group_type == group_has_spare &&
11372	sibling_imbalance(env, sds: &sds, busiest, local) > `1`)
11373	goto force_balance;
11374
11375	if (busiest->group_type != group_overloaded) {
11376	if (!env->idle) {
11377	/*
11378	* If the busiest group is not overloaded (and as a
11379	* result the local one too) but this CPU is already
11380	* busy, let another idle CPU try to pull task.
11381	*/
11382	goto out_balanced;
11383	}
11384
11385	if (busiest->group_type == group_smt_balance &&
11386	smt_vs_nonsmt_groups(sg1: sds.local, sg2: sds.busiest)) {
11387	/ Let non SMT CPU pull from SMT CPU sharing with sibling /
11388	goto force_balance;
11389	}
11390
11391	if (busiest->group_weight > `1` &&
11392	local->idle_cpus <= (busiest->idle_cpus + `1`)) {
11393	/*
11394	* If the busiest group is not overloaded
11395	* and there is no imbalance between this and busiest
11396	* group wrt idle CPUs, it is balanced. The imbalance
11397	* becomes significant if the diff is greater than 1
11398	* otherwise we might end up to just move the imbalance
11399	* on another group. Of course this applies only if
11400	* there is more than 1 CPU per group.
11401	*/
11402	goto out_balanced;
11403	}
11404
11405	if (busiest->sum_h_nr_running == `1`) {
11406	/*
11407	* busiest doesn't have any tasks waiting to run
11408	*/
11409	goto out_balanced;
11410	}
11411	}
11412
11413	force_balance:
11414	/ Looks like there is an imbalance. Compute it /
11415	calculate_imbalance(env, sds: &sds);
11416	return env->imbalance ? sds.busiest : NULL;
11417
11418	out_balanced:
11419	env->imbalance = `0`;
11420	return NULL;
11421	}
11422
11423	/*
11424	* sched_balance_find_src_rq - find the busiest runqueue among the CPUs in the group.
11425	*/
11426	static struct rq sched_balance_find_src_rq(struct* lb_env *env,
11427	struct sched_group *group)
11428	{
11429	struct rq busiest = NULL, rq;
11430	unsigned long busiest_util = `0`, busiest_load = `0`, busiest_capacity = `1`;
11431	unsigned int busiest_nr = `0`;
11432	int i;
11433
11434	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
11435	unsigned long capacity, load, util;
11436	unsigned int nr_running;
11437	enum fbq_type rt;
11438
11439	rq = cpu_rq(i);
11440	rt = fbq_classify_rq(rq);
11441
11442	/*
11443	* We classify groups/runqueues into three groups:
11444	* - regular: there are !numa tasks
11445	* - remote: there are numa tasks that run on the 'wrong' node
11446	* - all: there is no distinction
11447	*
11448	* In order to avoid migrating ideally placed numa tasks,
11449	* ignore those when there's better options.
11450	*
11451	* If we ignore the actual busiest queue to migrate another
11452	* task, the next balance pass can still reduce the busiest
11453	* queue by moving tasks around inside the node.
11454	*
11455	* If we cannot move enough load due to this classification
11456	* the next pass will adjust the group classification and
11457	* allow migration of more tasks.
11458	*
11459	* Both cases only affect the total convergence complexity.
11460	*/
11461	if (rt > env->fbq_type)
11462	continue;
11463
11464	nr_running = rq->cfs.h_nr_runnable;
11465	if (!nr_running)
11466	continue;
11467
11468	capacity = capacity_of(cpu: i);
11469
11470	/*
11471	* For ASYM_CPUCAPACITY domains, don't pick a CPU that could
11472	* eventually lead to active_balancing high->low capacity.
11473	* Higher per-CPU capacity is considered better than balancing
11474	* average load.
11475	*/
11476	if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
11477	!capacity_greater(capacity_of(env->dst_cpu), capacity) &&
11478	nr_running == `1`)
11479	continue;
11480
11481	/*
11482	* Make sure we only pull tasks from a CPU of lower priority
11483	* when balancing between SMT siblings.
11484	*
11485	* If balancing between cores, let lower priority CPUs help
11486	* SMT cores with more than one busy sibling.
11487	*/
11488	if (sched_asym(sd: env->sd, dst_cpu: i, src_cpu: env->dst_cpu) && nr_running == `1`)
11489	continue;
11490
11491	switch (env->migration_type) {
11492	case migrate_load:
11493	/*
11494	* When comparing with load imbalance, use cpu_load()
11495	* which is not scaled with the CPU capacity.
11496	*/
11497	load = cpu_load(rq);
11498
11499	if (nr_running == `1` && load > env->imbalance &&
11500	!check_cpu_capacity(rq, sd: env->sd))
11501	break;
11502
11503	/*
11504	* For the load comparisons with the other CPUs,
11505	* consider the cpu_load() scaled with the CPU
11506	* capacity, so that the load can be moved away
11507	* from the CPU that is potentially running at a
11508	* lower capacity.
11509	*
11510	* Thus we're looking for max(load_i / capacity_i),
11511	* crosswise multiplication to rid ourselves of the
11512	* division works out to:
11513	* load_i * capacity_j > load_j * capacity_i;
11514	* where j is our previous maximum.
11515	*/
11516	if (load * busiest_capacity > busiest_load * capacity) {
11517	busiest_load = load;
11518	busiest_capacity = capacity;
11519	busiest = rq;
11520	}
11521	break;
11522
11523	case migrate_util:
11524	util = cpu_util_cfs_boost(cpu: i);
11525
11526	/*
11527	* Don't try to pull utilization from a CPU with one
11528	* running task. Whatever its utilization, we will fail
11529	* detach the task.
11530	*/
11531	if (nr_running <= `1`)
11532	continue;
11533
11534	if (busiest_util < util) {
11535	busiest_util = util;
11536	busiest = rq;
11537	}
11538	break;
11539
11540	case migrate_task:
11541	if (busiest_nr < nr_running) {
11542	busiest_nr = nr_running;
11543	busiest = rq;
11544	}
11545	break;
11546
11547	case migrate_misfit:
11548	/*
11549	* For ASYM_CPUCAPACITY domains with misfit tasks we
11550	* simply seek the "biggest" misfit task.
11551	*/
11552	if (rq->misfit_task_load > busiest_load) {
11553	busiest_load = rq->misfit_task_load;
11554	busiest = rq;
11555	}
11556
11557	break;
11558
11559	}
11560	}
11561
11562	return busiest;
11563	}
11564
11565	/*
11566	* Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
11567	* so long as it is large enough.
11568	*/
11569	#define MAX_PINNED_INTERVAL 512
11570
11571	static inline bool
11572	asym_active_balance(struct lb_env *env)
11573	{
11574	/*
11575	* ASYM_PACKING needs to force migrate tasks from busy but lower
11576	* priority CPUs in order to pack all tasks in the highest priority
11577	* CPUs. When done between cores, do it only if the whole core if the
11578	* whole core is idle.
11579	*
11580	* If @env::src_cpu is an SMT core with busy siblings, let
11581	* the lower priority @env::dst_cpu help it. Do not follow
11582	* CPU priority.
11583	*/
11584	return env->idle && sched_use_asym_prio(sd: env->sd, cpu: env->dst_cpu) &&
11585	(sched_asym_prefer(a: env->dst_cpu, b: env->src_cpu) \|\|
11586	!sched_use_asym_prio(sd: env->sd, cpu: env->src_cpu));
11587	}
11588
11589	static inline bool
11590	imbalanced_active_balance(struct lb_env *env)
11591	{
11592	struct sched_domain *sd = env->sd;
11593
11594	/*
11595	* The imbalanced case includes the case of pinned tasks preventing a fair
11596	* distribution of the load on the system but also the even distribution of the
11597	* threads on a system with spare capacity
11598	*/
11599	if ((env->migration_type == migrate_task) &&
11600	(sd->nr_balance_failed > sd->cache_nice_tries+`2`))
11601	return `1`;
11602
11603	return `0`;
11604	}
11605
11606	static int need_active_balance(struct lb_env *env)
11607	{
11608	struct sched_domain *sd = env->sd;
11609
11610	if (asym_active_balance(env))
11611	return `1`;
11612
11613	if (imbalanced_active_balance(env))
11614	return `1`;
11615
11616	/*
11617	* The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
11618	* It's worth migrating the task if the src_cpu's capacity is reduced
11619	* because of other sched_class or IRQs if more capacity stays
11620	* available on dst_cpu.
11621	*/
11622	if (env->idle &&
11623	(env->src_rq->cfs.h_nr_runnable == `1`)) {
11624	if ((check_cpu_capacity(rq: env->src_rq, sd)) &&
11625	(capacity_of(cpu: env->src_cpu)sd->imbalance_pct < capacity_of(cpu: env->dst_cpu)`100`))
11626	return `1`;
11627	}
11628
11629	if (env->migration_type == migrate_misfit)
11630	return `1`;
11631
11632	return `0`;
11633	}
11634
11635	static int active_load_balance_cpu_stop(void *data);
11636
11637	static int should_we_balance(struct lb_env *env)
11638	{
11639	struct cpumask *swb_cpus = this_cpu_cpumask_var_ptr(should_we_balance_tmpmask);
11640	struct sched_group *sg = env->sd->groups;
11641	int cpu, idle_smt = -`1`;
11642
11643	/*
11644	* Ensure the balancing environment is consistent; can happen
11645	* when the softirq triggers 'during' hotplug.
11646	*/
11647	if (!cpumask_test_cpu(cpu: env->dst_cpu, cpumask: env->cpus))
11648	return `0`;
11649
11650	/*
11651	* In the newly idle case, we will allow all the CPUs
11652	* to do the newly idle load balance.
11653	*
11654	* However, we bail out if we already have tasks or a wakeup pending,
11655	* to optimize wakeup latency.
11656	*/
11657	if (env->idle == CPU_NEWLY_IDLE) {
11658	if (env->dst_rq->nr_running > `0` \|\| env->dst_rq->ttwu_pending)
11659	return `0`;
11660	return `1`;
11661	}
11662
11663	cpumask_copy(dstp: swb_cpus, srcp: group_balance_mask(sg));
11664	/ Try to find first idle CPU /
11665	for_each_cpu_and(cpu, swb_cpus, env->cpus) {
11666	if (!idle_cpu(cpu))
11667	continue;
11668
11669	/*
11670	* Don't balance to idle SMT in busy core right away when
11671	* balancing cores, but remember the first idle SMT CPU for
11672	* later consideration. Find CPU on an idle core first.
11673	*/
11674	if (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && !is_core_idle(cpu)) {
11675	if (idle_smt == -`1`)
11676	idle_smt = cpu;
11677	/*
11678	* If the core is not idle, and first SMT sibling which is
11679	* idle has been found, then its not needed to check other
11680	* SMT siblings for idleness:
11681	*/
11682	#ifdef CONFIG_SCHED_SMT
11683	cpumask_andnot(dstp: swb_cpus, src1p: swb_cpus, src2p: cpu_smt_mask(cpu));
11684	#endif
11685	continue;
11686	}
11687
11688	/*
11689	* Are we the first idle core in a non-SMT domain or higher,
11690	* or the first idle CPU in a SMT domain?
11691	*/
11692	return cpu == env->dst_cpu;
11693	}
11694
11695	/ Are we the first idle CPU with busy siblings? /
11696	if (idle_smt != -`1`)
11697	return idle_smt == env->dst_cpu;
11698
11699	/ Are we the first CPU of this group ? /
11700	return group_balance_cpu(sg) == env->dst_cpu;
11701	}
11702
11703	static void update_lb_imbalance_stat(struct lb_env env, struct* sched_domain *sd,
11704	enum cpu_idle_type idle)
11705	{
11706	if (!schedstat_enabled())
11707	return;
11708
11709	switch (env->migration_type) {
11710	case migrate_load:
11711	__schedstat_add(sd->lb_imbalance_load[idle], env->imbalance);
11712	break;
11713	case migrate_util:
11714	__schedstat_add(sd->lb_imbalance_util[idle], env->imbalance);
11715	break;
11716	case migrate_task:
11717	__schedstat_add(sd->lb_imbalance_task[idle], env->imbalance);
11718	break;
11719	case migrate_misfit:
11720	__schedstat_add(sd->lb_imbalance_misfit[idle], env->imbalance);
11721	break;
11722	}
11723	}
11724
11725	/*
11726	* Check this_cpu to ensure it is balanced within domain. Attempt to move
11727	* tasks if there is an imbalance.
11728	*/
11729	static int sched_balance_rq(int this_cpu, struct rq *this_rq,
11730	struct sched_domain sd, enum* cpu_idle_type idle,
11731	int *continue_balancing)
11732	{
11733	int ld_moved, cur_ld_moved, active_balance = `0`;
11734	struct sched_domain *sd_parent = sd->parent;
11735	struct sched_group *group;
11736	struct rq *busiest;
11737	struct rq_flags rf;
11738	struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
11739	struct lb_env env = {
11740	.sd = sd,
11741	.dst_cpu = this_cpu,
11742	.dst_rq = this_rq,
11743	.dst_grpmask = group_balance_mask(sg: sd->groups),
11744	.idle = idle,
11745	.loop_break = SCHED_NR_MIGRATE_BREAK,
11746	.cpus = cpus,
11747	.fbq_type = all,
11748	.tasks = LIST_HEAD_INIT(env.tasks),
11749	};
11750
11751	cpumask_and(dstp: cpus, src1p: sched_domain_span(sd), cpu_active_mask);
11752
11753	schedstat_inc(sd->lb_count[idle]);
11754
11755	redo:
11756	if (!should_we_balance(env: &env)) {
11757	*continue_balancing = `0`;
11758	goto out_balanced;
11759	}
11760
11761	group = sched_balance_find_src_group(env: &env);
11762	if (!group) {
11763	schedstat_inc(sd->lb_nobusyg[idle]);
11764	goto out_balanced;
11765	}
11766
11767	busiest = sched_balance_find_src_rq(env: &env, group);
11768	if (!busiest) {
11769	schedstat_inc(sd->lb_nobusyq[idle]);
11770	goto out_balanced;
11771	}
11772
11773	WARN_ON_ONCE(busiest == env.dst_rq);
11774
11775	update_lb_imbalance_stat(env: &env, sd, idle);
11776
11777	env.src_cpu = busiest->cpu;
11778	env.src_rq = busiest;
11779
11780	ld_moved = `0`;
11781	/ Clear this flag as soon as we find a pullable task /
11782	env.flags \|= LBF_ALL_PINNED;
11783	if (busiest->nr_running > `1`) {
11784	/*
11785	* Attempt to move tasks. If sched_balance_find_src_group has found
11786	* an imbalance but busiest->nr_running <= 1, the group is
11787	* still unbalanced. ld_moved simply stays zero, so it is
11788	* correctly treated as an imbalance.
11789	*/
11790	env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
11791
11792	more_balance:
11793	rq_lock_irqsave(rq: busiest, rf: &rf);
11794	update_rq_clock(rq: busiest);
11795
11796	/*
11797	* cur_ld_moved - load moved in current iteration
11798	* ld_moved - cumulative load moved across iterations
11799	*/
11800	cur_ld_moved = detach_tasks(env: &env);
11801
11802	/*
11803	* We've detached some tasks from busiest_rq. Every
11804	* task is masked "TASK_ON_RQ_MIGRATING", so we can safely
11805	* unlock busiest->lock, and we are able to be sure
11806	* that nobody can manipulate the tasks in parallel.
11807	* See task_rq_lock() family for the details.
11808	*/
11809
11810	rq_unlock(rq: busiest, rf: &rf);
11811
11812	if (cur_ld_moved) {
11813	attach_tasks(env: &env);
11814	ld_moved += cur_ld_moved;
11815	}
11816
11817	local_irq_restore(rf.flags);
11818
11819	if (env.flags & LBF_NEED_BREAK) {
11820	env.flags &= ~LBF_NEED_BREAK;
11821	goto more_balance;
11822	}
11823
11824	/*
11825	* Revisit (affine) tasks on src_cpu that couldn't be moved to
11826	* us and move them to an alternate dst_cpu in our sched_group
11827	* where they can run. The upper limit on how many times we
11828	* iterate on same src_cpu is dependent on number of CPUs in our
11829	* sched_group.
11830	*
11831	* This changes load balance semantics a bit on who can move
11832	* load to a given_cpu. In addition to the given_cpu itself
11833	* (or a ilb_cpu acting on its behalf where given_cpu is
11834	* nohz-idle), we now have balance_cpu in a position to move
11835	* load to given_cpu. In rare situations, this may cause
11836	* conflicts (balance_cpu and given_cpu/ilb_cpu deciding
11837	* _independently_ and at _same_ time to move some load to
11838	* given_cpu) causing excess load to be moved to given_cpu.
11839	* This however should not happen so much in practice and
11840	* moreover subsequent load balance cycles should correct the
11841	* excess load moved.
11842	*/
11843	if ((env.flags & LBF_DST_PINNED) && env.imbalance > `0`) {
11844
11845	/ Prevent to re-select dst_cpu via env's CPUs /
11846	__cpumask_clear_cpu(cpu: env.dst_cpu, dstp: env.cpus);
11847
11848	env.dst_rq = cpu_rq(env.new_dst_cpu);
11849	env.dst_cpu = env.new_dst_cpu;
11850	env.flags &= ~LBF_DST_PINNED;
11851	env.loop = `0`;
11852	env.loop_break = SCHED_NR_MIGRATE_BREAK;
11853
11854	/*
11855	* Go back to "more_balance" rather than "redo" since we
11856	* need to continue with same src_cpu.
11857	*/
11858	goto more_balance;
11859	}
11860
11861	/*
11862	* We failed to reach balance because of affinity.
11863	*/
11864	if (sd_parent) {
11865	int *group_imbalance = &sd_parent->groups->sgc->imbalance;
11866
11867	if ((env.flags & LBF_SOME_PINNED) && env.imbalance > `0`)
11868	*group_imbalance = `1`;
11869	}
11870
11871	/ All tasks on this runqueue were pinned by CPU affinity /
11872	if (unlikely(env.flags & LBF_ALL_PINNED)) {
11873	__cpumask_clear_cpu(cpu: cpu_of(rq: busiest), dstp: cpus);
11874	/*
11875	* Attempting to continue load balancing at the current
11876	* sched_domain level only makes sense if there are
11877	* active CPUs remaining as possible busiest CPUs to
11878	* pull load from which are not contained within the
11879	* destination group that is receiving any migrated
11880	* load.
11881	*/
11882	if (!cpumask_subset(src1p: cpus, src2p: env.dst_grpmask)) {
11883	env.loop = `0`;
11884	env.loop_break = SCHED_NR_MIGRATE_BREAK;
11885	goto redo;
11886	}
11887	goto out_all_pinned;
11888	}
11889	}
11890
11891	if (!ld_moved) {
11892	schedstat_inc(sd->lb_failed[idle]);
11893	/*
11894	* Increment the failure counter only on periodic balance.
11895	* We do not want newidle balance, which can be very
11896	* frequent, pollute the failure counter causing
11897	* excessive cache_hot migrations and active balances.
11898	*
11899	* Similarly for migration_misfit which is not related to
11900	* load/util migration, don't pollute nr_balance_failed.
11901	*/
11902	if (idle != CPU_NEWLY_IDLE &&
11903	env.migration_type != migrate_misfit)
11904	sd->nr_balance_failed++;
11905
11906	if (need_active_balance(env: &env)) {
11907	unsigned long flags;
11908
11909	raw_spin_rq_lock_irqsave(busiest, flags);
11910
11911	/*
11912	* Don't kick the active_load_balance_cpu_stop,
11913	* if the curr task on busiest CPU can't be
11914	* moved to this_cpu:
11915	*/
11916	if (!cpumask_test_cpu(cpu: this_cpu, cpumask: busiest->curr->cpus_ptr)) {
11917	raw_spin_rq_unlock_irqrestore(rq: busiest, flags);
11918	goto out_one_pinned;
11919	}
11920
11921	/ Record that we found at least one task that could run on this_cpu /
11922	env.flags &= ~LBF_ALL_PINNED;
11923
11924	/*
11925	* ->active_balance synchronizes accesses to
11926	* ->active_balance_work. Once set, it's cleared
11927	* only after active load balance is finished.
11928	*/
11929	if (!busiest->active_balance) {
11930	busiest->active_balance = `1`;
11931	busiest->push_cpu = this_cpu;
11932	active_balance = `1`;
11933	}
11934
11935	preempt_disable();
11936	raw_spin_rq_unlock_irqrestore(rq: busiest, flags);
11937	if (active_balance) {
11938	stop_one_cpu_nowait(cpu: cpu_of(rq: busiest),
11939	fn: active_load_balance_cpu_stop, arg: busiest,
11940	work_buf: &busiest->active_balance_work);
11941	}
11942	preempt_enable();
11943	}
11944	} else {
11945	sd->nr_balance_failed = `0`;
11946	}
11947
11948	if (likely(!active_balance) \|\| need_active_balance(env: &env)) {
11949	/ We were unbalanced, so reset the balancing interval /
11950	sd->balance_interval = sd->min_interval;
11951	}
11952
11953	goto out;
11954
11955	out_balanced:
11956	/*
11957	* We reach balance although we may have faced some affinity
11958	* constraints. Clear the imbalance flag only if other tasks got
11959	* a chance to move and fix the imbalance.
11960	*/
11961	if (sd_parent && !(env.flags & LBF_ALL_PINNED)) {
11962	int *group_imbalance = &sd_parent->groups->sgc->imbalance;
11963
11964	if (*group_imbalance)
11965	*group_imbalance = `0`;
11966	}
11967
11968	out_all_pinned:
11969	/*
11970	* We reach balance because all tasks are pinned at this level so
11971	* we can't migrate them. Let the imbalance flag set so parent level
11972	* can try to migrate them.
11973	*/
11974	schedstat_inc(sd->lb_balanced[idle]);
11975
11976	sd->nr_balance_failed = `0`;
11977
11978	out_one_pinned:
11979	ld_moved = `0`;
11980
11981	/*
11982	* sched_balance_newidle() disregards balance intervals, so we could
11983	* repeatedly reach this code, which would lead to balance_interval
11984	* skyrocketing in a short amount of time. Skip the balance_interval
11985	* increase logic to avoid that.
11986	*
11987	* Similarly misfit migration which is not necessarily an indication of
11988	* the system being busy and requires lb to backoff to let it settle
11989	* down.
11990	*/
11991	if (env.idle == CPU_NEWLY_IDLE \|\|
11992	env.migration_type == migrate_misfit)
11993	goto out;
11994
11995	/ tune up the balancing interval /
11996	if ((env.flags & LBF_ALL_PINNED &&
11997	sd->balance_interval < MAX_PINNED_INTERVAL) \|\|
11998	sd->balance_interval < sd->max_interval)
11999	sd->balance_interval *= `2`;
12000	out:
12001	return ld_moved;
12002	}
12003
12004	static inline unsigned long
12005	get_sd_balance_interval(struct sched_domain sd, int* cpu_busy)
12006	{
12007	unsigned long interval = sd->balance_interval;
12008
12009	if (cpu_busy)
12010	interval *= sd->busy_factor;
12011
12012	/ scale ms to jiffies /
12013	interval = msecs_to_jiffies(m: interval);
12014
12015	/*
12016	* Reduce likelihood of busy balancing at higher domains racing with
12017	* balancing at lower domains by preventing their balancing periods
12018	* from being multiples of each other.
12019	*/
12020	if (cpu_busy)
12021	interval -= `1`;
12022
12023	interval = clamp(interval, `1UL`, max_load_balance_interval);
12024
12025	return interval;
12026	}
12027
12028	static inline void
12029	update_next_balance(struct sched_domain sd, unsigned* long *next_balance)
12030	{
12031	unsigned long interval, next;
12032
12033	/ used by idle balance, so cpu_busy = 0 /
12034	interval = get_sd_balance_interval(sd, cpu_busy: `0`);
12035	next = sd->last_balance + interval;
12036
12037	if (time_after(*next_balance, next))
12038	*next_balance = next;
12039	}
12040
12041	/*
12042	* active_load_balance_cpu_stop is run by the CPU stopper. It pushes
12043	* running tasks off the busiest CPU onto idle CPUs. It requires at
12044	* least 1 task to be running on each physical CPU where possible, and
12045	* avoids physical / logical imbalances.
12046	*/
12047	static int active_load_balance_cpu_stop(void *data)
12048	{
12049	struct rq *busiest_rq = data;
12050	int busiest_cpu = cpu_of(rq: busiest_rq);
12051	int target_cpu = busiest_rq->push_cpu;
12052	struct rq *target_rq = cpu_rq(target_cpu);
12053	struct sched_domain *sd;
12054	struct task_struct *p = NULL;
12055	struct rq_flags rf;
12056
12057	rq_lock_irq(rq: busiest_rq, rf: &rf);
12058	/*
12059	* Between queueing the stop-work and running it is a hole in which
12060	* CPUs can become inactive. We should not move tasks from or to
12061	* inactive CPUs.
12062	*/
12063	if (!cpu_active(cpu: busiest_cpu) \|\| !cpu_active(cpu: target_cpu))
12064	goto out_unlock;
12065
12066	/ Make sure the requested CPU hasn't gone down in the meantime: /
12067	if (unlikely(busiest_cpu != smp_processor_id() \|\|
12068	!busiest_rq->active_balance))
12069	goto out_unlock;
12070
12071	/ Is there any task to move? /
12072	if (busiest_rq->nr_running <= `1`)
12073	goto out_unlock;
12074
12075	/*
12076	* This condition is "impossible", if it occurs
12077	* we need to fix it. Originally reported by
12078	* Bjorn Helgaas on a 128-CPU setup.
12079	*/
12080	WARN_ON_ONCE(busiest_rq == target_rq);
12081
12082	/ Search for an sd spanning us and the target CPU. /
12083	rcu_read_lock();
12084	for_each_domain(target_cpu, sd) {
12085	if (cpumask_test_cpu(cpu: busiest_cpu, cpumask: sched_domain_span(sd)))
12086	break;
12087	}
12088
12089	if (likely(sd)) {
12090	struct lb_env env = {
12091	.sd = sd,
12092	.dst_cpu = target_cpu,
12093	.dst_rq = target_rq,
12094	.src_cpu = busiest_rq->cpu,
12095	.src_rq = busiest_rq,
12096	.idle = CPU_IDLE,
12097	.flags = LBF_ACTIVE_LB,
12098	};
12099
12100	schedstat_inc(sd->alb_count);
12101	update_rq_clock(rq: busiest_rq);
12102
12103	p = detach_one_task(env: &env);
12104	if (p) {
12105	schedstat_inc(sd->alb_pushed);
12106	/ Active balancing done, reset the failure counter. /
12107	sd->nr_balance_failed = `0`;
12108	} else {
12109	schedstat_inc(sd->alb_failed);
12110	}
12111	}
12112	rcu_read_unlock();
12113	out_unlock:
12114	busiest_rq->active_balance = `0`;
12115	rq_unlock(rq: busiest_rq, rf: &rf);
12116
12117	if (p)
12118	attach_one_task(rq: target_rq, p);
12119
12120	local_irq_enable();
12121
12122	return `0`;
12123	}
12124
12125	/*
12126	* This flag serializes load-balancing passes over large domains
12127	* (above the NODE topology level) - only one load-balancing instance
12128	* may run at a time, to reduce overhead on very large systems with
12129	* lots of CPUs and large NUMA distances.
12130	*
12131	* - Note that load-balancing passes triggered while another one
12132	* is executing are skipped and not re-tried.
12133	*
12134	* - Also note that this does not serialize rebalance_domains()
12135	* execution, as non-SD_SERIALIZE domains will still be
12136	* load-balanced in parallel.
12137	*/
12138	static atomic_t sched_balance_running = ATOMIC_INIT(`0`);
12139
12140	/*
12141	* Scale the max sched_balance_rq interval with the number of CPUs in the system.
12142	* This trades load-balance latency on larger machines for less cross talk.
12143	*/
12144	void update_max_interval(void)
12145	{
12146	max_load_balance_interval = HZ*num_online_cpus()/`10`;
12147	}
12148
12149	static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
12150	{
12151	if (cost > sd->max_newidle_lb_cost) {
12152	/*
12153	* Track max cost of a domain to make sure to not delay the
12154	* next wakeup on the CPU.
12155	*
12156	* sched_balance_newidle() bumps the cost whenever newidle
12157	* balance fails, and we don't want things to grow out of
12158	* control. Use the sysctl_sched_migration_cost as the upper
12159	* limit, plus a litle extra to avoid off by ones.
12160	*/
12161	sd->max_newidle_lb_cost =
12162	min(cost, sysctl_sched_migration_cost + `200`);
12163	sd->last_decay_max_lb_cost = jiffies;
12164	} else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) {
12165	/*
12166	* Decay the newidle max times by ~1% per second to ensure that
12167	* it is not outdated and the current max cost is actually
12168	* shorter.
12169	*/
12170	sd->max_newidle_lb_cost = (sd->max_newidle_lb_cost * `253`) / `256`;
12171	sd->last_decay_max_lb_cost = jiffies;
12172
12173	return true;
12174	}
12175
12176	return false;
12177	}
12178
12179	/*
12180	* It checks each scheduling domain to see if it is due to be balanced,
12181	* and initiates a balancing operation if so.
12182	*
12183	* Balancing parameters are set up in init_sched_domains.
12184	*/
12185	static void sched_balance_domains(struct rq rq, enum* cpu_idle_type idle)
12186	{
12187	int continue_balancing = `1`;
12188	int cpu = rq->cpu;
12189	int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
12190	unsigned long interval;
12191	struct sched_domain *sd;
12192	/ Earliest time when we have to do rebalance again /
12193	unsigned long next_balance = jiffies + `60`*HZ;
12194	int update_next_balance = `0`;
12195	int need_serialize, need_decay = `0`;
12196	u64 max_cost = `0`;
12197
12198	rcu_read_lock();
12199	for_each_domain(cpu, sd) {
12200	/*
12201	* Decay the newidle max times here because this is a regular
12202	* visit to all the domains.
12203	*/
12204	need_decay = update_newidle_cost(sd, cost: `0`);
12205	max_cost += sd->max_newidle_lb_cost;
12206
12207	/*
12208	* Stop the load balance at this level. There is another
12209	* CPU in our sched group which is doing load balancing more
12210	* actively.
12211	*/
12212	if (!continue_balancing) {
12213	if (need_decay)
12214	continue;
12215	break;
12216	}
12217
12218	interval = get_sd_balance_interval(sd, cpu_busy: busy);
12219
12220	need_serialize = sd->flags & SD_SERIALIZE;
12221	if (need_serialize) {
12222	if (atomic_cmpxchg_acquire(v: &sched_balance_running, old: `0`, new: `1`))
12223	goto out;
12224	}
12225
12226	if (time_after_eq(jiffies, sd->last_balance + interval)) {
12227	if (sched_balance_rq(this_cpu: cpu, this_rq: rq, sd, idle, continue_balancing: &continue_balancing)) {
12228	/*
12229	* The LBF_DST_PINNED logic could have changed
12230	* env->dst_cpu, so we can't know our idle
12231	* state even if we migrated tasks. Update it.
12232	*/
12233	idle = idle_cpu(cpu);
12234	busy = !idle && !sched_idle_cpu(cpu);
12235	}
12236	sd->last_balance = jiffies;
12237	interval = get_sd_balance_interval(sd, cpu_busy: busy);
12238	}
12239	if (need_serialize)
12240	atomic_set_release(v: &sched_balance_running, i: `0`);
12241	out:
12242	if (time_after(next_balance, sd->last_balance + interval)) {
12243	next_balance = sd->last_balance + interval;
12244	update_next_balance = `1`;
12245	}
12246	}
12247	if (need_decay) {
12248	/*
12249	* Ensure the rq-wide value also decays but keep it at a
12250	* reasonable floor to avoid funnies with rq->avg_idle.
12251	*/
12252	rq->max_idle_balance_cost =
12253	max((u64)sysctl_sched_migration_cost, max_cost);
12254	}
12255	rcu_read_unlock();
12256
12257	/*
12258	* next_balance will be updated only when there is a need.
12259	* When the cpu is attached to null domain for ex, it will not be
12260	* updated.
12261	*/
12262	if (likely(update_next_balance))
12263	rq->next_balance = next_balance;
12264
12265	}
12266
12267	static inline int on_null_domain(struct rq *rq)
12268	{
12269	return unlikely(!rcu_dereference_sched(rq->sd));
12270	}
12271
12272	#ifdef CONFIG_NO_HZ_COMMON
12273	/*
12274	* NOHZ idle load balancing (ILB) details:
12275	*
12276	* - When one of the busy CPUs notices that there may be an idle rebalancing
12277	* needed, they will kick the idle load balancer, which then does idle
12278	* load balancing for all the idle CPUs.
12279	*/
12280	static inline int find_new_ilb(void)
12281	{
12282	const struct cpumask *hk_mask;
12283	int ilb_cpu;
12284
12285	hk_mask = housekeeping_cpumask(type: HK_TYPE_KERNEL_NOISE);
12286
12287	for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) {
12288
12289	if (ilb_cpu == smp_processor_id())
12290	continue;
12291
12292	if (idle_cpu(cpu: ilb_cpu))
12293	return ilb_cpu;
12294	}
12295
12296	return -`1`;
12297	}
12298
12299	/*
12300	* Kick a CPU to do the NOHZ balancing, if it is time for it, via a cross-CPU
12301	* SMP function call (IPI).
12302	*
12303	* We pick the first idle CPU in the HK_TYPE_KERNEL_NOISE housekeeping set
12304	* (if there is one).
12305	*/
12306	static void kick_ilb(unsigned int flags)
12307	{
12308	int ilb_cpu;
12309
12310	/*
12311	* Increase nohz.next_balance only when if full ilb is triggered but
12312	* not if we only update stats.
12313	*/
12314	if (flags & NOHZ_BALANCE_KICK)
12315	nohz.next_balance = jiffies+`1`;
12316
12317	ilb_cpu = find_new_ilb();
12318	if (ilb_cpu < `0`)
12319	return;
12320
12321	/*
12322	* Don't bother if no new NOHZ balance work items for ilb_cpu,
12323	* i.e. all bits in flags are already set in ilb_cpu.
12324	*/
12325	if ((atomic_read(nohz_flags(ilb_cpu)) & flags) == flags)
12326	return;
12327
12328	/*
12329	* Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets
12330	* the first flag owns it; cleared by nohz_csd_func().
12331	*/
12332	flags = atomic_fetch_or(i: flags, nohz_flags(ilb_cpu));
12333	if (flags & NOHZ_KICK_MASK)
12334	return;
12335
12336	/*
12337	* This way we generate an IPI on the target CPU which
12338	* is idle, and the softirq performing NOHZ idle load balancing
12339	* will be run before returning from the IPI.
12340	*/
12341	smp_call_function_single_async(cpu: ilb_cpu, csd: &cpu_rq(ilb_cpu)->nohz_csd);
12342	}
12343
12344	/*
12345	* Current decision point for kicking the idle load balancer in the presence
12346	* of idle CPUs in the system.
12347	*/
12348	static void nohz_balancer_kick(struct rq *rq)
12349	{
12350	unsigned long now = jiffies;
12351	struct sched_domain_shared *sds;
12352	struct sched_domain *sd;
12353	int nr_busy, i, cpu = rq->cpu;
12354	unsigned int flags = `0`;
12355
12356	if (unlikely(rq->idle_balance))
12357	return;
12358
12359	/*
12360	* We may be recently in ticked or tickless idle mode. At the first
12361	* busy tick after returning from idle, we will update the busy stats.
12362	*/
12363	nohz_balance_exit_idle(rq);
12364
12365	/*
12366	* None are in tickless mode and hence no need for NOHZ idle load
12367	* balancing:
12368	*/
12369	if (likely(!atomic_read(&nohz.nr_cpus)))
12370	return;
12371
12372	if (READ_ONCE(nohz.has_blocked) &&
12373	time_after(now, READ_ONCE(nohz.next_blocked)))
12374	flags = NOHZ_STATS_KICK;
12375
12376	if (time_before(now, nohz.next_balance))
12377	goto out;
12378
12379	if (rq->nr_running >= `2`) {
12380	flags = NOHZ_STATS_KICK \| NOHZ_BALANCE_KICK;
12381	goto out;
12382	}
12383
12384	rcu_read_lock();
12385
12386	sd = rcu_dereference(rq->sd);
12387	if (sd) {
12388	/*
12389	* If there's a runnable CFS task and the current CPU has reduced
12390	* capacity, kick the ILB to see if there's a better CPU to run on:
12391	*/
12392	if (rq->cfs.h_nr_runnable >= `1` && check_cpu_capacity(rq, sd)) {
12393	flags = NOHZ_STATS_KICK \| NOHZ_BALANCE_KICK;
12394	goto unlock;
12395	}
12396	}
12397
12398	sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
12399	if (sd) {
12400	/*
12401	* When ASYM_PACKING; see if there's a more preferred CPU
12402	* currently idle; in which case, kick the ILB to move tasks
12403	* around.
12404	*
12405	* When balancing between cores, all the SMT siblings of the
12406	* preferred CPU must be idle.
12407	*/
12408	for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
12409	if (sched_asym(sd, dst_cpu: i, src_cpu: cpu)) {
12410	flags = NOHZ_STATS_KICK \| NOHZ_BALANCE_KICK;
12411	goto unlock;
12412	}
12413	}
12414	}
12415
12416	sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
12417	if (sd) {
12418	/*
12419	* When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
12420	* to run the misfit task on.
12421	*/
12422	if (check_misfit_status(rq)) {
12423	flags = NOHZ_STATS_KICK \| NOHZ_BALANCE_KICK;
12424	goto unlock;
12425	}
12426
12427	/*
12428	* For asymmetric systems, we do not want to nicely balance
12429	* cache use, instead we want to embrace asymmetry and only
12430	* ensure tasks have enough CPU capacity.
12431	*
12432	* Skip the LLC logic because it's not relevant in that case.
12433	*/
12434	goto unlock;
12435	}
12436
12437	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
12438	if (sds) {
12439	/*
12440	* If there is an imbalance between LLC domains (IOW we could
12441	* increase the overall cache utilization), we need a less-loaded LLC
12442	* domain to pull some load from. Likewise, we may need to spread
12443	* load within the current LLC domain (e.g. packed SMT cores but
12444	* other CPUs are idle). We can't really know from here how busy
12445	* the others are - so just get a NOHZ balance going if it looks
12446	* like this LLC domain has tasks we could move.
12447	*/
12448	nr_busy = atomic_read(v: &sds->nr_busy_cpus);
12449	if (nr_busy > `1`) {
12450	flags = NOHZ_STATS_KICK \| NOHZ_BALANCE_KICK;
12451	goto unlock;
12452	}
12453	}
12454	unlock:
12455	rcu_read_unlock();
12456	out:
12457	if (READ_ONCE(nohz.needs_update))
12458	flags \|= NOHZ_NEXT_KICK;
12459
12460	if (flags)
12461	kick_ilb(flags);
12462	}
12463
12464	static void set_cpu_sd_state_busy(int cpu)
12465	{
12466	struct sched_domain *sd;
12467
12468	rcu_read_lock();
12469	sd = rcu_dereference(per_cpu(sd_llc, cpu));
12470
12471	if (!sd \|\| !sd->nohz_idle)
12472	goto unlock;
12473	sd->nohz_idle = `0`;
12474
12475	atomic_inc(v: &sd->shared->nr_busy_cpus);
12476	unlock:
12477	rcu_read_unlock();
12478	}
12479
12480	void nohz_balance_exit_idle(struct rq *rq)
12481	{
12482	WARN_ON_ONCE(rq != this_rq());
12483
12484	if (likely(!rq->nohz_tick_stopped))
12485	return;
12486
12487	rq->nohz_tick_stopped = `0`;
12488	cpumask_clear_cpu(cpu: rq->cpu, dstp: nohz.idle_cpus_mask);
12489	atomic_dec(v: &nohz.nr_cpus);
12490
12491	set_cpu_sd_state_busy(rq->cpu);
12492	}
12493
12494	static void set_cpu_sd_state_idle(int cpu)
12495	{
12496	struct sched_domain *sd;
12497
12498	rcu_read_lock();
12499	sd = rcu_dereference(per_cpu(sd_llc, cpu));
12500
12501	if (!sd \|\| sd->nohz_idle)
12502	goto unlock;
12503	sd->nohz_idle = `1`;
12504
12505	atomic_dec(v: &sd->shared->nr_busy_cpus);
12506	unlock:
12507	rcu_read_unlock();
12508	}
12509
12510	/*
12511	* This routine will record that the CPU is going idle with tick stopped.
12512	* This info will be used in performing idle load balancing in the future.
12513	*/
12514	void nohz_balance_enter_idle(int cpu)
12515	{
12516	struct rq *rq = cpu_rq(cpu);
12517
12518	WARN_ON_ONCE(cpu != smp_processor_id());
12519
12520	/ If this CPU is going down, then nothing needs to be done: /
12521	if (!cpu_active(cpu))
12522	return;
12523
12524	/*
12525	* Can be set safely without rq->lock held
12526	* If a clear happens, it will have evaluated last additions because
12527	* rq->lock is held during the check and the clear
12528	*/
12529	rq->has_blocked_load = `1`;
12530
12531	/*
12532	* The tick is still stopped but load could have been added in the
12533	* meantime. We set the nohz.has_blocked flag to trig a check of the
12534	* *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear
12535	* of nohz.has_blocked can only happen after checking the new load
12536	*/
12537	if (rq->nohz_tick_stopped)
12538	goto out;
12539
12540	/ If we're a completely isolated CPU, we don't play: /
12541	if (on_null_domain(rq))
12542	return;
12543
12544	rq->nohz_tick_stopped = `1`;
12545
12546	cpumask_set_cpu(cpu, dstp: nohz.idle_cpus_mask);
12547	atomic_inc(v: &nohz.nr_cpus);
12548
12549	/*
12550	* Ensures that if nohz_idle_balance() fails to observe our
12551	* @idle_cpus_mask store, it must observe the @has_blocked
12552	* and @needs_update stores.
12553	*/
12554	smp_mb__after_atomic();
12555
12556	set_cpu_sd_state_idle(cpu);
12557
12558	WRITE_ONCE(nohz.needs_update, `1`);
12559	out:
12560	/*
12561	* Each time a cpu enter idle, we assume that it has blocked load and
12562	* enable the periodic update of the load of idle CPUs
12563	*/
12564	WRITE_ONCE(nohz.has_blocked, `1`);
12565	}
12566
12567	static bool update_nohz_stats(struct rq *rq)
12568	{
12569	unsigned int cpu = rq->cpu;
12570
12571	if (!rq->has_blocked_load)
12572	return false;
12573
12574	if (!cpumask_test_cpu(cpu, cpumask: nohz.idle_cpus_mask))
12575	return false;
12576
12577	if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick)))
12578	return true;
12579
12580	sched_balance_update_blocked_averages(cpu);
12581
12582	return rq->has_blocked_load;
12583	}
12584
12585	/*
12586	* Internal function that runs load balance for all idle CPUs. The load balance
12587	* can be a simple update of blocked load or a complete load balance with
12588	* tasks movement depending of flags.
12589	*/
12590	static void _nohz_idle_balance(struct rq this_rq, unsigned* int flags)
12591	{
12592	/ Earliest time when we have to do rebalance again /
12593	unsigned long now = jiffies;
12594	unsigned long next_balance = now + `60`*HZ;
12595	bool has_blocked_load = false;
12596	int update_next_balance = `0`;
12597	int this_cpu = this_rq->cpu;
12598	int balance_cpu;
12599	struct rq *rq;
12600
12601	WARN_ON_ONCE((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
12602
12603	/*
12604	* We assume there will be no idle load after this update and clear
12605	* the has_blocked flag. If a cpu enters idle in the mean time, it will
12606	* set the has_blocked flag and trigger another update of idle load.
12607	* Because a cpu that becomes idle, is added to idle_cpus_mask before
12608	* setting the flag, we are sure to not clear the state and not
12609	* check the load of an idle cpu.
12610	*
12611	* Same applies to idle_cpus_mask vs needs_update.
12612	*/
12613	if (flags & NOHZ_STATS_KICK)
12614	WRITE_ONCE(nohz.has_blocked, `0`);
12615	if (flags & NOHZ_NEXT_KICK)
12616	WRITE_ONCE(nohz.needs_update, `0`);
12617
12618	/*
12619	* Ensures that if we miss the CPU, we must see the has_blocked
12620	* store from nohz_balance_enter_idle().
12621	*/
12622	smp_mb();
12623
12624	/*
12625	* Start with the next CPU after this_cpu so we will end with this_cpu and let a
12626	* chance for other idle cpu to pull load.
12627	*/
12628	for_each_cpu_wrap(balance_cpu, nohz.idle_cpus_mask, this_cpu+`1`) {
12629	if (!idle_cpu(cpu: balance_cpu))
12630	continue;
12631
12632	/*
12633	* If this CPU gets work to do, stop the load balancing
12634	* work being done for other CPUs. Next load
12635	* balancing owner will pick it up.
12636	*/
12637	if (!idle_cpu(cpu: this_cpu) && need_resched()) {
12638	if (flags & NOHZ_STATS_KICK)
12639	has_blocked_load = true;
12640	if (flags & NOHZ_NEXT_KICK)
12641	WRITE_ONCE(nohz.needs_update, `1`);
12642	goto abort;
12643	}
12644
12645	rq = cpu_rq(balance_cpu);
12646
12647	if (flags & NOHZ_STATS_KICK)
12648	has_blocked_load \|= update_nohz_stats(rq);
12649
12650	/*
12651	* If time for next balance is due,
12652	* do the balance.
12653	*/
12654	if (time_after_eq(jiffies, rq->next_balance)) {
12655	struct rq_flags rf;
12656
12657	rq_lock_irqsave(rq, rf: &rf);
12658	update_rq_clock(rq);
12659	rq_unlock_irqrestore(rq, rf: &rf);
12660
12661	if (flags & NOHZ_BALANCE_KICK)
12662	sched_balance_domains(rq, idle: CPU_IDLE);
12663	}
12664
12665	if (time_after(next_balance, rq->next_balance)) {
12666	next_balance = rq->next_balance;
12667	update_next_balance = `1`;
12668	}
12669	}
12670
12671	/*
12672	* next_balance will be updated only when there is a need.
12673	* When the CPU is attached to null domain for ex, it will not be
12674	* updated.
12675	*/
12676	if (likely(update_next_balance))
12677	nohz.next_balance = next_balance;
12678
12679	if (flags & NOHZ_STATS_KICK)
12680	WRITE_ONCE(nohz.next_blocked,
12681	now + msecs_to_jiffies(LOAD_AVG_PERIOD));
12682
12683	abort:
12684	/ There is still blocked load, enable periodic update /
12685	if (has_blocked_load)
12686	WRITE_ONCE(nohz.has_blocked, `1`);
12687	}
12688
12689	/*
12690	* In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
12691	* rebalancing for all the CPUs for whom scheduler ticks are stopped.
12692	*/
12693	static bool nohz_idle_balance(struct rq this_rq, enum* cpu_idle_type idle)
12694	{
12695	unsigned int flags = this_rq->nohz_idle_balance;
12696
12697	if (!flags)
12698	return false;
12699
12700	this_rq->nohz_idle_balance = `0`;
12701
12702	if (idle != CPU_IDLE)
12703	return false;
12704
12705	_nohz_idle_balance(this_rq, flags);
12706
12707	return true;
12708	}
12709
12710	/*
12711	* Check if we need to directly run the ILB for updating blocked load before
12712	* entering idle state. Here we run ILB directly without issuing IPIs.
12713	*
12714	* Note that when this function is called, the tick may not yet be stopped on
12715	* this CPU yet. nohz.idle_cpus_mask is updated only when tick is stopped and
12716	* cleared on the next busy tick. In other words, nohz.idle_cpus_mask updates
12717	* don't align with CPUs enter/exit idle to avoid bottlenecks due to high idle
12718	* entry/exit rate (usec). So it is possible that _nohz_idle_balance() is
12719	* called from this function on (this) CPU that's not yet in the mask. That's
12720	* OK because the goal of nohz_run_idle_balance() is to run ILB only for
12721	* updating the blocked load of already idle CPUs without waking up one of
12722	* those idle CPUs and outside the preempt disable / IRQ off phase of the local
12723	* cpu about to enter idle, because it can take a long time.
12724	*/
12725	void nohz_run_idle_balance(int cpu)
12726	{
12727	unsigned int flags;
12728
12729	flags = atomic_fetch_andnot(NOHZ_NEWILB_KICK, nohz_flags(cpu));
12730
12731	/*
12732	* Update the blocked load only if no SCHED_SOFTIRQ is about to happen
12733	* (i.e. NOHZ_STATS_KICK set) and will do the same.
12734	*/
12735	if ((flags == NOHZ_NEWILB_KICK) && !need_resched())
12736	_nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK);
12737	}
12738
12739	static void nohz_newidle_balance(struct rq *this_rq)
12740	{
12741	int this_cpu = this_rq->cpu;
12742
12743	/ Will wake up very soon. No time for doing anything else/
12744	if (this_rq->avg_idle < sysctl_sched_migration_cost)
12745	return;
12746
12747	/ Don't need to update blocked load of idle CPUs/
12748	if (!READ_ONCE(nohz.has_blocked) \|\|
12749	time_before(jiffies, READ_ONCE(nohz.next_blocked)))
12750	return;
12751
12752	/*
12753	* Set the need to trigger ILB in order to update blocked load
12754	* before entering idle state.
12755	*/
12756	atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu));
12757	}
12758
12759	#else /* !CONFIG_NO_HZ_COMMON: */
12760	static inline void nohz_balancer_kick(struct rq *rq) { }
12761
12762	static inline bool nohz_idle_balance(struct rq this_rq, enum* cpu_idle_type idle)
12763	{
12764	return false;
12765	}
12766
12767	static inline void nohz_newidle_balance(struct rq *this_rq) { }
12768	#endif /* !CONFIG_NO_HZ_COMMON */
12769
12770	/*
12771	* sched_balance_newidle is called by schedule() if this_cpu is about to become
12772	* idle. Attempts to pull tasks from other CPUs.
12773	*
12774	* Returns:
12775	* < 0 - we released the lock and there are !fair tasks present
12776	* 0 - failed, no new tasks
12777	* > 0 - success, new (fair) tasks present
12778	*/
12779	static int sched_balance_newidle(struct rq this_rq, struct* rq_flags *rf)
12780	{
12781	unsigned long next_balance = jiffies + HZ;
12782	int this_cpu = this_rq->cpu;
12783	int continue_balancing = `1`;
12784	u64 t0, t1, curr_cost = `0`;
12785	struct sched_domain *sd;
12786	int pulled_task = `0`;
12787
12788	update_misfit_status(NULL, rq: this_rq);
12789
12790	/*
12791	* There is a task waiting to run. No need to search for one.
12792	* Return 0; the task will be enqueued when switching to idle.
12793	*/
12794	if (this_rq->ttwu_pending)
12795	return `0`;
12796
12797	/*
12798	* We must set idle_stamp _before_ calling sched_balance_rq()
12799	* for CPU_NEWLY_IDLE, such that we measure the this duration
12800	* as idle time.
12801	*/
12802	this_rq->idle_stamp = rq_clock(rq: this_rq);
12803
12804	/*
12805	* Do not pull tasks towards !active CPUs...
12806	*/
12807	if (!cpu_active(cpu: this_cpu))
12808	return `0`;
12809
12810	/*
12811	* This is OK, because current is on_cpu, which avoids it being picked
12812	* for load-balance and preemption/IRQs are still disabled avoiding
12813	* further scheduler activity on it and we're being very careful to
12814	* re-start the picking loop.
12815	*/
12816	rq_unpin_lock(rq: this_rq, rf);
12817
12818	rcu_read_lock();
12819	sd = rcu_dereference_check_sched_domain(this_rq->sd);
12820
12821	if (!get_rd_overloaded(rd: this_rq->rd) \|\|
12822	(sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) {
12823
12824	if (sd)
12825	update_next_balance(sd, next_balance: &next_balance);
12826	rcu_read_unlock();
12827
12828	goto out;
12829	}
12830	rcu_read_unlock();
12831
12832	raw_spin_rq_unlock(rq: this_rq);
12833
12834	t0 = sched_clock_cpu(cpu: this_cpu);
12835	sched_balance_update_blocked_averages(cpu: this_cpu);
12836
12837	rcu_read_lock();
12838	for_each_domain(this_cpu, sd) {
12839	u64 domain_cost;
12840
12841	update_next_balance(sd, next_balance: &next_balance);
12842
12843	if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
12844	break;
12845
12846	if (sd->flags & SD_BALANCE_NEWIDLE) {
12847
12848	pulled_task = sched_balance_rq(this_cpu, this_rq,
12849	sd, idle: CPU_NEWLY_IDLE,
12850	continue_balancing: &continue_balancing);
12851
12852	t1 = sched_clock_cpu(cpu: this_cpu);
12853	domain_cost = t1 - t0;
12854	curr_cost += domain_cost;
12855	t0 = t1;
12856
12857	/*
12858	* Failing newidle means it is not effective;
12859	* bump the cost so we end up doing less of it.
12860	*/
12861	if (!pulled_task)
12862	domain_cost = (`3` * sd->max_newidle_lb_cost) / `2`;
12863
12864	update_newidle_cost(sd, cost: domain_cost);
12865	}
12866
12867	/*
12868	* Stop searching for tasks to pull if there are
12869	* now runnable tasks on this rq.
12870	*/
12871	if (pulled_task \|\| !continue_balancing)
12872	break;
12873	}
12874	rcu_read_unlock();
12875
12876	raw_spin_rq_lock(rq: this_rq);
12877
12878	if (curr_cost > this_rq->max_idle_balance_cost)
12879	this_rq->max_idle_balance_cost = curr_cost;
12880
12881	/*
12882	* While browsing the domains, we released the rq lock, a task could
12883	* have been enqueued in the meantime. Since we're not going idle,
12884	* pretend we pulled a task.
12885	*/
12886	if (this_rq->cfs.h_nr_queued && !pulled_task)
12887	pulled_task = `1`;
12888
12889	/ Is there a task of a high priority class? /
12890	if (this_rq->nr_running != this_rq->cfs.h_nr_queued)
12891	pulled_task = -`1`;
12892
12893	out:
12894	/ Move the next balance forward /
12895	if (time_after(this_rq->next_balance, next_balance))
12896	this_rq->next_balance = next_balance;
12897
12898	if (pulled_task)
12899	this_rq->idle_stamp = `0`;
12900	else
12901	nohz_newidle_balance(this_rq);
12902
12903	rq_repin_lock(rq: this_rq, rf);
12904
12905	return pulled_task;
12906	}
12907
12908	/*
12909	* This softirq handler is triggered via SCHED_SOFTIRQ from two places:
12910	*
12911	* - directly from the local sched_tick() for periodic load balancing
12912	*
12913	* - indirectly from a remote sched_tick() for NOHZ idle balancing
12914	* through the SMP cross-call nohz_csd_func()
12915	*/
12916	static __latent_entropy void sched_balance_softirq(void)
12917	{
12918	struct rq *this_rq = this_rq();
12919	enum cpu_idle_type idle = this_rq->idle_balance;
12920	/*
12921	* If this CPU has a pending NOHZ_BALANCE_KICK, then do the
12922	* balancing on behalf of the other idle CPUs whose ticks are
12923	* stopped. Do nohz_idle_balance before sched_balance_domains to
12924	* give the idle CPUs a chance to load balance. Else we may
12925	* load balance only within the local sched_domain hierarchy
12926	* and abort nohz_idle_balance altogether if we pull some load.
12927	*/
12928	if (nohz_idle_balance(this_rq, idle))
12929	return;
12930
12931	/ normal load balance /
12932	sched_balance_update_blocked_averages(cpu: this_rq->cpu);
12933	sched_balance_domains(rq: this_rq, idle);
12934	}
12935
12936	/*
12937	* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
12938	*/
12939	void sched_balance_trigger(struct rq *rq)
12940	{
12941	/*
12942	* Don't need to rebalance while attached to NULL domain or
12943	* runqueue CPU is not active
12944	*/
12945	if (unlikely(on_null_domain(rq) \|\| !cpu_active(cpu_of(rq))))
12946	return;
12947
12948	if (time_after_eq(jiffies, rq->next_balance))
12949	raise_softirq(nr: SCHED_SOFTIRQ);
12950
12951	nohz_balancer_kick(rq);
12952	}
12953
12954	static void rq_online_fair(struct rq *rq)
12955	{
12956	update_sysctl();
12957
12958	update_runtime_enabled(rq);
12959	}
12960
12961	static void rq_offline_fair(struct rq *rq)
12962	{
12963	update_sysctl();
12964
12965	/ Ensure any throttled groups are reachable by pick_next_task /
12966	unthrottle_offline_cfs_rqs(rq);
12967
12968	/ Ensure that we remove rq contribution to group share: /
12969	clear_tg_offline_cfs_rqs(rq);
12970	}
12971
12972	#ifdef CONFIG_SCHED_CORE
12973	static inline bool
12974	__entity_slice_used(struct sched_entity se, int* min_nr_tasks)
12975	{
12976	u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime;
12977	u64 slice = se->slice;
12978
12979	return (rtime * min_nr_tasks > slice);
12980	}
12981
12982	#define MIN_NR_TASKS_DURING_FORCEIDLE 2
12983	static inline void task_tick_core(struct rq rq, struct* task_struct *curr)
12984	{
12985	if (!sched_core_enabled(rq))
12986	return;
12987
12988	/*
12989	* If runqueue has only one task which used up its slice and
12990	* if the sibling is forced idle, then trigger schedule to
12991	* give forced idle task a chance.
12992	*
12993	* sched_slice() considers only this active rq and it gets the
12994	* whole slice. But during force idle, we have siblings acting
12995	* like a single runqueue and hence we need to consider runnable
12996	* tasks on this CPU and the forced idle CPU. Ideally, we should
12997	* go through the forced idle rq, but that would be a perf hit.
12998	* We can assume that the forced idle CPU has at least
12999	* MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check
13000	* if we need to give up the CPU.
13001	*/
13002	if (rq->core->core_forceidle_count && rq->cfs.nr_queued == `1` &&
13003	__entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
13004	resched_curr(rq);
13005	}
13006
13007	/*
13008	* se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed.
13009	*/
13010	static void se_fi_update(const struct sched_entity se, unsigned* int fi_seq,
13011	bool forceidle)
13012	{
13013	for_each_sched_entity(se) {
13014	struct cfs_rq *cfs_rq = cfs_rq_of(se);
13015
13016	if (forceidle) {
13017	if (cfs_rq->forceidle_seq == fi_seq)
13018	break;
13019	cfs_rq->forceidle_seq = fi_seq;
13020	}
13021
13022	cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime;
13023	}
13024	}
13025
13026	void task_vruntime_update(struct rq rq, struct* task_struct *p, bool in_fi)
13027	{
13028	struct sched_entity *se = &p->se;
13029
13030	if (p->sched_class != &fair_sched_class)
13031	return;
13032
13033	se_fi_update(se, rq->core->core_forceidle_seq, in_fi);
13034	}
13035
13036	bool cfs_prio_less(const struct task_struct a, const* struct task_struct *b,
13037	bool in_fi)
13038	{
13039	struct rq *rq = task_rq(a);
13040	const struct sched_entity *sea = &a->se;
13041	const struct sched_entity *seb = &b->se;
13042	struct cfs_rq *cfs_rqa;
13043	struct cfs_rq *cfs_rqb;
13044	s64 delta;
13045
13046	WARN_ON_ONCE(task_rq(b)->core != rq->core);
13047
13048	#ifdef CONFIG_FAIR_GROUP_SCHED
13049	/*
13050	* Find an se in the hierarchy for tasks a and b, such that the se's
13051	* are immediate siblings.
13052	*/
13053	while (sea->cfs_rq->tg != seb->cfs_rq->tg) {
13054	int sea_depth = sea->depth;
13055	int seb_depth = seb->depth;
13056
13057	if (sea_depth >= seb_depth)
13058	sea = parent_entity(sea);
13059	if (sea_depth <= seb_depth)
13060	seb = parent_entity(seb);
13061	}
13062
13063	se_fi_update(sea, rq->core->core_forceidle_seq, in_fi);
13064	se_fi_update(seb, rq->core->core_forceidle_seq, in_fi);
13065
13066	cfs_rqa = sea->cfs_rq;
13067	cfs_rqb = seb->cfs_rq;
13068	#else /* !CONFIG_FAIR_GROUP_SCHED: */
13069	cfs_rqa = &task_rq(a)->cfs;
13070	cfs_rqb = &task_rq(b)->cfs;
13071	#endif /* !CONFIG_FAIR_GROUP_SCHED */
13072
13073	/*
13074	* Find delta after normalizing se's vruntime with its cfs_rq's
13075	* min_vruntime_fi, which would have been updated in prior calls
13076	* to se_fi_update().
13077	*/
13078	delta = (s64)(sea->vruntime - seb->vruntime) +
13079	(s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
13080
13081	return delta > `0`;
13082	}
13083
13084	static int task_is_throttled_fair(struct task_struct p, int* cpu)
13085	{
13086	struct cfs_rq *cfs_rq;
13087
13088	#ifdef CONFIG_FAIR_GROUP_SCHED
13089	cfs_rq = task_group(p)->cfs_rq[cpu];
13090	#else
13091	cfs_rq = &cpu_rq(cpu)->cfs;
13092	#endif
13093	return throttled_hierarchy(cfs_rq);
13094	}
13095	#else /* !CONFIG_SCHED_CORE: */
13096	static inline void task_tick_core(struct rq rq, struct* task_struct *curr) {}
13097	#endif /* !CONFIG_SCHED_CORE */
13098
13099	/*
13100	* scheduler tick hitting a task of our scheduling class.
13101	*
13102	* NOTE: This function can be called remotely by the tick offload that
13103	* goes along full dynticks. Therefore no local assumption can be made
13104	* and everything must be accessed through the @rq and @curr passed in
13105	* parameters.
13106	*/
13107	static void task_tick_fair(struct rq rq, struct* task_struct curr, int* queued)
13108	{
13109	struct cfs_rq *cfs_rq;
13110	struct sched_entity *se = &curr->se;
13111
13112	for_each_sched_entity(se) {
13113	cfs_rq = cfs_rq_of(se);
13114	entity_tick(cfs_rq, curr: se, queued);
13115	}
13116
13117	if (static_branch_unlikely(&sched_numa_balancing))
13118	task_tick_numa(rq, curr);
13119
13120	update_misfit_status(p: curr, rq);
13121	check_update_overutilized_status(task_rq(curr));
13122
13123	task_tick_core(rq, curr);
13124	}
13125
13126	/*
13127	* called on fork with the child task as argument from the parent's context
13128	* - child not yet on the tasklist
13129	* - preemption disabled
13130	*/
13131	static void task_fork_fair(struct task_struct *p)
13132	{
13133	set_task_max_allowed_capacity(p);
13134	}
13135
13136	/*
13137	* Priority of the task has changed. Check to see if we preempt
13138	* the current task.
13139	*/
13140	static void
13141	prio_changed_fair(struct rq rq, struct* task_struct p, int* oldprio)
13142	{
13143	if (!task_on_rq_queued(p))
13144	return;
13145
13146	if (rq->cfs.nr_queued == `1`)
13147	return;
13148
13149	/*
13150	* Reschedule if we are currently running on this runqueue and
13151	* our priority decreased, or if we are not currently running on
13152	* this runqueue and our priority is higher than the current's
13153	*/
13154	if (task_current_donor(rq, p)) {
13155	if (p->prio > oldprio)
13156	resched_curr(rq);
13157	} else
13158	wakeup_preempt(rq, p, flags: `0`);
13159	}
13160
13161	#ifdef CONFIG_FAIR_GROUP_SCHED
13162	/*
13163	* Propagate the changes of the sched_entity across the tg tree to make it
13164	* visible to the root
13165	*/
13166	static void propagate_entity_cfs_rq(struct sched_entity *se)
13167	{
13168	struct cfs_rq *cfs_rq = cfs_rq_of(se);
13169
13170	/*
13171	* If a task gets attached to this cfs_rq and before being queued,
13172	* it gets migrated to another CPU due to reasons like affinity
13173	* change, make sure this cfs_rq stays on leaf cfs_rq list to have
13174	* that removed load decayed or it can cause faireness problem.
13175	*/
13176	if (!cfs_rq_pelt_clock_throttled(cfs_rq))
13177	list_add_leaf_cfs_rq(cfs_rq);
13178
13179	/ Start to propagate at parent /
13180	se = se->parent;
13181
13182	for_each_sched_entity(se) {
13183	cfs_rq = cfs_rq_of(se);
13184
13185	update_load_avg(cfs_rq, se, UPDATE_TG);
13186
13187	if (!cfs_rq_pelt_clock_throttled(cfs_rq))
13188	list_add_leaf_cfs_rq(cfs_rq);
13189	}
13190	}
13191	#else /* !CONFIG_FAIR_GROUP_SCHED: */
13192	static void propagate_entity_cfs_rq(struct sched_entity *se) { }
13193	#endif /* !CONFIG_FAIR_GROUP_SCHED */
13194
13195	static void detach_entity_cfs_rq(struct sched_entity *se)
13196	{
13197	struct cfs_rq *cfs_rq = cfs_rq_of(se);
13198
13199	/*
13200	* In case the task sched_avg hasn't been attached:
13201	* - A forked task which hasn't been woken up by wake_up_new_task().
13202	* - A task which has been woken up by try_to_wake_up() but is
13203	* waiting for actually being woken up by sched_ttwu_pending().
13204	*/
13205	if (!se->avg.last_update_time)
13206	return;
13207
13208	/ Catch up with the cfs_rq and remove our load when we leave /
13209	update_load_avg(cfs_rq, se, flags: `0`);
13210	detach_entity_load_avg(cfs_rq, se);
13211	update_tg_load_avg(cfs_rq);
13212	propagate_entity_cfs_rq(se);
13213	}
13214
13215	static void attach_entity_cfs_rq(struct sched_entity *se)
13216	{
13217	struct cfs_rq *cfs_rq = cfs_rq_of(se);
13218
13219	/ Synchronize entity with its cfs_rq /
13220	update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? `0` : SKIP_AGE_LOAD);
13221	attach_entity_load_avg(cfs_rq, se);
13222	update_tg_load_avg(cfs_rq);
13223	propagate_entity_cfs_rq(se);
13224	}
13225
13226	static void detach_task_cfs_rq(struct task_struct *p)
13227	{
13228	struct sched_entity *se = &p->se;
13229
13230	detach_entity_cfs_rq(se);
13231	}
13232
13233	static void attach_task_cfs_rq(struct task_struct *p)
13234	{
13235	struct sched_entity *se = &p->se;
13236
13237	attach_entity_cfs_rq(se);
13238	}
13239
13240	static void switched_from_fair(struct rq rq, struct* task_struct *p)
13241	{
13242	detach_task_cfs_rq(p);
13243	}
13244
13245	static void switched_to_fair(struct rq rq, struct* task_struct *p)
13246	{
13247	WARN_ON_ONCE(p->se.sched_delayed);
13248
13249	attach_task_cfs_rq(p);
13250
13251	set_task_max_allowed_capacity(p);
13252
13253	if (task_on_rq_queued(p)) {
13254	/*
13255	* We were most likely switched from sched_rt, so
13256	* kick off the schedule if running, otherwise just see
13257	* if we can still preempt the current task.
13258	*/
13259	if (task_current_donor(rq, p))
13260	resched_curr(rq);
13261	else
13262	wakeup_preempt(rq, p, flags: `0`);
13263	}
13264	}
13265
13266	static void __set_next_task_fair(struct rq rq, struct* task_struct *p, bool first)
13267	{
13268	struct sched_entity *se = &p->se;
13269
13270	if (task_on_rq_queued(p)) {
13271	/*
13272	* Move the next running task to the front of the list, so our
13273	* cfs_tasks list becomes MRU one.
13274	*/
13275	list_move(list: &se->group_node, head: &rq->cfs_tasks);
13276	}
13277	if (!first)
13278	return;
13279
13280	WARN_ON_ONCE(se->sched_delayed);
13281
13282	if (hrtick_enabled_fair(rq))
13283	hrtick_start_fair(rq, p);
13284
13285	update_misfit_status(p, rq);
13286	sched_fair_update_stop_tick(rq, p);
13287	}
13288
13289	/*
13290	* Account for a task changing its policy or group.
13291	*
13292	* This routine is mostly called to set cfs_rq->curr field when a task
13293	* migrates between groups/classes.
13294	*/
13295	static void set_next_task_fair(struct rq rq, struct* task_struct *p, bool first)
13296	{
13297	struct sched_entity *se = &p->se;
13298
13299	for_each_sched_entity(se) {
13300	struct cfs_rq *cfs_rq = cfs_rq_of(se);
13301
13302	set_next_entity(cfs_rq, se);
13303	/ ensure bandwidth has been allocated on our new cfs_rq /
13304	account_cfs_rq_runtime(cfs_rq, delta_exec: `0`);
13305	}
13306
13307	__set_next_task_fair(rq, p, first);
13308	}
13309
13310	void init_cfs_rq(struct cfs_rq *cfs_rq)
13311	{
13312	cfs_rq->tasks_timeline = RB_ROOT_CACHED;
13313	cfs_rq->min_vruntime = (u64)(-(`1LL` << `20`));
13314	raw_spin_lock_init(&cfs_rq->removed.lock);
13315	}
13316
13317	#ifdef CONFIG_FAIR_GROUP_SCHED
13318	static void task_change_group_fair(struct task_struct *p)
13319	{
13320	/*
13321	* We couldn't detach or attach a forked task which
13322	* hasn't been woken up by wake_up_new_task().
13323	*/
13324	if (READ_ONCE(p->__state) == TASK_NEW)
13325	return;
13326
13327	detach_task_cfs_rq(p);
13328
13329	/ Tell se's cfs_rq has been changed -- migrated /
13330	p->se.avg.last_update_time = `0`;
13331	set_task_rq(p, cpu: task_cpu(p));
13332	attach_task_cfs_rq(p);
13333	}
13334
13335	void free_fair_sched_group(struct task_group *tg)
13336	{
13337	int i;
13338
13339	for_each_possible_cpu(i) {
13340	if (tg->cfs_rq)
13341	kfree(objp: tg->cfs_rq[i]);
13342	if (tg->se)
13343	kfree(objp: tg->se[i]);
13344	}
13345
13346	kfree(objp: tg->cfs_rq);
13347	kfree(objp: tg->se);
13348	}
13349
13350	int alloc_fair_sched_group(struct task_group tg, struct* task_group *parent)
13351	{
13352	struct sched_entity *se;
13353	struct cfs_rq *cfs_rq;
13354	int i;
13355
13356	tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL);
13357	if (!tg->cfs_rq)
13358	goto err;
13359	tg->se = kcalloc(nr_cpu_ids, sizeof(se), GFP_KERNEL);
13360	if (!tg->se)
13361	goto err;
13362
13363	tg->shares = NICE_0_LOAD;
13364
13365	init_cfs_bandwidth(cfs_b: tg_cfs_bandwidth(tg), parent: tg_cfs_bandwidth(tg: parent));
13366
13367	for_each_possible_cpu(i) {
13368	cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
13369	GFP_KERNEL, cpu_to_node(i));
13370	if (!cfs_rq)
13371	goto err;
13372
13373	se = kzalloc_node(sizeof(struct sched_entity_stats),
13374	GFP_KERNEL, cpu_to_node(i));
13375	if (!se)
13376	goto err_free_rq;
13377
13378	init_cfs_rq(cfs_rq);
13379	init_tg_cfs_entry(tg, cfs_rq, se, cpu: i, parent: parent->se[i]);
13380	init_entity_runnable_average(se);
13381	}
13382
13383	return `1`;
13384
13385	err_free_rq:
13386	kfree(objp: cfs_rq);
13387	err:
13388	return `0`;
13389	}
13390
13391	void online_fair_sched_group(struct task_group *tg)
13392	{
13393	struct sched_entity *se;
13394	struct rq_flags rf;
13395	struct rq *rq;
13396	int i;
13397
13398	for_each_possible_cpu(i) {
13399	rq = cpu_rq(i);
13400	se = tg->se[i];
13401	rq_lock_irq(rq, rf: &rf);
13402	update_rq_clock(rq);
13403	attach_entity_cfs_rq(se);
13404	sync_throttle(tg, cpu: i);
13405	rq_unlock_irq(rq, rf: &rf);
13406	}
13407	}
13408
13409	void unregister_fair_sched_group(struct task_group *tg)
13410	{
13411	int cpu;
13412
13413	destroy_cfs_bandwidth(cfs_b: tg_cfs_bandwidth(tg));
13414
13415	for_each_possible_cpu(cpu) {
13416	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
13417	struct sched_entity *se = tg->se[cpu];
13418	struct rq *rq = cpu_rq(cpu);
13419
13420	if (se) {
13421	if (se->sched_delayed) {
13422	guard(rq_lock_irqsave)(l: rq);
13423	if (se->sched_delayed) {
13424	update_rq_clock(rq);
13425	dequeue_entities(rq, se, DEQUEUE_SLEEP \| DEQUEUE_DELAYED);
13426	}
13427	list_del_leaf_cfs_rq(cfs_rq);
13428	}
13429	remove_entity_load_avg(se);
13430	}
13431
13432	/*
13433	* Only empty task groups can be destroyed; so we can speculatively
13434	* check on_list without danger of it being re-added.
13435	*/
13436	if (cfs_rq->on_list) {
13437	guard(rq_lock_irqsave)(l: rq);
13438	list_del_leaf_cfs_rq(cfs_rq);
13439	}
13440	}
13441	}
13442
13443	void init_tg_cfs_entry(struct task_group tg, struct* cfs_rq *cfs_rq,
13444	struct sched_entity se, int* cpu,
13445	struct sched_entity *parent)
13446	{
13447	struct rq *rq = cpu_rq(cpu);
13448
13449	cfs_rq->tg = tg;
13450	cfs_rq->rq = rq;
13451	init_cfs_rq_runtime(cfs_rq);
13452
13453	tg->cfs_rq[cpu] = cfs_rq;
13454	tg->se[cpu] = se;
13455
13456	/ se could be NULL for root_task_group /
13457	if (!se)
13458	return;
13459
13460	if (!parent) {
13461	se->cfs_rq = &rq->cfs;
13462	se->depth = `0`;
13463	} else {
13464	se->cfs_rq = parent->my_q;
13465	se->depth = parent->depth + `1`;
13466	}
13467
13468	se->my_q = cfs_rq;
13469	/ guarantee group entities always have weight /
13470	update_load_set(lw: &se->load, NICE_0_LOAD);
13471	se->parent = parent;
13472	}
13473
13474	static DEFINE_MUTEX(shares_mutex);
13475
13476	static int __sched_group_set_shares(struct task_group tg, unsigned* long shares)
13477	{
13478	int i;
13479
13480	lockdep_assert_held(&shares_mutex);
13481
13482	/*
13483	* We can't change the weight of the root cgroup.
13484	*/
13485	if (!tg->se[`0`])
13486	return -EINVAL;
13487
13488	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
13489
13490	if (tg->shares == shares)
13491	return `0`;
13492
13493	tg->shares = shares;
13494	for_each_possible_cpu(i) {
13495	struct rq *rq = cpu_rq(i);
13496	struct sched_entity *se = tg->se[i];
13497	struct rq_flags rf;
13498
13499	/ Propagate contribution to hierarchy /
13500	rq_lock_irqsave(rq, rf: &rf);
13501	update_rq_clock(rq);
13502	for_each_sched_entity(se) {
13503	update_load_avg(cfs_rq: cfs_rq_of(se), se, UPDATE_TG);
13504	update_cfs_group(se);
13505	}
13506	rq_unlock_irqrestore(rq, rf: &rf);
13507	}
13508
13509	return `0`;
13510	}
13511
13512	int sched_group_set_shares(struct task_group tg, unsigned* long shares)
13513	{
13514	int ret;
13515
13516	mutex_lock(lock: &shares_mutex);
13517	if (tg_is_idle(tg))
13518	ret = -EINVAL;
13519	else
13520	ret = __sched_group_set_shares(tg, shares);
13521	mutex_unlock(lock: &shares_mutex);
13522
13523	return ret;
13524	}
13525
13526	int sched_group_set_idle(struct task_group tg, long* idle)
13527	{
13528	int i;
13529
13530	if (tg == &root_task_group)
13531	return -EINVAL;
13532
13533	if (idle < `0` \|\| idle > `1`)
13534	return -EINVAL;
13535
13536	mutex_lock(lock: &shares_mutex);
13537
13538	if (tg->idle == idle) {
13539	mutex_unlock(lock: &shares_mutex);
13540	return `0`;
13541	}
13542
13543	tg->idle = idle;
13544
13545	for_each_possible_cpu(i) {
13546	struct rq *rq = cpu_rq(i);
13547	struct sched_entity *se = tg->se[i];
13548	struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i];
13549	bool was_idle = cfs_rq_is_idle(cfs_rq: grp_cfs_rq);
13550	long idle_task_delta;
13551	struct rq_flags rf;
13552
13553	rq_lock_irqsave(rq, rf: &rf);
13554
13555	grp_cfs_rq->idle = idle;
13556	if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq)))
13557	goto next_cpu;
13558
13559	idle_task_delta = grp_cfs_rq->h_nr_queued -
13560	grp_cfs_rq->h_nr_idle;
13561	if (!cfs_rq_is_idle(cfs_rq: grp_cfs_rq))
13562	idle_task_delta *= -`1`;
13563
13564	for_each_sched_entity(se) {
13565	struct cfs_rq *cfs_rq = cfs_rq_of(se);
13566
13567	if (!se->on_rq)
13568	break;
13569
13570	cfs_rq->h_nr_idle += idle_task_delta;
13571
13572	/ Already accounted at parent level and above. /
13573	if (cfs_rq_is_idle(cfs_rq))
13574	break;
13575	}
13576
13577	next_cpu:
13578	rq_unlock_irqrestore(rq, rf: &rf);
13579	}
13580
13581	/ Idle groups have minimum weight. /
13582	if (tg_is_idle(tg))
13583	__sched_group_set_shares(tg, scale_load(WEIGHT_IDLEPRIO));
13584	else
13585	__sched_group_set_shares(tg, NICE_0_LOAD);
13586
13587	mutex_unlock(lock: &shares_mutex);
13588	return `0`;
13589	}
13590
13591	#endif /* CONFIG_FAIR_GROUP_SCHED */
13592
13593
13594	static unsigned int get_rr_interval_fair(struct rq rq, struct* task_struct *task)
13595	{
13596	struct sched_entity *se = &task->se;
13597	unsigned int rr_interval = `0`;
13598
13599	/*
13600	* Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
13601	* idle runqueue:
13602	*/
13603	if (rq->cfs.load.weight)
13604	rr_interval = NS_TO_JIFFIES(se->slice);
13605
13606	return rr_interval;
13607	}
13608
13609	/*
13610	* All the scheduling class methods:
13611	*/
13612	DEFINE_SCHED_CLASS(fair) = {
13613
13614	.enqueue_task = enqueue_task_fair,
13615	.dequeue_task = dequeue_task_fair,
13616	.yield_task = yield_task_fair,
13617	.yield_to_task = yield_to_task_fair,
13618
13619	.wakeup_preempt = check_preempt_wakeup_fair,
13620
13621	.pick_task = pick_task_fair,
13622	.pick_next_task = __pick_next_task_fair,
13623	.put_prev_task = put_prev_task_fair,
13624	.set_next_task = set_next_task_fair,
13625
13626	.balance = balance_fair,
13627	.select_task_rq = select_task_rq_fair,
13628	.migrate_task_rq = migrate_task_rq_fair,
13629
13630	.rq_online = rq_online_fair,
13631	.rq_offline = rq_offline_fair,
13632
13633	.task_dead = task_dead_fair,
13634	.set_cpus_allowed = set_cpus_allowed_fair,
13635
13636	.task_tick = task_tick_fair,
13637	.task_fork = task_fork_fair,
13638
13639	.reweight_task = reweight_task_fair,
13640	.prio_changed = prio_changed_fair,
13641	.switched_from = switched_from_fair,
13642	.switched_to = switched_to_fair,
13643
13644	.get_rr_interval = get_rr_interval_fair,
13645
13646	.update_curr = update_curr_fair,
13647
13648	#ifdef CONFIG_FAIR_GROUP_SCHED
13649	.task_change_group = task_change_group_fair,
13650	#endif
13651
13652	#ifdef CONFIG_SCHED_CORE
13653	.task_is_throttled = task_is_throttled_fair,
13654	#endif
13655
13656	#ifdef CONFIG_UCLAMP_TASK
13657	.uclamp_enabled = `1`,
13658	#endif
13659	};
13660
13661	void print_cfs_stats(struct seq_file m, int* cpu)
13662	{
13663	struct cfs_rq cfs_rq, pos;
13664
13665	rcu_read_lock();
13666	for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
13667	print_cfs_rq(m, cpu, cfs_rq);
13668	rcu_read_unlock();
13669	}
13670
13671	#ifdef CONFIG_NUMA_BALANCING
13672	void show_numa_stats(struct task_struct p, struct* seq_file *m)
13673	{
13674	int node;
13675	unsigned long tsf = `0`, tpf = `0`, gsf = `0`, gpf = `0`;
13676	struct numa_group *ng;
13677
13678	rcu_read_lock();
13679	ng = rcu_dereference(p->numa_group);
13680	for_each_online_node(node) {
13681	if (p->numa_faults) {
13682	tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, `0`)];
13683	tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, `1`)];
13684	}
13685	if (ng) {
13686	gsf = ng->faults[task_faults_idx(NUMA_MEM, node, `0`)],
13687	gpf = ng->faults[task_faults_idx(NUMA_MEM, node, `1`)];
13688	}
13689	print_numa_stats(m, node, tsf, tpf, gsf, gpf);
13690	}
13691	rcu_read_unlock();
13692	}
13693	#endif /* CONFIG_NUMA_BALANCING */
13694
13695	__init void init_sched_fair_class(void)
13696	{
13697	int i;
13698
13699	for_each_possible_cpu(i) {
13700	zalloc_cpumask_var_node(mask: &per_cpu(load_balance_mask, i), GFP_KERNEL, node: cpu_to_node(cpu: i));
13701	zalloc_cpumask_var_node(mask: &per_cpu(select_rq_mask, i), GFP_KERNEL, node: cpu_to_node(cpu: i));
13702	zalloc_cpumask_var_node(mask: &per_cpu(should_we_balance_tmpmask, i),
13703	GFP_KERNEL, node: cpu_to_node(cpu: i));
13704
13705	#ifdef CONFIG_CFS_BANDWIDTH
13706	INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i));
13707	INIT_LIST_HEAD(&cpu_rq(i)->cfsb_csd_list);
13708	#endif
13709	}
13710
13711	open_softirq(nr: SCHED_SOFTIRQ, action: sched_balance_softirq);
13712
13713	#ifdef CONFIG_NO_HZ_COMMON
13714	nohz.next_balance = jiffies;
13715	nohz.next_blocked = jiffies;
13716	zalloc_cpumask_var(mask: &nohz.idle_cpus_mask, GFP_NOWAIT);
13717	#endif
13718	}
13719

Browse the source code of Linux/kernel/sched/fair.c