cpupri.c source code [Linux/kernel/sched/cpupri.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* kernel/sched/cpupri.c
4	*
5	* CPU priority management
6	*
7	* Copyright (C) 2007-2008 Novell
8	*
9	* Author: Gregory Haskins <ghaskins@novell.com>
10	*
11	* This code tracks the priority of each CPU so that global migration
12	* decisions are easy to calculate. Each CPU can be in a state as follows:
13	*
14	* (INVALID), NORMAL, RT1, ... RT99, HIGHER
15	*
16	* going from the lowest priority to the highest. CPUs in the INVALID state
17	* are not eligible for routing. The system maintains this state with
18	* a 2 dimensional bitmap (the first for priority class, the second for CPUs
19	* in that class). Therefore a typical application without affinity
20	* restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
21	* searches). For tasks with affinity restrictions, the algorithm has a
22	* worst case complexity of O(min(101, nr_domcpus)), though the scenario that
23	* yields the worst case search is fairly contrived.
24	*/
25	#include "sched.h"
26
27	/*
28	* p->rt_priority p->prio newpri cpupri
29	*
30	* -1 -1 (CPUPRI_INVALID)
31	*
32	* 99 0 (CPUPRI_NORMAL)
33	*
34	* 1 98 98 1
35	* ...
36	* 49 50 50 49
37	* 50 49 49 50
38	* ...
39	* 99 0 0 99
40	*
41	* 100 100 (CPUPRI_HIGHER)
42	*/
43	static int convert_prio(int prio)
44	{
45	int cpupri;
46
47	switch (prio) {
48	case CPUPRI_INVALID:
49	cpupri = CPUPRI_INVALID; / -1 /
50	break;
51
52	case `0` ... `98`:
53	cpupri = MAX_RT_PRIO-`1` - prio; / 1 ... 99 /
54	break;
55
56	case MAX_RT_PRIO-`1`:
57	cpupri = CPUPRI_NORMAL; / 0 /
58	break;
59
60	case MAX_RT_PRIO:
61	cpupri = CPUPRI_HIGHER; / 100 /
62	break;
63	}
64
65	return cpupri;
66	}
67
68	static inline int __cpupri_find(struct cpupri cp, struct* task_struct *p,
69	struct cpumask lowest_mask, int* idx)
70	{
71	struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
72	int skip = `0`;
73
74	if (!atomic_read(v: &(vec)->count))
75	skip = `1`;
76	/*
77	* When looking at the vector, we need to read the counter,
78	* do a memory barrier, then read the mask.
79	*
80	* Note: This is still all racy, but we can deal with it.
81	* Ideally, we only want to look at masks that are set.
82	*
83	* If a mask is not set, then the only thing wrong is that we
84	* did a little more work than necessary.
85	*
86	* If we read a zero count but the mask is set, because of the
87	* memory barriers, that can only happen when the highest prio
88	* task for a run queue has left the run queue, in which case,
89	* it will be followed by a pull. If the task we are processing
90	* fails to find a proper place to go, that pull request will
91	* pull this task if the run queue is running at a lower
92	* priority.
93	*/
94	smp_rmb();
95
96	/ Need to do the rmb for every iteration /
97	if (skip)
98	return `0`;
99
100	if (cpumask_any_and(&p->cpus_mask, vec->mask) >= nr_cpu_ids)
101	return `0`;
102
103	if (lowest_mask) {
104	cpumask_and(dstp: lowest_mask, src1p: &p->cpus_mask, src2p: vec->mask);
105	cpumask_and(dstp: lowest_mask, src1p: lowest_mask, cpu_active_mask);
106
107	/*
108	* We have to ensure that we have at least one bit
109	* still set in the array, since the map could have
110	* been concurrently emptied between the first and
111	* second reads of vec->mask. If we hit this
112	* condition, simply act as though we never hit this
113	* priority level and continue on.
114	*/
115	if (cpumask_empty(srcp: lowest_mask))
116	return `0`;
117	}
118
119	return `1`;
120	}
121
122	int cpupri_find(struct cpupri cp, struct* task_struct *p,
123	struct cpumask *lowest_mask)
124	{
125	return cpupri_find_fitness(cp, p, lowest_mask, NULL);
126	}
127
128	/**
129	* cpupri_find_fitness - find the best (lowest-pri) CPU in the system
130	* @cp: The cpupri context
131	* @p: The task
132	* @lowest_mask: A mask to fill in with selected CPUs (or NULL)
133	* @fitness_fn: A pointer to a function to do custom checks whether the CPU
134	* fits a specific criteria so that we only return those CPUs.
135	*
136	* Note: This function returns the recommended CPUs as calculated during the
137	* current invocation. By the time the call returns, the CPUs may have in
138	* fact changed priorities any number of times. While not ideal, it is not
139	* an issue of correctness since the normal rebalancer logic will correct
140	* any discrepancies created by racing against the uncertainty of the current
141	* priority configuration.
142	*
143	* Return: (int)bool - CPUs were found
144	*/
145	int cpupri_find_fitness(struct cpupri cp, struct* task_struct *p,
146	struct cpumask *lowest_mask,
147	bool (fitness_fn)(struct* task_struct p, int* cpu))
148	{
149	int task_pri = convert_prio(prio: p->prio);
150	int idx, cpu;
151
152	WARN_ON_ONCE(task_pri >= CPUPRI_NR_PRIORITIES);
153
154	for (idx = `0`; idx < task_pri; idx++) {
155
156	if (!__cpupri_find(cp, p, lowest_mask, idx))
157	continue;
158
159	if (!lowest_mask \|\| !fitness_fn)
160	return `1`;
161
162	/ Ensure the capacity of the CPUs fit the task /
163	for_each_cpu(cpu, lowest_mask) {
164	if (!fitness_fn(p, cpu))
165	cpumask_clear_cpu(cpu, dstp: lowest_mask);
166	}
167
168	/*
169	* If no CPU at the current priority can fit the task
170	* continue looking
171	*/
172	if (cpumask_empty(srcp: lowest_mask))
173	continue;
174
175	return `1`;
176	}
177
178	/*
179	* If we failed to find a fitting lowest_mask, kick off a new search
180	* but without taking into account any fitness criteria this time.
181	*
182	* This rule favours honouring priority over fitting the task in the
183	* correct CPU (Capacity Awareness being the only user now).
184	* The idea is that if a higher priority task can run, then it should
185	* run even if this ends up being on unfitting CPU.
186	*
187	* The cost of this trade-off is not entirely clear and will probably
188	* be good for some workloads and bad for others.
189	*
190	* The main idea here is that if some CPUs were over-committed, we try
191	* to spread which is what the scheduler traditionally did. Sys admins
192	* must do proper RT planning to avoid overloading the system if they
193	* really care.
194	*/
195	if (fitness_fn)
196	return cpupri_find(cp, p, lowest_mask);
197
198	return `0`;
199	}
200
201	/**
202	* cpupri_set - update the CPU priority setting
203	* @cp: The cpupri context
204	* @cpu: The target CPU
205	* @newpri: The priority (INVALID,NORMAL,RT1-RT99,HIGHER) to assign to this CPU
206	*
207	* Note: Assumes cpu_rq(cpu)->lock is locked
208	*
209	* Returns: (void)
210	*/
211	void cpupri_set(struct cpupri cp, int* cpu, int newpri)
212	{
213	int *currpri = &cp->cpu_to_pri[cpu];
214	int oldpri = *currpri;
215	int do_mb = `0`;
216
217	newpri = convert_prio(prio: newpri);
218
219	BUG_ON(newpri >= CPUPRI_NR_PRIORITIES);
220
221	if (newpri == oldpri)
222	return;
223
224	/*
225	* If the CPU was currently mapped to a different value, we
226	* need to map it to the new value then remove the old value.
227	* Note, we must add the new value first, otherwise we risk the
228	* cpu being missed by the priority loop in cpupri_find.
229	*/
230	if (likely(newpri != CPUPRI_INVALID)) {
231	struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
232
233	cpumask_set_cpu(cpu, dstp: vec->mask);
234	/*
235	* When adding a new vector, we update the mask first,
236	* do a write memory barrier, and then update the count, to
237	* make sure the vector is visible when count is set.
238	*/
239	smp_mb__before_atomic();
240	atomic_inc(v: &(vec)->count);
241	do_mb = `1`;
242	}
243	if (likely(oldpri != CPUPRI_INVALID)) {
244	struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
245
246	/*
247	* Because the order of modification of the vec->count
248	* is important, we must make sure that the update
249	* of the new prio is seen before we decrement the
250	* old prio. This makes sure that the loop sees
251	* one or the other when we raise the priority of
252	* the run queue. We don't care about when we lower the
253	* priority, as that will trigger an rt pull anyway.
254	*
255	* We only need to do a memory barrier if we updated
256	* the new priority vec.
257	*/
258	if (do_mb)
259	smp_mb__after_atomic();
260
261	/*
262	* When removing from the vector, we decrement the counter first
263	* do a memory barrier and then clear the mask.
264	*/
265	atomic_dec(v: &(vec)->count);
266	smp_mb__after_atomic();
267	cpumask_clear_cpu(cpu, dstp: vec->mask);
268	}
269
270	*currpri = newpri;
271	}
272
273	/**
274	* cpupri_init - initialize the cpupri structure
275	* @cp: The cpupri context
276	*
277	* Return: -ENOMEM on memory allocation failure.
278	*/
279	int cpupri_init(struct cpupri *cp)
280	{
281	int i;
282
283	for (i = `0`; i < CPUPRI_NR_PRIORITIES; i++) {
284	struct cpupri_vec *vec = &cp->pri_to_cpu[i];
285
286	atomic_set(v: &vec->count, i: `0`);
287	if (!zalloc_cpumask_var(mask: &vec->mask, GFP_KERNEL))
288	goto cleanup;
289	}
290
291	cp->cpu_to_pri = kcalloc(nr_cpu_ids, sizeof(int), GFP_KERNEL);
292	if (!cp->cpu_to_pri)
293	goto cleanup;
294
295	for_each_possible_cpu(i)
296	cp->cpu_to_pri[i] = CPUPRI_INVALID;
297
298	return `0`;
299
300	cleanup:
301	for (i--; i >= `0`; i--)
302	free_cpumask_var(mask: cp->pri_to_cpu[i].mask);
303	return -ENOMEM;
304	}
305
306	/**
307	* cpupri_cleanup - clean up the cpupri structure
308	* @cp: The cpupri context
309	*/
310	void cpupri_cleanup(struct cpupri *cp)
311	{
312	int i;
313
314	kfree(objp: cp->cpu_to_pri);
315	for (i = `0`; i < CPUPRI_NR_PRIORITIES; i++)
316	free_cpumask_var(mask: cp->pri_to_cpu[i].mask);
317	}
318

Browse the source code of Linux/kernel/sched/cpupri.c