loadavg.c source code [Linux/kernel/sched/loadavg.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* kernel/sched/loadavg.c
4	*
5	* This file contains the magic bits required to compute the global loadavg
6	* figure. Its a silly number but people think its important. We go through
7	* great pains to make it work on big machines and tickless kernels.
8	*/
9	#include <linux/sched/nohz.h>
10	#include "sched.h"
11
12	/*
13	* Global load-average calculations
14	*
15	* We take a distributed and async approach to calculating the global load-avg
16	* in order to minimize overhead.
17	*
18	* The global load average is an exponentially decaying average of nr_running +
19	* nr_uninterruptible.
20	*
21	* Once every LOAD_FREQ:
22	*
23	* nr_active = 0;
24	* for_each_possible_cpu(cpu)
25	* nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
26	*
27	* avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
28	*
29	* Due to a number of reasons the above turns in the mess below:
30	*
31	* - for_each_possible_cpu() is prohibitively expensive on machines with
32	* serious number of CPUs, therefore we need to take a distributed approach
33	* to calculating nr_active.
34	*
35	* \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) \| x_i(t_0) := 0
36	* = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
37	*
38	* So assuming nr_active := 0 when we start out -- true per definition, we
39	* can simply take per-CPU deltas and fold those into a global accumulate
40	* to obtain the same result. See calc_load_fold_active().
41	*
42	* Furthermore, in order to avoid synchronizing all per-CPU delta folding
43	* across the machine, we assume 10 ticks is sufficient time for every
44	* CPU to have completed this task.
45	*
46	* This places an upper-bound on the IRQ-off latency of the machine. Then
47	* again, being late doesn't loose the delta, just wrecks the sample.
48	*
49	* - cpu_rq()->nr_uninterruptible isn't accurately tracked per-CPU because
50	* this would add another cross-CPU cache-line miss and atomic operation
51	* to the wakeup path. Instead we increment on whatever CPU the task ran
52	* when it went into uninterruptible state and decrement on whatever CPU
53	* did the wakeup. This means that only the sum of nr_uninterruptible over
54	* all CPUs yields the correct result.
55	*
56	* This covers the NO_HZ=n code, for extra head-aches, see the comment below.
57	*/
58
59	/ Variables and functions for calc_load /
60	atomic_long_t calc_load_tasks;
61	unsigned long calc_load_update;
62	unsigned long avenrun[`3`];
63	EXPORT_SYMBOL(avenrun); / should be removed /
64
65	/**
66	* get_avenrun - get the load average array
67	* @loads: pointer to destination load array
68	* @offset: offset to add
69	* @shift: shift count to shift the result left
70	*
71	* These values are estimates at best, so no need for locking.
72	*/
73	void get_avenrun(unsigned long loads, unsigned* long offset, int shift)
74	{
75	loads[`0`] = (avenrun[`0`] + offset) << shift;
76	loads[`1`] = (avenrun[`1`] + offset) << shift;
77	loads[`2`] = (avenrun[`2`] + offset) << shift;
78	}
79
80	long calc_load_fold_active(struct rq this_rq, long* adjust)
81	{
82	long nr_active, delta = `0`;
83
84	nr_active = this_rq->nr_running - adjust;
85	nr_active += (long)this_rq->nr_uninterruptible;
86
87	if (nr_active != this_rq->calc_load_active) {
88	delta = nr_active - this_rq->calc_load_active;
89	this_rq->calc_load_active = nr_active;
90	}
91
92	return delta;
93	}
94
95	/**
96	* fixed_power_int - compute: x^n, in O(log n) time
97	*
98	* @x: base of the power
99	* @frac_bits: fractional bits of @x
100	* @n: power to raise @x to.
101	*
102	* By exploiting the relation between the definition of the natural power
103	* function: x^n := xx...*x (x multiplied by itself for n times), and
104	* the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
105	* (where: n_i \elem {0, 1}, the binary vector representing n),
106	* we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
107	* of course trivially computable in O(log_2 n), the length of our binary
108	* vector.
109	*/
110	static unsigned long
111	fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
112	{
113	unsigned long result = `1UL` << frac_bits;
114
115	if (n) {
116	for (;;) {
117	if (n & `1`) {
118	result *= x;
119	result += `1UL` << (frac_bits - `1`);
120	result >>= frac_bits;
121	}
122	n >>= `1`;
123	if (!n)
124	break;
125	x *= x;
126	x += `1UL` << (frac_bits - `1`);
127	x >>= frac_bits;
128	}
129	}
130
131	return result;
132	}
133
134	/*
135	* a1 = a0 * e + a * (1 - e)
136	*
137	* a2 = a1 * e + a * (1 - e)
138	* = (a0 * e + a * (1 - e)) * e + a * (1 - e)
139	* = a0 * e^2 + a * (1 - e) * (1 + e)
140	*
141	* a3 = a2 * e + a * (1 - e)
142	* = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
143	* = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
144	*
145	* ...
146	*
147	* an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
148	* = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
149	* = a0 * e^n + a * (1 - e^n)
150	*
151	* [1] application of the geometric series:
152	*
153	* n 1 - x^(n+1)
154	* S_n := \Sum x^i = -------------
155	* i=0 1 - x
156	*/
157	unsigned long
158	calc_load_n(unsigned long load, unsigned long exp,
159	unsigned long active, unsigned int n)
160	{
161	return calc_load(load, exp: fixed_power_int(x: exp, FSHIFT, n), active);
162	}
163
164	#ifdef CONFIG_NO_HZ_COMMON
165	/*
166	* Handle NO_HZ for the global load-average.
167	*
168	* Since the above described distributed algorithm to compute the global
169	* load-average relies on per-CPU sampling from the tick, it is affected by
170	* NO_HZ.
171	*
172	* The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon
173	* entering NO_HZ state such that we can include this as an 'extra' CPU delta
174	* when we read the global state.
175	*
176	* Obviously reality has to ruin such a delightfully simple scheme:
177	*
178	* - When we go NO_HZ idle during the window, we can negate our sample
179	* contribution, causing under-accounting.
180	*
181	* We avoid this by keeping two NO_HZ-delta counters and flipping them
182	* when the window starts, thus separating old and new NO_HZ load.
183	*
184	* The only trick is the slight shift in index flip for read vs write.
185	*
186	* 0s 5s 10s 15s
187	* +10 +10 +10 +10
188	* \|-\|-----------\|-\|-----------\|-\|-----------\|-\|
189	* r:0 0 1 1 0 0 1 1 0
190	* w:0 1 1 0 0 1 1 0 0
191	*
192	* This ensures we'll fold the old NO_HZ contribution in this window while
193	* accumulating the new one.
194	*
195	* - When we wake up from NO_HZ during the window, we push up our
196	* contribution, since we effectively move our sample point to a known
197	* busy state.
198	*
199	* This is solved by pushing the window forward, and thus skipping the
200	* sample, for this CPU (effectively using the NO_HZ-delta for this CPU which
201	* was in effect at the time the window opened). This also solves the issue
202	* of having to deal with a CPU having been in NO_HZ for multiple LOAD_FREQ
203	* intervals.
204	*
205	* When making the ILB scale, we should try to pull this in as well.
206	*/
207	static atomic_long_t calc_load_nohz[`2`];
208	static int calc_load_idx;
209
210	static inline int calc_load_write_idx(void)
211	{
212	int idx = calc_load_idx;
213
214	/*
215	* See calc_global_nohz(), if we observe the new index, we also
216	* need to observe the new update time.
217	*/
218	smp_rmb();
219
220	/*
221	* If the folding window started, make sure we start writing in the
222	* next NO_HZ-delta.
223	*/
224	if (!time_before(jiffies, READ_ONCE(calc_load_update)))
225	idx++;
226
227	return idx & `1`;
228	}
229
230	static inline int calc_load_read_idx(void)
231	{
232	return calc_load_idx & `1`;
233	}
234
235	static void calc_load_nohz_fold(struct rq *rq)
236	{
237	long delta;
238
239	delta = calc_load_fold_active(this_rq: rq, adjust: `0`);
240	if (delta) {
241	int idx = calc_load_write_idx();
242
243	atomic_long_add(i: delta, v: &calc_load_nohz[idx]);
244	}
245	}
246
247	void calc_load_nohz_start(void)
248	{
249	/*
250	* We're going into NO_HZ mode, if there's any pending delta, fold it
251	* into the pending NO_HZ delta.
252	*/
253	calc_load_nohz_fold(this_rq());
254	}
255
256	/*
257	* Keep track of the load for NOHZ_FULL, must be called between
258	* calc_load_nohz_{start,stop}().
259	*/
260	void calc_load_nohz_remote(struct rq *rq)
261	{
262	calc_load_nohz_fold(rq);
263	}
264
265	void calc_load_nohz_stop(void)
266	{
267	struct rq *this_rq = this_rq();
268
269	/*
270	* If we're still before the pending sample window, we're done.
271	*/
272	this_rq->calc_load_update = READ_ONCE(calc_load_update);
273	if (time_before(jiffies, this_rq->calc_load_update))
274	return;
275
276	/*
277	* We woke inside or after the sample window, this means we're already
278	* accounted through the nohz accounting, so skip the entire deal and
279	* sync up for the next window.
280	*/
281	if (time_before(jiffies, this_rq->calc_load_update + `10`))
282	this_rq->calc_load_update += LOAD_FREQ;
283	}
284
285	static long calc_load_nohz_read(void)
286	{
287	int idx = calc_load_read_idx();
288	long delta = `0`;
289
290	if (atomic_long_read(v: &calc_load_nohz[idx]))
291	delta = atomic_long_xchg(v: &calc_load_nohz[idx], new: `0`);
292
293	return delta;
294	}
295
296	/*
297	* NO_HZ can leave us missing all per-CPU ticks calling
298	* calc_load_fold_active(), but since a NO_HZ CPU folds its delta into
299	* calc_load_nohz per calc_load_nohz_start(), all we need to do is fold
300	* in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary.
301	*
302	* Once we've updated the global active value, we need to apply the exponential
303	* weights adjusted to the number of cycles missed.
304	*/
305	static void calc_global_nohz(void)
306	{
307	unsigned long sample_window;
308	long delta, active, n;
309
310	sample_window = READ_ONCE(calc_load_update);
311	if (!time_before(jiffies, sample_window + `10`)) {
312	/*
313	* Catch-up, fold however many we are behind still
314	*/
315	delta = jiffies - sample_window - `10`;
316	n = `1` + (delta / LOAD_FREQ);
317
318	active = atomic_long_read(v: &calc_load_tasks);
319	active = active > `0` ? active * FIXED_1 : `0`;
320
321	avenrun[`0`] = calc_load_n(load: avenrun[`0`], EXP_1, active, n);
322	avenrun[`1`] = calc_load_n(load: avenrun[`1`], EXP_5, active, n);
323	avenrun[`2`] = calc_load_n(load: avenrun[`2`], EXP_15, active, n);
324
325	WRITE_ONCE(calc_load_update, sample_window + n * LOAD_FREQ);
326	}
327
328	/*
329	* Flip the NO_HZ index...
330	*
331	* Make sure we first write the new time then flip the index, so that
332	* calc_load_write_idx() will see the new time when it reads the new
333	* index, this avoids a double flip messing things up.
334	*/
335	smp_wmb();
336	calc_load_idx++;
337	}
338	#else /* !CONFIG_NO_HZ_COMMON: */
339
340	static inline long calc_load_nohz_read(void) { return `0`; }
341	static inline void calc_global_nohz(void) { }
342
343	#endif /* !CONFIG_NO_HZ_COMMON */
344
345	/*
346	* calc_load - update the avenrun load estimates 10 ticks after the
347	* CPUs have updated calc_load_tasks.
348	*
349	* Called from the global timer code.
350	*/
351	void calc_global_load(void)
352	{
353	unsigned long sample_window;
354	long active, delta;
355
356	sample_window = READ_ONCE(calc_load_update);
357	if (time_before(jiffies, sample_window + `10`))
358	return;
359
360	/*
361	* Fold the 'old' NO_HZ-delta to include all NO_HZ CPUs.
362	*/
363	delta = calc_load_nohz_read();
364	if (delta)
365	atomic_long_add(i: delta, v: &calc_load_tasks);
366
367	active = atomic_long_read(v: &calc_load_tasks);
368	active = active > `0` ? active * FIXED_1 : `0`;
369
370	avenrun[`0`] = calc_load(load: avenrun[`0`], EXP_1, active);
371	avenrun[`1`] = calc_load(load: avenrun[`1`], EXP_5, active);
372	avenrun[`2`] = calc_load(load: avenrun[`2`], EXP_15, active);
373
374	WRITE_ONCE(calc_load_update, sample_window + LOAD_FREQ);
375
376	/*
377	* In case we went to NO_HZ for multiple LOAD_FREQ intervals
378	* catch up in bulk.
379	*/
380	calc_global_nohz();
381	}
382
383	/*
384	* Called from sched_tick() to periodically update this CPU's
385	* active count.
386	*/
387	void calc_global_load_tick(struct rq *this_rq)
388	{
389	long delta;
390
391	if (time_before(jiffies, this_rq->calc_load_update))
392	return;
393
394	delta = calc_load_fold_active(this_rq, adjust: `0`);
395	if (delta)
396	atomic_long_add(i: delta, v: &calc_load_tasks);
397
398	this_rq->calc_load_update += LOAD_FREQ;
399	}
400

Browse the source code of Linux/kernel/sched/loadavg.c