page_counter.c source code [Linux/mm/page_counter.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Lockless hierarchical page accounting & limiting
4	*
5	* Copyright (C) 2014 Red Hat, Inc., Johannes Weiner
6	*/
7
8	#include <linux/page_counter.h>
9	#include <linux/atomic.h>
10	#include <linux/kernel.h>
11	#include <linux/string.h>
12	#include <linux/sched.h>
13	#include <linux/bug.h>
14	#include <asm/page.h>
15
16	static bool track_protection(struct page_counter *c)
17	{
18	return c->protection_support;
19	}
20
21	static void propagate_protected_usage(struct page_counter *c,
22	unsigned long usage)
23	{
24	unsigned long protected, old_protected;
25	long delta;
26
27	if (!c->parent)
28	return;
29
30	protected = min(usage, READ_ONCE(c->min));
31	old_protected = atomic_long_read(v: &c->min_usage);
32	if (protected != old_protected) {
33	old_protected = atomic_long_xchg(v: &c->min_usage, new: protected);
34	delta = protected - old_protected;
35	if (delta)
36	atomic_long_add(i: delta, v: &c->parent->children_min_usage);
37	}
38
39	protected = min(usage, READ_ONCE(c->low));
40	old_protected = atomic_long_read(v: &c->low_usage);
41	if (protected != old_protected) {
42	old_protected = atomic_long_xchg(v: &c->low_usage, new: protected);
43	delta = protected - old_protected;
44	if (delta)
45	atomic_long_add(i: delta, v: &c->parent->children_low_usage);
46	}
47	}
48
49	/**
50	* page_counter_cancel - take pages out of the local counter
51	* @counter: counter
52	* @nr_pages: number of pages to cancel
53	*/
54	void page_counter_cancel(struct page_counter counter, unsigned* long nr_pages)
55	{
56	long new;
57
58	new = atomic_long_sub_return(i: nr_pages, v: &counter->usage);
59	/ More uncharges than charges? /
60	if (WARN_ONCE(new < `0`, "page_counter underflow: %ld nr_pages=%lu\n",
61	new, nr_pages)) {
62	new = `0`;
63	atomic_long_set(v: &counter->usage, i: new);
64	}
65	if (track_protection(c: counter))
66	propagate_protected_usage(c: counter, usage: new);
67	}
68
69	/**
70	* page_counter_charge - hierarchically charge pages
71	* @counter: counter
72	* @nr_pages: number of pages to charge
73	*
74	* NOTE: This does not consider any configured counter limits.
75	*/
76	void page_counter_charge(struct page_counter counter, unsigned* long nr_pages)
77	{
78	struct page_counter *c;
79	bool protection = track_protection(c: counter);
80
81	for (c = counter; c; c = c->parent) {
82	long new;
83
84	new = atomic_long_add_return(i: nr_pages, v: &c->usage);
85	if (protection)
86	propagate_protected_usage(c, usage: new);
87	/*
88	* This is indeed racy, but we can live with some
89	* inaccuracy in the watermark.
90	*
91	* Notably, we have two watermarks to allow for both a globally
92	* visible peak and one that can be reset at a smaller scope.
93	*
94	* Since we reset both watermarks when the global reset occurs,
95	* we can guarantee that watermark >= local_watermark, so we
96	* don't need to do both comparisons every time.
97	*
98	* On systems with branch predictors, the inner condition should
99	* be almost free.
100	*/
101	if (new > READ_ONCE(c->local_watermark)) {
102	WRITE_ONCE(c->local_watermark, new);
103	if (new > READ_ONCE(c->watermark))
104	WRITE_ONCE(c->watermark, new);
105	}
106	}
107	}
108
109	/**
110	* page_counter_try_charge - try to hierarchically charge pages
111	* @counter: counter
112	* @nr_pages: number of pages to charge
113	* @fail: points first counter to hit its limit, if any
114	*
115	* Returns %true on success, or %false and @fail if the counter or one
116	* of its ancestors has hit its configured limit.
117	*/
118	bool page_counter_try_charge(struct page_counter *counter,
119	unsigned long nr_pages,
120	struct page_counter **fail)
121	{
122	struct page_counter *c;
123	bool protection = track_protection(c: counter);
124	bool track_failcnt = counter->track_failcnt;
125
126	for (c = counter; c; c = c->parent) {
127	long new;
128	/*
129	* Charge speculatively to avoid an expensive CAS. If
130	* a bigger charge fails, it might falsely lock out a
131	* racing smaller charge and send it into reclaim
132	* early, but the error is limited to the difference
133	* between the two sizes, which is less than 2M/4M in
134	* case of a THP locking out a regular page charge.
135	*
136	* The atomic_long_add_return() implies a full memory
137	* barrier between incrementing the count and reading
138	* the limit. When racing with page_counter_set_max(),
139	* we either see the new limit or the setter sees the
140	* counter has changed and retries.
141	*/
142	new = atomic_long_add_return(i: nr_pages, v: &c->usage);
143	if (new > c->max) {
144	atomic_long_sub(i: nr_pages, v: &c->usage);
145	/*
146	* This is racy, but we can live with some
147	* inaccuracy in the failcnt which is only used
148	* to report stats.
149	*/
150	if (track_failcnt)
151	data_race(c->failcnt++);
152	*fail = c;
153	goto failed;
154	}
155	if (protection)
156	propagate_protected_usage(c, usage: new);
157
158	/ see comment on page_counter_charge /
159	if (new > READ_ONCE(c->local_watermark)) {
160	WRITE_ONCE(c->local_watermark, new);
161	if (new > READ_ONCE(c->watermark))
162	WRITE_ONCE(c->watermark, new);
163	}
164	}
165	return true;
166
167	failed:
168	for (c = counter; c != *fail; c = c->parent)
169	page_counter_cancel(counter: c, nr_pages);
170
171	return false;
172	}
173
174	/**
175	* page_counter_uncharge - hierarchically uncharge pages
176	* @counter: counter
177	* @nr_pages: number of pages to uncharge
178	*/
179	void page_counter_uncharge(struct page_counter counter, unsigned* long nr_pages)
180	{
181	struct page_counter *c;
182
183	for (c = counter; c; c = c->parent)
184	page_counter_cancel(counter: c, nr_pages);
185	}
186
187	/**
188	* page_counter_set_max - set the maximum number of pages allowed
189	* @counter: counter
190	* @nr_pages: limit to set
191	*
192	* Returns 0 on success, -EBUSY if the current number of pages on the
193	* counter already exceeds the specified limit.
194	*
195	* The caller must serialize invocations on the same counter.
196	*/
197	int page_counter_set_max(struct page_counter counter, unsigned* long nr_pages)
198	{
199	for (;;) {
200	unsigned long old;
201	long usage;
202
203	/*
204	* Update the limit while making sure that it's not
205	* below the concurrently-changing counter value.
206	*
207	* The xchg implies two full memory barriers before
208	* and after, so the read-swap-read is ordered and
209	* ensures coherency with page_counter_try_charge():
210	* that function modifies the count before checking
211	* the limit, so if it sees the old limit, we see the
212	* modified counter and retry.
213	*/
214	usage = page_counter_read(counter);
215
216	if (usage > nr_pages)
217	return -EBUSY;
218
219	old = xchg(&counter->max, nr_pages);
220
221	if (page_counter_read(counter) <= usage \|\| nr_pages >= old)
222	return `0`;
223
224	counter->max = old;
225	cond_resched();
226	}
227	}
228
229	/**
230	* page_counter_set_min - set the amount of protected memory
231	* @counter: counter
232	* @nr_pages: value to set
233	*
234	* The caller must serialize invocations on the same counter.
235	*/
236	void page_counter_set_min(struct page_counter counter, unsigned* long nr_pages)
237	{
238	struct page_counter *c;
239
240	WRITE_ONCE(counter->min, nr_pages);
241
242	for (c = counter; c; c = c->parent)
243	propagate_protected_usage(c, usage: atomic_long_read(v: &c->usage));
244	}
245
246	/**
247	* page_counter_set_low - set the amount of protected memory
248	* @counter: counter
249	* @nr_pages: value to set
250	*
251	* The caller must serialize invocations on the same counter.
252	*/
253	void page_counter_set_low(struct page_counter counter, unsigned* long nr_pages)
254	{
255	struct page_counter *c;
256
257	WRITE_ONCE(counter->low, nr_pages);
258
259	for (c = counter; c; c = c->parent)
260	propagate_protected_usage(c, usage: atomic_long_read(v: &c->usage));
261	}
262
263	/**
264	* page_counter_memparse - memparse() for page counter limits
265	* @buf: string to parse
266	* @max: string meaning maximum possible value
267	* @nr_pages: returns the result in number of pages
268	*
269	* Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be
270	* limited to %PAGE_COUNTER_MAX.
271	*/
272	int page_counter_memparse(const char buf, const* char *max,
273	unsigned long *nr_pages)
274	{
275	char *end;
276	u64 bytes;
277
278	if (!strcmp(buf, max)) {
279	*nr_pages = PAGE_COUNTER_MAX;
280	return `0`;
281	}
282
283	bytes = memparse(ptr: buf, retptr: &end);
284	if (*end != `'\0'`)
285	return -EINVAL;
286
287	*nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX);
288
289	return `0`;
290	}
291
292
293	#if IS_ENABLED(CONFIG_MEMCG) \|\| IS_ENABLED(CONFIG_CGROUP_DMEM)
294	/*
295	* This function calculates an individual page counter's effective
296	* protection which is derived from its own memory.min/low, its
297	* parent's and siblings' settings, as well as the actual memory
298	* distribution in the tree.
299	*
300	* The following rules apply to the effective protection values:
301	*
302	* 1. At the first level of reclaim, effective protection is equal to
303	* the declared protection in memory.min and memory.low.
304	*
305	* 2. To enable safe delegation of the protection configuration, at
306	* subsequent levels the effective protection is capped to the
307	* parent's effective protection.
308	*
309	* 3. To make complex and dynamic subtrees easier to configure, the
310	* user is allowed to overcommit the declared protection at a given
311	* level. If that is the case, the parent's effective protection is
312	* distributed to the children in proportion to how much protection
313	* they have declared and how much of it they are utilizing.
314	*
315	* This makes distribution proportional, but also work-conserving:
316	* if one counter claims much more protection than it uses memory,
317	* the unused remainder is available to its siblings.
318	*
319	* 4. Conversely, when the declared protection is undercommitted at a
320	* given level, the distribution of the larger parental protection
321	* budget is NOT proportional. A counter's protection from a sibling
322	* is capped to its own memory.min/low setting.
323	*
324	* 5. However, to allow protecting recursive subtrees from each other
325	* without having to declare each individual counter's fixed share
326	* of the ancestor's claim to protection, any unutilized -
327	* "floating" - protection from up the tree is distributed in
328	* proportion to each counter's usage. This makes the protection
329	* neutral wrt sibling cgroups and lets them compete freely over
330	* the shared parental protection budget, but it protects the
331	* subtree as a whole from neighboring subtrees.
332	*
333	* Note that 4. and 5. are not in conflict: 4. is about protecting
334	* against immediate siblings whereas 5. is about protecting against
335	* neighboring subtrees.
336	*/
337	static unsigned long effective_protection(unsigned long usage,
338	unsigned long parent_usage,
339	unsigned long setting,
340	unsigned long parent_effective,
341	unsigned long siblings_protected,
342	bool recursive_protection)
343	{
344	unsigned long protected;
345	unsigned long ep;
346
347	protected = min(usage, setting);
348	/*
349	* If all cgroups at this level combined claim and use more
350	* protection than what the parent affords them, distribute
351	* shares in proportion to utilization.
352	*
353	* We are using actual utilization rather than the statically
354	* claimed protection in order to be work-conserving: claimed
355	* but unused protection is available to siblings that would
356	* otherwise get a smaller chunk than what they claimed.
357	*/
358	if (siblings_protected > parent_effective)
359	return protected * parent_effective / siblings_protected;
360
361	/*
362	* Ok, utilized protection of all children is within what the
363	* parent affords them, so we know whatever this child claims
364	* and utilizes is effectively protected.
365	*
366	* If there is unprotected usage beyond this value, reclaim
367	* will apply pressure in proportion to that amount.
368	*
369	* If there is unutilized protection, the cgroup will be fully
370	* shielded from reclaim, but we do return a smaller value for
371	* protection than what the group could enjoy in theory. This
372	* is okay. With the overcommit distribution above, effective
373	* protection is always dependent on how memory is actually
374	* consumed among the siblings anyway.
375	*/
376	ep = protected;
377
378	/*
379	* If the children aren't claiming (all of) the protection
380	* afforded to them by the parent, distribute the remainder in
381	* proportion to the (unprotected) memory of each cgroup. That
382	* way, cgroups that aren't explicitly prioritized wrt each
383	* other compete freely over the allowance, but they are
384	* collectively protected from neighboring trees.
385	*
386	* We're using unprotected memory for the weight so that if
387	* some cgroups DO claim explicit protection, we don't protect
388	* the same bytes twice.
389	*
390	* Check both usage and parent_usage against the respective
391	* protected values. One should imply the other, but they
392	* aren't read atomically - make sure the division is sane.
393	*/
394	if (!recursive_protection)
395	return ep;
396
397	if (parent_effective > siblings_protected &&
398	parent_usage > siblings_protected &&
399	usage > protected) {
400	unsigned long unclaimed;
401
402	unclaimed = parent_effective - siblings_protected;
403	unclaimed *= usage - protected;
404	unclaimed /= parent_usage - siblings_protected;
405
406	ep += unclaimed;
407	}
408
409	return ep;
410	}
411
412
413	/**
414	* page_counter_calculate_protection - check if memory consumption is in the normal range
415	* @root: the top ancestor of the sub-tree being checked
416	* @counter: the page_counter the counter to update
417	* @recursive_protection: Whether to use memory_recursiveprot behavior.
418	*
419	* Calculates elow/emin thresholds for given page_counter.
420	*
421	* WARNING: This function is not stateless! It can only be used as part
422	* of a top-down tree iteration, not for isolated queries.
423	*/
424	void page_counter_calculate_protection(struct page_counter *root,
425	struct page_counter *counter,
426	bool recursive_protection)
427	{
428	unsigned long usage, parent_usage;
429	struct page_counter *parent = counter->parent;
430
431	/*
432	* Effective values of the reclaim targets are ignored so they
433	* can be stale. Have a look at mem_cgroup_protection for more
434	* details.
435	* TODO: calculation should be more robust so that we do not need
436	* that special casing.
437	*/
438	if (root == counter)
439	return;
440
441	usage = page_counter_read(counter);
442	if (!usage)
443	return;
444
445	if (parent == root) {
446	counter->emin = READ_ONCE(counter->min);
447	counter->elow = READ_ONCE(counter->low);
448	return;
449	}
450
451	parent_usage = page_counter_read(parent);
452
453	WRITE_ONCE(counter->emin, effective_protection(usage, parent_usage,
454	READ_ONCE(counter->min),
455	READ_ONCE(parent->emin),
456	atomic_long_read(&parent->children_min_usage),
457	recursive_protection));
458
459	WRITE_ONCE(counter->elow, effective_protection(usage, parent_usage,
460	READ_ONCE(counter->low),
461	READ_ONCE(parent->elow),
462	atomic_long_read(&parent->children_low_usage),
463	recursive_protection));
464	}
465	#endif /* CONFIG_MEMCG \|\| CONFIG_CGROUP_DMEM */
466

Browse the source code of Linux/mm/page_counter.c