hugetlb_cgroup.c source code [Linux/mm/hugetlb_cgroup.c]

1	/*
2	*
3	* Copyright IBM Corporation, 2012
4	* Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
5	*
6	* Cgroup v2
7	* Copyright (C) 2019 Red Hat, Inc.
8	* Author: Giuseppe Scrivano <gscrivan@redhat.com>
9	*
10	* This program is free software; you can redistribute it and/or modify it
11	* under the terms of version 2.1 of the GNU Lesser General Public License
12	* as published by the Free Software Foundation.
13	*
14	* This program is distributed in the hope that it would be useful, but
15	* WITHOUT ANY WARRANTY; without even the implied warranty of
16	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
17	*
18	*/
19
20	#include <linux/cgroup.h>
21	#include <linux/page_counter.h>
22	#include <linux/slab.h>
23	#include <linux/hugetlb.h>
24	#include <linux/hugetlb_cgroup.h>
25
26	#define MEMFILE_PRIVATE(x, val) (((x) << 16) \| (val))
27	#define MEMFILE_IDX(val) (((val) >> 16) & 0xffff)
28	#define MEMFILE_ATTR(val) ((val) & 0xffff)
29
30	/ Use t->m[0] to encode the offset /
31	#define MEMFILE_OFFSET(t, m0) (((offsetof(t, m0) << 16) \| sizeof_field(t, m0)))
32	#define MEMFILE_OFFSET0(val) (((val) >> 16) & 0xffff)
33	#define MEMFILE_FIELD_SIZE(val) ((val) & 0xffff)
34
35	#define DFL_TMPL_SIZE ARRAY_SIZE(hugetlb_dfl_tmpl)
36	#define LEGACY_TMPL_SIZE ARRAY_SIZE(hugetlb_legacy_tmpl)
37
38	static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
39	static struct cftype *dfl_files;
40	static struct cftype *legacy_files;
41
42	static inline struct page_counter *
43	__hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup h_cg, int* idx,
44	bool rsvd)
45	{
46	if (rsvd)
47	return &h_cg->rsvd_hugepage[idx];
48	return &h_cg->hugepage[idx];
49	}
50
51	static inline struct page_counter *
52	hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup h_cg, int* idx)
53	{
54	return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd: false);
55	}
56
57	static inline struct page_counter *
58	hugetlb_cgroup_counter_from_cgroup_rsvd(struct hugetlb_cgroup h_cg, int* idx)
59	{
60	return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd: true);
61	}
62
63	static inline
64	struct hugetlb_cgroup hugetlb_cgroup_from_css(struct* cgroup_subsys_state *s)
65	{
66	return s ? container_of(s, struct hugetlb_cgroup, css) : NULL;
67	}
68
69	static inline
70	struct hugetlb_cgroup hugetlb_cgroup_from_task(struct* task_struct *task)
71	{
72	return hugetlb_cgroup_from_css(s: task_css(task, subsys_id: hugetlb_cgrp_id));
73	}
74
75	static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
76	{
77	return (h_cg == root_h_cgroup);
78	}
79
80	static inline struct hugetlb_cgroup *
81	parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg)
82	{
83	return hugetlb_cgroup_from_css(s: h_cg->css.parent);
84	}
85
86	static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
87	{
88	struct hstate *h;
89
90	for_each_hstate(h) {
91	if (page_counter_read(
92	counter: hugetlb_cgroup_counter_from_cgroup(h_cg, idx: hstate_index(h))))
93	return true;
94	}
95	return false;
96	}
97
98	static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
99	struct hugetlb_cgroup *parent_h_cgroup)
100	{
101	int idx;
102
103	for (idx = `0`; idx < HUGE_MAX_HSTATE; idx++) {
104	struct page_counter fault, fault_parent = NULL;
105	struct page_counter rsvd, rsvd_parent = NULL;
106	unsigned long limit;
107
108	if (parent_h_cgroup) {
109	fault_parent = hugetlb_cgroup_counter_from_cgroup(
110	h_cg: parent_h_cgroup, idx);
111	rsvd_parent = hugetlb_cgroup_counter_from_cgroup_rsvd(
112	h_cg: parent_h_cgroup, idx);
113	}
114	fault = hugetlb_cgroup_counter_from_cgroup(h_cg: h_cgroup, idx);
115	rsvd = hugetlb_cgroup_counter_from_cgroup_rsvd(h_cg: h_cgroup, idx);
116
117	page_counter_init(counter: fault, parent: fault_parent, protection_support: false);
118	page_counter_init(counter: rsvd, parent: rsvd_parent, protection_support: false);
119
120	if (!cgroup_subsys_on_dfl(hugetlb_cgrp_subsys)) {
121	fault->track_failcnt = true;
122	rsvd->track_failcnt = true;
123	}
124
125	limit = round_down(PAGE_COUNTER_MAX,
126	pages_per_huge_page(&hstates[idx]));
127
128	VM_BUG_ON(page_counter_set_max(fault, limit));
129	VM_BUG_ON(page_counter_set_max(rsvd, limit));
130	}
131	}
132
133	static void hugetlb_cgroup_free(struct hugetlb_cgroup *h_cgroup)
134	{
135	int node;
136
137	for_each_node(node)
138	kfree(objp: h_cgroup->nodeinfo[node]);
139	kfree(objp: h_cgroup);
140	}
141
142	static struct cgroup_subsys_state *
143	hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
144	{
145	struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(s: parent_css);
146	struct hugetlb_cgroup *h_cgroup;
147	int node;
148
149	h_cgroup = kzalloc(struct_size(h_cgroup, nodeinfo, nr_node_ids),
150	GFP_KERNEL);
151
152	if (!h_cgroup)
153	return ERR_PTR(error: -ENOMEM);
154
155	if (!parent_h_cgroup)
156	root_h_cgroup = h_cgroup;
157
158	/*
159	* TODO: this routine can waste much memory for nodes which will
160	* never be onlined. It's better to use memory hotplug callback
161	* function.
162	*/
163	for_each_node(node) {
164	/ Set node_to_alloc to NUMA_NO_NODE for offline nodes. /
165	int node_to_alloc =
166	node_state(node, state: N_NORMAL_MEMORY) ? node : NUMA_NO_NODE;
167	h_cgroup->nodeinfo[node] =
168	kzalloc_node(sizeof(struct hugetlb_cgroup_per_node),
169	GFP_KERNEL, node_to_alloc);
170	if (!h_cgroup->nodeinfo[node])
171	goto fail_alloc_nodeinfo;
172	}
173
174	hugetlb_cgroup_init(h_cgroup, parent_h_cgroup);
175	return &h_cgroup->css;
176
177	fail_alloc_nodeinfo:
178	hugetlb_cgroup_free(h_cgroup);
179	return ERR_PTR(error: -ENOMEM);
180	}
181
182	static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
183	{
184	hugetlb_cgroup_free(h_cgroup: hugetlb_cgroup_from_css(s: css));
185	}
186
187	/*
188	* Should be called with hugetlb_lock held.
189	* Since we are holding hugetlb_lock, pages cannot get moved from
190	* active list or uncharged from the cgroup, So no need to get
191	* page reference and test for page active here. This function
192	* cannot fail.
193	*/
194	static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
195	struct folio *folio)
196	{
197	unsigned int nr_pages;
198	struct page_counter *counter;
199	struct hugetlb_cgroup *hcg;
200	struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);
201
202	hcg = hugetlb_cgroup_from_folio(folio);
203	/*
204	* We can have pages in active list without any cgroup
205	* ie, hugepage with less than 3 pages. We can safely
206	* ignore those pages.
207	*/
208	if (!hcg \|\| hcg != h_cg)
209	goto out;
210
211	nr_pages = folio_nr_pages(folio);
212	if (!parent) {
213	parent = root_h_cgroup;
214	/ root has no limit /
215	page_counter_charge(counter: &parent->hugepage[idx], nr_pages);
216	}
217	counter = &h_cg->hugepage[idx];
218	/ Take the pages off the local counter /
219	page_counter_cancel(counter, nr_pages);
220
221	set_hugetlb_cgroup(folio, h_cg: parent);
222	out:
223	return;
224	}
225
226	/*
227	* Force the hugetlb cgroup to empty the hugetlb resources by moving them to
228	* the parent cgroup.
229	*/
230	static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
231	{
232	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(s: css);
233	struct hstate *h;
234	struct folio *folio;
235
236	do {
237	for_each_hstate(h) {
238	spin_lock_irq(lock: &hugetlb_lock);
239	list_for_each_entry(folio, &h->hugepage_activelist, lru)
240	hugetlb_cgroup_move_parent(idx: hstate_index(h), h_cg, folio);
241
242	spin_unlock_irq(lock: &hugetlb_lock);
243	}
244	cond_resched();
245	} while (hugetlb_cgroup_have_usage(h_cg));
246	}
247
248	static inline void hugetlb_event(struct hugetlb_cgroup hugetlb, int* idx,
249	enum hugetlb_memory_event event)
250	{
251	atomic_long_inc(v: &hugetlb->events_local[idx][event]);
252	cgroup_file_notify(cfile: &hugetlb->events_local_file[idx]);
253
254	do {
255	atomic_long_inc(v: &hugetlb->events[idx][event]);
256	cgroup_file_notify(cfile: &hugetlb->events_file[idx]);
257	} while ((hugetlb = parent_hugetlb_cgroup(h_cg: hugetlb)) &&
258	!hugetlb_cgroup_is_root(h_cg: hugetlb));
259	}
260
261	static int __hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
262	struct hugetlb_cgroup **ptr,
263	bool rsvd)
264	{
265	int ret = `0`;
266	struct page_counter *counter;
267	struct hugetlb_cgroup *h_cg = NULL;
268
269	if (hugetlb_cgroup_disabled())
270	goto done;
271	again:
272	rcu_read_lock();
273	h_cg = hugetlb_cgroup_from_task(current);
274	if (!css_tryget(css: &h_cg->css)) {
275	rcu_read_unlock();
276	goto again;
277	}
278	rcu_read_unlock();
279
280	if (!page_counter_try_charge(
281	counter: __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd),
282	nr_pages, fail: &counter)) {
283	ret = -ENOMEM;
284	hugetlb_event(hugetlb: h_cg, idx, event: HUGETLB_MAX);
285	css_put(css: &h_cg->css);
286	goto done;
287	}
288	/ Reservations take a reference to the css because they do not get*
289	* reparented.
290	*/
291	if (!rsvd)
292	css_put(css: &h_cg->css);
293	done:
294	*ptr = h_cg;
295	return ret;
296	}
297
298	int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
299	struct hugetlb_cgroup **ptr)
300	{
301	return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, rsvd: false);
302	}
303
304	int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages,
305	struct hugetlb_cgroup **ptr)
306	{
307	return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, rsvd: true);
308	}
309
310	/ Should be called with hugetlb_lock held /
311	static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
312	struct hugetlb_cgroup *h_cg,
313	struct folio *folio, bool rsvd)
314	{
315	if (hugetlb_cgroup_disabled() \|\| !h_cg)
316	return;
317	lockdep_assert_held(&hugetlb_lock);
318	__set_hugetlb_cgroup(folio, h_cg, rsvd);
319	if (!rsvd) {
320	unsigned long usage =
321	h_cg->nodeinfo[folio_nid(folio)]->usage[idx];
322	/*
323	* This write is not atomic due to fetching usage and writing
324	* to it, but that's fine because we call this with
325	* hugetlb_lock held anyway.
326	*/
327	WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx],
328	usage + nr_pages);
329	}
330	}
331
332	void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
333	struct hugetlb_cgroup *h_cg,
334	struct folio *folio)
335	{
336	__hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, rsvd: false);
337	}
338
339	void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages,
340	struct hugetlb_cgroup *h_cg,
341	struct folio *folio)
342	{
343	__hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, rsvd: true);
344	}
345
346	/*
347	* Should be called with hugetlb_lock held
348	*/
349	static void __hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages,
350	struct folio *folio, bool rsvd)
351	{
352	struct hugetlb_cgroup *h_cg;
353
354	if (hugetlb_cgroup_disabled())
355	return;
356	lockdep_assert_held(&hugetlb_lock);
357	h_cg = __hugetlb_cgroup_from_folio(folio, rsvd);
358	if (unlikely(!h_cg))
359	return;
360	__set_hugetlb_cgroup(folio, NULL, rsvd);
361
362	page_counter_uncharge(counter: __hugetlb_cgroup_counter_from_cgroup(h_cg, idx,
363	rsvd),
364	nr_pages);
365
366	if (rsvd)
367	css_put(css: &h_cg->css);
368	else {
369	unsigned long usage =
370	h_cg->nodeinfo[folio_nid(folio)]->usage[idx];
371	/*
372	* This write is not atomic due to fetching usage and writing
373	* to it, but that's fine because we call this with
374	* hugetlb_lock held anyway.
375	*/
376	WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx],
377	usage - nr_pages);
378	}
379	}
380
381	void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages,
382	struct folio *folio)
383	{
384	__hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, rsvd: false);
385	}
386
387	void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages,
388	struct folio *folio)
389	{
390	__hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, rsvd: true);
391	}
392
393	static void __hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
394	struct hugetlb_cgroup *h_cg,
395	bool rsvd)
396	{
397	if (hugetlb_cgroup_disabled() \|\| !h_cg)
398	return;
399
400	page_counter_uncharge(counter: __hugetlb_cgroup_counter_from_cgroup(h_cg, idx,
401	rsvd),
402	nr_pages);
403
404	if (rsvd)
405	css_put(css: &h_cg->css);
406	}
407
408	void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
409	struct hugetlb_cgroup *h_cg)
410	{
411	__hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, rsvd: false);
412	}
413
414	void hugetlb_cgroup_uncharge_cgroup_rsvd(int idx, unsigned long nr_pages,
415	struct hugetlb_cgroup *h_cg)
416	{
417	__hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, rsvd: true);
418	}
419
420	void hugetlb_cgroup_uncharge_counter(struct resv_map resv, unsigned* long start,
421	unsigned long end)
422	{
423	if (hugetlb_cgroup_disabled() \|\| !resv \|\| !resv->reservation_counter \|\|
424	!resv->css)
425	return;
426
427	page_counter_uncharge(counter: resv->reservation_counter,
428	nr_pages: (end - start) * resv->pages_per_hpage);
429	css_put(css: resv->css);
430	}
431
432	void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv,
433	struct file_region *rg,
434	unsigned long nr_pages,
435	bool region_del)
436	{
437	if (hugetlb_cgroup_disabled() \|\| !resv \|\| !rg \|\| !nr_pages)
438	return;
439
440	if (rg->reservation_counter && resv->pages_per_hpage &&
441	!resv->reservation_counter) {
442	page_counter_uncharge(counter: rg->reservation_counter,
443	nr_pages: nr_pages * resv->pages_per_hpage);
444	/*
445	* Only do css_put(rg->css) when we delete the entire region
446	* because one file_region must hold exactly one css reference.
447	*/
448	if (region_del)
449	css_put(css: rg->css);
450	}
451	}
452
453	enum {
454	RES_USAGE,
455	RES_RSVD_USAGE,
456	RES_LIMIT,
457	RES_RSVD_LIMIT,
458	RES_MAX_USAGE,
459	RES_RSVD_MAX_USAGE,
460	RES_FAILCNT,
461	RES_RSVD_FAILCNT,
462	};
463
464	static int hugetlb_cgroup_read_numa_stat(struct seq_file seq, void* *dummy)
465	{
466	int nid;
467	struct cftype *cft = seq_cft(seq);
468	int idx = MEMFILE_IDX(cft->private);
469	bool legacy = !cgroup_subsys_on_dfl(hugetlb_cgrp_subsys);
470	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(s: seq_css(seq));
471	struct cgroup_subsys_state *css;
472	unsigned long usage;
473
474	if (legacy) {
475	/ Add up usage across all nodes for the non-hierarchical total. /
476	usage = `0`;
477	for_each_node_state(nid, N_MEMORY)
478	usage += READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]);
479	seq_printf(m: seq, fmt: "total=%lu", usage * PAGE_SIZE);
480
481	/ Simply print the per-node usage for the non-hierarchical total. /
482	for_each_node_state(nid, N_MEMORY)
483	seq_printf(m: seq, fmt: " N%d=%lu", nid,
484	READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]) *
485	PAGE_SIZE);
486	seq_putc(m: seq, c: `'\n'`);
487	}
488
489	/*
490	* The hierarchical total is pretty much the value recorded by the
491	* counter, so use that.
492	*/
493	seq_printf(m: seq, fmt: "%stotal=%lu", legacy ? "hierarchical_" : "",
494	page_counter_read(counter: &h_cg->hugepage[idx]) * PAGE_SIZE);
495
496	/*
497	* For each node, transverse the css tree to obtain the hierarchical
498	* node usage.
499	*/
500	for_each_node_state(nid, N_MEMORY) {
501	usage = `0`;
502	rcu_read_lock();
503	css_for_each_descendant_pre(css, &h_cg->css) {
504	usage += READ_ONCE(hugetlb_cgroup_from_css(css)
505	->nodeinfo[nid]
506	->usage[idx]);
507	}
508	rcu_read_unlock();
509	seq_printf(m: seq, fmt: " N%d=%lu", nid, usage * PAGE_SIZE);
510	}
511
512	seq_putc(m: seq, c: `'\n'`);
513
514	return `0`;
515	}
516
517	static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
518	struct cftype *cft)
519	{
520	struct page_counter *counter;
521	struct page_counter *rsvd_counter;
522	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(s: css);
523
524	counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)];
525	rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(cft->private)];
526
527	switch (MEMFILE_ATTR(cft->private)) {
528	case RES_USAGE:
529	return (u64)page_counter_read(counter) * PAGE_SIZE;
530	case RES_RSVD_USAGE:
531	return (u64)page_counter_read(counter: rsvd_counter) * PAGE_SIZE;
532	case RES_LIMIT:
533	return (u64)counter->max * PAGE_SIZE;
534	case RES_RSVD_LIMIT:
535	return (u64)rsvd_counter->max * PAGE_SIZE;
536	case RES_MAX_USAGE:
537	return (u64)counter->watermark * PAGE_SIZE;
538	case RES_RSVD_MAX_USAGE:
539	return (u64)rsvd_counter->watermark * PAGE_SIZE;
540	case RES_FAILCNT:
541	return counter->failcnt;
542	case RES_RSVD_FAILCNT:
543	return rsvd_counter->failcnt;
544	default:
545	BUG();
546	}
547	}
548
549	static int hugetlb_cgroup_read_u64_max(struct seq_file seq, void* *v)
550	{
551	int idx;
552	u64 val;
553	struct cftype *cft = seq_cft(seq);
554	unsigned long limit;
555	struct page_counter *counter;
556	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(s: seq_css(seq));
557
558	idx = MEMFILE_IDX(cft->private);
559	counter = &h_cg->hugepage[idx];
560
561	limit = round_down(PAGE_COUNTER_MAX,
562	pages_per_huge_page(&hstates[idx]));
563
564	switch (MEMFILE_ATTR(cft->private)) {
565	case RES_RSVD_USAGE:
566	counter = &h_cg->rsvd_hugepage[idx];
567	fallthrough;
568	case RES_USAGE:
569	val = (u64)page_counter_read(counter);
570	seq_printf(m: seq, fmt: "%llu\n", val * PAGE_SIZE);
571	break;
572	case RES_RSVD_LIMIT:
573	counter = &h_cg->rsvd_hugepage[idx];
574	fallthrough;
575	case RES_LIMIT:
576	val = (u64)counter->max;
577	if (val == limit)
578	seq_puts(m: seq, s: "max\n");
579	else
580	seq_printf(m: seq, fmt: "%llu\n", val * PAGE_SIZE);
581	break;
582	default:
583	BUG();
584	}
585
586	return `0`;
587	}
588
589	static DEFINE_MUTEX(hugetlb_limit_mutex);
590
591	static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
592	char *buf, size_t nbytes, loff_t off,
593	const char *max)
594	{
595	int ret, idx;
596	unsigned long nr_pages;
597	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(s: of_css(of));
598	bool rsvd = false;
599
600	if (hugetlb_cgroup_is_root(h_cg)) / Can't set limit on root /
601	return -EINVAL;
602
603	buf = strstrip(str: buf);
604	ret = page_counter_memparse(buf, max, nr_pages: &nr_pages);
605	if (ret)
606	return ret;
607
608	idx = MEMFILE_IDX(of_cft(of)->private);
609	nr_pages = round_down(nr_pages, pages_per_huge_page(&hstates[idx]));
610
611	switch (MEMFILE_ATTR(of_cft(of)->private)) {
612	case RES_RSVD_LIMIT:
613	rsvd = true;
614	fallthrough;
615	case RES_LIMIT:
616	mutex_lock(lock: &hugetlb_limit_mutex);
617	ret = page_counter_set_max(
618	counter: __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd),
619	nr_pages);
620	mutex_unlock(lock: &hugetlb_limit_mutex);
621	break;
622	default:
623	ret = -EINVAL;
624	break;
625	}
626	return ret ?: nbytes;
627	}
628
629	static ssize_t hugetlb_cgroup_write_legacy(struct kernfs_open_file *of,
630	char *buf, size_t nbytes, loff_t off)
631	{
632	return hugetlb_cgroup_write(of, buf, nbytes, off, max: "-1");
633	}
634
635	static ssize_t hugetlb_cgroup_write_dfl(struct kernfs_open_file *of,
636	char *buf, size_t nbytes, loff_t off)
637	{
638	return hugetlb_cgroup_write(of, buf, nbytes, off, max: "max");
639	}
640
641	static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of,
642	char *buf, size_t nbytes, loff_t off)
643	{
644	int ret = `0`;
645	struct page_counter counter, rsvd_counter;
646	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(s: of_css(of));
647
648	counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)];
649	rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(of_cft(of)->private)];
650
651	switch (MEMFILE_ATTR(of_cft(of)->private)) {
652	case RES_MAX_USAGE:
653	page_counter_reset_watermark(counter);
654	break;
655	case RES_RSVD_MAX_USAGE:
656	page_counter_reset_watermark(counter: rsvd_counter);
657	break;
658	case RES_FAILCNT:
659	counter->failcnt = `0`;
660	break;
661	case RES_RSVD_FAILCNT:
662	rsvd_counter->failcnt = `0`;
663	break;
664	default:
665	ret = -EINVAL;
666	break;
667	}
668	return ret ?: nbytes;
669	}
670
671	static char mem_fmt(char* buf, int* size, unsigned long hsize)
672	{
673	if (hsize >= SZ_1G)
674	snprintf(buf, size, fmt: "%luGB", hsize / SZ_1G);
675	else if (hsize >= SZ_1M)
676	snprintf(buf, size, fmt: "%luMB", hsize / SZ_1M);
677	else
678	snprintf(buf, size, fmt: "%luKB", hsize / SZ_1K);
679	return buf;
680	}
681
682	static int __hugetlb_events_show(struct seq_file *seq, bool local)
683	{
684	int idx;
685	long max;
686	struct cftype *cft = seq_cft(seq);
687	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(s: seq_css(seq));
688
689	idx = MEMFILE_IDX(cft->private);
690
691	if (local)
692	max = atomic_long_read(v: &h_cg->events_local[idx][HUGETLB_MAX]);
693	else
694	max = atomic_long_read(v: &h_cg->events[idx][HUGETLB_MAX]);
695
696	seq_printf(m: seq, fmt: "max %lu\n", max);
697
698	return `0`;
699	}
700
701	static int hugetlb_events_show(struct seq_file seq, void* *v)
702	{
703	return __hugetlb_events_show(seq, local: false);
704	}
705
706	static int hugetlb_events_local_show(struct seq_file seq, void* *v)
707	{
708	return __hugetlb_events_show(seq, local: true);
709	}
710
711	static struct cftype hugetlb_dfl_tmpl[] = {
712	{
713	.name = "max",
714	.private = RES_LIMIT,
715	.seq_show = hugetlb_cgroup_read_u64_max,
716	.write = hugetlb_cgroup_write_dfl,
717	.flags = CFTYPE_NOT_ON_ROOT,
718	},
719	{
720	.name = "rsvd.max",
721	.private = RES_RSVD_LIMIT,
722	.seq_show = hugetlb_cgroup_read_u64_max,
723	.write = hugetlb_cgroup_write_dfl,
724	.flags = CFTYPE_NOT_ON_ROOT,
725	},
726	{
727	.name = "current",
728	.private = RES_USAGE,
729	.seq_show = hugetlb_cgroup_read_u64_max,
730	.flags = CFTYPE_NOT_ON_ROOT,
731	},
732	{
733	.name = "rsvd.current",
734	.private = RES_RSVD_USAGE,
735	.seq_show = hugetlb_cgroup_read_u64_max,
736	.flags = CFTYPE_NOT_ON_ROOT,
737	},
738	{
739	.name = "events",
740	.seq_show = hugetlb_events_show,
741	.file_offset = MEMFILE_OFFSET(struct hugetlb_cgroup, events_file[`0`]),
742	.flags = CFTYPE_NOT_ON_ROOT,
743	},
744	{
745	.name = "events.local",
746	.seq_show = hugetlb_events_local_show,
747	.file_offset = MEMFILE_OFFSET(struct hugetlb_cgroup, events_local_file[`0`]),
748	.flags = CFTYPE_NOT_ON_ROOT,
749	},
750	{
751	.name = "numa_stat",
752	.seq_show = hugetlb_cgroup_read_numa_stat,
753	.flags = CFTYPE_NOT_ON_ROOT,
754	},
755	/ don't need terminator here /
756	};
757
758	static struct cftype hugetlb_legacy_tmpl[] = {
759	{
760	.name = "limit_in_bytes",
761	.private = RES_LIMIT,
762	.read_u64 = hugetlb_cgroup_read_u64,
763	.write = hugetlb_cgroup_write_legacy,
764	},
765	{
766	.name = "rsvd.limit_in_bytes",
767	.private = RES_RSVD_LIMIT,
768	.read_u64 = hugetlb_cgroup_read_u64,
769	.write = hugetlb_cgroup_write_legacy,
770	},
771	{
772	.name = "usage_in_bytes",
773	.private = RES_USAGE,
774	.read_u64 = hugetlb_cgroup_read_u64,
775	},
776	{
777	.name = "rsvd.usage_in_bytes",
778	.private = RES_RSVD_USAGE,
779	.read_u64 = hugetlb_cgroup_read_u64,
780	},
781	{
782	.name = "max_usage_in_bytes",
783	.private = RES_MAX_USAGE,
784	.write = hugetlb_cgroup_reset,
785	.read_u64 = hugetlb_cgroup_read_u64,
786	},
787	{
788	.name = "rsvd.max_usage_in_bytes",
789	.private = RES_RSVD_MAX_USAGE,
790	.write = hugetlb_cgroup_reset,
791	.read_u64 = hugetlb_cgroup_read_u64,
792	},
793	{
794	.name = "failcnt",
795	.private = RES_FAILCNT,
796	.write = hugetlb_cgroup_reset,
797	.read_u64 = hugetlb_cgroup_read_u64,
798	},
799	{
800	.name = "rsvd.failcnt",
801	.private = RES_RSVD_FAILCNT,
802	.write = hugetlb_cgroup_reset,
803	.read_u64 = hugetlb_cgroup_read_u64,
804	},
805	{
806	.name = "numa_stat",
807	.seq_show = hugetlb_cgroup_read_numa_stat,
808	},
809	/ don't need terminator here /
810	};
811
812	static void __init
813	hugetlb_cgroup_cfttypes_init(struct hstate h, struct* cftype *cft,
814	struct cftype tmpl, int* tmpl_size)
815	{
816	char buf[`32`];
817	int i, idx = hstate_index(h);
818
819	/ format the size /
820	mem_fmt(buf, size: sizeof(buf), hsize: huge_page_size(h));
821
822	for (i = `0`; i < tmpl_size; cft++, tmpl++, i++) {
823	cft = tmpl;
824	/ rebuild the name /
825	snprintf(buf: cft->name, MAX_CFTYPE_NAME, fmt: "%s.%s", buf, tmpl->name);
826	/ rebuild the private /
827	cft->private = MEMFILE_PRIVATE(idx, tmpl->private);
828	/ rebuild the file_offset /
829	if (tmpl->file_offset) {
830	unsigned int offset = tmpl->file_offset;
831
832	cft->file_offset = MEMFILE_OFFSET0(offset) +
833	MEMFILE_FIELD_SIZE(offset) * idx;
834	}
835
836	lockdep_register_key(key: &cft->lockdep_key);
837	}
838	}
839
840	static void __init __hugetlb_cgroup_file_dfl_init(struct hstate *h)
841	{
842	int idx = hstate_index(h);
843
844	hugetlb_cgroup_cfttypes_init(h, cft: dfl_files + idx * DFL_TMPL_SIZE,
845	tmpl: hugetlb_dfl_tmpl, DFL_TMPL_SIZE);
846	}
847
848	static void __init __hugetlb_cgroup_file_legacy_init(struct hstate *h)
849	{
850	int idx = hstate_index(h);
851
852	hugetlb_cgroup_cfttypes_init(h, cft: legacy_files + idx * LEGACY_TMPL_SIZE,
853	tmpl: hugetlb_legacy_tmpl, LEGACY_TMPL_SIZE);
854	}
855
856	static void __init __hugetlb_cgroup_file_init(struct hstate *h)
857	{
858	__hugetlb_cgroup_file_dfl_init(h);
859	__hugetlb_cgroup_file_legacy_init(h);
860	}
861
862	static void __init __hugetlb_cgroup_file_pre_init(void)
863	{
864	int cft_count;
865
866	cft_count = hugetlb_max_hstate * DFL_TMPL_SIZE + `1`; / add terminator /
867	dfl_files = kcalloc(cft_count, sizeof(struct cftype), GFP_KERNEL);
868	BUG_ON(!dfl_files);
869	cft_count = hugetlb_max_hstate * LEGACY_TMPL_SIZE + `1`; / add terminator /
870	legacy_files = kcalloc(cft_count, sizeof(struct cftype), GFP_KERNEL);
871	BUG_ON(!legacy_files);
872	}
873
874	static void __init __hugetlb_cgroup_file_post_init(void)
875	{
876	WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys,
877	dfl_files));
878	WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys,
879	legacy_files));
880	}
881
882	void __init hugetlb_cgroup_file_init(void)
883	{
884	struct hstate *h;
885
886	__hugetlb_cgroup_file_pre_init();
887	for_each_hstate(h)
888	__hugetlb_cgroup_file_init(h);
889	__hugetlb_cgroup_file_post_init();
890	}
891
892	/*
893	* hugetlb_lock will make sure a parallel cgroup rmdir won't happen
894	* when we migrate hugepages
895	*/
896	void hugetlb_cgroup_migrate(struct folio old_folio, struct* folio *new_folio)
897	{
898	struct hugetlb_cgroup *h_cg;
899	struct hugetlb_cgroup *h_cg_rsvd;
900	struct hstate *h = folio_hstate(folio: old_folio);
901
902	if (hugetlb_cgroup_disabled())
903	return;
904
905	spin_lock_irq(lock: &hugetlb_lock);
906	h_cg = hugetlb_cgroup_from_folio(folio: old_folio);
907	h_cg_rsvd = hugetlb_cgroup_from_folio_rsvd(folio: old_folio);
908	set_hugetlb_cgroup(folio: old_folio, NULL);
909	set_hugetlb_cgroup_rsvd(folio: old_folio, NULL);
910
911	/ move the h_cg details to new cgroup /
912	set_hugetlb_cgroup(folio: new_folio, h_cg);
913	set_hugetlb_cgroup_rsvd(folio: new_folio, h_cg: h_cg_rsvd);
914	list_move(list: &new_folio->lru, head: &h->hugepage_activelist);
915	spin_unlock_irq(lock: &hugetlb_lock);
916	}
917
918	static struct cftype hugetlb_files[] = {
919	{} / terminate /
920	};
921
922	struct cgroup_subsys hugetlb_cgrp_subsys = {
923	.css_alloc = hugetlb_cgroup_css_alloc,
924	.css_offline = hugetlb_cgroup_css_offline,
925	.css_free = hugetlb_cgroup_css_free,
926	.dfl_cftypes = hugetlb_files,
927	.legacy_cftypes = hugetlb_files,
928	};
929

Browse the source code of Linux/mm/hugetlb_cgroup.c