vmstat.c source code [Linux/mm/vmstat.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* linux/mm/vmstat.c
4	*
5	* Manages VM statistics
6	* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
7	*
8	* zoned VM statistics
9	* Copyright (C) 2006 Silicon Graphics, Inc.,
10	* Christoph Lameter <cl@gentwo.org>
11	* Copyright (C) 2008-2014 Christoph Lameter
12	*/
13	#include <linux/fs.h>
14	#include <linux/mm.h>
15	#include <linux/err.h>
16	#include <linux/module.h>
17	#include <linux/slab.h>
18	#include <linux/cpu.h>
19	#include <linux/cpumask.h>
20	#include <linux/vmstat.h>
21	#include <linux/proc_fs.h>
22	#include <linux/seq_file.h>
23	#include <linux/debugfs.h>
24	#include <linux/sched.h>
25	#include <linux/math64.h>
26	#include <linux/writeback.h>
27	#include <linux/compaction.h>
28	#include <linux/mm_inline.h>
29	#include <linux/page_owner.h>
30	#include <linux/sched/isolation.h>
31
32	#include "internal.h"
33
34	#ifdef CONFIG_PROC_FS
35	#ifdef CONFIG_NUMA
36	#define ENABLE_NUMA_STAT 1
37	static int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
38
39	/ zero numa counters within a zone /
40	static void zero_zone_numa_counters(struct zone *zone)
41	{
42	int item, cpu;
43
44	for (item = `0`; item < NR_VM_NUMA_EVENT_ITEMS; item++) {
45	atomic_long_set(v: &zone->vm_numa_event[item], i: `0`);
46	for_each_online_cpu(cpu) {
47	per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_numa_event[item]
48	= `0`;
49	}
50	}
51	}
52
53	/ zero numa counters of all the populated zones /
54	static void zero_zones_numa_counters(void)
55	{
56	struct zone *zone;
57
58	for_each_populated_zone(zone)
59	zero_zone_numa_counters(zone);
60	}
61
62	/ zero global numa counters /
63	static void zero_global_numa_counters(void)
64	{
65	int item;
66
67	for (item = `0`; item < NR_VM_NUMA_EVENT_ITEMS; item++)
68	atomic_long_set(v: &vm_numa_event[item], i: `0`);
69	}
70
71	static void invalid_numa_statistics(void)
72	{
73	zero_zones_numa_counters();
74	zero_global_numa_counters();
75	}
76
77	static DEFINE_MUTEX(vm_numa_stat_lock);
78
79	static int sysctl_vm_numa_stat_handler(const struct ctl_table table, int* write,
80	void buffer, size_t length, loff_t *ppos)
81	{
82	int ret, oldval;
83
84	mutex_lock(lock: &vm_numa_stat_lock);
85	if (write)
86	oldval = sysctl_vm_numa_stat;
87	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
88	if (ret \|\| !write)
89	goto out;
90
91	if (oldval == sysctl_vm_numa_stat)
92	goto out;
93	else if (sysctl_vm_numa_stat == ENABLE_NUMA_STAT) {
94	static_branch_enable(&vm_numa_stat_key);
95	pr_info("enable numa statistics\n");
96	} else {
97	static_branch_disable(&vm_numa_stat_key);
98	invalid_numa_statistics();
99	pr_info("disable numa statistics, and clear numa counters\n");
100	}
101
102	out:
103	mutex_unlock(lock: &vm_numa_stat_lock);
104	return ret;
105	}
106	#endif
107	#endif /* CONFIG_PROC_FS */
108
109	#ifdef CONFIG_VM_EVENT_COUNTERS
110	DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{`0`}};
111	EXPORT_PER_CPU_SYMBOL(vm_event_states);
112
113	static void sum_vm_events(unsigned long *ret)
114	{
115	int cpu;
116	int i;
117
118	memset(s: ret, c: `0`, n: NR_VM_EVENT_ITEMS * sizeof(unsigned long));
119
120	for_each_online_cpu(cpu) {
121	struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
122
123	for (i = `0`; i < NR_VM_EVENT_ITEMS; i++)
124	ret[i] += this->event[i];
125	}
126	}
127
128	/*
129	* Accumulate the vm event counters across all CPUs.
130	* The result is unavoidably approximate - it can change
131	* during and after execution of this function.
132	*/
133	void all_vm_events(unsigned long *ret)
134	{
135	cpus_read_lock();
136	sum_vm_events(ret);
137	cpus_read_unlock();
138	}
139	EXPORT_SYMBOL_GPL(all_vm_events);
140
141	/*
142	* Fold the foreign cpu events into our own.
143	*
144	* This is adding to the events on one processor
145	* but keeps the global counts constant.
146	*/
147	void vm_events_fold_cpu(int cpu)
148	{
149	struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
150	int i;
151
152	for (i = `0`; i < NR_VM_EVENT_ITEMS; i++) {
153	count_vm_events(item: i, delta: fold_state->event[i]);
154	fold_state->event[i] = `0`;
155	}
156	}
157
158	#endif /* CONFIG_VM_EVENT_COUNTERS */
159
160	/*
161	* Manage combined zone based / global counters
162	*
163	* vm_stat contains the global counters
164	*/
165	atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
166	atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
167	atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS] __cacheline_aligned_in_smp;
168	EXPORT_SYMBOL(vm_zone_stat);
169	EXPORT_SYMBOL(vm_node_stat);
170
171	#ifdef CONFIG_NUMA
172	static void fold_vm_zone_numa_events(struct zone *zone)
173	{
174	unsigned long zone_numa_events[NR_VM_NUMA_EVENT_ITEMS] = { `0`, };
175	int cpu;
176	enum numa_stat_item item;
177
178	for_each_online_cpu(cpu) {
179	struct per_cpu_zonestat *pzstats;
180
181	pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
182	for (item = `0`; item < NR_VM_NUMA_EVENT_ITEMS; item++)
183	zone_numa_events[item] += xchg(&pzstats->vm_numa_event[item], `0`);
184	}
185
186	for (item = `0`; item < NR_VM_NUMA_EVENT_ITEMS; item++)
187	zone_numa_event_add(x: zone_numa_events[item], zone, item);
188	}
189
190	void fold_vm_numa_events(void)
191	{
192	struct zone *zone;
193
194	for_each_populated_zone(zone)
195	fold_vm_zone_numa_events(zone);
196	}
197	#endif
198
199	#ifdef CONFIG_SMP
200
201	int calculate_pressure_threshold(struct zone *zone)
202	{
203	int threshold;
204	int watermark_distance;
205
206	/*
207	* As vmstats are not up to date, there is drift between the estimated
208	* and real values. For high thresholds and a high number of CPUs, it
209	* is possible for the min watermark to be breached while the estimated
210	* value looks fine. The pressure threshold is a reduced value such
211	* that even the maximum amount of drift will not accidentally breach
212	* the min watermark
213	*/
214	watermark_distance = low_wmark_pages(z: zone) - min_wmark_pages(z: zone);
215	threshold = max(`1`, (int)(watermark_distance / num_online_cpus()));
216
217	/*
218	* Maximum threshold is 125
219	*/
220	threshold = min(`125`, threshold);
221
222	return threshold;
223	}
224
225	int calculate_normal_threshold(struct zone *zone)
226	{
227	int threshold;
228	int mem; / memory in 128 MB units /
229
230	/*
231	* The threshold scales with the number of processors and the amount
232	* of memory per zone. More memory means that we can defer updates for
233	* longer, more processors could lead to more contention.
234	* fls() is used to have a cheap way of logarithmic scaling.
235	*
236	* Some sample thresholds:
237	*
238	* Threshold Processors (fls) Zonesize fls(mem)+1
239	* ------------------------------------------------------------------
240	* 8 1 1 0.9-1 GB 4
241	* 16 2 2 0.9-1 GB 4
242	* 20 2 2 1-2 GB 5
243	* 24 2 2 2-4 GB 6
244	* 28 2 2 4-8 GB 7
245	* 32 2 2 8-16 GB 8
246	* 4 2 2 <128M 1
247	* 30 4 3 2-4 GB 5
248	* 48 4 3 8-16 GB 8
249	* 32 8 4 1-2 GB 4
250	* 32 8 4 0.9-1GB 4
251	* 10 16 5 <128M 1
252	* 40 16 5 900M 4
253	* 70 64 7 2-4 GB 5
254	* 84 64 7 4-8 GB 6
255	* 108 512 9 4-8 GB 6
256	* 125 1024 10 8-16 GB 8
257	* 125 1024 10 16-32 GB 9
258	*/
259
260	mem = zone_managed_pages(zone) >> (`27` - PAGE_SHIFT);
261
262	threshold = `2` * fls(x: num_online_cpus()) * (`1` + fls(x: mem));
263
264	/*
265	* Maximum threshold is 125
266	*/
267	threshold = min(`125`, threshold);
268
269	return threshold;
270	}
271
272	/*
273	* Refresh the thresholds for each zone.
274	*/
275	void refresh_zone_stat_thresholds(void)
276	{
277	struct pglist_data *pgdat;
278	struct zone *zone;
279	int cpu;
280	int threshold;
281
282	/ Zero current pgdat thresholds /
283	for_each_online_pgdat(pgdat) {
284	for_each_online_cpu(cpu) {
285	per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold = `0`;
286	}
287	}
288
289	for_each_populated_zone(zone) {
290	struct pglist_data *pgdat = zone->zone_pgdat;
291	unsigned long max_drift, tolerate_drift;
292
293	threshold = calculate_normal_threshold(zone);
294
295	for_each_online_cpu(cpu) {
296	int pgdat_threshold;
297
298	per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold
299	= threshold;
300
301	/ Base nodestat threshold on the largest populated zone. /
302	pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
303	per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
304	= max(threshold, pgdat_threshold);
305	}
306
307	/*
308	* Only set percpu_drift_mark if there is a danger that
309	* NR_FREE_PAGES reports the low watermark is ok when in fact
310	* the min watermark could be breached by an allocation
311	*/
312	tolerate_drift = low_wmark_pages(z: zone) - min_wmark_pages(z: zone);
313	max_drift = num_online_cpus() * threshold;
314	if (max_drift > tolerate_drift)
315	zone->percpu_drift_mark = high_wmark_pages(z: zone) +
316	max_drift;
317	}
318	}
319
320	void set_pgdat_percpu_threshold(pg_data_t *pgdat,
321	int (calculate_pressure)(struct* zone *))
322	{
323	struct zone *zone;
324	int cpu;
325	int threshold;
326	int i;
327
328	for (i = `0`; i < pgdat->nr_zones; i++) {
329	zone = &pgdat->node_zones[i];
330	if (!zone->percpu_drift_mark)
331	continue;
332
333	threshold = (*calculate_pressure)(zone);
334	for_each_online_cpu(cpu)
335	per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold
336	= threshold;
337	}
338	}
339
340	/*
341	* For use when we know that interrupts are disabled,
342	* or when we know that preemption is disabled and that
343	* particular counter cannot be updated from interrupt context.
344	*/
345	void __mod_zone_page_state(struct zone zone, enum* zone_stat_item item,
346	long delta)
347	{
348	struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
349	s8 __percpu *p = pcp->vm_stat_diff + item;
350	long x;
351	long t;
352
353	/*
354	* Accurate vmstat updates require a RMW. On !PREEMPT_RT kernels,
355	* atomicity is provided by IRQs being disabled -- either explicitly
356	* or via local_lock_irq. On PREEMPT_RT, local_lock_irq only disables
357	* CPU migrations and preemption potentially corrupts a counter so
358	* disable preemption.
359	*/
360	preempt_disable_nested();
361
362	x = delta + __this_cpu_read(*p);
363
364	t = __this_cpu_read(pcp->stat_threshold);
365
366	if (unlikely(abs(x) > t)) {
367	zone_page_state_add(x, zone, item);
368	x = `0`;
369	}
370	__this_cpu_write(*p, x);
371
372	preempt_enable_nested();
373	}
374	EXPORT_SYMBOL(__mod_zone_page_state);
375
376	void __mod_node_page_state(struct pglist_data pgdat, enum* node_stat_item item,
377	long delta)
378	{
379	struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
380	s8 __percpu *p = pcp->vm_node_stat_diff + item;
381	long x;
382	long t;
383
384	if (vmstat_item_in_bytes(idx: item)) {
385	/*
386	* Only cgroups use subpage accounting right now; at
387	* the global level, these items still change in
388	* multiples of whole pages. Store them as pages
389	* internally to keep the per-cpu counters compact.
390	*/
391	VM_WARN_ON_ONCE(delta & (PAGE_SIZE - `1`));
392	delta >>= PAGE_SHIFT;
393	}
394
395	/ See __mod_node_page_state /
396	preempt_disable_nested();
397
398	x = delta + __this_cpu_read(*p);
399
400	t = __this_cpu_read(pcp->stat_threshold);
401
402	if (unlikely(abs(x) > t)) {
403	node_page_state_add(x, pgdat, item);
404	x = `0`;
405	}
406	__this_cpu_write(*p, x);
407
408	preempt_enable_nested();
409	}
410	EXPORT_SYMBOL(__mod_node_page_state);
411
412	/*
413	* Optimized increment and decrement functions.
414	*
415	* These are only for a single page and therefore can take a struct page *
416	* argument instead of struct zone *. This allows the inclusion of the code
417	* generated for page_zone(page) into the optimized functions.
418	*
419	* No overflow check is necessary and therefore the differential can be
420	* incremented or decremented in place which may allow the compilers to
421	* generate better code.
422	* The increment or decrement is known and therefore one boundary check can
423	* be omitted.
424	*
425	* NOTE: These functions are very performance sensitive. Change only
426	* with care.
427	*
428	* Some processors have inc/dec instructions that are atomic vs an interrupt.
429	* However, the code must first determine the differential location in a zone
430	* based on the processor number and then inc/dec the counter. There is no
431	* guarantee without disabling preemption that the processor will not change
432	* in between and therefore the atomicity vs. interrupt cannot be exploited
433	* in a useful way here.
434	*/
435	void __inc_zone_state(struct zone zone, enum* zone_stat_item item)
436	{
437	struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
438	s8 __percpu *p = pcp->vm_stat_diff + item;
439	s8 v, t;
440
441	/ See __mod_node_page_state /
442	preempt_disable_nested();
443
444	v = __this_cpu_inc_return(*p);
445	t = __this_cpu_read(pcp->stat_threshold);
446	if (unlikely(v > t)) {
447	s8 overstep = t >> `1`;
448
449	zone_page_state_add(x: v + overstep, zone, item);
450	__this_cpu_write(*p, -overstep);
451	}
452
453	preempt_enable_nested();
454	}
455
456	void __inc_node_state(struct pglist_data pgdat, enum* node_stat_item item)
457	{
458	struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
459	s8 __percpu *p = pcp->vm_node_stat_diff + item;
460	s8 v, t;
461
462	VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
463
464	/ See __mod_node_page_state /
465	preempt_disable_nested();
466
467	v = __this_cpu_inc_return(*p);
468	t = __this_cpu_read(pcp->stat_threshold);
469	if (unlikely(v > t)) {
470	s8 overstep = t >> `1`;
471
472	node_page_state_add(x: v + overstep, pgdat, item);
473	__this_cpu_write(*p, -overstep);
474	}
475
476	preempt_enable_nested();
477	}
478
479	void __inc_zone_page_state(struct page page, enum* zone_stat_item item)
480	{
481	__inc_zone_state(zone: page_zone(page), item);
482	}
483	EXPORT_SYMBOL(__inc_zone_page_state);
484
485	void __inc_node_page_state(struct page page, enum* node_stat_item item)
486	{
487	__inc_node_state(pgdat: page_pgdat(page), item);
488	}
489	EXPORT_SYMBOL(__inc_node_page_state);
490
491	void __dec_zone_state(struct zone zone, enum* zone_stat_item item)
492	{
493	struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
494	s8 __percpu *p = pcp->vm_stat_diff + item;
495	s8 v, t;
496
497	/ See __mod_node_page_state /
498	preempt_disable_nested();
499
500	v = __this_cpu_dec_return(*p);
501	t = __this_cpu_read(pcp->stat_threshold);
502	if (unlikely(v < - t)) {
503	s8 overstep = t >> `1`;
504
505	zone_page_state_add(x: v - overstep, zone, item);
506	__this_cpu_write(*p, overstep);
507	}
508
509	preempt_enable_nested();
510	}
511
512	void __dec_node_state(struct pglist_data pgdat, enum* node_stat_item item)
513	{
514	struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
515	s8 __percpu *p = pcp->vm_node_stat_diff + item;
516	s8 v, t;
517
518	VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
519
520	/ See __mod_node_page_state /
521	preempt_disable_nested();
522
523	v = __this_cpu_dec_return(*p);
524	t = __this_cpu_read(pcp->stat_threshold);
525	if (unlikely(v < - t)) {
526	s8 overstep = t >> `1`;
527
528	node_page_state_add(x: v - overstep, pgdat, item);
529	__this_cpu_write(*p, overstep);
530	}
531
532	preempt_enable_nested();
533	}
534
535	void __dec_zone_page_state(struct page page, enum* zone_stat_item item)
536	{
537	__dec_zone_state(zone: page_zone(page), item);
538	}
539	EXPORT_SYMBOL(__dec_zone_page_state);
540
541	void __dec_node_page_state(struct page page, enum* node_stat_item item)
542	{
543	__dec_node_state(pgdat: page_pgdat(page), item);
544	}
545	EXPORT_SYMBOL(__dec_node_page_state);
546
547	#ifdef CONFIG_HAVE_CMPXCHG_LOCAL
548	/*
549	* If we have cmpxchg_local support then we do not need to incur the overhead
550	* that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
551	*
552	* mod_state() modifies the zone counter state through atomic per cpu
553	* operations.
554	*
555	* Overstep mode specifies how overstep should handled:
556	* 0 No overstepping
557	* 1 Overstepping half of threshold
558	* -1 Overstepping minus half of threshold
559	*/
560	static inline void mod_zone_state(struct zone *zone,
561	enum zone_stat_item item, long delta, int overstep_mode)
562	{
563	struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
564	s8 __percpu *p = pcp->vm_stat_diff + item;
565	long n, t, z;
566	s8 o;
567
568	o = this_cpu_read(*p);
569	do {
570	z = `0`; / overflow to zone counters /
571
572	/*
573	* The fetching of the stat_threshold is racy. We may apply
574	* a counter threshold to the wrong the cpu if we get
575	* rescheduled while executing here. However, the next
576	* counter update will apply the threshold again and
577	* therefore bring the counter under the threshold again.
578	*
579	* Most of the time the thresholds are the same anyways
580	* for all cpus in a zone.
581	*/
582	t = this_cpu_read(pcp->stat_threshold);
583
584	n = delta + (long)o;
585
586	if (abs(n) > t) {
587	int os = overstep_mode * (t >> `1`) ;
588
589	/ Overflow must be added to zone counters /
590	z = n + os;
591	n = -os;
592	}
593	} while (!this_cpu_try_cmpxchg(*p, &o, n));
594
595	if (z)
596	zone_page_state_add(x: z, zone, item);
597	}
598
599	void mod_zone_page_state(struct zone zone, enum* zone_stat_item item,
600	long delta)
601	{
602	mod_zone_state(zone, item, delta, overstep_mode: `0`);
603	}
604	EXPORT_SYMBOL(mod_zone_page_state);
605
606	void inc_zone_page_state(struct page page, enum* zone_stat_item item)
607	{
608	mod_zone_state(zone: page_zone(page), item, delta: `1`, overstep_mode: `1`);
609	}
610	EXPORT_SYMBOL(inc_zone_page_state);
611
612	void dec_zone_page_state(struct page page, enum* zone_stat_item item)
613	{
614	mod_zone_state(zone: page_zone(page), item, delta: -`1`, overstep_mode: -`1`);
615	}
616	EXPORT_SYMBOL(dec_zone_page_state);
617
618	static inline void mod_node_state(struct pglist_data *pgdat,
619	enum node_stat_item item, int delta, int overstep_mode)
620	{
621	struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
622	s8 __percpu *p = pcp->vm_node_stat_diff + item;
623	long n, t, z;
624	s8 o;
625
626	if (vmstat_item_in_bytes(idx: item)) {
627	/*
628	* Only cgroups use subpage accounting right now; at
629	* the global level, these items still change in
630	* multiples of whole pages. Store them as pages
631	* internally to keep the per-cpu counters compact.
632	*/
633	VM_WARN_ON_ONCE(delta & (PAGE_SIZE - `1`));
634	delta >>= PAGE_SHIFT;
635	}
636
637	o = this_cpu_read(*p);
638	do {
639	z = `0`; / overflow to node counters /
640
641	/*
642	* The fetching of the stat_threshold is racy. We may apply
643	* a counter threshold to the wrong the cpu if we get
644	* rescheduled while executing here. However, the next
645	* counter update will apply the threshold again and
646	* therefore bring the counter under the threshold again.
647	*
648	* Most of the time the thresholds are the same anyways
649	* for all cpus in a node.
650	*/
651	t = this_cpu_read(pcp->stat_threshold);
652
653	n = delta + (long)o;
654
655	if (abs(n) > t) {
656	int os = overstep_mode * (t >> `1`) ;
657
658	/ Overflow must be added to node counters /
659	z = n + os;
660	n = -os;
661	}
662	} while (!this_cpu_try_cmpxchg(*p, &o, n));
663
664	if (z)
665	node_page_state_add(x: z, pgdat, item);
666	}
667
668	void mod_node_page_state(struct pglist_data pgdat, enum* node_stat_item item,
669	long delta)
670	{
671	mod_node_state(pgdat, item, delta, overstep_mode: `0`);
672	}
673	EXPORT_SYMBOL(mod_node_page_state);
674
675	void inc_node_state(struct pglist_data pgdat, enum* node_stat_item item)
676	{
677	mod_node_state(pgdat, item, delta: `1`, overstep_mode: `1`);
678	}
679
680	void inc_node_page_state(struct page page, enum* node_stat_item item)
681	{
682	mod_node_state(pgdat: page_pgdat(page), item, delta: `1`, overstep_mode: `1`);
683	}
684	EXPORT_SYMBOL(inc_node_page_state);
685
686	void dec_node_page_state(struct page page, enum* node_stat_item item)
687	{
688	mod_node_state(pgdat: page_pgdat(page), item, delta: -`1`, overstep_mode: -`1`);
689	}
690	EXPORT_SYMBOL(dec_node_page_state);
691	#else
692	/*
693	* Use interrupt disable to serialize counter updates
694	*/
695	void mod_zone_page_state(struct zone zone, enum* zone_stat_item item,
696	long delta)
697	{
698	unsigned long flags;
699
700	local_irq_save(flags);
701	__mod_zone_page_state(zone, item, delta);
702	local_irq_restore(flags);
703	}
704	EXPORT_SYMBOL(mod_zone_page_state);
705
706	void inc_zone_page_state(struct page page, enum* zone_stat_item item)
707	{
708	unsigned long flags;
709	struct zone *zone;
710
711	zone = page_zone(page);
712	local_irq_save(flags);
713	__inc_zone_state(zone, item);
714	local_irq_restore(flags);
715	}
716	EXPORT_SYMBOL(inc_zone_page_state);
717
718	void dec_zone_page_state(struct page page, enum* zone_stat_item item)
719	{
720	unsigned long flags;
721
722	local_irq_save(flags);
723	__dec_zone_page_state(page, item);
724	local_irq_restore(flags);
725	}
726	EXPORT_SYMBOL(dec_zone_page_state);
727
728	void inc_node_state(struct pglist_data pgdat, enum* node_stat_item item)
729	{
730	unsigned long flags;
731
732	local_irq_save(flags);
733	__inc_node_state(pgdat, item);
734	local_irq_restore(flags);
735	}
736	EXPORT_SYMBOL(inc_node_state);
737
738	void mod_node_page_state(struct pglist_data pgdat, enum* node_stat_item item,
739	long delta)
740	{
741	unsigned long flags;
742
743	local_irq_save(flags);
744	__mod_node_page_state(pgdat, item, delta);
745	local_irq_restore(flags);
746	}
747	EXPORT_SYMBOL(mod_node_page_state);
748
749	void inc_node_page_state(struct page page, enum* node_stat_item item)
750	{
751	unsigned long flags;
752	struct pglist_data *pgdat;
753
754	pgdat = page_pgdat(page);
755	local_irq_save(flags);
756	__inc_node_state(pgdat, item);
757	local_irq_restore(flags);
758	}
759	EXPORT_SYMBOL(inc_node_page_state);
760
761	void dec_node_page_state(struct page page, enum* node_stat_item item)
762	{
763	unsigned long flags;
764
765	local_irq_save(flags);
766	__dec_node_page_state(page, item);
767	local_irq_restore(flags);
768	}
769	EXPORT_SYMBOL(dec_node_page_state);
770	#endif
771
772	/*
773	* Fold a differential into the global counters.
774	* Returns the number of counters updated.
775	*/
776	static int fold_diff(int zone_diff, int* *node_diff)
777	{
778	int i;
779	int changes = `0`;
780
781	for (i = `0`; i < NR_VM_ZONE_STAT_ITEMS; i++)
782	if (zone_diff[i]) {
783	atomic_long_add(i: zone_diff[i], v: &vm_zone_stat[i]);
784	changes++;
785	}
786
787	for (i = `0`; i < NR_VM_NODE_STAT_ITEMS; i++)
788	if (node_diff[i]) {
789	atomic_long_add(i: node_diff[i], v: &vm_node_stat[i]);
790	changes++;
791	}
792	return changes;
793	}
794
795	/*
796	* Update the zone counters for the current cpu.
797	*
798	* Note that refresh_cpu_vm_stats strives to only access
799	* node local memory. The per cpu pagesets on remote zones are placed
800	* in the memory local to the processor using that pageset. So the
801	* loop over all zones will access a series of cachelines local to
802	* the processor.
803	*
804	* The call to zone_page_state_add updates the cachelines with the
805	* statistics in the remote zone struct as well as the global cachelines
806	* with the global counters. These could cause remote node cache line
807	* bouncing and will have to be only done when necessary.
808	*
809	* The function returns the number of global counters updated.
810	*/
811	static int refresh_cpu_vm_stats(bool do_pagesets)
812	{
813	struct pglist_data *pgdat;
814	struct zone *zone;
815	int i;
816	int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { `0`, };
817	int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { `0`, };
818	int changes = `0`;
819
820	for_each_populated_zone(zone) {
821	struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
822	struct per_cpu_pages __percpu *pcp = zone->per_cpu_pageset;
823
824	for (i = `0`; i < NR_VM_ZONE_STAT_ITEMS; i++) {
825	int v;
826
827	v = this_cpu_xchg(pzstats->vm_stat_diff[i], `0`);
828	if (v) {
829
830	atomic_long_add(i: v, v: &zone->vm_stat[i]);
831	global_zone_diff[i] += v;
832	#ifdef CONFIG_NUMA
833	/ 3 seconds idle till flush /
834	__this_cpu_write(pcp->expire, `3`);
835	#endif
836	}
837	}
838
839	if (do_pagesets) {
840	cond_resched();
841
842	changes += decay_pcp_high(zone, this_cpu_ptr(pcp));
843	#ifdef CONFIG_NUMA
844	/*
845	* Deal with draining the remote pageset of this
846	* processor
847	*
848	* Check if there are pages remaining in this pageset
849	* if not then there is nothing to expire.
850	*/
851	if (!__this_cpu_read(pcp->expire) \|\|
852	!__this_cpu_read(pcp->count))
853	continue;
854
855	/*
856	* We never drain zones local to this processor.
857	*/
858	if (zone_to_nid(zone) == numa_node_id()) {
859	__this_cpu_write(pcp->expire, `0`);
860	continue;
861	}
862
863	if (__this_cpu_dec_return(pcp->expire)) {
864	changes++;
865	continue;
866	}
867
868	if (__this_cpu_read(pcp->count)) {
869	drain_zone_pages(zone, this_cpu_ptr(pcp));
870	changes++;
871	}
872	#endif
873	}
874	}
875
876	for_each_online_pgdat(pgdat) {
877	struct per_cpu_nodestat __percpu *p = pgdat->per_cpu_nodestats;
878
879	for (i = `0`; i < NR_VM_NODE_STAT_ITEMS; i++) {
880	int v;
881
882	v = this_cpu_xchg(p->vm_node_stat_diff[i], `0`);
883	if (v) {
884	atomic_long_add(i: v, v: &pgdat->vm_stat[i]);
885	global_node_diff[i] += v;
886	}
887	}
888	}
889
890	changes += fold_diff(zone_diff: global_zone_diff, node_diff: global_node_diff);
891	return changes;
892	}
893
894	/*
895	* Fold the data for an offline cpu into the global array.
896	* There cannot be any access by the offline cpu and therefore
897	* synchronization is simplified.
898	*/
899	void cpu_vm_stats_fold(int cpu)
900	{
901	struct pglist_data *pgdat;
902	struct zone *zone;
903	int i;
904	int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { `0`, };
905	int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { `0`, };
906
907	for_each_populated_zone(zone) {
908	struct per_cpu_zonestat *pzstats;
909
910	pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
911
912	for (i = `0`; i < NR_VM_ZONE_STAT_ITEMS; i++) {
913	if (pzstats->vm_stat_diff[i]) {
914	int v;
915
916	v = pzstats->vm_stat_diff[i];
917	pzstats->vm_stat_diff[i] = `0`;
918	atomic_long_add(i: v, v: &zone->vm_stat[i]);
919	global_zone_diff[i] += v;
920	}
921	}
922	#ifdef CONFIG_NUMA
923	for (i = `0`; i < NR_VM_NUMA_EVENT_ITEMS; i++) {
924	if (pzstats->vm_numa_event[i]) {
925	unsigned long v;
926
927	v = pzstats->vm_numa_event[i];
928	pzstats->vm_numa_event[i] = `0`;
929	zone_numa_event_add(x: v, zone, item: i);
930	}
931	}
932	#endif
933	}
934
935	for_each_online_pgdat(pgdat) {
936	struct per_cpu_nodestat *p;
937
938	p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
939
940	for (i = `0`; i < NR_VM_NODE_STAT_ITEMS; i++)
941	if (p->vm_node_stat_diff[i]) {
942	int v;
943
944	v = p->vm_node_stat_diff[i];
945	p->vm_node_stat_diff[i] = `0`;
946	atomic_long_add(i: v, v: &pgdat->vm_stat[i]);
947	global_node_diff[i] += v;
948	}
949	}
950
951	fold_diff(zone_diff: global_zone_diff, node_diff: global_node_diff);
952	}
953
954	/*
955	* this is only called if !populated_zone(zone), which implies no other users of
956	* pset->vm_stat_diff[] exist.
957	*/
958	void drain_zonestat(struct zone zone, struct* per_cpu_zonestat *pzstats)
959	{
960	unsigned long v;
961	int i;
962
963	for (i = `0`; i < NR_VM_ZONE_STAT_ITEMS; i++) {
964	if (pzstats->vm_stat_diff[i]) {
965	v = pzstats->vm_stat_diff[i];
966	pzstats->vm_stat_diff[i] = `0`;
967	zone_page_state_add(x: v, zone, item: i);
968	}
969	}
970
971	#ifdef CONFIG_NUMA
972	for (i = `0`; i < NR_VM_NUMA_EVENT_ITEMS; i++) {
973	if (pzstats->vm_numa_event[i]) {
974	v = pzstats->vm_numa_event[i];
975	pzstats->vm_numa_event[i] = `0`;
976	zone_numa_event_add(x: v, zone, item: i);
977	}
978	}
979	#endif
980	}
981	#endif
982
983	#ifdef CONFIG_NUMA
984	/*
985	* Determine the per node value of a stat item. This function
986	* is called frequently in a NUMA machine, so try to be as
987	* frugal as possible.
988	*/
989	unsigned long sum_zone_node_page_state(int node,
990	enum zone_stat_item item)
991	{
992	struct zone *zones = NODE_DATA(node)->node_zones;
993	int i;
994	unsigned long count = `0`;
995
996	for (i = `0`; i < MAX_NR_ZONES; i++)
997	count += zone_page_state(zone: zones + i, item);
998
999	return count;
1000	}
1001
1002	/ Determine the per node value of a numa stat item. /
1003	unsigned long sum_zone_numa_event_state(int node,
1004	enum numa_stat_item item)
1005	{
1006	struct zone *zones = NODE_DATA(node)->node_zones;
1007	unsigned long count = `0`;
1008	int i;
1009
1010	for (i = `0`; i < MAX_NR_ZONES; i++)
1011	count += zone_numa_event_state(zone: zones + i, item);
1012
1013	return count;
1014	}
1015
1016	/*
1017	* Determine the per node value of a stat item.
1018	*/
1019	unsigned long node_page_state_pages(struct pglist_data *pgdat,
1020	enum node_stat_item item)
1021	{
1022	long x = atomic_long_read(v: &pgdat->vm_stat[item]);
1023	#ifdef CONFIG_SMP
1024	if (x < `0`)
1025	x = `0`;
1026	#endif
1027	return x;
1028	}
1029
1030	unsigned long node_page_state(struct pglist_data *pgdat,
1031	enum node_stat_item item)
1032	{
1033	VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
1034
1035	return node_page_state_pages(pgdat, item);
1036	}
1037	#endif
1038
1039	/*
1040	* Count number of pages "struct page" and "struct page_ext" consume.
1041	* nr_memmap_boot_pages: # of pages allocated by boot allocator
1042	* nr_memmap_pages: # of pages that were allocated by buddy allocator
1043	*/
1044	static atomic_long_t nr_memmap_boot_pages = ATOMIC_LONG_INIT(`0`);
1045	static atomic_long_t nr_memmap_pages = ATOMIC_LONG_INIT(`0`);
1046
1047	void memmap_boot_pages_add(long delta)
1048	{
1049	atomic_long_add(i: delta, v: &nr_memmap_boot_pages);
1050	}
1051
1052	void memmap_pages_add(long delta)
1053	{
1054	atomic_long_add(i: delta, v: &nr_memmap_pages);
1055	}
1056
1057	#ifdef CONFIG_COMPACTION
1058
1059	struct contig_page_info {
1060	unsigned long free_pages;
1061	unsigned long free_blocks_total;
1062	unsigned long free_blocks_suitable;
1063	};
1064
1065	/*
1066	* Calculate the number of free pages in a zone, how many contiguous
1067	* pages are free and how many are large enough to satisfy an allocation of
1068	* the target size. Note that this function makes no attempt to estimate
1069	* how many suitable free blocks there might be if MOVABLE pages were
1070	* migrated. Calculating that is possible, but expensive and can be
1071	* figured out from userspace
1072	*/
1073	static void fill_contig_page_info(struct zone *zone,
1074	unsigned int suitable_order,
1075	struct contig_page_info *info)
1076	{
1077	unsigned int order;
1078
1079	info->free_pages = `0`;
1080	info->free_blocks_total = `0`;
1081	info->free_blocks_suitable = `0`;
1082
1083	for (order = `0`; order < NR_PAGE_ORDERS; order++) {
1084	unsigned long blocks;
1085
1086	/*
1087	* Count number of free blocks.
1088	*
1089	* Access to nr_free is lockless as nr_free is used only for
1090	* diagnostic purposes. Use data_race to avoid KCSAN warning.
1091	*/
1092	blocks = data_race(zone->free_area[order].nr_free);
1093	info->free_blocks_total += blocks;
1094
1095	/ Count free base pages /
1096	info->free_pages += blocks << order;
1097
1098	/ Count the suitable free blocks /
1099	if (order >= suitable_order)
1100	info->free_blocks_suitable += blocks <<
1101	(order - suitable_order);
1102	}
1103	}
1104
1105	/*
1106	* A fragmentation index only makes sense if an allocation of a requested
1107	* size would fail. If that is true, the fragmentation index indicates
1108	* whether external fragmentation or a lack of memory was the problem.
1109	* The value can be used to determine if page reclaim or compaction
1110	* should be used
1111	*/
1112	static int __fragmentation_index(unsigned int order, struct contig_page_info *info)
1113	{
1114	unsigned long requested = `1UL` << order;
1115
1116	if (WARN_ON_ONCE(order > MAX_PAGE_ORDER))
1117	return `0`;
1118
1119	if (!info->free_blocks_total)
1120	return `0`;
1121
1122	/ Fragmentation index only makes sense when a request would fail /
1123	if (info->free_blocks_suitable)
1124	return -`1000`;
1125
1126	/*
1127	* Index is between 0 and 1 so return within 3 decimal places
1128	*
1129	* 0 => allocation would fail due to lack of memory
1130	* 1 => allocation would fail due to fragmentation
1131	*/
1132	return `1000` - div_u64( dividend: (`1000`+(div_u64(dividend: info->free_pages * `1000ULL`, divisor: requested))), divisor: info->free_blocks_total);
1133	}
1134
1135	/*
1136	* Calculates external fragmentation within a zone wrt the given order.
1137	* It is defined as the percentage of pages found in blocks of size
1138	* less than 1 << order. It returns values in range [0, 100].
1139	*/
1140	unsigned int extfrag_for_order(struct zone zone, unsigned* int order)
1141	{
1142	struct contig_page_info info;
1143
1144	fill_contig_page_info(zone, suitable_order: order, info: &info);
1145	if (info.free_pages == `0`)
1146	return `0`;
1147
1148	return div_u64(dividend: (info.free_pages -
1149	(info.free_blocks_suitable << order)) * `100`,
1150	divisor: info.free_pages);
1151	}
1152
1153	/ Same as __fragmentation index but allocs contig_page_info on stack /
1154	int fragmentation_index(struct zone zone, unsigned* int order)
1155	{
1156	struct contig_page_info info;
1157
1158	fill_contig_page_info(zone, suitable_order: order, info: &info);
1159	return __fragmentation_index(order, info: &info);
1160	}
1161	#endif
1162
1163	#if defined(CONFIG_PROC_FS) \|\| defined(CONFIG_SYSFS) \|\| \
1164	defined(CONFIG_NUMA) \|\| defined(CONFIG_MEMCG)
1165	#ifdef CONFIG_ZONE_DMA
1166	#define TEXT_FOR_DMA(xx, yy) [xx##_DMA] = yy "_dma",
1167	#else
1168	#define TEXT_FOR_DMA(xx, yy)
1169	#endif
1170
1171	#ifdef CONFIG_ZONE_DMA32
1172	#define TEXT_FOR_DMA32(xx, yy) [xx##_DMA32] = yy "_dma32",
1173	#else
1174	#define TEXT_FOR_DMA32(xx, yy)
1175	#endif
1176
1177	#ifdef CONFIG_HIGHMEM
1178	#define TEXT_FOR_HIGHMEM(xx, yy) [xx##_HIGH] = yy "_high",
1179	#else
1180	#define TEXT_FOR_HIGHMEM(xx, yy)
1181	#endif
1182
1183	#ifdef CONFIG_ZONE_DEVICE
1184	#define TEXT_FOR_DEVICE(xx, yy) [xx##_DEVICE] = yy "_device",
1185	#else
1186	#define TEXT_FOR_DEVICE(xx, yy)
1187	#endif
1188
1189	#define TEXTS_FOR_ZONES(xx, yy) \
1190	TEXT_FOR_DMA(xx, yy) \
1191	TEXT_FOR_DMA32(xx, yy) \
1192	[xx##_NORMAL] = yy "_normal", \
1193	TEXT_FOR_HIGHMEM(xx, yy) \
1194	[xx##_MOVABLE] = yy "_movable", \
1195	TEXT_FOR_DEVICE(xx, yy)
1196
1197	const char * const vmstat_text[] = {
1198	/ enum zone_stat_item counters /
1199	#define I(x) (x)
1200	[I(NR_FREE_PAGES)] = "nr_free_pages",
1201	[I(NR_FREE_PAGES_BLOCKS)] = "nr_free_pages_blocks",
1202	[I(NR_ZONE_INACTIVE_ANON)] = "nr_zone_inactive_anon",
1203	[I(NR_ZONE_ACTIVE_ANON)] = "nr_zone_active_anon",
1204	[I(NR_ZONE_INACTIVE_FILE)] = "nr_zone_inactive_file",
1205	[I(NR_ZONE_ACTIVE_FILE)] = "nr_zone_active_file",
1206	[I(NR_ZONE_UNEVICTABLE)] = "nr_zone_unevictable",
1207	[I(NR_ZONE_WRITE_PENDING)] = "nr_zone_write_pending",
1208	[I(NR_MLOCK)] = "nr_mlock",
1209	#if IS_ENABLED(CONFIG_ZSMALLOC)
1210	[I(NR_ZSPAGES)] = "nr_zspages",
1211	#endif
1212	[I(NR_FREE_CMA_PAGES)] = "nr_free_cma",
1213	#ifdef CONFIG_UNACCEPTED_MEMORY
1214	[I(NR_UNACCEPTED)] = "nr_unaccepted",
1215	#endif
1216	#undef I
1217
1218	/ enum numa_stat_item counters /
1219	#define I(x) (NR_VM_ZONE_STAT_ITEMS + x)
1220	#ifdef CONFIG_NUMA
1221	[I(NUMA_HIT)] = "numa_hit",
1222	[I(NUMA_MISS)] = "numa_miss",
1223	[I(NUMA_FOREIGN)] = "numa_foreign",
1224	[I(NUMA_INTERLEAVE_HIT)] = "numa_interleave",
1225	[I(NUMA_LOCAL)] = "numa_local",
1226	[I(NUMA_OTHER)] = "numa_other",
1227	#endif
1228	#undef I
1229
1230	/ enum node_stat_item counters /
1231	#define I(x) (NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS + x)
1232	[I(NR_INACTIVE_ANON)] = "nr_inactive_anon",
1233	[I(NR_ACTIVE_ANON)] = "nr_active_anon",
1234	[I(NR_INACTIVE_FILE)] = "nr_inactive_file",
1235	[I(NR_ACTIVE_FILE)] = "nr_active_file",
1236	[I(NR_UNEVICTABLE)] = "nr_unevictable",
1237	[I(NR_SLAB_RECLAIMABLE_B)] = "nr_slab_reclaimable",
1238	[I(NR_SLAB_UNRECLAIMABLE_B)] = "nr_slab_unreclaimable",
1239	[I(NR_ISOLATED_ANON)] = "nr_isolated_anon",
1240	[I(NR_ISOLATED_FILE)] = "nr_isolated_file",
1241	[I(WORKINGSET_NODES)] = "workingset_nodes",
1242	[I(WORKINGSET_REFAULT_ANON)] = "workingset_refault_anon",
1243	[I(WORKINGSET_REFAULT_FILE)] = "workingset_refault_file",
1244	[I(WORKINGSET_ACTIVATE_ANON)] = "workingset_activate_anon",
1245	[I(WORKINGSET_ACTIVATE_FILE)] = "workingset_activate_file",
1246	[I(WORKINGSET_RESTORE_ANON)] = "workingset_restore_anon",
1247	[I(WORKINGSET_RESTORE_FILE)] = "workingset_restore_file",
1248	[I(WORKINGSET_NODERECLAIM)] = "workingset_nodereclaim",
1249	[I(NR_ANON_MAPPED)] = "nr_anon_pages",
1250	[I(NR_FILE_MAPPED)] = "nr_mapped",
1251	[I(NR_FILE_PAGES)] = "nr_file_pages",
1252	[I(NR_FILE_DIRTY)] = "nr_dirty",
1253	[I(NR_WRITEBACK)] = "nr_writeback",
1254	[I(NR_SHMEM)] = "nr_shmem",
1255	[I(NR_SHMEM_THPS)] = "nr_shmem_hugepages",
1256	[I(NR_SHMEM_PMDMAPPED)] = "nr_shmem_pmdmapped",
1257	[I(NR_FILE_THPS)] = "nr_file_hugepages",
1258	[I(NR_FILE_PMDMAPPED)] = "nr_file_pmdmapped",
1259	[I(NR_ANON_THPS)] = "nr_anon_transparent_hugepages",
1260	[I(NR_VMSCAN_WRITE)] = "nr_vmscan_write",
1261	[I(NR_VMSCAN_IMMEDIATE)] = "nr_vmscan_immediate_reclaim",
1262	[I(NR_DIRTIED)] = "nr_dirtied",
1263	[I(NR_WRITTEN)] = "nr_written",
1264	[I(NR_THROTTLED_WRITTEN)] = "nr_throttled_written",
1265	[I(NR_KERNEL_MISC_RECLAIMABLE)] = "nr_kernel_misc_reclaimable",
1266	[I(NR_FOLL_PIN_ACQUIRED)] = "nr_foll_pin_acquired",
1267	[I(NR_FOLL_PIN_RELEASED)] = "nr_foll_pin_released",
1268	[I(NR_KERNEL_STACK_KB)] = "nr_kernel_stack",
1269	#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
1270	[I(NR_KERNEL_SCS_KB)] = "nr_shadow_call_stack",
1271	#endif
1272	[I(NR_PAGETABLE)] = "nr_page_table_pages",
1273	[I(NR_SECONDARY_PAGETABLE)] = "nr_sec_page_table_pages",
1274	#ifdef CONFIG_IOMMU_SUPPORT
1275	[I(NR_IOMMU_PAGES)] = "nr_iommu_pages",
1276	#endif
1277	#ifdef CONFIG_SWAP
1278	[I(NR_SWAPCACHE)] = "nr_swapcached",
1279	#endif
1280	#ifdef CONFIG_NUMA_BALANCING
1281	[I(PGPROMOTE_SUCCESS)] = "pgpromote_success",
1282	[I(PGPROMOTE_CANDIDATE)] = "pgpromote_candidate",
1283	[I(PGPROMOTE_CANDIDATE_NRL)] = "pgpromote_candidate_nrl",
1284	#endif
1285	[I(PGDEMOTE_KSWAPD)] = "pgdemote_kswapd",
1286	[I(PGDEMOTE_DIRECT)] = "pgdemote_direct",
1287	[I(PGDEMOTE_KHUGEPAGED)] = "pgdemote_khugepaged",
1288	[I(PGDEMOTE_PROACTIVE)] = "pgdemote_proactive",
1289	#ifdef CONFIG_HUGETLB_PAGE
1290	[I(NR_HUGETLB)] = "nr_hugetlb",
1291	#endif
1292	[I(NR_BALLOON_PAGES)] = "nr_balloon_pages",
1293	[I(NR_KERNEL_FILE_PAGES)] = "nr_kernel_file_pages",
1294	#undef I
1295
1296	/ system-wide enum vm_stat_item counters /
1297	#define I(x) (NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS + \
1298	NR_VM_NODE_STAT_ITEMS + x)
1299	[I(NR_DIRTY_THRESHOLD)] = "nr_dirty_threshold",
1300	[I(NR_DIRTY_BG_THRESHOLD)] = "nr_dirty_background_threshold",
1301	[I(NR_MEMMAP_PAGES)] = "nr_memmap_pages",
1302	[I(NR_MEMMAP_BOOT_PAGES)] = "nr_memmap_boot_pages",
1303	#undef I
1304
1305	#if defined(CONFIG_VM_EVENT_COUNTERS)
1306	/ enum vm_event_item counters /
1307	#define I(x) (NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS + \
1308	NR_VM_NODE_STAT_ITEMS + NR_VM_STAT_ITEMS + x)
1309
1310	[I(PGPGIN)] = "pgpgin",
1311	[I(PGPGOUT)] = "pgpgout",
1312	[I(PSWPIN)] = "pswpin",
1313	[I(PSWPOUT)] = "pswpout",
1314
1315	#define OFF (NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS + \
1316	NR_VM_NODE_STAT_ITEMS + NR_VM_STAT_ITEMS)
1317	TEXTS_FOR_ZONES(OFF+PGALLOC, "pgalloc")
1318	TEXTS_FOR_ZONES(OFF+ALLOCSTALL, "allocstall")
1319	TEXTS_FOR_ZONES(OFF+PGSCAN_SKIP, "pgskip")
1320	#undef OFF
1321
1322	[I(PGFREE)] = "pgfree",
1323	[I(PGACTIVATE)] = "pgactivate",
1324	[I(PGDEACTIVATE)] = "pgdeactivate",
1325	[I(PGLAZYFREE)] = "pglazyfree",
1326
1327	[I(PGFAULT)] = "pgfault",
1328	[I(PGMAJFAULT)] = "pgmajfault",
1329	[I(PGLAZYFREED)] = "pglazyfreed",
1330
1331	[I(PGREFILL)] = "pgrefill",
1332	[I(PGREUSE)] = "pgreuse",
1333	[I(PGSTEAL_KSWAPD)] = "pgsteal_kswapd",
1334	[I(PGSTEAL_DIRECT)] = "pgsteal_direct",
1335	[I(PGSTEAL_KHUGEPAGED)] = "pgsteal_khugepaged",
1336	[I(PGSTEAL_PROACTIVE)] = "pgsteal_proactive",
1337	[I(PGSCAN_KSWAPD)] = "pgscan_kswapd",
1338	[I(PGSCAN_DIRECT)] = "pgscan_direct",
1339	[I(PGSCAN_KHUGEPAGED)] = "pgscan_khugepaged",
1340	[I(PGSCAN_PROACTIVE)] = "pgscan_proactive",
1341	[I(PGSCAN_DIRECT_THROTTLE)] = "pgscan_direct_throttle",
1342	[I(PGSCAN_ANON)] = "pgscan_anon",
1343	[I(PGSCAN_FILE)] = "pgscan_file",
1344	[I(PGSTEAL_ANON)] = "pgsteal_anon",
1345	[I(PGSTEAL_FILE)] = "pgsteal_file",
1346
1347	#ifdef CONFIG_NUMA
1348	[I(PGSCAN_ZONE_RECLAIM_SUCCESS)] = "zone_reclaim_success",
1349	[I(PGSCAN_ZONE_RECLAIM_FAILED)] = "zone_reclaim_failed",
1350	#endif
1351	[I(PGINODESTEAL)] = "pginodesteal",
1352	[I(SLABS_SCANNED)] = "slabs_scanned",
1353	[I(KSWAPD_INODESTEAL)] = "kswapd_inodesteal",
1354	[I(KSWAPD_LOW_WMARK_HIT_QUICKLY)] = "kswapd_low_wmark_hit_quickly",
1355	[I(KSWAPD_HIGH_WMARK_HIT_QUICKLY)] = "kswapd_high_wmark_hit_quickly",
1356	[I(PAGEOUTRUN)] = "pageoutrun",
1357
1358	[I(PGROTATED)] = "pgrotated",
1359
1360	[I(DROP_PAGECACHE)] = "drop_pagecache",
1361	[I(DROP_SLAB)] = "drop_slab",
1362	[I(OOM_KILL)] = "oom_kill",
1363
1364	#ifdef CONFIG_NUMA_BALANCING
1365	[I(NUMA_PTE_UPDATES)] = "numa_pte_updates",
1366	[I(NUMA_HUGE_PTE_UPDATES)] = "numa_huge_pte_updates",
1367	[I(NUMA_HINT_FAULTS)] = "numa_hint_faults",
1368	[I(NUMA_HINT_FAULTS_LOCAL)] = "numa_hint_faults_local",
1369	[I(NUMA_PAGE_MIGRATE)] = "numa_pages_migrated",
1370	#endif
1371	#ifdef CONFIG_MIGRATION
1372	[I(PGMIGRATE_SUCCESS)] = "pgmigrate_success",
1373	[I(PGMIGRATE_FAIL)] = "pgmigrate_fail",
1374	[I(THP_MIGRATION_SUCCESS)] = "thp_migration_success",
1375	[I(THP_MIGRATION_FAIL)] = "thp_migration_fail",
1376	[I(THP_MIGRATION_SPLIT)] = "thp_migration_split",
1377	#endif
1378	#ifdef CONFIG_COMPACTION
1379	[I(COMPACTMIGRATE_SCANNED)] = "compact_migrate_scanned",
1380	[I(COMPACTFREE_SCANNED)] = "compact_free_scanned",
1381	[I(COMPACTISOLATED)] = "compact_isolated",
1382	[I(COMPACTSTALL)] = "compact_stall",
1383	[I(COMPACTFAIL)] = "compact_fail",
1384	[I(COMPACTSUCCESS)] = "compact_success",
1385	[I(KCOMPACTD_WAKE)] = "compact_daemon_wake",
1386	[I(KCOMPACTD_MIGRATE_SCANNED)] = "compact_daemon_migrate_scanned",
1387	[I(KCOMPACTD_FREE_SCANNED)] = "compact_daemon_free_scanned",
1388	#endif
1389
1390	#ifdef CONFIG_HUGETLB_PAGE
1391	[I(HTLB_BUDDY_PGALLOC)] = "htlb_buddy_alloc_success",
1392	[I(HTLB_BUDDY_PGALLOC_FAIL)] = "htlb_buddy_alloc_fail",
1393	#endif
1394	#ifdef CONFIG_CMA
1395	[I(CMA_ALLOC_SUCCESS)] = "cma_alloc_success",
1396	[I(CMA_ALLOC_FAIL)] = "cma_alloc_fail",
1397	#endif
1398	[I(UNEVICTABLE_PGCULLED)] = "unevictable_pgs_culled",
1399	[I(UNEVICTABLE_PGSCANNED)] = "unevictable_pgs_scanned",
1400	[I(UNEVICTABLE_PGRESCUED)] = "unevictable_pgs_rescued",
1401	[I(UNEVICTABLE_PGMLOCKED)] = "unevictable_pgs_mlocked",
1402	[I(UNEVICTABLE_PGMUNLOCKED)] = "unevictable_pgs_munlocked",
1403	[I(UNEVICTABLE_PGCLEARED)] = "unevictable_pgs_cleared",
1404	[I(UNEVICTABLE_PGSTRANDED)] = "unevictable_pgs_stranded",
1405
1406	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1407	[I(THP_FAULT_ALLOC)] = "thp_fault_alloc",
1408	[I(THP_FAULT_FALLBACK)] = "thp_fault_fallback",
1409	[I(THP_FAULT_FALLBACK_CHARGE)] = "thp_fault_fallback_charge",
1410	[I(THP_COLLAPSE_ALLOC)] = "thp_collapse_alloc",
1411	[I(THP_COLLAPSE_ALLOC_FAILED)] = "thp_collapse_alloc_failed",
1412	[I(THP_FILE_ALLOC)] = "thp_file_alloc",
1413	[I(THP_FILE_FALLBACK)] = "thp_file_fallback",
1414	[I(THP_FILE_FALLBACK_CHARGE)] = "thp_file_fallback_charge",
1415	[I(THP_FILE_MAPPED)] = "thp_file_mapped",
1416	[I(THP_SPLIT_PAGE)] = "thp_split_page",
1417	[I(THP_SPLIT_PAGE_FAILED)] = "thp_split_page_failed",
1418	[I(THP_DEFERRED_SPLIT_PAGE)] = "thp_deferred_split_page",
1419	[I(THP_UNDERUSED_SPLIT_PAGE)] = "thp_underused_split_page",
1420	[I(THP_SPLIT_PMD)] = "thp_split_pmd",
1421	[I(THP_SCAN_EXCEED_NONE_PTE)] = "thp_scan_exceed_none_pte",
1422	[I(THP_SCAN_EXCEED_SWAP_PTE)] = "thp_scan_exceed_swap_pte",
1423	[I(THP_SCAN_EXCEED_SHARED_PTE)] = "thp_scan_exceed_share_pte",
1424	#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1425	[I(THP_SPLIT_PUD)] = "thp_split_pud",
1426	#endif
1427	[I(THP_ZERO_PAGE_ALLOC)] = "thp_zero_page_alloc",
1428	[I(THP_ZERO_PAGE_ALLOC_FAILED)] = "thp_zero_page_alloc_failed",
1429	[I(THP_SWPOUT)] = "thp_swpout",
1430	[I(THP_SWPOUT_FALLBACK)] = "thp_swpout_fallback",
1431	#endif
1432	#ifdef CONFIG_MEMORY_BALLOON
1433	[I(BALLOON_INFLATE)] = "balloon_inflate",
1434	[I(BALLOON_DEFLATE)] = "balloon_deflate",
1435	#ifdef CONFIG_BALLOON_COMPACTION
1436	[I(BALLOON_MIGRATE)] = "balloon_migrate",
1437	#endif
1438	#endif /* CONFIG_MEMORY_BALLOON */
1439	#ifdef CONFIG_DEBUG_TLBFLUSH
1440	[I(NR_TLB_REMOTE_FLUSH)] = "nr_tlb_remote_flush",
1441	[I(NR_TLB_REMOTE_FLUSH_RECEIVED)] = "nr_tlb_remote_flush_received",
1442	[I(NR_TLB_LOCAL_FLUSH_ALL)] = "nr_tlb_local_flush_all",
1443	[I(NR_TLB_LOCAL_FLUSH_ONE)] = "nr_tlb_local_flush_one",
1444	#endif /* CONFIG_DEBUG_TLBFLUSH */
1445
1446	#ifdef CONFIG_SWAP
1447	[I(SWAP_RA)] = "swap_ra",
1448	[I(SWAP_RA_HIT)] = "swap_ra_hit",
1449	[I(SWPIN_ZERO)] = "swpin_zero",
1450	[I(SWPOUT_ZERO)] = "swpout_zero",
1451	#ifdef CONFIG_KSM
1452	[I(KSM_SWPIN_COPY)] = "ksm_swpin_copy",
1453	#endif
1454	#endif
1455	#ifdef CONFIG_KSM
1456	[I(COW_KSM)] = "cow_ksm",
1457	#endif
1458	#ifdef CONFIG_ZSWAP
1459	[I(ZSWPIN)] = "zswpin",
1460	[I(ZSWPOUT)] = "zswpout",
1461	[I(ZSWPWB)] = "zswpwb",
1462	#endif
1463	#ifdef CONFIG_X86
1464	[I(DIRECT_MAP_LEVEL2_SPLIT)] = "direct_map_level2_splits",
1465	[I(DIRECT_MAP_LEVEL3_SPLIT)] = "direct_map_level3_splits",
1466	[I(DIRECT_MAP_LEVEL2_COLLAPSE)] = "direct_map_level2_collapses",
1467	[I(DIRECT_MAP_LEVEL3_COLLAPSE)] = "direct_map_level3_collapses",
1468	#endif
1469	#ifdef CONFIG_PER_VMA_LOCK_STATS
1470	[I(VMA_LOCK_SUCCESS)] = "vma_lock_success",
1471	[I(VMA_LOCK_ABORT)] = "vma_lock_abort",
1472	[I(VMA_LOCK_RETRY)] = "vma_lock_retry",
1473	[I(VMA_LOCK_MISS)] = "vma_lock_miss",
1474	#endif
1475	#ifdef CONFIG_DEBUG_STACK_USAGE
1476	[I(KSTACK_1K)] = "kstack_1k",
1477	#if THREAD_SIZE > 1024
1478	[I(KSTACK_2K)] = "kstack_2k",
1479	#endif
1480	#if THREAD_SIZE > 2048
1481	[I(KSTACK_4K)] = "kstack_4k",
1482	#endif
1483	#if THREAD_SIZE > 4096
1484	[I(KSTACK_8K)] = "kstack_8k",
1485	#endif
1486	#if THREAD_SIZE > 8192
1487	[I(KSTACK_16K)] = "kstack_16k",
1488	#endif
1489	#if THREAD_SIZE > 16384
1490	[I(KSTACK_32K)] = "kstack_32k",
1491	#endif
1492	#if THREAD_SIZE > 32768
1493	[I(KSTACK_64K)] = "kstack_64k",
1494	#endif
1495	#if THREAD_SIZE > 65536
1496	[I(KSTACK_REST)] = "kstack_rest",
1497	#endif
1498	#endif
1499	#undef I
1500	#endif /* CONFIG_VM_EVENT_COUNTERS */
1501	};
1502	#endif /* CONFIG_PROC_FS \|\| CONFIG_SYSFS \|\| CONFIG_NUMA \|\| CONFIG_MEMCG */
1503
1504	#if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) \|\| \
1505	defined(CONFIG_PROC_FS)
1506	static void frag_start(struct* seq_file m, loff_t pos)
1507	{
1508	pg_data_t *pgdat;
1509	loff_t node = *pos;
1510
1511	for (pgdat = first_online_pgdat();
1512	pgdat && node;
1513	pgdat = next_online_pgdat(pgdat))
1514	--node;
1515
1516	return pgdat;
1517	}
1518
1519	static void frag_next(struct* seq_file m, void* arg, loff_t pos)
1520	{
1521	pg_data_t pgdat = (pg_data_t )arg;
1522
1523	(*pos)++;
1524	return next_online_pgdat(pgdat);
1525	}
1526
1527	static void frag_stop(struct seq_file m, void* *arg)
1528	{
1529	}
1530
1531	/*
1532	* Walk zones in a node and print using a callback.
1533	* If @assert_populated is true, only use callback for zones that are populated.
1534	*/
1535	static void walk_zones_in_node(struct seq_file m, pg_data_t pgdat,
1536	bool assert_populated, bool nolock,
1537	void (print)(struct* seq_file m, pg_data_t , struct zone *))
1538	{
1539	struct zone *zone;
1540	struct zone *node_zones = pgdat->node_zones;
1541	unsigned long flags;
1542
1543	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
1544	if (assert_populated && !populated_zone(zone))
1545	continue;
1546
1547	if (!nolock)
1548	spin_lock_irqsave(&zone->lock, flags);
1549	print(m, pgdat, zone);
1550	if (!nolock)
1551	spin_unlock_irqrestore(lock: &zone->lock, flags);
1552	}
1553	}
1554	#endif
1555
1556	#ifdef CONFIG_PROC_FS
1557	static void frag_show_print(struct seq_file m, pg_data_t pgdat,
1558	struct zone *zone)
1559	{
1560	int order;
1561
1562	seq_printf(m, fmt: "Node %d, zone %8s ", pgdat->node_id, zone->name);
1563	for (order = `0`; order < NR_PAGE_ORDERS; ++order)
1564	/*
1565	* Access to nr_free is lockless as nr_free is used only for
1566	* printing purposes. Use data_race to avoid KCSAN warning.
1567	*/
1568	seq_printf(m, fmt: "%6lu ", data_race(zone->free_area[order].nr_free));
1569	seq_putc(m, c: `'\n'`);
1570	}
1571
1572	/*
1573	* This walks the free areas for each zone.
1574	*/
1575	static int frag_show(struct seq_file m, void* *arg)
1576	{
1577	pg_data_t pgdat = (pg_data_t )arg;
1578	walk_zones_in_node(m, pgdat, assert_populated: true, nolock: false, print: frag_show_print);
1579	return `0`;
1580	}
1581
1582	static void pagetypeinfo_showfree_print(struct seq_file *m,
1583	pg_data_t pgdat, struct* zone *zone)
1584	{
1585	int order, mtype;
1586
1587	for (mtype = `0`; mtype < MIGRATE_TYPES; mtype++) {
1588	seq_printf(m, fmt: "Node %4d, zone %8s, type %12s ",
1589	pgdat->node_id,
1590	zone->name,
1591	migratetype_names[mtype]);
1592	for (order = `0`; order < NR_PAGE_ORDERS; ++order) {
1593	unsigned long freecount = `0`;
1594	struct free_area *area;
1595	struct list_head *curr;
1596	bool overflow = false;
1597
1598	area = &(zone->free_area[order]);
1599
1600	list_for_each(curr, &area->free_list[mtype]) {
1601	/*
1602	* Cap the free_list iteration because it might
1603	* be really large and we are under a spinlock
1604	* so a long time spent here could trigger a
1605	* hard lockup detector. Anyway this is a
1606	* debugging tool so knowing there is a handful
1607	* of pages of this order should be more than
1608	* sufficient.
1609	*/
1610	if (++freecount >= `100000`) {
1611	overflow = true;
1612	break;
1613	}
1614	}
1615	seq_printf(m, fmt: "%s%6lu ", overflow ? ">" : "", freecount);
1616	spin_unlock_irq(lock: &zone->lock);
1617	cond_resched();
1618	spin_lock_irq(lock: &zone->lock);
1619	}
1620	seq_putc(m, c: `'\n'`);
1621	}
1622	}
1623
1624	/ Print out the free pages at each order for each migatetype /
1625	static void pagetypeinfo_showfree(struct seq_file m, void* *arg)
1626	{
1627	int order;
1628	pg_data_t pgdat = (pg_data_t )arg;
1629
1630	/ Print header /
1631	seq_printf(m, fmt: "%-43s ", "Free pages count per migrate type at order");
1632	for (order = `0`; order < NR_PAGE_ORDERS; ++order)
1633	seq_printf(m, fmt: "%6d ", order);
1634	seq_putc(m, c: `'\n'`);
1635
1636	walk_zones_in_node(m, pgdat, assert_populated: true, nolock: false, print: pagetypeinfo_showfree_print);
1637	}
1638
1639	static void pagetypeinfo_showblockcount_print(struct seq_file *m,
1640	pg_data_t pgdat, struct* zone *zone)
1641	{
1642	int mtype;
1643	unsigned long pfn;
1644	unsigned long start_pfn = zone->zone_start_pfn;
1645	unsigned long end_pfn = zone_end_pfn(zone);
1646	unsigned long count[MIGRATE_TYPES] = { `0`, };
1647
1648	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
1649	struct page *page;
1650
1651	page = pfn_to_online_page(pfn);
1652	if (!page)
1653	continue;
1654
1655	if (page_zone(page) != zone)
1656	continue;
1657
1658	mtype = get_pageblock_migratetype(page);
1659
1660	if (mtype < MIGRATE_TYPES)
1661	count[mtype]++;
1662	}
1663
1664	/ Print counts /
1665	seq_printf(m, fmt: "Node %d, zone %8s ", pgdat->node_id, zone->name);
1666	for (mtype = `0`; mtype < MIGRATE_TYPES; mtype++)
1667	seq_printf(m, fmt: "%12lu ", count[mtype]);
1668	seq_putc(m, c: `'\n'`);
1669	}
1670
1671	/ Print out the number of pageblocks for each migratetype /
1672	static void pagetypeinfo_showblockcount(struct seq_file m, void* *arg)
1673	{
1674	int mtype;
1675	pg_data_t pgdat = (pg_data_t )arg;
1676
1677	seq_printf(m, fmt: "\n%-23s", "Number of blocks type ");
1678	for (mtype = `0`; mtype < MIGRATE_TYPES; mtype++)
1679	seq_printf(m, fmt: "%12s ", migratetype_names[mtype]);
1680	seq_putc(m, c: `'\n'`);
1681	walk_zones_in_node(m, pgdat, assert_populated: true, nolock: false,
1682	print: pagetypeinfo_showblockcount_print);
1683	}
1684
1685	/*
1686	* Print out the number of pageblocks for each migratetype that contain pages
1687	* of other types. This gives an indication of how well fallbacks are being
1688	* contained by rmqueue_fallback(). It requires information from PAGE_OWNER
1689	* to determine what is going on
1690	*/
1691	static void pagetypeinfo_showmixedcount(struct seq_file m, pg_data_t pgdat)
1692	{
1693	#ifdef CONFIG_PAGE_OWNER
1694	int mtype;
1695
1696	if (!static_branch_unlikely(&page_owner_inited))
1697	return;
1698
1699	drain_all_pages(NULL);
1700
1701	seq_printf(m, "\n%-23s", "Number of mixed blocks ");
1702	for (mtype = `0`; mtype < MIGRATE_TYPES; mtype++)
1703	seq_printf(m, "%12s ", migratetype_names[mtype]);
1704	seq_putc(m, `'\n'`);
1705
1706	walk_zones_in_node(m, pgdat, true, true,
1707	pagetypeinfo_showmixedcount_print);
1708	#endif /* CONFIG_PAGE_OWNER */
1709	}
1710
1711	/*
1712	* This prints out statistics in relation to grouping pages by mobility.
1713	* It is expensive to collect so do not constantly read the file.
1714	*/
1715	static int pagetypeinfo_show(struct seq_file m, void* *arg)
1716	{
1717	pg_data_t pgdat = (pg_data_t )arg;
1718
1719	/ check memoryless node /
1720	if (!node_state(node: pgdat->node_id, state: N_MEMORY))
1721	return `0`;
1722
1723	seq_printf(m, fmt: "Page block order: %d\n", pageblock_order);
1724	seq_printf(m, fmt: "Pages per block: %lu\n", pageblock_nr_pages);
1725	seq_putc(m, c: `'\n'`);
1726	pagetypeinfo_showfree(m, arg: pgdat);
1727	pagetypeinfo_showblockcount(m, arg: pgdat);
1728	pagetypeinfo_showmixedcount(m, pgdat);
1729
1730	return `0`;
1731	}
1732
1733	static const struct seq_operations fragmentation_op = {
1734	.start = frag_start,
1735	.next = frag_next,
1736	.stop = frag_stop,
1737	.show = frag_show,
1738	};
1739
1740	static const struct seq_operations pagetypeinfo_op = {
1741	.start = frag_start,
1742	.next = frag_next,
1743	.stop = frag_stop,
1744	.show = pagetypeinfo_show,
1745	};
1746
1747	static bool is_zone_first_populated(pg_data_t pgdat, struct* zone *zone)
1748	{
1749	int zid;
1750
1751	for (zid = `0`; zid < MAX_NR_ZONES; zid++) {
1752	struct zone *compare = &pgdat->node_zones[zid];
1753
1754	if (populated_zone(zone: compare))
1755	return zone == compare;
1756	}
1757
1758	return false;
1759	}
1760
1761	static void zoneinfo_show_print(struct seq_file m, pg_data_t pgdat,
1762	struct zone *zone)
1763	{
1764	int i;
1765	seq_printf(m, fmt: "Node %d, zone %8s", pgdat->node_id, zone->name);
1766	if (is_zone_first_populated(pgdat, zone)) {
1767	seq_printf(m, fmt: "\n per-node stats");
1768	for (i = `0`; i < NR_VM_NODE_STAT_ITEMS; i++) {
1769	unsigned long pages = node_page_state_pages(pgdat, item: i);
1770
1771	if (vmstat_item_print_in_thp(item: i))
1772	pages /= HPAGE_PMD_NR;
1773	seq_printf(m, fmt: "\n %-12s %lu", node_stat_name(item: i),
1774	pages);
1775	}
1776	}
1777	seq_printf(m,
1778	fmt: "\n pages free %lu"
1779	"\n boost %lu"
1780	"\n min %lu"
1781	"\n low %lu"
1782	"\n high %lu"
1783	"\n promo %lu"
1784	"\n spanned %lu"
1785	"\n present %lu"
1786	"\n managed %lu"
1787	"\n cma %lu",
1788	zone_page_state(zone, item: NR_FREE_PAGES),
1789	zone->watermark_boost,
1790	min_wmark_pages(z: zone),
1791	low_wmark_pages(z: zone),
1792	high_wmark_pages(z: zone),
1793	promo_wmark_pages(z: zone),
1794	zone->spanned_pages,
1795	zone->present_pages,
1796	zone_managed_pages(zone),
1797	zone_cma_pages(zone));
1798
1799	seq_printf(m,
1800	fmt: "\n protection: (%ld",
1801	zone->lowmem_reserve[`0`]);
1802	for (i = `1`; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
1803	seq_printf(m, fmt: ", %ld", zone->lowmem_reserve[i]);
1804	seq_putc(m, c: `')'`);
1805
1806	/ If unpopulated, no other information is useful /
1807	if (!populated_zone(zone)) {
1808	seq_putc(m, c: `'\n'`);
1809	return;
1810	}
1811
1812	for (i = `0`; i < NR_VM_ZONE_STAT_ITEMS; i++)
1813	seq_printf(m, fmt: "\n %-12s %lu", zone_stat_name(item: i),
1814	zone_page_state(zone, item: i));
1815
1816	#ifdef CONFIG_NUMA
1817	fold_vm_zone_numa_events(zone);
1818	for (i = `0`; i < NR_VM_NUMA_EVENT_ITEMS; i++)
1819	seq_printf(m, fmt: "\n %-12s %lu", numa_stat_name(item: i),
1820	zone_numa_event_state(zone, item: i));
1821	#endif
1822
1823	seq_printf(m, fmt: "\n pagesets");
1824	for_each_online_cpu(i) {
1825	struct per_cpu_pages *pcp;
1826	struct per_cpu_zonestat __maybe_unused *pzstats;
1827
1828	pcp = per_cpu_ptr(zone->per_cpu_pageset, i);
1829	seq_printf(m,
1830	fmt: "\n cpu: %i"
1831	"\n count: %i"
1832	"\n high: %i"
1833	"\n batch: %i"
1834	"\n high_min: %i"
1835	"\n high_max: %i",
1836	i,
1837	pcp->count,
1838	pcp->high,
1839	pcp->batch,
1840	pcp->high_min,
1841	pcp->high_max);
1842	#ifdef CONFIG_SMP
1843	pzstats = per_cpu_ptr(zone->per_cpu_zonestats, i);
1844	seq_printf(m, fmt: "\n vm stats threshold: %d",
1845	pzstats->stat_threshold);
1846	#endif
1847	}
1848	seq_printf(m,
1849	fmt: "\n node_unreclaimable: %u"
1850	"\n start_pfn: %lu",
1851	atomic_read(v: &pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES,
1852	zone->zone_start_pfn);
1853	seq_putc(m, c: `'\n'`);
1854	}
1855
1856	/*
1857	* Output information about zones in @pgdat. All zones are printed regardless
1858	* of whether they are populated or not: lowmem_reserve_ratio operates on the
1859	* set of all zones and userspace would not be aware of such zones if they are
1860	* suppressed here (zoneinfo displays the effect of lowmem_reserve_ratio).
1861	*/
1862	static int zoneinfo_show(struct seq_file m, void* *arg)
1863	{
1864	pg_data_t pgdat = (pg_data_t )arg;
1865	walk_zones_in_node(m, pgdat, assert_populated: false, nolock: false, print: zoneinfo_show_print);
1866	return `0`;
1867	}
1868
1869	static const struct seq_operations zoneinfo_op = {
1870	.start = frag_start, / iterate over all zones. The same as in*
1871	* fragmentation. */
1872	.next = frag_next,
1873	.stop = frag_stop,
1874	.show = zoneinfo_show,
1875	};
1876
1877	#define NR_VMSTAT_ITEMS (NR_VM_ZONE_STAT_ITEMS + \
1878	NR_VM_NUMA_EVENT_ITEMS + \
1879	NR_VM_NODE_STAT_ITEMS + \
1880	NR_VM_STAT_ITEMS + \
1881	(IS_ENABLED(CONFIG_VM_EVENT_COUNTERS) ? \
1882	NR_VM_EVENT_ITEMS : 0))
1883
1884	static void vmstat_start(struct* seq_file m, loff_t pos)
1885	{
1886	unsigned long *v;
1887	int i;
1888
1889	if (*pos >= NR_VMSTAT_ITEMS)
1890	return NULL;
1891
1892	BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) != NR_VMSTAT_ITEMS);
1893	fold_vm_numa_events();
1894	v = kmalloc_array(NR_VMSTAT_ITEMS, sizeof(unsigned long), GFP_KERNEL);
1895	m->private = v;
1896	if (!v)
1897	return ERR_PTR(error: -ENOMEM);
1898	for (i = `0`; i < NR_VM_ZONE_STAT_ITEMS; i++)
1899	v[i] = global_zone_page_state(item: i);
1900	v += NR_VM_ZONE_STAT_ITEMS;
1901
1902	#ifdef CONFIG_NUMA
1903	for (i = `0`; i < NR_VM_NUMA_EVENT_ITEMS; i++)
1904	v[i] = global_numa_event_state(item: i);
1905	v += NR_VM_NUMA_EVENT_ITEMS;
1906	#endif
1907
1908	for (i = `0`; i < NR_VM_NODE_STAT_ITEMS; i++) {
1909	v[i] = global_node_page_state_pages(item: i);
1910	if (vmstat_item_print_in_thp(item: i))
1911	v[i] /= HPAGE_PMD_NR;
1912	}
1913	v += NR_VM_NODE_STAT_ITEMS;
1914
1915	global_dirty_limits(pbackground: v + NR_DIRTY_BG_THRESHOLD,
1916	pdirty: v + NR_DIRTY_THRESHOLD);
1917	v[NR_MEMMAP_PAGES] = atomic_long_read(v: &nr_memmap_pages);
1918	v[NR_MEMMAP_BOOT_PAGES] = atomic_long_read(v: &nr_memmap_boot_pages);
1919	v += NR_VM_STAT_ITEMS;
1920
1921	#ifdef CONFIG_VM_EVENT_COUNTERS
1922	all_vm_events(v);
1923	v[PGPGIN] /= `2`; / sectors -> kbytes /
1924	v[PGPGOUT] /= `2`;
1925	#endif
1926	return (unsigned long )m->private + pos;
1927	}
1928
1929	static void vmstat_next(struct* seq_file m, void* arg, loff_t pos)
1930	{
1931	(*pos)++;
1932	if (*pos >= NR_VMSTAT_ITEMS)
1933	return NULL;
1934	return (unsigned long )m->private + pos;
1935	}
1936
1937	static int vmstat_show(struct seq_file m, void* *arg)
1938	{
1939	unsigned long *l = arg;
1940	unsigned long off = l - (unsigned long *)m->private;
1941
1942	seq_puts(m, s: vmstat_text[off]);
1943	seq_put_decimal_ull(m, delimiter: " ", num: *l);
1944	seq_putc(m, c: `'\n'`);
1945
1946	if (off == NR_VMSTAT_ITEMS - `1`) {
1947	/*
1948	* We've come to the end - add any deprecated counters to avoid
1949	* breaking userspace which might depend on them being present.
1950	*/
1951	seq_puts(m, s: "nr_unstable 0\n");
1952	}
1953	return `0`;
1954	}
1955
1956	static void vmstat_stop(struct seq_file m, void* *arg)
1957	{
1958	kfree(objp: m->private);
1959	m->private = NULL;
1960	}
1961
1962	static const struct seq_operations vmstat_op = {
1963	.start = vmstat_start,
1964	.next = vmstat_next,
1965	.stop = vmstat_stop,
1966	.show = vmstat_show,
1967	};
1968	#endif /* CONFIG_PROC_FS */
1969
1970	#ifdef CONFIG_SMP
1971	static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
1972	static int sysctl_stat_interval __read_mostly = HZ;
1973	static int vmstat_late_init_done;
1974
1975	#ifdef CONFIG_PROC_FS
1976	static void refresh_vm_stats(struct work_struct *work)
1977	{
1978	refresh_cpu_vm_stats(do_pagesets: true);
1979	}
1980
1981	static int vmstat_refresh(const struct ctl_table table, int* write,
1982	void buffer, size_t lenp, loff_t *ppos)
1983	{
1984	long val;
1985	int err;
1986	int i;
1987
1988	/*
1989	* The regular update, every sysctl_stat_interval, may come later
1990	* than expected: leaving a significant amount in per_cpu buckets.
1991	* This is particularly misleading when checking a quantity of HUGE
1992	* pages, immediately after running a test. /proc/sys/vm/stat_refresh,
1993	* which can equally be echo'ed to or cat'ted from (by root),
1994	* can be used to update the stats just before reading them.
1995	*
1996	* Oh, and since global_zone_page_state() etc. are so careful to hide
1997	* transiently negative values, report an error here if any of
1998	* the stats is negative, so we know to go looking for imbalance.
1999	*/
2000	err = schedule_on_each_cpu(func: refresh_vm_stats);
2001	if (err)
2002	return err;
2003	for (i = `0`; i < NR_VM_ZONE_STAT_ITEMS; i++) {
2004	/*
2005	* Skip checking stats known to go negative occasionally.
2006	*/
2007	switch (i) {
2008	case NR_ZONE_WRITE_PENDING:
2009	case NR_FREE_CMA_PAGES:
2010	continue;
2011	}
2012	val = atomic_long_read(v: &vm_zone_stat[i]);
2013	if (val < `0`) {
2014	pr_warn("%s: %s %ld\n",
2015	__func__, zone_stat_name(i), val);
2016	}
2017	}
2018	for (i = `0`; i < NR_VM_NODE_STAT_ITEMS; i++) {
2019	/*
2020	* Skip checking stats known to go negative occasionally.
2021	*/
2022	switch (i) {
2023	case NR_WRITEBACK:
2024	continue;
2025	}
2026	val = atomic_long_read(v: &vm_node_stat[i]);
2027	if (val < `0`) {
2028	pr_warn("%s: %s %ld\n",
2029	__func__, node_stat_name(i), val);
2030	}
2031	}
2032	if (write)
2033	ppos += lenp;
2034	else
2035	*lenp = `0`;
2036	return `0`;
2037	}
2038	#endif /* CONFIG_PROC_FS */
2039
2040	static void vmstat_update(struct work_struct *w)
2041	{
2042	if (refresh_cpu_vm_stats(do_pagesets: true)) {
2043	/*
2044	* Counters were updated so we expect more updates
2045	* to occur in the future. Keep on running the
2046	* update worker thread.
2047	*/
2048	queue_delayed_work_on(smp_processor_id(), wq: mm_percpu_wq,
2049	this_cpu_ptr(&vmstat_work),
2050	delay: round_jiffies_relative(j: sysctl_stat_interval));
2051	}
2052	}
2053
2054	/*
2055	* Check if the diffs for a certain cpu indicate that
2056	* an update is needed.
2057	*/
2058	static bool need_update(int cpu)
2059	{
2060	pg_data_t *last_pgdat = NULL;
2061	struct zone *zone;
2062
2063	for_each_populated_zone(zone) {
2064	struct per_cpu_zonestat *pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
2065	struct per_cpu_nodestat *n;
2066
2067	/*
2068	* The fast way of checking if there are any vmstat diffs.
2069	*/
2070	if (memchr_inv(s: pzstats->vm_stat_diff, c: `0`, n: sizeof(pzstats->vm_stat_diff)))
2071	return true;
2072
2073	if (last_pgdat == zone->zone_pgdat)
2074	continue;
2075	last_pgdat = zone->zone_pgdat;
2076	n = per_cpu_ptr(zone->zone_pgdat->per_cpu_nodestats, cpu);
2077	if (memchr_inv(s: n->vm_node_stat_diff, c: `0`, n: sizeof(n->vm_node_stat_diff)))
2078	return true;
2079	}
2080	return false;
2081	}
2082
2083	/*
2084	* Switch off vmstat processing and then fold all the remaining differentials
2085	* until the diffs stay at zero. The function is used by NOHZ and can only be
2086	* invoked when tick processing is not active.
2087	*/
2088	void quiet_vmstat(void)
2089	{
2090	if (system_state != SYSTEM_RUNNING)
2091	return;
2092
2093	if (!delayed_work_pending(this_cpu_ptr(&vmstat_work)))
2094	return;
2095
2096	if (!need_update(smp_processor_id()))
2097	return;
2098
2099	/*
2100	* Just refresh counters and do not care about the pending delayed
2101	* vmstat_update. It doesn't fire that often to matter and canceling
2102	* it would be too expensive from this path.
2103	* vmstat_shepherd will take care about that for us.
2104	*/
2105	refresh_cpu_vm_stats(do_pagesets: false);
2106	}
2107
2108	/*
2109	* Shepherd worker thread that checks the
2110	* differentials of processors that have their worker
2111	* threads for vm statistics updates disabled because of
2112	* inactivity.
2113	*/
2114	static void vmstat_shepherd(struct work_struct *w);
2115
2116	static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd);
2117
2118	static void vmstat_shepherd(struct work_struct *w)
2119	{
2120	int cpu;
2121
2122	cpus_read_lock();
2123	/ Check processors whose vmstat worker threads have been disabled /
2124	for_each_online_cpu(cpu) {
2125	struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
2126
2127	/*
2128	* In kernel users of vmstat counters either require the precise value and
2129	* they are using zone_page_state_snapshot interface or they can live with
2130	* an imprecision as the regular flushing can happen at arbitrary time and
2131	* cumulative error can grow (see calculate_normal_threshold).
2132	*
2133	* From that POV the regular flushing can be postponed for CPUs that have
2134	* been isolated from the kernel interference without critical
2135	* infrastructure ever noticing. Skip regular flushing from vmstat_shepherd
2136	* for all isolated CPUs to avoid interference with the isolated workload.
2137	*/
2138	if (cpu_is_isolated(cpu))
2139	continue;
2140
2141	if (!delayed_work_pending(dw) && need_update(cpu))
2142	queue_delayed_work_on(cpu, wq: mm_percpu_wq, work: dw, delay: `0`);
2143
2144	cond_resched();
2145	}
2146	cpus_read_unlock();
2147
2148	schedule_delayed_work(dwork: &shepherd,
2149	delay: round_jiffies_relative(j: sysctl_stat_interval));
2150	}
2151
2152	static void __init start_shepherd_timer(void)
2153	{
2154	int cpu;
2155
2156	for_each_possible_cpu(cpu) {
2157	INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
2158	vmstat_update);
2159
2160	/*
2161	* For secondary CPUs during CPU hotplug scenarios,
2162	* vmstat_cpu_online() will enable the work.
2163	* mm/vmstat:online enables and disables vmstat_work
2164	* symmetrically during CPU hotplug events.
2165	*/
2166	if (!cpu_online(cpu))
2167	disable_delayed_work_sync(dwork: &per_cpu(vmstat_work, cpu));
2168	}
2169
2170	schedule_delayed_work(dwork: &shepherd,
2171	delay: round_jiffies_relative(j: sysctl_stat_interval));
2172	}
2173
2174	static void __init init_cpu_node_state(void)
2175	{
2176	int node;
2177
2178	for_each_online_node(node) {
2179	if (!cpumask_empty(srcp: cpumask_of_node(node)))
2180	node_set_state(node, state: N_CPU);
2181	}
2182	}
2183
2184	static int vmstat_cpu_online(unsigned int cpu)
2185	{
2186	if (vmstat_late_init_done)
2187	refresh_zone_stat_thresholds();
2188
2189	if (!node_state(node: cpu_to_node(cpu), state: N_CPU)) {
2190	node_set_state(node: cpu_to_node(cpu), state: N_CPU);
2191	}
2192	enable_delayed_work(dwork: &per_cpu(vmstat_work, cpu));
2193
2194	return `0`;
2195	}
2196
2197	static int vmstat_cpu_down_prep(unsigned int cpu)
2198	{
2199	disable_delayed_work_sync(dwork: &per_cpu(vmstat_work, cpu));
2200	return `0`;
2201	}
2202
2203	static int vmstat_cpu_dead(unsigned int cpu)
2204	{
2205	const struct cpumask *node_cpus;
2206	int node;
2207
2208	node = cpu_to_node(cpu);
2209
2210	refresh_zone_stat_thresholds();
2211	node_cpus = cpumask_of_node(node);
2212	if (!cpumask_empty(srcp: node_cpus))
2213	return `0`;
2214
2215	node_clear_state(node, state: N_CPU);
2216
2217	return `0`;
2218	}
2219
2220	static int __init vmstat_late_init(void)
2221	{
2222	refresh_zone_stat_thresholds();
2223	vmstat_late_init_done = `1`;
2224
2225	return `0`;
2226	}
2227	late_initcall(vmstat_late_init);
2228	#endif
2229
2230	#ifdef CONFIG_PROC_FS
2231	static const struct ctl_table vmstat_table[] = {
2232	#ifdef CONFIG_SMP
2233	{
2234	.procname = "stat_interval",
2235	.data = &sysctl_stat_interval,
2236	.maxlen = sizeof(sysctl_stat_interval),
2237	.mode = `0644`,
2238	.proc_handler = proc_dointvec_jiffies,
2239	},
2240	{
2241	.procname = "stat_refresh",
2242	.data = NULL,
2243	.maxlen = `0`,
2244	.mode = `0600`,
2245	.proc_handler = vmstat_refresh,
2246	},
2247	#endif
2248	#ifdef CONFIG_NUMA
2249	{
2250	.procname = "numa_stat",
2251	.data = &sysctl_vm_numa_stat,
2252	.maxlen = sizeof(int),
2253	.mode = `0644`,
2254	.proc_handler = sysctl_vm_numa_stat_handler,
2255	.extra1 = SYSCTL_ZERO,
2256	.extra2 = SYSCTL_ONE,
2257	},
2258	#endif
2259	};
2260	#endif
2261
2262	struct workqueue_struct *mm_percpu_wq;
2263
2264	void __init init_mm_internals(void)
2265	{
2266	int ret __maybe_unused;
2267
2268	mm_percpu_wq = alloc_workqueue("mm_percpu_wq", WQ_MEM_RECLAIM, `0`);
2269
2270	#ifdef CONFIG_SMP
2271	ret = cpuhp_setup_state_nocalls(state: CPUHP_MM_VMSTAT_DEAD, name: "mm/vmstat:dead",
2272	NULL, teardown: vmstat_cpu_dead);
2273	if (ret < `0`)
2274	pr_err("vmstat: failed to register 'dead' hotplug state\n");
2275
2276	ret = cpuhp_setup_state_nocalls(state: CPUHP_AP_ONLINE_DYN, name: "mm/vmstat:online",
2277	startup: vmstat_cpu_online,
2278	teardown: vmstat_cpu_down_prep);
2279	if (ret < `0`)
2280	pr_err("vmstat: failed to register 'online' hotplug state\n");
2281
2282	cpus_read_lock();
2283	init_cpu_node_state();
2284	cpus_read_unlock();
2285
2286	start_shepherd_timer();
2287	#endif
2288	#ifdef CONFIG_PROC_FS
2289	proc_create_seq("buddyinfo", `0444`, NULL, &fragmentation_op);
2290	proc_create_seq("pagetypeinfo", `0400`, NULL, &pagetypeinfo_op);
2291	proc_create_seq("vmstat", `0444`, NULL, &vmstat_op);
2292	proc_create_seq("zoneinfo", `0444`, NULL, &zoneinfo_op);
2293	register_sysctl_init("vm", vmstat_table);
2294	#endif
2295	}
2296
2297	#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
2298
2299	/*
2300	* Return an index indicating how much of the available free memory is
2301	* unusable for an allocation of the requested size.
2302	*/
2303	static int unusable_free_index(unsigned int order,
2304	struct contig_page_info *info)
2305	{
2306	/ No free memory is interpreted as all free memory is unusable /
2307	if (info->free_pages == `0`)
2308	return `1000`;
2309
2310	/*
2311	* Index should be a value between 0 and 1. Return a value to 3
2312	* decimal places.
2313	*
2314	* 0 => no fragmentation
2315	* 1 => high fragmentation
2316	*/
2317	return div_u64(dividend: (info->free_pages - (info->free_blocks_suitable << order)) * `1000ULL`, divisor: info->free_pages);
2318
2319	}
2320
2321	static void unusable_show_print(struct seq_file *m,
2322	pg_data_t pgdat, struct* zone *zone)
2323	{
2324	unsigned int order;
2325	int index;
2326	struct contig_page_info info;
2327
2328	seq_printf(m, fmt: "Node %d, zone %8s ",
2329	pgdat->node_id,
2330	zone->name);
2331	for (order = `0`; order < NR_PAGE_ORDERS; ++order) {
2332	fill_contig_page_info(zone, suitable_order: order, info: &info);
2333	index = unusable_free_index(order, info: &info);
2334	seq_printf(m, fmt: "%d.%03d ", index / `1000`, index % `1000`);
2335	}
2336
2337	seq_putc(m, c: `'\n'`);
2338	}
2339
2340	/*
2341	* Display unusable free space index
2342	*
2343	* The unusable free space index measures how much of the available free
2344	* memory cannot be used to satisfy an allocation of a given size and is a
2345	* value between 0 and 1. The higher the value, the more of free memory is
2346	* unusable and by implication, the worse the external fragmentation is. This
2347	* can be expressed as a percentage by multiplying by 100.
2348	*/
2349	static int unusable_show(struct seq_file m, void* *arg)
2350	{
2351	pg_data_t pgdat = (pg_data_t )arg;
2352
2353	/ check memoryless node /
2354	if (!node_state(node: pgdat->node_id, state: N_MEMORY))
2355	return `0`;
2356
2357	walk_zones_in_node(m, pgdat, assert_populated: true, nolock: false, print: unusable_show_print);
2358
2359	return `0`;
2360	}
2361
2362	static const struct seq_operations unusable_sops = {
2363	.start = frag_start,
2364	.next = frag_next,
2365	.stop = frag_stop,
2366	.show = unusable_show,
2367	};
2368
2369	DEFINE_SEQ_ATTRIBUTE(unusable);
2370
2371	static void extfrag_show_print(struct seq_file *m,
2372	pg_data_t pgdat, struct* zone *zone)
2373	{
2374	unsigned int order;
2375	int index;
2376
2377	/ Alloc on stack as interrupts are disabled for zone walk /
2378	struct contig_page_info info;
2379
2380	seq_printf(m, fmt: "Node %d, zone %8s ",
2381	pgdat->node_id,
2382	zone->name);
2383	for (order = `0`; order < NR_PAGE_ORDERS; ++order) {
2384	fill_contig_page_info(zone, suitable_order: order, info: &info);
2385	index = __fragmentation_index(order, info: &info);
2386	seq_printf(m, fmt: "%2d.%03d ", index / `1000`, index % `1000`);
2387	}
2388
2389	seq_putc(m, c: `'\n'`);
2390	}
2391
2392	/*
2393	* Display fragmentation index for orders that allocations would fail for
2394	*/
2395	static int extfrag_show(struct seq_file m, void* *arg)
2396	{
2397	pg_data_t pgdat = (pg_data_t )arg;
2398
2399	walk_zones_in_node(m, pgdat, assert_populated: true, nolock: false, print: extfrag_show_print);
2400
2401	return `0`;
2402	}
2403
2404	static const struct seq_operations extfrag_sops = {
2405	.start = frag_start,
2406	.next = frag_next,
2407	.stop = frag_stop,
2408	.show = extfrag_show,
2409	};
2410
2411	DEFINE_SEQ_ATTRIBUTE(extfrag);
2412
2413	static int __init extfrag_debug_init(void)
2414	{
2415	struct dentry *extfrag_debug_root;
2416
2417	extfrag_debug_root = debugfs_create_dir(name: "extfrag", NULL);
2418
2419	debugfs_create_file("unusable_index", `0444`, extfrag_debug_root, NULL,
2420	&unusable_fops);
2421
2422	debugfs_create_file("extfrag_index", `0444`, extfrag_debug_root, NULL,
2423	&extfrag_fops);
2424
2425	return `0`;
2426	}
2427
2428	module_init(extfrag_debug_init);
2429
2430	#endif
2431

Browse the source code of Linux/mm/vmstat.c