vmscan.c source code [Linux/mm/vmscan.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
4	*
5	* Swap reorganised 29.12.95, Stephen Tweedie.
6	* kswapd added: 7.1.96 sct
7	* Removed kswapd_ctl limits, and swap out as many pages as needed
8	* to bring the system back to freepages.high: 2.4.97, Rik van Riel.
9	* Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
10	* Multiqueue VM started 5.8.00, Rik van Riel.
11	*/
12
13	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14
15	#include <linux/mm.h>
16	#include <linux/sched/mm.h>
17	#include <linux/module.h>
18	#include <linux/gfp.h>
19	#include <linux/kernel_stat.h>
20	#include <linux/swap.h>
21	#include <linux/pagemap.h>
22	#include <linux/init.h>
23	#include <linux/highmem.h>
24	#include <linux/vmpressure.h>
25	#include <linux/vmstat.h>
26	#include <linux/file.h>
27	#include <linux/writeback.h>
28	#include <linux/blkdev.h>
29	#include <linux/buffer_head.h> /* for buffer_heads_over_limit */
30	#include <linux/mm_inline.h>
31	#include <linux/backing-dev.h>
32	#include <linux/rmap.h>
33	#include <linux/topology.h>
34	#include <linux/cpu.h>
35	#include <linux/cpuset.h>
36	#include <linux/compaction.h>
37	#include <linux/notifier.h>
38	#include <linux/delay.h>
39	#include <linux/kthread.h>
40	#include <linux/freezer.h>
41	#include <linux/memcontrol.h>
42	#include <linux/migrate.h>
43	#include <linux/delayacct.h>
44	#include <linux/sysctl.h>
45	#include <linux/memory-tiers.h>
46	#include <linux/oom.h>
47	#include <linux/pagevec.h>
48	#include <linux/prefetch.h>
49	#include <linux/printk.h>
50	#include <linux/dax.h>
51	#include <linux/psi.h>
52	#include <linux/pagewalk.h>
53	#include <linux/shmem_fs.h>
54	#include <linux/ctype.h>
55	#include <linux/debugfs.h>
56	#include <linux/khugepaged.h>
57	#include <linux/rculist_nulls.h>
58	#include <linux/random.h>
59	#include <linux/mmu_notifier.h>
60	#include <linux/parser.h>
61
62	#include <asm/tlbflush.h>
63	#include <asm/div64.h>
64
65	#include <linux/swapops.h>
66	#include <linux/balloon_compaction.h>
67	#include <linux/sched/sysctl.h>
68
69	#include "internal.h"
70	#include "swap.h"
71
72	#define CREATE_TRACE_POINTS
73	#include <trace/events/vmscan.h>
74
75	struct scan_control {
76	/ How many pages shrink_list() should reclaim /
77	unsigned long nr_to_reclaim;
78
79	/*
80	* Nodemask of nodes allowed by the caller. If NULL, all nodes
81	* are scanned.
82	*/
83	nodemask_t *nodemask;
84
85	/*
86	* The memory cgroup that hit its limit and as a result is the
87	* primary target of this reclaim invocation.
88	*/
89	struct mem_cgroup *target_mem_cgroup;
90
91	/*
92	* Scan pressure balancing between anon and file LRUs
93	*/
94	unsigned long anon_cost;
95	unsigned long file_cost;
96
97	/ Swappiness value for proactive reclaim. Always use sc_swappiness()! /
98	int *proactive_swappiness;
99
100	/ Can active folios be deactivated as part of reclaim? /
101	#define DEACTIVATE_ANON 1
102	#define DEACTIVATE_FILE 2
103	unsigned int may_deactivate:`2`;
104	unsigned int force_deactivate:`1`;
105	unsigned int skipped_deactivate:`1`;
106
107	/ Writepage batching in laptop mode; RECLAIM_WRITE /
108	unsigned int may_writepage:`1`;
109
110	/ Can mapped folios be reclaimed? /
111	unsigned int may_unmap:`1`;
112
113	/ Can folios be swapped as part of reclaim? /
114	unsigned int may_swap:`1`;
115
116	/ Not allow cache_trim_mode to be turned on as part of reclaim? /
117	unsigned int no_cache_trim_mode:`1`;
118
119	/ Has cache_trim_mode failed at least once? /
120	unsigned int cache_trim_mode_failed:`1`;
121
122	/ Proactive reclaim invoked by userspace /
123	unsigned int proactive:`1`;
124
125	/*
126	* Cgroup memory below memory.low is protected as long as we
127	* don't threaten to OOM. If any cgroup is reclaimed at
128	* reduced force or passed over entirely due to its memory.low
129	* setting (memcg_low_skipped), and nothing is reclaimed as a
130	* result, then go back for one more cycle that reclaims the protected
131	* memory (memcg_low_reclaim) to avert OOM.
132	*/
133	unsigned int memcg_low_reclaim:`1`;
134	unsigned int memcg_low_skipped:`1`;
135
136	/ Shared cgroup tree walk failed, rescan the whole tree /
137	unsigned int memcg_full_walk:`1`;
138
139	unsigned int hibernation_mode:`1`;
140
141	/ One of the zones is ready for compaction /
142	unsigned int compaction_ready:`1`;
143
144	/ There is easily reclaimable cold cache in the current node /
145	unsigned int cache_trim_mode:`1`;
146
147	/ The file folios on the current node are dangerously low /
148	unsigned int file_is_tiny:`1`;
149
150	/ Always discard instead of demoting to lower tier memory /
151	unsigned int no_demotion:`1`;
152
153	/ Allocation order /
154	s8 order;
155
156	/ Scan (total_size >> priority) pages at once /
157	s8 priority;
158
159	/ The highest zone to isolate folios for reclaim from /
160	s8 reclaim_idx;
161
162	/ This context's GFP mask /
163	gfp_t gfp_mask;
164
165	/ Incremented by the number of inactive pages that were scanned /
166	unsigned long nr_scanned;
167
168	/ Number of pages freed so far during a call to shrink_zones() /
169	unsigned long nr_reclaimed;
170
171	struct {
172	unsigned int dirty;
173	unsigned int unqueued_dirty;
174	unsigned int congested;
175	unsigned int writeback;
176	unsigned int immediate;
177	unsigned int file_taken;
178	unsigned int taken;
179	} nr;
180
181	/ for recording the reclaimed slab by now /
182	struct reclaim_state reclaim_state;
183	};
184
185	#ifdef ARCH_HAS_PREFETCHW
186	#define prefetchw_prev_lru_folio(_folio, _base, _field) \
187	do { \
188	if ((_folio)->lru.prev != _base) { \
189	struct folio *prev; \
190	\
191	prev = lru_to_folio(&(_folio->lru)); \
192	prefetchw(&prev->_field); \
193	} \
194	} while (0)
195	#else
196	#define prefetchw_prev_lru_folio(_folio, _base, _field) do { } while (0)
197	#endif
198
199	/*
200	* From 0 .. MAX_SWAPPINESS. Higher means more swappy.
201	*/
202	int vm_swappiness = `60`;
203
204	#ifdef CONFIG_MEMCG
205
206	/ Returns true for reclaim through cgroup limits or cgroup interfaces. /
207	static bool cgroup_reclaim(struct scan_control *sc)
208	{
209	return sc->target_mem_cgroup;
210	}
211
212	/*
213	* Returns true for reclaim on the root cgroup. This is true for direct
214	* allocator reclaim and reclaim through cgroup interfaces on the root cgroup.
215	*/
216	static bool root_reclaim(struct scan_control *sc)
217	{
218	return !sc->target_mem_cgroup \|\| mem_cgroup_is_root(sc->target_mem_cgroup);
219	}
220
221	/**
222	* writeback_throttling_sane - is the usual dirty throttling mechanism available?
223	* @sc: scan_control in question
224	*
225	* The normal page dirty throttling mechanism in balance_dirty_pages() is
226	* completely broken with the legacy memcg and direct stalling in
227	* shrink_folio_list() is used for throttling instead, which lacks all the
228	* niceties such as fairness, adaptive pausing, bandwidth proportional
229	* allocation and configurability.
230	*
231	* This function tests whether the vmscan currently in progress can assume
232	* that the normal dirty throttling mechanism is operational.
233	*/
234	static bool writeback_throttling_sane(struct scan_control *sc)
235	{
236	if (!cgroup_reclaim(sc))
237	return true;
238	#ifdef CONFIG_CGROUP_WRITEBACK
239	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
240	return true;
241	#endif
242	return false;
243	}
244
245	static int sc_swappiness(struct scan_control sc, struct* mem_cgroup *memcg)
246	{
247	if (sc->proactive && sc->proactive_swappiness)
248	return *sc->proactive_swappiness;
249	return mem_cgroup_swappiness(memcg);
250	}
251	#else
252	static bool cgroup_reclaim(struct scan_control *sc)
253	{
254	return false;
255	}
256
257	static bool root_reclaim(struct scan_control *sc)
258	{
259	return true;
260	}
261
262	static bool writeback_throttling_sane(struct scan_control *sc)
263	{
264	return true;
265	}
266
267	static int sc_swappiness(struct scan_control sc, struct* mem_cgroup *memcg)
268	{
269	return READ_ONCE(vm_swappiness);
270	}
271	#endif
272
273	/ for_each_managed_zone_pgdat - helper macro to iterate over all managed zones in a pgdat up to*
274	* and including the specified highidx
275	* @zone: The current zone in the iterator
276	* @pgdat: The pgdat which node_zones are being iterated
277	* @idx: The index variable
278	* @highidx: The index of the highest zone to return
279	*
280	* This macro iterates through all managed zones up to and including the specified highidx.
281	* The zone iterator enters an invalid state after macro call and must be reinitialized
282	* before it can be used again.
283	*/
284	#define for_each_managed_zone_pgdat(zone, pgdat, idx, highidx) \
285	for ((idx) = 0, (zone) = (pgdat)->node_zones; \
286	(idx) <= (highidx); \
287	(idx)++, (zone)++) \
288	if (!managed_zone(zone)) \
289	continue; \
290	else
291
292	static void set_task_reclaim_state(struct task_struct *task,
293	struct reclaim_state *rs)
294	{
295	/ Check for an overwrite /
296	WARN_ON_ONCE(rs && task->reclaim_state);
297
298	/ Check for the nulling of an already-nulled member /
299	WARN_ON_ONCE(!rs && !task->reclaim_state);
300
301	task->reclaim_state = rs;
302	}
303
304	/*
305	* flush_reclaim_state(): add pages reclaimed outside of LRU-based reclaim to
306	* scan_control->nr_reclaimed.
307	*/
308	static void flush_reclaim_state(struct scan_control *sc)
309	{
310	/*
311	* Currently, reclaim_state->reclaimed includes three types of pages
312	* freed outside of vmscan:
313	* (1) Slab pages.
314	* (2) Clean file pages from pruned inodes (on highmem systems).
315	* (3) XFS freed buffer pages.
316	*
317	* For all of these cases, we cannot universally link the pages to a
318	* single memcg. For example, a memcg-aware shrinker can free one object
319	* charged to the target memcg, causing an entire page to be freed.
320	* If we count the entire page as reclaimed from the memcg, we end up
321	* overestimating the reclaimed amount (potentially under-reclaiming).
322	*
323	* Only count such pages for global reclaim to prevent under-reclaiming
324	* from the target memcg; preventing unnecessary retries during memcg
325	* charging and false positives from proactive reclaim.
326	*
327	* For uncommon cases where the freed pages were actually mostly
328	* charged to the target memcg, we end up underestimating the reclaimed
329	* amount. This should be fine. The freed pages will be uncharged
330	* anyway, even if they are not counted here properly, and we will be
331	* able to make forward progress in charging (which is usually in a
332	* retry loop).
333	*
334	* We can go one step further, and report the uncharged objcg pages in
335	* memcg reclaim, to make reporting more accurate and reduce
336	* underestimation, but it's probably not worth the complexity for now.
337	*/
338	if (current->reclaim_state && root_reclaim(sc)) {
339	sc->nr_reclaimed += current->reclaim_state->reclaimed;
340	current->reclaim_state->reclaimed = `0`;
341	}
342	}
343
344	static bool can_demote(int nid, struct scan_control *sc,
345	struct mem_cgroup *memcg)
346	{
347	int demotion_nid;
348
349	if (!numa_demotion_enabled)
350	return false;
351	if (sc && sc->no_demotion)
352	return false;
353
354	demotion_nid = next_demotion_node(node: nid);
355	if (demotion_nid == NUMA_NO_NODE)
356	return false;
357
358	/ If demotion node isn't in the cgroup's mems_allowed, fall back /
359	return mem_cgroup_node_allowed(memcg, nid: demotion_nid);
360	}
361
362	static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
363	int nid,
364	struct scan_control *sc)
365	{
366	if (memcg == NULL) {
367	/*
368	* For non-memcg reclaim, is there
369	* space in any swap device?
370	*/
371	if (get_nr_swap_pages() > `0`)
372	return true;
373	} else {
374	/ Is the memcg below its swap limit? /
375	if (mem_cgroup_get_nr_swap_pages(memcg) > `0`)
376	return true;
377	}
378
379	/*
380	* The page can not be swapped.
381	*
382	* Can it be reclaimed from this node via demotion?
383	*/
384	return can_demote(nid, sc, memcg);
385	}
386
387	/*
388	* This misses isolated folios which are not accounted for to save counters.
389	* As the data only determines if reclaim or compaction continues, it is
390	* not expected that isolated folios will be a dominating factor.
391	*/
392	unsigned long zone_reclaimable_pages(struct zone *zone)
393	{
394	unsigned long nr;
395
396	nr = zone_page_state_snapshot(zone, item: NR_ZONE_INACTIVE_FILE) +
397	zone_page_state_snapshot(zone, item: NR_ZONE_ACTIVE_FILE);
398	if (can_reclaim_anon_pages(NULL, nid: zone_to_nid(zone), NULL))
399	nr += zone_page_state_snapshot(zone, item: NR_ZONE_INACTIVE_ANON) +
400	zone_page_state_snapshot(zone, item: NR_ZONE_ACTIVE_ANON);
401
402	return nr;
403	}
404
405	/**
406	* lruvec_lru_size - Returns the number of pages on the given LRU list.
407	* @lruvec: lru vector
408	* @lru: lru to use
409	* @zone_idx: zones to consider (use MAX_NR_ZONES - 1 for the whole LRU list)
410	*/
411	static unsigned long lruvec_lru_size(struct lruvec lruvec, enum* lru_list lru,
412	int zone_idx)
413	{
414	unsigned long size = `0`;
415	int zid;
416	struct zone *zone;
417
418	for_each_managed_zone_pgdat(zone, lruvec_pgdat(lruvec), zid, zone_idx) {
419	if (!mem_cgroup_disabled())
420	size += mem_cgroup_get_zone_lru_size(lruvec, lru, zone_idx: zid);
421	else
422	size += zone_page_state(zone, item: NR_ZONE_LRU_BASE + lru);
423	}
424	return size;
425	}
426
427	static unsigned long drop_slab_node(int nid)
428	{
429	unsigned long freed = `0`;
430	struct mem_cgroup *memcg = NULL;
431
432	memcg = mem_cgroup_iter(NULL, NULL, NULL);
433	do {
434	freed += shrink_slab(GFP_KERNEL, nid, memcg, priority: `0`);
435	} while ((memcg = mem_cgroup_iter(NULL, prev: memcg, NULL)) != NULL);
436
437	return freed;
438	}
439
440	void drop_slab(void)
441	{
442	int nid;
443	int shift = `0`;
444	unsigned long freed;
445
446	do {
447	freed = `0`;
448	for_each_online_node(nid) {
449	if (fatal_signal_pending(current))
450	return;
451
452	freed += drop_slab_node(nid);
453	}
454	} while ((freed >> shift++) > `1`);
455	}
456
457	#define CHECK_RECLAIMER_OFFSET(type) \
458	do { \
459	BUILD_BUG_ON(PGSTEAL_##type - PGSTEAL_KSWAPD != \
460	PGDEMOTE_##type - PGDEMOTE_KSWAPD); \
461	BUILD_BUG_ON(PGSTEAL_##type - PGSTEAL_KSWAPD != \
462	PGSCAN_##type - PGSCAN_KSWAPD); \
463	} while (0)
464
465	static int reclaimer_offset(struct scan_control *sc)
466	{
467	CHECK_RECLAIMER_OFFSET(DIRECT);
468	CHECK_RECLAIMER_OFFSET(KHUGEPAGED);
469	CHECK_RECLAIMER_OFFSET(PROACTIVE);
470
471	if (current_is_kswapd())
472	return `0`;
473	if (current_is_khugepaged())
474	return PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD;
475	if (sc->proactive)
476	return PGSTEAL_PROACTIVE - PGSTEAL_KSWAPD;
477	return PGSTEAL_DIRECT - PGSTEAL_KSWAPD;
478	}
479
480	static inline int is_page_cache_freeable(struct folio *folio)
481	{
482	/*
483	* A freeable page cache folio is referenced only by the caller
484	* that isolated the folio, the page cache and optional filesystem
485	* private data at folio->private.
486	*/
487	return folio_ref_count(folio) - folio_test_private(folio) ==
488	`1` + folio_nr_pages(folio);
489	}
490
491	/*
492	* We detected a synchronous write error writing a folio out. Probably
493	* -ENOSPC. We need to propagate that into the address_space for a subsequent
494	* fsync(), msync() or close().
495	*
496	* The tricky part is that after writepage we cannot touch the mapping: nothing
497	* prevents it from being freed up. But we have a ref on the folio and once
498	* that folio is locked, the mapping is pinned.
499	*
500	* We're allowed to run sleeping folio_lock() here because we know the caller has
501	* __GFP_FS.
502	*/
503	static void handle_write_error(struct address_space *mapping,
504	struct folio folio, int* error)
505	{
506	folio_lock(folio);
507	if (folio_mapping(folio) == mapping)
508	mapping_set_error(mapping, error);
509	folio_unlock(folio);
510	}
511
512	static bool skip_throttle_noprogress(pg_data_t *pgdat)
513	{
514	int reclaimable = `0`, write_pending = `0`;
515	int i;
516	struct zone *zone;
517	/*
518	* If kswapd is disabled, reschedule if necessary but do not
519	* throttle as the system is likely near OOM.
520	*/
521	if (atomic_read(v: &pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES)
522	return true;
523
524	/*
525	* If there are a lot of dirty/writeback folios then do not
526	* throttle as throttling will occur when the folios cycle
527	* towards the end of the LRU if still under writeback.
528	*/
529	for_each_managed_zone_pgdat(zone, pgdat, i, MAX_NR_ZONES - `1`) {
530	reclaimable += zone_reclaimable_pages(zone);
531	write_pending += zone_page_state_snapshot(zone,
532	item: NR_ZONE_WRITE_PENDING);
533	}
534	if (`2` * write_pending <= reclaimable)
535	return true;
536
537	return false;
538	}
539
540	void reclaim_throttle(pg_data_t pgdat, enum* vmscan_throttle_state reason)
541	{
542	wait_queue_head_t *wqh = &pgdat->reclaim_wait[reason];
543	long timeout, ret;
544	DEFINE_WAIT(wait);
545
546	/*
547	* Do not throttle user workers, kthreads other than kswapd or
548	* workqueues. They may be required for reclaim to make
549	* forward progress (e.g. journalling workqueues or kthreads).
550	*/
551	if (!current_is_kswapd() &&
552	current->flags & (PF_USER_WORKER\|PF_KTHREAD)) {
553	cond_resched();
554	return;
555	}
556
557	/*
558	* These figures are pulled out of thin air.
559	* VMSCAN_THROTTLE_ISOLATED is a transient condition based on too many
560	* parallel reclaimers which is a short-lived event so the timeout is
561	* short. Failing to make progress or waiting on writeback are
562	* potentially long-lived events so use a longer timeout. This is shaky
563	* logic as a failure to make progress could be due to anything from
564	* writeback to a slow device to excessive referenced folios at the tail
565	* of the inactive LRU.
566	*/
567	switch(reason) {
568	case VMSCAN_THROTTLE_WRITEBACK:
569	timeout = HZ/`10`;
570
571	if (atomic_inc_return(v: &pgdat->nr_writeback_throttled) == `1`) {
572	WRITE_ONCE(pgdat->nr_reclaim_start,
573	node_page_state(pgdat, NR_THROTTLED_WRITTEN));
574	}
575
576	break;
577	case VMSCAN_THROTTLE_CONGESTED:
578	fallthrough;
579	case VMSCAN_THROTTLE_NOPROGRESS:
580	if (skip_throttle_noprogress(pgdat)) {
581	cond_resched();
582	return;
583	}
584
585	timeout = `1`;
586
587	break;
588	case VMSCAN_THROTTLE_ISOLATED:
589	timeout = HZ/`50`;
590	break;
591	default:
592	WARN_ON_ONCE(`1`);
593	timeout = HZ;
594	break;
595	}
596
597	prepare_to_wait(wq_head: wqh, wq_entry: &wait, TASK_UNINTERRUPTIBLE);
598	ret = schedule_timeout(timeout);
599	finish_wait(wq_head: wqh, wq_entry: &wait);
600
601	if (reason == VMSCAN_THROTTLE_WRITEBACK)
602	atomic_dec(v: &pgdat->nr_writeback_throttled);
603
604	trace_mm_vmscan_throttled(nid: pgdat->node_id, usec_timeout: jiffies_to_usecs(j: timeout),
605	usec_delayed: jiffies_to_usecs(j: timeout - ret),
606	reason);
607	}
608
609	/*
610	* Account for folios written if tasks are throttled waiting on dirty
611	* folios to clean. If enough folios have been cleaned since throttling
612	* started then wakeup the throttled tasks.
613	*/
614	void __acct_reclaim_writeback(pg_data_t pgdat, struct* folio *folio,
615	int nr_throttled)
616	{
617	unsigned long nr_written;
618
619	node_stat_add_folio(folio, item: NR_THROTTLED_WRITTEN);
620
621	/*
622	* This is an inaccurate read as the per-cpu deltas may not
623	* be synchronised. However, given that the system is
624	* writeback throttled, it is not worth taking the penalty
625	* of getting an accurate count. At worst, the throttle
626	* timeout guarantees forward progress.
627	*/
628	nr_written = node_page_state(pgdat, item: NR_THROTTLED_WRITTEN) -
629	READ_ONCE(pgdat->nr_reclaim_start);
630
631	if (nr_written > SWAP_CLUSTER_MAX * nr_throttled)
632	wake_up(&pgdat->reclaim_wait[VMSCAN_THROTTLE_WRITEBACK]);
633	}
634
635	/ possible outcome of pageout() /
636	typedef enum {
637	/ failed to write folio out, folio is locked /
638	PAGE_KEEP,
639	/ move folio to the active list, folio is locked /
640	PAGE_ACTIVATE,
641	/ folio has been sent to the disk successfully, folio is unlocked /
642	PAGE_SUCCESS,
643	/ folio is clean and locked /
644	PAGE_CLEAN,
645	} pageout_t;
646
647	static pageout_t writeout(struct folio folio, struct* address_space *mapping,
648	struct swap_iocb plug, struct** list_head *folio_list)
649	{
650	int res;
651
652	folio_set_reclaim(folio);
653
654	/*
655	* The large shmem folio can be split if CONFIG_THP_SWAP is not enabled
656	* or we failed to allocate contiguous swap entries, in which case
657	* the split out folios get added back to folio_list.
658	*/
659	if (shmem_mapping(mapping))
660	res = shmem_writeout(folio, plug, folio_list);
661	else
662	res = swap_writeout(folio, swap_plug: plug);
663
664	if (res < `0`)
665	handle_write_error(mapping, folio, error: res);
666	if (res == AOP_WRITEPAGE_ACTIVATE) {
667	folio_clear_reclaim(folio);
668	return PAGE_ACTIVATE;
669	}
670
671	/ synchronous write? /
672	if (!folio_test_writeback(folio))
673	folio_clear_reclaim(folio);
674
675	trace_mm_vmscan_write_folio(folio);
676	node_stat_add_folio(folio, item: NR_VMSCAN_WRITE);
677	return PAGE_SUCCESS;
678	}
679
680	/*
681	* pageout is called by shrink_folio_list() for each dirty folio.
682	*/
683	static pageout_t pageout(struct folio folio, struct* address_space *mapping,
684	struct swap_iocb plug, struct** list_head *folio_list)
685	{
686	/*
687	* We no longer attempt to writeback filesystem folios here, other
688	* than tmpfs/shmem. That's taken care of in page-writeback.
689	* If we find a dirty filesystem folio at the end of the LRU list,
690	* typically that means the filesystem is saturating the storage
691	* with contiguous writes and telling it to write a folio here
692	* would only make the situation worse by injecting an element
693	* of random access.
694	*
695	* If the folio is swapcache, write it back even if that would
696	* block, for some throttling. This happens by accident, because
697	* swap_backing_dev_info is bust: it doesn't reflect the
698	* congestion state of the swapdevs. Easy to fix, if needed.
699	*/
700	if (!is_page_cache_freeable(folio))
701	return PAGE_KEEP;
702	if (!mapping) {
703	/*
704	* Some data journaling orphaned folios can have
705	* folio->mapping == NULL while being dirty with clean buffers.
706	*/
707	if (folio_test_private(folio)) {
708	if (try_to_free_buffers(folio)) {
709	folio_clear_dirty(folio);
710	pr_info("%s: orphaned folio\n", __func__);
711	return PAGE_CLEAN;
712	}
713	}
714	return PAGE_KEEP;
715	}
716
717	if (!shmem_mapping(mapping) && !folio_test_anon(folio))
718	return PAGE_ACTIVATE;
719	if (!folio_clear_dirty_for_io(folio))
720	return PAGE_CLEAN;
721	return writeout(folio, mapping, plug, folio_list);
722	}
723
724	/*
725	* Same as remove_mapping, but if the folio is removed from the mapping, it
726	* gets returned with a refcount of 0.
727	*/
728	static int __remove_mapping(struct address_space mapping, struct* folio *folio,
729	bool reclaimed, struct mem_cgroup *target_memcg)
730	{
731	int refcount;
732	void *shadow = NULL;
733	struct swap_cluster_info *ci;
734
735	BUG_ON(!folio_test_locked(folio));
736	BUG_ON(mapping != folio_mapping(folio));
737
738	if (folio_test_swapcache(folio)) {
739	ci = swap_cluster_get_and_lock_irq(folio);
740	} else {
741	spin_lock(lock: &mapping->host->i_lock);
742	xa_lock_irq(&mapping->i_pages);
743	}
744
745	/*
746	* The non racy check for a busy folio.
747	*
748	* Must be careful with the order of the tests. When someone has
749	* a ref to the folio, it may be possible that they dirty it then
750	* drop the reference. So if the dirty flag is tested before the
751	* refcount here, then the following race may occur:
752	*
753	* get_user_pages(&page);
754	* [user mapping goes away]
755	* write_to(page);
756	* !folio_test_dirty(folio) [good]
757	* folio_set_dirty(folio);
758	* folio_put(folio);
759	* !refcount(folio) [good, discard it]
760	*
761	* [oops, our write_to data is lost]
762	*
763	* Reversing the order of the tests ensures such a situation cannot
764	* escape unnoticed. The smp_rmb is needed to ensure the folio->flags
765	* load is not satisfied before that of folio->_refcount.
766	*
767	* Note that if the dirty flag is always set via folio_mark_dirty,
768	* and thus under the i_pages lock, then this ordering is not required.
769	*/
770	refcount = `1` + folio_nr_pages(folio);
771	if (!folio_ref_freeze(folio, count: refcount))
772	goto cannot_free;
773	/ note: atomic_cmpxchg in folio_ref_freeze provides the smp_rmb /
774	if (unlikely(folio_test_dirty(folio))) {
775	folio_ref_unfreeze(folio, count: refcount);
776	goto cannot_free;
777	}
778
779	if (folio_test_swapcache(folio)) {
780	swp_entry_t swap = folio->swap;
781
782	if (reclaimed && !mapping_exiting(mapping))
783	shadow = workingset_eviction(folio, target_memcg);
784	__swap_cache_del_folio(ci, folio, entry: swap, shadow);
785	memcg1_swapout(folio, entry: swap);
786	swap_cluster_unlock_irq(ci);
787	put_swap_folio(folio, entry: swap);
788	} else {
789	void (free_folio)(struct* folio *);
790
791	free_folio = mapping->a_ops->free_folio;
792	/*
793	* Remember a shadow entry for reclaimed file cache in
794	* order to detect refaults, thus thrashing, later on.
795	*
796	* But don't store shadows in an address space that is
797	* already exiting. This is not just an optimization,
798	* inode reclaim needs to empty out the radix tree or
799	* the nodes are lost. Don't plant shadows behind its
800	* back.
801	*
802	* We also don't store shadows for DAX mappings because the
803	* only page cache folios found in these are zero pages
804	* covering holes, and because we don't want to mix DAX
805	* exceptional entries and shadow exceptional entries in the
806	* same address_space.
807	*/
808	if (reclaimed && folio_is_file_lru(folio) &&
809	!mapping_exiting(mapping) && !dax_mapping(mapping))
810	shadow = workingset_eviction(folio, target_memcg);
811	__filemap_remove_folio(folio, shadow);
812	xa_unlock_irq(&mapping->i_pages);
813	if (mapping_shrinkable(mapping))
814	inode_add_lru(inode: mapping->host);
815	spin_unlock(lock: &mapping->host->i_lock);
816
817	if (free_folio)
818	free_folio(folio);
819	}
820
821	return `1`;
822
823	cannot_free:
824	if (folio_test_swapcache(folio)) {
825	swap_cluster_unlock_irq(ci);
826	} else {
827	xa_unlock_irq(&mapping->i_pages);
828	spin_unlock(lock: &mapping->host->i_lock);
829	}
830	return `0`;
831	}
832
833	/**
834	* remove_mapping() - Attempt to remove a folio from its mapping.
835	* @mapping: The address space.
836	* @folio: The folio to remove.
837	*
838	* If the folio is dirty, under writeback or if someone else has a ref
839	* on it, removal will fail.
840	* Return: The number of pages removed from the mapping. 0 if the folio
841	* could not be removed.
842	* Context: The caller should have a single refcount on the folio and
843	* hold its lock.
844	*/
845	long remove_mapping(struct address_space mapping, struct* folio *folio)
846	{
847	if (__remove_mapping(mapping, folio, reclaimed: false, NULL)) {
848	/*
849	* Unfreezing the refcount with 1 effectively
850	* drops the pagecache ref for us without requiring another
851	* atomic operation.
852	*/
853	folio_ref_unfreeze(folio, count: `1`);
854	return folio_nr_pages(folio);
855	}
856	return `0`;
857	}
858
859	/**
860	* folio_putback_lru - Put previously isolated folio onto appropriate LRU list.
861	* @folio: Folio to be returned to an LRU list.
862	*
863	* Add previously isolated @folio to appropriate LRU list.
864	* The folio may still be unevictable for other reasons.
865	*
866	* Context: lru_lock must not be held, interrupts must be enabled.
867	*/
868	void folio_putback_lru(struct folio *folio)
869	{
870	folio_add_lru(folio);
871	folio_put(folio); / drop ref from isolate /
872	}
873
874	enum folio_references {
875	FOLIOREF_RECLAIM,
876	FOLIOREF_RECLAIM_CLEAN,
877	FOLIOREF_KEEP,
878	FOLIOREF_ACTIVATE,
879	};
880
881	#ifdef CONFIG_LRU_GEN
882	/*
883	* Only used on a mapped folio in the eviction (rmap walk) path, where promotion
884	* needs to be done by taking the folio off the LRU list and then adding it back
885	* with PG_active set. In contrast, the aging (page table walk) path uses
886	* folio_update_gen().
887	*/
888	static bool lru_gen_set_refs(struct folio *folio)
889	{
890	/ see the comment on LRU_REFS_FLAGS /
891	if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) {
892	set_mask_bits(&folio->flags.f, LRU_REFS_MASK, BIT(PG_referenced));
893	return false;
894	}
895
896	set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_workingset));
897	return true;
898	}
899	#else
900	static bool lru_gen_set_refs(struct folio *folio)
901	{
902	return false;
903	}
904	#endif /* CONFIG_LRU_GEN */
905
906	static enum folio_references folio_check_references(struct folio *folio,
907	struct scan_control *sc)
908	{
909	int referenced_ptes, referenced_folio;
910	vm_flags_t vm_flags;
911
912	referenced_ptes = folio_referenced(folio, is_locked: `1`, memcg: sc->target_mem_cgroup,
913	vm_flags: &vm_flags);
914
915	/*
916	* The supposedly reclaimable folio was found to be in a VM_LOCKED vma.
917	* Let the folio, now marked Mlocked, be moved to the unevictable list.
918	*/
919	if (vm_flags & VM_LOCKED)
920	return FOLIOREF_ACTIVATE;
921
922	/*
923	* There are two cases to consider.
924	* 1) Rmap lock contention: rotate.
925	* 2) Skip the non-shared swapbacked folio mapped solely by
926	* the exiting or OOM-reaped process.
927	*/
928	if (referenced_ptes == -`1`)
929	return FOLIOREF_KEEP;
930
931	if (lru_gen_enabled()) {
932	if (!referenced_ptes)
933	return FOLIOREF_RECLAIM;
934
935	return lru_gen_set_refs(folio) ? FOLIOREF_ACTIVATE : FOLIOREF_KEEP;
936	}
937
938	referenced_folio = folio_test_clear_referenced(folio);
939
940	if (referenced_ptes) {
941	/*
942	* All mapped folios start out with page table
943	* references from the instantiating fault, so we need
944	* to look twice if a mapped file/anon folio is used more
945	* than once.
946	*
947	* Mark it and spare it for another trip around the
948	* inactive list. Another page table reference will
949	* lead to its activation.
950	*
951	* Note: the mark is set for activated folios as well
952	* so that recently deactivated but used folios are
953	* quickly recovered.
954	*/
955	folio_set_referenced(folio);
956
957	if (referenced_folio \|\| referenced_ptes > `1`)
958	return FOLIOREF_ACTIVATE;
959
960	/*
961	* Activate file-backed executable folios after first usage.
962	*/
963	if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio))
964	return FOLIOREF_ACTIVATE;
965
966	return FOLIOREF_KEEP;
967	}
968
969	/ Reclaim if clean, defer dirty folios to writeback /
970	if (referenced_folio && folio_is_file_lru(folio))
971	return FOLIOREF_RECLAIM_CLEAN;
972
973	return FOLIOREF_RECLAIM;
974	}
975
976	/ Check if a folio is dirty or under writeback /
977	static void folio_check_dirty_writeback(struct folio *folio,
978	bool dirty, bool writeback)
979	{
980	struct address_space *mapping;
981
982	/*
983	* Anonymous folios are not handled by flushers and must be written
984	* from reclaim context. Do not stall reclaim based on them.
985	* MADV_FREE anonymous folios are put into inactive file list too.
986	* They could be mistakenly treated as file lru. So further anon
987	* test is needed.
988	*/
989	if (!folio_is_file_lru(folio) \|\|
990	(folio_test_anon(folio) && !folio_test_swapbacked(folio))) {
991	*dirty = false;
992	*writeback = false;
993	return;
994	}
995
996	/ By default assume that the folio flags are accurate /
997	*dirty = folio_test_dirty(folio);
998	*writeback = folio_test_writeback(folio);
999
1000	/ Verify dirty/writeback state if the filesystem supports it /
1001	if (!folio_test_private(folio))
1002	return;
1003
1004	mapping = folio_mapping(folio);
1005	if (mapping && mapping->a_ops->is_dirty_writeback)
1006	mapping->a_ops->is_dirty_writeback(folio, dirty, writeback);
1007	}
1008
1009	static struct folio alloc_demote_folio(struct* folio *src,
1010	unsigned long private)
1011	{
1012	struct folio *dst;
1013	nodemask_t *allowed_mask;
1014	struct migration_target_control *mtc;
1015
1016	mtc = (struct migration_target_control *)private;
1017
1018	allowed_mask = mtc->nmask;
1019	/*
1020	* make sure we allocate from the target node first also trying to
1021	* demote or reclaim pages from the target node via kswapd if we are
1022	* low on free memory on target node. If we don't do this and if
1023	* we have free memory on the slower(lower) memtier, we would start
1024	* allocating pages from slower(lower) memory tiers without even forcing
1025	* a demotion of cold pages from the target memtier. This can result
1026	* in the kernel placing hot pages in slower(lower) memory tiers.
1027	*/
1028	mtc->nmask = NULL;
1029	mtc->gfp_mask \|= __GFP_THISNODE;
1030	dst = alloc_migration_target(src, private: (unsigned long)mtc);
1031	if (dst)
1032	return dst;
1033
1034	mtc->gfp_mask &= ~__GFP_THISNODE;
1035	mtc->nmask = allowed_mask;
1036
1037	return alloc_migration_target(src, private: (unsigned long)mtc);
1038	}
1039
1040	/*
1041	* Take folios on @demote_folios and attempt to demote them to another node.
1042	* Folios which are not demoted are left on @demote_folios.
1043	*/
1044	static unsigned int demote_folio_list(struct list_head *demote_folios,
1045	struct pglist_data *pgdat)
1046	{
1047	int target_nid = next_demotion_node(node: pgdat->node_id);
1048	unsigned int nr_succeeded;
1049	nodemask_t allowed_mask;
1050
1051	struct migration_target_control mtc = {
1052	/*
1053	* Allocate from 'node', or fail quickly and quietly.
1054	* When this happens, 'page' will likely just be discarded
1055	* instead of migrated.
1056	*/
1057	.gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) \| __GFP_NOWARN \|
1058	__GFP_NOMEMALLOC \| GFP_NOWAIT,
1059	.nid = target_nid,
1060	.nmask = &allowed_mask,
1061	.reason = MR_DEMOTION,
1062	};
1063
1064	if (list_empty(head: demote_folios))
1065	return `0`;
1066
1067	if (target_nid == NUMA_NO_NODE)
1068	return `0`;
1069
1070	node_get_allowed_targets(pgdat, targets: &allowed_mask);
1071
1072	/ Demotion ignores all cpuset and mempolicy settings /
1073	migrate_pages(l: demote_folios, new: alloc_demote_folio, NULL,
1074	private: (unsigned long)&mtc, mode: MIGRATE_ASYNC, reason: MR_DEMOTION,
1075	ret_succeeded: &nr_succeeded);
1076
1077	return nr_succeeded;
1078	}
1079
1080	static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask)
1081	{
1082	if (gfp_mask & __GFP_FS)
1083	return true;
1084	if (!folio_test_swapcache(folio) \|\| !(gfp_mask & __GFP_IO))
1085	return false;
1086	/*
1087	* We can "enter_fs" for swap-cache with only __GFP_IO
1088	* providing this isn't SWP_FS_OPS.
1089	* ->flags can be updated non-atomicially (scan_swap_map_slots),
1090	* but that will never affect SWP_FS_OPS, so the data_race
1091	* is safe.
1092	*/
1093	return !data_race(folio_swap_flags(folio) & SWP_FS_OPS);
1094	}
1095
1096	/*
1097	* shrink_folio_list() returns the number of reclaimed pages
1098	*/
1099	static unsigned int shrink_folio_list(struct list_head *folio_list,
1100	struct pglist_data pgdat, struct* scan_control *sc,
1101	struct reclaim_stat *stat, bool ignore_references,
1102	struct mem_cgroup *memcg)
1103	{
1104	struct folio_batch free_folios;
1105	LIST_HEAD(ret_folios);
1106	LIST_HEAD(demote_folios);
1107	unsigned int nr_reclaimed = `0`, nr_demoted = `0`;
1108	unsigned int pgactivate = `0`;
1109	bool do_demote_pass;
1110	struct swap_iocb *plug = NULL;
1111
1112	folio_batch_init(fbatch: &free_folios);
1113	memset(s: stat, c: `0`, n: sizeof(*stat));
1114	cond_resched();
1115	do_demote_pass = can_demote(nid: pgdat->node_id, sc, memcg);
1116
1117	retry:
1118	while (!list_empty(head: folio_list)) {
1119	struct address_space *mapping;
1120	struct folio *folio;
1121	enum folio_references references = FOLIOREF_RECLAIM;
1122	bool dirty, writeback;
1123	unsigned int nr_pages;
1124
1125	cond_resched();
1126
1127	folio = lru_to_folio(head: folio_list);
1128	list_del(entry: &folio->lru);
1129
1130	if (!folio_trylock(folio))
1131	goto keep;
1132
1133	if (folio_contain_hwpoisoned_page(folio)) {
1134	/*
1135	* unmap_poisoned_folio() can't handle large
1136	* folio, just skip it. memory_failure() will
1137	* handle it if the UCE is triggered again.
1138	*/
1139	if (folio_test_large(folio))
1140	goto keep_locked;
1141
1142	unmap_poisoned_folio(folio, pfn: folio_pfn(folio), must_kill: false);
1143	folio_unlock(folio);
1144	folio_put(folio);
1145	continue;
1146	}
1147
1148	VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
1149
1150	nr_pages = folio_nr_pages(folio);
1151
1152	/ Account the number of base pages /
1153	sc->nr_scanned += nr_pages;
1154
1155	if (unlikely(!folio_evictable(folio)))
1156	goto activate_locked;
1157
1158	if (!sc->may_unmap && folio_mapped(folio))
1159	goto keep_locked;
1160
1161	/*
1162	* The number of dirty pages determines if a node is marked
1163	* reclaim_congested. kswapd will stall and start writing
1164	* folios if the tail of the LRU is all dirty unqueued folios.
1165	*/
1166	folio_check_dirty_writeback(folio, dirty: &dirty, writeback: &writeback);
1167	if (dirty \|\| writeback)
1168	stat->nr_dirty += nr_pages;
1169
1170	if (dirty && !writeback)
1171	stat->nr_unqueued_dirty += nr_pages;
1172
1173	/*
1174	* Treat this folio as congested if folios are cycling
1175	* through the LRU so quickly that the folios marked
1176	* for immediate reclaim are making it to the end of
1177	* the LRU a second time.
1178	*/
1179	if (writeback && folio_test_reclaim(folio))
1180	stat->nr_congested += nr_pages;
1181
1182	/*
1183	* If a folio at the tail of the LRU is under writeback, there
1184	* are three cases to consider.
1185	*
1186	* 1) If reclaim is encountering an excessive number
1187	* of folios under writeback and this folio has both
1188	* the writeback and reclaim flags set, then it
1189	* indicates that folios are being queued for I/O but
1190	* are being recycled through the LRU before the I/O
1191	* can complete. Waiting on the folio itself risks an
1192	* indefinite stall if it is impossible to writeback
1193	* the folio due to I/O error or disconnected storage
1194	* so instead note that the LRU is being scanned too
1195	* quickly and the caller can stall after the folio
1196	* list has been processed.
1197	*
1198	* 2) Global or new memcg reclaim encounters a folio that is
1199	* not marked for immediate reclaim, or the caller does not
1200	* have __GFP_FS (or __GFP_IO if it's simply going to swap,
1201	* not to fs), or the folio belongs to a mapping where
1202	* waiting on writeback during reclaim may lead to a deadlock.
1203	* In this case mark the folio for immediate reclaim and
1204	* continue scanning.
1205	*
1206	* Require may_enter_fs() because we would wait on fs, which
1207	* may not have submitted I/O yet. And the loop driver might
1208	* enter reclaim, and deadlock if it waits on a folio for
1209	* which it is needed to do the write (loop masks off
1210	* __GFP_IO\|__GFP_FS for this reason); but more thought
1211	* would probably show more reasons.
1212	*
1213	* 3) Legacy memcg encounters a folio that already has the
1214	* reclaim flag set. memcg does not have any dirty folio
1215	* throttling so we could easily OOM just because too many
1216	* folios are in writeback and there is nothing else to
1217	* reclaim. Wait for the writeback to complete.
1218	*
1219	* In cases 1) and 2) we activate the folios to get them out of
1220	* the way while we continue scanning for clean folios on the
1221	* inactive list and refilling from the active list. The
1222	* observation here is that waiting for disk writes is more
1223	* expensive than potentially causing reloads down the line.
1224	* Since they're marked for immediate reclaim, they won't put
1225	* memory pressure on the cache working set any longer than it
1226	* takes to write them to disk.
1227	*/
1228	if (folio_test_writeback(folio)) {
1229	mapping = folio_mapping(folio);
1230
1231	/ Case 1 above /
1232	if (current_is_kswapd() &&
1233	folio_test_reclaim(folio) &&
1234	test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
1235	stat->nr_immediate += nr_pages;
1236	goto activate_locked;
1237
1238	/ Case 2 above /
1239	} else if (writeback_throttling_sane(sc) \|\|
1240	!folio_test_reclaim(folio) \|\|
1241	!may_enter_fs(folio, gfp_mask: sc->gfp_mask) \|\|
1242	(mapping &&
1243	mapping_writeback_may_deadlock_on_reclaim(mapping))) {
1244	/*
1245	* This is slightly racy -
1246	* folio_end_writeback() might have
1247	* just cleared the reclaim flag, then
1248	* setting the reclaim flag here ends up
1249	* interpreted as the readahead flag - but
1250	* that does not matter enough to care.
1251	* What we do want is for this folio to
1252	* have the reclaim flag set next time
1253	* memcg reclaim reaches the tests above,
1254	* so it will then wait for writeback to
1255	* avoid OOM; and it's also appropriate
1256	* in global reclaim.
1257	*/
1258	folio_set_reclaim(folio);
1259	stat->nr_writeback += nr_pages;
1260	goto activate_locked;
1261
1262	/ Case 3 above /
1263	} else {
1264	folio_unlock(folio);
1265	folio_wait_writeback(folio);
1266	/ then go back and try same folio again /
1267	list_add_tail(new: &folio->lru, head: folio_list);
1268	continue;
1269	}
1270	}
1271
1272	if (!ignore_references)
1273	references = folio_check_references(folio, sc);
1274
1275	switch (references) {
1276	case FOLIOREF_ACTIVATE:
1277	goto activate_locked;
1278	case FOLIOREF_KEEP:
1279	stat->nr_ref_keep += nr_pages;
1280	goto keep_locked;
1281	case FOLIOREF_RECLAIM:
1282	case FOLIOREF_RECLAIM_CLEAN:
1283	; / try to reclaim the folio below /
1284	}
1285
1286	/*
1287	* Before reclaiming the folio, try to relocate
1288	* its contents to another node.
1289	*/
1290	if (do_demote_pass &&
1291	(thp_migration_supported() \|\| !folio_test_large(folio))) {
1292	list_add(new: &folio->lru, head: &demote_folios);
1293	folio_unlock(folio);
1294	continue;
1295	}
1296
1297	/*
1298	* Anonymous process memory has backing store?
1299	* Try to allocate it some swap space here.
1300	* Lazyfree folio could be freed directly
1301	*/
1302	if (folio_test_anon(folio) && folio_test_swapbacked(folio)) {
1303	if (!folio_test_swapcache(folio)) {
1304	if (!(sc->gfp_mask & __GFP_IO))
1305	goto keep_locked;
1306	if (folio_maybe_dma_pinned(folio))
1307	goto keep_locked;
1308	if (folio_test_large(folio)) {
1309	/ cannot split folio, skip it /
1310	if (!can_split_folio(folio, caller_pins: `1`, NULL))
1311	goto activate_locked;
1312	/*
1313	* Split partially mapped folios right away.
1314	* We can free the unmapped pages without IO.
1315	*/
1316	if (data_race(!list_empty(&folio->_deferred_list) &&
1317	folio_test_partially_mapped(folio)) &&
1318	split_folio_to_list(folio, list: folio_list))
1319	goto activate_locked;
1320	}
1321	if (folio_alloc_swap(folio, __GFP_HIGH \| __GFP_NOWARN)) {
1322	int __maybe_unused order = folio_order(folio);
1323
1324	if (!folio_test_large(folio))
1325	goto activate_locked_split;
1326	/ Fallback to swap normal pages /
1327	if (split_folio_to_list(folio, list: folio_list))
1328	goto activate_locked;
1329	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1330	if (nr_pages >= HPAGE_PMD_NR) {
1331	count_memcg_folio_events(folio,
1332	THP_SWPOUT_FALLBACK, `1`);
1333	count_vm_event(THP_SWPOUT_FALLBACK);
1334	}
1335	#endif
1336	count_mthp_stat(order, item: MTHP_STAT_SWPOUT_FALLBACK);
1337	if (folio_alloc_swap(folio, __GFP_HIGH \| __GFP_NOWARN))
1338	goto activate_locked_split;
1339	}
1340	/*
1341	* Normally the folio will be dirtied in unmap because its
1342	* pte should be dirty. A special case is MADV_FREE page. The
1343	* page's pte could have dirty bit cleared but the folio's
1344	* SwapBacked flag is still set because clearing the dirty bit
1345	* and SwapBacked flag has no lock protected. For such folio,
1346	* unmap will not set dirty bit for it, so folio reclaim will
1347	* not write the folio out. This can cause data corruption when
1348	* the folio is swapped in later. Always setting the dirty flag
1349	* for the folio solves the problem.
1350	*/
1351	folio_mark_dirty(folio);
1352	}
1353	}
1354
1355	/*
1356	* If the folio was split above, the tail pages will make
1357	* their own pass through this function and be accounted
1358	* then.
1359	*/
1360	if ((nr_pages > `1`) && !folio_test_large(folio)) {
1361	sc->nr_scanned -= (nr_pages - `1`);
1362	nr_pages = `1`;
1363	}
1364
1365	/*
1366	* The folio is mapped into the page tables of one or more
1367	* processes. Try to unmap it here.
1368	*/
1369	if (folio_mapped(folio)) {
1370	enum ttu_flags flags = TTU_BATCH_FLUSH;
1371	bool was_swapbacked = folio_test_swapbacked(folio);
1372
1373	if (folio_test_pmd_mappable(folio))
1374	flags \|= TTU_SPLIT_HUGE_PMD;
1375	/*
1376	* Without TTU_SYNC, try_to_unmap will only begin to
1377	* hold PTL from the first present PTE within a large
1378	* folio. Some initial PTEs might be skipped due to
1379	* races with parallel PTE writes in which PTEs can be
1380	* cleared temporarily before being written new present
1381	* values. This will lead to a large folio is still
1382	* mapped while some subpages have been partially
1383	* unmapped after try_to_unmap; TTU_SYNC helps
1384	* try_to_unmap acquire PTL from the first PTE,
1385	* eliminating the influence of temporary PTE values.
1386	*/
1387	if (folio_test_large(folio))
1388	flags \|= TTU_SYNC;
1389
1390	try_to_unmap(folio, flags);
1391	if (folio_mapped(folio)) {
1392	stat->nr_unmap_fail += nr_pages;
1393	if (!was_swapbacked &&
1394	folio_test_swapbacked(folio))
1395	stat->nr_lazyfree_fail += nr_pages;
1396	goto activate_locked;
1397	}
1398	}
1399
1400	/*
1401	* Folio is unmapped now so it cannot be newly pinned anymore.
1402	* No point in trying to reclaim folio if it is pinned.
1403	* Furthermore we don't want to reclaim underlying fs metadata
1404	* if the folio is pinned and thus potentially modified by the
1405	* pinning process as that may upset the filesystem.
1406	*/
1407	if (folio_maybe_dma_pinned(folio))
1408	goto activate_locked;
1409
1410	mapping = folio_mapping(folio);
1411	if (folio_test_dirty(folio)) {
1412	/*
1413	* Only kswapd can writeback filesystem folios
1414	* to avoid risk of stack overflow. But avoid
1415	* injecting inefficient single-folio I/O into
1416	* flusher writeback as much as possible: only
1417	* write folios when we've encountered many
1418	* dirty folios, and when we've already scanned
1419	* the rest of the LRU for clean folios and see
1420	* the same dirty folios again (with the reclaim
1421	* flag set).
1422	*/
1423	if (folio_is_file_lru(folio) &&
1424	(!current_is_kswapd() \|\|
1425	!folio_test_reclaim(folio) \|\|
1426	!test_bit(PGDAT_DIRTY, &pgdat->flags))) {
1427	/*
1428	* Immediately reclaim when written back.
1429	* Similar in principle to folio_deactivate()
1430	* except we already have the folio isolated
1431	* and know it's dirty
1432	*/
1433	node_stat_mod_folio(folio, item: NR_VMSCAN_IMMEDIATE,
1434	nr: nr_pages);
1435	folio_set_reclaim(folio);
1436
1437	goto activate_locked;
1438	}
1439
1440	if (references == FOLIOREF_RECLAIM_CLEAN)
1441	goto keep_locked;
1442	if (!may_enter_fs(folio, gfp_mask: sc->gfp_mask))
1443	goto keep_locked;
1444	if (!sc->may_writepage)
1445	goto keep_locked;
1446
1447	/*
1448	* Folio is dirty. Flush the TLB if a writable entry
1449	* potentially exists to avoid CPU writes after I/O
1450	* starts and then write it out here.
1451	*/
1452	try_to_unmap_flush_dirty();
1453	switch (pageout(folio, mapping, plug: &plug, folio_list)) {
1454	case PAGE_KEEP:
1455	goto keep_locked;
1456	case PAGE_ACTIVATE:
1457	/*
1458	* If shmem folio is split when writeback to swap,
1459	* the tail pages will make their own pass through
1460	* this function and be accounted then.
1461	*/
1462	if (nr_pages > `1` && !folio_test_large(folio)) {
1463	sc->nr_scanned -= (nr_pages - `1`);
1464	nr_pages = `1`;
1465	}
1466	goto activate_locked;
1467	case PAGE_SUCCESS:
1468	if (nr_pages > `1` && !folio_test_large(folio)) {
1469	sc->nr_scanned -= (nr_pages - `1`);
1470	nr_pages = `1`;
1471	}
1472	stat->nr_pageout += nr_pages;
1473
1474	if (folio_test_writeback(folio))
1475	goto keep;
1476	if (folio_test_dirty(folio))
1477	goto keep;
1478
1479	/*
1480	* A synchronous write - probably a ramdisk. Go
1481	* ahead and try to reclaim the folio.
1482	*/
1483	if (!folio_trylock(folio))
1484	goto keep;
1485	if (folio_test_dirty(folio) \|\|
1486	folio_test_writeback(folio))
1487	goto keep_locked;
1488	mapping = folio_mapping(folio);
1489	fallthrough;
1490	case PAGE_CLEAN:
1491	; / try to free the folio below /
1492	}
1493	}
1494
1495	/*
1496	* If the folio has buffers, try to free the buffer
1497	* mappings associated with this folio. If we succeed
1498	* we try to free the folio as well.
1499	*
1500	* We do this even if the folio is dirty.
1501	* filemap_release_folio() does not perform I/O, but it
1502	* is possible for a folio to have the dirty flag set,
1503	* but it is actually clean (all its buffers are clean).
1504	* This happens if the buffers were written out directly,
1505	* with submit_bh(). ext3 will do this, as well as
1506	* the blockdev mapping. filemap_release_folio() will
1507	* discover that cleanness and will drop the buffers
1508	* and mark the folio clean - it can be freed.
1509	*
1510	* Rarely, folios can have buffers and no ->mapping.
1511	* These are the folios which were not successfully
1512	* invalidated in truncate_cleanup_folio(). We try to
1513	* drop those buffers here and if that worked, and the
1514	* folio is no longer mapped into process address space
1515	* (refcount == 1) it can be freed. Otherwise, leave
1516	* the folio on the LRU so it is swappable.
1517	*/
1518	if (folio_needs_release(folio)) {
1519	if (!filemap_release_folio(folio, gfp: sc->gfp_mask))
1520	goto activate_locked;
1521	if (!mapping && folio_ref_count(folio) == `1`) {
1522	folio_unlock(folio);
1523	if (folio_put_testzero(folio))
1524	goto free_it;
1525	else {
1526	/*
1527	* rare race with speculative reference.
1528	* the speculative reference will free
1529	* this folio shortly, so we may
1530	* increment nr_reclaimed here (and
1531	* leave it off the LRU).
1532	*/
1533	nr_reclaimed += nr_pages;
1534	continue;
1535	}
1536	}
1537	}
1538
1539	if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) {
1540	/ follow __remove_mapping for reference /
1541	if (!folio_ref_freeze(folio, count: `1`))
1542	goto keep_locked;
1543	/*
1544	* The folio has only one reference left, which is
1545	* from the isolation. After the caller puts the
1546	* folio back on the lru and drops the reference, the
1547	* folio will be freed anyway. It doesn't matter
1548	* which lru it goes on. So we don't bother checking
1549	* the dirty flag here.
1550	*/
1551	count_vm_events(item: PGLAZYFREED, delta: nr_pages);
1552	count_memcg_folio_events(folio, idx: PGLAZYFREED, nr: nr_pages);
1553	} else if (!mapping \|\| !__remove_mapping(mapping, folio, reclaimed: true,
1554	target_memcg: sc->target_mem_cgroup))
1555	goto keep_locked;
1556
1557	folio_unlock(folio);
1558	free_it:
1559	/*
1560	* Folio may get swapped out as a whole, need to account
1561	* all pages in it.
1562	*/
1563	nr_reclaimed += nr_pages;
1564
1565	folio_unqueue_deferred_split(folio);
1566	if (folio_batch_add(fbatch: &free_folios, folio) == `0`) {
1567	mem_cgroup_uncharge_folios(folios: &free_folios);
1568	try_to_unmap_flush();
1569	free_unref_folios(fbatch: &free_folios);
1570	}
1571	continue;
1572
1573	activate_locked_split:
1574	/*
1575	* The tail pages that are failed to add into swap cache
1576	* reach here. Fixup nr_scanned and nr_pages.
1577	*/
1578	if (nr_pages > `1`) {
1579	sc->nr_scanned -= (nr_pages - `1`);
1580	nr_pages = `1`;
1581	}
1582	activate_locked:
1583	/ Not a candidate for swapping, so reclaim swap space. /
1584	if (folio_test_swapcache(folio) &&
1585	(mem_cgroup_swap_full(folio) \|\| folio_test_mlocked(folio)))
1586	folio_free_swap(folio);
1587	VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
1588	if (!folio_test_mlocked(folio)) {
1589	int type = folio_is_file_lru(folio);
1590	folio_set_active(folio);
1591	stat->nr_activate[type] += nr_pages;
1592	count_memcg_folio_events(folio, idx: PGACTIVATE, nr: nr_pages);
1593	}
1594	keep_locked:
1595	folio_unlock(folio);
1596	keep:
1597	list_add(new: &folio->lru, head: &ret_folios);
1598	VM_BUG_ON_FOLIO(folio_test_lru(folio) \|\|
1599	folio_test_unevictable(folio), folio);
1600	}
1601	/ 'folio_list' is always empty here /
1602
1603	/ Migrate folios selected for demotion /
1604	nr_demoted = demote_folio_list(demote_folios: &demote_folios, pgdat);
1605	nr_reclaimed += nr_demoted;
1606	stat->nr_demoted += nr_demoted;
1607	/ Folios that could not be demoted are still in @demote_folios /
1608	if (!list_empty(head: &demote_folios)) {
1609	/ Folios which weren't demoted go back on @folio_list /
1610	list_splice_init(list: &demote_folios, head: folio_list);
1611
1612	/*
1613	* goto retry to reclaim the undemoted folios in folio_list if
1614	* desired.
1615	*
1616	* Reclaiming directly from top tier nodes is not often desired
1617	* due to it breaking the LRU ordering: in general memory
1618	* should be reclaimed from lower tier nodes and demoted from
1619	* top tier nodes.
1620	*
1621	* However, disabling reclaim from top tier nodes entirely
1622	* would cause ooms in edge scenarios where lower tier memory
1623	* is unreclaimable for whatever reason, eg memory being
1624	* mlocked or too hot to reclaim. We can disable reclaim
1625	* from top tier nodes in proactive reclaim though as that is
1626	* not real memory pressure.
1627	*/
1628	if (!sc->proactive) {
1629	do_demote_pass = false;
1630	goto retry;
1631	}
1632	}
1633
1634	pgactivate = stat->nr_activate[`0`] + stat->nr_activate[`1`];
1635
1636	mem_cgroup_uncharge_folios(folios: &free_folios);
1637	try_to_unmap_flush();
1638	free_unref_folios(fbatch: &free_folios);
1639
1640	list_splice(list: &ret_folios, head: folio_list);
1641	count_vm_events(item: PGACTIVATE, delta: pgactivate);
1642
1643	if (plug)
1644	swap_write_unplug(sio: plug);
1645	return nr_reclaimed;
1646	}
1647
1648	unsigned int reclaim_clean_pages_from_list(struct zone *zone,
1649	struct list_head *folio_list)
1650	{
1651	struct scan_control sc = {
1652	.gfp_mask = GFP_KERNEL,
1653	.may_unmap = `1`,
1654	};
1655	struct reclaim_stat stat;
1656	unsigned int nr_reclaimed;
1657	struct folio folio, next;
1658	LIST_HEAD(clean_folios);
1659	unsigned int noreclaim_flag;
1660
1661	list_for_each_entry_safe(folio, next, folio_list, lru) {
1662	/ TODO: these pages should not even appear in this list. /
1663	if (page_has_movable_ops(page: &folio->page))
1664	continue;
1665	if (!folio_test_hugetlb(folio) && folio_is_file_lru(folio) &&
1666	!folio_test_dirty(folio) && !folio_test_unevictable(folio)) {
1667	folio_clear_active(folio);
1668	list_move(list: &folio->lru, head: &clean_folios);
1669	}
1670	}
1671
1672	/*
1673	* We should be safe here since we are only dealing with file pages and
1674	* we are not kswapd and therefore cannot write dirty file pages. But
1675	* call memalloc_noreclaim_save() anyway, just in case these conditions
1676	* change in the future.
1677	*/
1678	noreclaim_flag = memalloc_noreclaim_save();
1679	nr_reclaimed = shrink_folio_list(folio_list: &clean_folios, pgdat: zone->zone_pgdat, sc: &sc,
1680	stat: &stat, ignore_references: true, NULL);
1681	memalloc_noreclaim_restore(flags: noreclaim_flag);
1682
1683	list_splice(list: &clean_folios, head: folio_list);
1684	mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
1685	-(long)nr_reclaimed);
1686	/*
1687	* Since lazyfree pages are isolated from file LRU from the beginning,
1688	* they will rotate back to anonymous LRU in the end if it failed to
1689	* discard so isolated count will be mismatched.
1690	* Compensate the isolated count for both LRU lists.
1691	*/
1692	mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON,
1693	stat.nr_lazyfree_fail);
1694	mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
1695	-(long)stat.nr_lazyfree_fail);
1696	return nr_reclaimed;
1697	}
1698
1699	/*
1700	* Update LRU sizes after isolating pages. The LRU size updates must
1701	* be complete before mem_cgroup_update_lru_size due to a sanity check.
1702	*/
1703	static __always_inline void update_lru_sizes(struct lruvec *lruvec,
1704	enum lru_list lru, unsigned long *nr_zone_taken)
1705	{
1706	int zid;
1707
1708	for (zid = `0`; zid < MAX_NR_ZONES; zid++) {
1709	if (!nr_zone_taken[zid])
1710	continue;
1711
1712	update_lru_size(lruvec, lru, zid, nr_pages: -nr_zone_taken[zid]);
1713	}
1714
1715	}
1716
1717	/*
1718	* Isolating page from the lruvec to fill in @dst list by nr_to_scan times.
1719	*
1720	* lruvec->lru_lock is heavily contended. Some of the functions that
1721	* shrink the lists perform better by taking out a batch of pages
1722	* and working on them outside the LRU lock.
1723	*
1724	* For pagecache intensive workloads, this function is the hottest
1725	* spot in the kernel (apart from copy_*_user functions).
1726	*
1727	* Lru_lock must be held before calling this function.
1728	*
1729	* @nr_to_scan: The number of eligible pages to look through on the list.
1730	* @lruvec: The LRU vector to pull pages from.
1731	* @dst: The temp list to put pages on to.
1732	* @nr_scanned: The number of pages that were scanned.
1733	* @sc: The scan_control struct for this reclaim session
1734	* @lru: LRU list id for isolating
1735	*
1736	* returns how many pages were moved onto *@dst.
1737	*/
1738	static unsigned long isolate_lru_folios(unsigned long nr_to_scan,
1739	struct lruvec lruvec, struct* list_head *dst,
1740	unsigned long nr_scanned, struct* scan_control *sc,
1741	enum lru_list lru)
1742	{
1743	struct list_head *src = &lruvec->lists[lru];
1744	unsigned long nr_taken = `0`;
1745	unsigned long nr_zone_taken[MAX_NR_ZONES] = { `0` };
1746	unsigned long nr_skipped[MAX_NR_ZONES] = { `0`, };
1747	unsigned long skipped = `0`, total_scan = `0`, scan = `0`;
1748	unsigned long nr_pages;
1749	unsigned long max_nr_skipped = `0`;
1750	LIST_HEAD(folios_skipped);
1751
1752	while (scan < nr_to_scan && !list_empty(head: src)) {
1753	struct list_head *move_to = src;
1754	struct folio *folio;
1755
1756	folio = lru_to_folio(head: src);
1757	prefetchw_prev_lru_folio(folio, src, flags);
1758
1759	nr_pages = folio_nr_pages(folio);
1760	total_scan += nr_pages;
1761
1762	/ Using max_nr_skipped to prevent hard LOCKUP/
1763	if (max_nr_skipped < SWAP_CLUSTER_MAX_SKIPPED &&
1764	(folio_zonenum(folio) > sc->reclaim_idx)) {
1765	nr_skipped[folio_zonenum(folio)] += nr_pages;
1766	move_to = &folios_skipped;
1767	max_nr_skipped++;
1768	goto move;
1769	}
1770
1771	/*
1772	* Do not count skipped folios because that makes the function
1773	* return with no isolated folios if the LRU mostly contains
1774	* ineligible folios. This causes the VM to not reclaim any
1775	* folios, triggering a premature OOM.
1776	* Account all pages in a folio.
1777	*/
1778	scan += nr_pages;
1779
1780	if (!folio_test_lru(folio))
1781	goto move;
1782	if (!sc->may_unmap && folio_mapped(folio))
1783	goto move;
1784
1785	/*
1786	* Be careful not to clear the lru flag until after we're
1787	* sure the folio is not being freed elsewhere -- the
1788	* folio release code relies on it.
1789	*/
1790	if (unlikely(!folio_try_get(folio)))
1791	goto move;
1792
1793	if (!folio_test_clear_lru(folio)) {
1794	/ Another thread is already isolating this folio /
1795	folio_put(folio);
1796	goto move;
1797	}
1798
1799	nr_taken += nr_pages;
1800	nr_zone_taken[folio_zonenum(folio)] += nr_pages;
1801	move_to = dst;
1802	move:
1803	list_move(list: &folio->lru, head: move_to);
1804	}
1805
1806	/*
1807	* Splice any skipped folios to the start of the LRU list. Note that
1808	* this disrupts the LRU order when reclaiming for lower zones but
1809	* we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX
1810	* scanning would soon rescan the same folios to skip and waste lots
1811	* of cpu cycles.
1812	*/
1813	if (!list_empty(head: &folios_skipped)) {
1814	int zid;
1815
1816	list_splice(list: &folios_skipped, head: src);
1817	for (zid = `0`; zid < MAX_NR_ZONES; zid++) {
1818	if (!nr_skipped[zid])
1819	continue;
1820
1821	__count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
1822	skipped += nr_skipped[zid];
1823	}
1824	}
1825	*nr_scanned = total_scan;
1826	trace_mm_vmscan_lru_isolate(highest_zoneidx: sc->reclaim_idx, order: sc->order, nr_requested: nr_to_scan,
1827	nr_scanned: total_scan, nr_skipped: skipped, nr_taken, lru);
1828	update_lru_sizes(lruvec, lru, nr_zone_taken);
1829	return nr_taken;
1830	}
1831
1832	/**
1833	* folio_isolate_lru() - Try to isolate a folio from its LRU list.
1834	* @folio: Folio to isolate from its LRU list.
1835	*
1836	* Isolate a @folio from an LRU list and adjust the vmstat statistic
1837	* corresponding to whatever LRU list the folio was on.
1838	*
1839	* The folio will have its LRU flag cleared. If it was found on the
1840	* active list, it will have the Active flag set. If it was found on the
1841	* unevictable list, it will have the Unevictable flag set. These flags
1842	* may need to be cleared by the caller before letting the page go.
1843	*
1844	* Context:
1845	*
1846	* (1) Must be called with an elevated refcount on the folio. This is a
1847	* fundamental difference from isolate_lru_folios() (which is called
1848	* without a stable reference).
1849	* (2) The lru_lock must not be held.
1850	* (3) Interrupts must be enabled.
1851	*
1852	* Return: true if the folio was removed from an LRU list.
1853	* false if the folio was not on an LRU list.
1854	*/
1855	bool folio_isolate_lru(struct folio *folio)
1856	{
1857	bool ret = false;
1858
1859	VM_BUG_ON_FOLIO(!folio_ref_count(folio), folio);
1860
1861	if (folio_test_clear_lru(folio)) {
1862	struct lruvec *lruvec;
1863
1864	folio_get(folio);
1865	lruvec = folio_lruvec_lock_irq(folio);
1866	lruvec_del_folio(lruvec, folio);
1867	unlock_page_lruvec_irq(lruvec);
1868	ret = true;
1869	}
1870
1871	return ret;
1872	}
1873
1874	/*
1875	* A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
1876	* then get rescheduled. When there are massive number of tasks doing page
1877	* allocation, such sleeping direct reclaimers may keep piling up on each CPU,
1878	* the LRU list will go small and be scanned faster than necessary, leading to
1879	* unnecessary swapping, thrashing and OOM.
1880	*/
1881	static bool too_many_isolated(struct pglist_data pgdat, int* file,
1882	struct scan_control *sc)
1883	{
1884	unsigned long inactive, isolated;
1885	bool too_many;
1886
1887	if (current_is_kswapd())
1888	return false;
1889
1890	if (!writeback_throttling_sane(sc))
1891	return false;
1892
1893	if (file) {
1894	inactive = node_page_state(pgdat, item: NR_INACTIVE_FILE);
1895	isolated = node_page_state(pgdat, item: NR_ISOLATED_FILE);
1896	} else {
1897	inactive = node_page_state(pgdat, item: NR_INACTIVE_ANON);
1898	isolated = node_page_state(pgdat, item: NR_ISOLATED_ANON);
1899	}
1900
1901	/*
1902	* GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
1903	* won't get blocked by normal direct-reclaimers, forming a circular
1904	* deadlock.
1905	*/
1906	if (gfp_has_io_fs(gfp: sc->gfp_mask))
1907	inactive >>= `3`;
1908
1909	too_many = isolated > inactive;
1910
1911	/ Wake up tasks throttled due to too_many_isolated. /
1912	if (!too_many)
1913	wake_throttle_isolated(pgdat);
1914
1915	return too_many;
1916	}
1917
1918	/*
1919	* move_folios_to_lru() moves folios from private @list to appropriate LRU list.
1920	*
1921	* Returns the number of pages moved to the given lruvec.
1922	*/
1923	static unsigned int move_folios_to_lru(struct lruvec *lruvec,
1924	struct list_head *list)
1925	{
1926	int nr_pages, nr_moved = `0`;
1927	struct folio_batch free_folios;
1928
1929	folio_batch_init(fbatch: &free_folios);
1930	while (!list_empty(head: list)) {
1931	struct folio *folio = lru_to_folio(head: list);
1932
1933	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
1934	list_del(entry: &folio->lru);
1935	if (unlikely(!folio_evictable(folio))) {
1936	spin_unlock_irq(lock: &lruvec->lru_lock);
1937	folio_putback_lru(folio);
1938	spin_lock_irq(lock: &lruvec->lru_lock);
1939	continue;
1940	}
1941
1942	/*
1943	* The folio_set_lru needs to be kept here for list integrity.
1944	* Otherwise:
1945	* #0 move_folios_to_lru #1 release_pages
1946	* if (!folio_put_testzero())
1947	* if (folio_put_testzero())
1948	* !lru //skip lru_lock
1949	* folio_set_lru()
1950	* list_add(&folio->lru,)
1951	* list_add(&folio->lru,)
1952	*/
1953	folio_set_lru(folio);
1954
1955	if (unlikely(folio_put_testzero(folio))) {
1956	__folio_clear_lru_flags(folio);
1957
1958	folio_unqueue_deferred_split(folio);
1959	if (folio_batch_add(fbatch: &free_folios, folio) == `0`) {
1960	spin_unlock_irq(lock: &lruvec->lru_lock);
1961	mem_cgroup_uncharge_folios(folios: &free_folios);
1962	free_unref_folios(fbatch: &free_folios);
1963	spin_lock_irq(lock: &lruvec->lru_lock);
1964	}
1965
1966	continue;
1967	}
1968
1969	/*
1970	* All pages were isolated from the same lruvec (and isolation
1971	* inhibits memcg migration).
1972	*/
1973	VM_BUG_ON_FOLIO(!folio_matches_lruvec(folio, lruvec), folio);
1974	lruvec_add_folio(lruvec, folio);
1975	nr_pages = folio_nr_pages(folio);
1976	nr_moved += nr_pages;
1977	if (folio_test_active(folio))
1978	workingset_age_nonresident(lruvec, nr_pages);
1979	}
1980
1981	if (free_folios.nr) {
1982	spin_unlock_irq(lock: &lruvec->lru_lock);
1983	mem_cgroup_uncharge_folios(folios: &free_folios);
1984	free_unref_folios(fbatch: &free_folios);
1985	spin_lock_irq(lock: &lruvec->lru_lock);
1986	}
1987
1988	return nr_moved;
1989	}
1990
1991	/*
1992	* If a kernel thread (such as nfsd for loop-back mounts) services a backing
1993	* device by writing to the page cache it sets PF_LOCAL_THROTTLE. In this case
1994	* we should not throttle. Otherwise it is safe to do so.
1995	*/
1996	static int current_may_throttle(void)
1997	{
1998	return !(current->flags & PF_LOCAL_THROTTLE);
1999	}
2000
2001	/*
2002	* shrink_inactive_list() is a helper for shrink_node(). It returns the number
2003	* of reclaimed pages
2004	*/
2005	static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
2006	struct lruvec lruvec, struct* scan_control *sc,
2007	enum lru_list lru)
2008	{
2009	LIST_HEAD(folio_list);
2010	unsigned long nr_scanned;
2011	unsigned int nr_reclaimed = `0`;
2012	unsigned long nr_taken;
2013	struct reclaim_stat stat;
2014	bool file = is_file_lru(lru);
2015	enum vm_event_item item;
2016	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2017	bool stalled = false;
2018
2019	while (unlikely(too_many_isolated(pgdat, file, sc))) {
2020	if (stalled)
2021	return `0`;
2022
2023	/ wait a bit for the reclaimer. /
2024	stalled = true;
2025	reclaim_throttle(pgdat, reason: VMSCAN_THROTTLE_ISOLATED);
2026
2027	/ We are about to die and free our memory. Return now. /
2028	if (fatal_signal_pending(current))
2029	return SWAP_CLUSTER_MAX;
2030	}
2031
2032	lru_add_drain();
2033
2034	spin_lock_irq(lock: &lruvec->lru_lock);
2035
2036	nr_taken = isolate_lru_folios(nr_to_scan, lruvec, dst: &folio_list,
2037	nr_scanned: &nr_scanned, sc, lru);
2038
2039	__mod_node_page_state(pgdat, item: NR_ISOLATED_ANON + file, nr_taken);
2040	item = PGSCAN_KSWAPD + reclaimer_offset(sc);
2041	if (!cgroup_reclaim(sc))
2042	__count_vm_events(item, delta: nr_scanned);
2043	count_memcg_events(memcg: lruvec_memcg(lruvec), idx: item, count: nr_scanned);
2044	__count_vm_events(item: PGSCAN_ANON + file, delta: nr_scanned);
2045
2046	spin_unlock_irq(lock: &lruvec->lru_lock);
2047
2048	if (nr_taken == `0`)
2049	return `0`;
2050
2051	nr_reclaimed = shrink_folio_list(folio_list: &folio_list, pgdat, sc, stat: &stat, ignore_references: false,
2052	memcg: lruvec_memcg(lruvec));
2053
2054	spin_lock_irq(lock: &lruvec->lru_lock);
2055	move_folios_to_lru(lruvec, list: &folio_list);
2056
2057	__mod_lruvec_state(lruvec, idx: PGDEMOTE_KSWAPD + reclaimer_offset(sc),
2058	val: stat.nr_demoted);
2059	__mod_node_page_state(pgdat, item: NR_ISOLATED_ANON + file, -nr_taken);
2060	item = PGSTEAL_KSWAPD + reclaimer_offset(sc);
2061	if (!cgroup_reclaim(sc))
2062	__count_vm_events(item, delta: nr_reclaimed);
2063	count_memcg_events(memcg: lruvec_memcg(lruvec), idx: item, count: nr_reclaimed);
2064	__count_vm_events(item: PGSTEAL_ANON + file, delta: nr_reclaimed);
2065
2066	lru_note_cost_unlock_irq(lruvec, file, nr_io: stat.nr_pageout,
2067	nr_rotated: nr_scanned - nr_reclaimed);
2068
2069	/*
2070	* If dirty folios are scanned that are not queued for IO, it
2071	* implies that flushers are not doing their job. This can
2072	* happen when memory pressure pushes dirty folios to the end of
2073	* the LRU before the dirty limits are breached and the dirty
2074	* data has expired. It can also happen when the proportion of
2075	* dirty folios grows not through writes but through memory
2076	* pressure reclaiming all the clean cache. And in some cases,
2077	* the flushers simply cannot keep up with the allocation
2078	* rate. Nudge the flusher threads in case they are asleep.
2079	*/
2080	if (stat.nr_unqueued_dirty == nr_taken) {
2081	wakeup_flusher_threads(reason: WB_REASON_VMSCAN);
2082	/*
2083	* For cgroupv1 dirty throttling is achieved by waking up
2084	* the kernel flusher here and later waiting on folios
2085	* which are in writeback to finish (see shrink_folio_list()).
2086	*
2087	* Flusher may not be able to issue writeback quickly
2088	* enough for cgroupv1 writeback throttling to work
2089	* on a large system.
2090	*/
2091	if (!writeback_throttling_sane(sc))
2092	reclaim_throttle(pgdat, reason: VMSCAN_THROTTLE_WRITEBACK);
2093	}
2094
2095	sc->nr.dirty += stat.nr_dirty;
2096	sc->nr.congested += stat.nr_congested;
2097	sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
2098	sc->nr.writeback += stat.nr_writeback;
2099	sc->nr.immediate += stat.nr_immediate;
2100	sc->nr.taken += nr_taken;
2101	if (file)
2102	sc->nr.file_taken += nr_taken;
2103
2104	trace_mm_vmscan_lru_shrink_inactive(nid: pgdat->node_id,
2105	nr_scanned, nr_reclaimed, stat: &stat, priority: sc->priority, file);
2106	return nr_reclaimed;
2107	}
2108
2109	/*
2110	* shrink_active_list() moves folios from the active LRU to the inactive LRU.
2111	*
2112	* We move them the other way if the folio is referenced by one or more
2113	* processes.
2114	*
2115	* If the folios are mostly unmapped, the processing is fast and it is
2116	* appropriate to hold lru_lock across the whole operation. But if
2117	* the folios are mapped, the processing is slow (folio_referenced()), so
2118	* we should drop lru_lock around each folio. It's impossible to balance
2119	* this, so instead we remove the folios from the LRU while processing them.
2120	* It is safe to rely on the active flag against the non-LRU folios in here
2121	* because nobody will play with that bit on a non-LRU folio.
2122	*
2123	* The downside is that we have to touch folio->_refcount against each folio.
2124	* But we had to alter folio->flags anyway.
2125	*/
2126	static void shrink_active_list(unsigned long nr_to_scan,
2127	struct lruvec *lruvec,
2128	struct scan_control *sc,
2129	enum lru_list lru)
2130	{
2131	unsigned long nr_taken;
2132	unsigned long nr_scanned;
2133	vm_flags_t vm_flags;
2134	LIST_HEAD(l_hold); / The folios which were snipped off /
2135	LIST_HEAD(l_active);
2136	LIST_HEAD(l_inactive);
2137	unsigned nr_deactivate, nr_activate;
2138	unsigned nr_rotated = `0`;
2139	bool file = is_file_lru(lru);
2140	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2141
2142	lru_add_drain();
2143
2144	spin_lock_irq(lock: &lruvec->lru_lock);
2145
2146	nr_taken = isolate_lru_folios(nr_to_scan, lruvec, dst: &l_hold,
2147	nr_scanned: &nr_scanned, sc, lru);
2148
2149	__mod_node_page_state(pgdat, item: NR_ISOLATED_ANON + file, nr_taken);
2150
2151	if (!cgroup_reclaim(sc))
2152	__count_vm_events(item: PGREFILL, delta: nr_scanned);
2153	count_memcg_events(memcg: lruvec_memcg(lruvec), idx: PGREFILL, count: nr_scanned);
2154
2155	spin_unlock_irq(lock: &lruvec->lru_lock);
2156
2157	while (!list_empty(head: &l_hold)) {
2158	struct folio *folio;
2159
2160	cond_resched();
2161	folio = lru_to_folio(head: &l_hold);
2162	list_del(entry: &folio->lru);
2163
2164	if (unlikely(!folio_evictable(folio))) {
2165	folio_putback_lru(folio);
2166	continue;
2167	}
2168
2169	if (unlikely(buffer_heads_over_limit)) {
2170	if (folio_needs_release(folio) &&
2171	folio_trylock(folio)) {
2172	filemap_release_folio(folio, gfp: `0`);
2173	folio_unlock(folio);
2174	}
2175	}
2176
2177	/ Referenced or rmap lock contention: rotate /
2178	if (folio_referenced(folio, is_locked: `0`, memcg: sc->target_mem_cgroup,
2179	vm_flags: &vm_flags) != `0`) {
2180	/*
2181	* Identify referenced, file-backed active folios and
2182	* give them one more trip around the active list. So
2183	* that executable code get better chances to stay in
2184	* memory under moderate memory pressure. Anon folios
2185	* are not likely to be evicted by use-once streaming
2186	* IO, plus JVM can create lots of anon VM_EXEC folios,
2187	* so we ignore them here.
2188	*/
2189	if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) {
2190	nr_rotated += folio_nr_pages(folio);
2191	list_add(new: &folio->lru, head: &l_active);
2192	continue;
2193	}
2194	}
2195
2196	folio_clear_active(folio); / we are de-activating /
2197	folio_set_workingset(folio);
2198	list_add(new: &folio->lru, head: &l_inactive);
2199	}
2200
2201	/*
2202	* Move folios back to the lru list.
2203	*/
2204	spin_lock_irq(lock: &lruvec->lru_lock);
2205
2206	nr_activate = move_folios_to_lru(lruvec, list: &l_active);
2207	nr_deactivate = move_folios_to_lru(lruvec, list: &l_inactive);
2208
2209	__count_vm_events(item: PGDEACTIVATE, delta: nr_deactivate);
2210	count_memcg_events(memcg: lruvec_memcg(lruvec), idx: PGDEACTIVATE, count: nr_deactivate);
2211
2212	__mod_node_page_state(pgdat, item: NR_ISOLATED_ANON + file, -nr_taken);
2213
2214	lru_note_cost_unlock_irq(lruvec, file, nr_io: `0`, nr_rotated);
2215	trace_mm_vmscan_lru_shrink_active(nid: pgdat->node_id, nr_taken, nr_active: nr_activate,
2216	nr_deactivated: nr_deactivate, nr_referenced: nr_rotated, priority: sc->priority, file);
2217	}
2218
2219	static unsigned int reclaim_folio_list(struct list_head *folio_list,
2220	struct pglist_data *pgdat)
2221	{
2222	struct reclaim_stat stat;
2223	unsigned int nr_reclaimed;
2224	struct folio *folio;
2225	struct scan_control sc = {
2226	.gfp_mask = GFP_KERNEL,
2227	.may_writepage = `1`,
2228	.may_unmap = `1`,
2229	.may_swap = `1`,
2230	.no_demotion = `1`,
2231	};
2232
2233	nr_reclaimed = shrink_folio_list(folio_list, pgdat, sc: &sc, stat: &stat, ignore_references: true, NULL);
2234	while (!list_empty(head: folio_list)) {
2235	folio = lru_to_folio(head: folio_list);
2236	list_del(entry: &folio->lru);
2237	folio_putback_lru(folio);
2238	}
2239	trace_mm_vmscan_reclaim_pages(nid: pgdat->node_id, nr_scanned: sc.nr_scanned, nr_reclaimed, stat: &stat);
2240
2241	return nr_reclaimed;
2242	}
2243
2244	unsigned long reclaim_pages(struct list_head *folio_list)
2245	{
2246	int nid;
2247	unsigned int nr_reclaimed = `0`;
2248	LIST_HEAD(node_folio_list);
2249	unsigned int noreclaim_flag;
2250
2251	if (list_empty(head: folio_list))
2252	return nr_reclaimed;
2253
2254	noreclaim_flag = memalloc_noreclaim_save();
2255
2256	nid = folio_nid(folio: lru_to_folio(head: folio_list));
2257	do {
2258	struct folio *folio = lru_to_folio(head: folio_list);
2259
2260	if (nid == folio_nid(folio)) {
2261	folio_clear_active(folio);
2262	list_move(list: &folio->lru, head: &node_folio_list);
2263	continue;
2264	}
2265
2266	nr_reclaimed += reclaim_folio_list(folio_list: &node_folio_list, NODE_DATA(nid));
2267	nid = folio_nid(folio: lru_to_folio(head: folio_list));
2268	} while (!list_empty(head: folio_list));
2269
2270	nr_reclaimed += reclaim_folio_list(folio_list: &node_folio_list, NODE_DATA(nid));
2271
2272	memalloc_noreclaim_restore(flags: noreclaim_flag);
2273
2274	return nr_reclaimed;
2275	}
2276
2277	static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
2278	struct lruvec lruvec, struct* scan_control *sc)
2279	{
2280	if (is_active_lru(lru)) {
2281	if (sc->may_deactivate & (`1` << is_file_lru(lru)))
2282	shrink_active_list(nr_to_scan, lruvec, sc, lru);
2283	else
2284	sc->skipped_deactivate = `1`;
2285	return `0`;
2286	}
2287
2288	return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
2289	}
2290
2291	/*
2292	* The inactive anon list should be small enough that the VM never has
2293	* to do too much work.
2294	*
2295	* The inactive file list should be small enough to leave most memory
2296	* to the established workingset on the scan-resistant active list,
2297	* but large enough to avoid thrashing the aggregate readahead window.
2298	*
2299	* Both inactive lists should also be large enough that each inactive
2300	* folio has a chance to be referenced again before it is reclaimed.
2301	*
2302	* If that fails and refaulting is observed, the inactive list grows.
2303	*
2304	* The inactive_ratio is the target ratio of ACTIVE to INACTIVE folios
2305	* on this LRU, maintained by the pageout code. An inactive_ratio
2306	* of 3 means 3:1 or 25% of the folios are kept on the inactive list.
2307	*
2308	* total target max
2309	* memory ratio inactive
2310	* -------------------------------------
2311	* 10MB 1 5MB
2312	* 100MB 1 50MB
2313	* 1GB 3 250MB
2314	* 10GB 10 0.9GB
2315	* 100GB 31 3GB
2316	* 1TB 101 10GB
2317	* 10TB 320 32GB
2318	*/
2319	static bool inactive_is_low(struct lruvec lruvec, enum* lru_list inactive_lru)
2320	{
2321	enum lru_list active_lru = inactive_lru + LRU_ACTIVE;
2322	unsigned long inactive, active;
2323	unsigned long inactive_ratio;
2324	unsigned long gb;
2325
2326	inactive = lruvec_page_state(lruvec, idx: NR_LRU_BASE + inactive_lru);
2327	active = lruvec_page_state(lruvec, idx: NR_LRU_BASE + active_lru);
2328
2329	gb = (inactive + active) >> (`30` - PAGE_SHIFT);
2330	if (gb)
2331	inactive_ratio = int_sqrt(`10` * gb);
2332	else
2333	inactive_ratio = `1`;
2334
2335	return inactive * inactive_ratio < active;
2336	}
2337
2338	enum scan_balance {
2339	SCAN_EQUAL,
2340	SCAN_FRACT,
2341	SCAN_ANON,
2342	SCAN_FILE,
2343	};
2344
2345	static void prepare_scan_control(pg_data_t pgdat, struct* scan_control *sc)
2346	{
2347	unsigned long file;
2348	struct lruvec *target_lruvec;
2349
2350	if (lru_gen_enabled())
2351	return;
2352
2353	target_lruvec = mem_cgroup_lruvec(memcg: sc->target_mem_cgroup, pgdat);
2354
2355	/*
2356	* Flush the memory cgroup stats in rate-limited way as we don't need
2357	* most accurate stats here. We may switch to regular stats flushing
2358	* in the future once it is cheap enough.
2359	*/
2360	mem_cgroup_flush_stats_ratelimited(memcg: sc->target_mem_cgroup);
2361
2362	/*
2363	* Determine the scan balance between anon and file LRUs.
2364	*/
2365	spin_lock_irq(lock: &target_lruvec->lru_lock);
2366	sc->anon_cost = target_lruvec->anon_cost;
2367	sc->file_cost = target_lruvec->file_cost;
2368	spin_unlock_irq(lock: &target_lruvec->lru_lock);
2369
2370	/*
2371	* Target desirable inactive:active list ratios for the anon
2372	* and file LRU lists.
2373	*/
2374	if (!sc->force_deactivate) {
2375	unsigned long refaults;
2376
2377	/*
2378	* When refaults are being observed, it means a new
2379	* workingset is being established. Deactivate to get
2380	* rid of any stale active pages quickly.
2381	*/
2382	refaults = lruvec_page_state(lruvec: target_lruvec,
2383	idx: WORKINGSET_ACTIVATE_ANON);
2384	if (refaults != target_lruvec->refaults[WORKINGSET_ANON] \|\|
2385	inactive_is_low(lruvec: target_lruvec, inactive_lru: LRU_INACTIVE_ANON))
2386	sc->may_deactivate \|= DEACTIVATE_ANON;
2387	else
2388	sc->may_deactivate &= ~DEACTIVATE_ANON;
2389
2390	refaults = lruvec_page_state(lruvec: target_lruvec,
2391	idx: WORKINGSET_ACTIVATE_FILE);
2392	if (refaults != target_lruvec->refaults[WORKINGSET_FILE] \|\|
2393	inactive_is_low(lruvec: target_lruvec, inactive_lru: LRU_INACTIVE_FILE))
2394	sc->may_deactivate \|= DEACTIVATE_FILE;
2395	else
2396	sc->may_deactivate &= ~DEACTIVATE_FILE;
2397	} else
2398	sc->may_deactivate = DEACTIVATE_ANON \| DEACTIVATE_FILE;
2399
2400	/*
2401	* If we have plenty of inactive file pages that aren't
2402	* thrashing, try to reclaim those first before touching
2403	* anonymous pages.
2404	*/
2405	file = lruvec_page_state(lruvec: target_lruvec, idx: NR_INACTIVE_FILE);
2406	if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE) &&
2407	!sc->no_cache_trim_mode)
2408	sc->cache_trim_mode = `1`;
2409	else
2410	sc->cache_trim_mode = `0`;
2411
2412	/*
2413	* Prevent the reclaimer from falling into the cache trap: as
2414	* cache pages start out inactive, every cache fault will tip
2415	* the scan balance towards the file LRU. And as the file LRU
2416	* shrinks, so does the window for rotation from references.
2417	* This means we have a runaway feedback loop where a tiny
2418	* thrashing file LRU becomes infinitely more attractive than
2419	* anon pages. Try to detect this based on file LRU size.
2420	*/
2421	if (!cgroup_reclaim(sc)) {
2422	unsigned long total_high_wmark = `0`;
2423	unsigned long free, anon;
2424	int z;
2425	struct zone *zone;
2426
2427	free = sum_zone_node_page_state(node: pgdat->node_id, item: NR_FREE_PAGES);
2428	file = node_page_state(pgdat, item: NR_ACTIVE_FILE) +
2429	node_page_state(pgdat, item: NR_INACTIVE_FILE);
2430
2431	for_each_managed_zone_pgdat(zone, pgdat, z, MAX_NR_ZONES - `1`) {
2432	total_high_wmark += high_wmark_pages(z: zone);
2433	}
2434
2435	/*
2436	* Consider anon: if that's low too, this isn't a
2437	* runaway file reclaim problem, but rather just
2438	* extreme pressure. Reclaim as per usual then.
2439	*/
2440	anon = node_page_state(pgdat, item: NR_INACTIVE_ANON);
2441
2442	sc->file_is_tiny =
2443	file + free <= total_high_wmark &&
2444	!(sc->may_deactivate & DEACTIVATE_ANON) &&
2445	anon >> sc->priority;
2446	}
2447	}
2448
2449	static inline void calculate_pressure_balance(struct scan_control *sc,
2450	int swappiness, u64 fraction, u64 denominator)
2451	{
2452	unsigned long anon_cost, file_cost, total_cost;
2453	unsigned long ap, fp;
2454
2455	/*
2456	* Calculate the pressure balance between anon and file pages.
2457	*
2458	* The amount of pressure we put on each LRU is inversely
2459	* proportional to the cost of reclaiming each list, as
2460	* determined by the share of pages that are refaulting, times
2461	* the relative IO cost of bringing back a swapped out
2462	* anonymous page vs reloading a filesystem page (swappiness).
2463	*
2464	* Although we limit that influence to ensure no list gets
2465	* left behind completely: at least a third of the pressure is
2466	* applied, before swappiness.
2467	*
2468	* With swappiness at 100, anon and file have equal IO cost.
2469	*/
2470	total_cost = sc->anon_cost + sc->file_cost;
2471	anon_cost = total_cost + sc->anon_cost;
2472	file_cost = total_cost + sc->file_cost;
2473	total_cost = anon_cost + file_cost;
2474
2475	ap = swappiness * (total_cost + `1`);
2476	ap /= anon_cost + `1`;
2477
2478	fp = (MAX_SWAPPINESS - swappiness) * (total_cost + `1`);
2479	fp /= file_cost + `1`;
2480
2481	fraction[WORKINGSET_ANON] = ap;
2482	fraction[WORKINGSET_FILE] = fp;
2483	*denominator = ap + fp;
2484	}
2485
2486	static unsigned long apply_proportional_protection(struct mem_cgroup *memcg,
2487	struct scan_control sc, unsigned* long scan)
2488	{
2489	unsigned long min, low;
2490
2491	mem_cgroup_protection(root: sc->target_mem_cgroup, memcg, min: &min, low: &low);
2492
2493	if (min \|\| low) {
2494	/*
2495	* Scale a cgroup's reclaim pressure by proportioning
2496	* its current usage to its memory.low or memory.min
2497	* setting.
2498	*
2499	* This is important, as otherwise scanning aggression
2500	* becomes extremely binary -- from nothing as we
2501	* approach the memory protection threshold, to totally
2502	* nominal as we exceed it. This results in requiring
2503	* setting extremely liberal protection thresholds. It
2504	* also means we simply get no protection at all if we
2505	* set it too low, which is not ideal.
2506	*
2507	* If there is any protection in place, we reduce scan
2508	* pressure by how much of the total memory used is
2509	* within protection thresholds.
2510	*
2511	* There is one special case: in the first reclaim pass,
2512	* we skip over all groups that are within their low
2513	* protection. If that fails to reclaim enough pages to
2514	* satisfy the reclaim goal, we come back and override
2515	* the best-effort low protection. However, we still
2516	* ideally want to honor how well-behaved groups are in
2517	* that case instead of simply punishing them all
2518	* equally. As such, we reclaim them based on how much
2519	* memory they are using, reducing the scan pressure
2520	* again by how much of the total memory used is under
2521	* hard protection.
2522	*/
2523	unsigned long cgroup_size = mem_cgroup_size(memcg);
2524	unsigned long protection;
2525
2526	/ memory.low scaling, make sure we retry before OOM /
2527	if (!sc->memcg_low_reclaim && low > min) {
2528	protection = low;
2529	sc->memcg_low_skipped = `1`;
2530	} else {
2531	protection = min;
2532	}
2533
2534	/ Avoid TOCTOU with earlier protection check /
2535	cgroup_size = max(cgroup_size, protection);
2536
2537	scan -= scan * protection / (cgroup_size + `1`);
2538
2539	/*
2540	* Minimally target SWAP_CLUSTER_MAX pages to keep
2541	* reclaim moving forwards, avoiding decrementing
2542	* sc->priority further than desirable.
2543	*/
2544	scan = max(scan, SWAP_CLUSTER_MAX);
2545	}
2546	return scan;
2547	}
2548
2549	/*
2550	* Determine how aggressively the anon and file LRU lists should be
2551	* scanned.
2552	*
2553	* nr[0] = anon inactive folios to scan; nr[1] = anon active folios to scan
2554	* nr[2] = file inactive folios to scan; nr[3] = file active folios to scan
2555	*/
2556	static void get_scan_count(struct lruvec lruvec, struct* scan_control *sc,
2557	unsigned long *nr)
2558	{
2559	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2560	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
2561	int swappiness = sc_swappiness(sc, memcg);
2562	u64 fraction[ANON_AND_FILE];
2563	u64 denominator = `0`; / gcc /
2564	enum scan_balance scan_balance;
2565	enum lru_list lru;
2566
2567	/ If we have no swap space, do not bother scanning anon folios. /
2568	if (!sc->may_swap \|\| !can_reclaim_anon_pages(memcg, nid: pgdat->node_id, sc)) {
2569	scan_balance = SCAN_FILE;
2570	goto out;
2571	}
2572
2573	/*
2574	* Global reclaim will swap to prevent OOM even with no
2575	* swappiness, but memcg users want to use this knob to
2576	* disable swapping for individual groups completely when
2577	* using the memory controller's swap limit feature would be
2578	* too expensive.
2579	*/
2580	if (cgroup_reclaim(sc) && !swappiness) {
2581	scan_balance = SCAN_FILE;
2582	goto out;
2583	}
2584
2585	/ Proactive reclaim initiated by userspace for anonymous memory only /
2586	if (swappiness == SWAPPINESS_ANON_ONLY) {
2587	WARN_ON_ONCE(!sc->proactive);
2588	scan_balance = SCAN_ANON;
2589	goto out;
2590	}
2591
2592	/*
2593	* Do not apply any pressure balancing cleverness when the
2594	* system is close to OOM, scan both anon and file equally
2595	* (unless the swappiness setting disagrees with swapping).
2596	*/
2597	if (!sc->priority && swappiness) {
2598	scan_balance = SCAN_EQUAL;
2599	goto out;
2600	}
2601
2602	/*
2603	* If the system is almost out of file pages, force-scan anon.
2604	*/
2605	if (sc->file_is_tiny) {
2606	scan_balance = SCAN_ANON;
2607	goto out;
2608	}
2609
2610	/*
2611	* If there is enough inactive page cache, we do not reclaim
2612	* anything from the anonymous working right now to make sure
2613	* a streaming file access pattern doesn't cause swapping.
2614	*/
2615	if (sc->cache_trim_mode) {
2616	scan_balance = SCAN_FILE;
2617	goto out;
2618	}
2619
2620	scan_balance = SCAN_FRACT;
2621	calculate_pressure_balance(sc, swappiness, fraction, denominator: &denominator);
2622
2623	out:
2624	for_each_evictable_lru(lru) {
2625	bool file = is_file_lru(lru);
2626	unsigned long lruvec_size;
2627	unsigned long scan;
2628
2629	lruvec_size = lruvec_lru_size(lruvec, lru, zone_idx: sc->reclaim_idx);
2630	scan = apply_proportional_protection(memcg, sc, scan: lruvec_size);
2631	scan >>= sc->priority;
2632
2633	/*
2634	* If the cgroup's already been deleted, make sure to
2635	* scrape out the remaining cache.
2636	*/
2637	if (!scan && !mem_cgroup_online(memcg))
2638	scan = min(lruvec_size, SWAP_CLUSTER_MAX);
2639
2640	switch (scan_balance) {
2641	case SCAN_EQUAL:
2642	/ Scan lists relative to size /
2643	break;
2644	case SCAN_FRACT:
2645	/*
2646	* Scan types proportional to swappiness and
2647	* their relative recent reclaim efficiency.
2648	* Make sure we don't miss the last page on
2649	* the offlined memory cgroups because of a
2650	* round-off error.
2651	*/
2652	scan = mem_cgroup_online(memcg) ?
2653	div64_u64(dividend: scan * fraction[file], divisor: denominator) :
2654	DIV64_U64_ROUND_UP(scan * fraction[file],
2655	denominator);
2656	break;
2657	case SCAN_FILE:
2658	case SCAN_ANON:
2659	/ Scan one type exclusively /
2660	if ((scan_balance == SCAN_FILE) != file)
2661	scan = `0`;
2662	break;
2663	default:
2664	/ Look ma, no brain /
2665	BUG();
2666	}
2667
2668	nr[lru] = scan;
2669	}
2670	}
2671
2672	/*
2673	* Anonymous LRU management is a waste if there is
2674	* ultimately no way to reclaim the memory.
2675	*/
2676	static bool can_age_anon_pages(struct lruvec *lruvec,
2677	struct scan_control *sc)
2678	{
2679	/ Aging the anon LRU is valuable if swap is present: /
2680	if (total_swap_pages > `0`)
2681	return true;
2682
2683	/ Also valuable if anon pages can be demoted: /
2684	return can_demote(nid: lruvec_pgdat(lruvec)->node_id, sc,
2685	memcg: lruvec_memcg(lruvec));
2686	}
2687
2688	#ifdef CONFIG_LRU_GEN
2689
2690	#ifdef CONFIG_LRU_GEN_ENABLED
2691	DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS);
2692	#define get_cap(cap) static_branch_likely(&lru_gen_caps[cap])
2693	#else
2694	DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS);
2695	#define get_cap(cap) static_branch_unlikely(&lru_gen_caps[cap])
2696	#endif
2697
2698	static bool should_walk_mmu(void)
2699	{
2700	return arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK);
2701	}
2702
2703	static bool should_clear_pmd_young(void)
2704	{
2705	return arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG);
2706	}
2707
2708	/******************************************************************************
2709	* shorthand helpers
2710	******************************************************************************/
2711
2712	#define DEFINE_MAX_SEQ(lruvec) \
2713	unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq)
2714
2715	#define DEFINE_MIN_SEQ(lruvec) \
2716	unsigned long min_seq[ANON_AND_FILE] = { \
2717	READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_ANON]), \
2718	READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \
2719	}
2720
2721	/ Get the min/max evictable type based on swappiness /
2722	#define min_type(swappiness) (!(swappiness))
2723	#define max_type(swappiness) ((swappiness) < SWAPPINESS_ANON_ONLY)
2724
2725	#define evictable_min_seq(min_seq, swappiness) \
2726	min((min_seq)[min_type(swappiness)], (min_seq)[max_type(swappiness)])
2727
2728	#define for_each_gen_type_zone(gen, type, zone) \
2729	for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
2730	for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
2731	for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
2732
2733	#define for_each_evictable_type(type, swappiness) \
2734	for ((type) = min_type(swappiness); (type) <= max_type(swappiness); (type)++)
2735
2736	#define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS)
2737	#define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS)
2738
2739	static struct lruvec get_lruvec(struct* mem_cgroup memcg, int* nid)
2740	{
2741	struct pglist_data *pgdat = NODE_DATA(nid);
2742
2743	#ifdef CONFIG_MEMCG
2744	if (memcg) {
2745	struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
2746
2747	/ see the comment in mem_cgroup_lruvec() /
2748	if (!lruvec->pgdat)
2749	lruvec->pgdat = pgdat;
2750
2751	return lruvec;
2752	}
2753	#endif
2754	VM_WARN_ON_ONCE(!mem_cgroup_disabled());
2755
2756	return &pgdat->__lruvec;
2757	}
2758
2759	static int get_swappiness(struct lruvec lruvec, struct* scan_control *sc)
2760	{
2761	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
2762	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2763
2764	if (!sc->may_swap)
2765	return `0`;
2766
2767	if (!can_demote(pgdat->node_id, sc, memcg) &&
2768	mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
2769	return `0`;
2770
2771	return sc_swappiness(sc, memcg);
2772	}
2773
2774	static int get_nr_gens(struct lruvec lruvec, int* type)
2775	{
2776	return lruvec->lrugen.max_seq - lruvec->lrugen.min_seq[type] + `1`;
2777	}
2778
2779	static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
2780	{
2781	int type;
2782
2783	for (type = `0`; type < ANON_AND_FILE; type++) {
2784	int n = get_nr_gens(lruvec, type);
2785
2786	if (n < MIN_NR_GENS \|\| n > MAX_NR_GENS)
2787	return false;
2788	}
2789
2790	return true;
2791	}
2792
2793	/******************************************************************************
2794	* Bloom filters
2795	******************************************************************************/
2796
2797	/*
2798	* Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when
2799	* n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of
2800	* bits in a bitmap, k is the number of hash functions and n is the number of
2801	* inserted items.
2802	*
2803	* Page table walkers use one of the two filters to reduce their search space.
2804	* To get rid of non-leaf entries that no longer have enough leaf entries, the
2805	* aging uses the double-buffering technique to flip to the other filter each
2806	* time it produces a new generation. For non-leaf entries that have enough
2807	* leaf entries, the aging carries them over to the next generation in
2808	* walk_pmd_range(); the eviction also report them when walking the rmap
2809	* in lru_gen_look_around().
2810	*
2811	* For future optimizations:
2812	* 1. It's not necessary to keep both filters all the time. The spare one can be
2813	* freed after the RCU grace period and reallocated if needed again.
2814	* 2. And when reallocating, it's worth scaling its size according to the number
2815	* of inserted entries in the other filter, to reduce the memory overhead on
2816	* small systems and false positives on large systems.
2817	* 3. Jenkins' hash function is an alternative to Knuth's.
2818	*/
2819	#define BLOOM_FILTER_SHIFT 15
2820
2821	static inline int filter_gen_from_seq(unsigned long seq)
2822	{
2823	return seq % NR_BLOOM_FILTERS;
2824	}
2825
2826	static void get_item_key(void item, int* *key)
2827	{
2828	u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * `2`);
2829
2830	BUILD_BUG_ON(BLOOM_FILTER_SHIFT * `2` > BITS_PER_TYPE(u32));
2831
2832	key[`0`] = hash & (BIT(BLOOM_FILTER_SHIFT) - `1`);
2833	key[`1`] = hash >> BLOOM_FILTER_SHIFT;
2834	}
2835
2836	static bool test_bloom_filter(struct lru_gen_mm_state mm_state, unsigned* long seq,
2837	void *item)
2838	{
2839	int key[`2`];
2840	unsigned long *filter;
2841	int gen = filter_gen_from_seq(seq);
2842
2843	filter = READ_ONCE(mm_state->filters[gen]);
2844	if (!filter)
2845	return true;
2846
2847	get_item_key(item, key);
2848
2849	return test_bit(key[`0`], filter) && test_bit(key[`1`], filter);
2850	}
2851
2852	static void update_bloom_filter(struct lru_gen_mm_state mm_state, unsigned* long seq,
2853	void *item)
2854	{
2855	int key[`2`];
2856	unsigned long *filter;
2857	int gen = filter_gen_from_seq(seq);
2858
2859	filter = READ_ONCE(mm_state->filters[gen]);
2860	if (!filter)
2861	return;
2862
2863	get_item_key(item, key);
2864
2865	if (!test_bit(key[`0`], filter))
2866	set_bit(key[`0`], filter);
2867	if (!test_bit(key[`1`], filter))
2868	set_bit(key[`1`], filter);
2869	}
2870
2871	static void reset_bloom_filter(struct lru_gen_mm_state mm_state, unsigned* long seq)
2872	{
2873	unsigned long *filter;
2874	int gen = filter_gen_from_seq(seq);
2875
2876	filter = mm_state->filters[gen];
2877	if (filter) {
2878	bitmap_clear(filter, `0`, BIT(BLOOM_FILTER_SHIFT));
2879	return;
2880	}
2881
2882	filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT),
2883	__GFP_HIGH \| __GFP_NOMEMALLOC \| __GFP_NOWARN);
2884	WRITE_ONCE(mm_state->filters[gen], filter);
2885	}
2886
2887	/******************************************************************************
2888	* mm_struct list
2889	******************************************************************************/
2890
2891	#ifdef CONFIG_LRU_GEN_WALKS_MMU
2892
2893	static struct lru_gen_mm_list get_mm_list(struct* mem_cgroup *memcg)
2894	{
2895	static struct lru_gen_mm_list mm_list = {
2896	.fifo = LIST_HEAD_INIT(mm_list.fifo),
2897	.lock = __SPIN_LOCK_UNLOCKED(mm_list.lock),
2898	};
2899
2900	#ifdef CONFIG_MEMCG
2901	if (memcg)
2902	return &memcg->mm_list;
2903	#endif
2904	VM_WARN_ON_ONCE(!mem_cgroup_disabled());
2905
2906	return &mm_list;
2907	}
2908
2909	static struct lru_gen_mm_state get_mm_state(struct* lruvec *lruvec)
2910	{
2911	return &lruvec->mm_state;
2912	}
2913
2914	static struct mm_struct get_next_mm(struct* lru_gen_mm_walk *walk)
2915	{
2916	int key;
2917	struct mm_struct *mm;
2918	struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
2919	struct lru_gen_mm_state *mm_state = get_mm_state(walk->lruvec);
2920
2921	mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
2922	key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap);
2923
2924	if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap))
2925	return NULL;
2926
2927	clear_bit(key, &mm->lru_gen.bitmap);
2928
2929	return mmget_not_zero(mm) ? mm : NULL;
2930	}
2931
2932	void lru_gen_add_mm(struct mm_struct *mm)
2933	{
2934	int nid;
2935	struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
2936	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
2937
2938	VM_WARN_ON_ONCE(!list_empty(&mm->lru_gen.list));
2939	#ifdef CONFIG_MEMCG
2940	VM_WARN_ON_ONCE(mm->lru_gen.memcg);
2941	mm->lru_gen.memcg = memcg;
2942	#endif
2943	spin_lock(&mm_list->lock);
2944
2945	for_each_node_state(nid, N_MEMORY) {
2946	struct lruvec *lruvec = get_lruvec(memcg, nid);
2947	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
2948
2949	/ the first addition since the last iteration /
2950	if (mm_state->tail == &mm_list->fifo)
2951	mm_state->tail = &mm->lru_gen.list;
2952	}
2953
2954	list_add_tail(&mm->lru_gen.list, &mm_list->fifo);
2955
2956	spin_unlock(&mm_list->lock);
2957	}
2958
2959	void lru_gen_del_mm(struct mm_struct *mm)
2960	{
2961	int nid;
2962	struct lru_gen_mm_list *mm_list;
2963	struct mem_cgroup *memcg = NULL;
2964
2965	if (list_empty(&mm->lru_gen.list))
2966	return;
2967
2968	#ifdef CONFIG_MEMCG
2969	memcg = mm->lru_gen.memcg;
2970	#endif
2971	mm_list = get_mm_list(memcg);
2972
2973	spin_lock(&mm_list->lock);
2974
2975	for_each_node(nid) {
2976	struct lruvec *lruvec = get_lruvec(memcg, nid);
2977	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
2978
2979	/ where the current iteration continues after /
2980	if (mm_state->head == &mm->lru_gen.list)
2981	mm_state->head = mm_state->head->prev;
2982
2983	/ where the last iteration ended before /
2984	if (mm_state->tail == &mm->lru_gen.list)
2985	mm_state->tail = mm_state->tail->next;
2986	}
2987
2988	list_del_init(&mm->lru_gen.list);
2989
2990	spin_unlock(&mm_list->lock);
2991
2992	#ifdef CONFIG_MEMCG
2993	mem_cgroup_put(mm->lru_gen.memcg);
2994	mm->lru_gen.memcg = NULL;
2995	#endif
2996	}
2997
2998	#ifdef CONFIG_MEMCG
2999	void lru_gen_migrate_mm(struct mm_struct *mm)
3000	{
3001	struct mem_cgroup *memcg;
3002	struct task_struct *task = rcu_dereference_protected(mm->owner, true);
3003
3004	VM_WARN_ON_ONCE(task->mm != mm);
3005	lockdep_assert_held(&task->alloc_lock);
3006
3007	/ for mm_update_next_owner() /
3008	if (mem_cgroup_disabled())
3009	return;
3010
3011	/ migration can happen before addition /
3012	if (!mm->lru_gen.memcg)
3013	return;
3014
3015	rcu_read_lock();
3016	memcg = mem_cgroup_from_task(task);
3017	rcu_read_unlock();
3018	if (memcg == mm->lru_gen.memcg)
3019	return;
3020
3021	VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list));
3022
3023	lru_gen_del_mm(mm);
3024	lru_gen_add_mm(mm);
3025	}
3026	#endif
3027
3028	#else /* !CONFIG_LRU_GEN_WALKS_MMU */
3029
3030	static struct lru_gen_mm_list get_mm_list(struct* mem_cgroup *memcg)
3031	{
3032	return NULL;
3033	}
3034
3035	static struct lru_gen_mm_state get_mm_state(struct* lruvec *lruvec)
3036	{
3037	return NULL;
3038	}
3039
3040	static struct mm_struct get_next_mm(struct* lru_gen_mm_walk *walk)
3041	{
3042	return NULL;
3043	}
3044
3045	#endif
3046
3047	static void reset_mm_stats(struct lru_gen_mm_walk *walk, bool last)
3048	{
3049	int i;
3050	int hist;
3051	struct lruvec *lruvec = walk->lruvec;
3052	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
3053
3054	lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
3055
3056	hist = lru_hist_from_seq(walk->seq);
3057
3058	for (i = `0`; i < NR_MM_STATS; i++) {
3059	WRITE_ONCE(mm_state->stats[hist][i],
3060	mm_state->stats[hist][i] + walk->mm_stats[i]);
3061	walk->mm_stats[i] = `0`;
3062	}
3063
3064	if (NR_HIST_GENS > `1` && last) {
3065	hist = lru_hist_from_seq(walk->seq + `1`);
3066
3067	for (i = `0`; i < NR_MM_STATS; i++)
3068	WRITE_ONCE(mm_state->stats[hist][i], `0`);
3069	}
3070	}
3071
3072	static bool iterate_mm_list(struct lru_gen_mm_walk walk, struct* mm_struct **iter)
3073	{
3074	bool first = false;
3075	bool last = false;
3076	struct mm_struct *mm = NULL;
3077	struct lruvec *lruvec = walk->lruvec;
3078	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
3079	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
3080	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
3081
3082	/*
3083	* mm_state->seq is incremented after each iteration of mm_list. There
3084	* are three interesting cases for this page table walker:
3085	* 1. It tries to start a new iteration with a stale max_seq: there is
3086	* nothing left to do.
3087	* 2. It started the next iteration: it needs to reset the Bloom filter
3088	* so that a fresh set of PTE tables can be recorded.
3089	* 3. It ended the current iteration: it needs to reset the mm stats
3090	* counters and tell its caller to increment max_seq.
3091	*/
3092	spin_lock(&mm_list->lock);
3093
3094	VM_WARN_ON_ONCE(mm_state->seq + `1` < walk->seq);
3095
3096	if (walk->seq <= mm_state->seq)
3097	goto done;
3098
3099	if (!mm_state->head)
3100	mm_state->head = &mm_list->fifo;
3101
3102	if (mm_state->head == &mm_list->fifo)
3103	first = true;
3104
3105	do {
3106	mm_state->head = mm_state->head->next;
3107	if (mm_state->head == &mm_list->fifo) {
3108	WRITE_ONCE(mm_state->seq, mm_state->seq + `1`);
3109	last = true;
3110	break;
3111	}
3112
3113	/ force scan for those added after the last iteration /
3114	if (!mm_state->tail \|\| mm_state->tail == mm_state->head) {
3115	mm_state->tail = mm_state->head->next;
3116	walk->force_scan = true;
3117	}
3118	} while (!(mm = get_next_mm(walk)));
3119	done:
3120	if (*iter \|\| last)
3121	reset_mm_stats(walk, last);
3122
3123	spin_unlock(&mm_list->lock);
3124
3125	if (mm && first)
3126	reset_bloom_filter(mm_state, walk->seq + `1`);
3127
3128	if (*iter)
3129	mmput_async(*iter);
3130
3131	*iter = mm;
3132
3133	return last;
3134	}
3135
3136	static bool iterate_mm_list_nowalk(struct lruvec lruvec, unsigned* long seq)
3137	{
3138	bool success = false;
3139	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
3140	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
3141	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
3142
3143	spin_lock(&mm_list->lock);
3144
3145	VM_WARN_ON_ONCE(mm_state->seq + `1` < seq);
3146
3147	if (seq > mm_state->seq) {
3148	mm_state->head = NULL;
3149	mm_state->tail = NULL;
3150	WRITE_ONCE(mm_state->seq, mm_state->seq + `1`);
3151	success = true;
3152	}
3153
3154	spin_unlock(&mm_list->lock);
3155
3156	return success;
3157	}
3158
3159	/******************************************************************************
3160	* PID controller
3161	******************************************************************************/
3162
3163	/*
3164	* A feedback loop based on Proportional-Integral-Derivative (PID) controller.
3165	*
3166	* The P term is refaulted/(evicted+protected) from a tier in the generation
3167	* currently being evicted; the I term is the exponential moving average of the
3168	* P term over the generations previously evicted, using the smoothing factor
3169	* 1/2; the D term isn't supported.
3170	*
3171	* The setpoint (SP) is always the first tier of one type; the process variable
3172	* (PV) is either any tier of the other type or any other tier of the same
3173	* type.
3174	*
3175	* The error is the difference between the SP and the PV; the correction is to
3176	* turn off protection when SP>PV or turn on protection when SP<PV.
3177	*
3178	* For future optimizations:
3179	* 1. The D term may discount the other two terms over time so that long-lived
3180	* generations can resist stale information.
3181	*/
3182	struct ctrl_pos {
3183	unsigned long refaulted;
3184	unsigned long total;
3185	int gain;
3186	};
3187
3188	static void read_ctrl_pos(struct lruvec lruvec, int* type, int tier, int gain,
3189	struct ctrl_pos *pos)
3190	{
3191	int i;
3192	struct lru_gen_folio *lrugen = &lruvec->lrugen;
3193	int hist = lru_hist_from_seq(lrugen->min_seq[type]);
3194
3195	pos->gain = gain;
3196	pos->refaulted = pos->total = `0`;
3197
3198	for (i = tier % MAX_NR_TIERS; i <= min(tier, MAX_NR_TIERS - `1`); i++) {
3199	pos->refaulted += lrugen->avg_refaulted[type][i] +
3200	atomic_long_read(&lrugen->refaulted[hist][type][i]);
3201	pos->total += lrugen->avg_total[type][i] +
3202	lrugen->protected[hist][type][i] +
3203	atomic_long_read(&lrugen->evicted[hist][type][i]);
3204	}
3205	}
3206
3207	static void reset_ctrl_pos(struct lruvec lruvec, int* type, bool carryover)
3208	{
3209	int hist, tier;
3210	struct lru_gen_folio *lrugen = &lruvec->lrugen;
3211	bool clear = carryover ? NR_HIST_GENS == `1` : NR_HIST_GENS > `1`;
3212	unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + `1`;
3213
3214	lockdep_assert_held(&lruvec->lru_lock);
3215
3216	if (!carryover && !clear)
3217	return;
3218
3219	hist = lru_hist_from_seq(seq);
3220
3221	for (tier = `0`; tier < MAX_NR_TIERS; tier++) {
3222	if (carryover) {
3223	unsigned long sum;
3224
3225	sum = lrugen->avg_refaulted[type][tier] +
3226	atomic_long_read(&lrugen->refaulted[hist][type][tier]);
3227	WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / `2`);
3228
3229	sum = lrugen->avg_total[type][tier] +
3230	lrugen->protected[hist][type][tier] +
3231	atomic_long_read(&lrugen->evicted[hist][type][tier]);
3232	WRITE_ONCE(lrugen->avg_total[type][tier], sum / `2`);
3233	}
3234
3235	if (clear) {
3236	atomic_long_set(&lrugen->refaulted[hist][type][tier], `0`);
3237	atomic_long_set(&lrugen->evicted[hist][type][tier], `0`);
3238	WRITE_ONCE(lrugen->protected[hist][type][tier], `0`);
3239	}
3240	}
3241	}
3242
3243	static bool positive_ctrl_err(struct ctrl_pos sp, struct* ctrl_pos *pv)
3244	{
3245	/*
3246	* Return true if the PV has a limited number of refaults or a lower
3247	* refaulted/total than the SP.
3248	*/
3249	return pv->refaulted < MIN_LRU_BATCH \|\|
3250	pv->refaulted * (sp->total + MIN_LRU_BATCH) * sp->gain <=
3251	(sp->refaulted + `1`) * pv->total * pv->gain;
3252	}
3253
3254	/******************************************************************************
3255	* the aging
3256	******************************************************************************/
3257
3258	/ promote pages accessed through page tables /
3259	static int folio_update_gen(struct folio folio, int* gen)
3260	{
3261	unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f);
3262
3263	VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
3264
3265	/ see the comment on LRU_REFS_FLAGS /
3266	if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) {
3267	set_mask_bits(&folio->flags.f, LRU_REFS_MASK, BIT(PG_referenced));
3268	return -`1`;
3269	}
3270
3271	do {
3272	/ lru_gen_del_folio() has isolated this page? /
3273	if (!(old_flags & LRU_GEN_MASK))
3274	return -`1`;
3275
3276	new_flags = old_flags & ~(LRU_GEN_MASK \| LRU_REFS_FLAGS);
3277	new_flags \|= ((gen + `1UL`) << LRU_GEN_PGOFF) \| BIT(PG_workingset);
3278	} while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags));
3279
3280	return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - `1`;
3281	}
3282
3283	/ protect pages accessed multiple times through file descriptors /
3284	static int folio_inc_gen(struct lruvec lruvec, struct* folio *folio, bool reclaiming)
3285	{
3286	int type = folio_is_file_lru(folio);
3287	struct lru_gen_folio *lrugen = &lruvec->lrugen;
3288	int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
3289	unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f);
3290
3291	VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio);
3292
3293	do {
3294	new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - `1`;
3295	/ folio_update_gen() has promoted this page? /
3296	if (new_gen >= `0` && new_gen != old_gen)
3297	return new_gen;
3298
3299	new_gen = (old_gen + `1`) % MAX_NR_GENS;
3300
3301	new_flags = old_flags & ~(LRU_GEN_MASK \| LRU_REFS_FLAGS);
3302	new_flags \|= (new_gen + `1UL`) << LRU_GEN_PGOFF;
3303	/ for folio_end_writeback() /
3304	if (reclaiming)
3305	new_flags \|= BIT(PG_reclaim);
3306	} while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags));
3307
3308	lru_gen_update_size(lruvec, folio, old_gen, new_gen);
3309
3310	return new_gen;
3311	}
3312
3313	static void update_batch_size(struct lru_gen_mm_walk walk, struct* folio *folio,
3314	int old_gen, int new_gen)
3315	{
3316	int type = folio_is_file_lru(folio);
3317	int zone = folio_zonenum(folio);
3318	int delta = folio_nr_pages(folio);
3319
3320	VM_WARN_ON_ONCE(old_gen >= MAX_NR_GENS);
3321	VM_WARN_ON_ONCE(new_gen >= MAX_NR_GENS);
3322
3323	walk->batched++;
3324
3325	walk->nr_pages[old_gen][type][zone] -= delta;
3326	walk->nr_pages[new_gen][type][zone] += delta;
3327	}
3328
3329	static void reset_batch_size(struct lru_gen_mm_walk *walk)
3330	{
3331	int gen, type, zone;
3332	struct lruvec *lruvec = walk->lruvec;
3333	struct lru_gen_folio *lrugen = &lruvec->lrugen;
3334
3335	walk->batched = `0`;
3336
3337	for_each_gen_type_zone(gen, type, zone) {
3338	enum lru_list lru = type * LRU_INACTIVE_FILE;
3339	int delta = walk->nr_pages[gen][type][zone];
3340
3341	if (!delta)
3342	continue;
3343
3344	walk->nr_pages[gen][type][zone] = `0`;
3345	WRITE_ONCE(lrugen->nr_pages[gen][type][zone],
3346	lrugen->nr_pages[gen][type][zone] + delta);
3347
3348	if (lru_gen_is_active(lruvec, gen))
3349	lru += LRU_ACTIVE;
3350	__update_lru_size(lruvec, lru, zone, delta);
3351	}
3352	}
3353
3354	static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *args)
3355	{
3356	struct address_space *mapping;
3357	struct vm_area_struct *vma = args->vma;
3358	struct lru_gen_mm_walk *walk = args->private;
3359
3360	if (!vma_is_accessible(vma))
3361	return true;
3362
3363	if (is_vm_hugetlb_page(vma))
3364	return true;
3365
3366	if (!vma_has_recency(vma))
3367	return true;
3368
3369	if (vma->vm_flags & (VM_LOCKED \| VM_SPECIAL))
3370	return true;
3371
3372	if (vma == get_gate_vma(vma->vm_mm))
3373	return true;
3374
3375	if (vma_is_anonymous(vma))
3376	return !walk->swappiness;
3377
3378	if (WARN_ON_ONCE(!vma->vm_file \|\| !vma->vm_file->f_mapping))
3379	return true;
3380
3381	mapping = vma->vm_file->f_mapping;
3382	if (mapping_unevictable(mapping))
3383	return true;
3384
3385	if (shmem_mapping(mapping))
3386	return !walk->swappiness;
3387
3388	if (walk->swappiness > MAX_SWAPPINESS)
3389	return true;
3390
3391	/ to exclude special mappings like dax, etc. /
3392	return !mapping->a_ops->read_folio;
3393	}
3394
3395	/*
3396	* Some userspace memory allocators map many single-page VMAs. Instead of
3397	* returning back to the PGD table for each of such VMAs, finish an entire PMD
3398	* table to reduce zigzags and improve cache performance.
3399	*/
3400	static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk *args,
3401	unsigned long vm_start, unsigned* long *vm_end)
3402	{
3403	unsigned long start = round_up(*vm_end, size);
3404	unsigned long end = (start \| ~mask) + `1`;
3405	VMA_ITERATOR(vmi, args->mm, start);
3406
3407	VM_WARN_ON_ONCE(mask & size);
3408	VM_WARN_ON_ONCE((start & mask) != (*vm_start & mask));
3409
3410	for_each_vma(vmi, args->vma) {
3411	if (end && end <= args->vma->vm_start)
3412	return false;
3413
3414	if (should_skip_vma(args->vma->vm_start, args->vma->vm_end, args))
3415	continue;
3416
3417	*vm_start = max(start, args->vma->vm_start);
3418	*vm_end = min(end - `1`, args->vma->vm_end - `1`) + `1`;
3419
3420	return true;
3421	}
3422
3423	return false;
3424	}
3425
3426	static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct vma, unsigned* long addr,
3427	struct pglist_data *pgdat)
3428	{
3429	unsigned long pfn = pte_pfn(pte);
3430
3431	VM_WARN_ON_ONCE(addr < vma->vm_start \|\| addr >= vma->vm_end);
3432
3433	if (!pte_present(pte) \|\| is_zero_pfn(pfn))
3434	return -`1`;
3435
3436	if (WARN_ON_ONCE(pte_special(pte)))
3437	return -`1`;
3438
3439	if (!pte_young(pte) && !mm_has_notifiers(vma->vm_mm))
3440	return -`1`;
3441
3442	if (WARN_ON_ONCE(!pfn_valid(pfn)))
3443	return -`1`;
3444
3445	if (pfn < pgdat->node_start_pfn \|\| pfn >= pgdat_end_pfn(pgdat))
3446	return -`1`;
3447
3448	return pfn;
3449	}
3450
3451	static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct vma, unsigned* long addr,
3452	struct pglist_data *pgdat)
3453	{
3454	unsigned long pfn = pmd_pfn(pmd);
3455
3456	VM_WARN_ON_ONCE(addr < vma->vm_start \|\| addr >= vma->vm_end);
3457
3458	if (!pmd_present(pmd) \|\| is_huge_zero_pmd(pmd))
3459	return -`1`;
3460
3461	if (!pmd_young(pmd) && !mm_has_notifiers(vma->vm_mm))
3462	return -`1`;
3463
3464	if (WARN_ON_ONCE(!pfn_valid(pfn)))
3465	return -`1`;
3466
3467	if (pfn < pgdat->node_start_pfn \|\| pfn >= pgdat_end_pfn(pgdat))
3468	return -`1`;
3469
3470	return pfn;
3471	}
3472
3473	static struct folio get_pfn_folio(unsigned* long pfn, struct mem_cgroup *memcg,
3474	struct pglist_data *pgdat)
3475	{
3476	struct folio *folio = pfn_folio(pfn);
3477
3478	if (folio_lru_gen(folio) < `0`)
3479	return NULL;
3480
3481	if (folio_nid(folio) != pgdat->node_id)
3482	return NULL;
3483
3484	if (folio_memcg(folio) != memcg)
3485	return NULL;
3486
3487	return folio;
3488	}
3489
3490	static bool suitable_to_scan(int total, int young)
3491	{
3492	int n = clamp_t(int, cache_line_size() / sizeof(pte_t), `2`, `8`);
3493
3494	/ suitable if the average number of young PTEs per cacheline is >=1 /
3495	return young * n >= total;
3496	}
3497
3498	static void walk_update_folio(struct lru_gen_mm_walk walk, struct* folio *folio,
3499	int new_gen, bool dirty)
3500	{
3501	int old_gen;
3502
3503	if (!folio)
3504	return;
3505
3506	if (dirty && !folio_test_dirty(folio) &&
3507	!(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
3508	!folio_test_swapcache(folio)))
3509	folio_mark_dirty(folio);
3510
3511	if (walk) {
3512	old_gen = folio_update_gen(folio, new_gen);
3513	if (old_gen >= `0` && old_gen != new_gen)
3514	update_batch_size(walk, folio, old_gen, new_gen);
3515	} else if (lru_gen_set_refs(folio)) {
3516	old_gen = folio_lru_gen(folio);
3517	if (old_gen >= `0` && old_gen != new_gen)
3518	folio_activate(folio);
3519	}
3520	}
3521
3522	static bool walk_pte_range(pmd_t pmd, unsigned* long start, unsigned long end,
3523	struct mm_walk *args)
3524	{
3525	int i;
3526	bool dirty;
3527	pte_t *pte;
3528	spinlock_t *ptl;
3529	unsigned long addr;
3530	int total = `0`;
3531	int young = `0`;
3532	struct folio *last = NULL;
3533	struct lru_gen_mm_walk *walk = args->private;
3534	struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
3535	struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
3536	DEFINE_MAX_SEQ(walk->lruvec);
3537	int gen = lru_gen_from_seq(max_seq);
3538	pmd_t pmdval;
3539
3540	pte = pte_offset_map_rw_nolock(args->mm, pmd, start & PMD_MASK, &pmdval, &ptl);
3541	if (!pte)
3542	return false;
3543
3544	if (!spin_trylock(ptl)) {
3545	pte_unmap(pte);
3546	return true;
3547	}
3548
3549	if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pmd)))) {
3550	pte_unmap_unlock(pte, ptl);
3551	return false;
3552	}
3553
3554	arch_enter_lazy_mmu_mode();
3555	restart:
3556	for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) {
3557	unsigned long pfn;
3558	struct folio *folio;
3559	pte_t ptent = ptep_get(pte + i);
3560
3561	total++;
3562	walk->mm_stats[MM_LEAF_TOTAL]++;
3563
3564	pfn = get_pte_pfn(ptent, args->vma, addr, pgdat);
3565	if (pfn == -`1`)
3566	continue;
3567
3568	folio = get_pfn_folio(pfn, memcg, pgdat);
3569	if (!folio)
3570	continue;
3571
3572	if (!ptep_clear_young_notify(args->vma, addr, pte + i))
3573	continue;
3574
3575	if (last != folio) {
3576	walk_update_folio(walk, last, gen, dirty);
3577
3578	last = folio;
3579	dirty = false;
3580	}
3581
3582	if (pte_dirty(ptent))
3583	dirty = true;
3584
3585	young++;
3586	walk->mm_stats[MM_LEAF_YOUNG]++;
3587	}
3588
3589	walk_update_folio(walk, last, gen, dirty);
3590	last = NULL;
3591
3592	if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end))
3593	goto restart;
3594
3595	arch_leave_lazy_mmu_mode();
3596	pte_unmap_unlock(pte, ptl);
3597
3598	return suitable_to_scan(total, young);
3599	}
3600
3601	static void walk_pmd_range_locked(pud_t pud, unsigned* long addr, struct vm_area_struct *vma,
3602	struct mm_walk args, unsigned* long bitmap, unsigned* long *first)
3603	{
3604	int i;
3605	bool dirty;
3606	pmd_t *pmd;
3607	spinlock_t *ptl;
3608	struct folio *last = NULL;
3609	struct lru_gen_mm_walk *walk = args->private;
3610	struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
3611	struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
3612	DEFINE_MAX_SEQ(walk->lruvec);
3613	int gen = lru_gen_from_seq(max_seq);
3614
3615	VM_WARN_ON_ONCE(pud_leaf(*pud));
3616
3617	/ try to batch at most 1+MIN_LRU_BATCH+1 entries /
3618	if (*first == -`1`) {
3619	*first = addr;
3620	bitmap_zero(bitmap, MIN_LRU_BATCH);
3621	return;
3622	}
3623
3624	i = addr == -`1` ? `0` : pmd_index(addr) - pmd_index(*first);
3625	if (i && i <= MIN_LRU_BATCH) {
3626	__set_bit(i - `1`, bitmap);
3627	return;
3628	}
3629
3630	pmd = pmd_offset(pud, *first);
3631
3632	ptl = pmd_lockptr(args->mm, pmd);
3633	if (!spin_trylock(ptl))
3634	goto done;
3635
3636	arch_enter_lazy_mmu_mode();
3637
3638	do {
3639	unsigned long pfn;
3640	struct folio *folio;
3641
3642	/ don't round down the first address /
3643	addr = i ? (first & PMD_MASK) + i PMD_SIZE : *first;
3644
3645	if (!pmd_present(pmd[i]))
3646	goto next;
3647
3648	if (!pmd_trans_huge(pmd[i])) {
3649	if (!walk->force_scan && should_clear_pmd_young() &&
3650	!mm_has_notifiers(args->mm))
3651	pmdp_test_and_clear_young(vma, addr, pmd + i);
3652	goto next;
3653	}
3654
3655	pfn = get_pmd_pfn(pmd[i], vma, addr, pgdat);
3656	if (pfn == -`1`)
3657	goto next;
3658
3659	folio = get_pfn_folio(pfn, memcg, pgdat);
3660	if (!folio)
3661	goto next;
3662
3663	if (!pmdp_clear_young_notify(vma, addr, pmd + i))
3664	goto next;
3665
3666	if (last != folio) {
3667	walk_update_folio(walk, last, gen, dirty);
3668
3669	last = folio;
3670	dirty = false;
3671	}
3672
3673	if (pmd_dirty(pmd[i]))
3674	dirty = true;
3675
3676	walk->mm_stats[MM_LEAF_YOUNG]++;
3677	next:
3678	i = i > MIN_LRU_BATCH ? `0` : find_next_bit(bitmap, MIN_LRU_BATCH, i) + `1`;
3679	} while (i <= MIN_LRU_BATCH);
3680
3681	walk_update_folio(walk, last, gen, dirty);
3682
3683	arch_leave_lazy_mmu_mode();
3684	spin_unlock(ptl);
3685	done:
3686	*first = -`1`;
3687	}
3688
3689	static void walk_pmd_range(pud_t pud, unsigned* long start, unsigned long end,
3690	struct mm_walk *args)
3691	{
3692	int i;
3693	pmd_t *pmd;
3694	unsigned long next;
3695	unsigned long addr;
3696	struct vm_area_struct *vma;
3697	DECLARE_BITMAP(bitmap, MIN_LRU_BATCH);
3698	unsigned long first = -`1`;
3699	struct lru_gen_mm_walk *walk = args->private;
3700	struct lru_gen_mm_state *mm_state = get_mm_state(walk->lruvec);
3701
3702	VM_WARN_ON_ONCE(pud_leaf(*pud));
3703
3704	/*
3705	* Finish an entire PMD in two passes: the first only reaches to PTE
3706	* tables to avoid taking the PMD lock; the second, if necessary, takes
3707	* the PMD lock to clear the accessed bit in PMD entries.
3708	*/
3709	pmd = pmd_offset(pud, start & PUD_MASK);
3710	restart:
3711	/ walk_pte_range() may call get_next_vma() /
3712	vma = args->vma;
3713	for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) {
3714	pmd_t val = pmdp_get_lockless(pmd + i);
3715
3716	next = pmd_addr_end(addr, end);
3717
3718	if (!pmd_present(val) \|\| is_huge_zero_pmd(val)) {
3719	walk->mm_stats[MM_LEAF_TOTAL]++;
3720	continue;
3721	}
3722
3723	if (pmd_trans_huge(val)) {
3724	struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
3725	unsigned long pfn = get_pmd_pfn(val, vma, addr, pgdat);
3726
3727	walk->mm_stats[MM_LEAF_TOTAL]++;
3728
3729	if (pfn != -`1`)
3730	walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
3731	continue;
3732	}
3733
3734	if (!walk->force_scan && should_clear_pmd_young() &&
3735	!mm_has_notifiers(args->mm)) {
3736	if (!pmd_young(val))
3737	continue;
3738
3739	walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
3740	}
3741
3742	if (!walk->force_scan && !test_bloom_filter(mm_state, walk->seq, pmd + i))
3743	continue;
3744
3745	walk->mm_stats[MM_NONLEAF_FOUND]++;
3746
3747	if (!walk_pte_range(&val, addr, next, args))
3748	continue;
3749
3750	walk->mm_stats[MM_NONLEAF_ADDED]++;
3751
3752	/ carry over to the next generation /
3753	update_bloom_filter(mm_state, walk->seq + `1`, pmd + i);
3754	}
3755
3756	walk_pmd_range_locked(pud, -`1`, vma, args, bitmap, &first);
3757
3758	if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end))
3759	goto restart;
3760	}
3761
3762	static int walk_pud_range(p4d_t p4d, unsigned* long start, unsigned long end,
3763	struct mm_walk *args)
3764	{
3765	int i;
3766	pud_t *pud;
3767	unsigned long addr;
3768	unsigned long next;
3769	struct lru_gen_mm_walk *walk = args->private;
3770
3771	VM_WARN_ON_ONCE(p4d_leaf(*p4d));
3772
3773	pud = pud_offset(p4d, start & P4D_MASK);
3774	restart:
3775	for (i = pud_index(start), addr = start; addr != end; i++, addr = next) {
3776	pud_t val = READ_ONCE(pud[i]);
3777
3778	next = pud_addr_end(addr, end);
3779
3780	if (!pud_present(val) \|\| WARN_ON_ONCE(pud_leaf(val)))
3781	continue;
3782
3783	walk_pmd_range(&val, addr, next, args);
3784
3785	if (need_resched() \|\| walk->batched >= MAX_LRU_BATCH) {
3786	end = (addr \| ~PUD_MASK) + `1`;
3787	goto done;
3788	}
3789	}
3790
3791	if (i < PTRS_PER_PUD && get_next_vma(P4D_MASK, PUD_SIZE, args, &start, &end))
3792	goto restart;
3793
3794	end = round_up(end, P4D_SIZE);
3795	done:
3796	if (!end \|\| !args->vma)
3797	return `1`;
3798
3799	walk->next_addr = max(end, args->vma->vm_start);
3800
3801	return -EAGAIN;
3802	}
3803
3804	static void walk_mm(struct mm_struct mm, struct* lru_gen_mm_walk *walk)
3805	{
3806	static const struct mm_walk_ops mm_walk_ops = {
3807	.test_walk = should_skip_vma,
3808	.p4d_entry = walk_pud_range,
3809	.walk_lock = PGWALK_RDLOCK,
3810	};
3811	int err;
3812	struct lruvec *lruvec = walk->lruvec;
3813
3814	walk->next_addr = FIRST_USER_ADDRESS;
3815
3816	do {
3817	DEFINE_MAX_SEQ(lruvec);
3818
3819	err = -EBUSY;
3820
3821	/ another thread might have called inc_max_seq() /
3822	if (walk->seq != max_seq)
3823	break;
3824
3825	/ the caller might be holding the lock for write /
3826	if (mmap_read_trylock(mm)) {
3827	err = walk_page_range(mm, walk->next_addr, ULONG_MAX, &mm_walk_ops, walk);
3828
3829	mmap_read_unlock(mm);
3830	}
3831
3832	if (walk->batched) {
3833	spin_lock_irq(&lruvec->lru_lock);
3834	reset_batch_size(walk);
3835	spin_unlock_irq(&lruvec->lru_lock);
3836	}
3837
3838	cond_resched();
3839	} while (err == -EAGAIN);
3840	}
3841
3842	static struct lru_gen_mm_walk set_mm_walk(struct* pglist_data *pgdat, bool force_alloc)
3843	{
3844	struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
3845
3846	if (pgdat && current_is_kswapd()) {
3847	VM_WARN_ON_ONCE(walk);
3848
3849	walk = &pgdat->mm_walk;
3850	} else if (!walk && force_alloc) {
3851	VM_WARN_ON_ONCE(current_is_kswapd());
3852
3853	walk = kzalloc(sizeof(*walk), __GFP_HIGH \| __GFP_NOMEMALLOC \| __GFP_NOWARN);
3854	}
3855
3856	current->reclaim_state->mm_walk = walk;
3857
3858	return walk;
3859	}
3860
3861	static void clear_mm_walk(void)
3862	{
3863	struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
3864
3865	VM_WARN_ON_ONCE(walk && memchr_inv(walk->nr_pages, `0`, sizeof(walk->nr_pages)));
3866	VM_WARN_ON_ONCE(walk && memchr_inv(walk->mm_stats, `0`, sizeof(walk->mm_stats)));
3867
3868	current->reclaim_state->mm_walk = NULL;
3869
3870	if (!current_is_kswapd())
3871	kfree(walk);
3872	}
3873
3874	static bool inc_min_seq(struct lruvec lruvec, int* type, int swappiness)
3875	{
3876	int zone;
3877	int remaining = MAX_LRU_BATCH;
3878	struct lru_gen_folio *lrugen = &lruvec->lrugen;
3879	int hist = lru_hist_from_seq(lrugen->min_seq[type]);
3880	int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
3881
3882	/ For file type, skip the check if swappiness is anon only /
3883	if (type && (swappiness == SWAPPINESS_ANON_ONLY))
3884	goto done;
3885
3886	/ For anon type, skip the check if swappiness is zero (file only) /
3887	if (!type && !swappiness)
3888	goto done;
3889
3890	/ prevent cold/hot inversion if the type is evictable /
3891	for (zone = `0`; zone < MAX_NR_ZONES; zone++) {
3892	struct list_head *head = &lrugen->folios[old_gen][type][zone];
3893
3894	while (!list_empty(head)) {
3895	struct folio *folio = lru_to_folio(head);
3896	int refs = folio_lru_refs(folio);
3897	bool workingset = folio_test_workingset(folio);
3898
3899	VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
3900	VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
3901	VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
3902	VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
3903
3904	new_gen = folio_inc_gen(lruvec, folio, false);
3905	list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]);
3906
3907	/ don't count the workingset being lazily promoted /
3908	if (refs + workingset != BIT(LRU_REFS_WIDTH) + `1`) {
3909	int tier = lru_tier_from_refs(refs, workingset);
3910	int delta = folio_nr_pages(folio);
3911
3912	WRITE_ONCE(lrugen->protected[hist][type][tier],
3913	lrugen->protected[hist][type][tier] + delta);
3914	}
3915
3916	if (!--remaining)
3917	return false;
3918	}
3919	}
3920	done:
3921	reset_ctrl_pos(lruvec, type, true);
3922	WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + `1`);
3923
3924	return true;
3925	}
3926
3927	static bool try_to_inc_min_seq(struct lruvec lruvec, int* swappiness)
3928	{
3929	int gen, type, zone;
3930	bool success = false;
3931	bool seq_inc_flag = false;
3932	struct lru_gen_folio *lrugen = &lruvec->lrugen;
3933	DEFINE_MIN_SEQ(lruvec);
3934
3935	VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
3936
3937	/ find the oldest populated generation /
3938	for_each_evictable_type(type, swappiness) {
3939	while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) {
3940	gen = lru_gen_from_seq(min_seq[type]);
3941
3942	for (zone = `0`; zone < MAX_NR_ZONES; zone++) {
3943	if (!list_empty(&lrugen->folios[gen][type][zone]))
3944	goto next;
3945	}
3946
3947	min_seq[type]++;
3948	seq_inc_flag = true;
3949	}
3950	next:
3951	;
3952	}
3953
3954	/*
3955	* If min_seq[type] of both anonymous and file is not increased,
3956	* we can directly return false to avoid unnecessary checking
3957	* overhead later.
3958	*/
3959	if (!seq_inc_flag)
3960	return success;
3961
3962	/ see the comment on lru_gen_folio /
3963	if (swappiness && swappiness <= MAX_SWAPPINESS) {
3964	unsigned long seq = lrugen->max_seq - MIN_NR_GENS;
3965
3966	if (min_seq[LRU_GEN_ANON] > seq && min_seq[LRU_GEN_FILE] < seq)
3967	min_seq[LRU_GEN_ANON] = seq;
3968	else if (min_seq[LRU_GEN_FILE] > seq && min_seq[LRU_GEN_ANON] < seq)
3969	min_seq[LRU_GEN_FILE] = seq;
3970	}
3971
3972	for_each_evictable_type(type, swappiness) {
3973	if (min_seq[type] <= lrugen->min_seq[type])
3974	continue;
3975
3976	reset_ctrl_pos(lruvec, type, true);
3977	WRITE_ONCE(lrugen->min_seq[type], min_seq[type]);
3978	success = true;
3979	}
3980
3981	return success;
3982	}
3983
3984	static bool inc_max_seq(struct lruvec lruvec, unsigned* long seq, int swappiness)
3985	{
3986	bool success;
3987	int prev, next;
3988	int type, zone;
3989	struct lru_gen_folio *lrugen = &lruvec->lrugen;
3990	restart:
3991	if (seq < READ_ONCE(lrugen->max_seq))
3992	return false;
3993
3994	spin_lock_irq(&lruvec->lru_lock);
3995
3996	VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
3997
3998	success = seq == lrugen->max_seq;
3999	if (!success)
4000	goto unlock;
4001
4002	for (type = `0`; type < ANON_AND_FILE; type++) {
4003	if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
4004	continue;
4005
4006	if (inc_min_seq(lruvec, type, swappiness))
4007	continue;
4008
4009	spin_unlock_irq(&lruvec->lru_lock);
4010	cond_resched();
4011	goto restart;
4012	}
4013
4014	/*
4015	* Update the active/inactive LRU sizes for compatibility. Both sides of
4016	* the current max_seq need to be covered, since max_seq+1 can overlap
4017	* with min_seq[LRU_GEN_ANON] if swapping is constrained. And if they do
4018	* overlap, cold/hot inversion happens.
4019	*/
4020	prev = lru_gen_from_seq(lrugen->max_seq - `1`);
4021	next = lru_gen_from_seq(lrugen->max_seq + `1`);
4022
4023	for (type = `0`; type < ANON_AND_FILE; type++) {
4024	for (zone = `0`; zone < MAX_NR_ZONES; zone++) {
4025	enum lru_list lru = type * LRU_INACTIVE_FILE;
4026	long delta = lrugen->nr_pages[prev][type][zone] -
4027	lrugen->nr_pages[next][type][zone];
4028
4029	if (!delta)
4030	continue;
4031
4032	__update_lru_size(lruvec, lru, zone, delta);
4033	__update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta);
4034	}
4035	}
4036
4037	for (type = `0`; type < ANON_AND_FILE; type++)
4038	reset_ctrl_pos(lruvec, type, false);
4039
4040	WRITE_ONCE(lrugen->timestamps[next], jiffies);
4041	/ make sure preceding modifications appear /
4042	smp_store_release(&lrugen->max_seq, lrugen->max_seq + `1`);
4043	unlock:
4044	spin_unlock_irq(&lruvec->lru_lock);
4045
4046	return success;
4047	}
4048
4049	static bool try_to_inc_max_seq(struct lruvec lruvec, unsigned* long seq,
4050	int swappiness, bool force_scan)
4051	{
4052	bool success;
4053	struct lru_gen_mm_walk *walk;
4054	struct mm_struct *mm = NULL;
4055	struct lru_gen_folio *lrugen = &lruvec->lrugen;
4056	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
4057
4058	VM_WARN_ON_ONCE(seq > READ_ONCE(lrugen->max_seq));
4059
4060	if (!mm_state)
4061	return inc_max_seq(lruvec, seq, swappiness);
4062
4063	/ see the comment in iterate_mm_list() /
4064	if (seq <= READ_ONCE(mm_state->seq))
4065	return false;
4066
4067	/*
4068	* If the hardware doesn't automatically set the accessed bit, fallback
4069	* to lru_gen_look_around(), which only clears the accessed bit in a
4070	* handful of PTEs. Spreading the work out over a period of time usually
4071	* is less efficient, but it avoids bursty page faults.
4072	*/
4073	if (!should_walk_mmu()) {
4074	success = iterate_mm_list_nowalk(lruvec, seq);
4075	goto done;
4076	}
4077
4078	walk = set_mm_walk(NULL, true);
4079	if (!walk) {
4080	success = iterate_mm_list_nowalk(lruvec, seq);
4081	goto done;
4082	}
4083
4084	walk->lruvec = lruvec;
4085	walk->seq = seq;
4086	walk->swappiness = swappiness;
4087	walk->force_scan = force_scan;
4088
4089	do {
4090	success = iterate_mm_list(walk, &mm);
4091	if (mm)
4092	walk_mm(mm, walk);
4093	} while (mm);
4094	done:
4095	if (success) {
4096	success = inc_max_seq(lruvec, seq, swappiness);
4097	WARN_ON_ONCE(!success);
4098	}
4099
4100	return success;
4101	}
4102
4103	/******************************************************************************
4104	* working set protection
4105	******************************************************************************/
4106
4107	static void set_initial_priority(struct pglist_data pgdat, struct* scan_control *sc)
4108	{
4109	int priority;
4110	unsigned long reclaimable;
4111
4112	if (sc->priority != DEF_PRIORITY \|\| sc->nr_to_reclaim < MIN_LRU_BATCH)
4113	return;
4114	/*
4115	* Determine the initial priority based on
4116	* (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim,
4117	* where reclaimed_to_scanned_ratio = inactive / total.
4118	*/
4119	reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
4120	if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
4121	reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
4122
4123	/ round down reclaimable and round up sc->nr_to_reclaim /
4124	priority = fls_long(reclaimable) - `1` - fls_long(sc->nr_to_reclaim - `1`);
4125
4126	/*
4127	* The estimation is based on LRU pages only, so cap it to prevent
4128	* overshoots of shrinker objects by large margins.
4129	*/
4130	sc->priority = clamp(priority, DEF_PRIORITY / `2`, DEF_PRIORITY);
4131	}
4132
4133	static bool lruvec_is_sizable(struct lruvec lruvec, struct* scan_control *sc)
4134	{
4135	int gen, type, zone;
4136	unsigned long total = `0`;
4137	int swappiness = get_swappiness(lruvec, sc);
4138	struct lru_gen_folio *lrugen = &lruvec->lrugen;
4139	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
4140	DEFINE_MAX_SEQ(lruvec);
4141	DEFINE_MIN_SEQ(lruvec);
4142
4143	for_each_evictable_type(type, swappiness) {
4144	unsigned long seq;
4145
4146	for (seq = min_seq[type]; seq <= max_seq; seq++) {
4147	gen = lru_gen_from_seq(seq);
4148
4149	for (zone = `0`; zone < MAX_NR_ZONES; zone++)
4150	total += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), `0L`);
4151	}
4152	}
4153
4154	/ whether the size is big enough to be helpful /
4155	return mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
4156	}
4157
4158	static bool lruvec_is_reclaimable(struct lruvec lruvec, struct* scan_control *sc,
4159	unsigned long min_ttl)
4160	{
4161	int gen;
4162	unsigned long birth;
4163	int swappiness = get_swappiness(lruvec, sc);
4164	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
4165	DEFINE_MIN_SEQ(lruvec);
4166
4167	if (mem_cgroup_below_min(NULL, memcg))
4168	return false;
4169
4170	if (!lruvec_is_sizable(lruvec, sc))
4171	return false;
4172
4173	gen = lru_gen_from_seq(evictable_min_seq(min_seq, swappiness));
4174	birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
4175
4176	return time_is_before_jiffies(birth + min_ttl);
4177	}
4178
4179	/ to protect the working set of the last N jiffies /
4180	static unsigned long lru_gen_min_ttl __read_mostly;
4181
4182	static void lru_gen_age_node(struct pglist_data pgdat, struct* scan_control *sc)
4183	{
4184	struct mem_cgroup *memcg;
4185	unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
4186	bool reclaimable = !min_ttl;
4187
4188	VM_WARN_ON_ONCE(!current_is_kswapd());
4189
4190	set_initial_priority(pgdat, sc);
4191
4192	memcg = mem_cgroup_iter(NULL, NULL, NULL);
4193	do {
4194	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
4195
4196	mem_cgroup_calculate_protection(NULL, memcg);
4197
4198	if (!reclaimable)
4199	reclaimable = lruvec_is_reclaimable(lruvec, sc, min_ttl);
4200	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
4201
4202	/*
4203	* The main goal is to OOM kill if every generation from all memcgs is
4204	* younger than min_ttl. However, another possibility is all memcgs are
4205	* either too small or below min.
4206	*/
4207	if (!reclaimable && mutex_trylock(&oom_lock)) {
4208	struct oom_control oc = {
4209	.gfp_mask = sc->gfp_mask,
4210	};
4211
4212	out_of_memory(&oc);
4213
4214	mutex_unlock(&oom_lock);
4215	}
4216	}
4217
4218	/******************************************************************************
4219	* rmap/PT walk feedback
4220	******************************************************************************/
4221
4222	/*
4223	* This function exploits spatial locality when shrink_folio_list() walks the
4224	* rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If
4225	* the scan was done cacheline efficiently, it adds the PMD entry pointing to
4226	* the PTE table to the Bloom filter. This forms a feedback loop between the
4227	* eviction and the aging.
4228	*/
4229	bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
4230	{
4231	int i;
4232	bool dirty;
4233	unsigned long start;
4234	unsigned long end;
4235	struct lru_gen_mm_walk *walk;
4236	struct folio *last = NULL;
4237	int young = `1`;
4238	pte_t *pte = pvmw->pte;
4239	unsigned long addr = pvmw->address;
4240	struct vm_area_struct *vma = pvmw->vma;
4241	struct folio *folio = pfn_folio(pvmw->pfn);
4242	struct mem_cgroup *memcg = folio_memcg(folio);
4243	struct pglist_data *pgdat = folio_pgdat(folio);
4244	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
4245	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
4246	DEFINE_MAX_SEQ(lruvec);
4247	int gen = lru_gen_from_seq(max_seq);
4248
4249	lockdep_assert_held(pvmw->ptl);
4250	VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
4251
4252	if (!ptep_clear_young_notify(vma, addr, pte))
4253	return false;
4254
4255	if (spin_is_contended(pvmw->ptl))
4256	return true;
4257
4258	/ exclude special VMAs containing anon pages from COW /
4259	if (vma->vm_flags & VM_SPECIAL)
4260	return true;
4261
4262	/ avoid taking the LRU lock under the PTL when possible /
4263	walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL;
4264
4265	start = max(addr & PMD_MASK, vma->vm_start);
4266	end = min(addr \| ~PMD_MASK, vma->vm_end - `1`) + `1`;
4267
4268	if (end - start == PAGE_SIZE)
4269	return true;
4270
4271	if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
4272	if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / `2`)
4273	end = start + MIN_LRU_BATCH * PAGE_SIZE;
4274	else if (end - addr < MIN_LRU_BATCH * PAGE_SIZE / `2`)
4275	start = end - MIN_LRU_BATCH * PAGE_SIZE;
4276	else {
4277	start = addr - MIN_LRU_BATCH * PAGE_SIZE / `2`;
4278	end = addr + MIN_LRU_BATCH * PAGE_SIZE / `2`;
4279	}
4280	}
4281
4282	arch_enter_lazy_mmu_mode();
4283
4284	pte -= (addr - start) / PAGE_SIZE;
4285
4286	for (i = `0`, addr = start; addr != end; i++, addr += PAGE_SIZE) {
4287	unsigned long pfn;
4288	pte_t ptent = ptep_get(pte + i);
4289
4290	pfn = get_pte_pfn(ptent, vma, addr, pgdat);
4291	if (pfn == -`1`)
4292	continue;
4293
4294	folio = get_pfn_folio(pfn, memcg, pgdat);
4295	if (!folio)
4296	continue;
4297
4298	if (!ptep_clear_young_notify(vma, addr, pte + i))
4299	continue;
4300
4301	if (last != folio) {
4302	walk_update_folio(walk, last, gen, dirty);
4303
4304	last = folio;
4305	dirty = false;
4306	}
4307
4308	if (pte_dirty(ptent))
4309	dirty = true;
4310
4311	young++;
4312	}
4313
4314	walk_update_folio(walk, last, gen, dirty);
4315
4316	arch_leave_lazy_mmu_mode();
4317
4318	/ feedback from rmap walkers to page table walkers /
4319	if (mm_state && suitable_to_scan(i, young))
4320	update_bloom_filter(mm_state, max_seq, pvmw->pmd);
4321
4322	return true;
4323	}
4324
4325	/******************************************************************************
4326	* memcg LRU
4327	******************************************************************************/
4328
4329	/ see the comment on MEMCG_NR_GENS /
4330	enum {
4331	MEMCG_LRU_NOP,
4332	MEMCG_LRU_HEAD,
4333	MEMCG_LRU_TAIL,
4334	MEMCG_LRU_OLD,
4335	MEMCG_LRU_YOUNG,
4336	};
4337
4338	static void lru_gen_rotate_memcg(struct lruvec lruvec, int* op)
4339	{
4340	int seg;
4341	int old, new;
4342	unsigned long flags;
4343	int bin = get_random_u32_below(MEMCG_NR_BINS);
4344	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
4345
4346	spin_lock_irqsave(&pgdat->memcg_lru.lock, flags);
4347
4348	VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
4349
4350	seg = `0`;
4351	new = old = lruvec->lrugen.gen;
4352
4353	/ see the comment on MEMCG_NR_GENS /
4354	if (op == MEMCG_LRU_HEAD)
4355	seg = MEMCG_LRU_HEAD;
4356	else if (op == MEMCG_LRU_TAIL)
4357	seg = MEMCG_LRU_TAIL;
4358	else if (op == MEMCG_LRU_OLD)
4359	new = get_memcg_gen(pgdat->memcg_lru.seq);
4360	else if (op == MEMCG_LRU_YOUNG)
4361	new = get_memcg_gen(pgdat->memcg_lru.seq + `1`);
4362	else
4363	VM_WARN_ON_ONCE(true);
4364
4365	WRITE_ONCE(lruvec->lrugen.seg, seg);
4366	WRITE_ONCE(lruvec->lrugen.gen, new);
4367
4368	hlist_nulls_del_rcu(&lruvec->lrugen.list);
4369
4370	if (op == MEMCG_LRU_HEAD \|\| op == MEMCG_LRU_OLD)
4371	hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
4372	else
4373	hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
4374
4375	pgdat->memcg_lru.nr_memcgs[old]--;
4376	pgdat->memcg_lru.nr_memcgs[new]++;
4377
4378	if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq))
4379	WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + `1`);
4380
4381	spin_unlock_irqrestore(&pgdat->memcg_lru.lock, flags);
4382	}
4383
4384	#ifdef CONFIG_MEMCG
4385
4386	void lru_gen_online_memcg(struct mem_cgroup *memcg)
4387	{
4388	int gen;
4389	int nid;
4390	int bin = get_random_u32_below(MEMCG_NR_BINS);
4391
4392	for_each_node(nid) {
4393	struct pglist_data *pgdat = NODE_DATA(nid);
4394	struct lruvec *lruvec = get_lruvec(memcg, nid);
4395
4396	spin_lock_irq(&pgdat->memcg_lru.lock);
4397
4398	VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list));
4399
4400	gen = get_memcg_gen(pgdat->memcg_lru.seq);
4401
4402	lruvec->lrugen.gen = gen;
4403
4404	hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
4405	pgdat->memcg_lru.nr_memcgs[gen]++;
4406
4407	spin_unlock_irq(&pgdat->memcg_lru.lock);
4408	}
4409	}
4410
4411	void lru_gen_offline_memcg(struct mem_cgroup *memcg)
4412	{
4413	int nid;
4414
4415	for_each_node(nid) {
4416	struct lruvec *lruvec = get_lruvec(memcg, nid);
4417
4418	lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD);
4419	}
4420	}
4421
4422	void lru_gen_release_memcg(struct mem_cgroup *memcg)
4423	{
4424	int gen;
4425	int nid;
4426
4427	for_each_node(nid) {
4428	struct pglist_data *pgdat = NODE_DATA(nid);
4429	struct lruvec *lruvec = get_lruvec(memcg, nid);
4430
4431	spin_lock_irq(&pgdat->memcg_lru.lock);
4432
4433	if (hlist_nulls_unhashed(&lruvec->lrugen.list))
4434	goto unlock;
4435
4436	gen = lruvec->lrugen.gen;
4437
4438	hlist_nulls_del_init_rcu(&lruvec->lrugen.list);
4439	pgdat->memcg_lru.nr_memcgs[gen]--;
4440
4441	if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
4442	WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + `1`);
4443	unlock:
4444	spin_unlock_irq(&pgdat->memcg_lru.lock);
4445	}
4446	}
4447
4448	void lru_gen_soft_reclaim(struct mem_cgroup memcg, int* nid)
4449	{
4450	struct lruvec *lruvec = get_lruvec(memcg, nid);
4451
4452	/ see the comment on MEMCG_NR_GENS /
4453	if (READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_HEAD)
4454	lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
4455	}
4456
4457	#endif /* CONFIG_MEMCG */
4458
4459	/******************************************************************************
4460	* the eviction
4461	******************************************************************************/
4462
4463	static bool sort_folio(struct lruvec lruvec, struct* folio folio, struct* scan_control *sc,
4464	int tier_idx)
4465	{
4466	bool success;
4467	bool dirty, writeback;
4468	int gen = folio_lru_gen(folio);
4469	int type = folio_is_file_lru(folio);
4470	int zone = folio_zonenum(folio);
4471	int delta = folio_nr_pages(folio);
4472	int refs = folio_lru_refs(folio);
4473	bool workingset = folio_test_workingset(folio);
4474	int tier = lru_tier_from_refs(refs, workingset);
4475	struct lru_gen_folio *lrugen = &lruvec->lrugen;
4476
4477	VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio);
4478
4479	/ unevictable /
4480	if (!folio_evictable(folio)) {
4481	success = lru_gen_del_folio(lruvec, folio, true);
4482	VM_WARN_ON_ONCE_FOLIO(!success, folio);
4483	folio_set_unevictable(folio);
4484	lruvec_add_folio(lruvec, folio);
4485	__count_vm_events(UNEVICTABLE_PGCULLED, delta);
4486	return true;
4487	}
4488
4489	/ promoted /
4490	if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
4491	list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
4492	return true;
4493	}
4494
4495	/ protected /
4496	if (tier > tier_idx \|\| refs + workingset == BIT(LRU_REFS_WIDTH) + `1`) {
4497	gen = folio_inc_gen(lruvec, folio, false);
4498	list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
4499
4500	/ don't count the workingset being lazily promoted /
4501	if (refs + workingset != BIT(LRU_REFS_WIDTH) + `1`) {
4502	int hist = lru_hist_from_seq(lrugen->min_seq[type]);
4503
4504	WRITE_ONCE(lrugen->protected[hist][type][tier],
4505	lrugen->protected[hist][type][tier] + delta);
4506	}
4507	return true;
4508	}
4509
4510	/ ineligible /
4511	if (zone > sc->reclaim_idx) {
4512	gen = folio_inc_gen(lruvec, folio, false);
4513	list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
4514	return true;
4515	}
4516
4517	dirty = folio_test_dirty(folio);
4518	writeback = folio_test_writeback(folio);
4519	if (type == LRU_GEN_FILE && dirty) {
4520	sc->nr.file_taken += delta;
4521	if (!writeback)
4522	sc->nr.unqueued_dirty += delta;
4523	}
4524
4525	/ waiting for writeback /
4526	if (writeback \|\| (type == LRU_GEN_FILE && dirty)) {
4527	gen = folio_inc_gen(lruvec, folio, true);
4528	list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
4529	return true;
4530	}
4531
4532	return false;
4533	}
4534
4535	static bool isolate_folio(struct lruvec lruvec, struct* folio folio, struct* scan_control *sc)
4536	{
4537	bool success;
4538
4539	/ swap constrained /
4540	if (!(sc->gfp_mask & __GFP_IO) &&
4541	(folio_test_dirty(folio) \|\|
4542	(folio_test_anon(folio) && !folio_test_swapcache(folio))))
4543	return false;
4544
4545	/ raced with release_pages() /
4546	if (!folio_try_get(folio))
4547	return false;
4548
4549	/ raced with another isolation /
4550	if (!folio_test_clear_lru(folio)) {
4551	folio_put(folio);
4552	return false;
4553	}
4554
4555	/ see the comment on LRU_REFS_FLAGS /
4556	if (!folio_test_referenced(folio))
4557	set_mask_bits(&folio->flags.f, LRU_REFS_MASK, `0`);
4558
4559	/ for shrink_folio_list() /
4560	folio_clear_reclaim(folio);
4561
4562	success = lru_gen_del_folio(lruvec, folio, true);
4563	VM_WARN_ON_ONCE_FOLIO(!success, folio);
4564
4565	return true;
4566	}
4567
4568	static int scan_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
4569	struct scan_control sc, int* type, int tier,
4570	struct list_head *list)
4571	{
4572	int i;
4573	int gen;
4574	enum vm_event_item item;
4575	int sorted = `0`;
4576	int scanned = `0`;
4577	int isolated = `0`;
4578	int skipped = `0`;
4579	int remaining = min(nr_to_scan, MAX_LRU_BATCH);
4580	struct lru_gen_folio *lrugen = &lruvec->lrugen;
4581	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
4582
4583	VM_WARN_ON_ONCE(!list_empty(list));
4584
4585	if (get_nr_gens(lruvec, type) == MIN_NR_GENS)
4586	return `0`;
4587
4588	gen = lru_gen_from_seq(lrugen->min_seq[type]);
4589
4590	for (i = MAX_NR_ZONES; i > `0`; i--) {
4591	LIST_HEAD(moved);
4592	int skipped_zone = `0`;
4593	int zone = (sc->reclaim_idx + i) % MAX_NR_ZONES;
4594	struct list_head *head = &lrugen->folios[gen][type][zone];
4595
4596	while (!list_empty(head)) {
4597	struct folio *folio = lru_to_folio(head);
4598	int delta = folio_nr_pages(folio);
4599
4600	VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
4601	VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
4602	VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
4603	VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
4604
4605	scanned += delta;
4606
4607	if (sort_folio(lruvec, folio, sc, tier))
4608	sorted += delta;
4609	else if (isolate_folio(lruvec, folio, sc)) {
4610	list_add(&folio->lru, list);
4611	isolated += delta;
4612	} else {
4613	list_move(&folio->lru, &moved);
4614	skipped_zone += delta;
4615	}
4616
4617	if (!--remaining \|\| max(isolated, skipped_zone) >= MIN_LRU_BATCH)
4618	break;
4619	}
4620
4621	if (skipped_zone) {
4622	list_splice(&moved, head);
4623	__count_zid_vm_events(PGSCAN_SKIP, zone, skipped_zone);
4624	skipped += skipped_zone;
4625	}
4626
4627	if (!remaining \|\| isolated >= MIN_LRU_BATCH)
4628	break;
4629	}
4630
4631	item = PGSCAN_KSWAPD + reclaimer_offset(sc);
4632	if (!cgroup_reclaim(sc)) {
4633	__count_vm_events(item, isolated);
4634	__count_vm_events(PGREFILL, sorted);
4635	}
4636	count_memcg_events(memcg, item, isolated);
4637	count_memcg_events(memcg, PGREFILL, sorted);
4638	__count_vm_events(PGSCAN_ANON + type, isolated);
4639	trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, MAX_LRU_BATCH,
4640	scanned, skipped, isolated,
4641	type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON);
4642	if (type == LRU_GEN_FILE)
4643	sc->nr.file_taken += isolated;
4644	/*
4645	* There might not be eligible folios due to reclaim_idx. Check the
4646	* remaining to prevent livelock if it's not making progress.
4647	*/
4648	return isolated \|\| !remaining ? scanned : `0`;
4649	}
4650
4651	static int get_tier_idx(struct lruvec lruvec, int* type)
4652	{
4653	int tier;
4654	struct ctrl_pos sp, pv;
4655
4656	/*
4657	* To leave a margin for fluctuations, use a larger gain factor (2:3).
4658	* This value is chosen because any other tier would have at least twice
4659	* as many refaults as the first tier.
4660	*/
4661	read_ctrl_pos(lruvec, type, `0`, `2`, &sp);
4662	for (tier = `1`; tier < MAX_NR_TIERS; tier++) {
4663	read_ctrl_pos(lruvec, type, tier, `3`, &pv);
4664	if (!positive_ctrl_err(&sp, &pv))
4665	break;
4666	}
4667
4668	return tier - `1`;
4669	}
4670
4671	static int get_type_to_scan(struct lruvec lruvec, int* swappiness)
4672	{
4673	struct ctrl_pos sp, pv;
4674
4675	if (swappiness <= MIN_SWAPPINESS + `1`)
4676	return LRU_GEN_FILE;
4677
4678	if (swappiness >= MAX_SWAPPINESS)
4679	return LRU_GEN_ANON;
4680	/*
4681	* Compare the sum of all tiers of anon with that of file to determine
4682	* which type to scan.
4683	*/
4684	read_ctrl_pos(lruvec, LRU_GEN_ANON, MAX_NR_TIERS, swappiness, &sp);
4685	read_ctrl_pos(lruvec, LRU_GEN_FILE, MAX_NR_TIERS, MAX_SWAPPINESS - swappiness, &pv);
4686
4687	return positive_ctrl_err(&sp, &pv);
4688	}
4689
4690	static int isolate_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
4691	struct scan_control sc, int* swappiness,
4692	int type_scanned, struct* list_head *list)
4693	{
4694	int i;
4695	int type = get_type_to_scan(lruvec, swappiness);
4696
4697	for_each_evictable_type(i, swappiness) {
4698	int scanned;
4699	int tier = get_tier_idx(lruvec, type);
4700
4701	*type_scanned = type;
4702
4703	scanned = scan_folios(nr_to_scan, lruvec, sc, type, tier, list);
4704	if (scanned)
4705	return scanned;
4706
4707	type = !type;
4708	}
4709
4710	return `0`;
4711	}
4712
4713	static int evict_folios(unsigned long nr_to_scan, struct lruvec *lruvec,
4714	struct scan_control sc, int* swappiness)
4715	{
4716	int type;
4717	int scanned;
4718	int reclaimed;
4719	LIST_HEAD(list);
4720	LIST_HEAD(clean);
4721	struct folio *folio;
4722	struct folio *next;
4723	enum vm_event_item item;
4724	struct reclaim_stat stat;
4725	struct lru_gen_mm_walk *walk;
4726	bool skip_retry = false;
4727	struct lru_gen_folio *lrugen = &lruvec->lrugen;
4728	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
4729	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
4730
4731	spin_lock_irq(&lruvec->lru_lock);
4732
4733	scanned = isolate_folios(nr_to_scan, lruvec, sc, swappiness, &type, &list);
4734
4735	scanned += try_to_inc_min_seq(lruvec, swappiness);
4736
4737	if (evictable_min_seq(lrugen->min_seq, swappiness) + MIN_NR_GENS > lrugen->max_seq)
4738	scanned = `0`;
4739
4740	spin_unlock_irq(&lruvec->lru_lock);
4741
4742	if (list_empty(&list))
4743	return scanned;
4744	retry:
4745	reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false, memcg);
4746	sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
4747	sc->nr_reclaimed += reclaimed;
4748	trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
4749	scanned, reclaimed, &stat, sc->priority,
4750	type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON);
4751
4752	list_for_each_entry_safe_reverse(folio, next, &list, lru) {
4753	DEFINE_MIN_SEQ(lruvec);
4754
4755	if (!folio_evictable(folio)) {
4756	list_del(&folio->lru);
4757	folio_putback_lru(folio);
4758	continue;
4759	}
4760
4761	/ retry folios that may have missed folio_rotate_reclaimable() /
4762	if (!skip_retry && !folio_test_active(folio) && !folio_mapped(folio) &&
4763	!folio_test_dirty(folio) && !folio_test_writeback(folio)) {
4764	list_move(&folio->lru, &clean);
4765	continue;
4766	}
4767
4768	/ don't add rejected folios to the oldest generation /
4769	if (lru_gen_folio_seq(lruvec, folio, false) == min_seq[type])
4770	set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_active));
4771	}
4772
4773	spin_lock_irq(&lruvec->lru_lock);
4774
4775	move_folios_to_lru(lruvec, &list);
4776
4777	walk = current->reclaim_state->mm_walk;
4778	if (walk && walk->batched) {
4779	walk->lruvec = lruvec;
4780	reset_batch_size(walk);
4781	}
4782
4783	__mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc),
4784	stat.nr_demoted);
4785
4786	item = PGSTEAL_KSWAPD + reclaimer_offset(sc);
4787	if (!cgroup_reclaim(sc))
4788	__count_vm_events(item, reclaimed);
4789	count_memcg_events(memcg, item, reclaimed);
4790	__count_vm_events(PGSTEAL_ANON + type, reclaimed);
4791
4792	spin_unlock_irq(&lruvec->lru_lock);
4793
4794	list_splice_init(&clean, &list);
4795
4796	if (!list_empty(&list)) {
4797	skip_retry = true;
4798	goto retry;
4799	}
4800
4801	return scanned;
4802	}
4803
4804	static bool should_run_aging(struct lruvec lruvec, unsigned* long max_seq,
4805	int swappiness, unsigned long *nr_to_scan)
4806	{
4807	int gen, type, zone;
4808	unsigned long size = `0`;
4809	struct lru_gen_folio *lrugen = &lruvec->lrugen;
4810	DEFINE_MIN_SEQ(lruvec);
4811
4812	*nr_to_scan = `0`;
4813	/ have to run aging, since eviction is not possible anymore /
4814	if (evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS > max_seq)
4815	return true;
4816
4817	for_each_evictable_type(type, swappiness) {
4818	unsigned long seq;
4819
4820	for (seq = min_seq[type]; seq <= max_seq; seq++) {
4821	gen = lru_gen_from_seq(seq);
4822
4823	for (zone = `0`; zone < MAX_NR_ZONES; zone++)
4824	size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), `0L`);
4825	}
4826	}
4827
4828	*nr_to_scan = size;
4829	/ better to run aging even though eviction is still possible /
4830	return evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS == max_seq;
4831	}
4832
4833	/*
4834	* For future optimizations:
4835	* 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
4836	* reclaim.
4837	*/
4838	static long get_nr_to_scan(struct lruvec lruvec, struct* scan_control sc, int* swappiness)
4839	{
4840	bool success;
4841	unsigned long nr_to_scan;
4842	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
4843	DEFINE_MAX_SEQ(lruvec);
4844
4845	if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg))
4846	return -`1`;
4847
4848	success = should_run_aging(lruvec, max_seq, swappiness, &nr_to_scan);
4849
4850	/ try to scrape all its memory if this memcg was deleted /
4851	if (nr_to_scan && !mem_cgroup_online(memcg))
4852	return nr_to_scan;
4853
4854	nr_to_scan = apply_proportional_protection(memcg, sc, nr_to_scan);
4855
4856	/ try to get away with not aging at the default priority /
4857	if (!success \|\| sc->priority == DEF_PRIORITY)
4858	return nr_to_scan >> sc->priority;
4859
4860	/ stop scanning this lruvec as it's low on cold folios /
4861	return try_to_inc_max_seq(lruvec, max_seq, swappiness, false) ? -`1` : `0`;
4862	}
4863
4864	static bool should_abort_scan(struct lruvec lruvec, struct* scan_control *sc)
4865	{
4866	int i;
4867	enum zone_watermarks mark;
4868
4869	/ don't abort memcg reclaim to ensure fairness /
4870	if (!root_reclaim(sc))
4871	return false;
4872
4873	if (sc->nr_reclaimed >= max(sc->nr_to_reclaim, compact_gap(sc->order)))
4874	return true;
4875
4876	/ check the order to exclude compaction-induced reclaim /
4877	if (!current_is_kswapd() \|\| sc->order)
4878	return false;
4879
4880	mark = sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING ?
4881	WMARK_PROMO : WMARK_HIGH;
4882
4883	for (i = `0`; i <= sc->reclaim_idx; i++) {
4884	struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i;
4885	unsigned long size = wmark_pages(zone, mark) + MIN_LRU_BATCH;
4886
4887	if (managed_zone(zone) && !zone_watermark_ok(zone, `0`, size, sc->reclaim_idx, `0`))
4888	return false;
4889	}
4890
4891	/ kswapd should abort if all eligible zones are safe /
4892	return true;
4893	}
4894
4895	static bool try_to_shrink_lruvec(struct lruvec lruvec, struct* scan_control *sc)
4896	{
4897	long nr_to_scan;
4898	unsigned long scanned = `0`;
4899	int swappiness = get_swappiness(lruvec, sc);
4900
4901	while (true) {
4902	int delta;
4903
4904	nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
4905	if (nr_to_scan <= `0`)
4906	break;
4907
4908	delta = evict_folios(nr_to_scan, lruvec, sc, swappiness);
4909	if (!delta)
4910	break;
4911
4912	scanned += delta;
4913	if (scanned >= nr_to_scan)
4914	break;
4915
4916	if (should_abort_scan(lruvec, sc))
4917	break;
4918
4919	cond_resched();
4920	}
4921
4922	/*
4923	* If too many file cache in the coldest generation can't be evicted
4924	* due to being dirty, wake up the flusher.
4925	*/
4926	if (sc->nr.unqueued_dirty && sc->nr.unqueued_dirty == sc->nr.file_taken)
4927	wakeup_flusher_threads(WB_REASON_VMSCAN);
4928
4929	/ whether this lruvec should be rotated /
4930	return nr_to_scan < `0`;
4931	}
4932
4933	static int shrink_one(struct lruvec lruvec, struct* scan_control *sc)
4934	{
4935	bool success;
4936	unsigned long scanned = sc->nr_scanned;
4937	unsigned long reclaimed = sc->nr_reclaimed;
4938	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
4939	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
4940
4941	/ lru_gen_age_node() called mem_cgroup_calculate_protection() /
4942	if (mem_cgroup_below_min(NULL, memcg))
4943	return MEMCG_LRU_YOUNG;
4944
4945	if (mem_cgroup_below_low(NULL, memcg)) {
4946	/ see the comment on MEMCG_NR_GENS /
4947	if (READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL)
4948	return MEMCG_LRU_TAIL;
4949
4950	memcg_memory_event(memcg, MEMCG_LOW);
4951	}
4952
4953	success = try_to_shrink_lruvec(lruvec, sc);
4954
4955	shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority);
4956
4957	if (!sc->proactive)
4958	vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned,
4959	sc->nr_reclaimed - reclaimed);
4960
4961	flush_reclaim_state(sc);
4962
4963	if (success && mem_cgroup_online(memcg))
4964	return MEMCG_LRU_YOUNG;
4965
4966	if (!success && lruvec_is_sizable(lruvec, sc))
4967	return `0`;
4968
4969	/ one retry if offlined or too small /
4970	return READ_ONCE(lruvec->lrugen.seg) != MEMCG_LRU_TAIL ?
4971	MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG;
4972	}
4973
4974	static void shrink_many(struct pglist_data pgdat, struct* scan_control *sc)
4975	{
4976	int op;
4977	int gen;
4978	int bin;
4979	int first_bin;
4980	struct lruvec *lruvec;
4981	struct lru_gen_folio *lrugen;
4982	struct mem_cgroup *memcg;
4983	struct hlist_nulls_node *pos;
4984
4985	gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq));
4986	bin = first_bin = get_random_u32_below(MEMCG_NR_BINS);
4987	restart:
4988	op = `0`;
4989	memcg = NULL;
4990
4991	rcu_read_lock();
4992
4993	hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) {
4994	if (op) {
4995	lru_gen_rotate_memcg(lruvec, op);
4996	op = `0`;
4997	}
4998
4999	mem_cgroup_put(memcg);
5000	memcg = NULL;
5001
5002	if (gen != READ_ONCE(lrugen->gen))
5003	continue;
5004
5005	lruvec = container_of(lrugen, struct lruvec, lrugen);
5006	memcg = lruvec_memcg(lruvec);
5007
5008	if (!mem_cgroup_tryget(memcg)) {
5009	lru_gen_release_memcg(memcg);
5010	memcg = NULL;
5011	continue;
5012	}
5013
5014	rcu_read_unlock();
5015
5016	op = shrink_one(lruvec, sc);
5017
5018	rcu_read_lock();
5019
5020	if (should_abort_scan(lruvec, sc))
5021	break;
5022	}
5023
5024	rcu_read_unlock();
5025
5026	if (op)
5027	lru_gen_rotate_memcg(lruvec, op);
5028
5029	mem_cgroup_put(memcg);
5030
5031	if (!is_a_nulls(pos))
5032	return;
5033
5034	/ restart if raced with lru_gen_rotate_memcg() /
5035	if (gen != get_nulls_value(pos))
5036	goto restart;
5037
5038	/ try the rest of the bins of the current generation /
5039	bin = get_memcg_bin(bin + `1`);
5040	if (bin != first_bin)
5041	goto restart;
5042	}
5043
5044	static void lru_gen_shrink_lruvec(struct lruvec lruvec, struct* scan_control *sc)
5045	{
5046	struct blk_plug plug;
5047
5048	VM_WARN_ON_ONCE(root_reclaim(sc));
5049	VM_WARN_ON_ONCE(!sc->may_writepage \|\| !sc->may_unmap);
5050
5051	lru_add_drain();
5052
5053	blk_start_plug(&plug);
5054
5055	set_mm_walk(NULL, sc->proactive);
5056
5057	if (try_to_shrink_lruvec(lruvec, sc))
5058	lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG);
5059
5060	clear_mm_walk();
5061
5062	blk_finish_plug(&plug);
5063	}
5064
5065	static void lru_gen_shrink_node(struct pglist_data pgdat, struct* scan_control *sc)
5066	{
5067	struct blk_plug plug;
5068	unsigned long reclaimed = sc->nr_reclaimed;
5069
5070	VM_WARN_ON_ONCE(!root_reclaim(sc));
5071
5072	/*
5073	* Unmapped clean folios are already prioritized. Scanning for more of
5074	* them is likely futile and can cause high reclaim latency when there
5075	* is a large number of memcgs.
5076	*/
5077	if (!sc->may_writepage \|\| !sc->may_unmap)
5078	goto done;
5079
5080	lru_add_drain();
5081
5082	blk_start_plug(&plug);
5083
5084	set_mm_walk(pgdat, sc->proactive);
5085
5086	set_initial_priority(pgdat, sc);
5087
5088	if (current_is_kswapd())
5089	sc->nr_reclaimed = `0`;
5090
5091	if (mem_cgroup_disabled())
5092	shrink_one(&pgdat->__lruvec, sc);
5093	else
5094	shrink_many(pgdat, sc);
5095
5096	if (current_is_kswapd())
5097	sc->nr_reclaimed += reclaimed;
5098
5099	clear_mm_walk();
5100
5101	blk_finish_plug(&plug);
5102	done:
5103	if (sc->nr_reclaimed > reclaimed)
5104	atomic_set(&pgdat->kswapd_failures, `0`);
5105	}
5106
5107	/******************************************************************************
5108	* state change
5109	******************************************************************************/
5110
5111	static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
5112	{
5113	struct lru_gen_folio *lrugen = &lruvec->lrugen;
5114
5115	if (lrugen->enabled) {
5116	enum lru_list lru;
5117
5118	for_each_evictable_lru(lru) {
5119	if (!list_empty(&lruvec->lists[lru]))
5120	return false;
5121	}
5122	} else {
5123	int gen, type, zone;
5124
5125	for_each_gen_type_zone(gen, type, zone) {
5126	if (!list_empty(&lrugen->folios[gen][type][zone]))
5127	return false;
5128	}
5129	}
5130
5131	return true;
5132	}
5133
5134	static bool fill_evictable(struct lruvec *lruvec)
5135	{
5136	enum lru_list lru;
5137	int remaining = MAX_LRU_BATCH;
5138
5139	for_each_evictable_lru(lru) {
5140	int type = is_file_lru(lru);
5141	bool active = is_active_lru(lru);
5142	struct list_head *head = &lruvec->lists[lru];
5143
5144	while (!list_empty(head)) {
5145	bool success;
5146	struct folio *folio = lru_to_folio(head);
5147
5148	VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
5149	VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio) != active, folio);
5150	VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
5151	VM_WARN_ON_ONCE_FOLIO(folio_lru_gen(folio) != -`1`, folio);
5152
5153	lruvec_del_folio(lruvec, folio);
5154	success = lru_gen_add_folio(lruvec, folio, false);
5155	VM_WARN_ON_ONCE(!success);
5156
5157	if (!--remaining)
5158	return false;
5159	}
5160	}
5161
5162	return true;
5163	}
5164
5165	static bool drain_evictable(struct lruvec *lruvec)
5166	{
5167	int gen, type, zone;
5168	int remaining = MAX_LRU_BATCH;
5169
5170	for_each_gen_type_zone(gen, type, zone) {
5171	struct list_head *head = &lruvec->lrugen.folios[gen][type][zone];
5172
5173	while (!list_empty(head)) {
5174	bool success;
5175	struct folio *folio = lru_to_folio(head);
5176
5177	VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
5178	VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
5179	VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
5180	VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
5181
5182	success = lru_gen_del_folio(lruvec, folio, false);
5183	VM_WARN_ON_ONCE(!success);
5184	lruvec_add_folio(lruvec, folio);
5185
5186	if (!--remaining)
5187	return false;
5188	}
5189	}
5190
5191	return true;
5192	}
5193
5194	static void lru_gen_change_state(bool enabled)
5195	{
5196	static DEFINE_MUTEX(state_mutex);
5197
5198	struct mem_cgroup *memcg;
5199
5200	cgroup_lock();
5201	cpus_read_lock();
5202	get_online_mems();
5203	mutex_lock(&state_mutex);
5204
5205	if (enabled == lru_gen_enabled())
5206	goto unlock;
5207
5208	if (enabled)
5209	static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
5210	else
5211	static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
5212
5213	memcg = mem_cgroup_iter(NULL, NULL, NULL);
5214	do {
5215	int nid;
5216
5217	for_each_node(nid) {
5218	struct lruvec *lruvec = get_lruvec(memcg, nid);
5219
5220	spin_lock_irq(&lruvec->lru_lock);
5221
5222	VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
5223	VM_WARN_ON_ONCE(!state_is_valid(lruvec));
5224
5225	lruvec->lrugen.enabled = enabled;
5226
5227	while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) {
5228	spin_unlock_irq(&lruvec->lru_lock);
5229	cond_resched();
5230	spin_lock_irq(&lruvec->lru_lock);
5231	}
5232
5233	spin_unlock_irq(&lruvec->lru_lock);
5234	}
5235
5236	cond_resched();
5237	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
5238	unlock:
5239	mutex_unlock(&state_mutex);
5240	put_online_mems();
5241	cpus_read_unlock();
5242	cgroup_unlock();
5243	}
5244
5245	/******************************************************************************
5246	* sysfs interface
5247	******************************************************************************/
5248
5249	static ssize_t min_ttl_ms_show(struct kobject kobj, struct* kobj_attribute attr, char* *buf)
5250	{
5251	return sysfs_emit(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl)));
5252	}
5253
5254	/ see Documentation/admin-guide/mm/multigen_lru.rst for details /
5255	static ssize_t min_ttl_ms_store(struct kobject kobj, struct* kobj_attribute *attr,
5256	const char *buf, size_t len)
5257	{
5258	unsigned int msecs;
5259
5260	if (kstrtouint(buf, `0`, &msecs))
5261	return -EINVAL;
5262
5263	WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs));
5264
5265	return len;
5266	}
5267
5268	static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR_RW(min_ttl_ms);
5269
5270	static ssize_t enabled_show(struct kobject kobj, struct* kobj_attribute attr, char* *buf)
5271	{
5272	unsigned int caps = `0`;
5273
5274	if (get_cap(LRU_GEN_CORE))
5275	caps \|= BIT(LRU_GEN_CORE);
5276
5277	if (should_walk_mmu())
5278	caps \|= BIT(LRU_GEN_MM_WALK);
5279
5280	if (should_clear_pmd_young())
5281	caps \|= BIT(LRU_GEN_NONLEAF_YOUNG);
5282
5283	return sysfs_emit(buf, "0x%04x\n", caps);
5284	}
5285
5286	/ see Documentation/admin-guide/mm/multigen_lru.rst for details /
5287	static ssize_t enabled_store(struct kobject kobj, struct* kobj_attribute *attr,
5288	const char *buf, size_t len)
5289	{
5290	int i;
5291	unsigned int caps;
5292
5293	if (tolower(*buf) == `'n'`)
5294	caps = `0`;
5295	else if (tolower(*buf) == `'y'`)
5296	caps = -`1`;
5297	else if (kstrtouint(buf, `0`, &caps))
5298	return -EINVAL;
5299
5300	for (i = `0`; i < NR_LRU_GEN_CAPS; i++) {
5301	bool enabled = caps & BIT(i);
5302
5303	if (i == LRU_GEN_CORE)
5304	lru_gen_change_state(enabled);
5305	else if (enabled)
5306	static_branch_enable(&lru_gen_caps[i]);
5307	else
5308	static_branch_disable(&lru_gen_caps[i]);
5309	}
5310
5311	return len;
5312	}
5313
5314	static struct kobj_attribute lru_gen_enabled_attr = __ATTR_RW(enabled);
5315
5316	static struct attribute *lru_gen_attrs[] = {
5317	&lru_gen_min_ttl_attr.attr,
5318	&lru_gen_enabled_attr.attr,
5319	NULL
5320	};
5321
5322	static const struct attribute_group lru_gen_attr_group = {
5323	.name = "lru_gen",
5324	.attrs = lru_gen_attrs,
5325	};
5326
5327	/******************************************************************************
5328	* debugfs interface
5329	******************************************************************************/
5330
5331	static void lru_gen_seq_start(struct* seq_file m, loff_t pos)
5332	{
5333	struct mem_cgroup *memcg;
5334	loff_t nr_to_skip = *pos;
5335
5336	m->private = kvmalloc(PATH_MAX, GFP_KERNEL);
5337	if (!m->private)
5338	return ERR_PTR(-ENOMEM);
5339
5340	memcg = mem_cgroup_iter(NULL, NULL, NULL);
5341	do {
5342	int nid;
5343
5344	for_each_node_state(nid, N_MEMORY) {
5345	if (!nr_to_skip--)
5346	return get_lruvec(memcg, nid);
5347	}
5348	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
5349
5350	return NULL;
5351	}
5352
5353	static void lru_gen_seq_stop(struct seq_file m, void* *v)
5354	{
5355	if (!IS_ERR_OR_NULL(v))
5356	mem_cgroup_iter_break(NULL, lruvec_memcg(v));
5357
5358	kvfree(m->private);
5359	m->private = NULL;
5360	}
5361
5362	static void lru_gen_seq_next(struct* seq_file m, void* v, loff_t pos)
5363	{
5364	int nid = lruvec_pgdat(v)->node_id;
5365	struct mem_cgroup *memcg = lruvec_memcg(v);
5366
5367	++*pos;
5368
5369	nid = next_memory_node(nid);
5370	if (nid == MAX_NUMNODES) {
5371	memcg = mem_cgroup_iter(NULL, memcg, NULL);
5372	if (!memcg)
5373	return NULL;
5374
5375	nid = first_memory_node;
5376	}
5377
5378	return get_lruvec(memcg, nid);
5379	}
5380
5381	static void lru_gen_seq_show_full(struct seq_file m, struct* lruvec *lruvec,
5382	unsigned long max_seq, unsigned long *min_seq,
5383	unsigned long seq)
5384	{
5385	int i;
5386	int type, tier;
5387	int hist = lru_hist_from_seq(seq);
5388	struct lru_gen_folio *lrugen = &lruvec->lrugen;
5389	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
5390
5391	for (tier = `0`; tier < MAX_NR_TIERS; tier++) {
5392	seq_printf(m, " %10d", tier);
5393	for (type = `0`; type < ANON_AND_FILE; type++) {
5394	const char *s = "xxx";
5395	unsigned long n[`3`] = {};
5396
5397	if (seq == max_seq) {
5398	s = "RTx";
5399	n[`0`] = READ_ONCE(lrugen->avg_refaulted[type][tier]);
5400	n[`1`] = READ_ONCE(lrugen->avg_total[type][tier]);
5401	} else if (seq == min_seq[type] \|\| NR_HIST_GENS > `1`) {
5402	s = "rep";
5403	n[`0`] = atomic_long_read(&lrugen->refaulted[hist][type][tier]);
5404	n[`1`] = atomic_long_read(&lrugen->evicted[hist][type][tier]);
5405	n[`2`] = READ_ONCE(lrugen->protected[hist][type][tier]);
5406	}
5407
5408	for (i = `0`; i < `3`; i++)
5409	seq_printf(m, " %10lu%c", n[i], s[i]);
5410	}
5411	seq_putc(m, `'\n'`);
5412	}
5413
5414	if (!mm_state)
5415	return;
5416
5417	seq_puts(m, " ");
5418	for (i = `0`; i < NR_MM_STATS; i++) {
5419	const char *s = "xxxx";
5420	unsigned long n = `0`;
5421
5422	if (seq == max_seq && NR_HIST_GENS == `1`) {
5423	s = "TYFA";
5424	n = READ_ONCE(mm_state->stats[hist][i]);
5425	} else if (seq != max_seq && NR_HIST_GENS > `1`) {
5426	s = "tyfa";
5427	n = READ_ONCE(mm_state->stats[hist][i]);
5428	}
5429
5430	seq_printf(m, " %10lu%c", n, s[i]);
5431	}
5432	seq_putc(m, `'\n'`);
5433	}
5434
5435	/ see Documentation/admin-guide/mm/multigen_lru.rst for details /
5436	static int lru_gen_seq_show(struct seq_file m, void* *v)
5437	{
5438	unsigned long seq;
5439	bool full = debugfs_get_aux_num(m->file);
5440	struct lruvec *lruvec = v;
5441	struct lru_gen_folio *lrugen = &lruvec->lrugen;
5442	int nid = lruvec_pgdat(lruvec)->node_id;
5443	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
5444	DEFINE_MAX_SEQ(lruvec);
5445	DEFINE_MIN_SEQ(lruvec);
5446
5447	if (nid == first_memory_node) {
5448	const char *path = memcg ? m->private : "";
5449
5450	#ifdef CONFIG_MEMCG
5451	if (memcg)
5452	cgroup_path(memcg->css.cgroup, m->private, PATH_MAX);
5453	#endif
5454	seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path);
5455	}
5456
5457	seq_printf(m, " node %5d\n", nid);
5458
5459	if (!full)
5460	seq = evictable_min_seq(min_seq, MAX_SWAPPINESS / `2`);
5461	else if (max_seq >= MAX_NR_GENS)
5462	seq = max_seq - MAX_NR_GENS + `1`;
5463	else
5464	seq = `0`;
5465
5466	for (; seq <= max_seq; seq++) {
5467	int type, zone;
5468	int gen = lru_gen_from_seq(seq);
5469	unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
5470
5471	seq_printf(m, " %10lu %10u", seq, jiffies_to_msecs(jiffies - birth));
5472
5473	for (type = `0`; type < ANON_AND_FILE; type++) {
5474	unsigned long size = `0`;
5475	char mark = full && seq < min_seq[type] ? `'x'` : `' '`;
5476
5477	for (zone = `0`; zone < MAX_NR_ZONES; zone++)
5478	size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), `0L`);
5479
5480	seq_printf(m, " %10lu%c", size, mark);
5481	}
5482
5483	seq_putc(m, `'\n'`);
5484
5485	if (full)
5486	lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq);
5487	}
5488
5489	return `0`;
5490	}
5491
5492	static const struct seq_operations lru_gen_seq_ops = {
5493	.start = lru_gen_seq_start,
5494	.stop = lru_gen_seq_stop,
5495	.next = lru_gen_seq_next,
5496	.show = lru_gen_seq_show,
5497	};
5498
5499	static int run_aging(struct lruvec lruvec, unsigned* long seq,
5500	int swappiness, bool force_scan)
5501	{
5502	DEFINE_MAX_SEQ(lruvec);
5503
5504	if (seq > max_seq)
5505	return -EINVAL;
5506
5507	return try_to_inc_max_seq(lruvec, max_seq, swappiness, force_scan) ? `0` : -EEXIST;
5508	}
5509
5510	static int run_eviction(struct lruvec lruvec, unsigned* long seq, struct scan_control *sc,
5511	int swappiness, unsigned long nr_to_reclaim)
5512	{
5513	DEFINE_MAX_SEQ(lruvec);
5514
5515	if (seq + MIN_NR_GENS > max_seq)
5516	return -EINVAL;
5517
5518	sc->nr_reclaimed = `0`;
5519
5520	while (!signal_pending(current)) {
5521	DEFINE_MIN_SEQ(lruvec);
5522
5523	if (seq < evictable_min_seq(min_seq, swappiness))
5524	return `0`;
5525
5526	if (sc->nr_reclaimed >= nr_to_reclaim)
5527	return `0`;
5528
5529	if (!evict_folios(nr_to_reclaim - sc->nr_reclaimed, lruvec, sc,
5530	swappiness))
5531	return `0`;
5532
5533	cond_resched();
5534	}
5535
5536	return -EINTR;
5537	}
5538
5539	static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
5540	struct scan_control sc, int* swappiness, unsigned long opt)
5541	{
5542	struct lruvec *lruvec;
5543	int err = -EINVAL;
5544	struct mem_cgroup *memcg = NULL;
5545
5546	if (nid < `0` \|\| nid >= MAX_NUMNODES \|\| !node_state(nid, N_MEMORY))
5547	return -EINVAL;
5548
5549	if (!mem_cgroup_disabled()) {
5550	rcu_read_lock();
5551
5552	memcg = mem_cgroup_from_id(memcg_id);
5553	if (!mem_cgroup_tryget(memcg))
5554	memcg = NULL;
5555
5556	rcu_read_unlock();
5557
5558	if (!memcg)
5559	return -EINVAL;
5560	}
5561
5562	if (memcg_id != mem_cgroup_id(memcg))
5563	goto done;
5564
5565	sc->target_mem_cgroup = memcg;
5566	lruvec = get_lruvec(memcg, nid);
5567
5568	if (swappiness < MIN_SWAPPINESS)
5569	swappiness = get_swappiness(lruvec, sc);
5570	else if (swappiness > SWAPPINESS_ANON_ONLY)
5571	goto done;
5572
5573	switch (cmd) {
5574	case `'+'`:
5575	err = run_aging(lruvec, seq, swappiness, opt);
5576	break;
5577	case `'-'`:
5578	err = run_eviction(lruvec, seq, sc, swappiness, opt);
5579	break;
5580	}
5581	done:
5582	mem_cgroup_put(memcg);
5583
5584	return err;
5585	}
5586
5587	/ see Documentation/admin-guide/mm/multigen_lru.rst for details /
5588	static ssize_t lru_gen_seq_write(struct file file, const* char __user *src,
5589	size_t len, loff_t *pos)
5590	{
5591	void *buf;
5592	char cur, next;
5593	unsigned int flags;
5594	struct blk_plug plug;
5595	int err = -EINVAL;
5596	struct scan_control sc = {
5597	.may_writepage = true,
5598	.may_unmap = true,
5599	.may_swap = true,
5600	.reclaim_idx = MAX_NR_ZONES - `1`,
5601	.gfp_mask = GFP_KERNEL,
5602	.proactive = true,
5603	};
5604
5605	buf = kvmalloc(len + `1`, GFP_KERNEL);
5606	if (!buf)
5607	return -ENOMEM;
5608
5609	if (copy_from_user(buf, src, len)) {
5610	kvfree(buf);
5611	return -EFAULT;
5612	}
5613
5614	set_task_reclaim_state(current, &sc.reclaim_state);
5615	flags = memalloc_noreclaim_save();
5616	blk_start_plug(&plug);
5617	if (!set_mm_walk(NULL, true)) {
5618	err = -ENOMEM;
5619	goto done;
5620	}
5621
5622	next = buf;
5623	next[len] = `'\0'`;
5624
5625	while ((cur = strsep(&next, ",;\n"))) {
5626	int n;
5627	int end;
5628	char cmd, swap_string[`5`];
5629	unsigned int memcg_id;
5630	unsigned int nid;
5631	unsigned long seq;
5632	unsigned int swappiness;
5633	unsigned long opt = -`1`;
5634
5635	cur = skip_spaces(cur);
5636	if (!*cur)
5637	continue;
5638
5639	n = sscanf(cur, "%c %u %u %lu %n %4s %n %lu %n", &cmd, &memcg_id, &nid,
5640	&seq, &end, swap_string, &end, &opt, &end);
5641	if (n < `4` \|\| cur[end]) {
5642	err = -EINVAL;
5643	break;
5644	}
5645
5646	if (n == `4`) {
5647	swappiness = -`1`;
5648	} else if (!strcmp("max", swap_string)) {
5649	/ set by userspace for anonymous memory only /
5650	swappiness = SWAPPINESS_ANON_ONLY;
5651	} else {
5652	err = kstrtouint(swap_string, `0`, &swappiness);
5653	if (err)
5654	break;
5655	}
5656
5657	err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt);
5658	if (err)
5659	break;
5660	}
5661	done:
5662	clear_mm_walk();
5663	blk_finish_plug(&plug);
5664	memalloc_noreclaim_restore(flags);
5665	set_task_reclaim_state(current, NULL);
5666
5667	kvfree(buf);
5668
5669	return err ? : len;
5670	}
5671
5672	static int lru_gen_seq_open(struct inode inode, struct* file *file)
5673	{
5674	return seq_open(file, &lru_gen_seq_ops);
5675	}
5676
5677	static const struct file_operations lru_gen_rw_fops = {
5678	.open = lru_gen_seq_open,
5679	.read = seq_read,
5680	.write = lru_gen_seq_write,
5681	.llseek = seq_lseek,
5682	.release = seq_release,
5683	};
5684
5685	static const struct file_operations lru_gen_ro_fops = {
5686	.open = lru_gen_seq_open,
5687	.read = seq_read,
5688	.llseek = seq_lseek,
5689	.release = seq_release,
5690	};
5691
5692	/******************************************************************************
5693	* initialization
5694	******************************************************************************/
5695
5696	void lru_gen_init_pgdat(struct pglist_data *pgdat)
5697	{
5698	int i, j;
5699
5700	spin_lock_init(&pgdat->memcg_lru.lock);
5701
5702	for (i = `0`; i < MEMCG_NR_GENS; i++) {
5703	for (j = `0`; j < MEMCG_NR_BINS; j++)
5704	INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i);
5705	}
5706	}
5707
5708	void lru_gen_init_lruvec(struct lruvec *lruvec)
5709	{
5710	int i;
5711	int gen, type, zone;
5712	struct lru_gen_folio *lrugen = &lruvec->lrugen;
5713	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
5714
5715	lrugen->max_seq = MIN_NR_GENS + `1`;
5716	lrugen->enabled = lru_gen_enabled();
5717
5718	for (i = `0`; i <= MIN_NR_GENS + `1`; i++)
5719	lrugen->timestamps[i] = jiffies;
5720
5721	for_each_gen_type_zone(gen, type, zone)
5722	INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]);
5723
5724	if (mm_state)
5725	mm_state->seq = MIN_NR_GENS;
5726	}
5727
5728	#ifdef CONFIG_MEMCG
5729
5730	void lru_gen_init_memcg(struct mem_cgroup *memcg)
5731	{
5732	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
5733
5734	if (!mm_list)
5735	return;
5736
5737	INIT_LIST_HEAD(&mm_list->fifo);
5738	spin_lock_init(&mm_list->lock);
5739	}
5740
5741	void lru_gen_exit_memcg(struct mem_cgroup *memcg)
5742	{
5743	int i;
5744	int nid;
5745	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
5746
5747	VM_WARN_ON_ONCE(mm_list && !list_empty(&mm_list->fifo));
5748
5749	for_each_node(nid) {
5750	struct lruvec *lruvec = get_lruvec(memcg, nid);
5751	struct lru_gen_mm_state *mm_state = get_mm_state(lruvec);
5752
5753	VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, `0`,
5754	sizeof(lruvec->lrugen.nr_pages)));
5755
5756	lruvec->lrugen.list.next = LIST_POISON1;
5757
5758	if (!mm_state)
5759	continue;
5760
5761	for (i = `0`; i < NR_BLOOM_FILTERS; i++) {
5762	bitmap_free(mm_state->filters[i]);
5763	mm_state->filters[i] = NULL;
5764	}
5765	}
5766	}
5767
5768	#endif /* CONFIG_MEMCG */
5769
5770	static int __init init_lru_gen(void)
5771	{
5772	BUILD_BUG_ON(MIN_NR_GENS + `1` >= MAX_NR_GENS);
5773	BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
5774
5775	if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
5776	pr_err("lru_gen: failed to create sysfs group\n");
5777
5778	debugfs_create_file_aux_num("lru_gen", `0644`, NULL, NULL, false,
5779	&lru_gen_rw_fops);
5780	debugfs_create_file_aux_num("lru_gen_full", `0444`, NULL, NULL, true,
5781	&lru_gen_ro_fops);
5782
5783	return `0`;
5784	};
5785	late_initcall(init_lru_gen);
5786
5787	#else /* !CONFIG_LRU_GEN */
5788
5789	static void lru_gen_age_node(struct pglist_data pgdat, struct* scan_control *sc)
5790	{
5791	BUILD_BUG();
5792	}
5793
5794	static void lru_gen_shrink_lruvec(struct lruvec lruvec, struct* scan_control *sc)
5795	{
5796	BUILD_BUG();
5797	}
5798
5799	static void lru_gen_shrink_node(struct pglist_data pgdat, struct* scan_control *sc)
5800	{
5801	BUILD_BUG();
5802	}
5803
5804	#endif /* CONFIG_LRU_GEN */
5805
5806	static void shrink_lruvec(struct lruvec lruvec, struct* scan_control *sc)
5807	{
5808	unsigned long nr[NR_LRU_LISTS];
5809	unsigned long targets[NR_LRU_LISTS];
5810	unsigned long nr_to_scan;
5811	enum lru_list lru;
5812	unsigned long nr_reclaimed = `0`;
5813	unsigned long nr_to_reclaim = sc->nr_to_reclaim;
5814	bool proportional_reclaim;
5815	struct blk_plug plug;
5816
5817	if (lru_gen_enabled() && !root_reclaim(sc)) {
5818	lru_gen_shrink_lruvec(lruvec, sc);
5819	return;
5820	}
5821
5822	get_scan_count(lruvec, sc, nr);
5823
5824	/ Record the original scan target for proportional adjustments later /
5825	memcpy(to: targets, from: nr, len: sizeof(nr));
5826
5827	/*
5828	* Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
5829	* event that can occur when there is little memory pressure e.g.
5830	* multiple streaming readers/writers. Hence, we do not abort scanning
5831	* when the requested number of pages are reclaimed when scanning at
5832	* DEF_PRIORITY on the assumption that the fact we are direct
5833	* reclaiming implies that kswapd is not keeping up and it is best to
5834	* do a batch of work at once. For memcg reclaim one check is made to
5835	* abort proportional reclaim if either the file or anon lru has already
5836	* dropped to zero at the first pass.
5837	*/
5838	proportional_reclaim = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
5839	sc->priority == DEF_PRIORITY);
5840
5841	blk_start_plug(&plug);
5842	while (nr[LRU_INACTIVE_ANON] \|\| nr[LRU_ACTIVE_FILE] \|\|
5843	nr[LRU_INACTIVE_FILE]) {
5844	unsigned long nr_anon, nr_file, percentage;
5845	unsigned long nr_scanned;
5846
5847	for_each_evictable_lru(lru) {
5848	if (nr[lru]) {
5849	nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
5850	nr[lru] -= nr_to_scan;
5851
5852	nr_reclaimed += shrink_list(lru, nr_to_scan,
5853	lruvec, sc);
5854	}
5855	}
5856
5857	cond_resched();
5858
5859	if (nr_reclaimed < nr_to_reclaim \|\| proportional_reclaim)
5860	continue;
5861
5862	/*
5863	* For kswapd and memcg, reclaim at least the number of pages
5864	* requested. Ensure that the anon and file LRUs are scanned
5865	* proportionally what was requested by get_scan_count(). We
5866	* stop reclaiming one LRU and reduce the amount scanning
5867	* proportional to the original scan target.
5868	*/
5869	nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
5870	nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
5871
5872	/*
5873	* It's just vindictive to attack the larger once the smaller
5874	* has gone to zero. And given the way we stop scanning the
5875	* smaller below, this makes sure that we only make one nudge
5876	* towards proportionality once we've got nr_to_reclaim.
5877	*/
5878	if (!nr_file \|\| !nr_anon)
5879	break;
5880
5881	if (nr_file > nr_anon) {
5882	unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
5883	targets[LRU_ACTIVE_ANON] + `1`;
5884	lru = LRU_BASE;
5885	percentage = nr_anon * `100` / scan_target;
5886	} else {
5887	unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
5888	targets[LRU_ACTIVE_FILE] + `1`;
5889	lru = LRU_FILE;
5890	percentage = nr_file * `100` / scan_target;
5891	}
5892
5893	/ Stop scanning the smaller of the LRU /
5894	nr[lru] = `0`;
5895	nr[lru + LRU_ACTIVE] = `0`;
5896
5897	/*
5898	* Recalculate the other LRU scan count based on its original
5899	* scan target and the percentage scanning already complete
5900	*/
5901	lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
5902	nr_scanned = targets[lru] - nr[lru];
5903	nr[lru] = targets[lru] * (`100` - percentage) / `100`;
5904	nr[lru] -= min(nr[lru], nr_scanned);
5905
5906	lru += LRU_ACTIVE;
5907	nr_scanned = targets[lru] - nr[lru];
5908	nr[lru] = targets[lru] * (`100` - percentage) / `100`;
5909	nr[lru] -= min(nr[lru], nr_scanned);
5910	}
5911	blk_finish_plug(&plug);
5912	sc->nr_reclaimed += nr_reclaimed;
5913
5914	/*
5915	* Even if we did not try to evict anon pages at all, we want to
5916	* rebalance the anon lru active/inactive ratio.
5917	*/
5918	if (can_age_anon_pages(lruvec, sc) &&
5919	inactive_is_low(lruvec, inactive_lru: LRU_INACTIVE_ANON))
5920	shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
5921	sc, lru: LRU_ACTIVE_ANON);
5922	}
5923
5924	/ Use reclaim/compaction for costly allocs or under memory pressure /
5925	static bool in_reclaim_compaction(struct scan_control *sc)
5926	{
5927	if (gfp_compaction_allowed(gfp_mask: sc->gfp_mask) && sc->order &&
5928	(sc->order > PAGE_ALLOC_COSTLY_ORDER \|\|
5929	sc->priority < DEF_PRIORITY - `2`))
5930	return true;
5931
5932	return false;
5933	}
5934
5935	/*
5936	* Reclaim/compaction is used for high-order allocation requests. It reclaims
5937	* order-0 pages before compacting the zone. should_continue_reclaim() returns
5938	* true if more pages should be reclaimed such that when the page allocator
5939	* calls try_to_compact_pages() that it will have enough free pages to succeed.
5940	* It will give up earlier than that if there is difficulty reclaiming pages.
5941	*/
5942	static inline bool should_continue_reclaim(struct pglist_data *pgdat,
5943	unsigned long nr_reclaimed,
5944	struct scan_control *sc)
5945	{
5946	unsigned long pages_for_compaction;
5947	unsigned long inactive_lru_pages;
5948	int z;
5949	struct zone *zone;
5950
5951	/ If not in reclaim/compaction mode, stop /
5952	if (!in_reclaim_compaction(sc))
5953	return false;
5954
5955	/*
5956	* Stop if we failed to reclaim any pages from the last SWAP_CLUSTER_MAX
5957	* number of pages that were scanned. This will return to the caller
5958	* with the risk reclaim/compaction and the resulting allocation attempt
5959	* fails. In the past we have tried harder for __GFP_RETRY_MAYFAIL
5960	* allocations through requiring that the full LRU list has been scanned
5961	* first, by assuming that zero delta of sc->nr_scanned means full LRU
5962	* scan, but that approximation was wrong, and there were corner cases
5963	* where always a non-zero amount of pages were scanned.
5964	*/
5965	if (!nr_reclaimed)
5966	return false;
5967
5968	/ If compaction would go ahead or the allocation would succeed, stop /
5969	for_each_managed_zone_pgdat(zone, pgdat, z, sc->reclaim_idx) {
5970	unsigned long watermark = min_wmark_pages(z: zone);
5971
5972	/ Allocation can already succeed, nothing to do /
5973	if (zone_watermark_ok(z: zone, order: sc->order, mark: watermark,
5974	highest_zoneidx: sc->reclaim_idx, alloc_flags: `0`))
5975	return false;
5976
5977	if (compaction_suitable(zone, order: sc->order, watermark,
5978	highest_zoneidx: sc->reclaim_idx))
5979	return false;
5980	}
5981
5982	/*
5983	* If we have not reclaimed enough pages for compaction and the
5984	* inactive lists are large enough, continue reclaiming
5985	*/
5986	pages_for_compaction = compact_gap(order: sc->order);
5987	inactive_lru_pages = node_page_state(pgdat, item: NR_INACTIVE_FILE);
5988	if (can_reclaim_anon_pages(NULL, nid: pgdat->node_id, sc))
5989	inactive_lru_pages += node_page_state(pgdat, item: NR_INACTIVE_ANON);
5990
5991	return inactive_lru_pages > pages_for_compaction;
5992	}
5993
5994	static void shrink_node_memcgs(pg_data_t pgdat, struct* scan_control *sc)
5995	{
5996	struct mem_cgroup *target_memcg = sc->target_mem_cgroup;
5997	struct mem_cgroup_reclaim_cookie reclaim = {
5998	.pgdat = pgdat,
5999	};
6000	struct mem_cgroup_reclaim_cookie *partial = &reclaim;
6001	struct mem_cgroup *memcg;
6002
6003	/*
6004	* In most cases, direct reclaimers can do partial walks
6005	* through the cgroup tree, using an iterator state that
6006	* persists across invocations. This strikes a balance between
6007	* fairness and allocation latency.
6008	*
6009	* For kswapd, reliable forward progress is more important
6010	* than a quick return to idle. Always do full walks.
6011	*/
6012	if (current_is_kswapd() \|\| sc->memcg_full_walk)
6013	partial = NULL;
6014
6015	memcg = mem_cgroup_iter(root: target_memcg, NULL, reclaim: partial);
6016	do {
6017	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
6018	unsigned long reclaimed;
6019	unsigned long scanned;
6020
6021	/*
6022	* This loop can become CPU-bound when target memcgs
6023	* aren't eligible for reclaim - either because they
6024	* don't have any reclaimable pages, or because their
6025	* memory is explicitly protected. Avoid soft lockups.
6026	*/
6027	cond_resched();
6028
6029	mem_cgroup_calculate_protection(root: target_memcg, memcg);
6030
6031	if (mem_cgroup_below_min(target: target_memcg, memcg)) {
6032	/*
6033	* Hard protection.
6034	* If there is no reclaimable memory, OOM.
6035	*/
6036	continue;
6037	} else if (mem_cgroup_below_low(target: target_memcg, memcg)) {
6038	/*
6039	* Soft protection.
6040	* Respect the protection only as long as
6041	* there is an unprotected supply
6042	* of reclaimable memory from other cgroups.
6043	*/
6044	if (!sc->memcg_low_reclaim) {
6045	sc->memcg_low_skipped = `1`;
6046	continue;
6047	}
6048	memcg_memory_event(memcg, event: MEMCG_LOW);
6049	}
6050
6051	reclaimed = sc->nr_reclaimed;
6052	scanned = sc->nr_scanned;
6053
6054	shrink_lruvec(lruvec, sc);
6055
6056	shrink_slab(gfp_mask: sc->gfp_mask, nid: pgdat->node_id, memcg,
6057	priority: sc->priority);
6058
6059	/ Record the group's reclaim efficiency /
6060	if (!sc->proactive)
6061	vmpressure(gfp: sc->gfp_mask, memcg, tree: false,
6062	scanned: sc->nr_scanned - scanned,
6063	reclaimed: sc->nr_reclaimed - reclaimed);
6064
6065	/ If partial walks are allowed, bail once goal is reached /
6066	if (partial && sc->nr_reclaimed >= sc->nr_to_reclaim) {
6067	mem_cgroup_iter_break(root: target_memcg, prev: memcg);
6068	break;
6069	}
6070	} while ((memcg = mem_cgroup_iter(root: target_memcg, prev: memcg, reclaim: partial)));
6071	}
6072
6073	static void shrink_node(pg_data_t pgdat, struct* scan_control *sc)
6074	{
6075	unsigned long nr_reclaimed, nr_scanned, nr_node_reclaimed;
6076	struct lruvec *target_lruvec;
6077	bool reclaimable = false;
6078
6079	if (lru_gen_enabled() && root_reclaim(sc)) {
6080	memset(s: &sc->nr, c: `0`, n: sizeof(sc->nr));
6081	lru_gen_shrink_node(pgdat, sc);
6082	return;
6083	}
6084
6085	target_lruvec = mem_cgroup_lruvec(memcg: sc->target_mem_cgroup, pgdat);
6086
6087	again:
6088	memset(s: &sc->nr, c: `0`, n: sizeof(sc->nr));
6089
6090	nr_reclaimed = sc->nr_reclaimed;
6091	nr_scanned = sc->nr_scanned;
6092
6093	prepare_scan_control(pgdat, sc);
6094
6095	shrink_node_memcgs(pgdat, sc);
6096
6097	flush_reclaim_state(sc);
6098
6099	nr_node_reclaimed = sc->nr_reclaimed - nr_reclaimed;
6100
6101	/ Record the subtree's reclaim efficiency /
6102	if (!sc->proactive)
6103	vmpressure(gfp: sc->gfp_mask, memcg: sc->target_mem_cgroup, tree: true,
6104	scanned: sc->nr_scanned - nr_scanned, reclaimed: nr_node_reclaimed);
6105
6106	if (nr_node_reclaimed)
6107	reclaimable = true;
6108
6109	if (current_is_kswapd()) {
6110	/*
6111	* If reclaim is isolating dirty pages under writeback,
6112	* it implies that the long-lived page allocation rate
6113	* is exceeding the page laundering rate. Either the
6114	* global limits are not being effective at throttling
6115	* processes due to the page distribution throughout
6116	* zones or there is heavy usage of a slow backing
6117	* device. The only option is to throttle from reclaim
6118	* context which is not ideal as there is no guarantee
6119	* the dirtying process is throttled in the same way
6120	* balance_dirty_pages() manages.
6121	*
6122	* Once a node is flagged PGDAT_WRITEBACK, kswapd will
6123	* count the number of pages under pages flagged for
6124	* immediate reclaim and stall if any are encountered
6125	* in the nr_immediate check below.
6126	*/
6127	if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
6128	set_bit(nr: PGDAT_WRITEBACK, addr: &pgdat->flags);
6129
6130	/ Allow kswapd to start writing pages during reclaim./
6131	if (sc->nr.unqueued_dirty &&
6132	sc->nr.unqueued_dirty == sc->nr.file_taken)
6133	set_bit(nr: PGDAT_DIRTY, addr: &pgdat->flags);
6134
6135	/*
6136	* If kswapd scans pages marked for immediate
6137	* reclaim and under writeback (nr_immediate), it
6138	* implies that pages are cycling through the LRU
6139	* faster than they are written so forcibly stall
6140	* until some pages complete writeback.
6141	*/
6142	if (sc->nr.immediate)
6143	reclaim_throttle(pgdat, reason: VMSCAN_THROTTLE_WRITEBACK);
6144	}
6145
6146	/*
6147	* Tag a node/memcg as congested if all the dirty pages were marked
6148	* for writeback and immediate reclaim (counted in nr.congested).
6149	*
6150	* Legacy memcg will stall in page writeback so avoid forcibly
6151	* stalling in reclaim_throttle().
6152	*/
6153	if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested) {
6154	if (cgroup_reclaim(sc) && writeback_throttling_sane(sc))
6155	set_bit(nr: LRUVEC_CGROUP_CONGESTED, addr: &target_lruvec->flags);
6156
6157	if (current_is_kswapd())
6158	set_bit(nr: LRUVEC_NODE_CONGESTED, addr: &target_lruvec->flags);
6159	}
6160
6161	/*
6162	* Stall direct reclaim for IO completions if the lruvec is
6163	* node is congested. Allow kswapd to continue until it
6164	* starts encountering unqueued dirty pages or cycling through
6165	* the LRU too quickly.
6166	*/
6167	if (!current_is_kswapd() && current_may_throttle() &&
6168	!sc->hibernation_mode &&
6169	(test_bit(LRUVEC_CGROUP_CONGESTED, &target_lruvec->flags) \|\|
6170	test_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags)))
6171	reclaim_throttle(pgdat, reason: VMSCAN_THROTTLE_CONGESTED);
6172
6173	if (should_continue_reclaim(pgdat, nr_reclaimed: nr_node_reclaimed, sc))
6174	goto again;
6175
6176	/*
6177	* Kswapd gives up on balancing particular nodes after too
6178	* many failures to reclaim anything from them and goes to
6179	* sleep. On reclaim progress, reset the failure counter. A
6180	* successful direct reclaim run will revive a dormant kswapd.
6181	*/
6182	if (reclaimable)
6183	atomic_set(v: &pgdat->kswapd_failures, i: `0`);
6184	else if (sc->cache_trim_mode)
6185	sc->cache_trim_mode_failed = `1`;
6186	}
6187
6188	/*
6189	* Returns true if compaction should go ahead for a costly-order request, or
6190	* the allocation would already succeed without compaction. Return false if we
6191	* should reclaim first.
6192	*/
6193	static inline bool compaction_ready(struct zone zone, struct* scan_control *sc)
6194	{
6195	unsigned long watermark;
6196
6197	if (!gfp_compaction_allowed(gfp_mask: sc->gfp_mask))
6198	return false;
6199
6200	/ Allocation can already succeed, nothing to do /
6201	if (zone_watermark_ok(z: zone, order: sc->order, mark: min_wmark_pages(z: zone),
6202	highest_zoneidx: sc->reclaim_idx, alloc_flags: `0`))
6203	return true;
6204
6205	/*
6206	* Direct reclaim usually targets the min watermark, but compaction
6207	* takes time to run and there are potentially other callers using the
6208	* pages just freed. So target a higher buffer to give compaction a
6209	* reasonable chance of completing and allocating the pages.
6210	*
6211	* Note that we won't actually reclaim the whole buffer in one attempt
6212	* as the target watermark in should_continue_reclaim() is lower. But if
6213	* we are already above the high+gap watermark, don't reclaim at all.
6214	*/
6215	watermark = high_wmark_pages(z: zone);
6216	if (compaction_suitable(zone, order: sc->order, watermark, highest_zoneidx: sc->reclaim_idx))
6217	return true;
6218
6219	return false;
6220	}
6221
6222	static void consider_reclaim_throttle(pg_data_t pgdat, struct* scan_control *sc)
6223	{
6224	/*
6225	* If reclaim is making progress greater than 12% efficiency then
6226	* wake all the NOPROGRESS throttled tasks.
6227	*/
6228	if (sc->nr_reclaimed > (sc->nr_scanned >> `3`)) {
6229	wait_queue_head_t *wqh;
6230
6231	wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_NOPROGRESS];
6232	if (waitqueue_active(wq_head: wqh))
6233	wake_up(wqh);
6234
6235	return;
6236	}
6237
6238	/*
6239	* Do not throttle kswapd or cgroup reclaim on NOPROGRESS as it will
6240	* throttle on VMSCAN_THROTTLE_WRITEBACK if there are too many pages
6241	* under writeback and marked for immediate reclaim at the tail of the
6242	* LRU.
6243	*/
6244	if (current_is_kswapd() \|\| cgroup_reclaim(sc))
6245	return;
6246
6247	/ Throttle if making no progress at high prioities. /
6248	if (sc->priority == `1` && !sc->nr_reclaimed)
6249	reclaim_throttle(pgdat, reason: VMSCAN_THROTTLE_NOPROGRESS);
6250	}
6251
6252	/*
6253	* This is the direct reclaim path, for page-allocating processes. We only
6254	* try to reclaim pages from zones which will satisfy the caller's allocation
6255	* request.
6256	*
6257	* If a zone is deemed to be full of pinned pages then just give it a light
6258	* scan then give up on it.
6259	*/
6260	static void shrink_zones(struct zonelist zonelist, struct* scan_control *sc)
6261	{
6262	struct zoneref *z;
6263	struct zone *zone;
6264	unsigned long nr_soft_reclaimed;
6265	unsigned long nr_soft_scanned;
6266	gfp_t orig_mask;
6267	pg_data_t *last_pgdat = NULL;
6268	pg_data_t *first_pgdat = NULL;
6269
6270	/*
6271	* If the number of buffer_heads in the machine exceeds the maximum
6272	* allowed level, force direct reclaim to scan the highmem zone as
6273	* highmem pages could be pinning lowmem pages storing buffer_heads
6274	*/
6275	orig_mask = sc->gfp_mask;
6276	if (buffer_heads_over_limit) {
6277	sc->gfp_mask \|= __GFP_HIGHMEM;
6278	sc->reclaim_idx = gfp_zone(flags: sc->gfp_mask);
6279	}
6280
6281	for_each_zone_zonelist_nodemask(zone, z, zonelist,
6282	sc->reclaim_idx, sc->nodemask) {
6283	/*
6284	* Take care memory controller reclaiming has small influence
6285	* to global LRU.
6286	*/
6287	if (!cgroup_reclaim(sc)) {
6288	if (!cpuset_zone_allowed(z: zone,
6289	GFP_KERNEL \| __GFP_HARDWALL))
6290	continue;
6291
6292	/*
6293	* If we already have plenty of memory free for
6294	* compaction in this zone, don't free any more.
6295	* Even though compaction is invoked for any
6296	* non-zero order, only frequent costly order
6297	* reclamation is disruptive enough to become a
6298	* noticeable problem, like transparent huge
6299	* page allocations.
6300	*/
6301	if (IS_ENABLED(CONFIG_COMPACTION) &&
6302	sc->order > PAGE_ALLOC_COSTLY_ORDER &&
6303	compaction_ready(zone, sc)) {
6304	sc->compaction_ready = true;
6305	continue;
6306	}
6307
6308	/*
6309	* Shrink each node in the zonelist once. If the
6310	* zonelist is ordered by zone (not the default) then a
6311	* node may be shrunk multiple times but in that case
6312	* the user prefers lower zones being preserved.
6313	*/
6314	if (zone->zone_pgdat == last_pgdat)
6315	continue;
6316
6317	/*
6318	* This steals pages from memory cgroups over softlimit
6319	* and returns the number of reclaimed pages and
6320	* scanned pages. This works for global memory pressure
6321	* and balancing, not for a memcg's limit.
6322	*/
6323	nr_soft_scanned = `0`;
6324	nr_soft_reclaimed = memcg1_soft_limit_reclaim(pgdat: zone->zone_pgdat,
6325	order: sc->order, gfp_mask: sc->gfp_mask,
6326	total_scanned: &nr_soft_scanned);
6327	sc->nr_reclaimed += nr_soft_reclaimed;
6328	sc->nr_scanned += nr_soft_scanned;
6329	/ need some check for avoid more shrink_zone() /
6330	}
6331
6332	if (!first_pgdat)
6333	first_pgdat = zone->zone_pgdat;
6334
6335	/ See comment about same check for global reclaim above /
6336	if (zone->zone_pgdat == last_pgdat)
6337	continue;
6338	last_pgdat = zone->zone_pgdat;
6339	shrink_node(pgdat: zone->zone_pgdat, sc);
6340	}
6341
6342	if (first_pgdat)
6343	consider_reclaim_throttle(pgdat: first_pgdat, sc);
6344
6345	/*
6346	* Restore to original mask to avoid the impact on the caller if we
6347	* promoted it to __GFP_HIGHMEM.
6348	*/
6349	sc->gfp_mask = orig_mask;
6350	}
6351
6352	static void snapshot_refaults(struct mem_cgroup target_memcg, pg_data_t pgdat)
6353	{
6354	struct lruvec *target_lruvec;
6355	unsigned long refaults;
6356
6357	if (lru_gen_enabled())
6358	return;
6359
6360	target_lruvec = mem_cgroup_lruvec(memcg: target_memcg, pgdat);
6361	refaults = lruvec_page_state(lruvec: target_lruvec, idx: WORKINGSET_ACTIVATE_ANON);
6362	target_lruvec->refaults[WORKINGSET_ANON] = refaults;
6363	refaults = lruvec_page_state(lruvec: target_lruvec, idx: WORKINGSET_ACTIVATE_FILE);
6364	target_lruvec->refaults[WORKINGSET_FILE] = refaults;
6365	}
6366
6367	/*
6368	* This is the main entry point to direct page reclaim.
6369	*
6370	* If a full scan of the inactive list fails to free enough memory then we
6371	* are "out of memory" and something needs to be killed.
6372	*
6373	* If the caller is !__GFP_FS then the probability of a failure is reasonably
6374	* high - the zone may be full of dirty or under-writeback pages, which this
6375	* caller can't do much about. We kick the writeback threads and take explicit
6376	* naps in the hope that some of these pages can be written. But if the
6377	* allocating task holds filesystem locks which prevent writeout this might not
6378	* work, and the allocation attempt will fail.
6379	*
6380	* returns: 0, if no pages reclaimed
6381	* else, the number of pages reclaimed
6382	*/
6383	static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
6384	struct scan_control *sc)
6385	{
6386	int initial_priority = sc->priority;
6387	pg_data_t *last_pgdat;
6388	struct zoneref *z;
6389	struct zone *zone;
6390	retry:
6391	delayacct_freepages_start();
6392
6393	if (!cgroup_reclaim(sc))
6394	__count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, `1`);
6395
6396	do {
6397	if (!sc->proactive)
6398	vmpressure_prio(gfp: sc->gfp_mask, memcg: sc->target_mem_cgroup,
6399	prio: sc->priority);
6400	sc->nr_scanned = `0`;
6401	shrink_zones(zonelist, sc);
6402
6403	if (sc->nr_reclaimed >= sc->nr_to_reclaim)
6404	break;
6405
6406	if (sc->compaction_ready)
6407	break;
6408
6409	/*
6410	* If we're getting trouble reclaiming, start doing
6411	* writepage even in laptop mode.
6412	*/
6413	if (sc->priority < DEF_PRIORITY - `2`)
6414	sc->may_writepage = `1`;
6415	} while (--sc->priority >= `0`);
6416
6417	last_pgdat = NULL;
6418	for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx,
6419	sc->nodemask) {
6420	if (zone->zone_pgdat == last_pgdat)
6421	continue;
6422	last_pgdat = zone->zone_pgdat;
6423
6424	snapshot_refaults(target_memcg: sc->target_mem_cgroup, pgdat: zone->zone_pgdat);
6425
6426	if (cgroup_reclaim(sc)) {
6427	struct lruvec *lruvec;
6428
6429	lruvec = mem_cgroup_lruvec(memcg: sc->target_mem_cgroup,
6430	pgdat: zone->zone_pgdat);
6431	clear_bit(nr: LRUVEC_CGROUP_CONGESTED, addr: &lruvec->flags);
6432	}
6433	}
6434
6435	delayacct_freepages_end();
6436
6437	if (sc->nr_reclaimed)
6438	return sc->nr_reclaimed;
6439
6440	/ Aborted reclaim to try compaction? don't OOM, then /
6441	if (sc->compaction_ready)
6442	return `1`;
6443
6444	/*
6445	* In most cases, direct reclaimers can do partial walks
6446	* through the cgroup tree to meet the reclaim goal while
6447	* keeping latency low. Since the iterator state is shared
6448	* among all direct reclaim invocations (to retain fairness
6449	* among cgroups), though, high concurrency can result in
6450	* individual threads not seeing enough cgroups to make
6451	* meaningful forward progress. Avoid false OOMs in this case.
6452	*/
6453	if (!sc->memcg_full_walk) {
6454	sc->priority = initial_priority;
6455	sc->memcg_full_walk = `1`;
6456	goto retry;
6457	}
6458
6459	/*
6460	* We make inactive:active ratio decisions based on the node's
6461	* composition of memory, but a restrictive reclaim_idx or a
6462	* memory.low cgroup setting can exempt large amounts of
6463	* memory from reclaim. Neither of which are very common, so
6464	* instead of doing costly eligibility calculations of the
6465	* entire cgroup subtree up front, we assume the estimates are
6466	* good, and retry with forcible deactivation if that fails.
6467	*/
6468	if (sc->skipped_deactivate) {
6469	sc->priority = initial_priority;
6470	sc->force_deactivate = `1`;
6471	sc->skipped_deactivate = `0`;
6472	goto retry;
6473	}
6474
6475	/ Untapped cgroup reserves? Don't OOM, retry. /
6476	if (sc->memcg_low_skipped) {
6477	sc->priority = initial_priority;
6478	sc->force_deactivate = `0`;
6479	sc->memcg_low_reclaim = `1`;
6480	sc->memcg_low_skipped = `0`;
6481	goto retry;
6482	}
6483
6484	return `0`;
6485	}
6486
6487	static bool allow_direct_reclaim(pg_data_t *pgdat)
6488	{
6489	struct zone *zone;
6490	unsigned long pfmemalloc_reserve = `0`;
6491	unsigned long free_pages = `0`;
6492	int i;
6493	bool wmark_ok;
6494
6495	if (atomic_read(v: &pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES)
6496	return true;
6497
6498	for_each_managed_zone_pgdat(zone, pgdat, i, ZONE_NORMAL) {
6499	if (!zone_reclaimable_pages(zone) && zone_page_state_snapshot(zone, item: NR_FREE_PAGES))
6500	continue;
6501
6502	pfmemalloc_reserve += min_wmark_pages(z: zone);
6503	free_pages += zone_page_state_snapshot(zone, item: NR_FREE_PAGES);
6504	}
6505
6506	/ If there are no reserves (unexpected config) then do not throttle /
6507	if (!pfmemalloc_reserve)
6508	return true;
6509
6510	wmark_ok = free_pages > pfmemalloc_reserve / `2`;
6511
6512	/ kswapd must be awake if processes are being throttled /
6513	if (!wmark_ok && waitqueue_active(wq_head: &pgdat->kswapd_wait)) {
6514	if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL)
6515	WRITE_ONCE(pgdat->kswapd_highest_zoneidx, ZONE_NORMAL);
6516
6517	wake_up_interruptible(&pgdat->kswapd_wait);
6518	}
6519
6520	return wmark_ok;
6521	}
6522
6523	/*
6524	* Throttle direct reclaimers if backing storage is backed by the network
6525	* and the PFMEMALLOC reserve for the preferred node is getting dangerously
6526	* depleted. kswapd will continue to make progress and wake the processes
6527	* when the low watermark is reached.
6528	*
6529	* Returns true if a fatal signal was delivered during throttling. If this
6530	* happens, the page allocator should not consider triggering the OOM killer.
6531	*/
6532	static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
6533	nodemask_t *nodemask)
6534	{
6535	struct zoneref *z;
6536	struct zone *zone;
6537	pg_data_t *pgdat = NULL;
6538
6539	/*
6540	* Kernel threads should not be throttled as they may be indirectly
6541	* responsible for cleaning pages necessary for reclaim to make forward
6542	* progress. kjournald for example may enter direct reclaim while
6543	* committing a transaction where throttling it could forcing other
6544	* processes to block on log_wait_commit().
6545	*/
6546	if (current->flags & PF_KTHREAD)
6547	goto out;
6548
6549	/*
6550	* If a fatal signal is pending, this process should not throttle.
6551	* It should return quickly so it can exit and free its memory
6552	*/
6553	if (fatal_signal_pending(current))
6554	goto out;
6555
6556	/*
6557	* Check if the pfmemalloc reserves are ok by finding the first node
6558	* with a usable ZONE_NORMAL or lower zone. The expectation is that
6559	* GFP_KERNEL will be required for allocating network buffers when
6560	* swapping over the network so ZONE_HIGHMEM is unusable.
6561	*
6562	* Throttling is based on the first usable node and throttled processes
6563	* wait on a queue until kswapd makes progress and wakes them. There
6564	* is an affinity then between processes waking up and where reclaim
6565	* progress has been made assuming the process wakes on the same node.
6566	* More importantly, processes running on remote nodes will not compete
6567	* for remote pfmemalloc reserves and processes on different nodes
6568	* should make reasonable progress.
6569	*/
6570	for_each_zone_zonelist_nodemask(zone, z, zonelist,
6571	gfp_zone(gfp_mask), nodemask) {
6572	if (zone_idx(zone) > ZONE_NORMAL)
6573	continue;
6574
6575	/ Throttle based on the first usable node /
6576	pgdat = zone->zone_pgdat;
6577	if (allow_direct_reclaim(pgdat))
6578	goto out;
6579	break;
6580	}
6581
6582	/ If no zone was usable by the allocation flags then do not throttle /
6583	if (!pgdat)
6584	goto out;
6585
6586	/ Account for the throttling /
6587	count_vm_event(item: PGSCAN_DIRECT_THROTTLE);
6588
6589	/*
6590	* If the caller cannot enter the filesystem, it's possible that it
6591	* is due to the caller holding an FS lock or performing a journal
6592	* transaction in the case of a filesystem like ext[3\|4]. In this case,
6593	* it is not safe to block on pfmemalloc_wait as kswapd could be
6594	* blocked waiting on the same lock. Instead, throttle for up to a
6595	* second before continuing.
6596	*/
6597	if (!(gfp_mask & __GFP_FS))
6598	wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
6599	allow_direct_reclaim(pgdat), HZ);
6600	else
6601	/ Throttle until kswapd wakes the process /
6602	wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
6603	allow_direct_reclaim(pgdat));
6604
6605	if (fatal_signal_pending(current))
6606	return true;
6607
6608	out:
6609	return false;
6610	}
6611
6612	unsigned long try_to_free_pages(struct zonelist zonelist, int* order,
6613	gfp_t gfp_mask, nodemask_t *nodemask)
6614	{
6615	unsigned long nr_reclaimed;
6616	struct scan_control sc = {
6617	.nr_to_reclaim = SWAP_CLUSTER_MAX,
6618	.gfp_mask = current_gfp_context(flags: gfp_mask),
6619	.reclaim_idx = gfp_zone(flags: gfp_mask),
6620	.order = order,
6621	.nodemask = nodemask,
6622	.priority = DEF_PRIORITY,
6623	.may_writepage = !laptop_mode,
6624	.may_unmap = `1`,
6625	.may_swap = `1`,
6626	};
6627
6628	/*
6629	* scan_control uses s8 fields for order, priority, and reclaim_idx.
6630	* Confirm they are large enough for max values.
6631	*/
6632	BUILD_BUG_ON(MAX_PAGE_ORDER >= S8_MAX);
6633	BUILD_BUG_ON(DEF_PRIORITY > S8_MAX);
6634	BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX);
6635
6636	/*
6637	* Do not enter reclaim if fatal signal was delivered while throttled.
6638	* 1 is returned so that the page allocator does not OOM kill at this
6639	* point.
6640	*/
6641	if (throttle_direct_reclaim(gfp_mask: sc.gfp_mask, zonelist, nodemask))
6642	return `1`;
6643
6644	set_task_reclaim_state(current, rs: &sc.reclaim_state);
6645	trace_mm_vmscan_direct_reclaim_begin(order, gfp_flags: sc.gfp_mask);
6646
6647	nr_reclaimed = do_try_to_free_pages(zonelist, sc: &sc);
6648
6649	trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
6650	set_task_reclaim_state(current, NULL);
6651
6652	return nr_reclaimed;
6653	}
6654
6655	#ifdef CONFIG_MEMCG
6656
6657	/ Only used by soft limit reclaim. Do not reuse for anything else. /
6658	unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
6659	gfp_t gfp_mask, bool noswap,
6660	pg_data_t *pgdat,
6661	unsigned long *nr_scanned)
6662	{
6663	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
6664	struct scan_control sc = {
6665	.nr_to_reclaim = SWAP_CLUSTER_MAX,
6666	.target_mem_cgroup = memcg,
6667	.may_writepage = !laptop_mode,
6668	.may_unmap = `1`,
6669	.reclaim_idx = MAX_NR_ZONES - `1`,
6670	.may_swap = !noswap,
6671	};
6672
6673	WARN_ON_ONCE(!current->reclaim_state);
6674
6675	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) \|
6676	(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
6677
6678	trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
6679	sc.gfp_mask);
6680
6681	/*
6682	* NOTE: Although we can get the priority field, using it
6683	* here is not a good idea, since it limits the pages we can scan.
6684	* if we don't reclaim here, the shrink_node from balance_pgdat
6685	* will pick up pages from other mem cgroup's as well. We hack
6686	* the priority and make it zero.
6687	*/
6688	shrink_lruvec(lruvec, &sc);
6689
6690	trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
6691
6692	*nr_scanned = sc.nr_scanned;
6693
6694	return sc.nr_reclaimed;
6695	}
6696
6697	unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
6698	unsigned long nr_pages,
6699	gfp_t gfp_mask,
6700	unsigned int reclaim_options,
6701	int *swappiness)
6702	{
6703	unsigned long nr_reclaimed;
6704	unsigned int noreclaim_flag;
6705	struct scan_control sc = {
6706	.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
6707	.proactive_swappiness = swappiness,
6708	.gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) \|
6709	(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
6710	.reclaim_idx = MAX_NR_ZONES - `1`,
6711	.target_mem_cgroup = memcg,
6712	.priority = DEF_PRIORITY,
6713	.may_writepage = !laptop_mode,
6714	.may_unmap = `1`,
6715	.may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
6716	.proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
6717	};
6718	/*
6719	* Traverse the ZONELIST_FALLBACK zonelist of the current node to put
6720	* equal pressure on all the nodes. This is based on the assumption that
6721	* the reclaim does not bail out early.
6722	*/
6723	struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
6724
6725	set_task_reclaim_state(current, &sc.reclaim_state);
6726	trace_mm_vmscan_memcg_reclaim_begin(`0`, sc.gfp_mask);
6727	noreclaim_flag = memalloc_noreclaim_save();
6728
6729	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
6730
6731	memalloc_noreclaim_restore(noreclaim_flag);
6732	trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
6733	set_task_reclaim_state(current, NULL);
6734
6735	return nr_reclaimed;
6736	}
6737	#else
6738	unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
6739	unsigned long nr_pages,
6740	gfp_t gfp_mask,
6741	unsigned int reclaim_options,
6742	int *swappiness)
6743	{
6744	return `0`;
6745	}
6746	#endif
6747
6748	static void kswapd_age_node(struct pglist_data pgdat, struct* scan_control *sc)
6749	{
6750	struct mem_cgroup *memcg;
6751	struct lruvec *lruvec;
6752
6753	if (lru_gen_enabled()) {
6754	lru_gen_age_node(pgdat, sc);
6755	return;
6756	}
6757
6758	lruvec = mem_cgroup_lruvec(NULL, pgdat);
6759	if (!can_age_anon_pages(lruvec, sc))
6760	return;
6761
6762	if (!inactive_is_low(lruvec, inactive_lru: LRU_INACTIVE_ANON))
6763	return;
6764
6765	memcg = mem_cgroup_iter(NULL, NULL, NULL);
6766	do {
6767	lruvec = mem_cgroup_lruvec(memcg, pgdat);
6768	shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
6769	sc, lru: LRU_ACTIVE_ANON);
6770	memcg = mem_cgroup_iter(NULL, prev: memcg, NULL);
6771	} while (memcg);
6772	}
6773
6774	static bool pgdat_watermark_boosted(pg_data_t pgdat, int* highest_zoneidx)
6775	{
6776	int i;
6777	struct zone *zone;
6778
6779	/*
6780	* Check for watermark boosts top-down as the higher zones
6781	* are more likely to be boosted. Both watermarks and boosts
6782	* should not be checked at the same time as reclaim would
6783	* start prematurely when there is no boosting and a lower
6784	* zone is balanced.
6785	*/
6786	for (i = highest_zoneidx; i >= `0`; i--) {
6787	zone = pgdat->node_zones + i;
6788	if (!managed_zone(zone))
6789	continue;
6790
6791	if (zone->watermark_boost)
6792	return true;
6793	}
6794
6795	return false;
6796	}
6797
6798	/*
6799	* Returns true if there is an eligible zone balanced for the request order
6800	* and highest_zoneidx
6801	*/
6802	static bool pgdat_balanced(pg_data_t pgdat, int* order, int highest_zoneidx)
6803	{
6804	int i;
6805	unsigned long mark = -`1`;
6806	struct zone *zone;
6807
6808	/*
6809	* Check watermarks bottom-up as lower zones are more likely to
6810	* meet watermarks.
6811	*/
6812	for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) {
6813	enum zone_stat_item item;
6814	unsigned long free_pages;
6815
6816	if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
6817	mark = promo_wmark_pages(z: zone);
6818	else
6819	mark = high_wmark_pages(z: zone);
6820
6821	/*
6822	* In defrag_mode, watermarks must be met in whole
6823	* blocks to avoid polluting allocator fallbacks.
6824	*
6825	* However, kswapd usually cannot accomplish this on
6826	* its own and needs kcompactd support. Once it's
6827	* reclaimed a compaction gap, and kswapd_shrink_node
6828	* has dropped order, simply ensure there are enough
6829	* base pages for compaction, wake kcompactd & sleep.
6830	*/
6831	if (defrag_mode && order)
6832	item = NR_FREE_PAGES_BLOCKS;
6833	else
6834	item = NR_FREE_PAGES;
6835
6836	/*
6837	* When there is a high number of CPUs in the system,
6838	* the cumulative error from the vmstat per-cpu cache
6839	* can blur the line between the watermarks. In that
6840	* case, be safe and get an accurate snapshot.
6841	*
6842	* TODO: NR_FREE_PAGES_BLOCKS moves in steps of
6843	* pageblock_nr_pages, while the vmstat pcp threshold
6844	* is limited to 125. On many configurations that
6845	* counter won't actually be per-cpu cached. But keep
6846	* things simple for now; revisit when somebody cares.
6847	*/
6848	free_pages = zone_page_state(zone, item);
6849	if (zone->percpu_drift_mark && free_pages < zone->percpu_drift_mark)
6850	free_pages = zone_page_state_snapshot(zone, item);
6851
6852	if (__zone_watermark_ok(z: zone, order, mark, highest_zoneidx,
6853	alloc_flags: `0`, free_pages))
6854	return true;
6855	}
6856
6857	/*
6858	* If a node has no managed zone within highest_zoneidx, it does not
6859	* need balancing by definition. This can happen if a zone-restricted
6860	* allocation tries to wake a remote kswapd.
6861	*/
6862	if (mark == -`1`)
6863	return true;
6864
6865	return false;
6866	}
6867
6868	/ Clear pgdat state for congested, dirty or under writeback. /
6869	static void clear_pgdat_congested(pg_data_t *pgdat)
6870	{
6871	struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
6872
6873	clear_bit(nr: LRUVEC_NODE_CONGESTED, addr: &lruvec->flags);
6874	clear_bit(nr: LRUVEC_CGROUP_CONGESTED, addr: &lruvec->flags);
6875	clear_bit(nr: PGDAT_DIRTY, addr: &pgdat->flags);
6876	clear_bit(nr: PGDAT_WRITEBACK, addr: &pgdat->flags);
6877	}
6878
6879	/*
6880	* Prepare kswapd for sleeping. This verifies that there are no processes
6881	* waiting in throttle_direct_reclaim() and that watermarks have been met.
6882	*
6883	* Returns true if kswapd is ready to sleep
6884	*/
6885	static bool prepare_kswapd_sleep(pg_data_t pgdat, int* order,
6886	int highest_zoneidx)
6887	{
6888	/*
6889	* The throttled processes are normally woken up in balance_pgdat() as
6890	* soon as allow_direct_reclaim() is true. But there is a potential
6891	* race between when kswapd checks the watermarks and a process gets
6892	* throttled. There is also a potential race if processes get
6893	* throttled, kswapd wakes, a large process exits thereby balancing the
6894	* zones, which causes kswapd to exit balance_pgdat() before reaching
6895	* the wake up checks. If kswapd is going to sleep, no process should
6896	* be sleeping on pfmemalloc_wait, so wake them now if necessary. If
6897	* the wake up is premature, processes will wake kswapd and get
6898	* throttled again. The difference from wake ups in balance_pgdat() is
6899	* that here we are under prepare_to_wait().
6900	*/
6901	if (waitqueue_active(wq_head: &pgdat->pfmemalloc_wait))
6902	wake_up_all(&pgdat->pfmemalloc_wait);
6903
6904	/ Hopeless node, leave it to direct reclaim /
6905	if (atomic_read(v: &pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES)
6906	return true;
6907
6908	if (pgdat_balanced(pgdat, order, highest_zoneidx)) {
6909	clear_pgdat_congested(pgdat);
6910	return true;
6911	}
6912
6913	return false;
6914	}
6915
6916	/*
6917	* kswapd shrinks a node of pages that are at or below the highest usable
6918	* zone that is currently unbalanced.
6919	*
6920	* Returns true if kswapd scanned at least the requested number of pages to
6921	* reclaim or if the lack of progress was due to pages under writeback.
6922	* This is used to determine if the scanning priority needs to be raised.
6923	*/
6924	static bool kswapd_shrink_node(pg_data_t *pgdat,
6925	struct scan_control *sc)
6926	{
6927	struct zone *zone;
6928	int z;
6929	unsigned long nr_reclaimed = sc->nr_reclaimed;
6930
6931	/ Reclaim a number of pages proportional to the number of zones /
6932	sc->nr_to_reclaim = `0`;
6933	for_each_managed_zone_pgdat(zone, pgdat, z, sc->reclaim_idx) {
6934	sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
6935	}
6936
6937	/*
6938	* Historically care was taken to put equal pressure on all zones but
6939	* now pressure is applied based on node LRU order.
6940	*/
6941	shrink_node(pgdat, sc);
6942
6943	/*
6944	* Fragmentation may mean that the system cannot be rebalanced for
6945	* high-order allocations. If twice the allocation size has been
6946	* reclaimed then recheck watermarks only at order-0 to prevent
6947	* excessive reclaim. Assume that a process requested a high-order
6948	* can direct reclaim/compact.
6949	*/
6950	if (sc->order && sc->nr_reclaimed >= compact_gap(order: sc->order))
6951	sc->order = `0`;
6952
6953	/ account for progress from mm_account_reclaimed_pages() /
6954	return max(sc->nr_scanned, sc->nr_reclaimed - nr_reclaimed) >= sc->nr_to_reclaim;
6955	}
6956
6957	/ Page allocator PCP high watermark is lowered if reclaim is active. /
6958	static inline void
6959	update_reclaim_active(pg_data_t pgdat, int* highest_zoneidx, bool active)
6960	{
6961	int i;
6962	struct zone *zone;
6963
6964	for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) {
6965	if (active)
6966	set_bit(nr: ZONE_RECLAIM_ACTIVE, addr: &zone->flags);
6967	else
6968	clear_bit(nr: ZONE_RECLAIM_ACTIVE, addr: &zone->flags);
6969	}
6970	}
6971
6972	static inline void
6973	set_reclaim_active(pg_data_t pgdat, int* highest_zoneidx)
6974	{
6975	update_reclaim_active(pgdat, highest_zoneidx, active: true);
6976	}
6977
6978	static inline void
6979	clear_reclaim_active(pg_data_t pgdat, int* highest_zoneidx)
6980	{
6981	update_reclaim_active(pgdat, highest_zoneidx, active: false);
6982	}
6983
6984	/*
6985	* For kswapd, balance_pgdat() will reclaim pages across a node from zones
6986	* that are eligible for use by the caller until at least one zone is
6987	* balanced.
6988	*
6989	* Returns the order kswapd finished reclaiming at.
6990	*
6991	* kswapd scans the zones in the highmem->normal->dma direction. It skips
6992	* zones which have free_pages > high_wmark_pages(zone), but once a zone is
6993	* found to have free_pages <= high_wmark_pages(zone), any page in that zone
6994	* or lower is eligible for reclaim until at least one usable zone is
6995	* balanced.
6996	*/
6997	static int balance_pgdat(pg_data_t pgdat, int* order, int highest_zoneidx)
6998	{
6999	int i;
7000	unsigned long nr_soft_reclaimed;
7001	unsigned long nr_soft_scanned;
7002	unsigned long pflags;
7003	unsigned long nr_boost_reclaim;
7004	unsigned long zone_boosts[MAX_NR_ZONES] = { `0`, };
7005	bool boosted;
7006	struct zone *zone;
7007	struct scan_control sc = {
7008	.gfp_mask = GFP_KERNEL,
7009	.order = order,
7010	.may_unmap = `1`,
7011	};
7012
7013	set_task_reclaim_state(current, rs: &sc.reclaim_state);
7014	psi_memstall_enter(flags: &pflags);
7015	__fs_reclaim_acquire(_THIS_IP_);
7016
7017	count_vm_event(item: PAGEOUTRUN);
7018
7019	/*
7020	* Account for the reclaim boost. Note that the zone boost is left in
7021	* place so that parallel allocations that are near the watermark will
7022	* stall or direct reclaim until kswapd is finished.
7023	*/
7024	nr_boost_reclaim = `0`;
7025	for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) {
7026	nr_boost_reclaim += zone->watermark_boost;
7027	zone_boosts[i] = zone->watermark_boost;
7028	}
7029	boosted = nr_boost_reclaim;
7030
7031	restart:
7032	set_reclaim_active(pgdat, highest_zoneidx);
7033	sc.priority = DEF_PRIORITY;
7034	do {
7035	unsigned long nr_reclaimed = sc.nr_reclaimed;
7036	bool raise_priority = true;
7037	bool balanced;
7038	bool ret;
7039	bool was_frozen;
7040
7041	sc.reclaim_idx = highest_zoneidx;
7042
7043	/*
7044	* If the number of buffer_heads exceeds the maximum allowed
7045	* then consider reclaiming from all zones. This has a dual
7046	* purpose -- on 64-bit systems it is expected that
7047	* buffer_heads are stripped during active rotation. On 32-bit
7048	* systems, highmem pages can pin lowmem memory and shrinking
7049	* buffers can relieve lowmem pressure. Reclaim may still not
7050	* go ahead if all eligible zones for the original allocation
7051	* request are balanced to avoid excessive reclaim from kswapd.
7052	*/
7053	if (buffer_heads_over_limit) {
7054	for (i = MAX_NR_ZONES - `1`; i >= `0`; i--) {
7055	zone = pgdat->node_zones + i;
7056	if (!managed_zone(zone))
7057	continue;
7058
7059	sc.reclaim_idx = i;
7060	break;
7061	}
7062	}
7063
7064	/*
7065	* If the pgdat is imbalanced then ignore boosting and preserve
7066	* the watermarks for a later time and restart. Note that the
7067	* zone watermarks will be still reset at the end of balancing
7068	* on the grounds that the normal reclaim should be enough to
7069	* re-evaluate if boosting is required when kswapd next wakes.
7070	*/
7071	balanced = pgdat_balanced(pgdat, order: sc.order, highest_zoneidx);
7072	if (!balanced && nr_boost_reclaim) {
7073	nr_boost_reclaim = `0`;
7074	goto restart;
7075	}
7076
7077	/*
7078	* If boosting is not active then only reclaim if there are no
7079	* eligible zones. Note that sc.reclaim_idx is not used as
7080	* buffer_heads_over_limit may have adjusted it.
7081	*/
7082	if (!nr_boost_reclaim && balanced)
7083	goto out;
7084
7085	/ Limit the priority of boosting to avoid reclaim writeback /
7086	if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - `2`)
7087	raise_priority = false;
7088
7089	/*
7090	* Do not writeback or swap pages for boosted reclaim. The
7091	* intent is to relieve pressure not issue sub-optimal IO
7092	* from reclaim context. If no pages are reclaimed, the
7093	* reclaim will be aborted.
7094	*/
7095	sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
7096	sc.may_swap = !nr_boost_reclaim;
7097
7098	/*
7099	* Do some background aging, to give pages a chance to be
7100	* referenced before reclaiming. All pages are rotated
7101	* regardless of classzone as this is about consistent aging.
7102	*/
7103	kswapd_age_node(pgdat, sc: &sc);
7104
7105	/*
7106	* If we're getting trouble reclaiming, start doing writepage
7107	* even in laptop mode.
7108	*/
7109	if (sc.priority < DEF_PRIORITY - `2`)
7110	sc.may_writepage = `1`;
7111
7112	/ Call soft limit reclaim before calling shrink_node. /
7113	sc.nr_scanned = `0`;
7114	nr_soft_scanned = `0`;
7115	nr_soft_reclaimed = memcg1_soft_limit_reclaim(pgdat, order: sc.order,
7116	gfp_mask: sc.gfp_mask, total_scanned: &nr_soft_scanned);
7117	sc.nr_reclaimed += nr_soft_reclaimed;
7118
7119	/*
7120	* There should be no need to raise the scanning priority if
7121	* enough pages are already being scanned that that high
7122	* watermark would be met at 100% efficiency.
7123	*/
7124	if (kswapd_shrink_node(pgdat, sc: &sc))
7125	raise_priority = false;
7126
7127	/*
7128	* If the low watermark is met there is no need for processes
7129	* to be throttled on pfmemalloc_wait as they should not be
7130	* able to safely make forward progress. Wake them
7131	*/
7132	if (waitqueue_active(wq_head: &pgdat->pfmemalloc_wait) &&
7133	allow_direct_reclaim(pgdat))
7134	wake_up_all(&pgdat->pfmemalloc_wait);
7135
7136	/ Check if kswapd should be suspending /
7137	__fs_reclaim_release(_THIS_IP_);
7138	ret = kthread_freezable_should_stop(was_frozen: &was_frozen);
7139	__fs_reclaim_acquire(_THIS_IP_);
7140	if (was_frozen \|\| ret)
7141	break;
7142
7143	/*
7144	* Raise priority if scanning rate is too low or there was no
7145	* progress in reclaiming pages
7146	*/
7147	nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
7148	nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed);
7149
7150	/*
7151	* If reclaim made no progress for a boost, stop reclaim as
7152	* IO cannot be queued and it could be an infinite loop in
7153	* extreme circumstances.
7154	*/
7155	if (nr_boost_reclaim && !nr_reclaimed)
7156	break;
7157
7158	if (raise_priority \|\| !nr_reclaimed)
7159	sc.priority--;
7160	} while (sc.priority >= `1`);
7161
7162	/*
7163	* Restart only if it went through the priority loop all the way,
7164	* but cache_trim_mode didn't work.
7165	*/
7166	if (!sc.nr_reclaimed && sc.priority < `1` &&
7167	!sc.no_cache_trim_mode && sc.cache_trim_mode_failed) {
7168	sc.no_cache_trim_mode = `1`;
7169	goto restart;
7170	}
7171
7172	if (!sc.nr_reclaimed)
7173	atomic_inc(v: &pgdat->kswapd_failures);
7174
7175	out:
7176	clear_reclaim_active(pgdat, highest_zoneidx);
7177
7178	/ If reclaim was boosted, account for the reclaim done in this pass /
7179	if (boosted) {
7180	unsigned long flags;
7181
7182	for (i = `0`; i <= highest_zoneidx; i++) {
7183	if (!zone_boosts[i])
7184	continue;
7185
7186	/ Increments are under the zone lock /
7187	zone = pgdat->node_zones + i;
7188	spin_lock_irqsave(&zone->lock, flags);
7189	zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]);
7190	spin_unlock_irqrestore(lock: &zone->lock, flags);
7191	}
7192
7193	/*
7194	* As there is now likely space, wakeup kcompact to defragment
7195	* pageblocks.
7196	*/
7197	wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx);
7198	}
7199
7200	snapshot_refaults(NULL, pgdat);
7201	__fs_reclaim_release(_THIS_IP_);
7202	psi_memstall_leave(flags: &pflags);
7203	set_task_reclaim_state(current, NULL);
7204
7205	/*
7206	* Return the order kswapd stopped reclaiming at as
7207	* prepare_kswapd_sleep() takes it into account. If another caller
7208	* entered the allocator slow path while kswapd was awake, order will
7209	* remain at the higher level.
7210	*/
7211	return sc.order;
7212	}
7213
7214	/*
7215	* The pgdat->kswapd_highest_zoneidx is used to pass the highest zone index to
7216	* be reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is
7217	* not a valid index then either kswapd runs for first time or kswapd couldn't
7218	* sleep after previous reclaim attempt (node is still unbalanced). In that
7219	* case return the zone index of the previous kswapd reclaim cycle.
7220	*/
7221	static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat,
7222	enum zone_type prev_highest_zoneidx)
7223	{
7224	enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
7225
7226	return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx;
7227	}
7228
7229	static void kswapd_try_to_sleep(pg_data_t pgdat, int* alloc_order, int reclaim_order,
7230	unsigned int highest_zoneidx)
7231	{
7232	long remaining = `0`;
7233	DEFINE_WAIT(wait);
7234
7235	if (freezing(current) \|\| kthread_should_stop())
7236	return;
7237
7238	prepare_to_wait(wq_head: &pgdat->kswapd_wait, wq_entry: &wait, TASK_INTERRUPTIBLE);
7239
7240	/*
7241	* Try to sleep for a short interval. Note that kcompactd will only be
7242	* woken if it is possible to sleep for a short interval. This is
7243	* deliberate on the assumption that if reclaim cannot keep an
7244	* eligible zone balanced that it's also unlikely that compaction will
7245	* succeed.
7246	*/
7247	if (prepare_kswapd_sleep(pgdat, order: reclaim_order, highest_zoneidx)) {
7248	/*
7249	* Compaction records what page blocks it recently failed to
7250	* isolate pages from and skips them in the future scanning.
7251	* When kswapd is going to sleep, it is reasonable to assume
7252	* that pages and compaction may succeed so reset the cache.
7253	*/
7254	reset_isolation_suitable(pgdat);
7255
7256	/*
7257	* We have freed the memory, now we should compact it to make
7258	* allocation of the requested order possible.
7259	*/
7260	wakeup_kcompactd(pgdat, order: alloc_order, highest_zoneidx);
7261
7262	remaining = schedule_timeout(HZ/`10`);
7263
7264	/*
7265	* If woken prematurely then reset kswapd_highest_zoneidx and
7266	* order. The values will either be from a wakeup request or
7267	* the previous request that slept prematurely.
7268	*/
7269	if (remaining) {
7270	WRITE_ONCE(pgdat->kswapd_highest_zoneidx,
7271	kswapd_highest_zoneidx(pgdat,
7272	highest_zoneidx));
7273
7274	if (READ_ONCE(pgdat->kswapd_order) < reclaim_order)
7275	WRITE_ONCE(pgdat->kswapd_order, reclaim_order);
7276	}
7277
7278	finish_wait(wq_head: &pgdat->kswapd_wait, wq_entry: &wait);
7279	prepare_to_wait(wq_head: &pgdat->kswapd_wait, wq_entry: &wait, TASK_INTERRUPTIBLE);
7280	}
7281
7282	/*
7283	* After a short sleep, check if it was a premature sleep. If not, then
7284	* go fully to sleep until explicitly woken up.
7285	*/
7286	if (!remaining &&
7287	prepare_kswapd_sleep(pgdat, order: reclaim_order, highest_zoneidx)) {
7288	trace_mm_vmscan_kswapd_sleep(nid: pgdat->node_id);
7289
7290	/*
7291	* vmstat counters are not perfectly accurate and the estimated
7292	* value for counters such as NR_FREE_PAGES can deviate from the
7293	* true value by nr_online_cpus * threshold. To avoid the zone
7294	* watermarks being breached while under pressure, we reduce the
7295	* per-cpu vmstat threshold while kswapd is awake and restore
7296	* them before going back to sleep.
7297	*/
7298	set_pgdat_percpu_threshold(pgdat, calculate_pressure: calculate_normal_threshold);
7299
7300	if (!kthread_should_stop())
7301	schedule();
7302
7303	set_pgdat_percpu_threshold(pgdat, calculate_pressure: calculate_pressure_threshold);
7304	} else {
7305	if (remaining)
7306	count_vm_event(item: KSWAPD_LOW_WMARK_HIT_QUICKLY);
7307	else
7308	count_vm_event(item: KSWAPD_HIGH_WMARK_HIT_QUICKLY);
7309	}
7310	finish_wait(wq_head: &pgdat->kswapd_wait, wq_entry: &wait);
7311	}
7312
7313	/*
7314	* The background pageout daemon, started as a kernel thread
7315	* from the init process.
7316	*
7317	* This basically trickles out pages so that we have _some_
7318	* free memory available even if there is no other activity
7319	* that frees anything up. This is needed for things like routing
7320	* etc, where we otherwise might have all activity going on in
7321	* asynchronous contexts that cannot page things out.
7322	*
7323	* If there are applications that are active memory-allocators
7324	* (most normal use), this basically shouldn't matter.
7325	*/
7326	static int kswapd(void *p)
7327	{
7328	unsigned int alloc_order, reclaim_order;
7329	unsigned int highest_zoneidx = MAX_NR_ZONES - `1`;
7330	pg_data_t pgdat = (pg_data_t )p;
7331	struct task_struct *tsk = current;
7332
7333	/*
7334	* Tell the memory management that we're a "memory allocator",
7335	* and that if we need more memory we should get access to it
7336	* regardless (see "__alloc_pages()"). "kswapd" should
7337	* never get caught in the normal page freeing logic.
7338	*
7339	* (Kswapd normally doesn't need memory anyway, but sometimes
7340	* you need a small amount of memory in order to be able to
7341	* page out something else, and this flag essentially protects
7342	* us from recursively trying to free more memory as we're
7343	* trying to free the first piece of memory in the first place).
7344	*/
7345	tsk->flags \|= PF_MEMALLOC \| PF_KSWAPD;
7346	set_freezable();
7347
7348	WRITE_ONCE(pgdat->kswapd_order, `0`);
7349	WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
7350	atomic_set(v: &pgdat->nr_writeback_throttled, i: `0`);
7351	for ( ; ; ) {
7352	bool was_frozen;
7353
7354	alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
7355	highest_zoneidx = kswapd_highest_zoneidx(pgdat,
7356	prev_highest_zoneidx: highest_zoneidx);
7357
7358	kswapd_try_sleep:
7359	kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
7360	highest_zoneidx);
7361
7362	/ Read the new order and highest_zoneidx /
7363	alloc_order = READ_ONCE(pgdat->kswapd_order);
7364	highest_zoneidx = kswapd_highest_zoneidx(pgdat,
7365	prev_highest_zoneidx: highest_zoneidx);
7366	WRITE_ONCE(pgdat->kswapd_order, `0`);
7367	WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
7368
7369	if (kthread_freezable_should_stop(was_frozen: &was_frozen))
7370	break;
7371
7372	/*
7373	* We can speed up thawing tasks if we don't call balance_pgdat
7374	* after returning from the refrigerator
7375	*/
7376	if (was_frozen)
7377	continue;
7378
7379	/*
7380	* Reclaim begins at the requested order but if a high-order
7381	* reclaim fails then kswapd falls back to reclaiming for
7382	* order-0. If that happens, kswapd will consider sleeping
7383	* for the order it finished reclaiming at (reclaim_order)
7384	* but kcompactd is woken to compact for the original
7385	* request (alloc_order).
7386	*/
7387	trace_mm_vmscan_kswapd_wake(nid: pgdat->node_id, zid: highest_zoneidx,
7388	order: alloc_order);
7389	reclaim_order = balance_pgdat(pgdat, order: alloc_order,
7390	highest_zoneidx);
7391	if (reclaim_order < alloc_order)
7392	goto kswapd_try_sleep;
7393	}
7394
7395	tsk->flags &= ~(PF_MEMALLOC \| PF_KSWAPD);
7396
7397	return `0`;
7398	}
7399
7400	/*
7401	* A zone is low on free memory or too fragmented for high-order memory. If
7402	* kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's
7403	* pgdat. It will wake up kcompactd after reclaiming memory. If kswapd reclaim
7404	* has failed or is not needed, still wake up kcompactd if only compaction is
7405	* needed.
7406	*/
7407	void wakeup_kswapd(struct zone zone, gfp_t gfp_flags, int* order,
7408	enum zone_type highest_zoneidx)
7409	{
7410	pg_data_t *pgdat;
7411	enum zone_type curr_idx;
7412
7413	if (!managed_zone(zone))
7414	return;
7415
7416	if (!cpuset_zone_allowed(z: zone, gfp_mask: gfp_flags))
7417	return;
7418
7419	pgdat = zone->zone_pgdat;
7420	curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
7421
7422	if (curr_idx == MAX_NR_ZONES \|\| curr_idx < highest_zoneidx)
7423	WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx);
7424
7425	if (READ_ONCE(pgdat->kswapd_order) < order)
7426	WRITE_ONCE(pgdat->kswapd_order, order);
7427
7428	if (!waitqueue_active(wq_head: &pgdat->kswapd_wait))
7429	return;
7430
7431	/ Hopeless node, leave it to direct reclaim if possible /
7432	if (atomic_read(v: &pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES \|\|
7433	(pgdat_balanced(pgdat, order, highest_zoneidx) &&
7434	!pgdat_watermark_boosted(pgdat, highest_zoneidx))) {
7435	/*
7436	* There may be plenty of free memory available, but it's too
7437	* fragmented for high-order allocations. Wake up kcompactd
7438	* and rely on compaction_suitable() to determine if it's
7439	* needed. If it fails, it will defer subsequent attempts to
7440	* ratelimit its work.
7441	*/
7442	if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
7443	wakeup_kcompactd(pgdat, order, highest_zoneidx);
7444	return;
7445	}
7446
7447	trace_mm_vmscan_wakeup_kswapd(nid: pgdat->node_id, zid: highest_zoneidx, order,
7448	gfp_flags);
7449	wake_up_interruptible(&pgdat->kswapd_wait);
7450	}
7451
7452	#ifdef CONFIG_HIBERNATION
7453	/*
7454	* Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
7455	* freed pages.
7456	*
7457	* Rather than trying to age LRUs the aim is to preserve the overall
7458	* LRU order by reclaiming preferentially
7459	* inactive > active > active referenced > active mapped
7460	*/
7461	unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
7462	{
7463	struct scan_control sc = {
7464	.nr_to_reclaim = nr_to_reclaim,
7465	.gfp_mask = GFP_HIGHUSER_MOVABLE,
7466	.reclaim_idx = MAX_NR_ZONES - `1`,
7467	.priority = DEF_PRIORITY,
7468	.may_writepage = `1`,
7469	.may_unmap = `1`,
7470	.may_swap = `1`,
7471	.hibernation_mode = `1`,
7472	};
7473	struct zonelist *zonelist = node_zonelist(nid: numa_node_id(), flags: sc.gfp_mask);
7474	unsigned long nr_reclaimed;
7475	unsigned int noreclaim_flag;
7476
7477	fs_reclaim_acquire(gfp_mask: sc.gfp_mask);
7478	noreclaim_flag = memalloc_noreclaim_save();
7479	set_task_reclaim_state(current, rs: &sc.reclaim_state);
7480
7481	nr_reclaimed = do_try_to_free_pages(zonelist, sc: &sc);
7482
7483	set_task_reclaim_state(current, NULL);
7484	memalloc_noreclaim_restore(flags: noreclaim_flag);
7485	fs_reclaim_release(gfp_mask: sc.gfp_mask);
7486
7487	return nr_reclaimed;
7488	}
7489	#endif /* CONFIG_HIBERNATION */
7490
7491	/*
7492	* This kswapd start function will be called by init and node-hot-add.
7493	*/
7494	void __meminit kswapd_run(int nid)
7495	{
7496	pg_data_t *pgdat = NODE_DATA(nid);
7497
7498	pgdat_kswapd_lock(pgdat);
7499	if (!pgdat->kswapd) {
7500	pgdat->kswapd = kthread_create_on_node(threadfn: kswapd, data: pgdat, node: nid, namefmt: "kswapd%d", nid);
7501	if (IS_ERR(ptr: pgdat->kswapd)) {
7502	/ failure at boot is fatal /
7503	pr_err("Failed to start kswapd on node %d，ret=%ld\n",
7504	nid, PTR_ERR(pgdat->kswapd));
7505	BUG_ON(system_state < SYSTEM_RUNNING);
7506	pgdat->kswapd = NULL;
7507	} else {
7508	wake_up_process(tsk: pgdat->kswapd);
7509	}
7510	}
7511	pgdat_kswapd_unlock(pgdat);
7512	}
7513
7514	/*
7515	* Called by memory hotplug when all memory in a node is offlined. Caller must
7516	* be holding mem_hotplug_begin/done().
7517	*/
7518	void __meminit kswapd_stop(int nid)
7519	{
7520	pg_data_t *pgdat = NODE_DATA(nid);
7521	struct task_struct *kswapd;
7522
7523	pgdat_kswapd_lock(pgdat);
7524	kswapd = pgdat->kswapd;
7525	if (kswapd) {
7526	kthread_stop(k: kswapd);
7527	pgdat->kswapd = NULL;
7528	}
7529	pgdat_kswapd_unlock(pgdat);
7530	}
7531
7532	static const struct ctl_table vmscan_sysctl_table[] = {
7533	{
7534	.procname = "swappiness",
7535	.data = &vm_swappiness,
7536	.maxlen = sizeof(vm_swappiness),
7537	.mode = `0644`,
7538	.proc_handler = proc_dointvec_minmax,
7539	.extra1 = SYSCTL_ZERO,
7540	.extra2 = SYSCTL_TWO_HUNDRED,
7541	},
7542	#ifdef CONFIG_NUMA
7543	{
7544	.procname = "zone_reclaim_mode",
7545	.data = &node_reclaim_mode,
7546	.maxlen = sizeof(node_reclaim_mode),
7547	.mode = `0644`,
7548	.proc_handler = proc_dointvec_minmax,
7549	.extra1 = SYSCTL_ZERO,
7550	}
7551	#endif
7552	};
7553
7554	static int __init kswapd_init(void)
7555	{
7556	int nid;
7557
7558	swap_setup();
7559	for_each_node_state(nid, N_MEMORY)
7560	kswapd_run(nid);
7561	register_sysctl_init("vm", vmscan_sysctl_table);
7562	return `0`;
7563	}
7564
7565	module_init(kswapd_init)
7566
7567	#ifdef CONFIG_NUMA
7568	/*
7569	* Node reclaim mode
7570	*
7571	* If non-zero call node_reclaim when the number of free pages falls below
7572	* the watermarks.
7573	*/
7574	int node_reclaim_mode __read_mostly;
7575
7576	/*
7577	* Priority for NODE_RECLAIM. This determines the fraction of pages
7578	* of a node considered for each zone_reclaim. 4 scans 1/16th of
7579	* a zone.
7580	*/
7581	#define NODE_RECLAIM_PRIORITY 4
7582
7583	/*
7584	* Percentage of pages in a zone that must be unmapped for node_reclaim to
7585	* occur.
7586	*/
7587	int sysctl_min_unmapped_ratio = `1`;
7588
7589	/*
7590	* If the number of slab pages in a zone grows beyond this percentage then
7591	* slab reclaim needs to occur.
7592	*/
7593	int sysctl_min_slab_ratio = `5`;
7594
7595	static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat)
7596	{
7597	unsigned long file_mapped = node_page_state(pgdat, item: NR_FILE_MAPPED);
7598	unsigned long file_lru = node_page_state(pgdat, item: NR_INACTIVE_FILE) +
7599	node_page_state(pgdat, item: NR_ACTIVE_FILE);
7600
7601	/*
7602	* It's possible for there to be more file mapped pages than
7603	* accounted for by the pages on the file LRU lists because
7604	* tmpfs pages accounted for as ANON can also be FILE_MAPPED
7605	*/
7606	return (file_lru > file_mapped) ? (file_lru - file_mapped) : `0`;
7607	}
7608
7609	/ Work out how many page cache pages we can reclaim in this reclaim_mode /
7610	static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat)
7611	{
7612	unsigned long nr_pagecache_reclaimable;
7613	unsigned long delta = `0`;
7614
7615	/*
7616	* If RECLAIM_UNMAP is set, then all file pages are considered
7617	* potentially reclaimable. Otherwise, we have to worry about
7618	* pages like swapcache and node_unmapped_file_pages() provides
7619	* a better estimate
7620	*/
7621	if (node_reclaim_mode & RECLAIM_UNMAP)
7622	nr_pagecache_reclaimable = node_page_state(pgdat, item: NR_FILE_PAGES);
7623	else
7624	nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat);
7625
7626	/ If we can't clean pages, remove dirty pages from consideration /
7627	if (!(node_reclaim_mode & RECLAIM_WRITE))
7628	delta += node_page_state(pgdat, item: NR_FILE_DIRTY);
7629
7630	/ Watch for any possible underflows due to delta /
7631	if (unlikely(delta > nr_pagecache_reclaimable))
7632	delta = nr_pagecache_reclaimable;
7633
7634	return nr_pagecache_reclaimable - delta;
7635	}
7636
7637	/*
7638	* Try to free up some pages from this node through reclaim.
7639	*/
7640	static unsigned long __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask,
7641	unsigned long nr_pages,
7642	struct scan_control *sc)
7643	{
7644	struct task_struct *p = current;
7645	unsigned int noreclaim_flag;
7646	unsigned long pflags;
7647
7648	trace_mm_vmscan_node_reclaim_begin(nid: pgdat->node_id, order: sc->order,
7649	gfp_flags: sc->gfp_mask);
7650
7651	cond_resched();
7652	psi_memstall_enter(flags: &pflags);
7653	delayacct_freepages_start();
7654	fs_reclaim_acquire(gfp_mask: sc->gfp_mask);
7655	/*
7656	* We need to be able to allocate from the reserves for RECLAIM_UNMAP
7657	*/
7658	noreclaim_flag = memalloc_noreclaim_save();
7659	set_task_reclaim_state(task: p, rs: &sc->reclaim_state);
7660
7661	if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages \|\|
7662	node_page_state_pages(pgdat, item: NR_SLAB_RECLAIMABLE_B) > pgdat->min_slab_pages) {
7663	/*
7664	* Free memory by calling shrink node with increasing
7665	* priorities until we have enough memory freed.
7666	*/
7667	do {
7668	shrink_node(pgdat, sc);
7669	} while (sc->nr_reclaimed < nr_pages && --sc->priority >= `0`);
7670	}
7671
7672	set_task_reclaim_state(task: p, NULL);
7673	memalloc_noreclaim_restore(flags: noreclaim_flag);
7674	fs_reclaim_release(gfp_mask: sc->gfp_mask);
7675	delayacct_freepages_end();
7676	psi_memstall_leave(flags: &pflags);
7677
7678	trace_mm_vmscan_node_reclaim_end(nr_reclaimed: sc->nr_reclaimed);
7679
7680	return sc->nr_reclaimed;
7681	}
7682
7683	int node_reclaim(struct pglist_data pgdat, gfp_t gfp_mask, unsigned* int order)
7684	{
7685	int ret;
7686	/ Minimum pages needed in order to stay on node /
7687	const unsigned long nr_pages = `1` << order;
7688	struct scan_control sc = {
7689	.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
7690	.gfp_mask = current_gfp_context(flags: gfp_mask),
7691	.order = order,
7692	.priority = NODE_RECLAIM_PRIORITY,
7693	.may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
7694	.may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
7695	.may_swap = `1`,
7696	.reclaim_idx = gfp_zone(flags: gfp_mask),
7697	};
7698
7699	/*
7700	* Node reclaim reclaims unmapped file backed pages and
7701	* slab pages if we are over the defined limits.
7702	*
7703	* A small portion of unmapped file backed pages is needed for
7704	* file I/O otherwise pages read by file I/O will be immediately
7705	* thrown out if the node is overallocated. So we do not reclaim
7706	* if less than a specified percentage of the node is used by
7707	* unmapped file backed pages.
7708	*/
7709	if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
7710	node_page_state_pages(pgdat, item: NR_SLAB_RECLAIMABLE_B) <=
7711	pgdat->min_slab_pages)
7712	return NODE_RECLAIM_FULL;
7713
7714	/*
7715	* Do not scan if the allocation should not be delayed.
7716	*/
7717	if (!gfpflags_allow_blocking(gfp_flags: gfp_mask) \|\| (current->flags & PF_MEMALLOC))
7718	return NODE_RECLAIM_NOSCAN;
7719
7720	/*
7721	* Only run node reclaim on the local node or on nodes that do not
7722	* have associated processors. This will favor the local processor
7723	* over remote processors and spread off node memory allocations
7724	* as wide as possible.
7725	*/
7726	if (node_state(node: pgdat->node_id, state: N_CPU) && pgdat->node_id != numa_node_id())
7727	return NODE_RECLAIM_NOSCAN;
7728
7729	if (test_and_set_bit_lock(nr: PGDAT_RECLAIM_LOCKED, addr: &pgdat->flags))
7730	return NODE_RECLAIM_NOSCAN;
7731
7732	ret = __node_reclaim(pgdat, gfp_mask, nr_pages, sc: &sc) >= nr_pages;
7733	clear_bit_unlock(nr: PGDAT_RECLAIM_LOCKED, addr: &pgdat->flags);
7734
7735	if (ret)
7736	count_vm_event(item: PGSCAN_ZONE_RECLAIM_SUCCESS);
7737	else
7738	count_vm_event(item: PGSCAN_ZONE_RECLAIM_FAILED);
7739
7740	return ret;
7741	}
7742
7743	enum {
7744	MEMORY_RECLAIM_SWAPPINESS = `0`,
7745	MEMORY_RECLAIM_SWAPPINESS_MAX,
7746	MEMORY_RECLAIM_NULL,
7747	};
7748	static const match_table_t tokens = {
7749	{ MEMORY_RECLAIM_SWAPPINESS, "swappiness=%d"},
7750	{ .token: MEMORY_RECLAIM_SWAPPINESS_MAX, .pattern: "swappiness=max"},
7751	{ .token: MEMORY_RECLAIM_NULL, NULL },
7752	};
7753
7754	int user_proactive_reclaim(char *buf,
7755	struct mem_cgroup memcg, pg_data_t pgdat)
7756	{
7757	unsigned int nr_retries = MAX_RECLAIM_RETRIES;
7758	unsigned long nr_to_reclaim, nr_reclaimed = `0`;
7759	int swappiness = -`1`;
7760	char old_buf, start;
7761	substring_t args[MAX_OPT_ARGS];
7762	gfp_t gfp_mask = GFP_KERNEL;
7763
7764	if (!buf \|\| (!memcg && !pgdat) \|\| (memcg && pgdat))
7765	return -EINVAL;
7766
7767	buf = strstrip(str: buf);
7768
7769	old_buf = buf;
7770	nr_to_reclaim = memparse(ptr: buf, retptr: &buf) / PAGE_SIZE;
7771	if (buf == old_buf)
7772	return -EINVAL;
7773
7774	buf = strstrip(str: buf);
7775
7776	while ((start = strsep(&buf, " ")) != NULL) {
7777	if (!strlen(start))
7778	continue;
7779	switch (match_token(start, table: tokens, args)) {
7780	case MEMORY_RECLAIM_SWAPPINESS:
7781	if (match_int(&args[`0`], result: &swappiness))
7782	return -EINVAL;
7783	if (swappiness < MIN_SWAPPINESS \|\|
7784	swappiness > MAX_SWAPPINESS)
7785	return -EINVAL;
7786	break;
7787	case MEMORY_RECLAIM_SWAPPINESS_MAX:
7788	swappiness = SWAPPINESS_ANON_ONLY;
7789	break;
7790	default:
7791	return -EINVAL;
7792	}
7793	}
7794
7795	while (nr_reclaimed < nr_to_reclaim) {
7796	/ Will converge on zero, but reclaim enforces a minimum /
7797	unsigned long batch_size = (nr_to_reclaim - nr_reclaimed) / `4`;
7798	unsigned long reclaimed;
7799
7800	if (signal_pending(current))
7801	return -EINTR;
7802
7803	/*
7804	* This is the final attempt, drain percpu lru caches in the
7805	* hope of introducing more evictable pages.
7806	*/
7807	if (!nr_retries)
7808	lru_add_drain_all();
7809
7810	if (memcg) {
7811	unsigned int reclaim_options;
7812
7813	reclaim_options = MEMCG_RECLAIM_MAY_SWAP \|
7814	MEMCG_RECLAIM_PROACTIVE;
7815	reclaimed = try_to_free_mem_cgroup_pages(memcg,
7816	nr_pages: batch_size, gfp_mask,
7817	reclaim_options,
7818	swappiness: swappiness == -`1` ? NULL : &swappiness);
7819	} else {
7820	struct scan_control sc = {
7821	.gfp_mask = current_gfp_context(flags: gfp_mask),
7822	.reclaim_idx = gfp_zone(flags: gfp_mask),
7823	.proactive_swappiness = swappiness == -`1` ? NULL : &swappiness,
7824	.priority = DEF_PRIORITY,
7825	.may_writepage = !laptop_mode,
7826	.nr_to_reclaim = max(batch_size, SWAP_CLUSTER_MAX),
7827	.may_unmap = `1`,
7828	.may_swap = `1`,
7829	.proactive = `1`,
7830	};
7831
7832	if (test_and_set_bit_lock(nr: PGDAT_RECLAIM_LOCKED,
7833	addr: &pgdat->flags))
7834	return -EBUSY;
7835
7836	reclaimed = __node_reclaim(pgdat, gfp_mask,
7837	nr_pages: batch_size, sc: &sc);
7838	clear_bit_unlock(nr: PGDAT_RECLAIM_LOCKED, addr: &pgdat->flags);
7839	}
7840
7841	if (!reclaimed && !nr_retries--)
7842	return -EAGAIN;
7843
7844	nr_reclaimed += reclaimed;
7845	}
7846
7847	return `0`;
7848	}
7849
7850	#endif
7851
7852	/**
7853	* check_move_unevictable_folios - Move evictable folios to appropriate zone
7854	* lru list
7855	* @fbatch: Batch of lru folios to check.
7856	*
7857	* Checks folios for evictability, if an evictable folio is in the unevictable
7858	* lru list, moves it to the appropriate evictable lru list. This function
7859	* should be only used for lru folios.
7860	*/
7861	void check_move_unevictable_folios(struct folio_batch *fbatch)
7862	{
7863	struct lruvec *lruvec = NULL;
7864	int pgscanned = `0`;
7865	int pgrescued = `0`;
7866	int i;
7867
7868	for (i = `0`; i < fbatch->nr; i++) {
7869	struct folio *folio = fbatch->folios[i];
7870	int nr_pages = folio_nr_pages(folio);
7871
7872	pgscanned += nr_pages;
7873
7874	/ block memcg migration while the folio moves between lrus /
7875	if (!folio_test_clear_lru(folio))
7876	continue;
7877
7878	lruvec = folio_lruvec_relock_irq(folio, locked_lruvec: lruvec);
7879	if (folio_evictable(folio) && folio_test_unevictable(folio)) {
7880	lruvec_del_folio(lruvec, folio);
7881	folio_clear_unevictable(folio);
7882	lruvec_add_folio(lruvec, folio);
7883	pgrescued += nr_pages;
7884	}
7885	folio_set_lru(folio);
7886	}
7887
7888	if (lruvec) {
7889	__count_vm_events(item: UNEVICTABLE_PGRESCUED, delta: pgrescued);
7890	__count_vm_events(item: UNEVICTABLE_PGSCANNED, delta: pgscanned);
7891	unlock_page_lruvec_irq(lruvec);
7892	} else if (pgscanned) {
7893	count_vm_events(item: UNEVICTABLE_PGSCANNED, delta: pgscanned);
7894	}
7895	}
7896	EXPORT_SYMBOL_GPL(check_move_unevictable_folios);
7897
7898	#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
7899	static ssize_t reclaim_store(struct device *dev,
7900	struct device_attribute *attr,
7901	const char *buf, size_t count)
7902	{
7903	int ret, nid = dev->id;
7904
7905	ret = user_proactive_reclaim(buf: (char *)buf, NULL, NODE_DATA(nid));
7906	return ret ? -EAGAIN : count;
7907	}
7908
7909	static DEVICE_ATTR_WO(reclaim);
7910	int reclaim_register_node(struct node *node)
7911	{
7912	return device_create_file(device: &node->dev, entry: &dev_attr_reclaim);
7913	}
7914
7915	void reclaim_unregister_node(struct node *node)
7916	{
7917	return device_remove_file(dev: &node->dev, attr: &dev_attr_reclaim);
7918	}
7919	#endif
7920

Browse the source code of Linux/mm/vmscan.c