page_alloc.c source code [Linux/mm/page_alloc.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* linux/mm/page_alloc.c
4	*
5	* Manages the free list, the system allocates free pages here.
6	* Note that kmalloc() lives in slab.c
7	*
8	* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
9	* Swap reorganised 29.12.95, Stephen Tweedie
10	* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
11	* Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
12	* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
13	* Zone balancing, Kanoj Sarcar, SGI, Jan 2000
14	* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
15	* (lots of bits borrowed from Ingo Molnar & Andrew Morton)
16	*/
17
18	#include <linux/stddef.h>
19	#include <linux/mm.h>
20	#include <linux/highmem.h>
21	#include <linux/interrupt.h>
22	#include <linux/jiffies.h>
23	#include <linux/compiler.h>
24	#include <linux/kernel.h>
25	#include <linux/kasan.h>
26	#include <linux/kmsan.h>
27	#include <linux/module.h>
28	#include <linux/suspend.h>
29	#include <linux/ratelimit.h>
30	#include <linux/oom.h>
31	#include <linux/topology.h>
32	#include <linux/sysctl.h>
33	#include <linux/cpu.h>
34	#include <linux/cpuset.h>
35	#include <linux/pagevec.h>
36	#include <linux/memory_hotplug.h>
37	#include <linux/nodemask.h>
38	#include <linux/vmstat.h>
39	#include <linux/fault-inject.h>
40	#include <linux/compaction.h>
41	#include <trace/events/kmem.h>
42	#include <trace/events/oom.h>
43	#include <linux/prefetch.h>
44	#include <linux/mm_inline.h>
45	#include <linux/mmu_notifier.h>
46	#include <linux/migrate.h>
47	#include <linux/sched/mm.h>
48	#include <linux/page_owner.h>
49	#include <linux/page_table_check.h>
50	#include <linux/memcontrol.h>
51	#include <linux/ftrace.h>
52	#include <linux/lockdep.h>
53	#include <linux/psi.h>
54	#include <linux/khugepaged.h>
55	#include <linux/delayacct.h>
56	#include <linux/cacheinfo.h>
57	#include <linux/pgalloc_tag.h>
58	#include <asm/div64.h>
59	#include "internal.h"
60	#include "shuffle.h"
61	#include "page_reporting.h"
62
63	/ Free Page Internal flags: for internal, non-pcp variants of free_pages(). /
64	typedef int __bitwise fpi_t;
65
66	/ No special request /
67	#define FPI_NONE ((__force fpi_t)0)
68
69	/*
70	* Skip free page reporting notification for the (possibly merged) page.
71	* This does not hinder free page reporting from grabbing the page,
72	* reporting it and marking it "reported" - it only skips notifying
73	* the free page reporting infrastructure about a newly freed page. For
74	* example, used when temporarily pulling a page from a freelist and
75	* putting it back unmodified.
76	*/
77	#define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0))
78
79	/*
80	* Place the (possibly merged) page to the tail of the freelist. Will ignore
81	* page shuffling (relevant code - e.g., memory onlining - is expected to
82	* shuffle the whole zone).
83	*
84	* Note: No code should rely on this flag for correctness - it's purely
85	* to allow for optimizations when handing back either fresh pages
86	* (memory onlining) or untouched pages (page isolation, free page
87	* reporting).
88	*/
89	#define FPI_TO_TAIL ((__force fpi_t)BIT(1))
90
91	/ Free the page without taking locks. Rely on trylock only. /
92	#define FPI_TRYLOCK ((__force fpi_t)BIT(2))
93
94	/ prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields /
95	static DEFINE_MUTEX(pcp_batch_high_lock);
96	#define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
97
98	#if defined(CONFIG_SMP) \|\| defined(CONFIG_PREEMPT_RT)
99	/*
100	* On SMP, spin_trylock is sufficient protection.
101	* On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP.
102	*/
103	#define pcp_trylock_prepare(flags) do { } while (0)
104	#define pcp_trylock_finish(flag) do { } while (0)
105	#else
106
107	/ UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. /
108	#define pcp_trylock_prepare(flags) local_irq_save(flags)
109	#define pcp_trylock_finish(flags) local_irq_restore(flags)
110	#endif
111
112	/*
113	* Locking a pcp requires a PCP lookup followed by a spinlock. To avoid
114	* a migration causing the wrong PCP to be locked and remote memory being
115	* potentially allocated, pin the task to the CPU for the lookup+lock.
116	* preempt_disable is used on !RT because it is faster than migrate_disable.
117	* migrate_disable is used on RT because otherwise RT spinlock usage is
118	* interfered with and a high priority task cannot preempt the allocator.
119	*/
120	#ifndef CONFIG_PREEMPT_RT
121	#define pcpu_task_pin() preempt_disable()
122	#define pcpu_task_unpin() preempt_enable()
123	#else
124	#define pcpu_task_pin() migrate_disable()
125	#define pcpu_task_unpin() migrate_enable()
126	#endif
127
128	/*
129	* Generic helper to lookup and a per-cpu variable with an embedded spinlock.
130	* Return value should be used with equivalent unlock helper.
131	*/
132	#define pcpu_spin_lock(type, member, ptr) \
133	({ \
134	type *_ret; \
135	pcpu_task_pin(); \
136	_ret = this_cpu_ptr(ptr); \
137	spin_lock(&_ret->member); \
138	_ret; \
139	})
140
141	#define pcpu_spin_trylock(type, member, ptr) \
142	({ \
143	type *_ret; \
144	pcpu_task_pin(); \
145	_ret = this_cpu_ptr(ptr); \
146	if (!spin_trylock(&_ret->member)) { \
147	pcpu_task_unpin(); \
148	_ret = NULL; \
149	} \
150	_ret; \
151	})
152
153	#define pcpu_spin_unlock(member, ptr) \
154	({ \
155	spin_unlock(&ptr->member); \
156	pcpu_task_unpin(); \
157	})
158
159	/ struct per_cpu_pages specific helpers. /
160	#define pcp_spin_lock(ptr) \
161	pcpu_spin_lock(struct per_cpu_pages, lock, ptr)
162
163	#define pcp_spin_trylock(ptr) \
164	pcpu_spin_trylock(struct per_cpu_pages, lock, ptr)
165
166	#define pcp_spin_unlock(ptr) \
167	pcpu_spin_unlock(lock, ptr)
168
169	#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
170	DEFINE_PER_CPU(int, numa_node);
171	EXPORT_PER_CPU_SYMBOL(numa_node);
172	#endif
173
174	DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key);
175
176	#ifdef CONFIG_HAVE_MEMORYLESS_NODES
177	/*
178	* N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
179	* It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
180	* Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
181	* defined in <linux/topology.h>.
182	*/
183	DEFINE_PER_CPU(int, _numa_mem_); / Kernel "local memory" node /
184	EXPORT_PER_CPU_SYMBOL(_numa_mem_);
185	#endif
186
187	static DEFINE_MUTEX(pcpu_drain_mutex);
188
189	#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
190	volatile unsigned long latent_entropy __latent_entropy;
191	EXPORT_SYMBOL(latent_entropy);
192	#endif
193
194	/*
195	* Array of node states.
196	*/
197	nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
198	[N_POSSIBLE] = NODE_MASK_ALL,
199	[N_ONLINE] = { { [`0`] = `1UL` } },
200	#ifndef CONFIG_NUMA
201	[N_NORMAL_MEMORY] = { { [`0`] = `1UL` } },
202	#ifdef CONFIG_HIGHMEM
203	[N_HIGH_MEMORY] = { { [`0`] = `1UL` } },
204	#endif
205	[N_MEMORY] = { { [`0`] = `1UL` } },
206	[N_CPU] = { { [`0`] = `1UL` } },
207	#endif /* NUMA */
208	};
209	EXPORT_SYMBOL(node_states);
210
211	gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
212
213	#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
214	unsigned int pageblock_order __read_mostly;
215	#endif
216
217	static void __free_pages_ok(struct page page, unsigned* int order,
218	fpi_t fpi_flags);
219
220	/*
221	* results with 256, 32 in the lowmem_reserve sysctl:
222	* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
223	* 1G machine -> (16M dma, 784M normal, 224M high)
224	* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
225	* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
226	* HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA
227	*
228	* TBD: should special case ZONE_DMA32 machines here - in those we normally
229	* don't need any ZONE_NORMAL reservation
230	*/
231	static int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = {
232	#ifdef CONFIG_ZONE_DMA
233	[ZONE_DMA] = `256`,
234	#endif
235	#ifdef CONFIG_ZONE_DMA32
236	[ZONE_DMA32] = `256`,
237	#endif
238	[ZONE_NORMAL] = `32`,
239	#ifdef CONFIG_HIGHMEM
240	[ZONE_HIGHMEM] = `0`,
241	#endif
242	[ZONE_MOVABLE] = `0`,
243	};
244
245	char * const zone_names[MAX_NR_ZONES] = {
246	#ifdef CONFIG_ZONE_DMA
247	"DMA",
248	#endif
249	#ifdef CONFIG_ZONE_DMA32
250	"DMA32",
251	#endif
252	"Normal",
253	#ifdef CONFIG_HIGHMEM
254	"HighMem",
255	#endif
256	"Movable",
257	#ifdef CONFIG_ZONE_DEVICE
258	"Device",
259	#endif
260	};
261
262	const char * const migratetype_names[MIGRATE_TYPES] = {
263	"Unmovable",
264	"Movable",
265	"Reclaimable",
266	"HighAtomic",
267	#ifdef CONFIG_CMA
268	"CMA",
269	#endif
270	#ifdef CONFIG_MEMORY_ISOLATION
271	"Isolate",
272	#endif
273	};
274
275	int min_free_kbytes = `1024`;
276	int user_min_free_kbytes = -`1`;
277	static int watermark_boost_factor __read_mostly = `15000`;
278	static int watermark_scale_factor = `10`;
279	int defrag_mode;
280
281	/ movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from /
282	int movable_zone;
283	EXPORT_SYMBOL(movable_zone);
284
285	#if MAX_NUMNODES > 1
286	unsigned int nr_node_ids __read_mostly = MAX_NUMNODES;
287	unsigned int nr_online_nodes __read_mostly = `1`;
288	EXPORT_SYMBOL(nr_node_ids);
289	EXPORT_SYMBOL(nr_online_nodes);
290	#endif
291
292	static bool page_contains_unaccepted(struct page page, unsigned* int order);
293	static bool cond_accept_memory(struct zone zone, unsigned* int order,
294	int alloc_flags);
295	static bool __free_unaccepted(struct page *page);
296
297	int page_group_by_mobility_disabled __read_mostly;
298
299	#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
300	/*
301	* During boot we initialize deferred pages on-demand, as needed, but once
302	* page_alloc_init_late() has finished, the deferred pages are all initialized,
303	* and we can permanently disable that path.
304	*/
305	DEFINE_STATIC_KEY_TRUE(deferred_pages);
306
307	static inline bool deferred_pages_enabled(void)
308	{
309	return static_branch_unlikely(&deferred_pages);
310	}
311
312	/*
313	* deferred_grow_zone() is __init, but it is called from
314	* get_page_from_freelist() during early boot until deferred_pages permanently
315	* disables this call. This is why we have refdata wrapper to avoid warning,
316	* and to ensure that the function body gets unloaded.
317	*/
318	static bool __ref
319	_deferred_grow_zone(struct zone zone, unsigned* int order)
320	{
321	return deferred_grow_zone(zone, order);
322	}
323	#else
324	static inline bool deferred_pages_enabled(void)
325	{
326	return false;
327	}
328
329	static inline bool _deferred_grow_zone(struct zone zone, unsigned* int order)
330	{
331	return false;
332	}
333	#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
334
335	/ Return a pointer to the bitmap storing bits affecting a block of pages /
336	static inline unsigned long get_pageblock_bitmap(const* struct page *page,
337	unsigned long pfn)
338	{
339	#ifdef CONFIG_SPARSEMEM
340	return section_to_usemap(ms: __pfn_to_section(pfn));
341	#else
342	return page_zone(page)->pageblock_flags;
343	#endif /* CONFIG_SPARSEMEM */
344	}
345
346	static inline int pfn_to_bitidx(const struct page page, unsigned* long pfn)
347	{
348	#ifdef CONFIG_SPARSEMEM
349	pfn &= (PAGES_PER_SECTION-`1`);
350	#else
351	pfn = pfn - pageblock_start_pfn(page_zone(page)->zone_start_pfn);
352	#endif /* CONFIG_SPARSEMEM */
353	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
354	}
355
356	static __always_inline bool is_standalone_pb_bit(enum pageblock_bits pb_bit)
357	{
358	return pb_bit >= PB_compact_skip && pb_bit < __NR_PAGEBLOCK_BITS;
359	}
360
361	static __always_inline void
362	get_pfnblock_bitmap_bitidx(const struct page page, unsigned* long pfn,
363	unsigned long *bitmap_word, unsigned* long *bitidx)
364	{
365	unsigned long *bitmap;
366	unsigned long word_bitidx;
367
368	#ifdef CONFIG_MEMORY_ISOLATION
369	BUILD_BUG_ON(NR_PAGEBLOCK_BITS != `8`);
370	#else
371	BUILD_BUG_ON(NR_PAGEBLOCK_BITS != `4`);
372	#endif
373	BUILD_BUG_ON(__MIGRATE_TYPE_END > MIGRATETYPE_MASK);
374	VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
375
376	bitmap = get_pageblock_bitmap(page, pfn);
377	*bitidx = pfn_to_bitidx(page, pfn);
378	word_bitidx = *bitidx / BITS_PER_LONG;
379	*bitidx &= (BITS_PER_LONG - `1`);
380	*bitmap_word = &bitmap[word_bitidx];
381	}
382
383
384	/**
385	* __get_pfnblock_flags_mask - Return the requested group of flags for
386	* a pageblock_nr_pages block of pages
387	* @page: The page within the block of interest
388	* @pfn: The target page frame number
389	* @mask: mask of bits that the caller is interested in
390	*
391	* Return: pageblock_bits flags
392	*/
393	static unsigned long __get_pfnblock_flags_mask(const struct page *page,
394	unsigned long pfn,
395	unsigned long mask)
396	{
397	unsigned long *bitmap_word;
398	unsigned long bitidx;
399	unsigned long word;
400
401	get_pfnblock_bitmap_bitidx(page, pfn, bitmap_word: &bitmap_word, bitidx: &bitidx);
402	/*
403	* This races, without locks, with set_pfnblock_migratetype(). Ensure
404	* a consistent read of the memory array, so that results, even though
405	* racy, are not corrupted.
406	*/
407	word = READ_ONCE(*bitmap_word);
408	return (word >> bitidx) & mask;
409	}
410
411	/**
412	* get_pfnblock_bit - Check if a standalone bit of a pageblock is set
413	* @page: The page within the block of interest
414	* @pfn: The target page frame number
415	* @pb_bit: pageblock bit to check
416	*
417	* Return: true if the bit is set, otherwise false
418	*/
419	bool get_pfnblock_bit(const struct page page, unsigned* long pfn,
420	enum pageblock_bits pb_bit)
421	{
422	unsigned long *bitmap_word;
423	unsigned long bitidx;
424
425	if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit)))
426	return false;
427
428	get_pfnblock_bitmap_bitidx(page, pfn, bitmap_word: &bitmap_word, bitidx: &bitidx);
429
430	return test_bit(bitidx + pb_bit, bitmap_word);
431	}
432
433	/**
434	* get_pfnblock_migratetype - Return the migratetype of a pageblock
435	* @page: The page within the block of interest
436	* @pfn: The target page frame number
437	*
438	* Return: The migratetype of the pageblock
439	*
440	* Use get_pfnblock_migratetype() if caller already has both @page and @pfn
441	* to save a call to page_to_pfn().
442	*/
443	__always_inline enum migratetype
444	get_pfnblock_migratetype(const struct page page, unsigned* long pfn)
445	{
446	unsigned long mask = MIGRATETYPE_AND_ISO_MASK;
447	unsigned long flags;
448
449	flags = __get_pfnblock_flags_mask(page, pfn, mask);
450
451	#ifdef CONFIG_MEMORY_ISOLATION
452	if (flags & BIT(PB_migrate_isolate))
453	return MIGRATE_ISOLATE;
454	#endif
455	return flags & MIGRATETYPE_MASK;
456	}
457
458	/**
459	* __set_pfnblock_flags_mask - Set the requested group of flags for
460	* a pageblock_nr_pages block of pages
461	* @page: The page within the block of interest
462	* @pfn: The target page frame number
463	* @flags: The flags to set
464	* @mask: mask of bits that the caller is interested in
465	*/
466	static void __set_pfnblock_flags_mask(struct page page, unsigned* long pfn,
467	unsigned long flags, unsigned long mask)
468	{
469	unsigned long *bitmap_word;
470	unsigned long bitidx;
471	unsigned long word;
472
473	get_pfnblock_bitmap_bitidx(page, pfn, bitmap_word: &bitmap_word, bitidx: &bitidx);
474
475	mask <<= bitidx;
476	flags <<= bitidx;
477
478	word = READ_ONCE(*bitmap_word);
479	do {
480	} while (!try_cmpxchg(bitmap_word, &word, (word & ~mask) \| flags));
481	}
482
483	/**
484	* set_pfnblock_bit - Set a standalone bit of a pageblock
485	* @page: The page within the block of interest
486	* @pfn: The target page frame number
487	* @pb_bit: pageblock bit to set
488	*/
489	void set_pfnblock_bit(const struct page page, unsigned* long pfn,
490	enum pageblock_bits pb_bit)
491	{
492	unsigned long *bitmap_word;
493	unsigned long bitidx;
494
495	if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit)))
496	return;
497
498	get_pfnblock_bitmap_bitidx(page, pfn, bitmap_word: &bitmap_word, bitidx: &bitidx);
499
500	set_bit(nr: bitidx + pb_bit, addr: bitmap_word);
501	}
502
503	/**
504	* clear_pfnblock_bit - Clear a standalone bit of a pageblock
505	* @page: The page within the block of interest
506	* @pfn: The target page frame number
507	* @pb_bit: pageblock bit to clear
508	*/
509	void clear_pfnblock_bit(const struct page page, unsigned* long pfn,
510	enum pageblock_bits pb_bit)
511	{
512	unsigned long *bitmap_word;
513	unsigned long bitidx;
514
515	if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit)))
516	return;
517
518	get_pfnblock_bitmap_bitidx(page, pfn, bitmap_word: &bitmap_word, bitidx: &bitidx);
519
520	clear_bit(nr: bitidx + pb_bit, addr: bitmap_word);
521	}
522
523	/**
524	* set_pageblock_migratetype - Set the migratetype of a pageblock
525	* @page: The page within the block of interest
526	* @migratetype: migratetype to set
527	*/
528	static void set_pageblock_migratetype(struct page *page,
529	enum migratetype migratetype)
530	{
531	if (unlikely(page_group_by_mobility_disabled &&
532	migratetype < MIGRATE_PCPTYPES))
533	migratetype = MIGRATE_UNMOVABLE;
534
535	#ifdef CONFIG_MEMORY_ISOLATION
536	if (migratetype == MIGRATE_ISOLATE) {
537	VM_WARN_ONCE(`1`,
538	"Use set_pageblock_isolate() for pageblock isolation");
539	return;
540	}
541	VM_WARN_ONCE(get_pageblock_isolate(page),
542	"Use clear_pageblock_isolate() to unisolate pageblock");
543	/ MIGRATETYPE_AND_ISO_MASK clears PB_migrate_isolate if it is set /
544	#endif
545	__set_pfnblock_flags_mask(page, page_to_pfn(page),
546	flags: (unsigned long)migratetype,
547	MIGRATETYPE_AND_ISO_MASK);
548	}
549
550	void __meminit init_pageblock_migratetype(struct page *page,
551	enum migratetype migratetype,
552	bool isolate)
553	{
554	unsigned long flags;
555
556	if (unlikely(page_group_by_mobility_disabled &&
557	migratetype < MIGRATE_PCPTYPES))
558	migratetype = MIGRATE_UNMOVABLE;
559
560	flags = migratetype;
561
562	#ifdef CONFIG_MEMORY_ISOLATION
563	if (migratetype == MIGRATE_ISOLATE) {
564	VM_WARN_ONCE(
565	`1`,
566	"Set isolate=true to isolate pageblock with a migratetype");
567	return;
568	}
569	if (isolate)
570	flags \|= BIT(PB_migrate_isolate);
571	#endif
572	__set_pfnblock_flags_mask(page, page_to_pfn(page), flags,
573	MIGRATETYPE_AND_ISO_MASK);
574	}
575
576	#ifdef CONFIG_DEBUG_VM
577	static int page_outside_zone_boundaries(struct zone zone, struct* page *page)
578	{
579	int ret;
580	unsigned seq;
581	unsigned long pfn = page_to_pfn(page);
582	unsigned long sp, start_pfn;
583
584	do {
585	seq = zone_span_seqbegin(zone);
586	start_pfn = zone->zone_start_pfn;
587	sp = zone->spanned_pages;
588	ret = !zone_spans_pfn(zone, pfn);
589	} while (zone_span_seqretry(zone, seq));
590
591	if (ret)
592	pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",
593	pfn, zone_to_nid(zone), zone->name,
594	start_pfn, start_pfn + sp);
595
596	return ret;
597	}
598
599	/*
600	* Temporary debugging check for pages not lying within a given zone.
601	*/
602	static bool __maybe_unused bad_range(struct zone zone, struct* page *page)
603	{
604	if (page_outside_zone_boundaries(zone, page))
605	return true;
606	if (zone != page_zone(page))
607	return true;
608
609	return false;
610	}
611	#else
612	static inline bool __maybe_unused bad_range(struct zone zone, struct* page *page)
613	{
614	return false;
615	}
616	#endif
617
618	static void bad_page(struct page page, const* char *reason)
619	{
620	static unsigned long resume;
621	static unsigned long nr_shown;
622	static unsigned long nr_unshown;
623
624	/*
625	* Allow a burst of 60 reports, then keep quiet for that minute;
626	* or allow a steady drip of one report per second.
627	*/
628	if (nr_shown == `60`) {
629	if (time_before(jiffies, resume)) {
630	nr_unshown++;
631	goto out;
632	}
633	if (nr_unshown) {
634	pr_alert(
635	"BUG: Bad page state: %lu messages suppressed\n",
636	nr_unshown);
637	nr_unshown = `0`;
638	}
639	nr_shown = `0`;
640	}
641	if (nr_shown++ == `0`)
642	resume = jiffies + `60` * HZ;
643
644	pr_alert("BUG: Bad page state in process %s pfn:%05lx\n",
645	current->comm, page_to_pfn(page));
646	dump_page(page, reason);
647
648	print_modules();
649	dump_stack();
650	out:
651	/ Leave bad fields for debug, except PageBuddy could make trouble /
652	if (PageBuddy(page))
653	__ClearPageBuddy(page);
654	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
655	}
656
657	static inline unsigned int order_to_pindex(int migratetype, int order)
658	{
659
660	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
661	bool movable;
662	if (order > PAGE_ALLOC_COSTLY_ORDER) {
663	VM_BUG_ON(order != HPAGE_PMD_ORDER);
664
665	movable = migratetype == MIGRATE_MOVABLE;
666
667	return NR_LOWORDER_PCP_LISTS + movable;
668	}
669	#else
670	VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
671	#endif
672
673	return (MIGRATE_PCPTYPES * order) + migratetype;
674	}
675
676	static inline int pindex_to_order(unsigned int pindex)
677	{
678	int order = pindex / MIGRATE_PCPTYPES;
679
680	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
681	if (pindex >= NR_LOWORDER_PCP_LISTS)
682	order = HPAGE_PMD_ORDER;
683	#else
684	VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
685	#endif
686
687	return order;
688	}
689
690	static inline bool pcp_allowed_order(unsigned int order)
691	{
692	if (order <= PAGE_ALLOC_COSTLY_ORDER)
693	return true;
694	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
695	if (order == HPAGE_PMD_ORDER)
696	return true;
697	#endif
698	return false;
699	}
700
701	/*
702	* Higher-order pages are called "compound pages". They are structured thusly:
703	*
704	* The first PAGE_SIZE page is called the "head page" and have PG_head set.
705	*
706	* The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded
707	* in bit 0 of page->compound_head. The rest of bits is pointer to head page.
708	*
709	* The first tail page's ->compound_order holds the order of allocation.
710	* This usage means that zero-order pages may not be compound.
711	*/
712
713	void prep_compound_page(struct page page, unsigned* int order)
714	{
715	int i;
716	int nr_pages = `1` << order;
717
718	__SetPageHead(page);
719	for (i = `1`; i < nr_pages; i++)
720	prep_compound_tail(head: page, tail_idx: i);
721
722	prep_compound_head(page, order);
723	}
724
725	static inline void set_buddy_order(struct page page, unsigned* int order)
726	{
727	set_page_private(page, private: order);
728	__SetPageBuddy(page);
729	}
730
731	#ifdef CONFIG_COMPACTION
732	static inline struct capture_control task_capc(struct* zone *zone)
733	{
734	struct capture_control *capc = current->capture_control;
735
736	return unlikely(capc) &&
737	!(current->flags & PF_KTHREAD) &&
738	!capc->page &&
739	capc->cc->zone == zone ? capc : NULL;
740	}
741
742	static inline bool
743	compaction_capture(struct capture_control capc, struct* page *page,
744	int order, int migratetype)
745	{
746	if (!capc \|\| order != capc->cc->order)
747	return false;
748
749	/ Do not accidentally pollute CMA or isolated regions/
750	if (is_migrate_cma(migratetype) \|\|
751	is_migrate_isolate(migratetype))
752	return false;
753
754	/*
755	* Do not let lower order allocations pollute a movable pageblock
756	* unless compaction is also requesting movable pages.
757	* This might let an unmovable request use a reclaimable pageblock
758	* and vice-versa but no more than normal fallback logic which can
759	* have trouble finding a high-order free page.
760	*/
761	if (order < pageblock_order && migratetype == MIGRATE_MOVABLE &&
762	capc->cc->migratetype != MIGRATE_MOVABLE)
763	return false;
764
765	if (migratetype != capc->cc->migratetype)
766	trace_mm_page_alloc_extfrag(page, alloc_order: capc->cc->order, fallback_order: order,
767	alloc_migratetype: capc->cc->migratetype, fallback_migratetype: migratetype);
768
769	capc->page = page;
770	return true;
771	}
772
773	#else
774	static inline struct capture_control task_capc(struct* zone *zone)
775	{
776	return NULL;
777	}
778
779	static inline bool
780	compaction_capture(struct capture_control capc, struct* page *page,
781	int order, int migratetype)
782	{
783	return false;
784	}
785	#endif /* CONFIG_COMPACTION */
786
787	static inline void account_freepages(struct zone zone, int* nr_pages,
788	int migratetype)
789	{
790	lockdep_assert_held(&zone->lock);
791
792	if (is_migrate_isolate(migratetype))
793	return;
794
795	__mod_zone_page_state(zone, item: NR_FREE_PAGES, nr_pages);
796
797	if (is_migrate_cma(migratetype))
798	__mod_zone_page_state(zone, item: NR_FREE_CMA_PAGES, nr_pages);
799	else if (migratetype == MIGRATE_HIGHATOMIC)
800	WRITE_ONCE(zone->nr_free_highatomic,
801	zone->nr_free_highatomic + nr_pages);
802	}
803
804	/ Used for pages not on another list /
805	static inline void __add_to_free_list(struct page page, struct* zone *zone,
806	unsigned int order, int migratetype,
807	bool tail)
808	{
809	struct free_area *area = &zone->free_area[order];
810	int nr_pages = `1` << order;
811
812	VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype,
813	"page type is %d, passed migratetype is %d (nr=%d)\n",
814	get_pageblock_migratetype(page), migratetype, nr_pages);
815
816	if (tail)
817	list_add_tail(new: &page->buddy_list, head: &area->free_list[migratetype]);
818	else
819	list_add(new: &page->buddy_list, head: &area->free_list[migratetype]);
820	area->nr_free++;
821
822	if (order >= pageblock_order && !is_migrate_isolate(migratetype))
823	__mod_zone_page_state(zone, item: NR_FREE_PAGES_BLOCKS, nr_pages);
824	}
825
826	/*
827	* Used for pages which are on another list. Move the pages to the tail
828	* of the list - so the moved pages won't immediately be considered for
829	* allocation again (e.g., optimization for memory onlining).
830	*/
831	static inline void move_to_free_list(struct page page, struct* zone *zone,
832	unsigned int order, int old_mt, int new_mt)
833	{
834	struct free_area *area = &zone->free_area[order];
835	int nr_pages = `1` << order;
836
837	/ Free page moving can fail, so it happens before the type update /
838	VM_WARN_ONCE(get_pageblock_migratetype(page) != old_mt,
839	"page type is %d, passed migratetype is %d (nr=%d)\n",
840	get_pageblock_migratetype(page), old_mt, nr_pages);
841
842	list_move_tail(list: &page->buddy_list, head: &area->free_list[new_mt]);
843
844	account_freepages(zone, nr_pages: -nr_pages, migratetype: old_mt);
845	account_freepages(zone, nr_pages, migratetype: new_mt);
846
847	if (order >= pageblock_order &&
848	is_migrate_isolate(migratetype: old_mt) != is_migrate_isolate(migratetype: new_mt)) {
849	if (!is_migrate_isolate(migratetype: old_mt))
850	nr_pages = -nr_pages;
851	__mod_zone_page_state(zone, item: NR_FREE_PAGES_BLOCKS, nr_pages);
852	}
853	}
854
855	static inline void __del_page_from_free_list(struct page page, struct* zone *zone,
856	unsigned int order, int migratetype)
857	{
858	int nr_pages = `1` << order;
859
860	VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype,
861	"page type is %d, passed migratetype is %d (nr=%d)\n",
862	get_pageblock_migratetype(page), migratetype, nr_pages);
863
864	/ clear reported state and update reported page count /
865	if (page_reported(page))
866	__ClearPageReported(page);
867
868	list_del(entry: &page->buddy_list);
869	__ClearPageBuddy(page);
870	set_page_private(page, private: `0`);
871	zone->free_area[order].nr_free--;
872
873	if (order >= pageblock_order && !is_migrate_isolate(migratetype))
874	__mod_zone_page_state(zone, item: NR_FREE_PAGES_BLOCKS, -nr_pages);
875	}
876
877	static inline void del_page_from_free_list(struct page page, struct* zone *zone,
878	unsigned int order, int migratetype)
879	{
880	__del_page_from_free_list(page, zone, order, migratetype);
881	account_freepages(zone, nr_pages: -(`1` << order), migratetype);
882	}
883
884	static inline struct page get_page_from_free_area(struct* free_area *area,
885	int migratetype)
886	{
887	return list_first_entry_or_null(&area->free_list[migratetype],
888	struct page, buddy_list);
889	}
890
891	/*
892	* If this is less than the 2nd largest possible page, check if the buddy
893	* of the next-higher order is free. If it is, it's possible
894	* that pages are being freed that will coalesce soon. In case,
895	* that is happening, add the free page to the tail of the list
896	* so it's less likely to be used soon and more likely to be merged
897	* as a 2-level higher order page
898	*/
899	static inline bool
900	buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
901	struct page page, unsigned* int order)
902	{
903	unsigned long higher_page_pfn;
904	struct page *higher_page;
905
906	if (order >= MAX_PAGE_ORDER - `1`)
907	return false;
908
909	higher_page_pfn = buddy_pfn & pfn;
910	higher_page = page + (higher_page_pfn - pfn);
911
912	return find_buddy_page_pfn(page: higher_page, pfn: higher_page_pfn, order: order + `1`,
913	NULL) != NULL;
914	}
915
916	/*
917	* Freeing function for a buddy system allocator.
918	*
919	* The concept of a buddy system is to maintain direct-mapped table
920	* (containing bit values) for memory blocks of various "orders".
921	* The bottom level table contains the map for the smallest allocatable
922	* units of memory (here, pages), and each level above it describes
923	* pairs of units from the levels below, hence, "buddies".
924	* At a high level, all that happens here is marking the table entry
925	* at the bottom level available, and propagating the changes upward
926	* as necessary, plus some accounting needed to play nicely with other
927	* parts of the VM system.
928	* At each level, we keep a list of pages, which are heads of continuous
929	* free pages of length of (1 << order) and marked with PageBuddy.
930	* Page's order is recorded in page_private(page) field.
931	* So when we are allocating or freeing one, we can derive the state of the
932	* other. That is, if we allocate a small block, and both were
933	* free, the remainder of the region must be split into blocks.
934	* If a block is freed, and its buddy is also free, then this
935	* triggers coalescing into a block of larger size.
936	*
937	* -- nyc
938	*/
939
940	static inline void __free_one_page(struct page *page,
941	unsigned long pfn,
942	struct zone zone, unsigned* int order,
943	int migratetype, fpi_t fpi_flags)
944	{
945	struct capture_control *capc = task_capc(zone);
946	unsigned long buddy_pfn = `0`;
947	unsigned long combined_pfn;
948	struct page *buddy;
949	bool to_tail;
950
951	VM_BUG_ON(!zone_is_initialized(zone));
952	VM_BUG_ON_PAGE(page->flags.f & PAGE_FLAGS_CHECK_AT_PREP, page);
953
954	VM_BUG_ON(migratetype == -`1`);
955	VM_BUG_ON_PAGE(pfn & ((`1` << order) - `1`), page);
956	VM_BUG_ON_PAGE(bad_range(zone, page), page);
957
958	account_freepages(zone, nr_pages: `1` << order, migratetype);
959
960	while (order < MAX_PAGE_ORDER) {
961	int buddy_mt = migratetype;
962
963	if (compaction_capture(capc, page, order, migratetype)) {
964	account_freepages(zone, nr_pages: -(`1` << order), migratetype);
965	return;
966	}
967
968	buddy = find_buddy_page_pfn(page, pfn, order, buddy_pfn: &buddy_pfn);
969	if (!buddy)
970	goto done_merging;
971
972	if (unlikely(order >= pageblock_order)) {
973	/*
974	* We want to prevent merge between freepages on pageblock
975	* without fallbacks and normal pageblock. Without this,
976	* pageblock isolation could cause incorrect freepage or CMA
977	* accounting or HIGHATOMIC accounting.
978	*/
979	buddy_mt = get_pfnblock_migratetype(page: buddy, pfn: buddy_pfn);
980
981	if (migratetype != buddy_mt &&
982	(!migratetype_is_mergeable(mt: migratetype) \|\|
983	!migratetype_is_mergeable(mt: buddy_mt)))
984	goto done_merging;
985	}
986
987	/*
988	* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
989	* merge with it and move up one order.
990	*/
991	if (page_is_guard(page: buddy))
992	clear_page_guard(zone, page: buddy, order);
993	else
994	__del_page_from_free_list(page: buddy, zone, order, migratetype: buddy_mt);
995
996	if (unlikely(buddy_mt != migratetype)) {
997	/*
998	* Match buddy type. This ensures that an
999	* expand() down the line puts the sub-blocks
1000	* on the right freelists.
1001	*/
1002	set_pageblock_migratetype(page: buddy, migratetype);
1003	}
1004
1005	combined_pfn = buddy_pfn & pfn;
1006	page = page + (combined_pfn - pfn);
1007	pfn = combined_pfn;
1008	order++;
1009	}
1010
1011	done_merging:
1012	set_buddy_order(page, order);
1013
1014	if (fpi_flags & FPI_TO_TAIL)
1015	to_tail = true;
1016	else if (is_shuffle_order(order))
1017	to_tail = shuffle_pick_tail();
1018	else
1019	to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
1020
1021	__add_to_free_list(page, zone, order, migratetype, tail: to_tail);
1022
1023	/ Notify page reporting subsystem of freed page /
1024	if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY))
1025	page_reporting_notify_free(order);
1026	}
1027
1028	/*
1029	* A bad page could be due to a number of fields. Instead of multiple branches,
1030	* try and check multiple fields with one check. The caller must do a detailed
1031	* check if necessary.
1032	*/
1033	static inline bool page_expected_state(struct page *page,
1034	unsigned long check_flags)
1035	{
1036	if (unlikely(atomic_read(&page->_mapcount) != -`1`))
1037	return false;
1038
1039	if (unlikely((unsigned long)page->mapping \|
1040	page_ref_count(page) \|
1041	#ifdef CONFIG_MEMCG
1042	page->memcg_data \|
1043	#endif
1044	page_pool_page_is_pp(page) \|
1045	(page->flags.f & check_flags)))
1046	return false;
1047
1048	return true;
1049	}
1050
1051	static const char page_bad_reason(struct* page page, unsigned* long flags)
1052	{
1053	const char *bad_reason = NULL;
1054
1055	if (unlikely(atomic_read(&page->_mapcount) != -`1`))
1056	bad_reason = "nonzero mapcount";
1057	if (unlikely(page->mapping != NULL))
1058	bad_reason = "non-NULL mapping";
1059	if (unlikely(page_ref_count(page) != `0`))
1060	bad_reason = "nonzero _refcount";
1061	if (unlikely(page->flags.f & flags)) {
1062	if (flags == PAGE_FLAGS_CHECK_AT_PREP)
1063	bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set";
1064	else
1065	bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
1066	}
1067	#ifdef CONFIG_MEMCG
1068	if (unlikely(page->memcg_data))
1069	bad_reason = "page still charged to cgroup";
1070	#endif
1071	if (unlikely(page_pool_page_is_pp(page)))
1072	bad_reason = "page_pool leak";
1073	return bad_reason;
1074	}
1075
1076	static inline bool free_page_is_bad(struct page *page)
1077	{
1078	if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
1079	return false;
1080
1081	/ Something has gone sideways, find it /
1082	bad_page(page, reason: page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE));
1083	return true;
1084	}
1085
1086	static inline bool is_check_pages_enabled(void)
1087	{
1088	return static_branch_unlikely(&check_pages_enabled);
1089	}
1090
1091	static int free_tail_page_prepare(struct page head_page, struct* page *page)
1092	{
1093	struct folio folio = (struct* folio *)head_page;
1094	int ret = `1`;
1095
1096	/*
1097	* We rely page->lru.next never has bit 0 set, unless the page
1098	* is PageTail(). Let's make sure that's true even for poisoned ->lru.
1099	*/
1100	BUILD_BUG_ON((unsigned long)LIST_POISON1 & `1`);
1101
1102	if (!is_check_pages_enabled()) {
1103	ret = `0`;
1104	goto out;
1105	}
1106	switch (page - head_page) {
1107	case `1`:
1108	/ the first tail page: these may be in place of ->mapping /
1109	if (unlikely(folio_large_mapcount(folio))) {
1110	bad_page(page, reason: "nonzero large_mapcount");
1111	goto out;
1112	}
1113	if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT) &&
1114	unlikely(atomic_read(&folio->_nr_pages_mapped))) {
1115	bad_page(page, reason: "nonzero nr_pages_mapped");
1116	goto out;
1117	}
1118	if (IS_ENABLED(CONFIG_MM_ID)) {
1119	if (unlikely(folio->_mm_id_mapcount[`0`] != -`1`)) {
1120	bad_page(page, reason: "nonzero mm mapcount 0");
1121	goto out;
1122	}
1123	if (unlikely(folio->_mm_id_mapcount[`1`] != -`1`)) {
1124	bad_page(page, reason: "nonzero mm mapcount 1");
1125	goto out;
1126	}
1127	}
1128	if (IS_ENABLED(CONFIG_64BIT)) {
1129	if (unlikely(atomic_read(&folio->_entire_mapcount) + `1`)) {
1130	bad_page(page, reason: "nonzero entire_mapcount");
1131	goto out;
1132	}
1133	if (unlikely(atomic_read(&folio->_pincount))) {
1134	bad_page(page, reason: "nonzero pincount");
1135	goto out;
1136	}
1137	}
1138	break;
1139	case `2`:
1140	/ the second tail page: deferred_list overlaps ->mapping /
1141	if (unlikely(!list_empty(&folio->_deferred_list))) {
1142	bad_page(page, reason: "on deferred list");
1143	goto out;
1144	}
1145	if (!IS_ENABLED(CONFIG_64BIT)) {
1146	if (unlikely(atomic_read(&folio->_entire_mapcount) + `1`)) {
1147	bad_page(page, reason: "nonzero entire_mapcount");
1148	goto out;
1149	}
1150	if (unlikely(atomic_read(&folio->_pincount))) {
1151	bad_page(page, reason: "nonzero pincount");
1152	goto out;
1153	}
1154	}
1155	break;
1156	case `3`:
1157	/ the third tail page: hugetlb specifics overlap ->mappings /
1158	if (IS_ENABLED(CONFIG_HUGETLB_PAGE))
1159	break;
1160	fallthrough;
1161	default:
1162	if (page->mapping != TAIL_MAPPING) {
1163	bad_page(page, reason: "corrupted mapping in tail page");
1164	goto out;
1165	}
1166	break;
1167	}
1168	if (unlikely(!PageTail(page))) {
1169	bad_page(page, reason: "PageTail not set");
1170	goto out;
1171	}
1172	if (unlikely(compound_head(page) != head_page)) {
1173	bad_page(page, reason: "compound_head not consistent");
1174	goto out;
1175	}
1176	ret = `0`;
1177	out:
1178	page->mapping = NULL;
1179	clear_compound_head(page);
1180	return ret;
1181	}
1182
1183	/*
1184	* Skip KASAN memory poisoning when either:
1185	*
1186	* 1. For generic KASAN: deferred memory initialization has not yet completed.
1187	* Tag-based KASAN modes skip pages freed via deferred memory initialization
1188	* using page tags instead (see below).
1189	* 2. For tag-based KASAN modes: the page has a match-all KASAN tag, indicating
1190	* that error detection is disabled for accesses via the page address.
1191	*
1192	* Pages will have match-all tags in the following circumstances:
1193	*
1194	* 1. Pages are being initialized for the first time, including during deferred
1195	* memory init; see the call to page_kasan_tag_reset in __init_single_page.
1196	* 2. The allocation was not unpoisoned due to __GFP_SKIP_KASAN, with the
1197	* exception of pages unpoisoned by kasan_unpoison_vmalloc.
1198	* 3. The allocation was excluded from being checked due to sampling,
1199	* see the call to kasan_unpoison_pages.
1200	*
1201	* Poisoning pages during deferred memory init will greatly lengthen the
1202	* process and cause problem in large memory systems as the deferred pages
1203	* initialization is done with interrupt disabled.
1204	*
1205	* Assuming that there will be no reference to those newly initialized
1206	* pages before they are ever allocated, this should have no effect on
1207	* KASAN memory tracking as the poison will be properly inserted at page
1208	* allocation time. The only corner case is when pages are allocated by
1209	* on-demand allocation and then freed again before the deferred pages
1210	* initialization is done, but this is not likely to happen.
1211	*/
1212	static inline bool should_skip_kasan_poison(struct page *page)
1213	{
1214	if (IS_ENABLED(CONFIG_KASAN_GENERIC))
1215	return deferred_pages_enabled();
1216
1217	return page_kasan_tag(page) == KASAN_TAG_KERNEL;
1218	}
1219
1220	static void kernel_init_pages(struct page page, int* numpages)
1221	{
1222	int i;
1223
1224	/ s390's use of memset() could override KASAN redzones. /
1225	kasan_disable_current();
1226	for (i = `0`; i < numpages; i++)
1227	clear_highpage_kasan_tagged(page: page + i);
1228	kasan_enable_current();
1229	}
1230
1231	#ifdef CONFIG_MEM_ALLOC_PROFILING
1232
1233	/ Should be called only if mem_alloc_profiling_enabled() /
1234	void __clear_page_tag_ref(struct page *page)
1235	{
1236	union pgtag_ref_handle handle;
1237	union codetag_ref ref;
1238
1239	if (get_page_tag_ref(page, &ref, &handle)) {
1240	set_codetag_empty(&ref);
1241	update_page_tag_ref(handle, &ref);
1242	put_page_tag_ref(handle);
1243	}
1244	}
1245
1246	/ Should be called only if mem_alloc_profiling_enabled() /
1247	static noinline
1248	void __pgalloc_tag_add(struct page page, struct* task_struct *task,
1249	unsigned int nr)
1250	{
1251	union pgtag_ref_handle handle;
1252	union codetag_ref ref;
1253
1254	if (get_page_tag_ref(page, &ref, &handle)) {
1255	alloc_tag_add(&ref, task->alloc_tag, PAGE_SIZE * nr);
1256	update_page_tag_ref(handle, &ref);
1257	put_page_tag_ref(handle);
1258	}
1259	}
1260
1261	static inline void pgalloc_tag_add(struct page page, struct* task_struct *task,
1262	unsigned int nr)
1263	{
1264	if (mem_alloc_profiling_enabled())
1265	__pgalloc_tag_add(page, task, nr);
1266	}
1267
1268	/ Should be called only if mem_alloc_profiling_enabled() /
1269	static noinline
1270	void __pgalloc_tag_sub(struct page page, unsigned* int nr)
1271	{
1272	union pgtag_ref_handle handle;
1273	union codetag_ref ref;
1274
1275	if (get_page_tag_ref(page, &ref, &handle)) {
1276	alloc_tag_sub(&ref, PAGE_SIZE * nr);
1277	update_page_tag_ref(handle, &ref);
1278	put_page_tag_ref(handle);
1279	}
1280	}
1281
1282	static inline void pgalloc_tag_sub(struct page page, unsigned* int nr)
1283	{
1284	if (mem_alloc_profiling_enabled())
1285	__pgalloc_tag_sub(page, nr);
1286	}
1287
1288	/ When tag is not NULL, assuming mem_alloc_profiling_enabled /
1289	static inline void pgalloc_tag_sub_pages(struct alloc_tag tag, unsigned* int nr)
1290	{
1291	if (tag)
1292	this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr);
1293	}
1294
1295	#else /* CONFIG_MEM_ALLOC_PROFILING */
1296
1297	static inline void pgalloc_tag_add(struct page page, struct* task_struct *task,
1298	unsigned int nr) {}
1299	static inline void pgalloc_tag_sub(struct page page, unsigned* int nr) {}
1300	static inline void pgalloc_tag_sub_pages(struct alloc_tag tag, unsigned* int nr) {}
1301
1302	#endif /* CONFIG_MEM_ALLOC_PROFILING */
1303
1304	__always_inline bool free_pages_prepare(struct page *page,
1305	unsigned int order)
1306	{
1307	int bad = `0`;
1308	bool skip_kasan_poison = should_skip_kasan_poison(page);
1309	bool init = want_init_on_free();
1310	bool compound = PageCompound(page);
1311	struct folio *folio = page_folio(page);
1312
1313	VM_BUG_ON_PAGE(PageTail(page), page);
1314
1315	trace_mm_page_free(page, order);
1316	kmsan_free_page(page, order);
1317
1318	if (memcg_kmem_online() && PageMemcgKmem(page))
1319	__memcg_kmem_uncharge_page(page, order);
1320
1321	/*
1322	* In rare cases, when truncation or holepunching raced with
1323	* munlock after VM_LOCKED was cleared, Mlocked may still be
1324	* found set here. This does not indicate a problem, unless
1325	* "unevictable_pgs_cleared" appears worryingly large.
1326	*/
1327	if (unlikely(folio_test_mlocked(folio))) {
1328	long nr_pages = folio_nr_pages(folio);
1329
1330	__folio_clear_mlocked(folio);
1331	zone_stat_mod_folio(folio, item: NR_MLOCK, nr: -nr_pages);
1332	count_vm_events(item: UNEVICTABLE_PGCLEARED, delta: nr_pages);
1333	}
1334
1335	if (unlikely(PageHWPoison(page)) && !order) {
1336	/ Do not let hwpoison pages hit pcplists/buddy /
1337	reset_page_owner(page, order);
1338	page_table_check_free(page, order);
1339	pgalloc_tag_sub(page, nr: `1` << order);
1340
1341	/*
1342	* The page is isolated and accounted for.
1343	* Mark the codetag as empty to avoid accounting error
1344	* when the page is freed by unpoison_memory().
1345	*/
1346	clear_page_tag_ref(page);
1347	return false;
1348	}
1349
1350	VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
1351
1352	/*
1353	* Check tail pages before head page information is cleared to
1354	* avoid checking PageCompound for order-0 pages.
1355	*/
1356	if (unlikely(order)) {
1357	int i;
1358
1359	if (compound) {
1360	page[`1`].flags.f &= ~PAGE_FLAGS_SECOND;
1361	#ifdef NR_PAGES_IN_LARGE_FOLIO
1362	folio->_nr_pages = `0`;
1363	#endif
1364	}
1365	for (i = `1`; i < (`1` << order); i++) {
1366	if (compound)
1367	bad += free_tail_page_prepare(head_page: page, page: page + i);
1368	if (is_check_pages_enabled()) {
1369	if (free_page_is_bad(page: page + i)) {
1370	bad++;
1371	continue;
1372	}
1373	}
1374	(page + i)->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
1375	}
1376	}
1377	if (folio_test_anon(folio)) {
1378	mod_mthp_stat(order, item: MTHP_STAT_NR_ANON, delta: -`1`);
1379	folio->mapping = NULL;
1380	}
1381	if (unlikely(page_has_type(page)))
1382	/ Reset the page_type (which overlays _mapcount) /
1383	page->page_type = UINT_MAX;
1384
1385	if (is_check_pages_enabled()) {
1386	if (free_page_is_bad(page))
1387	bad++;
1388	if (bad)
1389	return false;
1390	}
1391
1392	page_cpupid_reset_last(page);
1393	page->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
1394	reset_page_owner(page, order);
1395	page_table_check_free(page, order);
1396	pgalloc_tag_sub(page, nr: `1` << order);
1397
1398	if (!PageHighMem(page)) {
1399	debug_check_no_locks_freed(page_address(page),
1400	PAGE_SIZE << order);
1401	debug_check_no_obj_freed(page_address(page),
1402	PAGE_SIZE << order);
1403	}
1404
1405	kernel_poison_pages(page, numpages: `1` << order);
1406
1407	/*
1408	* As memory initialization might be integrated into KASAN,
1409	* KASAN poisoning and memory initialization code must be
1410	* kept together to avoid discrepancies in behavior.
1411	*
1412	* With hardware tag-based KASAN, memory tags must be set before the
1413	* page becomes unavailable via debug_pagealloc or arch_free_page.
1414	*/
1415	if (!skip_kasan_poison) {
1416	kasan_poison_pages(page, order, init);
1417
1418	/ Memory is already initialized if KASAN did it internally. /
1419	if (kasan_has_integrated_init())
1420	init = false;
1421	}
1422	if (init)
1423	kernel_init_pages(page, numpages: `1` << order);
1424
1425	/*
1426	* arch_free_page() can make the page's contents inaccessible. s390
1427	* does this. So nothing which can access the page's contents should
1428	* happen after this.
1429	*/
1430	arch_free_page(page, order);
1431
1432	debug_pagealloc_unmap_pages(page, numpages: `1` << order);
1433
1434	return true;
1435	}
1436
1437	/*
1438	* Frees a number of pages from the PCP lists
1439	* Assumes all pages on list are in same zone.
1440	* count is the number of pages to free.
1441	*/
1442	static void free_pcppages_bulk(struct zone zone, int* count,
1443	struct per_cpu_pages *pcp,
1444	int pindex)
1445	{
1446	unsigned long flags;
1447	unsigned int order;
1448	struct page *page;
1449
1450	/*
1451	* Ensure proper count is passed which otherwise would stuck in the
1452	* below while (list_empty(list)) loop.
1453	*/
1454	count = min(pcp->count, count);
1455
1456	/ Ensure requested pindex is drained first. /
1457	pindex = pindex - `1`;
1458
1459	spin_lock_irqsave(&zone->lock, flags);
1460
1461	while (count > `0`) {
1462	struct list_head *list;
1463	int nr_pages;
1464
1465	/ Remove pages from lists in a round-robin fashion. /
1466	do {
1467	if (++pindex > NR_PCP_LISTS - `1`)
1468	pindex = `0`;
1469	list = &pcp->lists[pindex];
1470	} while (list_empty(head: list));
1471
1472	order = pindex_to_order(pindex);
1473	nr_pages = `1` << order;
1474	do {
1475	unsigned long pfn;
1476	int mt;
1477
1478	page = list_last_entry(list, struct page, pcp_list);
1479	pfn = page_to_pfn(page);
1480	mt = get_pfnblock_migratetype(page, pfn);
1481
1482	/ must delete to avoid corrupting pcp list /
1483	list_del(entry: &page->pcp_list);
1484	count -= nr_pages;
1485	pcp->count -= nr_pages;
1486
1487	__free_one_page(page, pfn, zone, order, migratetype: mt, FPI_NONE);
1488	trace_mm_page_pcpu_drain(page, order, migratetype: mt);
1489	} while (count > `0` && !list_empty(head: list));
1490	}
1491
1492	spin_unlock_irqrestore(lock: &zone->lock, flags);
1493	}
1494
1495	/ Split a multi-block free page into its individual pageblocks. /
1496	static void split_large_buddy(struct zone zone, struct* page *page,
1497	unsigned long pfn, int order, fpi_t fpi)
1498	{
1499	unsigned long end = pfn + (`1` << order);
1500
1501	VM_WARN_ON_ONCE(!IS_ALIGNED(pfn, `1` << order));
1502	/ Caller removed page from freelist, buddy info cleared! /
1503	VM_WARN_ON_ONCE(PageBuddy(page));
1504
1505	if (order > pageblock_order)
1506	order = pageblock_order;
1507
1508	do {
1509	int mt = get_pfnblock_migratetype(page, pfn);
1510
1511	__free_one_page(page, pfn, zone, order, migratetype: mt, fpi_flags: fpi);
1512	pfn += `1` << order;
1513	if (pfn == end)
1514	break;
1515	page = pfn_to_page(pfn);
1516	} while (`1`);
1517	}
1518
1519	static void add_page_to_zone_llist(struct zone zone, struct* page *page,
1520	unsigned int order)
1521	{
1522	/ Remember the order /
1523	page->private = order;
1524	/ Add the page to the free list /
1525	llist_add(new: &page->pcp_llist, head: &zone->trylock_free_pages);
1526	}
1527
1528	static void free_one_page(struct zone zone, struct* page *page,
1529	unsigned long pfn, unsigned int order,
1530	fpi_t fpi_flags)
1531	{
1532	struct llist_head *llhead;
1533	unsigned long flags;
1534
1535	if (unlikely(fpi_flags & FPI_TRYLOCK)) {
1536	if (!spin_trylock_irqsave(&zone->lock, flags)) {
1537	add_page_to_zone_llist(zone, page, order);
1538	return;
1539	}
1540	} else {
1541	spin_lock_irqsave(&zone->lock, flags);
1542	}
1543
1544	/ The lock succeeded. Process deferred pages. /
1545	llhead = &zone->trylock_free_pages;
1546	if (unlikely(!llist_empty(llhead) && !(fpi_flags & FPI_TRYLOCK))) {
1547	struct llist_node *llnode;
1548	struct page p, tmp;
1549
1550	llnode = llist_del_all(head: llhead);
1551	llist_for_each_entry_safe(p, tmp, llnode, pcp_llist) {
1552	unsigned int p_order = p->private;
1553
1554	split_large_buddy(zone, page: p, page_to_pfn(p), order: p_order, fpi: fpi_flags);
1555	__count_vm_events(item: PGFREE, delta: `1` << p_order);
1556	}
1557	}
1558	split_large_buddy(zone, page, pfn, order, fpi: fpi_flags);
1559	spin_unlock_irqrestore(lock: &zone->lock, flags);
1560
1561	__count_vm_events(item: PGFREE, delta: `1` << order);
1562	}
1563
1564	static void __free_pages_ok(struct page page, unsigned* int order,
1565	fpi_t fpi_flags)
1566	{
1567	unsigned long pfn = page_to_pfn(page);
1568	struct zone *zone = page_zone(page);
1569
1570	if (free_pages_prepare(page, order))
1571	free_one_page(zone, page, pfn, order, fpi_flags);
1572	}
1573
1574	void __meminit __free_pages_core(struct page page, unsigned* int order,
1575	enum meminit_context context)
1576	{
1577	unsigned int nr_pages = `1` << order;
1578	struct page *p = page;
1579	unsigned int loop;
1580
1581	/*
1582	* When initializing the memmap, __init_single_page() sets the refcount
1583	* of all pages to 1 ("allocated"/"not free"). We have to set the
1584	* refcount of all involved pages to 0.
1585	*
1586	* Note that hotplugged memory pages are initialized to PageOffline().
1587	* Pages freed from memblock might be marked as reserved.
1588	*/
1589	if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) &&
1590	unlikely(context == MEMINIT_HOTPLUG)) {
1591	for (loop = `0`; loop < nr_pages; loop++, p++) {
1592	VM_WARN_ON_ONCE(PageReserved(p));
1593	__ClearPageOffline(page: p);
1594	set_page_count(page: p, v: `0`);
1595	}
1596
1597	adjust_managed_page_count(page, count: nr_pages);
1598	} else {
1599	for (loop = `0`; loop < nr_pages; loop++, p++) {
1600	__ClearPageReserved(page: p);
1601	set_page_count(page: p, v: `0`);
1602	}
1603
1604	/ memblock adjusts totalram_pages() manually. /
1605	atomic_long_add(i: nr_pages, v: &page_zone(page)->managed_pages);
1606	}
1607
1608	if (page_contains_unaccepted(page, order)) {
1609	if (order == MAX_PAGE_ORDER && __free_unaccepted(page))
1610	return;
1611
1612	accept_memory(page_to_phys(page), PAGE_SIZE << order);
1613	}
1614
1615	/*
1616	* Bypass PCP and place fresh pages right to the tail, primarily
1617	* relevant for memory onlining.
1618	*/
1619	__free_pages_ok(page, order, FPI_TO_TAIL);
1620	}
1621
1622	/*
1623	* Check that the whole (or subset of) a pageblock given by the interval of
1624	* [start_pfn, end_pfn) is valid and within the same zone, before scanning it
1625	* with the migration of free compaction scanner.
1626	*
1627	* Return struct page pointer of start_pfn, or NULL if checks were not passed.
1628	*
1629	* It's possible on some configurations to have a setup like node0 node1 node0
1630	* i.e. it's possible that all pages within a zones range of pages do not
1631	* belong to a single zone. We assume that a border between node0 and node1
1632	* can occur within a single pageblock, but not a node0 node1 node0
1633	* interleaving within a single pageblock. It is therefore sufficient to check
1634	* the first and last page of a pageblock and avoid checking each individual
1635	* page in a pageblock.
1636	*
1637	* Note: the function may return non-NULL struct page even for a page block
1638	* which contains a memory hole (i.e. there is no physical memory for a subset
1639	* of the pfn range). For example, if the pageblock order is MAX_PAGE_ORDER, which
1640	* will fall into 2 sub-sections, and the end pfn of the pageblock may be hole
1641	* even though the start pfn is online and valid. This should be safe most of
1642	* the time because struct pages are still initialized via init_unavailable_range()
1643	* and pfn walkers shouldn't touch any physical memory range for which they do
1644	* not recognize any specific metadata in struct pages.
1645	*/
1646	struct page __pageblock_pfn_to_page(unsigned* long start_pfn,
1647	unsigned long end_pfn, struct zone *zone)
1648	{
1649	struct page *start_page;
1650	struct page *end_page;
1651
1652	/ end_pfn is one past the range we are checking /
1653	end_pfn--;
1654
1655	if (!pfn_valid(pfn: end_pfn))
1656	return NULL;
1657
1658	start_page = pfn_to_online_page(start_pfn);
1659	if (!start_page)
1660	return NULL;
1661
1662	if (page_zone(page: start_page) != zone)
1663	return NULL;
1664
1665	end_page = pfn_to_page(end_pfn);
1666
1667	/ This gives a shorter code than deriving page_zone(end_page) /
1668	if (page_zone_id(page: start_page) != page_zone_id(page: end_page))
1669	return NULL;
1670
1671	return start_page;
1672	}
1673
1674	/*
1675	* The order of subdivision here is critical for the IO subsystem.
1676	* Please do not alter this order without good reasons and regression
1677	* testing. Specifically, as large blocks of memory are subdivided,
1678	* the order in which smaller blocks are delivered depends on the order
1679	* they're subdivided in this function. This is the primary factor
1680	* influencing the order in which pages are delivered to the IO
1681	* subsystem according to empirical testing, and this is also justified
1682	* by considering the behavior of a buddy system containing a single
1683	* large block of memory acted on by a series of small allocations.
1684	* This behavior is a critical factor in sglist merging's success.
1685	*
1686	* -- nyc
1687	*/
1688	static inline unsigned int expand(struct zone zone, struct* page page, int* low,
1689	int high, int migratetype)
1690	{
1691	unsigned int size = `1` << high;
1692	unsigned int nr_added = `0`;
1693
1694	while (high > low) {
1695	high--;
1696	size >>= `1`;
1697	VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
1698
1699	/*
1700	* Mark as guard pages (or page), that will allow to
1701	* merge back to allocator when buddy will be freed.
1702	* Corresponding page table entries will not be touched,
1703	* pages will stay not present in virtual address space
1704	*/
1705	if (set_page_guard(zone, page: &page[size], order: high))
1706	continue;
1707
1708	__add_to_free_list(page: &page[size], zone, order: high, migratetype, tail: false);
1709	set_buddy_order(page: &page[size], order: high);
1710	nr_added += size;
1711	}
1712
1713	return nr_added;
1714	}
1715
1716	static __always_inline void page_del_and_expand(struct zone *zone,
1717	struct page page, int* low,
1718	int high, int migratetype)
1719	{
1720	int nr_pages = `1` << high;
1721
1722	__del_page_from_free_list(page, zone, order: high, migratetype);
1723	nr_pages -= expand(zone, page, low, high, migratetype);
1724	account_freepages(zone, nr_pages: -nr_pages, migratetype);
1725	}
1726
1727	static void check_new_page_bad(struct page *page)
1728	{
1729	if (unlikely(PageHWPoison(page))) {
1730	/ Don't complain about hwpoisoned pages /
1731	if (PageBuddy(page))
1732	__ClearPageBuddy(page);
1733	return;
1734	}
1735
1736	bad_page(page,
1737	reason: page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP));
1738	}
1739
1740	/*
1741	* This page is about to be returned from the page allocator
1742	*/
1743	static bool check_new_page(struct page *page)
1744	{
1745	if (likely(page_expected_state(page,
1746	PAGE_FLAGS_CHECK_AT_PREP\|__PG_HWPOISON)))
1747	return false;
1748
1749	check_new_page_bad(page);
1750	return true;
1751	}
1752
1753	static inline bool check_new_pages(struct page page, unsigned* int order)
1754	{
1755	if (is_check_pages_enabled()) {
1756	for (int i = `0`; i < (`1` << order); i++) {
1757	struct page *p = page + i;
1758
1759	if (check_new_page(page: p))
1760	return true;
1761	}
1762	}
1763
1764	return false;
1765	}
1766
1767	static inline bool should_skip_kasan_unpoison(gfp_t flags)
1768	{
1769	/ Don't skip if a software KASAN mode is enabled. /
1770	if (IS_ENABLED(CONFIG_KASAN_GENERIC) \|\|
1771	IS_ENABLED(CONFIG_KASAN_SW_TAGS))
1772	return false;
1773
1774	/ Skip, if hardware tag-based KASAN is not enabled. /
1775	if (!kasan_hw_tags_enabled())
1776	return true;
1777
1778	/*
1779	* With hardware tag-based KASAN enabled, skip if this has been
1780	* requested via __GFP_SKIP_KASAN.
1781	*/
1782	return flags & __GFP_SKIP_KASAN;
1783	}
1784
1785	static inline bool should_skip_init(gfp_t flags)
1786	{
1787	/ Don't skip, if hardware tag-based KASAN is not enabled. /
1788	if (!kasan_hw_tags_enabled())
1789	return false;
1790
1791	/ For hardware tag-based KASAN, skip if requested. /
1792	return (flags & __GFP_SKIP_ZERO);
1793	}
1794
1795	inline void post_alloc_hook(struct page page, unsigned* int order,
1796	gfp_t gfp_flags)
1797	{
1798	bool init = !want_init_on_free() && want_init_on_alloc(flags: gfp_flags) &&
1799	!should_skip_init(flags: gfp_flags);
1800	bool zero_tags = init && (gfp_flags & __GFP_ZEROTAGS);
1801	int i;
1802
1803	set_page_private(page, private: `0`);
1804
1805	arch_alloc_page(page, order);
1806	debug_pagealloc_map_pages(page, numpages: `1` << order);
1807
1808	/*
1809	* Page unpoisoning must happen before memory initialization.
1810	* Otherwise, the poison pattern will be overwritten for __GFP_ZERO
1811	* allocations and the page unpoisoning code will complain.
1812	*/
1813	kernel_unpoison_pages(page, numpages: `1` << order);
1814
1815	/*
1816	* As memory initialization might be integrated into KASAN,
1817	* KASAN unpoisoning and memory initializion code must be
1818	* kept together to avoid discrepancies in behavior.
1819	*/
1820
1821	/*
1822	* If memory tags should be zeroed
1823	* (which happens only when memory should be initialized as well).
1824	*/
1825	if (zero_tags) {
1826	/ Initialize both memory and memory tags. /
1827	for (i = `0`; i != `1` << order; ++i)
1828	tag_clear_highpage(page: page + i);
1829
1830	/ Take note that memory was initialized by the loop above. /
1831	init = false;
1832	}
1833	if (!should_skip_kasan_unpoison(flags: gfp_flags) &&
1834	kasan_unpoison_pages(page, order, init)) {
1835	/ Take note that memory was initialized by KASAN. /
1836	if (kasan_has_integrated_init())
1837	init = false;
1838	} else {
1839	/*
1840	* If memory tags have not been set by KASAN, reset the page
1841	* tags to ensure page_address() dereferencing does not fault.
1842	*/
1843	for (i = `0`; i != `1` << order; ++i)
1844	page_kasan_tag_reset(page: page + i);
1845	}
1846	/ If memory is still not initialized, initialize it now. /
1847	if (init)
1848	kernel_init_pages(page, numpages: `1` << order);
1849
1850	set_page_owner(page, order, gfp_mask: gfp_flags);
1851	page_table_check_alloc(page, order);
1852	pgalloc_tag_add(page, current, nr: `1` << order);
1853	}
1854
1855	static void prep_new_page(struct page page, unsigned* int order, gfp_t gfp_flags,
1856	unsigned int alloc_flags)
1857	{
1858	post_alloc_hook(page, order, gfp_flags);
1859
1860	if (order && (gfp_flags & __GFP_COMP))
1861	prep_compound_page(page, order);
1862
1863	/*
1864	* page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to
1865	* allocate the page. The expectation is that the caller is taking
1866	* steps that will free more memory. The caller should avoid the page
1867	* being used for !PFMEMALLOC purposes.
1868	*/
1869	if (alloc_flags & ALLOC_NO_WATERMARKS)
1870	set_page_pfmemalloc(page);
1871	else
1872	clear_page_pfmemalloc(page);
1873	}
1874
1875	/*
1876	* Go through the free lists for the given migratetype and remove
1877	* the smallest available page from the freelists
1878	*/
1879	static __always_inline
1880	struct page __rmqueue_smallest(struct* zone zone, unsigned* int order,
1881	int migratetype)
1882	{
1883	unsigned int current_order;
1884	struct free_area *area;
1885	struct page *page;
1886
1887	/ Find a page of the appropriate size in the preferred list /
1888	for (current_order = order; current_order < NR_PAGE_ORDERS; ++current_order) {
1889	area = &(zone->free_area[current_order]);
1890	page = get_page_from_free_area(area, migratetype);
1891	if (!page)
1892	continue;
1893
1894	page_del_and_expand(zone, page, low: order, high: current_order,
1895	migratetype);
1896	trace_mm_page_alloc_zone_locked(page, order, migratetype,
1897	percpu_refill: pcp_allowed_order(order) &&
1898	migratetype < MIGRATE_PCPTYPES);
1899	return page;
1900	}
1901
1902	return NULL;
1903	}
1904
1905
1906	/*
1907	* This array describes the order lists are fallen back to when
1908	* the free lists for the desirable migrate type are depleted
1909	*
1910	* The other migratetypes do not have fallbacks.
1911	*/
1912	static int fallbacks[MIGRATE_PCPTYPES][MIGRATE_PCPTYPES - `1`] = {
1913	[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE },
1914	[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE },
1915	[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE },
1916	};
1917
1918	#ifdef CONFIG_CMA
1919	static __always_inline struct page __rmqueue_cma_fallback(struct* zone *zone,
1920	unsigned int order)
1921	{
1922	return __rmqueue_smallest(zone, order, MIGRATE_CMA);
1923	}
1924	#else
1925	static inline struct page __rmqueue_cma_fallback(struct* zone *zone,
1926	unsigned int order) { return NULL; }
1927	#endif
1928
1929	/*
1930	* Move all free pages of a block to new type's freelist. Caller needs to
1931	* change the block type.
1932	*/
1933	static int __move_freepages_block(struct zone zone, unsigned* long start_pfn,
1934	int old_mt, int new_mt)
1935	{
1936	struct page *page;
1937	unsigned long pfn, end_pfn;
1938	unsigned int order;
1939	int pages_moved = `0`;
1940
1941	VM_WARN_ON(start_pfn & (pageblock_nr_pages - `1`));
1942	end_pfn = pageblock_end_pfn(start_pfn);
1943
1944	for (pfn = start_pfn; pfn < end_pfn;) {
1945	page = pfn_to_page(pfn);
1946	if (!PageBuddy(page)) {
1947	pfn++;
1948	continue;
1949	}
1950
1951	/ Make sure we are not inadvertently changing nodes /
1952	VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
1953	VM_BUG_ON_PAGE(page_zone(page) != zone, page);
1954
1955	order = buddy_order(page);
1956
1957	move_to_free_list(page, zone, order, old_mt, new_mt);
1958
1959	pfn += `1` << order;
1960	pages_moved += `1` << order;
1961	}
1962
1963	return pages_moved;
1964	}
1965
1966	static bool prep_move_freepages_block(struct zone zone, struct* page *page,
1967	unsigned long *start_pfn,
1968	int num_free, int* *num_movable)
1969	{
1970	unsigned long pfn, start, end;
1971
1972	pfn = page_to_pfn(page);
1973	start = pageblock_start_pfn(pfn);
1974	end = pageblock_end_pfn(pfn);
1975
1976	/*
1977	* The caller only has the lock for @zone, don't touch ranges
1978	* that straddle into other zones. While we could move part of
1979	* the range that's inside the zone, this call is usually
1980	* accompanied by other operations such as migratetype updates
1981	* which also should be locked.
1982	*/
1983	if (!zone_spans_pfn(zone, pfn: start))
1984	return false;
1985	if (!zone_spans_pfn(zone, pfn: end - `1`))
1986	return false;
1987
1988	*start_pfn = start;
1989
1990	if (num_free) {
1991	*num_free = `0`;
1992	*num_movable = `0`;
1993	for (pfn = start; pfn < end;) {
1994	page = pfn_to_page(pfn);
1995	if (PageBuddy(page)) {
1996	int nr = `1` << buddy_order(page);
1997
1998	*num_free += nr;
1999	pfn += nr;
2000	continue;
2001	}
2002	/*
2003	* We assume that pages that could be isolated for
2004	* migration are movable. But we don't actually try
2005	* isolating, as that would be expensive.
2006	*/
2007	if (PageLRU(page) \|\| page_has_movable_ops(page))
2008	(*num_movable)++;
2009	pfn++;
2010	}
2011	}
2012
2013	return true;
2014	}
2015
2016	static int move_freepages_block(struct zone zone, struct* page *page,
2017	int old_mt, int new_mt)
2018	{
2019	unsigned long start_pfn;
2020	int res;
2021
2022	if (!prep_move_freepages_block(zone, page, start_pfn: &start_pfn, NULL, NULL))
2023	return -`1`;
2024
2025	res = __move_freepages_block(zone, start_pfn, old_mt, new_mt);
2026	set_pageblock_migratetype(pfn_to_page(start_pfn), migratetype: new_mt);
2027
2028	return res;
2029
2030	}
2031
2032	#ifdef CONFIG_MEMORY_ISOLATION
2033	/ Look for a buddy that straddles start_pfn /
2034	static unsigned long find_large_buddy(unsigned long start_pfn)
2035	{
2036	/*
2037	* If start_pfn is not an order-0 PageBuddy, next PageBuddy containing
2038	* start_pfn has minimal order of __ffs(start_pfn) + 1. Start checking
2039	* the order with __ffs(start_pfn). If start_pfn is order-0 PageBuddy,
2040	* the starting order does not matter.
2041	*/
2042	int order = start_pfn ? __ffs(start_pfn) : MAX_PAGE_ORDER;
2043	struct page *page;
2044	unsigned long pfn = start_pfn;
2045
2046	while (!PageBuddy(page = pfn_to_page(pfn))) {
2047	/ Nothing found /
2048	if (++order > MAX_PAGE_ORDER)
2049	return start_pfn;
2050	pfn &= ~`0UL` << order;
2051	}
2052
2053	/*
2054	* Found a preceding buddy, but does it straddle?
2055	*/
2056	if (pfn + (`1` << buddy_order(page)) > start_pfn)
2057	return pfn;
2058
2059	/ Nothing found /
2060	return start_pfn;
2061	}
2062
2063	static inline void toggle_pageblock_isolate(struct page *page, bool isolate)
2064	{
2065	if (isolate)
2066	set_pageblock_isolate(page);
2067	else
2068	clear_pageblock_isolate(page);
2069	}
2070
2071	/**
2072	* __move_freepages_block_isolate - move free pages in block for page isolation
2073	* @zone: the zone
2074	* @page: the pageblock page
2075	* @isolate: to isolate the given pageblock or unisolate it
2076	*
2077	* This is similar to move_freepages_block(), but handles the special
2078	* case encountered in page isolation, where the block of interest
2079	* might be part of a larger buddy spanning multiple pageblocks.
2080	*
2081	* Unlike the regular page allocator path, which moves pages while
2082	* stealing buddies off the freelist, page isolation is interested in
2083	* arbitrary pfn ranges that may have overlapping buddies on both ends.
2084	*
2085	* This function handles that. Straddling buddies are split into
2086	* individual pageblocks. Only the block of interest is moved.
2087	*
2088	* Returns %true if pages could be moved, %false otherwise.
2089	*/
2090	static bool __move_freepages_block_isolate(struct zone *zone,
2091	struct page *page, bool isolate)
2092	{
2093	unsigned long start_pfn, buddy_pfn;
2094	int from_mt;
2095	int to_mt;
2096	struct page *buddy;
2097
2098	if (isolate == get_pageblock_isolate(page)) {
2099	VM_WARN_ONCE(`1`, "%s a pageblock that is already in that state",
2100	isolate ? "Isolate" : "Unisolate");
2101	return false;
2102	}
2103
2104	if (!prep_move_freepages_block(zone, page, &start_pfn, NULL, NULL))
2105	return false;
2106
2107	/ No splits needed if buddies can't span multiple blocks /
2108	if (pageblock_order == MAX_PAGE_ORDER)
2109	goto move;
2110
2111	buddy_pfn = find_large_buddy(start_pfn);
2112	buddy = pfn_to_page(buddy_pfn);
2113	/ We're a part of a larger buddy /
2114	if (PageBuddy(buddy) && buddy_order(buddy) > pageblock_order) {
2115	int order = buddy_order(buddy);
2116
2117	del_page_from_free_list(buddy, zone, order,
2118	get_pfnblock_migratetype(buddy, buddy_pfn));
2119	toggle_pageblock_isolate(page, isolate);
2120	split_large_buddy(zone, buddy, buddy_pfn, order, FPI_NONE);
2121	return true;
2122	}
2123
2124	move:
2125	/ Use MIGRATETYPE_MASK to get non-isolate migratetype /
2126	if (isolate) {
2127	from_mt = __get_pfnblock_flags_mask(page, page_to_pfn(page),
2128	MIGRATETYPE_MASK);
2129	to_mt = MIGRATE_ISOLATE;
2130	} else {
2131	from_mt = MIGRATE_ISOLATE;
2132	to_mt = __get_pfnblock_flags_mask(page, page_to_pfn(page),
2133	MIGRATETYPE_MASK);
2134	}
2135
2136	__move_freepages_block(zone, start_pfn, from_mt, to_mt);
2137	toggle_pageblock_isolate(pfn_to_page(start_pfn), isolate);
2138
2139	return true;
2140	}
2141
2142	bool pageblock_isolate_and_move_free_pages(struct zone zone, struct* page *page)
2143	{
2144	return __move_freepages_block_isolate(zone, page, true);
2145	}
2146
2147	bool pageblock_unisolate_and_move_free_pages(struct zone zone, struct* page *page)
2148	{
2149	return __move_freepages_block_isolate(zone, page, false);
2150	}
2151
2152	#endif /* CONFIG_MEMORY_ISOLATION */
2153
2154	static void change_pageblock_range(struct page *pageblock_page,
2155	int start_order, int migratetype)
2156	{
2157	int nr_pageblocks = `1` << (start_order - pageblock_order);
2158
2159	while (nr_pageblocks--) {
2160	set_pageblock_migratetype(page: pageblock_page, migratetype);
2161	pageblock_page += pageblock_nr_pages;
2162	}
2163	}
2164
2165	static inline bool boost_watermark(struct zone *zone)
2166	{
2167	unsigned long max_boost;
2168
2169	if (!watermark_boost_factor)
2170	return false;
2171	/*
2172	* Don't bother in zones that are unlikely to produce results.
2173	* On small machines, including kdump capture kernels running
2174	* in a small area, boosting the watermark can cause an out of
2175	* memory situation immediately.
2176	*/
2177	if ((pageblock_nr_pages * `4`) > zone_managed_pages(zone))
2178	return false;
2179
2180	max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
2181	watermark_boost_factor, `10000`);
2182
2183	/*
2184	* high watermark may be uninitialised if fragmentation occurs
2185	* very early in boot so do not boost. We do not fall
2186	* through and boost by pageblock_nr_pages as failing
2187	* allocations that early means that reclaim is not going
2188	* to help and it may even be impossible to reclaim the
2189	* boosted watermark resulting in a hang.
2190	*/
2191	if (!max_boost)
2192	return false;
2193
2194	max_boost = max(pageblock_nr_pages, max_boost);
2195
2196	zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
2197	max_boost);
2198
2199	return true;
2200	}
2201
2202	/*
2203	* When we are falling back to another migratetype during allocation, should we
2204	* try to claim an entire block to satisfy further allocations, instead of
2205	* polluting multiple pageblocks?
2206	*/
2207	static bool should_try_claim_block(unsigned int order, int start_mt)
2208	{
2209	/*
2210	* Leaving this order check is intended, although there is
2211	* relaxed order check in next check. The reason is that
2212	* we can actually claim the whole pageblock if this condition met,
2213	* but, below check doesn't guarantee it and that is just heuristic
2214	* so could be changed anytime.
2215	*/
2216	if (order >= pageblock_order)
2217	return true;
2218
2219	/*
2220	* Above a certain threshold, always try to claim, as it's likely there
2221	* will be more free pages in the pageblock.
2222	*/
2223	if (order >= pageblock_order / `2`)
2224	return true;
2225
2226	/*
2227	* Unmovable/reclaimable allocations would cause permanent
2228	* fragmentations if they fell back to allocating from a movable block
2229	* (polluting it), so we try to claim the whole block regardless of the
2230	* allocation size. Later movable allocations can always steal from this
2231	* block, which is less problematic.
2232	*/
2233	if (start_mt == MIGRATE_RECLAIMABLE \|\| start_mt == MIGRATE_UNMOVABLE)
2234	return true;
2235
2236	if (page_group_by_mobility_disabled)
2237	return true;
2238
2239	/*
2240	* Movable pages won't cause permanent fragmentation, so when you alloc
2241	* small pages, we just need to temporarily steal unmovable or
2242	* reclaimable pages that are closest to the request size. After a
2243	* while, memory compaction may occur to form large contiguous pages,
2244	* and the next movable allocation may not need to steal.
2245	*/
2246	return false;
2247	}
2248
2249	/*
2250	* Check whether there is a suitable fallback freepage with requested order.
2251	* If claimable is true, this function returns fallback_mt only if
2252	* we would do this whole-block claiming. This would help to reduce
2253	* fragmentation due to mixed migratetype pages in one pageblock.
2254	*/
2255	int find_suitable_fallback(struct free_area area, unsigned* int order,
2256	int migratetype, bool claimable)
2257	{
2258	int i;
2259
2260	if (claimable && !should_try_claim_block(order, start_mt: migratetype))
2261	return -`2`;
2262
2263	if (area->nr_free == `0`)
2264	return -`1`;
2265
2266	for (i = `0`; i < MIGRATE_PCPTYPES - `1` ; i++) {
2267	int fallback_mt = fallbacks[migratetype][i];
2268
2269	if (!free_area_empty(area, migratetype: fallback_mt))
2270	return fallback_mt;
2271	}
2272
2273	return -`1`;
2274	}
2275
2276	/*
2277	* This function implements actual block claiming behaviour. If order is large
2278	* enough, we can claim the whole pageblock for the requested migratetype. If
2279	* not, we check the pageblock for constituent pages; if at least half of the
2280	* pages are free or compatible, we can still claim the whole block, so pages
2281	* freed in the future will be put on the correct free list.
2282	*/
2283	static struct page *
2284	try_to_claim_block(struct zone zone, struct* page *page,
2285	int current_order, int order, int start_type,
2286	int block_type, unsigned int alloc_flags)
2287	{
2288	int free_pages, movable_pages, alike_pages;
2289	unsigned long start_pfn;
2290
2291	/ Take ownership for orders >= pageblock_order /
2292	if (current_order >= pageblock_order) {
2293	unsigned int nr_added;
2294
2295	del_page_from_free_list(page, zone, order: current_order, migratetype: block_type);
2296	change_pageblock_range(pageblock_page: page, start_order: current_order, migratetype: start_type);
2297	nr_added = expand(zone, page, low: order, high: current_order, migratetype: start_type);
2298	account_freepages(zone, nr_pages: nr_added, migratetype: start_type);
2299	return page;
2300	}
2301
2302	/*
2303	* Boost watermarks to increase reclaim pressure to reduce the
2304	* likelihood of future fallbacks. Wake kswapd now as the node
2305	* may be balanced overall and kswapd will not wake naturally.
2306	*/
2307	if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD))
2308	set_bit(nr: ZONE_BOOSTED_WATERMARK, addr: &zone->flags);
2309
2310	/ moving whole block can fail due to zone boundary conditions /
2311	if (!prep_move_freepages_block(zone, page, start_pfn: &start_pfn, num_free: &free_pages,
2312	num_movable: &movable_pages))
2313	return NULL;
2314
2315	/*
2316	* Determine how many pages are compatible with our allocation.
2317	* For movable allocation, it's the number of movable pages which
2318	* we just obtained. For other types it's a bit more tricky.
2319	*/
2320	if (start_type == MIGRATE_MOVABLE) {
2321	alike_pages = movable_pages;
2322	} else {
2323	/*
2324	* If we are falling back a RECLAIMABLE or UNMOVABLE allocation
2325	* to MOVABLE pageblock, consider all non-movable pages as
2326	* compatible. If it's UNMOVABLE falling back to RECLAIMABLE or
2327	* vice versa, be conservative since we can't distinguish the
2328	* exact migratetype of non-movable pages.
2329	*/
2330	if (block_type == MIGRATE_MOVABLE)
2331	alike_pages = pageblock_nr_pages
2332	- (free_pages + movable_pages);
2333	else
2334	alike_pages = `0`;
2335	}
2336	/*
2337	* If a sufficient number of pages in the block are either free or of
2338	* compatible migratability as our allocation, claim the whole block.
2339	*/
2340	if (free_pages + alike_pages >= (`1` << (pageblock_order-`1`)) \|\|
2341	page_group_by_mobility_disabled) {
2342	__move_freepages_block(zone, start_pfn, old_mt: block_type, new_mt: start_type);
2343	set_pageblock_migratetype(pfn_to_page(start_pfn), migratetype: start_type);
2344	return __rmqueue_smallest(zone, order, migratetype: start_type);
2345	}
2346
2347	return NULL;
2348	}
2349
2350	/*
2351	* Try to allocate from some fallback migratetype by claiming the entire block,
2352	* i.e. converting it to the allocation's start migratetype.
2353	*
2354	* The use of signed ints for order and current_order is a deliberate
2355	* deviation from the rest of this file, to make the for loop
2356	* condition simpler.
2357	*/
2358	static __always_inline struct page *
2359	__rmqueue_claim(struct zone zone, int* order, int start_migratetype,
2360	unsigned int alloc_flags)
2361	{
2362	struct free_area *area;
2363	int current_order;
2364	int min_order = order;
2365	struct page *page;
2366	int fallback_mt;
2367
2368	/*
2369	* Do not steal pages from freelists belonging to other pageblocks
2370	* i.e. orders < pageblock_order. If there are no local zones free,
2371	* the zonelists will be reiterated without ALLOC_NOFRAGMENT.
2372	*/
2373	if (order < pageblock_order && alloc_flags & ALLOC_NOFRAGMENT)
2374	min_order = pageblock_order;
2375
2376	/*
2377	* Find the largest available free page in the other list. This roughly
2378	* approximates finding the pageblock with the most free pages, which
2379	* would be too costly to do exactly.
2380	*/
2381	for (current_order = MAX_PAGE_ORDER; current_order >= min_order;
2382	--current_order) {
2383	area = &(zone->free_area[current_order]);
2384	fallback_mt = find_suitable_fallback(area, order: current_order,
2385	migratetype: start_migratetype, claimable: true);
2386
2387	/ No block in that order /
2388	if (fallback_mt == -`1`)
2389	continue;
2390
2391	/ Advanced into orders too low to claim, abort /
2392	if (fallback_mt == -`2`)
2393	break;
2394
2395	page = get_page_from_free_area(area, migratetype: fallback_mt);
2396	page = try_to_claim_block(zone, page, current_order, order,
2397	start_type: start_migratetype, block_type: fallback_mt,
2398	alloc_flags);
2399	if (page) {
2400	trace_mm_page_alloc_extfrag(page, alloc_order: order, fallback_order: current_order,
2401	alloc_migratetype: start_migratetype, fallback_migratetype: fallback_mt);
2402	return page;
2403	}
2404	}
2405
2406	return NULL;
2407	}
2408
2409	/*
2410	* Try to steal a single page from some fallback migratetype. Leave the rest of
2411	* the block as its current migratetype, potentially causing fragmentation.
2412	*/
2413	static __always_inline struct page *
2414	__rmqueue_steal(struct zone zone, int* order, int start_migratetype)
2415	{
2416	struct free_area *area;
2417	int current_order;
2418	struct page *page;
2419	int fallback_mt;
2420
2421	for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) {
2422	area = &(zone->free_area[current_order]);
2423	fallback_mt = find_suitable_fallback(area, order: current_order,
2424	migratetype: start_migratetype, claimable: false);
2425	if (fallback_mt == -`1`)
2426	continue;
2427
2428	page = get_page_from_free_area(area, migratetype: fallback_mt);
2429	page_del_and_expand(zone, page, low: order, high: current_order, migratetype: fallback_mt);
2430	trace_mm_page_alloc_extfrag(page, alloc_order: order, fallback_order: current_order,
2431	alloc_migratetype: start_migratetype, fallback_migratetype: fallback_mt);
2432	return page;
2433	}
2434
2435	return NULL;
2436	}
2437
2438	enum rmqueue_mode {
2439	RMQUEUE_NORMAL,
2440	RMQUEUE_CMA,
2441	RMQUEUE_CLAIM,
2442	RMQUEUE_STEAL,
2443	};
2444
2445	/*
2446	* Do the hard work of removing an element from the buddy allocator.
2447	* Call me with the zone->lock already held.
2448	*/
2449	static __always_inline struct page *
2450	__rmqueue(struct zone zone, unsigned* int order, int migratetype,
2451	unsigned int alloc_flags, enum rmqueue_mode *mode)
2452	{
2453	struct page *page;
2454
2455	if (IS_ENABLED(CONFIG_CMA)) {
2456	/*
2457	* Balance movable allocations between regular and CMA areas by
2458	* allocating from CMA when over half of the zone's free memory
2459	* is in the CMA area.
2460	*/
2461	if (alloc_flags & ALLOC_CMA &&
2462	zone_page_state(zone, item: NR_FREE_CMA_PAGES) >
2463	zone_page_state(zone, item: NR_FREE_PAGES) / `2`) {
2464	page = __rmqueue_cma_fallback(zone, order);
2465	if (page)
2466	return page;
2467	}
2468	}
2469
2470	/*
2471	* First try the freelists of the requested migratetype, then try
2472	* fallbacks modes with increasing levels of fragmentation risk.
2473	*
2474	* The fallback logic is expensive and rmqueue_bulk() calls in
2475	* a loop with the zone->lock held, meaning the freelists are
2476	* not subject to any outside changes. Remember in *mode where
2477	* we found pay dirt, to save us the search on the next call.
2478	*/
2479	switch (*mode) {
2480	case RMQUEUE_NORMAL:
2481	page = __rmqueue_smallest(zone, order, migratetype);
2482	if (page)
2483	return page;
2484	fallthrough;
2485	case RMQUEUE_CMA:
2486	if (alloc_flags & ALLOC_CMA) {
2487	page = __rmqueue_cma_fallback(zone, order);
2488	if (page) {
2489	*mode = RMQUEUE_CMA;
2490	return page;
2491	}
2492	}
2493	fallthrough;
2494	case RMQUEUE_CLAIM:
2495	page = __rmqueue_claim(zone, order, start_migratetype: migratetype, alloc_flags);
2496	if (page) {
2497	/ Replenished preferred freelist, back to normal mode. /
2498	*mode = RMQUEUE_NORMAL;
2499	return page;
2500	}
2501	fallthrough;
2502	case RMQUEUE_STEAL:
2503	if (!(alloc_flags & ALLOC_NOFRAGMENT)) {
2504	page = __rmqueue_steal(zone, order, start_migratetype: migratetype);
2505	if (page) {
2506	*mode = RMQUEUE_STEAL;
2507	return page;
2508	}
2509	}
2510	}
2511	return NULL;
2512	}
2513
2514	/*
2515	* Obtain a specified number of elements from the buddy allocator, all under
2516	* a single hold of the lock, for efficiency. Add them to the supplied list.
2517	* Returns the number of new pages which were placed at *list.
2518	*/
2519	static int rmqueue_bulk(struct zone zone, unsigned* int order,
2520	unsigned long count, struct list_head *list,
2521	int migratetype, unsigned int alloc_flags)
2522	{
2523	enum rmqueue_mode rmqm = RMQUEUE_NORMAL;
2524	unsigned long flags;
2525	int i;
2526
2527	if (unlikely(alloc_flags & ALLOC_TRYLOCK)) {
2528	if (!spin_trylock_irqsave(&zone->lock, flags))
2529	return `0`;
2530	} else {
2531	spin_lock_irqsave(&zone->lock, flags);
2532	}
2533	for (i = `0`; i < count; ++i) {
2534	struct page *page = __rmqueue(zone, order, migratetype,
2535	alloc_flags, mode: &rmqm);
2536	if (unlikely(page == NULL))
2537	break;
2538
2539	/*
2540	* Split buddy pages returned by expand() are received here in
2541	* physical page order. The page is added to the tail of
2542	* caller's list. From the callers perspective, the linked list
2543	* is ordered by page number under some conditions. This is
2544	* useful for IO devices that can forward direction from the
2545	* head, thus also in the physical page order. This is useful
2546	* for IO devices that can merge IO requests if the physical
2547	* pages are ordered properly.
2548	*/
2549	list_add_tail(new: &page->pcp_list, head: list);
2550	}
2551	spin_unlock_irqrestore(lock: &zone->lock, flags);
2552
2553	return i;
2554	}
2555
2556	/*
2557	* Called from the vmstat counter updater to decay the PCP high.
2558	* Return whether there are addition works to do.
2559	*/
2560	int decay_pcp_high(struct zone zone, struct* per_cpu_pages *pcp)
2561	{
2562	int high_min, to_drain, batch;
2563	int todo = `0`;
2564
2565	high_min = READ_ONCE(pcp->high_min);
2566	batch = READ_ONCE(pcp->batch);
2567	/*
2568	* Decrease pcp->high periodically to try to free possible
2569	* idle PCP pages. And, avoid to free too many pages to
2570	* control latency. This caps pcp->high decrement too.
2571	*/
2572	if (pcp->high > high_min) {
2573	pcp->high = max3(pcp->count - (batch << CONFIG_PCP_BATCH_SCALE_MAX),
2574	pcp->high - (pcp->high >> `3`), high_min);
2575	if (pcp->high > high_min)
2576	todo++;
2577	}
2578
2579	to_drain = pcp->count - pcp->high;
2580	if (to_drain > `0`) {
2581	spin_lock(lock: &pcp->lock);
2582	free_pcppages_bulk(zone, count: to_drain, pcp, pindex: `0`);
2583	spin_unlock(lock: &pcp->lock);
2584	todo++;
2585	}
2586
2587	return todo;
2588	}
2589
2590	#ifdef CONFIG_NUMA
2591	/*
2592	* Called from the vmstat counter updater to drain pagesets of this
2593	* currently executing processor on remote nodes after they have
2594	* expired.
2595	*/
2596	void drain_zone_pages(struct zone zone, struct* per_cpu_pages *pcp)
2597	{
2598	int to_drain, batch;
2599
2600	batch = READ_ONCE(pcp->batch);
2601	to_drain = min(pcp->count, batch);
2602	if (to_drain > `0`) {
2603	spin_lock(lock: &pcp->lock);
2604	free_pcppages_bulk(zone, count: to_drain, pcp, pindex: `0`);
2605	spin_unlock(lock: &pcp->lock);
2606	}
2607	}
2608	#endif
2609
2610	/*
2611	* Drain pcplists of the indicated processor and zone.
2612	*/
2613	static void drain_pages_zone(unsigned int cpu, struct zone *zone)
2614	{
2615	struct per_cpu_pages *pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
2616	int count;
2617
2618	do {
2619	spin_lock(lock: &pcp->lock);
2620	count = pcp->count;
2621	if (count) {
2622	int to_drain = min(count,
2623	pcp->batch << CONFIG_PCP_BATCH_SCALE_MAX);
2624
2625	free_pcppages_bulk(zone, count: to_drain, pcp, pindex: `0`);
2626	count -= to_drain;
2627	}
2628	spin_unlock(lock: &pcp->lock);
2629	} while (count);
2630	}
2631
2632	/*
2633	* Drain pcplists of all zones on the indicated processor.
2634	*/
2635	static void drain_pages(unsigned int cpu)
2636	{
2637	struct zone *zone;
2638
2639	for_each_populated_zone(zone) {
2640	drain_pages_zone(cpu, zone);
2641	}
2642	}
2643
2644	/*
2645	* Spill all of this CPU's per-cpu pages back into the buddy allocator.
2646	*/
2647	void drain_local_pages(struct zone *zone)
2648	{
2649	int cpu = smp_processor_id();
2650
2651	if (zone)
2652	drain_pages_zone(cpu, zone);
2653	else
2654	drain_pages(cpu);
2655	}
2656
2657	/*
2658	* The implementation of drain_all_pages(), exposing an extra parameter to
2659	* drain on all cpus.
2660	*
2661	* drain_all_pages() is optimized to only execute on cpus where pcplists are
2662	* not empty. The check for non-emptiness can however race with a free to
2663	* pcplist that has not yet increased the pcp->count from 0 to 1. Callers
2664	* that need the guarantee that every CPU has drained can disable the
2665	* optimizing racy check.
2666	*/
2667	static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
2668	{
2669	int cpu;
2670
2671	/*
2672	* Allocate in the BSS so we won't require allocation in
2673	* direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
2674	*/
2675	static cpumask_t cpus_with_pcps;
2676
2677	/*
2678	* Do not drain if one is already in progress unless it's specific to
2679	* a zone. Such callers are primarily CMA and memory hotplug and need
2680	* the drain to be complete when the call returns.
2681	*/
2682	if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) {
2683	if (!zone)
2684	return;
2685	mutex_lock(lock: &pcpu_drain_mutex);
2686	}
2687
2688	/*
2689	* We don't care about racing with CPU hotplug event
2690	* as offline notification will cause the notified
2691	* cpu to drain that CPU pcps and on_each_cpu_mask
2692	* disables preemption as part of its processing
2693	*/
2694	for_each_online_cpu(cpu) {
2695	struct per_cpu_pages *pcp;
2696	struct zone *z;
2697	bool has_pcps = false;
2698
2699	if (force_all_cpus) {
2700	/*
2701	* The pcp.count check is racy, some callers need a
2702	* guarantee that no cpu is missed.
2703	*/
2704	has_pcps = true;
2705	} else if (zone) {
2706	pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
2707	if (pcp->count)
2708	has_pcps = true;
2709	} else {
2710	for_each_populated_zone(z) {
2711	pcp = per_cpu_ptr(z->per_cpu_pageset, cpu);
2712	if (pcp->count) {
2713	has_pcps = true;
2714	break;
2715	}
2716	}
2717	}
2718
2719	if (has_pcps)
2720	cpumask_set_cpu(cpu, dstp: &cpus_with_pcps);
2721	else
2722	cpumask_clear_cpu(cpu, dstp: &cpus_with_pcps);
2723	}
2724
2725	for_each_cpu(cpu, &cpus_with_pcps) {
2726	if (zone)
2727	drain_pages_zone(cpu, zone);
2728	else
2729	drain_pages(cpu);
2730	}
2731
2732	mutex_unlock(lock: &pcpu_drain_mutex);
2733	}
2734
2735	/*
2736	* Spill all the per-cpu pages from all CPUs back into the buddy allocator.
2737	*
2738	* When zone parameter is non-NULL, spill just the single zone's pages.
2739	*/
2740	void drain_all_pages(struct zone *zone)
2741	{
2742	__drain_all_pages(zone, force_all_cpus: false);
2743	}
2744
2745	static int nr_pcp_free(struct per_cpu_pages pcp, int* batch, int high, bool free_high)
2746	{
2747	int min_nr_free, max_nr_free;
2748
2749	/ Free as much as possible if batch freeing high-order pages. /
2750	if (unlikely(free_high))
2751	return min(pcp->count, batch << CONFIG_PCP_BATCH_SCALE_MAX);
2752
2753	/ Check for PCP disabled or boot pageset /
2754	if (unlikely(high < batch))
2755	return `1`;
2756
2757	/ Leave at least pcp->batch pages on the list /
2758	min_nr_free = batch;
2759	max_nr_free = high - batch;
2760
2761	/*
2762	* Increase the batch number to the number of the consecutive
2763	* freed pages to reduce zone lock contention.
2764	*/
2765	batch = clamp_t(int, pcp->free_count, min_nr_free, max_nr_free);
2766
2767	return batch;
2768	}
2769
2770	static int nr_pcp_high(struct per_cpu_pages pcp, struct* zone *zone,
2771	int batch, bool free_high)
2772	{
2773	int high, high_min, high_max;
2774
2775	high_min = READ_ONCE(pcp->high_min);
2776	high_max = READ_ONCE(pcp->high_max);
2777	high = pcp->high = clamp(pcp->high, high_min, high_max);
2778
2779	if (unlikely(!high))
2780	return `0`;
2781
2782	if (unlikely(free_high)) {
2783	pcp->high = max(high - (batch << CONFIG_PCP_BATCH_SCALE_MAX),
2784	high_min);
2785	return `0`;
2786	}
2787
2788	/*
2789	* If reclaim is active, limit the number of pages that can be
2790	* stored on pcp lists
2791	*/
2792	if (test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags)) {
2793	int free_count = max_t(int, pcp->free_count, batch);
2794
2795	pcp->high = max(high - free_count, high_min);
2796	return min(batch << `2`, pcp->high);
2797	}
2798
2799	if (high_min == high_max)
2800	return high;
2801
2802	if (test_bit(ZONE_BELOW_HIGH, &zone->flags)) {
2803	int free_count = max_t(int, pcp->free_count, batch);
2804
2805	pcp->high = max(high - free_count, high_min);
2806	high = max(pcp->count, high_min);
2807	} else if (pcp->count >= high) {
2808	int need_high = pcp->free_count + batch;
2809
2810	/ pcp->high should be large enough to hold batch freed pages /
2811	if (pcp->high < need_high)
2812	pcp->high = clamp(need_high, high_min, high_max);
2813	}
2814
2815	return high;
2816	}
2817
2818	static void free_frozen_page_commit(struct zone *zone,
2819	struct per_cpu_pages pcp, struct* page page, int* migratetype,
2820	unsigned int order, fpi_t fpi_flags)
2821	{
2822	int high, batch;
2823	int pindex;
2824	bool free_high = false;
2825
2826	/*
2827	* On freeing, reduce the number of pages that are batch allocated.
2828	* See nr_pcp_alloc() where alloc_factor is increased for subsequent
2829	* allocations.
2830	*/
2831	pcp->alloc_factor >>= `1`;
2832	__count_vm_events(item: PGFREE, delta: `1` << order);
2833	pindex = order_to_pindex(migratetype, order);
2834	list_add(new: &page->pcp_list, head: &pcp->lists[pindex]);
2835	pcp->count += `1` << order;
2836
2837	batch = READ_ONCE(pcp->batch);
2838	/*
2839	* As high-order pages other than THP's stored on PCP can contribute
2840	* to fragmentation, limit the number stored when PCP is heavily
2841	* freeing without allocation. The remainder after bulk freeing
2842	* stops will be drained from vmstat refresh context.
2843	*/
2844	if (order && order <= PAGE_ALLOC_COSTLY_ORDER) {
2845	free_high = (pcp->free_count >= (batch + pcp->high_min / `2`) &&
2846	(pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) &&
2847	(!(pcp->flags & PCPF_FREE_HIGH_BATCH) \|\|
2848	pcp->count >= batch));
2849	pcp->flags \|= PCPF_PREV_FREE_HIGH_ORDER;
2850	} else if (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) {
2851	pcp->flags &= ~PCPF_PREV_FREE_HIGH_ORDER;
2852	}
2853	if (pcp->free_count < (batch << CONFIG_PCP_BATCH_SCALE_MAX))
2854	pcp->free_count += (`1` << order);
2855
2856	if (unlikely(fpi_flags & FPI_TRYLOCK)) {
2857	/*
2858	* Do not attempt to take a zone lock. Let pcp->count get
2859	* over high mark temporarily.
2860	*/
2861	return;
2862	}
2863
2864	high = nr_pcp_high(pcp, zone, batch, free_high);
2865	if (pcp->count < high)
2866	return;
2867
2868	free_pcppages_bulk(zone, count: nr_pcp_free(pcp, batch, high, free_high),
2869	pcp, pindex);
2870	if (test_bit(ZONE_BELOW_HIGH, &zone->flags) &&
2871	zone_watermark_ok(z: zone, order: `0`, mark: high_wmark_pages(z: zone),
2872	highest_zoneidx: ZONE_MOVABLE, alloc_flags: `0`)) {
2873	struct pglist_data *pgdat = zone->zone_pgdat;
2874	clear_bit(nr: ZONE_BELOW_HIGH, addr: &zone->flags);
2875
2876	/*
2877	* Assume that memory pressure on this node is gone and may be
2878	* in a reclaimable state. If a memory fallback node exists,
2879	* direct reclaim may not have been triggered, causing a
2880	* 'hopeless node' to stay in that state for a while. Let
2881	* kswapd work again by resetting kswapd_failures.
2882	*/
2883	if (atomic_read(v: &pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES &&
2884	next_memory_node(nid: pgdat->node_id) < MAX_NUMNODES)
2885	atomic_set(v: &pgdat->kswapd_failures, i: `0`);
2886	}
2887	}
2888
2889	/*
2890	* Free a pcp page
2891	*/
2892	static void __free_frozen_pages(struct page page, unsigned* int order,
2893	fpi_t fpi_flags)
2894	{
2895	unsigned long __maybe_unused UP_flags;
2896	struct per_cpu_pages *pcp;
2897	struct zone *zone;
2898	unsigned long pfn = page_to_pfn(page);
2899	int migratetype;
2900
2901	if (!pcp_allowed_order(order)) {
2902	__free_pages_ok(page, order, fpi_flags);
2903	return;
2904	}
2905
2906	if (!free_pages_prepare(page, order))
2907	return;
2908
2909	/*
2910	* We only track unmovable, reclaimable and movable on pcp lists.
2911	* Place ISOLATE pages on the isolated list because they are being
2912	* offlined but treat HIGHATOMIC and CMA as movable pages so we can
2913	* get those areas back if necessary. Otherwise, we may have to free
2914	* excessively into the page allocator
2915	*/
2916	zone = page_zone(page);
2917	migratetype = get_pfnblock_migratetype(page, pfn);
2918	if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
2919	if (unlikely(is_migrate_isolate(migratetype))) {
2920	free_one_page(zone, page, pfn, order, fpi_flags);
2921	return;
2922	}
2923	migratetype = MIGRATE_MOVABLE;
2924	}
2925
2926	if (unlikely((fpi_flags & FPI_TRYLOCK) && IS_ENABLED(CONFIG_PREEMPT_RT)
2927	&& (in_nmi() \|\| in_hardirq()))) {
2928	add_page_to_zone_llist(zone, page, order);
2929	return;
2930	}
2931	pcp_trylock_prepare(UP_flags);
2932	pcp = pcp_spin_trylock(zone->per_cpu_pageset);
2933	if (pcp) {
2934	free_frozen_page_commit(zone, pcp, page, migratetype, order, fpi_flags);
2935	pcp_spin_unlock(pcp);
2936	} else {
2937	free_one_page(zone, page, pfn, order, fpi_flags);
2938	}
2939	pcp_trylock_finish(UP_flags);
2940	}
2941
2942	void free_frozen_pages(struct page page, unsigned* int order)
2943	{
2944	__free_frozen_pages(page, order, FPI_NONE);
2945	}
2946
2947	/*
2948	* Free a batch of folios
2949	*/
2950	void free_unref_folios(struct folio_batch *folios)
2951	{
2952	unsigned long __maybe_unused UP_flags;
2953	struct per_cpu_pages *pcp = NULL;
2954	struct zone *locked_zone = NULL;
2955	int i, j;
2956
2957	/ Prepare folios for freeing /
2958	for (i = `0`, j = `0`; i < folios->nr; i++) {
2959	struct folio *folio = folios->folios[i];
2960	unsigned long pfn = folio_pfn(folio);
2961	unsigned int order = folio_order(folio);
2962
2963	if (!free_pages_prepare(page: &folio->page, order))
2964	continue;
2965	/*
2966	* Free orders not handled on the PCP directly to the
2967	* allocator.
2968	*/
2969	if (!pcp_allowed_order(order)) {
2970	free_one_page(zone: folio_zone(folio), page: &folio->page,
2971	pfn, order, FPI_NONE);
2972	continue;
2973	}
2974	folio->private = (void )(unsigned* long)order;
2975	if (j != i)
2976	folios->folios[j] = folio;
2977	j++;
2978	}
2979	folios->nr = j;
2980
2981	for (i = `0`; i < folios->nr; i++) {
2982	struct folio *folio = folios->folios[i];
2983	struct zone *zone = folio_zone(folio);
2984	unsigned long pfn = folio_pfn(folio);
2985	unsigned int order = (unsigned long)folio->private;
2986	int migratetype;
2987
2988	folio->private = NULL;
2989	migratetype = get_pfnblock_migratetype(page: &folio->page, pfn);
2990
2991	/ Different zone requires a different pcp lock /
2992	if (zone != locked_zone \|\|
2993	is_migrate_isolate(migratetype)) {
2994	if (pcp) {
2995	pcp_spin_unlock(pcp);
2996	pcp_trylock_finish(UP_flags);
2997	locked_zone = NULL;
2998	pcp = NULL;
2999	}
3000
3001	/*
3002	* Free isolated pages directly to the
3003	* allocator, see comment in free_frozen_pages.
3004	*/
3005	if (is_migrate_isolate(migratetype)) {
3006	free_one_page(zone, page: &folio->page, pfn,
3007	order, FPI_NONE);
3008	continue;
3009	}
3010
3011	/*
3012	* trylock is necessary as folios may be getting freed
3013	* from IRQ or SoftIRQ context after an IO completion.
3014	*/
3015	pcp_trylock_prepare(UP_flags);
3016	pcp = pcp_spin_trylock(zone->per_cpu_pageset);
3017	if (unlikely(!pcp)) {
3018	pcp_trylock_finish(UP_flags);
3019	free_one_page(zone, page: &folio->page, pfn,
3020	order, FPI_NONE);
3021	continue;
3022	}
3023	locked_zone = zone;
3024	}
3025
3026	/*
3027	* Non-isolated types over MIGRATE_PCPTYPES get added
3028	* to the MIGRATE_MOVABLE pcp list.
3029	*/
3030	if (unlikely(migratetype >= MIGRATE_PCPTYPES))
3031	migratetype = MIGRATE_MOVABLE;
3032
3033	trace_mm_page_free_batched(page: &folio->page);
3034	free_frozen_page_commit(zone, pcp, page: &folio->page, migratetype,
3035	order, FPI_NONE);
3036	}
3037
3038	if (pcp) {
3039	pcp_spin_unlock(pcp);
3040	pcp_trylock_finish(UP_flags);
3041	}
3042	folio_batch_reinit(fbatch: folios);
3043	}
3044
3045	/*
3046	* split_page takes a non-compound higher-order page, and splits it into
3047	* n (1<<order) sub-pages: page[0..n]
3048	* Each sub-page must be freed individually.
3049	*
3050	* Note: this is probably too low level an operation for use in drivers.
3051	* Please consult with lkml before using this in your driver.
3052	*/
3053	void split_page(struct page page, unsigned* int order)
3054	{
3055	int i;
3056
3057	VM_BUG_ON_PAGE(PageCompound(page), page);
3058	VM_BUG_ON_PAGE(!page_count(page), page);
3059
3060	for (i = `1`; i < (`1` << order); i++)
3061	set_page_refcounted(page + i);
3062	split_page_owner(page, old_order: order, new_order: `0`);
3063	pgalloc_tag_split(page_folio(page), old_order: order, new_order: `0`);
3064	split_page_memcg(first: page, order);
3065	}
3066	EXPORT_SYMBOL_GPL(split_page);
3067
3068	int __isolate_free_page(struct page page, unsigned* int order)
3069	{
3070	struct zone *zone = page_zone(page);
3071	int mt = get_pageblock_migratetype(page);
3072
3073	if (!is_migrate_isolate(migratetype: mt)) {
3074	unsigned long watermark;
3075	/*
3076	* Obey watermarks as if the page was being allocated. We can
3077	* emulate a high-order watermark check with a raised order-0
3078	* watermark, because we already know our high-order page
3079	* exists.
3080	*/
3081	watermark = zone->_watermark[WMARK_MIN] + (`1UL` << order);
3082	if (!zone_watermark_ok(z: zone, order: `0`, mark: watermark, highest_zoneidx: `0`, ALLOC_CMA))
3083	return `0`;
3084	}
3085
3086	del_page_from_free_list(page, zone, order, migratetype: mt);
3087
3088	/*
3089	* Set the pageblock if the isolated page is at least half of a
3090	* pageblock
3091	*/
3092	if (order >= pageblock_order - `1`) {
3093	struct page *endpage = page + (`1` << order) - `1`;
3094	for (; page < endpage; page += pageblock_nr_pages) {
3095	int mt = get_pageblock_migratetype(page);
3096	/*
3097	* Only change normal pageblocks (i.e., they can merge
3098	* with others)
3099	*/
3100	if (migratetype_is_mergeable(mt))
3101	move_freepages_block(zone, page, old_mt: mt,
3102	new_mt: MIGRATE_MOVABLE);
3103	}
3104	}
3105
3106	return `1UL` << order;
3107	}
3108
3109	/**
3110	* __putback_isolated_page - Return a now-isolated page back where we got it
3111	* @page: Page that was isolated
3112	* @order: Order of the isolated page
3113	* @mt: The page's pageblock's migratetype
3114	*
3115	* This function is meant to return a page pulled from the free lists via
3116	* __isolate_free_page back to the free lists they were pulled from.
3117	*/
3118	void __putback_isolated_page(struct page page, unsigned* int order, int mt)
3119	{
3120	struct zone *zone = page_zone(page);
3121
3122	/ zone lock should be held when this function is called /
3123	lockdep_assert_held(&zone->lock);
3124
3125	/ Return isolated page to tail of freelist. /
3126	__free_one_page(page, page_to_pfn(page), zone, order, migratetype: mt,
3127	FPI_SKIP_REPORT_NOTIFY \| FPI_TO_TAIL);
3128	}
3129
3130	/*
3131	* Update NUMA hit/miss statistics
3132	*/
3133	static inline void zone_statistics(struct zone preferred_zone, struct* zone *z,
3134	long nr_account)
3135	{
3136	#ifdef CONFIG_NUMA
3137	enum numa_stat_item local_stat = NUMA_LOCAL;
3138
3139	/ skip numa counters update if numa stats is disabled /
3140	if (!static_branch_likely(&vm_numa_stat_key))
3141	return;
3142
3143	if (zone_to_nid(zone: z) != numa_node_id())
3144	local_stat = NUMA_OTHER;
3145
3146	if (zone_to_nid(zone: z) == zone_to_nid(zone: preferred_zone))
3147	__count_numa_events(zone: z, item: NUMA_HIT, delta: nr_account);
3148	else {
3149	__count_numa_events(zone: z, item: NUMA_MISS, delta: nr_account);
3150	__count_numa_events(zone: preferred_zone, item: NUMA_FOREIGN, delta: nr_account);
3151	}
3152	__count_numa_events(zone: z, item: local_stat, delta: nr_account);
3153	#endif
3154	}
3155
3156	static __always_inline
3157	struct page rmqueue_buddy(struct* zone preferred_zone, struct* zone *zone,
3158	unsigned int order, unsigned int alloc_flags,
3159	int migratetype)
3160	{
3161	struct page *page;
3162	unsigned long flags;
3163
3164	do {
3165	page = NULL;
3166	if (unlikely(alloc_flags & ALLOC_TRYLOCK)) {
3167	if (!spin_trylock_irqsave(&zone->lock, flags))
3168	return NULL;
3169	} else {
3170	spin_lock_irqsave(&zone->lock, flags);
3171	}
3172	if (alloc_flags & ALLOC_HIGHATOMIC)
3173	page = __rmqueue_smallest(zone, order, migratetype: MIGRATE_HIGHATOMIC);
3174	if (!page) {
3175	enum rmqueue_mode rmqm = RMQUEUE_NORMAL;
3176
3177	page = __rmqueue(zone, order, migratetype, alloc_flags, mode: &rmqm);
3178
3179	/*
3180	* If the allocation fails, allow OOM handling and
3181	* order-0 (atomic) allocs access to HIGHATOMIC
3182	* reserves as failing now is worse than failing a
3183	* high-order atomic allocation in the future.
3184	*/
3185	if (!page && (alloc_flags & (ALLOC_OOM\|ALLOC_NON_BLOCK)))
3186	page = __rmqueue_smallest(zone, order, migratetype: MIGRATE_HIGHATOMIC);
3187
3188	if (!page) {
3189	spin_unlock_irqrestore(lock: &zone->lock, flags);
3190	return NULL;
3191	}
3192	}
3193	spin_unlock_irqrestore(lock: &zone->lock, flags);
3194	} while (check_new_pages(page, order));
3195
3196	__count_zid_vm_events(PGALLOC, page_zonenum(page), `1` << order);
3197	zone_statistics(preferred_zone, z: zone, nr_account: `1`);
3198
3199	return page;
3200	}
3201
3202	static int nr_pcp_alloc(struct per_cpu_pages pcp, struct* zone zone, int* order)
3203	{
3204	int high, base_batch, batch, max_nr_alloc;
3205	int high_max, high_min;
3206
3207	base_batch = READ_ONCE(pcp->batch);
3208	high_min = READ_ONCE(pcp->high_min);
3209	high_max = READ_ONCE(pcp->high_max);
3210	high = pcp->high = clamp(pcp->high, high_min, high_max);
3211
3212	/ Check for PCP disabled or boot pageset /
3213	if (unlikely(high < base_batch))
3214	return `1`;
3215
3216	if (order)
3217	batch = base_batch;
3218	else
3219	batch = (base_batch << pcp->alloc_factor);
3220
3221	/*
3222	* If we had larger pcp->high, we could avoid to allocate from
3223	* zone.
3224	*/
3225	if (high_min != high_max && !test_bit(ZONE_BELOW_HIGH, &zone->flags))
3226	high = pcp->high = min(high + batch, high_max);
3227
3228	if (!order) {
3229	max_nr_alloc = max(high - pcp->count - base_batch, base_batch);
3230	/*
3231	* Double the number of pages allocated each time there is
3232	* subsequent allocation of order-0 pages without any freeing.
3233	*/
3234	if (batch <= max_nr_alloc &&
3235	pcp->alloc_factor < CONFIG_PCP_BATCH_SCALE_MAX)
3236	pcp->alloc_factor++;
3237	batch = min(batch, max_nr_alloc);
3238	}
3239
3240	/*
3241	* Scale batch relative to order if batch implies free pages
3242	* can be stored on the PCP. Batch can be 1 for small zones or
3243	* for boot pagesets which should never store free pages as
3244	* the pages may belong to arbitrary zones.
3245	*/
3246	if (batch > `1`)
3247	batch = max(batch >> order, `2`);
3248
3249	return batch;
3250	}
3251
3252	/ Remove page from the per-cpu list, caller must protect the list /
3253	static inline
3254	struct page __rmqueue_pcplist(struct* zone zone, unsigned* int order,
3255	int migratetype,
3256	unsigned int alloc_flags,
3257	struct per_cpu_pages *pcp,
3258	struct list_head *list)
3259	{
3260	struct page *page;
3261
3262	do {
3263	if (list_empty(head: list)) {
3264	int batch = nr_pcp_alloc(pcp, zone, order);
3265	int alloced;
3266
3267	alloced = rmqueue_bulk(zone, order,
3268	count: batch, list,
3269	migratetype, alloc_flags);
3270
3271	pcp->count += alloced << order;
3272	if (unlikely(list_empty(list)))
3273	return NULL;
3274	}
3275
3276	page = list_first_entry(list, struct page, pcp_list);
3277	list_del(entry: &page->pcp_list);
3278	pcp->count -= `1` << order;
3279	} while (check_new_pages(page, order));
3280
3281	return page;
3282	}
3283
3284	/ Lock and remove page from the per-cpu list /
3285	static struct page rmqueue_pcplist(struct* zone *preferred_zone,
3286	struct zone zone, unsigned* int order,
3287	int migratetype, unsigned int alloc_flags)
3288	{
3289	struct per_cpu_pages *pcp;
3290	struct list_head *list;
3291	struct page *page;
3292	unsigned long __maybe_unused UP_flags;
3293
3294	/ spin_trylock may fail due to a parallel drain or IRQ reentrancy. /
3295	pcp_trylock_prepare(UP_flags);
3296	pcp = pcp_spin_trylock(zone->per_cpu_pageset);
3297	if (!pcp) {
3298	pcp_trylock_finish(UP_flags);
3299	return NULL;
3300	}
3301
3302	/*
3303	* On allocation, reduce the number of pages that are batch freed.
3304	* See nr_pcp_free() where free_factor is increased for subsequent
3305	* frees.
3306	*/
3307	pcp->free_count >>= `1`;
3308	list = &pcp->lists[order_to_pindex(migratetype, order)];
3309	page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list);
3310	pcp_spin_unlock(pcp);
3311	pcp_trylock_finish(UP_flags);
3312	if (page) {
3313	__count_zid_vm_events(PGALLOC, page_zonenum(page), `1` << order);
3314	zone_statistics(preferred_zone, z: zone, nr_account: `1`);
3315	}
3316	return page;
3317	}
3318
3319	/*
3320	* Allocate a page from the given zone.
3321	* Use pcplists for THP or "cheap" high-order allocations.
3322	*/
3323
3324	/*
3325	* Do not instrument rmqueue() with KMSAN. This function may call
3326	* __msan_poison_alloca() through a call to set_pfnblock_migratetype().
3327	* If __msan_poison_alloca() attempts to allocate pages for the stack depot, it
3328	* may call rmqueue() again, which will result in a deadlock.
3329	*/
3330	__no_sanitize_memory
3331	static inline
3332	struct page rmqueue(struct* zone *preferred_zone,
3333	struct zone zone, unsigned* int order,
3334	gfp_t gfp_flags, unsigned int alloc_flags,
3335	int migratetype)
3336	{
3337	struct page *page;
3338
3339	if (likely(pcp_allowed_order(order))) {
3340	page = rmqueue_pcplist(preferred_zone, zone, order,
3341	migratetype, alloc_flags);
3342	if (likely(page))
3343	goto out;
3344	}
3345
3346	page = rmqueue_buddy(preferred_zone, zone, order, alloc_flags,
3347	migratetype);
3348
3349	out:
3350	/ Separate test+clear to avoid unnecessary atomics /
3351	if ((alloc_flags & ALLOC_KSWAPD) &&
3352	unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) {
3353	clear_bit(nr: ZONE_BOOSTED_WATERMARK, addr: &zone->flags);
3354	wakeup_kswapd(zone, gfp_mask: `0`, order: `0`, zone_idx(zone));
3355	}
3356
3357	VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
3358	return page;
3359	}
3360
3361	/*
3362	* Reserve the pageblock(s) surrounding an allocation request for
3363	* exclusive use of high-order atomic allocations if there are no
3364	* empty page blocks that contain a page with a suitable order
3365	*/
3366	static void reserve_highatomic_pageblock(struct page page, int* order,
3367	struct zone *zone)
3368	{
3369	int mt;
3370	unsigned long max_managed, flags;
3371
3372	/*
3373	* The number reserved as: minimum is 1 pageblock, maximum is
3374	* roughly 1% of a zone. But if 1% of a zone falls below a
3375	* pageblock size, then don't reserve any pageblocks.
3376	* Check is race-prone but harmless.
3377	*/
3378	if ((zone_managed_pages(zone) / `100`) < pageblock_nr_pages)
3379	return;
3380	max_managed = ALIGN((zone_managed_pages(zone) / `100`), pageblock_nr_pages);
3381	if (zone->nr_reserved_highatomic >= max_managed)
3382	return;
3383
3384	spin_lock_irqsave(&zone->lock, flags);
3385
3386	/ Recheck the nr_reserved_highatomic limit under the lock /
3387	if (zone->nr_reserved_highatomic >= max_managed)
3388	goto out_unlock;
3389
3390	/ Yoink! /
3391	mt = get_pageblock_migratetype(page);
3392	/ Only reserve normal pageblocks (i.e., they can merge with others) /
3393	if (!migratetype_is_mergeable(mt))
3394	goto out_unlock;
3395
3396	if (order < pageblock_order) {
3397	if (move_freepages_block(zone, page, old_mt: mt, new_mt: MIGRATE_HIGHATOMIC) == -`1`)
3398	goto out_unlock;
3399	zone->nr_reserved_highatomic += pageblock_nr_pages;
3400	} else {
3401	change_pageblock_range(pageblock_page: page, start_order: order, migratetype: MIGRATE_HIGHATOMIC);
3402	zone->nr_reserved_highatomic += `1` << order;
3403	}
3404
3405	out_unlock:
3406	spin_unlock_irqrestore(lock: &zone->lock, flags);
3407	}
3408
3409	/*
3410	* Used when an allocation is about to fail under memory pressure. This
3411	* potentially hurts the reliability of high-order allocations when under
3412	* intense memory pressure but failed atomic allocations should be easier
3413	* to recover from than an OOM.
3414	*
3415	* If @force is true, try to unreserve pageblocks even though highatomic
3416	* pageblock is exhausted.
3417	*/
3418	static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
3419	bool force)
3420	{
3421	struct zonelist *zonelist = ac->zonelist;
3422	unsigned long flags;
3423	struct zoneref *z;
3424	struct zone *zone;
3425	struct page *page;
3426	int order;
3427	int ret;
3428
3429	for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
3430	ac->nodemask) {
3431	/*
3432	* Preserve at least one pageblock unless memory pressure
3433	* is really high.
3434	*/
3435	if (!force && zone->nr_reserved_highatomic <=
3436	pageblock_nr_pages)
3437	continue;
3438
3439	spin_lock_irqsave(&zone->lock, flags);
3440	for (order = `0`; order < NR_PAGE_ORDERS; order++) {
3441	struct free_area *area = &(zone->free_area[order]);
3442	unsigned long size;
3443
3444	page = get_page_from_free_area(area, migratetype: MIGRATE_HIGHATOMIC);
3445	if (!page)
3446	continue;
3447
3448	size = max(pageblock_nr_pages, `1UL` << order);
3449	/*
3450	* It should never happen but changes to
3451	* locking could inadvertently allow a per-cpu
3452	* drain to add pages to MIGRATE_HIGHATOMIC
3453	* while unreserving so be safe and watch for
3454	* underflows.
3455	*/
3456	if (WARN_ON_ONCE(size > zone->nr_reserved_highatomic))
3457	size = zone->nr_reserved_highatomic;
3458	zone->nr_reserved_highatomic -= size;
3459
3460	/*
3461	* Convert to ac->migratetype and avoid the normal
3462	* pageblock stealing heuristics. Minimally, the caller
3463	* is doing the work and needs the pages. More
3464	* importantly, if the block was always converted to
3465	* MIGRATE_UNMOVABLE or another type then the number
3466	* of pageblocks that cannot be completely freed
3467	* may increase.
3468	*/
3469	if (order < pageblock_order)
3470	ret = move_freepages_block(zone, page,
3471	old_mt: MIGRATE_HIGHATOMIC,
3472	new_mt: ac->migratetype);
3473	else {
3474	move_to_free_list(page, zone, order,
3475	old_mt: MIGRATE_HIGHATOMIC,
3476	new_mt: ac->migratetype);
3477	change_pageblock_range(pageblock_page: page, start_order: order,
3478	migratetype: ac->migratetype);
3479	ret = `1`;
3480	}
3481	/*
3482	* Reserving the block(s) already succeeded,
3483	* so this should not fail on zone boundaries.
3484	*/
3485	WARN_ON_ONCE(ret == -`1`);
3486	if (ret > `0`) {
3487	spin_unlock_irqrestore(lock: &zone->lock, flags);
3488	return ret;
3489	}
3490	}
3491	spin_unlock_irqrestore(lock: &zone->lock, flags);
3492	}
3493
3494	return false;
3495	}
3496
3497	static inline long __zone_watermark_unusable_free(struct zone *z,
3498	unsigned int order, unsigned int alloc_flags)
3499	{
3500	long unusable_free = (`1` << order) - `1`;
3501
3502	/*
3503	* If the caller does not have rights to reserves below the min
3504	* watermark then subtract the free pages reserved for highatomic.
3505	*/
3506	if (likely(!(alloc_flags & ALLOC_RESERVES)))
3507	unusable_free += READ_ONCE(z->nr_free_highatomic);
3508
3509	#ifdef CONFIG_CMA
3510	/ If allocation can't use CMA areas don't use free CMA pages /
3511	if (!(alloc_flags & ALLOC_CMA))
3512	unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES);
3513	#endif
3514
3515	return unusable_free;
3516	}
3517
3518	/*
3519	* Return true if free base pages are above 'mark'. For high-order checks it
3520	* will return true of the order-0 watermark is reached and there is at least
3521	* one free page of a suitable size. Checking now avoids taking the zone lock
3522	* to check in the allocation paths if no pages are free.
3523	*/
3524	bool __zone_watermark_ok(struct zone z, unsigned* int order, unsigned long mark,
3525	int highest_zoneidx, unsigned int alloc_flags,
3526	long free_pages)
3527	{
3528	long min = mark;
3529	int o;
3530
3531	/ free_pages may go negative - that's OK /
3532	free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags);
3533
3534	if (unlikely(alloc_flags & ALLOC_RESERVES)) {
3535	/*
3536	* __GFP_HIGH allows access to 50% of the min reserve as well
3537	* as OOM.
3538	*/
3539	if (alloc_flags & ALLOC_MIN_RESERVE) {
3540	min -= min / `2`;
3541
3542	/*
3543	* Non-blocking allocations (e.g. GFP_ATOMIC) can
3544	* access more reserves than just __GFP_HIGH. Other
3545	* non-blocking allocations requests such as GFP_NOWAIT
3546	* or (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) do not get
3547	* access to the min reserve.
3548	*/
3549	if (alloc_flags & ALLOC_NON_BLOCK)
3550	min -= min / `4`;
3551	}
3552
3553	/*
3554	* OOM victims can try even harder than the normal reserve
3555	* users on the grounds that it's definitely going to be in
3556	* the exit path shortly and free memory. Any allocation it
3557	* makes during the free path will be small and short-lived.
3558	*/
3559	if (alloc_flags & ALLOC_OOM)
3560	min -= min / `2`;
3561	}
3562
3563	/*
3564	* Check watermarks for an order-0 allocation request. If these
3565	* are not met, then a high-order request also cannot go ahead
3566	* even if a suitable page happened to be free.
3567	*/
3568	if (free_pages <= min + z->lowmem_reserve[highest_zoneidx])
3569	return false;
3570
3571	/ If this is an order-0 request then the watermark is fine /
3572	if (!order)
3573	return true;
3574
3575	/ For a high-order request, check at least one suitable page is free /
3576	for (o = order; o < NR_PAGE_ORDERS; o++) {
3577	struct free_area *area = &z->free_area[o];
3578	int mt;
3579
3580	if (!area->nr_free)
3581	continue;
3582
3583	for (mt = `0`; mt < MIGRATE_PCPTYPES; mt++) {
3584	if (!free_area_empty(area, migratetype: mt))
3585	return true;
3586	}
3587
3588	#ifdef CONFIG_CMA
3589	if ((alloc_flags & ALLOC_CMA) &&
3590	!free_area_empty(area, MIGRATE_CMA)) {
3591	return true;
3592	}
3593	#endif
3594	if ((alloc_flags & (ALLOC_HIGHATOMIC\|ALLOC_OOM)) &&
3595	!free_area_empty(area, migratetype: MIGRATE_HIGHATOMIC)) {
3596	return true;
3597	}
3598	}
3599	return false;
3600	}
3601
3602	bool zone_watermark_ok(struct zone z, unsigned* int order, unsigned long mark,
3603	int highest_zoneidx, unsigned int alloc_flags)
3604	{
3605	return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
3606	free_pages: zone_page_state(zone: z, item: NR_FREE_PAGES));
3607	}
3608
3609	static inline bool zone_watermark_fast(struct zone z, unsigned* int order,
3610	unsigned long mark, int highest_zoneidx,
3611	unsigned int alloc_flags, gfp_t gfp_mask)
3612	{
3613	long free_pages;
3614
3615	free_pages = zone_page_state(zone: z, item: NR_FREE_PAGES);
3616
3617	/*
3618	* Fast check for order-0 only. If this fails then the reserves
3619	* need to be calculated.
3620	*/
3621	if (!order) {
3622	long usable_free;
3623	long reserved;
3624
3625	usable_free = free_pages;
3626	reserved = __zone_watermark_unusable_free(z, order: `0`, alloc_flags);
3627
3628	/ reserved may over estimate high-atomic reserves. /
3629	usable_free -= min(usable_free, reserved);
3630	if (usable_free > mark + z->lowmem_reserve[highest_zoneidx])
3631	return true;
3632	}
3633
3634	if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
3635	free_pages))
3636	return true;
3637
3638	/*
3639	* Ignore watermark boosting for __GFP_HIGH order-0 allocations
3640	* when checking the min watermark. The min watermark is the
3641	* point where boosting is ignored so that kswapd is woken up
3642	* when below the low watermark.
3643	*/
3644	if (unlikely(!order && (alloc_flags & ALLOC_MIN_RESERVE) && z->watermark_boost
3645	&& ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) {
3646	mark = z->_watermark[WMARK_MIN];
3647	return __zone_watermark_ok(z, order, mark, highest_zoneidx,
3648	alloc_flags, free_pages);
3649	}
3650
3651	return false;
3652	}
3653
3654	#ifdef CONFIG_NUMA
3655	int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
3656
3657	static bool zone_allows_reclaim(struct zone local_zone, struct* zone *zone)
3658	{
3659	return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
3660	node_reclaim_distance;
3661	}
3662	#else /* CONFIG_NUMA */
3663	static bool zone_allows_reclaim(struct zone local_zone, struct* zone *zone)
3664	{
3665	return true;
3666	}
3667	#endif /* CONFIG_NUMA */
3668
3669	/*
3670	* The restriction on ZONE_DMA32 as being a suitable zone to use to avoid
3671	* fragmentation is subtle. If the preferred zone was HIGHMEM then
3672	* premature use of a lower zone may cause lowmem pressure problems that
3673	* are worse than fragmentation. If the next zone is ZONE_DMA then it is
3674	* probably too small. It only makes sense to spread allocations to avoid
3675	* fragmentation between the Normal and DMA32 zones.
3676	*/
3677	static inline unsigned int
3678	alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
3679	{
3680	unsigned int alloc_flags;
3681
3682	/*
3683	* __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
3684	* to save a branch.
3685	*/
3686	alloc_flags = (__force int) (gfp_mask & __GFP_KSWAPD_RECLAIM);
3687
3688	if (defrag_mode) {
3689	alloc_flags \|= ALLOC_NOFRAGMENT;
3690	return alloc_flags;
3691	}
3692
3693	#ifdef CONFIG_ZONE_DMA32
3694	if (!zone)
3695	return alloc_flags;
3696
3697	if (zone_idx(zone) != ZONE_NORMAL)
3698	return alloc_flags;
3699
3700	/*
3701	* If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and
3702	* the pointer is within zone->zone_pgdat->node_zones[]. Also assume
3703	* on UMA that if Normal is populated then so is DMA32.
3704	*/
3705	BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != `1`);
3706	if (nr_online_nodes > `1` && !populated_zone(zone: --zone))
3707	return alloc_flags;
3708
3709	alloc_flags \|= ALLOC_NOFRAGMENT;
3710	#endif /* CONFIG_ZONE_DMA32 */
3711	return alloc_flags;
3712	}
3713
3714	/ Must be called after current_gfp_context() which can change gfp_mask /
3715	static inline unsigned int gfp_to_alloc_flags_cma(gfp_t gfp_mask,
3716	unsigned int alloc_flags)
3717	{
3718	#ifdef CONFIG_CMA
3719	if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE)
3720	alloc_flags \|= ALLOC_CMA;
3721	#endif
3722	return alloc_flags;
3723	}
3724
3725	/*
3726	* get_page_from_freelist goes through the zonelist trying to allocate
3727	* a page.
3728	*/
3729	static struct page *
3730	get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
3731	const struct alloc_context *ac)
3732	{
3733	struct zoneref *z;
3734	struct zone *zone;
3735	struct pglist_data *last_pgdat = NULL;
3736	bool last_pgdat_dirty_ok = false;
3737	bool no_fallback;
3738	bool skip_kswapd_nodes = nr_online_nodes > `1`;
3739	bool skipped_kswapd_nodes = false;
3740
3741	retry:
3742	/*
3743	* Scan zonelist, looking for a zone with enough free.
3744	* See also cpuset_current_node_allowed() comment in kernel/cgroup/cpuset.c.
3745	*/
3746	no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
3747	z = ac->preferred_zoneref;
3748	for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx,
3749	ac->nodemask) {
3750	struct page *page;
3751	unsigned long mark;
3752
3753	if (cpusets_enabled() &&
3754	(alloc_flags & ALLOC_CPUSET) &&
3755	!__cpuset_zone_allowed(z: zone, gfp_mask))
3756	continue;
3757	/*
3758	* When allocating a page cache page for writing, we
3759	* want to get it from a node that is within its dirty
3760	* limit, such that no single node holds more than its
3761	* proportional share of globally allowed dirty pages.
3762	* The dirty limits take into account the node's
3763	* lowmem reserves and high watermark so that kswapd
3764	* should be able to balance it without having to
3765	* write pages from its LRU list.
3766	*
3767	* XXX: For now, allow allocations to potentially
3768	* exceed the per-node dirty limit in the slowpath
3769	* (spread_dirty_pages unset) before going into reclaim,
3770	* which is important when on a NUMA setup the allowed
3771	* nodes are together not big enough to reach the
3772	* global limit. The proper fix for these situations
3773	* will require awareness of nodes in the
3774	* dirty-throttling and the flusher threads.
3775	*/
3776	if (ac->spread_dirty_pages) {
3777	if (last_pgdat != zone->zone_pgdat) {
3778	last_pgdat = zone->zone_pgdat;
3779	last_pgdat_dirty_ok = node_dirty_ok(pgdat: zone->zone_pgdat);
3780	}
3781
3782	if (!last_pgdat_dirty_ok)
3783	continue;
3784	}
3785
3786	if (no_fallback && !defrag_mode && nr_online_nodes > `1` &&
3787	zone != zonelist_zone(zoneref: ac->preferred_zoneref)) {
3788	int local_nid;
3789
3790	/*
3791	* If moving to a remote node, retry but allow
3792	* fragmenting fallbacks. Locality is more important
3793	* than fragmentation avoidance.
3794	*/
3795	local_nid = zonelist_node_idx(zoneref: ac->preferred_zoneref);
3796	if (zone_to_nid(zone) != local_nid) {
3797	alloc_flags &= ~ALLOC_NOFRAGMENT;
3798	goto retry;
3799	}
3800	}
3801
3802	/*
3803	* If kswapd is already active on a node, keep looking
3804	* for other nodes that might be idle. This can happen
3805	* if another process has NUMA bindings and is causing
3806	* kswapd wakeups on only some nodes. Avoid accidental
3807	* "node_reclaim_mode"-like behavior in this case.
3808	*/
3809	if (skip_kswapd_nodes &&
3810	!waitqueue_active(wq_head: &zone->zone_pgdat->kswapd_wait)) {
3811	skipped_kswapd_nodes = true;
3812	continue;
3813	}
3814
3815	cond_accept_memory(zone, order, alloc_flags);
3816
3817	/*
3818	* Detect whether the number of free pages is below high
3819	* watermark. If so, we will decrease pcp->high and free
3820	* PCP pages in free path to reduce the possibility of
3821	* premature page reclaiming. Detection is done here to
3822	* avoid to do that in hotter free path.
3823	*/
3824	if (test_bit(ZONE_BELOW_HIGH, &zone->flags))
3825	goto check_alloc_wmark;
3826
3827	mark = high_wmark_pages(z: zone);
3828	if (zone_watermark_fast(z: zone, order, mark,
3829	highest_zoneidx: ac->highest_zoneidx, alloc_flags,
3830	gfp_mask))
3831	goto try_this_zone;
3832	else
3833	set_bit(nr: ZONE_BELOW_HIGH, addr: &zone->flags);
3834
3835	check_alloc_wmark:
3836	mark = wmark_pages(z: zone, w: alloc_flags & ALLOC_WMARK_MASK);
3837	if (!zone_watermark_fast(z: zone, order, mark,
3838	highest_zoneidx: ac->highest_zoneidx, alloc_flags,
3839	gfp_mask)) {
3840	int ret;
3841
3842	if (cond_accept_memory(zone, order, alloc_flags))
3843	goto try_this_zone;
3844
3845	/*
3846	* Watermark failed for this zone, but see if we can
3847	* grow this zone if it contains deferred pages.
3848	*/
3849	if (deferred_pages_enabled()) {
3850	if (_deferred_grow_zone(zone, order))
3851	goto try_this_zone;
3852	}
3853	/ Checked here to keep the fast path fast /
3854	BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
3855	if (alloc_flags & ALLOC_NO_WATERMARKS)
3856	goto try_this_zone;
3857
3858	if (!node_reclaim_enabled() \|\|
3859	!zone_allows_reclaim(local_zone: zonelist_zone(zoneref: ac->preferred_zoneref), zone))
3860	continue;
3861
3862	ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
3863	switch (ret) {
3864	case NODE_RECLAIM_NOSCAN:
3865	/ did not scan /
3866	continue;
3867	case NODE_RECLAIM_FULL:
3868	/ scanned but unreclaimable /
3869	continue;
3870	default:
3871	/ did we reclaim enough /
3872	if (zone_watermark_ok(z: zone, order, mark,
3873	highest_zoneidx: ac->highest_zoneidx, alloc_flags))
3874	goto try_this_zone;
3875
3876	continue;
3877	}
3878	}
3879
3880	try_this_zone:
3881	page = rmqueue(preferred_zone: zonelist_zone(zoneref: ac->preferred_zoneref), zone, order,
3882	gfp_flags: gfp_mask, alloc_flags, migratetype: ac->migratetype);
3883	if (page) {
3884	prep_new_page(page, order, gfp_flags: gfp_mask, alloc_flags);
3885
3886	/*
3887	* If this is a high-order atomic allocation then check
3888	* if the pageblock should be reserved for the future
3889	*/
3890	if (unlikely(alloc_flags & ALLOC_HIGHATOMIC))
3891	reserve_highatomic_pageblock(page, order, zone);
3892
3893	return page;
3894	} else {
3895	if (cond_accept_memory(zone, order, alloc_flags))
3896	goto try_this_zone;
3897
3898	/ Try again if zone has deferred pages /
3899	if (deferred_pages_enabled()) {
3900	if (_deferred_grow_zone(zone, order))
3901	goto try_this_zone;
3902	}
3903	}
3904	}
3905
3906	/*
3907	* If we skipped over nodes with active kswapds and found no
3908	* idle nodes, retry and place anywhere the watermarks permit.
3909	*/
3910	if (skip_kswapd_nodes && skipped_kswapd_nodes) {
3911	skip_kswapd_nodes = false;
3912	goto retry;
3913	}
3914
3915	/*
3916	* It's possible on a UMA machine to get through all zones that are
3917	* fragmented. If avoiding fragmentation, reset and try again.
3918	*/
3919	if (no_fallback && !defrag_mode) {
3920	alloc_flags &= ~ALLOC_NOFRAGMENT;
3921	goto retry;
3922	}
3923
3924	return NULL;
3925	}
3926
3927	static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
3928	{
3929	unsigned int filter = SHOW_MEM_FILTER_NODES;
3930
3931	/*
3932	* This documents exceptions given to allocations in certain
3933	* contexts that are allowed to allocate outside current's set
3934	* of allowed nodes.
3935	*/
3936	if (!(gfp_mask & __GFP_NOMEMALLOC))
3937	if (tsk_is_oom_victim(current) \|\|
3938	(current->flags & (PF_MEMALLOC \| PF_EXITING)))
3939	filter &= ~SHOW_MEM_FILTER_NODES;
3940	if (!in_task() \|\| !(gfp_mask & __GFP_DIRECT_RECLAIM))
3941	filter &= ~SHOW_MEM_FILTER_NODES;
3942
3943	__show_mem(flags: filter, nodemask, max_zone_idx: gfp_zone(flags: gfp_mask));
3944	}
3945
3946	void warn_alloc(gfp_t gfp_mask, nodemask_t nodemask, const* char *fmt, ...)
3947	{
3948	struct va_format vaf;
3949	va_list args;
3950	static DEFINE_RATELIMIT_STATE(nopage_rs, `10`*HZ, `1`);
3951
3952	if ((gfp_mask & __GFP_NOWARN) \|\|
3953	!__ratelimit(&nopage_rs) \|\|
3954	((gfp_mask & __GFP_DMA) && !has_managed_dma()))
3955	return;
3956
3957	va_start(args, fmt);
3958	vaf.fmt = fmt;
3959	vaf.va = &args;
3960	pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl",
3961	current->comm, &vaf, gfp_mask, &gfp_mask,
3962	nodemask_pr_args(nodemask));
3963	va_end(args);
3964
3965	cpuset_print_current_mems_allowed();
3966	pr_cont("\n");
3967	dump_stack();
3968	warn_alloc_show_mem(gfp_mask, nodemask);
3969	}
3970
3971	static inline struct page *
3972	__alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order,
3973	unsigned int alloc_flags,
3974	const struct alloc_context *ac)
3975	{
3976	struct page *page;
3977
3978	page = get_page_from_freelist(gfp_mask, order,
3979	alloc_flags: alloc_flags\|ALLOC_CPUSET, ac);
3980	/*
3981	* fallback to ignore cpuset restriction if our nodes
3982	* are depleted
3983	*/
3984	if (!page)
3985	page = get_page_from_freelist(gfp_mask, order,
3986	alloc_flags, ac);
3987	return page;
3988	}
3989
3990	static inline struct page *
3991	__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
3992	const struct alloc_context ac, unsigned* long *did_some_progress)
3993	{
3994	struct oom_control oc = {
3995	.zonelist = ac->zonelist,
3996	.nodemask = ac->nodemask,
3997	.memcg = NULL,
3998	.gfp_mask = gfp_mask,
3999	.order = order,
4000	};
4001	struct page *page;
4002
4003	*did_some_progress = `0`;
4004
4005	/*
4006	* Acquire the oom lock. If that fails, somebody else is
4007	* making progress for us.
4008	*/
4009	if (!mutex_trylock(lock: &oom_lock)) {
4010	*did_some_progress = `1`;
4011	schedule_timeout_uninterruptible(timeout: `1`);
4012	return NULL;
4013	}
4014
4015	/*
4016	* Go through the zonelist yet one more time, keep very high watermark
4017	* here, this is only to catch a parallel oom killing, we must fail if
4018	* we're still under heavy pressure. But make sure that this reclaim
4019	* attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY
4020	* allocation which will never fail due to oom_lock already held.
4021	*/
4022	page = get_page_from_freelist(gfp_mask: (gfp_mask \| __GFP_HARDWALL) &
4023	~__GFP_DIRECT_RECLAIM, order,
4024	ALLOC_WMARK_HIGH\|ALLOC_CPUSET, ac);
4025	if (page)
4026	goto out;
4027
4028	/ Coredumps can quickly deplete all memory reserves /
4029	if (current->flags & PF_DUMPCORE)
4030	goto out;
4031	/ The OOM killer will not help higher order allocs /
4032	if (order > PAGE_ALLOC_COSTLY_ORDER)
4033	goto out;
4034	/*
4035	* We have already exhausted all our reclaim opportunities without any
4036	* success so it is time to admit defeat. We will skip the OOM killer
4037	* because it is very likely that the caller has a more reasonable
4038	* fallback than shooting a random task.
4039	*
4040	* The OOM killer may not free memory on a specific node.
4041	*/
4042	if (gfp_mask & (__GFP_RETRY_MAYFAIL \| __GFP_THISNODE))
4043	goto out;
4044	/ The OOM killer does not needlessly kill tasks for lowmem /
4045	if (ac->highest_zoneidx < ZONE_NORMAL)
4046	goto out;
4047	if (pm_suspended_storage())
4048	goto out;
4049	/*
4050	* XXX: GFP_NOFS allocations should rather fail than rely on
4051	* other request to make a forward progress.
4052	* We are in an unfortunate situation where out_of_memory cannot
4053	* do much for this context but let's try it to at least get
4054	* access to memory reserved if the current task is killed (see
4055	* out_of_memory). Once filesystems are ready to handle allocation
4056	* failures more gracefully we should just bail out here.
4057	*/
4058
4059	/ Exhausted what can be done so it's blame time /
4060	if (out_of_memory(oc: &oc) \|\|
4061	WARN_ON_ONCE_GFP(gfp_mask & __GFP_NOFAIL, gfp_mask)) {
4062	*did_some_progress = `1`;
4063
4064	/*
4065	* Help non-failing allocations by giving them access to memory
4066	* reserves
4067	*/
4068	if (gfp_mask & __GFP_NOFAIL)
4069	page = __alloc_pages_cpuset_fallback(gfp_mask, order,
4070	ALLOC_NO_WATERMARKS, ac);
4071	}
4072	out:
4073	mutex_unlock(lock: &oom_lock);
4074	return page;
4075	}
4076
4077	/*
4078	* Maximum number of compaction retries with a progress before OOM
4079	* killer is consider as the only way to move forward.
4080	*/
4081	#define MAX_COMPACT_RETRIES 16
4082
4083	#ifdef CONFIG_COMPACTION
4084	/ Try memory compaction for high-order allocations before reclaim /
4085	static struct page *
4086	__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
4087	unsigned int alloc_flags, const struct alloc_context *ac,
4088	enum compact_priority prio, enum compact_result *compact_result)
4089	{
4090	struct page *page = NULL;
4091	unsigned long pflags;
4092	unsigned int noreclaim_flag;
4093
4094	if (!order)
4095	return NULL;
4096
4097	psi_memstall_enter(flags: &pflags);
4098	delayacct_compact_start();
4099	noreclaim_flag = memalloc_noreclaim_save();
4100
4101	*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
4102	prio, page: &page);
4103
4104	memalloc_noreclaim_restore(flags: noreclaim_flag);
4105	psi_memstall_leave(flags: &pflags);
4106	delayacct_compact_end();
4107
4108	if (*compact_result == COMPACT_SKIPPED)
4109	return NULL;
4110	/*
4111	* At least in one zone compaction wasn't deferred or skipped, so let's
4112	* count a compaction stall
4113	*/
4114	count_vm_event(item: COMPACTSTALL);
4115
4116	/ Prep a captured page if available /
4117	if (page)
4118	prep_new_page(page, order, gfp_flags: gfp_mask, alloc_flags);
4119
4120	/ Try get a page from the freelist if available /
4121	if (!page)
4122	page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
4123
4124	if (page) {
4125	struct zone *zone = page_zone(page);
4126
4127	zone->compact_blockskip_flush = false;
4128	compaction_defer_reset(zone, order, alloc_success: true);
4129	count_vm_event(item: COMPACTSUCCESS);
4130	return page;
4131	}
4132
4133	/*
4134	* It's bad if compaction run occurs and fails. The most likely reason
4135	* is that pages exist, but not enough to satisfy watermarks.
4136	*/
4137	count_vm_event(item: COMPACTFAIL);
4138
4139	cond_resched();
4140
4141	return NULL;
4142	}
4143
4144	static inline bool
4145	should_compact_retry(struct alloc_context ac, int* order, int alloc_flags,
4146	enum compact_result compact_result,
4147	enum compact_priority *compact_priority,
4148	int *compaction_retries)
4149	{
4150	int max_retries = MAX_COMPACT_RETRIES;
4151	int min_priority;
4152	bool ret = false;
4153	int retries = *compaction_retries;
4154	enum compact_priority priority = *compact_priority;
4155
4156	if (!order)
4157	return false;
4158
4159	if (fatal_signal_pending(current))
4160	return false;
4161
4162	/*
4163	* Compaction was skipped due to a lack of free order-0
4164	* migration targets. Continue if reclaim can help.
4165	*/
4166	if (compact_result == COMPACT_SKIPPED) {
4167	ret = compaction_zonelist_suitable(ac, order, alloc_flags);
4168	goto out;
4169	}
4170
4171	/*
4172	* Compaction managed to coalesce some page blocks, but the
4173	* allocation failed presumably due to a race. Retry some.
4174	*/
4175	if (compact_result == COMPACT_SUCCESS) {
4176	/*
4177	* !costly requests are much more important than
4178	* __GFP_RETRY_MAYFAIL costly ones because they are de
4179	* facto nofail and invoke OOM killer to move on while
4180	* costly can fail and users are ready to cope with
4181	* that. 1/4 retries is rather arbitrary but we would
4182	* need much more detailed feedback from compaction to
4183	* make a better decision.
4184	*/
4185	if (order > PAGE_ALLOC_COSTLY_ORDER)
4186	max_retries /= `4`;
4187
4188	if (++(*compaction_retries) <= max_retries) {
4189	ret = true;
4190	goto out;
4191	}
4192	}
4193
4194	/*
4195	* Compaction failed. Retry with increasing priority.
4196	*/
4197	min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
4198	MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
4199
4200	if (*compact_priority > min_priority) {
4201	(*compact_priority)--;
4202	*compaction_retries = `0`;
4203	ret = true;
4204	}
4205	out:
4206	trace_compact_retry(order, priority, result: compact_result, retries, max_retries, ret);
4207	return ret;
4208	}
4209	#else
4210	static inline struct page *
4211	__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
4212	unsigned int alloc_flags, const struct alloc_context *ac,
4213	enum compact_priority prio, enum compact_result *compact_result)
4214	{
4215	*compact_result = COMPACT_SKIPPED;
4216	return NULL;
4217	}
4218
4219	static inline bool
4220	should_compact_retry(struct alloc_context ac, int* order, int alloc_flags,
4221	enum compact_result compact_result,
4222	enum compact_priority *compact_priority,
4223	int *compaction_retries)
4224	{
4225	struct zone *zone;
4226	struct zoneref *z;
4227
4228	if (!order \|\| order > PAGE_ALLOC_COSTLY_ORDER)
4229	return false;
4230
4231	/*
4232	* There are setups with compaction disabled which would prefer to loop
4233	* inside the allocator rather than hit the oom killer prematurely.
4234	* Let's give them a good hope and keep retrying while the order-0
4235	* watermarks are OK.
4236	*/
4237	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
4238	ac->highest_zoneidx, ac->nodemask) {
4239	if (zone_watermark_ok(zone, `0`, min_wmark_pages(zone),
4240	ac->highest_zoneidx, alloc_flags))
4241	return true;
4242	}
4243	return false;
4244	}
4245	#endif /* CONFIG_COMPACTION */
4246
4247	#ifdef CONFIG_LOCKDEP
4248	static struct lockdep_map __fs_reclaim_map =
4249	STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map);
4250
4251	static bool __need_reclaim(gfp_t gfp_mask)
4252	{
4253	/ no reclaim without waiting on it /
4254	if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
4255	return false;
4256
4257	/ this guy won't enter reclaim /
4258	if (current->flags & PF_MEMALLOC)
4259	return false;
4260
4261	if (gfp_mask & __GFP_NOLOCKDEP)
4262	return false;
4263
4264	return true;
4265	}
4266
4267	void __fs_reclaim_acquire(unsigned long ip)
4268	{
4269	lock_acquire_exclusive(&__fs_reclaim_map, `0`, `0`, NULL, ip);
4270	}
4271
4272	void __fs_reclaim_release(unsigned long ip)
4273	{
4274	lock_release(&__fs_reclaim_map, ip);
4275	}
4276
4277	void fs_reclaim_acquire(gfp_t gfp_mask)
4278	{
4279	gfp_mask = current_gfp_context(gfp_mask);
4280
4281	if (__need_reclaim(gfp_mask)) {
4282	if (gfp_mask & __GFP_FS)
4283	__fs_reclaim_acquire(_RET_IP_);
4284
4285	#ifdef CONFIG_MMU_NOTIFIER
4286	lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
4287	lock_map_release(&__mmu_notifier_invalidate_range_start_map);
4288	#endif
4289
4290	}
4291	}
4292	EXPORT_SYMBOL_GPL(fs_reclaim_acquire);
4293
4294	void fs_reclaim_release(gfp_t gfp_mask)
4295	{
4296	gfp_mask = current_gfp_context(gfp_mask);
4297
4298	if (__need_reclaim(gfp_mask)) {
4299	if (gfp_mask & __GFP_FS)
4300	__fs_reclaim_release(_RET_IP_);
4301	}
4302	}
4303	EXPORT_SYMBOL_GPL(fs_reclaim_release);
4304	#endif
4305
4306	/*
4307	* Zonelists may change due to hotplug during allocation. Detect when zonelists
4308	* have been rebuilt so allocation retries. Reader side does not lock and
4309	* retries the allocation if zonelist changes. Writer side is protected by the
4310	* embedded spin_lock.
4311	*/
4312	static DEFINE_SEQLOCK(zonelist_update_seq);
4313
4314	static unsigned int zonelist_iter_begin(void)
4315	{
4316	if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
4317	return read_seqbegin(sl: &zonelist_update_seq);
4318
4319	return `0`;
4320	}
4321
4322	static unsigned int check_retry_zonelist(unsigned int seq)
4323	{
4324	if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
4325	return read_seqretry(sl: &zonelist_update_seq, start: seq);
4326
4327	return seq;
4328	}
4329
4330	/ Perform direct synchronous page reclaim /
4331	static unsigned long
4332	__perform_reclaim(gfp_t gfp_mask, unsigned int order,
4333	const struct alloc_context *ac)
4334	{
4335	unsigned int noreclaim_flag;
4336	unsigned long progress;
4337
4338	cond_resched();
4339
4340	/ We now go into synchronous reclaim /
4341	cpuset_memory_pressure_bump();
4342	fs_reclaim_acquire(gfp_mask);
4343	noreclaim_flag = memalloc_noreclaim_save();
4344
4345	progress = try_to_free_pages(zonelist: ac->zonelist, order, gfp_mask,
4346	mask: ac->nodemask);
4347
4348	memalloc_noreclaim_restore(flags: noreclaim_flag);
4349	fs_reclaim_release(gfp_mask);
4350
4351	cond_resched();
4352
4353	return progress;
4354	}
4355
4356	/ The really slow allocator path where we enter direct reclaim /
4357	static inline struct page *
4358	__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
4359	unsigned int alloc_flags, const struct alloc_context *ac,
4360	unsigned long *did_some_progress)
4361	{
4362	struct page *page = NULL;
4363	unsigned long pflags;
4364	bool drained = false;
4365
4366	psi_memstall_enter(flags: &pflags);
4367	*did_some_progress = __perform_reclaim(gfp_mask, order, ac);
4368	if (unlikely(!(*did_some_progress)))
4369	goto out;
4370
4371	retry:
4372	page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
4373
4374	/*
4375	* If an allocation failed after direct reclaim, it could be because
4376	* pages are pinned on the per-cpu lists or in high alloc reserves.
4377	* Shrink them and try again
4378	*/
4379	if (!page && !drained) {
4380	unreserve_highatomic_pageblock(ac, force: false);
4381	drain_all_pages(NULL);
4382	drained = true;
4383	goto retry;
4384	}
4385	out:
4386	psi_memstall_leave(flags: &pflags);
4387
4388	return page;
4389	}
4390
4391	static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
4392	const struct alloc_context *ac)
4393	{
4394	struct zoneref *z;
4395	struct zone *zone;
4396	pg_data_t *last_pgdat = NULL;
4397	enum zone_type highest_zoneidx = ac->highest_zoneidx;
4398	unsigned int reclaim_order;
4399
4400	if (defrag_mode)
4401	reclaim_order = max(order, pageblock_order);
4402	else
4403	reclaim_order = order;
4404
4405	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
4406	ac->nodemask) {
4407	if (!managed_zone(zone))
4408	continue;
4409	if (last_pgdat == zone->zone_pgdat)
4410	continue;
4411	wakeup_kswapd(zone, gfp_mask, order: reclaim_order, highest_zoneidx);
4412	last_pgdat = zone->zone_pgdat;
4413	}
4414	}
4415
4416	static inline unsigned int
4417	gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
4418	{
4419	unsigned int alloc_flags = ALLOC_WMARK_MIN \| ALLOC_CPUSET;
4420
4421	/*
4422	* __GFP_HIGH is assumed to be the same as ALLOC_MIN_RESERVE
4423	* and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
4424	* to save two branches.
4425	*/
4426	BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_MIN_RESERVE);
4427	BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD);
4428
4429	/*
4430	* The caller may dip into page reserves a bit more if the caller
4431	* cannot run direct reclaim, or if the caller has realtime scheduling
4432	* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
4433	* set both ALLOC_NON_BLOCK and ALLOC_MIN_RESERVE(__GFP_HIGH).
4434	*/
4435	alloc_flags \|= (__force int)
4436	(gfp_mask & (__GFP_HIGH \| __GFP_KSWAPD_RECLAIM));
4437
4438	if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) {
4439	/*
4440	* Not worth trying to allocate harder for __GFP_NOMEMALLOC even
4441	* if it can't schedule.
4442	*/
4443	if (!(gfp_mask & __GFP_NOMEMALLOC)) {
4444	alloc_flags \|= ALLOC_NON_BLOCK;
4445
4446	if (order > `0` && (alloc_flags & ALLOC_MIN_RESERVE))
4447	alloc_flags \|= ALLOC_HIGHATOMIC;
4448	}
4449
4450	/*
4451	* Ignore cpuset mems for non-blocking __GFP_HIGH (probably
4452	* GFP_ATOMIC) rather than fail, see the comment for
4453	* cpuset_current_node_allowed().
4454	*/
4455	if (alloc_flags & ALLOC_MIN_RESERVE)
4456	alloc_flags &= ~ALLOC_CPUSET;
4457	} else if (unlikely(rt_or_dl_task(current)) && in_task())
4458	alloc_flags \|= ALLOC_MIN_RESERVE;
4459
4460	alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags);
4461
4462	if (defrag_mode)
4463	alloc_flags \|= ALLOC_NOFRAGMENT;
4464
4465	return alloc_flags;
4466	}
4467
4468	static bool oom_reserves_allowed(struct task_struct *tsk)
4469	{
4470	if (!tsk_is_oom_victim(tsk))
4471	return false;
4472
4473	/*
4474	* !MMU doesn't have oom reaper so give access to memory reserves
4475	* only to the thread with TIF_MEMDIE set
4476	*/
4477	if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE))
4478	return false;
4479
4480	return true;
4481	}
4482
4483	/*
4484	* Distinguish requests which really need access to full memory
4485	* reserves from oom victims which can live with a portion of it
4486	*/
4487	static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask)
4488	{
4489	if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
4490	return `0`;
4491	if (gfp_mask & __GFP_MEMALLOC)
4492	return ALLOC_NO_WATERMARKS;
4493	if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
4494	return ALLOC_NO_WATERMARKS;
4495	if (!in_interrupt()) {
4496	if (current->flags & PF_MEMALLOC)
4497	return ALLOC_NO_WATERMARKS;
4498	else if (oom_reserves_allowed(current))
4499	return ALLOC_OOM;
4500	}
4501
4502	return `0`;
4503	}
4504
4505	bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
4506	{
4507	return !!__gfp_pfmemalloc_flags(gfp_mask);
4508	}
4509
4510	/*
4511	* Checks whether it makes sense to retry the reclaim to make a forward progress
4512	* for the given allocation request.
4513	*
4514	* We give up when we either have tried MAX_RECLAIM_RETRIES in a row
4515	* without success, or when we couldn't even meet the watermark if we
4516	* reclaimed all remaining pages on the LRU lists.
4517	*
4518	* Returns true if a retry is viable or false to enter the oom path.
4519	*/
4520	static inline bool
4521	should_reclaim_retry(gfp_t gfp_mask, unsigned order,
4522	struct alloc_context ac, int* alloc_flags,
4523	bool did_some_progress, int *no_progress_loops)
4524	{
4525	struct zone *zone;
4526	struct zoneref *z;
4527	bool ret = false;
4528
4529	/*
4530	* Costly allocations might have made a progress but this doesn't mean
4531	* their order will become available due to high fragmentation so
4532	* always increment the no progress counter for them
4533	*/
4534	if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
4535	*no_progress_loops = `0`;
4536	else
4537	(*no_progress_loops)++;
4538
4539	if (*no_progress_loops > MAX_RECLAIM_RETRIES)
4540	goto out;
4541
4542
4543	/*
4544	* Keep reclaiming pages while there is a chance this will lead
4545	* somewhere. If none of the target zones can satisfy our allocation
4546	* request even if all reclaimable pages are considered then we are
4547	* screwed and have to go OOM.
4548	*/
4549	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
4550	ac->highest_zoneidx, ac->nodemask) {
4551	unsigned long available;
4552	unsigned long reclaimable;
4553	unsigned long min_wmark = min_wmark_pages(z: zone);
4554	bool wmark;
4555
4556	if (cpusets_enabled() &&
4557	(alloc_flags & ALLOC_CPUSET) &&
4558	!__cpuset_zone_allowed(z: zone, gfp_mask))
4559	continue;
4560
4561	available = reclaimable = zone_reclaimable_pages(zone);
4562	available += zone_page_state_snapshot(zone, item: NR_FREE_PAGES);
4563
4564	/*
4565	* Would the allocation succeed if we reclaimed all
4566	* reclaimable pages?
4567	*/
4568	wmark = __zone_watermark_ok(z: zone, order, mark: min_wmark,
4569	highest_zoneidx: ac->highest_zoneidx, alloc_flags, free_pages: available);
4570	trace_reclaim_retry_zone(zoneref: z, order, reclaimable,
4571	available, min_wmark, no_progress_loops: *no_progress_loops, wmark_check: wmark);
4572	if (wmark) {
4573	ret = true;
4574	break;
4575	}
4576	}
4577
4578	/*
4579	* Memory allocation/reclaim might be called from a WQ context and the
4580	* current implementation of the WQ concurrency control doesn't
4581	* recognize that a particular WQ is congested if the worker thread is
4582	* looping without ever sleeping. Therefore we have to do a short sleep
4583	* here rather than calling cond_resched().
4584	*/
4585	if (current->flags & PF_WQ_WORKER)
4586	schedule_timeout_uninterruptible(timeout: `1`);
4587	else
4588	cond_resched();
4589	out:
4590	/ Before OOM, exhaust highatomic_reserve /
4591	if (!ret)
4592	return unreserve_highatomic_pageblock(ac, force: true);
4593
4594	return ret;
4595	}
4596
4597	static inline bool
4598	check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac)
4599	{
4600	/*
4601	* It's possible that cpuset's mems_allowed and the nodemask from
4602	* mempolicy don't intersect. This should be normally dealt with by
4603	* policy_nodemask(), but it's possible to race with cpuset update in
4604	* such a way the check therein was true, and then it became false
4605	* before we got our cpuset_mems_cookie here.
4606	* This assumes that for all allocations, ac->nodemask can come only
4607	* from MPOL_BIND mempolicy (whose documented semantics is to be ignored
4608	* when it does not intersect with the cpuset restrictions) or the
4609	* caller can deal with a violated nodemask.
4610	*/
4611	if (cpusets_enabled() && ac->nodemask &&
4612	!cpuset_nodemask_valid_mems_allowed(nodemask: ac->nodemask)) {
4613	ac->nodemask = NULL;
4614	return true;
4615	}
4616
4617	/*
4618	* When updating a task's mems_allowed or mempolicy nodemask, it is
4619	* possible to race with parallel threads in such a way that our
4620	* allocation can fail while the mask is being updated. If we are about
4621	* to fail, check if the cpuset changed during allocation and if so,
4622	* retry.
4623	*/
4624	if (read_mems_allowed_retry(seq: cpuset_mems_cookie))
4625	return true;
4626
4627	return false;
4628	}
4629
4630	static inline struct page *
4631	__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
4632	struct alloc_context *ac)
4633	{
4634	bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
4635	bool can_compact = gfp_compaction_allowed(gfp_mask);
4636	bool nofail = gfp_mask & __GFP_NOFAIL;
4637	const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
4638	struct page *page = NULL;
4639	unsigned int alloc_flags;
4640	unsigned long did_some_progress;
4641	enum compact_priority compact_priority;
4642	enum compact_result compact_result;
4643	int compaction_retries;
4644	int no_progress_loops;
4645	unsigned int cpuset_mems_cookie;
4646	unsigned int zonelist_iter_cookie;
4647	int reserve_flags;
4648
4649	if (unlikely(nofail)) {
4650	/*
4651	* We most definitely don't want callers attempting to
4652	* allocate greater than order-1 page units with __GFP_NOFAIL.
4653	*/
4654	WARN_ON_ONCE(order > `1`);
4655	/*
4656	* Also we don't support __GFP_NOFAIL without __GFP_DIRECT_RECLAIM,
4657	* otherwise, we may result in lockup.
4658	*/
4659	WARN_ON_ONCE(!can_direct_reclaim);
4660	/*
4661	* PF_MEMALLOC request from this context is rather bizarre
4662	* because we cannot reclaim anything and only can loop waiting
4663	* for somebody to do a work for us.
4664	*/
4665	WARN_ON_ONCE(current->flags & PF_MEMALLOC);
4666	}
4667
4668	restart:
4669	compaction_retries = `0`;
4670	no_progress_loops = `0`;
4671	compact_result = COMPACT_SKIPPED;
4672	compact_priority = DEF_COMPACT_PRIORITY;
4673	cpuset_mems_cookie = read_mems_allowed_begin();
4674	zonelist_iter_cookie = zonelist_iter_begin();
4675
4676	/*
4677	* The fast path uses conservative alloc_flags to succeed only until
4678	* kswapd needs to be woken up, and to avoid the cost of setting up
4679	* alloc_flags precisely. So we do that now.
4680	*/
4681	alloc_flags = gfp_to_alloc_flags(gfp_mask, order);
4682
4683	/*
4684	* We need to recalculate the starting point for the zonelist iterator
4685	* because we might have used different nodemask in the fast path, or
4686	* there was a cpuset modification and we are retrying - otherwise we
4687	* could end up iterating over non-eligible zones endlessly.
4688	*/
4689	ac->preferred_zoneref = first_zones_zonelist(zonelist: ac->zonelist,
4690	highest_zoneidx: ac->highest_zoneidx, nodes: ac->nodemask);
4691	if (!zonelist_zone(zoneref: ac->preferred_zoneref))
4692	goto nopage;
4693
4694	/*
4695	* Check for insane configurations where the cpuset doesn't contain
4696	* any suitable zone to satisfy the request - e.g. non-movable
4697	* GFP_HIGHUSER allocations from MOVABLE nodes only.
4698	*/
4699	if (cpusets_insane_config() && (gfp_mask & __GFP_HARDWALL)) {
4700	struct zoneref *z = first_zones_zonelist(zonelist: ac->zonelist,
4701	highest_zoneidx: ac->highest_zoneidx,
4702	nodes: &cpuset_current_mems_allowed);
4703	if (!zonelist_zone(zoneref: z))
4704	goto nopage;
4705	}
4706
4707	if (alloc_flags & ALLOC_KSWAPD)
4708	wake_all_kswapds(order, gfp_mask, ac);
4709
4710	/*
4711	* The adjusted alloc_flags might result in immediate success, so try
4712	* that first
4713	*/
4714	page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
4715	if (page)
4716	goto got_pg;
4717
4718	/*
4719	* For costly allocations, try direct compaction first, as it's likely
4720	* that we have enough base pages and don't need to reclaim. For non-
4721	* movable high-order allocations, do that as well, as compaction will
4722	* try prevent permanent fragmentation by migrating from blocks of the
4723	* same migratetype.
4724	* Don't try this for allocations that are allowed to ignore
4725	* watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
4726	*/
4727	if (can_direct_reclaim && can_compact &&
4728	(costly_order \|\|
4729	(order > `0` && ac->migratetype != MIGRATE_MOVABLE))
4730	&& !gfp_pfmemalloc_allowed(gfp_mask)) {
4731	page = __alloc_pages_direct_compact(gfp_mask, order,
4732	alloc_flags, ac,
4733	prio: INIT_COMPACT_PRIORITY,
4734	compact_result: &compact_result);
4735	if (page)
4736	goto got_pg;
4737
4738	/*
4739	* Checks for costly allocations with __GFP_NORETRY, which
4740	* includes some THP page fault allocations
4741	*/
4742	if (costly_order && (gfp_mask & __GFP_NORETRY)) {
4743	/*
4744	* If allocating entire pageblock(s) and compaction
4745	* failed because all zones are below low watermarks
4746	* or is prohibited because it recently failed at this
4747	* order, fail immediately unless the allocator has
4748	* requested compaction and reclaim retry.
4749	*
4750	* Reclaim is
4751	* - potentially very expensive because zones are far
4752	* below their low watermarks or this is part of very
4753	* bursty high order allocations,
4754	* - not guaranteed to help because isolate_freepages()
4755	* may not iterate over freed pages as part of its
4756	* linear scan, and
4757	* - unlikely to make entire pageblocks free on its
4758	* own.
4759	*/
4760	if (compact_result == COMPACT_SKIPPED \|\|
4761	compact_result == COMPACT_DEFERRED)
4762	goto nopage;
4763
4764	/*
4765	* Looks like reclaim/compaction is worth trying, but
4766	* sync compaction could be very expensive, so keep
4767	* using async compaction.
4768	*/
4769	compact_priority = INIT_COMPACT_PRIORITY;
4770	}
4771	}
4772
4773	retry:
4774	/*
4775	* Deal with possible cpuset update races or zonelist updates to avoid
4776	* infinite retries.
4777	*/
4778	if (check_retry_cpuset(cpuset_mems_cookie, ac) \|\|
4779	check_retry_zonelist(seq: zonelist_iter_cookie))
4780	goto restart;
4781
4782	/ Ensure kswapd doesn't accidentally go to sleep as long as we loop /
4783	if (alloc_flags & ALLOC_KSWAPD)
4784	wake_all_kswapds(order, gfp_mask, ac);
4785
4786	reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
4787	if (reserve_flags)
4788	alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags: reserve_flags) \|
4789	(alloc_flags & ALLOC_KSWAPD);
4790
4791	/*
4792	* Reset the nodemask and zonelist iterators if memory policies can be
4793	* ignored. These allocations are high priority and system rather than
4794	* user oriented.
4795	*/
4796	if (!(alloc_flags & ALLOC_CPUSET) \|\| reserve_flags) {
4797	ac->nodemask = NULL;
4798	ac->preferred_zoneref = first_zones_zonelist(zonelist: ac->zonelist,
4799	highest_zoneidx: ac->highest_zoneidx, nodes: ac->nodemask);
4800	}
4801
4802	/ Attempt with potentially adjusted zonelist and alloc_flags /
4803	page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
4804	if (page)
4805	goto got_pg;
4806
4807	/ Caller is not willing to reclaim, we can't balance anything /
4808	if (!can_direct_reclaim)
4809	goto nopage;
4810
4811	/ Avoid recursion of direct reclaim /
4812	if (current->flags & PF_MEMALLOC)
4813	goto nopage;
4814
4815	/ Try direct reclaim and then allocating /
4816	page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
4817	did_some_progress: &did_some_progress);
4818	if (page)
4819	goto got_pg;
4820
4821	/ Try direct compaction and then allocating /
4822	page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
4823	prio: compact_priority, compact_result: &compact_result);
4824	if (page)
4825	goto got_pg;
4826
4827	/ Do not loop if specifically requested /
4828	if (gfp_mask & __GFP_NORETRY)
4829	goto nopage;
4830
4831	/*
4832	* Do not retry costly high order allocations unless they are
4833	* __GFP_RETRY_MAYFAIL and we can compact
4834	*/
4835	if (costly_order && (!can_compact \|\|
4836	!(gfp_mask & __GFP_RETRY_MAYFAIL)))
4837	goto nopage;
4838
4839	if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
4840	did_some_progress: did_some_progress > `0`, no_progress_loops: &no_progress_loops))
4841	goto retry;
4842
4843	/*
4844	* It doesn't make any sense to retry for the compaction if the order-0
4845	* reclaim is not able to make any progress because the current
4846	* implementation of the compaction depends on the sufficient amount
4847	* of free memory (see __compaction_suitable)
4848	*/
4849	if (did_some_progress > `0` && can_compact &&
4850	should_compact_retry(ac, order, alloc_flags,
4851	compact_result, compact_priority: &compact_priority,
4852	compaction_retries: &compaction_retries))
4853	goto retry;
4854
4855	/ Reclaim/compaction failed to prevent the fallback /
4856	if (defrag_mode && (alloc_flags & ALLOC_NOFRAGMENT)) {
4857	alloc_flags &= ~ALLOC_NOFRAGMENT;
4858	goto retry;
4859	}
4860
4861	/*
4862	* Deal with possible cpuset update races or zonelist updates to avoid
4863	* a unnecessary OOM kill.
4864	*/
4865	if (check_retry_cpuset(cpuset_mems_cookie, ac) \|\|
4866	check_retry_zonelist(seq: zonelist_iter_cookie))
4867	goto restart;
4868
4869	/ Reclaim has failed us, start killing things /
4870	page = __alloc_pages_may_oom(gfp_mask, order, ac, did_some_progress: &did_some_progress);
4871	if (page)
4872	goto got_pg;
4873
4874	/ Avoid allocations with no watermarks from looping endlessly /
4875	if (tsk_is_oom_victim(current) &&
4876	(alloc_flags & ALLOC_OOM \|\|
4877	(gfp_mask & __GFP_NOMEMALLOC)))
4878	goto nopage;
4879
4880	/ Retry as long as the OOM killer is making progress /
4881	if (did_some_progress) {
4882	no_progress_loops = `0`;
4883	goto retry;
4884	}
4885
4886	nopage:
4887	/*
4888	* Deal with possible cpuset update races or zonelist updates to avoid
4889	* a unnecessary OOM kill.
4890	*/
4891	if (check_retry_cpuset(cpuset_mems_cookie, ac) \|\|
4892	check_retry_zonelist(seq: zonelist_iter_cookie))
4893	goto restart;
4894
4895	/*
4896	* Make sure that __GFP_NOFAIL request doesn't leak out and make sure
4897	* we always retry
4898	*/
4899	if (unlikely(nofail)) {
4900	/*
4901	* Lacking direct_reclaim we can't do anything to reclaim memory,
4902	* we disregard these unreasonable nofail requests and still
4903	* return NULL
4904	*/
4905	if (!can_direct_reclaim)
4906	goto fail;
4907
4908	/*
4909	* Help non-failing allocations by giving some access to memory
4910	* reserves normally used for high priority non-blocking
4911	* allocations but do not use ALLOC_NO_WATERMARKS because this
4912	* could deplete whole memory reserves which would just make
4913	* the situation worse.
4914	*/
4915	page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_MIN_RESERVE, ac);
4916	if (page)
4917	goto got_pg;
4918
4919	cond_resched();
4920	goto retry;
4921	}
4922	fail:
4923	warn_alloc(gfp_mask, nodemask: ac->nodemask,
4924	fmt: "page allocation failure: order:%u", order);
4925	got_pg:
4926	return page;
4927	}
4928
4929	static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
4930	int preferred_nid, nodemask_t *nodemask,
4931	struct alloc_context ac, gfp_t alloc_gfp,
4932	unsigned int *alloc_flags)
4933	{
4934	ac->highest_zoneidx = gfp_zone(flags: gfp_mask);
4935	ac->zonelist = node_zonelist(nid: preferred_nid, flags: gfp_mask);
4936	ac->nodemask = nodemask;
4937	ac->migratetype = gfp_migratetype(gfp_flags: gfp_mask);
4938
4939	if (cpusets_enabled()) {
4940	*alloc_gfp \|= __GFP_HARDWALL;
4941	/*
4942	* When we are in the interrupt context, it is irrelevant
4943	* to the current task context. It means that any node ok.
4944	*/
4945	if (in_task() && !ac->nodemask)
4946	ac->nodemask = &cpuset_current_mems_allowed;
4947	else
4948	*alloc_flags \|= ALLOC_CPUSET;
4949	}
4950
4951	might_alloc(gfp_mask);
4952
4953	/*
4954	* Don't invoke should_fail logic, since it may call
4955	* get_random_u32() and printk() which need to spin_lock.
4956	*/
4957	if (!(*alloc_flags & ALLOC_TRYLOCK) &&
4958	should_fail_alloc_page(gfp_mask, order))
4959	return false;
4960
4961	alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags: alloc_flags);
4962
4963	/ Dirty zone balancing only done in the fast path /
4964	ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
4965
4966	/*
4967	* The preferred zone is used for statistics but crucially it is
4968	* also used as the starting point for the zonelist iterator. It
4969	* may get reset for allocations that ignore memory policies.
4970	*/
4971	ac->preferred_zoneref = first_zones_zonelist(zonelist: ac->zonelist,
4972	highest_zoneidx: ac->highest_zoneidx, nodes: ac->nodemask);
4973
4974	return true;
4975	}
4976
4977	/*
4978	* __alloc_pages_bulk - Allocate a number of order-0 pages to an array
4979	* @gfp: GFP flags for the allocation
4980	* @preferred_nid: The preferred NUMA node ID to allocate from
4981	* @nodemask: Set of nodes to allocate from, may be NULL
4982	* @nr_pages: The number of pages desired in the array
4983	* @page_array: Array to store the pages
4984	*
4985	* This is a batched version of the page allocator that attempts to
4986	* allocate nr_pages quickly. Pages are added to the page_array.
4987	*
4988	* Note that only NULL elements are populated with pages and nr_pages
4989	* is the maximum number of pages that will be stored in the array.
4990	*
4991	* Returns the number of pages in the array.
4992	*/
4993	unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
4994	nodemask_t nodemask, int* nr_pages,
4995	struct page **page_array)
4996	{
4997	struct page *page;
4998	unsigned long __maybe_unused UP_flags;
4999	struct zone *zone;
5000	struct zoneref *z;
5001	struct per_cpu_pages *pcp;
5002	struct list_head *pcp_list;
5003	struct alloc_context ac;
5004	gfp_t alloc_gfp;
5005	unsigned int alloc_flags = ALLOC_WMARK_LOW;
5006	int nr_populated = `0`, nr_account = `0`;
5007
5008	/*
5009	* Skip populated array elements to determine if any pages need
5010	* to be allocated before disabling IRQs.
5011	*/
5012	while (nr_populated < nr_pages && page_array[nr_populated])
5013	nr_populated++;
5014
5015	/ No pages requested? /
5016	if (unlikely(nr_pages <= `0`))
5017	goto out;
5018
5019	/ Already populated array? /
5020	if (unlikely(nr_pages - nr_populated == `0`))
5021	goto out;
5022
5023	/ Bulk allocator does not support memcg accounting. /
5024	if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT))
5025	goto failed;
5026
5027	/ Use the single page allocator for one page. /
5028	if (nr_pages - nr_populated == `1`)
5029	goto failed;
5030
5031	#ifdef CONFIG_PAGE_OWNER
5032	/*
5033	* PAGE_OWNER may recurse into the allocator to allocate space to
5034	* save the stack with pagesets.lock held. Releasing/reacquiring
5035	* removes much of the performance benefit of bulk allocation so
5036	* force the caller to allocate one page at a time as it'll have
5037	* similar performance to added complexity to the bulk allocator.
5038	*/
5039	if (static_branch_unlikely(&page_owner_inited))
5040	goto failed;
5041	#endif
5042
5043	/ May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. /
5044	gfp &= gfp_allowed_mask;
5045	alloc_gfp = gfp;
5046	if (!prepare_alloc_pages(gfp_mask: gfp, order: `0`, preferred_nid, nodemask, ac: &ac, alloc_gfp: &alloc_gfp, alloc_flags: &alloc_flags))
5047	goto out;
5048	gfp = alloc_gfp;
5049
5050	/ Find an allowed local zone that meets the low watermark. /
5051	z = ac.preferred_zoneref;
5052	for_next_zone_zonelist_nodemask(zone, z, ac.highest_zoneidx, ac.nodemask) {
5053	unsigned long mark;
5054
5055	if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) &&
5056	!__cpuset_zone_allowed(z: zone, gfp_mask: gfp)) {
5057	continue;
5058	}
5059
5060	if (nr_online_nodes > `1` && zone != zonelist_zone(zoneref: ac.preferred_zoneref) &&
5061	zone_to_nid(zone) != zonelist_node_idx(zoneref: ac.preferred_zoneref)) {
5062	goto failed;
5063	}
5064
5065	cond_accept_memory(zone, order: `0`, alloc_flags);
5066	retry_this_zone:
5067	mark = wmark_pages(z: zone, w: alloc_flags & ALLOC_WMARK_MASK) + nr_pages;
5068	if (zone_watermark_fast(z: zone, order: `0`, mark,
5069	highest_zoneidx: zonelist_zone_idx(zoneref: ac.preferred_zoneref),
5070	alloc_flags, gfp_mask: gfp)) {
5071	break;
5072	}
5073
5074	if (cond_accept_memory(zone, order: `0`, alloc_flags))
5075	goto retry_this_zone;
5076
5077	/ Try again if zone has deferred pages /
5078	if (deferred_pages_enabled()) {
5079	if (_deferred_grow_zone(zone, order: `0`))
5080	goto retry_this_zone;
5081	}
5082	}
5083
5084	/*
5085	* If there are no allowed local zones that meets the watermarks then
5086	* try to allocate a single page and reclaim if necessary.
5087	*/
5088	if (unlikely(!zone))
5089	goto failed;
5090
5091	/ spin_trylock may fail due to a parallel drain or IRQ reentrancy. /
5092	pcp_trylock_prepare(UP_flags);
5093	pcp = pcp_spin_trylock(zone->per_cpu_pageset);
5094	if (!pcp)
5095	goto failed_irq;
5096
5097	/ Attempt the batch allocation /
5098	pcp_list = &pcp->lists[order_to_pindex(migratetype: ac.migratetype, order: `0`)];
5099	while (nr_populated < nr_pages) {
5100
5101	/ Skip existing pages /
5102	if (page_array[nr_populated]) {
5103	nr_populated++;
5104	continue;
5105	}
5106
5107	page = __rmqueue_pcplist(zone, order: `0`, migratetype: ac.migratetype, alloc_flags,
5108	pcp, list: pcp_list);
5109	if (unlikely(!page)) {
5110	/ Try and allocate at least one page /
5111	if (!nr_account) {
5112	pcp_spin_unlock(pcp);
5113	goto failed_irq;
5114	}
5115	break;
5116	}
5117	nr_account++;
5118
5119	prep_new_page(page, order: `0`, gfp_flags: gfp, alloc_flags: `0`);
5120	set_page_refcounted(page);
5121	page_array[nr_populated++] = page;
5122	}
5123
5124	pcp_spin_unlock(pcp);
5125	pcp_trylock_finish(UP_flags);
5126
5127	__count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
5128	zone_statistics(preferred_zone: zonelist_zone(zoneref: ac.preferred_zoneref), z: zone, nr_account);
5129
5130	out:
5131	return nr_populated;
5132
5133	failed_irq:
5134	pcp_trylock_finish(UP_flags);
5135
5136	failed:
5137	page = __alloc_pages_noprof(gfp, order: `0`, preferred_nid, nodemask);
5138	if (page)
5139	page_array[nr_populated++] = page;
5140	goto out;
5141	}
5142	EXPORT_SYMBOL_GPL(alloc_pages_bulk_noprof);
5143
5144	/*
5145	* This is the 'heart' of the zoned buddy allocator.
5146	*/
5147	struct page __alloc_frozen_pages_noprof(gfp_t gfp, unsigned* int order,
5148	int preferred_nid, nodemask_t *nodemask)
5149	{
5150	struct page *page;
5151	unsigned int alloc_flags = ALLOC_WMARK_LOW;
5152	gfp_t alloc_gfp; / The gfp_t that was actually used for allocation /
5153	struct alloc_context ac = { };
5154
5155	/*
5156	* There are several places where we assume that the order value is sane
5157	* so bail out early if the request is out of bound.
5158	*/
5159	if (WARN_ON_ONCE_GFP(order > MAX_PAGE_ORDER, gfp))
5160	return NULL;
5161
5162	gfp &= gfp_allowed_mask;
5163	/*
5164	* Apply scoped allocation constraints. This is mainly about GFP_NOFS
5165	* resp. GFP_NOIO which has to be inherited for all allocation requests
5166	* from a particular context which has been marked by
5167	* memalloc_no{fs,io}_{save,restore}. And PF_MEMALLOC_PIN which ensures
5168	* movable zones are not used during allocation.
5169	*/
5170	gfp = current_gfp_context(flags: gfp);
5171	alloc_gfp = gfp;
5172	if (!prepare_alloc_pages(gfp_mask: gfp, order, preferred_nid, nodemask, ac: &ac,
5173	alloc_gfp: &alloc_gfp, alloc_flags: &alloc_flags))
5174	return NULL;
5175
5176	/*
5177	* Forbid the first pass from falling back to types that fragment
5178	* memory until all local zones are considered.
5179	*/
5180	alloc_flags \|= alloc_flags_nofragment(zone: zonelist_zone(zoneref: ac.preferred_zoneref), gfp_mask: gfp);
5181
5182	/ First allocation attempt /
5183	page = get_page_from_freelist(gfp_mask: alloc_gfp, order, alloc_flags, ac: &ac);
5184	if (likely(page))
5185	goto out;
5186
5187	alloc_gfp = gfp;
5188	ac.spread_dirty_pages = false;
5189
5190	/*
5191	* Restore the original nodemask if it was potentially replaced with
5192	* &cpuset_current_mems_allowed to optimize the fast-path attempt.
5193	*/
5194	ac.nodemask = nodemask;
5195
5196	page = __alloc_pages_slowpath(gfp_mask: alloc_gfp, order, ac: &ac);
5197
5198	out:
5199	if (memcg_kmem_online() && (gfp & __GFP_ACCOUNT) && page &&
5200	unlikely(__memcg_kmem_charge_page(page, gfp, order) != `0`)) {
5201	free_frozen_pages(page, order);
5202	page = NULL;
5203	}
5204
5205	trace_mm_page_alloc(page, order, gfp_flags: alloc_gfp, migratetype: ac.migratetype);
5206	kmsan_alloc_page(page, order, flags: alloc_gfp);
5207
5208	return page;
5209	}
5210	EXPORT_SYMBOL(__alloc_frozen_pages_noprof);
5211
5212	struct page __alloc_pages_noprof(gfp_t gfp, unsigned* int order,
5213	int preferred_nid, nodemask_t *nodemask)
5214	{
5215	struct page *page;
5216
5217	page = __alloc_frozen_pages_noprof(gfp, order, preferred_nid, nodemask);
5218	if (page)
5219	set_page_refcounted(page);
5220	return page;
5221	}
5222	EXPORT_SYMBOL(__alloc_pages_noprof);
5223
5224	struct folio __folio_alloc_noprof(gfp_t gfp, unsigned* int order, int preferred_nid,
5225	nodemask_t *nodemask)
5226	{
5227	struct page *page = __alloc_pages_noprof(gfp \| __GFP_COMP, order,
5228	preferred_nid, nodemask);
5229	return page_rmappable_folio(page);
5230	}
5231	EXPORT_SYMBOL(__folio_alloc_noprof);
5232
5233	/*
5234	* Common helper functions. Never use with __GFP_HIGHMEM because the returned
5235	* address cannot represent highmem pages. Use alloc_pages and then kmap if
5236	* you need to access high mem.
5237	*/
5238	unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order)
5239	{
5240	struct page *page;
5241
5242	page = alloc_pages_noprof(gfp: gfp_mask & ~__GFP_HIGHMEM, order);
5243	if (!page)
5244	return `0`;
5245	return (unsigned long) page_address(page);
5246	}
5247	EXPORT_SYMBOL(get_free_pages_noprof);
5248
5249	unsigned long get_zeroed_page_noprof(gfp_t gfp_mask)
5250	{
5251	return get_free_pages_noprof(gfp_mask \| __GFP_ZERO, `0`);
5252	}
5253	EXPORT_SYMBOL(get_zeroed_page_noprof);
5254
5255	static void ___free_pages(struct page page, unsigned* int order,
5256	fpi_t fpi_flags)
5257	{
5258	/ get PageHead before we drop reference /
5259	int head = PageHead(page);
5260	/ get alloc tag in case the page is released by others /
5261	struct alloc_tag *tag = pgalloc_tag_get(page);
5262
5263	if (put_page_testzero(page))
5264	__free_frozen_pages(page, order, fpi_flags);
5265	else if (!head) {
5266	pgalloc_tag_sub_pages(tag, nr: (`1` << order) - `1`);
5267	while (order-- > `0`) {
5268	/*
5269	* The "tail" pages of this non-compound high-order
5270	* page will have no code tags, so to avoid warnings
5271	* mark them as empty.
5272	*/
5273	clear_page_tag_ref(page: page + (`1` << order));
5274	__free_frozen_pages(page: page + (`1` << order), order,
5275	fpi_flags);
5276	}
5277	}
5278	}
5279
5280	/**
5281	* __free_pages - Free pages allocated with alloc_pages().
5282	* @page: The page pointer returned from alloc_pages().
5283	* @order: The order of the allocation.
5284	*
5285	* This function can free multi-page allocations that are not compound
5286	* pages. It does not check that the @order passed in matches that of
5287	* the allocation, so it is easy to leak memory. Freeing more memory
5288	* than was allocated will probably emit a warning.
5289	*
5290	* If the last reference to this page is speculative, it will be released
5291	* by put_page() which only frees the first page of a non-compound
5292	* allocation. To prevent the remaining pages from being leaked, we free
5293	* the subsequent pages here. If you want to use the page's reference
5294	* count to decide when to free the allocation, you should allocate a
5295	* compound page, and use put_page() instead of __free_pages().
5296	*
5297	* Context: May be called in interrupt context or while holding a normal
5298	* spinlock, but not in NMI context or while holding a raw spinlock.
5299	*/
5300	void __free_pages(struct page page, unsigned* int order)
5301	{
5302	___free_pages(page, order, FPI_NONE);
5303	}
5304	EXPORT_SYMBOL(__free_pages);
5305
5306	/*
5307	* Can be called while holding raw_spin_lock or from IRQ and NMI for any
5308	* page type (not only those that came from alloc_pages_nolock)
5309	*/
5310	void free_pages_nolock(struct page page, unsigned* int order)
5311	{
5312	___free_pages(page, order, FPI_TRYLOCK);
5313	}
5314
5315	/**
5316	* free_pages - Free pages allocated with __get_free_pages().
5317	* @addr: The virtual address tied to a page returned from __get_free_pages().
5318	* @order: The order of the allocation.
5319	*
5320	* This function behaves the same as __free_pages(). Use this function
5321	* to free pages when you only have a valid virtual address. If you have
5322	* the page, call __free_pages() instead.
5323	*/
5324	void free_pages(unsigned long addr, unsigned int order)
5325	{
5326	if (addr != `0`) {
5327	VM_BUG_ON(!virt_addr_valid((void *)addr));
5328	__free_pages(virt_to_page((void *)addr), order);
5329	}
5330	}
5331
5332	EXPORT_SYMBOL(free_pages);
5333
5334	static void make_alloc_exact(unsigned* long addr, unsigned int order,
5335	size_t size)
5336	{
5337	if (addr) {
5338	unsigned long nr = DIV_ROUND_UP(size, PAGE_SIZE);
5339	struct page page = virt_to_page((void* *)addr);
5340	struct page *last = page + nr;
5341
5342	split_page_owner(page, old_order: order, new_order: `0`);
5343	pgalloc_tag_split(page_folio(page), old_order: order, new_order: `0`);
5344	split_page_memcg(first: page, order);
5345	while (page < --last)
5346	set_page_refcounted(last);
5347
5348	last = page + (`1UL` << order);
5349	for (page += nr; page < last; page++)
5350	__free_pages_ok(page, order: `0`, FPI_TO_TAIL);
5351	}
5352	return (void *)addr;
5353	}
5354
5355	/**
5356	* alloc_pages_exact - allocate an exact number physically-contiguous pages.
5357	* @size: the number of bytes to allocate
5358	* @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
5359	*
5360	* This function is similar to alloc_pages(), except that it allocates the
5361	* minimum number of pages to satisfy the request. alloc_pages() can only
5362	* allocate memory in power-of-two pages.
5363	*
5364	* This function is also limited by MAX_PAGE_ORDER.
5365	*
5366	* Memory allocated by this function must be released by free_pages_exact().
5367	*
5368	* Return: pointer to the allocated area or %NULL in case of error.
5369	*/
5370	void *alloc_pages_exact_noprof(size_t size, gfp_t gfp_mask)
5371	{
5372	unsigned int order = get_order(size);
5373	unsigned long addr;
5374
5375	if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP \| __GFP_HIGHMEM)))
5376	gfp_mask &= ~(__GFP_COMP \| __GFP_HIGHMEM);
5377
5378	addr = get_free_pages_noprof(gfp_mask, order);
5379	return make_alloc_exact(addr, order, size);
5380	}
5381	EXPORT_SYMBOL(alloc_pages_exact_noprof);
5382
5383	/**
5384	* alloc_pages_exact_nid - allocate an exact number of physically-contiguous
5385	* pages on a node.
5386	* @nid: the preferred node ID where memory should be allocated
5387	* @size: the number of bytes to allocate
5388	* @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
5389	*
5390	* Like alloc_pages_exact(), but try to allocate on node nid first before falling
5391	* back.
5392	*
5393	* Return: pointer to the allocated area or %NULL in case of error.
5394	*/
5395	void * __meminit alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mask)
5396	{
5397	unsigned int order = get_order(size);
5398	struct page *p;
5399
5400	if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP \| __GFP_HIGHMEM)))
5401	gfp_mask &= ~(__GFP_COMP \| __GFP_HIGHMEM);
5402
5403	p = alloc_pages_node_noprof(nid, gfp_mask, order);
5404	if (!p)
5405	return NULL;
5406	return make_alloc_exact(addr: (unsigned long)page_address(p), order, size);
5407	}
5408
5409	/**
5410	* free_pages_exact - release memory allocated via alloc_pages_exact()
5411	* @virt: the value returned by alloc_pages_exact.
5412	* @size: size of allocation, same value as passed to alloc_pages_exact().
5413	*
5414	* Release the memory allocated by a previous call to alloc_pages_exact.
5415	*/
5416	void free_pages_exact(void *virt, size_t size)
5417	{
5418	unsigned long addr = (unsigned long)virt;
5419	unsigned long end = addr + PAGE_ALIGN(size);
5420
5421	while (addr < end) {
5422	free_page(addr);
5423	addr += PAGE_SIZE;
5424	}
5425	}
5426	EXPORT_SYMBOL(free_pages_exact);
5427
5428	/**
5429	* nr_free_zone_pages - count number of pages beyond high watermark
5430	* @offset: The zone index of the highest zone
5431	*
5432	* nr_free_zone_pages() counts the number of pages which are beyond the
5433	* high watermark within all zones at or below a given zone index. For each
5434	* zone, the number of pages is calculated as:
5435	*
5436	* nr_free_zone_pages = managed_pages - high_pages
5437	*
5438	* Return: number of pages beyond high watermark.
5439	*/
5440	static unsigned long nr_free_zone_pages(int offset)
5441	{
5442	struct zoneref *z;
5443	struct zone *zone;
5444
5445	/ Just pick one node, since fallback list is circular /
5446	unsigned long sum = `0`;
5447
5448	struct zonelist *zonelist = node_zonelist(nid: numa_node_id(), GFP_KERNEL);
5449
5450	for_each_zone_zonelist(zone, z, zonelist, offset) {
5451	unsigned long size = zone_managed_pages(zone);
5452	unsigned long high = high_wmark_pages(z: zone);
5453	if (size > high)
5454	sum += size - high;
5455	}
5456
5457	return sum;
5458	}
5459
5460	/**
5461	* nr_free_buffer_pages - count number of pages beyond high watermark
5462	*
5463	* nr_free_buffer_pages() counts the number of pages which are beyond the high
5464	* watermark within ZONE_DMA and ZONE_NORMAL.
5465	*
5466	* Return: number of pages beyond high watermark within ZONE_DMA and
5467	* ZONE_NORMAL.
5468	*/
5469	unsigned long nr_free_buffer_pages(void)
5470	{
5471	return nr_free_zone_pages(offset: gfp_zone(GFP_USER));
5472	}
5473	EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
5474
5475	static void zoneref_set_zone(struct zone zone, struct* zoneref *zoneref)
5476	{
5477	zoneref->zone = zone;
5478	zoneref->zone_idx = zone_idx(zone);
5479	}
5480
5481	/*
5482	* Builds allocation fallback zone lists.
5483	*
5484	* Add all populated zones of a node to the zonelist.
5485	*/
5486	static int build_zonerefs_node(pg_data_t pgdat, struct* zoneref *zonerefs)
5487	{
5488	struct zone *zone;
5489	enum zone_type zone_type = MAX_NR_ZONES;
5490	int nr_zones = `0`;
5491
5492	do {
5493	zone_type--;
5494	zone = pgdat->node_zones + zone_type;
5495	if (populated_zone(zone)) {
5496	zoneref_set_zone(zone, zoneref: &zonerefs[nr_zones++]);
5497	check_highest_zone(k: zone_type);
5498	}
5499	} while (zone_type);
5500
5501	return nr_zones;
5502	}
5503
5504	#ifdef CONFIG_NUMA
5505
5506	static int __parse_numa_zonelist_order(char *s)
5507	{
5508	/*
5509	* We used to support different zonelists modes but they turned
5510	* out to be just not useful. Let's keep the warning in place
5511	* if somebody still use the cmd line parameter so that we do
5512	* not fail it silently
5513	*/
5514	if (!(s == `'d'` \|\| s == `'D'` \|\| s == `'n'` \|\| s == `'N'`)) {
5515	pr_warn("Ignoring unsupported numa_zonelist_order value: %s\n", s);
5516	return -EINVAL;
5517	}
5518	return `0`;
5519	}
5520
5521	static char numa_zonelist_order[] = "Node";
5522	#define NUMA_ZONELIST_ORDER_LEN 16
5523	/*
5524	* sysctl handler for numa_zonelist_order
5525	*/
5526	static int numa_zonelist_order_handler(const struct ctl_table table, int* write,
5527	void buffer, size_t length, loff_t *ppos)
5528	{
5529	if (write)
5530	return __parse_numa_zonelist_order(s: buffer);
5531	return proc_dostring(table, write, buffer, length, ppos);
5532	}
5533
5534	static int node_load[MAX_NUMNODES];
5535
5536	/**
5537	* find_next_best_node - find the next node that should appear in a given node's fallback list
5538	* @node: node whose fallback list we're appending
5539	* @used_node_mask: nodemask_t of already used nodes
5540	*
5541	* We use a number of factors to determine which is the next node that should
5542	* appear on a given node's fallback list. The node should not have appeared
5543	* already in @node's fallback list, and it should be the next closest node
5544	* according to the distance array (which contains arbitrary distance values
5545	* from each node to each node in the system), and should also prefer nodes
5546	* with no CPUs, since presumably they'll have very little allocation pressure
5547	* on them otherwise.
5548	*
5549	* Return: node id of the found node or %NUMA_NO_NODE if no node is found.
5550	*/
5551	int find_next_best_node(int node, nodemask_t *used_node_mask)
5552	{
5553	int n, val;
5554	int min_val = INT_MAX;
5555	int best_node = NUMA_NO_NODE;
5556
5557	/*
5558	* Use the local node if we haven't already, but for memoryless local
5559	* node, we should skip it and fall back to other nodes.
5560	*/
5561	if (!node_isset(node, *used_node_mask) && node_state(node, state: N_MEMORY)) {
5562	node_set(node, *used_node_mask);
5563	return node;
5564	}
5565
5566	for_each_node_state(n, N_MEMORY) {
5567
5568	/ Don't want a node to appear more than once /
5569	if (node_isset(n, *used_node_mask))
5570	continue;
5571
5572	/ Use the distance array to find the distance /
5573	val = node_distance(node, n);
5574
5575	/ Penalize nodes under us ("prefer the next node") /
5576	val += (n < node);
5577
5578	/ Give preference to headless and unused nodes /
5579	if (!cpumask_empty(srcp: cpumask_of_node(node: n)))
5580	val += PENALTY_FOR_NODE_WITH_CPUS;
5581
5582	/ Slight preference for less loaded node /
5583	val *= MAX_NUMNODES;
5584	val += node_load[n];
5585
5586	if (val < min_val) {
5587	min_val = val;
5588	best_node = n;
5589	}
5590	}
5591
5592	if (best_node >= `0`)
5593	node_set(best_node, *used_node_mask);
5594
5595	return best_node;
5596	}
5597
5598
5599	/*
5600	* Build zonelists ordered by node and zones within node.
5601	* This results in maximum locality--normal zone overflows into local
5602	* DMA zone, if any--but risks exhausting DMA zone.
5603	*/
5604	static void build_zonelists_in_node_order(pg_data_t pgdat, int* *node_order,
5605	unsigned nr_nodes)
5606	{
5607	struct zoneref *zonerefs;
5608	int i;
5609
5610	zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
5611
5612	for (i = `0`; i < nr_nodes; i++) {
5613	int nr_zones;
5614
5615	pg_data_t *node = NODE_DATA(node_order[i]);
5616
5617	nr_zones = build_zonerefs_node(pgdat: node, zonerefs);
5618	zonerefs += nr_zones;
5619	}
5620	zonerefs->zone = NULL;
5621	zonerefs->zone_idx = `0`;
5622	}
5623
5624	/*
5625	* Build __GFP_THISNODE zonelists
5626	*/
5627	static void build_thisnode_zonelists(pg_data_t *pgdat)
5628	{
5629	struct zoneref *zonerefs;
5630	int nr_zones;
5631
5632	zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs;
5633	nr_zones = build_zonerefs_node(pgdat, zonerefs);
5634	zonerefs += nr_zones;
5635	zonerefs->zone = NULL;
5636	zonerefs->zone_idx = `0`;
5637	}
5638
5639	static void build_zonelists(pg_data_t *pgdat)
5640	{
5641	static int node_order[MAX_NUMNODES];
5642	int node, nr_nodes = `0`;
5643	nodemask_t used_mask = NODE_MASK_NONE;
5644	int local_node, prev_node;
5645
5646	/ NUMA-aware ordering of nodes /
5647	local_node = pgdat->node_id;
5648	prev_node = local_node;
5649
5650	memset(s: node_order, c: `0`, n: sizeof(node_order));
5651	while ((node = find_next_best_node(node: local_node, used_node_mask: &used_mask)) >= `0`) {
5652	/*
5653	* We don't want to pressure a particular node.
5654	* So adding penalty to the first node in same
5655	* distance group to make it round-robin.
5656	*/
5657	if (node_distance(local_node, node) !=
5658	node_distance(local_node, prev_node))
5659	node_load[node] += `1`;
5660
5661	node_order[nr_nodes++] = node;
5662	prev_node = node;
5663	}
5664
5665	build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
5666	build_thisnode_zonelists(pgdat);
5667	pr_info("Fallback order for Node %d: ", local_node);
5668	for (node = `0`; node < nr_nodes; node++)
5669	pr_cont("%d ", node_order[node]);
5670	pr_cont("\n");
5671	}
5672
5673	#ifdef CONFIG_HAVE_MEMORYLESS_NODES
5674	/*
5675	* Return node id of node used for "local" allocations.
5676	* I.e., first node id of first zone in arg node's generic zonelist.
5677	* Used for initializing percpu 'numa_mem', which is used primarily
5678	* for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
5679	*/
5680	int local_memory_node(int node)
5681	{
5682	struct zoneref *z;
5683
5684	z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
5685	gfp_zone(GFP_KERNEL),
5686	NULL);
5687	return zonelist_node_idx(z);
5688	}
5689	#endif
5690
5691	static void setup_min_unmapped_ratio(void);
5692	static void setup_min_slab_ratio(void);
5693	#else /* CONFIG_NUMA */
5694
5695	static void build_zonelists(pg_data_t *pgdat)
5696	{
5697	struct zoneref *zonerefs;
5698	int nr_zones;
5699
5700	zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
5701	nr_zones = build_zonerefs_node(pgdat, zonerefs);
5702	zonerefs += nr_zones;
5703
5704	zonerefs->zone = NULL;
5705	zonerefs->zone_idx = `0`;
5706	}
5707
5708	#endif /* CONFIG_NUMA */
5709
5710	/*
5711	* Boot pageset table. One per cpu which is going to be used for all
5712	* zones and all nodes. The parameters will be set in such a way
5713	* that an item put on a list will immediately be handed over to
5714	* the buddy list. This is safe since pageset manipulation is done
5715	* with interrupts disabled.
5716	*
5717	* The boot_pagesets must be kept even after bootup is complete for
5718	* unused processors and/or zones. They do play a role for bootstrapping
5719	* hotplugged processors.
5720	*
5721	* zoneinfo_show() and maybe other functions do
5722	* not check if the processor is online before following the pageset pointer.
5723	* Other parts of the kernel may not check if the zone is available.
5724	*/
5725	static void per_cpu_pages_init(struct per_cpu_pages pcp, struct* per_cpu_zonestat *pzstats);
5726	/ These effectively disable the pcplists in the boot pageset completely /
5727	#define BOOT_PAGESET_HIGH 0
5728	#define BOOT_PAGESET_BATCH 1
5729	static DEFINE_PER_CPU(struct per_cpu_pages, boot_pageset);
5730	static DEFINE_PER_CPU(struct per_cpu_zonestat, boot_zonestats);
5731
5732	static void __build_all_zonelists(void *data)
5733	{
5734	int nid;
5735	int __maybe_unused cpu;
5736	pg_data_t *self = data;
5737	unsigned long flags;
5738
5739	/*
5740	* The zonelist_update_seq must be acquired with irqsave because the
5741	* reader can be invoked from IRQ with GFP_ATOMIC.
5742	*/
5743	write_seqlock_irqsave(&zonelist_update_seq, flags);
5744	/*
5745	* Also disable synchronous printk() to prevent any printk() from
5746	* trying to hold port->lock, for
5747	* tty_insert_flip_string_and_push_buffer() on other CPU might be
5748	* calling kmalloc(GFP_ATOMIC \| __GFP_NOWARN) with port->lock held.
5749	*/
5750	printk_deferred_enter();
5751
5752	#ifdef CONFIG_NUMA
5753	memset(s: node_load, c: `0`, n: sizeof(node_load));
5754	#endif
5755
5756	/*
5757	* This node is hotadded and no memory is yet present. So just
5758	* building zonelists is fine - no need to touch other nodes.
5759	*/
5760	if (self && !node_online(self->node_id)) {
5761	build_zonelists(pgdat: self);
5762	} else {
5763	/*
5764	* All possible nodes have pgdat preallocated
5765	* in free_area_init
5766	*/
5767	for_each_node(nid) {
5768	pg_data_t *pgdat = NODE_DATA(nid);
5769
5770	build_zonelists(pgdat);
5771	}
5772
5773	#ifdef CONFIG_HAVE_MEMORYLESS_NODES
5774	/*
5775	* We now know the "local memory node" for each node--
5776	* i.e., the node of the first zone in the generic zonelist.
5777	* Set up numa_mem percpu variable for on-line cpus. During
5778	* boot, only the boot cpu should be on-line; we'll init the
5779	* secondary cpus' numa_mem as they come on-line. During
5780	* node/memory hotplug, we'll fixup all on-line cpus.
5781	*/
5782	for_each_online_cpu(cpu)
5783	set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
5784	#endif
5785	}
5786
5787	printk_deferred_exit();
5788	write_sequnlock_irqrestore(sl: &zonelist_update_seq, flags);
5789	}
5790
5791	static noinline void __init
5792	build_all_zonelists_init(void)
5793	{
5794	int cpu;
5795
5796	__build_all_zonelists(NULL);
5797
5798	/*
5799	* Initialize the boot_pagesets that are going to be used
5800	* for bootstrapping processors. The real pagesets for
5801	* each zone will be allocated later when the per cpu
5802	* allocator is available.
5803	*
5804	* boot_pagesets are used also for bootstrapping offline
5805	* cpus if the system is already booted because the pagesets
5806	* are needed to initialize allocators on a specific cpu too.
5807	* F.e. the percpu allocator needs the page allocator which
5808	* needs the percpu allocator in order to allocate its pagesets
5809	* (a chicken-egg dilemma).
5810	*/
5811	for_each_possible_cpu(cpu)
5812	per_cpu_pages_init(pcp: &per_cpu(boot_pageset, cpu), pzstats: &per_cpu(boot_zonestats, cpu));
5813
5814	mminit_verify_zonelist();
5815	cpuset_init_current_mems_allowed();
5816	}
5817
5818	/*
5819	* unless system_state == SYSTEM_BOOTING.
5820	*
5821	* __ref due to call of __init annotated helper build_all_zonelists_init
5822	* [protected by SYSTEM_BOOTING].
5823	*/
5824	void __ref build_all_zonelists(pg_data_t *pgdat)
5825	{
5826	unsigned long vm_total_pages;
5827
5828	if (system_state == SYSTEM_BOOTING) {
5829	build_all_zonelists_init();
5830	} else {
5831	__build_all_zonelists(data: pgdat);
5832	/ cpuset refresh routine should be here /
5833	}
5834	/ Get the number of free pages beyond high watermark in all zones. /
5835	vm_total_pages = nr_free_zone_pages(offset: gfp_zone(GFP_HIGHUSER_MOVABLE));
5836	/*
5837	* Disable grouping by mobility if the number of pages in the
5838	* system is too low to allow the mechanism to work. It would be
5839	* more accurate, but expensive to check per-zone. This check is
5840	* made on memory-hotadd so a system can start with mobility
5841	* disabled and enable it later
5842	*/
5843	if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
5844	page_group_by_mobility_disabled = `1`;
5845	else
5846	page_group_by_mobility_disabled = `0`;
5847
5848	pr_info("Built %u zonelists, mobility grouping %s. Total pages: %ld\n",
5849	nr_online_nodes,
5850	str_off_on(page_group_by_mobility_disabled),
5851	vm_total_pages);
5852	#ifdef CONFIG_NUMA
5853	pr_info("Policy zone: %s\n", zone_names[policy_zone]);
5854	#endif
5855	}
5856
5857	static int zone_batchsize(struct zone *zone)
5858	{
5859	#ifdef CONFIG_MMU
5860	int batch;
5861
5862	/*
5863	* The number of pages to batch allocate is either ~0.1%
5864	* of the zone or 1MB, whichever is smaller. The batch
5865	* size is striking a balance between allocation latency
5866	* and zone lock contention.
5867	*/
5868	batch = min(zone_managed_pages(zone) >> `10`, SZ_1M / PAGE_SIZE);
5869	batch /= `4`; / We effectively = 4 below /*
5870	if (batch < `1`)
5871	batch = `1`;
5872
5873	/*
5874	* Clamp the batch to a 2^n - 1 value. Having a power
5875	* of 2 value was found to be more likely to have
5876	* suboptimal cache aliasing properties in some cases.
5877	*
5878	* For example if 2 tasks are alternately allocating
5879	* batches of pages, one task can end up with a lot
5880	* of pages of one half of the possible page colors
5881	* and the other with pages of the other colors.
5882	*/
5883	batch = rounddown_pow_of_two(batch + batch/`2`) - `1`;
5884
5885	return batch;
5886
5887	#else
5888	/ The deferral and batching of frees should be suppressed under NOMMU*
5889	* conditions.
5890	*
5891	* The problem is that NOMMU needs to be able to allocate large chunks
5892	* of contiguous memory as there's no hardware page translation to
5893	* assemble apparent contiguous memory from discontiguous pages.
5894	*
5895	* Queueing large contiguous runs of pages for batching, however,
5896	* causes the pages to actually be freed in smaller chunks. As there
5897	* can be a significant delay between the individual batches being
5898	* recycled, this leads to the once large chunks of space being
5899	* fragmented and becoming unavailable for high-order allocations.
5900	*/
5901	return `0`;
5902	#endif
5903	}
5904
5905	static int percpu_pagelist_high_fraction;
5906	static int zone_highsize(struct zone zone, int* batch, int cpu_online,
5907	int high_fraction)
5908	{
5909	#ifdef CONFIG_MMU
5910	int high;
5911	int nr_split_cpus;
5912	unsigned long total_pages;
5913
5914	if (!high_fraction) {
5915	/*
5916	* By default, the high value of the pcp is based on the zone
5917	* low watermark so that if they are full then background
5918	* reclaim will not be started prematurely.
5919	*/
5920	total_pages = low_wmark_pages(z: zone);
5921	} else {
5922	/*
5923	* If percpu_pagelist_high_fraction is configured, the high
5924	* value is based on a fraction of the managed pages in the
5925	* zone.
5926	*/
5927	total_pages = zone_managed_pages(zone) / high_fraction;
5928	}
5929
5930	/*
5931	* Split the high value across all online CPUs local to the zone. Note
5932	* that early in boot that CPUs may not be online yet and that during
5933	* CPU hotplug that the cpumask is not yet updated when a CPU is being
5934	* onlined. For memory nodes that have no CPUs, split the high value
5935	* across all online CPUs to mitigate the risk that reclaim is triggered
5936	* prematurely due to pages stored on pcp lists.
5937	*/
5938	nr_split_cpus = cpumask_weight(srcp: cpumask_of_node(node: zone_to_nid(zone))) + cpu_online;
5939	if (!nr_split_cpus)
5940	nr_split_cpus = num_online_cpus();
5941	high = total_pages / nr_split_cpus;
5942
5943	/*
5944	* Ensure high is at least batch*4. The multiple is based on the
5945	* historical relationship between high and batch.
5946	*/
5947	high = max(high, batch << `2`);
5948
5949	return high;
5950	#else
5951	return `0`;
5952	#endif
5953	}
5954
5955	/*
5956	* pcp->high and pcp->batch values are related and generally batch is lower
5957	* than high. They are also related to pcp->count such that count is lower
5958	* than high, and as soon as it reaches high, the pcplist is flushed.
5959	*
5960	* However, guaranteeing these relations at all times would require e.g. write
5961	* barriers here but also careful usage of read barriers at the read side, and
5962	* thus be prone to error and bad for performance. Thus the update only prevents
5963	* store tearing. Any new users of pcp->batch, pcp->high_min and pcp->high_max
5964	* should ensure they can cope with those fields changing asynchronously, and
5965	* fully trust only the pcp->count field on the local CPU with interrupts
5966	* disabled.
5967	*
5968	* mutex_is_locked(&pcp_batch_high_lock) required when calling this function
5969	* outside of boot time (or some other assurance that no concurrent updaters
5970	* exist).
5971	*/
5972	static void pageset_update(struct per_cpu_pages pcp, unsigned* long high_min,
5973	unsigned long high_max, unsigned long batch)
5974	{
5975	WRITE_ONCE(pcp->batch, batch);
5976	WRITE_ONCE(pcp->high_min, high_min);
5977	WRITE_ONCE(pcp->high_max, high_max);
5978	}
5979
5980	static void per_cpu_pages_init(struct per_cpu_pages pcp, struct* per_cpu_zonestat *pzstats)
5981	{
5982	int pindex;
5983
5984	memset(s: pcp, c: `0`, n: sizeof(*pcp));
5985	memset(s: pzstats, c: `0`, n: sizeof(*pzstats));
5986
5987	spin_lock_init(&pcp->lock);
5988	for (pindex = `0`; pindex < NR_PCP_LISTS; pindex++)
5989	INIT_LIST_HEAD(list: &pcp->lists[pindex]);
5990
5991	/*
5992	* Set batch and high values safe for a boot pageset. A true percpu
5993	* pageset's initialization will update them subsequently. Here we don't
5994	* need to be as careful as pageset_update() as nobody can access the
5995	* pageset yet.
5996	*/
5997	pcp->high_min = BOOT_PAGESET_HIGH;
5998	pcp->high_max = BOOT_PAGESET_HIGH;
5999	pcp->batch = BOOT_PAGESET_BATCH;
6000	}
6001
6002	static void __zone_set_pageset_high_and_batch(struct zone zone, unsigned* long high_min,
6003	unsigned long high_max, unsigned long batch)
6004	{
6005	struct per_cpu_pages *pcp;
6006	int cpu;
6007
6008	for_each_possible_cpu(cpu) {
6009	pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
6010	pageset_update(pcp, high_min, high_max, batch);
6011	}
6012	}
6013
6014	/*
6015	* Calculate and set new high and batch values for all per-cpu pagesets of a
6016	* zone based on the zone's size.
6017	*/
6018	static void zone_set_pageset_high_and_batch(struct zone zone, int* cpu_online)
6019	{
6020	int new_high_min, new_high_max, new_batch;
6021
6022	new_batch = max(`1`, zone_batchsize(zone));
6023	if (percpu_pagelist_high_fraction) {
6024	new_high_min = zone_highsize(zone, batch: new_batch, cpu_online,
6025	high_fraction: percpu_pagelist_high_fraction);
6026	/*
6027	* PCP high is tuned manually, disable auto-tuning via
6028	* setting high_min and high_max to the manual value.
6029	*/
6030	new_high_max = new_high_min;
6031	} else {
6032	new_high_min = zone_highsize(zone, batch: new_batch, cpu_online, high_fraction: `0`);
6033	new_high_max = zone_highsize(zone, batch: new_batch, cpu_online,
6034	MIN_PERCPU_PAGELIST_HIGH_FRACTION);
6035	}
6036
6037	if (zone->pageset_high_min == new_high_min &&
6038	zone->pageset_high_max == new_high_max &&
6039	zone->pageset_batch == new_batch)
6040	return;
6041
6042	zone->pageset_high_min = new_high_min;
6043	zone->pageset_high_max = new_high_max;
6044	zone->pageset_batch = new_batch;
6045
6046	__zone_set_pageset_high_and_batch(zone, high_min: new_high_min, high_max: new_high_max,
6047	batch: new_batch);
6048	}
6049
6050	void __meminit setup_zone_pageset(struct zone *zone)
6051	{
6052	int cpu;
6053
6054	/ Size may be 0 on !SMP && !NUMA /
6055	if (sizeof(struct per_cpu_zonestat) > `0`)
6056	zone->per_cpu_zonestats = alloc_percpu(struct per_cpu_zonestat);
6057
6058	zone->per_cpu_pageset = alloc_percpu(struct per_cpu_pages);
6059	for_each_possible_cpu(cpu) {
6060	struct per_cpu_pages *pcp;
6061	struct per_cpu_zonestat *pzstats;
6062
6063	pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
6064	pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
6065	per_cpu_pages_init(pcp, pzstats);
6066	}
6067
6068	zone_set_pageset_high_and_batch(zone, cpu_online: `0`);
6069	}
6070
6071	/*
6072	* The zone indicated has a new number of managed_pages; batch sizes and percpu
6073	* page high values need to be recalculated.
6074	*/
6075	static void zone_pcp_update(struct zone zone, int* cpu_online)
6076	{
6077	mutex_lock(lock: &pcp_batch_high_lock);
6078	zone_set_pageset_high_and_batch(zone, cpu_online);
6079	mutex_unlock(lock: &pcp_batch_high_lock);
6080	}
6081
6082	static void zone_pcp_update_cacheinfo(struct zone zone, unsigned* int cpu)
6083	{
6084	struct per_cpu_pages *pcp;
6085	struct cpu_cacheinfo *cci;
6086
6087	pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
6088	cci = get_cpu_cacheinfo(cpu);
6089	/*
6090	* If data cache slice of CPU is large enough, "pcp->batch"
6091	* pages can be preserved in PCP before draining PCP for
6092	* consecutive high-order pages freeing without allocation.
6093	* This can reduce zone lock contention without hurting
6094	* cache-hot pages sharing.
6095	*/
6096	spin_lock(lock: &pcp->lock);
6097	if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > `3` * pcp->batch)
6098	pcp->flags \|= PCPF_FREE_HIGH_BATCH;
6099	else
6100	pcp->flags &= ~PCPF_FREE_HIGH_BATCH;
6101	spin_unlock(lock: &pcp->lock);
6102	}
6103
6104	void setup_pcp_cacheinfo(unsigned int cpu)
6105	{
6106	struct zone *zone;
6107
6108	for_each_populated_zone(zone)
6109	zone_pcp_update_cacheinfo(zone, cpu);
6110	}
6111
6112	/*
6113	* Allocate per cpu pagesets and initialize them.
6114	* Before this call only boot pagesets were available.
6115	*/
6116	void __init setup_per_cpu_pageset(void)
6117	{
6118	struct pglist_data *pgdat;
6119	struct zone *zone;
6120	int __maybe_unused cpu;
6121
6122	for_each_populated_zone(zone)
6123	setup_zone_pageset(zone);
6124
6125	#ifdef CONFIG_NUMA
6126	/*
6127	* Unpopulated zones continue using the boot pagesets.
6128	* The numa stats for these pagesets need to be reset.
6129	* Otherwise, they will end up skewing the stats of
6130	* the nodes these zones are associated with.
6131	*/
6132	for_each_possible_cpu(cpu) {
6133	struct per_cpu_zonestat *pzstats = &per_cpu(boot_zonestats, cpu);
6134	memset(s: pzstats->vm_numa_event, c: `0`,
6135	n: sizeof(pzstats->vm_numa_event));
6136	}
6137	#endif
6138
6139	for_each_online_pgdat(pgdat)
6140	pgdat->per_cpu_nodestats =
6141	alloc_percpu(struct per_cpu_nodestat);
6142	}
6143
6144	__meminit void zone_pcp_init(struct zone *zone)
6145	{
6146	/*
6147	* per cpu subsystem is not up at this point. The following code
6148	* relies on the ability of the linker to provide the
6149	* offset of a (static) per cpu variable into the per cpu area.
6150	*/
6151	zone->per_cpu_pageset = &boot_pageset;
6152	zone->per_cpu_zonestats = &boot_zonestats;
6153	zone->pageset_high_min = BOOT_PAGESET_HIGH;
6154	zone->pageset_high_max = BOOT_PAGESET_HIGH;
6155	zone->pageset_batch = BOOT_PAGESET_BATCH;
6156
6157	if (populated_zone(zone))
6158	pr_debug(" %s zone: %lu pages, LIFO batch:%u\n", zone->name,
6159	zone->present_pages, zone_batchsize(zone));
6160	}
6161
6162	static void setup_per_zone_lowmem_reserve(void);
6163
6164	void adjust_managed_page_count(struct page page, long* count)
6165	{
6166	atomic_long_add(i: count, v: &page_zone(page)->managed_pages);
6167	totalram_pages_add(count);
6168	setup_per_zone_lowmem_reserve();
6169	}
6170	EXPORT_SYMBOL(adjust_managed_page_count);
6171
6172	unsigned long free_reserved_area(void start, void* end, int* poison, const char *s)
6173	{
6174	void *pos;
6175	unsigned long pages = `0`;
6176
6177	start = (void )PAGE_ALIGN((unsigned* long)start);
6178	end = (void )((unsigned* long)end & PAGE_MASK);
6179	for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
6180	struct page *page = virt_to_page(pos);
6181	void *direct_map_addr;
6182
6183	/*
6184	* 'direct_map_addr' might be different from 'pos'
6185	* because some architectures' virt_to_page()
6186	* work with aliases. Getting the direct map
6187	* address ensures that we get a _writeable_
6188	* alias for the memset().
6189	*/
6190	direct_map_addr = page_address(page);
6191	/*
6192	* Perform a kasan-unchecked memset() since this memory
6193	* has not been initialized.
6194	*/
6195	direct_map_addr = kasan_reset_tag(addr: direct_map_addr);
6196	if ((unsigned int)poison <= `0xFF`)
6197	memset(s: direct_map_addr, c: poison, PAGE_SIZE);
6198
6199	free_reserved_page(page);
6200	}
6201
6202	if (pages && s)
6203	pr_info("Freeing %s memory: %ldK\n", s, K(pages));
6204
6205	return pages;
6206	}
6207
6208	void free_reserved_page(struct page *page)
6209	{
6210	clear_page_tag_ref(page);
6211	ClearPageReserved(page);
6212	init_page_count(page);
6213	__free_page(page);
6214	adjust_managed_page_count(page, `1`);
6215	}
6216	EXPORT_SYMBOL(free_reserved_page);
6217
6218	static int page_alloc_cpu_dead(unsigned int cpu)
6219	{
6220	struct zone *zone;
6221
6222	lru_add_drain_cpu(cpu);
6223	mlock_drain_remote(cpu);
6224	drain_pages(cpu);
6225
6226	/*
6227	* Spill the event counters of the dead processor
6228	* into the current processors event counters.
6229	* This artificially elevates the count of the current
6230	* processor.
6231	*/
6232	vm_events_fold_cpu(cpu);
6233
6234	/*
6235	* Zero the differential counters of the dead processor
6236	* so that the vm statistics are consistent.
6237	*
6238	* This is only okay since the processor is dead and cannot
6239	* race with what we are doing.
6240	*/
6241	cpu_vm_stats_fold(cpu);
6242
6243	for_each_populated_zone(zone)
6244	zone_pcp_update(zone, cpu_online: `0`);
6245
6246	return `0`;
6247	}
6248
6249	static int page_alloc_cpu_online(unsigned int cpu)
6250	{
6251	struct zone *zone;
6252
6253	for_each_populated_zone(zone)
6254	zone_pcp_update(zone, cpu_online: `1`);
6255	return `0`;
6256	}
6257
6258	void __init page_alloc_init_cpuhp(void)
6259	{
6260	int ret;
6261
6262	ret = cpuhp_setup_state_nocalls(state: CPUHP_PAGE_ALLOC,
6263	name: "mm/page_alloc:pcp",
6264	startup: page_alloc_cpu_online,
6265	teardown: page_alloc_cpu_dead);
6266	WARN_ON(ret < `0`);
6267	}
6268
6269	/*
6270	* calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio
6271	* or min_free_kbytes changes.
6272	*/
6273	static void calculate_totalreserve_pages(void)
6274	{
6275	struct pglist_data *pgdat;
6276	unsigned long reserve_pages = `0`;
6277	enum zone_type i, j;
6278
6279	for_each_online_pgdat(pgdat) {
6280
6281	pgdat->totalreserve_pages = `0`;
6282
6283	for (i = `0`; i < MAX_NR_ZONES; i++) {
6284	struct zone *zone = pgdat->node_zones + i;
6285	long max = `0`;
6286	unsigned long managed_pages = zone_managed_pages(zone);
6287
6288	/ Find valid and maximum lowmem_reserve in the zone /
6289	for (j = i; j < MAX_NR_ZONES; j++)
6290	max = max(max, zone->lowmem_reserve[j]);
6291
6292	/ we treat the high watermark as reserved pages. /
6293	max += high_wmark_pages(z: zone);
6294
6295	max = min_t(unsigned long, max, managed_pages);
6296
6297	pgdat->totalreserve_pages += max;
6298
6299	reserve_pages += max;
6300	}
6301	}
6302	totalreserve_pages = reserve_pages;
6303	trace_mm_calculate_totalreserve_pages(totalreserve_pages);
6304	}
6305
6306	/*
6307	* setup_per_zone_lowmem_reserve - called whenever
6308	* sysctl_lowmem_reserve_ratio changes. Ensures that each zone
6309	* has a correct pages reserved value, so an adequate number of
6310	* pages are left in the zone after a successful __alloc_pages().
6311	*/
6312	static void setup_per_zone_lowmem_reserve(void)
6313	{
6314	struct pglist_data *pgdat;
6315	enum zone_type i, j;
6316
6317	for_each_online_pgdat(pgdat) {
6318	for (i = `0`; i < MAX_NR_ZONES - `1`; i++) {
6319	struct zone *zone = &pgdat->node_zones[i];
6320	int ratio = sysctl_lowmem_reserve_ratio[i];
6321	bool clear = !ratio \|\| !zone_managed_pages(zone);
6322	unsigned long managed_pages = `0`;
6323
6324	for (j = i + `1`; j < MAX_NR_ZONES; j++) {
6325	struct zone *upper_zone = &pgdat->node_zones[j];
6326
6327	managed_pages += zone_managed_pages(zone: upper_zone);
6328
6329	if (clear)
6330	zone->lowmem_reserve[j] = `0`;
6331	else
6332	zone->lowmem_reserve[j] = managed_pages / ratio;
6333	trace_mm_setup_per_zone_lowmem_reserve(zone, upper_zone,
6334	lowmem_reserve: zone->lowmem_reserve[j]);
6335	}
6336	}
6337	}
6338
6339	/ update totalreserve_pages /
6340	calculate_totalreserve_pages();
6341	}
6342
6343	static void __setup_per_zone_wmarks(void)
6344	{
6345	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - `10`);
6346	unsigned long lowmem_pages = `0`;
6347	struct zone *zone;
6348	unsigned long flags;
6349
6350	/ Calculate total number of !ZONE_HIGHMEM and !ZONE_MOVABLE pages /
6351	for_each_zone(zone) {
6352	if (!is_highmem(zone) && zone_idx(zone) != ZONE_MOVABLE)
6353	lowmem_pages += zone_managed_pages(zone);
6354	}
6355
6356	for_each_zone(zone) {
6357	u64 tmp;
6358
6359	spin_lock_irqsave(&zone->lock, flags);
6360	tmp = (u64)pages_min * zone_managed_pages(zone);
6361	tmp = div64_ul(tmp, lowmem_pages);
6362	if (is_highmem(zone) \|\| zone_idx(zone) == ZONE_MOVABLE) {
6363	/*
6364	* __GFP_HIGH and PF_MEMALLOC allocations usually don't
6365	* need highmem and movable zones pages, so cap pages_min
6366	* to a small value here.
6367	*
6368	* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
6369	* deltas control async page reclaim, and so should
6370	* not be capped for highmem and movable zones.
6371	*/
6372	unsigned long min_pages;
6373
6374	min_pages = zone_managed_pages(zone) / `1024`;
6375	min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, `128UL`);
6376	zone->_watermark[WMARK_MIN] = min_pages;
6377	} else {
6378	/*
6379	* If it's a lowmem zone, reserve a number of pages
6380	* proportionate to the zone's size.
6381	*/
6382	zone->_watermark[WMARK_MIN] = tmp;
6383	}
6384
6385	/*
6386	* Set the kswapd watermarks distance according to the
6387	* scale factor in proportion to available memory, but
6388	* ensure a minimum size on small systems.
6389	*/
6390	tmp = max_t(u64, tmp >> `2`,
6391	mult_frac(zone_managed_pages(zone),
6392	watermark_scale_factor, `10000`));
6393
6394	zone->watermark_boost = `0`;
6395	zone->_watermark[WMARK_LOW] = min_wmark_pages(z: zone) + tmp;
6396	zone->_watermark[WMARK_HIGH] = low_wmark_pages(z: zone) + tmp;
6397	zone->_watermark[WMARK_PROMO] = high_wmark_pages(z: zone) + tmp;
6398	trace_mm_setup_per_zone_wmarks(zone);
6399
6400	spin_unlock_irqrestore(lock: &zone->lock, flags);
6401	}
6402
6403	/ update totalreserve_pages /
6404	calculate_totalreserve_pages();
6405	}
6406
6407	/**
6408	* setup_per_zone_wmarks - called when min_free_kbytes changes
6409	* or when memory is hot-{added\|removed}
6410	*
6411	* Ensures that the watermark[min,low,high] values for each zone are set
6412	* correctly with respect to min_free_kbytes.
6413	*/
6414	void setup_per_zone_wmarks(void)
6415	{
6416	struct zone *zone;
6417	static DEFINE_SPINLOCK(lock);
6418
6419	spin_lock(lock: &lock);
6420	__setup_per_zone_wmarks();
6421	spin_unlock(lock: &lock);
6422
6423	/*
6424	* The watermark size have changed so update the pcpu batch
6425	* and high limits or the limits may be inappropriate.
6426	*/
6427	for_each_zone(zone)
6428	zone_pcp_update(zone, cpu_online: `0`);
6429	}
6430
6431	/*
6432	* Initialise min_free_kbytes.
6433	*
6434	* For small machines we want it small (128k min). For large machines
6435	* we want it large (256MB max). But it is not linear, because network
6436	* bandwidth does not increase linearly with machine size. We use
6437	*
6438	* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
6439	* min_free_kbytes = sqrt(lowmem_kbytes * 16)
6440	*
6441	* which yields
6442	*
6443	* 16MB: 512k
6444	* 32MB: 724k
6445	* 64MB: 1024k
6446	* 128MB: 1448k
6447	* 256MB: 2048k
6448	* 512MB: 2896k
6449	* 1024MB: 4096k
6450	* 2048MB: 5792k
6451	* 4096MB: 8192k
6452	* 8192MB: 11584k
6453	* 16384MB: 16384k
6454	*/
6455	void calculate_min_free_kbytes(void)
6456	{
6457	unsigned long lowmem_kbytes;
6458	int new_min_free_kbytes;
6459
6460	lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> `10`);
6461	new_min_free_kbytes = int_sqrt(lowmem_kbytes * `16`);
6462
6463	if (new_min_free_kbytes > user_min_free_kbytes)
6464	min_free_kbytes = clamp(new_min_free_kbytes, `128`, `262144`);
6465	else
6466	pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
6467	new_min_free_kbytes, user_min_free_kbytes);
6468
6469	}
6470
6471	int __meminit init_per_zone_wmark_min(void)
6472	{
6473	calculate_min_free_kbytes();
6474	setup_per_zone_wmarks();
6475	refresh_zone_stat_thresholds();
6476	setup_per_zone_lowmem_reserve();
6477
6478	#ifdef CONFIG_NUMA
6479	setup_min_unmapped_ratio();
6480	setup_min_slab_ratio();
6481	#endif
6482
6483	khugepaged_min_free_kbytes_update();
6484
6485	return `0`;
6486	}
6487	postcore_initcall(init_per_zone_wmark_min)
6488
6489	/*
6490	* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
6491	* that we can call two helper functions whenever min_free_kbytes
6492	* changes.
6493	*/
6494	static int min_free_kbytes_sysctl_handler(const struct ctl_table table, int* write,
6495	void buffer, size_t length, loff_t *ppos)
6496	{
6497	int rc;
6498
6499	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
6500	if (rc)
6501	return rc;
6502
6503	if (write) {
6504	user_min_free_kbytes = min_free_kbytes;
6505	setup_per_zone_wmarks();
6506	}
6507	return `0`;
6508	}
6509
6510	static int watermark_scale_factor_sysctl_handler(const struct ctl_table table, int* write,
6511	void buffer, size_t length, loff_t *ppos)
6512	{
6513	int rc;
6514
6515	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
6516	if (rc)
6517	return rc;
6518
6519	if (write)
6520	setup_per_zone_wmarks();
6521
6522	return `0`;
6523	}
6524
6525	#ifdef CONFIG_NUMA
6526	static void setup_min_unmapped_ratio(void)
6527	{
6528	pg_data_t *pgdat;
6529	struct zone *zone;
6530
6531	for_each_online_pgdat(pgdat)
6532	pgdat->min_unmapped_pages = `0`;
6533
6534	for_each_zone(zone)
6535	zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) *
6536	sysctl_min_unmapped_ratio) / `100`;
6537	}
6538
6539
6540	static int sysctl_min_unmapped_ratio_sysctl_handler(const struct ctl_table table, int* write,
6541	void buffer, size_t length, loff_t *ppos)
6542	{
6543	int rc;
6544
6545	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
6546	if (rc)
6547	return rc;
6548
6549	setup_min_unmapped_ratio();
6550
6551	return `0`;
6552	}
6553
6554	static void setup_min_slab_ratio(void)
6555	{
6556	pg_data_t *pgdat;
6557	struct zone *zone;
6558
6559	for_each_online_pgdat(pgdat)
6560	pgdat->min_slab_pages = `0`;
6561
6562	for_each_zone(zone)
6563	zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) *
6564	sysctl_min_slab_ratio) / `100`;
6565	}
6566
6567	static int sysctl_min_slab_ratio_sysctl_handler(const struct ctl_table table, int* write,
6568	void buffer, size_t length, loff_t *ppos)
6569	{
6570	int rc;
6571
6572	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
6573	if (rc)
6574	return rc;
6575
6576	setup_min_slab_ratio();
6577
6578	return `0`;
6579	}
6580	#endif
6581
6582	/*
6583	* lowmem_reserve_ratio_sysctl_handler - just a wrapper around
6584	* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
6585	* whenever sysctl_lowmem_reserve_ratio changes.
6586	*
6587	* The reserve ratio obviously has absolutely no relation with the
6588	* minimum watermarks. The lowmem reserve ratio can only make sense
6589	* if in function of the boot time zone sizes.
6590	*/
6591	static int lowmem_reserve_ratio_sysctl_handler(const struct ctl_table *table,
6592	int write, void buffer, size_t length, loff_t *ppos)
6593	{
6594	int i;
6595
6596	proc_dointvec_minmax(table, write, buffer, length, ppos);
6597
6598	for (i = `0`; i < MAX_NR_ZONES; i++) {
6599	if (sysctl_lowmem_reserve_ratio[i] < `1`)
6600	sysctl_lowmem_reserve_ratio[i] = `0`;
6601	}
6602
6603	setup_per_zone_lowmem_reserve();
6604	return `0`;
6605	}
6606
6607	/*
6608	* percpu_pagelist_high_fraction - changes the pcp->high for each zone on each
6609	* cpu. It is the fraction of total pages in each zone that a hot per cpu
6610	* pagelist can have before it gets flushed back to buddy allocator.
6611	*/
6612	static int percpu_pagelist_high_fraction_sysctl_handler(const struct ctl_table *table,
6613	int write, void buffer, size_t length, loff_t *ppos)
6614	{
6615	struct zone *zone;
6616	int old_percpu_pagelist_high_fraction;
6617	int ret;
6618
6619	mutex_lock(lock: &pcp_batch_high_lock);
6620	old_percpu_pagelist_high_fraction = percpu_pagelist_high_fraction;
6621
6622	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
6623	if (!write \|\| ret < `0`)
6624	goto out;
6625
6626	/ Sanity checking to avoid pcp imbalance /
6627	if (percpu_pagelist_high_fraction &&
6628	percpu_pagelist_high_fraction < MIN_PERCPU_PAGELIST_HIGH_FRACTION) {
6629	percpu_pagelist_high_fraction = old_percpu_pagelist_high_fraction;
6630	ret = -EINVAL;
6631	goto out;
6632	}
6633
6634	/ No change? /
6635	if (percpu_pagelist_high_fraction == old_percpu_pagelist_high_fraction)
6636	goto out;
6637
6638	for_each_populated_zone(zone)
6639	zone_set_pageset_high_and_batch(zone, cpu_online: `0`);
6640	out:
6641	mutex_unlock(lock: &pcp_batch_high_lock);
6642	return ret;
6643	}
6644
6645	static const struct ctl_table page_alloc_sysctl_table[] = {
6646	{
6647	.procname = "min_free_kbytes",
6648	.data = &min_free_kbytes,
6649	.maxlen = sizeof(min_free_kbytes),
6650	.mode = `0644`,
6651	.proc_handler = min_free_kbytes_sysctl_handler,
6652	.extra1 = SYSCTL_ZERO,
6653	},
6654	{
6655	.procname = "watermark_boost_factor",
6656	.data = &watermark_boost_factor,
6657	.maxlen = sizeof(watermark_boost_factor),
6658	.mode = `0644`,
6659	.proc_handler = proc_dointvec_minmax,
6660	.extra1 = SYSCTL_ZERO,
6661	},
6662	{
6663	.procname = "watermark_scale_factor",
6664	.data = &watermark_scale_factor,
6665	.maxlen = sizeof(watermark_scale_factor),
6666	.mode = `0644`,
6667	.proc_handler = watermark_scale_factor_sysctl_handler,
6668	.extra1 = SYSCTL_ONE,
6669	.extra2 = SYSCTL_THREE_THOUSAND,
6670	},
6671	{
6672	.procname = "defrag_mode",
6673	.data = &defrag_mode,
6674	.maxlen = sizeof(defrag_mode),
6675	.mode = `0644`,
6676	.proc_handler = proc_dointvec_minmax,
6677	.extra1 = SYSCTL_ZERO,
6678	.extra2 = SYSCTL_ONE,
6679	},
6680	{
6681	.procname = "percpu_pagelist_high_fraction",
6682	.data = &percpu_pagelist_high_fraction,
6683	.maxlen = sizeof(percpu_pagelist_high_fraction),
6684	.mode = `0644`,
6685	.proc_handler = percpu_pagelist_high_fraction_sysctl_handler,
6686	.extra1 = SYSCTL_ZERO,
6687	},
6688	{
6689	.procname = "lowmem_reserve_ratio",
6690	.data = &sysctl_lowmem_reserve_ratio,
6691	.maxlen = sizeof(sysctl_lowmem_reserve_ratio),
6692	.mode = `0644`,
6693	.proc_handler = lowmem_reserve_ratio_sysctl_handler,
6694	},
6695	#ifdef CONFIG_NUMA
6696	{
6697	.procname = "numa_zonelist_order",
6698	.data = &numa_zonelist_order,
6699	.maxlen = NUMA_ZONELIST_ORDER_LEN,
6700	.mode = `0644`,
6701	.proc_handler = numa_zonelist_order_handler,
6702	},
6703	{
6704	.procname = "min_unmapped_ratio",
6705	.data = &sysctl_min_unmapped_ratio,
6706	.maxlen = sizeof(sysctl_min_unmapped_ratio),
6707	.mode = `0644`,
6708	.proc_handler = sysctl_min_unmapped_ratio_sysctl_handler,
6709	.extra1 = SYSCTL_ZERO,
6710	.extra2 = SYSCTL_ONE_HUNDRED,
6711	},
6712	{
6713	.procname = "min_slab_ratio",
6714	.data = &sysctl_min_slab_ratio,
6715	.maxlen = sizeof(sysctl_min_slab_ratio),
6716	.mode = `0644`,
6717	.proc_handler = sysctl_min_slab_ratio_sysctl_handler,
6718	.extra1 = SYSCTL_ZERO,
6719	.extra2 = SYSCTL_ONE_HUNDRED,
6720	},
6721	#endif
6722	};
6723
6724	void __init page_alloc_sysctl_init(void)
6725	{
6726	register_sysctl_init("vm", page_alloc_sysctl_table);
6727	}
6728
6729	#ifdef CONFIG_CONTIG_ALLOC
6730	/ Usage: See admin-guide/dynamic-debug-howto.rst /
6731	static void alloc_contig_dump_pages(struct list_head *page_list)
6732	{
6733	DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, "migrate failure");
6734
6735	if (DYNAMIC_DEBUG_BRANCH(descriptor)) {
6736	struct page *page;
6737
6738	dump_stack();
6739	list_for_each_entry(page, page_list, lru)
6740	dump_page(page, "migration failure");
6741	}
6742	}
6743
6744	/ [start, end) must belong to a single zone. /
6745	static int __alloc_contig_migrate_range(struct compact_control *cc,
6746	unsigned long start, unsigned long end)
6747	{
6748	/ This function is based on compact_zone() from compaction.c. /
6749	unsigned int nr_reclaimed;
6750	unsigned long pfn = start;
6751	unsigned int tries = `0`;
6752	int ret = `0`;
6753	struct migration_target_control mtc = {
6754	.nid = zone_to_nid(cc->zone),
6755	.gfp_mask = cc->gfp_mask,
6756	.reason = MR_CONTIG_RANGE,
6757	};
6758
6759	lru_cache_disable();
6760
6761	while (pfn < end \|\| !list_empty(&cc->migratepages)) {
6762	if (fatal_signal_pending(current)) {
6763	ret = -EINTR;
6764	break;
6765	}
6766
6767	if (list_empty(&cc->migratepages)) {
6768	cc->nr_migratepages = `0`;
6769	ret = isolate_migratepages_range(cc, pfn, end);
6770	if (ret && ret != -EAGAIN)
6771	break;
6772	pfn = cc->migrate_pfn;
6773	tries = `0`;
6774	} else if (++tries == `5`) {
6775	ret = -EBUSY;
6776	break;
6777	}
6778
6779	nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
6780	&cc->migratepages);
6781	cc->nr_migratepages -= nr_reclaimed;
6782
6783	ret = migrate_pages(&cc->migratepages, alloc_migration_target,
6784	NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE, NULL);
6785
6786	/*
6787	* On -ENOMEM, migrate_pages() bails out right away. It is pointless
6788	* to retry again over this error, so do the same here.
6789	*/
6790	if (ret == -ENOMEM)
6791	break;
6792	}
6793
6794	lru_cache_enable();
6795	if (ret < `0`) {
6796	if (!(cc->gfp_mask & __GFP_NOWARN) && ret == -EBUSY)
6797	alloc_contig_dump_pages(&cc->migratepages);
6798	putback_movable_pages(&cc->migratepages);
6799	}
6800
6801	return (ret < `0`) ? ret : `0`;
6802	}
6803
6804	static void split_free_pages(struct list_head *list, gfp_t gfp_mask)
6805	{
6806	int order;
6807
6808	for (order = `0`; order < NR_PAGE_ORDERS; order++) {
6809	struct page page, next;
6810	int nr_pages = `1` << order;
6811
6812	list_for_each_entry_safe(page, next, &list[order], lru) {
6813	int i;
6814
6815	post_alloc_hook(page, order, gfp_mask);
6816	set_page_refcounted(page);
6817	if (!order)
6818	continue;
6819
6820	split_page(page, order);
6821
6822	/ Add all subpages to the order-0 head, in sequence. /
6823	list_del(&page->lru);
6824	for (i = `0`; i < nr_pages; i++)
6825	list_add_tail(&page[i].lru, &list[`0`]);
6826	}
6827	}
6828	}
6829
6830	static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask)
6831	{
6832	const gfp_t reclaim_mask = __GFP_IO \| __GFP_FS \| __GFP_RECLAIM;
6833	const gfp_t action_mask = __GFP_COMP \| __GFP_RETRY_MAYFAIL \| __GFP_NOWARN \|
6834	__GFP_ZERO \| __GFP_ZEROTAGS \| __GFP_SKIP_ZERO;
6835	const gfp_t cc_action_mask = __GFP_RETRY_MAYFAIL \| __GFP_NOWARN;
6836
6837	/*
6838	* We are given the range to allocate; node, mobility and placement
6839	* hints are irrelevant at this point. We'll simply ignore them.
6840	*/
6841	gfp_mask &= ~(GFP_ZONEMASK \| __GFP_RECLAIMABLE \| __GFP_WRITE \|
6842	__GFP_HARDWALL \| __GFP_THISNODE \| __GFP_MOVABLE);
6843
6844	/*
6845	* We only support most reclaim flags (but not NOFAIL/NORETRY), and
6846	* selected action flags.
6847	*/
6848	if (gfp_mask & ~(reclaim_mask \| action_mask))
6849	return -EINVAL;
6850
6851	/*
6852	* Flags to control page compaction/migration/reclaim, to free up our
6853	* page range. Migratable pages are movable, __GFP_MOVABLE is implied
6854	* for them.
6855	*
6856	* Traditionally we always had __GFP_RETRY_MAYFAIL set, keep doing that
6857	* to not degrade callers.
6858	*/
6859	*gfp_cc_mask = (gfp_mask & (reclaim_mask \| cc_action_mask)) \|
6860	__GFP_MOVABLE \| __GFP_RETRY_MAYFAIL;
6861	return `0`;
6862	}
6863
6864	/**
6865	* alloc_contig_range() -- tries to allocate given range of pages
6866	* @start: start PFN to allocate
6867	* @end: one-past-the-last PFN to allocate
6868	* @alloc_flags: allocation information
6869	* @gfp_mask: GFP mask. Node/zone/placement hints are ignored; only some
6870	* action and reclaim modifiers are supported. Reclaim modifiers
6871	* control allocation behavior during compaction/migration/reclaim.
6872	*
6873	* The PFN range does not have to be pageblock aligned. The PFN range must
6874	* belong to a single zone.
6875	*
6876	* The first thing this routine does is attempt to MIGRATE_ISOLATE all
6877	* pageblocks in the range. Once isolated, the pageblocks should not
6878	* be modified by others.
6879	*
6880	* Return: zero on success or negative error code. On success all
6881	* pages which PFN is in [start, end) are allocated for the caller and
6882	* need to be freed with free_contig_range().
6883	*/
6884	int alloc_contig_range_noprof(unsigned long start, unsigned long end,
6885	acr_flags_t alloc_flags, gfp_t gfp_mask)
6886	{
6887	const unsigned int order = ilog2(end - start);
6888	unsigned long outer_start, outer_end;
6889	int ret = `0`;
6890
6891	struct compact_control cc = {
6892	.nr_migratepages = `0`,
6893	.order = -`1`,
6894	.zone = page_zone(pfn_to_page(start)),
6895	.mode = MIGRATE_SYNC,
6896	.ignore_skip_hint = true,
6897	.no_set_skip_hint = true,
6898	.alloc_contig = true,
6899	};
6900	INIT_LIST_HEAD(&cc.migratepages);
6901	enum pb_isolate_mode mode = (alloc_flags & ACR_FLAGS_CMA) ?
6902	PB_ISOLATE_MODE_CMA_ALLOC :
6903	PB_ISOLATE_MODE_OTHER;
6904
6905	/*
6906	* In contrast to the buddy, we allow for orders here that exceed
6907	* MAX_PAGE_ORDER, so we must manually make sure that we are not
6908	* exceeding the maximum folio order.
6909	*/
6910	if (WARN_ON_ONCE((gfp_mask & __GFP_COMP) && order > MAX_FOLIO_ORDER))
6911	return -EINVAL;
6912
6913	gfp_mask = current_gfp_context(gfp_mask);
6914	if (__alloc_contig_verify_gfp_mask(gfp_mask, (gfp_t *)&cc.gfp_mask))
6915	return -EINVAL;
6916
6917	/*
6918	* What we do here is we mark all pageblocks in range as
6919	* MIGRATE_ISOLATE. Because pageblock and max order pages may
6920	* have different sizes, and due to the way page allocator
6921	* work, start_isolate_page_range() has special handlings for this.
6922	*
6923	* Once the pageblocks are marked as MIGRATE_ISOLATE, we
6924	* migrate the pages from an unaligned range (ie. pages that
6925	* we are interested in). This will put all the pages in
6926	* range back to page allocator as MIGRATE_ISOLATE.
6927	*
6928	* When this is done, we take the pages in range from page
6929	* allocator removing them from the buddy system. This way
6930	* page allocator will never consider using them.
6931	*
6932	* This lets us mark the pageblocks back as
6933	* MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
6934	* aligned range but not in the unaligned, original range are
6935	* put back to page allocator so that buddy can use them.
6936	*/
6937
6938	ret = start_isolate_page_range(start, end, mode);
6939	if (ret)
6940	goto done;
6941
6942	drain_all_pages(cc.zone);
6943
6944	/*
6945	* In case of -EBUSY, we'd like to know which page causes problem.
6946	* So, just fall through. test_pages_isolated() has a tracepoint
6947	* which will report the busy page.
6948	*
6949	* It is possible that busy pages could become available before
6950	* the call to test_pages_isolated, and the range will actually be
6951	* allocated. So, if we fall through be sure to clear ret so that
6952	* -EBUSY is not accidentally used or returned to caller.
6953	*/
6954	ret = __alloc_contig_migrate_range(&cc, start, end);
6955	if (ret && ret != -EBUSY)
6956	goto done;
6957
6958	/*
6959	* When in-use hugetlb pages are migrated, they may simply be released
6960	* back into the free hugepage pool instead of being returned to the
6961	* buddy system. After the migration of in-use huge pages is completed,
6962	* we will invoke replace_free_hugepage_folios() to ensure that these
6963	* hugepages are properly released to the buddy system.
6964	*/
6965	ret = replace_free_hugepage_folios(start, end);
6966	if (ret)
6967	goto done;
6968
6969	/*
6970	* Pages from [start, end) are within a pageblock_nr_pages
6971	* aligned blocks that are marked as MIGRATE_ISOLATE. What's
6972	* more, all pages in [start, end) are free in page allocator.
6973	* What we are going to do is to allocate all pages from
6974	* [start, end) (that is remove them from page allocator).
6975	*
6976	* The only problem is that pages at the beginning and at the
6977	* end of interesting range may be not aligned with pages that
6978	* page allocator holds, ie. they can be part of higher order
6979	* pages. Because of this, we reserve the bigger range and
6980	* once this is done free the pages we are not interested in.
6981	*
6982	* We don't have to hold zone->lock here because the pages are
6983	* isolated thus they won't get removed from buddy.
6984	*/
6985	outer_start = find_large_buddy(start);
6986
6987	/ Make sure the range is really isolated. /
6988	if (test_pages_isolated(outer_start, end, mode)) {
6989	ret = -EBUSY;
6990	goto done;
6991	}
6992
6993	/ Grab isolated pages from freelists. /
6994	outer_end = isolate_freepages_range(&cc, outer_start, end);
6995	if (!outer_end) {
6996	ret = -EBUSY;
6997	goto done;
6998	}
6999
7000	if (!(gfp_mask & __GFP_COMP)) {
7001	split_free_pages(cc.freepages, gfp_mask);
7002
7003	/ Free head and tail (if any) /
7004	if (start != outer_start)
7005	free_contig_range(outer_start, start - outer_start);
7006	if (end != outer_end)
7007	free_contig_range(end, outer_end - end);
7008	} else if (start == outer_start && end == outer_end && is_power_of_2(end - start)) {
7009	struct page *head = pfn_to_page(start);
7010
7011	check_new_pages(head, order);
7012	prep_new_page(head, order, gfp_mask, `0`);
7013	set_page_refcounted(head);
7014	} else {
7015	ret = -EINVAL;
7016	WARN(true, "PFN range: requested [%lu, %lu), allocated [%lu, %lu)\n",
7017	start, end, outer_start, outer_end);
7018	}
7019	done:
7020	undo_isolate_page_range(start, end);
7021	return ret;
7022	}
7023	EXPORT_SYMBOL(alloc_contig_range_noprof);
7024
7025	static int __alloc_contig_pages(unsigned long start_pfn,
7026	unsigned long nr_pages, gfp_t gfp_mask)
7027	{
7028	unsigned long end_pfn = start_pfn + nr_pages;
7029
7030	return alloc_contig_range_noprof(start_pfn, end_pfn, ACR_FLAGS_NONE,
7031	gfp_mask);
7032	}
7033
7034	static bool pfn_range_valid_contig(struct zone z, unsigned* long start_pfn,
7035	unsigned long nr_pages)
7036	{
7037	unsigned long i, end_pfn = start_pfn + nr_pages;
7038	struct page *page;
7039
7040	for (i = start_pfn; i < end_pfn; i++) {
7041	page = pfn_to_online_page(i);
7042	if (!page)
7043	return false;
7044
7045	if (page_zone(page) != z)
7046	return false;
7047
7048	if (PageReserved(page))
7049	return false;
7050
7051	if (PageHuge(page))
7052	return false;
7053	}
7054	return true;
7055	}
7056
7057	static bool zone_spans_last_pfn(const struct zone *zone,
7058	unsigned long start_pfn, unsigned long nr_pages)
7059	{
7060	unsigned long last_pfn = start_pfn + nr_pages - `1`;
7061
7062	return zone_spans_pfn(zone, last_pfn);
7063	}
7064
7065	/**
7066	* alloc_contig_pages() -- tries to find and allocate contiguous range of pages
7067	* @nr_pages: Number of contiguous pages to allocate
7068	* @gfp_mask: GFP mask. Node/zone/placement hints limit the search; only some
7069	* action and reclaim modifiers are supported. Reclaim modifiers
7070	* control allocation behavior during compaction/migration/reclaim.
7071	* @nid: Target node
7072	* @nodemask: Mask for other possible nodes
7073	*
7074	* This routine is a wrapper around alloc_contig_range(). It scans over zones
7075	* on an applicable zonelist to find a contiguous pfn range which can then be
7076	* tried for allocation with alloc_contig_range(). This routine is intended
7077	* for allocation requests which can not be fulfilled with the buddy allocator.
7078	*
7079	* The allocated memory is always aligned to a page boundary. If nr_pages is a
7080	* power of two, then allocated range is also guaranteed to be aligned to same
7081	* nr_pages (e.g. 1GB request would be aligned to 1GB).
7082	*
7083	* Allocated pages can be freed with free_contig_range() or by manually calling
7084	* __free_page() on each allocated page.
7085	*
7086	* Return: pointer to contiguous pages on success, or NULL if not successful.
7087	*/
7088	struct page alloc_contig_pages_noprof(unsigned* long nr_pages, gfp_t gfp_mask,
7089	int nid, nodemask_t *nodemask)
7090	{
7091	unsigned long ret, pfn, flags;
7092	struct zonelist *zonelist;
7093	struct zone *zone;
7094	struct zoneref *z;
7095
7096	zonelist = node_zonelist(nid, gfp_mask);
7097	for_each_zone_zonelist_nodemask(zone, z, zonelist,
7098	gfp_zone(gfp_mask), nodemask) {
7099	spin_lock_irqsave(&zone->lock, flags);
7100
7101	pfn = ALIGN(zone->zone_start_pfn, nr_pages);
7102	while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
7103	if (pfn_range_valid_contig(zone, pfn, nr_pages)) {
7104	/*
7105	* We release the zone lock here because
7106	* alloc_contig_range() will also lock the zone
7107	* at some point. If there's an allocation
7108	* spinning on this lock, it may win the race
7109	* and cause alloc_contig_range() to fail...
7110	*/
7111	spin_unlock_irqrestore(&zone->lock, flags);
7112	ret = __alloc_contig_pages(pfn, nr_pages,
7113	gfp_mask);
7114	if (!ret)
7115	return pfn_to_page(pfn);
7116	spin_lock_irqsave(&zone->lock, flags);
7117	}
7118	pfn += nr_pages;
7119	}
7120	spin_unlock_irqrestore(&zone->lock, flags);
7121	}
7122	return NULL;
7123	}
7124	#endif /* CONFIG_CONTIG_ALLOC */
7125
7126	void free_contig_range(unsigned long pfn, unsigned long nr_pages)
7127	{
7128	unsigned long count = `0`;
7129	struct folio *folio = pfn_folio(pfn);
7130
7131	if (folio_test_large(folio)) {
7132	int expected = folio_nr_pages(folio);
7133
7134	if (nr_pages == expected)
7135	folio_put(folio);
7136	else
7137	WARN(true, "PFN %lu: nr_pages %lu != expected %d\n",
7138	pfn, nr_pages, expected);
7139	return;
7140	}
7141
7142	for (; nr_pages--; pfn++) {
7143	struct page *page = pfn_to_page(pfn);
7144
7145	count += page_count(page) != `1`;
7146	__free_page(page);
7147	}
7148	WARN(count != `0`, "%lu pages are still in use!\n", count);
7149	}
7150	EXPORT_SYMBOL(free_contig_range);
7151
7152	/*
7153	* Effectively disable pcplists for the zone by setting the high limit to 0
7154	* and draining all cpus. A concurrent page freeing on another CPU that's about
7155	* to put the page on pcplist will either finish before the drain and the page
7156	* will be drained, or observe the new high limit and skip the pcplist.
7157	*
7158	* Must be paired with a call to zone_pcp_enable().
7159	*/
7160	void zone_pcp_disable(struct zone *zone)
7161	{
7162	mutex_lock(lock: &pcp_batch_high_lock);
7163	__zone_set_pageset_high_and_batch(zone, high_min: `0`, high_max: `0`, batch: `1`);
7164	__drain_all_pages(zone, force_all_cpus: true);
7165	}
7166
7167	void zone_pcp_enable(struct zone *zone)
7168	{
7169	__zone_set_pageset_high_and_batch(zone, high_min: zone->pageset_high_min,
7170	high_max: zone->pageset_high_max, batch: zone->pageset_batch);
7171	mutex_unlock(lock: &pcp_batch_high_lock);
7172	}
7173
7174	void zone_pcp_reset(struct zone *zone)
7175	{
7176	int cpu;
7177	struct per_cpu_zonestat *pzstats;
7178
7179	if (zone->per_cpu_pageset != &boot_pageset) {
7180	for_each_online_cpu(cpu) {
7181	pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
7182	drain_zonestat(zone, pzstats);
7183	}
7184	free_percpu(pdata: zone->per_cpu_pageset);
7185	zone->per_cpu_pageset = &boot_pageset;
7186	if (zone->per_cpu_zonestats != &boot_zonestats) {
7187	free_percpu(pdata: zone->per_cpu_zonestats);
7188	zone->per_cpu_zonestats = &boot_zonestats;
7189	}
7190	}
7191	}
7192
7193	#ifdef CONFIG_MEMORY_HOTREMOVE
7194	/*
7195	* All pages in the range must be in a single zone, must not contain holes,
7196	* must span full sections, and must be isolated before calling this function.
7197	*
7198	* Returns the number of managed (non-PageOffline()) pages in the range: the
7199	* number of pages for which memory offlining code must adjust managed page
7200	* counters using adjust_managed_page_count().
7201	*/
7202	unsigned long __offline_isolated_pages(unsigned long start_pfn,
7203	unsigned long end_pfn)
7204	{
7205	unsigned long already_offline = `0`, flags;
7206	unsigned long pfn = start_pfn;
7207	struct page *page;
7208	struct zone *zone;
7209	unsigned int order;
7210
7211	offline_mem_sections(pfn, end_pfn);
7212	zone = page_zone(pfn_to_page(pfn));
7213	spin_lock_irqsave(&zone->lock, flags);
7214	while (pfn < end_pfn) {
7215	page = pfn_to_page(pfn);
7216	/*
7217	* The HWPoisoned page may be not in buddy system, and
7218	* page_count() is not 0.
7219	*/
7220	if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
7221	pfn++;
7222	continue;
7223	}
7224	/*
7225	* At this point all remaining PageOffline() pages have a
7226	* reference count of 0 and can simply be skipped.
7227	*/
7228	if (PageOffline(page)) {
7229	BUG_ON(page_count(page));
7230	BUG_ON(PageBuddy(page));
7231	already_offline++;
7232	pfn++;
7233	continue;
7234	}
7235
7236	BUG_ON(page_count(page));
7237	BUG_ON(!PageBuddy(page));
7238	VM_WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE);
7239	order = buddy_order(page);
7240	del_page_from_free_list(page, zone, order, MIGRATE_ISOLATE);
7241	pfn += (`1` << order);
7242	}
7243	spin_unlock_irqrestore(&zone->lock, flags);
7244
7245	return end_pfn - start_pfn - already_offline;
7246	}
7247	#endif
7248
7249	/*
7250	* This function returns a stable result only if called under zone lock.
7251	*/
7252	bool is_free_buddy_page(const struct page *page)
7253	{
7254	unsigned long pfn = page_to_pfn(page);
7255	unsigned int order;
7256
7257	for (order = `0`; order < NR_PAGE_ORDERS; order++) {
7258	const struct page *head = page - (pfn & ((`1` << order) - `1`));
7259
7260	if (PageBuddy(page: head) &&
7261	buddy_order_unsafe(head) >= order)
7262	break;
7263	}
7264
7265	return order <= MAX_PAGE_ORDER;
7266	}
7267	EXPORT_SYMBOL(is_free_buddy_page);
7268
7269	#ifdef CONFIG_MEMORY_FAILURE
7270	static inline void add_to_free_list(struct page page, struct* zone *zone,
7271	unsigned int order, int migratetype,
7272	bool tail)
7273	{
7274	__add_to_free_list(page, zone, order, migratetype, tail);
7275	account_freepages(zone, `1` << order, migratetype);
7276	}
7277
7278	/*
7279	* Break down a higher-order page in sub-pages, and keep our target out of
7280	* buddy allocator.
7281	*/
7282	static void break_down_buddy_pages(struct zone zone, struct* page *page,
7283	struct page target, int* low, int high,
7284	int migratetype)
7285	{
7286	unsigned long size = `1` << high;
7287	struct page *current_buddy;
7288
7289	while (high > low) {
7290	high--;
7291	size >>= `1`;
7292
7293	if (target >= &page[size]) {
7294	current_buddy = page;
7295	page = page + size;
7296	} else {
7297	current_buddy = page + size;
7298	}
7299
7300	if (set_page_guard(zone, current_buddy, high))
7301	continue;
7302
7303	add_to_free_list(current_buddy, zone, high, migratetype, false);
7304	set_buddy_order(current_buddy, high);
7305	}
7306	}
7307
7308	/*
7309	* Take a page that will be marked as poisoned off the buddy allocator.
7310	*/
7311	bool take_page_off_buddy(struct page *page)
7312	{
7313	struct zone *zone = page_zone(page);
7314	unsigned long pfn = page_to_pfn(page);
7315	unsigned long flags;
7316	unsigned int order;
7317	bool ret = false;
7318
7319	spin_lock_irqsave(&zone->lock, flags);
7320	for (order = `0`; order < NR_PAGE_ORDERS; order++) {
7321	struct page *page_head = page - (pfn & ((`1` << order) - `1`));
7322	int page_order = buddy_order(page_head);
7323
7324	if (PageBuddy(page_head) && page_order >= order) {
7325	unsigned long pfn_head = page_to_pfn(page_head);
7326	int migratetype = get_pfnblock_migratetype(page_head,
7327	pfn_head);
7328
7329	del_page_from_free_list(page_head, zone, page_order,
7330	migratetype);
7331	break_down_buddy_pages(zone, page_head, page, `0`,
7332	page_order, migratetype);
7333	SetPageHWPoisonTakenOff(page);
7334	ret = true;
7335	break;
7336	}
7337	if (page_count(page_head) > `0`)
7338	break;
7339	}
7340	spin_unlock_irqrestore(&zone->lock, flags);
7341	return ret;
7342	}
7343
7344	/*
7345	* Cancel takeoff done by take_page_off_buddy().
7346	*/
7347	bool put_page_back_buddy(struct page *page)
7348	{
7349	struct zone *zone = page_zone(page);
7350	unsigned long flags;
7351	bool ret = false;
7352
7353	spin_lock_irqsave(&zone->lock, flags);
7354	if (put_page_testzero(page)) {
7355	unsigned long pfn = page_to_pfn(page);
7356	int migratetype = get_pfnblock_migratetype(page, pfn);
7357
7358	ClearPageHWPoisonTakenOff(page);
7359	__free_one_page(page, pfn, zone, `0`, migratetype, FPI_NONE);
7360	if (TestClearPageHWPoison(page)) {
7361	ret = true;
7362	}
7363	}
7364	spin_unlock_irqrestore(&zone->lock, flags);
7365
7366	return ret;
7367	}
7368	#endif
7369
7370	#ifdef CONFIG_ZONE_DMA
7371	bool has_managed_dma(void)
7372	{
7373	struct pglist_data *pgdat;
7374
7375	for_each_online_pgdat(pgdat) {
7376	struct zone *zone = &pgdat->node_zones[ZONE_DMA];
7377
7378	if (managed_zone(zone))
7379	return true;
7380	}
7381	return false;
7382	}
7383	#endif /* CONFIG_ZONE_DMA */
7384
7385	#ifdef CONFIG_UNACCEPTED_MEMORY
7386
7387	static bool lazy_accept = true;
7388
7389	static int __init accept_memory_parse(char *p)
7390	{
7391	if (!strcmp(p, "lazy")) {
7392	lazy_accept = true;
7393	return `0`;
7394	} else if (!strcmp(p, "eager")) {
7395	lazy_accept = false;
7396	return `0`;
7397	} else {
7398	return -EINVAL;
7399	}
7400	}
7401	early_param("accept_memory", accept_memory_parse);
7402
7403	static bool page_contains_unaccepted(struct page page, unsigned* int order)
7404	{
7405	phys_addr_t start = page_to_phys(page);
7406
7407	return range_contains_unaccepted_memory(start, PAGE_SIZE << order);
7408	}
7409
7410	static void __accept_page(struct zone zone, unsigned* long *flags,
7411	struct page *page)
7412	{
7413	list_del(&page->lru);
7414	account_freepages(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
7415	__mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES);
7416	__ClearPageUnaccepted(page);
7417	spin_unlock_irqrestore(&zone->lock, *flags);
7418
7419	accept_memory(page_to_phys(page), PAGE_SIZE << MAX_PAGE_ORDER);
7420
7421	__free_pages_ok(page, MAX_PAGE_ORDER, FPI_TO_TAIL);
7422	}
7423
7424	void accept_page(struct page *page)
7425	{
7426	struct zone *zone = page_zone(page);
7427	unsigned long flags;
7428
7429	spin_lock_irqsave(&zone->lock, flags);
7430	if (!PageUnaccepted(page)) {
7431	spin_unlock_irqrestore(&zone->lock, flags);
7432	return;
7433	}
7434
7435	/ Unlocks zone->lock /
7436	__accept_page(zone, &flags, page);
7437	}
7438
7439	static bool try_to_accept_memory_one(struct zone *zone)
7440	{
7441	unsigned long flags;
7442	struct page *page;
7443
7444	spin_lock_irqsave(&zone->lock, flags);
7445	page = list_first_entry_or_null(&zone->unaccepted_pages,
7446	struct page, lru);
7447	if (!page) {
7448	spin_unlock_irqrestore(&zone->lock, flags);
7449	return false;
7450	}
7451
7452	/ Unlocks zone->lock /
7453	__accept_page(zone, &flags, page);
7454
7455	return true;
7456	}
7457
7458	static bool cond_accept_memory(struct zone zone, unsigned* int order,
7459	int alloc_flags)
7460	{
7461	long to_accept, wmark;
7462	bool ret = false;
7463
7464	if (list_empty(&zone->unaccepted_pages))
7465	return false;
7466
7467	/ Bailout, since try_to_accept_memory_one() needs to take a lock /
7468	if (alloc_flags & ALLOC_TRYLOCK)
7469	return false;
7470
7471	wmark = promo_wmark_pages(zone);
7472
7473	/*
7474	* Watermarks have not been initialized yet.
7475	*
7476	* Accepting one MAX_ORDER page to ensure progress.
7477	*/
7478	if (!wmark)
7479	return try_to_accept_memory_one(zone);
7480
7481	/ How much to accept to get to promo watermark? /
7482	to_accept = wmark -
7483	(zone_page_state(zone, NR_FREE_PAGES) -
7484	__zone_watermark_unusable_free(zone, order, `0`) -
7485	zone_page_state(zone, NR_UNACCEPTED));
7486
7487	while (to_accept > `0`) {
7488	if (!try_to_accept_memory_one(zone))
7489	break;
7490	ret = true;
7491	to_accept -= MAX_ORDER_NR_PAGES;
7492	}
7493
7494	return ret;
7495	}
7496
7497	static bool __free_unaccepted(struct page *page)
7498	{
7499	struct zone *zone = page_zone(page);
7500	unsigned long flags;
7501
7502	if (!lazy_accept)
7503	return false;
7504
7505	spin_lock_irqsave(&zone->lock, flags);
7506	list_add_tail(&page->lru, &zone->unaccepted_pages);
7507	account_freepages(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
7508	__mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES);
7509	__SetPageUnaccepted(page);
7510	spin_unlock_irqrestore(&zone->lock, flags);
7511
7512	return true;
7513	}
7514
7515	#else
7516
7517	static bool page_contains_unaccepted(struct page page, unsigned* int order)
7518	{
7519	return false;
7520	}
7521
7522	static bool cond_accept_memory(struct zone zone, unsigned* int order,
7523	int alloc_flags)
7524	{
7525	return false;
7526	}
7527
7528	static bool __free_unaccepted(struct page *page)
7529	{
7530	BUILD_BUG();
7531	return false;
7532	}
7533
7534	#endif /* CONFIG_UNACCEPTED_MEMORY */
7535
7536	struct page alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int* nid, unsigned int order)
7537	{
7538	/*
7539	* Do not specify __GFP_DIRECT_RECLAIM, since direct claim is not allowed.
7540	* Do not specify __GFP_KSWAPD_RECLAIM either, since wake up of kswapd
7541	* is not safe in arbitrary context.
7542	*
7543	* These two are the conditions for gfpflags_allow_spinning() being true.
7544	*
7545	* Specify __GFP_NOWARN since failing alloc_pages_nolock() is not a reason
7546	* to warn. Also warn would trigger printk() which is unsafe from
7547	* various contexts. We cannot use printk_deferred_enter() to mitigate,
7548	* since the running context is unknown.
7549	*
7550	* Specify __GFP_ZERO to make sure that call to kmsan_alloc_page() below
7551	* is safe in any context. Also zeroing the page is mandatory for
7552	* BPF use cases.
7553	*
7554	* Though __GFP_NOMEMALLOC is not checked in the code path below,
7555	* specify it here to highlight that alloc_pages_nolock()
7556	* doesn't want to deplete reserves.
7557	*/
7558	gfp_t alloc_gfp = __GFP_NOWARN \| __GFP_ZERO \| __GFP_NOMEMALLOC \| __GFP_COMP
7559	\| gfp_flags;
7560	unsigned int alloc_flags = ALLOC_TRYLOCK;
7561	struct alloc_context ac = { };
7562	struct page *page;
7563
7564	VM_WARN_ON_ONCE(gfp_flags & ~__GFP_ACCOUNT);
7565	/*
7566	* In PREEMPT_RT spin_trylock() will call raw_spin_lock() which is
7567	* unsafe in NMI. If spin_trylock() is called from hard IRQ the current
7568	* task may be waiting for one rt_spin_lock, but rt_spin_trylock() will
7569	* mark the task as the owner of another rt_spin_lock which will
7570	* confuse PI logic, so return immediately if called form hard IRQ or
7571	* NMI.
7572	*
7573	* Note, irqs_disabled() case is ok. This function can be called
7574	* from raw_spin_lock_irqsave region.
7575	*/
7576	if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() \|\| in_hardirq()))
7577	return NULL;
7578	if (!pcp_allowed_order(order))
7579	return NULL;
7580
7581	/ Bailout, since _deferred_grow_zone() needs to take a lock /
7582	if (deferred_pages_enabled())
7583	return NULL;
7584
7585	if (nid == NUMA_NO_NODE)
7586	nid = numa_node_id();
7587
7588	prepare_alloc_pages(gfp_mask: alloc_gfp, order, preferred_nid: nid, NULL, ac: &ac,
7589	alloc_gfp: &alloc_gfp, alloc_flags: &alloc_flags);
7590
7591	/*
7592	* Best effort allocation from percpu free list.
7593	* If it's empty attempt to spin_trylock zone->lock.
7594	*/
7595	page = get_page_from_freelist(gfp_mask: alloc_gfp, order, alloc_flags, ac: &ac);
7596
7597	/ Unlike regular alloc_pages() there is no __alloc_pages_slowpath(). /
7598
7599	if (memcg_kmem_online() && page && (gfp_flags & __GFP_ACCOUNT) &&
7600	unlikely(__memcg_kmem_charge_page(page, alloc_gfp, order) != `0`)) {
7601	__free_frozen_pages(page, order, FPI_TRYLOCK);
7602	page = NULL;
7603	}
7604	trace_mm_page_alloc(page, order, gfp_flags: alloc_gfp, migratetype: ac.migratetype);
7605	kmsan_alloc_page(page, order, flags: alloc_gfp);
7606	return page;
7607	}
7608	/**
7609	* alloc_pages_nolock - opportunistic reentrant allocation from any context
7610	* @gfp_flags: GFP flags. Only __GFP_ACCOUNT allowed.
7611	* @nid: node to allocate from
7612	* @order: allocation order size
7613	*
7614	* Allocates pages of a given order from the given node. This is safe to
7615	* call from any context (from atomic, NMI, and also reentrant
7616	* allocator -> tracepoint -> alloc_pages_nolock_noprof).
7617	* Allocation is best effort and to be expected to fail easily so nobody should
7618	* rely on the success. Failures are not reported via warn_alloc().
7619	* See always fail conditions below.
7620	*
7621	* Return: allocated page or NULL on failure. NULL does not mean EBUSY or EAGAIN.
7622	* It means ENOMEM. There is no reason to call it again and expect !NULL.
7623	*/
7624	struct page alloc_pages_nolock_noprof(gfp_t gfp_flags, int* nid, unsigned int order)
7625	{
7626	struct page *page;
7627
7628	page = alloc_frozen_pages_nolock_noprof(gfp_flags, nid, order);
7629	if (page)
7630	set_page_refcounted(page);
7631	return page;
7632	}
7633	EXPORT_SYMBOL_GPL(alloc_pages_nolock_noprof);
7634

Browse the source code of Linux/mm/page_alloc.c