mm_init.c source code [Linux/mm/mm_init.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* mm_init.c - Memory initialisation verification and debugging
4	*
5	* Copyright 2008 IBM Corporation, 2008
6	* Author Mel Gorman <mel@csn.ul.ie>
7	*
8	*/
9	#include <linux/kernel.h>
10	#include <linux/init.h>
11	#include <linux/kobject.h>
12	#include <linux/export.h>
13	#include <linux/memory.h>
14	#include <linux/notifier.h>
15	#include <linux/sched.h>
16	#include <linux/mman.h>
17	#include <linux/memblock.h>
18	#include <linux/page-isolation.h>
19	#include <linux/padata.h>
20	#include <linux/nmi.h>
21	#include <linux/buffer_head.h>
22	#include <linux/kmemleak.h>
23	#include <linux/kfence.h>
24	#include <linux/page_ext.h>
25	#include <linux/pti.h>
26	#include <linux/pgtable.h>
27	#include <linux/stackdepot.h>
28	#include <linux/swap.h>
29	#include <linux/cma.h>
30	#include <linux/crash_dump.h>
31	#include <linux/execmem.h>
32	#include <linux/vmstat.h>
33	#include <linux/kexec_handover.h>
34	#include <linux/hugetlb.h>
35	#include "internal.h"
36	#include "slab.h"
37	#include "shuffle.h"
38
39	#include <asm/setup.h>
40
41	#ifndef CONFIG_NUMA
42	unsigned long max_mapnr;
43	EXPORT_SYMBOL(max_mapnr);
44
45	struct page *mem_map;
46	EXPORT_SYMBOL(mem_map);
47	#endif
48
49	/*
50	* high_memory defines the upper bound on direct map memory, then end
51	* of ZONE_NORMAL.
52	*/
53	void *high_memory;
54	EXPORT_SYMBOL(high_memory);
55
56	#ifdef CONFIG_DEBUG_MEMORY_INIT
57	int __meminitdata mminit_loglevel;
58
59	/ The zonelists are simply reported, validation is manual. /
60	void __init mminit_verify_zonelist(void)
61	{
62	int nid;
63
64	if (mminit_loglevel < MMINIT_VERIFY)
65	return;
66
67	for_each_online_node(nid) {
68	pg_data_t *pgdat = NODE_DATA(nid);
69	struct zone *zone;
70	struct zoneref *z;
71	struct zonelist *zonelist;
72	int i, listid, zoneid;
73
74	for (i = `0`; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) {
75
76	/ Identify the zone and nodelist /
77	zoneid = i % MAX_NR_ZONES;
78	listid = i / MAX_NR_ZONES;
79	zonelist = &pgdat->node_zonelists[listid];
80	zone = &pgdat->node_zones[zoneid];
81	if (!populated_zone(zone))
82	continue;
83
84	/ Print information about the zonelist /
85	printk(KERN_DEBUG "mminit::zonelist %s %d:%s = ",
86	listid > `0` ? "thisnode" : "general", nid,
87	zone->name);
88
89	/ Iterate the zonelist /
90	for_each_zone_zonelist(zone, z, zonelist, zoneid)
91	pr_cont("%d:%s ", zone_to_nid(zone), zone->name);
92	pr_cont("\n");
93	}
94	}
95	}
96
97	void __init mminit_verify_pageflags_layout(void)
98	{
99	int shift, width;
100	unsigned long or_mask, add_mask;
101
102	shift = BITS_PER_LONG;
103	width = shift - NR_NON_PAGEFLAG_BITS;
104	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
105	"Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n",
106	SECTIONS_WIDTH,
107	NODES_WIDTH,
108	ZONES_WIDTH,
109	LAST_CPUPID_WIDTH,
110	KASAN_TAG_WIDTH,
111	LRU_GEN_WIDTH,
112	LRU_REFS_WIDTH,
113	NR_PAGEFLAGS);
114	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
115	"Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n",
116	SECTIONS_SHIFT,
117	NODES_SHIFT,
118	ZONES_SHIFT,
119	LAST_CPUPID_SHIFT,
120	KASAN_TAG_WIDTH);
121	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts",
122	"Section %lu Node %lu Zone %lu Lastcpupid %lu Kasantag %lu\n",
123	(unsigned long)SECTIONS_PGSHIFT,
124	(unsigned long)NODES_PGSHIFT,
125	(unsigned long)ZONES_PGSHIFT,
126	(unsigned long)LAST_CPUPID_PGSHIFT,
127	(unsigned long)KASAN_TAG_PGSHIFT);
128	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid",
129	"Node/Zone ID: %lu -> %lu\n",
130	(unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT),
131	(unsigned long)ZONEID_PGOFF);
132	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage",
133	"location: %d -> %d layout %d -> %d unused %d -> %d page-flags\n",
134	shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, `0`);
135	#ifdef NODE_NOT_IN_PAGE_FLAGS
136	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
137	"Node not in page flags");
138	#endif
139	#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
140	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
141	"Last cpupid not in page flags");
142	#endif
143
144	if (SECTIONS_WIDTH) {
145	shift -= SECTIONS_WIDTH;
146	BUG_ON(shift != SECTIONS_PGSHIFT);
147	}
148	if (NODES_WIDTH) {
149	shift -= NODES_WIDTH;
150	BUG_ON(shift != NODES_PGSHIFT);
151	}
152	if (ZONES_WIDTH) {
153	shift -= ZONES_WIDTH;
154	BUG_ON(shift != ZONES_PGSHIFT);
155	}
156
157	/ Check for bitmask overlaps /
158	or_mask = (ZONES_MASK << ZONES_PGSHIFT) \|
159	(NODES_MASK << NODES_PGSHIFT) \|
160	(SECTIONS_MASK << SECTIONS_PGSHIFT);
161	add_mask = (ZONES_MASK << ZONES_PGSHIFT) +
162	(NODES_MASK << NODES_PGSHIFT) +
163	(SECTIONS_MASK << SECTIONS_PGSHIFT);
164	BUG_ON(or_mask != add_mask);
165	}
166
167	static __init int set_mminit_loglevel(char *str)
168	{
169	get_option(str: &str, pint: &mminit_loglevel);
170	return `0`;
171	}
172	early_param("mminit_loglevel", set_mminit_loglevel);
173	#endif /* CONFIG_DEBUG_MEMORY_INIT */
174
175	struct kobject *mm_kobj;
176
177	#ifdef CONFIG_SMP
178	s32 vm_committed_as_batch = `32`;
179
180	void mm_compute_batch(int overcommit_policy)
181	{
182	u64 memsized_batch;
183	s32 nr = num_present_cpus();
184	s32 batch = max_t(s32, nr*`2`, `32`);
185	unsigned long ram_pages = totalram_pages();
186
187	/*
188	* For policy OVERCOMMIT_NEVER, set batch size to 0.4% of
189	* (total memory/#cpus), and lift it to 25% for other policies
190	* to easy the possible lock contention for percpu_counter
191	* vm_committed_as, while the max limit is INT_MAX
192	*/
193	if (overcommit_policy == OVERCOMMIT_NEVER)
194	memsized_batch = min_t(u64, ram_pages/nr/`256`, INT_MAX);
195	else
196	memsized_batch = min_t(u64, ram_pages/nr/`4`, INT_MAX);
197
198	vm_committed_as_batch = max_t(s32, memsized_batch, batch);
199	}
200
201	static int __meminit mm_compute_batch_notifier(struct notifier_block *self,
202	unsigned long action, void *arg)
203	{
204	switch (action) {
205	case MEM_ONLINE:
206	case MEM_OFFLINE:
207	mm_compute_batch(overcommit_policy: sysctl_overcommit_memory);
208	break;
209	default:
210	break;
211	}
212	return NOTIFY_OK;
213	}
214
215	static int __init mm_compute_batch_init(void)
216	{
217	mm_compute_batch(overcommit_policy: sysctl_overcommit_memory);
218	hotplug_memory_notifier(fn: mm_compute_batch_notifier, MM_COMPUTE_BATCH_PRI);
219	return `0`;
220	}
221
222	__initcall(mm_compute_batch_init);
223
224	#endif
225
226	static int __init mm_sysfs_init(void)
227	{
228	mm_kobj = kobject_create_and_add(name: "mm", parent: kernel_kobj);
229	if (!mm_kobj)
230	return -ENOMEM;
231
232	return `0`;
233	}
234	postcore_initcall(mm_sysfs_init);
235
236	static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
237	static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
238	static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
239
240	static unsigned long required_kernelcore __initdata;
241	static unsigned long required_kernelcore_percent __initdata;
242	static unsigned long required_movablecore __initdata;
243	static unsigned long required_movablecore_percent __initdata;
244
245	static unsigned long nr_kernel_pages __initdata;
246	static unsigned long nr_all_pages __initdata;
247
248	static bool deferred_struct_pages __meminitdata;
249
250	static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
251
252	static int __init cmdline_parse_core(char p, unsigned* long *core,
253	unsigned long *percent)
254	{
255	unsigned long long coremem;
256	char *endptr;
257
258	if (!p)
259	return -EINVAL;
260
261	/ Value may be a percentage of total memory, otherwise bytes /
262	coremem = simple_strtoull(p, &endptr, `0`);
263	if (*endptr == `'%'`) {
264	/ Paranoid check for percent values greater than 100 /
265	WARN_ON(coremem > `100`);
266
267	*percent = coremem;
268	} else {
269	coremem = memparse(ptr: p, retptr: &p);
270	/ Paranoid check that UL is enough for the coremem value /
271	WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
272
273	*core = coremem >> PAGE_SHIFT;
274	*percent = `0UL`;
275	}
276	return `0`;
277	}
278
279	bool mirrored_kernelcore __initdata_memblock;
280
281	/*
282	* kernelcore=size sets the amount of memory for use for allocations that
283	* cannot be reclaimed or migrated.
284	*/
285	static int __init cmdline_parse_kernelcore(char *p)
286	{
287	/ parse kernelcore=mirror /
288	if (parse_option_str(str: p, option: "mirror")) {
289	mirrored_kernelcore = true;
290	return `0`;
291	}
292
293	return cmdline_parse_core(p, core: &required_kernelcore,
294	percent: &required_kernelcore_percent);
295	}
296	early_param("kernelcore", cmdline_parse_kernelcore);
297
298	/*
299	* movablecore=size sets the amount of memory for use for allocations that
300	* can be reclaimed or migrated.
301	*/
302	static int __init cmdline_parse_movablecore(char *p)
303	{
304	return cmdline_parse_core(p, core: &required_movablecore,
305	percent: &required_movablecore_percent);
306	}
307	early_param("movablecore", cmdline_parse_movablecore);
308
309	/*
310	* early_calculate_totalpages()
311	* Sum pages in active regions for movable zone.
312	* Populate N_MEMORY for calculating usable_nodes.
313	*/
314	static unsigned long __init early_calculate_totalpages(void)
315	{
316	unsigned long totalpages = `0`;
317	unsigned long start_pfn, end_pfn;
318	int i, nid;
319
320	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
321	unsigned long pages = end_pfn - start_pfn;
322
323	totalpages += pages;
324	if (pages)
325	node_set_state(node: nid, state: N_MEMORY);
326	}
327	return totalpages;
328	}
329
330	/*
331	* This finds a zone that can be used for ZONE_MOVABLE pages. The
332	* assumption is made that zones within a node are ordered in monotonic
333	* increasing memory addresses so that the "highest" populated zone is used
334	*/
335	static void __init find_usable_zone_for_movable(void)
336	{
337	int zone_index;
338	for (zone_index = MAX_NR_ZONES - `1`; zone_index >= `0`; zone_index--) {
339	if (zone_index == ZONE_MOVABLE)
340	continue;
341
342	if (arch_zone_highest_possible_pfn[zone_index] >
343	arch_zone_lowest_possible_pfn[zone_index])
344	break;
345	}
346
347	VM_BUG_ON(zone_index == -`1`);
348	movable_zone = zone_index;
349	}
350
351	/*
352	* Find the PFN the Movable zone begins in each node. Kernel memory
353	* is spread evenly between nodes as long as the nodes have enough
354	* memory. When they don't, some nodes will have more kernelcore than
355	* others
356	*/
357	static void __init find_zone_movable_pfns_for_nodes(void)
358	{
359	int i, nid;
360	unsigned long usable_startpfn;
361	unsigned long kernelcore_node, kernelcore_remaining;
362	/ save the state before borrow the nodemask /
363	nodemask_t saved_node_state = node_states[N_MEMORY];
364	unsigned long totalpages = early_calculate_totalpages();
365	int usable_nodes = nodes_weight(node_states[N_MEMORY]);
366	struct memblock_region *r;
367
368	/ Need to find movable_zone earlier when movable_node is specified. /
369	find_usable_zone_for_movable();
370
371	/*
372	* If movable_node is specified, ignore kernelcore and movablecore
373	* options.
374	*/
375	if (movable_node_is_enabled()) {
376	for_each_mem_region(r) {
377	if (!memblock_is_hotpluggable(m: r))
378	continue;
379
380	nid = memblock_get_region_node(r);
381
382	usable_startpfn = memblock_region_memory_base_pfn(reg: r);
383	zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
384	min(usable_startpfn, zone_movable_pfn[nid]) :
385	usable_startpfn;
386	}
387
388	goto out2;
389	}
390
391	/*
392	* If kernelcore=mirror is specified, ignore movablecore option
393	*/
394	if (mirrored_kernelcore) {
395	bool mem_below_4gb_not_mirrored = false;
396
397	if (!memblock_has_mirror()) {
398	pr_warn("The system has no mirror memory, ignore kernelcore=mirror.\n");
399	goto out;
400	}
401
402	if (is_kdump_kernel()) {
403	pr_warn("The system is under kdump, ignore kernelcore=mirror.\n");
404	goto out;
405	}
406
407	for_each_mem_region(r) {
408	if (memblock_is_mirror(m: r))
409	continue;
410
411	nid = memblock_get_region_node(r);
412
413	usable_startpfn = memblock_region_memory_base_pfn(reg: r);
414
415	if (usable_startpfn < PHYS_PFN(SZ_4G)) {
416	mem_below_4gb_not_mirrored = true;
417	continue;
418	}
419
420	zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
421	min(usable_startpfn, zone_movable_pfn[nid]) :
422	usable_startpfn;
423	}
424
425	if (mem_below_4gb_not_mirrored)
426	pr_warn("This configuration results in unmirrored kernel memory.\n");
427
428	goto out2;
429	}
430
431	/*
432	* If kernelcore=nn% or movablecore=nn% was specified, calculate the
433	* amount of necessary memory.
434	*/
435	if (required_kernelcore_percent)
436	required_kernelcore = (totalpages * `100` * required_kernelcore_percent) /
437	`10000UL`;
438	if (required_movablecore_percent)
439	required_movablecore = (totalpages * `100` * required_movablecore_percent) /
440	`10000UL`;
441
442	/*
443	* If movablecore= was specified, calculate what size of
444	* kernelcore that corresponds so that memory usable for
445	* any allocation type is evenly spread. If both kernelcore
446	* and movablecore are specified, then the value of kernelcore
447	* will be used for required_kernelcore if it's greater than
448	* what movablecore would have allowed.
449	*/
450	if (required_movablecore) {
451	unsigned long corepages;
452
453	/*
454	* Round-up so that ZONE_MOVABLE is at least as large as what
455	* was requested by the user
456	*/
457	required_movablecore =
458	round_up(required_movablecore, MAX_ORDER_NR_PAGES);
459	required_movablecore = min(totalpages, required_movablecore);
460	corepages = totalpages - required_movablecore;
461
462	required_kernelcore = max(required_kernelcore, corepages);
463	}
464
465	/*
466	* If kernelcore was not specified or kernelcore size is larger
467	* than totalpages, there is no ZONE_MOVABLE.
468	*/
469	if (!required_kernelcore \|\| required_kernelcore >= totalpages)
470	goto out;
471
472	/ usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at /
473	usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
474
475	restart:
476	/ Spread kernelcore memory as evenly as possible throughout nodes /
477	kernelcore_node = required_kernelcore / usable_nodes;
478	for_each_node_state(nid, N_MEMORY) {
479	unsigned long start_pfn, end_pfn;
480
481	/*
482	* Recalculate kernelcore_node if the division per node
483	* now exceeds what is necessary to satisfy the requested
484	* amount of memory for the kernel
485	*/
486	if (required_kernelcore < kernelcore_node)
487	kernelcore_node = required_kernelcore / usable_nodes;
488
489	/*
490	* As the map is walked, we track how much memory is usable
491	* by the kernel using kernelcore_remaining. When it is
492	* 0, the rest of the node is usable by ZONE_MOVABLE
493	*/
494	kernelcore_remaining = kernelcore_node;
495
496	/ Go through each range of PFNs within this node /
497	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
498	unsigned long size_pages;
499
500	start_pfn = max(start_pfn, zone_movable_pfn[nid]);
501	if (start_pfn >= end_pfn)
502	continue;
503
504	/ Account for what is only usable for kernelcore /
505	if (start_pfn < usable_startpfn) {
506	unsigned long kernel_pages;
507	kernel_pages = min(end_pfn, usable_startpfn)
508	- start_pfn;
509
510	kernelcore_remaining -= min(kernel_pages,
511	kernelcore_remaining);
512	required_kernelcore -= min(kernel_pages,
513	required_kernelcore);
514
515	/ Continue if range is now fully accounted /
516	if (end_pfn <= usable_startpfn) {
517
518	/*
519	* Push zone_movable_pfn to the end so
520	* that if we have to rebalance
521	* kernelcore across nodes, we will
522	* not double account here
523	*/
524	zone_movable_pfn[nid] = end_pfn;
525	continue;
526	}
527	start_pfn = usable_startpfn;
528	}
529
530	/*
531	* The usable PFN range for ZONE_MOVABLE is from
532	* start_pfn->end_pfn. Calculate size_pages as the
533	* number of pages used as kernelcore
534	*/
535	size_pages = end_pfn - start_pfn;
536	if (size_pages > kernelcore_remaining)
537	size_pages = kernelcore_remaining;
538	zone_movable_pfn[nid] = start_pfn + size_pages;
539
540	/*
541	* Some kernelcore has been met, update counts and
542	* break if the kernelcore for this node has been
543	* satisfied
544	*/
545	required_kernelcore -= min(required_kernelcore,
546	size_pages);
547	kernelcore_remaining -= size_pages;
548	if (!kernelcore_remaining)
549	break;
550	}
551	}
552
553	/*
554	* If there is still required_kernelcore, we do another pass with one
555	* less node in the count. This will push zone_movable_pfn[nid] further
556	* along on the nodes that still have memory until kernelcore is
557	* satisfied
558	*/
559	usable_nodes--;
560	if (usable_nodes && required_kernelcore > usable_nodes)
561	goto restart;
562
563	out2:
564	/ Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES /
565	for_each_node_state(nid, N_MEMORY) {
566	unsigned long start_pfn, end_pfn;
567
568	zone_movable_pfn[nid] =
569	round_up(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
570
571	get_pfn_range_for_nid(nid, start_pfn: &start_pfn, end_pfn: &end_pfn);
572	if (zone_movable_pfn[nid] >= end_pfn)
573	zone_movable_pfn[nid] = `0`;
574	}
575
576	out:
577	/ restore the node_state /
578	node_states[N_MEMORY] = saved_node_state;
579	}
580
581	void __meminit __init_single_page(struct page page, unsigned* long pfn,
582	unsigned long zone, int nid)
583	{
584	mm_zero_struct_page(page);
585	set_page_links(page, zone, node: nid, pfn);
586	init_page_count(page);
587	atomic_set(v: &page->_mapcount, i: -`1`);
588	page_cpupid_reset_last(page);
589	page_kasan_tag_reset(page);
590
591	INIT_LIST_HEAD(list: &page->lru);
592	#ifdef WANT_PAGE_VIRTUAL
593	/ The shift won't overflow because ZONE_NORMAL is below 4G. /
594	if (!is_highmem_idx(zone))
595	set_page_address(page, __va(pfn << PAGE_SHIFT));
596	#endif
597	}
598
599	#ifdef CONFIG_NUMA
600	/*
601	* During memory init memblocks map pfns to nids. The search is expensive and
602	* this caches recent lookups. The implementation of __early_pfn_to_nid
603	* treats start/end as pfns.
604	*/
605	struct mminit_pfnnid_cache {
606	unsigned long last_start;
607	unsigned long last_end;
608	int last_nid;
609	};
610
611	static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
612
613	/*
614	* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
615	*/
616	static int __meminit __early_pfn_to_nid(unsigned long pfn,
617	struct mminit_pfnnid_cache *state)
618	{
619	unsigned long start_pfn, end_pfn;
620	int nid;
621
622	if (state->last_start <= pfn && pfn < state->last_end)
623	return state->last_nid;
624
625	nid = memblock_search_pfn_nid(pfn, start_pfn: &start_pfn, end_pfn: &end_pfn);
626	if (nid != NUMA_NO_NODE) {
627	state->last_start = start_pfn;
628	state->last_end = end_pfn;
629	state->last_nid = nid;
630	}
631
632	return nid;
633	}
634
635	int __meminit early_pfn_to_nid(unsigned long pfn)
636	{
637	static DEFINE_SPINLOCK(early_pfn_lock);
638	int nid;
639
640	spin_lock(lock: &early_pfn_lock);
641	nid = __early_pfn_to_nid(pfn, state: &early_pfnnid_cache);
642	if (nid < `0`)
643	nid = first_online_node;
644	spin_unlock(lock: &early_pfn_lock);
645
646	return nid;
647	}
648
649	int hashdist = HASHDIST_DEFAULT;
650
651	static int __init set_hashdist(char *str)
652	{
653	if (!str)
654	return `0`;
655	hashdist = simple_strtoul(str, &str, `0`);
656	return `1`;
657	}
658	__setup("hashdist=", set_hashdist);
659
660	static inline void fixup_hashdist(void)
661	{
662	if (num_node_state(state: N_MEMORY) == `1`)
663	hashdist = `0`;
664	}
665	#else
666	static inline void fixup_hashdist(void) {}
667	#endif /* CONFIG_NUMA */
668
669	/*
670	* Initialize a reserved page unconditionally, finding its zone first.
671	*/
672	void __meminit __init_page_from_nid(unsigned long pfn, int nid)
673	{
674	pg_data_t *pgdat;
675	int zid;
676
677	pgdat = NODE_DATA(nid);
678
679	for (zid = `0`; zid < MAX_NR_ZONES; zid++) {
680	struct zone *zone = &pgdat->node_zones[zid];
681
682	if (zone_spans_pfn(zone, pfn))
683	break;
684	}
685	__init_single_page(pfn_to_page(pfn), pfn, zone: zid, nid);
686
687	if (pageblock_aligned(pfn))
688	init_pageblock_migratetype(pfn_to_page(pfn), migratetype: MIGRATE_MOVABLE,
689	isolate: false);
690	}
691
692	#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
693	static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
694	{
695	pgdat->first_deferred_pfn = ULONG_MAX;
696	}
697
698	/ Returns true if the struct page for the pfn is initialised /
699	static inline bool __meminit early_page_initialised(unsigned long pfn, int nid)
700	{
701	if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
702	return false;
703
704	return true;
705	}
706
707	/*
708	* Returns true when the remaining initialisation should be deferred until
709	* later in the boot cycle when it can be parallelised.
710	*/
711	static bool __meminit
712	defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
713	{
714	static unsigned long prev_end_pfn, nr_initialised;
715
716	if (early_page_ext_enabled())
717	return false;
718
719	/ Always populate low zones for address-constrained allocations /
720	if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
721	return false;
722
723	if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX)
724	return true;
725
726	/*
727	* prev_end_pfn static that contains the end of previous zone
728	* No need to protect because called very early in boot before smp_init.
729	*/
730	if (prev_end_pfn != end_pfn) {
731	prev_end_pfn = end_pfn;
732	nr_initialised = `0`;
733	}
734
735	/*
736	* We start only with one section of pages, more pages are added as
737	* needed until the rest of deferred pages are initialized.
738	*/
739	nr_initialised++;
740	if ((nr_initialised > PAGES_PER_SECTION) &&
741	(pfn & (PAGES_PER_SECTION - `1`)) == `0`) {
742	NODE_DATA(nid)->first_deferred_pfn = pfn;
743	return true;
744	}
745	return false;
746	}
747
748	static void __meminit __init_deferred_page(unsigned long pfn, int nid)
749	{
750	if (early_page_initialised(pfn, nid))
751	return;
752
753	__init_page_from_nid(pfn, nid);
754	}
755	#else
756	static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
757
758	static inline bool early_page_initialised(unsigned long pfn, int nid)
759	{
760	return true;
761	}
762
763	static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
764	{
765	return false;
766	}
767
768	static inline void __init_deferred_page(unsigned long pfn, int nid)
769	{
770	}
771	#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
772
773	void __meminit init_deferred_page(unsigned long pfn, int nid)
774	{
775	__init_deferred_page(pfn, nid);
776	}
777
778	/*
779	* Initialised pages do not have PageReserved set. This function is
780	* called for each range allocated by the bootmem allocator and
781	* marks the pages PageReserved. The remaining valid pages are later
782	* sent to the buddy page allocator.
783	*/
784	void __meminit reserve_bootmem_region(phys_addr_t start,
785	phys_addr_t end, int nid)
786	{
787	unsigned long pfn;
788
789	for_each_valid_pfn(pfn, PFN_DOWN(start), PFN_UP(end)) {
790	struct page *page = pfn_to_page(pfn);
791
792	__init_deferred_page(pfn, nid);
793
794	/*
795	* no need for atomic set_bit because the struct
796	* page is not visible yet so nobody should
797	* access it yet.
798	*/
799	__SetPageReserved(page);
800	}
801	}
802
803	/ If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init /
804	static bool __meminit
805	overlap_memmap_init(unsigned long zone, unsigned long *pfn)
806	{
807	static struct memblock_region *r;
808
809	if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
810	if (!r \|\| *pfn >= memblock_region_memory_end_pfn(reg: r)) {
811	for_each_mem_region(r) {
812	if (*pfn < memblock_region_memory_end_pfn(reg: r))
813	break;
814	}
815	}
816	if (*pfn >= memblock_region_memory_base_pfn(reg: r) &&
817	memblock_is_mirror(m: r)) {
818	*pfn = memblock_region_memory_end_pfn(reg: r);
819	return true;
820	}
821	}
822	return false;
823	}
824
825	/*
826	* Only struct pages that correspond to ranges defined by memblock.memory
827	* are zeroed and initialized by going through __init_single_page() during
828	* memmap_init_zone_range().
829	*
830	* But, there could be struct pages that correspond to holes in
831	* memblock.memory. This can happen because of the following reasons:
832	* - physical memory bank size is not necessarily the exact multiple of the
833	* arbitrary section size
834	* - early reserved memory may not be listed in memblock.memory
835	* - non-memory regions covered by the contiguous flatmem mapping
836	* - memory layouts defined with memmap= kernel parameter may not align
837	* nicely with memmap sections
838	*
839	* Explicitly initialize those struct pages so that:
840	* - PG_Reserved is set
841	* - zone and node links point to zone and node that span the page if the
842	* hole is in the middle of a zone
843	* - zone and node links point to adjacent zone/node if the hole falls on
844	* the zone boundary; the pages in such holes will be prepended to the
845	* zone/node above the hole except for the trailing pages in the last
846	* section that will be appended to the zone/node below.
847	*/
848	static void __init init_unavailable_range(unsigned long spfn,
849	unsigned long epfn,
850	int zone, int node)
851	{
852	unsigned long pfn;
853	u64 pgcnt = `0`;
854
855	for_each_valid_pfn(pfn, spfn, epfn) {
856	__init_single_page(pfn_to_page(pfn), pfn, zone, nid: node);
857	__SetPageReserved(pfn_to_page(pfn));
858	pgcnt++;
859	}
860
861	if (pgcnt)
862	pr_info("On node %d, zone %s: %lld pages in unavailable ranges\n",
863	node, zone_names[zone], pgcnt);
864	}
865
866	/*
867	* Initially all pages are reserved - free ones are freed
868	* up by memblock_free_all() once the early boot process is
869	* done. Non-atomic initialization, single-pass.
870	*
871	* All aligned pageblocks are initialized to the specified migratetype
872	* (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
873	* zone stats (e.g., nr_isolate_pageblock) are touched.
874	*/
875	void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone,
876	unsigned long start_pfn, unsigned long zone_end_pfn,
877	enum meminit_context context,
878	struct vmem_altmap altmap, int* migratetype,
879	bool isolate_pageblock)
880	{
881	unsigned long pfn, end_pfn = start_pfn + size;
882	struct page *page;
883
884	if (highest_memmap_pfn < end_pfn - `1`)
885	highest_memmap_pfn = end_pfn - `1`;
886
887	#ifdef CONFIG_ZONE_DEVICE
888	/*
889	* Honor reservation requested by the driver for this ZONE_DEVICE
890	* memory. We limit the total number of pages to initialize to just
891	* those that might contain the memory mapping. We will defer the
892	* ZONE_DEVICE page initialization until after we have released
893	* the hotplug lock.
894	*/
895	if (zone == ZONE_DEVICE) {
896	if (!altmap)
897	return;
898
899	if (start_pfn == altmap->base_pfn)
900	start_pfn += altmap->reserve;
901	end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
902	}
903	#endif
904
905	for (pfn = start_pfn; pfn < end_pfn; ) {
906	/*
907	* There can be holes in boot-time mem_map[]s handed to this
908	* function. They do not exist on hotplugged memory.
909	*/
910	if (context == MEMINIT_EARLY) {
911	if (overlap_memmap_init(zone, pfn: &pfn))
912	continue;
913	if (defer_init(nid, pfn, end_pfn: zone_end_pfn)) {
914	deferred_struct_pages = true;
915	break;
916	}
917	}
918
919	page = pfn_to_page(pfn);
920	__init_single_page(page, pfn, zone, nid);
921	if (context == MEMINIT_HOTPLUG) {
922	#ifdef CONFIG_ZONE_DEVICE
923	if (zone == ZONE_DEVICE)
924	__SetPageReserved(page);
925	else
926	#endif
927	__SetPageOffline(page);
928	}
929
930	/*
931	* Usually, we want to mark the pageblock MIGRATE_MOVABLE,
932	* such that unmovable allocations won't be scattered all
933	* over the place during system boot.
934	*/
935	if (pageblock_aligned(pfn)) {
936	init_pageblock_migratetype(page, migratetype,
937	isolate: isolate_pageblock);
938	cond_resched();
939	}
940	pfn++;
941	}
942	}
943
944	static void __init memmap_init_zone_range(struct zone *zone,
945	unsigned long start_pfn,
946	unsigned long end_pfn,
947	unsigned long *hole_pfn)
948	{
949	unsigned long zone_start_pfn = zone->zone_start_pfn;
950	unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages;
951	int nid = zone_to_nid(zone), zone_id = zone_idx(zone);
952
953	start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn);
954	end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn);
955
956	if (start_pfn >= end_pfn)
957	return;
958
959	memmap_init_range(size: end_pfn - start_pfn, nid, zone: zone_id, start_pfn,
960	zone_end_pfn, context: MEMINIT_EARLY, NULL, migratetype: MIGRATE_MOVABLE,
961	isolate_pageblock: false);
962
963	if (*hole_pfn < start_pfn)
964	init_unavailable_range(spfn: *hole_pfn, epfn: start_pfn, zone: zone_id, node: nid);
965
966	*hole_pfn = end_pfn;
967	}
968
969	static void __init memmap_init(void)
970	{
971	unsigned long start_pfn, end_pfn;
972	unsigned long hole_pfn = `0`;
973	int i, j, zone_id = `0`, nid;
974
975	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
976	struct pglist_data *node = NODE_DATA(nid);
977
978	for (j = `0`; j < MAX_NR_ZONES; j++) {
979	struct zone *zone = node->node_zones + j;
980
981	if (!populated_zone(zone))
982	continue;
983
984	memmap_init_zone_range(zone, start_pfn, end_pfn,
985	hole_pfn: &hole_pfn);
986	zone_id = j;
987	}
988	}
989
990	/*
991	* Initialize the memory map for hole in the range [memory_end,
992	* section_end] for SPARSEMEM and in the range [memory_end, memmap_end]
993	* for FLATMEM.
994	* Append the pages in this hole to the highest zone in the last
995	* node.
996	*/
997	#ifdef CONFIG_SPARSEMEM
998	end_pfn = round_up(end_pfn, PAGES_PER_SECTION);
999	#else
1000	end_pfn = round_up(end_pfn, MAX_ORDER_NR_PAGES);
1001	#endif
1002	if (hole_pfn < end_pfn)
1003	init_unavailable_range(spfn: hole_pfn, epfn: end_pfn, zone: zone_id, node: nid);
1004	}
1005
1006	#ifdef CONFIG_ZONE_DEVICE
1007	static void __ref __init_zone_device_page(struct page page, unsigned* long pfn,
1008	unsigned long zone_idx, int nid,
1009	struct dev_pagemap *pgmap)
1010	{
1011
1012	__init_single_page(page, pfn, zone_idx, nid);
1013
1014	/*
1015	* Mark page reserved as it will need to wait for onlining
1016	* phase for it to be fully associated with a zone.
1017	*
1018	* We can use the non-atomic __set_bit operation for setting
1019	* the flag as we are still initializing the pages.
1020	*/
1021	__SetPageReserved(page);
1022
1023	/*
1024	* ZONE_DEVICE pages union ->lru with a ->pgmap back pointer
1025	* and zone_device_data. It is a bug if a ZONE_DEVICE page is
1026	* ever freed or placed on a driver-private list.
1027	*/
1028	page_folio(page)->pgmap = pgmap;
1029	page->zone_device_data = NULL;
1030
1031	/*
1032	* Mark the block movable so that blocks are reserved for
1033	* movable at startup. This will force kernel allocations
1034	* to reserve their blocks rather than leaking throughout
1035	* the address space during boot when many long-lived
1036	* kernel allocations are made.
1037	*
1038	* Please note that MEMINIT_HOTPLUG path doesn't clear memmap
1039	* because this is done early in section_activate()
1040	*/
1041	if (pageblock_aligned(pfn)) {
1042	init_pageblock_migratetype(page, MIGRATE_MOVABLE, false);
1043	cond_resched();
1044	}
1045
1046	/*
1047	* ZONE_DEVICE pages other than MEMORY_TYPE_GENERIC are released
1048	* directly to the driver page allocator which will set the page count
1049	* to 1 when allocating the page.
1050	*
1051	* MEMORY_TYPE_GENERIC and MEMORY_TYPE_FS_DAX pages automatically have
1052	* their refcount reset to one whenever they are freed (ie. after
1053	* their refcount drops to 0).
1054	*/
1055	switch (pgmap->type) {
1056	case MEMORY_DEVICE_FS_DAX:
1057	case MEMORY_DEVICE_PRIVATE:
1058	case MEMORY_DEVICE_COHERENT:
1059	case MEMORY_DEVICE_PCI_P2PDMA:
1060	set_page_count(page, `0`);
1061	break;
1062
1063	case MEMORY_DEVICE_GENERIC:
1064	break;
1065	}
1066	}
1067
1068	/*
1069	* With compound page geometry and when struct pages are stored in ram most
1070	* tail pages are reused. Consequently, the amount of unique struct pages to
1071	* initialize is a lot smaller that the total amount of struct pages being
1072	* mapped. This is a paired / mild layering violation with explicit knowledge
1073	* of how the sparse_vmemmap internals handle compound pages in the lack
1074	* of an altmap. See vmemmap_populate_compound_pages().
1075	*/
1076	static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap,
1077	struct dev_pagemap *pgmap)
1078	{
1079	if (!vmemmap_can_optimize(altmap, pgmap))
1080	return pgmap_vmemmap_nr(pgmap);
1081
1082	return VMEMMAP_RESERVE_NR * (PAGE_SIZE / sizeof(struct page));
1083	}
1084
1085	static void __ref memmap_init_compound(struct page *head,
1086	unsigned long head_pfn,
1087	unsigned long zone_idx, int nid,
1088	struct dev_pagemap *pgmap,
1089	unsigned long nr_pages)
1090	{
1091	unsigned long pfn, end_pfn = head_pfn + nr_pages;
1092	unsigned int order = pgmap->vmemmap_shift;
1093
1094	/*
1095	* We have to initialize the pages, including setting up page links.
1096	* prep_compound_page() does not take care of that, so instead we
1097	* open-code prep_compound_page() so we can take care of initializing
1098	* the pages in the same go.
1099	*/
1100	__SetPageHead(head);
1101	for (pfn = head_pfn + `1`; pfn < end_pfn; pfn++) {
1102	struct page *page = pfn_to_page(pfn);
1103
1104	__init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
1105	prep_compound_tail(head, pfn - head_pfn);
1106	set_page_count(page, `0`);
1107	}
1108	prep_compound_head(head, order);
1109	}
1110
1111	void __ref memmap_init_zone_device(struct zone *zone,
1112	unsigned long start_pfn,
1113	unsigned long nr_pages,
1114	struct dev_pagemap *pgmap)
1115	{
1116	unsigned long pfn, end_pfn = start_pfn + nr_pages;
1117	struct pglist_data *pgdat = zone->zone_pgdat;
1118	struct vmem_altmap *altmap = pgmap_altmap(pgmap);
1119	unsigned int pfns_per_compound = pgmap_vmemmap_nr(pgmap);
1120	unsigned long zone_idx = zone_idx(zone);
1121	unsigned long start = jiffies;
1122	int nid = pgdat->node_id;
1123
1124	if (WARN_ON_ONCE(!pgmap \|\| zone_idx != ZONE_DEVICE))
1125	return;
1126
1127	/*
1128	* The call to memmap_init should have already taken care
1129	* of the pages reserved for the memmap, so we can just jump to
1130	* the end of that region and start processing the device pages.
1131	*/
1132	if (altmap) {
1133	start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
1134	nr_pages = end_pfn - start_pfn;
1135	}
1136
1137	for (pfn = start_pfn; pfn < end_pfn; pfn += pfns_per_compound) {
1138	struct page *page = pfn_to_page(pfn);
1139
1140	__init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
1141
1142	if (pfns_per_compound == `1`)
1143	continue;
1144
1145	memmap_init_compound(page, pfn, zone_idx, nid, pgmap,
1146	compound_nr_pages(altmap, pgmap));
1147	}
1148
1149	pr_debug("%s initialised %lu pages in %ums\n", __func__,
1150	nr_pages, jiffies_to_msecs(jiffies - start));
1151	}
1152	#endif
1153
1154	/*
1155	* The zone ranges provided by the architecture do not include ZONE_MOVABLE
1156	* because it is sized independent of architecture. Unlike the other zones,
1157	* the starting point for ZONE_MOVABLE is not fixed. It may be different
1158	* in each node depending on the size of each node and how evenly kernelcore
1159	* is distributed. This helper function adjusts the zone ranges
1160	* provided by the architecture for a given node by using the end of the
1161	* highest usable zone for ZONE_MOVABLE. This preserves the assumption that
1162	* zones within a node are in order of monotonic increases memory addresses
1163	*/
1164	static void __init adjust_zone_range_for_zone_movable(int nid,
1165	unsigned long zone_type,
1166	unsigned long node_end_pfn,
1167	unsigned long *zone_start_pfn,
1168	unsigned long *zone_end_pfn)
1169	{
1170	/ Only adjust if ZONE_MOVABLE is on this node /
1171	if (zone_movable_pfn[nid]) {
1172	/ Size ZONE_MOVABLE /
1173	if (zone_type == ZONE_MOVABLE) {
1174	*zone_start_pfn = zone_movable_pfn[nid];
1175	*zone_end_pfn = min(node_end_pfn,
1176	arch_zone_highest_possible_pfn[movable_zone]);
1177
1178	/ Adjust for ZONE_MOVABLE starting within this range /
1179	} else if (!mirrored_kernelcore &&
1180	*zone_start_pfn < zone_movable_pfn[nid] &&
1181	*zone_end_pfn > zone_movable_pfn[nid]) {
1182	*zone_end_pfn = zone_movable_pfn[nid];
1183
1184	/ Check if this whole range is within ZONE_MOVABLE /
1185	} else if (*zone_start_pfn >= zone_movable_pfn[nid])
1186	zone_start_pfn = zone_end_pfn;
1187	}
1188	}
1189
1190	/*
1191	* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
1192	* then all holes in the requested range will be accounted for.
1193	*/
1194	static unsigned long __init __absent_pages_in_range(int nid,
1195	unsigned long range_start_pfn,
1196	unsigned long range_end_pfn)
1197	{
1198	unsigned long nr_absent = range_end_pfn - range_start_pfn;
1199	unsigned long start_pfn, end_pfn;
1200	int i;
1201
1202	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
1203	start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
1204	end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
1205	nr_absent -= end_pfn - start_pfn;
1206	}
1207	return nr_absent;
1208	}
1209
1210	/**
1211	* absent_pages_in_range - Return number of page frames in holes within a range
1212	* @start_pfn: The start PFN to start searching for holes
1213	* @end_pfn: The end PFN to stop searching for holes
1214	*
1215	* Return: the number of pages frames in memory holes within a range.
1216	*/
1217	unsigned long __init absent_pages_in_range(unsigned long start_pfn,
1218	unsigned long end_pfn)
1219	{
1220	return __absent_pages_in_range(MAX_NUMNODES, range_start_pfn: start_pfn, range_end_pfn: end_pfn);
1221	}
1222
1223	/ Return the number of page frames in holes in a zone on a node /
1224	static unsigned long __init zone_absent_pages_in_node(int nid,
1225	unsigned long zone_type,
1226	unsigned long zone_start_pfn,
1227	unsigned long zone_end_pfn)
1228	{
1229	unsigned long nr_absent;
1230
1231	/ zone is empty, we don't have any absent pages /
1232	if (zone_start_pfn == zone_end_pfn)
1233	return `0`;
1234
1235	nr_absent = __absent_pages_in_range(nid, range_start_pfn: zone_start_pfn, range_end_pfn: zone_end_pfn);
1236
1237	/*
1238	* ZONE_MOVABLE handling.
1239	* Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
1240	* and vice versa.
1241	*/
1242	if (mirrored_kernelcore && zone_movable_pfn[nid]) {
1243	unsigned long start_pfn, end_pfn;
1244	struct memblock_region *r;
1245
1246	for_each_mem_region(r) {
1247	start_pfn = clamp(memblock_region_memory_base_pfn(r),
1248	zone_start_pfn, zone_end_pfn);
1249	end_pfn = clamp(memblock_region_memory_end_pfn(r),
1250	zone_start_pfn, zone_end_pfn);
1251
1252	if (zone_type == ZONE_MOVABLE &&
1253	memblock_is_mirror(m: r))
1254	nr_absent += end_pfn - start_pfn;
1255
1256	if (zone_type == ZONE_NORMAL &&
1257	!memblock_is_mirror(m: r))
1258	nr_absent += end_pfn - start_pfn;
1259	}
1260	}
1261
1262	return nr_absent;
1263	}
1264
1265	/*
1266	* Return the number of pages a zone spans in a node, including holes
1267	* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
1268	*/
1269	static unsigned long __init zone_spanned_pages_in_node(int nid,
1270	unsigned long zone_type,
1271	unsigned long node_start_pfn,
1272	unsigned long node_end_pfn,
1273	unsigned long *zone_start_pfn,
1274	unsigned long *zone_end_pfn)
1275	{
1276	unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
1277	unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
1278
1279	/ Get the start and end of the zone /
1280	*zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
1281	*zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
1282	adjust_zone_range_for_zone_movable(nid, zone_type, node_end_pfn,
1283	zone_start_pfn, zone_end_pfn);
1284
1285	/ Check that this node has pages within the zone's required range /
1286	if (zone_end_pfn < node_start_pfn \|\| zone_start_pfn > node_end_pfn)
1287	return `0`;
1288
1289	/ Move the zone boundaries inside the node if necessary /
1290	zone_end_pfn = min(zone_end_pfn, node_end_pfn);
1291	zone_start_pfn = max(zone_start_pfn, node_start_pfn);
1292
1293	/ Return the spanned pages /
1294	return zone_end_pfn - zone_start_pfn;
1295	}
1296
1297	static void __init reset_memoryless_node_totalpages(struct pglist_data *pgdat)
1298	{
1299	struct zone *z;
1300
1301	for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) {
1302	z->zone_start_pfn = `0`;
1303	z->spanned_pages = `0`;
1304	z->present_pages = `0`;
1305	#if defined(CONFIG_MEMORY_HOTPLUG)
1306	z->present_early_pages = `0`;
1307	#endif
1308	}
1309
1310	pgdat->node_spanned_pages = `0`;
1311	pgdat->node_present_pages = `0`;
1312	pr_debug("On node %d totalpages: 0\n", pgdat->node_id);
1313	}
1314
1315	static void __init calc_nr_kernel_pages(void)
1316	{
1317	unsigned long start_pfn, end_pfn;
1318	phys_addr_t start_addr, end_addr;
1319	u64 u;
1320	#ifdef CONFIG_HIGHMEM
1321	unsigned long high_zone_low = arch_zone_lowest_possible_pfn[ZONE_HIGHMEM];
1322	#endif
1323
1324	for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start_addr, &end_addr, NULL) {
1325	start_pfn = PFN_UP(start_addr);
1326	end_pfn = PFN_DOWN(end_addr);
1327
1328	if (start_pfn < end_pfn) {
1329	nr_all_pages += end_pfn - start_pfn;
1330	#ifdef CONFIG_HIGHMEM
1331	start_pfn = clamp(start_pfn, `0`, high_zone_low);
1332	end_pfn = clamp(end_pfn, `0`, high_zone_low);
1333	#endif
1334	nr_kernel_pages += end_pfn - start_pfn;
1335	}
1336	}
1337	}
1338
1339	static void __init calculate_node_totalpages(struct pglist_data *pgdat,
1340	unsigned long node_start_pfn,
1341	unsigned long node_end_pfn)
1342	{
1343	unsigned long realtotalpages = `0`, totalpages = `0`;
1344	enum zone_type i;
1345
1346	for (i = `0`; i < MAX_NR_ZONES; i++) {
1347	struct zone *zone = pgdat->node_zones + i;
1348	unsigned long zone_start_pfn, zone_end_pfn;
1349	unsigned long spanned, absent;
1350	unsigned long real_size;
1351
1352	spanned = zone_spanned_pages_in_node(nid: pgdat->node_id, zone_type: i,
1353	node_start_pfn,
1354	node_end_pfn,
1355	zone_start_pfn: &zone_start_pfn,
1356	zone_end_pfn: &zone_end_pfn);
1357	absent = zone_absent_pages_in_node(nid: pgdat->node_id, zone_type: i,
1358	zone_start_pfn,
1359	zone_end_pfn);
1360
1361	real_size = spanned - absent;
1362
1363	if (spanned)
1364	zone->zone_start_pfn = zone_start_pfn;
1365	else
1366	zone->zone_start_pfn = `0`;
1367	zone->spanned_pages = spanned;
1368	zone->present_pages = real_size;
1369	#if defined(CONFIG_MEMORY_HOTPLUG)
1370	zone->present_early_pages = real_size;
1371	#endif
1372
1373	totalpages += spanned;
1374	realtotalpages += real_size;
1375	}
1376
1377	pgdat->node_spanned_pages = totalpages;
1378	pgdat->node_present_pages = realtotalpages;
1379	pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
1380	}
1381
1382	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1383	static void pgdat_init_split_queue(struct pglist_data *pgdat)
1384	{
1385	struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
1386
1387	spin_lock_init(&ds_queue->split_queue_lock);
1388	INIT_LIST_HEAD(&ds_queue->split_queue);
1389	ds_queue->split_queue_len = `0`;
1390	}
1391	#else
1392	static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
1393	#endif
1394
1395	#ifdef CONFIG_COMPACTION
1396	static void pgdat_init_kcompactd(struct pglist_data *pgdat)
1397	{
1398	init_waitqueue_head(&pgdat->kcompactd_wait);
1399	}
1400	#else
1401	static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
1402	#endif
1403
1404	static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
1405	{
1406	int i;
1407
1408	pgdat_resize_init(pgdat);
1409	pgdat_kswapd_lock_init(pgdat);
1410
1411	pgdat_init_split_queue(pgdat);
1412	pgdat_init_kcompactd(pgdat);
1413
1414	init_waitqueue_head(&pgdat->kswapd_wait);
1415	init_waitqueue_head(&pgdat->pfmemalloc_wait);
1416
1417	for (i = `0`; i < NR_VMSCAN_THROTTLE; i++)
1418	init_waitqueue_head(&pgdat->reclaim_wait[i]);
1419
1420	pgdat_page_ext_init(pgdat);
1421	lruvec_init(lruvec: &pgdat->__lruvec);
1422	}
1423
1424	static void __meminit zone_init_internals(struct zone zone, enum* zone_type idx, int nid,
1425	unsigned long remaining_pages)
1426	{
1427	atomic_long_set(v: &zone->managed_pages, i: remaining_pages);
1428	zone_set_nid(zone, nid);
1429	zone->name = zone_names[idx];
1430	zone->zone_pgdat = NODE_DATA(nid);
1431	spin_lock_init(&zone->lock);
1432	zone_seqlock_init(zone);
1433	zone_pcp_init(zone);
1434	}
1435
1436	static void __meminit zone_init_free_lists(struct zone *zone)
1437	{
1438	unsigned int order, t;
1439	for_each_migratetype_order(order, t) {
1440	INIT_LIST_HEAD(list: &zone->free_area[order].free_list[t]);
1441	zone->free_area[order].nr_free = `0`;
1442	}
1443
1444	#ifdef CONFIG_UNACCEPTED_MEMORY
1445	INIT_LIST_HEAD(&zone->unaccepted_pages);
1446	#endif
1447	}
1448
1449	void __meminit init_currently_empty_zone(struct zone *zone,
1450	unsigned long zone_start_pfn,
1451	unsigned long size)
1452	{
1453	struct pglist_data *pgdat = zone->zone_pgdat;
1454	int zone_idx = zone_idx(zone) + `1`;
1455
1456	if (zone_idx > pgdat->nr_zones)
1457	pgdat->nr_zones = zone_idx;
1458
1459	zone->zone_start_pfn = zone_start_pfn;
1460
1461	mminit_dprintk(MMINIT_TRACE, "memmap_init",
1462	"Initialising map node %d zone %lu pfns %lu -> %lu\n",
1463	pgdat->node_id,
1464	(unsigned long)zone_idx(zone),
1465	zone_start_pfn, (zone_start_pfn + size));
1466
1467	zone_init_free_lists(zone);
1468	zone->initialized = `1`;
1469	}
1470
1471	#ifndef CONFIG_SPARSEMEM
1472	/*
1473	* Calculate the size of the zone->pageblock_flags rounded to an unsigned long
1474	* Start by making sure zonesize is a multiple of pageblock_order by rounding
1475	* up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
1476	* round what is now in bits to nearest long in bits, then return it in
1477	* bytes.
1478	*/
1479	static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
1480	{
1481	unsigned long usemapsize;
1482
1483	zonesize += zone_start_pfn & (pageblock_nr_pages-`1`);
1484	usemapsize = round_up(zonesize, pageblock_nr_pages);
1485	usemapsize = usemapsize >> pageblock_order;
1486	usemapsize *= NR_PAGEBLOCK_BITS;
1487	usemapsize = round_up(usemapsize, BITS_PER_LONG);
1488
1489	return usemapsize / BITS_PER_BYTE;
1490	}
1491
1492	static void __ref setup_usemap(struct zone *zone)
1493	{
1494	unsigned long usemapsize = usemap_size(zone->zone_start_pfn,
1495	zone->spanned_pages);
1496	zone->pageblock_flags = NULL;
1497	if (usemapsize) {
1498	zone->pageblock_flags =
1499	memblock_alloc_node(usemapsize, SMP_CACHE_BYTES,
1500	zone_to_nid(zone));
1501	if (!zone->pageblock_flags)
1502	panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
1503	usemapsize, zone->name, zone_to_nid(zone));
1504	}
1505	}
1506	#else
1507	static inline void setup_usemap(struct zone *zone) {}
1508	#endif /* CONFIG_SPARSEMEM */
1509
1510	#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
1511
1512	/ Initialise the number of pages represented by NR_PAGEBLOCK_BITS /
1513	void __init set_pageblock_order(void)
1514	{
1515	unsigned int order = PAGE_BLOCK_MAX_ORDER;
1516
1517	/ Check that pageblock_nr_pages has not already been setup /
1518	if (pageblock_order)
1519	return;
1520
1521	/ Don't let pageblocks exceed the maximum allocation granularity. /
1522	if (HPAGE_SHIFT > PAGE_SHIFT && HUGETLB_PAGE_ORDER < order)
1523	order = HUGETLB_PAGE_ORDER;
1524
1525	/*
1526	* Assume the largest contiguous order of interest is a huge page.
1527	* This value may be variable depending on boot parameters on powerpc.
1528	*/
1529	pageblock_order = order;
1530	}
1531	#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
1532
1533	/*
1534	* When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
1535	* is unused as pageblock_order is set at compile-time. See
1536	* include/linux/pageblock-flags.h for the values of pageblock_order based on
1537	* the kernel config
1538	*/
1539	void __init set_pageblock_order(void)
1540	{
1541	}
1542
1543	#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
1544
1545	/*
1546	* Set up the zone data structures
1547	* - init pgdat internals
1548	* - init all zones belonging to this node
1549	*
1550	* NOTE: this function is only called during memory hotplug
1551	*/
1552	#ifdef CONFIG_MEMORY_HOTPLUG
1553	void __ref free_area_init_core_hotplug(struct pglist_data *pgdat)
1554	{
1555	int nid = pgdat->node_id;
1556	enum zone_type z;
1557	int cpu;
1558
1559	pgdat_init_internals(pgdat);
1560
1561	if (pgdat->per_cpu_nodestats == &boot_nodestats)
1562	pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
1563
1564	/*
1565	* Reset the nr_zones, order and highest_zoneidx before reuse.
1566	* Note that kswapd will init kswapd_highest_zoneidx properly
1567	* when it starts in the near future.
1568	*/
1569	pgdat->nr_zones = `0`;
1570	pgdat->kswapd_order = `0`;
1571	pgdat->kswapd_highest_zoneidx = `0`;
1572	pgdat->node_start_pfn = `0`;
1573	pgdat->node_present_pages = `0`;
1574
1575	for_each_online_cpu(cpu) {
1576	struct per_cpu_nodestat *p;
1577
1578	p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
1579	memset(p, `0`, sizeof(*p));
1580	}
1581
1582	/*
1583	* When memory is hot-added, all the memory is in offline state. So
1584	* clear all zones' present_pages and managed_pages because they will
1585	* be updated in online_pages() and offline_pages().
1586	*/
1587	for (z = `0`; z < MAX_NR_ZONES; z++) {
1588	struct zone *zone = pgdat->node_zones + z;
1589
1590	zone->present_pages = `0`;
1591	zone_init_internals(zone, z, nid, `0`);
1592	}
1593	}
1594	#endif
1595
1596	static void __init free_area_init_core(struct pglist_data *pgdat)
1597	{
1598	enum zone_type j;
1599	int nid = pgdat->node_id;
1600
1601	pgdat_init_internals(pgdat);
1602	pgdat->per_cpu_nodestats = &boot_nodestats;
1603
1604	for (j = `0`; j < MAX_NR_ZONES; j++) {
1605	struct zone *zone = pgdat->node_zones + j;
1606	unsigned long size = zone->spanned_pages;
1607
1608	/*
1609	* Initialize zone->managed_pages as 0 , it will be reset
1610	* when memblock allocator frees pages into buddy system.
1611	*/
1612	zone_init_internals(zone, idx: j, nid, remaining_pages: zone->present_pages);
1613
1614	if (!size)
1615	continue;
1616
1617	setup_usemap(zone);
1618	init_currently_empty_zone(zone, zone_start_pfn: zone->zone_start_pfn, size);
1619	}
1620	}
1621
1622	void __init *memmap_alloc(phys_addr_t size, phys_addr_t align,
1623	phys_addr_t min_addr, int nid, bool exact_nid)
1624	{
1625	void *ptr;
1626
1627	/*
1628	* Kmemleak will explicitly scan mem_map by traversing all valid
1629	* `struct *page`,so memblock does not need to be added to the scan list.
1630	*/
1631	if (exact_nid)
1632	ptr = memblock_alloc_exact_nid_raw(size, align, min_addr,
1633	MEMBLOCK_ALLOC_NOLEAKTRACE,
1634	nid);
1635	else
1636	ptr = memblock_alloc_try_nid_raw(size, align, min_addr,
1637	MEMBLOCK_ALLOC_NOLEAKTRACE,
1638	nid);
1639
1640	if (ptr && size > `0`)
1641	page_init_poison(page: ptr, size);
1642
1643	return ptr;
1644	}
1645
1646	#ifdef CONFIG_FLATMEM
1647	static void __init alloc_node_mem_map(struct pglist_data *pgdat)
1648	{
1649	unsigned long start, offset, size, end;
1650	struct page *map;
1651
1652	/ Skip empty nodes /
1653	if (!pgdat->node_spanned_pages)
1654	return;
1655
1656	start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - `1`);
1657	offset = pgdat->node_start_pfn - start;
1658	/*
1659	* The zone's endpoints aren't required to be MAX_PAGE_ORDER
1660	* aligned but the node_mem_map endpoints must be in order
1661	* for the buddy allocator to function correctly.
1662	*/
1663	end = ALIGN(pgdat_end_pfn(pgdat), MAX_ORDER_NR_PAGES);
1664	size = (end - start) * sizeof(struct page);
1665	map = memmap_alloc(size, SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT,
1666	pgdat->node_id, false);
1667	if (!map)
1668	panic("Failed to allocate %ld bytes for node %d memory map\n",
1669	size, pgdat->node_id);
1670	pgdat->node_mem_map = map + offset;
1671	memmap_boot_pages_add(DIV_ROUND_UP(size, PAGE_SIZE));
1672	pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
1673	__func__, pgdat->node_id, (unsigned long)pgdat,
1674	(unsigned long)pgdat->node_mem_map);
1675
1676	/ the global mem_map is just set as node 0's /
1677	WARN_ON(pgdat != NODE_DATA(`0`));
1678
1679	mem_map = pgdat->node_mem_map;
1680	if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
1681	mem_map -= offset;
1682
1683	max_mapnr = end - start;
1684	}
1685	#else
1686	static inline void alloc_node_mem_map(struct pglist_data *pgdat) { }
1687	#endif /* CONFIG_FLATMEM */
1688
1689	/**
1690	* get_pfn_range_for_nid - Return the start and end page frames for a node
1691	* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
1692	* @start_pfn: Passed by reference. On return, it will have the node start_pfn.
1693	* @end_pfn: Passed by reference. On return, it will have the node end_pfn.
1694	*
1695	* It returns the start and end page frame of a node based on information
1696	* provided by memblock_set_node(). If called for a node
1697	* with no available memory, the start and end PFNs will be 0.
1698	*/
1699	void __init get_pfn_range_for_nid(unsigned int nid,
1700	unsigned long start_pfn, unsigned* long *end_pfn)
1701	{
1702	unsigned long this_start_pfn, this_end_pfn;
1703	int i;
1704
1705	*start_pfn = -`1UL`;
1706	*end_pfn = `0`;
1707
1708	for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
1709	start_pfn = min(start_pfn, this_start_pfn);
1710	end_pfn = max(end_pfn, this_end_pfn);
1711	}
1712
1713	if (*start_pfn == -`1UL`)
1714	*start_pfn = `0`;
1715	}
1716
1717	static void __init free_area_init_node(int nid)
1718	{
1719	pg_data_t *pgdat = NODE_DATA(nid);
1720	unsigned long start_pfn = `0`;
1721	unsigned long end_pfn = `0`;
1722
1723	/ pg_data_t should be reset to zero when it's allocated /
1724	WARN_ON(pgdat->nr_zones \|\| pgdat->kswapd_highest_zoneidx);
1725
1726	get_pfn_range_for_nid(nid, start_pfn: &start_pfn, end_pfn: &end_pfn);
1727
1728	pgdat->node_id = nid;
1729	pgdat->node_start_pfn = start_pfn;
1730	pgdat->per_cpu_nodestats = NULL;
1731
1732	if (start_pfn != end_pfn) {
1733	pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
1734	(u64)start_pfn << PAGE_SHIFT,
1735	end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - `1` : `0`);
1736
1737	calculate_node_totalpages(pgdat, node_start_pfn: start_pfn, node_end_pfn: end_pfn);
1738	} else {
1739	pr_info("Initmem setup node %d as memoryless\n", nid);
1740
1741	reset_memoryless_node_totalpages(pgdat);
1742	}
1743
1744	alloc_node_mem_map(pgdat);
1745	pgdat_set_deferred_range(pgdat);
1746
1747	free_area_init_core(pgdat);
1748	lru_gen_init_pgdat(pgdat);
1749	}
1750
1751	/ Any regular or high memory on that node ? /
1752	static void __init check_for_memory(pg_data_t *pgdat)
1753	{
1754	enum zone_type zone_type;
1755
1756	for (zone_type = `0`; zone_type <= ZONE_MOVABLE - `1`; zone_type++) {
1757	struct zone *zone = &pgdat->node_zones[zone_type];
1758	if (populated_zone(zone)) {
1759	if (IS_ENABLED(CONFIG_HIGHMEM))
1760	node_set_state(node: pgdat->node_id, state: N_HIGH_MEMORY);
1761	if (zone_type <= ZONE_NORMAL)
1762	node_set_state(node: pgdat->node_id, state: N_NORMAL_MEMORY);
1763	break;
1764	}
1765	}
1766	}
1767
1768	#if MAX_NUMNODES > 1
1769	/*
1770	* Figure out the number of possible node ids.
1771	*/
1772	void __init setup_nr_node_ids(void)
1773	{
1774	unsigned int highest;
1775
1776	highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);
1777	nr_node_ids = highest + `1`;
1778	}
1779	#endif
1780
1781	/*
1782	* Some architectures, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
1783	* such cases we allow max_zone_pfn sorted in the descending order
1784	*/
1785	static bool arch_has_descending_max_zone_pfns(void)
1786	{
1787	return IS_ENABLED(CONFIG_ARC) && !IS_ENABLED(CONFIG_ARC_HAS_PAE40);
1788	}
1789
1790	static void __init set_high_memory(void)
1791	{
1792	phys_addr_t highmem = memblock_end_of_DRAM();
1793
1794	/*
1795	* Some architectures (e.g. ARM) set high_memory very early and
1796	* use it in arch setup code.
1797	* If an architecture already set high_memory don't overwrite it
1798	*/
1799	if (high_memory)
1800	return;
1801
1802	#ifdef CONFIG_HIGHMEM
1803	if (arch_has_descending_max_zone_pfns() \|\|
1804	highmem > PFN_PHYS(arch_zone_lowest_possible_pfn[ZONE_HIGHMEM]))
1805	highmem = PFN_PHYS(arch_zone_lowest_possible_pfn[ZONE_HIGHMEM]);
1806	#endif
1807
1808	high_memory = phys_to_virt(address: highmem - `1`) + `1`;
1809	}
1810
1811	/**
1812	* free_area_init - Initialise all pg_data_t and zone data
1813	* @max_zone_pfn: an array of max PFNs for each zone
1814	*
1815	* This will call free_area_init_node() for each active node in the system.
1816	* Using the page ranges provided by memblock_set_node(), the size of each
1817	* zone in each node and their holes is calculated. If the maximum PFN
1818	* between two adjacent zones match, it is assumed that the zone is empty.
1819	* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
1820	* that arch_max_dma32_pfn has no pages. It is also assumed that a zone
1821	* starts where the previous one ended. For example, ZONE_DMA32 starts
1822	* at arch_max_dma_pfn.
1823	*/
1824	void __init free_area_init(unsigned long *max_zone_pfn)
1825	{
1826	unsigned long start_pfn, end_pfn;
1827	int i, nid, zone;
1828	bool descending;
1829
1830	/ Record where the zone boundaries are /
1831	memset(s: arch_zone_lowest_possible_pfn, c: `0`,
1832	n: sizeof(arch_zone_lowest_possible_pfn));
1833	memset(s: arch_zone_highest_possible_pfn, c: `0`,
1834	n: sizeof(arch_zone_highest_possible_pfn));
1835
1836	start_pfn = PHYS_PFN(memblock_start_of_DRAM());
1837	descending = arch_has_descending_max_zone_pfns();
1838
1839	for (i = `0`; i < MAX_NR_ZONES; i++) {
1840	if (descending)
1841	zone = MAX_NR_ZONES - i - `1`;
1842	else
1843	zone = i;
1844
1845	if (zone == ZONE_MOVABLE)
1846	continue;
1847
1848	end_pfn = max(max_zone_pfn[zone], start_pfn);
1849	arch_zone_lowest_possible_pfn[zone] = start_pfn;
1850	arch_zone_highest_possible_pfn[zone] = end_pfn;
1851
1852	start_pfn = end_pfn;
1853	}
1854
1855	/ Find the PFNs that ZONE_MOVABLE begins at in each node /
1856	memset(s: zone_movable_pfn, c: `0`, n: sizeof(zone_movable_pfn));
1857	find_zone_movable_pfns_for_nodes();
1858
1859	/ Print out the zone ranges /
1860	pr_info("Zone ranges:\n");
1861	for (i = `0`; i < MAX_NR_ZONES; i++) {
1862	if (i == ZONE_MOVABLE)
1863	continue;
1864	pr_info(" %-8s ", zone_names[i]);
1865	if (arch_zone_lowest_possible_pfn[i] ==
1866	arch_zone_highest_possible_pfn[i])
1867	pr_cont("empty\n");
1868	else
1869	pr_cont("[mem %#018Lx-%#018Lx]\n",
1870	(u64)arch_zone_lowest_possible_pfn[i]
1871	<< PAGE_SHIFT,
1872	((u64)arch_zone_highest_possible_pfn[i]
1873	<< PAGE_SHIFT) - `1`);
1874	}
1875
1876	/ Print out the PFNs ZONE_MOVABLE begins at in each node /
1877	pr_info("Movable zone start for each node\n");
1878	for (i = `0`; i < MAX_NUMNODES; i++) {
1879	if (zone_movable_pfn[i])
1880	pr_info(" Node %d: %#018Lx\n", i,
1881	(u64)zone_movable_pfn[i] << PAGE_SHIFT);
1882	}
1883
1884	/*
1885	* Print out the early node map, and initialize the
1886	* subsection-map relative to active online memory ranges to
1887	* enable future "sub-section" extensions of the memory map.
1888	*/
1889	pr_info("Early memory node ranges\n");
1890	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
1891	pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,
1892	(u64)start_pfn << PAGE_SHIFT,
1893	((u64)end_pfn << PAGE_SHIFT) - `1`);
1894	subsection_map_init(pfn: start_pfn, nr_pages: end_pfn - start_pfn);
1895	}
1896
1897	/ Initialise every node /
1898	mminit_verify_pageflags_layout();
1899	setup_nr_node_ids();
1900	set_pageblock_order();
1901
1902	for_each_node(nid) {
1903	pg_data_t *pgdat;
1904
1905	if (!node_online(nid))
1906	alloc_offline_node_data(nid);
1907
1908	pgdat = NODE_DATA(nid);
1909	free_area_init_node(nid);
1910
1911	/*
1912	* No sysfs hierarchy will be created via register_one_node()
1913	*for memory-less node because here it's not marked as N_MEMORY
1914	*and won't be set online later. The benefit is userspace
1915	*program won't be confused by sysfs files/directories of
1916	*memory-less node. The pgdat will get fully initialized by
1917	*hotadd_init_pgdat() when memory is hotplugged into this node.
1918	*/
1919	if (pgdat->node_present_pages) {
1920	node_set_state(node: nid, state: N_MEMORY);
1921	check_for_memory(pgdat);
1922	}
1923	}
1924
1925	for_each_node_state(nid, N_MEMORY)
1926	sparse_vmemmap_init_nid_late(nid);
1927
1928	calc_nr_kernel_pages();
1929	memmap_init();
1930
1931	/ disable hash distribution for systems with a single node /
1932	fixup_hashdist();
1933
1934	set_high_memory();
1935	}
1936
1937	/**
1938	* node_map_pfn_alignment - determine the maximum internode alignment
1939	*
1940	* This function should be called after node map is populated and sorted.
1941	* It calculates the maximum power of two alignment which can distinguish
1942	* all the nodes.
1943	*
1944	* For example, if all nodes are 1GiB and aligned to 1GiB, the return value
1945	* would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the
1946	* nodes are shifted by 256MiB, 256MiB. Note that if only the last node is
1947	* shifted, 1GiB is enough and this function will indicate so.
1948	*
1949	* This is used to test whether pfn -> nid mapping of the chosen memory
1950	* model has fine enough granularity to avoid incorrect mapping for the
1951	* populated node map.
1952	*
1953	* Return: the determined alignment in pfn's. 0 if there is no alignment
1954	* requirement (single node).
1955	*/
1956	unsigned long __init node_map_pfn_alignment(void)
1957	{
1958	unsigned long accl_mask = `0`, last_end = `0`;
1959	unsigned long start, end, mask;
1960	int last_nid = NUMA_NO_NODE;
1961	int i, nid;
1962
1963	for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
1964	if (!start \|\| last_nid < `0` \|\| last_nid == nid) {
1965	last_nid = nid;
1966	last_end = end;
1967	continue;
1968	}
1969
1970	/*
1971	* Start with a mask granular enough to pin-point to the
1972	* start pfn and tick off bits one-by-one until it becomes
1973	* too coarse to separate the current node from the last.
1974	*/
1975	mask = ~((`1` << __ffs(start)) - `1`);
1976	while (mask && last_end <= (start & (mask << `1`)))
1977	mask <<= `1`;
1978
1979	/ accumulate all internode masks /
1980	accl_mask \|= mask;
1981	}
1982
1983	/ convert mask to number of pages /
1984	return ~accl_mask + `1`;
1985	}
1986
1987	#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1988	static void __init deferred_free_pages(unsigned long pfn,
1989	unsigned long nr_pages)
1990	{
1991	struct page *page;
1992	unsigned long i;
1993
1994	if (!nr_pages)
1995	return;
1996
1997	page = pfn_to_page(pfn);
1998
1999	/ Free a large naturally-aligned chunk if possible /
2000	if (nr_pages == MAX_ORDER_NR_PAGES && IS_MAX_ORDER_ALIGNED(pfn)) {
2001	for (i = `0`; i < nr_pages; i += pageblock_nr_pages)
2002	init_pageblock_migratetype(page + i, MIGRATE_MOVABLE,
2003	false);
2004	__free_pages_core(page, MAX_PAGE_ORDER, MEMINIT_EARLY);
2005	return;
2006	}
2007
2008	/ Accept chunks smaller than MAX_PAGE_ORDER upfront /
2009	accept_memory(PFN_PHYS(pfn), nr_pages * PAGE_SIZE);
2010
2011	for (i = `0`; i < nr_pages; i++, page++, pfn++) {
2012	if (pageblock_aligned(pfn))
2013	init_pageblock_migratetype(page, MIGRATE_MOVABLE,
2014	false);
2015	__free_pages_core(page, `0`, MEMINIT_EARLY);
2016	}
2017	}
2018
2019	/ Completion tracking for deferred_init_memmap() threads /
2020	static atomic_t pgdat_init_n_undone __initdata;
2021	static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);
2022
2023	static inline void __init pgdat_init_report_one_done(void)
2024	{
2025	if (atomic_dec_and_test(&pgdat_init_n_undone))
2026	complete(&pgdat_init_all_done_comp);
2027	}
2028
2029	/*
2030	* Initialize struct pages. We minimize pfn page lookups and scheduler checks
2031	* by performing it only once every MAX_ORDER_NR_PAGES.
2032	* Return number of pages initialized.
2033	*/
2034	static unsigned long __init deferred_init_pages(struct zone *zone,
2035	unsigned long pfn, unsigned long end_pfn)
2036	{
2037	int nid = zone_to_nid(zone);
2038	unsigned long nr_pages = end_pfn - pfn;
2039	int zid = zone_idx(zone);
2040	struct page *page = pfn_to_page(pfn);
2041
2042	for (; pfn < end_pfn; pfn++, page++)
2043	__init_single_page(page, pfn, zid, nid);
2044	return nr_pages;
2045	}
2046
2047	/*
2048	* Initialize and free pages.
2049	*
2050	* At this point reserved pages and struct pages that correspond to holes in
2051	* memblock.memory are already intialized so every free range has a valid
2052	* memory map around it.
2053	* This ensures that access of pages that are ahead of the range being
2054	* initialized (computing buddy page in __free_one_page()) always reads a valid
2055	* struct page.
2056	*
2057	* In order to try and improve CPU cache locality we have the loop broken along
2058	* max page order boundaries.
2059	*/
2060	static unsigned long __init
2061	deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
2062	struct zone *zone)
2063	{
2064	int nid = zone_to_nid(zone);
2065	unsigned long nr_pages = `0`;
2066	phys_addr_t start, end;
2067	u64 i = `0`;
2068
2069	for_each_free_mem_range(i, nid, `0`, &start, &end, NULL) {
2070	unsigned long spfn = PFN_UP(start);
2071	unsigned long epfn = PFN_DOWN(end);
2072
2073	if (spfn >= end_pfn)
2074	break;
2075
2076	spfn = max(spfn, start_pfn);
2077	epfn = min(epfn, end_pfn);
2078
2079	while (spfn < epfn) {
2080	unsigned long mo_pfn = ALIGN(spfn + `1`, MAX_ORDER_NR_PAGES);
2081	unsigned long chunk_end = min(mo_pfn, epfn);
2082
2083	nr_pages += deferred_init_pages(zone, spfn, chunk_end);
2084	deferred_free_pages(spfn, chunk_end - spfn);
2085
2086	spfn = chunk_end;
2087
2088	if (irqs_disabled())
2089	touch_nmi_watchdog();
2090	else
2091	cond_resched();
2092	}
2093	}
2094
2095	return nr_pages;
2096	}
2097
2098	static void __init
2099	deferred_init_memmap_job(unsigned long start_pfn, unsigned long end_pfn,
2100	void *arg)
2101	{
2102	struct zone *zone = arg;
2103
2104	deferred_init_memmap_chunk(start_pfn, end_pfn, zone);
2105	}
2106
2107	static unsigned int __init
2108	deferred_page_init_max_threads(const struct cpumask *node_cpumask)
2109	{
2110	return max(cpumask_weight(node_cpumask), `1U`);
2111	}
2112
2113	/ Initialise remaining memory on a node /
2114	static int __init deferred_init_memmap(void *data)
2115	{
2116	pg_data_t *pgdat = data;
2117	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
2118	int max_threads = deferred_page_init_max_threads(cpumask);
2119	unsigned long first_init_pfn, last_pfn, flags;
2120	unsigned long start = jiffies;
2121	struct zone *zone;
2122
2123	/ Bind memory initialisation thread to a local node if possible /
2124	if (!cpumask_empty(cpumask))
2125	set_cpus_allowed_ptr(current, cpumask);
2126
2127	pgdat_resize_lock(pgdat, &flags);
2128	first_init_pfn = pgdat->first_deferred_pfn;
2129	if (first_init_pfn == ULONG_MAX) {
2130	pgdat_resize_unlock(pgdat, &flags);
2131	pgdat_init_report_one_done();
2132	return `0`;
2133	}
2134
2135	/ Sanity check boundaries /
2136	BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
2137	BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
2138	pgdat->first_deferred_pfn = ULONG_MAX;
2139
2140	/*
2141	* Once we unlock here, the zone cannot be grown anymore, thus if an
2142	* interrupt thread must allocate this early in boot, zone must be
2143	* pre-grown prior to start of deferred page initialization.
2144	*/
2145	pgdat_resize_unlock(pgdat, &flags);
2146
2147	/ Only the highest zone is deferred /
2148	zone = pgdat->node_zones + pgdat->nr_zones - `1`;
2149	last_pfn = SECTION_ALIGN_UP(zone_end_pfn(zone));
2150
2151	struct padata_mt_job job = {
2152	.thread_fn = deferred_init_memmap_job,
2153	.fn_arg = zone,
2154	.start = first_init_pfn,
2155	.size = last_pfn - first_init_pfn,
2156	.align = PAGES_PER_SECTION,
2157	.min_chunk = PAGES_PER_SECTION,
2158	.max_threads = max_threads,
2159	.numa_aware = false,
2160	};
2161
2162	padata_do_multithreaded(&job);
2163
2164	/ Sanity check that the next zone really is unpopulated /
2165	WARN_ON(pgdat->nr_zones < MAX_NR_ZONES && populated_zone(++zone));
2166
2167	pr_info("node %d deferred pages initialised in %ums\n",
2168	pgdat->node_id, jiffies_to_msecs(jiffies - start));
2169
2170	pgdat_init_report_one_done();
2171	return `0`;
2172	}
2173
2174	/*
2175	* If this zone has deferred pages, try to grow it by initializing enough
2176	* deferred pages to satisfy the allocation specified by order, rounded up to
2177	* the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments
2178	* of SECTION_SIZE bytes by initializing struct pages in increments of
2179	* PAGES_PER_SECTION * sizeof(struct page) bytes.
2180	*
2181	* Return true when zone was grown, otherwise return false. We return true even
2182	* when we grow less than requested, to let the caller decide if there are
2183	* enough pages to satisfy the allocation.
2184	*/
2185	bool __init deferred_grow_zone(struct zone zone, unsigned* int order)
2186	{
2187	unsigned long nr_pages_needed = SECTION_ALIGN_UP(`1` << order);
2188	pg_data_t *pgdat = zone->zone_pgdat;
2189	unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
2190	unsigned long spfn, epfn, flags;
2191	unsigned long nr_pages = `0`;
2192
2193	/ Only the last zone may have deferred pages /
2194	if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat))
2195	return false;
2196
2197	pgdat_resize_lock(pgdat, &flags);
2198
2199	/*
2200	* If someone grew this zone while we were waiting for spinlock, return
2201	* true, as there might be enough pages already.
2202	*/
2203	if (first_deferred_pfn != pgdat->first_deferred_pfn) {
2204	pgdat_resize_unlock(pgdat, &flags);
2205	return true;
2206	}
2207
2208	/*
2209	* Initialize at least nr_pages_needed in section chunks.
2210	* If a section has less free memory than nr_pages_needed, the next
2211	* section will be also initialized.
2212	* Note, that it still does not guarantee that allocation of order can
2213	* be satisfied if the sections are fragmented because of memblock
2214	* allocations.
2215	*/
2216	for (spfn = first_deferred_pfn, epfn = SECTION_ALIGN_UP(spfn + `1`);
2217	nr_pages < nr_pages_needed && spfn < zone_end_pfn(zone);
2218	spfn = epfn, epfn += PAGES_PER_SECTION) {
2219	nr_pages += deferred_init_memmap_chunk(spfn, epfn, zone);
2220	}
2221
2222	/*
2223	* There were no pages to initialize and free which means the zone's
2224	* memory map is completely initialized.
2225	*/
2226	pgdat->first_deferred_pfn = nr_pages ? spfn : ULONG_MAX;
2227
2228	pgdat_resize_unlock(pgdat, &flags);
2229
2230	return nr_pages > `0`;
2231	}
2232
2233	#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
2234
2235	#ifdef CONFIG_CMA
2236	void __init init_cma_reserved_pageblock(struct page *page)
2237	{
2238	unsigned i = pageblock_nr_pages;
2239	struct page *p = page;
2240
2241	do {
2242	__ClearPageReserved(p);
2243	set_page_count(p, `0`);
2244	} while (++p, --i);
2245
2246	init_pageblock_migratetype(page, MIGRATE_CMA, false);
2247	set_page_refcounted(page);
2248	/ pages were reserved and not allocated /
2249	clear_page_tag_ref(page);
2250	__free_pages(page, pageblock_order);
2251
2252	adjust_managed_page_count(page, pageblock_nr_pages);
2253	page_zone(page)->cma_pages += pageblock_nr_pages;
2254	}
2255	/*
2256	* Similar to above, but only set the migrate type and stats.
2257	*/
2258	void __init init_cma_pageblock(struct page *page)
2259	{
2260	init_pageblock_migratetype(page, MIGRATE_CMA, false);
2261	adjust_managed_page_count(page, pageblock_nr_pages);
2262	page_zone(page)->cma_pages += pageblock_nr_pages;
2263	}
2264	#endif
2265
2266	void set_zone_contiguous(struct zone *zone)
2267	{
2268	unsigned long block_start_pfn = zone->zone_start_pfn;
2269	unsigned long block_end_pfn;
2270
2271	block_end_pfn = pageblock_end_pfn(block_start_pfn);
2272	for (; block_start_pfn < zone_end_pfn(zone);
2273	block_start_pfn = block_end_pfn,
2274	block_end_pfn += pageblock_nr_pages) {
2275
2276	block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
2277
2278	if (!__pageblock_pfn_to_page(start_pfn: block_start_pfn,
2279	end_pfn: block_end_pfn, zone))
2280	return;
2281	cond_resched();
2282	}
2283
2284	/ We confirm that there is no hole /
2285	zone->contiguous = true;
2286	}
2287
2288	/*
2289	* Check if a PFN range intersects multiple zones on one or more
2290	* NUMA nodes. Specify the @nid argument if it is known that this
2291	* PFN range is on one node, NUMA_NO_NODE otherwise.
2292	*/
2293	bool pfn_range_intersects_zones(int nid, unsigned long start_pfn,
2294	unsigned long nr_pages)
2295	{
2296	struct zone zone, izone = NULL;
2297
2298	for_each_zone(zone) {
2299	if (nid != NUMA_NO_NODE && zone_to_nid(zone) != nid)
2300	continue;
2301
2302	if (zone_intersects(zone, start_pfn, nr_pages)) {
2303	if (izone != NULL)
2304	return true;
2305	izone = zone;
2306	}
2307
2308	}
2309
2310	return false;
2311	}
2312
2313	static void __init mem_init_print_info(void);
2314	void __init page_alloc_init_late(void)
2315	{
2316	struct zone *zone;
2317	int nid;
2318
2319	#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
2320
2321	/ There will be num_node_state(N_MEMORY) threads /
2322	atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
2323	for_each_node_state(nid, N_MEMORY) {
2324	kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
2325	}
2326
2327	/ Block until all are initialised /
2328	wait_for_completion(&pgdat_init_all_done_comp);
2329
2330	/*
2331	* We initialized the rest of the deferred pages. Permanently disable
2332	* on-demand struct page initialization.
2333	*/
2334	static_branch_disable(&deferred_pages);
2335
2336	/ Reinit limits that are based on free pages after the kernel is up /
2337	files_maxfiles_init();
2338	#endif
2339
2340	/ Accounting of total+free memory is stable at this point. /
2341	mem_init_print_info();
2342	buffer_init();
2343
2344	/ Discard memblock private memory /
2345	memblock_discard();
2346
2347	for_each_node_state(nid, N_MEMORY)
2348	shuffle_free_memory(NODE_DATA(nid));
2349
2350	for_each_populated_zone(zone)
2351	set_zone_contiguous(zone);
2352
2353	/ Initialize page ext after all struct pages are initialized. /
2354	if (deferred_struct_pages)
2355	page_ext_init();
2356
2357	page_alloc_sysctl_init();
2358	}
2359
2360	/*
2361	* Adaptive scale is meant to reduce sizes of hash tables on large memory
2362	* machines. As memory size is increased the scale is also increased but at
2363	* slower pace. Starting from ADAPT_SCALE_BASE (64G), every time memory
2364	* quadruples the scale is increased by one, which means the size of hash table
2365	* only doubles, instead of quadrupling as well.
2366	* Because 32-bit systems cannot have large physical memory, where this scaling
2367	* makes sense, it is disabled on such platforms.
2368	*/
2369	#if __BITS_PER_LONG > 32
2370	#define ADAPT_SCALE_BASE (64ul << 30)
2371	#define ADAPT_SCALE_SHIFT 2
2372	#define ADAPT_SCALE_NPAGES (ADAPT_SCALE_BASE >> PAGE_SHIFT)
2373	#endif
2374
2375	/*
2376	* allocate a large system hash table from bootmem
2377	* - it is assumed that the hash table must contain an exact power-of-2
2378	* quantity of entries
2379	* - limit is the number of hash buckets, not the total allocation size
2380	*/
2381	void __init alloc_large_system_hash(const* char *tablename,
2382	unsigned long bucketsize,
2383	unsigned long numentries,
2384	int scale,
2385	int flags,
2386	unsigned int *_hash_shift,
2387	unsigned int *_hash_mask,
2388	unsigned long low_limit,
2389	unsigned long high_limit)
2390	{
2391	unsigned long long max = high_limit;
2392	unsigned long log2qty, size;
2393	void *table;
2394	gfp_t gfp_flags;
2395	bool virt;
2396	bool huge;
2397
2398	/ allow the kernel cmdline to have a say /
2399	if (!numentries) {
2400	/ round applicable memory size up to nearest megabyte /
2401	numentries = nr_kernel_pages;
2402
2403	/ It isn't necessary when PAGE_SIZE >= 1MB /
2404	if (PAGE_SIZE < SZ_1M)
2405	numentries = round_up(numentries, SZ_1M / PAGE_SIZE);
2406
2407	#if __BITS_PER_LONG > 32
2408	if (!high_limit) {
2409	unsigned long adapt;
2410
2411	for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries;
2412	adapt <<= ADAPT_SCALE_SHIFT)
2413	scale++;
2414	}
2415	#endif
2416
2417	/ limit to 1 bucket per 2^scale bytes of low memory /
2418	if (scale > PAGE_SHIFT)
2419	numentries >>= (scale - PAGE_SHIFT);
2420	else
2421	numentries <<= (PAGE_SHIFT - scale);
2422
2423	if (unlikely((numentries * bucketsize) < PAGE_SIZE))
2424	numentries = PAGE_SIZE / bucketsize;
2425	}
2426	numentries = roundup_pow_of_two(numentries);
2427
2428	/ limit allocation size to 1/16 total memory by default /
2429	if (max == `0`) {
2430	max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> `4`;
2431	do_div(max, bucketsize);
2432	}
2433	max = min(max, `0x80000000ULL`);
2434
2435	if (numentries < low_limit)
2436	numentries = low_limit;
2437	if (numentries > max)
2438	numentries = max;
2439
2440	log2qty = ilog2(numentries);
2441
2442	gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC \| __GFP_ZERO : GFP_ATOMIC;
2443	do {
2444	virt = false;
2445	size = bucketsize << log2qty;
2446	if (flags & HASH_EARLY) {
2447	if (flags & HASH_ZERO)
2448	table = memblock_alloc(size, SMP_CACHE_BYTES);
2449	else
2450	table = memblock_alloc_raw(size,
2451	SMP_CACHE_BYTES);
2452	} else if (get_order(size) > MAX_PAGE_ORDER \|\| hashdist) {
2453	table = vmalloc_huge(size, gfp_mask: gfp_flags);
2454	virt = true;
2455	if (table)
2456	huge = is_vm_area_hugepages(addr: table);
2457	} else {
2458	/*
2459	* If bucketsize is not a power-of-two, we may free
2460	* some pages at the end of hash table which
2461	* alloc_pages_exact() automatically does
2462	*/
2463	table = alloc_pages_exact(size, gfp_flags);
2464	kmemleak_alloc(ptr: table, size, min_count: `1`, gfp: gfp_flags);
2465	}
2466	} while (!table && size > PAGE_SIZE && --log2qty);
2467
2468	if (!table)
2469	panic(fmt: "Failed to allocate %s hash table\n", tablename);
2470
2471	pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
2472	tablename, `1UL` << log2qty, ilog2(size) - PAGE_SHIFT, size,
2473	virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear");
2474
2475	if (_hash_shift)
2476	*_hash_shift = log2qty;
2477	if (_hash_mask)
2478	*_hash_mask = (`1` << log2qty) - `1`;
2479
2480	return table;
2481	}
2482
2483	void __init memblock_free_pages(struct page page, unsigned* long pfn,
2484	unsigned int order)
2485	{
2486	if (IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT)) {
2487	int nid = early_pfn_to_nid(pfn);
2488
2489	if (!early_page_initialised(pfn, nid))
2490	return;
2491	}
2492
2493	if (!kmsan_memblock_free_pages(page, order)) {
2494	/ KMSAN will take care of these pages. /
2495	return;
2496	}
2497
2498	/ pages were reserved and not allocated /
2499	clear_page_tag_ref(page);
2500	__free_pages_core(page, order, context: MEMINIT_EARLY);
2501	}
2502
2503	DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
2504	EXPORT_SYMBOL(init_on_alloc);
2505
2506	DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
2507	EXPORT_SYMBOL(init_on_free);
2508
2509	static bool _init_on_alloc_enabled_early __read_mostly
2510	= IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON);
2511	static int __init early_init_on_alloc(char *buf)
2512	{
2513
2514	return kstrtobool(s: buf, res: &_init_on_alloc_enabled_early);
2515	}
2516	early_param("init_on_alloc", early_init_on_alloc);
2517
2518	static bool _init_on_free_enabled_early __read_mostly
2519	= IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON);
2520	static int __init early_init_on_free(char *buf)
2521	{
2522	return kstrtobool(s: buf, res: &_init_on_free_enabled_early);
2523	}
2524	early_param("init_on_free", early_init_on_free);
2525
2526	DEFINE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled);
2527
2528	/*
2529	* Enable static keys related to various memory debugging and hardening options.
2530	* Some override others, and depend on early params that are evaluated in the
2531	* order of appearance. So we need to first gather the full picture of what was
2532	* enabled, and then make decisions.
2533	*/
2534	static void __init mem_debugging_and_hardening_init(void)
2535	{
2536	bool page_poisoning_requested = false;
2537	bool want_check_pages = false;
2538
2539	#ifdef CONFIG_PAGE_POISONING
2540	/*
2541	* Page poisoning is debug page alloc for some arches. If
2542	* either of those options are enabled, enable poisoning.
2543	*/
2544	if (page_poisoning_enabled() \|\|
2545	(!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
2546	debug_pagealloc_enabled())) {
2547	static_branch_enable(&_page_poisoning_enabled);
2548	page_poisoning_requested = true;
2549	want_check_pages = true;
2550	}
2551	#endif
2552
2553	if ((_init_on_alloc_enabled_early \|\| _init_on_free_enabled_early) &&
2554	page_poisoning_requested) {
2555	pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
2556	"will take precedence over init_on_alloc and init_on_free\n");
2557	_init_on_alloc_enabled_early = false;
2558	_init_on_free_enabled_early = false;
2559	}
2560
2561	if (_init_on_alloc_enabled_early) {
2562	want_check_pages = true;
2563	static_branch_enable(&init_on_alloc);
2564	} else {
2565	static_branch_disable(&init_on_alloc);
2566	}
2567
2568	if (_init_on_free_enabled_early) {
2569	want_check_pages = true;
2570	static_branch_enable(&init_on_free);
2571	} else {
2572	static_branch_disable(&init_on_free);
2573	}
2574
2575	if (IS_ENABLED(CONFIG_KMSAN) &&
2576	(_init_on_alloc_enabled_early \|\| _init_on_free_enabled_early))
2577	pr_info("mem auto-init: please make sure init_on_alloc and init_on_free are disabled when running KMSAN\n");
2578
2579	#ifdef CONFIG_DEBUG_PAGEALLOC
2580	if (debug_pagealloc_enabled()) {
2581	want_check_pages = true;
2582	static_branch_enable(&_debug_pagealloc_enabled);
2583
2584	if (debug_guardpage_minorder())
2585	static_branch_enable(&_debug_guardpage_enabled);
2586	}
2587	#endif
2588
2589	/*
2590	* Any page debugging or hardening option also enables sanity checking
2591	* of struct pages being allocated or freed. With CONFIG_DEBUG_VM it's
2592	* enabled already.
2593	*/
2594	if (!IS_ENABLED(CONFIG_DEBUG_VM) && want_check_pages)
2595	static_branch_enable(&check_pages_enabled);
2596	}
2597
2598	/ Report memory auto-initialization states for this boot. /
2599	static void __init report_meminit(void)
2600	{
2601	const char *stack;
2602
2603	if (IS_ENABLED(CONFIG_INIT_STACK_ALL_PATTERN))
2604	stack = "all(pattern)";
2605	else if (IS_ENABLED(CONFIG_INIT_STACK_ALL_ZERO))
2606	stack = "all(zero)";
2607	else
2608	stack = "off";
2609
2610	pr_info("mem auto-init: stack:%s, heap alloc:%s, heap free:%s\n",
2611	stack, str_on_off(want_init_on_alloc(GFP_KERNEL)),
2612	str_on_off(want_init_on_free()));
2613	if (want_init_on_free())
2614	pr_info("mem auto-init: clearing system memory may take some time...\n");
2615	}
2616
2617	static void __init mem_init_print_info(void)
2618	{
2619	unsigned long physpages, codesize, datasize, rosize, bss_size;
2620	unsigned long init_code_size, init_data_size;
2621
2622	physpages = get_num_physpages();
2623	codesize = _etext - _stext;
2624	datasize = _edata - _sdata;
2625	rosize = __end_rodata - __start_rodata;
2626	bss_size = __bss_stop - __bss_start;
2627	init_data_size = __init_end - __init_begin;
2628	init_code_size = _einittext - _sinittext;
2629
2630	/*
2631	* Detect special cases and adjust section sizes accordingly:
2632	* 1) .init.* may be embedded into .data sections
2633	* 2) .init.text.* may be out of [__init_begin, __init_end],
2634	* please refer to arch/tile/kernel/vmlinux.lds.S.
2635	* 3) .rodata.* may be embedded into .text or .data sections.
2636	*/
2637	#define adj_init_size(start, end, size, pos, adj) \
2638	do { \
2639	if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \
2640	size -= adj; \
2641	} while (0)
2642
2643	adj_init_size(__init_begin, __init_end, init_data_size,
2644	_sinittext, init_code_size);
2645	adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
2646	adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
2647	adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
2648	adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
2649
2650	#undef adj_init_size
2651
2652	pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved"
2653	#ifdef CONFIG_HIGHMEM
2654	", %luK highmem"
2655	#endif
2656	")\n",
2657	K(nr_free_pages()), K(physpages),
2658	codesize / SZ_1K, datasize / SZ_1K, rosize / SZ_1K,
2659	(init_data_size + init_code_size) / SZ_1K, bss_size / SZ_1K,
2660	K(physpages - totalram_pages() - totalcma_pages),
2661	K(totalcma_pages)
2662	#ifdef CONFIG_HIGHMEM
2663	, K(totalhigh_pages())
2664	#endif
2665	);
2666	}
2667
2668	void __init __weak arch_mm_preinit(void)
2669	{
2670	}
2671
2672	void __init __weak mem_init(void)
2673	{
2674	}
2675
2676	/*
2677	* Set up kernel memory allocators
2678	*/
2679	void __init mm_core_init(void)
2680	{
2681	arch_mm_preinit();
2682	hugetlb_bootmem_alloc();
2683
2684	/ Initializations relying on SMP setup /
2685	BUILD_BUG_ON(MAX_ZONELISTS > `2`);
2686	build_all_zonelists(NULL);
2687	page_alloc_init_cpuhp();
2688	alloc_tag_sec_init();
2689	/*
2690	* page_ext requires contiguous pages,
2691	* bigger than MAX_PAGE_ORDER unless SPARSEMEM.
2692	*/
2693	page_ext_init_flatmem();
2694	mem_debugging_and_hardening_init();
2695	kfence_alloc_pool_and_metadata();
2696	report_meminit();
2697	kmsan_init_shadow();
2698	stack_depot_early_init();
2699
2700	/*
2701	* KHO memory setup must happen while memblock is still active, but
2702	* as close as possible to buddy initialization
2703	*/
2704	kho_memory_init();
2705
2706	memblock_free_all();
2707	mem_init();
2708	kmem_cache_init();
2709	/*
2710	* page_owner must be initialized after buddy is ready, and also after
2711	* slab is ready so that stack_depot_init() works properly
2712	*/
2713	page_ext_init_flatmem_late();
2714	kmemleak_init();
2715	ptlock_cache_init();
2716	pgtable_cache_init();
2717	debug_objects_mem_init();
2718	vmalloc_init();
2719	/ If no deferred init page_ext now, as vmap is fully initialized /
2720	if (!deferred_struct_pages)
2721	page_ext_init();
2722	/ Should be run before the first non-init thread is created /
2723	init_espfix_bsp();
2724	/ Should be run after espfix64 is set up. /
2725	pti_init();
2726	kmsan_init_runtime();
2727	mm_cache_init();
2728	execmem_init();
2729	}
2730

Browse the source code of Linux/mm/mm_init.c