hugetlb_vmemmap.c source code [Linux/mm/hugetlb_vmemmap.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* HugeTLB Vmemmap Optimization (HVO)
4	*
5	* Copyright (c) 2020, ByteDance. All rights reserved.
6	*
7	* Author: Muchun Song <songmuchun@bytedance.com>
8	*
9	* See Documentation/mm/vmemmap_dedup.rst
10	*/
11	#define pr_fmt(fmt) "HugeTLB: " fmt
12
13	#include <linux/pgtable.h>
14	#include <linux/moduleparam.h>
15	#include <linux/bootmem_info.h>
16	#include <linux/mmdebug.h>
17	#include <linux/pagewalk.h>
18	#include <asm/pgalloc.h>
19	#include <asm/tlbflush.h>
20	#include "hugetlb_vmemmap.h"
21
22	/**
23	* struct vmemmap_remap_walk - walk vmemmap page table
24	*
25	* @remap_pte: called for each lowest-level entry (PTE).
26	* @nr_walked: the number of walked pte.
27	* @reuse_page: the page which is reused for the tail vmemmap pages.
28	* @reuse_addr: the virtual address of the @reuse_page page.
29	* @vmemmap_pages: the list head of the vmemmap pages that can be freed
30	* or is mapped from.
31	* @flags: used to modify behavior in vmemmap page table walking
32	* operations.
33	*/
34	struct vmemmap_remap_walk {
35	void (remap_pte)(pte_t pte, unsigned long addr,
36	struct vmemmap_remap_walk *walk);
37	unsigned long nr_walked;
38	struct page *reuse_page;
39	unsigned long reuse_addr;
40	struct list_head *vmemmap_pages;
41
42	/ Skip the TLB flush when we split the PMD /
43	#define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0)
44	/ Skip the TLB flush when we remap the PTE /
45	#define VMEMMAP_REMAP_NO_TLB_FLUSH BIT(1)
46	/ synchronize_rcu() to avoid writes from page_ref_add_unless() /
47	#define VMEMMAP_SYNCHRONIZE_RCU BIT(2)
48	unsigned long flags;
49	};
50
51	static int vmemmap_split_pmd(pmd_t pmd, struct* page head, unsigned* long start,
52	struct vmemmap_remap_walk *walk)
53	{
54	pmd_t __pmd;
55	int i;
56	unsigned long addr = start;
57	pte_t *pgtable;
58
59	pgtable = pte_alloc_one_kernel(&init_mm);
60	if (!pgtable)
61	return -ENOMEM;
62
63	pmd_populate_kernel(mm: &init_mm, pmd: &__pmd, pte: pgtable);
64
65	for (i = `0`; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
66	pte_t entry, *pte;
67	pgprot_t pgprot = PAGE_KERNEL;
68
69	entry = mk_pte(page: head + i, pgprot);
70	pte = pte_offset_kernel(pmd: &__pmd, address: addr);
71	set_pte_at(&init_mm, addr, pte, entry);
72	}
73
74	spin_lock(lock: &init_mm.page_table_lock);
75	if (likely(pmd_leaf(*pmd))) {
76	/*
77	* Higher order allocations from buddy allocator must be able to
78	* be treated as indepdenent small pages (as they can be freed
79	* individually).
80	*/
81	if (!PageReserved(page: head))
82	split_page(page: head, order: get_order(PMD_SIZE));
83
84	/ Make pte visible before pmd. See comment in pmd_install(). /
85	smp_wmb();
86	pmd_populate_kernel(mm: &init_mm, pmd, pte: pgtable);
87	if (!(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH))
88	flush_tlb_kernel_range(start, end: start + PMD_SIZE);
89	} else {
90	pte_free_kernel(mm: &init_mm, pte: pgtable);
91	}
92	spin_unlock(lock: &init_mm.page_table_lock);
93
94	return `0`;
95	}
96
97	static int vmemmap_pmd_entry(pmd_t pmd, unsigned* long addr,
98	unsigned long next, struct mm_walk *walk)
99	{
100	int ret = `0`;
101	struct page *head;
102	struct vmemmap_remap_walk *vmemmap_walk = walk->private;
103
104	/ Only splitting, not remapping the vmemmap pages. /
105	if (!vmemmap_walk->remap_pte)
106	walk->action = ACTION_CONTINUE;
107
108	spin_lock(lock: &init_mm.page_table_lock);
109	head = pmd_leaf(pte: pmd) ? pmd_page(pmd) : NULL;
110	/*
111	* Due to HugeTLB alignment requirements and the vmemmap
112	* pages being at the start of the hotplugged memory
113	* region in memory_hotplug.memmap_on_memory case. Checking
114	* the vmemmap page associated with the first vmemmap page
115	* if it is self-hosted is sufficient.
116	*
117	* [ hotplugged memory ]
118	* [ section ][...][ section ]
119	* [ vmemmap ][ usable memory ]
120	* ^ \| ^ \|
121	* +--+ \| \|
122	* +------------------------+
123	*/
124	if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && unlikely(!vmemmap_walk->nr_walked)) {
125	struct page *page = head ? head + pte_index(address: addr) :
126	pte_page(ptep_get(pte_offset_kernel(pmd, addr)));
127
128	if (PageVmemmapSelfHosted(page))
129	ret = -ENOTSUPP;
130	}
131	spin_unlock(lock: &init_mm.page_table_lock);
132	if (!head \|\| ret)
133	return ret;
134
135	return vmemmap_split_pmd(pmd, head, start: addr & PMD_MASK, walk: vmemmap_walk);
136	}
137
138	static int vmemmap_pte_entry(pte_t pte, unsigned* long addr,
139	unsigned long next, struct mm_walk *walk)
140	{
141	struct vmemmap_remap_walk *vmemmap_walk = walk->private;
142
143	/*
144	* The reuse_page is found 'first' in page table walking before
145	* starting remapping.
146	*/
147	if (!vmemmap_walk->reuse_page)
148	vmemmap_walk->reuse_page = pte_page(ptep_get(pte));
149	else
150	vmemmap_walk->remap_pte(pte, addr, vmemmap_walk);
151	vmemmap_walk->nr_walked++;
152
153	return `0`;
154	}
155
156	static const struct mm_walk_ops vmemmap_remap_ops = {
157	.pmd_entry = vmemmap_pmd_entry,
158	.pte_entry = vmemmap_pte_entry,
159	};
160
161	static int vmemmap_remap_range(unsigned long start, unsigned long end,
162	struct vmemmap_remap_walk *walk)
163	{
164	int ret;
165
166	VM_BUG_ON(!PAGE_ALIGNED(start \| end));
167
168	mmap_read_lock(mm: &init_mm);
169	ret = walk_kernel_page_table_range(start, end, ops: &vmemmap_remap_ops,
170	NULL, private: walk);
171	mmap_read_unlock(mm: &init_mm);
172	if (ret)
173	return ret;
174
175	if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH))
176	flush_tlb_kernel_range(start, end);
177
178	return `0`;
179	}
180
181	/*
182	* Free a vmemmap page. A vmemmap page can be allocated from the memblock
183	* allocator or buddy allocator. If the PG_reserved flag is set, it means
184	* that it allocated from the memblock allocator, just free it via the
185	* free_bootmem_page(). Otherwise, use __free_page().
186	*/
187	static inline void free_vmemmap_page(struct page *page)
188	{
189	if (PageReserved(page)) {
190	memmap_boot_pages_add(delta: -`1`);
191	free_bootmem_page(page);
192	} else {
193	memmap_pages_add(delta: -`1`);
194	__free_page(page);
195	}
196	}
197
198	/ Free a list of the vmemmap pages /
199	static void free_vmemmap_page_list(struct list_head *list)
200	{
201	struct page page, next;
202
203	list_for_each_entry_safe(page, next, list, lru)
204	free_vmemmap_page(page);
205	}
206
207	static void vmemmap_remap_pte(pte_t pte, unsigned* long addr,
208	struct vmemmap_remap_walk *walk)
209	{
210	/*
211	* Remap the tail pages as read-only to catch illegal write operation
212	* to the tail pages.
213	*/
214	pgprot_t pgprot = PAGE_KERNEL_RO;
215	struct page *page = pte_page(ptep_get(pte));
216	pte_t entry;
217
218	/ Remapping the head page requires r/w /
219	if (unlikely(addr == walk->reuse_addr)) {
220	pgprot = PAGE_KERNEL;
221	list_del(entry: &walk->reuse_page->lru);
222
223	/*
224	* Makes sure that preceding stores to the page contents from
225	* vmemmap_remap_free() become visible before the set_pte_at()
226	* write.
227	*/
228	smp_wmb();
229	}
230
231	entry = mk_pte(page: walk->reuse_page, pgprot);
232	list_add(new: &page->lru, head: walk->vmemmap_pages);
233	set_pte_at(&init_mm, addr, pte, entry);
234	}
235
236	/*
237	* How many struct page structs need to be reset. When we reuse the head
238	* struct page, the special metadata (e.g. page->flags or page->mapping)
239	* cannot copy to the tail struct page structs. The invalid value will be
240	* checked in the free_tail_page_prepare(). In order to avoid the message
241	* of "corrupted mapping in tail page". We need to reset at least 4 (one
242	* head struct page struct and three tail struct page structs) struct page
243	* structs.
244	*/
245	#define NR_RESET_STRUCT_PAGE 4
246
247	static inline void reset_struct_pages(struct page *start)
248	{
249	struct page *from = start + NR_RESET_STRUCT_PAGE;
250
251	BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * `2` > PAGE_SIZE / sizeof(struct page));
252	memcpy(to: start, from, len: sizeof(from) NR_RESET_STRUCT_PAGE);
253	}
254
255	static void vmemmap_restore_pte(pte_t pte, unsigned* long addr,
256	struct vmemmap_remap_walk *walk)
257	{
258	pgprot_t pgprot = PAGE_KERNEL;
259	struct page *page;
260	void *to;
261
262	BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page);
263
264	page = list_first_entry(walk->vmemmap_pages, struct page, lru);
265	list_del(entry: &page->lru);
266	to = page_to_virt(page);
267	copy_page(to, from: (void *)walk->reuse_addr);
268	reset_struct_pages(start: to);
269
270	/*
271	* Makes sure that preceding stores to the page contents become visible
272	* before the set_pte_at() write.
273	*/
274	smp_wmb();
275	set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
276	}
277
278	/**
279	* vmemmap_remap_split - split the vmemmap virtual address range [@start, @end)
280	* backing PMDs of the directmap into PTEs
281	* @start: start address of the vmemmap virtual address range that we want
282	* to remap.
283	* @end: end address of the vmemmap virtual address range that we want to
284	* remap.
285	* @reuse: reuse address.
286	*
287	* Return: %0 on success, negative error code otherwise.
288	*/
289	static int vmemmap_remap_split(unsigned long start, unsigned long end,
290	unsigned long reuse)
291	{
292	struct vmemmap_remap_walk walk = {
293	.remap_pte = NULL,
294	.flags = VMEMMAP_SPLIT_NO_TLB_FLUSH,
295	};
296
297	/ See the comment in the vmemmap_remap_free(). /
298	BUG_ON(start - reuse != PAGE_SIZE);
299
300	return vmemmap_remap_range(start: reuse, end, walk: &walk);
301	}
302
303	/**
304	* vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
305	* to the page which @reuse is mapped to, then free vmemmap
306	* which the range are mapped to.
307	* @start: start address of the vmemmap virtual address range that we want
308	* to remap.
309	* @end: end address of the vmemmap virtual address range that we want to
310	* remap.
311	* @reuse: reuse address.
312	* @vmemmap_pages: list to deposit vmemmap pages to be freed. It is callers
313	* responsibility to free pages.
314	* @flags: modifications to vmemmap_remap_walk flags
315	*
316	* Return: %0 on success, negative error code otherwise.
317	*/
318	static int vmemmap_remap_free(unsigned long start, unsigned long end,
319	unsigned long reuse,
320	struct list_head *vmemmap_pages,
321	unsigned long flags)
322	{
323	int ret;
324	struct vmemmap_remap_walk walk = {
325	.remap_pte = vmemmap_remap_pte,
326	.reuse_addr = reuse,
327	.vmemmap_pages = vmemmap_pages,
328	.flags = flags,
329	};
330	int nid = page_to_nid(page: (struct page *)reuse);
331	gfp_t gfp_mask = GFP_KERNEL \| __GFP_NORETRY \| __GFP_NOWARN;
332
333	/*
334	* Allocate a new head vmemmap page to avoid breaking a contiguous
335	* block of struct page memory when freeing it back to page allocator
336	* in free_vmemmap_page_list(). This will allow the likely contiguous
337	* struct page backing memory to be kept contiguous and allowing for
338	* more allocations of hugepages. Fallback to the currently
339	* mapped head page in case should it fail to allocate.
340	*/
341	walk.reuse_page = alloc_pages_node(nid, gfp_mask, `0`);
342	if (walk.reuse_page) {
343	copy_page(page_to_virt(walk.reuse_page),
344	from: (void *)walk.reuse_addr);
345	list_add(new: &walk.reuse_page->lru, head: vmemmap_pages);
346	memmap_pages_add(delta: `1`);
347	}
348
349	/*
350	* In order to make remapping routine most efficient for the huge pages,
351	* the routine of vmemmap page table walking has the following rules
352	* (see more details from the vmemmap_pte_range()):
353	*
354	* - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
355	* should be continuous.
356	* - The @reuse address is part of the range [@reuse, @end) that we are
357	* walking which is passed to vmemmap_remap_range().
358	* - The @reuse address is the first in the complete range.
359	*
360	* So we need to make sure that @start and @reuse meet the above rules.
361	*/
362	BUG_ON(start - reuse != PAGE_SIZE);
363
364	ret = vmemmap_remap_range(start: reuse, end, walk: &walk);
365	if (ret && walk.nr_walked) {
366	end = reuse + walk.nr_walked * PAGE_SIZE;
367	/*
368	* vmemmap_pages contains pages from the previous
369	* vmemmap_remap_range call which failed. These
370	* are pages which were removed from the vmemmap.
371	* They will be restored in the following call.
372	*/
373	walk = (struct vmemmap_remap_walk) {
374	.remap_pte = vmemmap_restore_pte,
375	.reuse_addr = reuse,
376	.vmemmap_pages = vmemmap_pages,
377	.flags = `0`,
378	};
379
380	vmemmap_remap_range(start: reuse, end, walk: &walk);
381	}
382
383	return ret;
384	}
385
386	static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
387	struct list_head *list)
388	{
389	gfp_t gfp_mask = GFP_KERNEL \| __GFP_RETRY_MAYFAIL;
390	unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
391	int nid = page_to_nid(page: (struct page *)start);
392	struct page page, next;
393	int i;
394
395	for (i = `0`; i < nr_pages; i++) {
396	page = alloc_pages_node(nid, gfp_mask, `0`);
397	if (!page)
398	goto out;
399	list_add(new: &page->lru, head: list);
400	}
401	memmap_pages_add(delta: nr_pages);
402
403	return `0`;
404	out:
405	list_for_each_entry_safe(page, next, list, lru)
406	__free_page(page);
407	return -ENOMEM;
408	}
409
410	/**
411	* vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
412	* to the page which is from the @vmemmap_pages
413	* respectively.
414	* @start: start address of the vmemmap virtual address range that we want
415	* to remap.
416	* @end: end address of the vmemmap virtual address range that we want to
417	* remap.
418	* @reuse: reuse address.
419	* @flags: modifications to vmemmap_remap_walk flags
420	*
421	* Return: %0 on success, negative error code otherwise.
422	*/
423	static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
424	unsigned long reuse, unsigned long flags)
425	{
426	LIST_HEAD(vmemmap_pages);
427	struct vmemmap_remap_walk walk = {
428	.remap_pte = vmemmap_restore_pte,
429	.reuse_addr = reuse,
430	.vmemmap_pages = &vmemmap_pages,
431	.flags = flags,
432	};
433
434	/ See the comment in the vmemmap_remap_free(). /
435	BUG_ON(start - reuse != PAGE_SIZE);
436
437	if (alloc_vmemmap_page_list(start, end, list: &vmemmap_pages))
438	return -ENOMEM;
439
440	return vmemmap_remap_range(start: reuse, end, walk: &walk);
441	}
442
443	DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
444	EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
445
446	static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
447	static int __init hugetlb_vmemmap_optimize_param(char *buf)
448	{
449	return kstrtobool(s: buf, res: &vmemmap_optimize_enabled);
450	}
451	early_param("hugetlb_free_vmemmap", hugetlb_vmemmap_optimize_param);
452
453	static int __hugetlb_vmemmap_restore_folio(const struct hstate *h,
454	struct folio folio, unsigned* long flags)
455	{
456	int ret;
457	unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
458	unsigned long vmemmap_reuse;
459
460	VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
461	VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio);
462
463	if (!folio_test_hugetlb_vmemmap_optimized(folio))
464	return `0`;
465
466	if (flags & VMEMMAP_SYNCHRONIZE_RCU)
467	synchronize_rcu();
468
469	vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
470	vmemmap_reuse = vmemmap_start;
471	vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE;
472
473	/*
474	* The pages which the vmemmap virtual address range [@vmemmap_start,
475	* @vmemmap_end) are mapped to are freed to the buddy allocator, and
476	* the range is mapped to the page which @vmemmap_reuse is mapped to.
477	* When a HugeTLB page is freed to the buddy allocator, previously
478	* discarded vmemmap pages must be allocated and remapping.
479	*/
480	ret = vmemmap_remap_alloc(start: vmemmap_start, end: vmemmap_end, reuse: vmemmap_reuse, flags);
481	if (!ret) {
482	folio_clear_hugetlb_vmemmap_optimized(folio);
483	static_branch_dec(&hugetlb_optimize_vmemmap_key);
484	}
485
486	return ret;
487	}
488
489	/**
490	* hugetlb_vmemmap_restore_folio - restore previously optimized (by
491	* hugetlb_vmemmap_optimize_folio()) vmemmap pages which
492	* will be reallocated and remapped.
493	* @h: struct hstate.
494	* @folio: the folio whose vmemmap pages will be restored.
495	*
496	* Return: %0 if @folio's vmemmap pages have been reallocated and remapped,
497	* negative error code otherwise.
498	*/
499	int hugetlb_vmemmap_restore_folio(const struct hstate h, struct* folio *folio)
500	{
501	return __hugetlb_vmemmap_restore_folio(h, folio, VMEMMAP_SYNCHRONIZE_RCU);
502	}
503
504	/**
505	* hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list.
506	* @h: hstate.
507	* @folio_list: list of folios.
508	* @non_hvo_folios: Output list of folios for which vmemmap exists.
509	*
510	* Return: number of folios for which vmemmap was restored, or an error code
511	* if an error was encountered restoring vmemmap for a folio.
512	* Folios that have vmemmap are moved to the non_hvo_folios
513	* list. Processing of entries stops when the first error is
514	* encountered. The folio that experienced the error and all
515	* non-processed folios will remain on folio_list.
516	*/
517	long hugetlb_vmemmap_restore_folios(const struct hstate *h,
518	struct list_head *folio_list,
519	struct list_head *non_hvo_folios)
520	{
521	struct folio folio, t_folio;
522	long restored = `0`;
523	long ret = `0`;
524	unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH \| VMEMMAP_SYNCHRONIZE_RCU;
525
526	list_for_each_entry_safe(folio, t_folio, folio_list, lru) {
527	if (folio_test_hugetlb_vmemmap_optimized(folio)) {
528	ret = __hugetlb_vmemmap_restore_folio(h, folio, flags);
529	/ only need to synchronize_rcu() once for each batch /
530	flags &= ~VMEMMAP_SYNCHRONIZE_RCU;
531
532	if (ret)
533	break;
534	restored++;
535	}
536
537	/ Add non-optimized folios to output list /
538	list_move(list: &folio->lru, head: non_hvo_folios);
539	}
540
541	if (restored)
542	flush_tlb_all();
543	if (!ret)
544	ret = restored;
545	return ret;
546	}
547
548	/ Return true iff a HugeTLB whose vmemmap should and can be optimized. /
549	static bool vmemmap_should_optimize_folio(const struct hstate h, struct* folio *folio)
550	{
551	if (folio_test_hugetlb_vmemmap_optimized(folio))
552	return false;
553
554	if (!READ_ONCE(vmemmap_optimize_enabled))
555	return false;
556
557	if (!hugetlb_vmemmap_optimizable(h))
558	return false;
559
560	return true;
561	}
562
563	static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
564	struct folio *folio,
565	struct list_head *vmemmap_pages,
566	unsigned long flags)
567	{
568	int ret = `0`;
569	unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
570	unsigned long vmemmap_reuse;
571
572	VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
573	VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio);
574
575	if (!vmemmap_should_optimize_folio(h, folio))
576	return ret;
577
578	static_branch_inc(&hugetlb_optimize_vmemmap_key);
579
580	if (flags & VMEMMAP_SYNCHRONIZE_RCU)
581	synchronize_rcu();
582	/*
583	* Very Subtle
584	* If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed
585	* immediately after remapping. As a result, subsequent accesses
586	* and modifications to struct pages associated with the hugetlb
587	* page could be to the OLD struct pages. Set the vmemmap optimized
588	* flag here so that it is copied to the new head page. This keeps
589	* the old and new struct pages in sync.
590	* If there is an error during optimization, we will immediately FLUSH
591	* the TLB and clear the flag below.
592	*/
593	folio_set_hugetlb_vmemmap_optimized(folio);
594
595	vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
596	vmemmap_reuse = vmemmap_start;
597	vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE;
598
599	/*
600	* Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end)
601	* to the page which @vmemmap_reuse is mapped to. Add pages previously
602	* mapping the range to vmemmap_pages list so that they can be freed by
603	* the caller.
604	*/
605	ret = vmemmap_remap_free(start: vmemmap_start, end: vmemmap_end, reuse: vmemmap_reuse,
606	vmemmap_pages, flags);
607	if (ret) {
608	static_branch_dec(&hugetlb_optimize_vmemmap_key);
609	folio_clear_hugetlb_vmemmap_optimized(folio);
610	}
611
612	return ret;
613	}
614
615	/**
616	* hugetlb_vmemmap_optimize_folio - optimize @folio's vmemmap pages.
617	* @h: struct hstate.
618	* @folio: the folio whose vmemmap pages will be optimized.
619	*
620	* This function only tries to optimize @folio's vmemmap pages and does not
621	* guarantee that the optimization will succeed after it returns. The caller
622	* can use folio_test_hugetlb_vmemmap_optimized(@folio) to detect if @folio's
623	* vmemmap pages have been optimized.
624	*/
625	void hugetlb_vmemmap_optimize_folio(const struct hstate h, struct* folio *folio)
626	{
627	LIST_HEAD(vmemmap_pages);
628
629	__hugetlb_vmemmap_optimize_folio(h, folio, vmemmap_pages: &vmemmap_pages, VMEMMAP_SYNCHRONIZE_RCU);
630	free_vmemmap_page_list(list: &vmemmap_pages);
631	}
632
633	static int hugetlb_vmemmap_split_folio(const struct hstate h, struct* folio *folio)
634	{
635	unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
636	unsigned long vmemmap_reuse;
637
638	if (!vmemmap_should_optimize_folio(h, folio))
639	return `0`;
640
641	vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
642	vmemmap_reuse = vmemmap_start;
643	vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE;
644
645	/*
646	* Split PMDs on the vmemmap virtual address range [@vmemmap_start,
647	* @vmemmap_end]
648	*/
649	return vmemmap_remap_split(start: vmemmap_start, end: vmemmap_end, reuse: vmemmap_reuse);
650	}
651
652	static void __hugetlb_vmemmap_optimize_folios(struct hstate *h,
653	struct list_head *folio_list,
654	bool boot)
655	{
656	struct folio *folio;
657	int nr_to_optimize;
658	LIST_HEAD(vmemmap_pages);
659	unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH \| VMEMMAP_SYNCHRONIZE_RCU;
660
661	nr_to_optimize = `0`;
662	list_for_each_entry(folio, folio_list, lru) {
663	int ret;
664	unsigned long spfn, epfn;
665
666	if (boot && folio_test_hugetlb_vmemmap_optimized(folio)) {
667	/*
668	* Already optimized by pre-HVO, just map the
669	* mirrored tail page structs RO.
670	*/
671	spfn = (unsigned long)&folio->page;
672	epfn = spfn + pages_per_huge_page(h);
673	vmemmap_wrprotect_hvo(start: spfn, end: epfn, node: folio_nid(folio),
674	HUGETLB_VMEMMAP_RESERVE_SIZE);
675	register_page_bootmem_memmap(section_nr: pfn_to_section_nr(pfn: spfn),
676	map: &folio->page,
677	HUGETLB_VMEMMAP_RESERVE_SIZE);
678	static_branch_inc(&hugetlb_optimize_vmemmap_key);
679	continue;
680	}
681
682	nr_to_optimize++;
683
684	ret = hugetlb_vmemmap_split_folio(h, folio);
685
686	/*
687	* Spliting the PMD requires allocating a page, thus lets fail
688	* early once we encounter the first OOM. No point in retrying
689	* as it can be dynamically done on remap with the memory
690	* we get back from the vmemmap deduplication.
691	*/
692	if (ret == -ENOMEM)
693	break;
694	}
695
696	if (!nr_to_optimize)
697	/*
698	* All pre-HVO folios, nothing left to do. It's ok if
699	* there is a mix of pre-HVO and not yet HVO-ed folios
700	* here, as __hugetlb_vmemmap_optimize_folio() will
701	* skip any folios that already have the optimized flag
702	* set, see vmemmap_should_optimize_folio().
703	*/
704	goto out;
705
706	flush_tlb_all();
707
708	list_for_each_entry(folio, folio_list, lru) {
709	int ret;
710
711	ret = __hugetlb_vmemmap_optimize_folio(h, folio, vmemmap_pages: &vmemmap_pages, flags);
712	/ only need to synchronize_rcu() once for each batch /
713	flags &= ~VMEMMAP_SYNCHRONIZE_RCU;
714
715	/*
716	* Pages to be freed may have been accumulated. If we
717	* encounter an ENOMEM, free what we have and try again.
718	* This can occur in the case that both spliting fails
719	* halfway and head page allocation also failed. In this
720	* case __hugetlb_vmemmap_optimize_folio() would free memory
721	* allowing more vmemmap remaps to occur.
722	*/
723	if (ret == -ENOMEM && !list_empty(head: &vmemmap_pages)) {
724	flush_tlb_all();
725	free_vmemmap_page_list(list: &vmemmap_pages);
726	INIT_LIST_HEAD(list: &vmemmap_pages);
727	__hugetlb_vmemmap_optimize_folio(h, folio, vmemmap_pages: &vmemmap_pages, flags);
728	}
729	}
730
731	out:
732	flush_tlb_all();
733	free_vmemmap_page_list(list: &vmemmap_pages);
734	}
735
736	void hugetlb_vmemmap_optimize_folios(struct hstate h, struct* list_head *folio_list)
737	{
738	__hugetlb_vmemmap_optimize_folios(h, folio_list, boot: false);
739	}
740
741	void hugetlb_vmemmap_optimize_bootmem_folios(struct hstate h, struct* list_head *folio_list)
742	{
743	__hugetlb_vmemmap_optimize_folios(h, folio_list, boot: true);
744	}
745
746	#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT
747
748	/ Return true of a bootmem allocated HugeTLB page should be pre-HVO-ed /
749	static bool vmemmap_should_optimize_bootmem_page(struct huge_bootmem_page *m)
750	{
751	unsigned long section_size, psize, pmd_vmemmap_size;
752	phys_addr_t paddr;
753
754	if (!READ_ONCE(vmemmap_optimize_enabled))
755	return false;
756
757	if (!hugetlb_vmemmap_optimizable(h: m->hstate))
758	return false;
759
760	psize = huge_page_size(h: m->hstate);
761	paddr = virt_to_phys(address: m);
762
763	/*
764	* Pre-HVO only works if the bootmem huge page
765	* is aligned to the section size.
766	*/
767	section_size = (`1UL` << PA_SECTION_SHIFT);
768	if (!IS_ALIGNED(paddr, section_size) \|\|
769	!IS_ALIGNED(psize, section_size))
770	return false;
771
772	/*
773	* The pre-HVO code does not deal with splitting PMDS,
774	* so the bootmem page must be aligned to the number
775	* of base pages that can be mapped with one vmemmap PMD.
776	*/
777	pmd_vmemmap_size = (PMD_SIZE / (sizeof(struct page))) << PAGE_SHIFT;
778	if (!IS_ALIGNED(paddr, pmd_vmemmap_size) \|\|
779	!IS_ALIGNED(psize, pmd_vmemmap_size))
780	return false;
781
782	return true;
783	}
784
785	/*
786	* Initialize memmap section for a gigantic page, HVO-style.
787	*/
788	void __init hugetlb_vmemmap_init_early(int nid)
789	{
790	unsigned long psize, paddr, section_size;
791	unsigned long ns, i, pnum, pfn, nr_pages;
792	unsigned long start, end;
793	struct huge_bootmem_page *m = NULL;
794	void *map;
795
796	/*
797	* Noting to do if bootmem pages were not allocated
798	* early in boot, or if HVO wasn't enabled in the
799	* first place.
800	*/
801	if (!hugetlb_bootmem_allocated())
802	return;
803
804	if (!READ_ONCE(vmemmap_optimize_enabled))
805	return;
806
807	section_size = (`1UL` << PA_SECTION_SHIFT);
808
809	list_for_each_entry(m, &huge_boot_pages[nid], list) {
810	if (!vmemmap_should_optimize_bootmem_page(m))
811	continue;
812
813	nr_pages = pages_per_huge_page(h: m->hstate);
814	psize = nr_pages << PAGE_SHIFT;
815	paddr = virt_to_phys(address: m);
816	pfn = PHYS_PFN(paddr);
817	map = pfn_to_page(pfn);
818	start = (unsigned long)map;
819	end = start + nr_pages * sizeof(struct page);
820
821	if (vmemmap_populate_hvo(start, end, node: nid,
822	HUGETLB_VMEMMAP_RESERVE_SIZE) < `0`)
823	continue;
824
825	memmap_boot_pages_add(HUGETLB_VMEMMAP_RESERVE_SIZE / PAGE_SIZE);
826
827	pnum = pfn_to_section_nr(pfn);
828	ns = psize / section_size;
829
830	for (i = `0`; i < ns; i++) {
831	sparse_init_early_section(nid, map, pnum,
832	SECTION_IS_VMEMMAP_PREINIT);
833	map += section_map_size();
834	pnum++;
835	}
836
837	m->flags \|= HUGE_BOOTMEM_HVO;
838	}
839	}
840
841	void __init hugetlb_vmemmap_init_late(int nid)
842	{
843	struct huge_bootmem_page m, tm;
844	unsigned long phys, nr_pages, start, end;
845	unsigned long pfn, nr_mmap;
846	struct hstate *h;
847	void *map;
848
849	if (!hugetlb_bootmem_allocated())
850	return;
851
852	if (!READ_ONCE(vmemmap_optimize_enabled))
853	return;
854
855	list_for_each_entry_safe(m, tm, &huge_boot_pages[nid], list) {
856	if (!(m->flags & HUGE_BOOTMEM_HVO))
857	continue;
858
859	phys = virt_to_phys(address: m);
860	h = m->hstate;
861	pfn = PHYS_PFN(phys);
862	nr_pages = pages_per_huge_page(h);
863
864	if (!hugetlb_bootmem_page_zones_valid(nid, m)) {
865	/*
866	* Oops, the hugetlb page spans multiple zones.
867	* Remove it from the list, and undo HVO.
868	*/
869	list_del(entry: &m->list);
870
871	map = pfn_to_page(pfn);
872
873	start = (unsigned long)map;
874	end = start + nr_pages * sizeof(struct page);
875
876	vmemmap_undo_hvo(start, end, node: nid,
877	HUGETLB_VMEMMAP_RESERVE_SIZE);
878	nr_mmap = end - start - HUGETLB_VMEMMAP_RESERVE_SIZE;
879	memmap_boot_pages_add(DIV_ROUND_UP(nr_mmap, PAGE_SIZE));
880
881	memblock_phys_free(base: phys, size: huge_page_size(h));
882	continue;
883	} else
884	m->flags \|= HUGE_BOOTMEM_ZONES_VALID;
885	}
886	}
887	#endif
888
889	static const struct ctl_table hugetlb_vmemmap_sysctls[] = {
890	{
891	.procname = "hugetlb_optimize_vmemmap",
892	.data = &vmemmap_optimize_enabled,
893	.maxlen = sizeof(vmemmap_optimize_enabled),
894	.mode = `0644`,
895	.proc_handler = proc_dobool,
896	},
897	};
898
899	static int __init hugetlb_vmemmap_init(void)
900	{
901	const struct hstate *h;
902
903	/ HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages /
904	BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES);
905
906	for_each_hstate(h) {
907	if (hugetlb_vmemmap_optimizable(h)) {
908	register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
909	break;
910	}
911	}
912	return `0`;
913	}
914	late_initcall(hugetlb_vmemmap_init);
915

Browse the source code of Linux/mm/hugetlb_vmemmap.c