memory.c source code [Linux/mm/memory.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* linux/mm/memory.c
4	*
5	* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
6	*/
7
8	/*
9	* demand-loading started 01.12.91 - seems it is high on the list of
10	* things wanted, and it should be easy to implement. - Linus
11	*/
12
13	/*
14	* Ok, demand-loading was easy, shared pages a little bit tricker. Shared
15	* pages started 02.12.91, seems to work. - Linus.
16	*
17	* Tested sharing by executing about 30 /bin/sh: under the old kernel it
18	* would have taken more than the 6M I have free, but it worked well as
19	* far as I could see.
20	*
21	* Also corrected some "invalidate()"s - I wasn't doing enough of them.
22	*/
23
24	/*
25	* Real VM (paging to/from disk) started 18.12.91. Much more work and
26	* thought has to go into this. Oh, well..
27	* 19.12.91 - works, somewhat. Sometimes I get faults, don't know why.
28	* Found it. Everything seems to work now.
29	* 20.12.91 - Ok, making the swap-device changeable like the root.
30	*/
31
32	/*
33	* 05.04.94 - Multi-page memory management added for v1.1.
34	* Idea by Alex Bligh (alex@cconcepts.co.uk)
35	*
36	* 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG
37	* (Gerhard.Wichert@pdb.siemens.de)
38	*
39	* Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
40	*/
41
42	#include <linux/kernel_stat.h>
43	#include <linux/mm.h>
44	#include <linux/mm_inline.h>
45	#include <linux/sched/mm.h>
46	#include <linux/sched/numa_balancing.h>
47	#include <linux/sched/task.h>
48	#include <linux/hugetlb.h>
49	#include <linux/mman.h>
50	#include <linux/swap.h>
51	#include <linux/highmem.h>
52	#include <linux/pagemap.h>
53	#include <linux/memremap.h>
54	#include <linux/kmsan.h>
55	#include <linux/ksm.h>
56	#include <linux/rmap.h>
57	#include <linux/export.h>
58	#include <linux/delayacct.h>
59	#include <linux/init.h>
60	#include <linux/writeback.h>
61	#include <linux/memcontrol.h>
62	#include <linux/mmu_notifier.h>
63	#include <linux/swapops.h>
64	#include <linux/elf.h>
65	#include <linux/gfp.h>
66	#include <linux/migrate.h>
67	#include <linux/string.h>
68	#include <linux/memory-tiers.h>
69	#include <linux/debugfs.h>
70	#include <linux/userfaultfd_k.h>
71	#include <linux/dax.h>
72	#include <linux/oom.h>
73	#include <linux/numa.h>
74	#include <linux/perf_event.h>
75	#include <linux/ptrace.h>
76	#include <linux/vmalloc.h>
77	#include <linux/sched/sysctl.h>
78
79	#include <trace/events/kmem.h>
80
81	#include <asm/io.h>
82	#include <asm/mmu_context.h>
83	#include <asm/pgalloc.h>
84	#include <linux/uaccess.h>
85	#include <asm/tlb.h>
86	#include <asm/tlbflush.h>
87
88	#include "pgalloc-track.h"
89	#include "internal.h"
90	#include "swap.h"
91
92	#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
93	#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
94	#endif
95
96	static vm_fault_t do_fault(struct vm_fault *vmf);
97	static vm_fault_t do_anonymous_page(struct vm_fault *vmf);
98	static bool vmf_pte_changed(struct vm_fault *vmf);
99
100	/*
101	* Return true if the original pte was a uffd-wp pte marker (so the pte was
102	* wr-protected).
103	*/
104	static __always_inline bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf)
105	{
106	if (!userfaultfd_wp(vma: vmf->vma))
107	return false;
108	if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
109	return false;
110
111	return pte_marker_uffd_wp(pte: vmf->orig_pte);
112	}
113
114	/*
115	* Randomize the address space (stacks, mmaps, brk, etc.).
116	*
117	* ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
118	* as ancient (libc5 based) binaries can segfault. )
119	*/
120	int randomize_va_space __read_mostly =
121	#ifdef CONFIG_COMPAT_BRK
122	`1`;
123	#else
124	`2`;
125	#endif
126
127	static const struct ctl_table mmu_sysctl_table[] = {
128	{
129	.procname = "randomize_va_space",
130	.data = &randomize_va_space,
131	.maxlen = sizeof(int),
132	.mode = `0644`,
133	.proc_handler = proc_dointvec,
134	},
135	};
136
137	static int __init init_mm_sysctl(void)
138	{
139	register_sysctl_init("kernel", mmu_sysctl_table);
140	return `0`;
141	}
142
143	subsys_initcall(init_mm_sysctl);
144
145	#ifndef arch_wants_old_prefaulted_pte
146	static inline bool arch_wants_old_prefaulted_pte(void)
147	{
148	/*
149	* Transitioning a PTE from 'old' to 'young' can be expensive on
150	* some architectures, even if it's performed in hardware. By
151	* default, "false" means prefaulted entries will be 'young'.
152	*/
153	return false;
154	}
155	#endif
156
157	static int __init disable_randmaps(char *s)
158	{
159	randomize_va_space = `0`;
160	return `1`;
161	}
162	__setup("norandmaps", disable_randmaps);
163
164	unsigned long zero_pfn __read_mostly;
165	EXPORT_SYMBOL(zero_pfn);
166
167	unsigned long highest_memmap_pfn __read_mostly;
168
169	/*
170	* CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
171	*/
172	static int __init init_zero_pfn(void)
173	{
174	zero_pfn = page_to_pfn(ZERO_PAGE(`0`));
175	return `0`;
176	}
177	early_initcall(init_zero_pfn);
178
179	void mm_trace_rss_stat(struct mm_struct mm, int* member)
180	{
181	trace_rss_stat(mm, member);
182	}
183
184	/*
185	* Note: this doesn't free the actual pages themselves. That
186	* has been handled earlier when unmapping all the memory regions.
187	*/
188	static void free_pte_range(struct mmu_gather tlb, pmd_t pmd,
189	unsigned long addr)
190	{
191	pgtable_t token = pmd_pgtable(*pmd);
192	pmd_clear(pmd);
193	pte_free_tlb(tlb, token, addr);
194	mm_dec_nr_ptes(mm: tlb->mm);
195	}
196
197	static inline void free_pmd_range(struct mmu_gather tlb, pud_t pud,
198	unsigned long addr, unsigned long end,
199	unsigned long floor, unsigned long ceiling)
200	{
201	pmd_t *pmd;
202	unsigned long next;
203	unsigned long start;
204
205	start = addr;
206	pmd = pmd_offset(pud, address: addr);
207	do {
208	next = pmd_addr_end(addr, end);
209	if (pmd_none_or_clear_bad(pmd))
210	continue;
211	free_pte_range(tlb, pmd, addr);
212	} while (pmd++, addr = next, addr != end);
213
214	start &= PUD_MASK;
215	if (start < floor)
216	return;
217	if (ceiling) {
218	ceiling &= PUD_MASK;
219	if (!ceiling)
220	return;
221	}
222	if (end - `1` > ceiling - `1`)
223	return;
224
225	pmd = pmd_offset(pud, address: start);
226	pud_clear(pud);
227	pmd_free_tlb(tlb, pmd, start);
228	mm_dec_nr_pmds(mm: tlb->mm);
229	}
230
231	static inline void free_pud_range(struct mmu_gather tlb, p4d_t p4d,
232	unsigned long addr, unsigned long end,
233	unsigned long floor, unsigned long ceiling)
234	{
235	pud_t *pud;
236	unsigned long next;
237	unsigned long start;
238
239	start = addr;
240	pud = pud_offset(p4d, address: addr);
241	do {
242	next = pud_addr_end(addr, end);
243	if (pud_none_or_clear_bad(pud))
244	continue;
245	free_pmd_range(tlb, pud, addr, end: next, floor, ceiling);
246	} while (pud++, addr = next, addr != end);
247
248	start &= P4D_MASK;
249	if (start < floor)
250	return;
251	if (ceiling) {
252	ceiling &= P4D_MASK;
253	if (!ceiling)
254	return;
255	}
256	if (end - `1` > ceiling - `1`)
257	return;
258
259	pud = pud_offset(p4d, address: start);
260	p4d_clear(p4d);
261	pud_free_tlb(tlb, pud, start);
262	mm_dec_nr_puds(mm: tlb->mm);
263	}
264
265	static inline void free_p4d_range(struct mmu_gather tlb, pgd_t pgd,
266	unsigned long addr, unsigned long end,
267	unsigned long floor, unsigned long ceiling)
268	{
269	p4d_t *p4d;
270	unsigned long next;
271	unsigned long start;
272
273	start = addr;
274	p4d = p4d_offset(pgd, address: addr);
275	do {
276	next = p4d_addr_end(addr, end);
277	if (p4d_none_or_clear_bad(p4d))
278	continue;
279	free_pud_range(tlb, p4d, addr, end: next, floor, ceiling);
280	} while (p4d++, addr = next, addr != end);
281
282	start &= PGDIR_MASK;
283	if (start < floor)
284	return;
285	if (ceiling) {
286	ceiling &= PGDIR_MASK;
287	if (!ceiling)
288	return;
289	}
290	if (end - `1` > ceiling - `1`)
291	return;
292
293	p4d = p4d_offset(pgd, address: start);
294	pgd_clear(pgd);
295	p4d_free_tlb(tlb, p4d, start);
296	}
297
298	/**
299	* free_pgd_range - Unmap and free page tables in the range
300	* @tlb: the mmu_gather containing pending TLB flush info
301	* @addr: virtual address start
302	* @end: virtual address end
303	* @floor: lowest address boundary
304	* @ceiling: highest address boundary
305	*
306	* This function tears down all user-level page tables in the
307	* specified virtual address range [@addr..@end). It is part of
308	* the memory unmap flow.
309	*/
310	void free_pgd_range(struct mmu_gather *tlb,
311	unsigned long addr, unsigned long end,
312	unsigned long floor, unsigned long ceiling)
313	{
314	pgd_t *pgd;
315	unsigned long next;
316
317	/*
318	* The next few lines have given us lots of grief...
319	*
320	* Why are we testing PMD* at this top level? Because often
321	* there will be no work to do at all, and we'd prefer not to
322	* go all the way down to the bottom just to discover that.
323	*
324	* Why all these "- 1"s? Because 0 represents both the bottom
325	* of the address space and the top of it (using -1 for the
326	* top wouldn't help much: the masks would do the wrong thing).
327	* The rule is that addr 0 and floor 0 refer to the bottom of
328	* the address space, but end 0 and ceiling 0 refer to the top
329	* Comparisons need to use "end - 1" and "ceiling - 1" (though
330	* that end 0 case should be mythical).
331	*
332	* Wherever addr is brought up or ceiling brought down, we must
333	* be careful to reject "the opposite 0" before it confuses the
334	* subsequent tests. But what about where end is brought down
335	* by PMD_SIZE below? no, end can't go down to 0 there.
336	*
337	* Whereas we round start (addr) and ceiling down, by different
338	* masks at different levels, in order to test whether a table
339	* now has no other vmas using it, so can be freed, we don't
340	* bother to round floor or end up - the tests don't need that.
341	*/
342
343	addr &= PMD_MASK;
344	if (addr < floor) {
345	addr += PMD_SIZE;
346	if (!addr)
347	return;
348	}
349	if (ceiling) {
350	ceiling &= PMD_MASK;
351	if (!ceiling)
352	return;
353	}
354	if (end - `1` > ceiling - `1`)
355	end -= PMD_SIZE;
356	if (addr > end - `1`)
357	return;
358	/*
359	* We add page table cache pages with PAGE_SIZE,
360	* (see pte_free_tlb()), flush the tlb if we need
361	*/
362	tlb_change_page_size(tlb, PAGE_SIZE);
363	pgd = pgd_offset(tlb->mm, addr);
364	do {
365	next = pgd_addr_end(addr, end);
366	if (pgd_none_or_clear_bad(pgd))
367	continue;
368	free_p4d_range(tlb, pgd, addr, end: next, floor, ceiling);
369	} while (pgd++, addr = next, addr != end);
370	}
371
372	void free_pgtables(struct mmu_gather tlb, struct* ma_state *mas,
373	struct vm_area_struct vma, unsigned* long floor,
374	unsigned long ceiling, bool mm_wr_locked)
375	{
376	struct unlink_vma_file_batch vb;
377
378	tlb_free_vmas(tlb);
379
380	do {
381	unsigned long addr = vma->vm_start;
382	struct vm_area_struct *next;
383
384	/*
385	* Note: USER_PGTABLES_CEILING may be passed as ceiling and may
386	* be 0. This will underflow and is okay.
387	*/
388	next = mas_find(mas, max: ceiling - `1`);
389	if (unlikely(xa_is_zero(next)))
390	next = NULL;
391
392	/*
393	* Hide vma from rmap and truncate_pagecache before freeing
394	* pgtables
395	*/
396	if (mm_wr_locked)
397	vma_start_write(vma);
398	unlink_anon_vmas(vma);
399
400	unlink_file_vma_batch_init(vb: &vb);
401	unlink_file_vma_batch_add(vb: &vb, vma);
402
403	/*
404	* Optimization: gather nearby vmas into one call down
405	*/
406	while (next && next->vm_start <= vma->vm_end + PMD_SIZE) {
407	vma = next;
408	next = mas_find(mas, max: ceiling - `1`);
409	if (unlikely(xa_is_zero(next)))
410	next = NULL;
411	if (mm_wr_locked)
412	vma_start_write(vma);
413	unlink_anon_vmas(vma);
414	unlink_file_vma_batch_add(vb: &vb, vma);
415	}
416	unlink_file_vma_batch_final(vb: &vb);
417
418	free_pgd_range(tlb, addr, end: vma->vm_end,
419	floor, ceiling: next ? next->vm_start : ceiling);
420	vma = next;
421	} while (vma);
422	}
423
424	void pmd_install(struct mm_struct mm, pmd_t pmd, pgtable_t *pte)
425	{
426	spinlock_t *ptl = pmd_lock(mm, pmd);
427
428	if (likely(pmd_none(pmd))) { /* Has another populated it ? /
429	mm_inc_nr_ptes(mm);
430	/*
431	* Ensure all pte setup (eg. pte page lock and page clearing) are
432	* visible before the pte is made visible to other CPUs by being
433	* put into page tables.
434	*
435	* The other side of the story is the pointer chasing in the page
436	* table walking code (when walking the page table without locking;
437	* ie. most of the time). Fortunately, these data accesses consist
438	* of a chain of data-dependent loads, meaning most CPUs (alpha
439	* being the notable exception) will already guarantee loads are
440	* seen in-order. See the alpha page table accessors for the
441	* smp_rmb() barriers in page table walking code.
442	*/
443	smp_wmb(); / Could be smp_wmb__xxx(before\|after)_spin_lock /
444	pmd_populate(mm, pmd, pte: *pte);
445	*pte = NULL;
446	}
447	spin_unlock(lock: ptl);
448	}
449
450	int __pte_alloc(struct mm_struct mm, pmd_t pmd)
451	{
452	pgtable_t new = pte_alloc_one(mm);
453	if (!new)
454	return -ENOMEM;
455
456	pmd_install(mm, pmd, pte: &new);
457	if (new)
458	pte_free(mm, pte_page: new);
459	return `0`;
460	}
461
462	int __pte_alloc_kernel(pmd_t *pmd)
463	{
464	pte_t *new = pte_alloc_one_kernel(&init_mm);
465	if (!new)
466	return -ENOMEM;
467
468	spin_lock(lock: &init_mm.page_table_lock);
469	if (likely(pmd_none(pmd))) { /* Has another populated it ? /
470	smp_wmb(); / See comment in pmd_install() /
471	pmd_populate_kernel(mm: &init_mm, pmd, pte: new);
472	new = NULL;
473	}
474	spin_unlock(lock: &init_mm.page_table_lock);
475	if (new)
476	pte_free_kernel(mm: &init_mm, pte: new);
477	return `0`;
478	}
479
480	static inline void init_rss_vec(int *rss)
481	{
482	memset(s: rss, c: `0`, n: sizeof(int) * NR_MM_COUNTERS);
483	}
484
485	static inline void add_mm_rss_vec(struct mm_struct mm, int* *rss)
486	{
487	int i;
488
489	for (i = `0`; i < NR_MM_COUNTERS; i++)
490	if (rss[i])
491	add_mm_counter(mm, member: i, value: rss[i]);
492	}
493
494	static bool is_bad_page_map_ratelimited(void)
495	{
496	static unsigned long resume;
497	static unsigned long nr_shown;
498	static unsigned long nr_unshown;
499
500	/*
501	* Allow a burst of 60 reports, then keep quiet for that minute;
502	* or allow a steady drip of one report per second.
503	*/
504	if (nr_shown == `60`) {
505	if (time_before(jiffies, resume)) {
506	nr_unshown++;
507	return true;
508	}
509	if (nr_unshown) {
510	pr_alert("BUG: Bad page map: %lu messages suppressed\n",
511	nr_unshown);
512	nr_unshown = `0`;
513	}
514	nr_shown = `0`;
515	}
516	if (nr_shown++ == `0`)
517	resume = jiffies + `60` * HZ;
518	return false;
519	}
520
521	static void __print_bad_page_map_pgtable(struct mm_struct mm, unsigned* long addr)
522	{
523	unsigned long long pgdv, p4dv, pudv, pmdv;
524	p4d_t p4d, *p4dp;
525	pud_t pud, *pudp;
526	pmd_t pmd, *pmdp;
527	pgd_t *pgdp;
528
529	/*
530	* Although this looks like a fully lockless pgtable walk, it is not:
531	* see locking requirements for print_bad_page_map().
532	*/
533	pgdp = pgd_offset(mm, addr);
534	pgdv = pgd_val(*pgdp);
535
536	if (!pgd_present(pgd: pgdp) \|\| pgd_leaf(pgdp)) {
537	pr_alert("pgd:%08llx\n", pgdv);
538	return;
539	}
540
541	p4dp = p4d_offset(pgd: pgdp, address: addr);
542	p4d = p4dp_get(p4dp);
543	p4dv = p4d_val(p4d);
544
545	if (!p4d_present(p4d) \|\| p4d_leaf(p4d)) {
546	pr_alert("pgd:%08llx p4d:%08llx\n", pgdv, p4dv);
547	return;
548	}
549
550	pudp = pud_offset(p4d: p4dp, address: addr);
551	pud = pudp_get(pudp);
552	pudv = pud_val(pud);
553
554	if (!pud_present(pud) \|\| pud_leaf(pud)) {
555	pr_alert("pgd:%08llx p4d:%08llx pud:%08llx\n", pgdv, p4dv, pudv);
556	return;
557	}
558
559	pmdp = pmd_offset(pud: pudp, address: addr);
560	pmd = pmdp_get(pmdp);
561	pmdv = pmd_val(pmd);
562
563	/*
564	* Dumping the PTE would be nice, but it's tricky with CONFIG_HIGHPTE,
565	* because the table should already be mapped by the caller and
566	* doing another map would be bad. print_bad_page_map() should
567	* already take care of printing the PTE.
568	*/
569	pr_alert("pgd:%08llx p4d:%08llx pud:%08llx pmd:%08llx\n", pgdv,
570	p4dv, pudv, pmdv);
571	}
572
573	/*
574	* This function is called to print an error when a bad page table entry (e.g.,
575	* corrupted page table entry) is found. For example, we might have a
576	* PFN-mapped pte in a region that doesn't allow it.
577	*
578	* The calling function must still handle the error.
579	*
580	* This function must be called during a proper page table walk, as it will
581	* re-walk the page table to dump information: the caller MUST prevent page
582	* table teardown (by holding mmap, vma or rmap lock) and MUST hold the leaf
583	* page table lock.
584	*/
585	static void print_bad_page_map(struct vm_area_struct *vma,
586	unsigned long addr, unsigned long long entry, struct page *page,
587	enum pgtable_level level)
588	{
589	struct address_space *mapping;
590	pgoff_t index;
591
592	if (is_bad_page_map_ratelimited())
593	return;
594
595	mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
596	index = linear_page_index(vma, address: addr);
597
598	pr_alert("BUG: Bad page map in process %s %s:%08llx", current->comm,
599	pgtable_level_to_str(level), entry);
600	__print_bad_page_map_pgtable(mm: vma->vm_mm, addr);
601	if (page)
602	dump_page(page, reason: "bad page map");
603	pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
604	(void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
605	pr_alert("file:%pD fault:%ps mmap:%ps mmap_prepare: %ps read_folio:%ps\n",
606	vma->vm_file,
607	vma->vm_ops ? vma->vm_ops->fault : NULL,
608	vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
609	vma->vm_file ? vma->vm_file->f_op->mmap_prepare : NULL,
610	mapping ? mapping->a_ops->read_folio : NULL);
611	dump_stack();
612	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
613	}
614	#define print_bad_pte(vma, addr, pte, page) \
615	print_bad_page_map(vma, addr, pte_val(pte), page, PGTABLE_LEVEL_PTE)
616
617	/**
618	* __vm_normal_page() - Get the "struct page" associated with a page table entry.
619	* @vma: The VMA mapping the page table entry.
620	* @addr: The address where the page table entry is mapped.
621	* @pfn: The PFN stored in the page table entry.
622	* @special: Whether the page table entry is marked "special".
623	* @level: The page table level for error reporting purposes only.
624	* @entry: The page table entry value for error reporting purposes only.
625	*
626	* "Special" mappings do not wish to be associated with a "struct page" (either
627	* it doesn't exist, or it exists but they don't want to touch it). In this
628	* case, NULL is returned here. "Normal" mappings do have a struct page and
629	* are ordinarily refcounted.
630	*
631	* Page mappings of the shared zero folios are always considered "special", as
632	* they are not ordinarily refcounted: neither the refcount nor the mapcount
633	* of these folios is adjusted when mapping them into user page tables.
634	* Selected page table walkers (such as GUP) can still identify mappings of the
635	* shared zero folios and work with the underlying "struct page".
636	*
637	* There are 2 broad cases. Firstly, an architecture may define a "special"
638	* page table entry bit, such as pte_special(), in which case this function is
639	* trivial. Secondly, an architecture may not have a spare page table
640	* entry bit, which requires a more complicated scheme, described below.
641	*
642	* With CONFIG_FIND_NORMAL_PAGE, we might have the "special" bit set on
643	* page table entries that actually map "normal" pages: however, that page
644	* cannot be looked up through the PFN stored in the page table entry, but
645	* instead will be looked up through vm_ops->find_normal_page(). So far, this
646	* only applies to PTEs.
647	*
648	* A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
649	* special mapping (even if there are underlying and valid "struct pages").
650	* COWed pages of a VM_PFNMAP are always normal.
651	*
652	* The way we recognize COWed pages within VM_PFNMAP mappings is through the
653	* rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
654	* set, and the vm_pgoff will point to the first PFN mapped: thus every special
655	* mapping will always honor the rule
656	*
657	* pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
658	*
659	* And for normal mappings this is false.
660	*
661	* This restricts such mappings to be a linear translation from virtual address
662	* to pfn. To get around this restriction, we allow arbitrary mappings so long
663	* as the vma is not a COW mapping; in that case, we know that all ptes are
664	* special (because none can have been COWed).
665	*
666	*
667	* In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
668	*
669	* VM_MIXEDMAP mappings can likewise contain memory with or without "struct
670	* page" backing, however the difference is that _all_ pages with a struct
671	* page (that is, those where pfn_valid is true, except the shared zero
672	* folios) are refcounted and considered normal pages by the VM.
673	*
674	* The disadvantage is that pages are refcounted (which can be slower and
675	* simply not an option for some PFNMAP users). The advantage is that we
676	* don't have to follow the strict linearity rule of PFNMAP mappings in
677	* order to support COWable mappings.
678	*
679	* Return: Returns the "struct page" if this is a "normal" mapping. Returns
680	* NULL if this is a "special" mapping.
681	*/
682	static inline struct page __vm_normal_page(struct* vm_area_struct *vma,
683	unsigned long addr, unsigned long pfn, bool special,
684	unsigned long long entry, enum pgtable_level level)
685	{
686	if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) {
687	if (unlikely(special)) {
688	#ifdef CONFIG_FIND_NORMAL_PAGE
689	if (vma->vm_ops && vma->vm_ops->find_normal_page)
690	return vma->vm_ops->find_normal_page(vma, addr);
691	#endif /* CONFIG_FIND_NORMAL_PAGE */
692	if (vma->vm_flags & (VM_PFNMAP \| VM_MIXEDMAP))
693	return NULL;
694	if (is_zero_pfn(pfn) \|\| is_huge_zero_pfn(pfn))
695	return NULL;
696
697	print_bad_page_map(vma, addr, entry, NULL, level);
698	return NULL;
699	}
700	/*
701	* With CONFIG_ARCH_HAS_PTE_SPECIAL, any special page table
702	* mappings (incl. shared zero folios) are marked accordingly.
703	*/
704	} else {
705	if (unlikely(vma->vm_flags & (VM_PFNMAP \| VM_MIXEDMAP))) {
706	if (vma->vm_flags & VM_MIXEDMAP) {
707	/ If it has a "struct page", it's "normal". /
708	if (!pfn_valid(pfn))
709	return NULL;
710	} else {
711	unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;
712
713	/ Only CoW'ed anon folios are "normal". /
714	if (pfn == vma->vm_pgoff + off)
715	return NULL;
716	if (!is_cow_mapping(flags: vma->vm_flags))
717	return NULL;
718	}
719	}
720
721	if (is_zero_pfn(pfn) \|\| is_huge_zero_pfn(pfn))
722	return NULL;
723	}
724
725	if (unlikely(pfn > highest_memmap_pfn)) {
726	/ Corrupted page table entry. /
727	print_bad_page_map(vma, addr, entry, NULL, level);
728	return NULL;
729	}
730	/*
731	* NOTE! We still have PageReserved() pages in the page tables.
732	* For example, VDSO mappings can cause them to exist.
733	*/
734	VM_WARN_ON_ONCE(is_zero_pfn(pfn) \|\| is_huge_zero_pfn(pfn));
735	return pfn_to_page(pfn);
736	}
737
738	/**
739	* vm_normal_page() - Get the "struct page" associated with a PTE
740	* @vma: The VMA mapping the @pte.
741	* @addr: The address where the @pte is mapped.
742	* @pte: The PTE.
743	*
744	* Get the "struct page" associated with a PTE. See __vm_normal_page()
745	* for details on "normal" and "special" mappings.
746	*
747	* Return: Returns the "struct page" if this is a "normal" mapping. Returns
748	* NULL if this is a "special" mapping.
749	*/
750	struct page vm_normal_page(struct* vm_area_struct vma, unsigned* long addr,
751	pte_t pte)
752	{
753	return __vm_normal_page(vma, addr, pfn: pte_pfn(pte), special: pte_special(pte),
754	pte_val(pte), level: PGTABLE_LEVEL_PTE);
755	}
756
757	/**
758	* vm_normal_folio() - Get the "struct folio" associated with a PTE
759	* @vma: The VMA mapping the @pte.
760	* @addr: The address where the @pte is mapped.
761	* @pte: The PTE.
762	*
763	* Get the "struct folio" associated with a PTE. See __vm_normal_page()
764	* for details on "normal" and "special" mappings.
765	*
766	* Return: Returns the "struct folio" if this is a "normal" mapping. Returns
767	* NULL if this is a "special" mapping.
768	*/
769	struct folio vm_normal_folio(struct* vm_area_struct vma, unsigned* long addr,
770	pte_t pte)
771	{
772	struct page *page = vm_normal_page(vma, addr, pte);
773
774	if (page)
775	return page_folio(page);
776	return NULL;
777	}
778
779	#ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES
780	/**
781	* vm_normal_page_pmd() - Get the "struct page" associated with a PMD
782	* @vma: The VMA mapping the @pmd.
783	* @addr: The address where the @pmd is mapped.
784	* @pmd: The PMD.
785	*
786	* Get the "struct page" associated with a PTE. See __vm_normal_page()
787	* for details on "normal" and "special" mappings.
788	*
789	* Return: Returns the "struct page" if this is a "normal" mapping. Returns
790	* NULL if this is a "special" mapping.
791	*/
792	struct page vm_normal_page_pmd(struct* vm_area_struct vma, unsigned* long addr,
793	pmd_t pmd)
794	{
795	return __vm_normal_page(vma, addr, pfn: pmd_pfn(pmd), special: pmd_special(pmd),
796	pmd_val(pmd), level: PGTABLE_LEVEL_PMD);
797	}
798
799	/**
800	* vm_normal_folio_pmd() - Get the "struct folio" associated with a PMD
801	* @vma: The VMA mapping the @pmd.
802	* @addr: The address where the @pmd is mapped.
803	* @pmd: The PMD.
804	*
805	* Get the "struct folio" associated with a PTE. See __vm_normal_page()
806	* for details on "normal" and "special" mappings.
807	*
808	* Return: Returns the "struct folio" if this is a "normal" mapping. Returns
809	* NULL if this is a "special" mapping.
810	*/
811	struct folio vm_normal_folio_pmd(struct* vm_area_struct *vma,
812	unsigned long addr, pmd_t pmd)
813	{
814	struct page *page = vm_normal_page_pmd(vma, addr, pmd);
815
816	if (page)
817	return page_folio(page);
818	return NULL;
819	}
820
821	/**
822	* vm_normal_page_pud() - Get the "struct page" associated with a PUD
823	* @vma: The VMA mapping the @pud.
824	* @addr: The address where the @pud is mapped.
825	* @pud: The PUD.
826	*
827	* Get the "struct page" associated with a PUD. See __vm_normal_page()
828	* for details on "normal" and "special" mappings.
829	*
830	* Return: Returns the "struct page" if this is a "normal" mapping. Returns
831	* NULL if this is a "special" mapping.
832	*/
833	struct page vm_normal_page_pud(struct* vm_area_struct *vma,
834	unsigned long addr, pud_t pud)
835	{
836	return __vm_normal_page(vma, addr, pud_pfn(pud), special: pud_special(pud),
837	pud_val(pud), level: PGTABLE_LEVEL_PUD);
838	}
839	#endif
840
841	/**
842	* restore_exclusive_pte - Restore a device-exclusive entry
843	* @vma: VMA covering @address
844	* @folio: the mapped folio
845	* @page: the mapped folio page
846	* @address: the virtual address
847	* @ptep: pte pointer into the locked page table mapping the folio page
848	* @orig_pte: pte value at @ptep
849	*
850	* Restore a device-exclusive non-swap entry to an ordinary present pte.
851	*
852	* The folio and the page table must be locked, and MMU notifiers must have
853	* been called to invalidate any (exclusive) device mappings.
854	*
855	* Locking the folio makes sure that anybody who just converted the pte to
856	* a device-exclusive entry can map it into the device to make forward
857	* progress without others converting it back until the folio was unlocked.
858	*
859	* If the folio lock ever becomes an issue, we can stop relying on the folio
860	* lock; it might make some scenarios with heavy thrashing less likely to
861	* make forward progress, but these scenarios might not be valid use cases.
862	*
863	* Note that the folio lock does not protect against all cases of concurrent
864	* page table modifications (e.g., MADV_DONTNEED, mprotect), so device drivers
865	* must use MMU notifiers to sync against any concurrent changes.
866	*/
867	static void restore_exclusive_pte(struct vm_area_struct *vma,
868	struct folio folio, struct* page page, unsigned* long address,
869	pte_t *ptep, pte_t orig_pte)
870	{
871	pte_t pte;
872
873	VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
874
875	pte = pte_mkold(pte: mk_pte(page, READ_ONCE(vma->vm_page_prot)));
876	if (pte_swp_soft_dirty(pte: orig_pte))
877	pte = pte_mksoft_dirty(pte);
878
879	if (pte_swp_uffd_wp(pte: orig_pte))
880	pte = pte_mkuffd_wp(pte);
881
882	if ((vma->vm_flags & VM_WRITE) &&
883	can_change_pte_writable(vma, addr: address, pte)) {
884	if (folio_test_dirty(folio))
885	pte = pte_mkdirty(pte);
886	pte = pte_mkwrite(pte, vma);
887	}
888	set_pte_at(vma->vm_mm, address, ptep, pte);
889
890	/*
891	* No need to invalidate - it was non-present before. However
892	* secondary CPUs may have mappings that need invalidating.
893	*/
894	update_mmu_cache(vma, addr: address, ptep);
895	}
896
897	/*
898	* Tries to restore an exclusive pte if the page lock can be acquired without
899	* sleeping.
900	*/
901	static int try_restore_exclusive_pte(struct vm_area_struct *vma,
902	unsigned long addr, pte_t *ptep, pte_t orig_pte)
903	{
904	struct page *page = pfn_swap_entry_to_page(entry: pte_to_swp_entry(pte: orig_pte));
905	struct folio *folio = page_folio(page);
906
907	if (folio_trylock(folio)) {
908	restore_exclusive_pte(vma, folio, page, address: addr, ptep, orig_pte);
909	folio_unlock(folio);
910	return `0`;
911	}
912
913	return -EBUSY;
914	}
915
916	/*
917	* copy one vm_area from one task to the other. Assumes the page tables
918	* already present in the new task to be cleared in the whole range
919	* covered by this vma.
920	*/
921
922	static unsigned long
923	copy_nonpresent_pte(struct mm_struct dst_mm, struct* mm_struct *src_mm,
924	pte_t dst_pte, pte_t src_pte, struct vm_area_struct *dst_vma,
925	struct vm_area_struct src_vma, unsigned* long addr, int *rss)
926	{
927	vm_flags_t vm_flags = dst_vma->vm_flags;
928	pte_t orig_pte = ptep_get(ptep: src_pte);
929	pte_t pte = orig_pte;
930	struct folio *folio;
931	struct page *page;
932	swp_entry_t entry = pte_to_swp_entry(pte: orig_pte);
933
934	if (likely(!non_swap_entry(entry))) {
935	if (swap_duplicate(entry) < `0`)
936	return -EIO;
937
938	/ make sure dst_mm is on swapoff's mmlist. /
939	if (unlikely(list_empty(&dst_mm->mmlist))) {
940	spin_lock(lock: &mmlist_lock);
941	if (list_empty(head: &dst_mm->mmlist))
942	list_add(new: &dst_mm->mmlist,
943	head: &src_mm->mmlist);
944	spin_unlock(lock: &mmlist_lock);
945	}
946	/ Mark the swap entry as shared. /
947	if (pte_swp_exclusive(pte: orig_pte)) {
948	pte = pte_swp_clear_exclusive(pte: orig_pte);
949	set_pte_at(src_mm, addr, src_pte, pte);
950	}
951	rss[MM_SWAPENTS]++;
952	} else if (is_migration_entry(entry)) {
953	folio = pfn_swap_entry_folio(entry);
954
955	rss[mm_counter(folio)]++;
956
957	if (!is_readable_migration_entry(entry) &&
958	is_cow_mapping(flags: vm_flags)) {
959	/*
960	* COW mappings require pages in both parent and child
961	* to be set to read. A previously exclusive entry is
962	* now shared.
963	*/
964	entry = make_readable_migration_entry(
965	offset: swp_offset(entry));
966	pte = swp_entry_to_pte(entry);
967	if (pte_swp_soft_dirty(pte: orig_pte))
968	pte = pte_swp_mksoft_dirty(pte);
969	if (pte_swp_uffd_wp(pte: orig_pte))
970	pte = pte_swp_mkuffd_wp(pte);
971	set_pte_at(src_mm, addr, src_pte, pte);
972	}
973	} else if (is_device_private_entry(entry)) {
974	page = pfn_swap_entry_to_page(entry);
975	folio = page_folio(page);
976
977	/*
978	* Update rss count even for unaddressable pages, as
979	* they should treated just like normal pages in this
980	* respect.
981	*
982	* We will likely want to have some new rss counters
983	* for unaddressable pages, at some point. But for now
984	* keep things as they are.
985	*/
986	folio_get(folio);
987	rss[mm_counter(folio)]++;
988	/ Cannot fail as these pages cannot get pinned. /
989	folio_try_dup_anon_rmap_pte(folio, page, dst_vma, src_vma);
990
991	/*
992	* We do not preserve soft-dirty information, because so
993	* far, checkpoint/restore is the only feature that
994	* requires that. And checkpoint/restore does not work
995	* when a device driver is involved (you cannot easily
996	* save and restore device driver state).
997	*/
998	if (is_writable_device_private_entry(entry) &&
999	is_cow_mapping(flags: vm_flags)) {
1000	entry = make_readable_device_private_entry(
1001	offset: swp_offset(entry));
1002	pte = swp_entry_to_pte(entry);
1003	if (pte_swp_uffd_wp(pte: orig_pte))
1004	pte = pte_swp_mkuffd_wp(pte);
1005	set_pte_at(src_mm, addr, src_pte, pte);
1006	}
1007	} else if (is_device_exclusive_entry(entry)) {
1008	/*
1009	* Make device exclusive entries present by restoring the
1010	* original entry then copying as for a present pte. Device
1011	* exclusive entries currently only support private writable
1012	* (ie. COW) mappings.
1013	*/
1014	VM_BUG_ON(!is_cow_mapping(src_vma->vm_flags));
1015	if (try_restore_exclusive_pte(vma: src_vma, addr, ptep: src_pte, orig_pte))
1016	return -EBUSY;
1017	return -ENOENT;
1018	} else if (is_pte_marker_entry(entry)) {
1019	pte_marker marker = copy_pte_marker(entry, dst_vma);
1020
1021	if (marker)
1022	set_pte_at(dst_mm, addr, dst_pte,
1023	make_pte_marker(marker));
1024	return `0`;
1025	}
1026	if (!userfaultfd_wp(vma: dst_vma))
1027	pte = pte_swp_clear_uffd_wp(pte);
1028	set_pte_at(dst_mm, addr, dst_pte, pte);
1029	return `0`;
1030	}
1031
1032	/*
1033	* Copy a present and normal page.
1034	*
1035	* NOTE! The usual case is that this isn't required;
1036	* instead, the caller can just increase the page refcount
1037	* and re-use the pte the traditional way.
1038	*
1039	* And if we need a pre-allocated page but don't yet have
1040	* one, return a negative error to let the preallocation
1041	* code know so that it can do so outside the page table
1042	* lock.
1043	*/
1044	static inline int
1045	copy_present_page(struct vm_area_struct dst_vma, struct* vm_area_struct *src_vma,
1046	pte_t dst_pte, pte_t src_pte, unsigned long addr, int *rss,
1047	struct folio prealloc, struct** page *page)
1048	{
1049	struct folio *new_folio;
1050	pte_t pte;
1051
1052	new_folio = *prealloc;
1053	if (!new_folio)
1054	return -EAGAIN;
1055
1056	/*
1057	* We have a prealloc page, all good! Take it
1058	* over and copy the page & arm it.
1059	*/
1060
1061	if (copy_mc_user_highpage(to: &new_folio->page, from: page, vaddr: addr, vma: src_vma))
1062	return -EHWPOISON;
1063
1064	*prealloc = NULL;
1065	__folio_mark_uptodate(folio: new_folio);
1066	folio_add_new_anon_rmap(new_folio, dst_vma, address: addr, RMAP_EXCLUSIVE);
1067	folio_add_lru_vma(new_folio, dst_vma);
1068	rss[MM_ANONPAGES]++;
1069
1070	/ All done, just insert the new page copy in the child /
1071	pte = folio_mk_pte(folio: new_folio, pgprot: dst_vma->vm_page_prot);
1072	pte = maybe_mkwrite(pte: pte_mkdirty(pte), vma: dst_vma);
1073	if (userfaultfd_pte_wp(vma: dst_vma, pte: ptep_get(ptep: src_pte)))
1074	/ Uffd-wp needs to be delivered to dest pte as well /
1075	pte = pte_mkuffd_wp(pte);
1076	set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
1077	return `0`;
1078	}
1079
1080	static __always_inline void __copy_present_ptes(struct vm_area_struct *dst_vma,
1081	struct vm_area_struct src_vma, pte_t dst_pte, pte_t *src_pte,
1082	pte_t pte, unsigned long addr, int nr)
1083	{
1084	struct mm_struct *src_mm = src_vma->vm_mm;
1085
1086	/ If it's a COW mapping, write protect it both processes. /
1087	if (is_cow_mapping(flags: src_vma->vm_flags) && pte_write(pte)) {
1088	wrprotect_ptes(mm: src_mm, addr, ptep: src_pte, nr);
1089	pte = pte_wrprotect(pte);
1090	}
1091
1092	/ If it's a shared mapping, mark it clean in the child. /
1093	if (src_vma->vm_flags & VM_SHARED)
1094	pte = pte_mkclean(pte);
1095	pte = pte_mkold(pte);
1096
1097	if (!userfaultfd_wp(vma: dst_vma))
1098	pte = pte_clear_uffd_wp(pte);
1099
1100	set_ptes(mm: dst_vma->vm_mm, addr, ptep: dst_pte, pte, nr);
1101	}
1102
1103	/*
1104	* Copy one present PTE, trying to batch-process subsequent PTEs that map
1105	* consecutive pages of the same folio by copying them as well.
1106	*
1107	* Returns -EAGAIN if one preallocated page is required to copy the next PTE.
1108	* Otherwise, returns the number of copied PTEs (at least 1).
1109	*/
1110	static inline int
1111	copy_present_ptes(struct vm_area_struct dst_vma, struct* vm_area_struct *src_vma,
1112	pte_t dst_pte, pte_t src_pte, pte_t pte, unsigned long addr,
1113	int max_nr, int rss, struct* folio **prealloc)
1114	{
1115	fpb_t flags = FPB_MERGE_WRITE;
1116	struct page *page;
1117	struct folio *folio;
1118	int err, nr;
1119
1120	page = vm_normal_page(vma: src_vma, addr, pte);
1121	if (unlikely(!page))
1122	goto copy_pte;
1123
1124	folio = page_folio(page);
1125
1126	/*
1127	* If we likely have to copy, just don't bother with batching. Make
1128	* sure that the common "small folio" case is as fast as possible
1129	* by keeping the batching logic separate.
1130	*/
1131	if (unlikely(!*prealloc && folio_test_large(folio) && max_nr != `1`)) {
1132	if (!(src_vma->vm_flags & VM_SHARED))
1133	flags \|= FPB_RESPECT_DIRTY;
1134	if (vma_soft_dirty_enabled(vma: src_vma))
1135	flags \|= FPB_RESPECT_SOFT_DIRTY;
1136
1137	nr = folio_pte_batch_flags(folio, vma: src_vma, ptep: src_pte, ptentp: &pte, max_nr, flags);
1138	folio_ref_add(folio, nr);
1139	if (folio_test_anon(folio)) {
1140	if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page,
1141	nr, dst_vma, src_vma))) {
1142	folio_ref_sub(folio, nr);
1143	return -EAGAIN;
1144	}
1145	rss[MM_ANONPAGES] += nr;
1146	VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
1147	} else {
1148	folio_dup_file_rmap_ptes(folio, page, nr_pages: nr, dst_vma);
1149	rss[mm_counter_file(folio)] += nr;
1150	}
1151	__copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte,
1152	addr, nr);
1153	return nr;
1154	}
1155
1156	folio_get(folio);
1157	if (folio_test_anon(folio)) {
1158	/*
1159	* If this page may have been pinned by the parent process,
1160	* copy the page immediately for the child so that we'll always
1161	* guarantee the pinned page won't be randomly replaced in the
1162	* future.
1163	*/
1164	if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, dst_vma, src_vma))) {
1165	/ Page may be pinned, we have to copy. /
1166	folio_put(folio);
1167	err = copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
1168	addr, rss, prealloc, page);
1169	return err ? err : `1`;
1170	}
1171	rss[MM_ANONPAGES]++;
1172	VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
1173	} else {
1174	folio_dup_file_rmap_pte(folio, page, dst_vma);
1175	rss[mm_counter_file(folio)]++;
1176	}
1177
1178	copy_pte:
1179	__copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, addr, nr: `1`);
1180	return `1`;
1181	}
1182
1183	static inline struct folio folio_prealloc(struct* mm_struct *src_mm,
1184	struct vm_area_struct vma, unsigned* long addr, bool need_zero)
1185	{
1186	struct folio *new_folio;
1187
1188	if (need_zero)
1189	new_folio = vma_alloc_zeroed_movable_folio(vma, addr);
1190	else
1191	new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, `0`, vma, addr);
1192
1193	if (!new_folio)
1194	return NULL;
1195
1196	if (mem_cgroup_charge(folio: new_folio, mm: src_mm, GFP_KERNEL)) {
1197	folio_put(folio: new_folio);
1198	return NULL;
1199	}
1200	folio_throttle_swaprate(folio: new_folio, GFP_KERNEL);
1201
1202	return new_folio;
1203	}
1204
1205	static int
1206	copy_pte_range(struct vm_area_struct dst_vma, struct* vm_area_struct *src_vma,
1207	pmd_t dst_pmd, pmd_t src_pmd, unsigned long addr,
1208	unsigned long end)
1209	{
1210	struct mm_struct *dst_mm = dst_vma->vm_mm;
1211	struct mm_struct *src_mm = src_vma->vm_mm;
1212	pte_t orig_src_pte, orig_dst_pte;
1213	pte_t src_pte, dst_pte;
1214	pmd_t dummy_pmdval;
1215	pte_t ptent;
1216	spinlock_t src_ptl, dst_ptl;
1217	int progress, max_nr, ret = `0`;
1218	int rss[NR_MM_COUNTERS];
1219	swp_entry_t entry = (swp_entry_t){`0`};
1220	struct folio *prealloc = NULL;
1221	int nr;
1222
1223	again:
1224	progress = `0`;
1225	init_rss_vec(rss);
1226
1227	/*
1228	* copy_pmd_range()'s prior pmd_none_or_clear_bad(src_pmd), and the
1229	* error handling here, assume that exclusive mmap_lock on dst and src
1230	* protects anon from unexpected THP transitions; with shmem and file
1231	* protected by mmap_lock-less collapse skipping areas with anon_vma
1232	* (whereas vma_needs_copy() skips areas without anon_vma). A rework
1233	* can remove such assumptions later, but this is good enough for now.
1234	*/
1235	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
1236	if (!dst_pte) {
1237	ret = -ENOMEM;
1238	goto out;
1239	}
1240
1241	/*
1242	* We already hold the exclusive mmap_lock, the copy_pte_range() and
1243	* retract_page_tables() are using vma->anon_vma to be exclusive, so
1244	* the PTE page is stable, and there is no need to get pmdval and do
1245	* pmd_same() check.
1246	*/
1247	src_pte = pte_offset_map_rw_nolock(mm: src_mm, pmd: src_pmd, addr, pmdvalp: &dummy_pmdval,
1248	ptlp: &src_ptl);
1249	if (!src_pte) {
1250	pte_unmap_unlock(dst_pte, dst_ptl);
1251	/ ret == 0 /
1252	goto out;
1253	}
1254	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1255	orig_src_pte = src_pte;
1256	orig_dst_pte = dst_pte;
1257	arch_enter_lazy_mmu_mode();
1258
1259	do {
1260	nr = `1`;
1261
1262	/*
1263	* We are holding two locks at this point - either of them
1264	* could generate latencies in another task on another CPU.
1265	*/
1266	if (progress >= `32`) {
1267	progress = `0`;
1268	if (need_resched() \|\|
1269	spin_needbreak(lock: src_ptl) \|\| spin_needbreak(lock: dst_ptl))
1270	break;
1271	}
1272	ptent = ptep_get(ptep: src_pte);
1273	if (pte_none(pte: ptent)) {
1274	progress++;
1275	continue;
1276	}
1277	if (unlikely(!pte_present(ptent))) {
1278	ret = copy_nonpresent_pte(dst_mm, src_mm,
1279	dst_pte, src_pte,
1280	dst_vma, src_vma,
1281	addr, rss);
1282	if (ret == -EIO) {
1283	entry = pte_to_swp_entry(pte: ptep_get(ptep: src_pte));
1284	break;
1285	} else if (ret == -EBUSY) {
1286	break;
1287	} else if (!ret) {
1288	progress += `8`;
1289	continue;
1290	}
1291	ptent = ptep_get(ptep: src_pte);
1292	VM_WARN_ON_ONCE(!pte_present(ptent));
1293
1294	/*
1295	* Device exclusive entry restored, continue by copying
1296	* the now present pte.
1297	*/
1298	WARN_ON_ONCE(ret != -ENOENT);
1299	}
1300	/ copy_present_ptes() will clear `prealloc' if consumed /*
1301	max_nr = (end - addr) / PAGE_SIZE;
1302	ret = copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte,
1303	pte: ptent, addr, max_nr, rss, prealloc: &prealloc);
1304	/*
1305	* If we need a pre-allocated page for this pte, drop the
1306	* locks, allocate, and try again.
1307	* If copy failed due to hwpoison in source page, break out.
1308	*/
1309	if (unlikely(ret == -EAGAIN \|\| ret == -EHWPOISON))
1310	break;
1311	if (unlikely(prealloc)) {
1312	/*
1313	* pre-alloc page cannot be reused by next time so as
1314	* to strictly follow mempolicy (e.g., alloc_page_vma()
1315	* will allocate page according to address). This
1316	* could only happen if one pinned pte changed.
1317	*/
1318	folio_put(folio: prealloc);
1319	prealloc = NULL;
1320	}
1321	nr = ret;
1322	progress += `8` * nr;
1323	} while (dst_pte += nr, src_pte += nr, addr += PAGE_SIZE * nr,
1324	addr != end);
1325
1326	arch_leave_lazy_mmu_mode();
1327	pte_unmap_unlock(orig_src_pte, src_ptl);
1328	add_mm_rss_vec(mm: dst_mm, rss);
1329	pte_unmap_unlock(orig_dst_pte, dst_ptl);
1330	cond_resched();
1331
1332	if (ret == -EIO) {
1333	VM_WARN_ON_ONCE(!entry.val);
1334	if (add_swap_count_continuation(entry, GFP_KERNEL) < `0`) {
1335	ret = -ENOMEM;
1336	goto out;
1337	}
1338	entry.val = `0`;
1339	} else if (ret == -EBUSY \|\| unlikely(ret == -EHWPOISON)) {
1340	goto out;
1341	} else if (ret == -EAGAIN) {
1342	prealloc = folio_prealloc(src_mm, vma: src_vma, addr, need_zero: false);
1343	if (!prealloc)
1344	return -ENOMEM;
1345	} else if (ret < `0`) {
1346	VM_WARN_ON_ONCE(`1`);
1347	}
1348
1349	/ We've captured and resolved the error. Reset, try again. /
1350	ret = `0`;
1351
1352	if (addr != end)
1353	goto again;
1354	out:
1355	if (unlikely(prealloc))
1356	folio_put(folio: prealloc);
1357	return ret;
1358	}
1359
1360	static inline int
1361	copy_pmd_range(struct vm_area_struct dst_vma, struct* vm_area_struct *src_vma,
1362	pud_t dst_pud, pud_t src_pud, unsigned long addr,
1363	unsigned long end)
1364	{
1365	struct mm_struct *dst_mm = dst_vma->vm_mm;
1366	struct mm_struct *src_mm = src_vma->vm_mm;
1367	pmd_t src_pmd, dst_pmd;
1368	unsigned long next;
1369
1370	dst_pmd = pmd_alloc(mm: dst_mm, pud: dst_pud, address: addr);
1371	if (!dst_pmd)
1372	return -ENOMEM;
1373	src_pmd = pmd_offset(pud: src_pud, address: addr);
1374	do {
1375	next = pmd_addr_end(addr, end);
1376	if (is_swap_pmd(pmd: src_pmd) \|\| pmd_trans_huge(pmd: src_pmd)) {
1377	int err;
1378	VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
1379	err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
1380	addr, dst_vma, src_vma);
1381	if (err == -ENOMEM)
1382	return -ENOMEM;
1383	if (!err)
1384	continue;
1385	/ fall through /
1386	}
1387	if (pmd_none_or_clear_bad(pmd: src_pmd))
1388	continue;
1389	if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd,
1390	addr, end: next))
1391	return -ENOMEM;
1392	} while (dst_pmd++, src_pmd++, addr = next, addr != end);
1393	return `0`;
1394	}
1395
1396	static inline int
1397	copy_pud_range(struct vm_area_struct dst_vma, struct* vm_area_struct *src_vma,
1398	p4d_t dst_p4d, p4d_t src_p4d, unsigned long addr,
1399	unsigned long end)
1400	{
1401	struct mm_struct *dst_mm = dst_vma->vm_mm;
1402	struct mm_struct *src_mm = src_vma->vm_mm;
1403	pud_t src_pud, dst_pud;
1404	unsigned long next;
1405
1406	dst_pud = pud_alloc(mm: dst_mm, p4d: dst_p4d, address: addr);
1407	if (!dst_pud)
1408	return -ENOMEM;
1409	src_pud = pud_offset(p4d: src_p4d, address: addr);
1410	do {
1411	next = pud_addr_end(addr, end);
1412	if (pud_trans_huge(pud: *src_pud)) {
1413	int err;
1414
1415	VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma);
1416	err = copy_huge_pud(dst_mm, src_mm,
1417	dst_pud, src_pud, addr, vma: src_vma);
1418	if (err == -ENOMEM)
1419	return -ENOMEM;
1420	if (!err)
1421	continue;
1422	/ fall through /
1423	}
1424	if (pud_none_or_clear_bad(pud: src_pud))
1425	continue;
1426	if (copy_pmd_range(dst_vma, src_vma, dst_pud, src_pud,
1427	addr, end: next))
1428	return -ENOMEM;
1429	} while (dst_pud++, src_pud++, addr = next, addr != end);
1430	return `0`;
1431	}
1432
1433	static inline int
1434	copy_p4d_range(struct vm_area_struct dst_vma, struct* vm_area_struct *src_vma,
1435	pgd_t dst_pgd, pgd_t src_pgd, unsigned long addr,
1436	unsigned long end)
1437	{
1438	struct mm_struct *dst_mm = dst_vma->vm_mm;
1439	p4d_t src_p4d, dst_p4d;
1440	unsigned long next;
1441
1442	dst_p4d = p4d_alloc(mm: dst_mm, pgd: dst_pgd, address: addr);
1443	if (!dst_p4d)
1444	return -ENOMEM;
1445	src_p4d = p4d_offset(pgd: src_pgd, address: addr);
1446	do {
1447	next = p4d_addr_end(addr, end);
1448	if (p4d_none_or_clear_bad(p4d: src_p4d))
1449	continue;
1450	if (copy_pud_range(dst_vma, src_vma, dst_p4d, src_p4d,
1451	addr, end: next))
1452	return -ENOMEM;
1453	} while (dst_p4d++, src_p4d++, addr = next, addr != end);
1454	return `0`;
1455	}
1456
1457	/*
1458	* Return true if the vma needs to copy the pgtable during this fork(). Return
1459	* false when we can speed up fork() by allowing lazy page faults later until
1460	* when the child accesses the memory range.
1461	*/
1462	static bool
1463	vma_needs_copy(struct vm_area_struct dst_vma, struct* vm_area_struct *src_vma)
1464	{
1465	/*
1466	* Always copy pgtables when dst_vma has uffd-wp enabled even if it's
1467	* file-backed (e.g. shmem). Because when uffd-wp is enabled, pgtable
1468	* contains uffd-wp protection information, that's something we can't
1469	* retrieve from page cache, and skip copying will lose those info.
1470	*/
1471	if (userfaultfd_wp(vma: dst_vma))
1472	return true;
1473
1474	if (src_vma->vm_flags & (VM_PFNMAP \| VM_MIXEDMAP))
1475	return true;
1476
1477	if (src_vma->anon_vma)
1478	return true;
1479
1480	/*
1481	* Don't copy ptes where a page fault will fill them correctly. Fork
1482	* becomes much lighter when there are big shared or private readonly
1483	* mappings. The tradeoff is that copy_page_range is more efficient
1484	* than faulting.
1485	*/
1486	return false;
1487	}
1488
1489	int
1490	copy_page_range(struct vm_area_struct dst_vma, struct* vm_area_struct *src_vma)
1491	{
1492	pgd_t src_pgd, dst_pgd;
1493	unsigned long addr = src_vma->vm_start;
1494	unsigned long end = src_vma->vm_end;
1495	struct mm_struct *dst_mm = dst_vma->vm_mm;
1496	struct mm_struct *src_mm = src_vma->vm_mm;
1497	struct mmu_notifier_range range;
1498	unsigned long next;
1499	bool is_cow;
1500	int ret;
1501
1502	if (!vma_needs_copy(dst_vma, src_vma))
1503	return `0`;
1504
1505	if (is_vm_hugetlb_page(vma: src_vma))
1506	return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma);
1507
1508	/*
1509	* We need to invalidate the secondary MMU mappings only when
1510	* there could be a permission downgrade on the ptes of the
1511	* parent mm. And a permission downgrade will only happen if
1512	* is_cow_mapping() returns true.
1513	*/
1514	is_cow = is_cow_mapping(flags: src_vma->vm_flags);
1515
1516	if (is_cow) {
1517	mmu_notifier_range_init(range: &range, event: MMU_NOTIFY_PROTECTION_PAGE,
1518	flags: `0`, mm: src_mm, start: addr, end);
1519	mmu_notifier_invalidate_range_start(range: &range);
1520	/*
1521	* Disabling preemption is not needed for the write side, as
1522	* the read side doesn't spin, but goes to the mmap_lock.
1523	*
1524	* Use the raw variant of the seqcount_t write API to avoid
1525	* lockdep complaining about preemptibility.
1526	*/
1527	vma_assert_write_locked(vma: src_vma);
1528	raw_write_seqcount_begin(&src_mm->write_protect_seq);
1529	}
1530
1531	ret = `0`;
1532	dst_pgd = pgd_offset(dst_mm, addr);
1533	src_pgd = pgd_offset(src_mm, addr);
1534	do {
1535	next = pgd_addr_end(addr, end);
1536	if (pgd_none_or_clear_bad(pgd: src_pgd))
1537	continue;
1538	if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
1539	addr, next))) {
1540	ret = -ENOMEM;
1541	break;
1542	}
1543	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
1544
1545	if (is_cow) {
1546	raw_write_seqcount_end(&src_mm->write_protect_seq);
1547	mmu_notifier_invalidate_range_end(range: &range);
1548	}
1549	return ret;
1550	}
1551
1552	/ Whether we should zap all COWed (private) pages too /
1553	static inline bool should_zap_cows(struct zap_details *details)
1554	{
1555	/ By default, zap all pages /
1556	if (!details \|\| details->reclaim_pt)
1557	return true;
1558
1559	/ Or, we zap COWed pages only if the caller wants to /
1560	return details->even_cows;
1561	}
1562
1563	/ Decides whether we should zap this folio with the folio pointer specified /
1564	static inline bool should_zap_folio(struct zap_details *details,
1565	struct folio *folio)
1566	{
1567	/ If we can make a decision without folio.. /*
1568	if (should_zap_cows(details))
1569	return true;
1570
1571	/ Otherwise we should only zap non-anon folios /
1572	return !folio_test_anon(folio);
1573	}
1574
1575	static inline bool zap_drop_markers(struct zap_details *details)
1576	{
1577	if (!details)
1578	return false;
1579
1580	return details->zap_flags & ZAP_FLAG_DROP_MARKER;
1581	}
1582
1583	/*
1584	* This function makes sure that we'll replace the none pte with an uffd-wp
1585	* swap special pte marker when necessary. Must be with the pgtable lock held.
1586	*
1587	* Returns true if uffd-wp ptes was installed, false otherwise.
1588	*/
1589	static inline bool
1590	zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
1591	unsigned long addr, pte_t pte, int* nr,
1592	struct zap_details *details, pte_t pteval)
1593	{
1594	bool was_installed = false;
1595
1596	#ifdef CONFIG_PTE_MARKER_UFFD_WP
1597	/ Zap on anonymous always means dropping everything /
1598	if (vma_is_anonymous(vma))
1599	return false;
1600
1601	if (zap_drop_markers(details))
1602	return false;
1603
1604	for (;;) {
1605	/ the PFN in the PTE is irrelevant. /
1606	if (pte_install_uffd_wp_if_needed(vma, addr, pte, pteval))
1607	was_installed = true;
1608	if (--nr == `0`)
1609	break;
1610	pte++;
1611	addr += PAGE_SIZE;
1612	}
1613	#endif
1614	return was_installed;
1615	}
1616
1617	static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb,
1618	struct vm_area_struct vma, struct* folio *folio,
1619	struct page page, pte_t pte, pte_t ptent, unsigned int nr,
1620	unsigned long addr, struct zap_details details, int* *rss,
1621	bool force_flush, bool force_break, bool *any_skipped)
1622	{
1623	struct mm_struct *mm = tlb->mm;
1624	bool delay_rmap = false;
1625
1626	if (!folio_test_anon(folio)) {
1627	ptent = get_and_clear_full_ptes(mm, addr, ptep: pte, nr, full: tlb->fullmm);
1628	if (pte_dirty(pte: ptent)) {
1629	folio_mark_dirty(folio);
1630	if (tlb_delay_rmap(tlb)) {
1631	delay_rmap = true;
1632	*force_flush = true;
1633	}
1634	}
1635	if (pte_young(pte: ptent) && likely(vma_has_recency(vma)))
1636	folio_mark_accessed(folio);
1637	rss[mm_counter(folio)] -= nr;
1638	} else {
1639	/ We don't need up-to-date accessed/dirty bits. /
1640	clear_full_ptes(mm, addr, ptep: pte, nr, full: tlb->fullmm);
1641	rss[MM_ANONPAGES] -= nr;
1642	}
1643	/ Checking a single PTE in a batch is sufficient. /
1644	arch_check_zapped_pte(vma, pte: ptent);
1645	tlb_remove_tlb_entries(tlb, ptep: pte, nr, address: addr);
1646	if (unlikely(userfaultfd_pte_wp(vma, ptent)))
1647	*any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte,
1648	nr, details, pteval: ptent);
1649
1650	if (!delay_rmap) {
1651	folio_remove_rmap_ptes(folio, page, nr_pages: nr, vma);
1652
1653	if (unlikely(folio_mapcount(folio) < `0`))
1654	print_bad_pte(vma, addr, ptent, page);
1655	}
1656	if (unlikely(__tlb_remove_folio_pages(tlb, page, nr, delay_rmap))) {
1657	*force_flush = true;
1658	*force_break = true;
1659	}
1660	}
1661
1662	/*
1663	* Zap or skip at least one present PTE, trying to batch-process subsequent
1664	* PTEs that map consecutive pages of the same folio.
1665	*
1666	* Returns the number of processed (skipped or zapped) PTEs (at least 1).
1667	*/
1668	static inline int zap_present_ptes(struct mmu_gather *tlb,
1669	struct vm_area_struct vma, pte_t pte, pte_t ptent,
1670	unsigned int max_nr, unsigned long addr,
1671	struct zap_details details, int* rss, bool force_flush,
1672	bool force_break, bool any_skipped)
1673	{
1674	struct mm_struct *mm = tlb->mm;
1675	struct folio *folio;
1676	struct page *page;
1677	int nr;
1678
1679	page = vm_normal_page(vma, addr, pte: ptent);
1680	if (!page) {
1681	/ We don't need up-to-date accessed/dirty bits. /
1682	ptep_get_and_clear_full(mm, addr, ptep: pte, full: tlb->fullmm);
1683	arch_check_zapped_pte(vma, pte: ptent);
1684	tlb_remove_tlb_entry(tlb, pte, addr);
1685	if (userfaultfd_pte_wp(vma, pte: ptent))
1686	*any_skipped = zap_install_uffd_wp_if_needed(vma, addr,
1687	pte, nr: `1`, details, pteval: ptent);
1688	ksm_might_unmap_zero_page(mm, pte: ptent);
1689	return `1`;
1690	}
1691
1692	folio = page_folio(page);
1693	if (unlikely(!should_zap_folio(details, folio))) {
1694	*any_skipped = true;
1695	return `1`;
1696	}
1697
1698	/*
1699	* Make sure that the common "small folio" case is as fast as possible
1700	* by keeping the batching logic separate.
1701	*/
1702	if (unlikely(folio_test_large(folio) && max_nr != `1`)) {
1703	nr = folio_pte_batch(folio, ptep: pte, pte: ptent, max_nr);
1704	zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr,
1705	addr, details, rss, force_flush,
1706	force_break, any_skipped);
1707	return nr;
1708	}
1709	zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr: `1`, addr,
1710	details, rss, force_flush, force_break, any_skipped);
1711	return `1`;
1712	}
1713
1714	static inline int zap_nonpresent_ptes(struct mmu_gather *tlb,
1715	struct vm_area_struct vma, pte_t pte, pte_t ptent,
1716	unsigned int max_nr, unsigned long addr,
1717	struct zap_details details, int* rss, bool any_skipped)
1718	{
1719	swp_entry_t entry;
1720	int nr = `1`;
1721
1722	*any_skipped = true;
1723	entry = pte_to_swp_entry(pte: ptent);
1724	if (is_device_private_entry(entry) \|\|
1725	is_device_exclusive_entry(entry)) {
1726	struct page *page = pfn_swap_entry_to_page(entry);
1727	struct folio *folio = page_folio(page);
1728
1729	if (unlikely(!should_zap_folio(details, folio)))
1730	return `1`;
1731	/*
1732	* Both device private/exclusive mappings should only
1733	* work with anonymous page so far, so we don't need to
1734	* consider uffd-wp bit when zap. For more information,
1735	* see zap_install_uffd_wp_if_needed().
1736	*/
1737	WARN_ON_ONCE(!vma_is_anonymous(vma));
1738	rss[mm_counter(folio)]--;
1739	folio_remove_rmap_pte(folio, page, vma);
1740	folio_put(folio);
1741	} else if (!non_swap_entry(entry)) {
1742	/ Genuine swap entries, hence a private anon pages /
1743	if (!should_zap_cows(details))
1744	return `1`;
1745
1746	nr = swap_pte_batch(start_ptep: pte, max_nr, pte: ptent);
1747	rss[MM_SWAPENTS] -= nr;
1748	free_swap_and_cache_nr(entry, nr);
1749	} else if (is_migration_entry(entry)) {
1750	struct folio *folio = pfn_swap_entry_folio(entry);
1751
1752	if (!should_zap_folio(details, folio))
1753	return `1`;
1754	rss[mm_counter(folio)]--;
1755	} else if (pte_marker_entry_uffd_wp(entry)) {
1756	/*
1757	* For anon: always drop the marker; for file: only
1758	* drop the marker if explicitly requested.
1759	*/
1760	if (!vma_is_anonymous(vma) && !zap_drop_markers(details))
1761	return `1`;
1762	} else if (is_guard_swp_entry(entry)) {
1763	/*
1764	* Ordinary zapping should not remove guard PTE
1765	* markers. Only do so if we should remove PTE markers
1766	* in general.
1767	*/
1768	if (!zap_drop_markers(details))
1769	return `1`;
1770	} else if (is_hwpoison_entry(swp: entry) \|\| is_poisoned_swp_entry(entry)) {
1771	if (!should_zap_cows(details))
1772	return `1`;
1773	} else {
1774	/ We should have covered all the swap entry types /
1775	pr_alert("unrecognized swap entry 0x%lx\n", entry.val);
1776	WARN_ON_ONCE(`1`);
1777	}
1778	clear_not_present_full_ptes(mm: vma->vm_mm, addr, ptep: pte, nr, full: tlb->fullmm);
1779	*any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, pteval: ptent);
1780
1781	return nr;
1782	}
1783
1784	static inline int do_zap_pte_range(struct mmu_gather *tlb,
1785	struct vm_area_struct vma, pte_t pte,
1786	unsigned long addr, unsigned long end,
1787	struct zap_details details, int* *rss,
1788	bool force_flush, bool force_break,
1789	bool *any_skipped)
1790	{
1791	pte_t ptent = ptep_get(ptep: pte);
1792	int max_nr = (end - addr) / PAGE_SIZE;
1793	int nr = `0`;
1794
1795	/ Skip all consecutive none ptes /
1796	if (pte_none(pte: ptent)) {
1797	for (nr = `1`; nr < max_nr; nr++) {
1798	ptent = ptep_get(ptep: pte + nr);
1799	if (!pte_none(pte: ptent))
1800	break;
1801	}
1802	max_nr -= nr;
1803	if (!max_nr)
1804	return nr;
1805	pte += nr;
1806	addr += nr * PAGE_SIZE;
1807	}
1808
1809	if (pte_present(a: ptent))
1810	nr += zap_present_ptes(tlb, vma, pte, ptent, max_nr, addr,
1811	details, rss, force_flush, force_break,
1812	any_skipped);
1813	else
1814	nr += zap_nonpresent_ptes(tlb, vma, pte, ptent, max_nr, addr,
1815	details, rss, any_skipped);
1816
1817	return nr;
1818	}
1819
1820	static unsigned long zap_pte_range(struct mmu_gather *tlb,
1821	struct vm_area_struct vma, pmd_t pmd,
1822	unsigned long addr, unsigned long end,
1823	struct zap_details *details)
1824	{
1825	bool force_flush = false, force_break = false;
1826	struct mm_struct *mm = tlb->mm;
1827	int rss[NR_MM_COUNTERS];
1828	spinlock_t *ptl;
1829	pte_t *start_pte;
1830	pte_t *pte;
1831	pmd_t pmdval;
1832	unsigned long start = addr;
1833	bool can_reclaim_pt = reclaim_pt_is_enabled(start, end, details);
1834	bool direct_reclaim = true;
1835	int nr;
1836
1837	retry:
1838	tlb_change_page_size(tlb, PAGE_SIZE);
1839	init_rss_vec(rss);
1840	start_pte = pte = pte_offset_map_lock(mm, pmd, addr, ptlp: &ptl);
1841	if (!pte)
1842	return addr;
1843
1844	flush_tlb_batched_pending(mm);
1845	arch_enter_lazy_mmu_mode();
1846	do {
1847	bool any_skipped = false;
1848
1849	if (need_resched()) {
1850	direct_reclaim = false;
1851	break;
1852	}
1853
1854	nr = do_zap_pte_range(tlb, vma, pte, addr, end, details, rss,
1855	force_flush: &force_flush, force_break: &force_break, any_skipped: &any_skipped);
1856	if (any_skipped)
1857	can_reclaim_pt = false;
1858	if (unlikely(force_break)) {
1859	addr += nr * PAGE_SIZE;
1860	direct_reclaim = false;
1861	break;
1862	}
1863	} while (pte += nr, addr += PAGE_SIZE * nr, addr != end);
1864
1865	/*
1866	* Fast path: try to hold the pmd lock and unmap the PTE page.
1867	*
1868	* If the pte lock was released midway (retry case), or if the attempt
1869	* to hold the pmd lock failed, then we need to recheck all pte entries
1870	* to ensure they are still none, thereby preventing the pte entries
1871	* from being repopulated by another thread.
1872	*/
1873	if (can_reclaim_pt && direct_reclaim && addr == end)
1874	direct_reclaim = try_get_and_clear_pmd(mm, pmd, pmdval: &pmdval);
1875
1876	add_mm_rss_vec(mm, rss);
1877	arch_leave_lazy_mmu_mode();
1878
1879	/ Do the actual TLB flush before dropping ptl /
1880	if (force_flush) {
1881	tlb_flush_mmu_tlbonly(tlb);
1882	tlb_flush_rmaps(tlb, vma);
1883	}
1884	pte_unmap_unlock(start_pte, ptl);
1885
1886	/*
1887	* If we forced a TLB flush (either due to running out of
1888	* batch buffers or because we needed to flush dirty TLB
1889	* entries before releasing the ptl), free the batched
1890	* memory too. Come back again if we didn't do everything.
1891	*/
1892	if (force_flush)
1893	tlb_flush_mmu(tlb);
1894
1895	if (addr != end) {
1896	cond_resched();
1897	force_flush = false;
1898	force_break = false;
1899	goto retry;
1900	}
1901
1902	if (can_reclaim_pt) {
1903	if (direct_reclaim)
1904	free_pte(mm, addr: start, tlb, pmdval);
1905	else
1906	try_to_free_pte(mm, pmd, addr: start, tlb);
1907	}
1908
1909	return addr;
1910	}
1911
1912	static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1913	struct vm_area_struct vma, pud_t pud,
1914	unsigned long addr, unsigned long end,
1915	struct zap_details *details)
1916	{
1917	pmd_t *pmd;
1918	unsigned long next;
1919
1920	pmd = pmd_offset(pud, address: addr);
1921	do {
1922	next = pmd_addr_end(addr, end);
1923	if (is_swap_pmd(pmd: pmd) \|\| pmd_trans_huge(pmd: pmd)) {
1924	if (next - addr != HPAGE_PMD_SIZE)
1925	__split_huge_pmd(vma, pmd, address: addr, freeze: false);
1926	else if (zap_huge_pmd(tlb, vma, pmd, addr)) {
1927	addr = next;
1928	continue;
1929	}
1930	/ fall through /
1931	} else if (details && details->single_folio &&
1932	folio_test_pmd_mappable(folio: details->single_folio) &&
1933	next - addr == HPAGE_PMD_SIZE && pmd_none(pmd: *pmd)) {
1934	spinlock_t *ptl = pmd_lock(mm: tlb->mm, pmd);
1935	/*
1936	* Take and drop THP pmd lock so that we cannot return
1937	* prematurely, while zap_huge_pmd() has cleared *pmd,
1938	* but not yet decremented compound_mapcount().
1939	*/
1940	spin_unlock(lock: ptl);
1941	}
1942	if (pmd_none(pmd: *pmd)) {
1943	addr = next;
1944	continue;
1945	}
1946	addr = zap_pte_range(tlb, vma, pmd, addr, end: next, details);
1947	if (addr != next)
1948	pmd--;
1949	} while (pmd++, cond_resched(), addr != end);
1950
1951	return addr;
1952	}
1953
1954	static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1955	struct vm_area_struct vma, p4d_t p4d,
1956	unsigned long addr, unsigned long end,
1957	struct zap_details *details)
1958	{
1959	pud_t *pud;
1960	unsigned long next;
1961
1962	pud = pud_offset(p4d, address: addr);
1963	do {
1964	next = pud_addr_end(addr, end);
1965	if (pud_trans_huge(pud: *pud)) {
1966	if (next - addr != HPAGE_PUD_SIZE) {
1967	mmap_assert_locked(mm: tlb->mm);
1968	split_huge_pud(vma, pud, addr);
1969	} else if (zap_huge_pud(tlb, vma, pud, addr))
1970	goto next;
1971	/ fall through /
1972	}
1973	if (pud_none_or_clear_bad(pud))
1974	continue;
1975	next = zap_pmd_range(tlb, vma, pud, addr, end: next, details);
1976	next:
1977	cond_resched();
1978	} while (pud++, addr = next, addr != end);
1979
1980	return addr;
1981	}
1982
1983	static inline unsigned long zap_p4d_range(struct mmu_gather *tlb,
1984	struct vm_area_struct vma, pgd_t pgd,
1985	unsigned long addr, unsigned long end,
1986	struct zap_details *details)
1987	{
1988	p4d_t *p4d;
1989	unsigned long next;
1990
1991	p4d = p4d_offset(pgd, address: addr);
1992	do {
1993	next = p4d_addr_end(addr, end);
1994	if (p4d_none_or_clear_bad(p4d))
1995	continue;
1996	next = zap_pud_range(tlb, vma, p4d, addr, end: next, details);
1997	} while (p4d++, addr = next, addr != end);
1998
1999	return addr;
2000	}
2001
2002	void unmap_page_range(struct mmu_gather *tlb,
2003	struct vm_area_struct *vma,
2004	unsigned long addr, unsigned long end,
2005	struct zap_details *details)
2006	{
2007	pgd_t *pgd;
2008	unsigned long next;
2009
2010	BUG_ON(addr >= end);
2011	tlb_start_vma(tlb, vma);
2012	pgd = pgd_offset(vma->vm_mm, addr);
2013	do {
2014	next = pgd_addr_end(addr, end);
2015	if (pgd_none_or_clear_bad(pgd))
2016	continue;
2017	next = zap_p4d_range(tlb, vma, pgd, addr, end: next, details);
2018	} while (pgd++, addr = next, addr != end);
2019	tlb_end_vma(tlb, vma);
2020	}
2021
2022
2023	static void unmap_single_vma(struct mmu_gather *tlb,
2024	struct vm_area_struct vma, unsigned* long start_addr,
2025	unsigned long end_addr,
2026	struct zap_details *details, bool mm_wr_locked)
2027	{
2028	unsigned long start = max(vma->vm_start, start_addr);
2029	unsigned long end;
2030
2031	if (start >= vma->vm_end)
2032	return;
2033	end = min(vma->vm_end, end_addr);
2034	if (end <= vma->vm_start)
2035	return;
2036
2037	if (vma->vm_file)
2038	uprobe_munmap(vma, start, end);
2039
2040	if (start != end) {
2041	if (unlikely(is_vm_hugetlb_page(vma))) {
2042	/*
2043	* It is undesirable to test vma->vm_file as it
2044	* should be non-null for valid hugetlb area.
2045	* However, vm_file will be NULL in the error
2046	* cleanup path of mmap_region. When
2047	* hugetlbfs ->mmap method fails,
2048	* mmap_region() nullifies vma->vm_file
2049	* before calling this function to clean up.
2050	* Since no pte has actually been setup, it is
2051	* safe to do nothing in this case.
2052	*/
2053	if (vma->vm_file) {
2054	zap_flags_t zap_flags = details ?
2055	details->zap_flags : `0`;
2056	__unmap_hugepage_range(tlb, vma, start, end,
2057	NULL, zap_flags);
2058	}
2059	} else
2060	unmap_page_range(tlb, vma, addr: start, end, details);
2061	}
2062	}
2063
2064	/**
2065	* unmap_vmas - unmap a range of memory covered by a list of vma's
2066	* @tlb: address of the caller's struct mmu_gather
2067	* @mas: the maple state
2068	* @vma: the starting vma
2069	* @start_addr: virtual address at which to start unmapping
2070	* @end_addr: virtual address at which to end unmapping
2071	* @tree_end: The maximum index to check
2072	* @mm_wr_locked: lock flag
2073	*
2074	* Unmap all pages in the vma list.
2075	*
2076	* Only addresses between `start' and `end' will be unmapped.
2077	*
2078	* The VMA list must be sorted in ascending virtual address order.
2079	*
2080	* unmap_vmas() assumes that the caller will flush the whole unmapped address
2081	* range after unmap_vmas() returns. So the only responsibility here is to
2082	* ensure that any thus-far unmapped pages are flushed before unmap_vmas()
2083	* drops the lock and schedules.
2084	*/
2085	void unmap_vmas(struct mmu_gather tlb, struct* ma_state *mas,
2086	struct vm_area_struct vma, unsigned* long start_addr,
2087	unsigned long end_addr, unsigned long tree_end,
2088	bool mm_wr_locked)
2089	{
2090	struct mmu_notifier_range range;
2091	struct zap_details details = {
2092	.zap_flags = ZAP_FLAG_DROP_MARKER \| ZAP_FLAG_UNMAP,
2093	/ Careful - we need to zap private pages too! /
2094	.even_cows = true,
2095	};
2096
2097	mmu_notifier_range_init(range: &range, event: MMU_NOTIFY_UNMAP, flags: `0`, mm: vma->vm_mm,
2098	start: start_addr, end: end_addr);
2099	mmu_notifier_invalidate_range_start(range: &range);
2100	do {
2101	unsigned long start = start_addr;
2102	unsigned long end = end_addr;
2103	hugetlb_zap_begin(vma, start: &start, end: &end);
2104	unmap_single_vma(tlb, vma, start_addr: start, end_addr: end, details: &details,
2105	mm_wr_locked);
2106	hugetlb_zap_end(vma, details: &details);
2107	vma = mas_find(mas, max: tree_end - `1`);
2108	} while (vma && likely(!xa_is_zero(vma)));
2109	mmu_notifier_invalidate_range_end(range: &range);
2110	}
2111
2112	/**
2113	* zap_page_range_single_batched - remove user pages in a given range
2114	* @tlb: pointer to the caller's struct mmu_gather
2115	* @vma: vm_area_struct holding the applicable pages
2116	* @address: starting address of pages to remove
2117	* @size: number of bytes to remove
2118	* @details: details of shared cache invalidation
2119	*
2120	* @tlb shouldn't be NULL. The range must fit into one VMA. If @vma is for
2121	* hugetlb, @tlb is flushed and re-initialized by this function.
2122	*/
2123	void zap_page_range_single_batched(struct mmu_gather *tlb,
2124	struct vm_area_struct vma, unsigned* long address,
2125	unsigned long size, struct zap_details *details)
2126	{
2127	const unsigned long end = address + size;
2128	struct mmu_notifier_range range;
2129
2130	VM_WARN_ON_ONCE(!tlb \|\| tlb->mm != vma->vm_mm);
2131
2132	mmu_notifier_range_init(range: &range, event: MMU_NOTIFY_CLEAR, flags: `0`, mm: vma->vm_mm,
2133	start: address, end);
2134	hugetlb_zap_begin(vma, start: &range.start, end: &range.end);
2135	update_hiwater_rss(mm: vma->vm_mm);
2136	mmu_notifier_invalidate_range_start(range: &range);
2137	/*
2138	* unmap 'address-end' not 'range.start-range.end' as range
2139	* could have been expanded for hugetlb pmd sharing.
2140	*/
2141	unmap_single_vma(tlb, vma, start_addr: address, end_addr: end, details, mm_wr_locked: false);
2142	mmu_notifier_invalidate_range_end(range: &range);
2143	if (is_vm_hugetlb_page(vma)) {
2144	/*
2145	* flush tlb and free resources before hugetlb_zap_end(), to
2146	* avoid concurrent page faults' allocation failure.
2147	*/
2148	tlb_finish_mmu(tlb);
2149	hugetlb_zap_end(vma, details);
2150	tlb_gather_mmu(tlb, mm: vma->vm_mm);
2151	}
2152	}
2153
2154	/**
2155	* zap_page_range_single - remove user pages in a given range
2156	* @vma: vm_area_struct holding the applicable pages
2157	* @address: starting address of pages to zap
2158	* @size: number of bytes to zap
2159	* @details: details of shared cache invalidation
2160	*
2161	* The range must fit into one VMA.
2162	*/
2163	void zap_page_range_single(struct vm_area_struct vma, unsigned* long address,
2164	unsigned long size, struct zap_details *details)
2165	{
2166	struct mmu_gather tlb;
2167
2168	tlb_gather_mmu(tlb: &tlb, mm: vma->vm_mm);
2169	zap_page_range_single_batched(tlb: &tlb, vma, address, size, details);
2170	tlb_finish_mmu(tlb: &tlb);
2171	}
2172
2173	/**
2174	* zap_vma_ptes - remove ptes mapping the vma
2175	* @vma: vm_area_struct holding ptes to be zapped
2176	* @address: starting address of pages to zap
2177	* @size: number of bytes to zap
2178	*
2179	* This function only unmaps ptes assigned to VM_PFNMAP vmas.
2180	*
2181	* The entire address range must be fully contained within the vma.
2182	*
2183	*/
2184	void zap_vma_ptes(struct vm_area_struct vma, unsigned* long address,
2185	unsigned long size)
2186	{
2187	if (!range_in_vma(vma, start: address, end: address + size) \|\|
2188	!(vma->vm_flags & VM_PFNMAP))
2189	return;
2190
2191	zap_page_range_single(vma, address, size, NULL);
2192	}
2193	EXPORT_SYMBOL_GPL(zap_vma_ptes);
2194
2195	static pmd_t walk_to_pmd(struct* mm_struct mm, unsigned* long addr)
2196	{
2197	pgd_t *pgd;
2198	p4d_t *p4d;
2199	pud_t *pud;
2200	pmd_t *pmd;
2201
2202	pgd = pgd_offset(mm, addr);
2203	p4d = p4d_alloc(mm, pgd, address: addr);
2204	if (!p4d)
2205	return NULL;
2206	pud = pud_alloc(mm, p4d, address: addr);
2207	if (!pud)
2208	return NULL;
2209	pmd = pmd_alloc(mm, pud, address: addr);
2210	if (!pmd)
2211	return NULL;
2212
2213	VM_BUG_ON(pmd_trans_huge(*pmd));
2214	return pmd;
2215	}
2216
2217	pte_t __get_locked_pte(struct* mm_struct mm, unsigned* long addr,
2218	spinlock_t **ptl)
2219	{
2220	pmd_t *pmd = walk_to_pmd(mm, addr);
2221
2222	if (!pmd)
2223	return NULL;
2224	return pte_alloc_map_lock(mm, pmd, addr, ptl);
2225	}
2226
2227	static bool vm_mixed_zeropage_allowed(struct vm_area_struct *vma)
2228	{
2229	VM_WARN_ON_ONCE(vma->vm_flags & VM_PFNMAP);
2230	/*
2231	* Whoever wants to forbid the zeropage after some zeropages
2232	* might already have been mapped has to scan the page tables and
2233	* bail out on any zeropages. Zeropages in COW mappings can
2234	* be unshared using FAULT_FLAG_UNSHARE faults.
2235	*/
2236	if (mm_forbids_zeropage(vma->vm_mm))
2237	return false;
2238	/ zeropages in COW mappings are common and unproblematic. /
2239	if (is_cow_mapping(flags: vma->vm_flags))
2240	return true;
2241	/ Mappings that do not allow for writable PTEs are unproblematic. /
2242	if (!(vma->vm_flags & (VM_WRITE \| VM_MAYWRITE)))
2243	return true;
2244	/*
2245	* Why not allow any VMA that has vm_ops->pfn_mkwrite? GUP could
2246	* find the shared zeropage and longterm-pin it, which would
2247	* be problematic as soon as the zeropage gets replaced by a different
2248	* page due to vma->vm_ops->pfn_mkwrite, because what's mapped would
2249	* now differ to what GUP looked up. FSDAX is incompatible to
2250	* FOLL_LONGTERM and VM_IO is incompatible to GUP completely (see
2251	* check_vma_flags).
2252	*/
2253	return vma->vm_ops && vma->vm_ops->pfn_mkwrite &&
2254	(vma_is_fsdax(vma) \|\| vma->vm_flags & VM_IO);
2255	}
2256
2257	static int validate_page_before_insert(struct vm_area_struct *vma,
2258	struct page *page)
2259	{
2260	struct folio *folio = page_folio(page);
2261
2262	if (!folio_ref_count(folio))
2263	return -EINVAL;
2264	if (unlikely(is_zero_folio(folio))) {
2265	if (!vm_mixed_zeropage_allowed(vma))
2266	return -EINVAL;
2267	return `0`;
2268	}
2269	if (folio_test_anon(folio) \|\| page_has_type(page))
2270	return -EINVAL;
2271	flush_dcache_folio(folio);
2272	return `0`;
2273	}
2274
2275	static int insert_page_into_pte_locked(struct vm_area_struct vma, pte_t pte,
2276	unsigned long addr, struct page *page,
2277	pgprot_t prot, bool mkwrite)
2278	{
2279	struct folio *folio = page_folio(page);
2280	pte_t pteval = ptep_get(ptep: pte);
2281
2282	if (!pte_none(pte: pteval)) {
2283	if (!mkwrite)
2284	return -EBUSY;
2285
2286	/ see insert_pfn(). /
2287	if (pte_pfn(pte: pteval) != page_to_pfn(page)) {
2288	WARN_ON_ONCE(!is_zero_pfn(pte_pfn(pteval)));
2289	return -EFAULT;
2290	}
2291	pteval = maybe_mkwrite(pte: pteval, vma);
2292	pteval = pte_mkyoung(pte: pteval);
2293	if (ptep_set_access_flags(vma, address: addr, ptep: pte, entry: pteval, dirty: `1`))
2294	update_mmu_cache(vma, addr, ptep: pte);
2295	return `0`;
2296	}
2297
2298	/ Ok, finally just insert the thing.. /
2299	pteval = mk_pte(page, pgprot: prot);
2300	if (unlikely(is_zero_folio(folio))) {
2301	pteval = pte_mkspecial(pte: pteval);
2302	} else {
2303	folio_get(folio);
2304	pteval = mk_pte(page, pgprot: prot);
2305	if (mkwrite) {
2306	pteval = pte_mkyoung(pte: pteval);
2307	pteval = maybe_mkwrite(pte: pte_mkdirty(pte: pteval), vma);
2308	}
2309	inc_mm_counter(mm: vma->vm_mm, member: mm_counter_file(folio));
2310	folio_add_file_rmap_pte(folio, page, vma);
2311	}
2312	set_pte_at(vma->vm_mm, addr, pte, pteval);
2313	return `0`;
2314	}
2315
2316	static int insert_page(struct vm_area_struct vma, unsigned* long addr,
2317	struct page *page, pgprot_t prot, bool mkwrite)
2318	{
2319	int retval;
2320	pte_t *pte;
2321	spinlock_t *ptl;
2322
2323	retval = validate_page_before_insert(vma, page);
2324	if (retval)
2325	goto out;
2326	retval = -ENOMEM;
2327	pte = get_locked_pte(mm: vma->vm_mm, addr, ptl: &ptl);
2328	if (!pte)
2329	goto out;
2330	retval = insert_page_into_pte_locked(vma, pte, addr, page, prot,
2331	mkwrite);
2332	pte_unmap_unlock(pte, ptl);
2333	out:
2334	return retval;
2335	}
2336
2337	static int insert_page_in_batch_locked(struct vm_area_struct vma, pte_t pte,
2338	unsigned long addr, struct page *page, pgprot_t prot)
2339	{
2340	int err;
2341
2342	err = validate_page_before_insert(vma, page);
2343	if (err)
2344	return err;
2345	return insert_page_into_pte_locked(vma, pte, addr, page, prot, mkwrite: false);
2346	}
2347
2348	/ insert_pages() amortizes the cost of spinlock operations*
2349	* when inserting pages in a loop.
2350	*/
2351	static int insert_pages(struct vm_area_struct vma, unsigned* long addr,
2352	struct page *pages, unsigned* long *num, pgprot_t prot)
2353	{
2354	pmd_t *pmd = NULL;
2355	pte_t start_pte, pte;
2356	spinlock_t *pte_lock;
2357	struct mm_struct *const mm = vma->vm_mm;
2358	unsigned long curr_page_idx = `0`;
2359	unsigned long remaining_pages_total = *num;
2360	unsigned long pages_to_write_in_pmd;
2361	int ret;
2362	more:
2363	ret = -EFAULT;
2364	pmd = walk_to_pmd(mm, addr);
2365	if (!pmd)
2366	goto out;
2367
2368	pages_to_write_in_pmd = min_t(unsigned long,
2369	remaining_pages_total, PTRS_PER_PTE - pte_index(addr));
2370
2371	/ Allocate the PTE if necessary; takes PMD lock once only. /
2372	ret = -ENOMEM;
2373	if (pte_alloc(mm, pmd))
2374	goto out;
2375
2376	while (pages_to_write_in_pmd) {
2377	int pte_idx = `0`;
2378	const int batch_size = min_t(int, pages_to_write_in_pmd, `8`);
2379
2380	start_pte = pte_offset_map_lock(mm, pmd, addr, ptlp: &pte_lock);
2381	if (!start_pte) {
2382	ret = -EFAULT;
2383	goto out;
2384	}
2385	for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) {
2386	int err = insert_page_in_batch_locked(vma, pte,
2387	addr, page: pages[curr_page_idx], prot);
2388	if (unlikely(err)) {
2389	pte_unmap_unlock(start_pte, pte_lock);
2390	ret = err;
2391	remaining_pages_total -= pte_idx;
2392	goto out;
2393	}
2394	addr += PAGE_SIZE;
2395	++curr_page_idx;
2396	}
2397	pte_unmap_unlock(start_pte, pte_lock);
2398	pages_to_write_in_pmd -= batch_size;
2399	remaining_pages_total -= batch_size;
2400	}
2401	if (remaining_pages_total)
2402	goto more;
2403	ret = `0`;
2404	out:
2405	*num = remaining_pages_total;
2406	return ret;
2407	}
2408
2409	/**
2410	* vm_insert_pages - insert multiple pages into user vma, batching the pmd lock.
2411	* @vma: user vma to map to
2412	* @addr: target start user address of these pages
2413	* @pages: source kernel pages
2414	* @num: in: number of pages to map. out: number of pages that were not
2415	* mapped. (0 means all pages were successfully mapped).
2416	*
2417	* Preferred over vm_insert_page() when inserting multiple pages.
2418	*
2419	* In case of error, we may have mapped a subset of the provided
2420	* pages. It is the caller's responsibility to account for this case.
2421	*
2422	* The same restrictions apply as in vm_insert_page().
2423	*/
2424	int vm_insert_pages(struct vm_area_struct vma, unsigned* long addr,
2425	struct page *pages, unsigned* long *num)
2426	{
2427	const unsigned long end_addr = addr + (num PAGE_SIZE) - `1`;
2428
2429	if (addr < vma->vm_start \|\| end_addr >= vma->vm_end)
2430	return -EFAULT;
2431	if (!(vma->vm_flags & VM_MIXEDMAP)) {
2432	BUG_ON(mmap_read_trylock(vma->vm_mm));
2433	BUG_ON(vma->vm_flags & VM_PFNMAP);
2434	vm_flags_set(vma, VM_MIXEDMAP);
2435	}
2436	/ Defer page refcount checking till we're about to map that page. /
2437	return insert_pages(vma, addr, pages, num, prot: vma->vm_page_prot);
2438	}
2439	EXPORT_SYMBOL(vm_insert_pages);
2440
2441	/**
2442	* vm_insert_page - insert single page into user vma
2443	* @vma: user vma to map to
2444	* @addr: target user address of this page
2445	* @page: source kernel page
2446	*
2447	* This allows drivers to insert individual pages they've allocated
2448	* into a user vma. The zeropage is supported in some VMAs,
2449	* see vm_mixed_zeropage_allowed().
2450	*
2451	* The page has to be a nice clean _individual_ kernel allocation.
2452	* If you allocate a compound page, you need to have marked it as
2453	* such (__GFP_COMP), or manually just split the page up yourself
2454	* (see split_page()).
2455	*
2456	* NOTE! Traditionally this was done with "remap_pfn_range()" which
2457	* took an arbitrary page protection parameter. This doesn't allow
2458	* that. Your vma protection will have to be set up correctly, which
2459	* means that if you want a shared writable mapping, you'd better
2460	* ask for a shared writable mapping!
2461	*
2462	* The page does not need to be reserved.
2463	*
2464	* Usually this function is called from f_op->mmap() handler
2465	* under mm->mmap_lock write-lock, so it can change vma->vm_flags.
2466	* Caller must set VM_MIXEDMAP on vma if it wants to call this
2467	* function from other places, for example from page-fault handler.
2468	*
2469	* Return: %0 on success, negative error code otherwise.
2470	*/
2471	int vm_insert_page(struct vm_area_struct vma, unsigned* long addr,
2472	struct page *page)
2473	{
2474	if (addr < vma->vm_start \|\| addr >= vma->vm_end)
2475	return -EFAULT;
2476	if (!(vma->vm_flags & VM_MIXEDMAP)) {
2477	BUG_ON(mmap_read_trylock(vma->vm_mm));
2478	BUG_ON(vma->vm_flags & VM_PFNMAP);
2479	vm_flags_set(vma, VM_MIXEDMAP);
2480	}
2481	return insert_page(vma, addr, page, prot: vma->vm_page_prot, mkwrite: false);
2482	}
2483	EXPORT_SYMBOL(vm_insert_page);
2484
2485	/*
2486	* __vm_map_pages - maps range of kernel pages into user vma
2487	* @vma: user vma to map to
2488	* @pages: pointer to array of source kernel pages
2489	* @num: number of pages in page array
2490	* @offset: user's requested vm_pgoff
2491	*
2492	* This allows drivers to map range of kernel pages into a user vma.
2493	* The zeropage is supported in some VMAs, see
2494	* vm_mixed_zeropage_allowed().
2495	*
2496	* Return: 0 on success and error code otherwise.
2497	*/
2498	static int __vm_map_pages(struct vm_area_struct vma, struct* page **pages,
2499	unsigned long num, unsigned long offset)
2500	{
2501	unsigned long count = vma_pages(vma);
2502	unsigned long uaddr = vma->vm_start;
2503	int ret, i;
2504
2505	/ Fail if the user requested offset is beyond the end of the object /
2506	if (offset >= num)
2507	return -ENXIO;
2508
2509	/ Fail if the user requested size exceeds available object size /
2510	if (count > num - offset)
2511	return -ENXIO;
2512
2513	for (i = `0`; i < count; i++) {
2514	ret = vm_insert_page(vma, uaddr, pages[offset + i]);
2515	if (ret < `0`)
2516	return ret;
2517	uaddr += PAGE_SIZE;
2518	}
2519
2520	return `0`;
2521	}
2522
2523	/**
2524	* vm_map_pages - maps range of kernel pages starts with non zero offset
2525	* @vma: user vma to map to
2526	* @pages: pointer to array of source kernel pages
2527	* @num: number of pages in page array
2528	*
2529	* Maps an object consisting of @num pages, catering for the user's
2530	* requested vm_pgoff
2531	*
2532	* If we fail to insert any page into the vma, the function will return
2533	* immediately leaving any previously inserted pages present. Callers
2534	* from the mmap handler may immediately return the error as their caller
2535	* will destroy the vma, removing any successfully inserted pages. Other
2536	* callers should make their own arrangements for calling unmap_region().
2537	*
2538	* Context: Process context. Called by mmap handlers.
2539	* Return: 0 on success and error code otherwise.
2540	*/
2541	int vm_map_pages(struct vm_area_struct vma, struct* page **pages,
2542	unsigned long num)
2543	{
2544	return __vm_map_pages(vma, pages, num, offset: vma->vm_pgoff);
2545	}
2546	EXPORT_SYMBOL(vm_map_pages);
2547
2548	/**
2549	* vm_map_pages_zero - map range of kernel pages starts with zero offset
2550	* @vma: user vma to map to
2551	* @pages: pointer to array of source kernel pages
2552	* @num: number of pages in page array
2553	*
2554	* Similar to vm_map_pages(), except that it explicitly sets the offset
2555	* to 0. This function is intended for the drivers that did not consider
2556	* vm_pgoff.
2557	*
2558	* Context: Process context. Called by mmap handlers.
2559	* Return: 0 on success and error code otherwise.
2560	*/
2561	int vm_map_pages_zero(struct vm_area_struct vma, struct* page **pages,
2562	unsigned long num)
2563	{
2564	return __vm_map_pages(vma, pages, num, offset: `0`);
2565	}
2566	EXPORT_SYMBOL(vm_map_pages_zero);
2567
2568	static vm_fault_t insert_pfn(struct vm_area_struct vma, unsigned* long addr,
2569	unsigned long pfn, pgprot_t prot, bool mkwrite)
2570	{
2571	struct mm_struct *mm = vma->vm_mm;
2572	pte_t *pte, entry;
2573	spinlock_t *ptl;
2574
2575	pte = get_locked_pte(mm, addr, ptl: &ptl);
2576	if (!pte)
2577	return VM_FAULT_OOM;
2578	entry = ptep_get(ptep: pte);
2579	if (!pte_none(pte: entry)) {
2580	if (mkwrite) {
2581	/*
2582	* For read faults on private mappings the PFN passed
2583	* in may not match the PFN we have mapped if the
2584	* mapped PFN is a writeable COW page. In the mkwrite
2585	* case we are creating a writable PTE for a shared
2586	* mapping and we expect the PFNs to match. If they
2587	* don't match, we are likely racing with block
2588	* allocation and mapping invalidation so just skip the
2589	* update.
2590	*/
2591	if (pte_pfn(pte: entry) != pfn) {
2592	WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry)));
2593	goto out_unlock;
2594	}
2595	entry = pte_mkyoung(pte: entry);
2596	entry = maybe_mkwrite(pte: pte_mkdirty(pte: entry), vma);
2597	if (ptep_set_access_flags(vma, address: addr, ptep: pte, entry, dirty: `1`))
2598	update_mmu_cache(vma, addr, ptep: pte);
2599	}
2600	goto out_unlock;
2601	}
2602
2603	/ Ok, finally just insert the thing.. /
2604	entry = pte_mkspecial(pte: pfn_pte(page_nr: pfn, pgprot: prot));
2605
2606	if (mkwrite) {
2607	entry = pte_mkyoung(pte: entry);
2608	entry = maybe_mkwrite(pte: pte_mkdirty(pte: entry), vma);
2609	}
2610
2611	set_pte_at(mm, addr, pte, entry);
2612	update_mmu_cache(vma, addr, ptep: pte); / XXX: why not for insert_page? /
2613
2614	out_unlock:
2615	pte_unmap_unlock(pte, ptl);
2616	return VM_FAULT_NOPAGE;
2617	}
2618
2619	/**
2620	* vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot
2621	* @vma: user vma to map to
2622	* @addr: target user address of this page
2623	* @pfn: source kernel pfn
2624	* @pgprot: pgprot flags for the inserted page
2625	*
2626	* This is exactly like vmf_insert_pfn(), except that it allows drivers
2627	* to override pgprot on a per-page basis.
2628	*
2629	* This only makes sense for IO mappings, and it makes no sense for
2630	* COW mappings. In general, using multiple vmas is preferable;
2631	* vmf_insert_pfn_prot should only be used if using multiple VMAs is
2632	* impractical.
2633	*
2634	* pgprot typically only differs from @vma->vm_page_prot when drivers set
2635	* caching- and encryption bits different than those of @vma->vm_page_prot,
2636	* because the caching- or encryption mode may not be known at mmap() time.
2637	*
2638	* This is ok as long as @vma->vm_page_prot is not used by the core vm
2639	* to set caching and encryption bits for those vmas (except for COW pages).
2640	* This is ensured by core vm only modifying these page table entries using
2641	* functions that don't touch caching- or encryption bits, using pte_modify()
2642	* if needed. (See for example mprotect()).
2643	*
2644	* Also when new page-table entries are created, this is only done using the
2645	* fault() callback, and never using the value of vma->vm_page_prot,
2646	* except for page-table entries that point to anonymous pages as the result
2647	* of COW.
2648	*
2649	* Context: Process context. May allocate using %GFP_KERNEL.
2650	* Return: vm_fault_t value.
2651	*/
2652	vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct vma, unsigned* long addr,
2653	unsigned long pfn, pgprot_t pgprot)
2654	{
2655	/*
2656	* Technically, architectures with pte_special can avoid all these
2657	* restrictions (same for remap_pfn_range). However we would like
2658	* consistency in testing and feature parity among all, so we should
2659	* try to keep these invariants in place for everybody.
2660	*/
2661	BUG_ON(!(vma->vm_flags & (VM_PFNMAP\|VM_MIXEDMAP)));
2662	BUG_ON((vma->vm_flags & (VM_PFNMAP\|VM_MIXEDMAP)) ==
2663	(VM_PFNMAP\|VM_MIXEDMAP));
2664	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
2665	BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
2666
2667	if (addr < vma->vm_start \|\| addr >= vma->vm_end)
2668	return VM_FAULT_SIGBUS;
2669
2670	if (!pfn_modify_allowed(pfn, prot: pgprot))
2671	return VM_FAULT_SIGBUS;
2672
2673	pfnmap_setup_cachemode_pfn(pfn, prot: &pgprot);
2674
2675	return insert_pfn(vma, addr, pfn, prot: pgprot, mkwrite: false);
2676	}
2677	EXPORT_SYMBOL(vmf_insert_pfn_prot);
2678
2679	/**
2680	* vmf_insert_pfn - insert single pfn into user vma
2681	* @vma: user vma to map to
2682	* @addr: target user address of this page
2683	* @pfn: source kernel pfn
2684	*
2685	* Similar to vm_insert_page, this allows drivers to insert individual pages
2686	* they've allocated into a user vma. Same comments apply.
2687	*
2688	* This function should only be called from a vm_ops->fault handler, and
2689	* in that case the handler should return the result of this function.
2690	*
2691	* vma cannot be a COW mapping.
2692	*
2693	* As this is called only for pages that do not currently exist, we
2694	* do not need to flush old virtual caches or the TLB.
2695	*
2696	* Context: Process context. May allocate using %GFP_KERNEL.
2697	* Return: vm_fault_t value.
2698	*/
2699	vm_fault_t vmf_insert_pfn(struct vm_area_struct vma, unsigned* long addr,
2700	unsigned long pfn)
2701	{
2702	return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
2703	}
2704	EXPORT_SYMBOL(vmf_insert_pfn);
2705
2706	static bool vm_mixed_ok(struct vm_area_struct vma, unsigned* long pfn,
2707	bool mkwrite)
2708	{
2709	if (unlikely(is_zero_pfn(pfn)) &&
2710	(mkwrite \|\| !vm_mixed_zeropage_allowed(vma)))
2711	return false;
2712	/ these checks mirror the abort conditions in vm_normal_page /
2713	if (vma->vm_flags & VM_MIXEDMAP)
2714	return true;
2715	if (is_zero_pfn(pfn))
2716	return true;
2717	return false;
2718	}
2719
2720	static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
2721	unsigned long addr, unsigned long pfn, bool mkwrite)
2722	{
2723	pgprot_t pgprot = vma->vm_page_prot;
2724	int err;
2725
2726	if (!vm_mixed_ok(vma, pfn, mkwrite))
2727	return VM_FAULT_SIGBUS;
2728
2729	if (addr < vma->vm_start \|\| addr >= vma->vm_end)
2730	return VM_FAULT_SIGBUS;
2731
2732	pfnmap_setup_cachemode_pfn(pfn, prot: &pgprot);
2733
2734	if (!pfn_modify_allowed(pfn, prot: pgprot))
2735	return VM_FAULT_SIGBUS;
2736
2737	/*
2738	* If we don't have pte special, then we have to use the pfn_valid()
2739	* based VM_MIXEDMAP scheme (see vm_normal_page), and thus we must
2740	* refcount the page if pfn_valid is true (hence insert_page rather
2741	* than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP
2742	* without pte special, it would there be refcounted as a normal page.
2743	*/
2744	if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pfn_valid(pfn)) {
2745	struct page *page;
2746
2747	/*
2748	* At this point we are committed to insert_page()
2749	* regardless of whether the caller specified flags that
2750	* result in pfn_t_has_page() == false.
2751	*/
2752	page = pfn_to_page(pfn);
2753	err = insert_page(vma, addr, page, prot: pgprot, mkwrite);
2754	} else {
2755	return insert_pfn(vma, addr, pfn, prot: pgprot, mkwrite);
2756	}
2757
2758	if (err == -ENOMEM)
2759	return VM_FAULT_OOM;
2760	if (err < `0` && err != -EBUSY)
2761	return VM_FAULT_SIGBUS;
2762
2763	return VM_FAULT_NOPAGE;
2764	}
2765
2766	vm_fault_t vmf_insert_page_mkwrite(struct vm_fault vmf, struct* page *page,
2767	bool write)
2768	{
2769	pgprot_t pgprot = vmf->vma->vm_page_prot;
2770	unsigned long addr = vmf->address;
2771	int err;
2772
2773	if (addr < vmf->vma->vm_start \|\| addr >= vmf->vma->vm_end)
2774	return VM_FAULT_SIGBUS;
2775
2776	err = insert_page(vma: vmf->vma, addr, page, prot: pgprot, mkwrite: write);
2777	if (err == -ENOMEM)
2778	return VM_FAULT_OOM;
2779	if (err < `0` && err != -EBUSY)
2780	return VM_FAULT_SIGBUS;
2781
2782	return VM_FAULT_NOPAGE;
2783	}
2784	EXPORT_SYMBOL_GPL(vmf_insert_page_mkwrite);
2785
2786	vm_fault_t vmf_insert_mixed(struct vm_area_struct vma, unsigned* long addr,
2787	unsigned long pfn)
2788	{
2789	return __vm_insert_mixed(vma, addr, pfn, mkwrite: false);
2790	}
2791	EXPORT_SYMBOL(vmf_insert_mixed);
2792
2793	/*
2794	* If the insertion of PTE failed because someone else already added a
2795	* different entry in the mean time, we treat that as success as we assume
2796	* the same entry was actually inserted.
2797	*/
2798	vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
2799	unsigned long addr, unsigned long pfn)
2800	{
2801	return __vm_insert_mixed(vma, addr, pfn, mkwrite: true);
2802	}
2803
2804	/*
2805	* maps a range of physical memory into the requested pages. the old
2806	* mappings are removed. any references to nonexistent pages results
2807	* in null mappings (currently treated as "copy-on-access")
2808	*/
2809	static int remap_pte_range(struct mm_struct mm, pmd_t pmd,
2810	unsigned long addr, unsigned long end,
2811	unsigned long pfn, pgprot_t prot)
2812	{
2813	pte_t pte, mapped_pte;
2814	spinlock_t *ptl;
2815	int err = `0`;
2816
2817	mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
2818	if (!pte)
2819	return -ENOMEM;
2820	arch_enter_lazy_mmu_mode();
2821	do {
2822	BUG_ON(!pte_none(ptep_get(pte)));
2823	if (!pfn_modify_allowed(pfn, prot)) {
2824	err = -EACCES;
2825	break;
2826	}
2827	set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
2828	pfn++;
2829	} while (pte++, addr += PAGE_SIZE, addr != end);
2830	arch_leave_lazy_mmu_mode();
2831	pte_unmap_unlock(mapped_pte, ptl);
2832	return err;
2833	}
2834
2835	static inline int remap_pmd_range(struct mm_struct mm, pud_t pud,
2836	unsigned long addr, unsigned long end,
2837	unsigned long pfn, pgprot_t prot)
2838	{
2839	pmd_t *pmd;
2840	unsigned long next;
2841	int err;
2842
2843	pfn -= addr >> PAGE_SHIFT;
2844	pmd = pmd_alloc(mm, pud, address: addr);
2845	if (!pmd)
2846	return -ENOMEM;
2847	VM_BUG_ON(pmd_trans_huge(*pmd));
2848	do {
2849	next = pmd_addr_end(addr, end);
2850	err = remap_pte_range(mm, pmd, addr, end: next,
2851	pfn: pfn + (addr >> PAGE_SHIFT), prot);
2852	if (err)
2853	return err;
2854	} while (pmd++, addr = next, addr != end);
2855	return `0`;
2856	}
2857
2858	static inline int remap_pud_range(struct mm_struct mm, p4d_t p4d,
2859	unsigned long addr, unsigned long end,
2860	unsigned long pfn, pgprot_t prot)
2861	{
2862	pud_t *pud;
2863	unsigned long next;
2864	int err;
2865
2866	pfn -= addr >> PAGE_SHIFT;
2867	pud = pud_alloc(mm, p4d, address: addr);
2868	if (!pud)
2869	return -ENOMEM;
2870	do {
2871	next = pud_addr_end(addr, end);
2872	err = remap_pmd_range(mm, pud, addr, end: next,
2873	pfn: pfn + (addr >> PAGE_SHIFT), prot);
2874	if (err)
2875	return err;
2876	} while (pud++, addr = next, addr != end);
2877	return `0`;
2878	}
2879
2880	static inline int remap_p4d_range(struct mm_struct mm, pgd_t pgd,
2881	unsigned long addr, unsigned long end,
2882	unsigned long pfn, pgprot_t prot)
2883	{
2884	p4d_t *p4d;
2885	unsigned long next;
2886	int err;
2887
2888	pfn -= addr >> PAGE_SHIFT;
2889	p4d = p4d_alloc(mm, pgd, address: addr);
2890	if (!p4d)
2891	return -ENOMEM;
2892	do {
2893	next = p4d_addr_end(addr, end);
2894	err = remap_pud_range(mm, p4d, addr, end: next,
2895	pfn: pfn + (addr >> PAGE_SHIFT), prot);
2896	if (err)
2897	return err;
2898	} while (p4d++, addr = next, addr != end);
2899	return `0`;
2900	}
2901
2902	static int remap_pfn_range_internal(struct vm_area_struct vma, unsigned* long addr,
2903	unsigned long pfn, unsigned long size, pgprot_t prot)
2904	{
2905	pgd_t *pgd;
2906	unsigned long next;
2907	unsigned long end = addr + PAGE_ALIGN(size);
2908	struct mm_struct *mm = vma->vm_mm;
2909	int err;
2910
2911	if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
2912	return -EINVAL;
2913
2914	/*
2915	* Physically remapped pages are special. Tell the
2916	* rest of the world about it:
2917	* VM_IO tells people not to look at these pages
2918	* (accesses can have side effects).
2919	* VM_PFNMAP tells the core MM that the base pages are just
2920	* raw PFN mappings, and do not have a "struct page" associated
2921	* with them.
2922	* VM_DONTEXPAND
2923	* Disable vma merging and expanding with mremap().
2924	* VM_DONTDUMP
2925	* Omit vma from core dump, even when VM_IO turned off.
2926	*
2927	* There's a horrible special case to handle copy-on-write
2928	* behaviour that some programs depend on. We mark the "original"
2929	* un-COW'ed pages by matching them up with "vma->vm_pgoff".
2930	* See vm_normal_page() for details.
2931	*/
2932	if (is_cow_mapping(flags: vma->vm_flags)) {
2933	if (addr != vma->vm_start \|\| end != vma->vm_end)
2934	return -EINVAL;
2935	vma->vm_pgoff = pfn;
2936	}
2937
2938	vm_flags_set(vma, VM_IO \| VM_PFNMAP \| VM_DONTEXPAND \| VM_DONTDUMP);
2939
2940	BUG_ON(addr >= end);
2941	pfn -= addr >> PAGE_SHIFT;
2942	pgd = pgd_offset(mm, addr);
2943	flush_cache_range(vma, start: addr, end);
2944	do {
2945	next = pgd_addr_end(addr, end);
2946	err = remap_p4d_range(mm, pgd, addr, end: next,
2947	pfn: pfn + (addr >> PAGE_SHIFT), prot);
2948	if (err)
2949	return err;
2950	} while (pgd++, addr = next, addr != end);
2951
2952	return `0`;
2953	}
2954
2955	/*
2956	* Variant of remap_pfn_range that does not call track_pfn_remap. The caller
2957	* must have pre-validated the caching bits of the pgprot_t.
2958	*/
2959	int remap_pfn_range_notrack(struct vm_area_struct vma, unsigned* long addr,
2960	unsigned long pfn, unsigned long size, pgprot_t prot)
2961	{
2962	int error = remap_pfn_range_internal(vma, addr, pfn, size, prot);
2963
2964	if (!error)
2965	return `0`;
2966
2967	/*
2968	* A partial pfn range mapping is dangerous: it does not
2969	* maintain page reference counts, and callers may free
2970	* pages due to the error. So zap it early.
2971	*/
2972	zap_page_range_single(vma, address: addr, size, NULL);
2973	return error;
2974	}
2975
2976	#ifdef __HAVE_PFNMAP_TRACKING
2977	static inline struct pfnmap_track_ctx pfnmap_track_ctx_alloc(unsigned* long pfn,
2978	unsigned long size, pgprot_t *prot)
2979	{
2980	struct pfnmap_track_ctx *ctx;
2981
2982	if (pfnmap_track(pfn, size, prot))
2983	return ERR_PTR(error: -EINVAL);
2984
2985	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
2986	if (unlikely(!ctx)) {
2987	pfnmap_untrack(pfn, size);
2988	return ERR_PTR(error: -ENOMEM);
2989	}
2990
2991	ctx->pfn = pfn;
2992	ctx->size = size;
2993	kref_init(kref: &ctx->kref);
2994	return ctx;
2995	}
2996
2997	void pfnmap_track_ctx_release(struct kref *ref)
2998	{
2999	struct pfnmap_track_ctx ctx = container_of(ref, struct* pfnmap_track_ctx, kref);
3000
3001	pfnmap_untrack(pfn: ctx->pfn, size: ctx->size);
3002	kfree(objp: ctx);
3003	}
3004	#endif /* __HAVE_PFNMAP_TRACKING */
3005
3006	/**
3007	* remap_pfn_range - remap kernel memory to userspace
3008	* @vma: user vma to map to
3009	* @addr: target page aligned user address to start at
3010	* @pfn: page frame number of kernel physical memory address
3011	* @size: size of mapping area
3012	* @prot: page protection flags for this mapping
3013	*
3014	* Note: this is only safe if the mm semaphore is held when called.
3015	*
3016	* Return: %0 on success, negative error code otherwise.
3017	*/
3018	#ifdef __HAVE_PFNMAP_TRACKING
3019	int remap_pfn_range(struct vm_area_struct vma, unsigned* long addr,
3020	unsigned long pfn, unsigned long size, pgprot_t prot)
3021	{
3022	struct pfnmap_track_ctx *ctx = NULL;
3023	int err;
3024
3025	size = PAGE_ALIGN(size);
3026
3027	/*
3028	* If we cover the full VMA, we'll perform actual tracking, and
3029	* remember to untrack when the last reference to our tracking
3030	* context from a VMA goes away. We'll keep tracking the whole pfn
3031	* range even during VMA splits and partial unmapping.
3032	*
3033	* If we only cover parts of the VMA, we'll only setup the cachemode
3034	* in the pgprot for the pfn range.
3035	*/
3036	if (addr == vma->vm_start && addr + size == vma->vm_end) {
3037	if (vma->pfnmap_track_ctx)
3038	return -EINVAL;
3039	ctx = pfnmap_track_ctx_alloc(pfn, size, prot: &prot);
3040	if (IS_ERR(ptr: ctx))
3041	return PTR_ERR(ptr: ctx);
3042	} else if (pfnmap_setup_cachemode(pfn, size, prot: &prot)) {
3043	return -EINVAL;
3044	}
3045
3046	err = remap_pfn_range_notrack(vma, addr, pfn, size, prot);
3047	if (ctx) {
3048	if (err)
3049	kref_put(kref: &ctx->kref, release: pfnmap_track_ctx_release);
3050	else
3051	vma->pfnmap_track_ctx = ctx;
3052	}
3053	return err;
3054	}
3055
3056	#else
3057	int remap_pfn_range(struct vm_area_struct vma, unsigned* long addr,
3058	unsigned long pfn, unsigned long size, pgprot_t prot)
3059	{
3060	return remap_pfn_range_notrack(vma, addr, pfn, size, prot);
3061	}
3062	#endif
3063	EXPORT_SYMBOL(remap_pfn_range);
3064
3065	/**
3066	* vm_iomap_memory - remap memory to userspace
3067	* @vma: user vma to map to
3068	* @start: start of the physical memory to be mapped
3069	* @len: size of area
3070	*
3071	* This is a simplified io_remap_pfn_range() for common driver use. The
3072	* driver just needs to give us the physical memory range to be mapped,
3073	* we'll figure out the rest from the vma information.
3074	*
3075	* NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
3076	* whatever write-combining details or similar.
3077	*
3078	* Return: %0 on success, negative error code otherwise.
3079	*/
3080	int vm_iomap_memory(struct vm_area_struct vma, phys_addr_t start, unsigned* long len)
3081	{
3082	unsigned long vm_len, pfn, pages;
3083
3084	/ Check that the physical memory area passed in looks valid /
3085	if (start + len < start)
3086	return -EINVAL;
3087	/*
3088	* You really shouldn't map things that aren't page-aligned,
3089	* but we've historically allowed it because IO memory might
3090	* just have smaller alignment.
3091	*/
3092	len += start & ~PAGE_MASK;
3093	pfn = start >> PAGE_SHIFT;
3094	pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
3095	if (pfn + pages < pfn)
3096	return -EINVAL;
3097
3098	/ We start the mapping 'vm_pgoff' pages into the area /
3099	if (vma->vm_pgoff > pages)
3100	return -EINVAL;
3101	pfn += vma->vm_pgoff;
3102	pages -= vma->vm_pgoff;
3103
3104	/ Can we fit all of the mapping? /
3105	vm_len = vma->vm_end - vma->vm_start;
3106	if (vm_len >> PAGE_SHIFT > pages)
3107	return -EINVAL;
3108
3109	/ Ok, let it rip /
3110	return io_remap_pfn_range(vma, addr: vma->vm_start, pfn, size: vm_len, prot: vma->vm_page_prot);
3111	}
3112	EXPORT_SYMBOL(vm_iomap_memory);
3113
3114	static int apply_to_pte_range(struct mm_struct mm, pmd_t pmd,
3115	unsigned long addr, unsigned long end,
3116	pte_fn_t fn, void *data, bool create,
3117	pgtbl_mod_mask *mask)
3118	{
3119	pte_t pte, mapped_pte;
3120	int err = `0`;
3121	spinlock_t *ptl;
3122
3123	if (create) {
3124	mapped_pte = pte = (mm == &init_mm) ?
3125	pte_alloc_kernel_track(pmd, addr, mask) :
3126	pte_alloc_map_lock(mm, pmd, addr, &ptl);
3127	if (!pte)
3128	return -ENOMEM;
3129	} else {
3130	mapped_pte = pte = (mm == &init_mm) ?
3131	pte_offset_kernel(pmd, address: addr) :
3132	pte_offset_map_lock(mm, pmd, addr, ptlp: &ptl);
3133	if (!pte)
3134	return -EINVAL;
3135	}
3136
3137	arch_enter_lazy_mmu_mode();
3138
3139	if (fn) {
3140	do {
3141	if (create \|\| !pte_none(pte: ptep_get(ptep: pte))) {
3142	err = fn(pte, addr, data);
3143	if (err)
3144	break;
3145	}
3146	} while (pte++, addr += PAGE_SIZE, addr != end);
3147	}
3148	*mask \|= PGTBL_PTE_MODIFIED;
3149
3150	arch_leave_lazy_mmu_mode();
3151
3152	if (mm != &init_mm)
3153	pte_unmap_unlock(mapped_pte, ptl);
3154	return err;
3155	}
3156
3157	static int apply_to_pmd_range(struct mm_struct mm, pud_t pud,
3158	unsigned long addr, unsigned long end,
3159	pte_fn_t fn, void *data, bool create,
3160	pgtbl_mod_mask *mask)
3161	{
3162	pmd_t *pmd;
3163	unsigned long next;
3164	int err = `0`;
3165
3166	BUG_ON(pud_leaf(*pud));
3167
3168	if (create) {
3169	pmd = pmd_alloc_track(mm, pud, address: addr, mod_mask: mask);
3170	if (!pmd)
3171	return -ENOMEM;
3172	} else {
3173	pmd = pmd_offset(pud, address: addr);
3174	}
3175	do {
3176	next = pmd_addr_end(addr, end);
3177	if (pmd_none(pmd: *pmd) && !create)
3178	continue;
3179	if (WARN_ON_ONCE(pmd_leaf(*pmd)))
3180	return -EINVAL;
3181	if (!pmd_none(pmd: pmd) && WARN_ON_ONCE(pmd_bad(pmd))) {
3182	if (!create)
3183	continue;
3184	pmd_clear_bad(pmd);
3185	}
3186	err = apply_to_pte_range(mm, pmd, addr, end: next,
3187	fn, data, create, mask);
3188	if (err)
3189	break;
3190	} while (pmd++, addr = next, addr != end);
3191
3192	return err;
3193	}
3194
3195	static int apply_to_pud_range(struct mm_struct mm, p4d_t p4d,
3196	unsigned long addr, unsigned long end,
3197	pte_fn_t fn, void *data, bool create,
3198	pgtbl_mod_mask *mask)
3199	{
3200	pud_t *pud;
3201	unsigned long next;
3202	int err = `0`;
3203
3204	if (create) {
3205	pud = pud_alloc_track(mm, p4d, address: addr, mod_mask: mask);
3206	if (!pud)
3207	return -ENOMEM;
3208	} else {
3209	pud = pud_offset(p4d, address: addr);
3210	}
3211	do {
3212	next = pud_addr_end(addr, end);
3213	if (pud_none(pud: *pud) && !create)
3214	continue;
3215	if (WARN_ON_ONCE(pud_leaf(*pud)))
3216	return -EINVAL;
3217	if (!pud_none(pud: pud) && WARN_ON_ONCE(pud_bad(pud))) {
3218	if (!create)
3219	continue;
3220	pud_clear_bad(pud);
3221	}
3222	err = apply_to_pmd_range(mm, pud, addr, end: next,
3223	fn, data, create, mask);
3224	if (err)
3225	break;
3226	} while (pud++, addr = next, addr != end);
3227
3228	return err;
3229	}
3230
3231	static int apply_to_p4d_range(struct mm_struct mm, pgd_t pgd,
3232	unsigned long addr, unsigned long end,
3233	pte_fn_t fn, void *data, bool create,
3234	pgtbl_mod_mask *mask)
3235	{
3236	p4d_t *p4d;
3237	unsigned long next;
3238	int err = `0`;
3239
3240	if (create) {
3241	p4d = p4d_alloc_track(mm, pgd, address: addr, mod_mask: mask);
3242	if (!p4d)
3243	return -ENOMEM;
3244	} else {
3245	p4d = p4d_offset(pgd, address: addr);
3246	}
3247	do {
3248	next = p4d_addr_end(addr, end);
3249	if (p4d_none(p4d: *p4d) && !create)
3250	continue;
3251	if (WARN_ON_ONCE(p4d_leaf(*p4d)))
3252	return -EINVAL;
3253	if (!p4d_none(p4d: p4d) && WARN_ON_ONCE(p4d_bad(p4d))) {
3254	if (!create)
3255	continue;
3256	p4d_clear_bad(p4d);
3257	}
3258	err = apply_to_pud_range(mm, p4d, addr, end: next,
3259	fn, data, create, mask);
3260	if (err)
3261	break;
3262	} while (p4d++, addr = next, addr != end);
3263
3264	return err;
3265	}
3266
3267	static int __apply_to_page_range(struct mm_struct mm, unsigned* long addr,
3268	unsigned long size, pte_fn_t fn,
3269	void *data, bool create)
3270	{
3271	pgd_t *pgd;
3272	unsigned long start = addr, next;
3273	unsigned long end = addr + size;
3274	pgtbl_mod_mask mask = `0`;
3275	int err = `0`;
3276
3277	if (WARN_ON(addr >= end))
3278	return -EINVAL;
3279
3280	pgd = pgd_offset(mm, addr);
3281	do {
3282	next = pgd_addr_end(addr, end);
3283	if (pgd_none(pgd: *pgd) && !create)
3284	continue;
3285	if (WARN_ON_ONCE(pgd_leaf(*pgd))) {
3286	err = -EINVAL;
3287	break;
3288	}
3289	if (!pgd_none(pgd: pgd) && WARN_ON_ONCE(pgd_bad(pgd))) {
3290	if (!create)
3291	continue;
3292	pgd_clear_bad(pgd);
3293	}
3294	err = apply_to_p4d_range(mm, pgd, addr, end: next,
3295	fn, data, create, mask: &mask);
3296	if (err)
3297	break;
3298	} while (pgd++, addr = next, addr != end);
3299
3300	if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
3301	arch_sync_kernel_mappings(start, end: start + size);
3302
3303	return err;
3304	}
3305
3306	/*
3307	* Scan a region of virtual memory, filling in page tables as necessary
3308	* and calling a provided function on each leaf page table.
3309	*/
3310	int apply_to_page_range(struct mm_struct mm, unsigned* long addr,
3311	unsigned long size, pte_fn_t fn, void *data)
3312	{
3313	return __apply_to_page_range(mm, addr, size, fn, data, create: true);
3314	}
3315	EXPORT_SYMBOL_GPL(apply_to_page_range);
3316
3317	/*
3318	* Scan a region of virtual memory, calling a provided function on
3319	* each leaf page table where it exists.
3320	*
3321	* Unlike apply_to_page_range, this does _not_ fill in page tables
3322	* where they are absent.
3323	*/
3324	int apply_to_existing_page_range(struct mm_struct mm, unsigned* long addr,
3325	unsigned long size, pte_fn_t fn, void *data)
3326	{
3327	return __apply_to_page_range(mm, addr, size, fn, data, create: false);
3328	}
3329
3330	/*
3331	* handle_pte_fault chooses page fault handler according to an entry which was
3332	* read non-atomically. Before making any commitment, on those architectures
3333	* or configurations (e.g. i386 with PAE) which might give a mix of unmatched
3334	* parts, do_swap_page must check under lock before unmapping the pte and
3335	* proceeding (but do_wp_page is only called after already making such a check;
3336	* and do_anonymous_page can safely check later on).
3337	*/
3338	static inline int pte_unmap_same(struct vm_fault *vmf)
3339	{
3340	int same = `1`;
3341	#if defined(CONFIG_SMP) \|\| defined(CONFIG_PREEMPTION)
3342	if (sizeof(pte_t) > sizeof(unsigned long)) {
3343	spin_lock(lock: vmf->ptl);
3344	same = pte_same(a: ptep_get(ptep: vmf->pte), b: vmf->orig_pte);
3345	spin_unlock(lock: vmf->ptl);
3346	}
3347	#endif
3348	pte_unmap(pte: vmf->pte);
3349	vmf->pte = NULL;
3350	return same;
3351	}
3352
3353	/*
3354	* Return:
3355	* 0: copied succeeded
3356	* -EHWPOISON: copy failed due to hwpoison in source page
3357	* -EAGAIN: copied failed (some other reason)
3358	*/
3359	static inline int __wp_page_copy_user(struct page dst, struct* page *src,
3360	struct vm_fault *vmf)
3361	{
3362	int ret;
3363	void *kaddr;
3364	void __user *uaddr;
3365	struct vm_area_struct *vma = vmf->vma;
3366	struct mm_struct *mm = vma->vm_mm;
3367	unsigned long addr = vmf->address;
3368
3369	if (likely(src)) {
3370	if (copy_mc_user_highpage(to: dst, from: src, vaddr: addr, vma))
3371	return -EHWPOISON;
3372	return `0`;
3373	}
3374
3375	/*
3376	* If the source page was a PFN mapping, we don't have
3377	* a "struct page" for it. We do a best-effort copy by
3378	* just copying from the original user address. If that
3379	* fails, we just zero-fill it. Live with it.
3380	*/
3381	kaddr = kmap_local_page(page: dst);
3382	pagefault_disable();
3383	uaddr = (void __user *)(addr & PAGE_MASK);
3384
3385	/*
3386	* On architectures with software "accessed" bits, we would
3387	* take a double page fault, so mark it accessed here.
3388	*/
3389	vmf->pte = NULL;
3390	if (!arch_has_hw_pte_young() && !pte_young(pte: vmf->orig_pte)) {
3391	pte_t entry;
3392
3393	vmf->pte = pte_offset_map_lock(mm, pmd: vmf->pmd, addr, ptlp: &vmf->ptl);
3394	if (unlikely(!vmf->pte \|\| !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
3395	/*
3396	* Other thread has already handled the fault
3397	* and update local tlb only
3398	*/
3399	if (vmf->pte)
3400	update_mmu_tlb(vma, address: addr, ptep: vmf->pte);
3401	ret = -EAGAIN;
3402	goto pte_unlock;
3403	}
3404
3405	entry = pte_mkyoung(pte: vmf->orig_pte);
3406	if (ptep_set_access_flags(vma, address: addr, ptep: vmf->pte, entry, dirty: `0`))
3407	update_mmu_cache_range(vmf, vma, addr, ptep: vmf->pte, nr: `1`);
3408	}
3409
3410	/*
3411	* This really shouldn't fail, because the page is there
3412	* in the page tables. But it might just be unreadable,
3413	* in which case we just give up and fill the result with
3414	* zeroes.
3415	*/
3416	if (__copy_from_user_inatomic(to: kaddr, from: uaddr, PAGE_SIZE)) {
3417	if (vmf->pte)
3418	goto warn;
3419
3420	/ Re-validate under PTL if the page is still mapped /
3421	vmf->pte = pte_offset_map_lock(mm, pmd: vmf->pmd, addr, ptlp: &vmf->ptl);
3422	if (unlikely(!vmf->pte \|\| !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
3423	/ The PTE changed under us, update local tlb /
3424	if (vmf->pte)
3425	update_mmu_tlb(vma, address: addr, ptep: vmf->pte);
3426	ret = -EAGAIN;
3427	goto pte_unlock;
3428	}
3429
3430	/*
3431	* The same page can be mapped back since last copy attempt.
3432	* Try to copy again under PTL.
3433	*/
3434	if (__copy_from_user_inatomic(to: kaddr, from: uaddr, PAGE_SIZE)) {
3435	/*
3436	* Give a warn in case there can be some obscure
3437	* use-case
3438	*/
3439	warn:
3440	WARN_ON_ONCE(`1`);
3441	clear_page(page: kaddr);
3442	}
3443	}
3444
3445	ret = `0`;
3446
3447	pte_unlock:
3448	if (vmf->pte)
3449	pte_unmap_unlock(vmf->pte, vmf->ptl);
3450	pagefault_enable();
3451	kunmap_local(kaddr);
3452	flush_dcache_page(page: dst);
3453
3454	return ret;
3455	}
3456
3457	static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
3458	{
3459	struct file *vm_file = vma->vm_file;
3460
3461	if (vm_file)
3462	return mapping_gfp_mask(mapping: vm_file->f_mapping) \| __GFP_FS \| __GFP_IO;
3463
3464	/*
3465	* Special mappings (e.g. VDSO) do not have any file so fake
3466	* a default GFP_KERNEL for them.
3467	*/
3468	return GFP_KERNEL;
3469	}
3470
3471	/*
3472	* Notify the address space that the page is about to become writable so that
3473	* it can prohibit this or wait for the page to get into an appropriate state.
3474	*
3475	* We do this without the lock held, so that it can sleep if it needs to.
3476	*/
3477	static vm_fault_t do_page_mkwrite(struct vm_fault vmf, struct* folio *folio)
3478	{
3479	vm_fault_t ret;
3480	unsigned int old_flags = vmf->flags;
3481
3482	vmf->flags = FAULT_FLAG_WRITE\|FAULT_FLAG_MKWRITE;
3483
3484	if (vmf->vma->vm_file &&
3485	IS_SWAPFILE(vmf->vma->vm_file->f_mapping->host))
3486	return VM_FAULT_SIGBUS;
3487
3488	ret = vmf->vma->vm_ops->page_mkwrite(vmf);
3489	/ Restore original flags so that caller is not surprised /
3490	vmf->flags = old_flags;
3491	if (unlikely(ret & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE)))
3492	return ret;
3493	if (unlikely(!(ret & VM_FAULT_LOCKED))) {
3494	folio_lock(folio);
3495	if (!folio->mapping) {
3496	folio_unlock(folio);
3497	return `0`; / retry /
3498	}
3499	ret \|= VM_FAULT_LOCKED;
3500	} else
3501	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
3502	return ret;
3503	}
3504
3505	/*
3506	* Handle dirtying of a page in shared file mapping on a write fault.
3507	*
3508	* The function expects the page to be locked and unlocks it.
3509	*/
3510	static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
3511	{
3512	struct vm_area_struct *vma = vmf->vma;
3513	struct address_space *mapping;
3514	struct folio *folio = page_folio(vmf->page);
3515	bool dirtied;
3516	bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
3517
3518	dirtied = folio_mark_dirty(folio);
3519	VM_BUG_ON_FOLIO(folio_test_anon(folio), folio);
3520	/*
3521	* Take a local copy of the address_space - folio.mapping may be zeroed
3522	* by truncate after folio_unlock(). The address_space itself remains
3523	* pinned by vma->vm_file's reference. We rely on folio_unlock()'s
3524	* release semantics to prevent the compiler from undoing this copying.
3525	*/
3526	mapping = folio_raw_mapping(folio);
3527	folio_unlock(folio);
3528
3529	if (!page_mkwrite)
3530	file_update_time(file: vma->vm_file);
3531
3532	/*
3533	* Throttle page dirtying rate down to writeback speed.
3534	*
3535	* mapping may be NULL here because some device drivers do not
3536	* set page.mapping but still dirty their pages
3537	*
3538	* Drop the mmap_lock before waiting on IO, if we can. The file
3539	* is pinning the mapping, as per above.
3540	*/
3541	if ((dirtied \|\| page_mkwrite) && mapping) {
3542	struct file *fpin;
3543
3544	fpin = maybe_unlock_mmap_for_io(vmf, NULL);
3545	balance_dirty_pages_ratelimited(mapping);
3546	if (fpin) {
3547	fput(fpin);
3548	return VM_FAULT_COMPLETED;
3549	}
3550	}
3551
3552	return `0`;
3553	}
3554
3555	/*
3556	* Handle write page faults for pages that can be reused in the current vma
3557	*
3558	* This can happen either due to the mapping being with the VM_SHARED flag,
3559	* or due to us being the last reference standing to the page. In either
3560	* case, all we need to do here is to mark the page as writable and update
3561	* any related book-keeping.
3562	*/
3563	static inline void wp_page_reuse(struct vm_fault vmf, struct* folio *folio)
3564	__releases(vmf->ptl)
3565	{
3566	struct vm_area_struct *vma = vmf->vma;
3567	pte_t entry;
3568
3569	VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE));
3570	VM_WARN_ON(is_zero_pfn(pte_pfn(vmf->orig_pte)));
3571
3572	if (folio) {
3573	VM_BUG_ON(folio_test_anon(folio) &&
3574	!PageAnonExclusive(vmf->page));
3575	/*
3576	* Clear the folio's cpupid information as the existing
3577	* information potentially belongs to a now completely
3578	* unrelated process.
3579	*/
3580	folio_xchg_last_cpupid(folio, cpupid: (`1` << LAST_CPUPID_SHIFT) - `1`);
3581	}
3582
3583	flush_cache_page(vma, vmaddr: vmf->address, pfn: pte_pfn(pte: vmf->orig_pte));
3584	entry = pte_mkyoung(pte: vmf->orig_pte);
3585	entry = maybe_mkwrite(pte: pte_mkdirty(pte: entry), vma);
3586	if (ptep_set_access_flags(vma, address: vmf->address, ptep: vmf->pte, entry, dirty: `1`))
3587	update_mmu_cache_range(vmf, vma, addr: vmf->address, ptep: vmf->pte, nr: `1`);
3588	pte_unmap_unlock(vmf->pte, vmf->ptl);
3589	count_vm_event(item: PGREUSE);
3590	}
3591
3592	/*
3593	* We could add a bitflag somewhere, but for now, we know that all
3594	* vm_ops that have a ->map_pages have been audited and don't need
3595	* the mmap_lock to be held.
3596	*/
3597	static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf)
3598	{
3599	struct vm_area_struct *vma = vmf->vma;
3600
3601	if (vma->vm_ops->map_pages \|\| !(vmf->flags & FAULT_FLAG_VMA_LOCK))
3602	return `0`;
3603	vma_end_read(vma);
3604	return VM_FAULT_RETRY;
3605	}
3606
3607	/**
3608	* __vmf_anon_prepare - Prepare to handle an anonymous fault.
3609	* @vmf: The vm_fault descriptor passed from the fault handler.
3610	*
3611	* When preparing to insert an anonymous page into a VMA from a
3612	* fault handler, call this function rather than anon_vma_prepare().
3613	* If this vma does not already have an associated anon_vma and we are
3614	* only protected by the per-VMA lock, the caller must retry with the
3615	* mmap_lock held. __anon_vma_prepare() will look at adjacent VMAs to
3616	* determine if this VMA can share its anon_vma, and that's not safe to
3617	* do with only the per-VMA lock held for this VMA.
3618	*
3619	* Return: 0 if fault handling can proceed. Any other value should be
3620	* returned to the caller.
3621	*/
3622	vm_fault_t __vmf_anon_prepare(struct vm_fault *vmf)
3623	{
3624	struct vm_area_struct *vma = vmf->vma;
3625	vm_fault_t ret = `0`;
3626
3627	if (likely(vma->anon_vma))
3628	return `0`;
3629	if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
3630	if (!mmap_read_trylock(mm: vma->vm_mm))
3631	return VM_FAULT_RETRY;
3632	}
3633	if (__anon_vma_prepare(vma))
3634	ret = VM_FAULT_OOM;
3635	if (vmf->flags & FAULT_FLAG_VMA_LOCK)
3636	mmap_read_unlock(mm: vma->vm_mm);
3637	return ret;
3638	}
3639
3640	/*
3641	* Handle the case of a page which we actually need to copy to a new page,
3642	* either due to COW or unsharing.
3643	*
3644	* Called with mmap_lock locked and the old page referenced, but
3645	* without the ptl held.
3646	*
3647	* High level logic flow:
3648	*
3649	* - Allocate a page, copy the content of the old page to the new one.
3650	* - Handle book keeping and accounting - cgroups, mmu-notifiers, etc.
3651	* - Take the PTL. If the pte changed, bail out and release the allocated page
3652	* - If the pte is still the way we remember it, update the page table and all
3653	* relevant references. This includes dropping the reference the page-table
3654	* held to the old page, as well as updating the rmap.
3655	* - In any case, unlock the PTL and drop the reference we took to the old page.
3656	*/
3657	static vm_fault_t wp_page_copy(struct vm_fault *vmf)
3658	{
3659	const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
3660	struct vm_area_struct *vma = vmf->vma;
3661	struct mm_struct *mm = vma->vm_mm;
3662	struct folio *old_folio = NULL;
3663	struct folio *new_folio = NULL;
3664	pte_t entry;
3665	int page_copied = `0`;
3666	struct mmu_notifier_range range;
3667	vm_fault_t ret;
3668	bool pfn_is_zero;
3669
3670	delayacct_wpcopy_start();
3671
3672	if (vmf->page)
3673	old_folio = page_folio(vmf->page);
3674	ret = vmf_anon_prepare(vmf);
3675	if (unlikely(ret))
3676	goto out;
3677
3678	pfn_is_zero = is_zero_pfn(pfn: pte_pfn(pte: vmf->orig_pte));
3679	new_folio = folio_prealloc(src_mm: mm, vma, addr: vmf->address, need_zero: pfn_is_zero);
3680	if (!new_folio)
3681	goto oom;
3682
3683	if (!pfn_is_zero) {
3684	int err;
3685
3686	err = __wp_page_copy_user(dst: &new_folio->page, src: vmf->page, vmf);
3687	if (err) {
3688	/*
3689	* COW failed, if the fault was solved by other,
3690	* it's fine. If not, userspace would re-fault on
3691	* the same address and we will handle the fault
3692	* from the second attempt.
3693	* The -EHWPOISON case will not be retried.
3694	*/
3695	folio_put(folio: new_folio);
3696	if (old_folio)
3697	folio_put(folio: old_folio);
3698
3699	delayacct_wpcopy_end();
3700	return err == -EHWPOISON ? VM_FAULT_HWPOISON : `0`;
3701	}
3702	kmsan_copy_page_meta(dst: &new_folio->page, src: vmf->page);
3703	}
3704
3705	__folio_mark_uptodate(folio: new_folio);
3706
3707	mmu_notifier_range_init(range: &range, event: MMU_NOTIFY_CLEAR, flags: `0`, mm,
3708	start: vmf->address & PAGE_MASK,
3709	end: (vmf->address & PAGE_MASK) + PAGE_SIZE);
3710	mmu_notifier_invalidate_range_start(range: &range);
3711
3712	/*
3713	* Re-check the pte - we dropped the lock
3714	*/
3715	vmf->pte = pte_offset_map_lock(mm, pmd: vmf->pmd, addr: vmf->address, ptlp: &vmf->ptl);
3716	if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
3717	if (old_folio) {
3718	if (!folio_test_anon(folio: old_folio)) {
3719	dec_mm_counter(mm, member: mm_counter_file(folio: old_folio));
3720	inc_mm_counter(mm, member: MM_ANONPAGES);
3721	}
3722	} else {
3723	ksm_might_unmap_zero_page(mm, pte: vmf->orig_pte);
3724	inc_mm_counter(mm, member: MM_ANONPAGES);
3725	}
3726	flush_cache_page(vma, vmaddr: vmf->address, pfn: pte_pfn(pte: vmf->orig_pte));
3727	entry = folio_mk_pte(folio: new_folio, pgprot: vma->vm_page_prot);
3728	entry = pte_sw_mkyoung(pte: entry);
3729	if (unlikely(unshare)) {
3730	if (pte_soft_dirty(pte: vmf->orig_pte))
3731	entry = pte_mksoft_dirty(pte: entry);
3732	if (pte_uffd_wp(pte: vmf->orig_pte))
3733	entry = pte_mkuffd_wp(pte: entry);
3734	} else {
3735	entry = maybe_mkwrite(pte: pte_mkdirty(pte: entry), vma);
3736	}
3737
3738	/*
3739	* Clear the pte entry and flush it first, before updating the
3740	* pte with the new entry, to keep TLBs on different CPUs in
3741	* sync. This code used to set the new PTE then flush TLBs, but
3742	* that left a window where the new PTE could be loaded into
3743	* some TLBs while the old PTE remains in others.
3744	*/
3745	ptep_clear_flush(vma, address: vmf->address, ptep: vmf->pte);
3746	folio_add_new_anon_rmap(new_folio, vma, address: vmf->address, RMAP_EXCLUSIVE);
3747	folio_add_lru_vma(new_folio, vma);
3748	BUG_ON(unshare && pte_write(entry));
3749	set_pte_at(mm, vmf->address, vmf->pte, entry);
3750	update_mmu_cache_range(vmf, vma, addr: vmf->address, ptep: vmf->pte, nr: `1`);
3751	if (old_folio) {
3752	/*
3753	* Only after switching the pte to the new page may
3754	* we remove the mapcount here. Otherwise another
3755	* process may come and find the rmap count decremented
3756	* before the pte is switched to the new page, and
3757	* "reuse" the old page writing into it while our pte
3758	* here still points into it and can be read by other
3759	* threads.
3760	*
3761	* The critical issue is to order this
3762	* folio_remove_rmap_pte() with the ptp_clear_flush
3763	* above. Those stores are ordered by (if nothing else,)
3764	* the barrier present in the atomic_add_negative
3765	* in folio_remove_rmap_pte();
3766	*
3767	* Then the TLB flush in ptep_clear_flush ensures that
3768	* no process can access the old page before the
3769	* decremented mapcount is visible. And the old page
3770	* cannot be reused until after the decremented
3771	* mapcount is visible. So transitively, TLBs to
3772	* old page will be flushed before it can be reused.
3773	*/
3774	folio_remove_rmap_pte(old_folio, vmf->page, vma);
3775	}
3776
3777	/ Free the old page.. /
3778	new_folio = old_folio;
3779	page_copied = `1`;
3780	pte_unmap_unlock(vmf->pte, vmf->ptl);
3781	} else if (vmf->pte) {
3782	update_mmu_tlb(vma, address: vmf->address, ptep: vmf->pte);
3783	pte_unmap_unlock(vmf->pte, vmf->ptl);
3784	}
3785
3786	mmu_notifier_invalidate_range_end(range: &range);
3787
3788	if (new_folio)
3789	folio_put(folio: new_folio);
3790	if (old_folio) {
3791	if (page_copied)
3792	free_swap_cache(folio: old_folio);
3793	folio_put(folio: old_folio);
3794	}
3795
3796	delayacct_wpcopy_end();
3797	return `0`;
3798	oom:
3799	ret = VM_FAULT_OOM;
3800	out:
3801	if (old_folio)
3802	folio_put(folio: old_folio);
3803
3804	delayacct_wpcopy_end();
3805	return ret;
3806	}
3807
3808	/**
3809	* finish_mkwrite_fault - finish page fault for a shared mapping, making PTE
3810	* writeable once the page is prepared
3811	*
3812	* @vmf: structure describing the fault
3813	* @folio: the folio of vmf->page
3814	*
3815	* This function handles all that is needed to finish a write page fault in a
3816	* shared mapping due to PTE being read-only once the mapped page is prepared.
3817	* It handles locking of PTE and modifying it.
3818	*
3819	* The function expects the page to be locked or other protection against
3820	* concurrent faults / writeback (such as DAX radix tree locks).
3821	*
3822	* Return: %0 on success, %VM_FAULT_NOPAGE when PTE got changed before
3823	* we acquired PTE lock.
3824	*/
3825	static vm_fault_t finish_mkwrite_fault(struct vm_fault vmf, struct* folio *folio)
3826	{
3827	WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
3828	vmf->pte = pte_offset_map_lock(mm: vmf->vma->vm_mm, pmd: vmf->pmd, addr: vmf->address,
3829	ptlp: &vmf->ptl);
3830	if (!vmf->pte)
3831	return VM_FAULT_NOPAGE;
3832	/*
3833	* We might have raced with another page fault while we released the
3834	* pte_offset_map_lock.
3835	*/
3836	if (!pte_same(a: ptep_get(ptep: vmf->pte), b: vmf->orig_pte)) {
3837	update_mmu_tlb(vma: vmf->vma, address: vmf->address, ptep: vmf->pte);
3838	pte_unmap_unlock(vmf->pte, vmf->ptl);
3839	return VM_FAULT_NOPAGE;
3840	}
3841	wp_page_reuse(vmf, folio);
3842	return `0`;
3843	}
3844
3845	/*
3846	* Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
3847	* mapping
3848	*/
3849	static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
3850	{
3851	struct vm_area_struct *vma = vmf->vma;
3852
3853	if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
3854	vm_fault_t ret;
3855
3856	pte_unmap_unlock(vmf->pte, vmf->ptl);
3857	ret = vmf_can_call_fault(vmf);
3858	if (ret)
3859	return ret;
3860
3861	vmf->flags \|= FAULT_FLAG_MKWRITE;
3862	ret = vma->vm_ops->pfn_mkwrite(vmf);
3863	if (ret & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE))
3864	return ret;
3865	return finish_mkwrite_fault(vmf, NULL);
3866	}
3867	wp_page_reuse(vmf, NULL);
3868	return `0`;
3869	}
3870
3871	static vm_fault_t wp_page_shared(struct vm_fault vmf, struct* folio *folio)
3872	__releases(vmf->ptl)
3873	{
3874	struct vm_area_struct *vma = vmf->vma;
3875	vm_fault_t ret = `0`;
3876
3877	folio_get(folio);
3878
3879	if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
3880	vm_fault_t tmp;
3881
3882	pte_unmap_unlock(vmf->pte, vmf->ptl);
3883	tmp = vmf_can_call_fault(vmf);
3884	if (tmp) {
3885	folio_put(folio);
3886	return tmp;
3887	}
3888
3889	tmp = do_page_mkwrite(vmf, folio);
3890	if (unlikely(!tmp \|\| (tmp &
3891	(VM_FAULT_ERROR \| VM_FAULT_NOPAGE)))) {
3892	folio_put(folio);
3893	return tmp;
3894	}
3895	tmp = finish_mkwrite_fault(vmf, folio);
3896	if (unlikely(tmp & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE))) {
3897	folio_unlock(folio);
3898	folio_put(folio);
3899	return tmp;
3900	}
3901	} else {
3902	wp_page_reuse(vmf, folio);
3903	folio_lock(folio);
3904	}
3905	ret \|= fault_dirty_shared_page(vmf);
3906	folio_put(folio);
3907
3908	return ret;
3909	}
3910
3911	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3912	static bool __wp_can_reuse_large_anon_folio(struct folio *folio,
3913	struct vm_area_struct *vma)
3914	{
3915	bool exclusive = false;
3916
3917	/ Let's just free up a large folio if only a single page is mapped. /
3918	if (folio_large_mapcount(folio) <= `1`)
3919	return false;
3920
3921	/*
3922	* The assumption for anonymous folios is that each page can only get
3923	* mapped once into each MM. The only exception are KSM folios, which
3924	* are always small.
3925	*
3926	* Each taken mapcount must be paired with exactly one taken reference,
3927	* whereby the refcount must be incremented before the mapcount when
3928	* mapping a page, and the refcount must be decremented after the
3929	* mapcount when unmapping a page.
3930	*
3931	* If all folio references are from mappings, and all mappings are in
3932	* the page tables of this MM, then this folio is exclusive to this MM.
3933	*/
3934	if (test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids))
3935	return false;
3936
3937	VM_WARN_ON_ONCE(folio_test_ksm(folio));
3938
3939	if (unlikely(folio_test_swapcache(folio))) {
3940	/*
3941	* Note: freeing up the swapcache will fail if some PTEs are
3942	* still swap entries.
3943	*/
3944	if (!folio_trylock(folio))
3945	return false;
3946	folio_free_swap(folio);
3947	folio_unlock(folio);
3948	}
3949
3950	if (folio_large_mapcount(folio) != folio_ref_count(folio))
3951	return false;
3952
3953	/ Stabilize the mapcount vs. refcount and recheck. /
3954	folio_lock_large_mapcount(folio);
3955	VM_WARN_ON_ONCE_FOLIO(folio_large_mapcount(folio) > folio_ref_count(folio), folio);
3956
3957	if (test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids))
3958	goto unlock;
3959	if (folio_large_mapcount(folio) != folio_ref_count(folio))
3960	goto unlock;
3961
3962	VM_WARN_ON_ONCE_FOLIO(folio_large_mapcount(folio) > folio_nr_pages(folio), folio);
3963	VM_WARN_ON_ONCE_FOLIO(folio_entire_mapcount(folio), folio);
3964	VM_WARN_ON_ONCE(folio_mm_id(folio, `0`) != vma->vm_mm->mm_id &&
3965	folio_mm_id(folio, `1`) != vma->vm_mm->mm_id);
3966
3967	/*
3968	* Do we need the folio lock? Likely not. If there would have been
3969	* references from page migration/swapout, we would have detected
3970	* an additional folio reference and never ended up here.
3971	*/
3972	exclusive = true;
3973	unlock:
3974	folio_unlock_large_mapcount(folio);
3975	return exclusive;
3976	}
3977	#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
3978	static bool __wp_can_reuse_large_anon_folio(struct folio *folio,
3979	struct vm_area_struct *vma)
3980	{
3981	BUILD_BUG();
3982	}
3983	#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
3984
3985	static bool wp_can_reuse_anon_folio(struct folio *folio,
3986	struct vm_area_struct *vma)
3987	{
3988	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && folio_test_large(folio))
3989	return __wp_can_reuse_large_anon_folio(folio, vma);
3990
3991	/*
3992	* We have to verify under folio lock: these early checks are
3993	* just an optimization to avoid locking the folio and freeing
3994	* the swapcache if there is little hope that we can reuse.
3995	*
3996	* KSM doesn't necessarily raise the folio refcount.
3997	*/
3998	if (folio_test_ksm(folio) \|\| folio_ref_count(folio) > `3`)
3999	return false;
4000	if (!folio_test_lru(folio))
4001	/*
4002	* We cannot easily detect+handle references from
4003	* remote LRU caches or references to LRU folios.
4004	*/
4005	lru_add_drain();
4006	if (folio_ref_count(folio) > `1` + folio_test_swapcache(folio))
4007	return false;
4008	if (!folio_trylock(folio))
4009	return false;
4010	if (folio_test_swapcache(folio))
4011	folio_free_swap(folio);
4012	if (folio_test_ksm(folio) \|\| folio_ref_count(folio) != `1`) {
4013	folio_unlock(folio);
4014	return false;
4015	}
4016	/*
4017	* Ok, we've got the only folio reference from our mapping
4018	* and the folio is locked, it's dark out, and we're wearing
4019	* sunglasses. Hit it.
4020	*/
4021	folio_move_anon_rmap(folio, vma);
4022	folio_unlock(folio);
4023	return true;
4024	}
4025
4026	/*
4027	* This routine handles present pages, when
4028	* * users try to write to a shared page (FAULT_FLAG_WRITE)
4029	* * GUP wants to take a R/O pin on a possibly shared anonymous page
4030	* (FAULT_FLAG_UNSHARE)
4031	*
4032	* It is done by copying the page to a new address and decrementing the
4033	* shared-page counter for the old page.
4034	*
4035	* Note that this routine assumes that the protection checks have been
4036	* done by the caller (the low-level page fault routine in most cases).
4037	* Thus, with FAULT_FLAG_WRITE, we can safely just mark it writable once we've
4038	* done any necessary COW.
4039	*
4040	* In case of FAULT_FLAG_WRITE, we also mark the page dirty at this point even
4041	* though the page will change only once the write actually happens. This
4042	* avoids a few races, and potentially makes it more efficient.
4043	*
4044	* We enter with non-exclusive mmap_lock (to exclude vma changes,
4045	* but allow concurrent faults), with pte both mapped and locked.
4046	* We return with mmap_lock still held, but pte unmapped and unlocked.
4047	*/
4048	static vm_fault_t do_wp_page(struct vm_fault *vmf)
4049	__releases(vmf->ptl)
4050	{
4051	const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
4052	struct vm_area_struct *vma = vmf->vma;
4053	struct folio *folio = NULL;
4054	pte_t pte;
4055
4056	if (likely(!unshare)) {
4057	if (userfaultfd_pte_wp(vma, pte: ptep_get(ptep: vmf->pte))) {
4058	if (!userfaultfd_wp_async(vma)) {
4059	pte_unmap_unlock(vmf->pte, vmf->ptl);
4060	return handle_userfault(vmf, VM_UFFD_WP);
4061	}
4062
4063	/*
4064	* Nothing needed (cache flush, TLB invalidations,
4065	* etc.) because we're only removing the uffd-wp bit,
4066	* which is completely invisible to the user.
4067	*/
4068	pte = pte_clear_uffd_wp(pte: ptep_get(ptep: vmf->pte));
4069
4070	set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
4071	/*
4072	* Update this to be prepared for following up CoW
4073	* handling
4074	*/
4075	vmf->orig_pte = pte;
4076	}
4077
4078	/*
4079	* Userfaultfd write-protect can defer flushes. Ensure the TLB
4080	* is flushed in this case before copying.
4081	*/
4082	if (unlikely(userfaultfd_wp(vmf->vma) &&
4083	mm_tlb_flush_pending(vmf->vma->vm_mm)))
4084	flush_tlb_page(vma: vmf->vma, a: vmf->address);
4085	}
4086
4087	vmf->page = vm_normal_page(vma, addr: vmf->address, pte: vmf->orig_pte);
4088
4089	if (vmf->page)
4090	folio = page_folio(vmf->page);
4091
4092	/*
4093	* Shared mapping: we are guaranteed to have VM_WRITE and
4094	* FAULT_FLAG_WRITE set at this point.
4095	*/
4096	if (vma->vm_flags & (VM_SHARED \| VM_MAYSHARE)) {
4097	/*
4098	* VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
4099	* VM_PFNMAP VMA. FS DAX also wants ops->pfn_mkwrite called.
4100	*
4101	* We should not cow pages in a shared writeable mapping.
4102	* Just mark the pages writable and/or call ops->pfn_mkwrite.
4103	*/
4104	if (!vmf->page \|\| is_fsdax_page(page: vmf->page)) {
4105	vmf->page = NULL;
4106	return wp_pfn_shared(vmf);
4107	}
4108	return wp_page_shared(vmf, folio);
4109	}
4110
4111	/*
4112	* Private mapping: create an exclusive anonymous page copy if reuse
4113	* is impossible. We might miss VM_WRITE for FOLL_FORCE handling.
4114	*
4115	* If we encounter a page that is marked exclusive, we must reuse
4116	* the page without further checks.
4117	*/
4118	if (folio && folio_test_anon(folio) &&
4119	(PageAnonExclusive(page: vmf->page) \|\| wp_can_reuse_anon_folio(folio, vma))) {
4120	if (!PageAnonExclusive(page: vmf->page))
4121	SetPageAnonExclusive(vmf->page);
4122	if (unlikely(unshare)) {
4123	pte_unmap_unlock(vmf->pte, vmf->ptl);
4124	return `0`;
4125	}
4126	wp_page_reuse(vmf, folio);
4127	return `0`;
4128	}
4129	/*
4130	* Ok, we need to copy. Oh, well..
4131	*/
4132	if (folio)
4133	folio_get(folio);
4134
4135	pte_unmap_unlock(vmf->pte, vmf->ptl);
4136	#ifdef CONFIG_KSM
4137	if (folio && folio_test_ksm(folio))
4138	count_vm_event(COW_KSM);
4139	#endif
4140	return wp_page_copy(vmf);
4141	}
4142
4143	static void unmap_mapping_range_vma(struct vm_area_struct *vma,
4144	unsigned long start_addr, unsigned long end_addr,
4145	struct zap_details *details)
4146	{
4147	zap_page_range_single(vma, address: start_addr, size: end_addr - start_addr, details);
4148	}
4149
4150	static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
4151	pgoff_t first_index,
4152	pgoff_t last_index,
4153	struct zap_details *details)
4154	{
4155	struct vm_area_struct *vma;
4156	pgoff_t vba, vea, zba, zea;
4157
4158	vma_interval_tree_foreach(vma, root, first_index, last_index) {
4159	vba = vma->vm_pgoff;
4160	vea = vba + vma_pages(vma) - `1`;
4161	zba = max(first_index, vba);
4162	zea = min(last_index, vea);
4163
4164	unmap_mapping_range_vma(vma,
4165	start_addr: ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
4166	end_addr: ((zea - vba + `1`) << PAGE_SHIFT) + vma->vm_start,
4167	details);
4168	}
4169	}
4170
4171	/**
4172	* unmap_mapping_folio() - Unmap single folio from processes.
4173	* @folio: The locked folio to be unmapped.
4174	*
4175	* Unmap this folio from any userspace process which still has it mmaped.
4176	* Typically, for efficiency, the range of nearby pages has already been
4177	* unmapped by unmap_mapping_pages() or unmap_mapping_range(). But once
4178	* truncation or invalidation holds the lock on a folio, it may find that
4179	* the page has been remapped again: and then uses unmap_mapping_folio()
4180	* to unmap it finally.
4181	*/
4182	void unmap_mapping_folio(struct folio *folio)
4183	{
4184	struct address_space *mapping = folio->mapping;
4185	struct zap_details details = { };
4186	pgoff_t first_index;
4187	pgoff_t last_index;
4188
4189	VM_BUG_ON(!folio_test_locked(folio));
4190
4191	first_index = folio->index;
4192	last_index = folio_next_index(folio) - `1`;
4193
4194	details.even_cows = false;
4195	details.single_folio = folio;
4196	details.zap_flags = ZAP_FLAG_DROP_MARKER;
4197
4198	i_mmap_lock_read(mapping);
4199	if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
4200	unmap_mapping_range_tree(root: &mapping->i_mmap, first_index,
4201	last_index, details: &details);
4202	i_mmap_unlock_read(mapping);
4203	}
4204
4205	/**
4206	* unmap_mapping_pages() - Unmap pages from processes.
4207	* @mapping: The address space containing pages to be unmapped.
4208	* @start: Index of first page to be unmapped.
4209	* @nr: Number of pages to be unmapped. 0 to unmap to end of file.
4210	* @even_cows: Whether to unmap even private COWed pages.
4211	*
4212	* Unmap the pages in this address space from any userspace process which
4213	* has them mmaped. Generally, you want to remove COWed pages as well when
4214	* a file is being truncated, but not when invalidating pages from the page
4215	* cache.
4216	*/
4217	void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
4218	pgoff_t nr, bool even_cows)
4219	{
4220	struct zap_details details = { };
4221	pgoff_t first_index = start;
4222	pgoff_t last_index = start + nr - `1`;
4223
4224	details.even_cows = even_cows;
4225	if (last_index < first_index)
4226	last_index = ULONG_MAX;
4227
4228	i_mmap_lock_read(mapping);
4229	if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
4230	unmap_mapping_range_tree(root: &mapping->i_mmap, first_index,
4231	last_index, details: &details);
4232	i_mmap_unlock_read(mapping);
4233	}
4234	EXPORT_SYMBOL_GPL(unmap_mapping_pages);
4235
4236	/**
4237	* unmap_mapping_range - unmap the portion of all mmaps in the specified
4238	* address_space corresponding to the specified byte range in the underlying
4239	* file.
4240	*
4241	* @mapping: the address space containing mmaps to be unmapped.
4242	* @holebegin: byte in first page to unmap, relative to the start of
4243	* the underlying file. This will be rounded down to a PAGE_SIZE
4244	* boundary. Note that this is different from truncate_pagecache(), which
4245	* must keep the partial page. In contrast, we must get rid of
4246	* partial pages.
4247	* @holelen: size of prospective hole in bytes. This will be rounded
4248	* up to a PAGE_SIZE boundary. A holelen of zero truncates to the
4249	* end of the file.
4250	* @even_cows: 1 when truncating a file, unmap even private COWed pages;
4251	* but 0 when invalidating pagecache, don't throw away private data.
4252	*/
4253	void unmap_mapping_range(struct address_space *mapping,
4254	loff_t const holebegin, loff_t const holelen, int even_cows)
4255	{
4256	pgoff_t hba = (pgoff_t)(holebegin) >> PAGE_SHIFT;
4257	pgoff_t hlen = ((pgoff_t)(holelen) + PAGE_SIZE - `1`) >> PAGE_SHIFT;
4258
4259	/ Check for overflow. /
4260	if (sizeof(holelen) > sizeof(hlen)) {
4261	long long holeend =
4262	(holebegin + holelen + PAGE_SIZE - `1`) >> PAGE_SHIFT;
4263	if (holeend & ~(long long)ULONG_MAX)
4264	hlen = ULONG_MAX - hba + `1`;
4265	}
4266
4267	unmap_mapping_pages(mapping, hba, hlen, even_cows);
4268	}
4269	EXPORT_SYMBOL(unmap_mapping_range);
4270
4271	/*
4272	* Restore a potential device exclusive pte to a working pte entry
4273	*/
4274	static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
4275	{
4276	struct folio *folio = page_folio(vmf->page);
4277	struct vm_area_struct *vma = vmf->vma;
4278	struct mmu_notifier_range range;
4279	vm_fault_t ret;
4280
4281	/*
4282	* We need a reference to lock the folio because we don't hold
4283	* the PTL so a racing thread can remove the device-exclusive
4284	* entry and unmap it. If the folio is free the entry must
4285	* have been removed already. If it happens to have already
4286	* been re-allocated after being freed all we do is lock and
4287	* unlock it.
4288	*/
4289	if (!folio_try_get(folio))
4290	return `0`;
4291
4292	ret = folio_lock_or_retry(folio, vmf);
4293	if (ret) {
4294	folio_put(folio);
4295	return ret;
4296	}
4297	mmu_notifier_range_init_owner(range: &range, event: MMU_NOTIFY_CLEAR, flags: `0`,
4298	mm: vma->vm_mm, start: vmf->address & PAGE_MASK,
4299	end: (vmf->address & PAGE_MASK) + PAGE_SIZE, NULL);
4300	mmu_notifier_invalidate_range_start(range: &range);
4301
4302	vmf->pte = pte_offset_map_lock(mm: vma->vm_mm, pmd: vmf->pmd, addr: vmf->address,
4303	ptlp: &vmf->ptl);
4304	if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
4305	restore_exclusive_pte(vma, folio, page: vmf->page, address: vmf->address,
4306	ptep: vmf->pte, orig_pte: vmf->orig_pte);
4307
4308	if (vmf->pte)
4309	pte_unmap_unlock(vmf->pte, vmf->ptl);
4310	folio_unlock(folio);
4311	folio_put(folio);
4312
4313	mmu_notifier_invalidate_range_end(range: &range);
4314	return `0`;
4315	}
4316
4317	static inline bool should_try_to_free_swap(struct folio *folio,
4318	struct vm_area_struct *vma,
4319	unsigned int fault_flags)
4320	{
4321	if (!folio_test_swapcache(folio))
4322	return false;
4323	if (mem_cgroup_swap_full(folio) \|\| (vma->vm_flags & VM_LOCKED) \|\|
4324	folio_test_mlocked(folio))
4325	return true;
4326	/*
4327	* If we want to map a page that's in the swapcache writable, we
4328	* have to detect via the refcount if we're really the exclusive
4329	* user. Try freeing the swapcache to get rid of the swapcache
4330	* reference only in case it's likely that we'll be the exlusive user.
4331	*/
4332	return (fault_flags & FAULT_FLAG_WRITE) && !folio_test_ksm(folio) &&
4333	folio_ref_count(folio) == (`1` + folio_nr_pages(folio));
4334	}
4335
4336	static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
4337	{
4338	vmf->pte = pte_offset_map_lock(mm: vmf->vma->vm_mm, pmd: vmf->pmd,
4339	addr: vmf->address, ptlp: &vmf->ptl);
4340	if (!vmf->pte)
4341	return `0`;
4342	/*
4343	* Be careful so that we will only recover a special uffd-wp pte into a
4344	* none pte. Otherwise it means the pte could have changed, so retry.
4345	*
4346	* This should also cover the case where e.g. the pte changed
4347	* quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_POISONED.
4348	* So is_pte_marker() check is not enough to safely drop the pte.
4349	*/
4350	if (pte_same(a: vmf->orig_pte, b: ptep_get(ptep: vmf->pte)))
4351	pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte);
4352	pte_unmap_unlock(vmf->pte, vmf->ptl);
4353	return `0`;
4354	}
4355
4356	static vm_fault_t do_pte_missing(struct vm_fault *vmf)
4357	{
4358	if (vma_is_anonymous(vma: vmf->vma))
4359	return do_anonymous_page(vmf);
4360	else
4361	return do_fault(vmf);
4362	}
4363
4364	/*
4365	* This is actually a page-missing access, but with uffd-wp special pte
4366	* installed. It means this pte was wr-protected before being unmapped.
4367	*/
4368	static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf)
4369	{
4370	/*
4371	* Just in case there're leftover special ptes even after the region
4372	* got unregistered - we can simply clear them.
4373	*/
4374	if (unlikely(!userfaultfd_wp(vmf->vma)))
4375	return pte_marker_clear(vmf);
4376
4377	return do_pte_missing(vmf);
4378	}
4379
4380	static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
4381	{
4382	swp_entry_t entry = pte_to_swp_entry(pte: vmf->orig_pte);
4383	unsigned long marker = pte_marker_get(entry);
4384
4385	/*
4386	* PTE markers should never be empty. If anything weird happened,
4387	* the best thing to do is to kill the process along with its mm.
4388	*/
4389	if (WARN_ON_ONCE(!marker))
4390	return VM_FAULT_SIGBUS;
4391
4392	/ Higher priority than uffd-wp when data corrupted /
4393	if (marker & PTE_MARKER_POISONED)
4394	return VM_FAULT_HWPOISON;
4395
4396	/ Hitting a guard page is always a fatal condition. /
4397	if (marker & PTE_MARKER_GUARD)
4398	return VM_FAULT_SIGSEGV;
4399
4400	if (pte_marker_entry_uffd_wp(entry))
4401	return pte_marker_handle_uffd_wp(vmf);
4402
4403	/ This is an unknown pte marker /
4404	return VM_FAULT_SIGBUS;
4405	}
4406
4407	static struct folio __alloc_swap_folio(struct* vm_fault *vmf)
4408	{
4409	struct vm_area_struct *vma = vmf->vma;
4410	struct folio *folio;
4411	swp_entry_t entry;
4412
4413	folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, `0`, vma, vmf->address);
4414	if (!folio)
4415	return NULL;
4416
4417	entry = pte_to_swp_entry(pte: vmf->orig_pte);
4418	if (mem_cgroup_swapin_charge_folio(folio, mm: vma->vm_mm,
4419	GFP_KERNEL, entry)) {
4420	folio_put(folio);
4421	return NULL;
4422	}
4423
4424	return folio;
4425	}
4426
4427	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4428	/*
4429	* Check if the PTEs within a range are contiguous swap entries
4430	* and have consistent swapcache, zeromap.
4431	*/
4432	static bool can_swapin_thp(struct vm_fault vmf, pte_t ptep, int nr_pages)
4433	{
4434	unsigned long addr;
4435	swp_entry_t entry;
4436	int idx;
4437	pte_t pte;
4438
4439	addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
4440	idx = (vmf->address - addr) / PAGE_SIZE;
4441	pte = ptep_get(ptep);
4442
4443	if (!pte_same(pte, pte_move_swp_offset(vmf->orig_pte, -idx)))
4444	return false;
4445	entry = pte_to_swp_entry(pte);
4446	if (swap_pte_batch(ptep, nr_pages, pte) != nr_pages)
4447	return false;
4448
4449	/*
4450	* swap_read_folio() can't handle the case a large folio is hybridly
4451	* from different backends. And they are likely corner cases. Similar
4452	* things might be added once zswap support large folios.
4453	*/
4454	if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) != nr_pages))
4455	return false;
4456	if (unlikely(non_swapcache_batch(entry, nr_pages) != nr_pages))
4457	return false;
4458
4459	return true;
4460	}
4461
4462	static inline unsigned long thp_swap_suitable_orders(pgoff_t swp_offset,
4463	unsigned long addr,
4464	unsigned long orders)
4465	{
4466	int order, nr;
4467
4468	order = highest_order(orders);
4469
4470	/*
4471	* To swap in a THP with nr pages, we require that its first swap_offset
4472	* is aligned with that number, as it was when the THP was swapped out.
4473	* This helps filter out most invalid entries.
4474	*/
4475	while (orders) {
4476	nr = `1` << order;
4477	if ((addr >> PAGE_SHIFT) % nr == swp_offset % nr)
4478	break;
4479	order = next_order(&orders, order);
4480	}
4481
4482	return orders;
4483	}
4484
4485	static struct folio alloc_swap_folio(struct* vm_fault *vmf)
4486	{
4487	struct vm_area_struct *vma = vmf->vma;
4488	unsigned long orders;
4489	struct folio *folio;
4490	unsigned long addr;
4491	swp_entry_t entry;
4492	spinlock_t *ptl;
4493	pte_t *pte;
4494	gfp_t gfp;
4495	int order;
4496
4497	/*
4498	* If uffd is active for the vma we need per-page fault fidelity to
4499	* maintain the uffd semantics.
4500	*/
4501	if (unlikely(userfaultfd_armed(vma)))
4502	goto fallback;
4503
4504	/*
4505	* A large swapped out folio could be partially or fully in zswap. We
4506	* lack handling for such cases, so fallback to swapping in order-0
4507	* folio.
4508	*/
4509	if (!zswap_never_enabled())
4510	goto fallback;
4511
4512	entry = pte_to_swp_entry(vmf->orig_pte);
4513	/*
4514	* Get a list of all the (large) orders below PMD_ORDER that are enabled
4515	* and suitable for swapping THP.
4516	*/
4517	orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT,
4518	BIT(PMD_ORDER) - `1`);
4519	orders = thp_vma_suitable_orders(vma, vmf->address, orders);
4520	orders = thp_swap_suitable_orders(swp_offset(entry),
4521	vmf->address, orders);
4522
4523	if (!orders)
4524	goto fallback;
4525
4526	pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
4527	vmf->address & PMD_MASK, &ptl);
4528	if (unlikely(!pte))
4529	goto fallback;
4530
4531	/*
4532	* For do_swap_page, find the highest order where the aligned range is
4533	* completely swap entries with contiguous swap offsets.
4534	*/
4535	order = highest_order(orders);
4536	while (orders) {
4537	addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
4538	if (can_swapin_thp(vmf, pte + pte_index(addr), `1` << order))
4539	break;
4540	order = next_order(&orders, order);
4541	}
4542
4543	pte_unmap_unlock(pte, ptl);
4544
4545	/ Try allocating the highest of the remaining orders. /
4546	gfp = vma_thp_gfp_mask(vma);
4547	while (orders) {
4548	addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
4549	folio = vma_alloc_folio(gfp, order, vma, addr);
4550	if (folio) {
4551	if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
4552	gfp, entry))
4553	return folio;
4554	count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE);
4555	folio_put(folio);
4556	}
4557	count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK);
4558	order = next_order(&orders, order);
4559	}
4560
4561	fallback:
4562	return __alloc_swap_folio(vmf);
4563	}
4564	#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
4565	static struct folio alloc_swap_folio(struct* vm_fault *vmf)
4566	{
4567	return __alloc_swap_folio(vmf);
4568	}
4569	#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
4570
4571	static DECLARE_WAIT_QUEUE_HEAD(swapcache_wq);
4572
4573	/*
4574	* We enter with non-exclusive mmap_lock (to exclude vma changes,
4575	* but allow concurrent faults), and pte mapped but not yet locked.
4576	* We return with pte unmapped and unlocked.
4577	*
4578	* We return with the mmap_lock locked or unlocked in the same cases
4579	* as does filemap_fault().
4580	*/
4581	vm_fault_t do_swap_page(struct vm_fault *vmf)
4582	{
4583	struct vm_area_struct *vma = vmf->vma;
4584	struct folio swapcache, folio = NULL;
4585	DECLARE_WAITQUEUE(wait, current);
4586	struct page *page;
4587	struct swap_info_struct *si = NULL;
4588	rmap_t rmap_flags = RMAP_NONE;
4589	bool need_clear_cache = false;
4590	bool exclusive = false;
4591	swp_entry_t entry;
4592	pte_t pte;
4593	vm_fault_t ret = `0`;
4594	void *shadow = NULL;
4595	int nr_pages;
4596	unsigned long page_idx;
4597	unsigned long address;
4598	pte_t *ptep;
4599
4600	if (!pte_unmap_same(vmf))
4601	goto out;
4602
4603	entry = pte_to_swp_entry(pte: vmf->orig_pte);
4604	if (unlikely(non_swap_entry(entry))) {
4605	if (is_migration_entry(entry)) {
4606	migration_entry_wait(mm: vma->vm_mm, pmd: vmf->pmd,
4607	address: vmf->address);
4608	} else if (is_device_exclusive_entry(entry)) {
4609	vmf->page = pfn_swap_entry_to_page(entry);
4610	ret = remove_device_exclusive_entry(vmf);
4611	} else if (is_device_private_entry(entry)) {
4612	if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
4613	/*
4614	* migrate_to_ram is not yet ready to operate
4615	* under VMA lock.
4616	*/
4617	vma_end_read(vma);
4618	ret = VM_FAULT_RETRY;
4619	goto out;
4620	}
4621
4622	vmf->page = pfn_swap_entry_to_page(entry);
4623	vmf->pte = pte_offset_map_lock(mm: vma->vm_mm, pmd: vmf->pmd,
4624	addr: vmf->address, ptlp: &vmf->ptl);
4625	if (unlikely(!vmf->pte \|\|
4626	!pte_same(ptep_get(vmf->pte),
4627	vmf->orig_pte)))
4628	goto unlock;
4629
4630	/*
4631	* Get a page reference while we know the page can't be
4632	* freed.
4633	*/
4634	if (trylock_page(page: vmf->page)) {
4635	struct dev_pagemap *pgmap;
4636
4637	get_page(page: vmf->page);
4638	pte_unmap_unlock(vmf->pte, vmf->ptl);
4639	pgmap = page_pgmap(page: vmf->page);
4640	ret = pgmap->ops->migrate_to_ram(vmf);
4641	unlock_page(page: vmf->page);
4642	put_page(page: vmf->page);
4643	} else {
4644	pte_unmap_unlock(vmf->pte, vmf->ptl);
4645	}
4646	} else if (is_hwpoison_entry(swp: entry)) {
4647	ret = VM_FAULT_HWPOISON;
4648	} else if (is_pte_marker_entry(entry)) {
4649	ret = handle_pte_marker(vmf);
4650	} else {
4651	print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
4652	ret = VM_FAULT_SIGBUS;
4653	}
4654	goto out;
4655	}
4656
4657	/ Prevent swapoff from happening to us. /
4658	si = get_swap_device(entry);
4659	if (unlikely(!si))
4660	goto out;
4661
4662	folio = swap_cache_get_folio(entry);
4663	if (folio)
4664	swap_update_readahead(folio, vma, addr: vmf->address);
4665	swapcache = folio;
4666
4667	if (!folio) {
4668	if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
4669	__swap_count(entry) == `1`) {
4670	/ skip swapcache /
4671	folio = alloc_swap_folio(vmf);
4672	if (folio) {
4673	__folio_set_locked(folio);
4674	__folio_set_swapbacked(folio);
4675
4676	nr_pages = folio_nr_pages(folio);
4677	if (folio_test_large(folio))
4678	entry.val = ALIGN_DOWN(entry.val, nr_pages);
4679	/*
4680	* Prevent parallel swapin from proceeding with
4681	* the cache flag. Otherwise, another thread
4682	* may finish swapin first, free the entry, and
4683	* swapout reusing the same entry. It's
4684	* undetectable as pte_same() returns true due
4685	* to entry reuse.
4686	*/
4687	if (swapcache_prepare(entry, nr: nr_pages)) {
4688	/*
4689	* Relax a bit to prevent rapid
4690	* repeated page faults.
4691	*/
4692	add_wait_queue(wq_head: &swapcache_wq, wq_entry: &wait);
4693	schedule_timeout_uninterruptible(timeout: `1`);
4694	remove_wait_queue(wq_head: &swapcache_wq, wq_entry: &wait);
4695	goto out_page;
4696	}
4697	need_clear_cache = true;
4698
4699	memcg1_swapin(entry, nr_pages);
4700
4701	shadow = swap_cache_get_shadow(entry);
4702	if (shadow)
4703	workingset_refault(folio, shadow);
4704
4705	folio_add_lru(folio);
4706
4707	/ To provide entry to swap_read_folio() /
4708	folio->swap = entry;
4709	swap_read_folio(folio, NULL);
4710	folio->private = NULL;
4711	}
4712	} else {
4713	folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
4714	vmf);
4715	swapcache = folio;
4716	}
4717
4718	if (!folio) {
4719	/*
4720	* Back out if somebody else faulted in this pte
4721	* while we released the pte lock.
4722	*/
4723	vmf->pte = pte_offset_map_lock(mm: vma->vm_mm, pmd: vmf->pmd,
4724	addr: vmf->address, ptlp: &vmf->ptl);
4725	if (likely(vmf->pte &&
4726	pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
4727	ret = VM_FAULT_OOM;
4728	goto unlock;
4729	}
4730
4731	/ Had to read the page from swap area: Major fault /
4732	ret = VM_FAULT_MAJOR;
4733	count_vm_event(item: PGMAJFAULT);
4734	count_memcg_event_mm(mm: vma->vm_mm, idx: PGMAJFAULT);
4735	}
4736
4737	ret \|= folio_lock_or_retry(folio, vmf);
4738	if (ret & VM_FAULT_RETRY)
4739	goto out_release;
4740
4741	page = folio_file_page(folio, index: swp_offset(entry));
4742	if (swapcache) {
4743	/*
4744	* Make sure folio_free_swap() or swapoff did not release the
4745	* swapcache from under us. The page pin, and pte_same test
4746	* below, are not enough to exclude that. Even if it is still
4747	* swapcache, we need to check that the page's swap has not
4748	* changed.
4749	*/
4750	if (unlikely(!folio_matches_swap_entry(folio, entry)))
4751	goto out_page;
4752
4753	if (unlikely(PageHWPoison(page))) {
4754	/*
4755	* hwpoisoned dirty swapcache pages are kept for killing
4756	* owner processes (which may be unknown at hwpoison time)
4757	*/
4758	ret = VM_FAULT_HWPOISON;
4759	goto out_page;
4760	}
4761
4762	/*
4763	* KSM sometimes has to copy on read faults, for example, if
4764	* folio->index of non-ksm folios would be nonlinear inside the
4765	* anon VMA -- the ksm flag is lost on actual swapout.
4766	*/
4767	folio = ksm_might_need_to_copy(folio, vma, addr: vmf->address);
4768	if (unlikely(!folio)) {
4769	ret = VM_FAULT_OOM;
4770	folio = swapcache;
4771	goto out_page;
4772	} else if (unlikely(folio == ERR_PTR(-EHWPOISON))) {
4773	ret = VM_FAULT_HWPOISON;
4774	folio = swapcache;
4775	goto out_page;
4776	}
4777	if (folio != swapcache)
4778	page = folio_page(folio, `0`);
4779
4780	/*
4781	* If we want to map a page that's in the swapcache writable, we
4782	* have to detect via the refcount if we're really the exclusive
4783	* owner. Try removing the extra reference from the local LRU
4784	* caches if required.
4785	*/
4786	if ((vmf->flags & FAULT_FLAG_WRITE) && folio == swapcache &&
4787	!folio_test_ksm(folio) && !folio_test_lru(folio))
4788	lru_add_drain();
4789	}
4790
4791	folio_throttle_swaprate(folio, GFP_KERNEL);
4792
4793	/*
4794	* Back out if somebody else already faulted in this pte.
4795	*/
4796	vmf->pte = pte_offset_map_lock(mm: vma->vm_mm, pmd: vmf->pmd, addr: vmf->address,
4797	ptlp: &vmf->ptl);
4798	if (unlikely(!vmf->pte \|\| !pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
4799	goto out_nomap;
4800
4801	if (unlikely(!folio_test_uptodate(folio))) {
4802	ret = VM_FAULT_SIGBUS;
4803	goto out_nomap;
4804	}
4805
4806	/ allocated large folios for SWP_SYNCHRONOUS_IO /
4807	if (folio_test_large(folio) && !folio_test_swapcache(folio)) {
4808	unsigned long nr = folio_nr_pages(folio);
4809	unsigned long folio_start = ALIGN_DOWN(vmf->address, nr * PAGE_SIZE);
4810	unsigned long idx = (vmf->address - folio_start) / PAGE_SIZE;
4811	pte_t *folio_ptep = vmf->pte - idx;
4812	pte_t folio_pte = ptep_get(ptep: folio_ptep);
4813
4814	if (!pte_same(a: folio_pte, b: pte_move_swp_offset(pte: vmf->orig_pte, delta: -idx)) \|\|
4815	swap_pte_batch(start_ptep: folio_ptep, max_nr: nr, pte: folio_pte) != nr)
4816	goto out_nomap;
4817
4818	page_idx = idx;
4819	address = folio_start;
4820	ptep = folio_ptep;
4821	goto check_folio;
4822	}
4823
4824	nr_pages = `1`;
4825	page_idx = `0`;
4826	address = vmf->address;
4827	ptep = vmf->pte;
4828	if (folio_test_large(folio) && folio_test_swapcache(folio)) {
4829	int nr = folio_nr_pages(folio);
4830	unsigned long idx = folio_page_idx(folio, page);
4831	unsigned long folio_start = address - idx * PAGE_SIZE;
4832	unsigned long folio_end = folio_start + nr * PAGE_SIZE;
4833	pte_t *folio_ptep;
4834	pte_t folio_pte;
4835
4836	if (unlikely(folio_start < max(address & PMD_MASK, vma->vm_start)))
4837	goto check_folio;
4838	if (unlikely(folio_end > pmd_addr_end(address, vma->vm_end)))
4839	goto check_folio;
4840
4841	folio_ptep = vmf->pte - idx;
4842	folio_pte = ptep_get(ptep: folio_ptep);
4843	if (!pte_same(a: folio_pte, b: pte_move_swp_offset(pte: vmf->orig_pte, delta: -idx)) \|\|
4844	swap_pte_batch(start_ptep: folio_ptep, max_nr: nr, pte: folio_pte) != nr)
4845	goto check_folio;
4846
4847	page_idx = idx;
4848	address = folio_start;
4849	ptep = folio_ptep;
4850	nr_pages = nr;
4851	entry = folio->swap;
4852	page = &folio->page;
4853	}
4854
4855	check_folio:
4856	/*
4857	* PG_anon_exclusive reuses PG_mappedtodisk for anon pages. A swap pte
4858	* must never point at an anonymous page in the swapcache that is
4859	* PG_anon_exclusive. Sanity check that this holds and especially, that
4860	* no filesystem set PG_mappedtodisk on a page in the swapcache. Sanity
4861	* check after taking the PT lock and making sure that nobody
4862	* concurrently faulted in this page and set PG_anon_exclusive.
4863	*/
4864	BUG_ON(!folio_test_anon(folio) && folio_test_mappedtodisk(folio));
4865	BUG_ON(folio_test_anon(folio) && PageAnonExclusive(page));
4866
4867	/*
4868	* Check under PT lock (to protect against concurrent fork() sharing
4869	* the swap entry concurrently) for certainly exclusive pages.
4870	*/
4871	if (!folio_test_ksm(folio)) {
4872	exclusive = pte_swp_exclusive(pte: vmf->orig_pte);
4873	if (folio != swapcache) {
4874	/*
4875	* We have a fresh page that is not exposed to the
4876	* swapcache -> certainly exclusive.
4877	*/
4878	exclusive = true;
4879	} else if (exclusive && folio_test_writeback(folio) &&
4880	data_race(si->flags & SWP_STABLE_WRITES)) {
4881	/*
4882	* This is tricky: not all swap backends support
4883	* concurrent page modifications while under writeback.
4884	*
4885	* So if we stumble over such a page in the swapcache
4886	* we must not set the page exclusive, otherwise we can
4887	* map it writable without further checks and modify it
4888	* while still under writeback.
4889	*
4890	* For these problematic swap backends, simply drop the
4891	* exclusive marker: this is perfectly fine as we start
4892	* writeback only if we fully unmapped the page and
4893	* there are no unexpected references on the page after
4894	* unmapping succeeded. After fully unmapped, no
4895	* further GUP references (FOLL_GET and FOLL_PIN) can
4896	* appear, so dropping the exclusive marker and mapping
4897	* it only R/O is fine.
4898	*/
4899	exclusive = false;
4900	}
4901	}
4902
4903	/*
4904	* Some architectures may have to restore extra metadata to the page
4905	* when reading from swap. This metadata may be indexed by swap entry
4906	* so this must be called before swap_free().
4907	*/
4908	arch_swap_restore(entry: folio_swap(entry, folio), folio);
4909
4910	/*
4911	* Remove the swap entry and conditionally try to free up the swapcache.
4912	* We're already holding a reference on the page but haven't mapped it
4913	* yet.
4914	*/
4915	swap_free_nr(entry, nr_pages);
4916	if (should_try_to_free_swap(folio, vma, fault_flags: vmf->flags))
4917	folio_free_swap(folio);
4918
4919	add_mm_counter(mm: vma->vm_mm, member: MM_ANONPAGES, value: nr_pages);
4920	add_mm_counter(mm: vma->vm_mm, member: MM_SWAPENTS, value: -nr_pages);
4921	pte = mk_pte(page, pgprot: vma->vm_page_prot);
4922	if (pte_swp_soft_dirty(pte: vmf->orig_pte))
4923	pte = pte_mksoft_dirty(pte);
4924	if (pte_swp_uffd_wp(pte: vmf->orig_pte))
4925	pte = pte_mkuffd_wp(pte);
4926
4927	/*
4928	* Same logic as in do_wp_page(); however, optimize for pages that are
4929	* certainly not shared either because we just allocated them without
4930	* exposing them to the swapcache or because the swap entry indicates
4931	* exclusivity.
4932	*/
4933	if (!folio_test_ksm(folio) &&
4934	(exclusive \|\| folio_ref_count(folio) == `1`)) {
4935	if ((vma->vm_flags & VM_WRITE) && !userfaultfd_pte_wp(vma, pte) &&
4936	!pte_needs_soft_dirty_wp(vma, pte)) {
4937	pte = pte_mkwrite(pte, vma);
4938	if (vmf->flags & FAULT_FLAG_WRITE) {
4939	pte = pte_mkdirty(pte);
4940	vmf->flags &= ~FAULT_FLAG_WRITE;
4941	}
4942	}
4943	rmap_flags \|= RMAP_EXCLUSIVE;
4944	}
4945	folio_ref_add(folio, nr: nr_pages - `1`);
4946	flush_icache_pages(vma, page, nr: nr_pages);
4947	vmf->orig_pte = pte_advance_pfn(pte, nr: page_idx);
4948
4949	/ ksm created a completely new copy /
4950	if (unlikely(folio != swapcache && swapcache)) {
4951	folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE);
4952	folio_add_lru_vma(folio, vma);
4953	} else if (!folio_test_anon(folio)) {
4954	/*
4955	* We currently only expect small !anon folios which are either
4956	* fully exclusive or fully shared, or new allocated large
4957	* folios which are fully exclusive. If we ever get large
4958	* folios within swapcache here, we have to be careful.
4959	*/
4960	VM_WARN_ON_ONCE(folio_test_large(folio) && folio_test_swapcache(folio));
4961	VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
4962	folio_add_new_anon_rmap(folio, vma, address, flags: rmap_flags);
4963	} else {
4964	folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, address,
4965	flags: rmap_flags);
4966	}
4967
4968	VM_BUG_ON(!folio_test_anon(folio) \|\|
4969	(pte_write(pte) && !PageAnonExclusive(page)));
4970	set_ptes(mm: vma->vm_mm, addr: address, ptep, pte, nr: nr_pages);
4971	arch_do_swap_page_nr(mm: vma->vm_mm, vma, addr: address,
4972	pte, oldpte: pte, nr: nr_pages);
4973
4974	folio_unlock(folio);
4975	if (folio != swapcache && swapcache) {
4976	/*
4977	* Hold the lock to avoid the swap entry to be reused
4978	* until we take the PT lock for the pte_same() check
4979	* (to avoid false positives from pte_same). For
4980	* further safety release the lock after the swap_free
4981	* so that the swap count won't change under a
4982	* parallel locked swapcache.
4983	*/
4984	folio_unlock(folio: swapcache);
4985	folio_put(folio: swapcache);
4986	}
4987
4988	if (vmf->flags & FAULT_FLAG_WRITE) {
4989	ret \|= do_wp_page(vmf);
4990	if (ret & VM_FAULT_ERROR)
4991	ret &= VM_FAULT_ERROR;
4992	goto out;
4993	}
4994
4995	/ No need to invalidate - it was non-present before /
4996	update_mmu_cache_range(vmf, vma, addr: address, ptep, nr: nr_pages);
4997	unlock:
4998	if (vmf->pte)
4999	pte_unmap_unlock(vmf->pte, vmf->ptl);
5000	out:
5001	/ Clear the swap cache pin for direct swapin after PTL unlock /
5002	if (need_clear_cache) {
5003	swapcache_clear(si, entry, nr: nr_pages);
5004	if (waitqueue_active(wq_head: &swapcache_wq))
5005	wake_up(&swapcache_wq);
5006	}
5007	if (si)
5008	put_swap_device(si);
5009	return ret;
5010	out_nomap:
5011	if (vmf->pte)
5012	pte_unmap_unlock(vmf->pte, vmf->ptl);
5013	out_page:
5014	folio_unlock(folio);
5015	out_release:
5016	folio_put(folio);
5017	if (folio != swapcache && swapcache) {
5018	folio_unlock(folio: swapcache);
5019	folio_put(folio: swapcache);
5020	}
5021	if (need_clear_cache) {
5022	swapcache_clear(si, entry, nr: nr_pages);
5023	if (waitqueue_active(wq_head: &swapcache_wq))
5024	wake_up(&swapcache_wq);
5025	}
5026	if (si)
5027	put_swap_device(si);
5028	return ret;
5029	}
5030
5031	static bool pte_range_none(pte_t pte, int* nr_pages)
5032	{
5033	int i;
5034
5035	for (i = `0`; i < nr_pages; i++) {
5036	if (!pte_none(pte: ptep_get_lockless(ptep: pte + i)))
5037	return false;
5038	}
5039
5040	return true;
5041	}
5042
5043	static struct folio alloc_anon_folio(struct* vm_fault *vmf)
5044	{
5045	struct vm_area_struct *vma = vmf->vma;
5046	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5047	unsigned long orders;
5048	struct folio *folio;
5049	unsigned long addr;
5050	pte_t *pte;
5051	gfp_t gfp;
5052	int order;
5053
5054	/*
5055	* If uffd is active for the vma we need per-page fault fidelity to
5056	* maintain the uffd semantics.
5057	*/
5058	if (unlikely(userfaultfd_armed(vma)))
5059	goto fallback;
5060
5061	/*
5062	* Get a list of all the (large) orders below PMD_ORDER that are enabled
5063	* for this vma. Then filter out the orders that can't be allocated over
5064	* the faulting address and still be fully contained in the vma.
5065	*/
5066	orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT,
5067	BIT(PMD_ORDER) - `1`);
5068	orders = thp_vma_suitable_orders(vma, vmf->address, orders);
5069
5070	if (!orders)
5071	goto fallback;
5072
5073	pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK);
5074	if (!pte)
5075	return ERR_PTR(-EAGAIN);
5076
5077	/*
5078	* Find the highest order where the aligned range is completely
5079	* pte_none(). Note that all remaining orders will be completely
5080	* pte_none().
5081	*/
5082	order = highest_order(orders);
5083	while (orders) {
5084	addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
5085	if (pte_range_none(pte + pte_index(addr), `1` << order))
5086	break;
5087	order = next_order(&orders, order);
5088	}
5089
5090	pte_unmap(pte);
5091
5092	if (!orders)
5093	goto fallback;
5094
5095	/ Try allocating the highest of the remaining orders. /
5096	gfp = vma_thp_gfp_mask(vma);
5097	while (orders) {
5098	addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
5099	folio = vma_alloc_folio(gfp, order, vma, addr);
5100	if (folio) {
5101	if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
5102	count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
5103	folio_put(folio);
5104	goto next;
5105	}
5106	folio_throttle_swaprate(folio, gfp);
5107	/*
5108	* When a folio is not zeroed during allocation
5109	* (__GFP_ZERO not used) or user folios require special
5110	* handling, folio_zero_user() is used to make sure
5111	* that the page corresponding to the faulting address
5112	* will be hot in the cache after zeroing.
5113	*/
5114	if (user_alloc_needs_zeroing())
5115	folio_zero_user(folio, vmf->address);
5116	return folio;
5117	}
5118	next:
5119	count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
5120	order = next_order(&orders, order);
5121	}
5122
5123	fallback:
5124	#endif
5125	return folio_prealloc(src_mm: vma->vm_mm, vma, addr: vmf->address, need_zero: true);
5126	}
5127
5128	/*
5129	* We enter with non-exclusive mmap_lock (to exclude vma changes,
5130	* but allow concurrent faults), and pte mapped but not yet locked.
5131	* We return with mmap_lock still held, but pte unmapped and unlocked.
5132	*/
5133	static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
5134	{
5135	struct vm_area_struct *vma = vmf->vma;
5136	unsigned long addr = vmf->address;
5137	struct folio *folio;
5138	vm_fault_t ret = `0`;
5139	int nr_pages = `1`;
5140	pte_t entry;
5141
5142	/ File mapping without ->vm_ops ? /
5143	if (vma->vm_flags & VM_SHARED)
5144	return VM_FAULT_SIGBUS;
5145
5146	/*
5147	* Use pte_alloc() instead of pte_alloc_map(), so that OOM can
5148	* be distinguished from a transient failure of pte_offset_map().
5149	*/
5150	if (pte_alloc(vma->vm_mm, vmf->pmd))
5151	return VM_FAULT_OOM;
5152
5153	/ Use the zero-page for reads /
5154	if (!(vmf->flags & FAULT_FLAG_WRITE) &&
5155	!mm_forbids_zeropage(vma->vm_mm)) {
5156	entry = pte_mkspecial(pte: pfn_pte(page_nr: my_zero_pfn(addr: vmf->address),
5157	pgprot: vma->vm_page_prot));
5158	vmf->pte = pte_offset_map_lock(mm: vma->vm_mm, pmd: vmf->pmd,
5159	addr: vmf->address, ptlp: &vmf->ptl);
5160	if (!vmf->pte)
5161	goto unlock;
5162	if (vmf_pte_changed(vmf)) {
5163	update_mmu_tlb(vma, address: vmf->address, ptep: vmf->pte);
5164	goto unlock;
5165	}
5166	ret = check_stable_address_space(mm: vma->vm_mm);
5167	if (ret)
5168	goto unlock;
5169	/ Deliver the page fault to userland, check inside PT lock /
5170	if (userfaultfd_missing(vma)) {
5171	pte_unmap_unlock(vmf->pte, vmf->ptl);
5172	return handle_userfault(vmf, VM_UFFD_MISSING);
5173	}
5174	goto setpte;
5175	}
5176
5177	/ Allocate our own private page. /
5178	ret = vmf_anon_prepare(vmf);
5179	if (ret)
5180	return ret;
5181	/ Returns NULL on OOM or ERR_PTR(-EAGAIN) if we must retry the fault /
5182	folio = alloc_anon_folio(vmf);
5183	if (IS_ERR(ptr: folio))
5184	return `0`;
5185	if (!folio)
5186	goto oom;
5187
5188	nr_pages = folio_nr_pages(folio);
5189	addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
5190
5191	/*
5192	* The memory barrier inside __folio_mark_uptodate makes sure that
5193	* preceding stores to the page contents become visible before
5194	* the set_pte_at() write.
5195	*/
5196	__folio_mark_uptodate(folio);
5197
5198	entry = folio_mk_pte(folio, pgprot: vma->vm_page_prot);
5199	entry = pte_sw_mkyoung(pte: entry);
5200	if (vma->vm_flags & VM_WRITE)
5201	entry = pte_mkwrite(pte: pte_mkdirty(pte: entry), vma);
5202
5203	vmf->pte = pte_offset_map_lock(mm: vma->vm_mm, pmd: vmf->pmd, addr, ptlp: &vmf->ptl);
5204	if (!vmf->pte)
5205	goto release;
5206	if (nr_pages == `1` && vmf_pte_changed(vmf)) {
5207	update_mmu_tlb(vma, address: addr, ptep: vmf->pte);
5208	goto release;
5209	} else if (nr_pages > `1` && !pte_range_none(pte: vmf->pte, nr_pages)) {
5210	update_mmu_tlb_range(vma, address: addr, ptep: vmf->pte, nr: nr_pages);
5211	goto release;
5212	}
5213
5214	ret = check_stable_address_space(mm: vma->vm_mm);
5215	if (ret)
5216	goto release;
5217
5218	/ Deliver the page fault to userland, check inside PT lock /
5219	if (userfaultfd_missing(vma)) {
5220	pte_unmap_unlock(vmf->pte, vmf->ptl);
5221	folio_put(folio);
5222	return handle_userfault(vmf, VM_UFFD_MISSING);
5223	}
5224
5225	folio_ref_add(folio, nr: nr_pages - `1`);
5226	add_mm_counter(mm: vma->vm_mm, member: MM_ANONPAGES, value: nr_pages);
5227	count_mthp_stat(order: folio_order(folio), item: MTHP_STAT_ANON_FAULT_ALLOC);
5228	folio_add_new_anon_rmap(folio, vma, address: addr, RMAP_EXCLUSIVE);
5229	folio_add_lru_vma(folio, vma);
5230	setpte:
5231	if (vmf_orig_pte_uffd_wp(vmf))
5232	entry = pte_mkuffd_wp(pte: entry);
5233	set_ptes(mm: vma->vm_mm, addr, ptep: vmf->pte, pte: entry, nr: nr_pages);
5234
5235	/ No need to invalidate - it was non-present before /
5236	update_mmu_cache_range(vmf, vma, addr, ptep: vmf->pte, nr: nr_pages);
5237	unlock:
5238	if (vmf->pte)
5239	pte_unmap_unlock(vmf->pte, vmf->ptl);
5240	return ret;
5241	release:
5242	folio_put(folio);
5243	goto unlock;
5244	oom:
5245	return VM_FAULT_OOM;
5246	}
5247
5248	/*
5249	* The mmap_lock must have been held on entry, and may have been
5250	* released depending on flags and vma->vm_ops->fault() return value.
5251	* See filemap_fault() and __lock_page_retry().
5252	*/
5253	static vm_fault_t __do_fault(struct vm_fault *vmf)
5254	{
5255	struct vm_area_struct *vma = vmf->vma;
5256	struct folio *folio;
5257	vm_fault_t ret;
5258
5259	/*
5260	* Preallocate pte before we take page_lock because this might lead to
5261	* deadlocks for memcg reclaim which waits for pages under writeback:
5262	* lock_page(A)
5263	* SetPageWriteback(A)
5264	* unlock_page(A)
5265	* lock_page(B)
5266	* lock_page(B)
5267	* pte_alloc_one
5268	* shrink_folio_list
5269	* wait_on_page_writeback(A)
5270	* SetPageWriteback(B)
5271	* unlock_page(B)
5272	* # flush A, B to clear the writeback
5273	*/
5274	if (pmd_none(pmd: *vmf->pmd) && !vmf->prealloc_pte) {
5275	vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
5276	if (!vmf->prealloc_pte)
5277	return VM_FAULT_OOM;
5278	}
5279
5280	ret = vma->vm_ops->fault(vmf);
5281	if (unlikely(ret & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE \| VM_FAULT_RETRY \|
5282	VM_FAULT_DONE_COW)))
5283	return ret;
5284
5285	folio = page_folio(vmf->page);
5286	if (unlikely(PageHWPoison(vmf->page))) {
5287	vm_fault_t poisonret = VM_FAULT_HWPOISON;
5288	if (ret & VM_FAULT_LOCKED) {
5289	if (page_mapped(page: vmf->page))
5290	unmap_mapping_folio(folio);
5291	/ Retry if a clean folio was removed from the cache. /
5292	if (mapping_evict_folio(mapping: folio->mapping, folio))
5293	poisonret = VM_FAULT_NOPAGE;
5294	folio_unlock(folio);
5295	}
5296	folio_put(folio);
5297	vmf->page = NULL;
5298	return poisonret;
5299	}
5300
5301	if (unlikely(!(ret & VM_FAULT_LOCKED)))
5302	folio_lock(folio);
5303	else
5304	VM_BUG_ON_PAGE(!folio_test_locked(folio), vmf->page);
5305
5306	return ret;
5307	}
5308
5309	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5310	static void deposit_prealloc_pte(struct vm_fault *vmf)
5311	{
5312	struct vm_area_struct *vma = vmf->vma;
5313
5314	pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
5315	/*
5316	* We are going to consume the prealloc table,
5317	* count that as nr_ptes.
5318	*/
5319	mm_inc_nr_ptes(vma->vm_mm);
5320	vmf->prealloc_pte = NULL;
5321	}
5322
5323	vm_fault_t do_set_pmd(struct vm_fault vmf, struct* folio folio, struct* page *page)
5324	{
5325	struct vm_area_struct *vma = vmf->vma;
5326	bool write = vmf->flags & FAULT_FLAG_WRITE;
5327	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
5328	pmd_t entry;
5329	vm_fault_t ret = VM_FAULT_FALLBACK;
5330
5331	/*
5332	* It is too late to allocate a small folio, we already have a large
5333	* folio in the pagecache: especially s390 KVM cannot tolerate any
5334	* PMD mappings, but PTE-mapped THP are fine. So let's simply refuse any
5335	* PMD mappings if THPs are disabled. As we already have a THP,
5336	* behave as if we are forcing a collapse.
5337	*/
5338	if (thp_disabled_by_hw() \|\| vma_thp_disabled(vma, vma->vm_flags,
5339	/ forced_collapse=/ true))
5340	return ret;
5341
5342	if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
5343	return ret;
5344
5345	if (folio_order(folio) != HPAGE_PMD_ORDER)
5346	return ret;
5347	page = &folio->page;
5348
5349	/*
5350	* Just backoff if any subpage of a THP is corrupted otherwise
5351	* the corrupted page may mapped by PMD silently to escape the
5352	* check. This kind of THP just can be PTE mapped. Access to
5353	* the corrupted subpage should trigger SIGBUS as expected.
5354	*/
5355	if (unlikely(folio_test_has_hwpoisoned(folio)))
5356	return ret;
5357
5358	/*
5359	* Archs like ppc64 need additional space to store information
5360	* related to pte entry. Use the preallocated table for that.
5361	*/
5362	if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
5363	vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
5364	if (!vmf->prealloc_pte)
5365	return VM_FAULT_OOM;
5366	}
5367
5368	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
5369	if (unlikely(!pmd_none(*vmf->pmd)))
5370	goto out;
5371
5372	flush_icache_pages(vma, page, HPAGE_PMD_NR);
5373
5374	entry = folio_mk_pmd(folio, vma->vm_page_prot);
5375	if (write)
5376	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
5377
5378	add_mm_counter(vma->vm_mm, mm_counter_file(folio), HPAGE_PMD_NR);
5379	folio_add_file_rmap_pmd(folio, page, vma);
5380
5381	/*
5382	* deposit and withdraw with pmd lock held
5383	*/
5384	if (arch_needs_pgtable_deposit())
5385	deposit_prealloc_pte(vmf);
5386
5387	set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
5388
5389	update_mmu_cache_pmd(vma, haddr, vmf->pmd);
5390
5391	/ fault is handled /
5392	ret = `0`;
5393	count_vm_event(THP_FILE_MAPPED);
5394	out:
5395	spin_unlock(vmf->ptl);
5396	return ret;
5397	}
5398	#else
5399	vm_fault_t do_set_pmd(struct vm_fault vmf, struct* folio folio, struct* page *page)
5400	{
5401	return VM_FAULT_FALLBACK;
5402	}
5403	#endif
5404
5405	/**
5406	* set_pte_range - Set a range of PTEs to point to pages in a folio.
5407	* @vmf: Fault decription.
5408	* @folio: The folio that contains @page.
5409	* @page: The first page to create a PTE for.
5410	* @nr: The number of PTEs to create.
5411	* @addr: The first address to create a PTE for.
5412	*/
5413	void set_pte_range(struct vm_fault vmf, struct* folio *folio,
5414	struct page page, unsigned* int nr, unsigned long addr)
5415	{
5416	struct vm_area_struct *vma = vmf->vma;
5417	bool write = vmf->flags & FAULT_FLAG_WRITE;
5418	bool prefault = !in_range(vmf->address, addr, nr * PAGE_SIZE);
5419	pte_t entry;
5420
5421	flush_icache_pages(vma, page, nr);
5422	entry = mk_pte(page, pgprot: vma->vm_page_prot);
5423
5424	if (prefault && arch_wants_old_prefaulted_pte())
5425	entry = pte_mkold(pte: entry);
5426	else
5427	entry = pte_sw_mkyoung(pte: entry);
5428
5429	if (write)
5430	entry = maybe_mkwrite(pte: pte_mkdirty(pte: entry), vma);
5431	else if (pte_write(pte: entry) && folio_test_dirty(folio))
5432	entry = pte_mkdirty(pte: entry);
5433	if (unlikely(vmf_orig_pte_uffd_wp(vmf)))
5434	entry = pte_mkuffd_wp(pte: entry);
5435	/ copy-on-write page /
5436	if (write && !(vma->vm_flags & VM_SHARED)) {
5437	VM_BUG_ON_FOLIO(nr != `1`, folio);
5438	folio_add_new_anon_rmap(folio, vma, address: addr, RMAP_EXCLUSIVE);
5439	folio_add_lru_vma(folio, vma);
5440	} else {
5441	folio_add_file_rmap_ptes(folio, page, nr_pages: nr, vma);
5442	}
5443	set_ptes(mm: vma->vm_mm, addr, ptep: vmf->pte, pte: entry, nr);
5444
5445	/ no need to invalidate: a not-present page won't be cached /
5446	update_mmu_cache_range(vmf, vma, addr, ptep: vmf->pte, nr);
5447	}
5448
5449	static bool vmf_pte_changed(struct vm_fault *vmf)
5450	{
5451	if (vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)
5452	return !pte_same(a: ptep_get(ptep: vmf->pte), b: vmf->orig_pte);
5453
5454	return !pte_none(pte: ptep_get(ptep: vmf->pte));
5455	}
5456
5457	/**
5458	* finish_fault - finish page fault once we have prepared the page to fault
5459	*
5460	* @vmf: structure describing the fault
5461	*
5462	* This function handles all that is needed to finish a page fault once the
5463	* page to fault in is prepared. It handles locking of PTEs, inserts PTE for
5464	* given page, adds reverse page mapping, handles memcg charges and LRU
5465	* addition.
5466	*
5467	* The function expects the page to be locked and on success it consumes a
5468	* reference of a page being mapped (for the PTE which maps it).
5469	*
5470	* Return: %0 on success, %VM_FAULT_ code in case of error.
5471	*/
5472	vm_fault_t finish_fault(struct vm_fault *vmf)
5473	{
5474	struct vm_area_struct *vma = vmf->vma;
5475	struct page *page;
5476	struct folio *folio;
5477	vm_fault_t ret;
5478	bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) &&
5479	!(vma->vm_flags & VM_SHARED);
5480	int type, nr_pages;
5481	unsigned long addr;
5482	bool needs_fallback = false;
5483
5484	fallback:
5485	addr = vmf->address;
5486
5487	/ Did we COW the page? /
5488	if (is_cow)
5489	page = vmf->cow_page;
5490	else
5491	page = vmf->page;
5492
5493	folio = page_folio(page);
5494	/*
5495	* check even for read faults because we might have lost our CoWed
5496	* page
5497	*/
5498	if (!(vma->vm_flags & VM_SHARED)) {
5499	ret = check_stable_address_space(mm: vma->vm_mm);
5500	if (ret)
5501	return ret;
5502	}
5503
5504	if (pmd_none(pmd: *vmf->pmd)) {
5505	if (folio_test_pmd_mappable(folio)) {
5506	ret = do_set_pmd(vmf, folio, page);
5507	if (ret != VM_FAULT_FALLBACK)
5508	return ret;
5509	}
5510
5511	if (vmf->prealloc_pte)
5512	pmd_install(mm: vma->vm_mm, pmd: vmf->pmd, pte: &vmf->prealloc_pte);
5513	else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd)))
5514	return VM_FAULT_OOM;
5515	}
5516
5517	nr_pages = folio_nr_pages(folio);
5518
5519	/ Using per-page fault to maintain the uffd semantics /
5520	if (unlikely(userfaultfd_armed(vma)) \|\| unlikely(needs_fallback)) {
5521	nr_pages = `1`;
5522	} else if (nr_pages > `1`) {
5523	pgoff_t idx = folio_page_idx(folio, page);
5524	/ The page offset of vmf->address within the VMA. /
5525	pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff;
5526	/ The index of the entry in the pagetable for fault page. /
5527	pgoff_t pte_off = pte_index(address: vmf->address);
5528
5529	/*
5530	* Fallback to per-page fault in case the folio size in page
5531	* cache beyond the VMA limits and PMD pagetable limits.
5532	*/
5533	if (unlikely(vma_off < idx \|\|
5534	vma_off + (nr_pages - idx) > vma_pages(vma) \|\|
5535	pte_off < idx \|\|
5536	pte_off + (nr_pages - idx) > PTRS_PER_PTE)) {
5537	nr_pages = `1`;
5538	} else {
5539	/ Now we can set mappings for the whole large folio. /
5540	addr = vmf->address - idx * PAGE_SIZE;
5541	page = &folio->page;
5542	}
5543	}
5544
5545	vmf->pte = pte_offset_map_lock(mm: vma->vm_mm, pmd: vmf->pmd,
5546	addr, ptlp: &vmf->ptl);
5547	if (!vmf->pte)
5548	return VM_FAULT_NOPAGE;
5549
5550	/ Re-check under ptl /
5551	if (nr_pages == `1` && unlikely(vmf_pte_changed(vmf))) {
5552	update_mmu_tlb(vma, address: addr, ptep: vmf->pte);
5553	ret = VM_FAULT_NOPAGE;
5554	goto unlock;
5555	} else if (nr_pages > `1` && !pte_range_none(pte: vmf->pte, nr_pages)) {
5556	needs_fallback = true;
5557	pte_unmap_unlock(vmf->pte, vmf->ptl);
5558	goto fallback;
5559	}
5560
5561	folio_ref_add(folio, nr: nr_pages - `1`);
5562	set_pte_range(vmf, folio, page, nr: nr_pages, addr);
5563	type = is_cow ? MM_ANONPAGES : mm_counter_file(folio);
5564	add_mm_counter(mm: vma->vm_mm, member: type, value: nr_pages);
5565	ret = `0`;
5566
5567	unlock:
5568	pte_unmap_unlock(vmf->pte, vmf->ptl);
5569	return ret;
5570	}
5571
5572	static unsigned long fault_around_pages __read_mostly =
5573	`65536` >> PAGE_SHIFT;
5574
5575	#ifdef CONFIG_DEBUG_FS
5576	static int fault_around_bytes_get(void data, u64 val)
5577	{
5578	*val = fault_around_pages << PAGE_SHIFT;
5579	return `0`;
5580	}
5581
5582	/*
5583	* fault_around_bytes must be rounded down to the nearest page order as it's
5584	* what do_fault_around() expects to see.
5585	*/
5586	static int fault_around_bytes_set(void *data, u64 val)
5587	{
5588	if (val / PAGE_SIZE > PTRS_PER_PTE)
5589	return -EINVAL;
5590
5591	/*
5592	* The minimum value is 1 page, however this results in no fault-around
5593	* at all. See should_fault_around().
5594	*/
5595	val = max(val, PAGE_SIZE);
5596	fault_around_pages = rounddown_pow_of_two(val) >> PAGE_SHIFT;
5597
5598	return `0`;
5599	}
5600	DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
5601	fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
5602
5603	static int __init fault_around_debugfs(void)
5604	{
5605	debugfs_create_file_unsafe(name: "fault_around_bytes", mode: `0644`, NULL, NULL,
5606	fops: &fault_around_bytes_fops);
5607	return `0`;
5608	}
5609	late_initcall(fault_around_debugfs);
5610	#endif
5611
5612	/*
5613	* do_fault_around() tries to map few pages around the fault address. The hope
5614	* is that the pages will be needed soon and this will lower the number of
5615	* faults to handle.
5616	*
5617	* It uses vm_ops->map_pages() to map the pages, which skips the page if it's
5618	* not ready to be mapped: not up-to-date, locked, etc.
5619	*
5620	* This function doesn't cross VMA or page table boundaries, in order to call
5621	* map_pages() and acquire a PTE lock only once.
5622	*
5623	* fault_around_pages defines how many pages we'll try to map.
5624	* do_fault_around() expects it to be set to a power of two less than or equal
5625	* to PTRS_PER_PTE.
5626	*
5627	* The virtual address of the area that we map is naturally aligned to
5628	* fault_around_pages * PAGE_SIZE rounded down to the machine page size
5629	* (and therefore to page order). This way it's easier to guarantee
5630	* that we don't cross page table boundaries.
5631	*/
5632	static vm_fault_t do_fault_around(struct vm_fault *vmf)
5633	{
5634	pgoff_t nr_pages = READ_ONCE(fault_around_pages);
5635	pgoff_t pte_off = pte_index(address: vmf->address);
5636	/ The page offset of vmf->address within the VMA. /
5637	pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff;
5638	pgoff_t from_pte, to_pte;
5639	vm_fault_t ret;
5640
5641	/ The PTE offset of the start address, clamped to the VMA. /
5642	from_pte = max(ALIGN_DOWN(pte_off, nr_pages),
5643	pte_off - min(pte_off, vma_off));
5644
5645	/ The PTE offset of the end address, clamped to the VMA and PTE. /
5646	to_pte = min3(from_pte + nr_pages, (pgoff_t)PTRS_PER_PTE,
5647	pte_off + vma_pages(vmf->vma) - vma_off) - `1`;
5648
5649	if (pmd_none(pmd: *vmf->pmd)) {
5650	vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
5651	if (!vmf->prealloc_pte)
5652	return VM_FAULT_OOM;
5653	}
5654
5655	rcu_read_lock();
5656	ret = vmf->vma->vm_ops->map_pages(vmf,
5657	vmf->pgoff + from_pte - pte_off,
5658	vmf->pgoff + to_pte - pte_off);
5659	rcu_read_unlock();
5660
5661	return ret;
5662	}
5663
5664	/ Return true if we should do read fault-around, false otherwise /
5665	static inline bool should_fault_around(struct vm_fault *vmf)
5666	{
5667	/ No ->map_pages? No way to fault around... /
5668	if (!vmf->vma->vm_ops->map_pages)
5669	return false;
5670
5671	if (uffd_disable_fault_around(vma: vmf->vma))
5672	return false;
5673
5674	/ A single page implies no faulting 'around' at all. /
5675	return fault_around_pages > `1`;
5676	}
5677
5678	static vm_fault_t do_read_fault(struct vm_fault *vmf)
5679	{
5680	vm_fault_t ret = `0`;
5681	struct folio *folio;
5682
5683	/*
5684	* Let's call ->map_pages() first and use ->fault() as fallback
5685	* if page by the offset is not ready to be mapped (cold cache or
5686	* something).
5687	*/
5688	if (should_fault_around(vmf)) {
5689	ret = do_fault_around(vmf);
5690	if (ret)
5691	return ret;
5692	}
5693
5694	ret = vmf_can_call_fault(vmf);
5695	if (ret)
5696	return ret;
5697
5698	ret = __do_fault(vmf);
5699	if (unlikely(ret & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE \| VM_FAULT_RETRY)))
5700	return ret;
5701
5702	ret \|= finish_fault(vmf);
5703	folio = page_folio(vmf->page);
5704	folio_unlock(folio);
5705	if (unlikely(ret & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE \| VM_FAULT_RETRY)))
5706	folio_put(folio);
5707	return ret;
5708	}
5709
5710	static vm_fault_t do_cow_fault(struct vm_fault *vmf)
5711	{
5712	struct vm_area_struct *vma = vmf->vma;
5713	struct folio *folio;
5714	vm_fault_t ret;
5715
5716	ret = vmf_can_call_fault(vmf);
5717	if (!ret)
5718	ret = vmf_anon_prepare(vmf);
5719	if (ret)
5720	return ret;
5721
5722	folio = folio_prealloc(src_mm: vma->vm_mm, vma, addr: vmf->address, need_zero: false);
5723	if (!folio)
5724	return VM_FAULT_OOM;
5725
5726	vmf->cow_page = &folio->page;
5727
5728	ret = __do_fault(vmf);
5729	if (unlikely(ret & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE \| VM_FAULT_RETRY)))
5730	goto uncharge_out;
5731	if (ret & VM_FAULT_DONE_COW)
5732	return ret;
5733
5734	if (copy_mc_user_highpage(to: vmf->cow_page, from: vmf->page, vaddr: vmf->address, vma)) {
5735	ret = VM_FAULT_HWPOISON;
5736	goto unlock;
5737	}
5738	__folio_mark_uptodate(folio);
5739
5740	ret \|= finish_fault(vmf);
5741	unlock:
5742	unlock_page(page: vmf->page);
5743	put_page(page: vmf->page);
5744	if (unlikely(ret & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE \| VM_FAULT_RETRY)))
5745	goto uncharge_out;
5746	return ret;
5747	uncharge_out:
5748	folio_put(folio);
5749	return ret;
5750	}
5751
5752	static vm_fault_t do_shared_fault(struct vm_fault *vmf)
5753	{
5754	struct vm_area_struct *vma = vmf->vma;
5755	vm_fault_t ret, tmp;
5756	struct folio *folio;
5757
5758	ret = vmf_can_call_fault(vmf);
5759	if (ret)
5760	return ret;
5761
5762	ret = __do_fault(vmf);
5763	if (unlikely(ret & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE \| VM_FAULT_RETRY)))
5764	return ret;
5765
5766	folio = page_folio(vmf->page);
5767
5768	/*
5769	* Check if the backing address space wants to know that the page is
5770	* about to become writable
5771	*/
5772	if (vma->vm_ops->page_mkwrite) {
5773	folio_unlock(folio);
5774	tmp = do_page_mkwrite(vmf, folio);
5775	if (unlikely(!tmp \|\|
5776	(tmp & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE)))) {
5777	folio_put(folio);
5778	return tmp;
5779	}
5780	}
5781
5782	ret \|= finish_fault(vmf);
5783	if (unlikely(ret & (VM_FAULT_ERROR \| VM_FAULT_NOPAGE \|
5784	VM_FAULT_RETRY))) {
5785	folio_unlock(folio);
5786	folio_put(folio);
5787	return ret;
5788	}
5789
5790	ret \|= fault_dirty_shared_page(vmf);
5791	return ret;
5792	}
5793
5794	/*
5795	* We enter with non-exclusive mmap_lock (to exclude vma changes,
5796	* but allow concurrent faults).
5797	* The mmap_lock may have been released depending on flags and our
5798	* return value. See filemap_fault() and __folio_lock_or_retry().
5799	* If mmap_lock is released, vma may become invalid (for example
5800	* by other thread calling munmap()).
5801	*/
5802	static vm_fault_t do_fault(struct vm_fault *vmf)
5803	{
5804	struct vm_area_struct *vma = vmf->vma;
5805	struct mm_struct *vm_mm = vma->vm_mm;
5806	vm_fault_t ret;
5807
5808	/*
5809	* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND
5810	*/
5811	if (!vma->vm_ops->fault) {
5812	vmf->pte = pte_offset_map_lock(mm: vmf->vma->vm_mm, pmd: vmf->pmd,
5813	addr: vmf->address, ptlp: &vmf->ptl);
5814	if (unlikely(!vmf->pte))
5815	ret = VM_FAULT_SIGBUS;
5816	else {
5817	/*
5818	* Make sure this is not a temporary clearing of pte
5819	* by holding ptl and checking again. A R/M/W update
5820	* of pte involves: take ptl, clearing the pte so that
5821	* we don't have concurrent modification by hardware
5822	* followed by an update.
5823	*/
5824	if (unlikely(pte_none(ptep_get(vmf->pte))))
5825	ret = VM_FAULT_SIGBUS;
5826	else
5827	ret = VM_FAULT_NOPAGE;
5828
5829	pte_unmap_unlock(vmf->pte, vmf->ptl);
5830	}
5831	} else if (!(vmf->flags & FAULT_FLAG_WRITE))
5832	ret = do_read_fault(vmf);
5833	else if (!(vma->vm_flags & VM_SHARED))
5834	ret = do_cow_fault(vmf);
5835	else
5836	ret = do_shared_fault(vmf);
5837
5838	/ preallocated pagetable is unused: free it /
5839	if (vmf->prealloc_pte) {
5840	pte_free(mm: vm_mm, pte_page: vmf->prealloc_pte);
5841	vmf->prealloc_pte = NULL;
5842	}
5843	return ret;
5844	}
5845
5846	int numa_migrate_check(struct folio folio, struct* vm_fault *vmf,
5847	unsigned long addr, int *flags,
5848	bool writable, int *last_cpupid)
5849	{
5850	struct vm_area_struct *vma = vmf->vma;
5851
5852	/*
5853	* Avoid grouping on RO pages in general. RO pages shouldn't hurt as
5854	* much anyway since they can be in shared cache state. This misses
5855	* the case where a mapping is writable but the process never writes
5856	* to it but pte_write gets cleared during protection updates and
5857	* pte_dirty has unpredictable behaviour between PTE scan updates,
5858	* background writeback, dirty balancing and application behaviour.
5859	*/
5860	if (!writable)
5861	*flags \|= TNF_NO_GROUP;
5862
5863	/*
5864	* Flag if the folio is shared between multiple address spaces. This
5865	* is later used when determining whether to group tasks together
5866	*/
5867	if (folio_maybe_mapped_shared(folio) && (vma->vm_flags & VM_SHARED))
5868	*flags \|= TNF_SHARED;
5869	/*
5870	* For memory tiering mode, cpupid of slow memory page is used
5871	* to record page access time. So use default value.
5872	*/
5873	if (folio_use_access_time(folio))
5874	*last_cpupid = (-`1` & LAST_CPUPID_MASK);
5875	else
5876	*last_cpupid = folio_last_cpupid(folio);
5877
5878	/ Record the current PID acceesing VMA /
5879	vma_set_access_pid_bit(vma);
5880
5881	count_vm_numa_event(NUMA_HINT_FAULTS);
5882	#ifdef CONFIG_NUMA_BALANCING
5883	count_memcg_folio_events(folio, NUMA_HINT_FAULTS, `1`);
5884	#endif
5885	if (folio_nid(folio) == numa_node_id()) {
5886	count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
5887	*flags \|= TNF_FAULT_LOCAL;
5888	}
5889
5890	return mpol_misplaced(folio, vmf, addr);
5891	}
5892
5893	static void numa_rebuild_single_mapping(struct vm_fault vmf, struct* vm_area_struct *vma,
5894	unsigned long fault_addr, pte_t *fault_pte,
5895	bool writable)
5896	{
5897	pte_t pte, old_pte;
5898
5899	old_pte = ptep_modify_prot_start(vma, addr: fault_addr, ptep: fault_pte);
5900	pte = pte_modify(pte: old_pte, newprot: vma->vm_page_prot);
5901	pte = pte_mkyoung(pte);
5902	if (writable)
5903	pte = pte_mkwrite(pte, vma);
5904	ptep_modify_prot_commit(vma, addr: fault_addr, ptep: fault_pte, old_pte, pte);
5905	update_mmu_cache_range(vmf, vma, addr: fault_addr, ptep: fault_pte, nr: `1`);
5906	}
5907
5908	static void numa_rebuild_large_mapping(struct vm_fault vmf, struct* vm_area_struct *vma,
5909	struct folio *folio, pte_t fault_pte,
5910	bool ignore_writable, bool pte_write_upgrade)
5911	{
5912	int nr = pte_pfn(pte: fault_pte) - folio_pfn(folio);
5913	unsigned long start, end, addr = vmf->address;
5914	unsigned long addr_start = addr - (nr << PAGE_SHIFT);
5915	unsigned long pt_start = ALIGN_DOWN(addr, PMD_SIZE);
5916	pte_t *start_ptep;
5917
5918	/ Stay within the VMA and within the page table. /
5919	start = max3(addr_start, pt_start, vma->vm_start);
5920	end = min3(addr_start + folio_size(folio), pt_start + PMD_SIZE,
5921	vma->vm_end);
5922	start_ptep = vmf->pte - ((addr - start) >> PAGE_SHIFT);
5923
5924	/ Restore all PTEs' mapping of the large folio /
5925	for (addr = start; addr != end; start_ptep++, addr += PAGE_SIZE) {
5926	pte_t ptent = ptep_get(ptep: start_ptep);
5927	bool writable = false;
5928
5929	if (!pte_present(a: ptent) \|\| !pte_protnone(pte: ptent))
5930	continue;
5931
5932	if (pfn_folio(pfn: pte_pfn(pte: ptent)) != folio)
5933	continue;
5934
5935	if (!ignore_writable) {
5936	ptent = pte_modify(pte: ptent, newprot: vma->vm_page_prot);
5937	writable = pte_write(pte: ptent);
5938	if (!writable && pte_write_upgrade &&
5939	can_change_pte_writable(vma, addr, pte: ptent))
5940	writable = true;
5941	}
5942
5943	numa_rebuild_single_mapping(vmf, vma, fault_addr: addr, fault_pte: start_ptep, writable);
5944	}
5945	}
5946
5947	static vm_fault_t do_numa_page(struct vm_fault *vmf)
5948	{
5949	struct vm_area_struct *vma = vmf->vma;
5950	struct folio *folio = NULL;
5951	int nid = NUMA_NO_NODE;
5952	bool writable = false, ignore_writable = false;
5953	bool pte_write_upgrade = vma_wants_manual_pte_write_upgrade(vma);
5954	int last_cpupid;
5955	int target_nid;
5956	pte_t pte, old_pte;
5957	int flags = `0`, nr_pages;
5958
5959	/*
5960	* The pte cannot be used safely until we verify, while holding the page
5961	* table lock, that its contents have not changed during fault handling.
5962	*/
5963	spin_lock(lock: vmf->ptl);
5964	/ Read the live PTE from the page tables: /
5965	old_pte = ptep_get(ptep: vmf->pte);
5966
5967	if (unlikely(!pte_same(old_pte, vmf->orig_pte))) {
5968	pte_unmap_unlock(vmf->pte, vmf->ptl);
5969	return `0`;
5970	}
5971
5972	pte = pte_modify(pte: old_pte, newprot: vma->vm_page_prot);
5973
5974	/*
5975	* Detect now whether the PTE could be writable; this information
5976	* is only valid while holding the PT lock.
5977	*/
5978	writable = pte_write(pte);
5979	if (!writable && pte_write_upgrade &&
5980	can_change_pte_writable(vma, addr: vmf->address, pte))
5981	writable = true;
5982
5983	folio = vm_normal_folio(vma, addr: vmf->address, pte);
5984	if (!folio \|\| folio_is_zone_device(folio))
5985	goto out_map;
5986
5987	nid = folio_nid(folio);
5988	nr_pages = folio_nr_pages(folio);
5989
5990	target_nid = numa_migrate_check(folio, vmf, addr: vmf->address, flags: &flags,
5991	writable, last_cpupid: &last_cpupid);
5992	if (target_nid == NUMA_NO_NODE)
5993	goto out_map;
5994	if (migrate_misplaced_folio_prepare(folio, vma, node: target_nid)) {
5995	flags \|= TNF_MIGRATE_FAIL;
5996	goto out_map;
5997	}
5998	/ The folio is isolated and isolation code holds a folio reference. /
5999	pte_unmap_unlock(vmf->pte, vmf->ptl);
6000	writable = false;
6001	ignore_writable = true;
6002
6003	/ Migrate to the requested node /
6004	if (!migrate_misplaced_folio(folio, node: target_nid)) {
6005	nid = target_nid;
6006	flags \|= TNF_MIGRATED;
6007	task_numa_fault(last_node: last_cpupid, node: nid, pages: nr_pages, flags);
6008	return `0`;
6009	}
6010
6011	flags \|= TNF_MIGRATE_FAIL;
6012	vmf->pte = pte_offset_map_lock(mm: vma->vm_mm, pmd: vmf->pmd,
6013	addr: vmf->address, ptlp: &vmf->ptl);
6014	if (unlikely(!vmf->pte))
6015	return `0`;
6016	if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
6017	pte_unmap_unlock(vmf->pte, vmf->ptl);
6018	return `0`;
6019	}
6020	out_map:
6021	/*
6022	* Make it present again, depending on how arch implements
6023	* non-accessible ptes, some can allow access by kernel mode.
6024	*/
6025	if (folio && folio_test_large(folio))
6026	numa_rebuild_large_mapping(vmf, vma, folio, fault_pte: pte, ignore_writable,
6027	pte_write_upgrade);
6028	else
6029	numa_rebuild_single_mapping(vmf, vma, fault_addr: vmf->address, fault_pte: vmf->pte,
6030	writable);
6031	pte_unmap_unlock(vmf->pte, vmf->ptl);
6032
6033	if (nid != NUMA_NO_NODE)
6034	task_numa_fault(last_node: last_cpupid, node: nid, pages: nr_pages, flags);
6035	return `0`;
6036	}
6037
6038	static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
6039	{
6040	struct vm_area_struct *vma = vmf->vma;
6041	if (vma_is_anonymous(vma))
6042	return do_huge_pmd_anonymous_page(vmf);
6043	if (vma->vm_ops->huge_fault)
6044	return vma->vm_ops->huge_fault(vmf, PMD_ORDER);
6045	return VM_FAULT_FALLBACK;
6046	}
6047
6048	/ `inline' is required to avoid gcc 4.1.2 build error /
6049	static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
6050	{
6051	struct vm_area_struct *vma = vmf->vma;
6052	const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
6053	vm_fault_t ret;
6054
6055	if (vma_is_anonymous(vma)) {
6056	if (likely(!unshare) &&
6057	userfaultfd_huge_pmd_wp(vma, pmd: vmf->orig_pmd)) {
6058	if (userfaultfd_wp_async(vma: vmf->vma))
6059	goto split;
6060	return handle_userfault(vmf, VM_UFFD_WP);
6061	}
6062	return do_huge_pmd_wp_page(vmf);
6063	}
6064
6065	if (vma->vm_flags & (VM_SHARED \| VM_MAYSHARE)) {
6066	if (vma->vm_ops->huge_fault) {
6067	ret = vma->vm_ops->huge_fault(vmf, PMD_ORDER);
6068	if (!(ret & VM_FAULT_FALLBACK))
6069	return ret;
6070	}
6071	}
6072
6073	split:
6074	/ COW or write-notify handled on pte level: split pmd. /
6075	__split_huge_pmd(vma, pmd: vmf->pmd, address: vmf->address, freeze: false);
6076
6077	return VM_FAULT_FALLBACK;
6078	}
6079
6080	static vm_fault_t create_huge_pud(struct vm_fault *vmf)
6081	{
6082	#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
6083	defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
6084	struct vm_area_struct *vma = vmf->vma;
6085	/ No support for anonymous transparent PUD pages yet /
6086	if (vma_is_anonymous(vma))
6087	return VM_FAULT_FALLBACK;
6088	if (vma->vm_ops->huge_fault)
6089	return vma->vm_ops->huge_fault(vmf, PUD_ORDER);
6090	#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
6091	return VM_FAULT_FALLBACK;
6092	}
6093
6094	static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
6095	{
6096	#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
6097	defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
6098	struct vm_area_struct *vma = vmf->vma;
6099	vm_fault_t ret;
6100
6101	/ No support for anonymous transparent PUD pages yet /
6102	if (vma_is_anonymous(vma))
6103	goto split;
6104	if (vma->vm_flags & (VM_SHARED \| VM_MAYSHARE)) {
6105	if (vma->vm_ops->huge_fault) {
6106	ret = vma->vm_ops->huge_fault(vmf, PUD_ORDER);
6107	if (!(ret & VM_FAULT_FALLBACK))
6108	return ret;
6109	}
6110	}
6111	split:
6112	/ COW or write-notify not handled on PUD level: split pud./
6113	__split_huge_pud(vma, vmf->pud, vmf->address);
6114	#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
6115	return VM_FAULT_FALLBACK;
6116	}
6117
6118	/*
6119	* These routines also need to handle stuff like marking pages dirty
6120	* and/or accessed for architectures that don't do it in hardware (most
6121	* RISC architectures). The early dirtying is also good on the i386.
6122	*
6123	* There is also a hook called "update_mmu_cache()" that architectures
6124	* with external mmu caches can use to update those (ie the Sparc or
6125	* PowerPC hashed page tables that act as extended TLBs).
6126	*
6127	* We enter with non-exclusive mmap_lock (to exclude vma changes, but allow
6128	* concurrent faults).
6129	*
6130	* The mmap_lock may have been released depending on flags and our return value.
6131	* See filemap_fault() and __folio_lock_or_retry().
6132	*/
6133	static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
6134	{
6135	pte_t entry;
6136
6137	if (unlikely(pmd_none(*vmf->pmd))) {
6138	/*
6139	* Leave __pte_alloc() until later: because vm_ops->fault may
6140	* want to allocate huge page, and if we expose page table
6141	* for an instant, it will be difficult to retract from
6142	* concurrent faults and from rmap lookups.
6143	*/
6144	vmf->pte = NULL;
6145	vmf->flags &= ~FAULT_FLAG_ORIG_PTE_VALID;
6146	} else {
6147	pmd_t dummy_pmdval;
6148
6149	/*
6150	* A regular pmd is established and it can't morph into a huge
6151	* pmd by anon khugepaged, since that takes mmap_lock in write
6152	* mode; but shmem or file collapse to THP could still morph
6153	* it into a huge pmd: just retry later if so.
6154	*
6155	* Use the maywrite version to indicate that vmf->pte may be
6156	* modified, but since we will use pte_same() to detect the
6157	* change of the !pte_none() entry, there is no need to recheck
6158	* the pmdval. Here we chooes to pass a dummy variable instead
6159	* of NULL, which helps new user think about why this place is
6160	* special.
6161	*/
6162	vmf->pte = pte_offset_map_rw_nolock(mm: vmf->vma->vm_mm, pmd: vmf->pmd,
6163	addr: vmf->address, pmdvalp: &dummy_pmdval,
6164	ptlp: &vmf->ptl);
6165	if (unlikely(!vmf->pte))
6166	return `0`;
6167	vmf->orig_pte = ptep_get_lockless(ptep: vmf->pte);
6168	vmf->flags \|= FAULT_FLAG_ORIG_PTE_VALID;
6169
6170	if (pte_none(pte: vmf->orig_pte)) {
6171	pte_unmap(pte: vmf->pte);
6172	vmf->pte = NULL;
6173	}
6174	}
6175
6176	if (!vmf->pte)
6177	return do_pte_missing(vmf);
6178
6179	if (!pte_present(a: vmf->orig_pte))
6180	return do_swap_page(vmf);
6181
6182	if (pte_protnone(pte: vmf->orig_pte) && vma_is_accessible(vma: vmf->vma))
6183	return do_numa_page(vmf);
6184
6185	spin_lock(lock: vmf->ptl);
6186	entry = vmf->orig_pte;
6187	if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) {
6188	update_mmu_tlb(vma: vmf->vma, address: vmf->address, ptep: vmf->pte);
6189	goto unlock;
6190	}
6191	if (vmf->flags & (FAULT_FLAG_WRITE\|FAULT_FLAG_UNSHARE)) {
6192	if (!pte_write(pte: entry))
6193	return do_wp_page(vmf);
6194	else if (likely(vmf->flags & FAULT_FLAG_WRITE))
6195	entry = pte_mkdirty(pte: entry);
6196	}
6197	entry = pte_mkyoung(pte: entry);
6198	if (ptep_set_access_flags(vma: vmf->vma, address: vmf->address, ptep: vmf->pte, entry,
6199	dirty: vmf->flags & FAULT_FLAG_WRITE)) {
6200	update_mmu_cache_range(vmf, vma: vmf->vma, addr: vmf->address,
6201	ptep: vmf->pte, nr: `1`);
6202	} else {
6203	/ Skip spurious TLB flush for retried page fault /
6204	if (vmf->flags & FAULT_FLAG_TRIED)
6205	goto unlock;
6206	/*
6207	* This is needed only for protection faults but the arch code
6208	* is not yet telling us if this is a protection fault or not.
6209	* This still avoids useless tlb flushes for .text page faults
6210	* with threads.
6211	*/
6212	if (vmf->flags & FAULT_FLAG_WRITE)
6213	flush_tlb_fix_spurious_fault(vmf->vma, vmf->address,
6214	vmf->pte);
6215	}
6216	unlock:
6217	pte_unmap_unlock(vmf->pte, vmf->ptl);
6218	return `0`;
6219	}
6220
6221	/*
6222	* On entry, we hold either the VMA lock or the mmap_lock
6223	* (FAULT_FLAG_VMA_LOCK tells you which). If VM_FAULT_RETRY is set in
6224	* the result, the mmap_lock is not held on exit. See filemap_fault()
6225	* and __folio_lock_or_retry().
6226	*/
6227	static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
6228	unsigned long address, unsigned int flags)
6229	{
6230	struct vm_fault vmf = {
6231	.vma = vma,
6232	.address = address & PAGE_MASK,
6233	.real_address = address,
6234	.flags = flags,
6235	.pgoff = linear_page_index(vma, address),
6236	.gfp_mask = __get_fault_gfp_mask(vma),
6237	};
6238	struct mm_struct *mm = vma->vm_mm;
6239	vm_flags_t vm_flags = vma->vm_flags;
6240	pgd_t *pgd;
6241	p4d_t *p4d;
6242	vm_fault_t ret;
6243
6244	pgd = pgd_offset(mm, address);
6245	p4d = p4d_alloc(mm, pgd, address);
6246	if (!p4d)
6247	return VM_FAULT_OOM;
6248
6249	vmf.pud = pud_alloc(mm, p4d, address);
6250	if (!vmf.pud)
6251	return VM_FAULT_OOM;
6252	retry_pud:
6253	if (pud_none(pud: *vmf.pud) &&
6254	thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PUD_ORDER)) {
6255	ret = create_huge_pud(vmf: &vmf);
6256	if (!(ret & VM_FAULT_FALLBACK))
6257	return ret;
6258	} else {
6259	pud_t orig_pud = *vmf.pud;
6260
6261	barrier();
6262	if (pud_trans_huge(pud: orig_pud)) {
6263
6264	/*
6265	* TODO once we support anonymous PUDs: NUMA case and
6266	* FAULT_FLAG_UNSHARE handling.
6267	*/
6268	if ((flags & FAULT_FLAG_WRITE) && !pud_write(pud: orig_pud)) {
6269	ret = wp_huge_pud(vmf: &vmf, orig_pud);
6270	if (!(ret & VM_FAULT_FALLBACK))
6271	return ret;
6272	} else {
6273	huge_pud_set_accessed(vmf: &vmf, orig_pud);
6274	return `0`;
6275	}
6276	}
6277	}
6278
6279	vmf.pmd = pmd_alloc(mm, pud: vmf.pud, address);
6280	if (!vmf.pmd)
6281	return VM_FAULT_OOM;
6282
6283	/ Huge pud page fault raced with pmd_alloc? /
6284	if (pud_trans_unstable(pud: vmf.pud))
6285	goto retry_pud;
6286
6287	if (pmd_none(pmd: *vmf.pmd) &&
6288	thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PMD_ORDER)) {
6289	ret = create_huge_pmd(vmf: &vmf);
6290	if (!(ret & VM_FAULT_FALLBACK))
6291	return ret;
6292	} else {
6293	vmf.orig_pmd = pmdp_get_lockless(pmdp: vmf.pmd);
6294
6295	if (unlikely(is_swap_pmd(vmf.orig_pmd))) {
6296	VM_BUG_ON(thp_migration_supported() &&
6297	!is_pmd_migration_entry(vmf.orig_pmd));
6298	if (is_pmd_migration_entry(pmd: vmf.orig_pmd))
6299	pmd_migration_entry_wait(m: mm, p: vmf.pmd);
6300	return `0`;
6301	}
6302	if (pmd_trans_huge(pmd: vmf.orig_pmd)) {
6303	if (pmd_protnone(pmd: vmf.orig_pmd) && vma_is_accessible(vma))
6304	return do_huge_pmd_numa_page(vmf: &vmf);
6305
6306	if ((flags & (FAULT_FLAG_WRITE\|FAULT_FLAG_UNSHARE)) &&
6307	!pmd_write(pmd: vmf.orig_pmd)) {
6308	ret = wp_huge_pmd(vmf: &vmf);
6309	if (!(ret & VM_FAULT_FALLBACK))
6310	return ret;
6311	} else {
6312	huge_pmd_set_accessed(vmf: &vmf);
6313	return `0`;
6314	}
6315	}
6316	}
6317
6318	return handle_pte_fault(vmf: &vmf);
6319	}
6320
6321	/**
6322	* mm_account_fault - Do page fault accounting
6323	* @mm: mm from which memcg should be extracted. It can be NULL.
6324	* @regs: the pt_regs struct pointer. When set to NULL, will skip accounting
6325	* of perf event counters, but we'll still do the per-task accounting to
6326	* the task who triggered this page fault.
6327	* @address: the faulted address.
6328	* @flags: the fault flags.
6329	* @ret: the fault retcode.
6330	*
6331	* This will take care of most of the page fault accounting. Meanwhile, it
6332	* will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ\|MIN] perf counter
6333	* updates. However, note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
6334	* still be in per-arch page fault handlers at the entry of page fault.
6335	*/
6336	static inline void mm_account_fault(struct mm_struct mm, struct* pt_regs *regs,
6337	unsigned long address, unsigned int flags,
6338	vm_fault_t ret)
6339	{
6340	bool major;
6341
6342	/ Incomplete faults will be accounted upon completion. /
6343	if (ret & VM_FAULT_RETRY)
6344	return;
6345
6346	/*
6347	* To preserve the behavior of older kernels, PGFAULT counters record
6348	* both successful and failed faults, as opposed to perf counters,
6349	* which ignore failed cases.
6350	*/
6351	count_vm_event(item: PGFAULT);
6352	count_memcg_event_mm(mm, idx: PGFAULT);
6353
6354	/*
6355	* Do not account for unsuccessful faults (e.g. when the address wasn't
6356	* valid). That includes arch_vma_access_permitted() failing before
6357	* reaching here. So this is not a "this many hardware page faults"
6358	* counter. We should use the hw profiling for that.
6359	*/
6360	if (ret & VM_FAULT_ERROR)
6361	return;
6362
6363	/*
6364	* We define the fault as a major fault when the final successful fault
6365	* is VM_FAULT_MAJOR, or if it retried (which implies that we couldn't
6366	* handle it immediately previously).
6367	*/
6368	major = (ret & VM_FAULT_MAJOR) \|\| (flags & FAULT_FLAG_TRIED);
6369
6370	if (major)
6371	current->maj_flt++;
6372	else
6373	current->min_flt++;
6374
6375	/*
6376	* If the fault is done for GUP, regs will be NULL. We only do the
6377	* accounting for the per thread fault counters who triggered the
6378	* fault, and we skip the perf event updates.
6379	*/
6380	if (!regs)
6381	return;
6382
6383	if (major)
6384	perf_sw_event(event_id: PERF_COUNT_SW_PAGE_FAULTS_MAJ, nr: `1`, regs, addr: address);
6385	else
6386	perf_sw_event(event_id: PERF_COUNT_SW_PAGE_FAULTS_MIN, nr: `1`, regs, addr: address);
6387	}
6388
6389	#ifdef CONFIG_LRU_GEN
6390	static void lru_gen_enter_fault(struct vm_area_struct *vma)
6391	{
6392	/ the LRU algorithm only applies to accesses with recency /
6393	current->in_lru_fault = vma_has_recency(vma);
6394	}
6395
6396	static void lru_gen_exit_fault(void)
6397	{
6398	current->in_lru_fault = false;
6399	}
6400	#else
6401	static void lru_gen_enter_fault(struct vm_area_struct *vma)
6402	{
6403	}
6404
6405	static void lru_gen_exit_fault(void)
6406	{
6407	}
6408	#endif /* CONFIG_LRU_GEN */
6409
6410	static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma,
6411	unsigned int *flags)
6412	{
6413	if (unlikely(*flags & FAULT_FLAG_UNSHARE)) {
6414	if (WARN_ON_ONCE(*flags & FAULT_FLAG_WRITE))
6415	return VM_FAULT_SIGSEGV;
6416	/*
6417	* FAULT_FLAG_UNSHARE only applies to COW mappings. Let's
6418	* just treat it like an ordinary read-fault otherwise.
6419	*/
6420	if (!is_cow_mapping(flags: vma->vm_flags))
6421	*flags &= ~FAULT_FLAG_UNSHARE;
6422	} else if (*flags & FAULT_FLAG_WRITE) {
6423	/ Write faults on read-only mappings are impossible ... /
6424	if (WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE)))
6425	return VM_FAULT_SIGSEGV;
6426	/ ... and FOLL_FORCE only applies to COW mappings. /
6427	if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE) &&
6428	!is_cow_mapping(vma->vm_flags)))
6429	return VM_FAULT_SIGSEGV;
6430	}
6431	#ifdef CONFIG_PER_VMA_LOCK
6432	/*
6433	* Per-VMA locks can't be used with FAULT_FLAG_RETRY_NOWAIT because of
6434	* the assumption that lock is dropped on VM_FAULT_RETRY.
6435	*/
6436	if (WARN_ON_ONCE((*flags &
6437	(FAULT_FLAG_VMA_LOCK \| FAULT_FLAG_RETRY_NOWAIT)) ==
6438	(FAULT_FLAG_VMA_LOCK \| FAULT_FLAG_RETRY_NOWAIT)))
6439	return VM_FAULT_SIGSEGV;
6440	#endif
6441
6442	return `0`;
6443	}
6444
6445	/*
6446	* By the time we get here, we already hold either the VMA lock or the
6447	* mmap_lock (FAULT_FLAG_VMA_LOCK tells you which).
6448	*
6449	* The mmap_lock may have been released depending on flags and our
6450	* return value. See filemap_fault() and __folio_lock_or_retry().
6451	*/
6452	vm_fault_t handle_mm_fault(struct vm_area_struct vma, unsigned* long address,
6453	unsigned int flags, struct pt_regs *regs)
6454	{
6455	/ If the fault handler drops the mmap_lock, vma may be freed /
6456	struct mm_struct *mm = vma->vm_mm;
6457	vm_fault_t ret;
6458	bool is_droppable;
6459
6460	__set_current_state(TASK_RUNNING);
6461
6462	ret = sanitize_fault_flags(vma, flags: &flags);
6463	if (ret)
6464	goto out;
6465
6466	if (!arch_vma_access_permitted(vma, write: flags & FAULT_FLAG_WRITE,
6467	execute: flags & FAULT_FLAG_INSTRUCTION,
6468	foreign: flags & FAULT_FLAG_REMOTE)) {
6469	ret = VM_FAULT_SIGSEGV;
6470	goto out;
6471	}
6472
6473	is_droppable = !!(vma->vm_flags & VM_DROPPABLE);
6474
6475	/*
6476	* Enable the memcg OOM handling for faults triggered in user
6477	* space. Kernel faults are handled more gracefully.
6478	*/
6479	if (flags & FAULT_FLAG_USER)
6480	mem_cgroup_enter_user_fault();
6481
6482	lru_gen_enter_fault(vma);
6483
6484	if (unlikely(is_vm_hugetlb_page(vma)))
6485	ret = hugetlb_fault(mm: vma->vm_mm, vma, address, flags);
6486	else
6487	ret = __handle_mm_fault(vma, address, flags);
6488
6489	/*
6490	* Warning: It is no longer safe to dereference vma-> after this point,
6491	* because mmap_lock might have been dropped by __handle_mm_fault(), so
6492	* vma might be destroyed from underneath us.
6493	*/
6494
6495	lru_gen_exit_fault();
6496
6497	/ If the mapping is droppable, then errors due to OOM aren't fatal. /
6498	if (is_droppable)
6499	ret &= ~VM_FAULT_OOM;
6500
6501	if (flags & FAULT_FLAG_USER) {
6502	mem_cgroup_exit_user_fault();
6503	/*
6504	* The task may have entered a memcg OOM situation but
6505	* if the allocation error was handled gracefully (no
6506	* VM_FAULT_OOM), there is no need to kill anything.
6507	* Just clean up the OOM state peacefully.
6508	*/
6509	if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
6510	mem_cgroup_oom_synchronize(wait: false);
6511	}
6512	out:
6513	mm_account_fault(mm, regs, address, flags, ret);
6514
6515	return ret;
6516	}
6517	EXPORT_SYMBOL_GPL(handle_mm_fault);
6518
6519	#ifndef __PAGETABLE_P4D_FOLDED
6520	/*
6521	* Allocate p4d page table.
6522	* We've already handled the fast-path in-line.
6523	*/
6524	int __p4d_alloc(struct mm_struct mm, pgd_t pgd, unsigned long address)
6525	{
6526	p4d_t *new = p4d_alloc_one(mm, address);
6527	if (!new)
6528	return -ENOMEM;
6529
6530	spin_lock(lock: &mm->page_table_lock);
6531	if (pgd_present(pgd: pgd)) { /* Another has populated it /
6532	p4d_free(mm, p4d: new);
6533	} else {
6534	smp_wmb(); / See comment in pmd_install() /
6535	pgd_populate(mm, pgd, p4d: new);
6536	}
6537	spin_unlock(lock: &mm->page_table_lock);
6538	return `0`;
6539	}
6540	#endif /* __PAGETABLE_P4D_FOLDED */
6541
6542	#ifndef __PAGETABLE_PUD_FOLDED
6543	/*
6544	* Allocate page upper directory.
6545	* We've already handled the fast-path in-line.
6546	*/
6547	int __pud_alloc(struct mm_struct mm, p4d_t p4d, unsigned long address)
6548	{
6549	pud_t *new = pud_alloc_one(mm, address);
6550	if (!new)
6551	return -ENOMEM;
6552
6553	spin_lock(lock: &mm->page_table_lock);
6554	if (!p4d_present(p4d: *p4d)) {
6555	mm_inc_nr_puds(mm);
6556	smp_wmb(); / See comment in pmd_install() /
6557	p4d_populate(mm, p4d, pud: new);
6558	} else / Another has populated it /
6559	pud_free(mm, pud: new);
6560	spin_unlock(lock: &mm->page_table_lock);
6561	return `0`;
6562	}
6563	#endif /* __PAGETABLE_PUD_FOLDED */
6564
6565	#ifndef __PAGETABLE_PMD_FOLDED
6566	/*
6567	* Allocate page middle directory.
6568	* We've already handled the fast-path in-line.
6569	*/
6570	int __pmd_alloc(struct mm_struct mm, pud_t pud, unsigned long address)
6571	{
6572	spinlock_t *ptl;
6573	pmd_t *new = pmd_alloc_one(mm, address);
6574	if (!new)
6575	return -ENOMEM;
6576
6577	ptl = pud_lock(mm, pud);
6578	if (!pud_present(pud: *pud)) {
6579	mm_inc_nr_pmds(mm);
6580	smp_wmb(); / See comment in pmd_install() /
6581	pud_populate(mm, pud, pmd: new);
6582	} else { / Another has populated it /
6583	pmd_free(mm, pmd: new);
6584	}
6585	spin_unlock(lock: ptl);
6586	return `0`;
6587	}
6588	#endif /* __PAGETABLE_PMD_FOLDED */
6589
6590	static inline void pfnmap_args_setup(struct follow_pfnmap_args *args,
6591	spinlock_t lock, pte_t ptep,
6592	pgprot_t pgprot, unsigned long pfn_base,
6593	unsigned long addr_mask, bool writable,
6594	bool special)
6595	{
6596	args->lock = lock;
6597	args->ptep = ptep;
6598	args->pfn = pfn_base + ((args->address & ~addr_mask) >> PAGE_SHIFT);
6599	args->addr_mask = addr_mask;
6600	args->pgprot = pgprot;
6601	args->writable = writable;
6602	args->special = special;
6603	}
6604
6605	static inline void pfnmap_lockdep_assert(struct vm_area_struct *vma)
6606	{
6607	#ifdef CONFIG_LOCKDEP
6608	struct file *file = vma->vm_file;
6609	struct address_space *mapping = file ? file->f_mapping : NULL;
6610
6611	if (mapping)
6612	lockdep_assert(lockdep_is_held(&mapping->i_mmap_rwsem) \|\|
6613	lockdep_is_held(&vma->vm_mm->mmap_lock));
6614	else
6615	lockdep_assert(lockdep_is_held(&vma->vm_mm->mmap_lock));
6616	#endif
6617	}
6618
6619	/**
6620	* follow_pfnmap_start() - Look up a pfn mapping at a user virtual address
6621	* @args: Pointer to struct @follow_pfnmap_args
6622	*
6623	* The caller needs to setup args->vma and args->address to point to the
6624	* virtual address as the target of such lookup. On a successful return,
6625	* the results will be put into other output fields.
6626	*
6627	* After the caller finished using the fields, the caller must invoke
6628	* another follow_pfnmap_end() to proper releases the locks and resources
6629	* of such look up request.
6630	*
6631	* During the start() and end() calls, the results in @args will be valid
6632	* as proper locks will be held. After the end() is called, all the fields
6633	* in @follow_pfnmap_args will be invalid to be further accessed. Further
6634	* use of such information after end() may require proper synchronizations
6635	* by the caller with page table updates, otherwise it can create a
6636	* security bug.
6637	*
6638	* If the PTE maps a refcounted page, callers are responsible to protect
6639	* against invalidation with MMU notifiers; otherwise access to the PFN at
6640	* a later point in time can trigger use-after-free.
6641	*
6642	* Only IO mappings and raw PFN mappings are allowed. The mmap semaphore
6643	* should be taken for read, and the mmap semaphore cannot be released
6644	* before the end() is invoked.
6645	*
6646	* This function must not be used to modify PTE content.
6647	*
6648	* Return: zero on success, negative otherwise.
6649	*/
6650	int follow_pfnmap_start(struct follow_pfnmap_args *args)
6651	{
6652	struct vm_area_struct *vma = args->vma;
6653	unsigned long address = args->address;
6654	struct mm_struct *mm = vma->vm_mm;
6655	spinlock_t *lock;
6656	pgd_t *pgdp;
6657	p4d_t *p4dp, p4d;
6658	pud_t *pudp, pud;
6659	pmd_t *pmdp, pmd;
6660	pte_t *ptep, pte;
6661
6662	pfnmap_lockdep_assert(vma);
6663
6664	if (unlikely(address < vma->vm_start \|\| address >= vma->vm_end))
6665	goto out;
6666
6667	if (!(vma->vm_flags & (VM_IO \| VM_PFNMAP)))
6668	goto out;
6669	retry:
6670	pgdp = pgd_offset(mm, address);
6671	if (pgd_none(pgd: pgdp) \|\| unlikely(pgd_bad(pgdp)))
6672	goto out;
6673
6674	p4dp = p4d_offset(pgd: pgdp, address);
6675	p4d = READ_ONCE(*p4dp);
6676	if (p4d_none(p4d) \|\| unlikely(p4d_bad(p4d)))
6677	goto out;
6678
6679	pudp = pud_offset(p4d: p4dp, address);
6680	pud = READ_ONCE(*pudp);
6681	if (pud_none(pud))
6682	goto out;
6683	if (pud_leaf(pud)) {
6684	lock = pud_lock(mm, pud: pudp);
6685	if (!unlikely(pud_leaf(pud))) {
6686	spin_unlock(lock);
6687	goto retry;
6688	}
6689	pfnmap_args_setup(args, lock, NULL, pud_pgprot(pud),
6690	pud_pfn(pud), PUD_MASK, pud_write(pud),
6691	special: pud_special(pud));
6692	return `0`;
6693	}
6694
6695	pmdp = pmd_offset(pud: pudp, address);
6696	pmd = pmdp_get_lockless(pmdp);
6697	if (pmd_leaf(pte: pmd)) {
6698	lock = pmd_lock(mm, pmd: pmdp);
6699	if (!unlikely(pmd_leaf(pmd))) {
6700	spin_unlock(lock);
6701	goto retry;
6702	}
6703	pfnmap_args_setup(args, lock, NULL, pmd_pgprot(pmd),
6704	pfn_base: pmd_pfn(pmd), PMD_MASK, pmd_write(pmd),
6705	special: pmd_special(pmd));
6706	return `0`;
6707	}
6708
6709	ptep = pte_offset_map_lock(mm, pmd: pmdp, addr: address, ptlp: &lock);
6710	if (!ptep)
6711	goto out;
6712	pte = ptep_get(ptep);
6713	if (!pte_present(a: pte))
6714	goto unlock;
6715	pfnmap_args_setup(args, lock, ptep, pte_pgprot(pte),
6716	pfn_base: pte_pfn(pte), PAGE_MASK, writable: pte_write(pte),
6717	special: pte_special(pte));
6718	return `0`;
6719	unlock:
6720	pte_unmap_unlock(ptep, lock);
6721	out:
6722	return -EINVAL;
6723	}
6724	EXPORT_SYMBOL_GPL(follow_pfnmap_start);
6725
6726	/**
6727	* follow_pfnmap_end(): End a follow_pfnmap_start() process
6728	* @args: Pointer to struct @follow_pfnmap_args
6729	*
6730	* Must be used in pair of follow_pfnmap_start(). See the start() function
6731	* above for more information.
6732	*/
6733	void follow_pfnmap_end(struct follow_pfnmap_args *args)
6734	{
6735	if (args->lock)
6736	spin_unlock(lock: args->lock);
6737	if (args->ptep)
6738	pte_unmap(pte: args->ptep);
6739	}
6740	EXPORT_SYMBOL_GPL(follow_pfnmap_end);
6741
6742	#ifdef CONFIG_HAVE_IOREMAP_PROT
6743	/**
6744	* generic_access_phys - generic implementation for iomem mmap access
6745	* @vma: the vma to access
6746	* @addr: userspace address, not relative offset within @vma
6747	* @buf: buffer to read/write
6748	* @len: length of transfer
6749	* @write: set to FOLL_WRITE when writing, otherwise reading
6750	*
6751	* This is a generic implementation for &vm_operations_struct.access for an
6752	* iomem mapping. This callback is used by access_process_vm() when the @vma is
6753	* not page based.
6754	*/
6755	int generic_access_phys(struct vm_area_struct vma, unsigned* long addr,
6756	void buf, int* len, int write)
6757	{
6758	resource_size_t phys_addr;
6759	pgprot_t prot = __pgprot(`0`);
6760	void __iomem *maddr;
6761	int offset = offset_in_page(addr);
6762	int ret = -EINVAL;
6763	bool writable;
6764	struct follow_pfnmap_args args = { .vma = vma, .address = addr };
6765
6766	retry:
6767	if (follow_pfnmap_start(&args))
6768	return -EINVAL;
6769	prot = args.pgprot;
6770	phys_addr = (resource_size_t)args.pfn << PAGE_SHIFT;
6771	writable = args.writable;
6772	follow_pfnmap_end(&args);
6773
6774	if ((write & FOLL_WRITE) && !writable)
6775	return -EINVAL;
6776
6777	maddr = ioremap_prot(offset: phys_addr, PAGE_ALIGN(len + offset), prot);
6778	if (!maddr)
6779	return -ENOMEM;
6780
6781	if (follow_pfnmap_start(&args))
6782	goto out_unmap;
6783
6784	if ((pgprot_val(prot) != pgprot_val(args.pgprot)) \|\|
6785	(phys_addr != (args.pfn << PAGE_SHIFT)) \|\|
6786	(writable != args.writable)) {
6787	follow_pfnmap_end(&args);
6788	iounmap(addr: maddr);
6789	goto retry;
6790	}
6791
6792	if (write)
6793	memcpy_toio(maddr + offset, buf, len);
6794	else
6795	memcpy_fromio(buf, maddr + offset, len);
6796	ret = len;
6797	follow_pfnmap_end(&args);
6798	out_unmap:
6799	iounmap(addr: maddr);
6800
6801	return ret;
6802	}
6803	EXPORT_SYMBOL_GPL(generic_access_phys);
6804	#endif
6805
6806	/*
6807	* Access another process' address space as given in mm.
6808	*/
6809	static int __access_remote_vm(struct mm_struct mm, unsigned* long addr,
6810	void buf, int* len, unsigned int gup_flags)
6811	{
6812	void *old_buf = buf;
6813	int write = gup_flags & FOLL_WRITE;
6814
6815	if (mmap_read_lock_killable(mm))
6816	return `0`;
6817
6818	/ Untag the address before looking up the VMA /
6819	addr = untagged_addr_remote(mm, addr);
6820
6821	/ Avoid triggering the temporary warning in __get_user_pages /
6822	if (!vma_lookup(mm, addr) && !expand_stack(mm, addr))
6823	return `0`;
6824
6825	/ ignore errors, just check how much was successfully transferred /
6826	while (len) {
6827	int bytes, offset;
6828	void *maddr;
6829	struct folio *folio;
6830	struct vm_area_struct *vma = NULL;
6831	struct page *page = get_user_page_vma_remote(mm, addr,
6832	gup_flags, vmap: &vma);
6833
6834	if (IS_ERR(ptr: page)) {
6835	/ We might need to expand the stack to access it /
6836	vma = vma_lookup(mm, addr);
6837	if (!vma) {
6838	vma = expand_stack(mm, addr);
6839
6840	/ mmap_lock was dropped on failure /
6841	if (!vma)
6842	return buf - old_buf;
6843
6844	/ Try again if stack expansion worked /
6845	continue;
6846	}
6847
6848	/*
6849	* Check if this is a VM_IO \| VM_PFNMAP VMA, which
6850	* we can access using slightly different code.
6851	*/
6852	bytes = `0`;
6853	#ifdef CONFIG_HAVE_IOREMAP_PROT
6854	if (vma->vm_ops && vma->vm_ops->access)
6855	bytes = vma->vm_ops->access(vma, addr, buf,
6856	len, write);
6857	#endif
6858	if (bytes <= `0`)
6859	break;
6860	} else {
6861	folio = page_folio(page);
6862	bytes = len;
6863	offset = addr & (PAGE_SIZE-`1`);
6864	if (bytes > PAGE_SIZE-offset)
6865	bytes = PAGE_SIZE-offset;
6866
6867	maddr = kmap_local_folio(folio, offset: folio_page_idx(folio, page) * PAGE_SIZE);
6868	if (write) {
6869	copy_to_user_page(vma, page, addr,
6870	maddr + offset, buf, bytes);
6871	folio_mark_dirty_lock(folio);
6872	} else {
6873	copy_from_user_page(vma, page, addr,
6874	buf, maddr + offset, bytes);
6875	}
6876	folio_release_kmap(folio, addr: maddr);
6877	}
6878	len -= bytes;
6879	buf += bytes;
6880	addr += bytes;
6881	}
6882	mmap_read_unlock(mm);
6883
6884	return buf - old_buf;
6885	}
6886
6887	/**
6888	* access_remote_vm - access another process' address space
6889	* @mm: the mm_struct of the target address space
6890	* @addr: start address to access
6891	* @buf: source or destination buffer
6892	* @len: number of bytes to transfer
6893	* @gup_flags: flags modifying lookup behaviour
6894	*
6895	* The caller must hold a reference on @mm.
6896	*
6897	* Return: number of bytes copied from source to destination.
6898	*/
6899	int access_remote_vm(struct mm_struct mm, unsigned* long addr,
6900	void buf, int* len, unsigned int gup_flags)
6901	{
6902	return __access_remote_vm(mm, addr, buf, len, gup_flags);
6903	}
6904
6905	/*
6906	* Access another process' address space.
6907	* Source/target buffer must be kernel space,
6908	* Do not walk the page table directly, use get_user_pages
6909	*/
6910	int access_process_vm(struct task_struct tsk, unsigned* long addr,
6911	void buf, int* len, unsigned int gup_flags)
6912	{
6913	struct mm_struct *mm;
6914	int ret;
6915
6916	mm = get_task_mm(task: tsk);
6917	if (!mm)
6918	return `0`;
6919
6920	ret = __access_remote_vm(mm, addr, buf, len, gup_flags);
6921
6922	mmput(mm);
6923
6924	return ret;
6925	}
6926	EXPORT_SYMBOL_GPL(access_process_vm);
6927
6928	#ifdef CONFIG_BPF_SYSCALL
6929	/*
6930	* Copy a string from another process's address space as given in mm.
6931	* If there is any error return -EFAULT.
6932	*/
6933	static int __copy_remote_vm_str(struct mm_struct mm, unsigned* long addr,
6934	void buf, int* len, unsigned int gup_flags)
6935	{
6936	void *old_buf = buf;
6937	int err = `0`;
6938
6939	(char* *)buf = `'\0'`;
6940
6941	if (mmap_read_lock_killable(mm))
6942	return -EFAULT;
6943
6944	addr = untagged_addr_remote(mm, addr);
6945
6946	/ Avoid triggering the temporary warning in __get_user_pages /
6947	if (!vma_lookup(mm, addr)) {
6948	err = -EFAULT;
6949	goto out;
6950	}
6951
6952	while (len) {
6953	int bytes, offset, retval;
6954	void *maddr;
6955	struct folio *folio;
6956	struct page *page;
6957	struct vm_area_struct *vma = NULL;
6958
6959	page = get_user_page_vma_remote(mm, addr, gup_flags, &vma);
6960	if (IS_ERR(page)) {
6961	/*
6962	* Treat as a total failure for now until we decide how
6963	* to handle the CONFIG_HAVE_IOREMAP_PROT case and
6964	* stack expansion.
6965	*/
6966	(char* *)buf = `'\0'`;
6967	err = -EFAULT;
6968	goto out;
6969	}
6970
6971	folio = page_folio(page);
6972	bytes = len;
6973	offset = addr & (PAGE_SIZE - `1`);
6974	if (bytes > PAGE_SIZE - offset)
6975	bytes = PAGE_SIZE - offset;
6976
6977	maddr = kmap_local_folio(folio, folio_page_idx(folio, page) * PAGE_SIZE);
6978	retval = strscpy(buf, maddr + offset, bytes);
6979	if (retval >= `0`) {
6980	/ Found the end of the string /
6981	buf += retval;
6982	folio_release_kmap(folio, maddr);
6983	break;
6984	}
6985
6986	buf += bytes - `1`;
6987	/*
6988	* Because strscpy always NUL terminates we need to
6989	* copy the last byte in the page if we are going to
6990	* load more pages
6991	*/
6992	if (bytes != len) {
6993	addr += bytes - `1`;
6994	copy_from_user_page(vma, page, addr, buf, maddr + (PAGE_SIZE - `1`), `1`);
6995	buf += `1`;
6996	addr += `1`;
6997	}
6998	len -= bytes;
6999
7000	folio_release_kmap(folio, maddr);
7001	}
7002
7003	out:
7004	mmap_read_unlock(mm);
7005	if (err)
7006	return err;
7007	return buf - old_buf;
7008	}
7009
7010	/**
7011	* copy_remote_vm_str - copy a string from another process's address space.
7012	* @tsk: the task of the target address space
7013	* @addr: start address to read from
7014	* @buf: destination buffer
7015	* @len: number of bytes to copy
7016	* @gup_flags: flags modifying lookup behaviour
7017	*
7018	* The caller must hold a reference on @mm.
7019	*
7020	* Return: number of bytes copied from @addr (source) to @buf (destination);
7021	* not including the trailing NUL. Always guaranteed to leave NUL-terminated
7022	* buffer. On any error, return -EFAULT.
7023	*/
7024	int copy_remote_vm_str(struct task_struct tsk, unsigned* long addr,
7025	void buf, int* len, unsigned int gup_flags)
7026	{
7027	struct mm_struct *mm;
7028	int ret;
7029
7030	if (unlikely(len == `0`))
7031	return `0`;
7032
7033	mm = get_task_mm(tsk);
7034	if (!mm) {
7035	(char* *)buf = `'\0'`;
7036	return -EFAULT;
7037	}
7038
7039	ret = __copy_remote_vm_str(mm, addr, buf, len, gup_flags);
7040
7041	mmput(mm);
7042
7043	return ret;
7044	}
7045	EXPORT_SYMBOL_GPL(copy_remote_vm_str);
7046	#endif /* CONFIG_BPF_SYSCALL */
7047
7048	/*
7049	* Print the name of a VMA.
7050	*/
7051	void print_vma_addr(char prefix, unsigned* long ip)
7052	{
7053	struct mm_struct *mm = current->mm;
7054	struct vm_area_struct *vma;
7055
7056	/*
7057	* we might be running from an atomic context so we cannot sleep
7058	*/
7059	if (!mmap_read_trylock(mm))
7060	return;
7061
7062	vma = vma_lookup(mm, addr: ip);
7063	if (vma && vma->vm_file) {
7064	struct file *f = vma->vm_file;
7065	ip -= vma->vm_start;
7066	ip += vma->vm_pgoff << PAGE_SHIFT;
7067	printk("%s%pD[%lx,%lx+%lx]", prefix, f, ip,
7068	vma->vm_start,
7069	vma->vm_end - vma->vm_start);
7070	}
7071	mmap_read_unlock(mm);
7072	}
7073
7074	#if defined(CONFIG_PROVE_LOCKING) \|\| defined(CONFIG_DEBUG_ATOMIC_SLEEP)
7075	void __might_fault(const char file, int* line)
7076	{
7077	if (pagefault_disabled())
7078	return;
7079	__might_sleep(file, line);
7080	if (current->mm)
7081	might_lock_read(&current->mm->mmap_lock);
7082	}
7083	EXPORT_SYMBOL(__might_fault);
7084	#endif
7085
7086	#if defined(CONFIG_TRANSPARENT_HUGEPAGE) \|\| defined(CONFIG_HUGETLBFS)
7087	/*
7088	* Process all subpages of the specified huge page with the specified
7089	* operation. The target subpage will be processed last to keep its
7090	* cache lines hot.
7091	*/
7092	static inline int process_huge_page(
7093	unsigned long addr_hint, unsigned int nr_pages,
7094	int (process_subpage)(unsigned* long addr, int idx, void *arg),
7095	void *arg)
7096	{
7097	int i, n, base, l, ret;
7098	unsigned long addr = addr_hint &
7099	~(((unsigned long)nr_pages << PAGE_SHIFT) - `1`);
7100
7101	/ Process target subpage last to keep its cache lines hot /
7102	might_sleep();
7103	n = (addr_hint - addr) / PAGE_SIZE;
7104	if (`2` * n <= nr_pages) {
7105	/ If target subpage in first half of huge page /
7106	base = `0`;
7107	l = n;
7108	/ Process subpages at the end of huge page /
7109	for (i = nr_pages - `1`; i >= `2` * n; i--) {
7110	cond_resched();
7111	ret = process_subpage(addr + i * PAGE_SIZE, i, arg);
7112	if (ret)
7113	return ret;
7114	}
7115	} else {
7116	/ If target subpage in second half of huge page /
7117	base = nr_pages - `2` * (nr_pages - n);
7118	l = nr_pages - n;
7119	/ Process subpages at the begin of huge page /
7120	for (i = `0`; i < base; i++) {
7121	cond_resched();
7122	ret = process_subpage(addr + i * PAGE_SIZE, i, arg);
7123	if (ret)
7124	return ret;
7125	}
7126	}
7127	/*
7128	* Process remaining subpages in left-right-left-right pattern
7129	* towards the target subpage
7130	*/
7131	for (i = `0`; i < l; i++) {
7132	int left_idx = base + i;
7133	int right_idx = base + `2` * l - `1` - i;
7134
7135	cond_resched();
7136	ret = process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
7137	if (ret)
7138	return ret;
7139	cond_resched();
7140	ret = process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
7141	if (ret)
7142	return ret;
7143	}
7144	return `0`;
7145	}
7146
7147	static void clear_gigantic_page(struct folio folio, unsigned* long addr_hint,
7148	unsigned int nr_pages)
7149	{
7150	unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(folio));
7151	int i;
7152
7153	might_sleep();
7154	for (i = `0`; i < nr_pages; i++) {
7155	cond_resched();
7156	clear_user_highpage(folio_page(folio, i), vaddr: addr + i * PAGE_SIZE);
7157	}
7158	}
7159
7160	static int clear_subpage(unsigned long addr, int idx, void *arg)
7161	{
7162	struct folio *folio = arg;
7163
7164	clear_user_highpage(folio_page(folio, idx), vaddr: addr);
7165	return `0`;
7166	}
7167
7168	/**
7169	* folio_zero_user - Zero a folio which will be mapped to userspace.
7170	* @folio: The folio to zero.
7171	* @addr_hint: The address will be accessed or the base address if uncelar.
7172	*/
7173	void folio_zero_user(struct folio folio, unsigned* long addr_hint)
7174	{
7175	unsigned int nr_pages = folio_nr_pages(folio);
7176
7177	if (unlikely(nr_pages > MAX_ORDER_NR_PAGES))
7178	clear_gigantic_page(folio, addr_hint, nr_pages);
7179	else
7180	process_huge_page(addr_hint, nr_pages, process_subpage: clear_subpage, arg: folio);
7181	}
7182
7183	static int copy_user_gigantic_page(struct folio dst, struct* folio *src,
7184	unsigned long addr_hint,
7185	struct vm_area_struct *vma,
7186	unsigned int nr_pages)
7187	{
7188	unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(dst));
7189	struct page *dst_page;
7190	struct page *src_page;
7191	int i;
7192
7193	for (i = `0`; i < nr_pages; i++) {
7194	dst_page = folio_page(dst, i);
7195	src_page = folio_page(src, i);
7196
7197	cond_resched();
7198	if (copy_mc_user_highpage(to: dst_page, from: src_page,
7199	vaddr: addr + i*PAGE_SIZE, vma))
7200	return -EHWPOISON;
7201	}
7202	return `0`;
7203	}
7204
7205	struct copy_subpage_arg {
7206	struct folio *dst;
7207	struct folio *src;
7208	struct vm_area_struct *vma;
7209	};
7210
7211	static int copy_subpage(unsigned long addr, int idx, void *arg)
7212	{
7213	struct copy_subpage_arg *copy_arg = arg;
7214	struct page *dst = folio_page(copy_arg->dst, idx);
7215	struct page *src = folio_page(copy_arg->src, idx);
7216
7217	if (copy_mc_user_highpage(to: dst, from: src, vaddr: addr, vma: copy_arg->vma))
7218	return -EHWPOISON;
7219	return `0`;
7220	}
7221
7222	int copy_user_large_folio(struct folio dst, struct* folio *src,
7223	unsigned long addr_hint, struct vm_area_struct *vma)
7224	{
7225	unsigned int nr_pages = folio_nr_pages(folio: dst);
7226	struct copy_subpage_arg arg = {
7227	.dst = dst,
7228	.src = src,
7229	.vma = vma,
7230	};
7231
7232	if (unlikely(nr_pages > MAX_ORDER_NR_PAGES))
7233	return copy_user_gigantic_page(dst, src, addr_hint, vma, nr_pages);
7234
7235	return process_huge_page(addr_hint, nr_pages, process_subpage: copy_subpage, arg: &arg);
7236	}
7237
7238	long copy_folio_from_user(struct folio *dst_folio,
7239	const void __user *usr_src,
7240	bool allow_pagefault)
7241	{
7242	void *kaddr;
7243	unsigned long i, rc = `0`;
7244	unsigned int nr_pages = folio_nr_pages(folio: dst_folio);
7245	unsigned long ret_val = nr_pages * PAGE_SIZE;
7246	struct page *subpage;
7247
7248	for (i = `0`; i < nr_pages; i++) {
7249	subpage = folio_page(dst_folio, i);
7250	kaddr = kmap_local_page(page: subpage);
7251	if (!allow_pagefault)
7252	pagefault_disable();
7253	rc = copy_from_user(to: kaddr, from: usr_src + i * PAGE_SIZE, PAGE_SIZE);
7254	if (!allow_pagefault)
7255	pagefault_enable();
7256	kunmap_local(kaddr);
7257
7258	ret_val -= (PAGE_SIZE - rc);
7259	if (rc)
7260	break;
7261
7262	flush_dcache_page(page: subpage);
7263
7264	cond_resched();
7265	}
7266	return ret_val;
7267	}
7268	#endif /* CONFIG_TRANSPARENT_HUGEPAGE \|\| CONFIG_HUGETLBFS */
7269
7270	#if defined(CONFIG_SPLIT_PTE_PTLOCKS) && ALLOC_SPLIT_PTLOCKS
7271
7272	static struct kmem_cache *page_ptl_cachep;
7273
7274	void __init ptlock_cache_init(void)
7275	{
7276	page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), `0`,
7277	SLAB_PANIC, NULL);
7278	}
7279
7280	bool ptlock_alloc(struct ptdesc *ptdesc)
7281	{
7282	spinlock_t *ptl;
7283
7284	ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
7285	if (!ptl)
7286	return false;
7287	ptdesc->ptl = ptl;
7288	return true;
7289	}
7290
7291	void ptlock_free(struct ptdesc *ptdesc)
7292	{
7293	if (ptdesc->ptl)
7294	kmem_cache_free(page_ptl_cachep, ptdesc->ptl);
7295	}
7296	#endif
7297
7298	void vma_pgtable_walk_begin(struct vm_area_struct *vma)
7299	{
7300	if (is_vm_hugetlb_page(vma))
7301	hugetlb_vma_lock_read(vma);
7302	}
7303
7304	void vma_pgtable_walk_end(struct vm_area_struct *vma)
7305	{
7306	if (is_vm_hugetlb_page(vma))
7307	hugetlb_vma_unlock_read(vma);
7308	}
7309

Browse the source code of Linux/mm/memory.c