rmap.c source code [Linux/mm/rmap.c]

1	/*
2	* mm/rmap.c - physical to virtual reverse mappings
3	*
4	* Copyright 2001, Rik van Riel <riel@conectiva.com.br>
5	* Released under the General Public License (GPL).
6	*
7	* Simple, low overhead reverse mapping scheme.
8	* Please try to keep this thing as modular as possible.
9	*
10	* Provides methods for unmapping each kind of mapped page:
11	* the anon methods track anonymous pages, and
12	* the file methods track pages belonging to an inode.
13	*
14	* Original design by Rik van Riel <riel@conectiva.com.br> 2001
15	* File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
16	* Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
17	* Contributions by Hugh Dickins 2003, 2004
18	*/
19
20	/*
21	* Lock ordering in mm:
22	*
23	* inode->i_rwsem (while writing or truncating, not reading or faulting)
24	* mm->mmap_lock
25	* mapping->invalidate_lock (in filemap_fault)
26	* folio_lock
27	* hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below)
28	* vma_start_write
29	* mapping->i_mmap_rwsem
30	* anon_vma->rwsem
31	* mm->page_table_lock or pte_lock
32	* swap_lock (in swap_duplicate, swap_info_get)
33	* mmlist_lock (in mmput, drain_mmlist and others)
34	* mapping->private_lock (in block_dirty_folio)
35	* i_pages lock (widely used)
36	* lruvec->lru_lock (in folio_lruvec_lock_irq)
37	* inode->i_lock (in set_page_dirty's __mark_inode_dirty)
38	* bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
39	* sb_lock (within inode_lock in fs/fs-writeback.c)
40	* i_pages lock (widely used, in set_page_dirty,
41	* in arch-dependent flush_dcache_mmap_lock,
42	* within bdi.wb->list_lock in __sync_single_inode)
43	*
44	* anon_vma->rwsem,mapping->i_mmap_rwsem (memory_failure, collect_procs_anon)
45	* ->tasklist_lock
46	* pte map lock
47	*
48	* hugetlbfs PageHuge() take locks in this order:
49	* hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
50	* vma_lock (hugetlb specific lock for pmd_sharing)
51	* mapping->i_mmap_rwsem (also used for hugetlb pmd sharing)
52	* folio_lock
53	*/
54
55	#include <linux/mm.h>
56	#include <linux/sched/mm.h>
57	#include <linux/sched/task.h>
58	#include <linux/pagemap.h>
59	#include <linux/swap.h>
60	#include <linux/swapops.h>
61	#include <linux/slab.h>
62	#include <linux/init.h>
63	#include <linux/ksm.h>
64	#include <linux/rmap.h>
65	#include <linux/rcupdate.h>
66	#include <linux/export.h>
67	#include <linux/memcontrol.h>
68	#include <linux/mmu_notifier.h>
69	#include <linux/migrate.h>
70	#include <linux/hugetlb.h>
71	#include <linux/huge_mm.h>
72	#include <linux/backing-dev.h>
73	#include <linux/page_idle.h>
74	#include <linux/memremap.h>
75	#include <linux/userfaultfd_k.h>
76	#include <linux/mm_inline.h>
77	#include <linux/oom.h>
78
79	#include <asm/tlbflush.h>
80
81	#define CREATE_TRACE_POINTS
82	#include <trace/events/migrate.h>
83
84	#include "internal.h"
85
86	static struct kmem_cache *anon_vma_cachep;
87	static struct kmem_cache *anon_vma_chain_cachep;
88
89	static inline struct anon_vma anon_vma_alloc(void*)
90	{
91	struct anon_vma *anon_vma;
92
93	anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
94	if (anon_vma) {
95	atomic_set(v: &anon_vma->refcount, i: `1`);
96	anon_vma->num_children = `0`;
97	anon_vma->num_active_vmas = `0`;
98	anon_vma->parent = anon_vma;
99	/*
100	* Initialise the anon_vma root to point to itself. If called
101	* from fork, the root will be reset to the parents anon_vma.
102	*/
103	anon_vma->root = anon_vma;
104	}
105
106	return anon_vma;
107	}
108
109	static inline void anon_vma_free(struct anon_vma *anon_vma)
110	{
111	VM_BUG_ON(atomic_read(&anon_vma->refcount));
112
113	/*
114	* Synchronize against folio_lock_anon_vma_read() such that
115	* we can safely hold the lock without the anon_vma getting
116	* freed.
117	*
118	* Relies on the full mb implied by the atomic_dec_and_test() from
119	* put_anon_vma() against the acquire barrier implied by
120	* down_read_trylock() from folio_lock_anon_vma_read(). This orders:
121	*
122	* folio_lock_anon_vma_read() VS put_anon_vma()
123	* down_read_trylock() atomic_dec_and_test()
124	* LOCK MB
125	* atomic_read() rwsem_is_locked()
126	*
127	* LOCK should suffice since the actual taking of the lock must
128	* happen _before_ what follows.
129	*/
130	might_sleep();
131	if (rwsem_is_locked(sem: &anon_vma->root->rwsem)) {
132	anon_vma_lock_write(anon_vma);
133	anon_vma_unlock_write(anon_vma);
134	}
135
136	kmem_cache_free(s: anon_vma_cachep, objp: anon_vma);
137	}
138
139	static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
140	{
141	return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
142	}
143
144	static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
145	{
146	kmem_cache_free(s: anon_vma_chain_cachep, objp: anon_vma_chain);
147	}
148
149	static void anon_vma_chain_link(struct vm_area_struct *vma,
150	struct anon_vma_chain *avc,
151	struct anon_vma *anon_vma)
152	{
153	avc->vma = vma;
154	avc->anon_vma = anon_vma;
155	list_add(new: &avc->same_vma, head: &vma->anon_vma_chain);
156	anon_vma_interval_tree_insert(node: avc, root: &anon_vma->rb_root);
157	}
158
159	/**
160	* __anon_vma_prepare - attach an anon_vma to a memory region
161	* @vma: the memory region in question
162	*
163	* This makes sure the memory mapping described by 'vma' has
164	* an 'anon_vma' attached to it, so that we can associate the
165	* anonymous pages mapped into it with that anon_vma.
166	*
167	* The common case will be that we already have one, which
168	* is handled inline by anon_vma_prepare(). But if
169	* not we either need to find an adjacent mapping that we
170	* can re-use the anon_vma from (very common when the only
171	* reason for splitting a vma has been mprotect()), or we
172	* allocate a new one.
173	*
174	* Anon-vma allocations are very subtle, because we may have
175	* optimistically looked up an anon_vma in folio_lock_anon_vma_read()
176	* and that may actually touch the rwsem even in the newly
177	* allocated vma (it depends on RCU to make sure that the
178	* anon_vma isn't actually destroyed).
179	*
180	* As a result, we need to do proper anon_vma locking even
181	* for the new allocation. At the same time, we do not want
182	* to do any locking for the common case of already having
183	* an anon_vma.
184	*/
185	int __anon_vma_prepare(struct vm_area_struct *vma)
186	{
187	struct mm_struct *mm = vma->vm_mm;
188	struct anon_vma anon_vma, allocated;
189	struct anon_vma_chain *avc;
190
191	mmap_assert_locked(mm);
192	might_sleep();
193
194	avc = anon_vma_chain_alloc(GFP_KERNEL);
195	if (!avc)
196	goto out_enomem;
197
198	anon_vma = find_mergeable_anon_vma(vma);
199	allocated = NULL;
200	if (!anon_vma) {
201	anon_vma = anon_vma_alloc();
202	if (unlikely(!anon_vma))
203	goto out_enomem_free_avc;
204	anon_vma->num_children++; / self-parent link for new root /
205	allocated = anon_vma;
206	}
207
208	anon_vma_lock_write(anon_vma);
209	/ page_table_lock to protect against threads /
210	spin_lock(lock: &mm->page_table_lock);
211	if (likely(!vma->anon_vma)) {
212	vma->anon_vma = anon_vma;
213	anon_vma_chain_link(vma, avc, anon_vma);
214	anon_vma->num_active_vmas++;
215	allocated = NULL;
216	avc = NULL;
217	}
218	spin_unlock(lock: &mm->page_table_lock);
219	anon_vma_unlock_write(anon_vma);
220
221	if (unlikely(allocated))
222	put_anon_vma(anon_vma: allocated);
223	if (unlikely(avc))
224	anon_vma_chain_free(anon_vma_chain: avc);
225
226	return `0`;
227
228	out_enomem_free_avc:
229	anon_vma_chain_free(anon_vma_chain: avc);
230	out_enomem:
231	return -ENOMEM;
232	}
233
234	/*
235	* This is a useful helper function for locking the anon_vma root as
236	* we traverse the vma->anon_vma_chain, looping over anon_vma's that
237	* have the same vma.
238	*
239	* Such anon_vma's should have the same root, so you'd expect to see
240	* just a single mutex_lock for the whole traversal.
241	*/
242	static inline struct anon_vma lock_anon_vma_root(struct* anon_vma root, struct* anon_vma *anon_vma)
243	{
244	struct anon_vma *new_root = anon_vma->root;
245	if (new_root != root) {
246	if (WARN_ON_ONCE(root))
247	up_write(sem: &root->rwsem);
248	root = new_root;
249	down_write(sem: &root->rwsem);
250	}
251	return root;
252	}
253
254	static inline void unlock_anon_vma_root(struct anon_vma *root)
255	{
256	if (root)
257	up_write(sem: &root->rwsem);
258	}
259
260	/*
261	* Attach the anon_vmas from src to dst.
262	* Returns 0 on success, -ENOMEM on failure.
263	*
264	* anon_vma_clone() is called by vma_expand(), vma_merge(), __split_vma(),
265	* copy_vma() and anon_vma_fork(). The first four want an exact copy of src,
266	* while the last one, anon_vma_fork(), may try to reuse an existing anon_vma to
267	* prevent endless growth of anon_vma. Since dst->anon_vma is set to NULL before
268	* call, we can identify this case by checking (!dst->anon_vma &&
269	* src->anon_vma).
270	*
271	* If (!dst->anon_vma && src->anon_vma) is true, this function tries to find
272	* and reuse existing anon_vma which has no vmas and only one child anon_vma.
273	* This prevents degradation of anon_vma hierarchy to endless linear chain in
274	* case of constantly forking task. On the other hand, an anon_vma with more
275	* than one child isn't reused even if there was no alive vma, thus rmap
276	* walker has a good chance of avoiding scanning the whole hierarchy when it
277	* searches where page is mapped.
278	*/
279	int anon_vma_clone(struct vm_area_struct dst, struct* vm_area_struct *src)
280	{
281	struct anon_vma_chain avc, pavc;
282	struct anon_vma *root = NULL;
283
284	list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
285	struct anon_vma *anon_vma;
286
287	avc = anon_vma_chain_alloc(GFP_NOWAIT);
288	if (unlikely(!avc)) {
289	unlock_anon_vma_root(root);
290	root = NULL;
291	avc = anon_vma_chain_alloc(GFP_KERNEL);
292	if (!avc)
293	goto enomem_failure;
294	}
295	anon_vma = pavc->anon_vma;
296	root = lock_anon_vma_root(root, anon_vma);
297	anon_vma_chain_link(vma: dst, avc, anon_vma);
298
299	/*
300	* Reuse existing anon_vma if it has no vma and only one
301	* anon_vma child.
302	*
303	* Root anon_vma is never reused:
304	* it has self-parent reference and at least one child.
305	*/
306	if (!dst->anon_vma && src->anon_vma &&
307	anon_vma->num_children < `2` &&
308	anon_vma->num_active_vmas == `0`)
309	dst->anon_vma = anon_vma;
310	}
311	if (dst->anon_vma)
312	dst->anon_vma->num_active_vmas++;
313	unlock_anon_vma_root(root);
314	return `0`;
315
316	enomem_failure:
317	/*
318	* dst->anon_vma is dropped here otherwise its num_active_vmas can
319	* be incorrectly decremented in unlink_anon_vmas().
320	* We can safely do this because callers of anon_vma_clone() don't care
321	* about dst->anon_vma if anon_vma_clone() failed.
322	*/
323	dst->anon_vma = NULL;
324	unlink_anon_vmas(dst);
325	return -ENOMEM;
326	}
327
328	/*
329	* Attach vma to its own anon_vma, as well as to the anon_vmas that
330	* the corresponding VMA in the parent process is attached to.
331	* Returns 0 on success, non-zero on failure.
332	*/
333	int anon_vma_fork(struct vm_area_struct vma, struct* vm_area_struct *pvma)
334	{
335	struct anon_vma_chain *avc;
336	struct anon_vma *anon_vma;
337	int error;
338
339	/ Don't bother if the parent process has no anon_vma here. /
340	if (!pvma->anon_vma)
341	return `0`;
342
343	/ Drop inherited anon_vma, we'll reuse existing or allocate new. /
344	vma->anon_vma = NULL;
345
346	/*
347	* First, attach the new VMA to the parent VMA's anon_vmas,
348	* so rmap can find non-COWed pages in child processes.
349	*/
350	error = anon_vma_clone(dst: vma, src: pvma);
351	if (error)
352	return error;
353
354	/ An existing anon_vma has been reused, all done then. /
355	if (vma->anon_vma)
356	return `0`;
357
358	/ Then add our own anon_vma. /
359	anon_vma = anon_vma_alloc();
360	if (!anon_vma)
361	goto out_error;
362	anon_vma->num_active_vmas++;
363	avc = anon_vma_chain_alloc(GFP_KERNEL);
364	if (!avc)
365	goto out_error_free_anon_vma;
366
367	/*
368	* The root anon_vma's rwsem is the lock actually used when we
369	* lock any of the anon_vmas in this anon_vma tree.
370	*/
371	anon_vma->root = pvma->anon_vma->root;
372	anon_vma->parent = pvma->anon_vma;
373	/*
374	* With refcounts, an anon_vma can stay around longer than the
375	* process it belongs to. The root anon_vma needs to be pinned until
376	* this anon_vma is freed, because the lock lives in the root.
377	*/
378	get_anon_vma(anon_vma: anon_vma->root);
379	/ Mark this anon_vma as the one where our new (COWed) pages go. /
380	vma->anon_vma = anon_vma;
381	anon_vma_lock_write(anon_vma);
382	anon_vma_chain_link(vma, avc, anon_vma);
383	anon_vma->parent->num_children++;
384	anon_vma_unlock_write(anon_vma);
385
386	return `0`;
387
388	out_error_free_anon_vma:
389	put_anon_vma(anon_vma);
390	out_error:
391	unlink_anon_vmas(vma);
392	return -ENOMEM;
393	}
394
395	void unlink_anon_vmas(struct vm_area_struct *vma)
396	{
397	struct anon_vma_chain avc, next;
398	struct anon_vma *root = NULL;
399
400	/*
401	* Unlink each anon_vma chained to the VMA. This list is ordered
402	* from newest to oldest, ensuring the root anon_vma gets freed last.
403	*/
404	list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
405	struct anon_vma *anon_vma = avc->anon_vma;
406
407	root = lock_anon_vma_root(root, anon_vma);
408	anon_vma_interval_tree_remove(node: avc, root: &anon_vma->rb_root);
409
410	/*
411	* Leave empty anon_vmas on the list - we'll need
412	* to free them outside the lock.
413	*/
414	if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) {
415	anon_vma->parent->num_children--;
416	continue;
417	}
418
419	list_del(entry: &avc->same_vma);
420	anon_vma_chain_free(anon_vma_chain: avc);
421	}
422	if (vma->anon_vma) {
423	vma->anon_vma->num_active_vmas--;
424
425	/*
426	* vma would still be needed after unlink, and anon_vma will be prepared
427	* when handle fault.
428	*/
429	vma->anon_vma = NULL;
430	}
431	unlock_anon_vma_root(root);
432
433	/*
434	* Iterate the list once more, it now only contains empty and unlinked
435	* anon_vmas, destroy them. Could not do before due to __put_anon_vma()
436	* needing to write-acquire the anon_vma->root->rwsem.
437	*/
438	list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
439	struct anon_vma *anon_vma = avc->anon_vma;
440
441	VM_WARN_ON(anon_vma->num_children);
442	VM_WARN_ON(anon_vma->num_active_vmas);
443	put_anon_vma(anon_vma);
444
445	list_del(entry: &avc->same_vma);
446	anon_vma_chain_free(anon_vma_chain: avc);
447	}
448	}
449
450	static void anon_vma_ctor(void *data)
451	{
452	struct anon_vma *anon_vma = data;
453
454	init_rwsem(&anon_vma->rwsem);
455	atomic_set(v: &anon_vma->refcount, i: `0`);
456	anon_vma->rb_root = RB_ROOT_CACHED;
457	}
458
459	void __init anon_vma_init(void)
460	{
461	anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
462	`0`, SLAB_TYPESAFE_BY_RCU\|SLAB_PANIC\|SLAB_ACCOUNT,
463	anon_vma_ctor);
464	anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain,
465	SLAB_PANIC\|SLAB_ACCOUNT);
466	}
467
468	/*
469	* Getting a lock on a stable anon_vma from a page off the LRU is tricky!
470	*
471	* Since there is no serialization what so ever against folio_remove_rmap_*()
472	* the best this function can do is return a refcount increased anon_vma
473	* that might have been relevant to this page.
474	*
475	* The page might have been remapped to a different anon_vma or the anon_vma
476	* returned may already be freed (and even reused).
477	*
478	* In case it was remapped to a different anon_vma, the new anon_vma will be a
479	* child of the old anon_vma, and the anon_vma lifetime rules will therefore
480	* ensure that any anon_vma obtained from the page will still be valid for as
481	* long as we observe page_mapped() [ hence all those page_mapped() tests ].
482	*
483	* All users of this function must be very careful when walking the anon_vma
484	* chain and verify that the page in question is indeed mapped in it
485	* [ something equivalent to page_mapped_in_vma() ].
486	*
487	* Since anon_vma's slab is SLAB_TYPESAFE_BY_RCU and we know from
488	* folio_remove_rmap_*() that the anon_vma pointer from page->mapping is valid
489	* if there is a mapcount, we can dereference the anon_vma after observing
490	* those.
491	*
492	* NOTE: the caller should normally hold folio lock when calling this. If
493	* not, the caller needs to double check the anon_vma didn't change after
494	* taking the anon_vma lock for either read or write (UFFDIO_MOVE can modify it
495	* concurrently without folio lock protection). See folio_lock_anon_vma_read()
496	* which has already covered that, and comment above remap_pages().
497	*/
498	struct anon_vma folio_get_anon_vma(const* struct folio *folio)
499	{
500	struct anon_vma *anon_vma = NULL;
501	unsigned long anon_mapping;
502
503	rcu_read_lock();
504	anon_mapping = (unsigned long)READ_ONCE(folio->mapping);
505	if ((anon_mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON)
506	goto out;
507	if (!folio_mapped(folio))
508	goto out;
509
510	anon_vma = (struct anon_vma *) (anon_mapping - FOLIO_MAPPING_ANON);
511	if (!atomic_inc_not_zero(v: &anon_vma->refcount)) {
512	anon_vma = NULL;
513	goto out;
514	}
515
516	/*
517	* If this folio is still mapped, then its anon_vma cannot have been
518	* freed. But if it has been unmapped, we have no security against the
519	* anon_vma structure being freed and reused (for another anon_vma:
520	* SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero()
521	* above cannot corrupt).
522	*/
523	if (!folio_mapped(folio)) {
524	rcu_read_unlock();
525	put_anon_vma(anon_vma);
526	return NULL;
527	}
528	out:
529	rcu_read_unlock();
530
531	return anon_vma;
532	}
533
534	/*
535	* Similar to folio_get_anon_vma() except it locks the anon_vma.
536	*
537	* Its a little more complex as it tries to keep the fast path to a single
538	* atomic op -- the trylock. If we fail the trylock, we fall back to getting a
539	* reference like with folio_get_anon_vma() and then block on the mutex
540	* on !rwc->try_lock case.
541	*/
542	struct anon_vma folio_lock_anon_vma_read(const* struct folio *folio,
543	struct rmap_walk_control *rwc)
544	{
545	struct anon_vma *anon_vma = NULL;
546	struct anon_vma *root_anon_vma;
547	unsigned long anon_mapping;
548
549	retry:
550	rcu_read_lock();
551	anon_mapping = (unsigned long)READ_ONCE(folio->mapping);
552	if ((anon_mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON)
553	goto out;
554	if (!folio_mapped(folio))
555	goto out;
556
557	anon_vma = (struct anon_vma *) (anon_mapping - FOLIO_MAPPING_ANON);
558	root_anon_vma = READ_ONCE(anon_vma->root);
559	if (down_read_trylock(sem: &root_anon_vma->rwsem)) {
560	/*
561	* folio_move_anon_rmap() might have changed the anon_vma as we
562	* might not hold the folio lock here.
563	*/
564	if (unlikely((unsigned long)READ_ONCE(folio->mapping) !=
565	anon_mapping)) {
566	up_read(sem: &root_anon_vma->rwsem);
567	rcu_read_unlock();
568	goto retry;
569	}
570
571	/*
572	* If the folio is still mapped, then this anon_vma is still
573	* its anon_vma, and holding the mutex ensures that it will
574	* not go away, see anon_vma_free().
575	*/
576	if (!folio_mapped(folio)) {
577	up_read(sem: &root_anon_vma->rwsem);
578	anon_vma = NULL;
579	}
580	goto out;
581	}
582
583	if (rwc && rwc->try_lock) {
584	anon_vma = NULL;
585	rwc->contended = true;
586	goto out;
587	}
588
589	/ trylock failed, we got to sleep /
590	if (!atomic_inc_not_zero(v: &anon_vma->refcount)) {
591	anon_vma = NULL;
592	goto out;
593	}
594
595	if (!folio_mapped(folio)) {
596	rcu_read_unlock();
597	put_anon_vma(anon_vma);
598	return NULL;
599	}
600
601	/ we pinned the anon_vma, its safe to sleep /
602	rcu_read_unlock();
603	anon_vma_lock_read(anon_vma);
604
605	/*
606	* folio_move_anon_rmap() might have changed the anon_vma as we might
607	* not hold the folio lock here.
608	*/
609	if (unlikely((unsigned long)READ_ONCE(folio->mapping) !=
610	anon_mapping)) {
611	anon_vma_unlock_read(anon_vma);
612	put_anon_vma(anon_vma);
613	anon_vma = NULL;
614	goto retry;
615	}
616
617	if (atomic_dec_and_test(v: &anon_vma->refcount)) {
618	/*
619	* Oops, we held the last refcount, release the lock
620	* and bail -- can't simply use put_anon_vma() because
621	* we'll deadlock on the anon_vma_lock_write() recursion.
622	*/
623	anon_vma_unlock_read(anon_vma);
624	__put_anon_vma(anon_vma);
625	anon_vma = NULL;
626	}
627
628	return anon_vma;
629
630	out:
631	rcu_read_unlock();
632	return anon_vma;
633	}
634
635	#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
636	/*
637	* Flush TLB entries for recently unmapped pages from remote CPUs. It is
638	* important if a PTE was dirty when it was unmapped that it's flushed
639	* before any IO is initiated on the page to prevent lost writes. Similarly,
640	* it must be flushed before freeing to prevent data leakage.
641	*/
642	void try_to_unmap_flush(void)
643	{
644	struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
645
646	if (!tlb_ubc->flush_required)
647	return;
648
649	arch_tlbbatch_flush(batch: &tlb_ubc->arch);
650	tlb_ubc->flush_required = false;
651	tlb_ubc->writable = false;
652	}
653
654	/ Flush iff there are potentially writable TLB entries that can race with IO /
655	void try_to_unmap_flush_dirty(void)
656	{
657	struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
658
659	if (tlb_ubc->writable)
660	try_to_unmap_flush();
661	}
662
663	/*
664	* Bits 0-14 of mm->tlb_flush_batched record pending generations.
665	* Bits 16-30 of mm->tlb_flush_batched bit record flushed generations.
666	*/
667	#define TLB_FLUSH_BATCH_FLUSHED_SHIFT 16
668	#define TLB_FLUSH_BATCH_PENDING_MASK \
669	((1 << (TLB_FLUSH_BATCH_FLUSHED_SHIFT - 1)) - 1)
670	#define TLB_FLUSH_BATCH_PENDING_LARGE \
671	(TLB_FLUSH_BATCH_PENDING_MASK / 2)
672
673	static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
674	unsigned long start, unsigned long end)
675	{
676	struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
677	int batch;
678	bool writable = pte_dirty(pte: pteval);
679
680	if (!pte_accessible(mm, a: pteval))
681	return;
682
683	arch_tlbbatch_add_pending(batch: &tlb_ubc->arch, mm, start, end);
684	tlb_ubc->flush_required = true;
685
686	/*
687	* Ensure compiler does not re-order the setting of tlb_flush_batched
688	* before the PTE is cleared.
689	*/
690	barrier();
691	batch = atomic_read(v: &mm->tlb_flush_batched);
692	retry:
693	if ((batch & TLB_FLUSH_BATCH_PENDING_MASK) > TLB_FLUSH_BATCH_PENDING_LARGE) {
694	/*
695	* Prevent `pending' from catching up with `flushed' because of
696	* overflow. Reset `pending' and `flushed' to be 1 and 0 if
697	* `pending' becomes large.
698	*/
699	if (!atomic_try_cmpxchg(v: &mm->tlb_flush_batched, old: &batch, new: `1`))
700	goto retry;
701	} else {
702	atomic_inc(v: &mm->tlb_flush_batched);
703	}
704
705	/*
706	* If the PTE was dirty then it's best to assume it's writable. The
707	* caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
708	* before the page is queued for IO.
709	*/
710	if (writable)
711	tlb_ubc->writable = true;
712	}
713
714	/*
715	* Returns true if the TLB flush should be deferred to the end of a batch of
716	* unmap operations to reduce IPIs.
717	*/
718	static bool should_defer_flush(struct mm_struct mm, enum* ttu_flags flags)
719	{
720	if (!(flags & TTU_BATCH_FLUSH))
721	return false;
722
723	return arch_tlbbatch_should_defer(mm);
724	}
725
726	/*
727	* Reclaim unmaps pages under the PTL but do not flush the TLB prior to
728	* releasing the PTL if TLB flushes are batched. It's possible for a parallel
729	* operation such as mprotect or munmap to race between reclaim unmapping
730	* the page and flushing the page. If this race occurs, it potentially allows
731	* access to data via a stale TLB entry. Tracking all mm's that have TLB
732	* batching in flight would be expensive during reclaim so instead track
733	* whether TLB batching occurred in the past and if so then do a flush here
734	* if required. This will cost one additional flush per reclaim cycle paid
735	* by the first operation at risk such as mprotect and mumap.
736	*
737	* This must be called under the PTL so that an access to tlb_flush_batched
738	* that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise
739	* via the PTL.
740	*/
741	void flush_tlb_batched_pending(struct mm_struct *mm)
742	{
743	int batch = atomic_read(v: &mm->tlb_flush_batched);
744	int pending = batch & TLB_FLUSH_BATCH_PENDING_MASK;
745	int flushed = batch >> TLB_FLUSH_BATCH_FLUSHED_SHIFT;
746
747	if (pending != flushed) {
748	flush_tlb_mm(mm);
749	/*
750	* If the new TLB flushing is pending during flushing, leave
751	* mm->tlb_flush_batched as is, to avoid losing flushing.
752	*/
753	atomic_cmpxchg(v: &mm->tlb_flush_batched, old: batch,
754	new: pending \| (pending << TLB_FLUSH_BATCH_FLUSHED_SHIFT));
755	}
756	}
757	#else
758	static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
759	unsigned long start, unsigned long end)
760	{
761	}
762
763	static bool should_defer_flush(struct mm_struct mm, enum* ttu_flags flags)
764	{
765	return false;
766	}
767	#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
768
769	/**
770	* page_address_in_vma - The virtual address of a page in this VMA.
771	* @folio: The folio containing the page.
772	* @page: The page within the folio.
773	* @vma: The VMA we need to know the address in.
774	*
775	* Calculates the user virtual address of this page in the specified VMA.
776	* It is the caller's responsibility to check the page is actually
777	* within the VMA. There may not currently be a PTE pointing at this
778	* page, but if a page fault occurs at this address, this is the page
779	* which will be accessed.
780	*
781	* Context: Caller should hold a reference to the folio. Caller should
782	* hold a lock (eg the i_mmap_lock or the mmap_lock) which keeps the
783	* VMA from being altered.
784	*
785	* Return: The virtual address corresponding to this page in the VMA.
786	*/
787	unsigned long page_address_in_vma(const struct folio *folio,
788	const struct page page, const* struct vm_area_struct *vma)
789	{
790	if (folio_test_anon(folio)) {
791	struct anon_vma *anon_vma = folio_anon_vma(folio);
792	/*
793	* Note: swapoff's unuse_vma() is more efficient with this
794	* check, and needs it to match anon_vma when KSM is active.
795	*/
796	if (!vma->anon_vma \|\| !anon_vma \|\|
797	vma->anon_vma->root != anon_vma->root)
798	return -EFAULT;
799	} else if (!vma->vm_file) {
800	return -EFAULT;
801	} else if (vma->vm_file->f_mapping != folio->mapping) {
802	return -EFAULT;
803	}
804
805	/ KSM folios don't reach here because of the !anon_vma check /
806	return vma_address(vma, pgoff: page_pgoff(folio, page), nr_pages: `1`);
807	}
808
809	/*
810	* Returns the actual pmd_t* where we expect 'address' to be mapped from, or
811	* NULL if it doesn't exist. No guarantees / checks on what the pmd_t*
812	* represents.
813	*/
814	pmd_t mm_find_pmd(struct* mm_struct mm, unsigned* long address)
815	{
816	pgd_t *pgd;
817	p4d_t *p4d;
818	pud_t *pud;
819	pmd_t *pmd = NULL;
820
821	pgd = pgd_offset(mm, address);
822	if (!pgd_present(pgd: *pgd))
823	goto out;
824
825	p4d = p4d_offset(pgd, address);
826	if (!p4d_present(p4d: *p4d))
827	goto out;
828
829	pud = pud_offset(p4d, address);
830	if (!pud_present(pud: *pud))
831	goto out;
832
833	pmd = pmd_offset(pud, address);
834	out:
835	return pmd;
836	}
837
838	struct folio_referenced_arg {
839	int mapcount;
840	int referenced;
841	vm_flags_t vm_flags;
842	struct mem_cgroup *memcg;
843	};
844
845	/*
846	* arg: folio_referenced_arg will be passed
847	*/
848	static bool folio_referenced_one(struct folio *folio,
849	struct vm_area_struct vma, unsigned* long address, void *arg)
850	{
851	struct folio_referenced_arg *pra = arg;
852	DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, `0`);
853	int ptes = `0`, referenced = `0`;
854
855	while (page_vma_mapped_walk(pvmw: &pvmw)) {
856	address = pvmw.address;
857
858	if (vma->vm_flags & VM_LOCKED) {
859	ptes++;
860	pra->mapcount--;
861
862	/ Only mlock fully mapped pages /
863	if (pvmw.pte && ptes != pvmw.nr_pages)
864	continue;
865
866	/*
867	* All PTEs must be protected by page table lock in
868	* order to mlock the page.
869	*
870	* If page table boundary has been cross, current ptl
871	* only protect part of ptes.
872	*/
873	if (pvmw.flags & PVMW_PGTABLE_CROSSED)
874	continue;
875
876	/ Restore the mlock which got missed /
877	mlock_vma_folio(folio, vma);
878	page_vma_mapped_walk_done(pvmw: &pvmw);
879	pra->vm_flags \|= VM_LOCKED;
880	return false; / To break the loop /
881	}
882
883	/*
884	* Skip the non-shared swapbacked folio mapped solely by
885	* the exiting or OOM-reaped process. This avoids redundant
886	* swap-out followed by an immediate unmap.
887	*/
888	if ((!atomic_read(v: &vma->vm_mm->mm_users) \|\|
889	check_stable_address_space(mm: vma->vm_mm)) &&
890	folio_test_anon(folio) && folio_test_swapbacked(folio) &&
891	!folio_maybe_mapped_shared(folio)) {
892	pra->referenced = -`1`;
893	page_vma_mapped_walk_done(pvmw: &pvmw);
894	return false;
895	}
896
897	if (lru_gen_enabled() && pvmw.pte) {
898	if (lru_gen_look_around(pvmw: &pvmw))
899	referenced++;
900	} else if (pvmw.pte) {
901	if (ptep_clear_flush_young_notify(vma, address,
902	pvmw.pte))
903	referenced++;
904	} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
905	if (pmdp_clear_flush_young_notify(vma, address,
906	pvmw.pmd))
907	referenced++;
908	} else {
909	/ unexpected pmd-mapped folio? /
910	WARN_ON_ONCE(`1`);
911	}
912
913	pra->mapcount--;
914	}
915
916	if (referenced)
917	folio_clear_idle(folio);
918	if (folio_test_clear_young(folio))
919	referenced++;
920
921	if (referenced) {
922	pra->referenced++;
923	pra->vm_flags \|= vma->vm_flags & ~VM_LOCKED;
924	}
925
926	if (!pra->mapcount)
927	return false; / To break the loop /
928
929	return true;
930	}
931
932	static bool invalid_folio_referenced_vma(struct vm_area_struct vma, void* *arg)
933	{
934	struct folio_referenced_arg *pra = arg;
935	struct mem_cgroup *memcg = pra->memcg;
936
937	/*
938	* Ignore references from this mapping if it has no recency. If the
939	* folio has been used in another mapping, we will catch it; if this
940	* other mapping is already gone, the unmap path will have set the
941	* referenced flag or activated the folio in zap_pte_range().
942	*/
943	if (!vma_has_recency(vma))
944	return true;
945
946	/*
947	* If we are reclaiming on behalf of a cgroup, skip counting on behalf
948	* of references from different cgroups.
949	*/
950	if (memcg && !mm_match_cgroup(mm: vma->vm_mm, memcg))
951	return true;
952
953	return false;
954	}
955
956	/**
957	* folio_referenced() - Test if the folio was referenced.
958	* @folio: The folio to test.
959	* @is_locked: Caller holds lock on the folio.
960	* @memcg: target memory cgroup
961	* @vm_flags: A combination of all the vma->vm_flags which referenced the folio.
962	*
963	* Quick test_and_clear_referenced for all mappings of a folio,
964	*
965	* Return: The number of mappings which referenced the folio. Return -1 if
966	* the function bailed out due to rmap lock contention.
967	*/
968	int folio_referenced(struct folio folio, int* is_locked,
969	struct mem_cgroup memcg, vm_flags_t vm_flags)
970	{
971	bool we_locked = false;
972	struct folio_referenced_arg pra = {
973	.mapcount = folio_mapcount(folio),
974	.memcg = memcg,
975	};
976	struct rmap_walk_control rwc = {
977	.rmap_one = folio_referenced_one,
978	.arg = (void *)&pra,
979	.anon_lock = folio_lock_anon_vma_read,
980	.try_lock = true,
981	.invalid_vma = invalid_folio_referenced_vma,
982	};
983
984	*vm_flags = `0`;
985	if (!pra.mapcount)
986	return `0`;
987
988	if (!folio_raw_mapping(folio))
989	return `0`;
990
991	if (!is_locked && (!folio_test_anon(folio) \|\| folio_test_ksm(folio))) {
992	we_locked = folio_trylock(folio);
993	if (!we_locked)
994	return `1`;
995	}
996
997	rmap_walk(folio, rwc: &rwc);
998	*vm_flags = pra.vm_flags;
999
1000	if (we_locked)
1001	folio_unlock(folio);
1002
1003	return rwc.contended ? -`1` : pra.referenced;
1004	}
1005
1006	static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw)
1007	{
1008	int cleaned = `0`;
1009	struct vm_area_struct *vma = pvmw->vma;
1010	struct mmu_notifier_range range;
1011	unsigned long address = pvmw->address;
1012
1013	/*
1014	* We have to assume the worse case ie pmd for invalidation. Note that
1015	* the folio can not be freed from this function.
1016	*/
1017	mmu_notifier_range_init(range: &range, event: MMU_NOTIFY_PROTECTION_PAGE, flags: `0`,
1018	mm: vma->vm_mm, start: address, end: vma_address_end(pvmw));
1019	mmu_notifier_invalidate_range_start(range: &range);
1020
1021	while (page_vma_mapped_walk(pvmw)) {
1022	int ret = `0`;
1023
1024	address = pvmw->address;
1025	if (pvmw->pte) {
1026	pte_t *pte = pvmw->pte;
1027	pte_t entry = ptep_get(ptep: pte);
1028
1029	/*
1030	* PFN swap PTEs, such as device-exclusive ones, that
1031	* actually map pages are clean and not writable from a
1032	* CPU perspective. The MMU notifier takes care of any
1033	* device aspects.
1034	*/
1035	if (!pte_present(a: entry))
1036	continue;
1037	if (!pte_dirty(pte: entry) && !pte_write(pte: entry))
1038	continue;
1039
1040	flush_cache_page(vma, vmaddr: address, pfn: pte_pfn(pte: entry));
1041	entry = ptep_clear_flush(vma, address, ptep: pte);
1042	entry = pte_wrprotect(pte: entry);
1043	entry = pte_mkclean(pte: entry);
1044	set_pte_at(vma->vm_mm, address, pte, entry);
1045	ret = `1`;
1046	} else {
1047	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1048	pmd_t *pmd = pvmw->pmd;
1049	pmd_t entry;
1050
1051	if (!pmd_dirty(pmd) && !pmd_write(pmd))
1052	continue;
1053
1054	flush_cache_range(vma, address,
1055	address + HPAGE_PMD_SIZE);
1056	entry = pmdp_invalidate(vma, address, pmd);
1057	entry = pmd_wrprotect(entry);
1058	entry = pmd_mkclean(entry);
1059	set_pmd_at(vma->vm_mm, address, pmd, entry);
1060	ret = `1`;
1061	#else
1062	/ unexpected pmd-mapped folio? /
1063	WARN_ON_ONCE(`1`);
1064	#endif
1065	}
1066
1067	if (ret)
1068	cleaned++;
1069	}
1070
1071	mmu_notifier_invalidate_range_end(range: &range);
1072
1073	return cleaned;
1074	}
1075
1076	static bool page_mkclean_one(struct folio folio, struct* vm_area_struct *vma,
1077	unsigned long address, void *arg)
1078	{
1079	DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_SYNC);
1080	int *cleaned = arg;
1081
1082	*cleaned += page_vma_mkclean_one(pvmw: &pvmw);
1083
1084	return true;
1085	}
1086
1087	static bool invalid_mkclean_vma(struct vm_area_struct vma, void* *arg)
1088	{
1089	if (vma->vm_flags & VM_SHARED)
1090	return false;
1091
1092	return true;
1093	}
1094
1095	int folio_mkclean(struct folio *folio)
1096	{
1097	int cleaned = `0`;
1098	struct address_space *mapping;
1099	struct rmap_walk_control rwc = {
1100	.arg = (void *)&cleaned,
1101	.rmap_one = page_mkclean_one,
1102	.invalid_vma = invalid_mkclean_vma,
1103	};
1104
1105	BUG_ON(!folio_test_locked(folio));
1106
1107	if (!folio_mapped(folio))
1108	return `0`;
1109
1110	mapping = folio_mapping(folio);
1111	if (!mapping)
1112	return `0`;
1113
1114	rmap_walk(folio, rwc: &rwc);
1115
1116	return cleaned;
1117	}
1118	EXPORT_SYMBOL_GPL(folio_mkclean);
1119
1120	struct wrprotect_file_state {
1121	int cleaned;
1122	pgoff_t pgoff;
1123	unsigned long pfn;
1124	unsigned long nr_pages;
1125	};
1126
1127	static bool mapping_wrprotect_range_one(struct folio *folio,
1128	struct vm_area_struct vma, unsigned* long address, void *arg)
1129	{
1130	struct wrprotect_file_state state = (struct* wrprotect_file_state *)arg;
1131	struct page_vma_mapped_walk pvmw = {
1132	.pfn = state->pfn,
1133	.nr_pages = state->nr_pages,
1134	.pgoff = state->pgoff,
1135	.vma = vma,
1136	.address = address,
1137	.flags = PVMW_SYNC,
1138	};
1139
1140	state->cleaned += page_vma_mkclean_one(pvmw: &pvmw);
1141
1142	return true;
1143	}
1144
1145	static void __rmap_walk_file(struct folio folio, struct* address_space *mapping,
1146	pgoff_t pgoff_start, unsigned long nr_pages,
1147	struct rmap_walk_control *rwc, bool locked);
1148
1149	/**
1150	* mapping_wrprotect_range() - Write-protect all mappings in a specified range.
1151	*
1152	* @mapping: The mapping whose reverse mapping should be traversed.
1153	* @pgoff: The page offset at which @pfn is mapped within @mapping.
1154	* @pfn: The PFN of the page mapped in @mapping at @pgoff.
1155	* @nr_pages: The number of physically contiguous base pages spanned.
1156	*
1157	* Traverses the reverse mapping, finding all VMAs which contain a shared
1158	* mapping of the pages in the specified range in @mapping, and write-protects
1159	* them (that is, updates the page tables to mark the mappings read-only such
1160	* that a write protection fault arises when the mappings are written to).
1161	*
1162	* The @pfn value need not refer to a folio, but rather can reference a kernel
1163	* allocation which is mapped into userland. We therefore do not require that
1164	* the page maps to a folio with a valid mapping or index field, rather the
1165	* caller specifies these in @mapping and @pgoff.
1166	*
1167	* Return: the number of write-protected PTEs, or an error.
1168	*/
1169	int mapping_wrprotect_range(struct address_space *mapping, pgoff_t pgoff,
1170	unsigned long pfn, unsigned long nr_pages)
1171	{
1172	struct wrprotect_file_state state = {
1173	.cleaned = `0`,
1174	.pgoff = pgoff,
1175	.pfn = pfn,
1176	.nr_pages = nr_pages,
1177	};
1178	struct rmap_walk_control rwc = {
1179	.arg = (void *)&state,
1180	.rmap_one = mapping_wrprotect_range_one,
1181	.invalid_vma = invalid_mkclean_vma,
1182	};
1183
1184	if (!mapping)
1185	return `0`;
1186
1187	__rmap_walk_file(/ folio = /NULL, mapping, pgoff_start: pgoff, nr_pages, rwc: &rwc,
1188	/ locked = /false);
1189
1190	return state.cleaned;
1191	}
1192	EXPORT_SYMBOL_GPL(mapping_wrprotect_range);
1193
1194	/**
1195	* pfn_mkclean_range - Cleans the PTEs (including PMDs) mapped with range of
1196	* [@pfn, @pfn + @nr_pages) at the specific offset (@pgoff)
1197	* within the @vma of shared mappings. And since clean PTEs
1198	* should also be readonly, write protects them too.
1199	* @pfn: start pfn.
1200	* @nr_pages: number of physically contiguous pages srarting with @pfn.
1201	* @pgoff: page offset that the @pfn mapped with.
1202	* @vma: vma that @pfn mapped within.
1203	*
1204	* Returns the number of cleaned PTEs (including PMDs).
1205	*/
1206	int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
1207	struct vm_area_struct *vma)
1208	{
1209	struct page_vma_mapped_walk pvmw = {
1210	.pfn = pfn,
1211	.nr_pages = nr_pages,
1212	.pgoff = pgoff,
1213	.vma = vma,
1214	.flags = PVMW_SYNC,
1215	};
1216
1217	if (invalid_mkclean_vma(vma, NULL))
1218	return `0`;
1219
1220	pvmw.address = vma_address(vma, pgoff, nr_pages);
1221	VM_BUG_ON_VMA(pvmw.address == -EFAULT, vma);
1222
1223	return page_vma_mkclean_one(pvmw: &pvmw);
1224	}
1225
1226	static void __folio_mod_stat(struct folio folio, int* nr, int nr_pmdmapped)
1227	{
1228	int idx;
1229
1230	if (nr) {
1231	idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED;
1232	__lruvec_stat_mod_folio(folio, idx, val: nr);
1233	}
1234	if (nr_pmdmapped) {
1235	if (folio_test_anon(folio)) {
1236	idx = NR_ANON_THPS;
1237	__lruvec_stat_mod_folio(folio, idx, val: nr_pmdmapped);
1238	} else {
1239	/ NR__PMDMAPPED are not maintained per-memcg /*
1240	idx = folio_test_swapbacked(folio) ?
1241	NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED;
1242	__mod_node_page_state(folio_pgdat(folio), item: idx,
1243	nr_pmdmapped);
1244	}
1245	}
1246	}
1247
1248	static __always_inline void __folio_add_rmap(struct folio *folio,
1249	struct page page, int* nr_pages, struct vm_area_struct *vma,
1250	enum pgtable_level level)
1251	{
1252	atomic_t *mapped = &folio->_nr_pages_mapped;
1253	const int orig_nr_pages = nr_pages;
1254	int first = `0`, nr = `0`, nr_pmdmapped = `0`;
1255
1256	__folio_rmap_sanity_checks(folio, page, nr_pages, level);
1257
1258	switch (level) {
1259	case PGTABLE_LEVEL_PTE:
1260	if (!folio_test_large(folio)) {
1261	nr = atomic_inc_and_test(v: &folio->_mapcount);
1262	break;
1263	}
1264
1265	if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
1266	nr = folio_add_return_large_mapcount(folio, diff: orig_nr_pages, vma);
1267	if (nr == orig_nr_pages)
1268	/ Was completely unmapped. /
1269	nr = folio_large_nr_pages(folio);
1270	else
1271	nr = `0`;
1272	break;
1273	}
1274
1275	do {
1276	first += atomic_inc_and_test(v: &page->_mapcount);
1277	} while (page++, --nr_pages > `0`);
1278
1279	if (first &&
1280	atomic_add_return_relaxed(i: first, v: mapped) < ENTIRELY_MAPPED)
1281	nr = first;
1282
1283	folio_add_large_mapcount(folio, diff: orig_nr_pages, vma);
1284	break;
1285	case PGTABLE_LEVEL_PMD:
1286	case PGTABLE_LEVEL_PUD:
1287	first = atomic_inc_and_test(v: &folio->_entire_mapcount);
1288	if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
1289	if (level == PGTABLE_LEVEL_PMD && first)
1290	nr_pmdmapped = folio_large_nr_pages(folio);
1291	nr = folio_inc_return_large_mapcount(folio, vma);
1292	if (nr == `1`)
1293	/ Was completely unmapped. /
1294	nr = folio_large_nr_pages(folio);
1295	else
1296	nr = `0`;
1297	break;
1298	}
1299
1300	if (first) {
1301	nr = atomic_add_return_relaxed(ENTIRELY_MAPPED, v: mapped);
1302	if (likely(nr < ENTIRELY_MAPPED + ENTIRELY_MAPPED)) {
1303	nr_pages = folio_large_nr_pages(folio);
1304	/*
1305	* We only track PMD mappings of PMD-sized
1306	* folios separately.
1307	*/
1308	if (level == PGTABLE_LEVEL_PMD)
1309	nr_pmdmapped = nr_pages;
1310	nr = nr_pages - (nr & FOLIO_PAGES_MAPPED);
1311	/ Raced ahead of a remove and another add? /
1312	if (unlikely(nr < `0`))
1313	nr = `0`;
1314	} else {
1315	/ Raced ahead of a remove of ENTIRELY_MAPPED /
1316	nr = `0`;
1317	}
1318	}
1319	folio_inc_large_mapcount(folio, vma);
1320	break;
1321	default:
1322	BUILD_BUG();
1323	}
1324	__folio_mod_stat(folio, nr, nr_pmdmapped);
1325	}
1326
1327	/**
1328	* folio_move_anon_rmap - move a folio to our anon_vma
1329	* @folio: The folio to move to our anon_vma
1330	* @vma: The vma the folio belongs to
1331	*
1332	* When a folio belongs exclusively to one process after a COW event,
1333	* that folio can be moved into the anon_vma that belongs to just that
1334	* process, so the rmap code will not search the parent or sibling processes.
1335	*/
1336	void folio_move_anon_rmap(struct folio folio, struct* vm_area_struct *vma)
1337	{
1338	void *anon_vma = vma->anon_vma;
1339
1340	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
1341	VM_BUG_ON_VMA(!anon_vma, vma);
1342
1343	anon_vma += FOLIO_MAPPING_ANON;
1344	/*
1345	* Ensure that anon_vma and the FOLIO_MAPPING_ANON bit are written
1346	* simultaneously, so a concurrent reader (eg folio_referenced()'s
1347	* folio_test_anon()) will not see one without the other.
1348	*/
1349	WRITE_ONCE(folio->mapping, anon_vma);
1350	}
1351
1352	/**
1353	* __folio_set_anon - set up a new anonymous rmap for a folio
1354	* @folio: The folio to set up the new anonymous rmap for.
1355	* @vma: VM area to add the folio to.
1356	* @address: User virtual address of the mapping
1357	* @exclusive: Whether the folio is exclusive to the process.
1358	*/
1359	static void __folio_set_anon(struct folio folio, struct* vm_area_struct *vma,
1360	unsigned long address, bool exclusive)
1361	{
1362	struct anon_vma *anon_vma = vma->anon_vma;
1363
1364	BUG_ON(!anon_vma);
1365
1366	/*
1367	* If the folio isn't exclusive to this vma, we must use the _oldest_
1368	* possible anon_vma for the folio mapping!
1369	*/
1370	if (!exclusive)
1371	anon_vma = anon_vma->root;
1372
1373	/*
1374	* page_idle does a lockless/optimistic rmap scan on folio->mapping.
1375	* Make sure the compiler doesn't split the stores of anon_vma and
1376	* the FOLIO_MAPPING_ANON type identifier, otherwise the rmap code
1377	* could mistake the mapping for a struct address_space and crash.
1378	*/
1379	anon_vma = (void *) anon_vma + FOLIO_MAPPING_ANON;
1380	WRITE_ONCE(folio->mapping, (struct address_space *) anon_vma);
1381	folio->index = linear_page_index(vma, address);
1382	}
1383
1384	/**
1385	* __page_check_anon_rmap - sanity check anonymous rmap addition
1386	* @folio: The folio containing @page.
1387	* @page: the page to check the mapping of
1388	* @vma: the vm area in which the mapping is added
1389	* @address: the user virtual address mapped
1390	*/
1391	static void __page_check_anon_rmap(const struct folio *folio,
1392	const struct page page, struct* vm_area_struct *vma,
1393	unsigned long address)
1394	{
1395	/*
1396	* The page's anon-rmap details (mapping and index) are guaranteed to
1397	* be set up correctly at this point.
1398	*
1399	* We have exclusion against folio_add_anon_rmap_*() because the caller
1400	* always holds the page locked.
1401	*
1402	* We have exclusion against folio_add_new_anon_rmap because those pages
1403	* are initially only visible via the pagetables, and the pte is locked
1404	* over the call to folio_add_new_anon_rmap.
1405	*/
1406	VM_BUG_ON_FOLIO(folio_anon_vma(folio)->root != vma->anon_vma->root,
1407	folio);
1408	VM_BUG_ON_PAGE(page_pgoff(folio, page) != linear_page_index(vma, address),
1409	page);
1410	}
1411
1412	static __always_inline void __folio_add_anon_rmap(struct folio *folio,
1413	struct page page, int* nr_pages, struct vm_area_struct *vma,
1414	unsigned long address, rmap_t flags, enum pgtable_level level)
1415	{
1416	int i;
1417
1418	VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
1419
1420	__folio_add_rmap(folio, page, nr_pages, vma, level);
1421
1422	if (likely(!folio_test_ksm(folio)))
1423	__page_check_anon_rmap(folio, page, vma, address);
1424
1425	if (flags & RMAP_EXCLUSIVE) {
1426	switch (level) {
1427	case PGTABLE_LEVEL_PTE:
1428	for (i = `0`; i < nr_pages; i++)
1429	SetPageAnonExclusive(page + i);
1430	break;
1431	case PGTABLE_LEVEL_PMD:
1432	SetPageAnonExclusive(page);
1433	break;
1434	case PGTABLE_LEVEL_PUD:
1435	/*
1436	* Keep the compiler happy, we don't support anonymous
1437	* PUD mappings.
1438	*/
1439	WARN_ON_ONCE(`1`);
1440	break;
1441	default:
1442	BUILD_BUG();
1443	}
1444	}
1445
1446	VM_WARN_ON_FOLIO(!folio_test_large(folio) && PageAnonExclusive(page) &&
1447	atomic_read(&folio->_mapcount) > `0`, folio);
1448	for (i = `0`; i < nr_pages; i++) {
1449	struct page *cur_page = page + i;
1450
1451	VM_WARN_ON_FOLIO(folio_test_large(folio) &&
1452	folio_entire_mapcount(folio) > `1` &&
1453	PageAnonExclusive(cur_page), folio);
1454	if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT))
1455	continue;
1456
1457	/*
1458	* While PTE-mapping a THP we have a PMD and a PTE
1459	* mapping.
1460	*/
1461	VM_WARN_ON_FOLIO(atomic_read(&cur_page->_mapcount) > `0` &&
1462	PageAnonExclusive(cur_page), folio);
1463	}
1464
1465	/*
1466	* Only mlock it if the folio is fully mapped to the VMA.
1467	*
1468	* Partially mapped folios can be split on reclaim and part outside
1469	* of mlocked VMA can be evicted or freed.
1470	*/
1471	if (folio_nr_pages(folio) == nr_pages)
1472	mlock_vma_folio(folio, vma);
1473	}
1474
1475	/**
1476	* folio_add_anon_rmap_ptes - add PTE mappings to a page range of an anon folio
1477	* @folio: The folio to add the mappings to
1478	* @page: The first page to add
1479	* @nr_pages: The number of pages which will be mapped
1480	* @vma: The vm area in which the mappings are added
1481	* @address: The user virtual address of the first page to map
1482	* @flags: The rmap flags
1483	*
1484	* The page range of folio is defined by [first_page, first_page + nr_pages)
1485	*
1486	* The caller needs to hold the page table lock, and the page must be locked in
1487	* the anon_vma case: to serialize mapping,index checking after setting,
1488	* and to ensure that an anon folio is not being upgraded racily to a KSM folio
1489	* (but KSM folios are never downgraded).
1490	*/
1491	void folio_add_anon_rmap_ptes(struct folio folio, struct* page *page,
1492	int nr_pages, struct vm_area_struct vma, unsigned* long address,
1493	rmap_t flags)
1494	{
1495	__folio_add_anon_rmap(folio, page, nr_pages, vma, address, flags,
1496	level: PGTABLE_LEVEL_PTE);
1497	}
1498
1499	/**
1500	* folio_add_anon_rmap_pmd - add a PMD mapping to a page range of an anon folio
1501	* @folio: The folio to add the mapping to
1502	* @page: The first page to add
1503	* @vma: The vm area in which the mapping is added
1504	* @address: The user virtual address of the first page to map
1505	* @flags: The rmap flags
1506	*
1507	* The page range of folio is defined by [first_page, first_page + HPAGE_PMD_NR)
1508	*
1509	* The caller needs to hold the page table lock, and the page must be locked in
1510	* the anon_vma case: to serialize mapping,index checking after setting.
1511	*/
1512	void folio_add_anon_rmap_pmd(struct folio folio, struct* page *page,
1513	struct vm_area_struct vma, unsigned* long address, rmap_t flags)
1514	{
1515	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1516	__folio_add_anon_rmap(folio, page, HPAGE_PMD_NR, vma, address, flags,
1517	PGTABLE_LEVEL_PMD);
1518	#else
1519	WARN_ON_ONCE(true);
1520	#endif
1521	}
1522
1523	/**
1524	* folio_add_new_anon_rmap - Add mapping to a new anonymous folio.
1525	* @folio: The folio to add the mapping to.
1526	* @vma: the vm area in which the mapping is added
1527	* @address: the user virtual address mapped
1528	* @flags: The rmap flags
1529	*
1530	* Like folio_add_anon_rmap_() but must only be called on new* folios.
1531	* This means the inc-and-test can be bypassed.
1532	* The folio doesn't necessarily need to be locked while it's exclusive
1533	* unless two threads map it concurrently. However, the folio must be
1534	* locked if it's shared.
1535	*
1536	* If the folio is pmd-mappable, it is accounted as a THP.
1537	*/
1538	void folio_add_new_anon_rmap(struct folio folio, struct* vm_area_struct *vma,
1539	unsigned long address, rmap_t flags)
1540	{
1541	const bool exclusive = flags & RMAP_EXCLUSIVE;
1542	int nr = `1`, nr_pmdmapped = `0`;
1543
1544	VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
1545	VM_WARN_ON_FOLIO(!exclusive && !folio_test_locked(folio), folio);
1546
1547	/*
1548	* VM_DROPPABLE mappings don't swap; instead they're just dropped when
1549	* under memory pressure.
1550	*/
1551	if (!folio_test_swapbacked(folio) && !(vma->vm_flags & VM_DROPPABLE))
1552	__folio_set_swapbacked(folio);
1553	__folio_set_anon(folio, vma, address, exclusive);
1554
1555	if (likely(!folio_test_large(folio))) {
1556	/ increment count (starts at -1) /
1557	atomic_set(v: &folio->_mapcount, i: `0`);
1558	if (exclusive)
1559	SetPageAnonExclusive(&folio->page);
1560	} else if (!folio_test_pmd_mappable(folio)) {
1561	int i;
1562
1563	nr = folio_large_nr_pages(folio);
1564	for (i = `0`; i < nr; i++) {
1565	struct page *page = folio_page(folio, i);
1566
1567	if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
1568	/ increment count (starts at -1) /
1569	atomic_set(v: &page->_mapcount, i: `0`);
1570	if (exclusive)
1571	SetPageAnonExclusive(page);
1572	}
1573
1574	folio_set_large_mapcount(folio, mapcount: nr, vma);
1575	if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
1576	atomic_set(v: &folio->_nr_pages_mapped, i: nr);
1577	} else {
1578	nr = folio_large_nr_pages(folio);
1579	/ increment count (starts at -1) /
1580	atomic_set(v: &folio->_entire_mapcount, i: `0`);
1581	folio_set_large_mapcount(folio, mapcount: `1`, vma);
1582	if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
1583	atomic_set(v: &folio->_nr_pages_mapped, ENTIRELY_MAPPED);
1584	if (exclusive)
1585	SetPageAnonExclusive(&folio->page);
1586	nr_pmdmapped = nr;
1587	}
1588
1589	VM_WARN_ON_ONCE(address < vma->vm_start \|\|
1590	address + (nr << PAGE_SHIFT) > vma->vm_end);
1591
1592	__folio_mod_stat(folio, nr, nr_pmdmapped);
1593	mod_mthp_stat(order: folio_order(folio), item: MTHP_STAT_NR_ANON, delta: `1`);
1594	}
1595
1596	static __always_inline void __folio_add_file_rmap(struct folio *folio,
1597	struct page page, int* nr_pages, struct vm_area_struct *vma,
1598	enum pgtable_level level)
1599	{
1600	VM_WARN_ON_FOLIO(folio_test_anon(folio), folio);
1601
1602	__folio_add_rmap(folio, page, nr_pages, vma, level);
1603
1604	/*
1605	* Only mlock it if the folio is fully mapped to the VMA.
1606	*
1607	* Partially mapped folios can be split on reclaim and part outside
1608	* of mlocked VMA can be evicted or freed.
1609	*/
1610	if (folio_nr_pages(folio) == nr_pages)
1611	mlock_vma_folio(folio, vma);
1612	}
1613
1614	/**
1615	* folio_add_file_rmap_ptes - add PTE mappings to a page range of a folio
1616	* @folio: The folio to add the mappings to
1617	* @page: The first page to add
1618	* @nr_pages: The number of pages that will be mapped using PTEs
1619	* @vma: The vm area in which the mappings are added
1620	*
1621	* The page range of the folio is defined by [page, page + nr_pages)
1622	*
1623	* The caller needs to hold the page table lock.
1624	*/
1625	void folio_add_file_rmap_ptes(struct folio folio, struct* page *page,
1626	int nr_pages, struct vm_area_struct *vma)
1627	{
1628	__folio_add_file_rmap(folio, page, nr_pages, vma, level: PGTABLE_LEVEL_PTE);
1629	}
1630
1631	/**
1632	* folio_add_file_rmap_pmd - add a PMD mapping to a page range of a folio
1633	* @folio: The folio to add the mapping to
1634	* @page: The first page to add
1635	* @vma: The vm area in which the mapping is added
1636	*
1637	* The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
1638	*
1639	* The caller needs to hold the page table lock.
1640	*/
1641	void folio_add_file_rmap_pmd(struct folio folio, struct* page *page,
1642	struct vm_area_struct *vma)
1643	{
1644	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1645	__folio_add_file_rmap(folio, page, HPAGE_PMD_NR, vma, PGTABLE_LEVEL_PMD);
1646	#else
1647	WARN_ON_ONCE(true);
1648	#endif
1649	}
1650
1651	/**
1652	* folio_add_file_rmap_pud - add a PUD mapping to a page range of a folio
1653	* @folio: The folio to add the mapping to
1654	* @page: The first page to add
1655	* @vma: The vm area in which the mapping is added
1656	*
1657	* The page range of the folio is defined by [page, page + HPAGE_PUD_NR)
1658	*
1659	* The caller needs to hold the page table lock.
1660	*/
1661	void folio_add_file_rmap_pud(struct folio folio, struct* page *page,
1662	struct vm_area_struct *vma)
1663	{
1664	#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
1665	defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
1666	__folio_add_file_rmap(folio, page, HPAGE_PUD_NR, vma, PGTABLE_LEVEL_PUD);
1667	#else
1668	WARN_ON_ONCE(true);
1669	#endif
1670	}
1671
1672	static __always_inline void __folio_remove_rmap(struct folio *folio,
1673	struct page page, int* nr_pages, struct vm_area_struct *vma,
1674	enum pgtable_level level)
1675	{
1676	atomic_t *mapped = &folio->_nr_pages_mapped;
1677	int last = `0`, nr = `0`, nr_pmdmapped = `0`;
1678	bool partially_mapped = false;
1679
1680	__folio_rmap_sanity_checks(folio, page, nr_pages, level);
1681
1682	switch (level) {
1683	case PGTABLE_LEVEL_PTE:
1684	if (!folio_test_large(folio)) {
1685	nr = atomic_add_negative(i: -`1`, v: &folio->_mapcount);
1686	break;
1687	}
1688
1689	if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
1690	nr = folio_sub_return_large_mapcount(folio, diff: nr_pages, vma);
1691	if (!nr) {
1692	/ Now completely unmapped. /
1693	nr = folio_large_nr_pages(folio);
1694	} else {
1695	partially_mapped = nr < folio_large_nr_pages(folio) &&
1696	!folio_entire_mapcount(folio);
1697	nr = `0`;
1698	}
1699	break;
1700	}
1701
1702	folio_sub_large_mapcount(folio, diff: nr_pages, vma);
1703	do {
1704	last += atomic_add_negative(i: -`1`, v: &page->_mapcount);
1705	} while (page++, --nr_pages > `0`);
1706
1707	if (last &&
1708	atomic_sub_return_relaxed(i: last, v: mapped) < ENTIRELY_MAPPED)
1709	nr = last;
1710
1711	partially_mapped = nr && atomic_read(v: mapped);
1712	break;
1713	case PGTABLE_LEVEL_PMD:
1714	case PGTABLE_LEVEL_PUD:
1715	if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
1716	last = atomic_add_negative(i: -`1`, v: &folio->_entire_mapcount);
1717	if (level == PGTABLE_LEVEL_PMD && last)
1718	nr_pmdmapped = folio_large_nr_pages(folio);
1719	nr = folio_dec_return_large_mapcount(folio, vma);
1720	if (!nr) {
1721	/ Now completely unmapped. /
1722	nr = folio_large_nr_pages(folio);
1723	} else {
1724	partially_mapped = last &&
1725	nr < folio_large_nr_pages(folio);
1726	nr = `0`;
1727	}
1728	break;
1729	}
1730
1731	folio_dec_large_mapcount(folio, vma);
1732	last = atomic_add_negative(i: -`1`, v: &folio->_entire_mapcount);
1733	if (last) {
1734	nr = atomic_sub_return_relaxed(ENTIRELY_MAPPED, v: mapped);
1735	if (likely(nr < ENTIRELY_MAPPED)) {
1736	nr_pages = folio_large_nr_pages(folio);
1737	if (level == PGTABLE_LEVEL_PMD)
1738	nr_pmdmapped = nr_pages;
1739	nr = nr_pages - nr;
1740	/ Raced ahead of another remove and an add? /
1741	if (unlikely(nr < `0`))
1742	nr = `0`;
1743	} else {
1744	/ An add of ENTIRELY_MAPPED raced ahead /
1745	nr = `0`;
1746	}
1747	}
1748
1749	partially_mapped = nr && nr < nr_pmdmapped;
1750	break;
1751	default:
1752	BUILD_BUG();
1753	}
1754
1755	/*
1756	* Queue anon large folio for deferred split if at least one page of
1757	* the folio is unmapped and at least one page is still mapped.
1758	*
1759	* Check partially_mapped first to ensure it is a large folio.
1760	*/
1761	if (partially_mapped && folio_test_anon(folio) &&
1762	!folio_test_partially_mapped(folio))
1763	deferred_split_folio(folio, partially_mapped: true);
1764
1765	__folio_mod_stat(folio, nr: -nr, nr_pmdmapped: -nr_pmdmapped);
1766
1767	/*
1768	* It would be tidy to reset folio_test_anon mapping when fully
1769	* unmapped, but that might overwrite a racing folio_add_anon_rmap_*()
1770	* which increments mapcount after us but sets mapping before us:
1771	* so leave the reset to free_pages_prepare, and remember that
1772	* it's only reliable while mapped.
1773	*/
1774
1775	munlock_vma_folio(folio, vma);
1776	}
1777
1778	/**
1779	* folio_remove_rmap_ptes - remove PTE mappings from a page range of a folio
1780	* @folio: The folio to remove the mappings from
1781	* @page: The first page to remove
1782	* @nr_pages: The number of pages that will be removed from the mapping
1783	* @vma: The vm area from which the mappings are removed
1784	*
1785	* The page range of the folio is defined by [page, page + nr_pages)
1786	*
1787	* The caller needs to hold the page table lock.
1788	*/
1789	void folio_remove_rmap_ptes(struct folio folio, struct* page *page,
1790	int nr_pages, struct vm_area_struct *vma)
1791	{
1792	__folio_remove_rmap(folio, page, nr_pages, vma, level: PGTABLE_LEVEL_PTE);
1793	}
1794
1795	/**
1796	* folio_remove_rmap_pmd - remove a PMD mapping from a page range of a folio
1797	* @folio: The folio to remove the mapping from
1798	* @page: The first page to remove
1799	* @vma: The vm area from which the mapping is removed
1800	*
1801	* The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
1802	*
1803	* The caller needs to hold the page table lock.
1804	*/
1805	void folio_remove_rmap_pmd(struct folio folio, struct* page *page,
1806	struct vm_area_struct *vma)
1807	{
1808	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1809	__folio_remove_rmap(folio, page, HPAGE_PMD_NR, vma, PGTABLE_LEVEL_PMD);
1810	#else
1811	WARN_ON_ONCE(true);
1812	#endif
1813	}
1814
1815	/**
1816	* folio_remove_rmap_pud - remove a PUD mapping from a page range of a folio
1817	* @folio: The folio to remove the mapping from
1818	* @page: The first page to remove
1819	* @vma: The vm area from which the mapping is removed
1820	*
1821	* The page range of the folio is defined by [page, page + HPAGE_PUD_NR)
1822	*
1823	* The caller needs to hold the page table lock.
1824	*/
1825	void folio_remove_rmap_pud(struct folio folio, struct* page *page,
1826	struct vm_area_struct *vma)
1827	{
1828	#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
1829	defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
1830	__folio_remove_rmap(folio, page, HPAGE_PUD_NR, vma, PGTABLE_LEVEL_PUD);
1831	#else
1832	WARN_ON_ONCE(true);
1833	#endif
1834	}
1835
1836	static inline unsigned int folio_unmap_pte_batch(struct folio *folio,
1837	struct page_vma_mapped_walk *pvmw,
1838	enum ttu_flags flags, pte_t pte)
1839	{
1840	unsigned long end_addr, addr = pvmw->address;
1841	struct vm_area_struct *vma = pvmw->vma;
1842	unsigned int max_nr;
1843
1844	if (flags & TTU_HWPOISON)
1845	return `1`;
1846	if (!folio_test_large(folio))
1847	return `1`;
1848
1849	/ We may only batch within a single VMA and a single page table. /
1850	end_addr = pmd_addr_end(addr, vma->vm_end);
1851	max_nr = (end_addr - addr) >> PAGE_SHIFT;
1852
1853	/ We only support lazyfree batching for now ... /
1854	if (!folio_test_anon(folio) \|\| folio_test_swapbacked(folio))
1855	return `1`;
1856	if (pte_unused(pte))
1857	return `1`;
1858
1859	return folio_pte_batch(folio, ptep: pvmw->pte, pte, max_nr);
1860	}
1861
1862	/*
1863	* @arg: enum ttu_flags will be passed to this argument
1864	*/
1865	static bool try_to_unmap_one(struct folio folio, struct* vm_area_struct *vma,
1866	unsigned long address, void *arg)
1867	{
1868	struct mm_struct *mm = vma->vm_mm;
1869	DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, `0`);
1870	bool anon_exclusive, ret = true;
1871	pte_t pteval;
1872	struct page *subpage;
1873	struct mmu_notifier_range range;
1874	enum ttu_flags flags = (enum ttu_flags)(long)arg;
1875	unsigned long nr_pages = `1`, end_addr;
1876	unsigned long pfn;
1877	unsigned long hsz = `0`;
1878	int ptes = `0`;
1879
1880	/*
1881	* When racing against e.g. zap_pte_range() on another cpu,
1882	* in between its ptep_get_and_clear_full() and folio_remove_rmap_*(),
1883	* try_to_unmap() may return before page_mapped() has become false,
1884	* if page table locking is skipped: use TTU_SYNC to wait for that.
1885	*/
1886	if (flags & TTU_SYNC)
1887	pvmw.flags = PVMW_SYNC;
1888
1889	/*
1890	* For THP, we have to assume the worse case ie pmd for invalidation.
1891	* For hugetlb, it could be much worse if we need to do pud
1892	* invalidation in the case of pmd sharing.
1893	*
1894	* Note that the folio can not be freed in this function as call of
1895	* try_to_unmap() must hold a reference on the folio.
1896	*/
1897	range.end = vma_address_end(pvmw: &pvmw);
1898	mmu_notifier_range_init(range: &range, event: MMU_NOTIFY_CLEAR, flags: `0`, mm: vma->vm_mm,
1899	start: address, end: range.end);
1900	if (folio_test_hugetlb(folio)) {
1901	/*
1902	* If sharing is possible, start and end will be adjusted
1903	* accordingly.
1904	*/
1905	adjust_range_if_pmd_sharing_possible(vma, start: &range.start,
1906	end: &range.end);
1907
1908	/ We need the huge page size for set_huge_pte_at() /
1909	hsz = huge_page_size(h: hstate_vma(vma));
1910	}
1911	mmu_notifier_invalidate_range_start(range: &range);
1912
1913	while (page_vma_mapped_walk(pvmw: &pvmw)) {
1914	/*
1915	* If the folio is in an mlock()d vma, we must not swap it out.
1916	*/
1917	if (!(flags & TTU_IGNORE_MLOCK) &&
1918	(vma->vm_flags & VM_LOCKED)) {
1919	ptes++;
1920
1921	/*
1922	* Set 'ret' to indicate the page cannot be unmapped.
1923	*
1924	* Do not jump to walk_abort immediately as additional
1925	* iteration might be required to detect fully mapped
1926	* folio an mlock it.
1927	*/
1928	ret = false;
1929
1930	/ Only mlock fully mapped pages /
1931	if (pvmw.pte && ptes != pvmw.nr_pages)
1932	continue;
1933
1934	/*
1935	* All PTEs must be protected by page table lock in
1936	* order to mlock the page.
1937	*
1938	* If page table boundary has been cross, current ptl
1939	* only protect part of ptes.
1940	*/
1941	if (pvmw.flags & PVMW_PGTABLE_CROSSED)
1942	goto walk_done;
1943
1944	/ Restore the mlock which got missed /
1945	mlock_vma_folio(folio, vma);
1946	goto walk_done;
1947	}
1948
1949	if (!pvmw.pte) {
1950	if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) {
1951	if (unmap_huge_pmd_locked(vma, addr: pvmw.address, pmdp: pvmw.pmd, folio))
1952	goto walk_done;
1953	/*
1954	* unmap_huge_pmd_locked has either already marked
1955	* the folio as swap-backed or decided to retain it
1956	* due to GUP or speculative references.
1957	*/
1958	goto walk_abort;
1959	}
1960
1961	if (flags & TTU_SPLIT_HUGE_PMD) {
1962	/*
1963	* We temporarily have to drop the PTL and
1964	* restart so we can process the PTE-mapped THP.
1965	*/
1966	split_huge_pmd_locked(vma, address: pvmw.address,
1967	pmd: pvmw.pmd, freeze: false);
1968	flags &= ~TTU_SPLIT_HUGE_PMD;
1969	page_vma_mapped_walk_restart(pvmw: &pvmw);
1970	continue;
1971	}
1972	}
1973
1974	/ Unexpected PMD-mapped THP? /
1975	VM_BUG_ON_FOLIO(!pvmw.pte, folio);
1976
1977	/*
1978	* Handle PFN swap PTEs, such as device-exclusive ones, that
1979	* actually map pages.
1980	*/
1981	pteval = ptep_get(ptep: pvmw.pte);
1982	if (likely(pte_present(pteval))) {
1983	pfn = pte_pfn(pte: pteval);
1984	} else {
1985	pfn = swp_offset_pfn(entry: pte_to_swp_entry(pte: pteval));
1986	VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
1987	}
1988
1989	subpage = folio_page(folio, pfn - folio_pfn(folio));
1990	address = pvmw.address;
1991	anon_exclusive = folio_test_anon(folio) &&
1992	PageAnonExclusive(page: subpage);
1993
1994	if (folio_test_hugetlb(folio)) {
1995	bool anon = folio_test_anon(folio);
1996
1997	/*
1998	* The try_to_unmap() is only passed a hugetlb page
1999	* in the case where the hugetlb page is poisoned.
2000	*/
2001	VM_BUG_ON_PAGE(!PageHWPoison(subpage), subpage);
2002	/*
2003	* huge_pmd_unshare may unmap an entire PMD page.
2004	* There is no way of knowing exactly which PMDs may
2005	* be cached for this mm, so we must flush them all.
2006	* start/end were already adjusted above to cover this
2007	* range.
2008	*/
2009	flush_cache_range(vma, start: range.start, end: range.end);
2010
2011	/*
2012	* To call huge_pmd_unshare, i_mmap_rwsem must be
2013	* held in write mode. Caller needs to explicitly
2014	* do this outside rmap routines.
2015	*
2016	* We also must hold hugetlb vma_lock in write mode.
2017	* Lock order dictates acquiring vma_lock BEFORE
2018	* i_mmap_rwsem. We can only try lock here and fail
2019	* if unsuccessful.
2020	*/
2021	if (!anon) {
2022	VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
2023	if (!hugetlb_vma_trylock_write(vma))
2024	goto walk_abort;
2025	if (huge_pmd_unshare(mm, vma, addr: address, ptep: pvmw.pte)) {
2026	hugetlb_vma_unlock_write(vma);
2027	flush_tlb_range(vma,
2028	range.start, range.end);
2029	/*
2030	* The ref count of the PMD page was
2031	* dropped which is part of the way map
2032	* counting is done for shared PMDs.
2033	* Return 'true' here. When there is
2034	* no other sharing, huge_pmd_unshare
2035	* returns false and we will unmap the
2036	* actual page and drop map count
2037	* to zero.
2038	*/
2039	goto walk_done;
2040	}
2041	hugetlb_vma_unlock_write(vma);
2042	}
2043	pteval = huge_ptep_clear_flush(vma, addr: address, ptep: pvmw.pte);
2044	if (pte_dirty(pte: pteval))
2045	folio_mark_dirty(folio);
2046	} else if (likely(pte_present(pteval))) {
2047	nr_pages = folio_unmap_pte_batch(folio, pvmw: &pvmw, flags, pte: pteval);
2048	end_addr = address + nr_pages * PAGE_SIZE;
2049	flush_cache_range(vma, start: address, end: end_addr);
2050
2051	/ Nuke the page table entry. /
2052	pteval = get_and_clear_ptes(mm, addr: address, ptep: pvmw.pte, nr: nr_pages);
2053	/*
2054	* We clear the PTE but do not flush so potentially
2055	* a remote CPU could still be writing to the folio.
2056	* If the entry was previously clean then the
2057	* architecture must guarantee that a clear->dirty
2058	* transition on a cached TLB entry is written through
2059	* and traps if the PTE is unmapped.
2060	*/
2061	if (should_defer_flush(mm, flags))
2062	set_tlb_ubc_flush_pending(mm, pteval, start: address, end: end_addr);
2063	else
2064	flush_tlb_range(vma, address, end_addr);
2065	if (pte_dirty(pte: pteval))
2066	folio_mark_dirty(folio);
2067	} else {
2068	pte_clear(mm, address, pvmw.pte);
2069	}
2070
2071	/*
2072	* Now the pte is cleared. If this pte was uffd-wp armed,
2073	* we may want to replace a none pte with a marker pte if
2074	* it's file-backed, so we don't lose the tracking info.
2075	*/
2076	pte_install_uffd_wp_if_needed(vma, addr: address, pte: pvmw.pte, pteval);
2077
2078	/ Update high watermark before we lower rss /
2079	update_hiwater_rss(mm);
2080
2081	if (PageHWPoison(page: subpage) && (flags & TTU_HWPOISON)) {
2082	pteval = swp_entry_to_pte(entry: make_hwpoison_entry(page: subpage));
2083	if (folio_test_hugetlb(folio)) {
2084	hugetlb_count_sub(l: folio_nr_pages(folio), mm);
2085	set_huge_pte_at(mm, addr: address, ptep: pvmw.pte, pte: pteval,
2086	sz: hsz);
2087	} else {
2088	dec_mm_counter(mm, member: mm_counter(folio));
2089	set_pte_at(mm, address, pvmw.pte, pteval);
2090	}
2091	} else if (likely(pte_present(pteval)) && pte_unused(pte: pteval) &&
2092	!userfaultfd_armed(vma)) {
2093	/*
2094	* The guest indicated that the page content is of no
2095	* interest anymore. Simply discard the pte, vmscan
2096	* will take care of the rest.
2097	* A future reference will then fault in a new zero
2098	* page. When userfaultfd is active, we must not drop
2099	* this page though, as its main user (postcopy
2100	* migration) will not expect userfaults on already
2101	* copied pages.
2102	*/
2103	dec_mm_counter(mm, member: mm_counter(folio));
2104	} else if (folio_test_anon(folio)) {
2105	swp_entry_t entry = page_swap_entry(page: subpage);
2106	pte_t swp_pte;
2107	/*
2108	* Store the swap location in the pte.
2109	* See handle_pte_fault() ...
2110	*/
2111	if (unlikely(folio_test_swapbacked(folio) !=
2112	folio_test_swapcache(folio))) {
2113	WARN_ON_ONCE(`1`);
2114	goto walk_abort;
2115	}
2116
2117	/ MADV_FREE page check /
2118	if (!folio_test_swapbacked(folio)) {
2119	int ref_count, map_count;
2120
2121	/*
2122	* Synchronize with gup_pte_range():
2123	* - clear PTE; barrier; read refcount
2124	* - inc refcount; barrier; read PTE
2125	*/
2126	smp_mb();
2127
2128	ref_count = folio_ref_count(folio);
2129	map_count = folio_mapcount(folio);
2130
2131	/*
2132	* Order reads for page refcount and dirty flag
2133	* (see comments in __remove_mapping()).
2134	*/
2135	smp_rmb();
2136
2137	if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
2138	/*
2139	* redirtied either using the page table or a previously
2140	* obtained GUP reference.
2141	*/
2142	set_ptes(mm, addr: address, ptep: pvmw.pte, pte: pteval, nr: nr_pages);
2143	folio_set_swapbacked(folio);
2144	goto walk_abort;
2145	} else if (ref_count != `1` + map_count) {
2146	/*
2147	* Additional reference. Could be a GUP reference or any
2148	* speculative reference. GUP users must mark the folio
2149	* dirty if there was a modification. This folio cannot be
2150	* reclaimed right now either way, so act just like nothing
2151	* happened.
2152	* We'll come back here later and detect if the folio was
2153	* dirtied when the additional reference is gone.
2154	*/
2155	set_ptes(mm, addr: address, ptep: pvmw.pte, pte: pteval, nr: nr_pages);
2156	goto walk_abort;
2157	}
2158	add_mm_counter(mm, member: MM_ANONPAGES, value: -nr_pages);
2159	goto discard;
2160	}
2161
2162	if (swap_duplicate(entry) < `0`) {
2163	set_pte_at(mm, address, pvmw.pte, pteval);
2164	goto walk_abort;
2165	}
2166
2167	/*
2168	* arch_unmap_one() is expected to be a NOP on
2169	* architectures where we could have PFN swap PTEs,
2170	* so we'll not check/care.
2171	*/
2172	if (arch_unmap_one(mm, vma, addr: address, orig_pte: pteval) < `0`) {
2173	swap_free(entry);
2174	set_pte_at(mm, address, pvmw.pte, pteval);
2175	goto walk_abort;
2176	}
2177
2178	/ See folio_try_share_anon_rmap(): clear PTE first. /
2179	if (anon_exclusive &&
2180	folio_try_share_anon_rmap_pte(folio, page: subpage)) {
2181	swap_free(entry);
2182	set_pte_at(mm, address, pvmw.pte, pteval);
2183	goto walk_abort;
2184	}
2185	if (list_empty(head: &mm->mmlist)) {
2186	spin_lock(lock: &mmlist_lock);
2187	if (list_empty(head: &mm->mmlist))
2188	list_add(new: &mm->mmlist, head: &init_mm.mmlist);
2189	spin_unlock(lock: &mmlist_lock);
2190	}
2191	dec_mm_counter(mm, member: MM_ANONPAGES);
2192	inc_mm_counter(mm, member: MM_SWAPENTS);
2193	swp_pte = swp_entry_to_pte(entry);
2194	if (anon_exclusive)
2195	swp_pte = pte_swp_mkexclusive(pte: swp_pte);
2196	if (likely(pte_present(pteval))) {
2197	if (pte_soft_dirty(pte: pteval))
2198	swp_pte = pte_swp_mksoft_dirty(pte: swp_pte);
2199	if (pte_uffd_wp(pte: pteval))
2200	swp_pte = pte_swp_mkuffd_wp(pte: swp_pte);
2201	} else {
2202	if (pte_swp_soft_dirty(pte: pteval))
2203	swp_pte = pte_swp_mksoft_dirty(pte: swp_pte);
2204	if (pte_swp_uffd_wp(pte: pteval))
2205	swp_pte = pte_swp_mkuffd_wp(pte: swp_pte);
2206	}
2207	set_pte_at(mm, address, pvmw.pte, swp_pte);
2208	} else {
2209	/*
2210	* This is a locked file-backed folio,
2211	* so it cannot be removed from the page
2212	* cache and replaced by a new folio before
2213	* mmu_notifier_invalidate_range_end, so no
2214	* concurrent thread might update its page table
2215	* to point at a new folio while a device is
2216	* still using this folio.
2217	*
2218	* See Documentation/mm/mmu_notifier.rst
2219	*/
2220	dec_mm_counter(mm, member: mm_counter_file(folio));
2221	}
2222	discard:
2223	if (unlikely(folio_test_hugetlb(folio))) {
2224	hugetlb_remove_rmap(folio);
2225	} else {
2226	folio_remove_rmap_ptes(folio, page: subpage, nr_pages, vma);
2227	}
2228	if (vma->vm_flags & VM_LOCKED)
2229	mlock_drain_local();
2230	folio_put_refs(folio, refs: nr_pages);
2231
2232	/*
2233	* If we are sure that we batched the entire folio and cleared
2234	* all PTEs, we can just optimize and stop right here.
2235	*/
2236	if (nr_pages == folio_nr_pages(folio))
2237	goto walk_done;
2238	continue;
2239	walk_abort:
2240	ret = false;
2241	walk_done:
2242	page_vma_mapped_walk_done(pvmw: &pvmw);
2243	break;
2244	}
2245
2246	mmu_notifier_invalidate_range_end(range: &range);
2247
2248	return ret;
2249	}
2250
2251	static bool invalid_migration_vma(struct vm_area_struct vma, void* *arg)
2252	{
2253	return vma_is_temporary_stack(vma);
2254	}
2255
2256	static int folio_not_mapped(struct folio *folio)
2257	{
2258	return !folio_mapped(folio);
2259	}
2260
2261	/**
2262	* try_to_unmap - Try to remove all page table mappings to a folio.
2263	* @folio: The folio to unmap.
2264	* @flags: action and flags
2265	*
2266	* Tries to remove all the page table entries which are mapping this
2267	* folio. It is the caller's responsibility to check if the folio is
2268	* still mapped if needed (use TTU_SYNC to prevent accounting races).
2269	*
2270	* Context: Caller must hold the folio lock.
2271	*/
2272	void try_to_unmap(struct folio folio, enum* ttu_flags flags)
2273	{
2274	struct rmap_walk_control rwc = {
2275	.rmap_one = try_to_unmap_one,
2276	.arg = (void *)flags,
2277	.done = folio_not_mapped,
2278	.anon_lock = folio_lock_anon_vma_read,
2279	};
2280
2281	if (flags & TTU_RMAP_LOCKED)
2282	rmap_walk_locked(folio, rwc: &rwc);
2283	else
2284	rmap_walk(folio, rwc: &rwc);
2285	}
2286
2287	/*
2288	* @arg: enum ttu_flags will be passed to this argument.
2289	*
2290	* If TTU_SPLIT_HUGE_PMD is specified any PMD mappings will be split into PTEs
2291	* containing migration entries.
2292	*/
2293	static bool try_to_migrate_one(struct folio folio, struct* vm_area_struct *vma,
2294	unsigned long address, void *arg)
2295	{
2296	struct mm_struct *mm = vma->vm_mm;
2297	DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, `0`);
2298	bool anon_exclusive, writable, ret = true;
2299	pte_t pteval;
2300	struct page *subpage;
2301	struct mmu_notifier_range range;
2302	enum ttu_flags flags = (enum ttu_flags)(long)arg;
2303	unsigned long pfn;
2304	unsigned long hsz = `0`;
2305
2306	/*
2307	* When racing against e.g. zap_pte_range() on another cpu,
2308	* in between its ptep_get_and_clear_full() and folio_remove_rmap_*(),
2309	* try_to_migrate() may return before page_mapped() has become false,
2310	* if page table locking is skipped: use TTU_SYNC to wait for that.
2311	*/
2312	if (flags & TTU_SYNC)
2313	pvmw.flags = PVMW_SYNC;
2314
2315	/*
2316	* For THP, we have to assume the worse case ie pmd for invalidation.
2317	* For hugetlb, it could be much worse if we need to do pud
2318	* invalidation in the case of pmd sharing.
2319	*
2320	* Note that the page can not be free in this function as call of
2321	* try_to_unmap() must hold a reference on the page.
2322	*/
2323	range.end = vma_address_end(pvmw: &pvmw);
2324	mmu_notifier_range_init(range: &range, event: MMU_NOTIFY_CLEAR, flags: `0`, mm: vma->vm_mm,
2325	start: address, end: range.end);
2326	if (folio_test_hugetlb(folio)) {
2327	/*
2328	* If sharing is possible, start and end will be adjusted
2329	* accordingly.
2330	*/
2331	adjust_range_if_pmd_sharing_possible(vma, start: &range.start,
2332	end: &range.end);
2333
2334	/ We need the huge page size for set_huge_pte_at() /
2335	hsz = huge_page_size(h: hstate_vma(vma));
2336	}
2337	mmu_notifier_invalidate_range_start(range: &range);
2338
2339	while (page_vma_mapped_walk(pvmw: &pvmw)) {
2340	/ PMD-mapped THP migration entry /
2341	if (!pvmw.pte) {
2342	if (flags & TTU_SPLIT_HUGE_PMD) {
2343	split_huge_pmd_locked(vma, address: pvmw.address,
2344	pmd: pvmw.pmd, freeze: true);
2345	ret = false;
2346	page_vma_mapped_walk_done(pvmw: &pvmw);
2347	break;
2348	}
2349	#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
2350	subpage = folio_page(folio,
2351	pmd_pfn(*pvmw.pmd) - folio_pfn(folio));
2352	VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) \|\|
2353	!folio_test_pmd_mappable(folio), folio);
2354
2355	if (set_pmd_migration_entry(&pvmw, subpage)) {
2356	ret = false;
2357	page_vma_mapped_walk_done(&pvmw);
2358	break;
2359	}
2360	continue;
2361	#endif
2362	}
2363
2364	/ Unexpected PMD-mapped THP? /
2365	VM_BUG_ON_FOLIO(!pvmw.pte, folio);
2366
2367	/*
2368	* Handle PFN swap PTEs, such as device-exclusive ones, that
2369	* actually map pages.
2370	*/
2371	pteval = ptep_get(ptep: pvmw.pte);
2372	if (likely(pte_present(pteval))) {
2373	pfn = pte_pfn(pte: pteval);
2374	} else {
2375	pfn = swp_offset_pfn(entry: pte_to_swp_entry(pte: pteval));
2376	VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
2377	}
2378
2379	subpage = folio_page(folio, pfn - folio_pfn(folio));
2380	address = pvmw.address;
2381	anon_exclusive = folio_test_anon(folio) &&
2382	PageAnonExclusive(page: subpage);
2383
2384	if (folio_test_hugetlb(folio)) {
2385	bool anon = folio_test_anon(folio);
2386
2387	/*
2388	* huge_pmd_unshare may unmap an entire PMD page.
2389	* There is no way of knowing exactly which PMDs may
2390	* be cached for this mm, so we must flush them all.
2391	* start/end were already adjusted above to cover this
2392	* range.
2393	*/
2394	flush_cache_range(vma, start: range.start, end: range.end);
2395
2396	/*
2397	* To call huge_pmd_unshare, i_mmap_rwsem must be
2398	* held in write mode. Caller needs to explicitly
2399	* do this outside rmap routines.
2400	*
2401	* We also must hold hugetlb vma_lock in write mode.
2402	* Lock order dictates acquiring vma_lock BEFORE
2403	* i_mmap_rwsem. We can only try lock here and
2404	* fail if unsuccessful.
2405	*/
2406	if (!anon) {
2407	VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
2408	if (!hugetlb_vma_trylock_write(vma)) {
2409	page_vma_mapped_walk_done(pvmw: &pvmw);
2410	ret = false;
2411	break;
2412	}
2413	if (huge_pmd_unshare(mm, vma, addr: address, ptep: pvmw.pte)) {
2414	hugetlb_vma_unlock_write(vma);
2415	flush_tlb_range(vma,
2416	range.start, range.end);
2417
2418	/*
2419	* The ref count of the PMD page was
2420	* dropped which is part of the way map
2421	* counting is done for shared PMDs.
2422	* Return 'true' here. When there is
2423	* no other sharing, huge_pmd_unshare
2424	* returns false and we will unmap the
2425	* actual page and drop map count
2426	* to zero.
2427	*/
2428	page_vma_mapped_walk_done(pvmw: &pvmw);
2429	break;
2430	}
2431	hugetlb_vma_unlock_write(vma);
2432	}
2433	/ Nuke the hugetlb page table entry /
2434	pteval = huge_ptep_clear_flush(vma, addr: address, ptep: pvmw.pte);
2435	if (pte_dirty(pte: pteval))
2436	folio_mark_dirty(folio);
2437	writable = pte_write(pte: pteval);
2438	} else if (likely(pte_present(pteval))) {
2439	flush_cache_page(vma, vmaddr: address, pfn);
2440	/ Nuke the page table entry. /
2441	if (should_defer_flush(mm, flags)) {
2442	/*
2443	* We clear the PTE but do not flush so potentially
2444	* a remote CPU could still be writing to the folio.
2445	* If the entry was previously clean then the
2446	* architecture must guarantee that a clear->dirty
2447	* transition on a cached TLB entry is written through
2448	* and traps if the PTE is unmapped.
2449	*/
2450	pteval = ptep_get_and_clear(mm, addr: address, ptep: pvmw.pte);
2451
2452	set_tlb_ubc_flush_pending(mm, pteval, start: address, end: address + PAGE_SIZE);
2453	} else {
2454	pteval = ptep_clear_flush(vma, address, ptep: pvmw.pte);
2455	}
2456	if (pte_dirty(pte: pteval))
2457	folio_mark_dirty(folio);
2458	writable = pte_write(pte: pteval);
2459	} else {
2460	pte_clear(mm, address, pvmw.pte);
2461	writable = is_writable_device_private_entry(entry: pte_to_swp_entry(pte: pteval));
2462	}
2463
2464	VM_WARN_ON_FOLIO(writable && folio_test_anon(folio) &&
2465	!anon_exclusive, folio);
2466
2467	/ Update high watermark before we lower rss /
2468	update_hiwater_rss(mm);
2469
2470	if (PageHWPoison(page: subpage)) {
2471	VM_WARN_ON_FOLIO(folio_is_device_private(folio), folio);
2472
2473	pteval = swp_entry_to_pte(entry: make_hwpoison_entry(page: subpage));
2474	if (folio_test_hugetlb(folio)) {
2475	hugetlb_count_sub(l: folio_nr_pages(folio), mm);
2476	set_huge_pte_at(mm, addr: address, ptep: pvmw.pte, pte: pteval,
2477	sz: hsz);
2478	} else {
2479	dec_mm_counter(mm, member: mm_counter(folio));
2480	set_pte_at(mm, address, pvmw.pte, pteval);
2481	}
2482	} else if (likely(pte_present(pteval)) && pte_unused(pte: pteval) &&
2483	!userfaultfd_armed(vma)) {
2484	/*
2485	* The guest indicated that the page content is of no
2486	* interest anymore. Simply discard the pte, vmscan
2487	* will take care of the rest.
2488	* A future reference will then fault in a new zero
2489	* page. When userfaultfd is active, we must not drop
2490	* this page though, as its main user (postcopy
2491	* migration) will not expect userfaults on already
2492	* copied pages.
2493	*/
2494	dec_mm_counter(mm, member: mm_counter(folio));
2495	} else {
2496	swp_entry_t entry;
2497	pte_t swp_pte;
2498
2499	/*
2500	* arch_unmap_one() is expected to be a NOP on
2501	* architectures where we could have PFN swap PTEs,
2502	* so we'll not check/care.
2503	*/
2504	if (arch_unmap_one(mm, vma, addr: address, orig_pte: pteval) < `0`) {
2505	if (folio_test_hugetlb(folio))
2506	set_huge_pte_at(mm, addr: address, ptep: pvmw.pte,
2507	pte: pteval, sz: hsz);
2508	else
2509	set_pte_at(mm, address, pvmw.pte, pteval);
2510	ret = false;
2511	page_vma_mapped_walk_done(pvmw: &pvmw);
2512	break;
2513	}
2514
2515	/ See folio_try_share_anon_rmap_pte(): clear PTE first. /
2516	if (folio_test_hugetlb(folio)) {
2517	if (anon_exclusive &&
2518	hugetlb_try_share_anon_rmap(folio)) {
2519	set_huge_pte_at(mm, addr: address, ptep: pvmw.pte,
2520	pte: pteval, sz: hsz);
2521	ret = false;
2522	page_vma_mapped_walk_done(pvmw: &pvmw);
2523	break;
2524	}
2525	} else if (anon_exclusive &&
2526	folio_try_share_anon_rmap_pte(folio, page: subpage)) {
2527	set_pte_at(mm, address, pvmw.pte, pteval);
2528	ret = false;
2529	page_vma_mapped_walk_done(pvmw: &pvmw);
2530	break;
2531	}
2532
2533	/*
2534	* Store the pfn of the page in a special migration
2535	* pte. do_swap_page() will wait until the migration
2536	* pte is removed and then restart fault handling.
2537	*/
2538	if (writable)
2539	entry = make_writable_migration_entry(
2540	page_to_pfn(subpage));
2541	else if (anon_exclusive)
2542	entry = make_readable_exclusive_migration_entry(
2543	page_to_pfn(subpage));
2544	else
2545	entry = make_readable_migration_entry(
2546	page_to_pfn(subpage));
2547	if (likely(pte_present(pteval))) {
2548	if (pte_young(pte: pteval))
2549	entry = make_migration_entry_young(entry);
2550	if (pte_dirty(pte: pteval))
2551	entry = make_migration_entry_dirty(entry);
2552	swp_pte = swp_entry_to_pte(entry);
2553	if (pte_soft_dirty(pte: pteval))
2554	swp_pte = pte_swp_mksoft_dirty(pte: swp_pte);
2555	if (pte_uffd_wp(pte: pteval))
2556	swp_pte = pte_swp_mkuffd_wp(pte: swp_pte);
2557	} else {
2558	swp_pte = swp_entry_to_pte(entry);
2559	if (pte_swp_soft_dirty(pte: pteval))
2560	swp_pte = pte_swp_mksoft_dirty(pte: swp_pte);
2561	if (pte_swp_uffd_wp(pte: pteval))
2562	swp_pte = pte_swp_mkuffd_wp(pte: swp_pte);
2563	}
2564	if (folio_test_hugetlb(folio))
2565	set_huge_pte_at(mm, addr: address, ptep: pvmw.pte, pte: swp_pte,
2566	sz: hsz);
2567	else
2568	set_pte_at(mm, address, pvmw.pte, swp_pte);
2569	trace_set_migration_pte(addr: address, pte_val(swp_pte),
2570	order: folio_order(folio));
2571	/*
2572	* No need to invalidate here it will synchronize on
2573	* against the special swap migration pte.
2574	*/
2575	}
2576
2577	if (unlikely(folio_test_hugetlb(folio)))
2578	hugetlb_remove_rmap(folio);
2579	else
2580	folio_remove_rmap_pte(folio, subpage, vma);
2581	if (vma->vm_flags & VM_LOCKED)
2582	mlock_drain_local();
2583	folio_put(folio);
2584	}
2585
2586	mmu_notifier_invalidate_range_end(range: &range);
2587
2588	return ret;
2589	}
2590
2591	/**
2592	* try_to_migrate - try to replace all page table mappings with swap entries
2593	* @folio: the folio to replace page table entries for
2594	* @flags: action and flags
2595	*
2596	* Tries to remove all the page table entries which are mapping this folio and
2597	* replace them with special swap entries. Caller must hold the folio lock.
2598	*/
2599	void try_to_migrate(struct folio folio, enum* ttu_flags flags)
2600	{
2601	struct rmap_walk_control rwc = {
2602	.rmap_one = try_to_migrate_one,
2603	.arg = (void *)flags,
2604	.done = folio_not_mapped,
2605	.anon_lock = folio_lock_anon_vma_read,
2606	};
2607
2608	/*
2609	* Migration always ignores mlock and only supports TTU_RMAP_LOCKED and
2610	* TTU_SPLIT_HUGE_PMD, TTU_SYNC, and TTU_BATCH_FLUSH flags.
2611	*/
2612	if (WARN_ON_ONCE(flags & ~(TTU_RMAP_LOCKED \| TTU_SPLIT_HUGE_PMD \|
2613	TTU_SYNC \| TTU_BATCH_FLUSH)))
2614	return;
2615
2616	if (folio_is_zone_device(folio) &&
2617	(!folio_is_device_private(folio) && !folio_is_device_coherent(folio)))
2618	return;
2619
2620	/*
2621	* During exec, a temporary VMA is setup and later moved.
2622	* The VMA is moved under the anon_vma lock but not the
2623	* page tables leading to a race where migration cannot
2624	* find the migration ptes. Rather than increasing the
2625	* locking requirements of exec(), migration skips
2626	* temporary VMAs until after exec() completes.
2627	*/
2628	if (!folio_test_ksm(folio) && folio_test_anon(folio))
2629	rwc.invalid_vma = invalid_migration_vma;
2630
2631	if (flags & TTU_RMAP_LOCKED)
2632	rmap_walk_locked(folio, rwc: &rwc);
2633	else
2634	rmap_walk(folio, rwc: &rwc);
2635	}
2636
2637	#ifdef CONFIG_DEVICE_PRIVATE
2638	/**
2639	* make_device_exclusive() - Mark a page for exclusive use by a device
2640	* @mm: mm_struct of associated target process
2641	* @addr: the virtual address to mark for exclusive device access
2642	* @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering
2643	* @foliop: folio pointer will be stored here on success.
2644	*
2645	* This function looks up the page mapped at the given address, grabs a
2646	* folio reference, locks the folio and replaces the PTE with special
2647	* device-exclusive PFN swap entry, preventing access through the process
2648	* page tables. The function will return with the folio locked and referenced.
2649	*
2650	* On fault, the device-exclusive entries are replaced with the original PTE
2651	* under folio lock, after calling MMU notifiers.
2652	*
2653	* Only anonymous non-hugetlb folios are supported and the VMA must have
2654	* write permissions such that we can fault in the anonymous page writable
2655	* in order to mark it exclusive. The caller must hold the mmap_lock in read
2656	* mode.
2657	*
2658	* A driver using this to program access from a device must use a mmu notifier
2659	* critical section to hold a device specific lock during programming. Once
2660	* programming is complete it should drop the folio lock and reference after
2661	* which point CPU access to the page will revoke the exclusive access.
2662	*
2663	* Notes:
2664	* #. This function always operates on individual PTEs mapping individual
2665	* pages. PMD-sized THPs are first remapped to be mapped by PTEs before
2666	* the conversion happens on a single PTE corresponding to @addr.
2667	* #. While concurrent access through the process page tables is prevented,
2668	* concurrent access through other page references (e.g., earlier GUP
2669	* invocation) is not handled and not supported.
2670	* #. device-exclusive entries are considered "clean" and "old" by core-mm.
2671	* Device drivers must update the folio state when informed by MMU
2672	* notifiers.
2673	*
2674	* Returns: pointer to mapped page on success, otherwise a negative error.
2675	*/
2676	struct page make_device_exclusive(struct* mm_struct mm, unsigned* long addr,
2677	void owner, struct* folio **foliop)
2678	{
2679	struct mmu_notifier_range range;
2680	struct folio folio, fw_folio;
2681	struct vm_area_struct *vma;
2682	struct folio_walk fw;
2683	struct page *page;
2684	swp_entry_t entry;
2685	pte_t swp_pte;
2686	int ret;
2687
2688	mmap_assert_locked(mm);
2689	addr = PAGE_ALIGN_DOWN(addr);
2690
2691	/*
2692	* Fault in the page writable and try to lock it; note that if the
2693	* address would already be marked for exclusive use by a device,
2694	* the GUP call would undo that first by triggering a fault.
2695	*
2696	* If any other device would already map this page exclusively, the
2697	* fault will trigger a conversion to an ordinary
2698	* (non-device-exclusive) PTE and issue a MMU_NOTIFY_EXCLUSIVE.
2699	*/
2700	retry:
2701	page = get_user_page_vma_remote(mm, addr,
2702	FOLL_GET \| FOLL_WRITE \| FOLL_SPLIT_PMD,
2703	&vma);
2704	if (IS_ERR(page))
2705	return page;
2706	folio = page_folio(page);
2707
2708	if (!folio_test_anon(folio) \|\| folio_test_hugetlb(folio)) {
2709	folio_put(folio);
2710	return ERR_PTR(-EOPNOTSUPP);
2711	}
2712
2713	ret = folio_lock_killable(folio);
2714	if (ret) {
2715	folio_put(folio);
2716	return ERR_PTR(ret);
2717	}
2718
2719	/*
2720	* Inform secondary MMUs that we are going to convert this PTE to
2721	* device-exclusive, such that they unmap it now. Note that the
2722	* caller must filter this event out to prevent livelocks.
2723	*/
2724	mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, `0`,
2725	mm, addr, addr + PAGE_SIZE, owner);
2726	mmu_notifier_invalidate_range_start(&range);
2727
2728	/*
2729	* Let's do a second walk and make sure we still find the same page
2730	* mapped writable. Note that any page of an anonymous folio can
2731	* only be mapped writable using exactly one PTE ("exclusive"), so
2732	* there cannot be other mappings.
2733	*/
2734	fw_folio = folio_walk_start(&fw, vma, addr, `0`);
2735	if (fw_folio != folio \|\| fw.page != page \|\|
2736	fw.level != FW_LEVEL_PTE \|\| !pte_write(fw.pte)) {
2737	if (fw_folio)
2738	folio_walk_end(&fw, vma);
2739	mmu_notifier_invalidate_range_end(&range);
2740	folio_unlock(folio);
2741	folio_put(folio);
2742	goto retry;
2743	}
2744
2745	/ Nuke the page table entry so we get the uptodate dirty bit. /
2746	flush_cache_page(vma, addr, page_to_pfn(page));
2747	fw.pte = ptep_clear_flush(vma, addr, fw.ptep);
2748
2749	/ Set the dirty flag on the folio now the PTE is gone. /
2750	if (pte_dirty(fw.pte))
2751	folio_mark_dirty(folio);
2752
2753	/*
2754	* Store the pfn of the page in a special device-exclusive PFN swap PTE.
2755	* do_swap_page() will trigger the conversion back while holding the
2756	* folio lock.
2757	*/
2758	entry = make_device_exclusive_entry(page_to_pfn(page));
2759	swp_pte = swp_entry_to_pte(entry);
2760	if (pte_soft_dirty(fw.pte))
2761	swp_pte = pte_swp_mksoft_dirty(swp_pte);
2762	/ The pte is writable, uffd-wp does not apply. /
2763	set_pte_at(mm, addr, fw.ptep, swp_pte);
2764
2765	folio_walk_end(&fw, vma);
2766	mmu_notifier_invalidate_range_end(&range);
2767	*foliop = folio;
2768	return page;
2769	}
2770	EXPORT_SYMBOL_GPL(make_device_exclusive);
2771	#endif
2772
2773	void __put_anon_vma(struct anon_vma *anon_vma)
2774	{
2775	struct anon_vma *root = anon_vma->root;
2776
2777	anon_vma_free(anon_vma);
2778	if (root != anon_vma && atomic_dec_and_test(v: &root->refcount))
2779	anon_vma_free(anon_vma: root);
2780	}
2781
2782	static struct anon_vma rmap_walk_anon_lock(const* struct folio *folio,
2783	struct rmap_walk_control *rwc)
2784	{
2785	struct anon_vma *anon_vma;
2786
2787	if (rwc->anon_lock)
2788	return rwc->anon_lock(folio, rwc);
2789
2790	/*
2791	* Note: remove_migration_ptes() cannot use folio_lock_anon_vma_read()
2792	* because that depends on page_mapped(); but not all its usages
2793	* are holding mmap_lock. Users without mmap_lock are required to
2794	* take a reference count to prevent the anon_vma disappearing
2795	*/
2796	anon_vma = folio_anon_vma(folio);
2797	if (!anon_vma)
2798	return NULL;
2799
2800	if (anon_vma_trylock_read(anon_vma))
2801	goto out;
2802
2803	if (rwc->try_lock) {
2804	anon_vma = NULL;
2805	rwc->contended = true;
2806	goto out;
2807	}
2808
2809	anon_vma_lock_read(anon_vma);
2810	out:
2811	return anon_vma;
2812	}
2813
2814	/*
2815	* rmap_walk_anon - do something to anonymous page using the object-based
2816	* rmap method
2817	* @folio: the folio to be handled
2818	* @rwc: control variable according to each walk type
2819	* @locked: caller holds relevant rmap lock
2820	*
2821	* Find all the mappings of a folio using the mapping pointer and the vma
2822	* chains contained in the anon_vma struct it points to.
2823	*/
2824	static void rmap_walk_anon(struct folio *folio,
2825	struct rmap_walk_control *rwc, bool locked)
2826	{
2827	struct anon_vma *anon_vma;
2828	pgoff_t pgoff_start, pgoff_end;
2829	struct anon_vma_chain *avc;
2830
2831	if (locked) {
2832	anon_vma = folio_anon_vma(folio);
2833	/ anon_vma disappear under us? /
2834	VM_BUG_ON_FOLIO(!anon_vma, folio);
2835	} else {
2836	anon_vma = rmap_walk_anon_lock(folio, rwc);
2837	}
2838	if (!anon_vma)
2839	return;
2840
2841	pgoff_start = folio_pgoff(folio);
2842	pgoff_end = pgoff_start + folio_nr_pages(folio) - `1`;
2843	anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
2844	pgoff_start, pgoff_end) {
2845	struct vm_area_struct *vma = avc->vma;
2846	unsigned long address = vma_address(vma, pgoff: pgoff_start,
2847	nr_pages: folio_nr_pages(folio));
2848
2849	VM_BUG_ON_VMA(address == -EFAULT, vma);
2850	cond_resched();
2851
2852	if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
2853	continue;
2854
2855	if (!rwc->rmap_one(folio, vma, address, rwc->arg))
2856	break;
2857	if (rwc->done && rwc->done(folio))
2858	break;
2859	}
2860
2861	if (!locked)
2862	anon_vma_unlock_read(anon_vma);
2863	}
2864
2865	/**
2866	* __rmap_walk_file() - Traverse the reverse mapping for a file-backed mapping
2867	* of a page mapped within a specified page cache object at a specified offset.
2868	*
2869	* @folio: Either the folio whose mappings to traverse, or if NULL,
2870	* the callbacks specified in @rwc will be configured such
2871	* as to be able to look up mappings correctly.
2872	* @mapping: The page cache object whose mapping VMAs we intend to
2873	* traverse. If @folio is non-NULL, this should be equal to
2874	* folio_mapping(folio).
2875	* @pgoff_start: The offset within @mapping of the page which we are
2876	* looking up. If @folio is non-NULL, this should be equal
2877	* to folio_pgoff(folio).
2878	* @nr_pages: The number of pages mapped by the mapping. If @folio is
2879	* non-NULL, this should be equal to folio_nr_pages(folio).
2880	* @rwc: The reverse mapping walk control object describing how
2881	* the traversal should proceed.
2882	* @locked: Is the @mapping already locked? If not, we acquire the
2883	* lock.
2884	*/
2885	static void __rmap_walk_file(struct folio folio, struct* address_space *mapping,
2886	pgoff_t pgoff_start, unsigned long nr_pages,
2887	struct rmap_walk_control *rwc, bool locked)
2888	{
2889	pgoff_t pgoff_end = pgoff_start + nr_pages - `1`;
2890	struct vm_area_struct *vma;
2891
2892	VM_WARN_ON_FOLIO(folio && mapping != folio_mapping(folio), folio);
2893	VM_WARN_ON_FOLIO(folio && pgoff_start != folio_pgoff(folio), folio);
2894	VM_WARN_ON_FOLIO(folio && nr_pages != folio_nr_pages(folio), folio);
2895
2896	if (!locked) {
2897	if (i_mmap_trylock_read(mapping))
2898	goto lookup;
2899
2900	if (rwc->try_lock) {
2901	rwc->contended = true;
2902	return;
2903	}
2904
2905	i_mmap_lock_read(mapping);
2906	}
2907	lookup:
2908	vma_interval_tree_foreach(vma, &mapping->i_mmap,
2909	pgoff_start, pgoff_end) {
2910	unsigned long address = vma_address(vma, pgoff: pgoff_start, nr_pages);
2911
2912	VM_BUG_ON_VMA(address == -EFAULT, vma);
2913	cond_resched();
2914
2915	if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
2916	continue;
2917
2918	if (!rwc->rmap_one(folio, vma, address, rwc->arg))
2919	goto done;
2920	if (rwc->done && rwc->done(folio))
2921	goto done;
2922	}
2923	done:
2924	if (!locked)
2925	i_mmap_unlock_read(mapping);
2926	}
2927
2928	/*
2929	* rmap_walk_file - do something to file page using the object-based rmap method
2930	* @folio: the folio to be handled
2931	* @rwc: control variable according to each walk type
2932	* @locked: caller holds relevant rmap lock
2933	*
2934	* Find all the mappings of a folio using the mapping pointer and the vma chains
2935	* contained in the address_space struct it points to.
2936	*/
2937	static void rmap_walk_file(struct folio *folio,
2938	struct rmap_walk_control *rwc, bool locked)
2939	{
2940	/*
2941	* The folio lock not only makes sure that folio->mapping cannot
2942	* suddenly be NULLified by truncation, it makes sure that the structure
2943	* at mapping cannot be freed and reused yet, so we can safely take
2944	* mapping->i_mmap_rwsem.
2945	*/
2946	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
2947
2948	if (!folio->mapping)
2949	return;
2950
2951	__rmap_walk_file(folio, mapping: folio->mapping, pgoff_start: folio->index,
2952	nr_pages: folio_nr_pages(folio), rwc, locked);
2953	}
2954
2955	void rmap_walk(struct folio folio, struct* rmap_walk_control *rwc)
2956	{
2957	if (unlikely(folio_test_ksm(folio)))
2958	rmap_walk_ksm(folio, rwc);
2959	else if (folio_test_anon(folio))
2960	rmap_walk_anon(folio, rwc, locked: false);
2961	else
2962	rmap_walk_file(folio, rwc, locked: false);
2963	}
2964
2965	/ Like rmap_walk, but caller holds relevant rmap lock /
2966	void rmap_walk_locked(struct folio folio, struct* rmap_walk_control *rwc)
2967	{
2968	/ no ksm support for now /
2969	VM_BUG_ON_FOLIO(folio_test_ksm(folio), folio);
2970	if (folio_test_anon(folio))
2971	rmap_walk_anon(folio, rwc, locked: true);
2972	else
2973	rmap_walk_file(folio, rwc, locked: true);
2974	}
2975
2976	#ifdef CONFIG_HUGETLB_PAGE
2977	/*
2978	* The following two functions are for anonymous (private mapped) hugepages.
2979	* Unlike common anonymous pages, anonymous hugepages have no accounting code
2980	* and no lru code, because we handle hugepages differently from common pages.
2981	*/
2982	void hugetlb_add_anon_rmap(struct folio folio, struct* vm_area_struct *vma,
2983	unsigned long address, rmap_t flags)
2984	{
2985	VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
2986	VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
2987
2988	atomic_inc(v: &folio->_entire_mapcount);
2989	atomic_inc(v: &folio->_large_mapcount);
2990	if (flags & RMAP_EXCLUSIVE)
2991	SetPageAnonExclusive(&folio->page);
2992	VM_WARN_ON_FOLIO(folio_entire_mapcount(folio) > `1` &&
2993	PageAnonExclusive(&folio->page), folio);
2994	}
2995
2996	void hugetlb_add_new_anon_rmap(struct folio *folio,
2997	struct vm_area_struct vma, unsigned* long address)
2998	{
2999	VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
3000
3001	BUG_ON(address < vma->vm_start \|\| address >= vma->vm_end);
3002	/ increment count (starts at -1) /
3003	atomic_set(v: &folio->_entire_mapcount, i: `0`);
3004	atomic_set(v: &folio->_large_mapcount, i: `0`);
3005	folio_clear_hugetlb_restore_reserve(folio);
3006	__folio_set_anon(folio, vma, address, exclusive: true);
3007	SetPageAnonExclusive(&folio->page);
3008	}
3009	#endif /* CONFIG_HUGETLB_PAGE */
3010

Browse the source code of Linux/mm/rmap.c