rmap.h source code [Linux/include/linux/rmap.h]

1	/ SPDX-License-Identifier: GPL-2.0 /
2	#ifndef _LINUX_RMAP_H
3	#define _LINUX_RMAP_H
4	/*
5	* Declarations for Reverse Mapping functions in mm/rmap.c
6	*/
7
8	#include <linux/list.h>
9	#include <linux/slab.h>
10	#include <linux/mm.h>
11	#include <linux/rwsem.h>
12	#include <linux/memcontrol.h>
13	#include <linux/highmem.h>
14	#include <linux/pagemap.h>
15	#include <linux/memremap.h>
16	#include <linux/bit_spinlock.h>
17
18	/*
19	* The anon_vma heads a list of private "related" vmas, to scan if
20	* an anonymous page pointing to this anon_vma needs to be unmapped:
21	* the vmas on the list will be related by forking, or by splitting.
22	*
23	* Since vmas come and go as they are split and merged (particularly
24	* in mprotect), the mapping field of an anonymous page cannot point
25	* directly to a vma: instead it points to an anon_vma, on whose list
26	* the related vmas can be easily linked or unlinked.
27	*
28	* After unlinking the last vma on the list, we must garbage collect
29	* the anon_vma object itself: we're guaranteed no page can be
30	* pointing to this anon_vma once its vma list is empty.
31	*/
32	struct anon_vma {
33	struct anon_vma root; /* Root of this anon_vma tree /
34	struct rw_semaphore rwsem; / W: modification, R: walking the list /
35	/*
36	* The refcount is taken on an anon_vma when there is no
37	* guarantee that the vma of page tables will exist for
38	* the duration of the operation. A caller that takes
39	* the reference is responsible for clearing up the
40	* anon_vma if they are the last user on release
41	*/
42	atomic_t refcount;
43
44	/*
45	* Count of child anon_vmas. Equals to the count of all anon_vmas that
46	* have ->parent pointing to this one, including itself.
47	*
48	* This counter is used for making decision about reusing anon_vma
49	* instead of forking new one. See comments in function anon_vma_clone.
50	*/
51	unsigned long num_children;
52	/ Count of VMAs whose ->anon_vma pointer points to this object. /
53	unsigned long num_active_vmas;
54
55	struct anon_vma parent; /* Parent of this anon_vma /
56
57	/*
58	* NOTE: the LSB of the rb_root.rb_node is set by
59	* mm_take_all_locks() _after_ taking the above lock. So the
60	* rb_root must only be read/written after taking the above lock
61	* to be sure to see a valid next pointer. The LSB bit itself
62	* is serialized by a system wide lock only visible to
63	* mm_take_all_locks() (mm_all_locks_mutex).
64	*/
65
66	/ Interval tree of private "related" vmas /
67	struct rb_root_cached rb_root;
68	};
69
70	/*
71	* The copy-on-write semantics of fork mean that an anon_vma
72	* can become associated with multiple processes. Furthermore,
73	* each child process will have its own anon_vma, where new
74	* pages for that process are instantiated.
75	*
76	* This structure allows us to find the anon_vmas associated
77	* with a VMA, or the VMAs associated with an anon_vma.
78	* The "same_vma" list contains the anon_vma_chains linking
79	* all the anon_vmas associated with this VMA.
80	* The "rb" field indexes on an interval tree the anon_vma_chains
81	* which link all the VMAs associated with this anon_vma.
82	*/
83	struct anon_vma_chain {
84	struct vm_area_struct *vma;
85	struct anon_vma *anon_vma;
86	struct list_head same_vma; / locked by mmap_lock & page_table_lock /
87	struct rb_node rb; / locked by anon_vma->rwsem /
88	unsigned long rb_subtree_last;
89	#ifdef CONFIG_DEBUG_VM_RB
90	unsigned long cached_vma_start, cached_vma_last;
91	#endif
92	};
93
94	enum ttu_flags {
95	TTU_SPLIT_HUGE_PMD = `0x4`, / split huge PMD if any /
96	TTU_IGNORE_MLOCK = `0x8`, / ignore mlock /
97	TTU_SYNC = `0x10`, / avoid racy checks with PVMW_SYNC /
98	TTU_HWPOISON = `0x20`, / do convert pte to hwpoison entry /
99	TTU_BATCH_FLUSH = `0x40`, / Batch TLB flushes where possible*
100	* and caller guarantees they will
101	* do a final flush if necessary */
102	TTU_RMAP_LOCKED = `0x80`, / do not grab rmap lock:*
103	* caller holds it */
104	};
105
106	#ifdef CONFIG_MMU
107	static inline void get_anon_vma(struct anon_vma *anon_vma)
108	{
109	atomic_inc(v: &anon_vma->refcount);
110	}
111
112	void __put_anon_vma(struct anon_vma *anon_vma);
113
114	static inline void put_anon_vma(struct anon_vma *anon_vma)
115	{
116	if (atomic_dec_and_test(v: &anon_vma->refcount))
117	__put_anon_vma(anon_vma);
118	}
119
120	static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
121	{
122	down_write(sem: &anon_vma->root->rwsem);
123	}
124
125	static inline int anon_vma_trylock_write(struct anon_vma *anon_vma)
126	{
127	return down_write_trylock(sem: &anon_vma->root->rwsem);
128	}
129
130	static inline void anon_vma_unlock_write(struct anon_vma *anon_vma)
131	{
132	up_write(sem: &anon_vma->root->rwsem);
133	}
134
135	static inline void anon_vma_lock_read(struct anon_vma *anon_vma)
136	{
137	down_read(sem: &anon_vma->root->rwsem);
138	}
139
140	static inline int anon_vma_trylock_read(struct anon_vma *anon_vma)
141	{
142	return down_read_trylock(sem: &anon_vma->root->rwsem);
143	}
144
145	static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
146	{
147	up_read(sem: &anon_vma->root->rwsem);
148	}
149
150
151	/*
152	* anon_vma helper functions.
153	*/
154	void anon_vma_init(void); / create anon_vma_cachep /
155	int __anon_vma_prepare(struct vm_area_struct *);
156	void unlink_anon_vmas(struct vm_area_struct *);
157	int anon_vma_clone(struct vm_area_struct , struct* vm_area_struct *);
158	int anon_vma_fork(struct vm_area_struct , struct* vm_area_struct *);
159
160	static inline int anon_vma_prepare(struct vm_area_struct *vma)
161	{
162	if (likely(vma->anon_vma))
163	return `0`;
164
165	return __anon_vma_prepare(vma);
166	}
167
168	static inline void anon_vma_merge(struct vm_area_struct *vma,
169	struct vm_area_struct *next)
170	{
171	VM_BUG_ON_VMA(vma->anon_vma != next->anon_vma, vma);
172	unlink_anon_vmas(next);
173	}
174
175	struct anon_vma folio_get_anon_vma(const* struct folio *folio);
176
177	#ifdef CONFIG_MM_ID
178	static __always_inline void folio_lock_large_mapcount(struct folio *folio)
179	{
180	bit_spin_lock(FOLIO_MM_IDS_LOCK_BITNUM, &folio->_mm_ids);
181	}
182
183	static __always_inline void folio_unlock_large_mapcount(struct folio *folio)
184	{
185	__bit_spin_unlock(FOLIO_MM_IDS_LOCK_BITNUM, &folio->_mm_ids);
186	}
187
188	static inline unsigned int folio_mm_id(const struct folio folio, int* idx)
189	{
190	VM_WARN_ON_ONCE(idx != `0` && idx != `1`);
191	return folio->_mm_id[idx] & MM_ID_MASK;
192	}
193
194	static inline void folio_set_mm_id(struct folio folio, int* idx, mm_id_t id)
195	{
196	VM_WARN_ON_ONCE(idx != `0` && idx != `1`);
197	folio->_mm_id[idx] &= ~MM_ID_MASK;
198	folio->_mm_id[idx] \|= id;
199	}
200
201	static inline void __folio_large_mapcount_sanity_checks(const struct folio *folio,
202	int diff, mm_id_t mm_id)
203	{
204	VM_WARN_ON_ONCE(!folio_test_large(folio) \|\| folio_test_hugetlb(folio));
205	VM_WARN_ON_ONCE(diff <= `0`);
206	VM_WARN_ON_ONCE(mm_id < MM_ID_MIN \|\| mm_id > MM_ID_MAX);
207
208	/*
209	* Make sure we can detect at least one complete PTE mapping of the
210	* folio in a single MM as "exclusively mapped". This is primarily
211	* a check on 32bit, where we currently reduce the size of the per-MM
212	* mapcount to a short.
213	*/
214	VM_WARN_ON_ONCE(diff > folio_large_nr_pages(folio));
215	VM_WARN_ON_ONCE(folio_large_nr_pages(folio) - `1` > MM_ID_MAPCOUNT_MAX);
216
217	VM_WARN_ON_ONCE(folio_mm_id(folio, `0`) == MM_ID_DUMMY &&
218	folio->_mm_id_mapcount[`0`] != -`1`);
219	VM_WARN_ON_ONCE(folio_mm_id(folio, `0`) != MM_ID_DUMMY &&
220	folio->_mm_id_mapcount[`0`] < `0`);
221	VM_WARN_ON_ONCE(folio_mm_id(folio, `1`) == MM_ID_DUMMY &&
222	folio->_mm_id_mapcount[`1`] != -`1`);
223	VM_WARN_ON_ONCE(folio_mm_id(folio, `1`) != MM_ID_DUMMY &&
224	folio->_mm_id_mapcount[`1`] < `0`);
225	VM_WARN_ON_ONCE(!folio_mapped(folio) &&
226	test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids));
227	}
228
229	static __always_inline void folio_set_large_mapcount(struct folio *folio,
230	int mapcount, struct vm_area_struct *vma)
231	{
232	__folio_large_mapcount_sanity_checks(folio, mapcount, vma->vm_mm->mm_id);
233
234	VM_WARN_ON_ONCE(folio_mm_id(folio, `0`) != MM_ID_DUMMY);
235	VM_WARN_ON_ONCE(folio_mm_id(folio, `1`) != MM_ID_DUMMY);
236
237	/ Note: mapcounts start at -1. /
238	atomic_set(&folio->_large_mapcount, mapcount - `1`);
239	folio->_mm_id_mapcount[`0`] = mapcount - `1`;
240	folio_set_mm_id(folio, `0`, vma->vm_mm->mm_id);
241	}
242
243	static __always_inline int folio_add_return_large_mapcount(struct folio *folio,
244	int diff, struct vm_area_struct *vma)
245	{
246	const mm_id_t mm_id = vma->vm_mm->mm_id;
247	int new_mapcount_val;
248
249	folio_lock_large_mapcount(folio);
250	__folio_large_mapcount_sanity_checks(folio, diff, mm_id);
251
252	new_mapcount_val = atomic_read(&folio->_large_mapcount) + diff;
253	atomic_set(&folio->_large_mapcount, new_mapcount_val);
254
255	/*
256	* If a folio is mapped more than once into an MM on 32bit, we
257	* can in theory overflow the per-MM mapcount (although only for
258	* fairly large folios), turning it negative. In that case, just
259	* free up the slot and mark the folio "mapped shared", otherwise
260	* we might be in trouble when unmapping pages later.
261	*/
262	if (folio_mm_id(folio, `0`) == mm_id) {
263	folio->_mm_id_mapcount[`0`] += diff;
264	if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio->_mm_id_mapcount[`0`] < `0`)) {
265	folio->_mm_id_mapcount[`0`] = -`1`;
266	folio_set_mm_id(folio, `0`, MM_ID_DUMMY);
267	folio->_mm_ids \|= FOLIO_MM_IDS_SHARED_BIT;
268	}
269	} else if (folio_mm_id(folio, `1`) == mm_id) {
270	folio->_mm_id_mapcount[`1`] += diff;
271	if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio->_mm_id_mapcount[`1`] < `0`)) {
272	folio->_mm_id_mapcount[`1`] = -`1`;
273	folio_set_mm_id(folio, `1`, MM_ID_DUMMY);
274	folio->_mm_ids \|= FOLIO_MM_IDS_SHARED_BIT;
275	}
276	} else if (folio_mm_id(folio, `0`) == MM_ID_DUMMY) {
277	folio_set_mm_id(folio, `0`, mm_id);
278	folio->_mm_id_mapcount[`0`] = diff - `1`;
279	/ We might have other mappings already. /
280	if (new_mapcount_val != diff - `1`)
281	folio->_mm_ids \|= FOLIO_MM_IDS_SHARED_BIT;
282	} else if (folio_mm_id(folio, `1`) == MM_ID_DUMMY) {
283	folio_set_mm_id(folio, `1`, mm_id);
284	folio->_mm_id_mapcount[`1`] = diff - `1`;
285	/ Slot 0 certainly has mappings as well. /
286	folio->_mm_ids \|= FOLIO_MM_IDS_SHARED_BIT;
287	}
288	folio_unlock_large_mapcount(folio);
289	return new_mapcount_val + `1`;
290	}
291	#define folio_add_large_mapcount folio_add_return_large_mapcount
292
293	static __always_inline int folio_sub_return_large_mapcount(struct folio *folio,
294	int diff, struct vm_area_struct *vma)
295	{
296	const mm_id_t mm_id = vma->vm_mm->mm_id;
297	int new_mapcount_val;
298
299	folio_lock_large_mapcount(folio);
300	__folio_large_mapcount_sanity_checks(folio, diff, mm_id);
301
302	new_mapcount_val = atomic_read(&folio->_large_mapcount) - diff;
303	atomic_set(&folio->_large_mapcount, new_mapcount_val);
304
305	/*
306	* There are valid corner cases where we might underflow a per-MM
307	* mapcount (some mappings added when no slot was free, some mappings
308	* added once a slot was free), so we always set it to -1 once we go
309	* negative.
310	*/
311	if (folio_mm_id(folio, `0`) == mm_id) {
312	folio->_mm_id_mapcount[`0`] -= diff;
313	if (folio->_mm_id_mapcount[`0`] >= `0`)
314	goto out;
315	folio->_mm_id_mapcount[`0`] = -`1`;
316	folio_set_mm_id(folio, `0`, MM_ID_DUMMY);
317	} else if (folio_mm_id(folio, `1`) == mm_id) {
318	folio->_mm_id_mapcount[`1`] -= diff;
319	if (folio->_mm_id_mapcount[`1`] >= `0`)
320	goto out;
321	folio->_mm_id_mapcount[`1`] = -`1`;
322	folio_set_mm_id(folio, `1`, MM_ID_DUMMY);
323	}
324
325	/*
326	* If one MM slot owns all mappings, the folio is mapped exclusively.
327	* Note that if the folio is now unmapped (new_mapcount_val == -1), both
328	* slots must be free (mapcount == -1), and we'll also mark it as
329	* exclusive.
330	*/
331	if (folio->_mm_id_mapcount[`0`] == new_mapcount_val \|\|
332	folio->_mm_id_mapcount[`1`] == new_mapcount_val)
333	folio->_mm_ids &= ~FOLIO_MM_IDS_SHARED_BIT;
334	out:
335	folio_unlock_large_mapcount(folio);
336	return new_mapcount_val + `1`;
337	}
338	#define folio_sub_large_mapcount folio_sub_return_large_mapcount
339	#else /* !CONFIG_MM_ID */
340	/*
341	* See __folio_rmap_sanity_checks(), we might map large folios even without
342	* CONFIG_TRANSPARENT_HUGEPAGE. We'll keep that working for now.
343	*/
344	static inline void folio_set_large_mapcount(struct folio folio, int* mapcount,
345	struct vm_area_struct *vma)
346	{
347	/ Note: mapcounts start at -1. /
348	atomic_set(v: &folio->_large_mapcount, i: mapcount - `1`);
349	}
350
351	static inline void folio_add_large_mapcount(struct folio *folio,
352	int diff, struct vm_area_struct *vma)
353	{
354	atomic_add(i: diff, v: &folio->_large_mapcount);
355	}
356
357	static inline int folio_add_return_large_mapcount(struct folio *folio,
358	int diff, struct vm_area_struct *vma)
359	{
360	BUILD_BUG();
361	}
362
363	static inline void folio_sub_large_mapcount(struct folio *folio,
364	int diff, struct vm_area_struct *vma)
365	{
366	atomic_sub(i: diff, v: &folio->_large_mapcount);
367	}
368
369	static inline int folio_sub_return_large_mapcount(struct folio *folio,
370	int diff, struct vm_area_struct *vma)
371	{
372	BUILD_BUG();
373	}
374	#endif /* CONFIG_MM_ID */
375
376	#define folio_inc_large_mapcount(folio, vma) \
377	folio_add_large_mapcount(folio, 1, vma)
378	#define folio_inc_return_large_mapcount(folio, vma) \
379	folio_add_return_large_mapcount(folio, 1, vma)
380	#define folio_dec_large_mapcount(folio, vma) \
381	folio_sub_large_mapcount(folio, 1, vma)
382	#define folio_dec_return_large_mapcount(folio, vma) \
383	folio_sub_return_large_mapcount(folio, 1, vma)
384
385	/ RMAP flags, currently only relevant for some anon rmap operations. /
386	typedef int __bitwise rmap_t;
387
388	/*
389	* No special request: A mapped anonymous (sub)page is possibly shared between
390	* processes.
391	*/
392	#define RMAP_NONE ((__force rmap_t)0)
393
394	/ The anonymous (sub)page is exclusive to a single process. /
395	#define RMAP_EXCLUSIVE ((__force rmap_t)BIT(0))
396
397	static __always_inline void __folio_rmap_sanity_checks(const struct folio *folio,
398	const struct page page, int* nr_pages, enum pgtable_level level)
399	{
400	/ hugetlb folios are handled separately. /
401	VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
402
403	/ When (un)mapping zeropages, we should never touch ref+mapcount. /
404	VM_WARN_ON_FOLIO(is_zero_folio(folio), folio);
405
406	/*
407	* TODO: we get driver-allocated folios that have nothing to do with
408	* the rmap using vm_insert_page(); therefore, we cannot assume that
409	* folio_test_large_rmappable() holds for large folios. We should
410	* handle any desired mapcount+stats accounting for these folios in
411	* VM_MIXEDMAP VMAs separately, and then sanity-check here that
412	* we really only get rmappable folios.
413	*/
414
415	VM_WARN_ON_ONCE(nr_pages <= `0`);
416	VM_WARN_ON_FOLIO(page_folio(page) != folio, folio);
417	VM_WARN_ON_FOLIO(page_folio(page + nr_pages - `1`) != folio, folio);
418
419	switch (level) {
420	case PGTABLE_LEVEL_PTE:
421	break;
422	case PGTABLE_LEVEL_PMD:
423	/*
424	* We don't support folios larger than a single PMD yet. So
425	* when PGTABLE_LEVEL_PMD is set, we assume that we are creating
426	* a single "entire" mapping of the folio.
427	*/
428	VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio);
429	VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio);
430	break;
431	case PGTABLE_LEVEL_PUD:
432	/*
433	* Assume that we are creating a single "entire" mapping of the
434	* folio.
435	*/
436	VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PUD_NR, folio);
437	VM_WARN_ON_FOLIO(nr_pages != HPAGE_PUD_NR, folio);
438	break;
439	default:
440	BUILD_BUG();
441	}
442
443	/*
444	* Anon folios must have an associated live anon_vma as long as they're
445	* mapped into userspace.
446	* Note that the atomic_read() mainly does two things:
447	*
448	* 1. In KASAN builds with CONFIG_SLUB_RCU_DEBUG, it causes KASAN to
449	* check that the associated anon_vma has not yet been freed (subject
450	* to KASAN's usual limitations). This check will pass if the
451	* anon_vma's refcount has already dropped to 0 but an RCU grace
452	* period hasn't passed since then.
453	* 2. If the anon_vma has not yet been freed, it checks that the
454	* anon_vma still has a nonzero refcount (as opposed to being in the
455	* middle of an RCU delay for getting freed).
456	*/
457	if (folio_test_anon(folio) && !folio_test_ksm(folio)) {
458	unsigned long mapping = (unsigned long)folio->mapping;
459	struct anon_vma *anon_vma;
460
461	anon_vma = (void *)(mapping - FOLIO_MAPPING_ANON);
462	VM_WARN_ON_FOLIO(atomic_read(&anon_vma->refcount) == `0`, folio);
463	}
464	}
465
466	/*
467	* rmap interfaces called when adding or removing pte of page
468	*/
469	void folio_move_anon_rmap(struct folio , struct* vm_area_struct *);
470	void folio_add_anon_rmap_ptes(struct folio , struct* page , int* nr_pages,
471	struct vm_area_struct , unsigned* long address, rmap_t flags);
472	#define folio_add_anon_rmap_pte(folio, page, vma, address, flags) \
473	folio_add_anon_rmap_ptes(folio, page, 1, vma, address, flags)
474	void folio_add_anon_rmap_pmd(struct folio , struct* page *,
475	struct vm_area_struct , unsigned* long address, rmap_t flags);
476	void folio_add_new_anon_rmap(struct folio , struct* vm_area_struct *,
477	unsigned long address, rmap_t flags);
478	void folio_add_file_rmap_ptes(struct folio , struct* page , int* nr_pages,
479	struct vm_area_struct *);
480	#define folio_add_file_rmap_pte(folio, page, vma) \
481	folio_add_file_rmap_ptes(folio, page, 1, vma)
482	void folio_add_file_rmap_pmd(struct folio , struct* page *,
483	struct vm_area_struct *);
484	void folio_add_file_rmap_pud(struct folio , struct* page *,
485	struct vm_area_struct *);
486	void folio_remove_rmap_ptes(struct folio , struct* page , int* nr_pages,
487	struct vm_area_struct *);
488	#define folio_remove_rmap_pte(folio, page, vma) \
489	folio_remove_rmap_ptes(folio, page, 1, vma)
490	void folio_remove_rmap_pmd(struct folio , struct* page *,
491	struct vm_area_struct *);
492	void folio_remove_rmap_pud(struct folio , struct* page *,
493	struct vm_area_struct *);
494
495	void hugetlb_add_anon_rmap(struct folio , struct* vm_area_struct *,
496	unsigned long address, rmap_t flags);
497	void hugetlb_add_new_anon_rmap(struct folio , struct* vm_area_struct *,
498	unsigned long address);
499
500	/ See folio_try_dup_anon_rmap_() /*
501	static inline int hugetlb_try_dup_anon_rmap(struct folio *folio,
502	struct vm_area_struct *vma)
503	{
504	VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
505	VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
506
507	if (PageAnonExclusive(page: &folio->page)) {
508	if (unlikely(folio_needs_cow_for_dma(vma, folio)))
509	return -EBUSY;
510	ClearPageAnonExclusive(page: &folio->page);
511	}
512	atomic_inc(v: &folio->_entire_mapcount);
513	atomic_inc(v: &folio->_large_mapcount);
514	return `0`;
515	}
516
517	/ See folio_try_share_anon_rmap_() /*
518	static inline int hugetlb_try_share_anon_rmap(struct folio *folio)
519	{
520	VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
521	VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
522	VM_WARN_ON_FOLIO(!PageAnonExclusive(&folio->page), folio);
523
524	/ Paired with the memory barrier in try_grab_folio(). /
525	if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
526	smp_mb();
527
528	if (unlikely(folio_maybe_dma_pinned(folio)))
529	return -EBUSY;
530	ClearPageAnonExclusive(page: &folio->page);
531
532	/*
533	* This is conceptually a smp_wmb() paired with the smp_rmb() in
534	* gup_must_unshare().
535	*/
536	if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
537	smp_mb__after_atomic();
538	return `0`;
539	}
540
541	static inline void hugetlb_add_file_rmap(struct folio *folio)
542	{
543	VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
544	VM_WARN_ON_FOLIO(folio_test_anon(folio), folio);
545
546	atomic_inc(v: &folio->_entire_mapcount);
547	atomic_inc(v: &folio->_large_mapcount);
548	}
549
550	static inline void hugetlb_remove_rmap(struct folio *folio)
551	{
552	VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
553
554	atomic_dec(v: &folio->_entire_mapcount);
555	atomic_dec(v: &folio->_large_mapcount);
556	}
557
558	static __always_inline void __folio_dup_file_rmap(struct folio *folio,
559	struct page page, int* nr_pages, struct vm_area_struct *dst_vma,
560	enum pgtable_level level)
561	{
562	const int orig_nr_pages = nr_pages;
563
564	__folio_rmap_sanity_checks(folio, page, nr_pages, level);
565
566	switch (level) {
567	case PGTABLE_LEVEL_PTE:
568	if (!folio_test_large(folio)) {
569	atomic_inc(v: &folio->_mapcount);
570	break;
571	}
572
573	if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) {
574	do {
575	atomic_inc(v: &page->_mapcount);
576	} while (page++, --nr_pages > `0`);
577	}
578	folio_add_large_mapcount(folio, diff: orig_nr_pages, vma: dst_vma);
579	break;
580	case PGTABLE_LEVEL_PMD:
581	case PGTABLE_LEVEL_PUD:
582	atomic_inc(v: &folio->_entire_mapcount);
583	folio_inc_large_mapcount(folio, dst_vma);
584	break;
585	default:
586	BUILD_BUG();
587	}
588	}
589
590	/**
591	* folio_dup_file_rmap_ptes - duplicate PTE mappings of a page range of a folio
592	* @folio: The folio to duplicate the mappings of
593	* @page: The first page to duplicate the mappings of
594	* @nr_pages: The number of pages of which the mapping will be duplicated
595	* @dst_vma: The destination vm area
596	*
597	* The page range of the folio is defined by [page, page + nr_pages)
598	*
599	* The caller needs to hold the page table lock.
600	*/
601	static inline void folio_dup_file_rmap_ptes(struct folio *folio,
602	struct page page, int* nr_pages, struct vm_area_struct *dst_vma)
603	{
604	__folio_dup_file_rmap(folio, page, nr_pages, dst_vma, level: PGTABLE_LEVEL_PTE);
605	}
606
607	static __always_inline void folio_dup_file_rmap_pte(struct folio *folio,
608	struct page page, struct* vm_area_struct *dst_vma)
609	{
610	__folio_dup_file_rmap(folio, page, nr_pages: `1`, dst_vma, level: PGTABLE_LEVEL_PTE);
611	}
612
613	/**
614	* folio_dup_file_rmap_pmd - duplicate a PMD mapping of a page range of a folio
615	* @folio: The folio to duplicate the mapping of
616	* @page: The first page to duplicate the mapping of
617	* @dst_vma: The destination vm area
618	*
619	* The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
620	*
621	* The caller needs to hold the page table lock.
622	*/
623	static inline void folio_dup_file_rmap_pmd(struct folio *folio,
624	struct page page, struct* vm_area_struct *dst_vma)
625	{
626	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
627	__folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, dst_vma, PGTABLE_LEVEL_PTE);
628	#else
629	WARN_ON_ONCE(true);
630	#endif
631	}
632
633	static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
634	struct page page, int* nr_pages, struct vm_area_struct *dst_vma,
635	struct vm_area_struct src_vma, enum* pgtable_level level)
636	{
637	const int orig_nr_pages = nr_pages;
638	bool maybe_pinned;
639	int i;
640
641	VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
642	__folio_rmap_sanity_checks(folio, page, nr_pages, level);
643
644	/*
645	* If this folio may have been pinned by the parent process,
646	* don't allow to duplicate the mappings but instead require to e.g.,
647	* copy the subpage immediately for the child so that we'll always
648	* guarantee the pinned folio won't be randomly replaced in the
649	* future on write faults.
650	*/
651	maybe_pinned = likely(!folio_is_device_private(folio)) &&
652	unlikely(folio_needs_cow_for_dma(src_vma, folio));
653
654	/*
655	* No need to check+clear for already shared PTEs/PMDs of the
656	* folio. But if any page is PageAnonExclusive, we must fallback to
657	* copying if the folio maybe pinned.
658	*/
659	switch (level) {
660	case PGTABLE_LEVEL_PTE:
661	if (unlikely(maybe_pinned)) {
662	for (i = `0`; i < nr_pages; i++)
663	if (PageAnonExclusive(page: page + i))
664	return -EBUSY;
665	}
666
667	if (!folio_test_large(folio)) {
668	if (PageAnonExclusive(page))
669	ClearPageAnonExclusive(page);
670	atomic_inc(v: &folio->_mapcount);
671	break;
672	}
673
674	do {
675	if (PageAnonExclusive(page))
676	ClearPageAnonExclusive(page);
677	if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
678	atomic_inc(v: &page->_mapcount);
679	} while (page++, --nr_pages > `0`);
680	folio_add_large_mapcount(folio, diff: orig_nr_pages, vma: dst_vma);
681	break;
682	case PGTABLE_LEVEL_PMD:
683	case PGTABLE_LEVEL_PUD:
684	if (PageAnonExclusive(page)) {
685	if (unlikely(maybe_pinned))
686	return -EBUSY;
687	ClearPageAnonExclusive(page);
688	}
689	atomic_inc(v: &folio->_entire_mapcount);
690	folio_inc_large_mapcount(folio, dst_vma);
691	break;
692	default:
693	BUILD_BUG();
694	}
695	return `0`;
696	}
697
698	/**
699	* folio_try_dup_anon_rmap_ptes - try duplicating PTE mappings of a page range
700	* of a folio
701	* @folio: The folio to duplicate the mappings of
702	* @page: The first page to duplicate the mappings of
703	* @nr_pages: The number of pages of which the mapping will be duplicated
704	* @dst_vma: The destination vm area
705	* @src_vma: The vm area from which the mappings are duplicated
706	*
707	* The page range of the folio is defined by [page, page + nr_pages)
708	*
709	* The caller needs to hold the page table lock and the
710	* vma->vma_mm->write_protect_seq.
711	*
712	* Duplicating the mappings can only fail if the folio may be pinned; device
713	* private folios cannot get pinned and consequently this function cannot fail
714	* for them.
715	*
716	* If duplicating the mappings succeeded, the duplicated PTEs have to be R/O in
717	* the parent and the child. They must not be writable after this call
718	* succeeded.
719	*
720	* Returns 0 if duplicating the mappings succeeded. Returns -EBUSY otherwise.
721	*/
722	static inline int folio_try_dup_anon_rmap_ptes(struct folio *folio,
723	struct page page, int* nr_pages, struct vm_area_struct *dst_vma,
724	struct vm_area_struct *src_vma)
725	{
726	return __folio_try_dup_anon_rmap(folio, page, nr_pages, dst_vma,
727	src_vma, level: PGTABLE_LEVEL_PTE);
728	}
729
730	static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio,
731	struct page page, struct* vm_area_struct *dst_vma,
732	struct vm_area_struct *src_vma)
733	{
734	return __folio_try_dup_anon_rmap(folio, page, nr_pages: `1`, dst_vma, src_vma,
735	level: PGTABLE_LEVEL_PTE);
736	}
737
738	/**
739	* folio_try_dup_anon_rmap_pmd - try duplicating a PMD mapping of a page range
740	* of a folio
741	* @folio: The folio to duplicate the mapping of
742	* @page: The first page to duplicate the mapping of
743	* @dst_vma: The destination vm area
744	* @src_vma: The vm area from which the mapping is duplicated
745	*
746	* The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
747	*
748	* The caller needs to hold the page table lock and the
749	* vma->vma_mm->write_protect_seq.
750	*
751	* Duplicating the mapping can only fail if the folio may be pinned; device
752	* private folios cannot get pinned and consequently this function cannot fail
753	* for them.
754	*
755	* If duplicating the mapping succeeds, the duplicated PMD has to be R/O in
756	* the parent and the child. They must not be writable after this call
757	* succeeded.
758	*
759	* Returns 0 if duplicating the mapping succeeded. Returns -EBUSY otherwise.
760	*/
761	static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio,
762	struct page page, struct* vm_area_struct *dst_vma,
763	struct vm_area_struct *src_vma)
764	{
765	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
766	return __folio_try_dup_anon_rmap(folio, page, HPAGE_PMD_NR, dst_vma,
767	src_vma, PGTABLE_LEVEL_PMD);
768	#else
769	WARN_ON_ONCE(true);
770	return -EBUSY;
771	#endif
772	}
773
774	static __always_inline int __folio_try_share_anon_rmap(struct folio *folio,
775	struct page page, int* nr_pages, enum pgtable_level level)
776	{
777	VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
778	VM_WARN_ON_FOLIO(!PageAnonExclusive(page), folio);
779	__folio_rmap_sanity_checks(folio, page, nr_pages, level);
780
781	/ device private folios cannot get pinned via GUP. /
782	if (unlikely(folio_is_device_private(folio))) {
783	ClearPageAnonExclusive(page);
784	return `0`;
785	}
786
787	/*
788	* We have to make sure that when we clear PageAnonExclusive, that
789	* the page is not pinned and that concurrent GUP-fast won't succeed in
790	* concurrently pinning the page.
791	*
792	* Conceptually, PageAnonExclusive clearing consists of:
793	* (A1) Clear PTE
794	* (A2) Check if the page is pinned; back off if so.
795	* (A3) Clear PageAnonExclusive
796	* (A4) Restore PTE (optional, but certainly not writable)
797	*
798	* When clearing PageAnonExclusive, we cannot possibly map the page
799	* writable again, because anon pages that may be shared must never
800	* be writable. So in any case, if the PTE was writable it cannot
801	* be writable anymore afterwards and there would be a PTE change. Only
802	* if the PTE wasn't writable, there might not be a PTE change.
803	*
804	* Conceptually, GUP-fast pinning of an anon page consists of:
805	* (B1) Read the PTE
806	* (B2) FOLL_WRITE: check if the PTE is not writable; back off if so.
807	* (B3) Pin the mapped page
808	* (B4) Check if the PTE changed by re-reading it; back off if so.
809	* (B5) If the original PTE is not writable, check if
810	* PageAnonExclusive is not set; back off if so.
811	*
812	* If the PTE was writable, we only have to make sure that GUP-fast
813	* observes a PTE change and properly backs off.
814	*
815	* If the PTE was not writable, we have to make sure that GUP-fast either
816	* detects a (temporary) PTE change or that PageAnonExclusive is cleared
817	* and properly backs off.
818	*
819	* Consequently, when clearing PageAnonExclusive(), we have to make
820	* sure that (A1), (A2)/(A3) and (A4) happen in the right memory
821	* order. In GUP-fast pinning code, we have to make sure that (B3),(B4)
822	* and (B5) happen in the right memory order.
823	*
824	* We assume that there might not be a memory barrier after
825	* clearing/invalidating the PTE (A1) and before restoring the PTE (A4),
826	* so we use explicit ones here.
827	*/
828
829	/ Paired with the memory barrier in try_grab_folio(). /
830	if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
831	smp_mb();
832
833	if (unlikely(folio_maybe_dma_pinned(folio)))
834	return -EBUSY;
835	ClearPageAnonExclusive(page);
836
837	/*
838	* This is conceptually a smp_wmb() paired with the smp_rmb() in
839	* gup_must_unshare().
840	*/
841	if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
842	smp_mb__after_atomic();
843	return `0`;
844	}
845
846	/**
847	* folio_try_share_anon_rmap_pte - try marking an exclusive anonymous page
848	* mapped by a PTE possibly shared to prepare
849	* for KSM or temporary unmapping
850	* @folio: The folio to share a mapping of
851	* @page: The mapped exclusive page
852	*
853	* The caller needs to hold the page table lock and has to have the page table
854	* entries cleared/invalidated.
855	*
856	* This is similar to folio_try_dup_anon_rmap_pte(), however, not used during
857	* fork() to duplicate mappings, but instead to prepare for KSM or temporarily
858	* unmapping parts of a folio (swap, migration) via folio_remove_rmap_pte().
859	*
860	* Marking the mapped page shared can only fail if the folio maybe pinned;
861	* device private folios cannot get pinned and consequently this function cannot
862	* fail.
863	*
864	* Returns 0 if marking the mapped page possibly shared succeeded. Returns
865	* -EBUSY otherwise.
866	*/
867	static inline int folio_try_share_anon_rmap_pte(struct folio *folio,
868	struct page *page)
869	{
870	return __folio_try_share_anon_rmap(folio, page, nr_pages: `1`, level: PGTABLE_LEVEL_PTE);
871	}
872
873	/**
874	* folio_try_share_anon_rmap_pmd - try marking an exclusive anonymous page
875	* range mapped by a PMD possibly shared to
876	* prepare for temporary unmapping
877	* @folio: The folio to share the mapping of
878	* @page: The first page to share the mapping of
879	*
880	* The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
881	*
882	* The caller needs to hold the page table lock and has to have the page table
883	* entries cleared/invalidated.
884	*
885	* This is similar to folio_try_dup_anon_rmap_pmd(), however, not used during
886	* fork() to duplicate a mapping, but instead to prepare for temporarily
887	* unmapping parts of a folio (swap, migration) via folio_remove_rmap_pmd().
888	*
889	* Marking the mapped pages shared can only fail if the folio maybe pinned;
890	* device private folios cannot get pinned and consequently this function cannot
891	* fail.
892	*
893	* Returns 0 if marking the mapped pages possibly shared succeeded. Returns
894	* -EBUSY otherwise.
895	*/
896	static inline int folio_try_share_anon_rmap_pmd(struct folio *folio,
897	struct page *page)
898	{
899	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
900	return __folio_try_share_anon_rmap(folio, page, HPAGE_PMD_NR,
901	PGTABLE_LEVEL_PMD);
902	#else
903	WARN_ON_ONCE(true);
904	return -EBUSY;
905	#endif
906	}
907
908	/*
909	* Called from mm/vmscan.c to handle paging out
910	*/
911	int folio_referenced(struct folio , int* is_locked,
912	struct mem_cgroup memcg, vm_flags_t vm_flags);
913
914	void try_to_migrate(struct folio folio, enum* ttu_flags flags);
915	void try_to_unmap(struct folio , enum* ttu_flags flags);
916
917	struct page make_device_exclusive(struct* mm_struct mm, unsigned* long addr,
918	void owner, struct* folio **foliop);
919
920	/ Avoid racy checks /
921	#define PVMW_SYNC (1 << 0)
922	/ Look for migration entries rather than present PTEs /
923	#define PVMW_MIGRATION (1 << 1)
924
925	/ Result flags /
926
927	/ The page is mapped across page table boundary /
928	#define PVMW_PGTABLE_CROSSED (1 << 16)
929
930	struct page_vma_mapped_walk {
931	unsigned long pfn;
932	unsigned long nr_pages;
933	pgoff_t pgoff;
934	struct vm_area_struct *vma;
935	unsigned long address;
936	pmd_t *pmd;
937	pte_t *pte;
938	spinlock_t *ptl;
939	unsigned int flags;
940	};
941
942	#define DEFINE_FOLIO_VMA_WALK(name, _folio, _vma, _address, _flags) \
943	struct page_vma_mapped_walk name = { \
944	.pfn = folio_pfn(_folio), \
945	.nr_pages = folio_nr_pages(_folio), \
946	.pgoff = folio_pgoff(_folio), \
947	.vma = _vma, \
948	.address = _address, \
949	.flags = _flags, \
950	}
951
952	static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw)
953	{
954	/ HugeTLB pte is set to the relevant page table entry without pte_mapped. /
955	if (pvmw->pte && !is_vm_hugetlb_page(vma: pvmw->vma))
956	pte_unmap(pte: pvmw->pte);
957	if (pvmw->ptl)
958	spin_unlock(lock: pvmw->ptl);
959	}
960
961	/**
962	* page_vma_mapped_walk_restart - Restart the page table walk.
963	* @pvmw: Pointer to struct page_vma_mapped_walk.
964	*
965	* It restarts the page table walk when changes occur in the page
966	* table, such as splitting a PMD. Ensures that the PTL held during
967	* the previous walk is released and resets the state to allow for
968	* a new walk starting at the current address stored in pvmw->address.
969	*/
970	static inline void
971	page_vma_mapped_walk_restart(struct page_vma_mapped_walk *pvmw)
972	{
973	WARN_ON_ONCE(!pvmw->pmd && !pvmw->pte);
974
975	if (likely(pvmw->ptl))
976	spin_unlock(lock: pvmw->ptl);
977	else
978	WARN_ON_ONCE(`1`);
979
980	pvmw->ptl = NULL;
981	pvmw->pmd = NULL;
982	pvmw->pte = NULL;
983	}
984
985	bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw);
986	unsigned long page_address_in_vma(const struct folio *folio,
987	const struct page , const* struct vm_area_struct *);
988
989	/*
990	* Cleans the PTEs of shared mappings.
991	* (and since clean PTEs should also be readonly, write protects them too)
992	*
993	* returns the number of cleaned PTEs.
994	*/
995	int folio_mkclean(struct folio *);
996
997	int mapping_wrprotect_range(struct address_space *mapping, pgoff_t pgoff,
998	unsigned long pfn, unsigned long nr_pages);
999
1000	int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
1001	struct vm_area_struct *vma);
1002
1003	enum rmp_flags {
1004	RMP_LOCKED = `1` << `0`,
1005	RMP_USE_SHARED_ZEROPAGE = `1` << `1`,
1006	};
1007
1008	void remove_migration_ptes(struct folio src, struct* folio dst, int* flags);
1009
1010	/*
1011	* rmap_walk_control: To control rmap traversing for specific needs
1012	*
1013	* arg: passed to rmap_one() and invalid_vma()
1014	* try_lock: bail out if the rmap lock is contended
1015	* contended: indicate the rmap traversal bailed out due to lock contention
1016	* rmap_one: executed on each vma where page is mapped
1017	* done: for checking traversing termination condition
1018	* anon_lock: for getting anon_lock by optimized way rather than default
1019	* invalid_vma: for skipping uninterested vma
1020	*/
1021	struct rmap_walk_control {
1022	void *arg;
1023	bool try_lock;
1024	bool contended;
1025	/*
1026	* Return false if page table scanning in rmap_walk should be stopped.
1027	* Otherwise, return true.
1028	*/
1029	bool (rmap_one)(struct* folio folio, struct* vm_area_struct *vma,
1030	unsigned long addr, void *arg);
1031	int (done)(struct* folio *folio);
1032	struct anon_vma (anon_lock)(const struct folio *folio,
1033	struct rmap_walk_control *rwc);
1034	bool (invalid_vma)(struct* vm_area_struct vma, void* *arg);
1035	};
1036
1037	void rmap_walk(struct folio folio, struct* rmap_walk_control *rwc);
1038	void rmap_walk_locked(struct folio folio, struct* rmap_walk_control *rwc);
1039	struct anon_vma folio_lock_anon_vma_read(const* struct folio *folio,
1040	struct rmap_walk_control *rwc);
1041
1042	#else /* !CONFIG_MMU */
1043
1044	#define anon_vma_init() do {} while (0)
1045	#define anon_vma_prepare(vma) (0)
1046
1047	static inline int folio_referenced(struct folio folio, int* is_locked,
1048	struct mem_cgroup *memcg,
1049	vm_flags_t *vm_flags)
1050	{
1051	*vm_flags = `0`;
1052	return `0`;
1053	}
1054
1055	static inline void try_to_unmap(struct folio folio, enum* ttu_flags flags)
1056	{
1057	}
1058
1059	static inline int folio_mkclean(struct folio *folio)
1060	{
1061	return `0`;
1062	}
1063	#endif /* CONFIG_MMU */
1064
1065	#endif /* _LINUX_RMAP_H */
1066

Browse the source code of Linux/include/linux/rmap.h