internal.h source code [Linux/mm/internal.h]

1	/ SPDX-License-Identifier: GPL-2.0-or-later /
2	/ internal.h: mm/ internal definitions*
3	*
4	* Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
5	* Written by David Howells (dhowells@redhat.com)
6	*/
7	#ifndef __MM_INTERNAL_H
8	#define __MM_INTERNAL_H
9
10	#include <linux/fs.h>
11	#include <linux/khugepaged.h>
12	#include <linux/mm.h>
13	#include <linux/mm_inline.h>
14	#include <linux/pagemap.h>
15	#include <linux/pagewalk.h>
16	#include <linux/rmap.h>
17	#include <linux/swap.h>
18	#include <linux/swapops.h>
19	#include <linux/swap_cgroup.h>
20	#include <linux/tracepoint-defs.h>
21
22	/ Internal core VMA manipulation functions. /
23	#include "vma.h"
24
25	struct folio_batch;
26
27	/*
28	* Maintains state across a page table move. The operation assumes both source
29	* and destination VMAs already exist and are specified by the user.
30	*
31	* Partial moves are permitted, but the old and new ranges must both reside
32	* within a VMA.
33	*
34	* mmap lock must be held in write and VMA write locks must be held on any VMA
35	* that is visible.
36	*
37	* Use the PAGETABLE_MOVE() macro to initialise this struct.
38	*
39	* The old_addr and new_addr fields are updated as the page table move is
40	* executed.
41	*
42	* NOTE: The page table move is affected by reading from [old_addr, old_end),
43	* and old_addr may be updated for better page table alignment, so len_in
44	* represents the length of the range being copied as specified by the user.
45	*/
46	struct pagetable_move_control {
47	struct vm_area_struct old; /* Source VMA. /
48	struct vm_area_struct new; /* Destination VMA. /
49	unsigned long old_addr; / Address from which the move begins. /
50	unsigned long old_end; / Exclusive address at which old range ends. /
51	unsigned long new_addr; / Address to move page tables to. /
52	unsigned long len_in; / Bytes to remap specified by user. /
53
54	bool need_rmap_locks; / Do rmap locks need to be taken? /
55	bool for_stack; / Is this an early temp stack being moved? /
56	};
57
58	#define PAGETABLE_MOVE(name, old_, new_, old_addr_, new_addr_, len_) \
59	struct pagetable_move_control name = { \
60	.old = old_, \
61	.new = new_, \
62	.old_addr = old_addr_, \
63	.old_end = (old_addr_) + (len_), \
64	.new_addr = new_addr_, \
65	.len_in = len_, \
66	}
67
68	/*
69	* The set of flags that only affect watermark checking and reclaim
70	* behaviour. This is used by the MM to obey the caller constraints
71	* about IO, FS and watermark checking while ignoring placement
72	* hints such as HIGHMEM usage.
73	*/
74	#define GFP_RECLAIM_MASK (__GFP_RECLAIM\|__GFP_HIGH\|__GFP_IO\|__GFP_FS\|\
75	__GFP_NOWARN\|__GFP_RETRY_MAYFAIL\|__GFP_NOFAIL\|\
76	__GFP_NORETRY\|__GFP_MEMALLOC\|__GFP_NOMEMALLOC\|\
77	__GFP_NOLOCKDEP)
78
79	/ The GFP flags allowed during early boot /
80	#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM\|__GFP_IO\|__GFP_FS))
81
82	/ Control allocation cpuset and node placement constraints /
83	#define GFP_CONSTRAINT_MASK (__GFP_HARDWALL\|__GFP_THISNODE)
84
85	/ Do not use these with a slab allocator /
86	#define GFP_SLAB_BUG_MASK (__GFP_DMA32\|__GFP_HIGHMEM\|~__GFP_BITS_MASK)
87
88	/*
89	* Different from WARN_ON_ONCE(), no warning will be issued
90	* when we specify __GFP_NOWARN.
91	*/
92	#define WARN_ON_ONCE_GFP(cond, gfp) ({ \
93	static bool __section(".data..once") __warned; \
94	int __ret_warn_once = !!(cond); \
95	\
96	if (unlikely(!(gfp & __GFP_NOWARN) && __ret_warn_once && !__warned)) { \
97	__warned = true; \
98	WARN_ON(1); \
99	} \
100	unlikely(__ret_warn_once); \
101	})
102
103	void page_writeback_init(void);
104
105	/*
106	* If a 16GB hugetlb folio were mapped by PTEs of all of its 4kB pages,
107	* its nr_pages_mapped would be 0x400000: choose the ENTIRELY_MAPPED bit
108	* above that range, instead of 2*(PMD_SIZE/PAGE_SIZE). Hugetlb currently
109	* leaves nr_pages_mapped at 0, but avoid surprise if it participates later.
110	*/
111	#define ENTIRELY_MAPPED 0x800000
112	#define FOLIO_PAGES_MAPPED (ENTIRELY_MAPPED - 1)
113
114	/*
115	* Flags passed to __show_mem() and show_free_areas() to suppress output in
116	* various contexts.
117	*/
118	#define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */
119
120	/*
121	* How many individual pages have an elevated _mapcount. Excludes
122	* the folio's entire_mapcount.
123	*
124	* Don't use this function outside of debugging code.
125	*/
126	static inline int folio_nr_pages_mapped(const struct folio *folio)
127	{
128	if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT))
129	return -`1`;
130	return atomic_read(v: &folio->_nr_pages_mapped) & FOLIO_PAGES_MAPPED;
131	}
132
133	/*
134	* Retrieve the first entry of a folio based on a provided entry within the
135	* folio. We cannot rely on folio->swap as there is no guarantee that it has
136	* been initialized. Used for calling arch_swap_restore()
137	*/
138	static inline swp_entry_t folio_swap(swp_entry_t entry,
139	const struct folio *folio)
140	{
141	swp_entry_t swap = {
142	.val = ALIGN_DOWN(entry.val, folio_nr_pages(folio)),
143	};
144
145	return swap;
146	}
147
148	static inline void folio_raw_mapping(const* struct folio *folio)
149	{
150	unsigned long mapping = (unsigned long)folio->mapping;
151
152	return (void *)(mapping & ~FOLIO_MAPPING_FLAGS);
153	}
154
155	/*
156	* This is a file-backed mapping, and is about to be memory mapped - invoke its
157	* mmap hook and safely handle error conditions. On error, VMA hooks will be
158	* mutated.
159	*
160	* @file: File which backs the mapping.
161	* @vma: VMA which we are mapping.
162	*
163	* Returns: 0 if success, error otherwise.
164	*/
165	static inline int mmap_file(struct file file, struct* vm_area_struct *vma)
166	{
167	int err = vfs_mmap(file, vma);
168
169	if (likely(!err))
170	return `0`;
171
172	/*
173	* OK, we tried to call the file hook for mmap(), but an error
174	* arose. The mapping is in an inconsistent state and we most not invoke
175	* any further hooks on it.
176	*/
177	vma->vm_ops = &vma_dummy_vm_ops;
178
179	return err;
180	}
181
182	/*
183	* If the VMA has a close hook then close it, and since closing it might leave
184	* it in an inconsistent state which makes the use of any hooks suspect, clear
185	* them down by installing dummy empty hooks.
186	*/
187	static inline void vma_close(struct vm_area_struct *vma)
188	{
189	if (vma->vm_ops && vma->vm_ops->close) {
190	vma->vm_ops->close(vma);
191
192	/*
193	* The mapping is in an inconsistent state, and no further hooks
194	* may be invoked upon it.
195	*/
196	vma->vm_ops = &vma_dummy_vm_ops;
197	}
198	}
199
200	#ifdef CONFIG_MMU
201
202	/ Flags for folio_pte_batch(). /
203	typedef int __bitwise fpb_t;
204
205	/ Compare PTEs respecting the dirty bit. /
206	#define FPB_RESPECT_DIRTY ((__force fpb_t)BIT(0))
207
208	/ Compare PTEs respecting the soft-dirty bit. /
209	#define FPB_RESPECT_SOFT_DIRTY ((__force fpb_t)BIT(1))
210
211	/ Compare PTEs respecting the writable bit. /
212	#define FPB_RESPECT_WRITE ((__force fpb_t)BIT(2))
213
214	/*
215	* Merge PTE write bits: if any PTE in the batch is writable, modify the
216	* PTE at @ptentp to be writable.
217	*/
218	#define FPB_MERGE_WRITE ((__force fpb_t)BIT(3))
219
220	/*
221	* Merge PTE young and dirty bits: if any PTE in the batch is young or dirty,
222	* modify the PTE at @ptentp to be young or dirty, respectively.
223	*/
224	#define FPB_MERGE_YOUNG_DIRTY ((__force fpb_t)BIT(4))
225
226	static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
227	{
228	if (!(flags & FPB_RESPECT_DIRTY))
229	pte = pte_mkclean(pte);
230	if (likely(!(flags & FPB_RESPECT_SOFT_DIRTY)))
231	pte = pte_clear_soft_dirty(pte);
232	if (likely(!(flags & FPB_RESPECT_WRITE)))
233	pte = pte_wrprotect(pte);
234	return pte_mkold(pte);
235	}
236
237	/**
238	* folio_pte_batch_flags - detect a PTE batch for a large folio
239	* @folio: The large folio to detect a PTE batch for.
240	* @vma: The VMA. Only relevant with FPB_MERGE_WRITE, otherwise can be NULL.
241	* @ptep: Page table pointer for the first entry.
242	* @ptentp: Pointer to a COPY of the first page table entry whose flags this
243	* function updates based on @flags if appropriate.
244	* @max_nr: The maximum number of table entries to consider.
245	* @flags: Flags to modify the PTE batch semantics.
246	*
247	* Detect a PTE batch: consecutive (present) PTEs that map consecutive
248	* pages of the same large folio in a single VMA and a single page table.
249	*
250	* All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN,
251	* the accessed bit, writable bit, dirty bit (unless FPB_RESPECT_DIRTY is set)
252	* and soft-dirty bit (unless FPB_RESPECT_SOFT_DIRTY is set).
253	*
254	* @ptep must map any page of the folio. max_nr must be at least one and
255	* must be limited by the caller so scanning cannot exceed a single VMA and
256	* a single page table.
257	*
258	* Depending on the FPB_MERGE_* flags, the pte stored at @ptentp will
259	* be updated: it's crucial that a pointer to a COPY of the first
260	* page table entry, obtained through ptep_get(), is provided as @ptentp.
261	*
262	* This function will be inlined to optimize based on the input parameters;
263	* consider using folio_pte_batch() instead if applicable.
264	*
265	* Return: the number of table entries in the batch.
266	*/
267	static inline unsigned int folio_pte_batch_flags(struct folio *folio,
268	struct vm_area_struct vma, pte_t ptep, pte_t *ptentp,
269	unsigned int max_nr, fpb_t flags)
270	{
271	bool any_writable = false, any_young = false, any_dirty = false;
272	pte_t expected_pte, pte = *ptentp;
273	unsigned int nr, cur_nr;
274
275	VM_WARN_ON_FOLIO(!pte_present(pte), folio);
276	VM_WARN_ON_FOLIO(!folio_test_large(folio) \|\| max_nr < `1`, folio);
277	VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio);
278	/*
279	* Ensure this is a pointer to a copy not a pointer into a page table.
280	* If this is a stack value, it won't be a valid virtual address, but
281	* that's fine because it also cannot be pointing into the page table.
282	*/
283	VM_WARN_ON(virt_addr_valid(ptentp) && PageTable(virt_to_page(ptentp)));
284
285	/ Limit max_nr to the actual remaining PFNs in the folio we could batch. /
286	max_nr = min_t(unsigned long, max_nr,
287	folio_pfn(folio) + folio_nr_pages(folio) - pte_pfn(pte));
288
289	nr = pte_batch_hint(ptep, pte);
290	expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags);
291	ptep = ptep + nr;
292
293	while (nr < max_nr) {
294	pte = ptep_get(ptep);
295
296	if (!pte_same(a: __pte_batch_clear_ignored(pte, flags), b: expected_pte))
297	break;
298
299	if (flags & FPB_MERGE_WRITE)
300	any_writable \|= pte_write(pte);
301	if (flags & FPB_MERGE_YOUNG_DIRTY) {
302	any_young \|= pte_young(pte);
303	any_dirty \|= pte_dirty(pte);
304	}
305
306	cur_nr = pte_batch_hint(ptep, pte);
307	expected_pte = pte_advance_pfn(pte: expected_pte, nr: cur_nr);
308	ptep += cur_nr;
309	nr += cur_nr;
310	}
311
312	if (any_writable)
313	ptentp = pte_mkwrite(pte: ptentp, vma);
314	if (any_young)
315	ptentp = pte_mkyoung(pte: ptentp);
316	if (any_dirty)
317	ptentp = pte_mkdirty(pte: ptentp);
318
319	return min(nr, max_nr);
320	}
321
322	unsigned int folio_pte_batch(struct folio folio, pte_t ptep, pte_t pte,
323	unsigned int max_nr);
324
325	/**
326	* pte_move_swp_offset - Move the swap entry offset field of a swap pte
327	* forward or backward by delta
328	* @pte: The initial pte state; is_swap_pte(pte) must be true and
329	* non_swap_entry() must be false.
330	* @delta: The direction and the offset we are moving; forward if delta
331	* is positive; backward if delta is negative
332	*
333	* Moves the swap offset, while maintaining all other fields, including
334	* swap type, and any swp pte bits. The resulting pte is returned.
335	*/
336	static inline pte_t pte_move_swp_offset(pte_t pte, long delta)
337	{
338	swp_entry_t entry = pte_to_swp_entry(pte);
339	pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry),
340	(swp_offset(entry) + delta)));
341
342	if (pte_swp_soft_dirty(pte))
343	new = pte_swp_mksoft_dirty(pte: new);
344	if (pte_swp_exclusive(pte))
345	new = pte_swp_mkexclusive(pte: new);
346	if (pte_swp_uffd_wp(pte))
347	new = pte_swp_mkuffd_wp(pte: new);
348
349	return new;
350	}
351
352
353	/**
354	* pte_next_swp_offset - Increment the swap entry offset field of a swap pte.
355	* @pte: The initial pte state; is_swap_pte(pte) must be true and
356	* non_swap_entry() must be false.
357	*
358	* Increments the swap offset, while maintaining all other fields, including
359	* swap type, and any swp pte bits. The resulting pte is returned.
360	*/
361	static inline pte_t pte_next_swp_offset(pte_t pte)
362	{
363	return pte_move_swp_offset(pte, delta: `1`);
364	}
365
366	/**
367	* swap_pte_batch - detect a PTE batch for a set of contiguous swap entries
368	* @start_ptep: Page table pointer for the first entry.
369	* @max_nr: The maximum number of table entries to consider.
370	* @pte: Page table entry for the first entry.
371	*
372	* Detect a batch of contiguous swap entries: consecutive (non-present) PTEs
373	* containing swap entries all with consecutive offsets and targeting the same
374	* swap type, all with matching swp pte bits.
375	*
376	* max_nr must be at least one and must be limited by the caller so scanning
377	* cannot exceed a single page table.
378	*
379	* Return: the number of table entries in the batch.
380	*/
381	static inline int swap_pte_batch(pte_t start_ptep, int* max_nr, pte_t pte)
382	{
383	pte_t expected_pte = pte_next_swp_offset(pte);
384	const pte_t *end_ptep = start_ptep + max_nr;
385	swp_entry_t entry = pte_to_swp_entry(pte);
386	pte_t *ptep = start_ptep + `1`;
387	unsigned short cgroup_id;
388
389	VM_WARN_ON(max_nr < `1`);
390	VM_WARN_ON(!is_swap_pte(pte));
391	VM_WARN_ON(non_swap_entry(entry));
392
393	cgroup_id = lookup_swap_cgroup_id(ent: entry);
394	while (ptep < end_ptep) {
395	pte = ptep_get(ptep);
396
397	if (!pte_same(a: pte, b: expected_pte))
398	break;
399	if (lookup_swap_cgroup_id(ent: pte_to_swp_entry(pte)) != cgroup_id)
400	break;
401	expected_pte = pte_next_swp_offset(pte: expected_pte);
402	ptep++;
403	}
404
405	return ptep - start_ptep;
406	}
407	#endif /* CONFIG_MMU */
408
409	void __acct_reclaim_writeback(pg_data_t pgdat, struct* folio *folio,
410	int nr_throttled);
411	static inline void acct_reclaim_writeback(struct folio *folio)
412	{
413	pg_data_t *pgdat = folio_pgdat(folio);
414	int nr_throttled = atomic_read(v: &pgdat->nr_writeback_throttled);
415
416	if (nr_throttled)
417	__acct_reclaim_writeback(pgdat, folio, nr_throttled);
418	}
419
420	static inline void wake_throttle_isolated(pg_data_t *pgdat)
421	{
422	wait_queue_head_t *wqh;
423
424	wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_ISOLATED];
425	if (waitqueue_active(wq_head: wqh))
426	wake_up(wqh);
427	}
428
429	vm_fault_t __vmf_anon_prepare(struct vm_fault *vmf);
430	static inline vm_fault_t vmf_anon_prepare(struct vm_fault *vmf)
431	{
432	vm_fault_t ret = __vmf_anon_prepare(vmf);
433
434	if (unlikely(ret & VM_FAULT_RETRY))
435	vma_end_read(vma: vmf->vma);
436	return ret;
437	}
438
439	vm_fault_t do_swap_page(struct vm_fault *vmf);
440	void folio_rotate_reclaimable(struct folio *folio);
441	bool __folio_end_writeback(struct folio *folio);
442	void deactivate_file_folio(struct folio *folio);
443	void folio_activate(struct folio *folio);
444
445	void free_pgtables(struct mmu_gather tlb, struct* ma_state *mas,
446	struct vm_area_struct start_vma, unsigned* long floor,
447	unsigned long ceiling, bool mm_wr_locked);
448	void pmd_install(struct mm_struct mm, pmd_t pmd, pgtable_t *pte);
449
450	struct zap_details;
451	void unmap_page_range(struct mmu_gather *tlb,
452	struct vm_area_struct *vma,
453	unsigned long addr, unsigned long end,
454	struct zap_details *details);
455	void zap_page_range_single_batched(struct mmu_gather *tlb,
456	struct vm_area_struct vma, unsigned* long addr,
457	unsigned long size, struct zap_details *details);
458	int folio_unmap_invalidate(struct address_space mapping, struct* folio *folio,
459	gfp_t gfp);
460
461	void page_cache_ra_order(struct readahead_control , struct* file_ra_state *);
462	void force_page_cache_ra(struct readahead_control , unsigned* long nr);
463	static inline void force_page_cache_readahead(struct address_space *mapping,
464	struct file file, pgoff_t index, unsigned* long nr_to_read)
465	{
466	DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, index);
467	force_page_cache_ra(&ractl, nr: nr_to_read);
468	}
469
470	unsigned find_lock_entries(struct address_space mapping, pgoff_t start,
471	pgoff_t end, struct folio_batch fbatch, pgoff_t indices);
472	unsigned find_get_entries(struct address_space mapping, pgoff_t start,
473	pgoff_t end, struct folio_batch fbatch, pgoff_t indices);
474	void filemap_free_folio(struct address_space mapping, struct* folio *folio);
475	int truncate_inode_folio(struct address_space mapping, struct* folio *folio);
476	bool truncate_inode_partial_folio(struct folio *folio, loff_t start,
477	loff_t end);
478	long mapping_evict_folio(struct address_space mapping, struct* folio *folio);
479	unsigned long mapping_try_invalidate(struct address_space *mapping,
480	pgoff_t start, pgoff_t end, unsigned long *nr_failed);
481
482	/**
483	* folio_evictable - Test whether a folio is evictable.
484	* @folio: The folio to test.
485	*
486	* Test whether @folio is evictable -- i.e., should be placed on
487	* active/inactive lists vs unevictable list.
488	*
489	* Reasons folio might not be evictable:
490	* 1. folio's mapping marked unevictable
491	* 2. One of the pages in the folio is part of an mlocked VMA
492	*/
493	static inline bool folio_evictable(struct folio *folio)
494	{
495	bool ret;
496
497	/ Prevent address_space of inode and swap cache from being freed /
498	rcu_read_lock();
499	ret = !mapping_unevictable(mapping: folio_mapping(folio)) &&
500	!folio_test_mlocked(folio);
501	rcu_read_unlock();
502	return ret;
503	}
504
505	/*
506	* Turn a non-refcounted page (->_refcount == 0) into refcounted with
507	* a count of one.
508	*/
509	static inline void set_page_refcounted(struct page *page)
510	{
511	VM_BUG_ON_PAGE(PageTail(page), page);
512	VM_BUG_ON_PAGE(page_ref_count(page), page);
513	set_page_count(page, v: `1`);
514	}
515
516	/*
517	* Return true if a folio needs ->release_folio() calling upon it.
518	*/
519	static inline bool folio_needs_release(struct folio *folio)
520	{
521	struct address_space *mapping = folio_mapping(folio);
522
523	return folio_has_private(folio) \|\|
524	(mapping && mapping_release_always(mapping));
525	}
526
527	extern unsigned long highest_memmap_pfn;
528
529	/*
530	* Maximum number of reclaim retries without progress before the OOM
531	* killer is consider the only way forward.
532	*/
533	#define MAX_RECLAIM_RETRIES 16
534
535	/*
536	* in mm/vmscan.c:
537	*/
538	bool folio_isolate_lru(struct folio *folio);
539	void folio_putback_lru(struct folio *folio);
540	extern void reclaim_throttle(pg_data_t pgdat, enum* vmscan_throttle_state reason);
541	#ifdef CONFIG_NUMA
542	int user_proactive_reclaim(char *buf,
543	struct mem_cgroup memcg, pg_data_t pgdat);
544	#else
545	static inline int user_proactive_reclaim(char *buf,
546	struct mem_cgroup memcg, pg_data_t pgdat)
547	{
548	return `0`;
549	}
550	#endif
551
552	/*
553	* in mm/rmap.c:
554	*/
555	pmd_t mm_find_pmd(struct* mm_struct mm, unsigned* long address);
556
557	/*
558	* in mm/page_alloc.c
559	*/
560	#define K(x) ((x) << (PAGE_SHIFT-10))
561
562	extern char * const zone_names[MAX_NR_ZONES];
563
564	/ perform sanity checks on struct pages being allocated or freed /
565	DECLARE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled);
566
567	extern int min_free_kbytes;
568	extern int defrag_mode;
569
570	void setup_per_zone_wmarks(void);
571	void calculate_min_free_kbytes(void);
572	int __meminit init_per_zone_wmark_min(void);
573	void page_alloc_sysctl_init(void);
574
575	/*
576	* Structure for holding the mostly immutable allocation parameters passed
577	* between functions involved in allocations, including the alloc_pages*
578	* family of functions.
579	*
580	* nodemask, migratetype and highest_zoneidx are initialized only once in
581	* __alloc_pages() and then never change.
582	*
583	* zonelist, preferred_zone and highest_zoneidx are set first in
584	* __alloc_pages() for the fast path, and might be later changed
585	* in __alloc_pages_slowpath(). All other functions pass the whole structure
586	* by a const pointer.
587	*/
588	struct alloc_context {
589	struct zonelist *zonelist;
590	nodemask_t *nodemask;
591	struct zoneref *preferred_zoneref;
592	int migratetype;
593
594	/*
595	* highest_zoneidx represents highest usable zone index of
596	* the allocation request. Due to the nature of the zone,
597	* memory on lower zone than the highest_zoneidx will be
598	* protected by lowmem_reserve[highest_zoneidx].
599	*
600	* highest_zoneidx is also used by reclaim/compaction to limit
601	* the target zone since higher zone than this index cannot be
602	* usable for this allocation request.
603	*/
604	enum zone_type highest_zoneidx;
605	bool spread_dirty_pages;
606	};
607
608	/*
609	* This function returns the order of a free page in the buddy system. In
610	* general, page_zone(page)->lock must be held by the caller to prevent the
611	* page from being allocated in parallel and returning garbage as the order.
612	* If a caller does not hold page_zone(page)->lock, it must guarantee that the
613	* page cannot be allocated or merged in parallel. Alternatively, it must
614	* handle invalid values gracefully, and use buddy_order_unsafe() below.
615	*/
616	static inline unsigned int buddy_order(struct page *page)
617	{
618	/ PageBuddy() must be checked by the caller /
619	return page_private(page);
620	}
621
622	/*
623	* Like buddy_order(), but for callers who cannot afford to hold the zone lock.
624	* PageBuddy() should be checked first by the caller to minimize race window,
625	* and invalid values must be handled gracefully.
626	*
627	* READ_ONCE is used so that if the caller assigns the result into a local
628	* variable and e.g. tests it for valid range before using, the compiler cannot
629	* decide to remove the variable and inline the page_private(page) multiple
630	* times, potentially observing different values in the tests and the actual
631	* use of the result.
632	*/
633	#define buddy_order_unsafe(page) READ_ONCE(page_private(page))
634
635	/*
636	* This function checks whether a page is free && is the buddy
637	* we can coalesce a page and its buddy if
638	* (a) the buddy is not in a hole (check before calling!) &&
639	* (b) the buddy is in the buddy system &&
640	* (c) a page and its buddy have the same order &&
641	* (d) a page and its buddy are in the same zone.
642	*
643	* For recording whether a page is in the buddy system, we set PageBuddy.
644	* Setting, clearing, and testing PageBuddy is serialized by zone->lock.
645	*
646	* For recording page's order, we use page_private(page).
647	*/
648	static inline bool page_is_buddy(struct page page, struct* page *buddy,
649	unsigned int order)
650	{
651	if (!page_is_guard(page: buddy) && !PageBuddy(page: buddy))
652	return false;
653
654	if (buddy_order(page: buddy) != order)
655	return false;
656
657	/*
658	* zone check is done late to avoid uselessly calculating
659	* zone/node ids for pages that could never merge.
660	*/
661	if (page_zone_id(page) != page_zone_id(page: buddy))
662	return false;
663
664	VM_BUG_ON_PAGE(page_count(buddy) != `0`, buddy);
665
666	return true;
667	}
668
669	/*
670	* Locate the struct page for both the matching buddy in our
671	* pair (buddy1) and the combined O(n+1) page they form (page).
672	*
673	* 1) Any buddy B1 will have an order O twin B2 which satisfies
674	* the following equation:
675	* B2 = B1 ^ (1 << O)
676	* For example, if the starting buddy (buddy2) is #8 its order
677	* 1 buddy is #10:
678	* B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
679	*
680	* 2) Any buddy B will have an order O+1 parent P which
681	* satisfies the following equation:
682	* P = B & ~(1 << O)
683	*
684	* Assumption: *_mem_map is contiguous at least up to MAX_PAGE_ORDER
685	*/
686	static inline unsigned long
687	__find_buddy_pfn(unsigned long page_pfn, unsigned int order)
688	{
689	return page_pfn ^ (`1` << order);
690	}
691
692	/*
693	* Find the buddy of @page and validate it.
694	* @page: The input page
695	* @pfn: The pfn of the page, it saves a call to page_to_pfn() when the
696	* function is used in the performance-critical __free_one_page().
697	* @order: The order of the page
698	* @buddy_pfn: The output pointer to the buddy pfn, it also saves a call to
699	* page_to_pfn().
700	*
701	* The found buddy can be a non PageBuddy, out of @page's zone, or its order is
702	* not the same as @page. The validation is necessary before use it.
703	*
704	* Return: the found buddy page or NULL if not found.
705	*/
706	static inline struct page find_buddy_page_pfn(struct* page *page,
707	unsigned long pfn, unsigned int order, unsigned long *buddy_pfn)
708	{
709	unsigned long __buddy_pfn = __find_buddy_pfn(page_pfn: pfn, order);
710	struct page *buddy;
711
712	buddy = page + (__buddy_pfn - pfn);
713	if (buddy_pfn)
714	*buddy_pfn = __buddy_pfn;
715
716	if (page_is_buddy(page, buddy, order))
717	return buddy;
718	return NULL;
719	}
720
721	extern struct page __pageblock_pfn_to_page(unsigned* long start_pfn,
722	unsigned long end_pfn, struct zone *zone);
723
724	static inline struct page pageblock_pfn_to_page(unsigned* long start_pfn,
725	unsigned long end_pfn, struct zone *zone)
726	{
727	if (zone->contiguous)
728	return pfn_to_page(start_pfn);
729
730	return __pageblock_pfn_to_page(start_pfn, end_pfn, zone);
731	}
732
733	void set_zone_contiguous(struct zone *zone);
734	bool pfn_range_intersects_zones(int nid, unsigned long start_pfn,
735	unsigned long nr_pages);
736
737	static inline void clear_zone_contiguous(struct zone *zone)
738	{
739	zone->contiguous = false;
740	}
741
742	extern int __isolate_free_page(struct page page, unsigned* int order);
743	extern void __putback_isolated_page(struct page page, unsigned* int order,
744	int mt);
745	extern void memblock_free_pages(struct page page, unsigned* long pfn,
746	unsigned int order);
747	extern void __free_pages_core(struct page page, unsigned* int order,
748	enum meminit_context context);
749
750	/*
751	* This will have no effect, other than possibly generating a warning, if the
752	* caller passes in a non-large folio.
753	*/
754	static inline void folio_set_order(struct folio folio, unsigned* int order)
755	{
756	if (WARN_ON_ONCE(!order \|\| !folio_test_large(folio)))
757	return;
758	VM_WARN_ON_ONCE(order > MAX_FOLIO_ORDER);
759
760	folio->_flags_1 = (folio->_flags_1 & ~`0xffUL`) \| order;
761	#ifdef NR_PAGES_IN_LARGE_FOLIO
762	folio->_nr_pages = `1U` << order;
763	#endif
764	}
765
766	bool __folio_unqueue_deferred_split(struct folio *folio);
767	static inline bool folio_unqueue_deferred_split(struct folio *folio)
768	{
769	if (folio_order(folio) <= `1` \|\| !folio_test_large_rmappable(folio))
770	return false;
771
772	/*
773	* At this point, there is no one trying to add the folio to
774	* deferred_list. If folio is not in deferred_list, it's safe
775	* to check without acquiring the split_queue_lock.
776	*/
777	if (data_race(list_empty(&folio->_deferred_list)))
778	return false;
779
780	return __folio_unqueue_deferred_split(folio);
781	}
782
783	static inline struct folio page_rmappable_folio(struct* page *page)
784	{
785	struct folio folio = (struct* folio *)page;
786
787	if (folio && folio_test_large(folio))
788	folio_set_large_rmappable(folio);
789	return folio;
790	}
791
792	static inline void prep_compound_head(struct page page, unsigned* int order)
793	{
794	struct folio folio = (struct* folio *)page;
795
796	folio_set_order(folio, order);
797	atomic_set(v: &folio->_large_mapcount, i: -`1`);
798	if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
799	atomic_set(v: &folio->_nr_pages_mapped, i: `0`);
800	if (IS_ENABLED(CONFIG_MM_ID)) {
801	folio->_mm_ids = `0`;
802	folio->_mm_id_mapcount[`0`] = -`1`;
803	folio->_mm_id_mapcount[`1`] = -`1`;
804	}
805	if (IS_ENABLED(CONFIG_64BIT) \|\| order > `1`) {
806	atomic_set(v: &folio->_pincount, i: `0`);
807	atomic_set(v: &folio->_entire_mapcount, i: -`1`);
808	}
809	if (order > `1`)
810	INIT_LIST_HEAD(list: &folio->_deferred_list);
811	}
812
813	static inline void prep_compound_tail(struct page head, int* tail_idx)
814	{
815	struct page *p = head + tail_idx;
816
817	p->mapping = TAIL_MAPPING;
818	set_compound_head(page: p, head);
819	set_page_private(page: p, private: `0`);
820	}
821
822	void post_alloc_hook(struct page page, unsigned* int order, gfp_t gfp_flags);
823	extern bool free_pages_prepare(struct page page, unsigned* int order);
824
825	extern int user_min_free_kbytes;
826
827	struct page __alloc_frozen_pages_noprof(gfp_t, unsigned* int order, int nid,
828	nodemask_t *);
829	#define __alloc_frozen_pages(...) \
830	alloc_hooks(__alloc_frozen_pages_noprof(__VA_ARGS__))
831	void free_frozen_pages(struct page page, unsigned* int order);
832	void free_unref_folios(struct folio_batch *fbatch);
833
834	#ifdef CONFIG_NUMA
835	struct page alloc_frozen_pages_noprof(gfp_t, unsigned* int order);
836	#else
837	static inline struct page alloc_frozen_pages_noprof(gfp_t gfp, unsigned* int order)
838	{
839	return __alloc_frozen_pages_noprof(gfp, order, numa_node_id(), NULL);
840	}
841	#endif
842
843	#define alloc_frozen_pages(...) \
844	alloc_hooks(alloc_frozen_pages_noprof(__VA_ARGS__))
845
846	struct page alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int* nid, unsigned int order);
847	#define alloc_frozen_pages_nolock(...) \
848	alloc_hooks(alloc_frozen_pages_nolock_noprof(__VA_ARGS__))
849
850	extern void zone_pcp_reset(struct zone *zone);
851	extern void zone_pcp_disable(struct zone *zone);
852	extern void zone_pcp_enable(struct zone *zone);
853	extern void zone_pcp_init(struct zone *zone);
854
855	extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
856	phys_addr_t min_addr,
857	int nid, bool exact_nid);
858
859	void memmap_init_range(unsigned long, int, unsigned long, unsigned long,
860	unsigned long, enum meminit_context, struct vmem_altmap , int*,
861	bool);
862
863	#if defined CONFIG_COMPACTION \|\| defined CONFIG_CMA
864
865	/*
866	* in mm/compaction.c
867	*/
868	/*
869	* compact_control is used to track pages being migrated and the free pages
870	* they are being migrated to during memory compaction. The free_pfn starts
871	* at the end of a zone and migrate_pfn begins at the start. Movable pages
872	* are moved to the end of a zone during a compaction run and the run
873	* completes when free_pfn <= migrate_pfn
874	*/
875	struct compact_control {
876	struct list_head freepages[NR_PAGE_ORDERS]; / List of free pages to migrate to /
877	struct list_head migratepages; / List of pages being migrated /
878	unsigned int nr_freepages; / Number of isolated free pages /
879	unsigned int nr_migratepages; / Number of pages to migrate /
880	unsigned long free_pfn; / isolate_freepages search base /
881	/*
882	* Acts as an in/out parameter to page isolation for migration.
883	* isolate_migratepages uses it as a search base.
884	* isolate_migratepages_block will update the value to the next pfn
885	* after the last isolated one.
886	*/
887	unsigned long migrate_pfn;
888	unsigned long fast_start_pfn; / a pfn to start linear scan from /
889	struct zone *zone;
890	unsigned long total_migrate_scanned;
891	unsigned long total_free_scanned;
892	unsigned short fast_search_fail;/ failures to use free list searches /
893	short search_order; / order to start a fast search at /
894	const gfp_t gfp_mask; / gfp mask of a direct compactor /
895	int order; / order a direct compactor needs /
896	int migratetype; / migratetype of direct compactor /
897	const unsigned int alloc_flags; / alloc flags of a direct compactor /
898	const int highest_zoneidx; / zone index of a direct compactor /
899	enum migrate_mode mode; / Async or sync migration mode /
900	bool ignore_skip_hint; / Scan blocks even if marked skip /
901	bool no_set_skip_hint; / Don't mark blocks for skipping /
902	bool ignore_block_suitable; / Scan blocks considered unsuitable /
903	bool direct_compaction; / False from kcompactd or /proc/... /
904	bool proactive_compaction; / kcompactd proactive compaction /
905	bool whole_zone; / Whole zone should/has been scanned /
906	bool contended; / Signal lock contention /
907	bool finish_pageblock; / Scan the remainder of a pageblock. Used*
908	* when there are potentially transient
909	* isolation or migration failures to
910	* ensure forward progress.
911	*/
912	bool alloc_contig; / alloc_contig_range allocation /
913	};
914
915	/*
916	* Used in direct compaction when a page should be taken from the freelists
917	* immediately when one is created during the free path.
918	*/
919	struct capture_control {
920	struct compact_control *cc;
921	struct page *page;
922	};
923
924	unsigned long
925	isolate_freepages_range(struct compact_control *cc,
926	unsigned long start_pfn, unsigned long end_pfn);
927	int
928	isolate_migratepages_range(struct compact_control *cc,
929	unsigned long low_pfn, unsigned long end_pfn);
930
931	/ Free whole pageblock and set its migration type to MIGRATE_CMA. /
932	void init_cma_reserved_pageblock(struct page *page);
933
934	#endif /* CONFIG_COMPACTION \|\| CONFIG_CMA */
935
936	struct cma;
937
938	#ifdef CONFIG_CMA
939	void cma_reserve_early(struct* cma cma, unsigned* long size);
940	void init_cma_pageblock(struct page *page);
941	#else
942	static inline void cma_reserve_early(struct* cma cma, unsigned* long size)
943	{
944	return NULL;
945	}
946	static inline void init_cma_pageblock(struct page *page)
947	{
948	}
949	#endif
950
951
952	int find_suitable_fallback(struct free_area area, unsigned* int order,
953	int migratetype, bool claimable);
954
955	static inline bool free_area_empty(struct free_area area, int* migratetype)
956	{
957	return list_empty(head: &area->free_list[migratetype]);
958	}
959
960	/ mm/util.c /
961	struct anon_vma folio_anon_vma(const* struct folio *folio);
962
963	#ifdef CONFIG_MMU
964	void unmap_mapping_folio(struct folio *folio);
965	extern long populate_vma_page_range(struct vm_area_struct *vma,
966	unsigned long start, unsigned long end, int *locked);
967	extern long faultin_page_range(struct mm_struct mm, unsigned* long start,
968	unsigned long end, bool write, int *locked);
969	bool mlock_future_ok(const struct mm_struct *mm, vm_flags_t vm_flags,
970	unsigned long bytes);
971
972	/*
973	* NOTE: This function can't tell whether the folio is "fully mapped" in the
974	* range.
975	* "fully mapped" means all the pages of folio is associated with the page
976	* table of range while this function just check whether the folio range is
977	* within the range [start, end). Function caller needs to do page table
978	* check if it cares about the page table association.
979	*
980	* Typical usage (like mlock or madvise) is:
981	* Caller knows at least 1 page of folio is associated with page table of VMA
982	* and the range [start, end) is intersect with the VMA range. Caller wants
983	* to know whether the folio is fully associated with the range. It calls
984	* this function to check whether the folio is in the range first. Then checks
985	* the page table to know whether the folio is fully mapped to the range.
986	*/
987	static inline bool
988	folio_within_range(struct folio folio, struct* vm_area_struct *vma,
989	unsigned long start, unsigned long end)
990	{
991	pgoff_t pgoff, addr;
992	unsigned long vma_pglen = vma_pages(vma);
993
994	VM_WARN_ON_FOLIO(folio_test_ksm(folio), folio);
995	if (start > end)
996	return false;
997
998	if (start < vma->vm_start)
999	start = vma->vm_start;
1000
1001	if (end > vma->vm_end)
1002	end = vma->vm_end;
1003
1004	pgoff = folio_pgoff(folio);
1005
1006	/ if folio start address is not in vma range /
1007	if (!in_range(pgoff, vma->vm_pgoff, vma_pglen))
1008	return false;
1009
1010	addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
1011
1012	return !(addr < start \|\| end - addr < folio_size(folio));
1013	}
1014
1015	static inline bool
1016	folio_within_vma(struct folio folio, struct* vm_area_struct *vma)
1017	{
1018	return folio_within_range(folio, vma, start: vma->vm_start, end: vma->vm_end);
1019	}
1020
1021	/*
1022	* mlock_vma_folio() and munlock_vma_folio():
1023	* should be called with vma's mmap_lock held for read or write,
1024	* under page table lock for the pte/pmd being added or removed.
1025	*
1026	* mlock is usually called at the end of folio_add__rmap_(), munlock at
1027	* the end of folio_remove_rmap_*(); but new anon folios are managed by
1028	* folio_add_lru_vma() calling mlock_new_folio().
1029	*/
1030	void mlock_folio(struct folio *folio);
1031	static inline void mlock_vma_folio(struct folio *folio,
1032	struct vm_area_struct *vma)
1033	{
1034	/*
1035	* The VM_SPECIAL check here serves two purposes.
1036	* 1) VM_IO check prevents migration from double-counting during mlock.
1037	* 2) Although mmap_region() and mlock_fixup() take care that VM_LOCKED
1038	* is never left set on a VM_SPECIAL vma, there is an interval while
1039	* file->f_op->mmap() is using vm_insert_page(s), when VM_LOCKED may
1040	* still be set while VM_SPECIAL bits are added: so ignore it then.
1041	*/
1042	if (unlikely((vma->vm_flags & (VM_LOCKED\|VM_SPECIAL)) == VM_LOCKED))
1043	mlock_folio(folio);
1044	}
1045
1046	void munlock_folio(struct folio *folio);
1047	static inline void munlock_vma_folio(struct folio *folio,
1048	struct vm_area_struct *vma)
1049	{
1050	/*
1051	* munlock if the function is called. Ideally, we should only
1052	* do munlock if any page of folio is unmapped from VMA and
1053	* cause folio not fully mapped to VMA.
1054	*
1055	* But it's not easy to confirm that's the situation. So we
1056	* always munlock the folio and page reclaim will correct it
1057	* if it's wrong.
1058	*/
1059	if (unlikely(vma->vm_flags & VM_LOCKED))
1060	munlock_folio(folio);
1061	}
1062
1063	void mlock_new_folio(struct folio *folio);
1064	bool need_mlock_drain(int cpu);
1065	void mlock_drain_local(void);
1066	void mlock_drain_remote(int cpu);
1067
1068	extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
1069
1070	/**
1071	* vma_address - Find the virtual address a page range is mapped at
1072	* @vma: The vma which maps this object.
1073	* @pgoff: The page offset within its object.
1074	* @nr_pages: The number of pages to consider.
1075	*
1076	* If any page in this range is mapped by this VMA, return the first address
1077	* where any of these pages appear. Otherwise, return -EFAULT.
1078	*/
1079	static inline unsigned long vma_address(const struct vm_area_struct *vma,
1080	pgoff_t pgoff, unsigned long nr_pages)
1081	{
1082	unsigned long address;
1083
1084	if (pgoff >= vma->vm_pgoff) {
1085	address = vma->vm_start +
1086	((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
1087	/ Check for address beyond vma (or wrapped through 0?) /
1088	if (address < vma->vm_start \|\| address >= vma->vm_end)
1089	address = -EFAULT;
1090	} else if (pgoff + nr_pages - `1` >= vma->vm_pgoff) {
1091	/ Test above avoids possibility of wrap to 0 on 32-bit /
1092	address = vma->vm_start;
1093	} else {
1094	address = -EFAULT;
1095	}
1096	return address;
1097	}
1098
1099	/*
1100	* Then at what user virtual address will none of the range be found in vma?
1101	* Assumes that vma_address() already returned a good starting address.
1102	*/
1103	static inline unsigned long vma_address_end(struct page_vma_mapped_walk *pvmw)
1104	{
1105	struct vm_area_struct *vma = pvmw->vma;
1106	pgoff_t pgoff;
1107	unsigned long address;
1108
1109	/ Common case, plus ->pgoff is invalid for KSM /
1110	if (pvmw->nr_pages == `1`)
1111	return pvmw->address + PAGE_SIZE;
1112
1113	pgoff = pvmw->pgoff + pvmw->nr_pages;
1114	address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
1115	/ Check for address beyond vma (or wrapped through 0?) /
1116	if (address < vma->vm_start \|\| address > vma->vm_end)
1117	address = vma->vm_end;
1118	return address;
1119	}
1120
1121	static inline struct file maybe_unlock_mmap_for_io(struct* vm_fault *vmf,
1122	struct file *fpin)
1123	{
1124	int flags = vmf->flags;
1125
1126	if (fpin)
1127	return fpin;
1128
1129	/*
1130	* FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or
1131	* anything, so we only pin the file and drop the mmap_lock if only
1132	* FAULT_FLAG_ALLOW_RETRY is set, while this is the first attempt.
1133	*/
1134	if (fault_flag_allow_retry_first(flags) &&
1135	!(flags & FAULT_FLAG_RETRY_NOWAIT)) {
1136	fpin = get_file(f: vmf->vma->vm_file);
1137	release_fault_lock(vmf);
1138	}
1139	return fpin;
1140	}
1141	#else /* !CONFIG_MMU */
1142	static inline void unmap_mapping_folio(struct folio *folio) { }
1143	static inline void mlock_new_folio(struct folio *folio) { }
1144	static inline bool need_mlock_drain(int cpu) { return false; }
1145	static inline void mlock_drain_local(void) { }
1146	static inline void mlock_drain_remote(int cpu) { }
1147	static inline void vunmap_range_noflush(unsigned long start, unsigned long end)
1148	{
1149	}
1150	#endif /* !CONFIG_MMU */
1151
1152	/ Memory initialisation debug and verification /
1153	#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1154	DECLARE_STATIC_KEY_TRUE(deferred_pages);
1155
1156	bool __init deferred_grow_zone(struct zone zone, unsigned* int order);
1157	#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
1158
1159	void init_deferred_page(unsigned long pfn, int nid);
1160
1161	enum mminit_level {
1162	MMINIT_WARNING,
1163	MMINIT_VERIFY,
1164	MMINIT_TRACE
1165	};
1166
1167	#ifdef CONFIG_DEBUG_MEMORY_INIT
1168
1169	extern int mminit_loglevel;
1170
1171	#define mminit_dprintk(level, prefix, fmt, arg...) \
1172	do { \
1173	if (level < mminit_loglevel) { \
1174	if (level <= MMINIT_WARNING) \
1175	pr_warn("mminit::" prefix " " fmt, ##arg); \
1176	else \
1177	printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \
1178	} \
1179	} while (0)
1180
1181	extern void mminit_verify_pageflags_layout(void);
1182	extern void mminit_verify_zonelist(void);
1183	#else
1184
1185	static inline void mminit_dprintk(enum mminit_level level,
1186	const char prefix, const* char *fmt, ...)
1187	{
1188	}
1189
1190	static inline void mminit_verify_pageflags_layout(void)
1191	{
1192	}
1193
1194	static inline void mminit_verify_zonelist(void)
1195	{
1196	}
1197	#endif /* CONFIG_DEBUG_MEMORY_INIT */
1198
1199	#define NODE_RECLAIM_NOSCAN -2
1200	#define NODE_RECLAIM_FULL -1
1201	#define NODE_RECLAIM_SOME 0
1202	#define NODE_RECLAIM_SUCCESS 1
1203
1204	#ifdef CONFIG_NUMA
1205	extern int node_reclaim_mode;
1206
1207	extern int node_reclaim(struct pglist_data , gfp_t, unsigned* int);
1208	extern int find_next_best_node(int node, nodemask_t *used_node_mask);
1209	#else
1210	#define node_reclaim_mode 0
1211
1212	static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
1213	unsigned int order)
1214	{
1215	return NODE_RECLAIM_NOSCAN;
1216	}
1217	static inline int find_next_best_node(int node, nodemask_t *used_node_mask)
1218	{
1219	return NUMA_NO_NODE;
1220	}
1221	#endif
1222
1223	static inline bool node_reclaim_enabled(void)
1224	{
1225	/ Is any node_reclaim_mode bit set? /
1226	return node_reclaim_mode & (RECLAIM_ZONE\|RECLAIM_WRITE\|RECLAIM_UNMAP);
1227	}
1228
1229	/*
1230	* mm/memory-failure.c
1231	*/
1232	#ifdef CONFIG_MEMORY_FAILURE
1233	int unmap_poisoned_folio(struct folio folio, unsigned* long pfn, bool must_kill);
1234	void shake_folio(struct folio *folio);
1235	typedef int hwpoison_filter_func_t(struct page *p);
1236	void hwpoison_filter_register(hwpoison_filter_func_t *filter);
1237	void hwpoison_filter_unregister(void);
1238
1239	#define MAGIC_HWPOISON 0x48575053U /* HWPS */
1240	void SetPageHWPoisonTakenOff(struct page *page);
1241	void ClearPageHWPoisonTakenOff(struct page *page);
1242	bool take_page_off_buddy(struct page *page);
1243	bool put_page_back_buddy(struct page *page);
1244	struct task_struct task_early_kill(struct* task_struct tsk, int* force_early);
1245	void add_to_kill_ksm(struct task_struct tsk, const* struct page *p,
1246	struct vm_area_struct vma, struct* list_head *to_kill,
1247	unsigned long ksm_addr);
1248	unsigned long page_mapped_in_vma(const struct page *page,
1249	struct vm_area_struct *vma);
1250
1251	#else
1252	static inline int unmap_poisoned_folio(struct folio folio, unsigned* long pfn, bool must_kill)
1253	{
1254	return -EBUSY;
1255	}
1256	#endif
1257
1258	extern unsigned long __must_check vm_mmap_pgoff(struct file , unsigned* long,
1259	unsigned long, unsigned long,
1260	unsigned long, unsigned long);
1261
1262	extern void set_pageblock_order(void);
1263	unsigned long reclaim_pages(struct list_head *folio_list);
1264	unsigned int reclaim_clean_pages_from_list(struct zone *zone,
1265	struct list_head *folio_list);
1266	/ The ALLOC_WMARK bits are used as an index to zone->watermark /
1267	#define ALLOC_WMARK_MIN WMARK_MIN
1268	#define ALLOC_WMARK_LOW WMARK_LOW
1269	#define ALLOC_WMARK_HIGH WMARK_HIGH
1270	#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */
1271
1272	/ Mask to get the watermark bits /
1273	#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)
1274
1275	/*
1276	* Only MMU archs have async oom victim reclaim - aka oom_reaper so we
1277	* cannot assume a reduced access to memory reserves is sufficient for
1278	* !MMU
1279	*/
1280	#ifdef CONFIG_MMU
1281	#define ALLOC_OOM 0x08
1282	#else
1283	#define ALLOC_OOM ALLOC_NO_WATERMARKS
1284	#endif
1285
1286	#define ALLOC_NON_BLOCK 0x10 /* Caller cannot block. Allow access
1287	* to 25% of the min watermark or
1288	* 62.5% if __GFP_HIGH is set.
1289	*/
1290	#define ALLOC_MIN_RESERVE 0x20 /* __GFP_HIGH set. Allow access to 50%
1291	* of the min watermark.
1292	*/
1293	#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
1294	#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
1295	#ifdef CONFIG_ZONE_DMA32
1296	#define ALLOC_NOFRAGMENT 0x100 /* avoid mixing pageblock types */
1297	#else
1298	#define ALLOC_NOFRAGMENT 0x0
1299	#endif
1300	#define ALLOC_HIGHATOMIC 0x200 /* Allows access to MIGRATE_HIGHATOMIC */
1301	#define ALLOC_TRYLOCK 0x400 /* Only use spin_trylock in allocation path */
1302	#define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */
1303
1304	/ Flags that allow allocations below the min watermark. /
1305	#define ALLOC_RESERVES (ALLOC_NON_BLOCK\|ALLOC_MIN_RESERVE\|ALLOC_HIGHATOMIC\|ALLOC_OOM)
1306
1307	enum ttu_flags;
1308	struct tlbflush_unmap_batch;
1309
1310
1311	/*
1312	* only for MM internal work items which do not depend on
1313	* any allocations or locks which might depend on allocations
1314	*/
1315	extern struct workqueue_struct *mm_percpu_wq;
1316
1317	#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
1318	void try_to_unmap_flush(void);
1319	void try_to_unmap_flush_dirty(void);
1320	void flush_tlb_batched_pending(struct mm_struct *mm);
1321	#else
1322	static inline void try_to_unmap_flush(void)
1323	{
1324	}
1325	static inline void try_to_unmap_flush_dirty(void)
1326	{
1327	}
1328	static inline void flush_tlb_batched_pending(struct mm_struct *mm)
1329	{
1330	}
1331	#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
1332
1333	extern const struct trace_print_flags pageflag_names[];
1334	extern const struct trace_print_flags vmaflag_names[];
1335	extern const struct trace_print_flags gfpflag_names[];
1336
1337	void setup_zone_pageset(struct zone *zone);
1338
1339	struct migration_target_control {
1340	int nid; / preferred node id /
1341	nodemask_t *nmask;
1342	gfp_t gfp_mask;
1343	enum migrate_reason reason;
1344	};
1345
1346	/*
1347	* mm/filemap.c
1348	*/
1349	size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
1350	struct folio *folio, loff_t fpos, size_t size);
1351
1352	/*
1353	* mm/vmalloc.c
1354	*/
1355	#ifdef CONFIG_MMU
1356	void __init vmalloc_init(void);
1357	int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end,
1358	pgprot_t prot, struct page *pages, unsigned* int page_shift);
1359	unsigned int get_vm_area_page_order(struct vm_struct *vm);
1360	#else
1361	static inline void vmalloc_init(void)
1362	{
1363	}
1364
1365	static inline
1366	int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end,
1367	pgprot_t prot, struct page *pages, unsigned* int page_shift)
1368	{
1369	return -EINVAL;
1370	}
1371	#endif
1372
1373	int __must_check __vmap_pages_range_noflush(unsigned long addr,
1374	unsigned long end, pgprot_t prot,
1375	struct page *pages, unsigned* int page_shift);
1376
1377	void vunmap_range_noflush(unsigned long start, unsigned long end);
1378
1379	void __vunmap_range_noflush(unsigned long start, unsigned long end);
1380
1381	int numa_migrate_check(struct folio folio, struct* vm_fault *vmf,
1382	unsigned long addr, int *flags, bool writable,
1383	int *last_cpupid);
1384
1385	void free_zone_device_folio(struct folio *folio);
1386	int migrate_device_coherent_folio(struct folio *folio);
1387
1388	struct vm_struct __get_vm_area_node(unsigned* long size,
1389	unsigned long align, unsigned long shift,
1390	unsigned long vm_flags, unsigned long start,
1391	unsigned long end, int node, gfp_t gfp_mask,
1392	const void *caller);
1393
1394	/*
1395	* mm/gup.c
1396	*/
1397	int __must_check try_grab_folio(struct folio folio, int* refs,
1398	unsigned int flags);
1399
1400	/*
1401	* mm/huge_memory.c
1402	*/
1403	void touch_pud(struct vm_area_struct vma, unsigned* long addr,
1404	pud_t *pud, bool write);
1405	void touch_pmd(struct vm_area_struct vma, unsigned* long addr,
1406	pmd_t *pmd, bool write);
1407
1408	/*
1409	* Parses a string with mem suffixes into its order. Useful to parse kernel
1410	* parameters.
1411	*/
1412	static inline int get_order_from_str(const char *size_str,
1413	unsigned long valid_orders)
1414	{
1415	unsigned long size;
1416	char *endptr;
1417	int order;
1418
1419	size = memparse(ptr: size_str, retptr: &endptr);
1420
1421	if (!is_power_of_2(n: size))
1422	return -EINVAL;
1423	order = get_order(size);
1424	if (BIT(order) & ~valid_orders)
1425	return -EINVAL;
1426
1427	return order;
1428	}
1429
1430	enum {
1431	/ mark page accessed /
1432	FOLL_TOUCH = `1` << `16`,
1433	/ a retry, previous pass started an IO /
1434	FOLL_TRIED = `1` << `17`,
1435	/ we are working on non-current tsk/mm /
1436	FOLL_REMOTE = `1` << `18`,
1437	/ pages must be released via unpin_user_page /
1438	FOLL_PIN = `1` << `19`,
1439	/ gup_fast: prevent fall-back to slow gup /
1440	FOLL_FAST_ONLY = `1` << `20`,
1441	/ allow unlocking the mmap lock /
1442	FOLL_UNLOCKABLE = `1` << `21`,
1443	/ VMA lookup+checks compatible with MADV_POPULATE_(READ\|WRITE) /
1444	FOLL_MADV_POPULATE = `1` << `22`,
1445	};
1446
1447	#define INTERNAL_GUP_FLAGS (FOLL_TOUCH \| FOLL_TRIED \| FOLL_REMOTE \| FOLL_PIN \| \
1448	FOLL_FAST_ONLY \| FOLL_UNLOCKABLE \| \
1449	FOLL_MADV_POPULATE)
1450
1451	/*
1452	* Indicates for which pages that are write-protected in the page table,
1453	* whether GUP has to trigger unsharing via FAULT_FLAG_UNSHARE such that the
1454	* GUP pin will remain consistent with the pages mapped into the page tables
1455	* of the MM.
1456	*
1457	* Temporary unmapping of PageAnonExclusive() pages or clearing of
1458	* PageAnonExclusive() has to protect against concurrent GUP:
1459	* * Ordinary GUP: Using the PT lock
1460	* * GUP-fast and fork(): mm->write_protect_seq
1461	* * GUP-fast and KSM or temporary unmapping (swap, migration): see
1462	* folio_try_share_anon_rmap_*()
1463	*
1464	* Must be called with the (sub)page that's actually referenced via the
1465	* page table entry, which might not necessarily be the head page for a
1466	* PTE-mapped THP.
1467	*
1468	* If the vma is NULL, we're coming from the GUP-fast path and might have
1469	* to fallback to the slow path just to lookup the vma.
1470	*/
1471	static inline bool gup_must_unshare(struct vm_area_struct *vma,
1472	unsigned int flags, struct page *page)
1473	{
1474	/*
1475	* FOLL_WRITE is implicitly handled correctly as the page table entry
1476	* has to be writable -- and if it references (part of) an anonymous
1477	* folio, that part is required to be marked exclusive.
1478	*/
1479	if ((flags & (FOLL_WRITE \| FOLL_PIN)) != FOLL_PIN)
1480	return false;
1481	/*
1482	* Note: PageAnon(page) is stable until the page is actually getting
1483	* freed.
1484	*/
1485	if (!PageAnon(page)) {
1486	/*
1487	* We only care about R/O long-term pining: R/O short-term
1488	* pinning does not have the semantics to observe successive
1489	* changes through the process page tables.
1490	*/
1491	if (!(flags & FOLL_LONGTERM))
1492	return false;
1493
1494	/ We really need the vma ... /
1495	if (!vma)
1496	return true;
1497
1498	/*
1499	* ... because we only care about writable private ("COW")
1500	* mappings where we have to break COW early.
1501	*/
1502	return is_cow_mapping(flags: vma->vm_flags);
1503	}
1504
1505	/ Paired with a memory barrier in folio_try_share_anon_rmap_(). /*
1506	if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
1507	smp_rmb();
1508
1509	/*
1510	* Note that KSM pages cannot be exclusive, and consequently,
1511	* cannot get pinned.
1512	*/
1513	return !PageAnonExclusive(page);
1514	}
1515
1516	extern bool mirrored_kernelcore;
1517	bool memblock_has_mirror(void);
1518	void memblock_free_all(void);
1519
1520	static __always_inline void vma_set_range(struct vm_area_struct *vma,
1521	unsigned long start, unsigned long end,
1522	pgoff_t pgoff)
1523	{
1524	vma->vm_start = start;
1525	vma->vm_end = end;
1526	vma->vm_pgoff = pgoff;
1527	}
1528
1529	static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma)
1530	{
1531	/*
1532	* NOTE: we must check this before VM_SOFTDIRTY on soft-dirty
1533	* enablements, because when without soft-dirty being compiled in,
1534	* VM_SOFTDIRTY is defined as 0x0, then !(vm_flags & VM_SOFTDIRTY)
1535	* will be constantly true.
1536	*/
1537	if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
1538	return false;
1539
1540	/*
1541	* Soft-dirty is kind of special: its tracking is enabled when the
1542	* vma flags not set.
1543	*/
1544	return !(vma->vm_flags & VM_SOFTDIRTY);
1545	}
1546
1547	static inline bool pmd_needs_soft_dirty_wp(struct vm_area_struct *vma, pmd_t pmd)
1548	{
1549	return vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd);
1550	}
1551
1552	static inline bool pte_needs_soft_dirty_wp(struct vm_area_struct *vma, pte_t pte)
1553	{
1554	return vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte);
1555	}
1556
1557	void __meminit __init_single_page(struct page page, unsigned* long pfn,
1558	unsigned long zone, int nid);
1559	void __meminit __init_page_from_nid(unsigned long pfn, int nid);
1560
1561	/ shrinker related functions /
1562	unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg,
1563	int priority);
1564
1565	#ifdef CONFIG_SHRINKER_DEBUG
1566	static inline __printf(`2`, `0`) int shrinker_debugfs_name_alloc(
1567	struct shrinker shrinker, const* char *fmt, va_list ap)
1568	{
1569	shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap);
1570
1571	return shrinker->name ? `0` : -ENOMEM;
1572	}
1573
1574	static inline void shrinker_debugfs_name_free(struct shrinker *shrinker)
1575	{
1576	kfree_const(shrinker->name);
1577	shrinker->name = NULL;
1578	}
1579
1580	extern int shrinker_debugfs_add(struct shrinker *shrinker);
1581	extern struct dentry shrinker_debugfs_detach(struct* shrinker *shrinker,
1582	int *debugfs_id);
1583	extern void shrinker_debugfs_remove(struct dentry *debugfs_entry,
1584	int debugfs_id);
1585	#else /* CONFIG_SHRINKER_DEBUG */
1586	static inline int shrinker_debugfs_add(struct shrinker *shrinker)
1587	{
1588	return `0`;
1589	}
1590	static inline int shrinker_debugfs_name_alloc(struct shrinker *shrinker,
1591	const char *fmt, va_list ap)
1592	{
1593	return `0`;
1594	}
1595	static inline void shrinker_debugfs_name_free(struct shrinker *shrinker)
1596	{
1597	}
1598	static inline struct dentry shrinker_debugfs_detach(struct* shrinker *shrinker,
1599	int *debugfs_id)
1600	{
1601	*debugfs_id = -`1`;
1602	return NULL;
1603	}
1604	static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry,
1605	int debugfs_id)
1606	{
1607	}
1608	#endif /* CONFIG_SHRINKER_DEBUG */
1609
1610	/ Only track the nodes of mappings with shadow entries /
1611	void workingset_update_node(struct xa_node *node);
1612	extern struct list_lru shadow_nodes;
1613	#define mapping_set_update(xas, mapping) do { \
1614	if (!dax_mapping(mapping) && !shmem_mapping(mapping)) { \
1615	xas_set_update(xas, workingset_update_node); \
1616	xas_set_lru(xas, &shadow_nodes); \
1617	} \
1618	} while (0)
1619
1620	/ mremap.c /
1621	unsigned long move_page_tables(struct pagetable_move_control *pmc);
1622
1623	#ifdef CONFIG_UNACCEPTED_MEMORY
1624	void accept_page(struct page *page);
1625	#else /* CONFIG_UNACCEPTED_MEMORY */
1626	static inline void accept_page(struct page *page)
1627	{
1628	}
1629	#endif /* CONFIG_UNACCEPTED_MEMORY */
1630
1631	/ pagewalk.c /
1632	int walk_page_range_mm(struct mm_struct mm, unsigned* long start,
1633	unsigned long end, const struct mm_walk_ops *ops,
1634	void *private);
1635	int walk_page_range_debug(struct mm_struct mm, unsigned* long start,
1636	unsigned long end, const struct mm_walk_ops *ops,
1637	pgd_t pgd, void* *private);
1638
1639	/ pt_reclaim.c /
1640	bool try_get_and_clear_pmd(struct mm_struct mm, pmd_t pmd, pmd_t *pmdval);
1641	void free_pte(struct mm_struct mm, unsigned* long addr, struct mmu_gather *tlb,
1642	pmd_t pmdval);
1643	void try_to_free_pte(struct mm_struct mm, pmd_t pmd, unsigned long addr,
1644	struct mmu_gather *tlb);
1645
1646	#ifdef CONFIG_PT_RECLAIM
1647	bool reclaim_pt_is_enabled(unsigned long start, unsigned long end,
1648	struct zap_details *details);
1649	#else
1650	static inline bool reclaim_pt_is_enabled(unsigned long start, unsigned long end,
1651	struct zap_details *details)
1652	{
1653	return false;
1654	}
1655	#endif /* CONFIG_PT_RECLAIM */
1656
1657	void dup_mm_exe_file(struct mm_struct mm, struct* mm_struct *oldmm);
1658	int dup_mmap(struct mm_struct mm, struct* mm_struct *oldmm);
1659
1660	#endif /* __MM_INTERNAL_H */
1661

Browse the source code of Linux/mm/internal.h