util.c source code [Linux/mm/util.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	#include <linux/mm.h>
3	#include <linux/slab.h>
4	#include <linux/string.h>
5	#include <linux/compiler.h>
6	#include <linux/export.h>
7	#include <linux/err.h>
8	#include <linux/sched.h>
9	#include <linux/sched/mm.h>
10	#include <linux/sched/signal.h>
11	#include <linux/sched/task_stack.h>
12	#include <linux/security.h>
13	#include <linux/swap.h>
14	#include <linux/swapops.h>
15	#include <linux/sysctl.h>
16	#include <linux/mman.h>
17	#include <linux/hugetlb.h>
18	#include <linux/vmalloc.h>
19	#include <linux/userfaultfd_k.h>
20	#include <linux/elf.h>
21	#include <linux/elf-randomize.h>
22	#include <linux/personality.h>
23	#include <linux/random.h>
24	#include <linux/processor.h>
25	#include <linux/sizes.h>
26	#include <linux/compat.h>
27	#include <linux/fsnotify.h>
28	#include <linux/page_idle.h>
29
30	#include <linux/uaccess.h>
31
32	#include <kunit/visibility.h>
33
34	#include "internal.h"
35	#include "swap.h"
36
37	/**
38	* kfree_const - conditionally free memory
39	* @x: pointer to the memory
40	*
41	* Function calls kfree only if @x is not in .rodata section.
42	*/
43	void kfree_const(const void *x)
44	{
45	if (!is_kernel_rodata(addr: (unsigned long)x))
46	kfree(objp: x);
47	}
48	EXPORT_SYMBOL(kfree_const);
49
50	/**
51	* __kmemdup_nul - Create a NUL-terminated string from @s, which might be unterminated.
52	* @s: The data to copy
53	* @len: The size of the data, not including the NUL terminator
54	* @gfp: the GFP mask used in the kmalloc() call when allocating memory
55	*
56	* Return: newly allocated copy of @s with NUL-termination or %NULL in
57	* case of error
58	*/
59	static __always_inline char __kmemdup_nul(const* char *s, size_t len, gfp_t gfp)
60	{
61	char *buf;
62
63	/ '+1' for the NUL terminator /
64	buf = kmalloc_track_caller(len + `1`, gfp);
65	if (!buf)
66	return NULL;
67
68	memcpy(to: buf, from: s, len);
69	/ Ensure the buf is always NUL-terminated, regardless of @s. /
70	buf[len] = `'\0'`;
71	return buf;
72	}
73
74	/**
75	* kstrdup - allocate space for and copy an existing string
76	* @s: the string to duplicate
77	* @gfp: the GFP mask used in the kmalloc() call when allocating memory
78	*
79	* Return: newly allocated copy of @s or %NULL in case of error
80	*/
81	noinline
82	char kstrdup(const* char *s, gfp_t gfp)
83	{
84	return s ? __kmemdup_nul(s, len: strlen(s), gfp) : NULL;
85	}
86	EXPORT_SYMBOL(kstrdup);
87
88	/**
89	* kstrdup_const - conditionally duplicate an existing const string
90	* @s: the string to duplicate
91	* @gfp: the GFP mask used in the kmalloc() call when allocating memory
92	*
93	* Note: Strings allocated by kstrdup_const should be freed by kfree_const and
94	* must not be passed to krealloc().
95	*
96	* Return: source string if it is in .rodata section otherwise
97	* fallback to kstrdup.
98	*/
99	const char kstrdup_const(const* char *s, gfp_t gfp)
100	{
101	if (is_kernel_rodata(addr: (unsigned long)s))
102	return s;
103
104	return kstrdup(s, gfp);
105	}
106	EXPORT_SYMBOL(kstrdup_const);
107
108	/**
109	* kstrndup - allocate space for and copy an existing string
110	* @s: the string to duplicate
111	* @max: read at most @max chars from @s
112	* @gfp: the GFP mask used in the kmalloc() call when allocating memory
113	*
114	* Note: Use kmemdup_nul() instead if the size is known exactly.
115	*
116	* Return: newly allocated copy of @s or %NULL in case of error
117	*/
118	char kstrndup(const* char *s, size_t max, gfp_t gfp)
119	{
120	return s ? __kmemdup_nul(s, len: strnlen(s, max), gfp) : NULL;
121	}
122	EXPORT_SYMBOL(kstrndup);
123
124	/**
125	* kmemdup - duplicate region of memory
126	*
127	* @src: memory region to duplicate
128	* @len: memory region length
129	* @gfp: GFP mask to use
130	*
131	* Return: newly allocated copy of @src or %NULL in case of error,
132	* result is physically contiguous. Use kfree() to free.
133	*/
134	void kmemdup_noprof(const* void *src, size_t len, gfp_t gfp)
135	{
136	void *p;
137
138	p = kmalloc_node_track_caller_noprof(len, gfp, NUMA_NO_NODE, _RET_IP_);
139	if (p)
140	memcpy(to: p, from: src, len);
141	return p;
142	}
143	EXPORT_SYMBOL(kmemdup_noprof);
144
145	/**
146	* kmemdup_array - duplicate a given array.
147	*
148	* @src: array to duplicate.
149	* @count: number of elements to duplicate from array.
150	* @element_size: size of each element of array.
151	* @gfp: GFP mask to use.
152	*
153	* Return: duplicated array of @src or %NULL in case of error,
154	* result is physically contiguous. Use kfree() to free.
155	*/
156	void kmemdup_array(const* void *src, size_t count, size_t element_size, gfp_t gfp)
157	{
158	return kmemdup(src, size_mul(element_size, count), gfp);
159	}
160	EXPORT_SYMBOL(kmemdup_array);
161
162	/**
163	* kvmemdup - duplicate region of memory
164	*
165	* @src: memory region to duplicate
166	* @len: memory region length
167	* @gfp: GFP mask to use
168	*
169	* Return: newly allocated copy of @src or %NULL in case of error,
170	* result may be not physically contiguous. Use kvfree() to free.
171	*/
172	void kvmemdup(const* void *src, size_t len, gfp_t gfp)
173	{
174	void *p;
175
176	p = kvmalloc(len, gfp);
177	if (p)
178	memcpy(to: p, from: src, len);
179	return p;
180	}
181	EXPORT_SYMBOL(kvmemdup);
182
183	/**
184	* kmemdup_nul - Create a NUL-terminated string from unterminated data
185	* @s: The data to stringify
186	* @len: The size of the data
187	* @gfp: the GFP mask used in the kmalloc() call when allocating memory
188	*
189	* Return: newly allocated copy of @s with NUL-termination or %NULL in
190	* case of error
191	*/
192	char kmemdup_nul(const* char *s, size_t len, gfp_t gfp)
193	{
194	return s ? __kmemdup_nul(s, len, gfp) : NULL;
195	}
196	EXPORT_SYMBOL(kmemdup_nul);
197
198	static kmem_buckets *user_buckets __ro_after_init;
199
200	static int __init init_user_buckets(void)
201	{
202	user_buckets = kmem_buckets_create(name: "memdup_user", flags: `0`, useroffset: `0`, INT_MAX, NULL);
203
204	return `0`;
205	}
206	subsys_initcall(init_user_buckets);
207
208	/**
209	* memdup_user - duplicate memory region from user space
210	*
211	* @src: source address in user space
212	* @len: number of bytes to copy
213	*
214	* Return: an ERR_PTR() on failure. Result is physically
215	* contiguous, to be freed by kfree().
216	*/
217	void memdup_user(const* void __user *src, size_t len)
218	{
219	void *p;
220
221	p = kmem_buckets_alloc_track_caller(user_buckets, len, GFP_USER \| __GFP_NOWARN);
222	if (!p)
223	return ERR_PTR(error: -ENOMEM);
224
225	if (copy_from_user(to: p, from: src, n: len)) {
226	kfree(objp: p);
227	return ERR_PTR(error: -EFAULT);
228	}
229
230	return p;
231	}
232	EXPORT_SYMBOL(memdup_user);
233
234	/**
235	* vmemdup_user - duplicate memory region from user space
236	*
237	* @src: source address in user space
238	* @len: number of bytes to copy
239	*
240	* Return: an ERR_PTR() on failure. Result may be not
241	* physically contiguous. Use kvfree() to free.
242	*/
243	void vmemdup_user(const* void __user *src, size_t len)
244	{
245	void *p;
246
247	p = kmem_buckets_valloc(user_buckets, len, GFP_USER);
248	if (!p)
249	return ERR_PTR(error: -ENOMEM);
250
251	if (copy_from_user(to: p, from: src, n: len)) {
252	kvfree(addr: p);
253	return ERR_PTR(error: -EFAULT);
254	}
255
256	return p;
257	}
258	EXPORT_SYMBOL(vmemdup_user);
259
260	/**
261	* strndup_user - duplicate an existing string from user space
262	* @s: The string to duplicate
263	* @n: Maximum number of bytes to copy, including the trailing NUL.
264	*
265	* Return: newly allocated copy of @s or an ERR_PTR() in case of error
266	*/
267	char strndup_user(const* char __user s, long* n)
268	{
269	char *p;
270	long length;
271
272	length = strnlen_user(str: s, n);
273
274	if (!length)
275	return ERR_PTR(error: -EFAULT);
276
277	if (length > n)
278	return ERR_PTR(error: -EINVAL);
279
280	p = memdup_user(s, length);
281
282	if (IS_ERR(ptr: p))
283	return p;
284
285	p[length - `1`] = `'\0'`;
286
287	return p;
288	}
289	EXPORT_SYMBOL(strndup_user);
290
291	/**
292	* memdup_user_nul - duplicate memory region from user space and NUL-terminate
293	*
294	* @src: source address in user space
295	* @len: number of bytes to copy
296	*
297	* Return: an ERR_PTR() on failure.
298	*/
299	void memdup_user_nul(const* void __user *src, size_t len)
300	{
301	char *p;
302
303	p = kmem_buckets_alloc_track_caller(user_buckets, len + `1`, GFP_USER \| __GFP_NOWARN);
304	if (!p)
305	return ERR_PTR(error: -ENOMEM);
306
307	if (copy_from_user(to: p, from: src, n: len)) {
308	kfree(objp: p);
309	return ERR_PTR(error: -EFAULT);
310	}
311	p[len] = `'\0'`;
312
313	return p;
314	}
315	EXPORT_SYMBOL(memdup_user_nul);
316
317	/ Check if the vma is being used as a stack by this task /
318	int vma_is_stack_for_current(const struct vm_area_struct *vma)
319	{
320	struct task_struct * __maybe_unused t = current;
321
322	return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
323	}
324
325	/*
326	* Change backing file, only valid to use during initial VMA setup.
327	*/
328	void vma_set_file(struct vm_area_struct vma, struct* file *file)
329	{
330	/ Changing an anonymous vma with this is illegal /
331	get_file(f: file);
332	swap(vma->vm_file, file);
333	fput(file);
334	}
335	EXPORT_SYMBOL(vma_set_file);
336
337	#ifndef STACK_RND_MASK
338	#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */
339	#endif
340
341	unsigned long randomize_stack_top(unsigned long stack_top)
342	{
343	unsigned long random_variable = `0`;
344
345	if (current->flags & PF_RANDOMIZE) {
346	random_variable = get_random_long();
347	random_variable &= STACK_RND_MASK;
348	random_variable <<= PAGE_SHIFT;
349	}
350	#ifdef CONFIG_STACK_GROWSUP
351	return PAGE_ALIGN(stack_top) + random_variable;
352	#else
353	return PAGE_ALIGN(stack_top) - random_variable;
354	#endif
355	}
356
357	/**
358	* randomize_page - Generate a random, page aligned address
359	* @start: The smallest acceptable address the caller will take.
360	* @range: The size of the area, starting at @start, within which the
361	* random address must fall.
362	*
363	* If @start + @range would overflow, @range is capped.
364	*
365	* NOTE: Historical use of randomize_range, which this replaces, presumed that
366	* @start was already page aligned. We now align it regardless.
367	*
368	* Return: A page aligned address within [start, start + range). On error,
369	* @start is returned.
370	*/
371	unsigned long randomize_page(unsigned long start, unsigned long range)
372	{
373	if (!PAGE_ALIGNED(start)) {
374	range -= PAGE_ALIGN(start) - start;
375	start = PAGE_ALIGN(start);
376	}
377
378	if (start > ULONG_MAX - range)
379	range = ULONG_MAX - start;
380
381	range >>= PAGE_SHIFT;
382
383	if (range == `0`)
384	return start;
385
386	return start + (get_random_long() % range << PAGE_SHIFT);
387	}
388
389	#ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
390	unsigned long __weak arch_randomize_brk(struct mm_struct *mm)
391	{
392	/ Is the current task 32bit ? /
393	if (!IS_ENABLED(CONFIG_64BIT) \|\| is_compat_task())
394	return randomize_page(mm->brk, SZ_32M);
395
396	return randomize_page(mm->brk, SZ_1G);
397	}
398
399	unsigned long arch_mmap_rnd(void)
400	{
401	unsigned long rnd;
402
403	#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
404	if (is_compat_task())
405	rnd = get_random_long() & ((`1UL` << mmap_rnd_compat_bits) - `1`);
406	else
407	#endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */
408	rnd = get_random_long() & ((`1UL` << mmap_rnd_bits) - `1`);
409
410	return rnd << PAGE_SHIFT;
411	}
412
413	static int mmap_is_legacy(const struct rlimit *rlim_stack)
414	{
415	if (current->personality & ADDR_COMPAT_LAYOUT)
416	return `1`;
417
418	/ On parisc the stack always grows up - so a unlimited stack should*
419	* not be an indicator to use the legacy memory layout. */
420	if (rlim_stack->rlim_cur == RLIM_INFINITY &&
421	!IS_ENABLED(CONFIG_STACK_GROWSUP))
422	return `1`;
423
424	return sysctl_legacy_va_layout;
425	}
426
427	/*
428	* Leave enough space between the mmap area and the stack to honour ulimit in
429	* the face of randomisation.
430	*/
431	#define MIN_GAP (SZ_128M)
432	#define MAX_GAP (STACK_TOP / 6 * 5)
433
434	static unsigned long mmap_base(const unsigned long rnd, const struct rlimit *rlim_stack)
435	{
436	#ifdef CONFIG_STACK_GROWSUP
437	/*
438	* For an upwards growing stack the calculation is much simpler.
439	* Memory for the maximum stack size is reserved at the top of the
440	* task. mmap_base starts directly below the stack and grows
441	* downwards.
442	*/
443	return PAGE_ALIGN_DOWN(mmap_upper_limit(rlim_stack) - rnd);
444	#else
445	unsigned long gap = rlim_stack->rlim_cur;
446	unsigned long pad = stack_guard_gap;
447
448	/ Account for stack randomization if necessary /
449	if (current->flags & PF_RANDOMIZE)
450	pad += (STACK_RND_MASK << PAGE_SHIFT);
451
452	/ Values close to RLIM_INFINITY can overflow. /
453	if (gap + pad > gap)
454	gap += pad;
455
456	if (gap < MIN_GAP && MIN_GAP < MAX_GAP)
457	gap = MIN_GAP;
458	else if (gap > MAX_GAP)
459	gap = MAX_GAP;
460
461	return PAGE_ALIGN(STACK_TOP - gap - rnd);
462	#endif
463	}
464
465	void arch_pick_mmap_layout(struct mm_struct mm, const* struct rlimit *rlim_stack)
466	{
467	unsigned long random_factor = `0UL`;
468
469	if (current->flags & PF_RANDOMIZE)
470	random_factor = arch_mmap_rnd();
471
472	if (mmap_is_legacy(rlim_stack)) {
473	mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
474	mm_flags_clear(MMF_TOPDOWN, mm);
475	} else {
476	mm->mmap_base = mmap_base(random_factor, rlim_stack);
477	mm_flags_set(MMF_TOPDOWN, mm);
478	}
479	}
480	#elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
481	void arch_pick_mmap_layout(struct mm_struct mm, const* struct rlimit *rlim_stack)
482	{
483	mm->mmap_base = TASK_UNMAPPED_BASE;
484	mm_flags_clear(MMF_TOPDOWN, mm);
485	}
486	#endif
487	#ifdef CONFIG_MMU
488	EXPORT_SYMBOL_IF_KUNIT(arch_pick_mmap_layout);
489	#endif
490
491	/**
492	* __account_locked_vm - account locked pages to an mm's locked_vm
493	* @mm: mm to account against
494	* @pages: number of pages to account
495	* @inc: %true if @pages should be considered positive, %false if not
496	* @task: task used to check RLIMIT_MEMLOCK
497	* @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped
498	*
499	* Assumes @task and @mm are valid (i.e. at least one reference on each), and
500	* that mmap_lock is held as writer.
501	*
502	* Return:
503	* * 0 on success
504	* * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
505	*/
506	int __account_locked_vm(struct mm_struct mm, unsigned* long pages, bool inc,
507	const struct task_struct *task, bool bypass_rlim)
508	{
509	unsigned long locked_vm, limit;
510	int ret = `0`;
511
512	mmap_assert_write_locked(mm);
513
514	locked_vm = mm->locked_vm;
515	if (inc) {
516	if (!bypass_rlim) {
517	limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
518	if (locked_vm + pages > limit)
519	ret = -ENOMEM;
520	}
521	if (!ret)
522	mm->locked_vm = locked_vm + pages;
523	} else {
524	WARN_ON_ONCE(pages > locked_vm);
525	mm->locked_vm = locked_vm - pages;
526	}
527
528	pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid,
529	(void *)_RET_IP_, (inc) ? `'+'` : `'-'`, pages << PAGE_SHIFT,
530	locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK),
531	ret ? " - exceeded" : "");
532
533	return ret;
534	}
535	EXPORT_SYMBOL_GPL(__account_locked_vm);
536
537	/**
538	* account_locked_vm - account locked pages to an mm's locked_vm
539	* @mm: mm to account against, may be NULL
540	* @pages: number of pages to account
541	* @inc: %true if @pages should be considered positive, %false if not
542	*
543	* Assumes a non-NULL @mm is valid (i.e. at least one reference on it).
544	*
545	* Return:
546	* * 0 on success, or if mm is NULL
547	* * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
548	*/
549	int account_locked_vm(struct mm_struct mm, unsigned* long pages, bool inc)
550	{
551	int ret;
552
553	if (pages == `0` \|\| !mm)
554	return `0`;
555
556	mmap_write_lock(mm);
557	ret = __account_locked_vm(mm, pages, inc, current,
558	capable(CAP_IPC_LOCK));
559	mmap_write_unlock(mm);
560
561	return ret;
562	}
563	EXPORT_SYMBOL_GPL(account_locked_vm);
564
565	unsigned long vm_mmap_pgoff(struct file file, unsigned* long addr,
566	unsigned long len, unsigned long prot,
567	unsigned long flag, unsigned long pgoff)
568	{
569	loff_t off = (loff_t)pgoff << PAGE_SHIFT;
570	unsigned long ret;
571	struct mm_struct *mm = current->mm;
572	unsigned long populate;
573	LIST_HEAD(uf);
574
575	ret = security_mmap_file(file, prot, flags: flag);
576	if (!ret)
577	ret = fsnotify_mmap_perm(file, prot, off, len);
578	if (!ret) {
579	if (mmap_write_lock_killable(mm))
580	return -EINTR;
581	ret = do_mmap(file, addr, len, prot, flags: flag, vm_flags: `0`, pgoff, populate: &populate,
582	uf: &uf);
583	mmap_write_unlock(mm);
584	userfaultfd_unmap_complete(mm, uf: &uf);
585	if (populate)
586	mm_populate(addr: ret, len: populate);
587	}
588	return ret;
589	}
590
591	/*
592	* Perform a userland memory mapping into the current process address space. See
593	* the comment for do_mmap() for more details on this operation in general.
594	*
595	* This differs from do_mmap() in that:
596	*
597	* a. An offset parameter is provided rather than pgoff, which is both checked
598	* for overflow and page alignment.
599	* b. mmap locking is performed on the caller's behalf.
600	* c. Userfaultfd unmap events and memory population are handled.
601	*
602	* This means that this function performs essentially the same work as if
603	* userland were invoking mmap (2).
604	*
605	* Returns either an error, or the address at which the requested mapping has
606	* been performed.
607	*/
608	unsigned long vm_mmap(struct file file, unsigned* long addr,
609	unsigned long len, unsigned long prot,
610	unsigned long flag, unsigned long offset)
611	{
612	if (unlikely(offset + PAGE_ALIGN(len) < offset))
613	return -EINVAL;
614	if (unlikely(offset_in_page(offset)))
615	return -EINVAL;
616
617	return vm_mmap_pgoff(file, addr, len, prot, flag, pgoff: offset >> PAGE_SHIFT);
618	}
619	EXPORT_SYMBOL(vm_mmap);
620
621	/**
622	* __vmalloc_array - allocate memory for a virtually contiguous array.
623	* @n: number of elements.
624	* @size: element size.
625	* @flags: the type of memory to allocate (see kmalloc).
626	*/
627	void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags)
628	{
629	size_t bytes;
630
631	if (unlikely(check_mul_overflow(n, size, &bytes)))
632	return NULL;
633	return __vmalloc_noprof(size: bytes, gfp_mask: flags);
634	}
635	EXPORT_SYMBOL(__vmalloc_array_noprof);
636
637	/**
638	* vmalloc_array - allocate memory for a virtually contiguous array.
639	* @n: number of elements.
640	* @size: element size.
641	*/
642	void *vmalloc_array_noprof(size_t n, size_t size)
643	{
644	return __vmalloc_array_noprof(n, size, GFP_KERNEL);
645	}
646	EXPORT_SYMBOL(vmalloc_array_noprof);
647
648	/**
649	* __vcalloc - allocate and zero memory for a virtually contiguous array.
650	* @n: number of elements.
651	* @size: element size.
652	* @flags: the type of memory to allocate (see kmalloc).
653	*/
654	void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags)
655	{
656	return __vmalloc_array_noprof(n, size, flags \| __GFP_ZERO);
657	}
658	EXPORT_SYMBOL(__vcalloc_noprof);
659
660	/**
661	* vcalloc - allocate and zero memory for a virtually contiguous array.
662	* @n: number of elements.
663	* @size: element size.
664	*/
665	void *vcalloc_noprof(size_t n, size_t size)
666	{
667	return __vmalloc_array_noprof(n, size, GFP_KERNEL \| __GFP_ZERO);
668	}
669	EXPORT_SYMBOL(vcalloc_noprof);
670
671	struct anon_vma folio_anon_vma(const* struct folio *folio)
672	{
673	unsigned long mapping = (unsigned long)folio->mapping;
674
675	if ((mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON)
676	return NULL;
677	return (void *)(mapping - FOLIO_MAPPING_ANON);
678	}
679
680	/**
681	* folio_mapping - Find the mapping where this folio is stored.
682	* @folio: The folio.
683	*
684	* For folios which are in the page cache, return the mapping that this
685	* page belongs to. Folios in the swap cache return the swap mapping
686	* this page is stored in (which is different from the mapping for the
687	* swap file or swap device where the data is stored).
688	*
689	* You can call this for folios which aren't in the swap cache or page
690	* cache and it will return NULL.
691	*/
692	struct address_space folio_mapping(const* struct folio *folio)
693	{
694	struct address_space *mapping;
695
696	/ This happens if someone calls flush_dcache_page on slab page /
697	if (unlikely(folio_test_slab(folio)))
698	return NULL;
699
700	if (unlikely(folio_test_swapcache(folio)))
701	return swap_address_space(entry: folio->swap);
702
703	mapping = folio->mapping;
704	if ((unsigned long)mapping & FOLIO_MAPPING_FLAGS)
705	return NULL;
706
707	return mapping;
708	}
709	EXPORT_SYMBOL(folio_mapping);
710
711	/**
712	* folio_copy - Copy the contents of one folio to another.
713	* @dst: Folio to copy to.
714	* @src: Folio to copy from.
715	*
716	* The bytes in the folio represented by @src are copied to @dst.
717	* Assumes the caller has validated that @dst is at least as large as @src.
718	* Can be called in atomic context for order-0 folios, but if the folio is
719	* larger, it may sleep.
720	*/
721	void folio_copy(struct folio dst, struct* folio *src)
722	{
723	long i = `0`;
724	long nr = folio_nr_pages(folio: src);
725
726	for (;;) {
727	copy_highpage(folio_page(dst, i), folio_page(src, i));
728	if (++i == nr)
729	break;
730	cond_resched();
731	}
732	}
733	EXPORT_SYMBOL(folio_copy);
734
735	int folio_mc_copy(struct folio dst, struct* folio *src)
736	{
737	long nr = folio_nr_pages(folio: src);
738	long i = `0`;
739
740	for (;;) {
741	if (copy_mc_highpage(folio_page(dst, i), folio_page(src, i)))
742	return -EHWPOISON;
743	if (++i == nr)
744	break;
745	cond_resched();
746	}
747
748	return `0`;
749	}
750	EXPORT_SYMBOL(folio_mc_copy);
751
752	int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
753	static int sysctl_overcommit_ratio __read_mostly = `50`;
754	static unsigned long sysctl_overcommit_kbytes __read_mostly;
755	int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
756	unsigned long sysctl_user_reserve_kbytes __read_mostly = `1UL` << `17`; / 128MB /
757	unsigned long sysctl_admin_reserve_kbytes __read_mostly = `1UL` << `13`; / 8MB /
758
759	#ifdef CONFIG_SYSCTL
760
761	static int overcommit_ratio_handler(const struct ctl_table table, int* write,
762	void buffer, size_t lenp, loff_t *ppos)
763	{
764	int ret;
765
766	ret = proc_dointvec(table, write, buffer, lenp, ppos);
767	if (ret == `0` && write)
768	sysctl_overcommit_kbytes = `0`;
769	return ret;
770	}
771
772	static void sync_overcommit_as(struct work_struct *dummy)
773	{
774	percpu_counter_sync(fbc: &vm_committed_as);
775	}
776
777	static int overcommit_policy_handler(const struct ctl_table table, int* write,
778	void buffer, size_t lenp, loff_t *ppos)
779	{
780	struct ctl_table t;
781	int new_policy = -`1`;
782	int ret;
783
784	/*
785	* The deviation of sync_overcommit_as could be big with loose policy
786	* like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to
787	* strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply
788	* with the strict "NEVER", and to avoid possible race condition (even
789	* though user usually won't too frequently do the switching to policy
790	* OVERCOMMIT_NEVER), the switch is done in the following order:
791	* 1. changing the batch
792	* 2. sync percpu count on each CPU
793	* 3. switch the policy
794	*/
795	if (write) {
796	t = *table;
797	t.data = &new_policy;
798	ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
799	if (ret \|\| new_policy == -`1`)
800	return ret;
801
802	mm_compute_batch(overcommit_policy: new_policy);
803	if (new_policy == OVERCOMMIT_NEVER)
804	schedule_on_each_cpu(func: sync_overcommit_as);
805	sysctl_overcommit_memory = new_policy;
806	} else {
807	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
808	}
809
810	return ret;
811	}
812
813	static int overcommit_kbytes_handler(const struct ctl_table table, int* write,
814	void buffer, size_t lenp, loff_t *ppos)
815	{
816	int ret;
817
818	ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
819	if (ret == `0` && write)
820	sysctl_overcommit_ratio = `0`;
821	return ret;
822	}
823
824	static const struct ctl_table util_sysctl_table[] = {
825	{
826	.procname = "overcommit_memory",
827	.data = &sysctl_overcommit_memory,
828	.maxlen = sizeof(sysctl_overcommit_memory),
829	.mode = `0644`,
830	.proc_handler = overcommit_policy_handler,
831	.extra1 = SYSCTL_ZERO,
832	.extra2 = SYSCTL_TWO,
833	},
834	{
835	.procname = "overcommit_ratio",
836	.data = &sysctl_overcommit_ratio,
837	.maxlen = sizeof(sysctl_overcommit_ratio),
838	.mode = `0644`,
839	.proc_handler = overcommit_ratio_handler,
840	},
841	{
842	.procname = "overcommit_kbytes",
843	.data = &sysctl_overcommit_kbytes,
844	.maxlen = sizeof(sysctl_overcommit_kbytes),
845	.mode = `0644`,
846	.proc_handler = overcommit_kbytes_handler,
847	},
848	{
849	.procname = "user_reserve_kbytes",
850	.data = &sysctl_user_reserve_kbytes,
851	.maxlen = sizeof(sysctl_user_reserve_kbytes),
852	.mode = `0644`,
853	.proc_handler = proc_doulongvec_minmax,
854	},
855	{
856	.procname = "admin_reserve_kbytes",
857	.data = &sysctl_admin_reserve_kbytes,
858	.maxlen = sizeof(sysctl_admin_reserve_kbytes),
859	.mode = `0644`,
860	.proc_handler = proc_doulongvec_minmax,
861	},
862	};
863
864	static int __init init_vm_util_sysctls(void)
865	{
866	register_sysctl_init("vm", util_sysctl_table);
867	return `0`;
868	}
869	subsys_initcall(init_vm_util_sysctls);
870	#endif /* CONFIG_SYSCTL */
871
872	/*
873	* Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
874	*/
875	unsigned long vm_commit_limit(void)
876	{
877	unsigned long allowed;
878
879	if (sysctl_overcommit_kbytes)
880	allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - `10`);
881	else
882	allowed = ((totalram_pages() - hugetlb_total_pages())
883	* sysctl_overcommit_ratio / `100`);
884	allowed += total_swap_pages;
885
886	return allowed;
887	}
888
889	/*
890	* Make sure vm_committed_as in one cacheline and not cacheline shared with
891	* other variables. It can be updated by several CPUs frequently.
892	*/
893	struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
894
895	/*
896	* The global memory commitment made in the system can be a metric
897	* that can be used to drive ballooning decisions when Linux is hosted
898	* as a guest. On Hyper-V, the host implements a policy engine for dynamically
899	* balancing memory across competing virtual machines that are hosted.
900	* Several metrics drive this policy engine including the guest reported
901	* memory commitment.
902	*
903	* The time cost of this is very low for small platforms, and for big
904	* platform like a 2S/36C/72T Skylake server, in worst case where
905	* vm_committed_as's spinlock is under severe contention, the time cost
906	* could be about 30~40 microseconds.
907	*/
908	unsigned long vm_memory_committed(void)
909	{
910	return percpu_counter_sum_positive(fbc: &vm_committed_as);
911	}
912	EXPORT_SYMBOL_GPL(vm_memory_committed);
913
914	/*
915	* Check that a process has enough memory to allocate a new virtual
916	* mapping. 0 means there is enough memory for the allocation to
917	* succeed and -ENOMEM implies there is not.
918	*
919	* We currently support three overcommit policies, which are set via the
920	* vm.overcommit_memory sysctl. See Documentation/mm/overcommit-accounting.rst
921	*
922	* Strict overcommit modes added 2002 Feb 26 by Alan Cox.
923	* Additional code 2002 Jul 20 by Robert Love.
924	*
925	* cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
926	*
927	* Note this is a helper function intended to be used by LSMs which
928	* wish to use this logic.
929	*/
930	int __vm_enough_memory(const struct mm_struct mm, long* pages, int cap_sys_admin)
931	{
932	long allowed;
933	unsigned long bytes_failed;
934
935	vm_acct_memory(pages);
936
937	/*
938	* Sometimes we want to use more memory than we have
939	*/
940	if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
941	return `0`;
942
943	if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
944	if (pages > totalram_pages() + total_swap_pages)
945	goto error;
946	return `0`;
947	}
948
949	allowed = vm_commit_limit();
950	/*
951	* Reserve some for root
952	*/
953	if (!cap_sys_admin)
954	allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - `10`);
955
956	/*
957	* Don't let a single process grow so big a user can't recover
958	*/
959	if (mm) {
960	long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - `10`);
961
962	allowed -= min_t(long, mm->total_vm / `32`, reserve);
963	}
964
965	if (percpu_counter_read_positive(fbc: &vm_committed_as) < allowed)
966	return `0`;
967	error:
968	bytes_failed = pages << PAGE_SHIFT;
969	pr_warn_ratelimited("%s: pid: %d, comm: %s, bytes: %lu not enough memory for the allocation\n",
970	__func__, current->pid, current->comm, bytes_failed);
971	vm_unacct_memory(pages);
972
973	return -ENOMEM;
974	}
975
976	/**
977	* get_cmdline() - copy the cmdline value to a buffer.
978	* @task: the task whose cmdline value to copy.
979	* @buffer: the buffer to copy to.
980	* @buflen: the length of the buffer. Larger cmdline values are truncated
981	* to this length.
982	*
983	* Return: the size of the cmdline field copied. Note that the copy does
984	* not guarantee an ending NULL byte.
985	*/
986	int get_cmdline(struct task_struct task, char* buffer, int* buflen)
987	{
988	int res = `0`;
989	unsigned int len;
990	struct mm_struct *mm = get_task_mm(task);
991	unsigned long arg_start, arg_end, env_start, env_end;
992	if (!mm)
993	goto out;
994	if (!mm->arg_end)
995	goto out_mm; / Shh! No looking before we're done /
996
997	spin_lock(lock: &mm->arg_lock);
998	arg_start = mm->arg_start;
999	arg_end = mm->arg_end;
1000	env_start = mm->env_start;
1001	env_end = mm->env_end;
1002	spin_unlock(lock: &mm->arg_lock);
1003
1004	len = arg_end - arg_start;
1005
1006	if (len > buflen)
1007	len = buflen;
1008
1009	res = access_process_vm(tsk: task, addr: arg_start, buf: buffer, len, gup_flags: FOLL_FORCE);
1010
1011	/*
1012	* If the nul at the end of args has been overwritten, then
1013	* assume application is using setproctitle(3).
1014	*/
1015	if (res > `0` && buffer[res-`1`] != `'\0'` && len < buflen) {
1016	len = strnlen(buffer, res);
1017	if (len < res) {
1018	res = len;
1019	} else {
1020	len = env_end - env_start;
1021	if (len > buflen - res)
1022	len = buflen - res;
1023	res += access_process_vm(tsk: task, addr: env_start,
1024	buf: buffer+res, len,
1025	gup_flags: FOLL_FORCE);
1026	res = strnlen(buffer, res);
1027	}
1028	}
1029	out_mm:
1030	mmput(mm);
1031	out:
1032	return res;
1033	}
1034
1035	int __weak memcmp_pages(struct page page1, struct* page *page2)
1036	{
1037	char addr1, addr2;
1038	int ret;
1039
1040	addr1 = kmap_local_page(page: page1);
1041	addr2 = kmap_local_page(page: page2);
1042	ret = memcmp(addr1, addr2, PAGE_SIZE);
1043	kunmap_local(addr2);
1044	kunmap_local(addr1);
1045	return ret;
1046	}
1047
1048	#ifdef CONFIG_PRINTK
1049	/**
1050	* mem_dump_obj - Print available provenance information
1051	* @object: object for which to find provenance information.
1052	*
1053	* This function uses pr_cont(), so that the caller is expected to have
1054	* printed out whatever preamble is appropriate. The provenance information
1055	* depends on the type of object and on how much debugging is enabled.
1056	* For example, for a slab-cache object, the slab name is printed, and,
1057	* if available, the return address and stack trace from the allocation
1058	* and last free path of that object.
1059	*/
1060	void mem_dump_obj(void *object)
1061	{
1062	const char *type;
1063
1064	if (kmem_dump_obj(object))
1065	return;
1066
1067	if (vmalloc_dump_obj(object))
1068	return;
1069
1070	if (is_vmalloc_addr(x: object))
1071	type = "vmalloc memory";
1072	else if (virt_addr_valid(object))
1073	type = "non-slab/vmalloc memory";
1074	else if (object == NULL)
1075	type = "NULL pointer";
1076	else if (object == ZERO_SIZE_PTR)
1077	type = "zero-size pointer";
1078	else
1079	type = "non-paged memory";
1080
1081	pr_cont(" %s\n", type);
1082	}
1083	EXPORT_SYMBOL_GPL(mem_dump_obj);
1084	#endif
1085
1086	/*
1087	* A driver might set a page logically offline -- PageOffline() -- and
1088	* turn the page inaccessible in the hypervisor; after that, access to page
1089	* content can be fatal.
1090	*
1091	* Some special PFN walkers -- i.e., /proc/kcore -- read content of random
1092	* pages after checking PageOffline(); however, these PFN walkers can race
1093	* with drivers that set PageOffline().
1094	*
1095	* page_offline_freeze()/page_offline_thaw() allows for a subsystem to
1096	* synchronize with such drivers, achieving that a page cannot be set
1097	* PageOffline() while frozen.
1098	*
1099	* page_offline_begin()/page_offline_end() is used by drivers that care about
1100	* such races when setting a page PageOffline().
1101	*/
1102	static DECLARE_RWSEM(page_offline_rwsem);
1103
1104	void page_offline_freeze(void)
1105	{
1106	down_read(sem: &page_offline_rwsem);
1107	}
1108
1109	void page_offline_thaw(void)
1110	{
1111	up_read(sem: &page_offline_rwsem);
1112	}
1113
1114	void page_offline_begin(void)
1115	{
1116	down_write(sem: &page_offline_rwsem);
1117	}
1118	EXPORT_SYMBOL(page_offline_begin);
1119
1120	void page_offline_end(void)
1121	{
1122	up_write(sem: &page_offline_rwsem);
1123	}
1124	EXPORT_SYMBOL(page_offline_end);
1125
1126	#ifndef flush_dcache_folio
1127	void flush_dcache_folio(struct folio *folio)
1128	{
1129	long i, nr = folio_nr_pages(folio);
1130
1131	for (i = `0`; i < nr; i++)
1132	flush_dcache_page(folio_page(folio, i));
1133	}
1134	EXPORT_SYMBOL(flush_dcache_folio);
1135	#endif
1136
1137	/**
1138	* __compat_vma_mmap_prepare() - See description for compat_vma_mmap_prepare()
1139	* for details. This is the same operation, only with a specific file operations
1140	* struct which may or may not be the same as vma->vm_file->f_op.
1141	* @f_op: The file operations whose .mmap_prepare() hook is specified.
1142	* @file: The file which backs or will back the mapping.
1143	* @vma: The VMA to apply the .mmap_prepare() hook to.
1144	* Returns: 0 on success or error.
1145	*/
1146	int __compat_vma_mmap_prepare(const struct file_operations *f_op,
1147	struct file file, struct* vm_area_struct *vma)
1148	{
1149	struct vm_area_desc desc = {
1150	.mm = vma->vm_mm,
1151	.file = file,
1152	.start = vma->vm_start,
1153	.end = vma->vm_end,
1154
1155	.pgoff = vma->vm_pgoff,
1156	.vm_file = vma->vm_file,
1157	.vm_flags = vma->vm_flags,
1158	.page_prot = vma->vm_page_prot,
1159	};
1160	int err;
1161
1162	err = f_op->mmap_prepare(&desc);
1163	if (err)
1164	return err;
1165	set_vma_from_desc(vma, desc: &desc);
1166
1167	return `0`;
1168	}
1169	EXPORT_SYMBOL(__compat_vma_mmap_prepare);
1170
1171	/**
1172	* compat_vma_mmap_prepare() - Apply the file's .mmap_prepare() hook to an
1173	* existing VMA.
1174	* @file: The file which possesss an f_op->mmap_prepare() hook.
1175	* @vma: The VMA to apply the .mmap_prepare() hook to.
1176	*
1177	* Ordinarily, .mmap_prepare() is invoked directly upon mmap(). However, certain
1178	* stacked filesystems invoke a nested mmap hook of an underlying file.
1179	*
1180	* Until all filesystems are converted to use .mmap_prepare(), we must be
1181	* conservative and continue to invoke these stacked filesystems using the
1182	* deprecated .mmap() hook.
1183	*
1184	* However we have a problem if the underlying file system possesses an
1185	* .mmap_prepare() hook, as we are in a different context when we invoke the
1186	* .mmap() hook, already having a VMA to deal with.
1187	*
1188	* compat_vma_mmap_prepare() is a compatibility function that takes VMA state,
1189	* establishes a struct vm_area_desc descriptor, passes to the underlying
1190	* .mmap_prepare() hook and applies any changes performed by it.
1191	*
1192	* Once the conversion of filesystems is complete this function will no longer
1193	* be required and will be removed.
1194	*
1195	* Returns: 0 on success or error.
1196	*/
1197	int compat_vma_mmap_prepare(struct file file, struct* vm_area_struct *vma)
1198	{
1199	return __compat_vma_mmap_prepare(file->f_op, file, vma);
1200	}
1201	EXPORT_SYMBOL(compat_vma_mmap_prepare);
1202
1203	static void set_ps_flags(struct page_snapshot ps, const* struct folio *folio,
1204	const struct page *page)
1205	{
1206	/*
1207	* Only the first page of a high-order buddy page has PageBuddy() set.
1208	* So we have to check manually whether this page is part of a high-
1209	* order buddy page.
1210	*/
1211	if (PageBuddy(page))
1212	ps->flags \|= PAGE_SNAPSHOT_PG_BUDDY;
1213	else if (page_count(page) == `0` && is_free_buddy_page(page))
1214	ps->flags \|= PAGE_SNAPSHOT_PG_BUDDY;
1215
1216	if (folio_test_idle(folio))
1217	ps->flags \|= PAGE_SNAPSHOT_PG_IDLE;
1218	}
1219
1220	/**
1221	* snapshot_page() - Create a snapshot of a struct page
1222	* @ps: Pointer to a struct page_snapshot to store the page snapshot
1223	* @page: The page to snapshot
1224	*
1225	* Create a snapshot of the page and store both its struct page and struct
1226	* folio representations in @ps.
1227	*
1228	* A snapshot is marked as "faithful" if the compound state of @page was
1229	* stable and allowed safe reconstruction of the folio representation. In
1230	* rare cases where this is not possible (e.g. due to folio splitting),
1231	* snapshot_page() falls back to treating @page as a single page and the
1232	* snapshot is marked as "unfaithful". The snapshot_page_is_faithful()
1233	* helper can be used to check for this condition.
1234	*/
1235	void snapshot_page(struct page_snapshot ps, const* struct page *page)
1236	{
1237	unsigned long head, nr_pages = `1`;
1238	struct folio *foliop;
1239	int loops = `5`;
1240
1241	ps->pfn = page_to_pfn(page);
1242	ps->flags = PAGE_SNAPSHOT_FAITHFUL;
1243
1244	again:
1245	memset(s: &ps->folio_snapshot, c: `0`, n: sizeof(struct folio));
1246	memcpy(to: &ps->page_snapshot, from: page, len: sizeof(*page));
1247	head = ps->page_snapshot.compound_head;
1248	if ((head & `1`) == `0`) {
1249	ps->idx = `0`;
1250	foliop = (struct folio *)&ps->page_snapshot;
1251	if (!folio_test_large(folio: foliop)) {
1252	set_ps_flags(ps, page_folio(page), page);
1253	memcpy(to: &ps->folio_snapshot, from: foliop,
1254	len: sizeof(struct page));
1255	return;
1256	}
1257	foliop = (struct folio *)page;
1258	} else {
1259	foliop = (struct folio *)(head - `1`);
1260	ps->idx = folio_page_idx(folio: foliop, page);
1261	}
1262
1263	if (ps->idx < MAX_FOLIO_NR_PAGES) {
1264	memcpy(to: &ps->folio_snapshot, from: foliop, len: `2` * sizeof(struct page));
1265	nr_pages = folio_nr_pages(folio: &ps->folio_snapshot);
1266	if (nr_pages > `1`)
1267	memcpy(to: &ps->folio_snapshot.__page_2, from: &foliop->__page_2,
1268	len: sizeof(struct page));
1269	set_ps_flags(ps, folio: foliop, page);
1270	}
1271
1272	if (ps->idx > nr_pages) {
1273	if (loops-- > `0`)
1274	goto again;
1275	clear_compound_head(page: &ps->page_snapshot);
1276	foliop = (struct folio *)&ps->page_snapshot;
1277	memcpy(to: &ps->folio_snapshot, from: foliop, len: sizeof(struct page));
1278	ps->flags = `0`;
1279	ps->idx = `0`;
1280	}
1281	}
1282
1283	#ifdef CONFIG_MMU
1284	/**
1285	* folio_pte_batch - detect a PTE batch for a large folio
1286	* @folio: The large folio to detect a PTE batch for.
1287	* @ptep: Page table pointer for the first entry.
1288	* @pte: Page table entry for the first page.
1289	* @max_nr: The maximum number of table entries to consider.
1290	*
1291	* This is a simplified variant of folio_pte_batch_flags().
1292	*
1293	* Detect a PTE batch: consecutive (present) PTEs that map consecutive
1294	* pages of the same large folio in a single VMA and a single page table.
1295	*
1296	* All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN,
1297	* the accessed bit, writable bit, dirt-bit and soft-dirty bit.
1298	*
1299	* ptep must map any page of the folio. max_nr must be at least one and
1300	* must be limited by the caller so scanning cannot exceed a single VMA and
1301	* a single page table.
1302	*
1303	* Return: the number of table entries in the batch.
1304	*/
1305	unsigned int folio_pte_batch(struct folio folio, pte_t ptep, pte_t pte,
1306	unsigned int max_nr)
1307	{
1308	return folio_pte_batch_flags(folio, NULL, ptep, ptentp: &pte, max_nr, flags: `0`);
1309	}
1310	#endif /* CONFIG_MMU */
1311
1312	#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
1313	/**
1314	* page_range_contiguous - test whether the page range is contiguous
1315	* @page: the start of the page range.
1316	* @nr_pages: the number of pages in the range.
1317	*
1318	* Test whether the page range is contiguous, such that they can be iterated
1319	* naively, corresponding to iterating a contiguous PFN range.
1320	*
1321	* This function should primarily only be used for debug checks, or when
1322	* working with page ranges that are not naturally contiguous (e.g., pages
1323	* within a folio are).
1324	*
1325	* Returns true if contiguous, otherwise false.
1326	*/
1327	bool page_range_contiguous(const struct page page, unsigned* long nr_pages)
1328	{
1329	const unsigned long start_pfn = page_to_pfn(page);
1330	const unsigned long end_pfn = start_pfn + nr_pages;
1331	unsigned long pfn;
1332
1333	/*
1334	* The memmap is allocated per memory section, so no need to check
1335	* within the first section. However, we need to check each other
1336	* spanned memory section once, making sure the first page in a
1337	* section could similarly be reached by just iterating pages.
1338	*/
1339	for (pfn = ALIGN(start_pfn, PAGES_PER_SECTION);
1340	pfn < end_pfn; pfn += PAGES_PER_SECTION)
1341	if (unlikely(page + (pfn - start_pfn) != pfn_to_page(pfn)))
1342	return false;
1343	return true;
1344	}
1345	EXPORT_SYMBOL(page_range_contiguous);
1346	#endif
1347

Browse the source code of Linux/mm/util.c