exec.c source code [Linux/fs/exec.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* linux/fs/exec.c
4	*
5	* Copyright (C) 1991, 1992 Linus Torvalds
6	*/
7
8	/*
9	* #!-checking implemented by tytso.
10	*/
11	/*
12	* Demand-loading implemented 01.12.91 - no need to read anything but
13	* the header into memory. The inode of the executable is put into
14	* "current->executable", and page faults do the actual loading. Clean.
15	*
16	* Once more I can proudly say that linux stood up to being changed: it
17	* was less than 2 hours work to get demand-loading completely implemented.
18	*
19	* Demand loading changed July 1993 by Eric Youngdale. Use mmap instead,
20	* current->executable is only used by the procfs. This allows a dispatch
21	* table to check for several different types of binary formats. We keep
22	* trying until we recognize the file or we run out of supported binary
23	* formats.
24	*/
25
26	#include <linux/kernel_read_file.h>
27	#include <linux/slab.h>
28	#include <linux/file.h>
29	#include <linux/fdtable.h>
30	#include <linux/mm.h>
31	#include <linux/stat.h>
32	#include <linux/fcntl.h>
33	#include <linux/swap.h>
34	#include <linux/string.h>
35	#include <linux/init.h>
36	#include <linux/sched/mm.h>
37	#include <linux/sched/coredump.h>
38	#include <linux/sched/signal.h>
39	#include <linux/sched/numa_balancing.h>
40	#include <linux/sched/task.h>
41	#include <linux/pagemap.h>
42	#include <linux/perf_event.h>
43	#include <linux/highmem.h>
44	#include <linux/spinlock.h>
45	#include <linux/key.h>
46	#include <linux/personality.h>
47	#include <linux/binfmts.h>
48	#include <linux/utsname.h>
49	#include <linux/pid_namespace.h>
50	#include <linux/module.h>
51	#include <linux/namei.h>
52	#include <linux/mount.h>
53	#include <linux/security.h>
54	#include <linux/syscalls.h>
55	#include <linux/tsacct_kern.h>
56	#include <linux/cn_proc.h>
57	#include <linux/audit.h>
58	#include <linux/kmod.h>
59	#include <linux/fsnotify.h>
60	#include <linux/fs_struct.h>
61	#include <linux/oom.h>
62	#include <linux/compat.h>
63	#include <linux/vmalloc.h>
64	#include <linux/io_uring.h>
65	#include <linux/syscall_user_dispatch.h>
66	#include <linux/coredump.h>
67	#include <linux/time_namespace.h>
68	#include <linux/user_events.h>
69	#include <linux/rseq.h>
70	#include <linux/ksm.h>
71
72	#include <linux/uaccess.h>
73	#include <asm/mmu_context.h>
74	#include <asm/tlb.h>
75
76	#include <trace/events/task.h>
77	#include "internal.h"
78
79	#include <trace/events/sched.h>
80
81	/ For vma exec functions. /
82	#include "../mm/internal.h"
83
84	static int bprm_creds_from_file(struct linux_binprm *bprm);
85
86	int suid_dumpable = `0`;
87
88	static LIST_HEAD(formats);
89	static DEFINE_RWLOCK(binfmt_lock);
90
91	void __register_binfmt(struct linux_binfmt * fmt, int insert)
92	{
93	write_lock(&binfmt_lock);
94	insert ? list_add(new: &fmt->lh, head: &formats) :
95	list_add_tail(new: &fmt->lh, head: &formats);
96	write_unlock(&binfmt_lock);
97	}
98
99	EXPORT_SYMBOL(__register_binfmt);
100
101	void unregister_binfmt(struct linux_binfmt * fmt)
102	{
103	write_lock(&binfmt_lock);
104	list_del(entry: &fmt->lh);
105	write_unlock(&binfmt_lock);
106	}
107
108	EXPORT_SYMBOL(unregister_binfmt);
109
110	static inline void put_binfmt(struct linux_binfmt * fmt)
111	{
112	module_put(module: fmt->module);
113	}
114
115	bool path_noexec(const struct path *path)
116	{
117	/ If it's an anonymous inode make sure that we catch any shenanigans. /
118	VFS_WARN_ON_ONCE(IS_ANON_FILE(d_inode(path->dentry)) &&
119	!(path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC));
120	return (path->mnt->mnt_flags & MNT_NOEXEC) \|\|
121	(path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC);
122	}
123
124	#ifdef CONFIG_MMU
125	/*
126	* The nascent bprm->mm is not visible until exec_mmap() but it can
127	* use a lot of memory, account these pages in current->mm temporary
128	* for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we
129	* change the counter back via acct_arg_size(0).
130	*/
131	static void acct_arg_size(struct linux_binprm bprm, unsigned* long pages)
132	{
133	struct mm_struct *mm = current->mm;
134	long diff = (long)(pages - bprm->vma_pages);
135
136	if (!mm \|\| !diff)
137	return;
138
139	bprm->vma_pages = pages;
140	add_mm_counter(mm, member: MM_ANONPAGES, value: diff);
141	}
142
143	static struct page get_arg_page(struct* linux_binprm bprm, unsigned* long pos,
144	int write)
145	{
146	struct page *page;
147	struct vm_area_struct *vma = bprm->vma;
148	struct mm_struct *mm = bprm->mm;
149	int ret;
150
151	/*
152	* Avoid relying on expanding the stack down in GUP (which
153	* does not work for STACK_GROWSUP anyway), and just do it
154	* ahead of time.
155	*/
156	if (!mmap_read_lock_maybe_expand(mm, vma, addr: pos, write))
157	return NULL;
158
159	/*
160	* We are doing an exec(). 'current' is the process
161	* doing the exec and 'mm' is the new process's mm.
162	*/
163	ret = get_user_pages_remote(mm, start: pos, nr_pages: `1`,
164	gup_flags: write ? FOLL_WRITE : `0`,
165	pages: &page, NULL);
166	mmap_read_unlock(mm);
167	if (ret <= `0`)
168	return NULL;
169
170	if (write)
171	acct_arg_size(bprm, pages: vma_pages(vma));
172
173	return page;
174	}
175
176	static void put_arg_page(struct page *page)
177	{
178	put_page(page);
179	}
180
181	static void free_arg_pages(struct linux_binprm *bprm)
182	{
183	}
184
185	static void flush_arg_page(struct linux_binprm bprm, unsigned* long pos,
186	struct page *page)
187	{
188	flush_cache_page(vma: bprm->vma, vmaddr: pos, page_to_pfn(page));
189	}
190
191	static bool valid_arg_len(struct linux_binprm bprm, long* len)
192	{
193	return len <= MAX_ARG_STRLEN;
194	}
195
196	#else
197
198	static inline void acct_arg_size(struct linux_binprm bprm, unsigned* long pages)
199	{
200	}
201
202	static struct page get_arg_page(struct* linux_binprm bprm, unsigned* long pos,
203	int write)
204	{
205	struct page *page;
206
207	page = bprm->page[pos / PAGE_SIZE];
208	if (!page && write) {
209	page = alloc_page(GFP_HIGHUSER\|__GFP_ZERO);
210	if (!page)
211	return NULL;
212	bprm->page[pos / PAGE_SIZE] = page;
213	}
214
215	return page;
216	}
217
218	static void put_arg_page(struct page *page)
219	{
220	}
221
222	static void free_arg_page(struct linux_binprm bprm, int* i)
223	{
224	if (bprm->page[i]) {
225	__free_page(bprm->page[i]);
226	bprm->page[i] = NULL;
227	}
228	}
229
230	static void free_arg_pages(struct linux_binprm *bprm)
231	{
232	int i;
233
234	for (i = `0`; i < MAX_ARG_PAGES; i++)
235	free_arg_page(bprm, i);
236	}
237
238	static void flush_arg_page(struct linux_binprm bprm, unsigned* long pos,
239	struct page *page)
240	{
241	}
242
243	static bool valid_arg_len(struct linux_binprm bprm, long* len)
244	{
245	return len <= bprm->p;
246	}
247
248	#endif /* CONFIG_MMU */
249
250	/*
251	* Create a new mm_struct and populate it with a temporary stack
252	* vm_area_struct. We don't have enough context at this point to set the stack
253	* flags, permissions, and offset, so we use temporary values. We'll update
254	* them later in setup_arg_pages().
255	*/
256	static int bprm_mm_init(struct linux_binprm *bprm)
257	{
258	int err;
259	struct mm_struct *mm = NULL;
260
261	bprm->mm = mm = mm_alloc();
262	err = -ENOMEM;
263	if (!mm)
264	goto err;
265
266	/ Save current stack limit for all calculations made during exec. /
267	task_lock(current->group_leader);
268	bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK];
269	task_unlock(current->group_leader);
270
271	#ifndef CONFIG_MMU
272	bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *);
273	#else
274	err = create_init_stack_vma(mm: bprm->mm, vmap: &bprm->vma, top_mem_p: &bprm->p);
275	if (err)
276	goto err;
277	#endif
278
279	return `0`;
280
281	err:
282	if (mm) {
283	bprm->mm = NULL;
284	mmdrop(mm);
285	}
286
287	return err;
288	}
289
290	struct user_arg_ptr {
291	#ifdef CONFIG_COMPAT
292	bool is_compat;
293	#endif
294	union {
295	const char __user *const __user *native;
296	#ifdef CONFIG_COMPAT
297	const compat_uptr_t __user *compat;
298	#endif
299	} ptr;
300	};
301
302	static const char __user get_user_arg_ptr(struct* user_arg_ptr argv, int nr)
303	{
304	const char __user *native;
305
306	#ifdef CONFIG_COMPAT
307	if (unlikely(argv.is_compat)) {
308	compat_uptr_t compat;
309
310	if (get_user(compat, argv.ptr.compat + nr))
311	return ERR_PTR(error: -EFAULT);
312
313	return compat_ptr(uptr: compat);
314	}
315	#endif
316
317	if (get_user(native, argv.ptr.native + nr))
318	return ERR_PTR(error: -EFAULT);
319
320	return native;
321	}
322
323	/*
324	* count() counts the number of strings in array ARGV.
325	*/
326	static int count(struct user_arg_ptr argv, int max)
327	{
328	int i = `0`;
329
330	if (argv.ptr.native != NULL) {
331	for (;;) {
332	const char __user *p = get_user_arg_ptr(argv, nr: i);
333
334	if (!p)
335	break;
336
337	if (IS_ERR(ptr: p))
338	return -EFAULT;
339
340	if (i >= max)
341	return -E2BIG;
342	++i;
343
344	if (fatal_signal_pending(current))
345	return -ERESTARTNOHAND;
346	cond_resched();
347	}
348	}
349	return i;
350	}
351
352	static int count_strings_kernel(const char *const *argv)
353	{
354	int i;
355
356	if (!argv)
357	return `0`;
358
359	for (i = `0`; argv[i]; ++i) {
360	if (i >= MAX_ARG_STRINGS)
361	return -E2BIG;
362	if (fatal_signal_pending(current))
363	return -ERESTARTNOHAND;
364	cond_resched();
365	}
366	return i;
367	}
368
369	static inline int bprm_set_stack_limit(struct linux_binprm *bprm,
370	unsigned long limit)
371	{
372	#ifdef CONFIG_MMU
373	/ Avoid a pathological bprm->p. /
374	if (bprm->p < limit)
375	return -E2BIG;
376	bprm->argmin = bprm->p - limit;
377	#endif
378	return `0`;
379	}
380	static inline bool bprm_hit_stack_limit(struct linux_binprm *bprm)
381	{
382	#ifdef CONFIG_MMU
383	return bprm->p < bprm->argmin;
384	#else
385	return false;
386	#endif
387	}
388
389	/*
390	* Calculate bprm->argmin from:
391	* - _STK_LIM
392	* - ARG_MAX
393	* - bprm->rlim_stack.rlim_cur
394	* - bprm->argc
395	* - bprm->envc
396	* - bprm->p
397	*/
398	static int bprm_stack_limits(struct linux_binprm *bprm)
399	{
400	unsigned long limit, ptr_size;
401
402	/*
403	* Limit to 1/4 of the max stack size or 3/4 of _STK_LIM
404	* (whichever is smaller) for the argv+env strings.
405	* This ensures that:
406	* - the remaining binfmt code will not run out of stack space,
407	* - the program will have a reasonable amount of stack left
408	* to work from.
409	*/
410	limit = _STK_LIM / `4` * `3`;
411	limit = min(limit, bprm->rlim_stack.rlim_cur / `4`);
412	/*
413	* We've historically supported up to 32 pages (ARG_MAX)
414	* of argument strings even with small stacks
415	*/
416	limit = max_t(unsigned long, limit, ARG_MAX);
417	/ Reject totally pathological counts. /
418	if (bprm->argc < `0` \|\| bprm->envc < `0`)
419	return -E2BIG;
420	/*
421	* We must account for the size of all the argv and envp pointers to
422	* the argv and envp strings, since they will also take up space in
423	* the stack. They aren't stored until much later when we can't
424	* signal to the parent that the child has run out of stack space.
425	* Instead, calculate it here so it's possible to fail gracefully.
426	*
427	* In the case of argc = 0, make sure there is space for adding a
428	* empty string (which will bump argc to 1), to ensure confused
429	* userspace programs don't start processing from argv[1], thinking
430	* argc can never be 0, to keep them from walking envp by accident.
431	* See do_execveat_common().
432	*/
433	if (check_add_overflow(max(bprm->argc, `1`), bprm->envc, &ptr_size) \|\|
434	check_mul_overflow(ptr_size, sizeof(void *), &ptr_size))
435	return -E2BIG;
436	if (limit <= ptr_size)
437	return -E2BIG;
438	limit -= ptr_size;
439
440	return bprm_set_stack_limit(bprm, limit);
441	}
442
443	/*
444	* 'copy_strings()' copies argument/environment strings from the old
445	* processes's memory to the new process's stack. The call to get_user_pages()
446	* ensures the destination page is created and not swapped out.
447	*/
448	static int copy_strings(int argc, struct user_arg_ptr argv,
449	struct linux_binprm *bprm)
450	{
451	struct page *kmapped_page = NULL;
452	char *kaddr = NULL;
453	unsigned long kpos = `0`;
454	int ret;
455
456	while (argc-- > `0`) {
457	const char __user *str;
458	int len;
459	unsigned long pos;
460
461	ret = -EFAULT;
462	str = get_user_arg_ptr(argv, nr: argc);
463	if (IS_ERR(ptr: str))
464	goto out;
465
466	len = strnlen_user(str, MAX_ARG_STRLEN);
467	if (!len)
468	goto out;
469
470	ret = -E2BIG;
471	if (!valid_arg_len(bprm, len))
472	goto out;
473
474	/ We're going to work our way backwards. /
475	pos = bprm->p;
476	str += len;
477	bprm->p -= len;
478	if (bprm_hit_stack_limit(bprm))
479	goto out;
480
481	while (len > `0`) {
482	int offset, bytes_to_copy;
483
484	if (fatal_signal_pending(current)) {
485	ret = -ERESTARTNOHAND;
486	goto out;
487	}
488	cond_resched();
489
490	offset = pos % PAGE_SIZE;
491	if (offset == `0`)
492	offset = PAGE_SIZE;
493
494	bytes_to_copy = offset;
495	if (bytes_to_copy > len)
496	bytes_to_copy = len;
497
498	offset -= bytes_to_copy;
499	pos -= bytes_to_copy;
500	str -= bytes_to_copy;
501	len -= bytes_to_copy;
502
503	if (!kmapped_page \|\| kpos != (pos & PAGE_MASK)) {
504	struct page *page;
505
506	page = get_arg_page(bprm, pos, write: `1`);
507	if (!page) {
508	ret = -E2BIG;
509	goto out;
510	}
511
512	if (kmapped_page) {
513	flush_dcache_page(page: kmapped_page);
514	kunmap_local(kaddr);
515	put_arg_page(page: kmapped_page);
516	}
517	kmapped_page = page;
518	kaddr = kmap_local_page(page: kmapped_page);
519	kpos = pos & PAGE_MASK;
520	flush_arg_page(bprm, pos: kpos, page: kmapped_page);
521	}
522	if (copy_from_user(to: kaddr+offset, from: str, n: bytes_to_copy)) {
523	ret = -EFAULT;
524	goto out;
525	}
526	}
527	}
528	ret = `0`;
529	out:
530	if (kmapped_page) {
531	flush_dcache_page(page: kmapped_page);
532	kunmap_local(kaddr);
533	put_arg_page(page: kmapped_page);
534	}
535	return ret;
536	}
537
538	/*
539	* Copy and argument/environment string from the kernel to the processes stack.
540	*/
541	int copy_string_kernel(const char arg, struct* linux_binprm *bprm)
542	{
543	int len = strnlen(arg, MAX_ARG_STRLEN) + `1` / terminating NUL /;
544	unsigned long pos = bprm->p;
545
546	if (len == `0`)
547	return -EFAULT;
548	if (!valid_arg_len(bprm, len))
549	return -E2BIG;
550
551	/ We're going to work our way backwards. /
552	arg += len;
553	bprm->p -= len;
554	if (bprm_hit_stack_limit(bprm))
555	return -E2BIG;
556
557	while (len > `0`) {
558	unsigned int bytes_to_copy = min_t(unsigned int, len,
559	min_not_zero(offset_in_page(pos), PAGE_SIZE));
560	struct page *page;
561
562	pos -= bytes_to_copy;
563	arg -= bytes_to_copy;
564	len -= bytes_to_copy;
565
566	page = get_arg_page(bprm, pos, write: `1`);
567	if (!page)
568	return -E2BIG;
569	flush_arg_page(bprm, pos: pos & PAGE_MASK, page);
570	memcpy_to_page(page, offset_in_page(pos), from: arg, len: bytes_to_copy);
571	put_arg_page(page);
572	}
573
574	return `0`;
575	}
576	EXPORT_SYMBOL(copy_string_kernel);
577
578	static int copy_strings_kernel(int argc, const char *const *argv,
579	struct linux_binprm *bprm)
580	{
581	while (argc-- > `0`) {
582	int ret = copy_string_kernel(argv[argc], bprm);
583	if (ret < `0`)
584	return ret;
585	if (fatal_signal_pending(current))
586	return -ERESTARTNOHAND;
587	cond_resched();
588	}
589	return `0`;
590	}
591
592	#ifdef CONFIG_MMU
593
594	/*
595	* Finalizes the stack vm_area_struct. The flags and permissions are updated,
596	* the stack is optionally relocated, and some extra space is added.
597	*/
598	int setup_arg_pages(struct linux_binprm *bprm,
599	unsigned long stack_top,
600	int executable_stack)
601	{
602	int ret;
603	unsigned long stack_shift;
604	struct mm_struct *mm = current->mm;
605	struct vm_area_struct *vma = bprm->vma;
606	struct vm_area_struct *prev = NULL;
607	vm_flags_t vm_flags;
608	unsigned long stack_base;
609	unsigned long stack_size;
610	unsigned long stack_expand;
611	unsigned long rlim_stack;
612	struct mmu_gather tlb;
613	struct vma_iterator vmi;
614
615	#ifdef CONFIG_STACK_GROWSUP
616	/ Limit stack size /
617	stack_base = bprm->rlim_stack.rlim_max;
618
619	stack_base = calc_max_stack_size(stack_base);
620
621	/ Add space for stack randomization. /
622	if (current->flags & PF_RANDOMIZE)
623	stack_base += (STACK_RND_MASK << PAGE_SHIFT);
624
625	/ Make sure we didn't let the argument array grow too large. /
626	if (vma->vm_end - vma->vm_start > stack_base)
627	return -ENOMEM;
628
629	stack_base = PAGE_ALIGN(stack_top - stack_base);
630
631	stack_shift = vma->vm_start - stack_base;
632	mm->arg_start = bprm->p - stack_shift;
633	bprm->p = vma->vm_end - stack_shift;
634	#else
635	stack_top = arch_align_stack(sp: stack_top);
636	stack_top = PAGE_ALIGN(stack_top);
637
638	if (unlikely(stack_top < mmap_min_addr) \|\|
639	unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr))
640	return -ENOMEM;
641
642	stack_shift = vma->vm_end - stack_top;
643
644	bprm->p -= stack_shift;
645	mm->arg_start = bprm->p;
646	#endif
647
648	bprm->exec -= stack_shift;
649
650	if (mmap_write_lock_killable(mm))
651	return -EINTR;
652
653	vm_flags = VM_STACK_FLAGS;
654
655	/*
656	* Adjust stack execute permissions; explicitly enable for
657	* EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone
658	* (arch default) otherwise.
659	*/
660	if (unlikely(executable_stack == EXSTACK_ENABLE_X))
661	vm_flags \|= VM_EXEC;
662	else if (executable_stack == EXSTACK_DISABLE_X)
663	vm_flags &= ~VM_EXEC;
664	vm_flags \|= mm->def_flags;
665	vm_flags \|= VM_STACK_INCOMPLETE_SETUP;
666
667	vma_iter_init(vmi: &vmi, mm, addr: vma->vm_start);
668
669	tlb_gather_mmu(tlb: &tlb, mm);
670	ret = mprotect_fixup(vmi: &vmi, tlb: &tlb, vma, pprev: &prev, start: vma->vm_start, end: vma->vm_end,
671	newflags: vm_flags);
672	tlb_finish_mmu(tlb: &tlb);
673
674	if (ret)
675	goto out_unlock;
676	BUG_ON(prev != vma);
677
678	if (unlikely(vm_flags & VM_EXEC)) {
679	pr_warn_once("process '%pD4' started with executable stack\n",
680	bprm->file);
681	}
682
683	/ Move stack pages down in memory. /
684	if (stack_shift) {
685	/*
686	* During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX. Once
687	* the binfmt code determines where the new stack should reside, we shift it to
688	* its final location.
689	*/
690	ret = relocate_vma_down(vma, shift: stack_shift);
691	if (ret)
692	goto out_unlock;
693	}
694
695	/ mprotect_fixup is overkill to remove the temporary stack flags /
696	vm_flags_clear(vma, VM_STACK_INCOMPLETE_SETUP);
697
698	stack_expand = `131072UL`; / randomly 324k (or 264k) pages /
699	stack_size = vma->vm_end - vma->vm_start;
700	/*
701	* Align this down to a page boundary as expand_stack
702	* will align it up.
703	*/
704	rlim_stack = bprm->rlim_stack.rlim_cur & PAGE_MASK;
705
706	stack_expand = min(rlim_stack, stack_size + stack_expand);
707
708	#ifdef CONFIG_STACK_GROWSUP
709	stack_base = vma->vm_start + stack_expand;
710	#else
711	stack_base = vma->vm_end - stack_expand;
712	#endif
713	current->mm->start_stack = bprm->p;
714	ret = expand_stack_locked(vma, address: stack_base);
715	if (ret)
716	ret = -EFAULT;
717
718	out_unlock:
719	mmap_write_unlock(mm);
720	return ret;
721	}
722	EXPORT_SYMBOL(setup_arg_pages);
723
724	#else
725
726	/*
727	* Transfer the program arguments and environment from the holding pages
728	* onto the stack. The provided stack pointer is adjusted accordingly.
729	*/
730	int transfer_args_to_stack(struct linux_binprm *bprm,
731	unsigned long *sp_location)
732	{
733	unsigned long index, stop, sp;
734	int ret = `0`;
735
736	stop = bprm->p >> PAGE_SHIFT;
737	sp = *sp_location;
738
739	for (index = MAX_ARG_PAGES - `1`; index >= stop; index--) {
740	unsigned int offset = index == stop ? bprm->p & ~PAGE_MASK : `0`;
741	char *src = kmap_local_page(bprm->page[index]) + offset;
742	sp -= PAGE_SIZE - offset;
743	if (copy_to_user((void *) sp, src, PAGE_SIZE - offset) != `0`)
744	ret = -EFAULT;
745	kunmap_local(src);
746	if (ret)
747	goto out;
748	}
749
750	bprm->exec += sp_location - MAX_ARG_PAGES PAGE_SIZE;
751	*sp_location = sp;
752
753	out:
754	return ret;
755	}
756	EXPORT_SYMBOL(transfer_args_to_stack);
757
758	#endif /* CONFIG_MMU */
759
760	/*
761	* On success, caller must call do_close_execat() on the returned
762	* struct file to close it.
763	*/
764	static struct file do_open_execat(int* fd, struct filename name, int* flags)
765	{
766	int err;
767	struct file *file __free(fput) = NULL;
768	struct open_flags open_exec_flags = {
769	.open_flag = O_LARGEFILE \| O_RDONLY \| __FMODE_EXEC,
770	.acc_mode = MAY_EXEC,
771	.intent = LOOKUP_OPEN,
772	.lookup_flags = LOOKUP_FOLLOW,
773	};
774
775	if ((flags &
776	~(AT_SYMLINK_NOFOLLOW \| AT_EMPTY_PATH \| AT_EXECVE_CHECK)) != `0`)
777	return ERR_PTR(error: -EINVAL);
778	if (flags & AT_SYMLINK_NOFOLLOW)
779	open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW;
780	if (flags & AT_EMPTY_PATH)
781	open_exec_flags.lookup_flags \|= LOOKUP_EMPTY;
782
783	file = do_filp_open(dfd: fd, pathname: name, op: &open_exec_flags);
784	if (IS_ERR(ptr: file))
785	return file;
786
787	if (path_noexec(path: &file->f_path))
788	return ERR_PTR(error: -EACCES);
789
790	/*
791	* In the past the regular type check was here. It moved to may_open() in
792	* 633fb6ac3980 ("exec: move S_ISREG() check earlier"). Since then it is
793	* an invariant that all non-regular files error out before we get here.
794	*/
795	if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)))
796	return ERR_PTR(error: -EACCES);
797
798	err = exe_file_deny_write_access(exe_file: file);
799	if (err)
800	return ERR_PTR(error: err);
801
802	return no_free_ptr(file);
803	}
804
805	/**
806	* open_exec - Open a path name for execution
807	*
808	* @name: path name to open with the intent of executing it.
809	*
810	* Returns ERR_PTR on failure or allocated struct file on success.
811	*
812	* As this is a wrapper for the internal do_open_execat(), callers
813	* must call exe_file_allow_write_access() before fput() on release. Also see
814	* do_close_execat().
815	*/
816	struct file open_exec(const* char *name)
817	{
818	struct filename *filename = getname_kernel(name);
819	struct file *f = ERR_CAST(ptr: filename);
820
821	if (!IS_ERR(ptr: filename)) {
822	f = do_open_execat(AT_FDCWD, name: filename, flags: `0`);
823	putname(name: filename);
824	}
825	return f;
826	}
827	EXPORT_SYMBOL(open_exec);
828
829	#if defined(CONFIG_BINFMT_FLAT) \|\| defined(CONFIG_BINFMT_ELF_FDPIC)
830	ssize_t read_code(struct file file, unsigned* long addr, loff_t pos, size_t len)
831	{
832	ssize_t res = vfs_read(file, (void __user *)addr, len, &pos);
833	if (res > `0`)
834	flush_icache_user_range(addr, addr + len);
835	return res;
836	}
837	EXPORT_SYMBOL(read_code);
838	#endif
839
840	/*
841	* Maps the mm_struct mm into the current task struct.
842	* On success, this function returns with exec_update_lock
843	* held for writing.
844	*/
845	static int exec_mmap(struct mm_struct *mm)
846	{
847	struct task_struct *tsk;
848	struct mm_struct old_mm, active_mm;
849	int ret;
850
851	/ Notify parent that we're no longer interested in the old VM /
852	tsk = current;
853	old_mm = current->mm;
854	exec_mm_release(tsk, old_mm);
855
856	ret = down_write_killable(sem: &tsk->signal->exec_update_lock);
857	if (ret)
858	return ret;
859
860	if (old_mm) {
861	/*
862	* If there is a pending fatal signal perhaps a signal
863	* whose default action is to create a coredump get
864	* out and die instead of going through with the exec.
865	*/
866	ret = mmap_read_lock_killable(mm: old_mm);
867	if (ret) {
868	up_write(sem: &tsk->signal->exec_update_lock);
869	return ret;
870	}
871	}
872
873	task_lock(p: tsk);
874	membarrier_exec_mmap(mm);
875
876	local_irq_disable();
877	active_mm = tsk->active_mm;
878	tsk->active_mm = mm;
879	tsk->mm = mm;
880	mm_init_cid(mm, p: tsk);
881	/*
882	* This prevents preemption while active_mm is being loaded and
883	* it and mm are being updated, which could cause problems for
884	* lazy tlb mm refcounting when these are updated by context
885	* switches. Not all architectures can handle irqs off over
886	* activate_mm yet.
887	*/
888	if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
889	local_irq_enable();
890	activate_mm(active_mm, mm);
891	if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
892	local_irq_enable();
893	lru_gen_add_mm(mm);
894	task_unlock(p: tsk);
895	lru_gen_use_mm(mm);
896	if (old_mm) {
897	mmap_read_unlock(mm: old_mm);
898	BUG_ON(active_mm != old_mm);
899	setmax_mm_hiwater_rss(maxrss: &tsk->signal->maxrss, mm: old_mm);
900	mm_update_next_owner(mm: old_mm);
901	mmput(old_mm);
902	return `0`;
903	}
904	mmdrop_lazy_tlb(mm: active_mm);
905	return `0`;
906	}
907
908	static int de_thread(struct task_struct *tsk)
909	{
910	struct signal_struct *sig = tsk->signal;
911	struct sighand_struct *oldsighand = tsk->sighand;
912	spinlock_t *lock = &oldsighand->siglock;
913
914	if (thread_group_empty(p: tsk))
915	goto no_thread_group;
916
917	/*
918	* Kill all other threads in the thread group.
919	*/
920	spin_lock_irq(lock);
921	if ((sig->flags & SIGNAL_GROUP_EXIT) \|\| sig->group_exec_task) {
922	/*
923	* Another group action in progress, just
924	* return so that the signal is processed.
925	*/
926	spin_unlock_irq(lock);
927	return -EAGAIN;
928	}
929
930	sig->group_exec_task = tsk;
931	sig->notify_count = zap_other_threads(p: tsk);
932	if (!thread_group_leader(p: tsk))
933	sig->notify_count--;
934
935	while (sig->notify_count) {
936	__set_current_state(TASK_KILLABLE);
937	spin_unlock_irq(lock);
938	schedule();
939	if (__fatal_signal_pending(p: tsk))
940	goto killed;
941	spin_lock_irq(lock);
942	}
943	spin_unlock_irq(lock);
944
945	/*
946	* At this point all other threads have exited, all we have to
947	* do is to wait for the thread group leader to become inactive,
948	* and to assume its PID:
949	*/
950	if (!thread_group_leader(p: tsk)) {
951	struct task_struct *leader = tsk->group_leader;
952
953	for (;;) {
954	cgroup_threadgroup_change_begin(tsk);
955	write_lock_irq(&tasklist_lock);
956	/*
957	* Do this under tasklist_lock to ensure that
958	* exit_notify() can't miss ->group_exec_task
959	*/
960	sig->notify_count = -`1`;
961	if (likely(leader->exit_state))
962	break;
963	__set_current_state(TASK_KILLABLE);
964	write_unlock_irq(&tasklist_lock);
965	cgroup_threadgroup_change_end(tsk);
966	schedule();
967	if (__fatal_signal_pending(p: tsk))
968	goto killed;
969	}
970
971	/*
972	* The only record we have of the real-time age of a
973	* process, regardless of execs it's done, is start_time.
974	* All the past CPU time is accumulated in signal_struct
975	* from sister threads now dead. But in this non-leader
976	* exec, nothing survives from the original leader thread,
977	* whose birth marks the true age of this process now.
978	* When we take on its identity by switching to its PID, we
979	* also take its birthdate (always earlier than our own).
980	*/
981	tsk->start_time = leader->start_time;
982	tsk->start_boottime = leader->start_boottime;
983
984	BUG_ON(!same_thread_group(leader, tsk));
985	/*
986	* An exec() starts a new thread group with the
987	* TGID of the previous thread group. Rehash the
988	* two threads with a switched PID, and release
989	* the former thread group leader:
990	*/
991
992	/ Become a process group leader with the old leader's pid.*
993	* The old leader becomes a thread of the this thread group.
994	*/
995	exchange_tids(task: tsk, old: leader);
996	transfer_pid(old: leader, new: tsk, PIDTYPE_TGID);
997	transfer_pid(old: leader, new: tsk, PIDTYPE_PGID);
998	transfer_pid(old: leader, new: tsk, PIDTYPE_SID);
999
1000	list_replace_rcu(old: &leader->tasks, new: &tsk->tasks);
1001	list_replace_init(old: &leader->sibling, new: &tsk->sibling);
1002
1003	tsk->group_leader = tsk;
1004	leader->group_leader = tsk;
1005
1006	tsk->exit_signal = SIGCHLD;
1007	leader->exit_signal = -`1`;
1008
1009	BUG_ON(leader->exit_state != EXIT_ZOMBIE);
1010	leader->exit_state = EXIT_DEAD;
1011	/*
1012	* We are going to release_task()->ptrace_unlink() silently,
1013	* the tracer can sleep in do_wait(). EXIT_DEAD guarantees
1014	* the tracer won't block again waiting for this thread.
1015	*/
1016	if (unlikely(leader->ptrace))
1017	__wake_up_parent(p: leader, parent: leader->parent);
1018	write_unlock_irq(&tasklist_lock);
1019	cgroup_threadgroup_change_end(tsk);
1020
1021	release_task(p: leader);
1022	}
1023
1024	sig->group_exec_task = NULL;
1025	sig->notify_count = `0`;
1026
1027	no_thread_group:
1028	/ we have changed execution domain /
1029	tsk->exit_signal = SIGCHLD;
1030
1031	BUG_ON(!thread_group_leader(tsk));
1032	return `0`;
1033
1034	killed:
1035	/ protects against exit_notify() and __exit_signal() /
1036	read_lock(&tasklist_lock);
1037	sig->group_exec_task = NULL;
1038	sig->notify_count = `0`;
1039	read_unlock(&tasklist_lock);
1040	return -EAGAIN;
1041	}
1042
1043
1044	/*
1045	* This function makes sure the current process has its own signal table,
1046	* so that flush_signal_handlers can later reset the handlers without
1047	* disturbing other processes. (Other processes might share the signal
1048	* table via the CLONE_SIGHAND option to clone().)
1049	*/
1050	static int unshare_sighand(struct task_struct *me)
1051	{
1052	struct sighand_struct *oldsighand = me->sighand;
1053
1054	if (refcount_read(r: &oldsighand->count) != `1`) {
1055	struct sighand_struct *newsighand;
1056	/*
1057	* This ->sighand is shared with the CLONE_SIGHAND
1058	* but not CLONE_THREAD task, switch to the new one.
1059	*/
1060	newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
1061	if (!newsighand)
1062	return -ENOMEM;
1063
1064	refcount_set(r: &newsighand->count, n: `1`);
1065
1066	write_lock_irq(&tasklist_lock);
1067	spin_lock(lock: &oldsighand->siglock);
1068	memcpy(to: newsighand->action, from: oldsighand->action,
1069	len: sizeof(newsighand->action));
1070	rcu_assign_pointer(me->sighand, newsighand);
1071	spin_unlock(lock: &oldsighand->siglock);
1072	write_unlock_irq(&tasklist_lock);
1073
1074	__cleanup_sighand(oldsighand);
1075	}
1076	return `0`;
1077	}
1078
1079	/*
1080	* This is unlocked -- the string will always be NUL-terminated, but
1081	* may show overlapping contents if racing concurrent reads.
1082	*/
1083	void __set_task_comm(struct task_struct tsk, const* char *buf, bool exec)
1084	{
1085	size_t len = min(strlen(buf), sizeof(tsk->comm) - `1`);
1086
1087	trace_task_rename(task: tsk, comm: buf);
1088	memcpy(to: tsk->comm, from: buf, len);
1089	memset(s: &tsk->comm[len], c: `0`, n: sizeof(tsk->comm) - len);
1090	perf_event_comm(tsk, exec);
1091	}
1092
1093	/*
1094	* Calling this is the point of no return. None of the failures will be
1095	* seen by userspace since either the process is already taking a fatal
1096	* signal (via de_thread() or coredump), or will have SEGV raised
1097	* (after exec_mmap()) by search_binary_handler (see below).
1098	*/
1099	int begin_new_exec(struct linux_binprm * bprm)
1100	{
1101	struct task_struct *me = current;
1102	int retval;
1103
1104	/ Once we are committed compute the creds /
1105	retval = bprm_creds_from_file(bprm);
1106	if (retval)
1107	return retval;
1108
1109	/*
1110	* This tracepoint marks the point before flushing the old exec where
1111	* the current task is still unchanged, but errors are fatal (point of
1112	* no return). The later "sched_process_exec" tracepoint is called after
1113	* the current task has successfully switched to the new exec.
1114	*/
1115	trace_sched_prepare_exec(current, bprm);
1116
1117	/*
1118	* Ensure all future errors are fatal.
1119	*/
1120	bprm->point_of_no_return = true;
1121
1122	/ Make this the only thread in the thread group /
1123	retval = de_thread(tsk: me);
1124	if (retval)
1125	goto out;
1126	/ see the comment in check_unsafe_exec() /
1127	current->fs->in_exec = `0`;
1128	/*
1129	* Cancel any io_uring activity across execve
1130	*/
1131	io_uring_task_cancel();
1132
1133	/ Ensure the files table is not shared. /
1134	retval = unshare_files();
1135	if (retval)
1136	goto out;
1137
1138	/*
1139	* Must be called _before_ exec_mmap() as bprm->mm is
1140	* not visible until then. Doing it here also ensures
1141	* we don't race against replace_mm_exe_file().
1142	*/
1143	retval = set_mm_exe_file(mm: bprm->mm, new_exe_file: bprm->file);
1144	if (retval)
1145	goto out;
1146
1147	/ If the binary is not readable then enforce mm->dumpable=0 /
1148	would_dump(bprm, bprm->file);
1149	if (bprm->have_execfd)
1150	would_dump(bprm, bprm->executable);
1151
1152	/*
1153	* Release all of the old mmap stuff
1154	*/
1155	acct_arg_size(bprm, pages: `0`);
1156	retval = exec_mmap(mm: bprm->mm);
1157	if (retval)
1158	goto out;
1159
1160	bprm->mm = NULL;
1161
1162	retval = exec_task_namespaces();
1163	if (retval)
1164	goto out_unlock;
1165
1166	#ifdef CONFIG_POSIX_TIMERS
1167	spin_lock_irq(lock: &me->sighand->siglock);
1168	posix_cpu_timers_exit(task: me);
1169	spin_unlock_irq(lock: &me->sighand->siglock);
1170	exit_itimers(me);
1171	flush_itimer_signals();
1172	#endif
1173
1174	/*
1175	* Make the signal table private.
1176	*/
1177	retval = unshare_sighand(me);
1178	if (retval)
1179	goto out_unlock;
1180
1181	me->flags &= ~(PF_RANDOMIZE \| PF_FORKNOEXEC \|
1182	PF_NOFREEZE \| PF_NO_SETAFFINITY);
1183	flush_thread();
1184	me->personality &= ~bprm->per_clear;
1185
1186	clear_syscall_work_syscall_user_dispatch(me);
1187
1188	/*
1189	* We have to apply CLOEXEC before we change whether the process is
1190	* dumpable (in setup_new_exec) to avoid a race with a process in userspace
1191	* trying to access the should-be-closed file descriptors of a process
1192	* undergoing exec(2).
1193	*/
1194	do_close_on_exec(me->files);
1195
1196	if (bprm->secureexec) {
1197	/ Make sure parent cannot signal privileged process. /
1198	me->pdeath_signal = `0`;
1199
1200	/*
1201	* For secureexec, reset the stack limit to sane default to
1202	* avoid bad behavior from the prior rlimits. This has to
1203	* happen before arch_pick_mmap_layout(), which examines
1204	* RLIMIT_STACK, but after the point of no return to avoid
1205	* needing to clean up the change on failure.
1206	*/
1207	if (bprm->rlim_stack.rlim_cur > _STK_LIM)
1208	bprm->rlim_stack.rlim_cur = _STK_LIM;
1209	}
1210
1211	me->sas_ss_sp = me->sas_ss_size = `0`;
1212
1213	/*
1214	* Figure out dumpability. Note that this checking only of current
1215	* is wrong, but userspace depends on it. This should be testing
1216	* bprm->secureexec instead.
1217	*/
1218	if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP \|\|
1219	!(uid_eq(current_euid(), current_uid()) &&
1220	gid_eq(current_egid(), current_gid())))
1221	set_dumpable(current->mm, value: suid_dumpable);
1222	else
1223	set_dumpable(current->mm, SUID_DUMP_USER);
1224
1225	perf_event_exec();
1226
1227	/*
1228	* If the original filename was empty, alloc_bprm() made up a path
1229	* that will probably not be useful to admins running ps or similar.
1230	* Let's fix it up to be something reasonable.
1231	*/
1232	if (bprm->comm_from_dentry) {
1233	/*
1234	* Hold RCU lock to keep the name from being freed behind our back.
1235	* Use acquire semantics to make sure the terminating NUL from
1236	* __d_alloc() is seen.
1237	*
1238	* Note, we're deliberately sloppy here. We don't need to care about
1239	* detecting a concurrent rename and just want a terminated name.
1240	*/
1241	rcu_read_lock();
1242	__set_task_comm(tsk: me, smp_load_acquire(&bprm->file->f_path.dentry->d_name.name),
1243	exec: true);
1244	rcu_read_unlock();
1245	} else {
1246	__set_task_comm(tsk: me, buf: kbasename(path: bprm->filename), exec: true);
1247	}
1248
1249	/ An exec changes our domain. We are no longer part of the thread*
1250	group /*
1251	WRITE_ONCE(me->self_exec_id, me->self_exec_id + `1`);
1252	flush_signal_handlers(me, force_default: `0`);
1253
1254	retval = set_cred_ucounts(bprm->cred);
1255	if (retval < `0`)
1256	goto out_unlock;
1257
1258	/*
1259	* install the new credentials for this executable
1260	*/
1261	security_bprm_committing_creds(bprm);
1262
1263	commit_creds(bprm->cred);
1264	bprm->cred = NULL;
1265
1266	/*
1267	* Disable monitoring for regular users
1268	* when executing setuid binaries. Must
1269	* wait until new credentials are committed
1270	* by commit_creds() above
1271	*/
1272	if (get_dumpable(mm: me->mm) != SUID_DUMP_USER)
1273	perf_event_exit_task(child: me);
1274	/*
1275	* cred_guard_mutex must be held at least to this point to prevent
1276	* ptrace_attach() from altering our determination of the task's
1277	* credentials; any time after this it may be unlocked.
1278	*/
1279	security_bprm_committed_creds(bprm);
1280
1281	/ Pass the opened binary to the interpreter. /
1282	if (bprm->have_execfd) {
1283	retval = get_unused_fd_flags(flags: `0`);
1284	if (retval < `0`)
1285	goto out_unlock;
1286	fd_install(fd: retval, file: bprm->executable);
1287	bprm->executable = NULL;
1288	bprm->execfd = retval;
1289	}
1290	return `0`;
1291
1292	out_unlock:
1293	up_write(sem: &me->signal->exec_update_lock);
1294	if (!bprm->cred)
1295	mutex_unlock(lock: &me->signal->cred_guard_mutex);
1296
1297	out:
1298	return retval;
1299	}
1300	EXPORT_SYMBOL(begin_new_exec);
1301
1302	void would_dump(struct linux_binprm bprm, struct* file *file)
1303	{
1304	struct inode *inode = file_inode(f: file);
1305	struct mnt_idmap *idmap = file_mnt_idmap(file);
1306	if (inode_permission(idmap, inode, MAY_READ) < `0`) {
1307	struct user_namespace old, user_ns;
1308	bprm->interp_flags \|= BINPRM_FLAGS_ENFORCE_NONDUMP;
1309
1310	/ Ensure mm->user_ns contains the executable /
1311	user_ns = old = bprm->mm->user_ns;
1312	while ((user_ns != &init_user_ns) &&
1313	!privileged_wrt_inode_uidgid(ns: user_ns, idmap, inode))
1314	user_ns = user_ns->parent;
1315
1316	if (old != user_ns) {
1317	bprm->mm->user_ns = get_user_ns(ns: user_ns);
1318	put_user_ns(ns: old);
1319	}
1320	}
1321	}
1322	EXPORT_SYMBOL(would_dump);
1323
1324	void setup_new_exec(struct linux_binprm * bprm)
1325	{
1326	/ Setup things that can depend upon the personality /
1327	struct task_struct *me = current;
1328
1329	arch_pick_mmap_layout(mm: me->mm, rlim_stack: &bprm->rlim_stack);
1330
1331	arch_setup_new_exec();
1332
1333	/ Set the new mm task size. We have to do that late because it may*
1334	* depend on TIF_32BIT which is only updated in flush_thread() on
1335	* some architectures like powerpc
1336	*/
1337	me->mm->task_size = TASK_SIZE;
1338	up_write(sem: &me->signal->exec_update_lock);
1339	mutex_unlock(lock: &me->signal->cred_guard_mutex);
1340	}
1341	EXPORT_SYMBOL(setup_new_exec);
1342
1343	/ Runs immediately before start_thread() takes over. /
1344	void finalize_exec(struct linux_binprm *bprm)
1345	{
1346	/ Store any stack rlimit changes before starting thread. /
1347	task_lock(current->group_leader);
1348	current->signal->rlim[RLIMIT_STACK] = bprm->rlim_stack;
1349	task_unlock(current->group_leader);
1350	}
1351	EXPORT_SYMBOL(finalize_exec);
1352
1353	/*
1354	* Prepare credentials and lock ->cred_guard_mutex.
1355	* setup_new_exec() commits the new creds and drops the lock.
1356	* Or, if exec fails before, free_bprm() should release ->cred
1357	* and unlock.
1358	*/
1359	static int prepare_bprm_creds(struct linux_binprm *bprm)
1360	{
1361	if (mutex_lock_interruptible(lock: &current->signal->cred_guard_mutex))
1362	return -ERESTARTNOINTR;
1363
1364	bprm->cred = prepare_exec_creds();
1365	if (likely(bprm->cred))
1366	return `0`;
1367
1368	mutex_unlock(lock: &current->signal->cred_guard_mutex);
1369	return -ENOMEM;
1370	}
1371
1372	/ Matches do_open_execat() /
1373	static void do_close_execat(struct file *file)
1374	{
1375	if (!file)
1376	return;
1377	exe_file_allow_write_access(exe_file: file);
1378	fput(file);
1379	}
1380
1381	static void free_bprm(struct linux_binprm *bprm)
1382	{
1383	if (bprm->mm) {
1384	acct_arg_size(bprm, pages: `0`);
1385	mmput(bprm->mm);
1386	}
1387	free_arg_pages(bprm);
1388	if (bprm->cred) {
1389	/ in case exec fails before de_thread() succeeds /
1390	current->fs->in_exec = `0`;
1391	mutex_unlock(lock: &current->signal->cred_guard_mutex);
1392	abort_creds(bprm->cred);
1393	}
1394	do_close_execat(file: bprm->file);
1395	if (bprm->executable)
1396	fput(bprm->executable);
1397	/ If a binfmt changed the interp, free it. /
1398	if (bprm->interp != bprm->filename)
1399	kfree(objp: bprm->interp);
1400	kfree(objp: bprm->fdpath);
1401	kfree(objp: bprm);
1402	}
1403
1404	static struct linux_binprm alloc_bprm(int* fd, struct filename filename, int* flags)
1405	{
1406	struct linux_binprm *bprm;
1407	struct file *file;
1408	int retval = -ENOMEM;
1409
1410	file = do_open_execat(fd, name: filename, flags);
1411	if (IS_ERR(ptr: file))
1412	return ERR_CAST(ptr: file);
1413
1414	bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
1415	if (!bprm) {
1416	do_close_execat(file);
1417	return ERR_PTR(error: -ENOMEM);
1418	}
1419
1420	bprm->file = file;
1421
1422	if (fd == AT_FDCWD \|\| filename->name[`0`] == `'/'`) {
1423	bprm->filename = filename->name;
1424	} else {
1425	if (filename->name[`0`] == `'\0'`) {
1426	bprm->fdpath = kasprintf(GFP_KERNEL, fmt: "/dev/fd/%d", fd);
1427	bprm->comm_from_dentry = `1`;
1428	} else {
1429	bprm->fdpath = kasprintf(GFP_KERNEL, fmt: "/dev/fd/%d/%s",
1430	fd, filename->name);
1431	}
1432	if (!bprm->fdpath)
1433	goto out_free;
1434
1435	/*
1436	* Record that a name derived from an O_CLOEXEC fd will be
1437	* inaccessible after exec. This allows the code in exec to
1438	* choose to fail when the executable is not mmaped into the
1439	* interpreter and an open file descriptor is not passed to
1440	* the interpreter. This makes for a better user experience
1441	* than having the interpreter start and then immediately fail
1442	* when it finds the executable is inaccessible.
1443	*/
1444	if (get_close_on_exec(fd))
1445	bprm->interp_flags \|= BINPRM_FLAGS_PATH_INACCESSIBLE;
1446
1447	bprm->filename = bprm->fdpath;
1448	}
1449	bprm->interp = bprm->filename;
1450
1451	/*
1452	* At this point, security_file_open() has already been called (with
1453	* __FMODE_EXEC) and access control checks for AT_EXECVE_CHECK will
1454	* stop just after the security_bprm_creds_for_exec() call in
1455	* bprm_execve(). Indeed, the kernel should not try to parse the
1456	* content of the file with exec_binprm() nor change the calling
1457	* thread, which means that the following security functions will not
1458	* be called:
1459	* - security_bprm_check()
1460	* - security_bprm_creds_from_file()
1461	* - security_bprm_committing_creds()
1462	* - security_bprm_committed_creds()
1463	*/
1464	bprm->is_check = !!(flags & AT_EXECVE_CHECK);
1465
1466	retval = bprm_mm_init(bprm);
1467	if (!retval)
1468	return bprm;
1469
1470	out_free:
1471	free_bprm(bprm);
1472	return ERR_PTR(error: retval);
1473	}
1474
1475	int bprm_change_interp(const char interp, struct* linux_binprm *bprm)
1476	{
1477	/ If a binfmt changed the interp, free it first. /
1478	if (bprm->interp != bprm->filename)
1479	kfree(objp: bprm->interp);
1480	bprm->interp = kstrdup(s: interp, GFP_KERNEL);
1481	if (!bprm->interp)
1482	return -ENOMEM;
1483	return `0`;
1484	}
1485	EXPORT_SYMBOL(bprm_change_interp);
1486
1487	/*
1488	* determine how safe it is to execute the proposed program
1489	* - the caller must hold ->cred_guard_mutex to protect against
1490	* PTRACE_ATTACH or seccomp thread-sync
1491	*/
1492	static void check_unsafe_exec(struct linux_binprm *bprm)
1493	{
1494	struct task_struct p = current, t;
1495	unsigned n_fs;
1496
1497	if (p->ptrace)
1498	bprm->unsafe \|= LSM_UNSAFE_PTRACE;
1499
1500	/*
1501	* This isn't strictly necessary, but it makes it harder for LSMs to
1502	* mess up.
1503	*/
1504	if (task_no_new_privs(current))
1505	bprm->unsafe \|= LSM_UNSAFE_NO_NEW_PRIVS;
1506
1507	/*
1508	* If another task is sharing our fs, we cannot safely
1509	* suid exec because the differently privileged task
1510	* will be able to manipulate the current directory, etc.
1511	* It would be nice to force an unshare instead...
1512	*
1513	* Otherwise we set fs->in_exec = 1 to deny clone(CLONE_FS)
1514	* from another sub-thread until de_thread() succeeds, this
1515	* state is protected by cred_guard_mutex we hold.
1516	*/
1517	n_fs = `1`;
1518	read_seqlock_excl(sl: &p->fs->seq);
1519	rcu_read_lock();
1520	for_other_threads(p, t) {
1521	if (t->fs == p->fs)
1522	n_fs++;
1523	}
1524	rcu_read_unlock();
1525
1526	/ "users" and "in_exec" locked for copy_fs() /
1527	if (p->fs->users > n_fs)
1528	bprm->unsafe \|= LSM_UNSAFE_SHARE;
1529	else
1530	p->fs->in_exec = `1`;
1531	read_sequnlock_excl(sl: &p->fs->seq);
1532	}
1533
1534	static void bprm_fill_uid(struct linux_binprm bprm, struct* file *file)
1535	{
1536	/ Handle suid and sgid on files /
1537	struct mnt_idmap *idmap;
1538	struct inode *inode = file_inode(f: file);
1539	unsigned int mode;
1540	vfsuid_t vfsuid;
1541	vfsgid_t vfsgid;
1542	int err;
1543
1544	if (!mnt_may_suid(mnt: file->f_path.mnt))
1545	return;
1546
1547	if (task_no_new_privs(current))
1548	return;
1549
1550	mode = READ_ONCE(inode->i_mode);
1551	if (!(mode & (S_ISUID\|S_ISGID)))
1552	return;
1553
1554	idmap = file_mnt_idmap(file);
1555
1556	/ Be careful if suid/sgid is set /
1557	inode_lock(inode);
1558
1559	/ Atomically reload and check mode/uid/gid now that lock held. /
1560	mode = inode->i_mode;
1561	vfsuid = i_uid_into_vfsuid(idmap, inode);
1562	vfsgid = i_gid_into_vfsgid(idmap, inode);
1563	err = inode_permission(idmap, inode, MAY_EXEC);
1564	inode_unlock(inode);
1565
1566	/ Did the exec bit vanish out from under us? Give up. /
1567	if (err)
1568	return;
1569
1570	/ We ignore suid/sgid if there are no mappings for them in the ns /
1571	if (!vfsuid_has_mapping(userns: bprm->cred->user_ns, vfsuid) \|\|
1572	!vfsgid_has_mapping(userns: bprm->cred->user_ns, vfsgid))
1573	return;
1574
1575	if (mode & S_ISUID) {
1576	bprm->per_clear \|= PER_CLEAR_ON_SETID;
1577	bprm->cred->euid = vfsuid_into_kuid(vfsuid);
1578	}
1579
1580	if ((mode & (S_ISGID \| S_IXGRP)) == (S_ISGID \| S_IXGRP)) {
1581	bprm->per_clear \|= PER_CLEAR_ON_SETID;
1582	bprm->cred->egid = vfsgid_into_kgid(vfsgid);
1583	}
1584	}
1585
1586	/*
1587	* Compute brpm->cred based upon the final binary.
1588	*/
1589	static int bprm_creds_from_file(struct linux_binprm *bprm)
1590	{
1591	/ Compute creds based on which file? /
1592	struct file *file = bprm->execfd_creds ? bprm->executable : bprm->file;
1593
1594	bprm_fill_uid(bprm, file);
1595	return security_bprm_creds_from_file(bprm, file);
1596	}
1597
1598	/*
1599	* Fill the binprm structure from the inode.
1600	* Read the first BINPRM_BUF_SIZE bytes
1601	*
1602	* This may be called multiple times for binary chains (scripts for example).
1603	*/
1604	static int prepare_binprm(struct linux_binprm *bprm)
1605	{
1606	loff_t pos = `0`;
1607
1608	memset(s: bprm->buf, c: `0`, BINPRM_BUF_SIZE);
1609	return kernel_read(bprm->file, bprm->buf, BINPRM_BUF_SIZE, &pos);
1610	}
1611
1612	/*
1613	* Arguments are '\0' separated strings found at the location bprm->p
1614	* points to; chop off the first by relocating brpm->p to right after
1615	* the first '\0' encountered.
1616	*/
1617	int remove_arg_zero(struct linux_binprm *bprm)
1618	{
1619	unsigned long offset;
1620	char *kaddr;
1621	struct page *page;
1622
1623	if (!bprm->argc)
1624	return `0`;
1625
1626	do {
1627	offset = bprm->p & ~PAGE_MASK;
1628	page = get_arg_page(bprm, pos: bprm->p, write: `0`);
1629	if (!page)
1630	return -EFAULT;
1631	kaddr = kmap_local_page(page);
1632
1633	for (; offset < PAGE_SIZE && kaddr[offset];
1634	offset++, bprm->p++)
1635	;
1636
1637	kunmap_local(kaddr);
1638	put_arg_page(page);
1639	} while (offset == PAGE_SIZE);
1640
1641	bprm->p++;
1642	bprm->argc--;
1643
1644	return `0`;
1645	}
1646	EXPORT_SYMBOL(remove_arg_zero);
1647
1648	/*
1649	* cycle the list of binary formats handler, until one recognizes the image
1650	*/
1651	static int search_binary_handler(struct linux_binprm *bprm)
1652	{
1653	struct linux_binfmt *fmt;
1654	int retval;
1655
1656	retval = prepare_binprm(bprm);
1657	if (retval < `0`)
1658	return retval;
1659
1660	retval = security_bprm_check(bprm);
1661	if (retval)
1662	return retval;
1663
1664	read_lock(&binfmt_lock);
1665	list_for_each_entry(fmt, &formats, lh) {
1666	if (!try_module_get(module: fmt->module))
1667	continue;
1668	read_unlock(&binfmt_lock);
1669
1670	retval = fmt->load_binary(bprm);
1671
1672	read_lock(&binfmt_lock);
1673	put_binfmt(fmt);
1674	if (bprm->point_of_no_return \|\| (retval != -ENOEXEC)) {
1675	read_unlock(&binfmt_lock);
1676	return retval;
1677	}
1678	}
1679	read_unlock(&binfmt_lock);
1680
1681	return -ENOEXEC;
1682	}
1683
1684	/ binfmt handlers will call back into begin_new_exec() on success. /
1685	static int exec_binprm(struct linux_binprm *bprm)
1686	{
1687	pid_t old_pid, old_vpid;
1688	int ret, depth;
1689
1690	/ Need to fetch pid before load_binary changes it /
1691	old_pid = current->pid;
1692	rcu_read_lock();
1693	old_vpid = task_pid_nr_ns(current, ns: task_active_pid_ns(current->parent));
1694	rcu_read_unlock();
1695
1696	/ This allows 4 levels of binfmt rewrites before failing hard. /
1697	for (depth = `0`;; depth++) {
1698	struct file *exec;
1699	if (depth > `5`)
1700	return -ELOOP;
1701
1702	ret = search_binary_handler(bprm);
1703	if (ret < `0`)
1704	return ret;
1705	if (!bprm->interpreter)
1706	break;
1707
1708	exec = bprm->file;
1709	bprm->file = bprm->interpreter;
1710	bprm->interpreter = NULL;
1711
1712	exe_file_allow_write_access(exe_file: exec);
1713	if (unlikely(bprm->have_execfd)) {
1714	if (bprm->executable) {
1715	fput(exec);
1716	return -ENOEXEC;
1717	}
1718	bprm->executable = exec;
1719	} else
1720	fput(exec);
1721	}
1722
1723	audit_bprm(bprm);
1724	trace_sched_process_exec(current, old_pid, bprm);
1725	ptrace_event(PTRACE_EVENT_EXEC, message: old_vpid);
1726	proc_exec_connector(current);
1727	return `0`;
1728	}
1729
1730	static int bprm_execve(struct linux_binprm *bprm)
1731	{
1732	int retval;
1733
1734	retval = prepare_bprm_creds(bprm);
1735	if (retval)
1736	return retval;
1737
1738	/*
1739	* Check for unsafe execution states before exec_binprm(), which
1740	* will call back into begin_new_exec(), into bprm_creds_from_file(),
1741	* where setuid-ness is evaluated.
1742	*/
1743	check_unsafe_exec(bprm);
1744	current->in_execve = `1`;
1745	sched_mm_cid_before_execve(current);
1746
1747	sched_exec();
1748
1749	/ Set the unchanging part of bprm->cred /
1750	retval = security_bprm_creds_for_exec(bprm);
1751	if (retval \|\| bprm->is_check)
1752	goto out;
1753
1754	retval = exec_binprm(bprm);
1755	if (retval < `0`)
1756	goto out;
1757
1758	sched_mm_cid_after_execve(current);
1759	rseq_execve(current);
1760	/ execve succeeded /
1761	current->in_execve = `0`;
1762	user_events_execve(current);
1763	acct_update_integrals(current);
1764	task_numa_free(current, final: false);
1765	return retval;
1766
1767	out:
1768	/*
1769	* If past the point of no return ensure the code never
1770	* returns to the userspace process. Use an existing fatal
1771	* signal if present otherwise terminate the process with
1772	* SIGSEGV.
1773	*/
1774	if (bprm->point_of_no_return && !fatal_signal_pending(current))
1775	force_fatal_sig(SIGSEGV);
1776
1777	sched_mm_cid_after_execve(current);
1778	rseq_set_notify_resume(current);
1779	current->in_execve = `0`;
1780
1781	return retval;
1782	}
1783
1784	static int do_execveat_common(int fd, struct filename *filename,
1785	struct user_arg_ptr argv,
1786	struct user_arg_ptr envp,
1787	int flags)
1788	{
1789	struct linux_binprm *bprm;
1790	int retval;
1791
1792	if (IS_ERR(ptr: filename))
1793	return PTR_ERR(ptr: filename);
1794
1795	/*
1796	* We move the actual failure in case of RLIMIT_NPROC excess from
1797	* set*uid() to execve() because too many poorly written programs
1798	* don't check setuid() return code. Here we additionally recheck
1799	* whether NPROC limit is still exceeded.
1800	*/
1801	if ((current->flags & PF_NPROC_EXCEEDED) &&
1802	is_rlimit_overlimit(current_ucounts(), type: UCOUNT_RLIMIT_NPROC, max: rlimit(RLIMIT_NPROC))) {
1803	retval = -EAGAIN;
1804	goto out_ret;
1805	}
1806
1807	/ We're below the limit (still or again), so we don't want to make*
1808	* further execve() calls fail. */
1809	current->flags &= ~PF_NPROC_EXCEEDED;
1810
1811	bprm = alloc_bprm(fd, filename, flags);
1812	if (IS_ERR(ptr: bprm)) {
1813	retval = PTR_ERR(ptr: bprm);
1814	goto out_ret;
1815	}
1816
1817	retval = count(argv, MAX_ARG_STRINGS);
1818	if (retval < `0`)
1819	goto out_free;
1820	bprm->argc = retval;
1821
1822	retval = count(argv: envp, MAX_ARG_STRINGS);
1823	if (retval < `0`)
1824	goto out_free;
1825	bprm->envc = retval;
1826
1827	retval = bprm_stack_limits(bprm);
1828	if (retval < `0`)
1829	goto out_free;
1830
1831	retval = copy_string_kernel(bprm->filename, bprm);
1832	if (retval < `0`)
1833	goto out_free;
1834	bprm->exec = bprm->p;
1835
1836	retval = copy_strings(argc: bprm->envc, argv: envp, bprm);
1837	if (retval < `0`)
1838	goto out_free;
1839
1840	retval = copy_strings(argc: bprm->argc, argv, bprm);
1841	if (retval < `0`)
1842	goto out_free;
1843
1844	/*
1845	* When argv is empty, add an empty string ("") as argv[0] to
1846	* ensure confused userspace programs that start processing
1847	* from argv[1] won't end up walking envp. See also
1848	* bprm_stack_limits().
1849	*/
1850	if (bprm->argc == `0`) {
1851	retval = copy_string_kernel("", bprm);
1852	if (retval < `0`)
1853	goto out_free;
1854	bprm->argc = `1`;
1855
1856	pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n",
1857	current->comm, bprm->filename);
1858	}
1859
1860	retval = bprm_execve(bprm);
1861	out_free:
1862	free_bprm(bprm);
1863
1864	out_ret:
1865	putname(name: filename);
1866	return retval;
1867	}
1868
1869	int kernel_execve(const char *kernel_filename,
1870	const char *const argv, const* char *const *envp)
1871	{
1872	struct filename *filename;
1873	struct linux_binprm *bprm;
1874	int fd = AT_FDCWD;
1875	int retval;
1876
1877	/ It is non-sense for kernel threads to call execve /
1878	if (WARN_ON_ONCE(current->flags & PF_KTHREAD))
1879	return -EINVAL;
1880
1881	filename = getname_kernel(kernel_filename);
1882	if (IS_ERR(ptr: filename))
1883	return PTR_ERR(ptr: filename);
1884
1885	bprm = alloc_bprm(fd, filename, flags: `0`);
1886	if (IS_ERR(ptr: bprm)) {
1887	retval = PTR_ERR(ptr: bprm);
1888	goto out_ret;
1889	}
1890
1891	retval = count_strings_kernel(argv);
1892	if (WARN_ON_ONCE(retval == `0`))
1893	retval = -EINVAL;
1894	if (retval < `0`)
1895	goto out_free;
1896	bprm->argc = retval;
1897
1898	retval = count_strings_kernel(argv: envp);
1899	if (retval < `0`)
1900	goto out_free;
1901	bprm->envc = retval;
1902
1903	retval = bprm_stack_limits(bprm);
1904	if (retval < `0`)
1905	goto out_free;
1906
1907	retval = copy_string_kernel(bprm->filename, bprm);
1908	if (retval < `0`)
1909	goto out_free;
1910	bprm->exec = bprm->p;
1911
1912	retval = copy_strings_kernel(argc: bprm->envc, argv: envp, bprm);
1913	if (retval < `0`)
1914	goto out_free;
1915
1916	retval = copy_strings_kernel(argc: bprm->argc, argv, bprm);
1917	if (retval < `0`)
1918	goto out_free;
1919
1920	retval = bprm_execve(bprm);
1921	out_free:
1922	free_bprm(bprm);
1923	out_ret:
1924	putname(name: filename);
1925	return retval;
1926	}
1927
1928	static int do_execve(struct filename *filename,
1929	const char __user *const __user *__argv,
1930	const char __user *const __user *__envp)
1931	{
1932	struct user_arg_ptr argv = { .ptr.native = __argv };
1933	struct user_arg_ptr envp = { .ptr.native = __envp };
1934	return do_execveat_common(AT_FDCWD, filename, argv, envp, flags: `0`);
1935	}
1936
1937	static int do_execveat(int fd, struct filename *filename,
1938	const char __user *const __user *__argv,
1939	const char __user *const __user *__envp,
1940	int flags)
1941	{
1942	struct user_arg_ptr argv = { .ptr.native = __argv };
1943	struct user_arg_ptr envp = { .ptr.native = __envp };
1944
1945	return do_execveat_common(fd, filename, argv, envp, flags);
1946	}
1947
1948	#ifdef CONFIG_COMPAT
1949	static int compat_do_execve(struct filename *filename,
1950	const compat_uptr_t __user *__argv,
1951	const compat_uptr_t __user *__envp)
1952	{
1953	struct user_arg_ptr argv = {
1954	.is_compat = true,
1955	.ptr.compat = __argv,
1956	};
1957	struct user_arg_ptr envp = {
1958	.is_compat = true,
1959	.ptr.compat = __envp,
1960	};
1961	return do_execveat_common(AT_FDCWD, filename, argv, envp, flags: `0`);
1962	}
1963
1964	static int compat_do_execveat(int fd, struct filename *filename,
1965	const compat_uptr_t __user *__argv,
1966	const compat_uptr_t __user *__envp,
1967	int flags)
1968	{
1969	struct user_arg_ptr argv = {
1970	.is_compat = true,
1971	.ptr.compat = __argv,
1972	};
1973	struct user_arg_ptr envp = {
1974	.is_compat = true,
1975	.ptr.compat = __envp,
1976	};
1977	return do_execveat_common(fd, filename, argv, envp, flags);
1978	}
1979	#endif
1980
1981	void set_binfmt(struct linux_binfmt *new)
1982	{
1983	struct mm_struct *mm = current->mm;
1984
1985	if (mm->binfmt)
1986	module_put(module: mm->binfmt->module);
1987
1988	mm->binfmt = new;
1989	if (new)
1990	__module_get(module: new->module);
1991	}
1992	EXPORT_SYMBOL(set_binfmt);
1993
1994	/*
1995	* set_dumpable stores three-value SUID_DUMP_* into mm->flags.
1996	*/
1997	void set_dumpable(struct mm_struct mm, int* value)
1998	{
1999	if (WARN_ON((unsigned)value > SUID_DUMP_ROOT))
2000	return;
2001
2002	__mm_flags_set_mask_dumpable(mm, value);
2003	}
2004
2005	SYSCALL_DEFINE3(execve,
2006	const char __user *, filename,
2007	const char __user *const __user *, argv,
2008	const char __user *const __user *, envp)
2009	{
2010	return do_execve(filename: getname(name: filename), argv: argv, envp: envp);
2011	}
2012
2013	SYSCALL_DEFINE5(execveat,
2014	int, fd, const char __user *, filename,
2015	const char __user *const __user *, argv,
2016	const char __user *const __user *, envp,
2017	int, flags)
2018	{
2019	return do_execveat(fd,
2020	filename: getname_uflags(filename, flags),
2021	argv: argv, envp: envp, flags);
2022	}
2023
2024	#ifdef CONFIG_COMPAT
2025	COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename,
2026	const compat_uptr_t __user *, argv,
2027	const compat_uptr_t __user *, envp)
2028	{
2029	return compat_do_execve(filename: getname(name: filename), argv: argv, envp: envp);
2030	}
2031
2032	COMPAT_SYSCALL_DEFINE5(execveat, int, fd,
2033	const char __user *, filename,
2034	const compat_uptr_t __user *, argv,
2035	const compat_uptr_t __user *, envp,
2036	int, flags)
2037	{
2038	return compat_do_execveat(fd,
2039	filename: getname_uflags(filename, flags),
2040	argv: argv, envp: envp, flags);
2041	}
2042	#endif
2043
2044	#ifdef CONFIG_SYSCTL
2045
2046	static int proc_dointvec_minmax_coredump(const struct ctl_table table, int* write,
2047	void buffer, size_t lenp, loff_t *ppos)
2048	{
2049	int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
2050
2051	if (!error && !write)
2052	validate_coredump_safety();
2053	return error;
2054	}
2055
2056	static const struct ctl_table fs_exec_sysctls[] = {
2057	{
2058	.procname = "suid_dumpable",
2059	.data = &suid_dumpable,
2060	.maxlen = sizeof(int),
2061	.mode = `0644`,
2062	.proc_handler = proc_dointvec_minmax_coredump,
2063	.extra1 = SYSCTL_ZERO,
2064	.extra2 = SYSCTL_TWO,
2065	},
2066	};
2067
2068	static int __init init_fs_exec_sysctls(void)
2069	{
2070	register_sysctl_init("fs", fs_exec_sysctls);
2071	return `0`;
2072	}
2073
2074	fs_initcall(init_fs_exec_sysctls);
2075	#endif /* CONFIG_SYSCTL */
2076
2077	#ifdef CONFIG_EXEC_KUNIT_TEST
2078	#include "tests/exec_kunit.c"
2079	#endif
2080

Browse the source code of Linux/fs/exec.c