namespace.c source code [Linux/fs/namespace.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* linux/fs/namespace.c
4	*
5	* (C) Copyright Al Viro 2000, 2001
6	*
7	* Based on code from fs/super.c, copyright Linus Torvalds and others.
8	* Heavily rewritten.
9	*/
10
11	#include <linux/syscalls.h>
12	#include <linux/export.h>
13	#include <linux/capability.h>
14	#include <linux/mnt_namespace.h>
15	#include <linux/user_namespace.h>
16	#include <linux/namei.h>
17	#include <linux/security.h>
18	#include <linux/cred.h>
19	#include <linux/idr.h>
20	#include <linux/init.h> /* init_rootfs */
21	#include <linux/fs_struct.h> /* get_fs_root et.al. */
22	#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */
23	#include <linux/file.h>
24	#include <linux/uaccess.h>
25	#include <linux/proc_ns.h>
26	#include <linux/magic.h>
27	#include <linux/memblock.h>
28	#include <linux/proc_fs.h>
29	#include <linux/task_work.h>
30	#include <linux/sched/task.h>
31	#include <uapi/linux/mount.h>
32	#include <linux/fs_context.h>
33	#include <linux/shmem_fs.h>
34	#include <linux/mnt_idmapping.h>
35	#include <linux/pidfs.h>
36	#include <linux/nstree.h>
37
38	#include "pnode.h"
39	#include "internal.h"
40
41	/ Maximum number of mounts in a mount namespace /
42	static unsigned int sysctl_mount_max __read_mostly = `100000`;
43
44	static unsigned int m_hash_mask __ro_after_init;
45	static unsigned int m_hash_shift __ro_after_init;
46	static unsigned int mp_hash_mask __ro_after_init;
47	static unsigned int mp_hash_shift __ro_after_init;
48
49	static __initdata unsigned long mhash_entries;
50	static int __init set_mhash_entries(char *str)
51	{
52	if (!str)
53	return `0`;
54	mhash_entries = simple_strtoul(str, &str, `0`);
55	return `1`;
56	}
57	__setup("mhash_entries=", set_mhash_entries);
58
59	static __initdata unsigned long mphash_entries;
60	static int __init set_mphash_entries(char *str)
61	{
62	if (!str)
63	return `0`;
64	mphash_entries = simple_strtoul(str, &str, `0`);
65	return `1`;
66	}
67	__setup("mphash_entries=", set_mphash_entries);
68
69	static char * __initdata initramfs_options;
70	static int __init initramfs_options_setup(char *str)
71	{
72	initramfs_options = str;
73	return `1`;
74	}
75
76	__setup("initramfs_options=", initramfs_options_setup);
77
78	static u64 event;
79	static DEFINE_XARRAY_FLAGS(mnt_id_xa, XA_FLAGS_ALLOC);
80	static DEFINE_IDA(mnt_group_ida);
81
82	/ Don't allow confusion with old 32bit mount ID /
83	#define MNT_UNIQUE_ID_OFFSET (1ULL << 31)
84	static u64 mnt_id_ctr = MNT_UNIQUE_ID_OFFSET;
85
86	static struct hlist_head *mount_hashtable __ro_after_init;
87	static struct hlist_head *mountpoint_hashtable __ro_after_init;
88	static struct kmem_cache *mnt_cache __ro_after_init;
89	static DECLARE_RWSEM(namespace_sem);
90	static HLIST_HEAD(unmounted); / protected by namespace_sem /
91	static LIST_HEAD(ex_mountpoints); / protected by namespace_sem /
92	static struct mnt_namespace emptied_ns; /* protected by namespace_sem /
93
94	static inline void namespace_lock(void);
95	static void namespace_unlock(void);
96	DEFINE_LOCK_GUARD_0(namespace_excl, namespace_lock(), namespace_unlock())
97	DEFINE_LOCK_GUARD_0(namespace_shared, down_read(&namespace_sem),
98	up_read(&namespace_sem))
99
100	DEFINE_FREE(mntput, struct vfsmount *, if (!IS_ERR(_T)) mntput(_T))
101
102	#ifdef CONFIG_FSNOTIFY
103	LIST_HEAD(notify_list); / protected by namespace_sem /
104	#endif
105
106	enum mount_kattr_flags_t {
107	MOUNT_KATTR_RECURSE = (`1` << `0`),
108	MOUNT_KATTR_IDMAP_REPLACE = (`1` << `1`),
109	};
110
111	struct mount_kattr {
112	unsigned int attr_set;
113	unsigned int attr_clr;
114	unsigned int propagation;
115	unsigned int lookup_flags;
116	enum mount_kattr_flags_t kflags;
117	struct user_namespace *mnt_userns;
118	struct mnt_idmap *mnt_idmap;
119	};
120
121	/ /sys/fs /
122	struct kobject *fs_kobj __ro_after_init;
123	EXPORT_SYMBOL_GPL(fs_kobj);
124
125	/*
126	* vfsmount lock may be taken for read to prevent changes to the
127	* vfsmount hash, ie. during mountpoint lookups or walking back
128	* up the tree.
129	*
130	* It should be taken for write in all cases where the vfsmount
131	* tree or hash is modified or when a vfsmount structure is modified.
132	*/
133	__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
134
135	static inline struct mnt_namespace node_to_mnt_ns(const* struct rb_node *node)
136	{
137	struct ns_common *ns;
138
139	if (!node)
140	return NULL;
141	ns = rb_entry(node, struct ns_common, ns_tree_node);
142	return container_of(ns, struct mnt_namespace, ns);
143	}
144
145	static void mnt_ns_release(struct mnt_namespace *ns)
146	{
147	/ keep alive for {list,stat}mount() /
148	if (ns && refcount_dec_and_test(r: &ns->passive)) {
149	fsnotify_mntns_delete(mntns: ns);
150	put_user_ns(ns: ns->user_ns);
151	kfree(objp: ns);
152	}
153	}
154	DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T))
155
156	static void mnt_ns_release_rcu(struct rcu_head *rcu)
157	{
158	mnt_ns_release(container_of(rcu, struct mnt_namespace, ns.ns_rcu));
159	}
160
161	static void mnt_ns_tree_remove(struct mnt_namespace *ns)
162	{
163	/ remove from global mount namespace list /
164	if (ns_tree_active(ns))
165	ns_tree_remove(ns);
166
167	call_rcu(head: &ns->ns.ns_rcu, func: mnt_ns_release_rcu);
168	}
169
170	/*
171	* Lookup a mount namespace by id and take a passive reference count. Taking a
172	* passive reference means the mount namespace can be emptied if e.g., the last
173	* task holding an active reference exits. To access the mounts of the
174	* namespace the @namespace_sem must first be acquired. If the namespace has
175	* already shut down before acquiring @namespace_sem, {list,stat}mount() will
176	* see that the mount rbtree of the namespace is empty.
177	*
178	* Note the lookup is lockless protected by a sequence counter. We only
179	* need to guard against false negatives as false positives aren't
180	* possible. So if we didn't find a mount namespace and the sequence
181	* counter has changed we need to retry. If the sequence counter is
182	* still the same we know the search actually failed.
183	*/
184	static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id)
185	{
186	struct mnt_namespace *mnt_ns;
187	struct ns_common *ns;
188
189	guard(rcu)();
190	ns = ns_tree_lookup_rcu(ns_id: mnt_ns_id, CLONE_NEWNS);
191	if (!ns)
192	return NULL;
193
194	/*
195	* The last reference count is put with RCU delay so we can
196	* unconditonally acquire a reference here.
197	*/
198	mnt_ns = container_of(ns, struct mnt_namespace, ns);
199	refcount_inc(r: &mnt_ns->passive);
200	return mnt_ns;
201	}
202
203	static inline void lock_mount_hash(void)
204	{
205	write_seqlock(sl: &mount_lock);
206	}
207
208	static inline void unlock_mount_hash(void)
209	{
210	write_sequnlock(sl: &mount_lock);
211	}
212
213	static inline struct hlist_head m_hash(struct* vfsmount mnt, struct* dentry *dentry)
214	{
215	unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
216	tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
217	tmp = tmp + (tmp >> m_hash_shift);
218	return &mount_hashtable[tmp & m_hash_mask];
219	}
220
221	static inline struct hlist_head mp_hash(struct* dentry *dentry)
222	{
223	unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);
224	tmp = tmp + (tmp >> mp_hash_shift);
225	return &mountpoint_hashtable[tmp & mp_hash_mask];
226	}
227
228	static int mnt_alloc_id(struct mount *mnt)
229	{
230	int res;
231
232	xa_lock(&mnt_id_xa);
233	res = __xa_alloc(&mnt_id_xa, id: &mnt->mnt_id, entry: mnt, XA_LIMIT(`1`, INT_MAX), GFP_KERNEL);
234	if (!res)
235	mnt->mnt_id_unique = ++mnt_id_ctr;
236	xa_unlock(&mnt_id_xa);
237	return res;
238	}
239
240	static void mnt_free_id(struct mount *mnt)
241	{
242	xa_erase(&mnt_id_xa, index: mnt->mnt_id);
243	}
244
245	/*
246	* Allocate a new peer group ID
247	*/
248	static int mnt_alloc_group_id(struct mount *mnt)
249	{
250	int res = ida_alloc_min(ida: &mnt_group_ida, min: `1`, GFP_KERNEL);
251
252	if (res < `0`)
253	return res;
254	mnt->mnt_group_id = res;
255	return `0`;
256	}
257
258	/*
259	* Release a peer group ID
260	*/
261	void mnt_release_group_id(struct mount *mnt)
262	{
263	ida_free(&mnt_group_ida, id: mnt->mnt_group_id);
264	mnt->mnt_group_id = `0`;
265	}
266
267	/*
268	* vfsmount lock must be held for read
269	*/
270	static inline void mnt_add_count(struct mount mnt, int* n)
271	{
272	#ifdef CONFIG_SMP
273	this_cpu_add(mnt->mnt_pcp->mnt_count, n);
274	#else
275	preempt_disable();
276	mnt->mnt_count += n;
277	preempt_enable();
278	#endif
279	}
280
281	/*
282	* vfsmount lock must be held for write
283	*/
284	int mnt_get_count(struct mount *mnt)
285	{
286	#ifdef CONFIG_SMP
287	int count = `0`;
288	int cpu;
289
290	for_each_possible_cpu(cpu) {
291	count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
292	}
293
294	return count;
295	#else
296	return mnt->mnt_count;
297	#endif
298	}
299
300	static struct mount alloc_vfsmnt(const* char *name)
301	{
302	struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
303	if (mnt) {
304	int err;
305
306	err = mnt_alloc_id(mnt);
307	if (err)
308	goto out_free_cache;
309
310	if (name)
311	mnt->mnt_devname = kstrdup_const(s: name,
312	GFP_KERNEL_ACCOUNT);
313	else
314	mnt->mnt_devname = "none";
315	if (!mnt->mnt_devname)
316	goto out_free_id;
317
318	#ifdef CONFIG_SMP
319	mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
320	if (!mnt->mnt_pcp)
321	goto out_free_devname;
322
323	this_cpu_add(mnt->mnt_pcp->mnt_count, `1`);
324	#else
325	mnt->mnt_count = `1`;
326	mnt->mnt_writers = `0`;
327	#endif
328
329	INIT_HLIST_NODE(h: &mnt->mnt_hash);
330	INIT_LIST_HEAD(list: &mnt->mnt_child);
331	INIT_LIST_HEAD(list: &mnt->mnt_mounts);
332	INIT_LIST_HEAD(list: &mnt->mnt_list);
333	INIT_LIST_HEAD(list: &mnt->mnt_expire);
334	INIT_LIST_HEAD(list: &mnt->mnt_share);
335	INIT_HLIST_HEAD(&mnt->mnt_slave_list);
336	INIT_HLIST_NODE(h: &mnt->mnt_slave);
337	INIT_HLIST_NODE(h: &mnt->mnt_mp_list);
338	INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
339	RB_CLEAR_NODE(&mnt->mnt_node);
340	mnt->mnt.mnt_idmap = &nop_mnt_idmap;
341	}
342	return mnt;
343
344	#ifdef CONFIG_SMP
345	out_free_devname:
346	kfree_const(x: mnt->mnt_devname);
347	#endif
348	out_free_id:
349	mnt_free_id(mnt);
350	out_free_cache:
351	kmem_cache_free(s: mnt_cache, objp: mnt);
352	return NULL;
353	}
354
355	/*
356	* Most r/o checks on a fs are for operations that take
357	* discrete amounts of time, like a write() or unlink().
358	* We must keep track of when those operations start
359	* (for permission checks) and when they end, so that
360	* we can determine when writes are able to occur to
361	* a filesystem.
362	*/
363	/*
364	* __mnt_is_readonly: check whether a mount is read-only
365	* @mnt: the mount to check for its write status
366	*
367	* This shouldn't be used directly ouside of the VFS.
368	* It does not guarantee that the filesystem will stay
369	* r/w, just that it is right now. This can not and
370	* should not be used in place of IS_RDONLY(inode).
371	* mnt_want/drop_write() will _keep_ the filesystem
372	* r/w.
373	*/
374	bool __mnt_is_readonly(const struct vfsmount *mnt)
375	{
376	return (mnt->mnt_flags & MNT_READONLY) \|\| sb_rdonly(sb: mnt->mnt_sb);
377	}
378	EXPORT_SYMBOL_GPL(__mnt_is_readonly);
379
380	static inline void mnt_inc_writers(struct mount *mnt)
381	{
382	#ifdef CONFIG_SMP
383	this_cpu_inc(mnt->mnt_pcp->mnt_writers);
384	#else
385	mnt->mnt_writers++;
386	#endif
387	}
388
389	static inline void mnt_dec_writers(struct mount *mnt)
390	{
391	#ifdef CONFIG_SMP
392	this_cpu_dec(mnt->mnt_pcp->mnt_writers);
393	#else
394	mnt->mnt_writers--;
395	#endif
396	}
397
398	static unsigned int mnt_get_writers(struct mount *mnt)
399	{
400	#ifdef CONFIG_SMP
401	unsigned int count = `0`;
402	int cpu;
403
404	for_each_possible_cpu(cpu) {
405	count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
406	}
407
408	return count;
409	#else
410	return mnt->mnt_writers;
411	#endif
412	}
413
414	static int mnt_is_readonly(const struct vfsmount *mnt)
415	{
416	if (READ_ONCE(mnt->mnt_sb->s_readonly_remount))
417	return `1`;
418	/*
419	* The barrier pairs with the barrier in sb_start_ro_state_change()
420	* making sure if we don't see s_readonly_remount set yet, we also will
421	* not see any superblock / mount flag changes done by remount.
422	* It also pairs with the barrier in sb_end_ro_state_change()
423	* assuring that if we see s_readonly_remount already cleared, we will
424	* see the values of superblock / mount flags updated by remount.
425	*/
426	smp_rmb();
427	return __mnt_is_readonly(mnt);
428	}
429
430	/*
431	* Most r/o & frozen checks on a fs are for operations that take discrete
432	* amounts of time, like a write() or unlink(). We must keep track of when
433	* those operations start (for permission checks) and when they end, so that we
434	* can determine when writes are able to occur to a filesystem.
435	*/
436	/**
437	* mnt_get_write_access - get write access to a mount without freeze protection
438	* @m: the mount on which to take a write
439	*
440	* This tells the low-level filesystem that a write is about to be performed to
441	* it, and makes sure that writes are allowed (mnt it read-write) before
442	* returning success. This operation does not protect against filesystem being
443	* frozen. When the write operation is finished, mnt_put_write_access() must be
444	* called. This is effectively a refcount.
445	*/
446	int mnt_get_write_access(struct vfsmount *m)
447	{
448	struct mount *mnt = real_mount(mnt: m);
449	int ret = `0`;
450
451	preempt_disable();
452	mnt_inc_writers(mnt);
453	/*
454	* The store to mnt_inc_writers must be visible before we pass
455	* WRITE_HOLD loop below, so that the slowpath can see our
456	* incremented count after it has set WRITE_HOLD.
457	*/
458	smp_mb();
459	might_lock(&mount_lock.lock);
460	while (__test_write_hold(READ_ONCE(mnt->mnt_pprev_for_sb))) {
461	if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
462	cpu_relax();
463	} else {
464	/*
465	* This prevents priority inversion, if the task
466	* setting WRITE_HOLD got preempted on a remote
467	* CPU, and it prevents life lock if the task setting
468	* WRITE_HOLD has a lower priority and is bound to
469	* the same CPU as the task that is spinning here.
470	*/
471	preempt_enable();
472	read_seqlock_excl(sl: &mount_lock);
473	read_sequnlock_excl(sl: &mount_lock);
474	preempt_disable();
475	}
476	}
477	/*
478	* The barrier pairs with the barrier sb_start_ro_state_change() making
479	* sure that if we see WRITE_HOLD cleared, we will also see
480	* s_readonly_remount set (or even SB_RDONLY / MNT_READONLY flags) in
481	* mnt_is_readonly() and bail in case we are racing with remount
482	* read-only.
483	*/
484	smp_rmb();
485	if (mnt_is_readonly(mnt: m)) {
486	mnt_dec_writers(mnt);
487	ret = -EROFS;
488	}
489	preempt_enable();
490
491	return ret;
492	}
493	EXPORT_SYMBOL_GPL(mnt_get_write_access);
494
495	/**
496	* mnt_want_write - get write access to a mount
497	* @m: the mount on which to take a write
498	*
499	* This tells the low-level filesystem that a write is about to be performed to
500	* it, and makes sure that writes are allowed (mount is read-write, filesystem
501	* is not frozen) before returning success. When the write operation is
502	* finished, mnt_drop_write() must be called. This is effectively a refcount.
503	*/
504	int mnt_want_write(struct vfsmount *m)
505	{
506	int ret;
507
508	sb_start_write(sb: m->mnt_sb);
509	ret = mnt_get_write_access(m);
510	if (ret)
511	sb_end_write(sb: m->mnt_sb);
512	return ret;
513	}
514	EXPORT_SYMBOL_GPL(mnt_want_write);
515
516	/**
517	* mnt_get_write_access_file - get write access to a file's mount
518	* @file: the file who's mount on which to take a write
519	*
520	* This is like mnt_get_write_access, but if @file is already open for write it
521	* skips incrementing mnt_writers (since the open file already has a reference)
522	* and instead only does the check for emergency r/o remounts. This must be
523	* paired with mnt_put_write_access_file.
524	*/
525	int mnt_get_write_access_file(struct file *file)
526	{
527	if (file->f_mode & FMODE_WRITER) {
528	/*
529	* Superblock may have become readonly while there are still
530	* writable fd's, e.g. due to a fs error with errors=remount-ro
531	*/
532	if (__mnt_is_readonly(file->f_path.mnt))
533	return -EROFS;
534	return `0`;
535	}
536	return mnt_get_write_access(file->f_path.mnt);
537	}
538
539	/**
540	* mnt_want_write_file - get write access to a file's mount
541	* @file: the file who's mount on which to take a write
542	*
543	* This is like mnt_want_write, but if the file is already open for writing it
544	* skips incrementing mnt_writers (since the open file already has a reference)
545	* and instead only does the freeze protection and the check for emergency r/o
546	* remounts. This must be paired with mnt_drop_write_file.
547	*/
548	int mnt_want_write_file(struct file *file)
549	{
550	int ret;
551
552	sb_start_write(sb: file_inode(f: file)->i_sb);
553	ret = mnt_get_write_access_file(file);
554	if (ret)
555	sb_end_write(sb: file_inode(f: file)->i_sb);
556	return ret;
557	}
558	EXPORT_SYMBOL_GPL(mnt_want_write_file);
559
560	/**
561	* mnt_put_write_access - give up write access to a mount
562	* @mnt: the mount on which to give up write access
563	*
564	* Tells the low-level filesystem that we are done
565	* performing writes to it. Must be matched with
566	* mnt_get_write_access() call above.
567	*/
568	void mnt_put_write_access(struct vfsmount *mnt)
569	{
570	preempt_disable();
571	mnt_dec_writers(mnt: real_mount(mnt));
572	preempt_enable();
573	}
574	EXPORT_SYMBOL_GPL(mnt_put_write_access);
575
576	/**
577	* mnt_drop_write - give up write access to a mount
578	* @mnt: the mount on which to give up write access
579	*
580	* Tells the low-level filesystem that we are done performing writes to it and
581	* also allows filesystem to be frozen again. Must be matched with
582	* mnt_want_write() call above.
583	*/
584	void mnt_drop_write(struct vfsmount *mnt)
585	{
586	mnt_put_write_access(mnt);
587	sb_end_write(sb: mnt->mnt_sb);
588	}
589	EXPORT_SYMBOL_GPL(mnt_drop_write);
590
591	void mnt_put_write_access_file(struct file *file)
592	{
593	if (!(file->f_mode & FMODE_WRITER))
594	mnt_put_write_access(file->f_path.mnt);
595	}
596
597	void mnt_drop_write_file(struct file *file)
598	{
599	mnt_put_write_access_file(file);
600	sb_end_write(sb: file_inode(f: file)->i_sb);
601	}
602	EXPORT_SYMBOL(mnt_drop_write_file);
603
604	/**
605	* mnt_hold_writers - prevent write access to the given mount
606	* @mnt: mnt to prevent write access to
607	*
608	* Prevents write access to @mnt if there are no active writers for @mnt.
609	* This function needs to be called and return successfully before changing
610	* properties of @mnt that need to remain stable for callers with write access
611	* to @mnt.
612	*
613	* After this functions has been called successfully callers must pair it with
614	* a call to mnt_unhold_writers() in order to stop preventing write access to
615	* @mnt.
616	*
617	* Context: This function expects to be in mount_locked_reader scope serializing
618	* setting WRITE_HOLD.
619	* Return: On success 0 is returned.
620	* On error, -EBUSY is returned.
621	*/
622	static inline int mnt_hold_writers(struct mount *mnt)
623	{
624	set_write_hold(mnt);
625	/*
626	* After storing WRITE_HOLD, we'll read the counters. This store
627	* should be visible before we do.
628	*/
629	smp_mb();
630
631	/*
632	* With writers on hold, if this value is zero, then there are
633	* definitely no active writers (although held writers may subsequently
634	* increment the count, they'll have to wait, and decrement it after
635	* seeing MNT_READONLY).
636	*
637	* It is OK to have counter incremented on one CPU and decremented on
638	* another: the sum will add up correctly. The danger would be when we
639	* sum up each counter, if we read a counter before it is incremented,
640	* but then read another CPU's count which it has been subsequently
641	* decremented from -- we would see more decrements than we should.
642	* WRITE_HOLD protects against this scenario, because
643	* mnt_want_write first increments count, then smp_mb, then spins on
644	* WRITE_HOLD, so it can't be decremented by another CPU while
645	* we're counting up here.
646	*/
647	if (mnt_get_writers(mnt) > `0`)
648	return -EBUSY;
649
650	return `0`;
651	}
652
653	/**
654	* mnt_unhold_writers - stop preventing write access to the given mount
655	* @mnt: mnt to stop preventing write access to
656	*
657	* Stop preventing write access to @mnt allowing callers to gain write access
658	* to @mnt again.
659	*
660	* This function can only be called after a call to mnt_hold_writers().
661	*
662	* Context: This function expects to be in the same mount_locked_reader scope
663	* as the matching mnt_hold_writers().
664	*/
665	static inline void mnt_unhold_writers(struct mount *mnt)
666	{
667	if (!test_write_hold(m: mnt))
668	return;
669	/*
670	* MNT_READONLY must become visible before ~WRITE_HOLD, so writers
671	* that become unheld will see MNT_READONLY.
672	*/
673	smp_wmb();
674	clear_write_hold(m: mnt);
675	}
676
677	static inline void mnt_del_instance(struct mount *m)
678	{
679	struct mount **p = m->mnt_pprev_for_sb;
680	struct mount *next = m->mnt_next_for_sb;
681
682	if (next)
683	next->mnt_pprev_for_sb = p;
684	*p = next;
685	}
686
687	static inline void mnt_add_instance(struct mount m, struct* super_block *s)
688	{
689	struct mount *first = s->s_mounts;
690
691	if (first)
692	first->mnt_pprev_for_sb = &m->mnt_next_for_sb;
693	m->mnt_next_for_sb = first;
694	m->mnt_pprev_for_sb = &s->s_mounts;
695	s->s_mounts = m;
696	}
697
698	static int mnt_make_readonly(struct mount *mnt)
699	{
700	int ret;
701
702	ret = mnt_hold_writers(mnt);
703	if (!ret)
704	mnt->mnt.mnt_flags \|= MNT_READONLY;
705	mnt_unhold_writers(mnt);
706	return ret;
707	}
708
709	int sb_prepare_remount_readonly(struct super_block *sb)
710	{
711	int err = `0`;
712
713	/ Racy optimization. Recheck the counter under WRITE_HOLD /
714	if (atomic_long_read(v: &sb->s_remove_count))
715	return -EBUSY;
716
717	guard(mount_locked_reader)();
718
719	for (struct mount *m = sb->s_mounts; m; m = m->mnt_next_for_sb) {
720	if (!(m->mnt.mnt_flags & MNT_READONLY)) {
721	err = mnt_hold_writers(mnt: m);
722	if (err)
723	break;
724	}
725	}
726	if (!err && atomic_long_read(v: &sb->s_remove_count))
727	err = -EBUSY;
728
729	if (!err)
730	sb_start_ro_state_change(sb);
731	for (struct mount *m = sb->s_mounts; m; m = m->mnt_next_for_sb) {
732	if (test_write_hold(m))
733	clear_write_hold(m);
734	}
735
736	return err;
737	}
738
739	static void free_vfsmnt(struct mount *mnt)
740	{
741	mnt_idmap_put(idmap: mnt_idmap(mnt: &mnt->mnt));
742	kfree_const(x: mnt->mnt_devname);
743	#ifdef CONFIG_SMP
744	free_percpu(pdata: mnt->mnt_pcp);
745	#endif
746	kmem_cache_free(s: mnt_cache, objp: mnt);
747	}
748
749	static void delayed_free_vfsmnt(struct rcu_head *head)
750	{
751	free_vfsmnt(container_of(head, struct mount, mnt_rcu));
752	}
753
754	/ call under rcu_read_lock /
755	int __legitimize_mnt(struct vfsmount bastard, unsigned* seq)
756	{
757	struct mount *mnt;
758	if (read_seqretry(sl: &mount_lock, start: seq))
759	return `1`;
760	if (bastard == NULL)
761	return `0`;
762	mnt = real_mount(mnt: bastard);
763	mnt_add_count(mnt, n: `1`);
764	smp_mb(); // see mntput_no_expire() and do_umount()
765	if (likely(!read_seqretry(&mount_lock, seq)))
766	return `0`;
767	lock_mount_hash();
768	if (unlikely(bastard->mnt_flags & (MNT_SYNC_UMOUNT \| MNT_DOOMED))) {
769	mnt_add_count(mnt, n: -`1`);
770	unlock_mount_hash();
771	return `1`;
772	}
773	unlock_mount_hash();
774	/ caller will mntput() /
775	return -`1`;
776	}
777
778	/ call under rcu_read_lock /
779	static bool legitimize_mnt(struct vfsmount bastard, unsigned* seq)
780	{
781	int res = __legitimize_mnt(bastard, seq);
782	if (likely(!res))
783	return true;
784	if (unlikely(res < `0`)) {
785	rcu_read_unlock();
786	mntput(mnt: bastard);
787	rcu_read_lock();
788	}
789	return false;
790	}
791
792	/**
793	* __lookup_mnt - mount hash lookup
794	* @mnt: parent mount
795	* @dentry: dentry of mountpoint
796	*
797	* If @mnt has a child mount @c mounted on @dentry find and return it.
798	* Caller must either hold the spinlock component of @mount_lock or
799	* hold rcu_read_lock(), sample the seqcount component before the call
800	* and recheck it afterwards.
801	*
802	* Return: The child of @mnt mounted on @dentry or %NULL.
803	*/
804	struct mount __lookup_mnt(struct* vfsmount mnt, struct* dentry *dentry)
805	{
806	struct hlist_head *head = m_hash(mnt, dentry);
807	struct mount *p;
808
809	hlist_for_each_entry_rcu(p, head, mnt_hash)
810	if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
811	return p;
812	return NULL;
813	}
814
815	/**
816	* lookup_mnt - Return the child mount mounted at given location
817	* @path: location in the namespace
818	*
819	* Acquires and returns a new reference to mount at given location
820	* or %NULL if nothing is mounted there.
821	*/
822	struct vfsmount lookup_mnt(const* struct path *path)
823	{
824	struct mount *child_mnt;
825	struct vfsmount *m;
826	unsigned seq;
827
828	rcu_read_lock();
829	do {
830	seq = read_seqbegin(sl: &mount_lock);
831	child_mnt = __lookup_mnt(mnt: path->mnt, dentry: path->dentry);
832	m = child_mnt ? &child_mnt->mnt : NULL;
833	} while (!legitimize_mnt(bastard: m, seq));
834	rcu_read_unlock();
835	return m;
836	}
837
838	/*
839	* __is_local_mountpoint - Test to see if dentry is a mountpoint in the
840	* current mount namespace.
841	*
842	* The common case is dentries are not mountpoints at all and that
843	* test is handled inline. For the slow case when we are actually
844	* dealing with a mountpoint of some kind, walk through all of the
845	* mounts in the current mount namespace and test to see if the dentry
846	* is a mountpoint.
847	*
848	* The mount_hashtable is not usable in the context because we
849	* need to identify all mounts that may be in the current mount
850	* namespace not just a mount that happens to have some specified
851	* parent mount.
852	*/
853	bool __is_local_mountpoint(const struct dentry *dentry)
854	{
855	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
856	struct mount mnt, n;
857
858	guard(namespace_shared)();
859
860	rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node)
861	if (mnt->mnt_mountpoint == dentry)
862	return true;
863
864	return false;
865	}
866
867	struct pinned_mountpoint {
868	struct hlist_node node;
869	struct mountpoint *mp;
870	struct mount *parent;
871	};
872
873	static bool lookup_mountpoint(struct dentry dentry, struct* pinned_mountpoint *m)
874	{
875	struct hlist_head *chain = mp_hash(dentry);
876	struct mountpoint *mp;
877
878	hlist_for_each_entry(mp, chain, m_hash) {
879	if (mp->m_dentry == dentry) {
880	hlist_add_head(n: &m->node, h: &mp->m_list);
881	m->mp = mp;
882	return true;
883	}
884	}
885	return false;
886	}
887
888	static int get_mountpoint(struct dentry dentry, struct* pinned_mountpoint *m)
889	{
890	struct mountpoint *mp __free(kfree) = NULL;
891	bool found;
892	int ret;
893
894	if (d_mountpoint(dentry)) {
895	/ might be worth a WARN_ON() /
896	if (d_unlinked(dentry))
897	return -ENOENT;
898	mountpoint:
899	read_seqlock_excl(sl: &mount_lock);
900	found = lookup_mountpoint(dentry, m);
901	read_sequnlock_excl(sl: &mount_lock);
902	if (found)
903	return `0`;
904	}
905
906	if (!mp)
907	mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
908	if (!mp)
909	return -ENOMEM;
910
911	/ Exactly one processes may set d_mounted /
912	ret = d_set_mounted(dentry);
913
914	/ Someone else set d_mounted? /
915	if (ret == -EBUSY)
916	goto mountpoint;
917
918	/ The dentry is not available as a mountpoint? /
919	if (ret)
920	return ret;
921
922	/ Add the new mountpoint to the hash table /
923	read_seqlock_excl(sl: &mount_lock);
924	mp->m_dentry = dget(dentry);
925	hlist_add_head(n: &mp->m_hash, h: mp_hash(dentry));
926	INIT_HLIST_HEAD(&mp->m_list);
927	hlist_add_head(n: &m->node, h: &mp->m_list);
928	m->mp = no_free_ptr(mp);
929	read_sequnlock_excl(sl: &mount_lock);
930	return `0`;
931	}
932
933	/*
934	* vfsmount lock must be held. Additionally, the caller is responsible
935	* for serializing calls for given disposal list.
936	*/
937	static void maybe_free_mountpoint(struct mountpoint mp, struct* list_head *list)
938	{
939	if (hlist_empty(h: &mp->m_list)) {
940	struct dentry *dentry = mp->m_dentry;
941	spin_lock(lock: &dentry->d_lock);
942	dentry->d_flags &= ~DCACHE_MOUNTED;
943	spin_unlock(lock: &dentry->d_lock);
944	dput_to_list(dentry, list);
945	hlist_del(n: &mp->m_hash);
946	kfree(objp: mp);
947	}
948	}
949
950	/*
951	* locks: mount_lock [read_seqlock_excl], namespace_sem [excl]
952	*/
953	static void unpin_mountpoint(struct pinned_mountpoint *m)
954	{
955	if (m->mp) {
956	hlist_del(n: &m->node);
957	maybe_free_mountpoint(mp: m->mp, list: &ex_mountpoints);
958	}
959	}
960
961	static inline int check_mnt(const struct mount *mnt)
962	{
963	return mnt->mnt_ns == current->nsproxy->mnt_ns;
964	}
965
966	static inline bool check_anonymous_mnt(struct mount *mnt)
967	{
968	u64 seq;
969
970	if (!is_anon_ns(ns: mnt->mnt_ns))
971	return false;
972
973	seq = mnt->mnt_ns->seq_origin;
974	return !seq \|\| (seq == current->nsproxy->mnt_ns->ns.ns_id);
975	}
976
977	/*
978	* vfsmount lock must be held for write
979	*/
980	static void touch_mnt_namespace(struct mnt_namespace *ns)
981	{
982	if (ns) {
983	ns->event = ++event;
984	wake_up_interruptible(&ns->poll);
985	}
986	}
987
988	/*
989	* vfsmount lock must be held for write
990	*/
991	static void __touch_mnt_namespace(struct mnt_namespace *ns)
992	{
993	if (ns && ns->event != event) {
994	ns->event = event;
995	wake_up_interruptible(&ns->poll);
996	}
997	}
998
999	/*
1000	* locks: mount_lock[write_seqlock]
1001	*/
1002	static void __umount_mnt(struct mount mnt, struct* list_head *shrink_list)
1003	{
1004	struct mountpoint *mp;
1005	struct mount *parent = mnt->mnt_parent;
1006	if (unlikely(parent->overmount == mnt))
1007	parent->overmount = NULL;
1008	mnt->mnt_parent = mnt;
1009	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
1010	list_del_init(entry: &mnt->mnt_child);
1011	hlist_del_init_rcu(n: &mnt->mnt_hash);
1012	hlist_del_init(n: &mnt->mnt_mp_list);
1013	mp = mnt->mnt_mp;
1014	mnt->mnt_mp = NULL;
1015	maybe_free_mountpoint(mp, list: shrink_list);
1016	}
1017
1018	/*
1019	* locks: mount_lock[write_seqlock], namespace_sem[excl] (for ex_mountpoints)
1020	*/
1021	static void umount_mnt(struct mount *mnt)
1022	{
1023	__umount_mnt(mnt, shrink_list: &ex_mountpoints);
1024	}
1025
1026	/*
1027	* vfsmount lock must be held for write
1028	*/
1029	void mnt_set_mountpoint(struct mount *mnt,
1030	struct mountpoint *mp,
1031	struct mount *child_mnt)
1032	{
1033	child_mnt->mnt_mountpoint = mp->m_dentry;
1034	child_mnt->mnt_parent = mnt;
1035	child_mnt->mnt_mp = mp;
1036	hlist_add_head(n: &child_mnt->mnt_mp_list, h: &mp->m_list);
1037	}
1038
1039	static void make_visible(struct mount *mnt)
1040	{
1041	struct mount *parent = mnt->mnt_parent;
1042	if (unlikely(mnt->mnt_mountpoint == parent->mnt.mnt_root))
1043	parent->overmount = mnt;
1044	hlist_add_head_rcu(n: &mnt->mnt_hash,
1045	h: m_hash(mnt: &parent->mnt, dentry: mnt->mnt_mountpoint));
1046	list_add_tail(new: &mnt->mnt_child, head: &parent->mnt_mounts);
1047	}
1048
1049	/**
1050	* attach_mnt - mount a mount, attach to @mount_hashtable and parent's
1051	* list of child mounts
1052	* @parent: the parent
1053	* @mnt: the new mount
1054	* @mp: the new mountpoint
1055	*
1056	* Mount @mnt at @mp on @parent. Then attach @mnt
1057	* to @parent's child mount list and to @mount_hashtable.
1058	*
1059	* Note, when make_visible() is called @mnt->mnt_parent already points
1060	* to the correct parent.
1061	*
1062	* Context: This function expects namespace_lock() and lock_mount_hash()
1063	* to have been acquired in that order.
1064	*/
1065	static void attach_mnt(struct mount mnt, struct* mount *parent,
1066	struct mountpoint *mp)
1067	{
1068	mnt_set_mountpoint(mnt: parent, mp, child_mnt: mnt);
1069	make_visible(mnt);
1070	}
1071
1072	void mnt_change_mountpoint(struct mount parent, struct* mountpoint mp, struct* mount *mnt)
1073	{
1074	struct mountpoint *old_mp = mnt->mnt_mp;
1075
1076	list_del_init(entry: &mnt->mnt_child);
1077	hlist_del_init(n: &mnt->mnt_mp_list);
1078	hlist_del_init_rcu(n: &mnt->mnt_hash);
1079
1080	attach_mnt(mnt, parent, mp);
1081
1082	maybe_free_mountpoint(mp: old_mp, list: &ex_mountpoints);
1083	}
1084
1085	static inline struct mount node_to_mount(struct* rb_node *node)
1086	{
1087	return node ? rb_entry(node, struct mount, mnt_node) : NULL;
1088	}
1089
1090	static void mnt_add_to_ns(struct mnt_namespace ns, struct* mount *mnt)
1091	{
1092	struct rb_node **link = &ns->mounts.rb_node;
1093	struct rb_node *parent = NULL;
1094	bool mnt_first_node = true, mnt_last_node = true;
1095
1096	WARN_ON(mnt_ns_attached(mnt));
1097	mnt->mnt_ns = ns;
1098	while (*link) {
1099	parent = *link;
1100	if (mnt->mnt_id_unique < node_to_mount(node: parent)->mnt_id_unique) {
1101	link = &parent->rb_left;
1102	mnt_last_node = false;
1103	} else {
1104	link = &parent->rb_right;
1105	mnt_first_node = false;
1106	}
1107	}
1108
1109	if (mnt_last_node)
1110	ns->mnt_last_node = &mnt->mnt_node;
1111	if (mnt_first_node)
1112	ns->mnt_first_node = &mnt->mnt_node;
1113	rb_link_node(node: &mnt->mnt_node, parent, rb_link: link);
1114	rb_insert_color(&mnt->mnt_node, &ns->mounts);
1115
1116	mnt_notify_add(m: mnt);
1117	}
1118
1119	static struct mount next_mnt(struct* mount p, struct* mount *root)
1120	{
1121	struct list_head *next = p->mnt_mounts.next;
1122	if (next == &p->mnt_mounts) {
1123	while (`1`) {
1124	if (p == root)
1125	return NULL;
1126	next = p->mnt_child.next;
1127	if (next != &p->mnt_parent->mnt_mounts)
1128	break;
1129	p = p->mnt_parent;
1130	}
1131	}
1132	return list_entry(next, struct mount, mnt_child);
1133	}
1134
1135	static struct mount skip_mnt_tree(struct* mount *p)
1136	{
1137	struct list_head *prev = p->mnt_mounts.prev;
1138	while (prev != &p->mnt_mounts) {
1139	p = list_entry(prev, struct mount, mnt_child);
1140	prev = p->mnt_mounts.prev;
1141	}
1142	return p;
1143	}
1144
1145	/*
1146	* vfsmount lock must be held for write
1147	*/
1148	static void commit_tree(struct mount *mnt)
1149	{
1150	struct mnt_namespace *n = mnt->mnt_parent->mnt_ns;
1151
1152	if (!mnt_ns_attached(mnt)) {
1153	for (struct mount *m = mnt; m; m = next_mnt(p: m, root: mnt))
1154	mnt_add_to_ns(ns: n, mnt: m);
1155	n->nr_mounts += n->pending_mounts;
1156	n->pending_mounts = `0`;
1157	}
1158
1159	make_visible(mnt);
1160	touch_mnt_namespace(ns: n);
1161	}
1162
1163	static void setup_mnt(struct mount m, struct* dentry *root)
1164	{
1165	struct super_block *s = root->d_sb;
1166
1167	atomic_inc(v: &s->s_active);
1168	m->mnt.mnt_sb = s;
1169	m->mnt.mnt_root = dget(dentry: root);
1170	m->mnt_mountpoint = m->mnt.mnt_root;
1171	m->mnt_parent = m;
1172
1173	guard(mount_locked_reader)();
1174	mnt_add_instance(m, s);
1175	}
1176
1177	/**
1178	* vfs_create_mount - Create a mount for a configured superblock
1179	* @fc: The configuration context with the superblock attached
1180	*
1181	* Create a mount to an already configured superblock. If necessary, the
1182	* caller should invoke vfs_get_tree() before calling this.
1183	*
1184	* Note that this does not attach the mount to anything.
1185	*/
1186	struct vfsmount vfs_create_mount(struct* fs_context *fc)
1187	{
1188	struct mount *mnt;
1189
1190	if (!fc->root)
1191	return ERR_PTR(error: -EINVAL);
1192
1193	mnt = alloc_vfsmnt(name: fc->source);
1194	if (!mnt)
1195	return ERR_PTR(error: -ENOMEM);
1196
1197	if (fc->sb_flags & SB_KERNMOUNT)
1198	mnt->mnt.mnt_flags = MNT_INTERNAL;
1199
1200	setup_mnt(m: mnt, root: fc->root);
1201
1202	return &mnt->mnt;
1203	}
1204	EXPORT_SYMBOL(vfs_create_mount);
1205
1206	struct vfsmount fc_mount(struct* fs_context *fc)
1207	{
1208	int err = vfs_get_tree(fc);
1209	if (!err) {
1210	up_write(sem: &fc->root->d_sb->s_umount);
1211	return vfs_create_mount(fc);
1212	}
1213	return ERR_PTR(error: err);
1214	}
1215	EXPORT_SYMBOL(fc_mount);
1216
1217	struct vfsmount fc_mount_longterm(struct* fs_context *fc)
1218	{
1219	struct vfsmount *mnt = fc_mount(fc);
1220	if (!IS_ERR(ptr: mnt))
1221	real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;
1222	return mnt;
1223	}
1224	EXPORT_SYMBOL(fc_mount_longterm);
1225
1226	struct vfsmount vfs_kern_mount(struct* file_system_type *type,
1227	int flags, const char *name,
1228	void *data)
1229	{
1230	struct fs_context *fc;
1231	struct vfsmount *mnt;
1232	int ret = `0`;
1233
1234	if (!type)
1235	return ERR_PTR(error: -EINVAL);
1236
1237	fc = fs_context_for_mount(fs_type: type, sb_flags: flags);
1238	if (IS_ERR(ptr: fc))
1239	return ERR_CAST(ptr: fc);
1240
1241	if (name)
1242	ret = vfs_parse_fs_string(fc, key: "source", value: name);
1243	if (!ret)
1244	ret = parse_monolithic_mount_data(fc, data);
1245	if (!ret)
1246	mnt = fc_mount(fc);
1247	else
1248	mnt = ERR_PTR(error: ret);
1249
1250	put_fs_context(fc);
1251	return mnt;
1252	}
1253	EXPORT_SYMBOL_GPL(vfs_kern_mount);
1254
1255	static struct mount clone_mnt(struct* mount old, struct* dentry *root,
1256	int flag)
1257	{
1258	struct mount *mnt;
1259	int err;
1260
1261	mnt = alloc_vfsmnt(name: old->mnt_devname);
1262	if (!mnt)
1263	return ERR_PTR(error: -ENOMEM);
1264
1265	mnt->mnt.mnt_flags = READ_ONCE(old->mnt.mnt_flags) &
1266	~MNT_INTERNAL_FLAGS;
1267
1268	if (flag & (CL_SLAVE \| CL_PRIVATE))
1269	mnt->mnt_group_id = `0`; / not a peer of original /
1270	else
1271	mnt->mnt_group_id = old->mnt_group_id;
1272
1273	if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {
1274	err = mnt_alloc_group_id(mnt);
1275	if (err)
1276	goto out_free;
1277	}
1278
1279	if (mnt->mnt_group_id)
1280	set_mnt_shared(mnt);
1281
1282	mnt->mnt.mnt_idmap = mnt_idmap_get(idmap: mnt_idmap(mnt: &old->mnt));
1283
1284	setup_mnt(m: mnt, root);
1285
1286	if (flag & CL_PRIVATE) // we are done with it
1287	return mnt;
1288
1289	if (peers(m1: mnt, m2: old))
1290	list_add(new: &mnt->mnt_share, head: &old->mnt_share);
1291
1292	if ((flag & CL_SLAVE) && old->mnt_group_id) {
1293	hlist_add_head(n: &mnt->mnt_slave, h: &old->mnt_slave_list);
1294	mnt->mnt_master = old;
1295	} else if (IS_MNT_SLAVE(old)) {
1296	hlist_add_behind(n: &mnt->mnt_slave, prev: &old->mnt_slave);
1297	mnt->mnt_master = old->mnt_master;
1298	}
1299	return mnt;
1300
1301	out_free:
1302	mnt_free_id(mnt);
1303	free_vfsmnt(mnt);
1304	return ERR_PTR(error: err);
1305	}
1306
1307	static void cleanup_mnt(struct mount *mnt)
1308	{
1309	struct hlist_node *p;
1310	struct mount *m;
1311	/*
1312	* The warning here probably indicates that somebody messed
1313	* up a mnt_want/drop_write() pair. If this happens, the
1314	* filesystem was probably unable to make r/w->r/o transitions.
1315	* The locking used to deal with mnt_count decrement provides barriers,
1316	* so mnt_get_writers() below is safe.
1317	*/
1318	WARN_ON(mnt_get_writers(mnt));
1319	if (unlikely(mnt->mnt_pins.first))
1320	mnt_pin_kill(m: mnt);
1321	hlist_for_each_entry_safe(m, p, &mnt->mnt_stuck_children, mnt_umount) {
1322	hlist_del(n: &m->mnt_umount);
1323	mntput(mnt: &m->mnt);
1324	}
1325	fsnotify_vfsmount_delete(mnt: &mnt->mnt);
1326	dput(mnt->mnt.mnt_root);
1327	deactivate_super(sb: mnt->mnt.mnt_sb);
1328	mnt_free_id(mnt);
1329	call_rcu(head: &mnt->mnt_rcu, func: delayed_free_vfsmnt);
1330	}
1331
1332	static void __cleanup_mnt(struct rcu_head *head)
1333	{
1334	cleanup_mnt(container_of(head, struct mount, mnt_rcu));
1335	}
1336
1337	static LLIST_HEAD(delayed_mntput_list);
1338	static void delayed_mntput(struct work_struct *unused)
1339	{
1340	struct llist_node *node = llist_del_all(head: &delayed_mntput_list);
1341	struct mount m, t;
1342
1343	llist_for_each_entry_safe(m, t, node, mnt_llist)
1344	cleanup_mnt(mnt: m);
1345	}
1346	static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);
1347
1348	static void mntput_no_expire(struct mount *mnt)
1349	{
1350	LIST_HEAD(list);
1351	int count;
1352
1353	rcu_read_lock();
1354	if (likely(READ_ONCE(mnt->mnt_ns))) {
1355	/*
1356	* Since we don't do lock_mount_hash() here,
1357	* ->mnt_ns can change under us. However, if it's
1358	* non-NULL, then there's a reference that won't
1359	* be dropped until after an RCU delay done after
1360	* turning ->mnt_ns NULL. So if we observe it
1361	* non-NULL under rcu_read_lock(), the reference
1362	* we are dropping is not the final one.
1363	*/
1364	mnt_add_count(mnt, n: -`1`);
1365	rcu_read_unlock();
1366	return;
1367	}
1368	lock_mount_hash();
1369	/*
1370	* make sure that if __legitimize_mnt() has not seen us grab
1371	* mount_lock, we'll see their refcount increment here.
1372	*/
1373	smp_mb();
1374	mnt_add_count(mnt, n: -`1`);
1375	count = mnt_get_count(mnt);
1376	if (count != `0`) {
1377	WARN_ON(count < `0`);
1378	rcu_read_unlock();
1379	unlock_mount_hash();
1380	return;
1381	}
1382	if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
1383	rcu_read_unlock();
1384	unlock_mount_hash();
1385	return;
1386	}
1387	mnt->mnt.mnt_flags \|= MNT_DOOMED;
1388	rcu_read_unlock();
1389
1390	mnt_del_instance(m: mnt);
1391	if (unlikely(!list_empty(&mnt->mnt_expire)))
1392	list_del(entry: &mnt->mnt_expire);
1393
1394	if (unlikely(!list_empty(&mnt->mnt_mounts))) {
1395	struct mount p, tmp;
1396	list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) {
1397	__umount_mnt(mnt: p, shrink_list: &list);
1398	hlist_add_head(n: &p->mnt_umount, h: &mnt->mnt_stuck_children);
1399	}
1400	}
1401	unlock_mount_hash();
1402	shrink_dentry_list(&list);
1403
1404	if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
1405	struct task_struct *task = current;
1406	if (likely(!(task->flags & PF_KTHREAD))) {
1407	init_task_work(twork: &mnt->mnt_rcu, func: __cleanup_mnt);
1408	if (!task_work_add(task, twork: &mnt->mnt_rcu, mode: TWA_RESUME))
1409	return;
1410	}
1411	if (llist_add(new: &mnt->mnt_llist, head: &delayed_mntput_list))
1412	schedule_delayed_work(dwork: &delayed_mntput_work, delay: `1`);
1413	return;
1414	}
1415	cleanup_mnt(mnt);
1416	}
1417
1418	void mntput(struct vfsmount *mnt)
1419	{
1420	if (mnt) {
1421	struct mount *m = real_mount(mnt);
1422	/ avoid cacheline pingpong /
1423	if (unlikely(m->mnt_expiry_mark))
1424	WRITE_ONCE(m->mnt_expiry_mark, `0`);
1425	mntput_no_expire(mnt: m);
1426	}
1427	}
1428	EXPORT_SYMBOL(mntput);
1429
1430	struct vfsmount mntget(struct* vfsmount *mnt)
1431	{
1432	if (mnt)
1433	mnt_add_count(mnt: real_mount(mnt), n: `1`);
1434	return mnt;
1435	}
1436	EXPORT_SYMBOL(mntget);
1437
1438	/*
1439	* Make a mount point inaccessible to new lookups.
1440	* Because there may still be current users, the caller MUST WAIT
1441	* for an RCU grace period before destroying the mount point.
1442	*/
1443	void mnt_make_shortterm(struct vfsmount *mnt)
1444	{
1445	if (mnt)
1446	real_mount(mnt)->mnt_ns = NULL;
1447	}
1448
1449	/**
1450	* path_is_mountpoint() - Check if path is a mount in the current namespace.
1451	* @path: path to check
1452	*
1453	* d_mountpoint() can only be used reliably to establish if a dentry is
1454	* not mounted in any namespace and that common case is handled inline.
1455	* d_mountpoint() isn't aware of the possibility there may be multiple
1456	* mounts using a given dentry in a different namespace. This function
1457	* checks if the passed in path is a mountpoint rather than the dentry
1458	* alone.
1459	*/
1460	bool path_is_mountpoint(const struct path *path)
1461	{
1462	unsigned seq;
1463	bool res;
1464
1465	if (!d_mountpoint(dentry: path->dentry))
1466	return false;
1467
1468	rcu_read_lock();
1469	do {
1470	seq = read_seqbegin(sl: &mount_lock);
1471	res = __path_is_mountpoint(path);
1472	} while (read_seqretry(sl: &mount_lock, start: seq));
1473	rcu_read_unlock();
1474
1475	return res;
1476	}
1477	EXPORT_SYMBOL(path_is_mountpoint);
1478
1479	struct vfsmount mnt_clone_internal(const* struct path *path)
1480	{
1481	struct mount *p;
1482	p = clone_mnt(old: real_mount(mnt: path->mnt), root: path->dentry, CL_PRIVATE);
1483	if (IS_ERR(ptr: p))
1484	return ERR_CAST(ptr: p);
1485	p->mnt.mnt_flags \|= MNT_INTERNAL;
1486	return &p->mnt;
1487	}
1488
1489	/*
1490	* Returns the mount which either has the specified mnt_id, or has the next
1491	* smallest id afer the specified one.
1492	*/
1493	static struct mount mnt_find_id_at(struct* mnt_namespace *ns, u64 mnt_id)
1494	{
1495	struct rb_node *node = ns->mounts.rb_node;
1496	struct mount *ret = NULL;
1497
1498	while (node) {
1499	struct mount *m = node_to_mount(node);
1500
1501	if (mnt_id <= m->mnt_id_unique) {
1502	ret = node_to_mount(node);
1503	if (mnt_id == m->mnt_id_unique)
1504	break;
1505	node = node->rb_left;
1506	} else {
1507	node = node->rb_right;
1508	}
1509	}
1510	return ret;
1511	}
1512
1513	/*
1514	* Returns the mount which either has the specified mnt_id, or has the next
1515	* greater id before the specified one.
1516	*/
1517	static struct mount mnt_find_id_at_reverse(struct* mnt_namespace *ns, u64 mnt_id)
1518	{
1519	struct rb_node *node = ns->mounts.rb_node;
1520	struct mount *ret = NULL;
1521
1522	while (node) {
1523	struct mount *m = node_to_mount(node);
1524
1525	if (mnt_id >= m->mnt_id_unique) {
1526	ret = node_to_mount(node);
1527	if (mnt_id == m->mnt_id_unique)
1528	break;
1529	node = node->rb_right;
1530	} else {
1531	node = node->rb_left;
1532	}
1533	}
1534	return ret;
1535	}
1536
1537	#ifdef CONFIG_PROC_FS
1538
1539	/ iterator; we want it to have access to namespace_sem, thus here... /
1540	static void m_start(struct* seq_file m, loff_t pos)
1541	{
1542	struct proc_mounts *p = m->private;
1543
1544	down_read(sem: &namespace_sem);
1545
1546	return mnt_find_id_at(ns: p->ns, mnt_id: *pos);
1547	}
1548
1549	static void m_next(struct* seq_file m, void* v, loff_t pos)
1550	{
1551	struct mount next = NULL, mnt = v;
1552	struct rb_node *node = rb_next(&mnt->mnt_node);
1553
1554	++*pos;
1555	if (node) {
1556	next = node_to_mount(node);
1557	*pos = next->mnt_id_unique;
1558	}
1559	return next;
1560	}
1561
1562	static void m_stop(struct seq_file m, void* *v)
1563	{
1564	up_read(sem: &namespace_sem);
1565	}
1566
1567	static int m_show(struct seq_file m, void* *v)
1568	{
1569	struct proc_mounts *p = m->private;
1570	struct mount *r = v;
1571	return p->show(m, &r->mnt);
1572	}
1573
1574	const struct seq_operations mounts_op = {
1575	.start = m_start,
1576	.next = m_next,
1577	.stop = m_stop,
1578	.show = m_show,
1579	};
1580
1581	#endif /* CONFIG_PROC_FS */
1582
1583	/**
1584	* may_umount_tree - check if a mount tree is busy
1585	* @m: root of mount tree
1586	*
1587	* This is called to check if a tree of mounts has any
1588	* open files, pwds, chroots or sub mounts that are
1589	* busy.
1590	*/
1591	int may_umount_tree(struct vfsmount *m)
1592	{
1593	struct mount *mnt = real_mount(mnt: m);
1594	bool busy = false;
1595
1596	/ write lock needed for mnt_get_count /
1597	lock_mount_hash();
1598	for (struct mount *p = mnt; p; p = next_mnt(p, root: mnt)) {
1599	if (mnt_get_count(mnt: p) > (p == mnt ? `2` : `1`)) {
1600	busy = true;
1601	break;
1602	}
1603	}
1604	unlock_mount_hash();
1605
1606	return !busy;
1607	}
1608
1609	EXPORT_SYMBOL(may_umount_tree);
1610
1611	/**
1612	* may_umount - check if a mount point is busy
1613	* @mnt: root of mount
1614	*
1615	* This is called to check if a mount point has any
1616	* open files, pwds, chroots or sub mounts. If the
1617	* mount has sub mounts this will return busy
1618	* regardless of whether the sub mounts are busy.
1619	*
1620	* Doesn't take quota and stuff into account. IOW, in some cases it will
1621	* give false negatives. The main reason why it's here is that we need
1622	* a non-destructive way to look for easily umountable filesystems.
1623	*/
1624	int may_umount(struct vfsmount *mnt)
1625	{
1626	int ret = `1`;
1627	down_read(sem: &namespace_sem);
1628	lock_mount_hash();
1629	if (propagate_mount_busy(real_mount(mnt), `2`))
1630	ret = `0`;
1631	unlock_mount_hash();
1632	up_read(sem: &namespace_sem);
1633	return ret;
1634	}
1635
1636	EXPORT_SYMBOL(may_umount);
1637
1638	#ifdef CONFIG_FSNOTIFY
1639	static void mnt_notify(struct mount *p)
1640	{
1641	if (!p->prev_ns && p->mnt_ns) {
1642	fsnotify_mnt_attach(ns: p->mnt_ns, mnt: &p->mnt);
1643	} else if (p->prev_ns && !p->mnt_ns) {
1644	fsnotify_mnt_detach(ns: p->prev_ns, mnt: &p->mnt);
1645	} else if (p->prev_ns == p->mnt_ns) {
1646	fsnotify_mnt_move(ns: p->mnt_ns, mnt: &p->mnt);
1647	} else {
1648	fsnotify_mnt_detach(ns: p->prev_ns, mnt: &p->mnt);
1649	fsnotify_mnt_attach(ns: p->mnt_ns, mnt: &p->mnt);
1650	}
1651	p->prev_ns = p->mnt_ns;
1652	}
1653
1654	static void notify_mnt_list(void)
1655	{
1656	struct mount m, tmp;
1657	/*
1658	* Notify about mounts that were added/reparented/detached/remain
1659	* connected after unmount.
1660	*/
1661	list_for_each_entry_safe(m, tmp, &notify_list, to_notify) {
1662	mnt_notify(p: m);
1663	list_del_init(entry: &m->to_notify);
1664	}
1665	}
1666
1667	static bool need_notify_mnt_list(void)
1668	{
1669	return !list_empty(head: &notify_list);
1670	}
1671	#else
1672	static void notify_mnt_list(void)
1673	{
1674	}
1675
1676	static bool need_notify_mnt_list(void)
1677	{
1678	return false;
1679	}
1680	#endif
1681
1682	static void free_mnt_ns(struct mnt_namespace *);
1683	static void namespace_unlock(void)
1684	{
1685	struct hlist_head head;
1686	struct hlist_node *p;
1687	struct mount *m;
1688	struct mnt_namespace *ns = emptied_ns;
1689	LIST_HEAD(list);
1690
1691	hlist_move_list(old: &unmounted, new: &head);
1692	list_splice_init(list: &ex_mountpoints, head: &list);
1693	emptied_ns = NULL;
1694
1695	if (need_notify_mnt_list()) {
1696	/*
1697	* No point blocking out concurrent readers while notifications
1698	* are sent. This will also allow statmount()/listmount() to run
1699	* concurrently.
1700	*/
1701	downgrade_write(sem: &namespace_sem);
1702	notify_mnt_list();
1703	up_read(sem: &namespace_sem);
1704	} else {
1705	up_write(sem: &namespace_sem);
1706	}
1707	if (unlikely(ns)) {
1708	/ Make sure we notice when we leak mounts. /
1709	VFS_WARN_ON_ONCE(!mnt_ns_empty(ns));
1710	free_mnt_ns(ns);
1711	}
1712
1713	shrink_dentry_list(&list);
1714
1715	if (likely(hlist_empty(&head)))
1716	return;
1717
1718	synchronize_rcu_expedited();
1719
1720	hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
1721	hlist_del(n: &m->mnt_umount);
1722	mntput(&m->mnt);
1723	}
1724	}
1725
1726	static inline void namespace_lock(void)
1727	{
1728	down_write(sem: &namespace_sem);
1729	}
1730
1731	enum umount_tree_flags {
1732	UMOUNT_SYNC = `1`,
1733	UMOUNT_PROPAGATE = `2`,
1734	UMOUNT_CONNECTED = `4`,
1735	};
1736
1737	static bool disconnect_mount(struct mount mnt, enum* umount_tree_flags how)
1738	{
1739	/ Leaving mounts connected is only valid for lazy umounts /
1740	if (how & UMOUNT_SYNC)
1741	return true;
1742
1743	/ A mount without a parent has nothing to be connected to /
1744	if (!mnt_has_parent(mnt))
1745	return true;
1746
1747	/ Because the reference counting rules change when mounts are*
1748	* unmounted and connected, umounted mounts may not be
1749	* connected to mounted mounts.
1750	*/
1751	if (!(mnt->mnt_parent->mnt.mnt_flags & MNT_UMOUNT))
1752	return true;
1753
1754	/ Has it been requested that the mount remain connected? /
1755	if (how & UMOUNT_CONNECTED)
1756	return false;
1757
1758	/ Is the mount locked such that it needs to remain connected? /
1759	if (IS_MNT_LOCKED(mnt))
1760	return false;
1761
1762	/ By default disconnect the mount /
1763	return true;
1764	}
1765
1766	/*
1767	* mount_lock must be held
1768	* namespace_sem must be held for write
1769	*/
1770	static void umount_tree(struct mount mnt, enum* umount_tree_flags how)
1771	{
1772	LIST_HEAD(tmp_list);
1773	struct mount *p;
1774
1775	if (how & UMOUNT_PROPAGATE)
1776	propagate_mount_unlock(mnt);
1777
1778	/ Gather the mounts to umount /
1779	for (p = mnt; p; p = next_mnt(p, root: mnt)) {
1780	p->mnt.mnt_flags \|= MNT_UMOUNT;
1781	if (mnt_ns_attached(mnt: p))
1782	move_from_ns(mnt: p);
1783	list_add_tail(new: &p->mnt_list, head: &tmp_list);
1784	}
1785
1786	/ Hide the mounts from mnt_mounts /
1787	list_for_each_entry(p, &tmp_list, mnt_list) {
1788	list_del_init(entry: &p->mnt_child);
1789	}
1790
1791	/ Add propagated mounts to the tmp_list /
1792	if (how & UMOUNT_PROPAGATE)
1793	propagate_umount(&tmp_list);
1794
1795	bulk_make_private(&tmp_list);
1796
1797	while (!list_empty(head: &tmp_list)) {
1798	struct mnt_namespace *ns;
1799	bool disconnect;
1800	p = list_first_entry(&tmp_list, struct mount, mnt_list);
1801	list_del_init(entry: &p->mnt_expire);
1802	list_del_init(entry: &p->mnt_list);
1803	ns = p->mnt_ns;
1804	if (ns) {
1805	ns->nr_mounts--;
1806	__touch_mnt_namespace(ns);
1807	}
1808	p->mnt_ns = NULL;
1809	if (how & UMOUNT_SYNC)
1810	p->mnt.mnt_flags \|= MNT_SYNC_UMOUNT;
1811
1812	disconnect = disconnect_mount(mnt: p, how);
1813	if (mnt_has_parent(mnt: p)) {
1814	if (!disconnect) {
1815	/ Don't forget about p /
1816	list_add_tail(new: &p->mnt_child, head: &p->mnt_parent->mnt_mounts);
1817	} else {
1818	umount_mnt(mnt: p);
1819	}
1820	}
1821	if (disconnect)
1822	hlist_add_head(n: &p->mnt_umount, h: &unmounted);
1823
1824	/*
1825	* At this point p->mnt_ns is NULL, notification will be queued
1826	* only if
1827	*
1828	* - p->prev_ns is non-NULL and
1829	* - p->prev_ns->n_fsnotify_marks is non-NULL
1830	*
1831	* This will preclude queuing the mount if this is a cleanup
1832	* after a failed copy_tree() or destruction of an anonymous
1833	* namespace, etc.
1834	*/
1835	mnt_notify_add(m: p);
1836	}
1837	}
1838
1839	static void shrink_submounts(struct mount *mnt);
1840
1841	static int do_umount_root(struct super_block *sb)
1842	{
1843	int ret = `0`;
1844
1845	down_write(sem: &sb->s_umount);
1846	if (!sb_rdonly(sb)) {
1847	struct fs_context *fc;
1848
1849	fc = fs_context_for_reconfigure(dentry: sb->s_root, SB_RDONLY,
1850	SB_RDONLY);
1851	if (IS_ERR(ptr: fc)) {
1852	ret = PTR_ERR(ptr: fc);
1853	} else {
1854	ret = parse_monolithic_mount_data(fc, NULL);
1855	if (!ret)
1856	ret = reconfigure_super(fc);
1857	put_fs_context(fc);
1858	}
1859	}
1860	up_write(sem: &sb->s_umount);
1861	return ret;
1862	}
1863
1864	static int do_umount(struct mount mnt, int* flags)
1865	{
1866	struct super_block *sb = mnt->mnt.mnt_sb;
1867	int retval;
1868
1869	retval = security_sb_umount(mnt: &mnt->mnt, flags);
1870	if (retval)
1871	return retval;
1872
1873	/*
1874	* Allow userspace to request a mountpoint be expired rather than
1875	* unmounting unconditionally. Unmount only happens if:
1876	* (1) the mark is already set (the mark is cleared by mntput())
1877	* (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]
1878	*/
1879	if (flags & MNT_EXPIRE) {
1880	if (&mnt->mnt == current->fs->root.mnt \|\|
1881	flags & (MNT_FORCE \| MNT_DETACH))
1882	return -EINVAL;
1883
1884	/*
1885	* probably don't strictly need the lock here if we examined
1886	* all race cases, but it's a slowpath.
1887	*/
1888	lock_mount_hash();
1889	if (!list_empty(head: &mnt->mnt_mounts) \|\| mnt_get_count(mnt) != `2`) {
1890	unlock_mount_hash();
1891	return -EBUSY;
1892	}
1893	unlock_mount_hash();
1894
1895	if (!xchg(&mnt->mnt_expiry_mark, `1`))
1896	return -EAGAIN;
1897	}
1898
1899	/*
1900	* If we may have to abort operations to get out of this
1901	* mount, and they will themselves hold resources we must
1902	* allow the fs to do things. In the Unix tradition of
1903	* 'Gee thats tricky lets do it in userspace' the umount_begin
1904	* might fail to complete on the first run through as other tasks
1905	* must return, and the like. Thats for the mount program to worry
1906	* about for the moment.
1907	*/
1908
1909	if (flags & MNT_FORCE && sb->s_op->umount_begin) {
1910	sb->s_op->umount_begin(sb);
1911	}
1912
1913	/*
1914	* No sense to grab the lock for this test, but test itself looks
1915	* somewhat bogus. Suggestions for better replacement?
1916	* Ho-hum... In principle, we might treat that as umount + switch
1917	* to rootfs. GC would eventually take care of the old vfsmount.
1918	* Actually it makes sense, especially if rootfs would contain a
1919	* /reboot - static binary that would close all descriptors and
1920	* call reboot(9). Then init(8) could umount root and exec /reboot.
1921	*/
1922	if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {
1923	/*
1924	* Special case for "unmounting" root ...
1925	* we just try to remount it readonly.
1926	*/
1927	if (!ns_capable(ns: sb->s_user_ns, CAP_SYS_ADMIN))
1928	return -EPERM;
1929	return do_umount_root(sb);
1930	}
1931
1932	namespace_lock();
1933	lock_mount_hash();
1934
1935	/ Repeat the earlier racy checks, now that we are holding the locks /
1936	retval = -EINVAL;
1937	if (!check_mnt(mnt))
1938	goto out;
1939
1940	if (mnt->mnt.mnt_flags & MNT_LOCKED)
1941	goto out;
1942
1943	if (!mnt_has_parent(mnt)) / not the absolute root /
1944	goto out;
1945
1946	event++;
1947	if (flags & MNT_DETACH) {
1948	umount_tree(mnt, how: UMOUNT_PROPAGATE);
1949	retval = `0`;
1950	} else {
1951	smp_mb(); // paired with __legitimize_mnt()
1952	shrink_submounts(mnt);
1953	retval = -EBUSY;
1954	if (!propagate_mount_busy(mnt, `2`)) {
1955	umount_tree(mnt, how: UMOUNT_PROPAGATE\|UMOUNT_SYNC);
1956	retval = `0`;
1957	}
1958	}
1959	out:
1960	unlock_mount_hash();
1961	namespace_unlock();
1962	return retval;
1963	}
1964
1965	/*
1966	* __detach_mounts - lazily unmount all mounts on the specified dentry
1967	*
1968	* During unlink, rmdir, and d_drop it is possible to loose the path
1969	* to an existing mountpoint, and wind up leaking the mount.
1970	* detach_mounts allows lazily unmounting those mounts instead of
1971	* leaking them.
1972	*
1973	* The caller may hold dentry->d_inode->i_rwsem.
1974	*/
1975	void __detach_mounts(struct dentry *dentry)
1976	{
1977	struct pinned_mountpoint mp = {};
1978	struct mount *mnt;
1979
1980	guard(namespace_excl)();
1981	guard(mount_writer)();
1982
1983	if (!lookup_mountpoint(dentry, m: &mp))
1984	return;
1985
1986	event++;
1987	while (mp.node.next) {
1988	mnt = hlist_entry(mp.node.next, struct mount, mnt_mp_list);
1989	if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
1990	umount_mnt(mnt);
1991	hlist_add_head(n: &mnt->mnt_umount, h: &unmounted);
1992	}
1993	else umount_tree(mnt, how: UMOUNT_CONNECTED);
1994	}
1995	unpin_mountpoint(m: &mp);
1996	}
1997
1998	/*
1999	* Is the caller allowed to modify his namespace?
2000	*/
2001	bool may_mount(void)
2002	{
2003	return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
2004	}
2005
2006	static void warn_mandlock(void)
2007	{
2008	pr_warn_once("=======================================================\n"
2009	"WARNING: The mand mount option has been deprecated and\n"
2010	" and is ignored by this kernel. Remove the mand\n"
2011	" option from the mount to silence this warning.\n"
2012	"=======================================================\n");
2013	}
2014
2015	static int can_umount(const struct path path, int* flags)
2016	{
2017	struct mount *mnt = real_mount(mnt: path->mnt);
2018	struct super_block *sb = path->dentry->d_sb;
2019
2020	if (!may_mount())
2021	return -EPERM;
2022	if (!path_mounted(path))
2023	return -EINVAL;
2024	if (!check_mnt(mnt))
2025	return -EINVAL;
2026	if (mnt->mnt.mnt_flags & MNT_LOCKED) / Check optimistically /
2027	return -EINVAL;
2028	if (flags & MNT_FORCE && !ns_capable(ns: sb->s_user_ns, CAP_SYS_ADMIN))
2029	return -EPERM;
2030	return `0`;
2031	}
2032
2033	// caller is responsible for flags being sane
2034	int path_umount(const struct path path, int* flags)
2035	{
2036	struct mount *mnt = real_mount(mnt: path->mnt);
2037	int ret;
2038
2039	ret = can_umount(path, flags);
2040	if (!ret)
2041	ret = do_umount(mnt, flags);
2042
2043	/ we mustn't call path_put() as that would clear mnt_expiry_mark /
2044	dput(path->dentry);
2045	mntput_no_expire(mnt);
2046	return ret;
2047	}
2048
2049	static int ksys_umount(char __user name, int* flags)
2050	{
2051	int lookup_flags = LOOKUP_MOUNTPOINT;
2052	struct path path;
2053	int ret;
2054
2055	// basic validity checks done first
2056	if (flags & ~(MNT_FORCE \| MNT_DETACH \| MNT_EXPIRE \| UMOUNT_NOFOLLOW))
2057	return -EINVAL;
2058
2059	if (!(flags & UMOUNT_NOFOLLOW))
2060	lookup_flags \|= LOOKUP_FOLLOW;
2061	ret = user_path_at(AT_FDCWD, name, lookup_flags, &path);
2062	if (ret)
2063	return ret;
2064	return path_umount(path: &path, flags);
2065	}
2066
2067	SYSCALL_DEFINE2(umount, char __user , name, int*, flags)
2068	{
2069	return ksys_umount(name, flags);
2070	}
2071
2072	#ifdef __ARCH_WANT_SYS_OLDUMOUNT
2073
2074	/*
2075	* The 2.0 compatible umount. No flags.
2076	*/
2077	SYSCALL_DEFINE1(oldumount, char __user *, name)
2078	{
2079	return ksys_umount(name, flags: `0`);
2080	}
2081
2082	#endif
2083
2084	static bool is_mnt_ns_file(struct dentry *dentry)
2085	{
2086	struct ns_common *ns;
2087
2088	/ Is this a proxy for a mount namespace? /
2089	if (dentry->d_op != &ns_dentry_operations)
2090	return false;
2091
2092	ns = d_inode(dentry)->i_private;
2093
2094	return ns->ops == &mntns_operations;
2095	}
2096
2097	struct ns_common from_mnt_ns(struct* mnt_namespace *mnt)
2098	{
2099	return &mnt->ns;
2100	}
2101
2102	struct mnt_namespace get_sequential_mnt_ns(struct* mnt_namespace *mntns, bool previous)
2103	{
2104	struct ns_common *ns;
2105
2106	guard(rcu)();
2107
2108	for (;;) {
2109	ns = ns_tree_adjoined_rcu(mntns, previous);
2110	if (IS_ERR(ptr: ns))
2111	return ERR_CAST(ptr: ns);
2112
2113	mntns = to_mnt_ns(ns);
2114
2115	/*
2116	* The last passive reference count is put with RCU
2117	* delay so accessing the mount namespace is not just
2118	* safe but all relevant members are still valid.
2119	*/
2120	if (!ns_capable_noaudit(ns: mntns->user_ns, CAP_SYS_ADMIN))
2121	continue;
2122
2123	/*
2124	* We need an active reference count as we're persisting
2125	* the mount namespace and it might already be on its
2126	* deathbed.
2127	*/
2128	if (!ns_ref_get(mntns))
2129	continue;
2130
2131	return mntns;
2132	}
2133	}
2134
2135	struct mnt_namespace mnt_ns_from_dentry(struct* dentry *dentry)
2136	{
2137	if (!is_mnt_ns_file(dentry))
2138	return NULL;
2139
2140	return to_mnt_ns(get_proc_ns(dentry->d_inode));
2141	}
2142
2143	static bool mnt_ns_loop(struct dentry *dentry)
2144	{
2145	/ Could bind mounting the mount namespace inode cause a*
2146	* mount namespace loop?
2147	*/
2148	struct mnt_namespace *mnt_ns = mnt_ns_from_dentry(dentry);
2149
2150	if (!mnt_ns)
2151	return false;
2152
2153	return current->nsproxy->mnt_ns->ns.ns_id >= mnt_ns->ns.ns_id;
2154	}
2155
2156	struct mount copy_tree(struct* mount src_root, struct* dentry *dentry,
2157	int flag)
2158	{
2159	struct mount res, src_parent, src_root_child, src_mnt,
2160	dst_parent, dst_mnt;
2161
2162	if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(src_root))
2163	return ERR_PTR(error: -EINVAL);
2164
2165	if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))
2166	return ERR_PTR(error: -EINVAL);
2167
2168	res = dst_mnt = clone_mnt(old: src_root, root: dentry, flag);
2169	if (IS_ERR(ptr: dst_mnt))
2170	return dst_mnt;
2171
2172	src_parent = src_root;
2173
2174	list_for_each_entry(src_root_child, &src_root->mnt_mounts, mnt_child) {
2175	if (!is_subdir(src_root_child->mnt_mountpoint, dentry))
2176	continue;
2177
2178	for (src_mnt = src_root_child; src_mnt;
2179	src_mnt = next_mnt(p: src_mnt, root: src_root_child)) {
2180	if (!(flag & CL_COPY_UNBINDABLE) &&
2181	IS_MNT_UNBINDABLE(src_mnt)) {
2182	if (src_mnt->mnt.mnt_flags & MNT_LOCKED) {
2183	/ Both unbindable and locked. /
2184	dst_mnt = ERR_PTR(error: -EPERM);
2185	goto out;
2186	} else {
2187	src_mnt = skip_mnt_tree(p: src_mnt);
2188	continue;
2189	}
2190	}
2191	if (!(flag & CL_COPY_MNT_NS_FILE) &&
2192	is_mnt_ns_file(dentry: src_mnt->mnt.mnt_root)) {
2193	src_mnt = skip_mnt_tree(p: src_mnt);
2194	continue;
2195	}
2196	while (src_parent != src_mnt->mnt_parent) {
2197	src_parent = src_parent->mnt_parent;
2198	dst_mnt = dst_mnt->mnt_parent;
2199	}
2200
2201	src_parent = src_mnt;
2202	dst_parent = dst_mnt;
2203	dst_mnt = clone_mnt(old: src_mnt, root: src_mnt->mnt.mnt_root, flag);
2204	if (IS_ERR(ptr: dst_mnt))
2205	goto out;
2206	lock_mount_hash();
2207	if (src_mnt->mnt.mnt_flags & MNT_LOCKED)
2208	dst_mnt->mnt.mnt_flags \|= MNT_LOCKED;
2209	if (unlikely(flag & CL_EXPIRE)) {
2210	/ stick the duplicate mount on the same expiry*
2211	* list as the original if that was on one */
2212	if (!list_empty(head: &src_mnt->mnt_expire))
2213	list_add(new: &dst_mnt->mnt_expire,
2214	head: &src_mnt->mnt_expire);
2215	}
2216	attach_mnt(mnt: dst_mnt, parent: dst_parent, mp: src_parent->mnt_mp);
2217	unlock_mount_hash();
2218	}
2219	}
2220	return res;
2221
2222	out:
2223	if (res) {
2224	lock_mount_hash();
2225	umount_tree(mnt: res, how: UMOUNT_SYNC);
2226	unlock_mount_hash();
2227	}
2228	return dst_mnt;
2229	}
2230
2231	static inline bool extend_array(struct path res, struct path to_free,
2232	unsigned n, unsigned count, unsigned* new_count)
2233	{
2234	struct path *p;
2235
2236	if (likely(n < *count))
2237	return true;
2238	p = kmalloc_array(new_count, sizeof(struct path), GFP_KERNEL);
2239	if (p && *count)
2240	memcpy(to: p, from: res, len: count * sizeof(struct path));
2241	*count = new_count;
2242	kfree(objp: *to_free);
2243	to_free = res = p;
2244	return p;
2245	}
2246
2247	const struct path collect_paths(const* struct path *path,
2248	struct path prealloc, unsigned* count)
2249	{
2250	struct mount *root = real_mount(mnt: path->mnt);
2251	struct mount *child;
2252	struct path res = prealloc, to_free = NULL;
2253	unsigned n = `0`;
2254
2255	guard(namespace_shared)();
2256
2257	if (!check_mnt(mnt: root))
2258	return ERR_PTR(error: -EINVAL);
2259	if (!extend_array(res: &res, to_free: &to_free, n: `0`, count: &count, new_count: `32`))
2260	return ERR_PTR(error: -ENOMEM);
2261	res[n++] = *path;
2262	list_for_each_entry(child, &root->mnt_mounts, mnt_child) {
2263	if (!is_subdir(child->mnt_mountpoint, path->dentry))
2264	continue;
2265	for (struct mount *m = child; m; m = next_mnt(p: m, root: child)) {
2266	if (!extend_array(res: &res, to_free: &to_free, n, count: &count, new_count: `2` * count))
2267	return ERR_PTR(error: -ENOMEM);
2268	res[n].mnt = &m->mnt;
2269	res[n].dentry = m->mnt.mnt_root;
2270	n++;
2271	}
2272	}
2273	if (!extend_array(res: &res, to_free: &to_free, n, count: &count, new_count: count + `1`))
2274	return ERR_PTR(error: -ENOMEM);
2275	memset(s: res + n, c: `0`, n: (count - n) * sizeof(struct path));
2276	for (struct path *p = res; p->mnt; p++)
2277	path_get(p);
2278	return res;
2279	}
2280
2281	void drop_collected_paths(const struct path paths, const* struct path *prealloc)
2282	{
2283	for (const struct path *p = paths; p->mnt; p++)
2284	path_put(p);
2285	if (paths != prealloc)
2286	kfree(objp: paths);
2287	}
2288
2289	static struct mnt_namespace alloc_mnt_ns(struct* user_namespace *, bool);
2290
2291	void dissolve_on_fput(struct vfsmount *mnt)
2292	{
2293	struct mount *m = real_mount(mnt);
2294
2295	/*
2296	* m used to be the root of anon namespace; if it still is one,
2297	* we need to dissolve the mount tree and free that namespace.
2298	* Let's try to avoid taking namespace_sem if we can determine
2299	* that there's nothing to do without it - rcu_read_lock() is
2300	* enough to make anon_ns_root() memory-safe and once m has
2301	* left its namespace, it's no longer our concern, since it will
2302	* never become a root of anon ns again.
2303	*/
2304
2305	scoped_guard(rcu) {
2306	if (!anon_ns_root(m))
2307	return;
2308	}
2309
2310	scoped_guard(namespace_excl) {
2311	if (!anon_ns_root(m))
2312	return;
2313
2314	emptied_ns = m->mnt_ns;
2315	lock_mount_hash();
2316	umount_tree(mnt: m, how: UMOUNT_CONNECTED);
2317	unlock_mount_hash();
2318	}
2319	}
2320
2321	/ locks: namespace_shared && pinned(mnt) \|\| mount_locked_reader /
2322	static bool __has_locked_children(struct mount mnt, struct* dentry *dentry)
2323	{
2324	struct mount *child;
2325
2326	list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
2327	if (!is_subdir(child->mnt_mountpoint, dentry))
2328	continue;
2329
2330	if (child->mnt.mnt_flags & MNT_LOCKED)
2331	return true;
2332	}
2333	return false;
2334	}
2335
2336	bool has_locked_children(struct mount mnt, struct* dentry *dentry)
2337	{
2338	guard(mount_locked_reader)();
2339	return __has_locked_children(mnt, dentry);
2340	}
2341
2342	/*
2343	* Check that there aren't references to earlier/same mount namespaces in the
2344	* specified subtree. Such references can act as pins for mount namespaces
2345	* that aren't checked by the mount-cycle checking code, thereby allowing
2346	* cycles to be made.
2347	*
2348	* locks: mount_locked_reader \|\| namespace_shared && pinned(subtree)
2349	*/
2350	static bool check_for_nsfs_mounts(struct mount *subtree)
2351	{
2352	for (struct mount *p = subtree; p; p = next_mnt(p, root: subtree))
2353	if (mnt_ns_loop(dentry: p->mnt.mnt_root))
2354	return false;
2355	return true;
2356	}
2357
2358	/**
2359	* clone_private_mount - create a private clone of a path
2360	* @path: path to clone
2361	*
2362	* This creates a new vfsmount, which will be the clone of @path. The new mount
2363	* will not be attached anywhere in the namespace and will be private (i.e.
2364	* changes to the originating mount won't be propagated into this).
2365	*
2366	* This assumes caller has called or done the equivalent of may_mount().
2367	*
2368	* Release with mntput().
2369	*/
2370	struct vfsmount clone_private_mount(const* struct path *path)
2371	{
2372	struct mount *old_mnt = real_mount(mnt: path->mnt);
2373	struct mount *new_mnt;
2374
2375	guard(namespace_shared)();
2376
2377	if (IS_MNT_UNBINDABLE(old_mnt))
2378	return ERR_PTR(error: -EINVAL);
2379
2380	/*
2381	* Make sure the source mount is acceptable.
2382	* Anything mounted in our mount namespace is allowed.
2383	* Otherwise, it must be the root of an anonymous mount
2384	* namespace, and we need to make sure no namespace
2385	* loops get created.
2386	*/
2387	if (!check_mnt(mnt: old_mnt)) {
2388	if (!anon_ns_root(m: old_mnt))
2389	return ERR_PTR(error: -EINVAL);
2390
2391	if (!check_for_nsfs_mounts(subtree: old_mnt))
2392	return ERR_PTR(error: -EINVAL);
2393	}
2394
2395	if (!ns_capable(ns: old_mnt->mnt_ns->user_ns, CAP_SYS_ADMIN))
2396	return ERR_PTR(error: -EPERM);
2397
2398	if (__has_locked_children(mnt: old_mnt, dentry: path->dentry))
2399	return ERR_PTR(error: -EINVAL);
2400
2401	new_mnt = clone_mnt(old: old_mnt, root: path->dentry, CL_PRIVATE);
2402	if (IS_ERR(ptr: new_mnt))
2403	return ERR_PTR(error: -EINVAL);
2404
2405	/ Longterm mount to be removed by kern_unmount() /*
2406	new_mnt->mnt_ns = MNT_NS_INTERNAL;
2407	return &new_mnt->mnt;
2408	}
2409	EXPORT_SYMBOL_GPL(clone_private_mount);
2410
2411	static void lock_mnt_tree(struct mount *mnt)
2412	{
2413	struct mount *p;
2414
2415	for (p = mnt; p; p = next_mnt(p, root: mnt)) {
2416	int flags = p->mnt.mnt_flags;
2417	/ Don't allow unprivileged users to change mount flags /
2418	flags \|= MNT_LOCK_ATIME;
2419
2420	if (flags & MNT_READONLY)
2421	flags \|= MNT_LOCK_READONLY;
2422
2423	if (flags & MNT_NODEV)
2424	flags \|= MNT_LOCK_NODEV;
2425
2426	if (flags & MNT_NOSUID)
2427	flags \|= MNT_LOCK_NOSUID;
2428
2429	if (flags & MNT_NOEXEC)
2430	flags \|= MNT_LOCK_NOEXEC;
2431	/ Don't allow unprivileged users to reveal what is under a mount /
2432	if (list_empty(head: &p->mnt_expire) && p != mnt)
2433	flags \|= MNT_LOCKED;
2434	p->mnt.mnt_flags = flags;
2435	}
2436	}
2437
2438	static void cleanup_group_ids(struct mount mnt, struct* mount *end)
2439	{
2440	struct mount *p;
2441
2442	for (p = mnt; p != end; p = next_mnt(p, root: mnt)) {
2443	if (p->mnt_group_id && !IS_MNT_SHARED(p))
2444	mnt_release_group_id(mnt: p);
2445	}
2446	}
2447
2448	static int invent_group_ids(struct mount *mnt, bool recurse)
2449	{
2450	struct mount *p;
2451
2452	for (p = mnt; p; p = recurse ? next_mnt(p, root: mnt) : NULL) {
2453	if (!p->mnt_group_id) {
2454	int err = mnt_alloc_group_id(mnt: p);
2455	if (err) {
2456	cleanup_group_ids(mnt, end: p);
2457	return err;
2458	}
2459	}
2460	}
2461
2462	return `0`;
2463	}
2464
2465	int count_mounts(struct mnt_namespace ns, struct* mount *mnt)
2466	{
2467	unsigned int max = READ_ONCE(sysctl_mount_max);
2468	unsigned int mounts = `0`;
2469	struct mount *p;
2470
2471	if (ns->nr_mounts >= max)
2472	return -ENOSPC;
2473	max -= ns->nr_mounts;
2474	if (ns->pending_mounts >= max)
2475	return -ENOSPC;
2476	max -= ns->pending_mounts;
2477
2478	for (p = mnt; p; p = next_mnt(p, root: mnt))
2479	mounts++;
2480
2481	if (mounts > max)
2482	return -ENOSPC;
2483
2484	ns->pending_mounts += mounts;
2485	return `0`;
2486	}
2487
2488	enum mnt_tree_flags_t {
2489	MNT_TREE_BENEATH = BIT(`0`),
2490	MNT_TREE_PROPAGATION = BIT(`1`),
2491	};
2492
2493	/**
2494	* attach_recursive_mnt - attach a source mount tree
2495	* @source_mnt: mount tree to be attached
2496	* @dest: the context for mounting at the place where the tree should go
2497	*
2498	* NOTE: in the table below explains the semantics when a source mount
2499	* of a given type is attached to a destination mount of a given type.
2500	* ---------------------------------------------------------------------------
2501	* \| BIND MOUNT OPERATION \|
2502	* \|**************************************************************************
2503	* \| source-->\| shared \| private \| slave \| unbindable \|
2504	* \| dest \| \| \| \| \|
2505	* \| \| \| \| \| \| \|
2506	* \| v \| \| \| \| \|
2507	* \|**************************************************************************
2508	* \| shared \| shared (++) \| shared (+) \| shared(+++)\| invalid \|
2509	* \| \| \| \| \| \|
2510	* \|non-shared\| shared (+) \| private \| slave (*) \| invalid \|
2511	* ***************************************************************************
2512	* A bind operation clones the source mount and mounts the clone on the
2513	* destination mount.
2514	*
2515	* (++) the cloned mount is propagated to all the mounts in the propagation
2516	* tree of the destination mount and the cloned mount is added to
2517	* the peer group of the source mount.
2518	* (+) the cloned mount is created under the destination mount and is marked
2519	* as shared. The cloned mount is added to the peer group of the source
2520	* mount.
2521	* (+++) the mount is propagated to all the mounts in the propagation tree
2522	* of the destination mount and the cloned mount is made slave
2523	* of the same master as that of the source mount. The cloned mount
2524	* is marked as 'shared and slave'.
2525	* (*) the cloned mount is made a slave of the same master as that of the
2526	* source mount.
2527	*
2528	* ---------------------------------------------------------------------------
2529	* \| MOVE MOUNT OPERATION \|
2530	* \|**************************************************************************
2531	* \| source-->\| shared \| private \| slave \| unbindable \|
2532	* \| dest \| \| \| \| \|
2533	* \| \| \| \| \| \| \|
2534	* \| v \| \| \| \| \|
2535	* \|**************************************************************************
2536	* \| shared \| shared (+) \| shared (+) \| shared(+++) \| invalid \|
2537	* \| \| \| \| \| \|
2538	* \|non-shared\| shared (+) \| private \| slave () \| unbindable \|
2539	* ***************************************************************************
2540	*
2541	* (+) the mount is moved to the destination. And is then propagated to
2542	* all the mounts in the propagation tree of the destination mount.
2543	* (+*) the mount is moved to the destination.
2544	* (+++) the mount is moved to the destination and is then propagated to
2545	* all the mounts belonging to the destination mount's propagation tree.
2546	* the mount is marked as 'shared and slave'.
2547	* (*) the mount continues to be a slave at the new location.
2548	*
2549	* if the source mount is a tree, the operations explained above is
2550	* applied to each mount in the tree.
2551	* Must be called without spinlocks held, since this function can sleep
2552	* in allocations.
2553	*
2554	* Context: The function expects namespace_lock() to be held.
2555	* Return: If @source_mnt was successfully attached 0 is returned.
2556	* Otherwise a negative error code is returned.
2557	*/
2558	static int attach_recursive_mnt(struct mount *source_mnt,
2559	const struct pinned_mountpoint *dest)
2560	{
2561	struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
2562	struct mount *dest_mnt = dest->parent;
2563	struct mountpoint *dest_mp = dest->mp;
2564	HLIST_HEAD(tree_list);
2565	struct mnt_namespace *ns = dest_mnt->mnt_ns;
2566	struct pinned_mountpoint root = {};
2567	struct mountpoint *shorter = NULL;
2568	struct mount child, p;
2569	struct mount *top;
2570	struct hlist_node *n;
2571	int err = `0`;
2572	bool moving = mnt_has_parent(mnt: source_mnt);
2573
2574	/*
2575	* Preallocate a mountpoint in case the new mounts need to be
2576	* mounted beneath mounts on the same mountpoint.
2577	*/
2578	for (top = source_mnt; unlikely(top->overmount); top = top->overmount) {
2579	if (!shorter && is_mnt_ns_file(dentry: top->mnt.mnt_root))
2580	shorter = top->mnt_mp;
2581	}
2582	err = get_mountpoint(dentry: top->mnt.mnt_root, m: &root);
2583	if (err)
2584	return err;
2585
2586	/ Is there space to add these mounts to the mount namespace? /
2587	if (!moving) {
2588	err = count_mounts(ns, mnt: source_mnt);
2589	if (err)
2590	goto out;
2591	}
2592
2593	if (IS_MNT_SHARED(dest_mnt)) {
2594	err = invent_group_ids(mnt: source_mnt, recurse: true);
2595	if (err)
2596	goto out;
2597	err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
2598	}
2599	lock_mount_hash();
2600	if (err)
2601	goto out_cleanup_ids;
2602
2603	if (IS_MNT_SHARED(dest_mnt)) {
2604	for (p = source_mnt; p; p = next_mnt(p, root: source_mnt))
2605	set_mnt_shared(p);
2606	}
2607
2608	if (moving) {
2609	umount_mnt(mnt: source_mnt);
2610	mnt_notify_add(m: source_mnt);
2611	/ if the mount is moved, it should no longer be expired*
2612	* automatically */
2613	list_del_init(entry: &source_mnt->mnt_expire);
2614	} else {
2615	if (source_mnt->mnt_ns) {
2616	/ move from anon - the caller will destroy /
2617	emptied_ns = source_mnt->mnt_ns;
2618	for (p = source_mnt; p; p = next_mnt(p, root: source_mnt))
2619	move_from_ns(mnt: p);
2620	}
2621	}
2622
2623	mnt_set_mountpoint(mnt: dest_mnt, mp: dest_mp, child_mnt: source_mnt);
2624	/*
2625	* Now the original copy is in the same state as the secondaries -
2626	* its root attached to mountpoint, but not hashed and all mounts
2627	* in it are either in our namespace or in no namespace at all.
2628	* Add the original to the list of copies and deal with the
2629	* rest of work for all of them uniformly.
2630	*/
2631	hlist_add_head(n: &source_mnt->mnt_hash, h: &tree_list);
2632
2633	hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
2634	struct mount *q;
2635	hlist_del_init(n: &child->mnt_hash);
2636	/ Notice when we are propagating across user namespaces /
2637	if (child->mnt_parent->mnt_ns->user_ns != user_ns)
2638	lock_mnt_tree(mnt: child);
2639	q = __lookup_mnt(mnt: &child->mnt_parent->mnt,
2640	dentry: child->mnt_mountpoint);
2641	commit_tree(mnt: child);
2642	if (q) {
2643	struct mount *r = topmost_overmount(m: child);
2644	struct mountpoint *mp = root.mp;
2645
2646	if (unlikely(shorter) && child != source_mnt)
2647	mp = shorter;
2648	mnt_change_mountpoint(parent: r, mp, mnt: q);
2649	}
2650	}
2651	unpin_mountpoint(m: &root);
2652	unlock_mount_hash();
2653
2654	return `0`;
2655
2656	out_cleanup_ids:
2657	while (!hlist_empty(h: &tree_list)) {
2658	child = hlist_entry(tree_list.first, struct mount, mnt_hash);
2659	child->mnt_parent->mnt_ns->pending_mounts = `0`;
2660	umount_tree(mnt: child, how: UMOUNT_SYNC);
2661	}
2662	unlock_mount_hash();
2663	cleanup_group_ids(mnt: source_mnt, NULL);
2664	out:
2665	ns->pending_mounts = `0`;
2666
2667	read_seqlock_excl(sl: &mount_lock);
2668	unpin_mountpoint(m: &root);
2669	read_sequnlock_excl(sl: &mount_lock);
2670
2671	return err;
2672	}
2673
2674	static inline struct mount where_to_mount(const* struct path *path,
2675	struct dentry **dentry,
2676	bool beneath)
2677	{
2678	struct mount *m;
2679
2680	if (unlikely(beneath)) {
2681	m = topmost_overmount(m: real_mount(mnt: path->mnt));
2682	*dentry = m->mnt_mountpoint;
2683	return m->mnt_parent;
2684	}
2685	m = __lookup_mnt(mnt: path->mnt, dentry: path->dentry);
2686	if (unlikely(m)) {
2687	m = topmost_overmount(m);
2688	*dentry = m->mnt.mnt_root;
2689	return m;
2690	}
2691	*dentry = path->dentry;
2692	return real_mount(mnt: path->mnt);
2693	}
2694
2695	/**
2696	* do_lock_mount - acquire environment for mounting
2697	* @path: target path
2698	* @res: context to set up
2699	* @beneath: whether the intention is to mount beneath @path
2700	*
2701	* To mount something at given location, we need
2702	* namespace_sem locked exclusive
2703	* inode of dentry we are mounting on locked exclusive
2704	* struct mountpoint for that dentry
2705	* struct mount we are mounting on
2706	*
2707	* Results are stored in caller-supplied context (pinned_mountpoint);
2708	* on success we have res->parent and res->mp pointing to parent and
2709	* mountpoint respectively and res->node inserted into the ->m_list
2710	* of the mountpoint, making sure the mountpoint won't disappear.
2711	* On failure we have res->parent set to ERR_PTR(-E...), res->mp
2712	* left NULL, res->node - empty.
2713	* In case of success do_lock_mount returns with locks acquired (in
2714	* proper order - inode lock nests outside of namespace_sem).
2715	*
2716	* Request to mount on overmounted location is treated as "mount on
2717	* top of whatever's overmounting it"; request to mount beneath
2718	* a location - "mount immediately beneath the topmost mount at that
2719	* place".
2720	*
2721	* In all cases the location must not have been unmounted and the
2722	* chosen mountpoint must be allowed to be mounted on. For "beneath"
2723	* case we also require the location to be at the root of a mount
2724	* that has a parent (i.e. is not a root of some namespace).
2725	*/
2726	static void do_lock_mount(const struct path *path,
2727	struct pinned_mountpoint *res,
2728	bool beneath)
2729	{
2730	int err;
2731
2732	if (unlikely(beneath) && !path_mounted(path)) {
2733	res->parent = ERR_PTR(error: -EINVAL);
2734	return;
2735	}
2736
2737	do {
2738	struct dentry dentry, d;
2739	struct mount m, n;
2740
2741	scoped_guard(mount_locked_reader) {
2742	m = where_to_mount(path, dentry: &dentry, beneath);
2743	if (&m->mnt != path->mnt) {
2744	mntget(&m->mnt);
2745	dget(dentry);
2746	}
2747	}
2748
2749	inode_lock(inode: dentry->d_inode);
2750	namespace_lock();
2751
2752	// check if the chain of mounts (if any) has changed.
2753	scoped_guard(mount_locked_reader)
2754	n = where_to_mount(path, dentry: &d, beneath);
2755
2756	if (unlikely(n != m \|\| dentry != d))
2757	err = -EAGAIN; // something moved, retry
2758	else if (unlikely(cant_mount(dentry) \|\| !is_mounted(path->mnt)))
2759	err = -ENOENT; // not to be mounted on
2760	else if (beneath && &m->mnt == path->mnt && !m->overmount)
2761	err = -EINVAL;
2762	else
2763	err = get_mountpoint(dentry, m: res);
2764
2765	if (unlikely(err)) {
2766	res->parent = ERR_PTR(error: err);
2767	namespace_unlock();
2768	inode_unlock(inode: dentry->d_inode);
2769	} else {
2770	res->parent = m;
2771	}
2772	/*
2773	* Drop the temporary references. This is subtle - on success
2774	* we are doing that under namespace_sem, which would normally
2775	* be forbidden. However, in that case we are guaranteed that
2776	* refcounts won't reach zero, since we know that path->mnt
2777	* is mounted and thus all mounts reachable from it are pinned
2778	* and stable, along with their mountpoints and roots.
2779	*/
2780	if (&m->mnt != path->mnt) {
2781	dput(dentry);
2782	mntput(&m->mnt);
2783	}
2784	} while (err == -EAGAIN);
2785	}
2786
2787	static void __unlock_mount(struct pinned_mountpoint *m)
2788	{
2789	inode_unlock(inode: m->mp->m_dentry->d_inode);
2790	read_seqlock_excl(sl: &mount_lock);
2791	unpin_mountpoint(m);
2792	read_sequnlock_excl(sl: &mount_lock);
2793	namespace_unlock();
2794	}
2795
2796	static inline void unlock_mount(struct pinned_mountpoint *m)
2797	{
2798	if (!IS_ERR(ptr: m->parent))
2799	__unlock_mount(m);
2800	}
2801
2802	#define LOCK_MOUNT_MAYBE_BENEATH(mp, path, beneath) \
2803	struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \
2804	do_lock_mount((path), &mp, (beneath))
2805	#define LOCK_MOUNT(mp, path) LOCK_MOUNT_MAYBE_BENEATH(mp, (path), false)
2806	#define LOCK_MOUNT_EXACT(mp, path) \
2807	struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \
2808	lock_mount_exact((path), &mp)
2809
2810	static int graft_tree(struct mount mnt, const* struct pinned_mountpoint *mp)
2811	{
2812	if (mnt->mnt.mnt_sb->s_flags & SB_NOUSER)
2813	return -EINVAL;
2814
2815	if (d_is_dir(dentry: mp->mp->m_dentry) !=
2816	d_is_dir(dentry: mnt->mnt.mnt_root))
2817	return -ENOTDIR;
2818
2819	return attach_recursive_mnt(source_mnt: mnt, dest: mp);
2820	}
2821
2822	static int may_change_propagation(const struct mount *m)
2823	{
2824	struct mnt_namespace *ns = m->mnt_ns;
2825
2826	// it must be mounted in some namespace
2827	if (IS_ERR_OR_NULL(ptr: ns)) // is_mounted()
2828	return -EINVAL;
2829	// and the caller must be admin in userns of that namespace
2830	if (!ns_capable(ns: ns->user_ns, CAP_SYS_ADMIN))
2831	return -EPERM;
2832	return `0`;
2833	}
2834
2835	/*
2836	* Sanity check the flags to change_mnt_propagation.
2837	*/
2838
2839	static int flags_to_propagation_type(int ms_flags)
2840	{
2841	int type = ms_flags & ~(MS_REC \| MS_SILENT);
2842
2843	/ Fail if any non-propagation flags are set /
2844	if (type & ~(MS_SHARED \| MS_PRIVATE \| MS_SLAVE \| MS_UNBINDABLE))
2845	return `0`;
2846	/ Only one propagation flag should be set /
2847	if (!is_power_of_2(n: type))
2848	return `0`;
2849	return type;
2850	}
2851
2852	/*
2853	* recursively change the type of the mountpoint.
2854	*/
2855	static int do_change_type(const struct path path, int* ms_flags)
2856	{
2857	struct mount *m;
2858	struct mount *mnt = real_mount(mnt: path->mnt);
2859	int recurse = ms_flags & MS_REC;
2860	int type;
2861	int err;
2862
2863	if (!path_mounted(path))
2864	return -EINVAL;
2865
2866	type = flags_to_propagation_type(ms_flags);
2867	if (!type)
2868	return -EINVAL;
2869
2870	guard(namespace_excl)();
2871
2872	err = may_change_propagation(m: mnt);
2873	if (err)
2874	return err;
2875
2876	if (type == MS_SHARED) {
2877	err = invent_group_ids(mnt, recurse);
2878	if (err)
2879	return err;
2880	}
2881
2882	for (m = mnt; m; m = (recurse ? next_mnt(p: m, root: mnt) : NULL))
2883	change_mnt_propagation(m, type);
2884
2885	return `0`;
2886	}
2887
2888	/ may_copy_tree() - check if a mount tree can be copied*
2889	* @path: path to the mount tree to be copied
2890	*
2891	* This helper checks if the caller may copy the mount tree starting
2892	* from @path->mnt. The caller may copy the mount tree under the
2893	* following circumstances:
2894	*
2895	* (1) The caller is located in the mount namespace of the mount tree.
2896	* This also implies that the mount does not belong to an anonymous
2897	* mount namespace.
2898	* (2) The caller tries to copy an nfs mount referring to a mount
2899	* namespace, i.e., the caller is trying to copy a mount namespace
2900	* entry from nsfs.
2901	* (3) The caller tries to copy a pidfs mount referring to a pidfd.
2902	* (4) The caller is trying to copy a mount tree that belongs to an
2903	* anonymous mount namespace.
2904	*
2905	* For that to be safe, this helper enforces that the origin mount
2906	* namespace the anonymous mount namespace was created from is the
2907	* same as the caller's mount namespace by comparing the sequence
2908	* numbers.
2909	*
2910	* This is not strictly necessary. The current semantics of the new
2911	* mount api enforce that the caller must be located in the same
2912	* mount namespace as the mount tree it interacts with. Using the
2913	* origin sequence number preserves these semantics even for
2914	* anonymous mount namespaces. However, one could envision extending
2915	* the api to directly operate across mount namespace if needed.
2916	*
2917	* The ownership of a non-anonymous mount namespace such as the
2918	* caller's cannot change.
2919	* => We know that the caller's mount namespace is stable.
2920	*
2921	* If the origin sequence number of the anonymous mount namespace is
2922	* the same as the sequence number of the caller's mount namespace.
2923	* => The owning namespaces are the same.
2924	*
2925	* ==> The earlier capability check on the owning namespace of the
2926	* caller's mount namespace ensures that the caller has the
2927	* ability to copy the mount tree.
2928	*
2929	* Returns true if the mount tree can be copied, false otherwise.
2930	*/
2931	static inline bool may_copy_tree(const struct path *path)
2932	{
2933	struct mount *mnt = real_mount(mnt: path->mnt);
2934	const struct dentry_operations *d_op;
2935
2936	if (check_mnt(mnt))
2937	return true;
2938
2939	d_op = path->dentry->d_op;
2940	if (d_op == &ns_dentry_operations)
2941	return true;
2942
2943	if (d_op == &pidfs_dentry_operations)
2944	return true;
2945
2946	if (!is_mounted(mnt: path->mnt))
2947	return false;
2948
2949	return check_anonymous_mnt(mnt);
2950	}
2951
2952
2953	static struct mount __do_loopback(const* struct path old_path, int* recurse)
2954	{
2955	struct mount *old = real_mount(mnt: old_path->mnt);
2956
2957	if (IS_MNT_UNBINDABLE(old))
2958	return ERR_PTR(error: -EINVAL);
2959
2960	if (!may_copy_tree(path: old_path))
2961	return ERR_PTR(error: -EINVAL);
2962
2963	if (!recurse && __has_locked_children(mnt: old, dentry: old_path->dentry))
2964	return ERR_PTR(error: -EINVAL);
2965
2966	if (recurse)
2967	return copy_tree(src_root: old, dentry: old_path->dentry, CL_COPY_MNT_NS_FILE);
2968	else
2969	return clone_mnt(old, root: old_path->dentry, flag: `0`);
2970	}
2971
2972	/*
2973	* do loopback mount.
2974	*/
2975	static int do_loopback(const struct path path, const* char *old_name,
2976	int recurse)
2977	{
2978	struct path old_path __free(path_put) = {};
2979	struct mount *mnt = NULL;
2980	int err;
2981	if (!old_name \|\| !*old_name)
2982	return -EINVAL;
2983	err = kern_path(old_name, LOOKUP_FOLLOW\|LOOKUP_AUTOMOUNT, &old_path);
2984	if (err)
2985	return err;
2986
2987	if (mnt_ns_loop(dentry: old_path.dentry))
2988	return -EINVAL;
2989
2990	LOCK_MOUNT(mp, path);
2991	if (IS_ERR(ptr: mp.parent))
2992	return PTR_ERR(ptr: mp.parent);
2993
2994	if (!check_mnt(mnt: mp.parent))
2995	return -EINVAL;
2996
2997	mnt = __do_loopback(old_path: &old_path, recurse);
2998	if (IS_ERR(ptr: mnt))
2999	return PTR_ERR(ptr: mnt);
3000
3001	err = graft_tree(mnt, mp: &mp);
3002	if (err) {
3003	lock_mount_hash();
3004	umount_tree(mnt, how: UMOUNT_SYNC);
3005	unlock_mount_hash();
3006	}
3007	return err;
3008	}
3009
3010	static struct mnt_namespace get_detached_copy(const* struct path *path, bool recursive)
3011	{
3012	struct mnt_namespace ns, mnt_ns = current->nsproxy->mnt_ns, *src_mnt_ns;
3013	struct user_namespace *user_ns = mnt_ns->user_ns;
3014	struct mount mnt, p;
3015
3016	ns = alloc_mnt_ns(user_ns, true);
3017	if (IS_ERR(ptr: ns))
3018	return ns;
3019
3020	guard(namespace_excl)();
3021
3022	/*
3023	* Record the sequence number of the source mount namespace.
3024	* This needs to hold namespace_sem to ensure that the mount
3025	* doesn't get attached.
3026	*/
3027	if (is_mounted(mnt: path->mnt)) {
3028	src_mnt_ns = real_mount(mnt: path->mnt)->mnt_ns;
3029	if (is_anon_ns(ns: src_mnt_ns))
3030	ns->seq_origin = src_mnt_ns->seq_origin;
3031	else
3032	ns->seq_origin = src_mnt_ns->ns.ns_id;
3033	}
3034
3035	mnt = __do_loopback(old_path: path, recurse: recursive);
3036	if (IS_ERR(ptr: mnt)) {
3037	emptied_ns = ns;
3038	return ERR_CAST(ptr: mnt);
3039	}
3040
3041	for (p = mnt; p; p = next_mnt(p, root: mnt)) {
3042	mnt_add_to_ns(ns, mnt: p);
3043	ns->nr_mounts++;
3044	}
3045	ns->root = mnt;
3046	return ns;
3047	}
3048
3049	static struct file open_detached_copy(struct* path *path, bool recursive)
3050	{
3051	struct mnt_namespace *ns = get_detached_copy(path, recursive);
3052	struct file *file;
3053
3054	if (IS_ERR(ptr: ns))
3055	return ERR_CAST(ptr: ns);
3056
3057	mntput(path->mnt);
3058	path->mnt = mntget(&ns->root->mnt);
3059	file = dentry_open(path, O_PATH, current_cred());
3060	if (IS_ERR(ptr: file))
3061	dissolve_on_fput(mnt: path->mnt);
3062	else
3063	file->f_mode \|= FMODE_NEED_UNMOUNT;
3064	return file;
3065	}
3066
3067	static struct file vfs_open_tree(int* dfd, const char __user filename, unsigned* int flags)
3068	{
3069	int ret;
3070	struct path path __free(path_put) = {};
3071	int lookup_flags = LOOKUP_AUTOMOUNT \| LOOKUP_FOLLOW;
3072	bool detached = flags & OPEN_TREE_CLONE;
3073
3074	BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);
3075
3076	if (flags & ~(AT_EMPTY_PATH \| AT_NO_AUTOMOUNT \| AT_RECURSIVE \|
3077	AT_SYMLINK_NOFOLLOW \| OPEN_TREE_CLONE \|
3078	OPEN_TREE_CLOEXEC))
3079	return ERR_PTR(error: -EINVAL);
3080
3081	if ((flags & (AT_RECURSIVE \| OPEN_TREE_CLONE)) == AT_RECURSIVE)
3082	return ERR_PTR(error: -EINVAL);
3083
3084	if (flags & AT_NO_AUTOMOUNT)
3085	lookup_flags &= ~LOOKUP_AUTOMOUNT;
3086	if (flags & AT_SYMLINK_NOFOLLOW)
3087	lookup_flags &= ~LOOKUP_FOLLOW;
3088	if (flags & AT_EMPTY_PATH)
3089	lookup_flags \|= LOOKUP_EMPTY;
3090
3091	if (detached && !may_mount())
3092	return ERR_PTR(error: -EPERM);
3093
3094	ret = user_path_at(dfd, filename, lookup_flags, &path);
3095	if (unlikely(ret))
3096	return ERR_PTR(error: ret);
3097
3098	if (detached)
3099	return open_detached_copy(path: &path, recursive: flags & AT_RECURSIVE);
3100
3101	return dentry_open(path: &path, O_PATH, current_cred());
3102	}
3103
3104	SYSCALL_DEFINE3(open_tree, int, dfd, const char __user , filename, unsigned*, flags)
3105	{
3106	int fd;
3107	struct file *file __free(fput) = NULL;
3108
3109	file = vfs_open_tree(dfd, filename, flags);
3110	if (IS_ERR(ptr: file))
3111	return PTR_ERR(ptr: file);
3112
3113	fd = get_unused_fd_flags(flags: flags & O_CLOEXEC);
3114	if (fd < `0`)
3115	return fd;
3116
3117	fd_install(fd, no_free_ptr(file));
3118	return fd;
3119	}
3120
3121	/*
3122	* Don't allow locked mount flags to be cleared.
3123	*
3124	* No locks need to be held here while testing the various MNT_LOCK
3125	* flags because those flags can never be cleared once they are set.
3126	*/
3127	static bool can_change_locked_flags(struct mount mnt, unsigned* int mnt_flags)
3128	{
3129	unsigned int fl = mnt->mnt.mnt_flags;
3130
3131	if ((fl & MNT_LOCK_READONLY) &&
3132	!(mnt_flags & MNT_READONLY))
3133	return false;
3134
3135	if ((fl & MNT_LOCK_NODEV) &&
3136	!(mnt_flags & MNT_NODEV))
3137	return false;
3138
3139	if ((fl & MNT_LOCK_NOSUID) &&
3140	!(mnt_flags & MNT_NOSUID))
3141	return false;
3142
3143	if ((fl & MNT_LOCK_NOEXEC) &&
3144	!(mnt_flags & MNT_NOEXEC))
3145	return false;
3146
3147	if ((fl & MNT_LOCK_ATIME) &&
3148	((fl & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK)))
3149	return false;
3150
3151	return true;
3152	}
3153
3154	static int change_mount_ro_state(struct mount mnt, unsigned* int mnt_flags)
3155	{
3156	bool readonly_request = (mnt_flags & MNT_READONLY);
3157
3158	if (readonly_request == __mnt_is_readonly(&mnt->mnt))
3159	return `0`;
3160
3161	if (readonly_request)
3162	return mnt_make_readonly(mnt);
3163
3164	mnt->mnt.mnt_flags &= ~MNT_READONLY;
3165	return `0`;
3166	}
3167
3168	static void set_mount_attributes(struct mount mnt, unsigned* int mnt_flags)
3169	{
3170	mnt_flags \|= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
3171	mnt->mnt.mnt_flags = mnt_flags;
3172	touch_mnt_namespace(ns: mnt->mnt_ns);
3173	}
3174
3175	static void mnt_warn_timestamp_expiry(const struct path *mountpoint,
3176	struct vfsmount *mnt)
3177	{
3178	struct super_block *sb = mnt->mnt_sb;
3179
3180	if (!__mnt_is_readonly(mnt) &&
3181	(!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) &&
3182	(ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) {
3183	char buf, mntpath;
3184
3185	buf = (char *)__get_free_page(GFP_KERNEL);
3186	if (buf)
3187	mntpath = d_path(mountpoint, buf, PAGE_SIZE);
3188	else
3189	mntpath = ERR_PTR(error: -ENOMEM);
3190	if (IS_ERR(ptr: mntpath))
3191	mntpath = "(unknown)";
3192
3193	pr_warn("%s filesystem being %s at %s supports timestamps until %ptTd (0x%llx)\n",
3194	sb->s_type->name,
3195	is_mounted(mnt) ? "remounted" : "mounted",
3196	mntpath, &sb->s_time_max,
3197	(unsigned long long)sb->s_time_max);
3198
3199	sb->s_iflags \|= SB_I_TS_EXPIRY_WARNED;
3200	if (buf)
3201	free_page((unsigned long)buf);
3202	}
3203	}
3204
3205	/*
3206	* Handle reconfiguration of the mountpoint only without alteration of the
3207	* superblock it refers to. This is triggered by specifying MS_REMOUNT\|MS_BIND
3208	* to mount(2).
3209	*/
3210	static int do_reconfigure_mnt(const struct path path, unsigned* int mnt_flags)
3211	{
3212	struct super_block *sb = path->mnt->mnt_sb;
3213	struct mount *mnt = real_mount(mnt: path->mnt);
3214	int ret;
3215
3216	if (!check_mnt(mnt))
3217	return -EINVAL;
3218
3219	if (!path_mounted(path))
3220	return -EINVAL;
3221
3222	if (!can_change_locked_flags(mnt, mnt_flags))
3223	return -EPERM;
3224
3225	/*
3226	* We're only checking whether the superblock is read-only not
3227	* changing it, so only take down_read(&sb->s_umount).
3228	*/
3229	down_read(sem: &sb->s_umount);
3230	lock_mount_hash();
3231	ret = change_mount_ro_state(mnt, mnt_flags);
3232	if (ret == `0`)
3233	set_mount_attributes(mnt, mnt_flags);
3234	unlock_mount_hash();
3235	up_read(sem: &sb->s_umount);
3236
3237	mnt_warn_timestamp_expiry(mountpoint: path, mnt: &mnt->mnt);
3238
3239	return ret;
3240	}
3241
3242	/*
3243	* change filesystem flags. dir should be a physical root of filesystem.
3244	* If you've mounted a non-root directory somewhere and want to do remount
3245	* on it - tough luck.
3246	*/
3247	static int do_remount(const struct path path, int* sb_flags,
3248	int mnt_flags, void *data)
3249	{
3250	int err;
3251	struct super_block *sb = path->mnt->mnt_sb;
3252	struct mount *mnt = real_mount(mnt: path->mnt);
3253	struct fs_context *fc;
3254
3255	if (!check_mnt(mnt))
3256	return -EINVAL;
3257
3258	if (!path_mounted(path))
3259	return -EINVAL;
3260
3261	if (!can_change_locked_flags(mnt, mnt_flags))
3262	return -EPERM;
3263
3264	fc = fs_context_for_reconfigure(dentry: path->dentry, sb_flags, MS_RMT_MASK);
3265	if (IS_ERR(ptr: fc))
3266	return PTR_ERR(ptr: fc);
3267
3268	/*
3269	* Indicate to the filesystem that the remount request is coming
3270	* from the legacy mount system call.
3271	*/
3272	fc->oldapi = true;
3273
3274	err = parse_monolithic_mount_data(fc, data);
3275	if (!err) {
3276	down_write(sem: &sb->s_umount);
3277	err = -EPERM;
3278	if (ns_capable(ns: sb->s_user_ns, CAP_SYS_ADMIN)) {
3279	err = reconfigure_super(fc);
3280	if (!err) {
3281	lock_mount_hash();
3282	set_mount_attributes(mnt, mnt_flags);
3283	unlock_mount_hash();
3284	}
3285	}
3286	up_write(sem: &sb->s_umount);
3287	}
3288
3289	mnt_warn_timestamp_expiry(mountpoint: path, mnt: &mnt->mnt);
3290
3291	put_fs_context(fc);
3292	return err;
3293	}
3294
3295	static inline int tree_contains_unbindable(struct mount *mnt)
3296	{
3297	struct mount *p;
3298	for (p = mnt; p; p = next_mnt(p, root: mnt)) {
3299	if (IS_MNT_UNBINDABLE(p))
3300	return `1`;
3301	}
3302	return `0`;
3303	}
3304
3305	static int do_set_group(const struct path from_path, const* struct path *to_path)
3306	{
3307	struct mount *from = real_mount(mnt: from_path->mnt);
3308	struct mount *to = real_mount(mnt: to_path->mnt);
3309	int err;
3310
3311	guard(namespace_excl)();
3312
3313	err = may_change_propagation(m: from);
3314	if (err)
3315	return err;
3316	err = may_change_propagation(m: to);
3317	if (err)
3318	return err;
3319
3320	/ To and From paths should be mount roots /
3321	if (!path_mounted(path: from_path))
3322	return -EINVAL;
3323	if (!path_mounted(path: to_path))
3324	return -EINVAL;
3325
3326	/ Setting sharing groups is only allowed across same superblock /
3327	if (from->mnt.mnt_sb != to->mnt.mnt_sb)
3328	return -EINVAL;
3329
3330	/ From mount root should be wider than To mount root /
3331	if (!is_subdir(to->mnt.mnt_root, from->mnt.mnt_root))
3332	return -EINVAL;
3333
3334	/ From mount should not have locked children in place of To's root /
3335	if (__has_locked_children(mnt: from, dentry: to->mnt.mnt_root))
3336	return -EINVAL;
3337
3338	/ Setting sharing groups is only allowed on private mounts /
3339	if (IS_MNT_SHARED(to) \|\| IS_MNT_SLAVE(to))
3340	return -EINVAL;
3341
3342	/ From should not be private /
3343	if (!IS_MNT_SHARED(from) && !IS_MNT_SLAVE(from))
3344	return -EINVAL;
3345
3346	if (IS_MNT_SLAVE(from)) {
3347	hlist_add_behind(n: &to->mnt_slave, prev: &from->mnt_slave);
3348	to->mnt_master = from->mnt_master;
3349	}
3350
3351	if (IS_MNT_SHARED(from)) {
3352	to->mnt_group_id = from->mnt_group_id;
3353	list_add(new: &to->mnt_share, head: &from->mnt_share);
3354	set_mnt_shared(to);
3355	}
3356	return `0`;
3357	}
3358
3359	/**
3360	* path_overmounted - check if path is overmounted
3361	* @path: path to check
3362	*
3363	* Check if path is overmounted, i.e., if there's a mount on top of
3364	* @path->mnt with @path->dentry as mountpoint.
3365	*
3366	* Context: namespace_sem must be held at least shared.
3367	* MUST NOT be called under lock_mount_hash() (there one should just
3368	* call __lookup_mnt() and check if it returns NULL).
3369	* Return: If path is overmounted true is returned, false if not.
3370	*/
3371	static inline bool path_overmounted(const struct path *path)
3372	{
3373	unsigned seq = read_seqbegin(sl: &mount_lock);
3374	bool no_child;
3375
3376	rcu_read_lock();
3377	no_child = !__lookup_mnt(mnt: path->mnt, dentry: path->dentry);
3378	rcu_read_unlock();
3379	if (need_seqretry(lock: &mount_lock, seq)) {
3380	read_seqlock_excl(sl: &mount_lock);
3381	no_child = !__lookup_mnt(mnt: path->mnt, dentry: path->dentry);
3382	read_sequnlock_excl(sl: &mount_lock);
3383	}
3384	return unlikely(!no_child);
3385	}
3386
3387	/*
3388	* Check if there is a possibly empty chain of descent from p1 to p2.
3389	* Locks: namespace_sem (shared) or mount_lock (read_seqlock_excl).
3390	*/
3391	static bool mount_is_ancestor(const struct mount p1, const* struct mount *p2)
3392	{
3393	while (p2 != p1 && mnt_has_parent(mnt: p2))
3394	p2 = p2->mnt_parent;
3395	return p2 == p1;
3396	}
3397
3398	/**
3399	* can_move_mount_beneath - check that we can mount beneath the top mount
3400	* @mnt_from: mount we are trying to move
3401	* @mnt_to: mount under which to mount
3402	* @mp: mountpoint of @mnt_to
3403	*
3404	* - Make sure that nothing can be mounted beneath the caller's current
3405	* root or the rootfs of the namespace.
3406	* - Make sure that the caller can unmount the topmost mount ensuring
3407	* that the caller could reveal the underlying mountpoint.
3408	* - Ensure that nothing has been mounted on top of @mnt_from before we
3409	* grabbed @namespace_sem to avoid creating pointless shadow mounts.
3410	* - Prevent mounting beneath a mount if the propagation relationship
3411	* between the source mount, parent mount, and top mount would lead to
3412	* nonsensical mount trees.
3413	*
3414	* Context: This function expects namespace_lock() to be held.
3415	* Return: On success 0, and on error a negative error code is returned.
3416	*/
3417	static int can_move_mount_beneath(const struct mount *mnt_from,
3418	const struct mount *mnt_to,
3419	const struct mountpoint *mp)
3420	{
3421	struct mount *parent_mnt_to = mnt_to->mnt_parent;
3422
3423	if (IS_MNT_LOCKED(mnt_to))
3424	return -EINVAL;
3425
3426	/ Avoid creating shadow mounts during mount propagation. /
3427	if (mnt_from->overmount)
3428	return -EINVAL;
3429
3430	/*
3431	* Mounting beneath the rootfs only makes sense when the
3432	* semantics of pivot_root(".", ".") are used.
3433	*/
3434	if (&mnt_to->mnt == current->fs->root.mnt)
3435	return -EINVAL;
3436	if (parent_mnt_to == current->nsproxy->mnt_ns->root)
3437	return -EINVAL;
3438
3439	if (mount_is_ancestor(p1: mnt_to, p2: mnt_from))
3440	return -EINVAL;
3441
3442	/*
3443	* If the parent mount propagates to the child mount this would
3444	* mean mounting @mnt_from on @mnt_to->mnt_parent and then
3445	* propagating a copy @c of @mnt_from on top of @mnt_to. This
3446	* defeats the whole purpose of mounting beneath another mount.
3447	*/
3448	if (propagation_would_overmount(from: parent_mnt_to, to: mnt_to, mp))
3449	return -EINVAL;
3450
3451	/*
3452	* If @mnt_to->mnt_parent propagates to @mnt_from this would
3453	* mean propagating a copy @c of @mnt_from on top of @mnt_from.
3454	* Afterwards @mnt_from would be mounted on top of
3455	* @mnt_to->mnt_parent and @mnt_to would be unmounted from
3456	* @mnt->mnt_parent and remounted on @mnt_from. But since @c is
3457	* already mounted on @mnt_from, @mnt_to would ultimately be
3458	* remounted on top of @c. Afterwards, @mnt_from would be
3459	* covered by a copy @c of @mnt_from and @c would be covered by
3460	* @mnt_from itself. This defeats the whole purpose of mounting
3461	* @mnt_from beneath @mnt_to.
3462	*/
3463	if (check_mnt(mnt: mnt_from) &&
3464	propagation_would_overmount(from: parent_mnt_to, to: mnt_from, mp))
3465	return -EINVAL;
3466
3467	return `0`;
3468	}
3469
3470	/ may_use_mount() - check if a mount tree can be used*
3471	* @mnt: vfsmount to be used
3472	*
3473	* This helper checks if the caller may use the mount tree starting
3474	* from @path->mnt. The caller may use the mount tree under the
3475	* following circumstances:
3476	*
3477	* (1) The caller is located in the mount namespace of the mount tree.
3478	* This also implies that the mount does not belong to an anonymous
3479	* mount namespace.
3480	* (2) The caller is trying to use a mount tree that belongs to an
3481	* anonymous mount namespace.
3482	*
3483	* For that to be safe, this helper enforces that the origin mount
3484	* namespace the anonymous mount namespace was created from is the
3485	* same as the caller's mount namespace by comparing the sequence
3486	* numbers.
3487	*
3488	* The ownership of a non-anonymous mount namespace such as the
3489	* caller's cannot change.
3490	* => We know that the caller's mount namespace is stable.
3491	*
3492	* If the origin sequence number of the anonymous mount namespace is
3493	* the same as the sequence number of the caller's mount namespace.
3494	* => The owning namespaces are the same.
3495	*
3496	* ==> The earlier capability check on the owning namespace of the
3497	* caller's mount namespace ensures that the caller has the
3498	* ability to use the mount tree.
3499	*
3500	* Returns true if the mount tree can be used, false otherwise.
3501	*/
3502	static inline bool may_use_mount(struct mount *mnt)
3503	{
3504	if (check_mnt(mnt))
3505	return true;
3506
3507	/*
3508	* Make sure that noone unmounted the target path or somehow
3509	* managed to get their hands on something purely kernel
3510	* internal.
3511	*/
3512	if (!is_mounted(mnt: &mnt->mnt))
3513	return false;
3514
3515	return check_anonymous_mnt(mnt);
3516	}
3517
3518	static int do_move_mount(const struct path *old_path,
3519	const struct path *new_path,
3520	enum mnt_tree_flags_t flags)
3521	{
3522	struct mount *old = real_mount(mnt: old_path->mnt);
3523	int err;
3524	bool beneath = flags & MNT_TREE_BENEATH;
3525
3526	if (!path_mounted(path: old_path))
3527	return -EINVAL;
3528
3529	if (d_is_dir(dentry: new_path->dentry) != d_is_dir(dentry: old_path->dentry))
3530	return -EINVAL;
3531
3532	LOCK_MOUNT_MAYBE_BENEATH(mp, new_path, beneath);
3533	if (IS_ERR(ptr: mp.parent))
3534	return PTR_ERR(ptr: mp.parent);
3535
3536	if (check_mnt(mnt: old)) {
3537	/ if the source is in our namespace... /
3538	/ ... it should be detachable from parent /
3539	if (!mnt_has_parent(mnt: old) \|\| IS_MNT_LOCKED(old))
3540	return -EINVAL;
3541	/ ... which should not be shared /
3542	if (IS_MNT_SHARED(old->mnt_parent))
3543	return -EINVAL;
3544	/ ... and the target should be in our namespace /
3545	if (!check_mnt(mnt: mp.parent))
3546	return -EINVAL;
3547	} else {
3548	/*
3549	* otherwise the source must be the root of some anon namespace.
3550	*/
3551	if (!anon_ns_root(m: old))
3552	return -EINVAL;
3553	/*
3554	* Bail out early if the target is within the same namespace -
3555	* subsequent checks would've rejected that, but they lose
3556	* some corner cases if we check it early.
3557	*/
3558	if (old->mnt_ns == mp.parent->mnt_ns)
3559	return -EINVAL;
3560	/*
3561	* Target should be either in our namespace or in an acceptable
3562	* anon namespace, sensu check_anonymous_mnt().
3563	*/
3564	if (!may_use_mount(mnt: mp.parent))
3565	return -EINVAL;
3566	}
3567
3568	if (beneath) {
3569	struct mount *over = real_mount(mnt: new_path->mnt);
3570
3571	if (mp.parent != over->mnt_parent)
3572	over = mp.parent->overmount;
3573	err = can_move_mount_beneath(mnt_from: old, mnt_to: over, mp: mp.mp);
3574	if (err)
3575	return err;
3576	}
3577
3578	/*
3579	* Don't move a mount tree containing unbindable mounts to a destination
3580	* mount which is shared.
3581	*/
3582	if (IS_MNT_SHARED(mp.parent) && tree_contains_unbindable(mnt: old))
3583	return -EINVAL;
3584	if (!check_for_nsfs_mounts(subtree: old))
3585	return -ELOOP;
3586	if (mount_is_ancestor(p1: old, p2: mp.parent))
3587	return -ELOOP;
3588
3589	return attach_recursive_mnt(source_mnt: old, dest: &mp);
3590	}
3591
3592	static int do_move_mount_old(const struct path path, const* char *old_name)
3593	{
3594	struct path old_path __free(path_put) = {};
3595	int err;
3596
3597	if (!old_name \|\| !*old_name)
3598	return -EINVAL;
3599
3600	err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
3601	if (err)
3602	return err;
3603
3604	return do_move_mount(old_path: &old_path, new_path: path, flags: `0`);
3605	}
3606
3607	/*
3608	* add a mount into a namespace's mount tree
3609	*/
3610	static int do_add_mount(struct mount newmnt, const* struct pinned_mountpoint *mp,
3611	int mnt_flags)
3612	{
3613	struct mount *parent = mp->parent;
3614
3615	if (IS_ERR(ptr: parent))
3616	return PTR_ERR(ptr: parent);
3617
3618	mnt_flags &= ~MNT_INTERNAL_FLAGS;
3619
3620	if (unlikely(!check_mnt(parent))) {
3621	/ that's acceptable only for automounts done in private ns /
3622	if (!(mnt_flags & MNT_SHRINKABLE))
3623	return -EINVAL;
3624	/ ... and for those we'd better have mountpoint still alive /
3625	if (!parent->mnt_ns)
3626	return -EINVAL;
3627	}
3628
3629	/ Refuse the same filesystem on the same mount point /
3630	if (parent->mnt.mnt_sb == newmnt->mnt.mnt_sb &&
3631	parent->mnt.mnt_root == mp->mp->m_dentry)
3632	return -EBUSY;
3633
3634	if (d_is_symlink(dentry: newmnt->mnt.mnt_root))
3635	return -EINVAL;
3636
3637	newmnt->mnt.mnt_flags = mnt_flags;
3638	return graft_tree(mnt: newmnt, mp);
3639	}
3640
3641	static bool mount_too_revealing(const struct super_block sb, int* *new_mnt_flags);
3642
3643	/*
3644	* Create a new mount using a superblock configuration and request it
3645	* be added to the namespace tree.
3646	*/
3647	static int do_new_mount_fc(struct fs_context fc, const* struct path *mountpoint,
3648	unsigned int mnt_flags)
3649	{
3650	struct super_block *sb;
3651	struct vfsmount *mnt __free(mntput) = fc_mount(fc);
3652	int error;
3653
3654	if (IS_ERR(ptr: mnt))
3655	return PTR_ERR(ptr: mnt);
3656
3657	sb = fc->root->d_sb;
3658	error = security_sb_kern_mount(sb);
3659	if (unlikely(error))
3660	return error;
3661
3662	if (unlikely(mount_too_revealing(sb, &mnt_flags))) {
3663	errorfcp(fc, "VFS", "Mount too revealing");
3664	return -EPERM;
3665	}
3666
3667	mnt_warn_timestamp_expiry(mountpoint, mnt);
3668
3669	LOCK_MOUNT(mp, mountpoint);
3670	error = do_add_mount(newmnt: real_mount(mnt), mp: &mp, mnt_flags);
3671	if (!error)
3672	retain_and_null_ptr(mnt); // consumed on success
3673	return error;
3674	}
3675
3676	/*
3677	* create a new mount for userspace and request it to be added into the
3678	* namespace's tree
3679	*/
3680	static int do_new_mount(const struct path path, const* char *fstype,
3681	int sb_flags, int mnt_flags,
3682	const char name, void* *data)
3683	{
3684	struct file_system_type *type;
3685	struct fs_context *fc;
3686	const char *subtype = NULL;
3687	int err = `0`;
3688
3689	if (!fstype)
3690	return -EINVAL;
3691
3692	type = get_fs_type(name: fstype);
3693	if (!type)
3694	return -ENODEV;
3695
3696	if (type->fs_flags & FS_HAS_SUBTYPE) {
3697	subtype = strchr(fstype, `'.'`);
3698	if (subtype) {
3699	subtype++;
3700	if (!*subtype) {
3701	put_filesystem(fs: type);
3702	return -EINVAL;
3703	}
3704	}
3705	}
3706
3707	fc = fs_context_for_mount(fs_type: type, sb_flags);
3708	put_filesystem(fs: type);
3709	if (IS_ERR(ptr: fc))
3710	return PTR_ERR(ptr: fc);
3711
3712	/*
3713	* Indicate to the filesystem that the mount request is coming
3714	* from the legacy mount system call.
3715	*/
3716	fc->oldapi = true;
3717
3718	if (subtype)
3719	err = vfs_parse_fs_string(fc, key: "subtype", value: subtype);
3720	if (!err && name)
3721	err = vfs_parse_fs_string(fc, key: "source", value: name);
3722	if (!err)
3723	err = parse_monolithic_mount_data(fc, data);
3724	if (!err && !mount_capable(fc))
3725	err = -EPERM;
3726	if (!err)
3727	err = do_new_mount_fc(fc, mountpoint: path, mnt_flags);
3728
3729	put_fs_context(fc);
3730	return err;
3731	}
3732
3733	static void lock_mount_exact(const struct path *path,
3734	struct pinned_mountpoint *mp)
3735	{
3736	struct dentry *dentry = path->dentry;
3737	int err;
3738
3739	inode_lock(inode: dentry->d_inode);
3740	namespace_lock();
3741	if (unlikely(cant_mount(dentry)))
3742	err = -ENOENT;
3743	else if (path_overmounted(path))
3744	err = -EBUSY;
3745	else
3746	err = get_mountpoint(dentry, m: mp);
3747	if (unlikely(err)) {
3748	namespace_unlock();
3749	inode_unlock(inode: dentry->d_inode);
3750	mp->parent = ERR_PTR(error: err);
3751	} else {
3752	mp->parent = real_mount(mnt: path->mnt);
3753	}
3754	}
3755
3756	int finish_automount(struct vfsmount __m, const* struct path *path)
3757	{
3758	struct vfsmount *m __free(mntput) = __m;
3759	struct mount *mnt;
3760	int err;
3761
3762	if (!m)
3763	return `0`;
3764	if (IS_ERR(ptr: m))
3765	return PTR_ERR(ptr: m);
3766
3767	mnt = real_mount(mnt: m);
3768
3769	if (m->mnt_root == path->dentry)
3770	return -ELOOP;
3771
3772	/*
3773	* we don't want to use LOCK_MOUNT() - in this case finding something
3774	* that overmounts our mountpoint to be means "quitely drop what we've
3775	* got", not "try to mount it on top".
3776	*/
3777	LOCK_MOUNT_EXACT(mp, path);
3778	if (mp.parent == ERR_PTR(error: -EBUSY))
3779	return `0`;
3780
3781	err = do_add_mount(newmnt: mnt, mp: &mp, mnt_flags: path->mnt->mnt_flags \| MNT_SHRINKABLE);
3782	if (likely(!err))
3783	retain_and_null_ptr(m);
3784	return err;
3785	}
3786
3787	/**
3788	* mnt_set_expiry - Put a mount on an expiration list
3789	* @mnt: The mount to list.
3790	* @expiry_list: The list to add the mount to.
3791	*/
3792	void mnt_set_expiry(struct vfsmount mnt, struct* list_head *expiry_list)
3793	{
3794	guard(mount_locked_reader)();
3795	list_add_tail(new: &real_mount(mnt)->mnt_expire, head: expiry_list);
3796	}
3797	EXPORT_SYMBOL(mnt_set_expiry);
3798
3799	/*
3800	* process a list of expirable mountpoints with the intent of discarding any
3801	* mountpoints that aren't in use and haven't been touched since last we came
3802	* here
3803	*/
3804	void mark_mounts_for_expiry(struct list_head *mounts)
3805	{
3806	struct mount mnt, next;
3807	LIST_HEAD(graveyard);
3808
3809	if (list_empty(head: mounts))
3810	return;
3811
3812	guard(namespace_excl)();
3813	guard(mount_writer)();
3814
3815	/ extract from the expiration list every vfsmount that matches the*
3816	* following criteria:
3817	* - already mounted
3818	* - only referenced by its parent vfsmount
3819	* - still marked for expiry (marked on the last call here; marks are
3820	* cleared by mntput())
3821	*/
3822	list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {
3823	if (!is_mounted(mnt: &mnt->mnt))
3824	continue;
3825	if (!xchg(&mnt->mnt_expiry_mark, `1`) \|\|
3826	propagate_mount_busy(mnt, `1`))
3827	continue;
3828	list_move(list: &mnt->mnt_expire, head: &graveyard);
3829	}
3830	while (!list_empty(head: &graveyard)) {
3831	mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
3832	touch_mnt_namespace(ns: mnt->mnt_ns);
3833	umount_tree(mnt, how: UMOUNT_PROPAGATE\|UMOUNT_SYNC);
3834	}
3835	}
3836
3837	EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
3838
3839	/*
3840	* Ripoff of 'select_parent()'
3841	*
3842	* search the list of submounts for a given mountpoint, and move any
3843	* shrinkable submounts to the 'graveyard' list.
3844	*/
3845	static int select_submounts(struct mount parent, struct* list_head *graveyard)
3846	{
3847	struct mount *this_parent = parent;
3848	struct list_head *next;
3849	int found = `0`;
3850
3851	repeat:
3852	next = this_parent->mnt_mounts.next;
3853	resume:
3854	while (next != &this_parent->mnt_mounts) {
3855	struct list_head *tmp = next;
3856	struct mount mnt = list_entry(tmp, struct* mount, mnt_child);
3857
3858	next = tmp->next;
3859	if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE))
3860	continue;
3861	/*
3862	* Descend a level if the d_mounts list is non-empty.
3863	*/
3864	if (!list_empty(head: &mnt->mnt_mounts)) {
3865	this_parent = mnt;
3866	goto repeat;
3867	}
3868
3869	if (!propagate_mount_busy(mnt, `1`)) {
3870	list_move_tail(list: &mnt->mnt_expire, head: graveyard);
3871	found++;
3872	}
3873	}
3874	/*
3875	* All done at this level ... ascend and resume the search
3876	*/
3877	if (this_parent != parent) {
3878	next = this_parent->mnt_child.next;
3879	this_parent = this_parent->mnt_parent;
3880	goto resume;
3881	}
3882	return found;
3883	}
3884
3885	/*
3886	* process a list of expirable mountpoints with the intent of discarding any
3887	* submounts of a specific parent mountpoint
3888	*
3889	* mount_lock must be held for write
3890	*/
3891	static void shrink_submounts(struct mount *mnt)
3892	{
3893	LIST_HEAD(graveyard);
3894	struct mount *m;
3895
3896	/ extract submounts of 'mountpoint' from the expiration list /
3897	while (select_submounts(parent: mnt, graveyard: &graveyard)) {
3898	while (!list_empty(head: &graveyard)) {
3899	m = list_first_entry(&graveyard, struct mount,
3900	mnt_expire);
3901	touch_mnt_namespace(ns: m->mnt_ns);
3902	umount_tree(mnt: m, how: UMOUNT_PROPAGATE\|UMOUNT_SYNC);
3903	}
3904	}
3905	}
3906
3907	static void copy_mount_options(const* void __user * data)
3908	{
3909	char *copy;
3910	unsigned left, offset;
3911
3912	if (!data)
3913	return NULL;
3914
3915	copy = kmalloc(PAGE_SIZE, GFP_KERNEL);
3916	if (!copy)
3917	return ERR_PTR(error: -ENOMEM);
3918
3919	left = copy_from_user(to: copy, from: data, PAGE_SIZE);
3920
3921	/*
3922	* Not all architectures have an exact copy_from_user(). Resort to
3923	* byte at a time.
3924	*/
3925	offset = PAGE_SIZE - left;
3926	while (left) {
3927	char c;
3928	if (get_user(c, (const char __user *)data + offset))
3929	break;
3930	copy[offset] = c;
3931	left--;
3932	offset++;
3933	}
3934
3935	if (left == PAGE_SIZE) {
3936	kfree(objp: copy);
3937	return ERR_PTR(error: -EFAULT);
3938	}
3939
3940	return copy;
3941	}
3942
3943	static char copy_mount_string(const* void __user *data)
3944	{
3945	return data ? strndup_user(data, PATH_MAX) : NULL;
3946	}
3947
3948	/*
3949	* Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
3950	* be given to the mount() call (ie: read-only, no-dev, no-suid etc).
3951	*
3952	* data is a (void *) that can point to any structure up to
3953	* PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
3954	* information (or be NULL).
3955	*
3956	* Pre-0.97 versions of mount() didn't have a flags word.
3957	* When the flags word was introduced its top half was required
3958	* to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
3959	* Therefore, if this magic number is present, it carries no information
3960	* and must be discarded.
3961	*/
3962	int path_mount(const char dev_name, const* struct path *path,
3963	const char type_page, unsigned* long flags, void *data_page)
3964	{
3965	unsigned int mnt_flags = `0`, sb_flags;
3966	int ret;
3967
3968	/ Discard magic /
3969	if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
3970	flags &= ~MS_MGC_MSK;
3971
3972	/ Basic sanity checks /
3973	if (data_page)
3974	((char *)data_page)[PAGE_SIZE - `1`] = `0`;
3975
3976	if (flags & MS_NOUSER)
3977	return -EINVAL;
3978
3979	ret = security_sb_mount(dev_name, path, type: type_page, flags, data: data_page);
3980	if (ret)
3981	return ret;
3982	if (!may_mount())
3983	return -EPERM;
3984	if (flags & SB_MANDLOCK)
3985	warn_mandlock();
3986
3987	/ Default to relatime unless overriden /
3988	if (!(flags & MS_NOATIME))
3989	mnt_flags \|= MNT_RELATIME;
3990
3991	/ Separate the per-mountpoint flags /
3992	if (flags & MS_NOSUID)
3993	mnt_flags \|= MNT_NOSUID;
3994	if (flags & MS_NODEV)
3995	mnt_flags \|= MNT_NODEV;
3996	if (flags & MS_NOEXEC)
3997	mnt_flags \|= MNT_NOEXEC;
3998	if (flags & MS_NOATIME)
3999	mnt_flags \|= MNT_NOATIME;
4000	if (flags & MS_NODIRATIME)
4001	mnt_flags \|= MNT_NODIRATIME;
4002	if (flags & MS_STRICTATIME)
4003	mnt_flags &= ~(MNT_RELATIME \| MNT_NOATIME);
4004	if (flags & MS_RDONLY)
4005	mnt_flags \|= MNT_READONLY;
4006	if (flags & MS_NOSYMFOLLOW)
4007	mnt_flags \|= MNT_NOSYMFOLLOW;
4008
4009	/ The default atime for remount is preservation /
4010	if ((flags & MS_REMOUNT) &&
4011	((flags & (MS_NOATIME \| MS_NODIRATIME \| MS_RELATIME \|
4012	MS_STRICTATIME)) == `0`)) {
4013	mnt_flags &= ~MNT_ATIME_MASK;
4014	mnt_flags \|= path->mnt->mnt_flags & MNT_ATIME_MASK;
4015	}
4016
4017	sb_flags = flags & (SB_RDONLY \|
4018	SB_SYNCHRONOUS \|
4019	SB_MANDLOCK \|
4020	SB_DIRSYNC \|
4021	SB_SILENT \|
4022	SB_POSIXACL \|
4023	SB_LAZYTIME \|
4024	SB_I_VERSION);
4025
4026	if ((flags & (MS_REMOUNT \| MS_BIND)) == (MS_REMOUNT \| MS_BIND))
4027	return do_reconfigure_mnt(path, mnt_flags);
4028	if (flags & MS_REMOUNT)
4029	return do_remount(path, sb_flags, mnt_flags, data: data_page);
4030	if (flags & MS_BIND)
4031	return do_loopback(path, old_name: dev_name, recurse: flags & MS_REC);
4032	if (flags & (MS_SHARED \| MS_PRIVATE \| MS_SLAVE \| MS_UNBINDABLE))
4033	return do_change_type(path, ms_flags: flags);
4034	if (flags & MS_MOVE)
4035	return do_move_mount_old(path, old_name: dev_name);
4036
4037	return do_new_mount(path, fstype: type_page, sb_flags, mnt_flags, name: dev_name,
4038	data: data_page);
4039	}
4040
4041	int do_mount(const char dev_name, const* char __user *dir_name,
4042	const char type_page, unsigned* long flags, void *data_page)
4043	{
4044	struct path path __free(path_put) = {};
4045	int ret;
4046
4047	ret = user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path);
4048	if (ret)
4049	return ret;
4050	return path_mount(dev_name, path: &path, type_page, flags, data_page);
4051	}
4052
4053	static struct ucounts inc_mnt_namespaces(struct* user_namespace *ns)
4054	{
4055	return inc_ucount(ns, current_euid(), type: UCOUNT_MNT_NAMESPACES);
4056	}
4057
4058	static void dec_mnt_namespaces(struct ucounts *ucounts)
4059	{
4060	dec_ucount(ucounts, type: UCOUNT_MNT_NAMESPACES);
4061	}
4062
4063	static void free_mnt_ns(struct mnt_namespace *ns)
4064	{
4065	if (!is_anon_ns(ns))
4066	ns_common_free(ns);
4067	dec_mnt_namespaces(ucounts: ns->ucounts);
4068	mnt_ns_tree_remove(ns);
4069	}
4070
4071	static struct mnt_namespace alloc_mnt_ns(struct* user_namespace *user_ns, bool anon)
4072	{
4073	struct mnt_namespace *new_ns;
4074	struct ucounts *ucounts;
4075	int ret;
4076
4077	ucounts = inc_mnt_namespaces(ns: user_ns);
4078	if (!ucounts)
4079	return ERR_PTR(error: -ENOSPC);
4080
4081	new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL_ACCOUNT);
4082	if (!new_ns) {
4083	dec_mnt_namespaces(ucounts);
4084	return ERR_PTR(error: -ENOMEM);
4085	}
4086
4087	if (anon)
4088	ret = ns_common_init_inum(new_ns, MNT_NS_ANON_INO);
4089	else
4090	ret = ns_common_init(new_ns);
4091	if (ret) {
4092	kfree(objp: new_ns);
4093	dec_mnt_namespaces(ucounts);
4094	return ERR_PTR(error: ret);
4095	}
4096	if (!anon)
4097	ns_tree_gen_id(ns: &new_ns->ns);
4098	refcount_set(r: &new_ns->passive, n: `1`);
4099	new_ns->mounts = RB_ROOT;
4100	init_waitqueue_head(&new_ns->poll);
4101	new_ns->user_ns = get_user_ns(ns: user_ns);
4102	new_ns->ucounts = ucounts;
4103	return new_ns;
4104	}
4105
4106	__latent_entropy
4107	struct mnt_namespace copy_mnt_ns(u64 flags, struct* mnt_namespace *ns,
4108	struct user_namespace user_ns, struct* fs_struct *new_fs)
4109	{
4110	struct mnt_namespace *new_ns;
4111	struct vfsmount *rootmnt __free(mntput) = NULL;
4112	struct vfsmount *pwdmnt __free(mntput) = NULL;
4113	struct mount p, q;
4114	struct mount *old;
4115	struct mount *new;
4116	int copy_flags;
4117
4118	BUG_ON(!ns);
4119
4120	if (likely(!(flags & CLONE_NEWNS))) {
4121	get_mnt_ns(ns);
4122	return ns;
4123	}
4124
4125	old = ns->root;
4126
4127	new_ns = alloc_mnt_ns(user_ns, anon: false);
4128	if (IS_ERR(ptr: new_ns))
4129	return new_ns;
4130
4131	guard(namespace_excl)();
4132	/ First pass: copy the tree topology /
4133	copy_flags = CL_COPY_UNBINDABLE \| CL_EXPIRE;
4134	if (user_ns != ns->user_ns)
4135	copy_flags \|= CL_SLAVE;
4136	new = copy_tree(src_root: old, dentry: old->mnt.mnt_root, flag: copy_flags);
4137	if (IS_ERR(ptr: new)) {
4138	emptied_ns = new_ns;
4139	return ERR_CAST(ptr: new);
4140	}
4141	if (user_ns != ns->user_ns) {
4142	guard(mount_writer)();
4143	lock_mnt_tree(mnt: new);
4144	}
4145	new_ns->root = new;
4146
4147	/*
4148	* Second pass: switch the tsk->fs->* elements and mark new vfsmounts
4149	* as belonging to new namespace. We have already acquired a private
4150	* fs_struct, so tsk->fs->lock is not needed.
4151	*/
4152	p = old;
4153	q = new;
4154	while (p) {
4155	mnt_add_to_ns(ns: new_ns, mnt: q);
4156	new_ns->nr_mounts++;
4157	if (new_fs) {
4158	if (&p->mnt == new_fs->root.mnt) {
4159	new_fs->root.mnt = mntget(&q->mnt);
4160	rootmnt = &p->mnt;
4161	}
4162	if (&p->mnt == new_fs->pwd.mnt) {
4163	new_fs->pwd.mnt = mntget(&q->mnt);
4164	pwdmnt = &p->mnt;
4165	}
4166	}
4167	p = next_mnt(p, root: old);
4168	q = next_mnt(p: q, root: new);
4169	if (!q)
4170	break;
4171	// an mntns binding we'd skipped?
4172	while (p->mnt.mnt_root != q->mnt.mnt_root)
4173	p = next_mnt(p: skip_mnt_tree(p), root: old);
4174	}
4175	ns_tree_add_raw(new_ns);
4176	return new_ns;
4177	}
4178
4179	struct dentry mount_subtree(struct* vfsmount m, const* char *name)
4180	{
4181	struct mount *mnt = real_mount(mnt: m);
4182	struct mnt_namespace *ns;
4183	struct super_block *s;
4184	struct path path;
4185	int err;
4186
4187	ns = alloc_mnt_ns(user_ns: &init_user_ns, anon: true);
4188	if (IS_ERR(ptr: ns)) {
4189	mntput(m);
4190	return ERR_CAST(ptr: ns);
4191	}
4192	ns->root = mnt;
4193	ns->nr_mounts++;
4194	mnt_add_to_ns(ns, mnt);
4195
4196	err = vfs_path_lookup(m->mnt_root, m,
4197	name, LOOKUP_FOLLOW\|LOOKUP_AUTOMOUNT, &path);
4198
4199	put_mnt_ns(ns);
4200
4201	if (err)
4202	return ERR_PTR(error: err);
4203
4204	/ trade a vfsmount reference for active sb one /
4205	s = path.mnt->mnt_sb;
4206	atomic_inc(v: &s->s_active);
4207	mntput(path.mnt);
4208	/ lock the sucker /
4209	down_write(sem: &s->s_umount);
4210	/ ... and return the root of (sub)tree on it /
4211	return path.dentry;
4212	}
4213	EXPORT_SYMBOL(mount_subtree);
4214
4215	SYSCALL_DEFINE5(mount, char __user , dev_name, char* __user *, dir_name,
4216	char __user , type, unsigned* long, flags, void __user *, data)
4217	{
4218	int ret;
4219	char *kernel_type;
4220	char *kernel_dev;
4221	void *options;
4222
4223	kernel_type = copy_mount_string(data: type);
4224	ret = PTR_ERR(ptr: kernel_type);
4225	if (IS_ERR(ptr: kernel_type))
4226	goto out_type;
4227
4228	kernel_dev = copy_mount_string(data: dev_name);
4229	ret = PTR_ERR(ptr: kernel_dev);
4230	if (IS_ERR(ptr: kernel_dev))
4231	goto out_dev;
4232
4233	options = copy_mount_options(data);
4234	ret = PTR_ERR(ptr: options);
4235	if (IS_ERR(ptr: options))
4236	goto out_data;
4237
4238	ret = do_mount(dev_name: kernel_dev, dir_name, type_page: kernel_type, flags, data_page: options);
4239
4240	kfree(objp: options);
4241	out_data:
4242	kfree(objp: kernel_dev);
4243	out_dev:
4244	kfree(objp: kernel_type);
4245	out_type:
4246	return ret;
4247	}
4248
4249	#define FSMOUNT_VALID_FLAGS \
4250	(MOUNT_ATTR_RDONLY \| MOUNT_ATTR_NOSUID \| MOUNT_ATTR_NODEV \| \
4251	MOUNT_ATTR_NOEXEC \| MOUNT_ATTR__ATIME \| MOUNT_ATTR_NODIRATIME \| \
4252	MOUNT_ATTR_NOSYMFOLLOW)
4253
4254	#define MOUNT_SETATTR_VALID_FLAGS (FSMOUNT_VALID_FLAGS \| MOUNT_ATTR_IDMAP)
4255
4256	#define MOUNT_SETATTR_PROPAGATION_FLAGS \
4257	(MS_UNBINDABLE \| MS_PRIVATE \| MS_SLAVE \| MS_SHARED)
4258
4259	static unsigned int attr_flags_to_mnt_flags(u64 attr_flags)
4260	{
4261	unsigned int mnt_flags = `0`;
4262
4263	if (attr_flags & MOUNT_ATTR_RDONLY)
4264	mnt_flags \|= MNT_READONLY;
4265	if (attr_flags & MOUNT_ATTR_NOSUID)
4266	mnt_flags \|= MNT_NOSUID;
4267	if (attr_flags & MOUNT_ATTR_NODEV)
4268	mnt_flags \|= MNT_NODEV;
4269	if (attr_flags & MOUNT_ATTR_NOEXEC)
4270	mnt_flags \|= MNT_NOEXEC;
4271	if (attr_flags & MOUNT_ATTR_NODIRATIME)
4272	mnt_flags \|= MNT_NODIRATIME;
4273	if (attr_flags & MOUNT_ATTR_NOSYMFOLLOW)
4274	mnt_flags \|= MNT_NOSYMFOLLOW;
4275
4276	return mnt_flags;
4277	}
4278
4279	/*
4280	* Create a kernel mount representation for a new, prepared superblock
4281	* (specified by fs_fd) and attach to an open_tree-like file descriptor.
4282	*/
4283	SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
4284	unsigned int, attr_flags)
4285	{
4286	struct mnt_namespace *ns;
4287	struct fs_context *fc;
4288	struct file *file;
4289	struct path newmount;
4290	struct mount *mnt;
4291	unsigned int mnt_flags = `0`;
4292	long ret;
4293
4294	if (!may_mount())
4295	return -EPERM;
4296
4297	if ((flags & ~(FSMOUNT_CLOEXEC)) != `0`)
4298	return -EINVAL;
4299
4300	if (attr_flags & ~FSMOUNT_VALID_FLAGS)
4301	return -EINVAL;
4302
4303	mnt_flags = attr_flags_to_mnt_flags(attr_flags);
4304
4305	switch (attr_flags & MOUNT_ATTR__ATIME) {
4306	case MOUNT_ATTR_STRICTATIME:
4307	break;
4308	case MOUNT_ATTR_NOATIME:
4309	mnt_flags \|= MNT_NOATIME;
4310	break;
4311	case MOUNT_ATTR_RELATIME:
4312	mnt_flags \|= MNT_RELATIME;
4313	break;
4314	default:
4315	return -EINVAL;
4316	}
4317
4318	CLASS(fd, f)(fd: fs_fd);
4319	if (fd_empty(f))
4320	return -EBADF;
4321
4322	if (fd_file(f)->f_op != &fscontext_fops)
4323	return -EINVAL;
4324
4325	fc = fd_file(f)->private_data;
4326
4327	ret = mutex_lock_interruptible(lock: &fc->uapi_mutex);
4328	if (ret < `0`)
4329	return ret;
4330
4331	/ There must be a valid superblock or we can't mount it /
4332	ret = -EINVAL;
4333	if (!fc->root)
4334	goto err_unlock;
4335
4336	ret = -EPERM;
4337	if (mount_too_revealing(sb: fc->root->d_sb, new_mnt_flags: &mnt_flags)) {
4338	errorfcp(fc, "VFS", "Mount too revealing");
4339	goto err_unlock;
4340	}
4341
4342	ret = -EBUSY;
4343	if (fc->phase != FS_CONTEXT_AWAITING_MOUNT)
4344	goto err_unlock;
4345
4346	if (fc->sb_flags & SB_MANDLOCK)
4347	warn_mandlock();
4348
4349	newmount.mnt = vfs_create_mount(fc);
4350	if (IS_ERR(ptr: newmount.mnt)) {
4351	ret = PTR_ERR(ptr: newmount.mnt);
4352	goto err_unlock;
4353	}
4354	newmount.dentry = dget(dentry: fc->root);
4355	newmount.mnt->mnt_flags = mnt_flags;
4356
4357	/ We've done the mount bit - now move the file context into more or*
4358	* less the same state as if we'd done an fspick(). We don't want to
4359	* do any memory allocation or anything like that at this point as we
4360	* don't want to have to handle any errors incurred.
4361	*/
4362	vfs_clean_context(fc);
4363
4364	ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, anon: true);
4365	if (IS_ERR(ptr: ns)) {
4366	ret = PTR_ERR(ptr: ns);
4367	goto err_path;
4368	}
4369	mnt = real_mount(mnt: newmount.mnt);
4370	ns->root = mnt;
4371	ns->nr_mounts = `1`;
4372	mnt_add_to_ns(ns, mnt);
4373	mntget(newmount.mnt);
4374
4375	/ Attach to an apparent O_PATH fd with a note that we need to unmount*
4376	* it, not just simply put it.
4377	*/
4378	file = dentry_open(path: &newmount, O_PATH, creds: fc->cred);
4379	if (IS_ERR(ptr: file)) {
4380	dissolve_on_fput(mnt: newmount.mnt);
4381	ret = PTR_ERR(ptr: file);
4382	goto err_path;
4383	}
4384	file->f_mode \|= FMODE_NEED_UNMOUNT;
4385
4386	ret = get_unused_fd_flags(flags: (flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : `0`);
4387	if (ret >= `0`)
4388	fd_install(fd: ret, file);
4389	else
4390	fput(file);
4391
4392	err_path:
4393	path_put(&newmount);
4394	err_unlock:
4395	mutex_unlock(lock: &fc->uapi_mutex);
4396	return ret;
4397	}
4398
4399	static inline int vfs_move_mount(const struct path *from_path,
4400	const struct path *to_path,
4401	enum mnt_tree_flags_t mflags)
4402	{
4403	int ret;
4404
4405	ret = security_move_mount(from_path, to_path);
4406	if (ret)
4407	return ret;
4408
4409	if (mflags & MNT_TREE_PROPAGATION)
4410	return do_set_group(from_path, to_path);
4411
4412	return do_move_mount(old_path: from_path, new_path: to_path, flags: mflags);
4413	}
4414
4415	/*
4416	* Move a mount from one place to another. In combination with
4417	* fsopen()/fsmount() this is used to install a new mount and in combination
4418	* with open_tree(OPEN_TREE_CLONE [\| AT_RECURSIVE]) it can be used to copy
4419	* a mount subtree.
4420	*
4421	* Note the flags value is a combination of MOVE_MOUNT_* flags.
4422	*/
4423	SYSCALL_DEFINE5(move_mount,
4424	int, from_dfd, const char __user *, from_pathname,
4425	int, to_dfd, const char __user *, to_pathname,
4426	unsigned int, flags)
4427	{
4428	struct path to_path __free(path_put) = {};
4429	struct path from_path __free(path_put) = {};
4430	struct filename *to_name __free(putname) = NULL;
4431	struct filename *from_name __free(putname) = NULL;
4432	unsigned int lflags, uflags;
4433	enum mnt_tree_flags_t mflags = `0`;
4434	int ret = `0`;
4435
4436	if (!may_mount())
4437	return -EPERM;
4438
4439	if (flags & ~MOVE_MOUNT__MASK)
4440	return -EINVAL;
4441
4442	if ((flags & (MOVE_MOUNT_BENEATH \| MOVE_MOUNT_SET_GROUP)) ==
4443	(MOVE_MOUNT_BENEATH \| MOVE_MOUNT_SET_GROUP))
4444	return -EINVAL;
4445
4446	if (flags & MOVE_MOUNT_SET_GROUP) mflags \|= MNT_TREE_PROPAGATION;
4447	if (flags & MOVE_MOUNT_BENEATH) mflags \|= MNT_TREE_BENEATH;
4448
4449	uflags = `0`;
4450	if (flags & MOVE_MOUNT_T_EMPTY_PATH)
4451	uflags = AT_EMPTY_PATH;
4452
4453	to_name = getname_maybe_null(name: to_pathname, flags: uflags);
4454	if (IS_ERR(ptr: to_name))
4455	return PTR_ERR(ptr: to_name);
4456
4457	if (!to_name && to_dfd >= `0`) {
4458	CLASS(fd_raw, f_to)(fd: to_dfd);
4459	if (fd_empty(f: f_to))
4460	return -EBADF;
4461
4462	to_path = fd_file(f_to)->f_path;
4463	path_get(&to_path);
4464	} else {
4465	lflags = `0`;
4466	if (flags & MOVE_MOUNT_T_SYMLINKS)
4467	lflags \|= LOOKUP_FOLLOW;
4468	if (flags & MOVE_MOUNT_T_AUTOMOUNTS)
4469	lflags \|= LOOKUP_AUTOMOUNT;
4470	ret = filename_lookup(dfd: to_dfd, name: to_name, flags: lflags, path: &to_path, NULL);
4471	if (ret)
4472	return ret;
4473	}
4474
4475	uflags = `0`;
4476	if (flags & MOVE_MOUNT_F_EMPTY_PATH)
4477	uflags = AT_EMPTY_PATH;
4478
4479	from_name = getname_maybe_null(name: from_pathname, flags: uflags);
4480	if (IS_ERR(ptr: from_name))
4481	return PTR_ERR(ptr: from_name);
4482
4483	if (!from_name && from_dfd >= `0`) {
4484	CLASS(fd_raw, f_from)(fd: from_dfd);
4485	if (fd_empty(f: f_from))
4486	return -EBADF;
4487
4488	return vfs_move_mount(from_path: &fd_file(f_from)->f_path, to_path: &to_path, mflags);
4489	}
4490
4491	lflags = `0`;
4492	if (flags & MOVE_MOUNT_F_SYMLINKS)
4493	lflags \|= LOOKUP_FOLLOW;
4494	if (flags & MOVE_MOUNT_F_AUTOMOUNTS)
4495	lflags \|= LOOKUP_AUTOMOUNT;
4496	ret = filename_lookup(dfd: from_dfd, name: from_name, flags: lflags, path: &from_path, NULL);
4497	if (ret)
4498	return ret;
4499
4500	return vfs_move_mount(from_path: &from_path, to_path: &to_path, mflags);
4501	}
4502
4503	/*
4504	* Return true if path is reachable from root
4505	*
4506	* locks: mount_locked_reader \|\| namespace_shared && is_mounted(mnt)
4507	*/
4508	bool is_path_reachable(struct mount mnt, struct* dentry *dentry,
4509	const struct path *root)
4510	{
4511	while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) {
4512	dentry = mnt->mnt_mountpoint;
4513	mnt = mnt->mnt_parent;
4514	}
4515	return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);
4516	}
4517
4518	bool path_is_under(const struct path path1, const* struct path *path2)
4519	{
4520	guard(mount_locked_reader)();
4521	return is_path_reachable(mnt: real_mount(mnt: path1->mnt), dentry: path1->dentry, root: path2);
4522	}
4523	EXPORT_SYMBOL(path_is_under);
4524
4525	/*
4526	* pivot_root Semantics:
4527	* Moves the root file system of the current process to the directory put_old,
4528	* makes new_root as the new root file system of the current process, and sets
4529	* root/cwd of all processes which had them on the current root to new_root.
4530	*
4531	* Restrictions:
4532	* The new_root and put_old must be directories, and must not be on the
4533	* same file system as the current process root. The put_old must be
4534	* underneath new_root, i.e. adding a non-zero number of /.. to the string
4535	* pointed to by put_old must yield the same directory as new_root. No other
4536	* file system may be mounted on put_old. After all, new_root is a mountpoint.
4537	*
4538	* Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
4539	* See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives
4540	* in this situation.
4541	*
4542	* Notes:
4543	* - we don't move root/cwd if they are not at the root (reason: if something
4544	* cared enough to change them, it's probably wrong to force them elsewhere)
4545	* - it's okay to pick a root that isn't the root of a file system, e.g.
4546	* /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
4547	* though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
4548	* first.
4549	*/
4550	SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
4551	const char __user *, put_old)
4552	{
4553	struct path new __free(path_put) = {};
4554	struct path old __free(path_put) = {};
4555	struct path root __free(path_put) = {};
4556	struct mount new_mnt, root_mnt, old_mnt, root_parent, *ex_parent;
4557	int error;
4558
4559	if (!may_mount())
4560	return -EPERM;
4561
4562	error = user_path_at(AT_FDCWD, new_root,
4563	LOOKUP_FOLLOW \| LOOKUP_DIRECTORY, &new);
4564	if (error)
4565	return error;
4566
4567	error = user_path_at(AT_FDCWD, put_old,
4568	LOOKUP_FOLLOW \| LOOKUP_DIRECTORY, &old);
4569	if (error)
4570	return error;
4571
4572	error = security_sb_pivotroot(old_path: &old, new_path: &new);
4573	if (error)
4574	return error;
4575
4576	get_fs_root(current->fs, root: &root);
4577
4578	LOCK_MOUNT(old_mp, &old);
4579	old_mnt = old_mp.parent;
4580	if (IS_ERR(ptr: old_mnt))
4581	return PTR_ERR(ptr: old_mnt);
4582
4583	new_mnt = real_mount(mnt: new.mnt);
4584	root_mnt = real_mount(mnt: root.mnt);
4585	ex_parent = new_mnt->mnt_parent;
4586	root_parent = root_mnt->mnt_parent;
4587	if (IS_MNT_SHARED(old_mnt) \|\|
4588	IS_MNT_SHARED(ex_parent) \|\|
4589	IS_MNT_SHARED(root_parent))
4590	return -EINVAL;
4591	if (!check_mnt(mnt: root_mnt) \|\| !check_mnt(mnt: new_mnt))
4592	return -EINVAL;
4593	if (new_mnt->mnt.mnt_flags & MNT_LOCKED)
4594	return -EINVAL;
4595	if (d_unlinked(dentry: new.dentry))
4596	return -ENOENT;
4597	if (new_mnt == root_mnt \|\| old_mnt == root_mnt)
4598	return -EBUSY; / loop, on the same file system /
4599	if (!path_mounted(path: &root))
4600	return -EINVAL; / not a mountpoint /
4601	if (!mnt_has_parent(mnt: root_mnt))
4602	return -EINVAL; / absolute root /
4603	if (!path_mounted(path: &new))
4604	return -EINVAL; / not a mountpoint /
4605	if (!mnt_has_parent(mnt: new_mnt))
4606	return -EINVAL; / absolute root /
4607	/ make sure we can reach put_old from new_root /
4608	if (!is_path_reachable(mnt: old_mnt, dentry: old_mp.mp->m_dentry, root: &new))
4609	return -EINVAL;
4610	/ make certain new is below the root /
4611	if (!is_path_reachable(mnt: new_mnt, dentry: new.dentry, root: &root))
4612	return -EINVAL;
4613	lock_mount_hash();
4614	umount_mnt(mnt: new_mnt);
4615	if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
4616	new_mnt->mnt.mnt_flags \|= MNT_LOCKED;
4617	root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;
4618	}
4619	/ mount new_root on / /
4620	attach_mnt(mnt: new_mnt, parent: root_parent, mp: root_mnt->mnt_mp);
4621	umount_mnt(mnt: root_mnt);
4622	/ mount old root on put_old /
4623	attach_mnt(mnt: root_mnt, parent: old_mnt, mp: old_mp.mp);
4624	touch_mnt_namespace(current->nsproxy->mnt_ns);
4625	/ A moved mount should not expire automatically /
4626	list_del_init(entry: &new_mnt->mnt_expire);
4627	unlock_mount_hash();
4628	mnt_notify_add(m: root_mnt);
4629	mnt_notify_add(m: new_mnt);
4630	chroot_fs_refs(&root, &new);
4631	return `0`;
4632	}
4633
4634	static unsigned int recalc_flags(struct mount_kattr kattr, struct* mount *mnt)
4635	{
4636	unsigned int flags = mnt->mnt.mnt_flags;
4637
4638	/ flags to clear /
4639	flags &= ~kattr->attr_clr;
4640	/ flags to raise /
4641	flags \|= kattr->attr_set;
4642
4643	return flags;
4644	}
4645
4646	static int can_idmap_mount(const struct mount_kattr kattr, struct* mount *mnt)
4647	{
4648	struct vfsmount *m = &mnt->mnt;
4649	struct user_namespace *fs_userns = m->mnt_sb->s_user_ns;
4650
4651	if (!kattr->mnt_idmap)
4652	return `0`;
4653
4654	/*
4655	* Creating an idmapped mount with the filesystem wide idmapping
4656	* doesn't make sense so block that. We don't allow mushy semantics.
4657	*/
4658	if (kattr->mnt_userns == m->mnt_sb->s_user_ns)
4659	return -EINVAL;
4660
4661	/*
4662	* We only allow an mount to change it's idmapping if it has
4663	* never been accessible to userspace.
4664	*/
4665	if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE) && is_idmapped_mnt(mnt: m))
4666	return -EPERM;
4667
4668	/ The underlying filesystem doesn't support idmapped mounts yet. /
4669	if (!(m->mnt_sb->s_type->fs_flags & FS_ALLOW_IDMAP))
4670	return -EINVAL;
4671
4672	/ The filesystem has turned off idmapped mounts. /
4673	if (m->mnt_sb->s_iflags & SB_I_NOIDMAP)
4674	return -EINVAL;
4675
4676	/ We're not controlling the superblock. /
4677	if (!ns_capable(ns: fs_userns, CAP_SYS_ADMIN))
4678	return -EPERM;
4679
4680	/ Mount has already been visible in the filesystem hierarchy. /
4681	if (!is_anon_ns(ns: mnt->mnt_ns))
4682	return -EINVAL;
4683
4684	return `0`;
4685	}
4686
4687	/**
4688	* mnt_allow_writers() - check whether the attribute change allows writers
4689	* @kattr: the new mount attributes
4690	* @mnt: the mount to which @kattr will be applied
4691	*
4692	* Check whether thew new mount attributes in @kattr allow concurrent writers.
4693	*
4694	* Return: true if writers need to be held, false if not
4695	*/
4696	static inline bool mnt_allow_writers(const struct mount_kattr *kattr,
4697	const struct mount *mnt)
4698	{
4699	return (!(kattr->attr_set & MNT_READONLY) \|\|
4700	(mnt->mnt.mnt_flags & MNT_READONLY)) &&
4701	!kattr->mnt_idmap;
4702	}
4703
4704	static int mount_setattr_prepare(struct mount_kattr kattr, struct* mount *mnt)
4705	{
4706	struct mount *m;
4707	int err;
4708
4709	for (m = mnt; m; m = next_mnt(p: m, root: mnt)) {
4710	if (!can_change_locked_flags(mnt: m, mnt_flags: recalc_flags(kattr, mnt: m))) {
4711	err = -EPERM;
4712	break;
4713	}
4714
4715	err = can_idmap_mount(kattr, mnt: m);
4716	if (err)
4717	break;
4718
4719	if (!mnt_allow_writers(kattr, mnt: m)) {
4720	err = mnt_hold_writers(mnt: m);
4721	if (err) {
4722	m = next_mnt(p: m, root: mnt);
4723	break;
4724	}
4725	}
4726
4727	if (!(kattr->kflags & MOUNT_KATTR_RECURSE))
4728	return `0`;
4729	}
4730
4731	if (err) {
4732	/ undo all mnt_hold_writers() we'd done /
4733	for (struct mount *p = mnt; p != m; p = next_mnt(p, root: mnt))
4734	mnt_unhold_writers(mnt: p);
4735	}
4736	return err;
4737	}
4738
4739	static void do_idmap_mount(const struct mount_kattr kattr, struct* mount *mnt)
4740	{
4741	struct mnt_idmap *old_idmap;
4742
4743	if (!kattr->mnt_idmap)
4744	return;
4745
4746	old_idmap = mnt_idmap(mnt: &mnt->mnt);
4747
4748	/ Pairs with smp_load_acquire() in mnt_idmap(). /
4749	smp_store_release(&mnt->mnt.mnt_idmap, mnt_idmap_get(kattr->mnt_idmap));
4750	mnt_idmap_put(idmap: old_idmap);
4751	}
4752
4753	static void mount_setattr_commit(struct mount_kattr kattr, struct* mount *mnt)
4754	{
4755	struct mount *m;
4756
4757	for (m = mnt; m; m = next_mnt(p: m, root: mnt)) {
4758	unsigned int flags;
4759
4760	do_idmap_mount(kattr, mnt: m);
4761	flags = recalc_flags(kattr, mnt: m);
4762	WRITE_ONCE(m->mnt.mnt_flags, flags);
4763
4764	/ If we had to hold writers unblock them. /
4765	mnt_unhold_writers(mnt: m);
4766
4767	if (kattr->propagation)
4768	change_mnt_propagation(m, kattr->propagation);
4769	if (!(kattr->kflags & MOUNT_KATTR_RECURSE))
4770	break;
4771	}
4772	touch_mnt_namespace(ns: mnt->mnt_ns);
4773	}
4774
4775	static int do_mount_setattr(const struct path path, struct* mount_kattr *kattr)
4776	{
4777	struct mount *mnt = real_mount(mnt: path->mnt);
4778	int err = `0`;
4779
4780	if (!path_mounted(path))
4781	return -EINVAL;
4782
4783	if (kattr->mnt_userns) {
4784	struct mnt_idmap *mnt_idmap;
4785
4786	mnt_idmap = alloc_mnt_idmap(mnt_userns: kattr->mnt_userns);
4787	if (IS_ERR(ptr: mnt_idmap))
4788	return PTR_ERR(ptr: mnt_idmap);
4789	kattr->mnt_idmap = mnt_idmap;
4790	}
4791
4792	if (kattr->propagation) {
4793	/*
4794	* Only take namespace_lock() if we're actually changing
4795	* propagation.
4796	*/
4797	namespace_lock();
4798	if (kattr->propagation == MS_SHARED) {
4799	err = invent_group_ids(mnt, recurse: kattr->kflags & MOUNT_KATTR_RECURSE);
4800	if (err) {
4801	namespace_unlock();
4802	return err;
4803	}
4804	}
4805	}
4806
4807	err = -EINVAL;
4808	lock_mount_hash();
4809
4810	if (!anon_ns_root(m: mnt) && !check_mnt(mnt))
4811	goto out;
4812
4813	/*
4814	* First, we get the mount tree in a shape where we can change mount
4815	* properties without failure. If we succeeded to do so we commit all
4816	* changes and if we failed we clean up.
4817	*/
4818	err = mount_setattr_prepare(kattr, mnt);
4819	if (!err)
4820	mount_setattr_commit(kattr, mnt);
4821
4822	out:
4823	unlock_mount_hash();
4824
4825	if (kattr->propagation) {
4826	if (err)
4827	cleanup_group_ids(mnt, NULL);
4828	namespace_unlock();
4829	}
4830
4831	return err;
4832	}
4833
4834	static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
4835	struct mount_kattr *kattr)
4836	{
4837	struct ns_common *ns;
4838	struct user_namespace *mnt_userns;
4839
4840	if (!((attr->attr_set \| attr->attr_clr) & MOUNT_ATTR_IDMAP))
4841	return `0`;
4842
4843	if (attr->attr_clr & MOUNT_ATTR_IDMAP) {
4844	/*
4845	* We can only remove an idmapping if it's never been
4846	* exposed to userspace.
4847	*/
4848	if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE))
4849	return -EINVAL;
4850
4851	/*
4852	* Removal of idmappings is equivalent to setting
4853	* nop_mnt_idmap.
4854	*/
4855	if (!(attr->attr_set & MOUNT_ATTR_IDMAP)) {
4856	kattr->mnt_idmap = &nop_mnt_idmap;
4857	return `0`;
4858	}
4859	}
4860
4861	if (attr->userns_fd > INT_MAX)
4862	return -EINVAL;
4863
4864	CLASS(fd, f)(fd: attr->userns_fd);
4865	if (fd_empty(f))
4866	return -EBADF;
4867
4868	if (!proc_ns_file(fd_file(f)))
4869	return -EINVAL;
4870
4871	ns = get_proc_ns(file_inode(fd_file(f)));
4872	if (ns->ns_type != CLONE_NEWUSER)
4873	return -EINVAL;
4874
4875	/*
4876	* The initial idmapping cannot be used to create an idmapped
4877	* mount. We use the initial idmapping as an indicator of a mount
4878	* that is not idmapped. It can simply be passed into helpers that
4879	* are aware of idmapped mounts as a convenient shortcut. A user
4880	* can just create a dedicated identity mapping to achieve the same
4881	* result.
4882	*/
4883	mnt_userns = container_of(ns, struct user_namespace, ns);
4884	if (mnt_userns == &init_user_ns)
4885	return -EPERM;
4886
4887	/ We're not controlling the target namespace. /
4888	if (!ns_capable(ns: mnt_userns, CAP_SYS_ADMIN))
4889	return -EPERM;
4890
4891	kattr->mnt_userns = get_user_ns(ns: mnt_userns);
4892	return `0`;
4893	}
4894
4895	static int build_mount_kattr(const struct mount_attr *attr, size_t usize,
4896	struct mount_kattr *kattr)
4897	{
4898	if (attr->propagation & ~MOUNT_SETATTR_PROPAGATION_FLAGS)
4899	return -EINVAL;
4900	if (hweight32(attr->propagation & MOUNT_SETATTR_PROPAGATION_FLAGS) > `1`)
4901	return -EINVAL;
4902	kattr->propagation = attr->propagation;
4903
4904	if ((attr->attr_set \| attr->attr_clr) & ~MOUNT_SETATTR_VALID_FLAGS)
4905	return -EINVAL;
4906
4907	kattr->attr_set = attr_flags_to_mnt_flags(attr_flags: attr->attr_set);
4908	kattr->attr_clr = attr_flags_to_mnt_flags(attr_flags: attr->attr_clr);
4909
4910	/*
4911	* Since the MOUNT_ATTR_<atime> values are an enum, not a bitmap,
4912	* users wanting to transition to a different atime setting cannot
4913	* simply specify the atime setting in @attr_set, but must also
4914	* specify MOUNT_ATTR__ATIME in the @attr_clr field.
4915	* So ensure that MOUNT_ATTR__ATIME can't be partially set in
4916	* @attr_clr and that @attr_set can't have any atime bits set if
4917	* MOUNT_ATTR__ATIME isn't set in @attr_clr.
4918	*/
4919	if (attr->attr_clr & MOUNT_ATTR__ATIME) {
4920	if ((attr->attr_clr & MOUNT_ATTR__ATIME) != MOUNT_ATTR__ATIME)
4921	return -EINVAL;
4922
4923	/*
4924	* Clear all previous time settings as they are mutually
4925	* exclusive.
4926	*/
4927	kattr->attr_clr \|= MNT_RELATIME \| MNT_NOATIME;
4928	switch (attr->attr_set & MOUNT_ATTR__ATIME) {
4929	case MOUNT_ATTR_RELATIME:
4930	kattr->attr_set \|= MNT_RELATIME;
4931	break;
4932	case MOUNT_ATTR_NOATIME:
4933	kattr->attr_set \|= MNT_NOATIME;
4934	break;
4935	case MOUNT_ATTR_STRICTATIME:
4936	break;
4937	default:
4938	return -EINVAL;
4939	}
4940	} else {
4941	if (attr->attr_set & MOUNT_ATTR__ATIME)
4942	return -EINVAL;
4943	}
4944
4945	return build_mount_idmapped(attr, usize, kattr);
4946	}
4947
4948	static void finish_mount_kattr(struct mount_kattr *kattr)
4949	{
4950	if (kattr->mnt_userns) {
4951	put_user_ns(ns: kattr->mnt_userns);
4952	kattr->mnt_userns = NULL;
4953	}
4954
4955	if (kattr->mnt_idmap)
4956	mnt_idmap_put(idmap: kattr->mnt_idmap);
4957	}
4958
4959	static int wants_mount_setattr(struct mount_attr __user *uattr, size_t usize,
4960	struct mount_kattr *kattr)
4961	{
4962	int ret;
4963	struct mount_attr attr;
4964
4965	BUILD_BUG_ON(sizeof(struct mount_attr) != MOUNT_ATTR_SIZE_VER0);
4966
4967	if (unlikely(usize > PAGE_SIZE))
4968	return -E2BIG;
4969	if (unlikely(usize < MOUNT_ATTR_SIZE_VER0))
4970	return -EINVAL;
4971
4972	if (!may_mount())
4973	return -EPERM;
4974
4975	ret = copy_struct_from_user(dst: &attr, ksize: sizeof(attr), src: uattr, usize);
4976	if (ret)
4977	return ret;
4978
4979	/ Don't bother walking through the mounts if this is a nop. /
4980	if (attr.attr_set == `0` &&
4981	attr.attr_clr == `0` &&
4982	attr.propagation == `0`)
4983	return `0`; / Tell caller to not bother. /
4984
4985	ret = build_mount_kattr(attr: &attr, usize, kattr);
4986	if (ret < `0`)
4987	return ret;
4988
4989	return `1`;
4990	}
4991
4992	SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
4993	unsigned int, flags, struct mount_attr __user *, uattr,
4994	size_t, usize)
4995	{
4996	int err;
4997	struct path target;
4998	struct mount_kattr kattr;
4999	unsigned int lookup_flags = LOOKUP_AUTOMOUNT \| LOOKUP_FOLLOW;
5000
5001	if (flags & ~(AT_EMPTY_PATH \|
5002	AT_RECURSIVE \|
5003	AT_SYMLINK_NOFOLLOW \|
5004	AT_NO_AUTOMOUNT))
5005	return -EINVAL;
5006
5007	if (flags & AT_NO_AUTOMOUNT)
5008	lookup_flags &= ~LOOKUP_AUTOMOUNT;
5009	if (flags & AT_SYMLINK_NOFOLLOW)
5010	lookup_flags &= ~LOOKUP_FOLLOW;
5011	if (flags & AT_EMPTY_PATH)
5012	lookup_flags \|= LOOKUP_EMPTY;
5013
5014	kattr = (struct mount_kattr) {
5015	.lookup_flags = lookup_flags,
5016	};
5017
5018	if (flags & AT_RECURSIVE)
5019	kattr.kflags \|= MOUNT_KATTR_RECURSE;
5020
5021	err = wants_mount_setattr(uattr, usize, kattr: &kattr);
5022	if (err <= `0`)
5023	return err;
5024
5025	err = user_path_at(dfd, path, kattr.lookup_flags, &target);
5026	if (!err) {
5027	err = do_mount_setattr(path: &target, kattr: &kattr);
5028	path_put(&target);
5029	}
5030	finish_mount_kattr(kattr: &kattr);
5031	return err;
5032	}
5033
5034	SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename,
5035	unsigned, flags, struct mount_attr __user *, uattr,
5036	size_t, usize)
5037	{
5038	struct file __free(fput) *file = NULL;
5039	int fd;
5040
5041	if (!uattr && usize)
5042	return -EINVAL;
5043
5044	file = vfs_open_tree(dfd, filename, flags);
5045	if (IS_ERR(ptr: file))
5046	return PTR_ERR(ptr: file);
5047
5048	if (uattr) {
5049	int ret;
5050	struct mount_kattr kattr = {};
5051
5052	if (flags & OPEN_TREE_CLONE)
5053	kattr.kflags = MOUNT_KATTR_IDMAP_REPLACE;
5054	if (flags & AT_RECURSIVE)
5055	kattr.kflags \|= MOUNT_KATTR_RECURSE;
5056
5057	ret = wants_mount_setattr(uattr, usize, kattr: &kattr);
5058	if (ret > `0`) {
5059	ret = do_mount_setattr(path: &file->f_path, kattr: &kattr);
5060	finish_mount_kattr(kattr: &kattr);
5061	}
5062	if (ret)
5063	return ret;
5064	}
5065
5066	fd = get_unused_fd_flags(flags: flags & O_CLOEXEC);
5067	if (fd < `0`)
5068	return fd;
5069
5070	fd_install(fd, no_free_ptr(file));
5071	return fd;
5072	}
5073
5074	int show_path(struct seq_file m, struct* dentry *root)
5075	{
5076	if (root->d_sb->s_op->show_path)
5077	return root->d_sb->s_op->show_path(m, root);
5078
5079	seq_dentry(m, root, " \t\n\\");
5080	return `0`;
5081	}
5082
5083	static struct vfsmount lookup_mnt_in_ns(u64 id, struct* mnt_namespace *ns)
5084	{
5085	struct mount *mnt = mnt_find_id_at(ns, mnt_id: id);
5086
5087	if (!mnt \|\| mnt->mnt_id_unique != id)
5088	return NULL;
5089
5090	return &mnt->mnt;
5091	}
5092
5093	struct kstatmount {
5094	struct statmount __user *buf;
5095	size_t bufsize;
5096	struct vfsmount *mnt;
5097	struct mnt_idmap *idmap;
5098	u64 mask;
5099	struct path root;
5100	struct seq_file seq;
5101
5102	/ Must be last --ends in a flexible-array member. /
5103	struct statmount sm;
5104	};
5105
5106	static u64 mnt_to_attr_flags(struct vfsmount *mnt)
5107	{
5108	unsigned int mnt_flags = READ_ONCE(mnt->mnt_flags);
5109	u64 attr_flags = `0`;
5110
5111	if (mnt_flags & MNT_READONLY)
5112	attr_flags \|= MOUNT_ATTR_RDONLY;
5113	if (mnt_flags & MNT_NOSUID)
5114	attr_flags \|= MOUNT_ATTR_NOSUID;
5115	if (mnt_flags & MNT_NODEV)
5116	attr_flags \|= MOUNT_ATTR_NODEV;
5117	if (mnt_flags & MNT_NOEXEC)
5118	attr_flags \|= MOUNT_ATTR_NOEXEC;
5119	if (mnt_flags & MNT_NODIRATIME)
5120	attr_flags \|= MOUNT_ATTR_NODIRATIME;
5121	if (mnt_flags & MNT_NOSYMFOLLOW)
5122	attr_flags \|= MOUNT_ATTR_NOSYMFOLLOW;
5123
5124	if (mnt_flags & MNT_NOATIME)
5125	attr_flags \|= MOUNT_ATTR_NOATIME;
5126	else if (mnt_flags & MNT_RELATIME)
5127	attr_flags \|= MOUNT_ATTR_RELATIME;
5128	else
5129	attr_flags \|= MOUNT_ATTR_STRICTATIME;
5130
5131	if (is_idmapped_mnt(mnt))
5132	attr_flags \|= MOUNT_ATTR_IDMAP;
5133
5134	return attr_flags;
5135	}
5136
5137	static u64 mnt_to_propagation_flags(struct mount *m)
5138	{
5139	u64 propagation = `0`;
5140
5141	if (IS_MNT_SHARED(m))
5142	propagation \|= MS_SHARED;
5143	if (IS_MNT_SLAVE(m))
5144	propagation \|= MS_SLAVE;
5145	if (IS_MNT_UNBINDABLE(m))
5146	propagation \|= MS_UNBINDABLE;
5147	if (!propagation)
5148	propagation \|= MS_PRIVATE;
5149
5150	return propagation;
5151	}
5152
5153	static void statmount_sb_basic(struct kstatmount *s)
5154	{
5155	struct super_block *sb = s->mnt->mnt_sb;
5156
5157	s->sm.mask \|= STATMOUNT_SB_BASIC;
5158	s->sm.sb_dev_major = MAJOR(sb->s_dev);
5159	s->sm.sb_dev_minor = MINOR(sb->s_dev);
5160	s->sm.sb_magic = sb->s_magic;
5161	s->sm.sb_flags = sb->s_flags & (SB_RDONLY\|SB_SYNCHRONOUS\|SB_DIRSYNC\|SB_LAZYTIME);
5162	}
5163
5164	static void statmount_mnt_basic(struct kstatmount *s)
5165	{
5166	struct mount *m = real_mount(mnt: s->mnt);
5167
5168	s->sm.mask \|= STATMOUNT_MNT_BASIC;
5169	s->sm.mnt_id = m->mnt_id_unique;
5170	s->sm.mnt_parent_id = m->mnt_parent->mnt_id_unique;
5171	s->sm.mnt_id_old = m->mnt_id;
5172	s->sm.mnt_parent_id_old = m->mnt_parent->mnt_id;
5173	s->sm.mnt_attr = mnt_to_attr_flags(mnt: &m->mnt);
5174	s->sm.mnt_propagation = mnt_to_propagation_flags(m);
5175	s->sm.mnt_peer_group = m->mnt_group_id;
5176	s->sm.mnt_master = IS_MNT_SLAVE(m) ? m->mnt_master->mnt_group_id : `0`;
5177	}
5178
5179	static void statmount_propagate_from(struct kstatmount *s)
5180	{
5181	struct mount *m = real_mount(mnt: s->mnt);
5182
5183	s->sm.mask \|= STATMOUNT_PROPAGATE_FROM;
5184	if (IS_MNT_SLAVE(m))
5185	s->sm.propagate_from = get_dominating_id(mnt: m, root: &current->fs->root);
5186	}
5187
5188	static int statmount_mnt_root(struct kstatmount s, struct* seq_file *seq)
5189	{
5190	int ret;
5191	size_t start = seq->count;
5192
5193	ret = show_path(m: seq, root: s->mnt->mnt_root);
5194	if (ret)
5195	return ret;
5196
5197	if (unlikely(seq_has_overflowed(seq)))
5198	return -EAGAIN;
5199
5200	/*
5201	* Unescape the result. It would be better if supplied string was not
5202	* escaped in the first place, but that's a pretty invasive change.
5203	*/
5204	seq->buf[seq->count] = `'\0'`;
5205	seq->count = start;
5206	seq_commit(m: seq, num: string_unescape_inplace(buf: seq->buf + start, UNESCAPE_OCTAL));
5207	return `0`;
5208	}
5209
5210	static int statmount_mnt_point(struct kstatmount s, struct* seq_file *seq)
5211	{
5212	struct vfsmount *mnt = s->mnt;
5213	struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
5214	int err;
5215
5216	err = seq_path_root(m: seq, path: &mnt_path, root: &s->root, esc: "");
5217	return err == SEQ_SKIP ? `0` : err;
5218	}
5219
5220	static int statmount_fs_type(struct kstatmount s, struct* seq_file *seq)
5221	{
5222	struct super_block *sb = s->mnt->mnt_sb;
5223
5224	seq_puts(m: seq, s: sb->s_type->name);
5225	return `0`;
5226	}
5227
5228	static void statmount_fs_subtype(struct kstatmount s, struct* seq_file *seq)
5229	{
5230	struct super_block *sb = s->mnt->mnt_sb;
5231
5232	if (sb->s_subtype)
5233	seq_puts(m: seq, s: sb->s_subtype);
5234	}
5235
5236	static int statmount_sb_source(struct kstatmount s, struct* seq_file *seq)
5237	{
5238	struct super_block *sb = s->mnt->mnt_sb;
5239	struct mount *r = real_mount(mnt: s->mnt);
5240
5241	if (sb->s_op->show_devname) {
5242	size_t start = seq->count;
5243	int ret;
5244
5245	ret = sb->s_op->show_devname(seq, s->mnt->mnt_root);
5246	if (ret)
5247	return ret;
5248
5249	if (unlikely(seq_has_overflowed(seq)))
5250	return -EAGAIN;
5251
5252	/ Unescape the result /
5253	seq->buf[seq->count] = `'\0'`;
5254	seq->count = start;
5255	seq_commit(m: seq, num: string_unescape_inplace(buf: seq->buf + start, UNESCAPE_OCTAL));
5256	} else {
5257	seq_puts(m: seq, s: r->mnt_devname);
5258	}
5259	return `0`;
5260	}
5261
5262	static void statmount_mnt_ns_id(struct kstatmount s, struct* mnt_namespace *ns)
5263	{
5264	s->sm.mask \|= STATMOUNT_MNT_NS_ID;
5265	s->sm.mnt_ns_id = ns->ns.ns_id;
5266	}
5267
5268	static int statmount_mnt_opts(struct kstatmount s, struct* seq_file *seq)
5269	{
5270	struct vfsmount *mnt = s->mnt;
5271	struct super_block *sb = mnt->mnt_sb;
5272	size_t start = seq->count;
5273	int err;
5274
5275	err = security_sb_show_options(m: seq, sb);
5276	if (err)
5277	return err;
5278
5279	if (sb->s_op->show_options) {
5280	err = sb->s_op->show_options(seq, mnt->mnt_root);
5281	if (err)
5282	return err;
5283	}
5284
5285	if (unlikely(seq_has_overflowed(seq)))
5286	return -EAGAIN;
5287
5288	if (seq->count == start)
5289	return `0`;
5290
5291	/ skip leading comma /
5292	memmove(dest: seq->buf + start, src: seq->buf + start + `1`,
5293	count: seq->count - start - `1`);
5294	seq->count--;
5295
5296	return `0`;
5297	}
5298
5299	static inline int statmount_opt_process(struct seq_file *seq, size_t start)
5300	{
5301	char buf_end, opt_end, src, dst;
5302	int count = `0`;
5303
5304	if (unlikely(seq_has_overflowed(seq)))
5305	return -EAGAIN;
5306
5307	buf_end = seq->buf + seq->count;
5308	dst = seq->buf + start;
5309	src = dst + `1`; / skip initial comma /
5310
5311	if (src >= buf_end) {
5312	seq->count = start;
5313	return `0`;
5314	}
5315
5316	*buf_end = `'\0'`;
5317	for (; src < buf_end; src = opt_end + `1`) {
5318	opt_end = strchrnul(src, `','`);
5319	*opt_end = `'\0'`;
5320	dst += string_unescape(src, dst, size: `0`, UNESCAPE_OCTAL) + `1`;
5321	if (WARN_ON_ONCE(++count == INT_MAX))
5322	return -EOVERFLOW;
5323	}
5324	seq->count = dst - `1` - seq->buf;
5325	return count;
5326	}
5327
5328	static int statmount_opt_array(struct kstatmount s, struct* seq_file *seq)
5329	{
5330	struct vfsmount *mnt = s->mnt;
5331	struct super_block *sb = mnt->mnt_sb;
5332	size_t start = seq->count;
5333	int err;
5334
5335	if (!sb->s_op->show_options)
5336	return `0`;
5337
5338	err = sb->s_op->show_options(seq, mnt->mnt_root);
5339	if (err)
5340	return err;
5341
5342	err = statmount_opt_process(seq, start);
5343	if (err < `0`)
5344	return err;
5345
5346	s->sm.opt_num = err;
5347	return `0`;
5348	}
5349
5350	static int statmount_opt_sec_array(struct kstatmount s, struct* seq_file *seq)
5351	{
5352	struct vfsmount *mnt = s->mnt;
5353	struct super_block *sb = mnt->mnt_sb;
5354	size_t start = seq->count;
5355	int err;
5356
5357	err = security_sb_show_options(m: seq, sb);
5358	if (err)
5359	return err;
5360
5361	err = statmount_opt_process(seq, start);
5362	if (err < `0`)
5363	return err;
5364
5365	s->sm.opt_sec_num = err;
5366	return `0`;
5367	}
5368
5369	static inline int statmount_mnt_uidmap(struct kstatmount s, struct* seq_file *seq)
5370	{
5371	int ret;
5372
5373	ret = statmount_mnt_idmap(idmap: s->idmap, seq, uid_map: true);
5374	if (ret < `0`)
5375	return ret;
5376
5377	s->sm.mnt_uidmap_num = ret;
5378	/*
5379	* Always raise STATMOUNT_MNT_UIDMAP even if there are no valid
5380	* mappings. This allows userspace to distinguish between a
5381	* non-idmapped mount and an idmapped mount where none of the
5382	* individual mappings are valid in the caller's idmapping.
5383	*/
5384	if (is_valid_mnt_idmap(idmap: s->idmap))
5385	s->sm.mask \|= STATMOUNT_MNT_UIDMAP;
5386	return `0`;
5387	}
5388
5389	static inline int statmount_mnt_gidmap(struct kstatmount s, struct* seq_file *seq)
5390	{
5391	int ret;
5392
5393	ret = statmount_mnt_idmap(idmap: s->idmap, seq, uid_map: false);
5394	if (ret < `0`)
5395	return ret;
5396
5397	s->sm.mnt_gidmap_num = ret;
5398	/*
5399	* Always raise STATMOUNT_MNT_GIDMAP even if there are no valid
5400	* mappings. This allows userspace to distinguish between a
5401	* non-idmapped mount and an idmapped mount where none of the
5402	* individual mappings are valid in the caller's idmapping.
5403	*/
5404	if (is_valid_mnt_idmap(idmap: s->idmap))
5405	s->sm.mask \|= STATMOUNT_MNT_GIDMAP;
5406	return `0`;
5407	}
5408
5409	static int statmount_string(struct kstatmount *s, u64 flag)
5410	{
5411	int ret = `0`;
5412	size_t kbufsize;
5413	struct seq_file *seq = &s->seq;
5414	struct statmount *sm = &s->sm;
5415	u32 start, *offp;
5416
5417	/ Reserve an empty string at the beginning for any unset offsets /
5418	if (!seq->count)
5419	seq_putc(m: seq, c: `0`);
5420
5421	start = seq->count;
5422
5423	switch (flag) {
5424	case STATMOUNT_FS_TYPE:
5425	offp = &sm->fs_type;
5426	ret = statmount_fs_type(s, seq);
5427	break;
5428	case STATMOUNT_MNT_ROOT:
5429	offp = &sm->mnt_root;
5430	ret = statmount_mnt_root(s, seq);
5431	break;
5432	case STATMOUNT_MNT_POINT:
5433	offp = &sm->mnt_point;
5434	ret = statmount_mnt_point(s, seq);
5435	break;
5436	case STATMOUNT_MNT_OPTS:
5437	offp = &sm->mnt_opts;
5438	ret = statmount_mnt_opts(s, seq);
5439	break;
5440	case STATMOUNT_OPT_ARRAY:
5441	offp = &sm->opt_array;
5442	ret = statmount_opt_array(s, seq);
5443	break;
5444	case STATMOUNT_OPT_SEC_ARRAY:
5445	offp = &sm->opt_sec_array;
5446	ret = statmount_opt_sec_array(s, seq);
5447	break;
5448	case STATMOUNT_FS_SUBTYPE:
5449	offp = &sm->fs_subtype;
5450	statmount_fs_subtype(s, seq);
5451	break;
5452	case STATMOUNT_SB_SOURCE:
5453	offp = &sm->sb_source;
5454	ret = statmount_sb_source(s, seq);
5455	break;
5456	case STATMOUNT_MNT_UIDMAP:
5457	sm->mnt_uidmap = start;
5458	ret = statmount_mnt_uidmap(s, seq);
5459	break;
5460	case STATMOUNT_MNT_GIDMAP:
5461	sm->mnt_gidmap = start;
5462	ret = statmount_mnt_gidmap(s, seq);
5463	break;
5464	default:
5465	WARN_ON_ONCE(true);
5466	return -EINVAL;
5467	}
5468
5469	/*
5470	* If nothing was emitted, return to avoid setting the flag
5471	* and terminating the buffer.
5472	*/
5473	if (seq->count == start)
5474	return ret;
5475	if (unlikely(check_add_overflow(sizeof(*sm), seq->count, &kbufsize)))
5476	return -EOVERFLOW;
5477	if (kbufsize >= s->bufsize)
5478	return -EOVERFLOW;
5479
5480	/ signal a retry /
5481	if (unlikely(seq_has_overflowed(seq)))
5482	return -EAGAIN;
5483
5484	if (ret)
5485	return ret;
5486
5487	seq->buf[seq->count++] = `'\0'`;
5488	sm->mask \|= flag;
5489	*offp = start;
5490	return `0`;
5491	}
5492
5493	static int copy_statmount_to_user(struct kstatmount *s)
5494	{
5495	struct statmount *sm = &s->sm;
5496	struct seq_file *seq = &s->seq;
5497	char __user str = ((char* __user )s->buf) + sizeof(sm);
5498	size_t copysize = min_t(size_t, s->bufsize, sizeof(*sm));
5499
5500	if (seq->count && copy_to_user(to: str, from: seq->buf, n: seq->count))
5501	return -EFAULT;
5502
5503	/ Return the number of bytes copied to the buffer /
5504	sm->size = copysize + seq->count;
5505	if (copy_to_user(to: s->buf, from: sm, n: copysize))
5506	return -EFAULT;
5507
5508	return `0`;
5509	}
5510
5511	static struct mount listmnt_next(struct* mount *curr, bool reverse)
5512	{
5513	struct rb_node *node;
5514
5515	if (reverse)
5516	node = rb_prev(&curr->mnt_node);
5517	else
5518	node = rb_next(&curr->mnt_node);
5519
5520	return node_to_mount(node);
5521	}
5522
5523	static int grab_requested_root(struct mnt_namespace ns, struct* path *root)
5524	{
5525	struct mount first, child;
5526
5527	rwsem_assert_held(sem: &namespace_sem);
5528
5529	/ We're looking at our own ns, just use get_fs_root. /
5530	if (ns == current->nsproxy->mnt_ns) {
5531	get_fs_root(current->fs, root);
5532	return `0`;
5533	}
5534
5535	/*
5536	* We have to find the first mount in our ns and use that, however it
5537	* may not exist, so handle that properly.
5538	*/
5539	if (mnt_ns_empty(ns))
5540	return -ENOENT;
5541
5542	first = child = ns->root;
5543	for (;;) {
5544	child = listmnt_next(curr: child, reverse: false);
5545	if (!child)
5546	return -ENOENT;
5547	if (child->mnt_parent == first)
5548	break;
5549	}
5550
5551	root->mnt = mntget(&child->mnt);
5552	root->dentry = dget(dentry: root->mnt->mnt_root);
5553	return `0`;
5554	}
5555
5556	/ This must be updated whenever a new flag is added /
5557	#define STATMOUNT_SUPPORTED (STATMOUNT_SB_BASIC \| \
5558	STATMOUNT_MNT_BASIC \| \
5559	STATMOUNT_PROPAGATE_FROM \| \
5560	STATMOUNT_MNT_ROOT \| \
5561	STATMOUNT_MNT_POINT \| \
5562	STATMOUNT_FS_TYPE \| \
5563	STATMOUNT_MNT_NS_ID \| \
5564	STATMOUNT_MNT_OPTS \| \
5565	STATMOUNT_FS_SUBTYPE \| \
5566	STATMOUNT_SB_SOURCE \| \
5567	STATMOUNT_OPT_ARRAY \| \
5568	STATMOUNT_OPT_SEC_ARRAY \| \
5569	STATMOUNT_SUPPORTED_MASK \| \
5570	STATMOUNT_MNT_UIDMAP \| \
5571	STATMOUNT_MNT_GIDMAP)
5572
5573	/ locks: namespace_shared /
5574	static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
5575	struct mnt_namespace *ns)
5576	{
5577	struct mount *m;
5578	int err;
5579
5580	/ Has the namespace already been emptied? /
5581	if (mnt_ns_id && mnt_ns_empty(ns))
5582	return -ENOENT;
5583
5584	s->mnt = lookup_mnt_in_ns(id: mnt_id, ns);
5585	if (!s->mnt)
5586	return -ENOENT;
5587
5588	err = grab_requested_root(ns, root: &s->root);
5589	if (err)
5590	return err;
5591
5592	/*
5593	* Don't trigger audit denials. We just want to determine what
5594	* mounts to show users.
5595	*/
5596	m = real_mount(mnt: s->mnt);
5597	if (!is_path_reachable(mnt: m, dentry: m->mnt.mnt_root, root: &s->root) &&
5598	!ns_capable_noaudit(ns: ns->user_ns, CAP_SYS_ADMIN))
5599	return -EPERM;
5600
5601	err = security_sb_statfs(dentry: s->mnt->mnt_root);
5602	if (err)
5603	return err;
5604
5605	/*
5606	* Note that mount properties in mnt->mnt_flags, mnt->mnt_idmap
5607	* can change concurrently as we only hold the read-side of the
5608	* namespace semaphore and mount properties may change with only
5609	* the mount lock held.
5610	*
5611	* We could sample the mount lock sequence counter to detect
5612	* those changes and retry. But it's not worth it. Worst that
5613	* happens is that the mnt->mnt_idmap pointer is already changed
5614	* while mnt->mnt_flags isn't or vica versa. So what.
5615	*
5616	* Both mnt->mnt_flags and mnt->mnt_idmap are set and retrieved
5617	* via READ_ONCE()/WRITE_ONCE() and guard against theoretical
5618	* torn read/write. That's all we care about right now.
5619	*/
5620	s->idmap = mnt_idmap(mnt: s->mnt);
5621	if (s->mask & STATMOUNT_MNT_BASIC)
5622	statmount_mnt_basic(s);
5623
5624	if (s->mask & STATMOUNT_SB_BASIC)
5625	statmount_sb_basic(s);
5626
5627	if (s->mask & STATMOUNT_PROPAGATE_FROM)
5628	statmount_propagate_from(s);
5629
5630	if (s->mask & STATMOUNT_FS_TYPE)
5631	err = statmount_string(s, STATMOUNT_FS_TYPE);
5632
5633	if (!err && s->mask & STATMOUNT_MNT_ROOT)
5634	err = statmount_string(s, STATMOUNT_MNT_ROOT);
5635
5636	if (!err && s->mask & STATMOUNT_MNT_POINT)
5637	err = statmount_string(s, STATMOUNT_MNT_POINT);
5638
5639	if (!err && s->mask & STATMOUNT_MNT_OPTS)
5640	err = statmount_string(s, STATMOUNT_MNT_OPTS);
5641
5642	if (!err && s->mask & STATMOUNT_OPT_ARRAY)
5643	err = statmount_string(s, STATMOUNT_OPT_ARRAY);
5644
5645	if (!err && s->mask & STATMOUNT_OPT_SEC_ARRAY)
5646	err = statmount_string(s, STATMOUNT_OPT_SEC_ARRAY);
5647
5648	if (!err && s->mask & STATMOUNT_FS_SUBTYPE)
5649	err = statmount_string(s, STATMOUNT_FS_SUBTYPE);
5650
5651	if (!err && s->mask & STATMOUNT_SB_SOURCE)
5652	err = statmount_string(s, STATMOUNT_SB_SOURCE);
5653
5654	if (!err && s->mask & STATMOUNT_MNT_UIDMAP)
5655	err = statmount_string(s, STATMOUNT_MNT_UIDMAP);
5656
5657	if (!err && s->mask & STATMOUNT_MNT_GIDMAP)
5658	err = statmount_string(s, STATMOUNT_MNT_GIDMAP);
5659
5660	if (!err && s->mask & STATMOUNT_MNT_NS_ID)
5661	statmount_mnt_ns_id(s, ns);
5662
5663	if (!err && s->mask & STATMOUNT_SUPPORTED_MASK) {
5664	s->sm.mask \|= STATMOUNT_SUPPORTED_MASK;
5665	s->sm.supported_mask = STATMOUNT_SUPPORTED;
5666	}
5667
5668	if (err)
5669	return err;
5670
5671	/ Are there bits in the return mask not present in STATMOUNT_SUPPORTED? /
5672	WARN_ON_ONCE(~STATMOUNT_SUPPORTED & s->sm.mask);
5673
5674	return `0`;
5675	}
5676
5677	static inline bool retry_statmount(const long ret, size_t *seq_size)
5678	{
5679	if (likely(ret != -EAGAIN))
5680	return false;
5681	if (unlikely(check_mul_overflow(*seq_size, `2`, seq_size)))
5682	return false;
5683	if (unlikely(*seq_size > MAX_RW_COUNT))
5684	return false;
5685	return true;
5686	}
5687
5688	#define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT \| STATMOUNT_MNT_POINT \| \
5689	STATMOUNT_FS_TYPE \| STATMOUNT_MNT_OPTS \| \
5690	STATMOUNT_FS_SUBTYPE \| STATMOUNT_SB_SOURCE \| \
5691	STATMOUNT_OPT_ARRAY \| STATMOUNT_OPT_SEC_ARRAY \| \
5692	STATMOUNT_MNT_UIDMAP \| STATMOUNT_MNT_GIDMAP)
5693
5694	static int prepare_kstatmount(struct kstatmount ks, struct* mnt_id_req *kreq,
5695	struct statmount __user *buf, size_t bufsize,
5696	size_t seq_size)
5697	{
5698	if (!access_ok(buf, bufsize))
5699	return -EFAULT;
5700
5701	memset(s: ks, c: `0`, n: sizeof(*ks));
5702	ks->mask = kreq->param;
5703	ks->buf = buf;
5704	ks->bufsize = bufsize;
5705
5706	if (ks->mask & STATMOUNT_STRING_REQ) {
5707	if (bufsize == sizeof(ks->sm))
5708	return -EOVERFLOW;
5709
5710	ks->seq.buf = kvmalloc(seq_size, GFP_KERNEL_ACCOUNT);
5711	if (!ks->seq.buf)
5712	return -ENOMEM;
5713
5714	ks->seq.size = seq_size;
5715	}
5716
5717	return `0`;
5718	}
5719
5720	static int copy_mnt_id_req(const struct mnt_id_req __user *req,
5721	struct mnt_id_req *kreq)
5722	{
5723	int ret;
5724	size_t usize;
5725
5726	BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER1);
5727
5728	ret = get_user(usize, &req->size);
5729	if (ret)
5730	return -EFAULT;
5731	if (unlikely(usize > PAGE_SIZE))
5732	return -E2BIG;
5733	if (unlikely(usize < MNT_ID_REQ_SIZE_VER0))
5734	return -EINVAL;
5735	memset(s: kreq, c: `0`, n: sizeof(*kreq));
5736	ret = copy_struct_from_user(dst: kreq, ksize: sizeof(*kreq), src: req, usize);
5737	if (ret)
5738	return ret;
5739	if (kreq->spare != `0`)
5740	return -EINVAL;
5741	/ The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. /
5742	if (kreq->mnt_id <= MNT_UNIQUE_ID_OFFSET)
5743	return -EINVAL;
5744	return `0`;
5745	}
5746
5747	/*
5748	* If the user requested a specific mount namespace id, look that up and return
5749	* that, or if not simply grab a passive reference on our mount namespace and
5750	* return that.
5751	*/
5752	static struct mnt_namespace grab_requested_mnt_ns(const* struct mnt_id_req *kreq)
5753	{
5754	struct mnt_namespace *mnt_ns;
5755
5756	if (kreq->mnt_ns_id && kreq->spare)
5757	return ERR_PTR(error: -EINVAL);
5758
5759	if (kreq->mnt_ns_id)
5760	return lookup_mnt_ns(mnt_ns_id: kreq->mnt_ns_id);
5761
5762	if (kreq->spare) {
5763	struct ns_common *ns;
5764
5765	CLASS(fd, f)(fd: kreq->spare);
5766	if (fd_empty(f))
5767	return ERR_PTR(error: -EBADF);
5768
5769	if (!proc_ns_file(fd_file(f)))
5770	return ERR_PTR(error: -EINVAL);
5771
5772	ns = get_proc_ns(file_inode(fd_file(f)));
5773	if (ns->ns_type != CLONE_NEWNS)
5774	return ERR_PTR(error: -EINVAL);
5775
5776	mnt_ns = to_mnt_ns(ns);
5777	} else {
5778	mnt_ns = current->nsproxy->mnt_ns;
5779	}
5780
5781	refcount_inc(r: &mnt_ns->passive);
5782	return mnt_ns;
5783	}
5784
5785	SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
5786	struct statmount __user *, buf, size_t, bufsize,
5787	unsigned int, flags)
5788	{
5789	struct mnt_namespace *ns __free(mnt_ns_release) = NULL;
5790	struct kstatmount *ks __free(kfree) = NULL;
5791	struct mnt_id_req kreq;
5792	/ We currently support retrieval of 3 strings. /
5793	size_t seq_size = `3` * PATH_MAX;
5794	int ret;
5795
5796	if (flags)
5797	return -EINVAL;
5798
5799	ret = copy_mnt_id_req(req, kreq: &kreq);
5800	if (ret)
5801	return ret;
5802
5803	ns = grab_requested_mnt_ns(kreq: &kreq);
5804	if (!ns)
5805	return -ENOENT;
5806
5807	if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) &&
5808	!ns_capable_noaudit(ns: ns->user_ns, CAP_SYS_ADMIN))
5809	return -ENOENT;
5810
5811	ks = kmalloc(sizeof(*ks), GFP_KERNEL_ACCOUNT);
5812	if (!ks)
5813	return -ENOMEM;
5814
5815	retry:
5816	ret = prepare_kstatmount(ks, kreq: &kreq, buf, bufsize, seq_size);
5817	if (ret)
5818	return ret;
5819
5820	scoped_guard(namespace_shared)
5821	ret = do_statmount(s: ks, mnt_id: kreq.mnt_id, mnt_ns_id: kreq.mnt_ns_id, ns);
5822
5823	if (!ret)
5824	ret = copy_statmount_to_user(s: ks);
5825	kvfree(addr: ks->seq.buf);
5826	path_put(&ks->root);
5827	if (retry_statmount(ret, seq_size: &seq_size))
5828	goto retry;
5829	return ret;
5830	}
5831
5832	struct klistmount {
5833	u64 last_mnt_id;
5834	u64 mnt_parent_id;
5835	u64 *kmnt_ids;
5836	u32 nr_mnt_ids;
5837	struct mnt_namespace *ns;
5838	struct path root;
5839	};
5840
5841	/ locks: namespace_shared /
5842	static ssize_t do_listmount(struct klistmount *kls, bool reverse)
5843	{
5844	struct mnt_namespace *ns = kls->ns;
5845	u64 mnt_parent_id = kls->mnt_parent_id;
5846	u64 last_mnt_id = kls->last_mnt_id;
5847	u64 *mnt_ids = kls->kmnt_ids;
5848	size_t nr_mnt_ids = kls->nr_mnt_ids;
5849	struct path orig;
5850	struct mount r, first;
5851	ssize_t ret;
5852
5853	rwsem_assert_held(sem: &namespace_sem);
5854
5855	ret = grab_requested_root(ns, root: &kls->root);
5856	if (ret)
5857	return ret;
5858
5859	if (mnt_parent_id == LSMT_ROOT) {
5860	orig = kls->root;
5861	} else {
5862	orig.mnt = lookup_mnt_in_ns(id: mnt_parent_id, ns);
5863	if (!orig.mnt)
5864	return -ENOENT;
5865	orig.dentry = orig.mnt->mnt_root;
5866	}
5867
5868	/*
5869	* Don't trigger audit denials. We just want to determine what
5870	* mounts to show users.
5871	*/
5872	if (!is_path_reachable(mnt: real_mount(mnt: orig.mnt), dentry: orig.dentry, root: &kls->root) &&
5873	!ns_capable_noaudit(ns: ns->user_ns, CAP_SYS_ADMIN))
5874	return -EPERM;
5875
5876	ret = security_sb_statfs(dentry: orig.dentry);
5877	if (ret)
5878	return ret;
5879
5880	if (!last_mnt_id) {
5881	if (reverse)
5882	first = node_to_mount(node: ns->mnt_last_node);
5883	else
5884	first = node_to_mount(node: ns->mnt_first_node);
5885	} else {
5886	if (reverse)
5887	first = mnt_find_id_at_reverse(ns, mnt_id: last_mnt_id - `1`);
5888	else
5889	first = mnt_find_id_at(ns, mnt_id: last_mnt_id + `1`);
5890	}
5891
5892	for (ret = `0`, r = first; r && nr_mnt_ids; r = listmnt_next(curr: r, reverse)) {
5893	if (r->mnt_id_unique == mnt_parent_id)
5894	continue;
5895	if (!is_path_reachable(mnt: r, dentry: r->mnt.mnt_root, root: &orig))
5896	continue;
5897	*mnt_ids = r->mnt_id_unique;
5898	mnt_ids++;
5899	nr_mnt_ids--;
5900	ret++;
5901	}
5902	return ret;
5903	}
5904
5905	static void __free_klistmount_free(const struct klistmount *kls)
5906	{
5907	path_put(&kls->root);
5908	kvfree(addr: kls->kmnt_ids);
5909	mnt_ns_release(ns: kls->ns);
5910	}
5911
5912	static inline int prepare_klistmount(struct klistmount kls, struct* mnt_id_req *kreq,
5913	size_t nr_mnt_ids)
5914	{
5915
5916	u64 last_mnt_id = kreq->param;
5917
5918	/ The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. /
5919	if (last_mnt_id != `0` && last_mnt_id <= MNT_UNIQUE_ID_OFFSET)
5920	return -EINVAL;
5921
5922	kls->last_mnt_id = last_mnt_id;
5923
5924	kls->nr_mnt_ids = nr_mnt_ids;
5925	kls->kmnt_ids = kvmalloc_array(nr_mnt_ids, sizeof(*kls->kmnt_ids),
5926	GFP_KERNEL_ACCOUNT);
5927	if (!kls->kmnt_ids)
5928	return -ENOMEM;
5929
5930	kls->ns = grab_requested_mnt_ns(kreq);
5931	if (!kls->ns)
5932	return -ENOENT;
5933
5934	kls->mnt_parent_id = kreq->mnt_id;
5935	return `0`;
5936	}
5937
5938	SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
5939	u64 __user , mnt_ids, size_t, nr_mnt_ids, unsigned* int, flags)
5940	{
5941	struct klistmount kls __free(klistmount_free) = {};
5942	const size_t maxcount = `1000000`;
5943	struct mnt_id_req kreq;
5944	ssize_t ret;
5945
5946	if (flags & ~LISTMOUNT_REVERSE)
5947	return -EINVAL;
5948
5949	/*
5950	* If the mount namespace really has more than 1 million mounts the
5951	* caller must iterate over the mount namespace (and reconsider their
5952	* system design...).
5953	*/
5954	if (unlikely(nr_mnt_ids > maxcount))
5955	return -EOVERFLOW;
5956
5957	if (!access_ok(mnt_ids, nr_mnt_ids * sizeof(*mnt_ids)))
5958	return -EFAULT;
5959
5960	ret = copy_mnt_id_req(req, kreq: &kreq);
5961	if (ret)
5962	return ret;
5963
5964	ret = prepare_klistmount(kls: &kls, kreq: &kreq, nr_mnt_ids);
5965	if (ret)
5966	return ret;
5967
5968	if (kreq.mnt_ns_id && (kls.ns != current->nsproxy->mnt_ns) &&
5969	!ns_capable_noaudit(ns: kls.ns->user_ns, CAP_SYS_ADMIN))
5970	return -ENOENT;
5971
5972	/*
5973	* We only need to guard against mount topology changes as
5974	* listmount() doesn't care about any mount properties.
5975	*/
5976	scoped_guard(namespace_shared)
5977	ret = do_listmount(kls: &kls, reverse: (flags & LISTMOUNT_REVERSE));
5978	if (ret <= `0`)
5979	return ret;
5980
5981	if (copy_to_user(to: mnt_ids, from: kls.kmnt_ids, n: ret * sizeof(*mnt_ids)))
5982	return -EFAULT;
5983
5984	return ret;
5985	}
5986
5987	struct mnt_namespace init_mnt_ns = {
5988	.ns.inum = ns_init_inum(&init_mnt_ns),
5989	.ns.ops = &mntns_operations,
5990	.user_ns = &init_user_ns,
5991	.ns.__ns_ref = REFCOUNT_INIT(`1`),
5992	.ns.ns_type = ns_common_type(&init_mnt_ns),
5993	.passive = REFCOUNT_INIT(`1`),
5994	.mounts = RB_ROOT,
5995	.poll = __WAIT_QUEUE_HEAD_INITIALIZER(init_mnt_ns.poll),
5996	};
5997
5998	static void __init init_mount_tree(void)
5999	{
6000	struct vfsmount *mnt;
6001	struct mount *m;
6002	struct path root;
6003
6004	mnt = vfs_kern_mount(&rootfs_fs_type, `0`, "rootfs", initramfs_options);
6005	if (IS_ERR(ptr: mnt))
6006	panic(fmt: "Can't create rootfs");
6007
6008	m = real_mount(mnt);
6009	init_mnt_ns.root = m;
6010	init_mnt_ns.nr_mounts = `1`;
6011	mnt_add_to_ns(ns: &init_mnt_ns, mnt: m);
6012	init_task.nsproxy->mnt_ns = &init_mnt_ns;
6013	get_mnt_ns(ns: &init_mnt_ns);
6014
6015	root.mnt = mnt;
6016	root.dentry = mnt->mnt_root;
6017
6018	set_fs_pwd(current->fs, &root);
6019	set_fs_root(current->fs, &root);
6020
6021	ns_tree_add(&init_mnt_ns);
6022	}
6023
6024	void __init mnt_init(void)
6025	{
6026	int err;
6027
6028	mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
6029	`0`, SLAB_HWCACHE_ALIGN\|SLAB_PANIC\|SLAB_ACCOUNT, NULL);
6030
6031	mount_hashtable = alloc_large_system_hash(tablename: "Mount-cache",
6032	bucketsize: sizeof(struct hlist_head),
6033	numentries: mhash_entries, scale: `19`,
6034	HASH_ZERO,
6035	hash_shift: &m_hash_shift, hash_mask: &m_hash_mask, low_limit: `0`, high_limit: `0`);
6036	mountpoint_hashtable = alloc_large_system_hash(tablename: "Mountpoint-cache",
6037	bucketsize: sizeof(struct hlist_head),
6038	numentries: mphash_entries, scale: `19`,
6039	HASH_ZERO,
6040	hash_shift: &mp_hash_shift, hash_mask: &mp_hash_mask, low_limit: `0`, high_limit: `0`);
6041
6042	if (!mount_hashtable \|\| !mountpoint_hashtable)
6043	panic(fmt: "Failed to allocate mount hash table\n");
6044
6045	kernfs_init();
6046
6047	err = sysfs_init();
6048	if (err)
6049	printk(KERN_WARNING "%s: sysfs_init error: %d\n",
6050	__func__, err);
6051	fs_kobj = kobject_create_and_add(name: "fs", NULL);
6052	if (!fs_kobj)
6053	printk(KERN_WARNING "%s: kobj create error\n", __func__);
6054	shmem_init();
6055	init_rootfs();
6056	init_mount_tree();
6057	}
6058
6059	void put_mnt_ns(struct mnt_namespace *ns)
6060	{
6061	if (!ns_ref_put(ns))
6062	return;
6063	guard(namespace_excl)();
6064	emptied_ns = ns;
6065	guard(mount_writer)();
6066	umount_tree(mnt: ns->root, how: `0`);
6067	}
6068
6069	struct vfsmount kern_mount(struct* file_system_type *type)
6070	{
6071	struct vfsmount *mnt;
6072	mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL);
6073	if (!IS_ERR(ptr: mnt)) {
6074	/*
6075	* it is a longterm mount, don't release mnt until
6076	* we unmount before file sys is unregistered
6077	*/
6078	real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;
6079	}
6080	return mnt;
6081	}
6082	EXPORT_SYMBOL_GPL(kern_mount);
6083
6084	void kern_unmount(struct vfsmount *mnt)
6085	{
6086	/ release long term mount so mount point can be released /
6087	if (!IS_ERR(ptr: mnt)) {
6088	mnt_make_shortterm(mnt);
6089	synchronize_rcu(); / yecchhh... /
6090	mntput(mnt);
6091	}
6092	}
6093	EXPORT_SYMBOL(kern_unmount);
6094
6095	void kern_unmount_array(struct vfsmount mnt[], unsigned* int num)
6096	{
6097	unsigned int i;
6098
6099	for (i = `0`; i < num; i++)
6100	mnt_make_shortterm(mnt: mnt[i]);
6101	synchronize_rcu_expedited();
6102	for (i = `0`; i < num; i++)
6103	mntput(mnt[i]);
6104	}
6105	EXPORT_SYMBOL(kern_unmount_array);
6106
6107	bool our_mnt(struct vfsmount *mnt)
6108	{
6109	return check_mnt(mnt: real_mount(mnt));
6110	}
6111
6112	bool current_chrooted(void)
6113	{
6114	/ Does the current process have a non-standard root /
6115	struct path fs_root __free(path_put) = {};
6116	struct mount *root;
6117
6118	get_fs_root(current->fs, root: &fs_root);
6119
6120	/ Find the namespace root /
6121
6122	guard(mount_locked_reader)();
6123
6124	root = topmost_overmount(current->nsproxy->mnt_ns->root);
6125
6126	return fs_root.mnt != &root->mnt \|\| !path_mounted(path: &fs_root);
6127	}
6128
6129	static bool mnt_already_visible(struct mnt_namespace *ns,
6130	const struct super_block *sb,
6131	int *new_mnt_flags)
6132	{
6133	int new_flags = *new_mnt_flags;
6134	struct mount mnt, n;
6135
6136	guard(namespace_shared)();
6137	rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
6138	struct mount *child;
6139	int mnt_flags;
6140
6141	if (mnt->mnt.mnt_sb->s_type != sb->s_type)
6142	continue;
6143
6144	/ This mount is not fully visible if it's root directory*
6145	* is not the root directory of the filesystem.
6146	*/
6147	if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
6148	continue;
6149
6150	/ A local view of the mount flags /
6151	mnt_flags = mnt->mnt.mnt_flags;
6152
6153	/ Don't miss readonly hidden in the superblock flags /
6154	if (sb_rdonly(sb: mnt->mnt.mnt_sb))
6155	mnt_flags \|= MNT_LOCK_READONLY;
6156
6157	/ Verify the mount flags are equal to or more permissive*
6158	* than the proposed new mount.
6159	*/
6160	if ((mnt_flags & MNT_LOCK_READONLY) &&
6161	!(new_flags & MNT_READONLY))
6162	continue;
6163	if ((mnt_flags & MNT_LOCK_ATIME) &&
6164	((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK)))
6165	continue;
6166
6167	/ This mount is not fully visible if there are any*
6168	* locked child mounts that cover anything except for
6169	* empty directories.
6170	*/
6171	list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
6172	struct inode *inode = child->mnt_mountpoint->d_inode;
6173	/ Only worry about locked mounts /
6174	if (!(child->mnt.mnt_flags & MNT_LOCKED))
6175	continue;
6176	/ Is the directory permanently empty? /
6177	if (!is_empty_dir_inode(inode))
6178	goto next;
6179	}
6180	/ Preserve the locked attributes /
6181	*new_mnt_flags \|= mnt_flags & (MNT_LOCK_READONLY \| \
6182	MNT_LOCK_ATIME);
6183	return true;
6184	next: ;
6185	}
6186	return false;
6187	}
6188
6189	static bool mount_too_revealing(const struct super_block sb, int* *new_mnt_flags)
6190	{
6191	const unsigned long required_iflags = SB_I_NOEXEC \| SB_I_NODEV;
6192	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
6193	unsigned long s_iflags;
6194
6195	if (ns->user_ns == &init_user_ns)
6196	return false;
6197
6198	/ Can this filesystem be too revealing? /
6199	s_iflags = sb->s_iflags;
6200	if (!(s_iflags & SB_I_USERNS_VISIBLE))
6201	return false;
6202
6203	if ((s_iflags & required_iflags) != required_iflags) {
6204	WARN_ONCE(`1`, "Expected s_iflags to contain 0x%lx\n",
6205	required_iflags);
6206	return true;
6207	}
6208
6209	return !mnt_already_visible(ns, sb, new_mnt_flags);
6210	}
6211
6212	bool mnt_may_suid(struct vfsmount *mnt)
6213	{
6214	/*
6215	* Foreign mounts (accessed via fchdir or through /proc
6216	* symlinks) are always treated as if they are nosuid. This
6217	* prevents namespaces from trusting potentially unsafe
6218	* suid/sgid bits, file caps, or security labels that originate
6219	* in other namespaces.
6220	*/
6221	return !(mnt->mnt_flags & MNT_NOSUID) && check_mnt(mnt: real_mount(mnt)) &&
6222	current_in_userns(target_ns: mnt->mnt_sb->s_user_ns);
6223	}
6224
6225	static struct ns_common mntns_get(struct* task_struct *task)
6226	{
6227	struct ns_common *ns = NULL;
6228	struct nsproxy *nsproxy;
6229
6230	task_lock(p: task);
6231	nsproxy = task->nsproxy;
6232	if (nsproxy) {
6233	ns = &nsproxy->mnt_ns->ns;
6234	get_mnt_ns(ns: to_mnt_ns(ns));
6235	}
6236	task_unlock(p: task);
6237
6238	return ns;
6239	}
6240
6241	static void mntns_put(struct ns_common *ns)
6242	{
6243	put_mnt_ns(ns: to_mnt_ns(ns));
6244	}
6245
6246	static int mntns_install(struct nsset nsset, struct* ns_common *ns)
6247	{
6248	struct nsproxy *nsproxy = nsset->nsproxy;
6249	struct fs_struct *fs = nsset->fs;
6250	struct mnt_namespace mnt_ns = to_mnt_ns(ns), old_mnt_ns;
6251	struct user_namespace *user_ns = nsset->cred->user_ns;
6252	struct path root;
6253	int err;
6254
6255	if (!ns_capable(ns: mnt_ns->user_ns, CAP_SYS_ADMIN) \|\|
6256	!ns_capable(ns: user_ns, CAP_SYS_CHROOT) \|\|
6257	!ns_capable(ns: user_ns, CAP_SYS_ADMIN))
6258	return -EPERM;
6259
6260	if (is_anon_ns(ns: mnt_ns))
6261	return -EINVAL;
6262
6263	if (fs->users != `1`)
6264	return -EINVAL;
6265
6266	get_mnt_ns(ns: mnt_ns);
6267	old_mnt_ns = nsproxy->mnt_ns;
6268	nsproxy->mnt_ns = mnt_ns;
6269
6270	/ Find the root /
6271	err = vfs_path_lookup(mnt_ns->root->mnt.mnt_root, &mnt_ns->root->mnt,
6272	"/", LOOKUP_DOWN, &root);
6273	if (err) {
6274	/ revert to old namespace /
6275	nsproxy->mnt_ns = old_mnt_ns;
6276	put_mnt_ns(ns: mnt_ns);
6277	return err;
6278	}
6279
6280	put_mnt_ns(ns: old_mnt_ns);
6281
6282	/ Update the pwd and root /
6283	set_fs_pwd(fs, &root);
6284	set_fs_root(fs, &root);
6285
6286	path_put(&root);
6287	return `0`;
6288	}
6289
6290	static struct user_namespace mntns_owner(struct* ns_common *ns)
6291	{
6292	return to_mnt_ns(ns)->user_ns;
6293	}
6294
6295	const struct proc_ns_operations mntns_operations = {
6296	.name = "mnt",
6297	.get = mntns_get,
6298	.put = mntns_put,
6299	.install = mntns_install,
6300	.owner = mntns_owner,
6301	};
6302
6303	#ifdef CONFIG_SYSCTL
6304	static const struct ctl_table fs_namespace_sysctls[] = {
6305	{
6306	.procname = "mount-max",
6307	.data = &sysctl_mount_max,
6308	.maxlen = sizeof(unsigned int),
6309	.mode = `0644`,
6310	.proc_handler = proc_dointvec_minmax,
6311	.extra1 = SYSCTL_ONE,
6312	},
6313	};
6314
6315	static int __init init_fs_namespace_sysctls(void)
6316	{
6317	register_sysctl_init("fs", fs_namespace_sysctls);
6318	return `0`;
6319	}
6320	fs_initcall(init_fs_namespace_sysctls);
6321
6322	#endif /* CONFIG_SYSCTL */
6323

Browse the source code of Linux/fs/namespace.c