pidfs.c source code [Linux/fs/pidfs.c]

1	// SPDX-License-Identifier: GPL-2.0
2	#include <linux/anon_inodes.h>
3	#include <linux/exportfs.h>
4	#include <linux/file.h>
5	#include <linux/fs.h>
6	#include <linux/cgroup.h>
7	#include <linux/magic.h>
8	#include <linux/mount.h>
9	#include <linux/pid.h>
10	#include <linux/pidfs.h>
11	#include <linux/pid_namespace.h>
12	#include <linux/poll.h>
13	#include <linux/proc_fs.h>
14	#include <linux/proc_ns.h>
15	#include <linux/pseudo_fs.h>
16	#include <linux/ptrace.h>
17	#include <linux/seq_file.h>
18	#include <uapi/linux/pidfd.h>
19	#include <linux/ipc_namespace.h>
20	#include <linux/time_namespace.h>
21	#include <linux/utsname.h>
22	#include <net/net_namespace.h>
23	#include <linux/coredump.h>
24	#include <linux/xattr.h>
25
26	#include "internal.h"
27	#include "mount.h"
28
29	#define PIDFS_PID_DEAD ERR_PTR(-ESRCH)
30
31	static struct kmem_cache *pidfs_attr_cachep __ro_after_init;
32	static struct kmem_cache *pidfs_xattr_cachep __ro_after_init;
33
34	static struct path pidfs_root_path = {};
35
36	void pidfs_get_root(struct path *path)
37	{
38	*path = pidfs_root_path;
39	path_get(path);
40	}
41
42	/*
43	* Stashes information that userspace needs to access even after the
44	* process has been reaped.
45	*/
46	struct pidfs_exit_info {
47	__u64 cgroupid;
48	__s32 exit_code;
49	__u32 coredump_mask;
50	};
51
52	struct pidfs_attr {
53	struct simple_xattrs *xattrs;
54	struct pidfs_exit_info __pei;
55	struct pidfs_exit_info *exit_info;
56	};
57
58	static struct rb_root pidfs_ino_tree = RB_ROOT;
59
60	#if BITS_PER_LONG == 32
61	static inline unsigned long pidfs_ino(u64 ino)
62	{
63	return lower_32_bits(ino);
64	}
65
66	/ On 32 bit the generation number are the upper 32 bits. /
67	static inline u32 pidfs_gen(u64 ino)
68	{
69	return upper_32_bits(ino);
70	}
71
72	#else
73
74	/ On 64 bit simply return ino. /
75	static inline unsigned long pidfs_ino(u64 ino)
76	{
77	return ino;
78	}
79
80	/ On 64 bit the generation number is 0. /
81	static inline u32 pidfs_gen(u64 ino)
82	{
83	return `0`;
84	}
85	#endif
86
87	static int pidfs_ino_cmp(struct rb_node a, const* struct rb_node *b)
88	{
89	struct pid pid_a = rb_entry(a, struct* pid, pidfs_node);
90	struct pid pid_b = rb_entry(b, struct* pid, pidfs_node);
91	u64 pid_ino_a = pid_a->ino;
92	u64 pid_ino_b = pid_b->ino;
93
94	if (pid_ino_a < pid_ino_b)
95	return -`1`;
96	if (pid_ino_a > pid_ino_b)
97	return `1`;
98	return `0`;
99	}
100
101	void pidfs_add_pid(struct pid *pid)
102	{
103	static u64 pidfs_ino_nr = `2`;
104
105	/*
106	* On 64 bit nothing special happens. The 64bit number assigned
107	* to struct pid is the inode number.
108	*
109	* On 32 bit the 64 bit number assigned to struct pid is split
110	* into two 32 bit numbers. The lower 32 bits are used as the
111	* inode number and the upper 32 bits are used as the inode
112	* generation number.
113	*
114	* On 32 bit pidfs_ino() will return the lower 32 bit. When
115	* pidfs_ino() returns zero a wrap around happened. When a
116	* wraparound happens the 64 bit number will be incremented by 2
117	* so inode numbering starts at 2 again.
118	*
119	* On 64 bit comparing two pidfds is as simple as comparing
120	* inode numbers.
121	*
122	* When a wraparound happens on 32 bit multiple pidfds with the
123	* same inode number are likely to exist (This isn't a problem
124	* since before pidfs pidfds used the anonymous inode meaning
125	* all pidfds had the same inode number.). Userspace can
126	* reconstruct the 64 bit identifier by retrieving both the
127	* inode number and the inode generation number to compare or
128	* use file handles.
129	*/
130	if (pidfs_ino(ino: pidfs_ino_nr) == `0`)
131	pidfs_ino_nr += `2`;
132
133	pid->ino = pidfs_ino_nr;
134	pid->stashed = NULL;
135	pid->attr = NULL;
136	pidfs_ino_nr++;
137
138	write_seqcount_begin(&pidmap_lock_seq);
139	rb_find_add_rcu(node: &pid->pidfs_node, tree: &pidfs_ino_tree, cmp: pidfs_ino_cmp);
140	write_seqcount_end(&pidmap_lock_seq);
141	}
142
143	void pidfs_remove_pid(struct pid *pid)
144	{
145	write_seqcount_begin(&pidmap_lock_seq);
146	rb_erase(&pid->pidfs_node, &pidfs_ino_tree);
147	write_seqcount_end(&pidmap_lock_seq);
148	}
149
150	void pidfs_free_pid(struct pid *pid)
151	{
152	struct pidfs_attr *attr __free(kfree) = no_free_ptr(pid->attr);
153	struct simple_xattrs *xattrs __free(kfree) = NULL;
154
155	/*
156	* Any dentry must've been wiped from the pid by now.
157	* Otherwise there's a reference count bug.
158	*/
159	VFS_WARN_ON_ONCE(pid->stashed);
160
161	/*
162	* This if an error occurred during e.g., task creation that
163	* causes us to never go through the exit path.
164	*/
165	if (unlikely(!attr))
166	return;
167
168	/ This never had a pidfd created. /
169	if (IS_ERR(ptr: attr))
170	return;
171
172	xattrs = no_free_ptr(attr->xattrs);
173	if (xattrs)
174	simple_xattrs_free(xattrs, NULL);
175	}
176
177	#ifdef CONFIG_PROC_FS
178	/**
179	* pidfd_show_fdinfo - print information about a pidfd
180	* @m: proc fdinfo file
181	* @f: file referencing a pidfd
182	*
183	* Pid:
184	* This function will print the pid that a given pidfd refers to in the
185	* pid namespace of the procfs instance.
186	* If the pid namespace of the process is not a descendant of the pid
187	* namespace of the procfs instance 0 will be shown as its pid. This is
188	* similar to calling getppid() on a process whose parent is outside of
189	* its pid namespace.
190	*
191	* NSpid:
192	* If pid namespaces are supported then this function will also print
193	* the pid of a given pidfd refers to for all descendant pid namespaces
194	* starting from the current pid namespace of the instance, i.e. the
195	* Pid field and the first entry in the NSpid field will be identical.
196	* If the pid namespace of the process is not a descendant of the pid
197	* namespace of the procfs instance 0 will be shown as its first NSpid
198	* entry and no others will be shown.
199	* Note that this differs from the Pid and NSpid fields in
200	* /proc/<pid>/status where Pid and NSpid are always shown relative to
201	* the pid namespace of the procfs instance. The difference becomes
202	* obvious when sending around a pidfd between pid namespaces from a
203	* different branch of the tree, i.e. where no ancestral relation is
204	* present between the pid namespaces:
205	* - create two new pid namespaces ns1 and ns2 in the initial pid
206	* namespace (also take care to create new mount namespaces in the
207	* new pid namespace and mount procfs)
208	* - create a process with a pidfd in ns1
209	* - send pidfd from ns1 to ns2
210	* - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid
211	* have exactly one entry, which is 0
212	*/
213	static void pidfd_show_fdinfo(struct seq_file m, struct* file *f)
214	{
215	struct pid *pid = pidfd_pid(file: f);
216	struct pid_namespace *ns;
217	pid_t nr = -`1`;
218
219	if (likely(pid_has_task(pid, PIDTYPE_PID))) {
220	ns = proc_pid_ns(sb: file_inode(f: m->file)->i_sb);
221	nr = pid_nr_ns(pid, ns);
222	}
223
224	seq_put_decimal_ll(m, delimiter: "Pid:\t", num: nr);
225
226	#ifdef CONFIG_PID_NS
227	seq_put_decimal_ll(m, delimiter: "\nNSpid:\t", num: nr);
228	if (nr > `0`) {
229	int i;
230
231	/ If nr is non-zero it means that 'pid' is valid and that*
232	* ns, i.e. the pid namespace associated with the procfs
233	* instance, is in the pid namespace hierarchy of pid.
234	* Start at one below the already printed level.
235	*/
236	for (i = ns->level + `1`; i <= pid->level; i++)
237	seq_put_decimal_ll(m, delimiter: "\t", num: pid->numbers[i].nr);
238	}
239	#endif
240	seq_putc(m, c: `'\n'`);
241	}
242	#endif
243
244	/*
245	* Poll support for process exit notification.
246	*/
247	static __poll_t pidfd_poll(struct file file, struct* poll_table_struct *pts)
248	{
249	struct pid *pid = pidfd_pid(file);
250	struct task_struct *task;
251	__poll_t poll_flags = `0`;
252
253	poll_wait(filp: file, wait_address: &pid->wait_pidfd, p: pts);
254	/*
255	* Don't wake waiters if the thread-group leader exited
256	* prematurely. They either get notified when the last subthread
257	* exits or not at all if one of the remaining subthreads execs
258	* and assumes the struct pid of the old thread-group leader.
259	*/
260	guard(rcu)();
261	task = pid_task(pid, PIDTYPE_PID);
262	if (!task)
263	poll_flags = EPOLLIN \| EPOLLRDNORM \| EPOLLHUP;
264	else if (task->exit_state && !delay_group_leader(task))
265	poll_flags = EPOLLIN \| EPOLLRDNORM;
266
267	return poll_flags;
268	}
269
270	static inline bool pid_in_current_pidns(const struct pid *pid)
271	{
272	const struct pid_namespace *ns = task_active_pid_ns(current);
273
274	if (ns->level <= pid->level)
275	return pid->numbers[ns->level].ns == ns;
276
277	return false;
278	}
279
280	static __u32 pidfs_coredump_mask(unsigned long mm_flags)
281	{
282	switch (__get_dumpable(mm_flags)) {
283	case SUID_DUMP_USER:
284	return PIDFD_COREDUMP_USER;
285	case SUID_DUMP_ROOT:
286	return PIDFD_COREDUMP_ROOT;
287	case SUID_DUMP_DISABLE:
288	return PIDFD_COREDUMP_SKIP;
289	default:
290	WARN_ON_ONCE(true);
291	}
292
293	return `0`;
294	}
295
296	static long pidfd_info(struct file file, unsigned* int cmd, unsigned long arg)
297	{
298	struct pidfd_info __user uinfo = (struct* pidfd_info __user *)arg;
299	struct task_struct *task __free(put_task) = NULL;
300	struct pid *pid = pidfd_pid(file);
301	size_t usize = _IOC_SIZE(cmd);
302	struct pidfd_info kinfo = {};
303	struct pidfs_exit_info *exit_info;
304	struct user_namespace *user_ns;
305	struct pidfs_attr *attr;
306	const struct cred *c;
307	__u64 mask;
308
309	if (!uinfo)
310	return -EINVAL;
311	if (usize < PIDFD_INFO_SIZE_VER0)
312	return -EINVAL; / First version, no smaller struct possible /
313
314	if (copy_from_user(to: &mask, from: &uinfo->mask, n: sizeof(mask)))
315	return -EFAULT;
316
317	/*
318	* Restrict information retrieval to tasks within the caller's pid
319	* namespace hierarchy.
320	*/
321	if (!pid_in_current_pidns(pid))
322	return -ESRCH;
323
324	attr = READ_ONCE(pid->attr);
325	if (mask & PIDFD_INFO_EXIT) {
326	exit_info = READ_ONCE(attr->exit_info);
327	if (exit_info) {
328	kinfo.mask \|= PIDFD_INFO_EXIT;
329	#ifdef CONFIG_CGROUPS
330	kinfo.cgroupid = exit_info->cgroupid;
331	kinfo.mask \|= PIDFD_INFO_CGROUPID;
332	#endif
333	kinfo.exit_code = exit_info->exit_code;
334	}
335	}
336
337	if (mask & PIDFD_INFO_COREDUMP) {
338	kinfo.mask \|= PIDFD_INFO_COREDUMP;
339	kinfo.coredump_mask = READ_ONCE(attr->__pei.coredump_mask);
340	}
341
342	task = get_pid_task(pid, PIDTYPE_PID);
343	if (!task) {
344	/*
345	* If the task has already been reaped, only exit
346	* information is available
347	*/
348	if (!(mask & PIDFD_INFO_EXIT))
349	return -ESRCH;
350
351	goto copy_out;
352	}
353
354	c = get_task_cred(task);
355	if (!c)
356	return -ESRCH;
357
358	if ((kinfo.mask & PIDFD_INFO_COREDUMP) && !(kinfo.coredump_mask)) {
359	task_lock(p: task);
360	if (task->mm) {
361	unsigned long flags = __mm_flags_get_dumpable(mm: task->mm);
362
363	kinfo.coredump_mask = pidfs_coredump_mask(mm_flags: flags);
364	}
365	task_unlock(p: task);
366	}
367
368	/ Unconditionally return identifiers and credentials, the rest only on request /
369
370	user_ns = current_user_ns();
371	kinfo.ruid = from_kuid_munged(to: user_ns, kuid: c->uid);
372	kinfo.rgid = from_kgid_munged(to: user_ns, kgid: c->gid);
373	kinfo.euid = from_kuid_munged(to: user_ns, kuid: c->euid);
374	kinfo.egid = from_kgid_munged(to: user_ns, kgid: c->egid);
375	kinfo.suid = from_kuid_munged(to: user_ns, kuid: c->suid);
376	kinfo.sgid = from_kgid_munged(to: user_ns, kgid: c->sgid);
377	kinfo.fsuid = from_kuid_munged(to: user_ns, kuid: c->fsuid);
378	kinfo.fsgid = from_kgid_munged(to: user_ns, kgid: c->fsgid);
379	kinfo.mask \|= PIDFD_INFO_CREDS;
380	put_cred(cred: c);
381
382	#ifdef CONFIG_CGROUPS
383	if (!kinfo.cgroupid) {
384	struct cgroup *cgrp;
385
386	rcu_read_lock();
387	cgrp = task_dfl_cgroup(task);
388	kinfo.cgroupid = cgroup_id(cgrp);
389	kinfo.mask \|= PIDFD_INFO_CGROUPID;
390	rcu_read_unlock();
391	}
392	#endif
393
394	/*
395	* Copy pid/tgid last, to reduce the chances the information might be
396	* stale. Note that it is not possible to ensure it will be valid as the
397	* task might return as soon as the copy_to_user finishes, but that's ok
398	* and userspace expects that might happen and can act accordingly, so
399	* this is just best-effort. What we can do however is checking that all
400	* the fields are set correctly, or return ESRCH to avoid providing
401	* incomplete information. */
402
403	kinfo.ppid = task_ppid_nr_ns(tsk: task, NULL);
404	kinfo.tgid = task_tgid_vnr(tsk: task);
405	kinfo.pid = task_pid_vnr(tsk: task);
406	kinfo.mask \|= PIDFD_INFO_PID;
407
408	if (kinfo.pid == `0` \|\| kinfo.tgid == `0`)
409	return -ESRCH;
410
411	copy_out:
412	/*
413	* If userspace and the kernel have the same struct size it can just
414	* be copied. If userspace provides an older struct, only the bits that
415	* userspace knows about will be copied. If userspace provides a new
416	* struct, only the bits that the kernel knows about will be copied.
417	*/
418	return copy_struct_to_user(dst: uinfo, usize, src: &kinfo, ksize: sizeof(kinfo), NULL);
419	}
420
421	static bool pidfs_ioctl_valid(unsigned int cmd)
422	{
423	switch (cmd) {
424	case FS_IOC_GETVERSION:
425	case PIDFD_GET_CGROUP_NAMESPACE:
426	case PIDFD_GET_IPC_NAMESPACE:
427	case PIDFD_GET_MNT_NAMESPACE:
428	case PIDFD_GET_NET_NAMESPACE:
429	case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE:
430	case PIDFD_GET_TIME_NAMESPACE:
431	case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE:
432	case PIDFD_GET_UTS_NAMESPACE:
433	case PIDFD_GET_USER_NAMESPACE:
434	case PIDFD_GET_PID_NAMESPACE:
435	return true;
436	}
437
438	/ Extensible ioctls require some more careful checks. /
439	switch (_IOC_NR(cmd)) {
440	case _IOC_NR(PIDFD_GET_INFO):
441	/*
442	* Try to prevent performing a pidfd ioctl when someone
443	* erronously mistook the file descriptor for a pidfd.
444	* This is not perfect but will catch most cases.
445	*/
446	return extensible_ioctl_valid(cmd_a: cmd, PIDFD_GET_INFO, PIDFD_INFO_SIZE_VER0);
447	}
448
449	return false;
450	}
451
452	static long pidfd_ioctl(struct file file, unsigned* int cmd, unsigned long arg)
453	{
454	struct task_struct *task __free(put_task) = NULL;
455	struct nsproxy *nsp __free(put_nsproxy) = NULL;
456	struct ns_common *ns_common = NULL;
457	struct pid_namespace *pid_ns;
458
459	if (!pidfs_ioctl_valid(cmd))
460	return -ENOIOCTLCMD;
461
462	if (cmd == FS_IOC_GETVERSION) {
463	if (!arg)
464	return -EINVAL;
465
466	__u32 __user argp = (__u32 __user )arg;
467	return put_user(file_inode(file)->i_generation, argp);
468	}
469
470	/ Extensible IOCTL that does not open namespace FDs, take a shortcut /
471	if (_IOC_NR(cmd) == _IOC_NR(PIDFD_GET_INFO))
472	return pidfd_info(file, cmd, arg);
473
474	task = get_pid_task(pid: pidfd_pid(file), PIDTYPE_PID);
475	if (!task)
476	return -ESRCH;
477
478	if (arg)
479	return -EINVAL;
480
481	scoped_guard(task_lock, task) {
482	nsp = task->nsproxy;
483	if (nsp)
484	get_nsproxy(ns: nsp);
485	}
486	if (!nsp)
487	return -ESRCH; / just pretend it didn't exist /
488
489	/*
490	* We're trying to open a file descriptor to the namespace so perform a
491	* filesystem cred ptrace check. Also, we mirror nsfs behavior.
492	*/
493	if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
494	return -EACCES;
495
496	switch (cmd) {
497	/ Namespaces that hang of nsproxy. /
498	case PIDFD_GET_CGROUP_NAMESPACE:
499	if (IS_ENABLED(CONFIG_CGROUPS)) {
500	get_cgroup_ns(ns: nsp->cgroup_ns);
501	ns_common = to_ns_common(nsp->cgroup_ns);
502	}
503	break;
504	case PIDFD_GET_IPC_NAMESPACE:
505	if (IS_ENABLED(CONFIG_IPC_NS)) {
506	get_ipc_ns(ns: nsp->ipc_ns);
507	ns_common = to_ns_common(nsp->ipc_ns);
508	}
509	break;
510	case PIDFD_GET_MNT_NAMESPACE:
511	get_mnt_ns(ns: nsp->mnt_ns);
512	ns_common = to_ns_common(nsp->mnt_ns);
513	break;
514	case PIDFD_GET_NET_NAMESPACE:
515	if (IS_ENABLED(CONFIG_NET_NS)) {
516	ns_common = to_ns_common(nsp->net_ns);
517	get_net_ns(ns: ns_common);
518	}
519	break;
520	case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE:
521	if (IS_ENABLED(CONFIG_PID_NS)) {
522	get_pid_ns(ns: nsp->pid_ns_for_children);
523	ns_common = to_ns_common(nsp->pid_ns_for_children);
524	}
525	break;
526	case PIDFD_GET_TIME_NAMESPACE:
527	if (IS_ENABLED(CONFIG_TIME_NS)) {
528	get_time_ns(ns: nsp->time_ns);
529	ns_common = to_ns_common(nsp->time_ns);
530	}
531	break;
532	case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE:
533	if (IS_ENABLED(CONFIG_TIME_NS)) {
534	get_time_ns(ns: nsp->time_ns_for_children);
535	ns_common = to_ns_common(nsp->time_ns_for_children);
536	}
537	break;
538	case PIDFD_GET_UTS_NAMESPACE:
539	if (IS_ENABLED(CONFIG_UTS_NS)) {
540	get_uts_ns(ns: nsp->uts_ns);
541	ns_common = to_ns_common(nsp->uts_ns);
542	}
543	break;
544	/ Namespaces that don't hang of nsproxy. /
545	case PIDFD_GET_USER_NAMESPACE:
546	if (IS_ENABLED(CONFIG_USER_NS)) {
547	rcu_read_lock();
548	ns_common = to_ns_common(get_user_ns(task_cred_xxx(task, user_ns)));
549	rcu_read_unlock();
550	}
551	break;
552	case PIDFD_GET_PID_NAMESPACE:
553	if (IS_ENABLED(CONFIG_PID_NS)) {
554	rcu_read_lock();
555	pid_ns = task_active_pid_ns(tsk: task);
556	if (pid_ns)
557	ns_common = to_ns_common(get_pid_ns(pid_ns));
558	rcu_read_unlock();
559	}
560	break;
561	default:
562	return -ENOIOCTLCMD;
563	}
564
565	if (!ns_common)
566	return -EOPNOTSUPP;
567
568	/ open_namespace() unconditionally consumes the reference /
569	return open_namespace(ns: ns_common);
570	}
571
572	static const struct file_operations pidfs_file_operations = {
573	.poll = pidfd_poll,
574	#ifdef CONFIG_PROC_FS
575	.show_fdinfo = pidfd_show_fdinfo,
576	#endif
577	.unlocked_ioctl = pidfd_ioctl,
578	.compat_ioctl = compat_ptr_ioctl,
579	};
580
581	struct pid pidfd_pid(const* struct file *file)
582	{
583	if (file->f_op != &pidfs_file_operations)
584	return ERR_PTR(error: -EBADF);
585	return file_inode(f: file)->i_private;
586	}
587
588	/*
589	* We're called from release_task(). We know there's at least one
590	* reference to struct pid being held that won't be released until the
591	* task has been reaped which cannot happen until we're out of
592	* release_task().
593	*
594	* If this struct pid has at least once been referred to by a pidfd then
595	* pid->attr will be allocated. If not we mark the struct pid as dead so
596	* anyone who is trying to register it with pidfs will fail to do so.
597	* Otherwise we would hand out pidfs for reaped tasks without having
598	* exit information available.
599	*
600	* Worst case is that we've filled in the info and the pid gets freed
601	* right away in free_pid() when no one holds a pidfd anymore. Since
602	* pidfs_exit() currently is placed after exit_task_work() we know that
603	* it cannot be us aka the exiting task holding a pidfd to itself.
604	*/
605	void pidfs_exit(struct task_struct *tsk)
606	{
607	struct pid *pid = task_pid(task: tsk);
608	struct pidfs_attr *attr;
609	struct pidfs_exit_info *exit_info;
610	#ifdef CONFIG_CGROUPS
611	struct cgroup *cgrp;
612	#endif
613
614	might_sleep();
615
616	guard(spinlock_irq)(l: &pid->wait_pidfd.lock);
617	attr = pid->attr;
618	if (!attr) {
619	/*
620	* No one ever held a pidfd for this struct pid.
621	* Mark it as dead so no one can add a pidfs
622	* entry anymore. We're about to be reaped and
623	* so no exit information would be available.
624	*/
625	pid->attr = PIDFS_PID_DEAD;
626	return;
627	}
628
629	/*
630	* If @pid->attr is set someone might still legitimately hold a
631	* pidfd to @pid or someone might concurrently still be getting
632	* a reference to an already stashed dentry from @pid->stashed.
633	* So defer cleaning @pid->attr until the last reference to @pid
634	* is put
635	*/
636
637	exit_info = &attr->__pei;
638
639	#ifdef CONFIG_CGROUPS
640	rcu_read_lock();
641	cgrp = task_dfl_cgroup(task: tsk);
642	exit_info->cgroupid = cgroup_id(cgrp);
643	rcu_read_unlock();
644	#endif
645	exit_info->exit_code = tsk->exit_code;
646
647	/ Ensure that PIDFD_GET_INFO sees either all or nothing. /
648	smp_store_release(&attr->exit_info, &attr->__pei);
649	}
650
651	#ifdef CONFIG_COREDUMP
652	void pidfs_coredump(const struct coredump_params *cprm)
653	{
654	struct pid *pid = cprm->pid;
655	struct pidfs_exit_info *exit_info;
656	struct pidfs_attr *attr;
657	__u32 coredump_mask = `0`;
658
659	attr = READ_ONCE(pid->attr);
660
661	VFS_WARN_ON_ONCE(!attr);
662	VFS_WARN_ON_ONCE(attr == PIDFS_PID_DEAD);
663
664	exit_info = &attr->__pei;
665	/ Note how we were coredumped. /
666	coredump_mask = pidfs_coredump_mask(mm_flags: cprm->mm_flags);
667	/ Note that we actually did coredump. /
668	coredump_mask \|= PIDFD_COREDUMPED;
669	/ If coredumping is set to skip we should never end up here. /
670	VFS_WARN_ON_ONCE(coredump_mask & PIDFD_COREDUMP_SKIP);
671	smp_store_release(&exit_info->coredump_mask, coredump_mask);
672	}
673	#endif
674
675	static struct vfsmount *pidfs_mnt __ro_after_init;
676
677	/*
678	* The vfs falls back to simple_setattr() if i_op->setattr() isn't
679	* implemented. Let's reject it completely until we have a clean
680	* permission concept for pidfds.
681	*/
682	static int pidfs_setattr(struct mnt_idmap idmap, struct* dentry *dentry,
683	struct iattr *attr)
684	{
685	return anon_inode_setattr(idmap, dentry, attr);
686	}
687
688	static int pidfs_getattr(struct mnt_idmap idmap, const* struct path *path,
689	struct kstat *stat, u32 request_mask,
690	unsigned int query_flags)
691	{
692	return anon_inode_getattr(idmap, path, stat, request_mask, query_flags);
693	}
694
695	static ssize_t pidfs_listxattr(struct dentry dentry, char* *buf, size_t size)
696	{
697	struct inode *inode = d_inode(dentry);
698	struct pid *pid = inode->i_private;
699	struct pidfs_attr *attr = pid->attr;
700	struct simple_xattrs *xattrs;
701
702	xattrs = READ_ONCE(attr->xattrs);
703	if (!xattrs)
704	return `0`;
705
706	return simple_xattr_list(inode, xattrs, buffer: buf, size);
707	}
708
709	static const struct inode_operations pidfs_inode_operations = {
710	.getattr = pidfs_getattr,
711	.setattr = pidfs_setattr,
712	.listxattr = pidfs_listxattr,
713	};
714
715	static void pidfs_evict_inode(struct inode *inode)
716	{
717	struct pid *pid = inode->i_private;
718
719	clear_inode(inode);
720	put_pid(pid);
721	}
722
723	static const struct super_operations pidfs_sops = {
724	.drop_inode = inode_just_drop,
725	.evict_inode = pidfs_evict_inode,
726	.statfs = simple_statfs,
727	};
728
729	/*
730	* 'lsof' has knowledge of out historical anon_inode use, and expects
731	* the pidfs dentry name to start with 'anon_inode'.
732	*/
733	static char pidfs_dname(struct* dentry dentry, char* buffer, int* buflen)
734	{
735	return dynamic_dname(buffer, buflen, "anon_inode:[pidfd]");
736	}
737
738	const struct dentry_operations pidfs_dentry_operations = {
739	.d_dname = pidfs_dname,
740	.d_prune = stashed_dentry_prune,
741	};
742
743	static int pidfs_encode_fh(struct inode inode, u32 fh, int *max_len,
744	struct inode *parent)
745	{
746	const struct pid *pid = inode->i_private;
747
748	if (*max_len < `2`) {
749	*max_len = `2`;
750	return FILEID_INVALID;
751	}
752
753	*max_len = `2`;
754	(u64 )fh = pid->ino;
755	return FILEID_KERNFS;
756	}
757
758	static int pidfs_ino_find(const void key, const* struct rb_node *node)
759	{
760	const u64 pid_ino = (u64 )key;
761	const struct pid pid = rb_entry(node, struct* pid, pidfs_node);
762
763	if (pid_ino < pid->ino)
764	return -`1`;
765	if (pid_ino > pid->ino)
766	return `1`;
767	return `0`;
768	}
769
770	/ Find a struct pid based on the inode number. /
771	static struct pid *pidfs_ino_get_pid(u64 ino)
772	{
773	struct pid *pid;
774	struct rb_node *node;
775	unsigned int seq;
776
777	guard(rcu)();
778	do {
779	seq = read_seqcount_begin(&pidmap_lock_seq);
780	node = rb_find_rcu(key: &ino, tree: &pidfs_ino_tree, cmp: pidfs_ino_find);
781	if (node)
782	break;
783	} while (read_seqcount_retry(&pidmap_lock_seq, seq));
784
785	if (!node)
786	return NULL;
787
788	pid = rb_entry(node, struct pid, pidfs_node);
789
790	/ Within our pid namespace hierarchy? /
791	if (pid_vnr(pid) == `0`)
792	return NULL;
793
794	return get_pid(pid);
795	}
796
797	static struct dentry pidfs_fh_to_dentry(struct* super_block *sb,
798	struct fid fid, int* fh_len,
799	int fh_type)
800	{
801	int ret;
802	u64 pid_ino;
803	struct path path;
804	struct pid *pid;
805
806	if (fh_len < `2`)
807	return NULL;
808
809	switch (fh_type) {
810	case FILEID_KERNFS:
811	pid_ino = (u64 )fid;
812	break;
813	default:
814	return NULL;
815	}
816
817	pid = pidfs_ino_get_pid(ino: pid_ino);
818	if (!pid)
819	return NULL;
820
821	ret = path_from_stashed(stashed: &pid->stashed, mnt: pidfs_mnt, data: pid, path: &path);
822	if (ret < `0`)
823	return ERR_PTR(error: ret);
824
825	VFS_WARN_ON_ONCE(!pid->attr);
826
827	mntput(mnt: path.mnt);
828	return path.dentry;
829	}
830
831	/*
832	* Make sure that we reject any nonsensical flags that users pass via
833	* open_by_handle_at(). Note that PIDFD_THREAD is defined as O_EXCL, and
834	* PIDFD_NONBLOCK as O_NONBLOCK.
835	*/
836	#define VALID_FILE_HANDLE_OPEN_FLAGS \
837	(O_RDONLY \| O_WRONLY \| O_RDWR \| O_NONBLOCK \| O_CLOEXEC \| O_EXCL)
838
839	static int pidfs_export_permission(struct handle_to_path_ctx *ctx,
840	unsigned int oflags)
841	{
842	if (oflags & ~(VALID_FILE_HANDLE_OPEN_FLAGS \| O_LARGEFILE))
843	return -EINVAL;
844
845	/*
846	* pidfd_ino_get_pid() will verify that the struct pid is part
847	* of the caller's pid namespace hierarchy. No further
848	* permission checks are needed.
849	*/
850	return `0`;
851	}
852
853	static struct file pidfs_export_open(const* struct path path, unsigned* int oflags)
854	{
855	/*
856	* Clear O_LARGEFILE as open_by_handle_at() forces it and raise
857	* O_RDWR as pidfds always are.
858	*/
859	oflags &= ~O_LARGEFILE;
860	return dentry_open(path, flags: oflags \| O_RDWR, current_cred());
861	}
862
863	static const struct export_operations pidfs_export_operations = {
864	.encode_fh = pidfs_encode_fh,
865	.fh_to_dentry = pidfs_fh_to_dentry,
866	.open = pidfs_export_open,
867	.permission = pidfs_export_permission,
868	};
869
870	static int pidfs_init_inode(struct inode inode, void* *data)
871	{
872	const struct pid *pid = data;
873
874	inode->i_private = data;
875	inode->i_flags \|= S_PRIVATE \| S_ANON_INODE;
876	/ We allow to set xattrs. /
877	inode->i_flags &= ~S_IMMUTABLE;
878	inode->i_mode \|= S_IRWXU;
879	inode->i_op = &pidfs_inode_operations;
880	inode->i_fop = &pidfs_file_operations;
881	inode->i_ino = pidfs_ino(ino: pid->ino);
882	inode->i_generation = pidfs_gen(ino: pid->ino);
883	return `0`;
884	}
885
886	static void pidfs_put_data(void *data)
887	{
888	struct pid *pid = data;
889	put_pid(pid);
890	}
891
892	/**
893	* pidfs_register_pid - register a struct pid in pidfs
894	* @pid: pid to pin
895	*
896	* Register a struct pid in pidfs.
897	*
898	* Return: On success zero, on error a negative error code is returned.
899	*/
900	int pidfs_register_pid(struct pid *pid)
901	{
902	struct pidfs_attr *new_attr __free(kfree) = NULL;
903	struct pidfs_attr *attr;
904
905	might_sleep();
906
907	if (!pid)
908	return `0`;
909
910	attr = READ_ONCE(pid->attr);
911	if (unlikely(attr == PIDFS_PID_DEAD))
912	return PTR_ERR(PIDFS_PID_DEAD);
913	if (attr)
914	return `0`;
915
916	new_attr = kmem_cache_zalloc(pidfs_attr_cachep, GFP_KERNEL);
917	if (!new_attr)
918	return -ENOMEM;
919
920	/ Synchronize with pidfs_exit(). /
921	guard(spinlock_irq)(l: &pid->wait_pidfd.lock);
922
923	attr = pid->attr;
924	if (unlikely(attr == PIDFS_PID_DEAD))
925	return PTR_ERR(PIDFS_PID_DEAD);
926	if (unlikely(attr))
927	return `0`;
928
929	pid->attr = no_free_ptr(new_attr);
930	return `0`;
931	}
932
933	static struct dentry pidfs_stash_dentry(struct* dentry **stashed,
934	struct dentry *dentry)
935	{
936	int ret;
937	struct pid *pid = d_inode(dentry)->i_private;
938
939	VFS_WARN_ON_ONCE(stashed != &pid->stashed);
940
941	ret = pidfs_register_pid(pid);
942	if (ret)
943	return ERR_PTR(error: ret);
944
945	return stash_dentry(stashed, dentry);
946	}
947
948	static const struct stashed_operations pidfs_stashed_ops = {
949	.stash_dentry = pidfs_stash_dentry,
950	.init_inode = pidfs_init_inode,
951	.put_data = pidfs_put_data,
952	};
953
954	static int pidfs_xattr_get(const struct xattr_handler *handler,
955	struct dentry unused, struct* inode *inode,
956	const char suffix, void* *value, size_t size)
957	{
958	struct pid *pid = inode->i_private;
959	struct pidfs_attr *attr = pid->attr;
960	const char *name;
961	struct simple_xattrs *xattrs;
962
963	xattrs = READ_ONCE(attr->xattrs);
964	if (!xattrs)
965	return `0`;
966
967	name = xattr_full_name(handler, suffix);
968	return simple_xattr_get(xattrs, name, buffer: value, size);
969	}
970
971	static int pidfs_xattr_set(const struct xattr_handler *handler,
972	struct mnt_idmap idmap, struct* dentry *unused,
973	struct inode inode, const* char *suffix,
974	const void value, size_t size, int* flags)
975	{
976	struct pid *pid = inode->i_private;
977	struct pidfs_attr *attr = pid->attr;
978	const char *name;
979	struct simple_xattrs *xattrs;
980	struct simple_xattr *old_xattr;
981
982	/ Ensure we're the only one to set @attr->xattrs. /
983	WARN_ON_ONCE(!inode_is_locked(inode));
984
985	xattrs = READ_ONCE(attr->xattrs);
986	if (!xattrs) {
987	xattrs = kmem_cache_zalloc(pidfs_xattr_cachep, GFP_KERNEL);
988	if (!xattrs)
989	return -ENOMEM;
990
991	simple_xattrs_init(xattrs);
992	smp_store_release(&pid->attr->xattrs, xattrs);
993	}
994
995	name = xattr_full_name(handler, suffix);
996	old_xattr = simple_xattr_set(xattrs, name, value, size, flags);
997	if (IS_ERR(ptr: old_xattr))
998	return PTR_ERR(ptr: old_xattr);
999
1000	simple_xattr_free(xattr: old_xattr);
1001	return `0`;
1002	}
1003
1004	static const struct xattr_handler pidfs_trusted_xattr_handler = {
1005	.prefix = XATTR_TRUSTED_PREFIX,
1006	.get = pidfs_xattr_get,
1007	.set = pidfs_xattr_set,
1008	};
1009
1010	static const struct xattr_handler *const pidfs_xattr_handlers[] = {
1011	&pidfs_trusted_xattr_handler,
1012	NULL
1013	};
1014
1015	static int pidfs_init_fs_context(struct fs_context *fc)
1016	{
1017	struct pseudo_fs_context *ctx;
1018
1019	ctx = init_pseudo(fc, PID_FS_MAGIC);
1020	if (!ctx)
1021	return -ENOMEM;
1022
1023	fc->s_iflags \|= SB_I_NOEXEC;
1024	fc->s_iflags \|= SB_I_NODEV;
1025	ctx->ops = &pidfs_sops;
1026	ctx->eops = &pidfs_export_operations;
1027	ctx->dops = &pidfs_dentry_operations;
1028	ctx->xattr = pidfs_xattr_handlers;
1029	fc->s_fs_info = (void *)&pidfs_stashed_ops;
1030	return `0`;
1031	}
1032
1033	static struct file_system_type pidfs_type = {
1034	.name = "pidfs",
1035	.init_fs_context = pidfs_init_fs_context,
1036	.kill_sb = kill_anon_super,
1037	};
1038
1039	struct file pidfs_alloc_file(struct* pid pid, unsigned* int flags)
1040	{
1041	struct file *pidfd_file;
1042	struct path path __free(path_put) = {};
1043	int ret;
1044
1045	/*
1046	* Ensure that PIDFD_STALE can be passed as a flag without
1047	* overloading other uapi pidfd flags.
1048	*/
1049	BUILD_BUG_ON(PIDFD_STALE == PIDFD_THREAD);
1050	BUILD_BUG_ON(PIDFD_STALE == PIDFD_NONBLOCK);
1051
1052	ret = path_from_stashed(stashed: &pid->stashed, mnt: pidfs_mnt, data: get_pid(pid), path: &path);
1053	if (ret < `0`)
1054	return ERR_PTR(error: ret);
1055
1056	VFS_WARN_ON_ONCE(!pid->attr);
1057
1058	flags &= ~PIDFD_STALE;
1059	flags \|= O_RDWR;
1060	pidfd_file = dentry_open(path: &path, flags, current_cred());
1061	/ Raise PIDFD_THREAD explicitly as do_dentry_open() strips it. /
1062	if (!IS_ERR(ptr: pidfd_file))
1063	pidfd_file->f_flags \|= (flags & PIDFD_THREAD);
1064
1065	return pidfd_file;
1066	}
1067
1068	void __init pidfs_init(void)
1069	{
1070	pidfs_attr_cachep = kmem_cache_create("pidfs_attr_cache", sizeof(struct pidfs_attr), `0`,
1071	(SLAB_HWCACHE_ALIGN \| SLAB_RECLAIM_ACCOUNT \|
1072	SLAB_ACCOUNT \| SLAB_PANIC), NULL);
1073
1074	pidfs_xattr_cachep = kmem_cache_create("pidfs_xattr_cache",
1075	sizeof(struct simple_xattrs), `0`,
1076	(SLAB_HWCACHE_ALIGN \| SLAB_RECLAIM_ACCOUNT \|
1077	SLAB_ACCOUNT \| SLAB_PANIC), NULL);
1078
1079	pidfs_mnt = kern_mount(&pidfs_type);
1080	if (IS_ERR(ptr: pidfs_mnt))
1081	panic(fmt: "Failed to mount pidfs pseudo filesystem");
1082
1083	pidfs_root_path.mnt = pidfs_mnt;
1084	pidfs_root_path.dentry = pidfs_mnt->mnt_root;
1085	}
1086

Browse the source code of Linux/fs/pidfs.c