eventpoll.c source code [Linux/fs/eventpoll.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* fs/eventpoll.c (Efficient event retrieval implementation)
4	* Copyright (C) 2001,...,2009 Davide Libenzi
5	*
6	* Davide Libenzi <davidel@xmailserver.org>
7	*/
8
9	#include <linux/init.h>
10	#include <linux/kernel.h>
11	#include <linux/sched/signal.h>
12	#include <linux/fs.h>
13	#include <linux/file.h>
14	#include <linux/signal.h>
15	#include <linux/errno.h>
16	#include <linux/mm.h>
17	#include <linux/slab.h>
18	#include <linux/poll.h>
19	#include <linux/string.h>
20	#include <linux/list.h>
21	#include <linux/hash.h>
22	#include <linux/spinlock.h>
23	#include <linux/syscalls.h>
24	#include <linux/rbtree.h>
25	#include <linux/wait.h>
26	#include <linux/eventpoll.h>
27	#include <linux/mount.h>
28	#include <linux/bitops.h>
29	#include <linux/mutex.h>
30	#include <linux/anon_inodes.h>
31	#include <linux/device.h>
32	#include <linux/uaccess.h>
33	#include <asm/io.h>
34	#include <asm/mman.h>
35	#include <linux/atomic.h>
36	#include <linux/proc_fs.h>
37	#include <linux/seq_file.h>
38	#include <linux/compat.h>
39	#include <linux/rculist.h>
40	#include <linux/capability.h>
41	#include <net/busy_poll.h>
42
43	/*
44	* LOCKING:
45	* There are three level of locking required by epoll :
46	*
47	* 1) epnested_mutex (mutex)
48	* 2) ep->mtx (mutex)
49	* 3) ep->lock (spinlock)
50	*
51	* The acquire order is the one listed above, from 1 to 3.
52	* We need a spinlock (ep->lock) because we manipulate objects
53	* from inside the poll callback, that might be triggered from
54	* a wake_up() that in turn might be called from IRQ context.
55	* So we can't sleep inside the poll callback and hence we need
56	* a spinlock. During the event transfer loop (from kernel to
57	* user space) we could end up sleeping due a copy_to_user(), so
58	* we need a lock that will allow us to sleep. This lock is a
59	* mutex (ep->mtx). It is acquired during the event transfer loop,
60	* during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file().
61	* The epnested_mutex is acquired when inserting an epoll fd onto another
62	* epoll fd. We do this so that we walk the epoll tree and ensure that this
63	* insertion does not create a cycle of epoll file descriptors, which
64	* could lead to deadlock. We need a global mutex to prevent two
65	* simultaneous inserts (A into B and B into A) from racing and
66	* constructing a cycle without either insert observing that it is
67	* going to.
68	* It is necessary to acquire multiple "ep->mtx"es at once in the
69	* case when one epoll fd is added to another. In this case, we
70	* always acquire the locks in the order of nesting (i.e. after
71	* epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired
72	* before e2->mtx). Since we disallow cycles of epoll file
73	* descriptors, this ensures that the mutexes are well-ordered. In
74	* order to communicate this nesting to lockdep, when walking a tree
75	* of epoll file descriptors, we use the current recursion depth as
76	* the lockdep subkey.
77	* It is possible to drop the "ep->mtx" and to use the global
78	* mutex "epnested_mutex" (together with "ep->lock") to have it working,
79	* but having "ep->mtx" will make the interface more scalable.
80	* Events that require holding "epnested_mutex" are very rare, while for
81	* normal operations the epoll private "ep->mtx" will guarantee
82	* a better scalability.
83	*/
84
85	/ Epoll private bits inside the event mask /
86	#define EP_PRIVATE_BITS (EPOLLWAKEUP \| EPOLLONESHOT \| EPOLLET \| EPOLLEXCLUSIVE)
87
88	#define EPOLLINOUT_BITS (EPOLLIN \| EPOLLOUT)
89
90	#define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS \| EPOLLERR \| EPOLLHUP \| \
91	EPOLLWAKEUP \| EPOLLET \| EPOLLEXCLUSIVE)
92
93	/ Maximum number of nesting allowed inside epoll sets /
94	#define EP_MAX_NESTS 4
95
96	#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
97
98	#define EP_UNACTIVE_PTR ((void *) -1L)
99
100	#define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))
101
102	struct epoll_filefd {
103	struct file *file;
104	int fd;
105	} __packed;
106
107	/ Wait structure used by the poll hooks /
108	struct eppoll_entry {
109	/ List header used to link this structure to the "struct epitem" /
110	struct eppoll_entry *next;
111
112	/ The "base" pointer is set to the container "struct epitem" /
113	struct epitem *base;
114
115	/*
116	* Wait queue item that will be linked to the target file wait
117	* queue head.
118	*/
119	wait_queue_entry_t wait;
120
121	/ The wait queue head that linked the "wait" wait queue item /
122	wait_queue_head_t *whead;
123	};
124
125	/*
126	* Each file descriptor added to the eventpoll interface will
127	* have an entry of this type linked to the "rbr" RB tree.
128	* Avoid increasing the size of this struct, there can be many thousands
129	* of these on a server and we do not want this to take another cache line.
130	*/
131	struct epitem {
132	union {
133	/ RB tree node links this structure to the eventpoll RB tree /
134	struct rb_node rbn;
135	/ Used to free the struct epitem /
136	struct rcu_head rcu;
137	};
138
139	/ List header used to link this structure to the eventpoll ready list /
140	struct list_head rdllink;
141
142	/*
143	* Works together "struct eventpoll"->ovflist in keeping the
144	* single linked chain of items.
145	*/
146	struct epitem *next;
147
148	/ The file descriptor information this item refers to /
149	struct epoll_filefd ffd;
150
151	/*
152	* Protected by file->f_lock, true for to-be-released epitem already
153	* removed from the "struct file" items list; together with
154	* eventpoll->refcount orchestrates "struct eventpoll" disposal
155	*/
156	bool dying;
157
158	/ List containing poll wait queues /
159	struct eppoll_entry *pwqlist;
160
161	/ The "container" of this item /
162	struct eventpoll *ep;
163
164	/ List header used to link this item to the "struct file" items list /
165	struct hlist_node fllink;
166
167	/ wakeup_source used when EPOLLWAKEUP is set /
168	struct wakeup_source __rcu *ws;
169
170	/ The structure that describe the interested events and the source fd /
171	struct epoll_event event;
172	};
173
174	/*
175	* This structure is stored inside the "private_data" member of the file
176	* structure and represents the main data structure for the eventpoll
177	* interface.
178	*/
179	struct eventpoll {
180	/*
181	* This mutex is used to ensure that files are not removed
182	* while epoll is using them. This is held during the event
183	* collection loop, the file cleanup path, the epoll file exit
184	* code and the ctl operations.
185	*/
186	struct mutex mtx;
187
188	/ Wait queue used by sys_epoll_wait() /
189	wait_queue_head_t wq;
190
191	/ Wait queue used by file->poll() /
192	wait_queue_head_t poll_wait;
193
194	/ List of ready file descriptors /
195	struct list_head rdllist;
196
197	/ Lock which protects rdllist and ovflist /
198	spinlock_t lock;
199
200	/ RB tree root used to store monitored fd structs /
201	struct rb_root_cached rbr;
202
203	/*
204	* This is a single linked list that chains all the "struct epitem" that
205	* happened while transferring ready events to userspace w/out
206	* holding ->lock.
207	*/
208	struct epitem *ovflist;
209
210	/ wakeup_source used when ep_send_events or __ep_eventpoll_poll is running /
211	struct wakeup_source *ws;
212
213	/ The user that created the eventpoll descriptor /
214	struct user_struct *user;
215
216	struct file *file;
217
218	/ used to optimize loop detection check /
219	u64 gen;
220	struct hlist_head refs;
221	u8 loop_check_depth;
222
223	/*
224	* usage count, used together with epitem->dying to
225	* orchestrate the disposal of this struct
226	*/
227	refcount_t refcount;
228
229	#ifdef CONFIG_NET_RX_BUSY_POLL
230	/ used to track busy poll napi_id /
231	unsigned int napi_id;
232	/ busy poll timeout /
233	u32 busy_poll_usecs;
234	/ busy poll packet budget /
235	u16 busy_poll_budget;
236	bool prefer_busy_poll;
237	#endif
238
239	#ifdef CONFIG_DEBUG_LOCK_ALLOC
240	/ tracks wakeup nests for lockdep validation /
241	u8 nests;
242	#endif
243	};
244
245	/ Wrapper struct used by poll queueing /
246	struct ep_pqueue {
247	poll_table pt;
248	struct epitem *epi;
249	};
250
251	/*
252	* Configuration options available inside /proc/sys/fs/epoll/
253	*/
254	/ Maximum number of epoll watched descriptors, per user /
255	static long max_user_watches __read_mostly;
256
257	/ Used for cycles detection /
258	static DEFINE_MUTEX(epnested_mutex);
259
260	static u64 loop_check_gen = `0`;
261
262	/ Used to check for epoll file descriptor inclusion loops /
263	static struct eventpoll *inserting_into;
264
265	/ Slab cache used to allocate "struct epitem" /
266	static struct kmem_cache *epi_cache __ro_after_init;
267
268	/ Slab cache used to allocate "struct eppoll_entry" /
269	static struct kmem_cache *pwq_cache __ro_after_init;
270
271	/*
272	* List of files with newly added links, where we may need to limit the number
273	* of emanating paths. Protected by the epnested_mutex.
274	*/
275	struct epitems_head {
276	struct hlist_head epitems;
277	struct epitems_head *next;
278	};
279	static struct epitems_head *tfile_check_list = EP_UNACTIVE_PTR;
280
281	static struct kmem_cache *ephead_cache __ro_after_init;
282
283	static inline void free_ephead(struct epitems_head *head)
284	{
285	if (head)
286	kmem_cache_free(s: ephead_cache, objp: head);
287	}
288
289	static void list_file(struct file *file)
290	{
291	struct epitems_head *head;
292
293	head = container_of(file->f_ep, struct epitems_head, epitems);
294	if (!head->next) {
295	head->next = tfile_check_list;
296	tfile_check_list = head;
297	}
298	}
299
300	static void unlist_file(struct epitems_head *head)
301	{
302	struct epitems_head *to_free = head;
303	struct hlist_node *p = rcu_dereference(hlist_first_rcu(&head->epitems));
304	if (p) {
305	struct epitem epi= container_of(p, struct* epitem, fllink);
306	spin_lock(lock: &epi->ffd.file->f_lock);
307	if (!hlist_empty(h: &head->epitems))
308	to_free = NULL;
309	head->next = NULL;
310	spin_unlock(lock: &epi->ffd.file->f_lock);
311	}
312	free_ephead(head: to_free);
313	}
314
315	#ifdef CONFIG_SYSCTL
316
317	#include <linux/sysctl.h>
318
319	static long long_zero;
320	static long long_max = LONG_MAX;
321
322	static const struct ctl_table epoll_table[] = {
323	{
324	.procname = "max_user_watches",
325	.data = &max_user_watches,
326	.maxlen = sizeof(max_user_watches),
327	.mode = `0644`,
328	.proc_handler = proc_doulongvec_minmax,
329	.extra1 = &long_zero,
330	.extra2 = &long_max,
331	},
332	};
333
334	static void __init epoll_sysctls_init(void)
335	{
336	register_sysctl("fs/epoll", epoll_table);
337	}
338	#else
339	#define epoll_sysctls_init() do { } while (0)
340	#endif /* CONFIG_SYSCTL */
341
342	static const struct file_operations eventpoll_fops;
343
344	static inline int is_file_epoll(struct file *f)
345	{
346	return f->f_op == &eventpoll_fops;
347	}
348
349	/ Setup the structure that is used as key for the RB tree /
350	static inline void ep_set_ffd(struct epoll_filefd *ffd,
351	struct file file, int* fd)
352	{
353	ffd->file = file;
354	ffd->fd = fd;
355	}
356
357	/ Compare RB tree keys /
358	static inline int ep_cmp_ffd(struct epoll_filefd *p1,
359	struct epoll_filefd *p2)
360	{
361	return (p1->file > p2->file ? +`1`:
362	(p1->file < p2->file ? -`1` : p1->fd - p2->fd));
363	}
364
365	/ Tells us if the item is currently linked /
366	static inline int ep_is_linked(struct epitem *epi)
367	{
368	return !list_empty(head: &epi->rdllink);
369	}
370
371	static inline struct eppoll_entry ep_pwq_from_wait(wait_queue_entry_t p)
372	{
373	return container_of(p, struct eppoll_entry, wait);
374	}
375
376	/ Get the "struct epitem" from a wait queue pointer /
377	static inline struct epitem ep_item_from_wait(wait_queue_entry_t p)
378	{
379	return container_of(p, struct eppoll_entry, wait)->base;
380	}
381
382	/**
383	* ep_events_available - Checks if ready events might be available.
384	*
385	* @ep: Pointer to the eventpoll context.
386	*
387	* Return: a value different than %zero if ready events are available,
388	* or %zero otherwise.
389	*/
390	static inline int ep_events_available(struct eventpoll *ep)
391	{
392	return !list_empty_careful(head: &ep->rdllist) \|\|
393	READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR;
394	}
395
396	#ifdef CONFIG_NET_RX_BUSY_POLL
397	/**
398	* busy_loop_ep_timeout - check if busy poll has timed out. The timeout value
399	* from the epoll instance ep is preferred, but if it is not set fallback to
400	* the system-wide global via busy_loop_timeout.
401	*
402	* @start_time: The start time used to compute the remaining time until timeout.
403	* @ep: Pointer to the eventpoll context.
404	*
405	* Return: true if the timeout has expired, false otherwise.
406	*/
407	static bool busy_loop_ep_timeout(unsigned long start_time,
408	struct eventpoll *ep)
409	{
410	unsigned long bp_usec = READ_ONCE(ep->busy_poll_usecs);
411
412	if (bp_usec) {
413	unsigned long end_time = start_time + bp_usec;
414	unsigned long now = busy_loop_current_time();
415
416	return time_after(now, end_time);
417	} else {
418	return busy_loop_timeout(start_time);
419	}
420	}
421
422	static bool ep_busy_loop_on(struct eventpoll *ep)
423	{
424	return !!READ_ONCE(ep->busy_poll_usecs) \|\|
425	READ_ONCE(ep->prefer_busy_poll) \|\|
426	net_busy_loop_on();
427	}
428
429	static bool ep_busy_loop_end(void p, unsigned* long start_time)
430	{
431	struct eventpoll *ep = p;
432
433	return ep_events_available(ep) \|\| busy_loop_ep_timeout(start_time, ep);
434	}
435
436	/*
437	* Busy poll if globally on and supporting sockets found && no events,
438	* busy loop will return if need_resched or ep_events_available.
439	*
440	* we must do our busy polling with irqs enabled
441	*/
442	static bool ep_busy_loop(struct eventpoll *ep)
443	{
444	unsigned int napi_id = READ_ONCE(ep->napi_id);
445	u16 budget = READ_ONCE(ep->busy_poll_budget);
446	bool prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll);
447
448	if (!budget)
449	budget = BUSY_POLL_BUDGET;
450
451	if (napi_id_valid(napi_id) && ep_busy_loop_on(ep)) {
452	napi_busy_loop(napi_id, loop_end: ep_busy_loop_end,
453	loop_end_arg: ep, prefer_busy_poll, budget);
454	if (ep_events_available(ep))
455	return true;
456	/*
457	* Busy poll timed out. Drop NAPI ID for now, we can add
458	* it back in when we have moved a socket with a valid NAPI
459	* ID onto the ready list.
460	*/
461	if (prefer_busy_poll)
462	napi_resume_irqs(napi_id);
463	ep->napi_id = `0`;
464	return false;
465	}
466	return false;
467	}
468
469	/*
470	* Set epoll busy poll NAPI ID from sk.
471	*/
472	static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
473	{
474	struct eventpoll *ep = epi->ep;
475	unsigned int napi_id;
476	struct socket *sock;
477	struct sock *sk;
478
479	if (!ep_busy_loop_on(ep))
480	return;
481
482	sock = sock_from_file(file: epi->ffd.file);
483	if (!sock)
484	return;
485
486	sk = sock->sk;
487	if (!sk)
488	return;
489
490	napi_id = READ_ONCE(sk->sk_napi_id);
491
492	/ Non-NAPI IDs can be rejected*
493	* or
494	* Nothing to do if we already have this ID
495	*/
496	if (!napi_id_valid(napi_id) \|\| napi_id == ep->napi_id)
497	return;
498
499	/ record NAPI ID for use in next busy poll /
500	ep->napi_id = napi_id;
501	}
502
503	static long ep_eventpoll_bp_ioctl(struct file file, unsigned* int cmd,
504	unsigned long arg)
505	{
506	struct eventpoll *ep = file->private_data;
507	void __user uarg = (void* __user *)arg;
508	struct epoll_params epoll_params;
509
510	switch (cmd) {
511	case EPIOCSPARAMS:
512	if (copy_from_user(to: &epoll_params, from: uarg, n: sizeof(epoll_params)))
513	return -EFAULT;
514
515	/ pad byte must be zero /
516	if (epoll_params.__pad)
517	return -EINVAL;
518
519	if (epoll_params.busy_poll_usecs > S32_MAX)
520	return -EINVAL;
521
522	if (epoll_params.prefer_busy_poll > `1`)
523	return -EINVAL;
524
525	if (epoll_params.busy_poll_budget > NAPI_POLL_WEIGHT &&
526	!capable(CAP_NET_ADMIN))
527	return -EPERM;
528
529	WRITE_ONCE(ep->busy_poll_usecs, epoll_params.busy_poll_usecs);
530	WRITE_ONCE(ep->busy_poll_budget, epoll_params.busy_poll_budget);
531	WRITE_ONCE(ep->prefer_busy_poll, epoll_params.prefer_busy_poll);
532	return `0`;
533	case EPIOCGPARAMS:
534	memset(s: &epoll_params, c: `0`, n: sizeof(epoll_params));
535	epoll_params.busy_poll_usecs = READ_ONCE(ep->busy_poll_usecs);
536	epoll_params.busy_poll_budget = READ_ONCE(ep->busy_poll_budget);
537	epoll_params.prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll);
538	if (copy_to_user(to: uarg, from: &epoll_params, n: sizeof(epoll_params)))
539	return -EFAULT;
540	return `0`;
541	default:
542	return -ENOIOCTLCMD;
543	}
544	}
545
546	static void ep_suspend_napi_irqs(struct eventpoll *ep)
547	{
548	unsigned int napi_id = READ_ONCE(ep->napi_id);
549
550	if (napi_id_valid(napi_id) && READ_ONCE(ep->prefer_busy_poll))
551	napi_suspend_irqs(napi_id);
552	}
553
554	static void ep_resume_napi_irqs(struct eventpoll *ep)
555	{
556	unsigned int napi_id = READ_ONCE(ep->napi_id);
557
558	if (napi_id_valid(napi_id) && READ_ONCE(ep->prefer_busy_poll))
559	napi_resume_irqs(napi_id);
560	}
561
562	#else
563
564	static inline bool ep_busy_loop(struct eventpoll *ep)
565	{
566	return false;
567	}
568
569	static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
570	{
571	}
572
573	static long ep_eventpoll_bp_ioctl(struct file file, unsigned* int cmd,
574	unsigned long arg)
575	{
576	return -EOPNOTSUPP;
577	}
578
579	static void ep_suspend_napi_irqs(struct eventpoll *ep)
580	{
581	}
582
583	static void ep_resume_napi_irqs(struct eventpoll *ep)
584	{
585	}
586
587	#endif /* CONFIG_NET_RX_BUSY_POLL */
588
589	/*
590	* As described in commit 0ccf831cb lockdep: annotate epoll
591	* the use of wait queues used by epoll is done in a very controlled
592	* manner. Wake ups can nest inside each other, but are never done
593	* with the same locking. For example:
594	*
595	* dfd = socket(...);
596	* efd1 = epoll_create();
597	* efd2 = epoll_create();
598	* epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...);
599	* epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...);
600	*
601	* When a packet arrives to the device underneath "dfd", the net code will
602	* issue a wake_up() on its poll wake list. Epoll (efd1) has installed a
603	* callback wakeup entry on that queue, and the wake_up() performed by the
604	* "dfd" net code will end up in ep_poll_callback(). At this point epoll
605	* (efd1) notices that it may have some event ready, so it needs to wake up
606	* the waiters on its poll wait list (efd2). So it calls ep_poll_safewake()
607	* that ends up in another wake_up(), after having checked about the
608	* recursion constraints. That are, no more than EP_MAX_NESTS, to avoid
609	* stack blasting.
610	*
611	* When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle
612	* this special case of epoll.
613	*/
614	#ifdef CONFIG_DEBUG_LOCK_ALLOC
615
616	static void ep_poll_safewake(struct eventpoll ep, struct* epitem *epi,
617	unsigned pollflags)
618	{
619	struct eventpoll *ep_src;
620	unsigned long flags;
621	u8 nests = `0`;
622
623	/*
624	* To set the subclass or nesting level for spin_lock_irqsave_nested()
625	* it might be natural to create a per-cpu nest count. However, since
626	* we can recurse on ep->poll_wait.lock, and a non-raw spinlock can
627	* schedule() in the -rt kernel, the per-cpu variable are no longer
628	* protected. Thus, we are introducing a per eventpoll nest field.
629	* If we are not being call from ep_poll_callback(), epi is NULL and
630	* we are at the first level of nesting, 0. Otherwise, we are being
631	* called from ep_poll_callback() and if a previous wakeup source is
632	* not an epoll file itself, we are at depth 1 since the wakeup source
633	* is depth 0. If the wakeup source is a previous epoll file in the
634	* wakeup chain then we use its nests value and record ours as
635	* nests + 1. The previous epoll file nests value is stable since its
636	* already holding its own poll_wait.lock.
637	*/
638	if (epi) {
639	if ((is_file_epoll(epi->ffd.file))) {
640	ep_src = epi->ffd.file->private_data;
641	nests = ep_src->nests;
642	} else {
643	nests = `1`;
644	}
645	}
646	spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests);
647	ep->nests = nests + `1`;
648	wake_up_locked_poll(&ep->poll_wait, EPOLLIN \| pollflags);
649	ep->nests = `0`;
650	spin_unlock_irqrestore(&ep->poll_wait.lock, flags);
651	}
652
653	#else
654
655	static void ep_poll_safewake(struct eventpoll ep, struct* epitem *epi,
656	__poll_t pollflags)
657	{
658	wake_up_poll(&ep->poll_wait, EPOLLIN \| pollflags);
659	}
660
661	#endif
662
663	static void ep_remove_wait_queue(struct eppoll_entry *pwq)
664	{
665	wait_queue_head_t *whead;
666
667	rcu_read_lock();
668	/*
669	* If it is cleared by POLLFREE, it should be rcu-safe.
670	* If we read NULL we need a barrier paired with
671	* smp_store_release() in ep_poll_callback(), otherwise
672	* we rely on whead->lock.
673	*/
674	whead = smp_load_acquire(&pwq->whead);
675	if (whead)
676	remove_wait_queue(wq_head: whead, wq_entry: &pwq->wait);
677	rcu_read_unlock();
678	}
679
680	/*
681	* This function unregisters poll callbacks from the associated file
682	* descriptor. Must be called with "mtx" held.
683	*/
684	static void ep_unregister_pollwait(struct eventpoll ep, struct* epitem *epi)
685	{
686	struct eppoll_entry **p = &epi->pwqlist;
687	struct eppoll_entry *pwq;
688
689	while ((pwq = *p) != NULL) {
690	*p = pwq->next;
691	ep_remove_wait_queue(pwq);
692	kmem_cache_free(s: pwq_cache, objp: pwq);
693	}
694	}
695
696	/ call only when ep->mtx is held /
697	static inline struct wakeup_source ep_wakeup_source(struct* epitem *epi)
698	{
699	return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx));
700	}
701
702	/ call only when ep->mtx is held /
703	static inline void ep_pm_stay_awake(struct epitem *epi)
704	{
705	struct wakeup_source *ws = ep_wakeup_source(epi);
706
707	if (ws)
708	__pm_stay_awake(ws);
709	}
710
711	static inline bool ep_has_wakeup_source(struct epitem *epi)
712	{
713	return rcu_access_pointer(epi->ws) ? true : false;
714	}
715
716	/ call when ep->mtx cannot be held (ep_poll_callback) /
717	static inline void ep_pm_stay_awake_rcu(struct epitem *epi)
718	{
719	struct wakeup_source *ws;
720
721	rcu_read_lock();
722	ws = rcu_dereference(epi->ws);
723	if (ws)
724	__pm_stay_awake(ws);
725	rcu_read_unlock();
726	}
727
728
729	/*
730	* ep->mutex needs to be held because we could be hit by
731	* eventpoll_release_file() and epoll_ctl().
732	*/
733	static void ep_start_scan(struct eventpoll ep, struct* list_head *txlist)
734	{
735	/*
736	* Steal the ready list, and re-init the original one to the
737	* empty list. Also, set ep->ovflist to NULL so that events
738	* happening while looping w/out locks, are not lost. We cannot
739	* have the poll callback to queue directly on ep->rdllist,
740	* because we want the "sproc" callback to be able to do it
741	* in a lockless way.
742	*/
743	lockdep_assert_irqs_enabled();
744	spin_lock_irq(lock: &ep->lock);
745	list_splice_init(list: &ep->rdllist, head: txlist);
746	WRITE_ONCE(ep->ovflist, NULL);
747	spin_unlock_irq(lock: &ep->lock);
748	}
749
750	static void ep_done_scan(struct eventpoll *ep,
751	struct list_head *txlist)
752	{
753	struct epitem epi, nepi;
754
755	spin_lock_irq(lock: &ep->lock);
756	/*
757	* During the time we spent inside the "sproc" callback, some
758	* other events might have been queued by the poll callback.
759	* We re-insert them inside the main ready-list here.
760	*/
761	for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL;
762	nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
763	/*
764	* We need to check if the item is already in the list.
765	* During the "sproc" callback execution time, items are
766	* queued into ->ovflist but the "txlist" might already
767	* contain them, and the list_splice() below takes care of them.
768	*/
769	if (!ep_is_linked(epi)) {
770	/*
771	* ->ovflist is LIFO, so we have to reverse it in order
772	* to keep in FIFO.
773	*/
774	list_add(new: &epi->rdllink, head: &ep->rdllist);
775	ep_pm_stay_awake(epi);
776	}
777	}
778	/*
779	* We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
780	* releasing the lock, events will be queued in the normal way inside
781	* ep->rdllist.
782	*/
783	WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR);
784
785	/*
786	* Quickly re-inject items left on "txlist".
787	*/
788	list_splice(list: txlist, head: &ep->rdllist);
789	__pm_relax(ws: ep->ws);
790
791	if (!list_empty(head: &ep->rdllist)) {
792	if (waitqueue_active(wq_head: &ep->wq))
793	wake_up(&ep->wq);
794	}
795
796	spin_unlock_irq(lock: &ep->lock);
797	}
798
799	static void ep_get(struct eventpoll *ep)
800	{
801	refcount_inc(r: &ep->refcount);
802	}
803
804	/*
805	* Returns true if the event poll can be disposed
806	*/
807	static bool ep_refcount_dec_and_test(struct eventpoll *ep)
808	{
809	if (!refcount_dec_and_test(r: &ep->refcount))
810	return false;
811
812	WARN_ON_ONCE(!RB_EMPTY_ROOT(&ep->rbr.rb_root));
813	return true;
814	}
815
816	static void ep_free(struct eventpoll *ep)
817	{
818	ep_resume_napi_irqs(ep);
819	mutex_destroy(lock: &ep->mtx);
820	free_uid(ep->user);
821	wakeup_source_unregister(ws: ep->ws);
822	kfree(objp: ep);
823	}
824
825	/*
826	* Removes a "struct epitem" from the eventpoll RB tree and deallocates
827	* all the associated resources. Must be called with "mtx" held.
828	* If the dying flag is set, do the removal only if force is true.
829	* This prevents ep_clear_and_put() from dropping all the ep references
830	* while running concurrently with eventpoll_release_file().
831	* Returns true if the eventpoll can be disposed.
832	*/
833	static bool __ep_remove(struct eventpoll ep, struct* epitem *epi, bool force)
834	{
835	struct file *file = epi->ffd.file;
836	struct epitems_head *to_free;
837	struct hlist_head *head;
838
839	lockdep_assert_irqs_enabled();
840
841	/*
842	* Removes poll wait queue hooks.
843	*/
844	ep_unregister_pollwait(ep, epi);
845
846	/ Remove the current item from the list of epoll hooks /
847	spin_lock(lock: &file->f_lock);
848	if (epi->dying && !force) {
849	spin_unlock(lock: &file->f_lock);
850	return false;
851	}
852
853	to_free = NULL;
854	head = file->f_ep;
855	if (head->first == &epi->fllink && !epi->fllink.next) {
856	/ See eventpoll_release() for details. /
857	WRITE_ONCE(file->f_ep, NULL);
858	if (!is_file_epoll(f: file)) {
859	struct epitems_head *v;
860	v = container_of(head, struct epitems_head, epitems);
861	if (!smp_load_acquire(&v->next))
862	to_free = v;
863	}
864	}
865	hlist_del_rcu(n: &epi->fllink);
866	spin_unlock(lock: &file->f_lock);
867	free_ephead(head: to_free);
868
869	rb_erase_cached(node: &epi->rbn, root: &ep->rbr);
870
871	spin_lock_irq(lock: &ep->lock);
872	if (ep_is_linked(epi))
873	list_del_init(entry: &epi->rdllink);
874	spin_unlock_irq(lock: &ep->lock);
875
876	wakeup_source_unregister(ws: ep_wakeup_source(epi));
877	/*
878	* At this point it is safe to free the eventpoll item. Use the union
879	* field epi->rcu, since we are trying to minimize the size of
880	* 'struct epitem'. The 'rbn' field is no longer in use. Protected by
881	* ep->mtx. The rcu read side, reverse_path_check_proc(), does not make
882	* use of the rbn field.
883	*/
884	kfree_rcu(epi, rcu);
885
886	percpu_counter_dec(fbc: &ep->user->epoll_watches);
887	return true;
888	}
889
890	/*
891	* ep_remove variant for callers owing an additional reference to the ep
892	*/
893	static void ep_remove_safe(struct eventpoll ep, struct* epitem *epi)
894	{
895	if (__ep_remove(ep, epi, force: false))
896	WARN_ON_ONCE(ep_refcount_dec_and_test(ep));
897	}
898
899	static void ep_clear_and_put(struct eventpoll *ep)
900	{
901	struct rb_node rbp, next;
902	struct epitem *epi;
903
904	/ We need to release all tasks waiting for these file /
905	if (waitqueue_active(wq_head: &ep->poll_wait))
906	ep_poll_safewake(ep, NULL, pollflags: `0`);
907
908	mutex_lock(lock: &ep->mtx);
909
910	/*
911	* Walks through the whole tree by unregistering poll callbacks.
912	*/
913	for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
914	epi = rb_entry(rbp, struct epitem, rbn);
915
916	ep_unregister_pollwait(ep, epi);
917	cond_resched();
918	}
919
920	/*
921	* Walks through the whole tree and try to free each "struct epitem".
922	* Note that ep_remove_safe() will not remove the epitem in case of a
923	* racing eventpoll_release_file(); the latter will do the removal.
924	* At this point we are sure no poll callbacks will be lingering around.
925	* Since we still own a reference to the eventpoll struct, the loop can't
926	* dispose it.
927	*/
928	for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = next) {
929	next = rb_next(rbp);
930	epi = rb_entry(rbp, struct epitem, rbn);
931	ep_remove_safe(ep, epi);
932	cond_resched();
933	}
934
935	mutex_unlock(lock: &ep->mtx);
936	if (ep_refcount_dec_and_test(ep))
937	ep_free(ep);
938	}
939
940	static long ep_eventpoll_ioctl(struct file file, unsigned* int cmd,
941	unsigned long arg)
942	{
943	int ret;
944
945	if (!is_file_epoll(f: file))
946	return -EINVAL;
947
948	switch (cmd) {
949	case EPIOCSPARAMS:
950	case EPIOCGPARAMS:
951	ret = ep_eventpoll_bp_ioctl(file, cmd, arg);
952	break;
953	default:
954	ret = -EINVAL;
955	break;
956	}
957
958	return ret;
959	}
960
961	static int ep_eventpoll_release(struct inode inode, struct* file *file)
962	{
963	struct eventpoll *ep = file->private_data;
964
965	if (ep)
966	ep_clear_and_put(ep);
967
968	return `0`;
969	}
970
971	static __poll_t ep_item_poll(const struct epitem epi, poll_table pt, int depth);
972
973	static __poll_t __ep_eventpoll_poll(struct file file, poll_table wait, int depth)
974	{
975	struct eventpoll *ep = file->private_data;
976	LIST_HEAD(txlist);
977	struct epitem epi, tmp;
978	poll_table pt;
979	__poll_t res = `0`;
980
981	init_poll_funcptr(pt: &pt, NULL);
982
983	/ Insert inside our poll wait queue /
984	poll_wait(filp: file, wait_address: &ep->poll_wait, p: wait);
985
986	/*
987	* Proceed to find out if wanted events are really available inside
988	* the ready list.
989	*/
990	mutex_lock_nested(&ep->mtx, depth);
991	ep_start_scan(ep, txlist: &txlist);
992	list_for_each_entry_safe(epi, tmp, &txlist, rdllink) {
993	if (ep_item_poll(epi, pt: &pt, depth: depth + `1`)) {
994	res = EPOLLIN \| EPOLLRDNORM;
995	break;
996	} else {
997	/*
998	* Item has been dropped into the ready list by the poll
999	* callback, but it's not actually ready, as far as
1000	* caller requested events goes. We can remove it here.
1001	*/
1002	__pm_relax(ws: ep_wakeup_source(epi));
1003	list_del_init(entry: &epi->rdllink);
1004	}
1005	}
1006	ep_done_scan(ep, txlist: &txlist);
1007	mutex_unlock(lock: &ep->mtx);
1008	return res;
1009	}
1010
1011	/*
1012	* The ffd.file pointer may be in the process of being torn down due to
1013	* being closed, but we may not have finished eventpoll_release() yet.
1014	*
1015	* Normally, even with the atomic_long_inc_not_zero, the file may have
1016	* been free'd and then gotten re-allocated to something else (since
1017	* files are not RCU-delayed, they are SLAB_TYPESAFE_BY_RCU).
1018	*
1019	* But for epoll, users hold the ep->mtx mutex, and as such any file in
1020	* the process of being free'd will block in eventpoll_release_file()
1021	* and thus the underlying file allocation will not be free'd, and the
1022	* file re-use cannot happen.
1023	*
1024	* For the same reason we can avoid a rcu_read_lock() around the
1025	* operation - 'ffd.file' cannot go away even if the refcount has
1026	* reached zero (but we must still not call out to ->poll() functions
1027	* etc).
1028	*/
1029	static struct file epi_fget(const* struct epitem *epi)
1030	{
1031	struct file *file;
1032
1033	file = epi->ffd.file;
1034	if (!file_ref_get(ref: &file->f_ref))
1035	file = NULL;
1036	return file;
1037	}
1038
1039	/*
1040	* Differs from ep_eventpoll_poll() in that internal callers already have
1041	* the ep->mtx so we need to start from depth=1, such that mutex_lock_nested()
1042	* is correctly annotated.
1043	*/
1044	static __poll_t ep_item_poll(const struct epitem epi, poll_table pt,
1045	int depth)
1046	{
1047	struct file *file = epi_fget(epi);
1048	__poll_t res;
1049
1050	/*
1051	* We could return EPOLLERR \| EPOLLHUP or something, but let's
1052	* treat this more as "file doesn't exist, poll didn't happen".
1053	*/
1054	if (!file)
1055	return `0`;
1056
1057	pt->_key = epi->event.events;
1058	if (!is_file_epoll(f: file))
1059	res = vfs_poll(file, pt);
1060	else
1061	res = __ep_eventpoll_poll(file, wait: pt, depth);
1062	fput(file);
1063	return res & epi->event.events;
1064	}
1065
1066	static __poll_t ep_eventpoll_poll(struct file file, poll_table wait)
1067	{
1068	return __ep_eventpoll_poll(file, wait, depth: `0`);
1069	}
1070
1071	#ifdef CONFIG_PROC_FS
1072	static void ep_show_fdinfo(struct seq_file m, struct* file *f)
1073	{
1074	struct eventpoll *ep = f->private_data;
1075	struct rb_node *rbp;
1076
1077	mutex_lock(lock: &ep->mtx);
1078	for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
1079	struct epitem epi = rb_entry(rbp, struct* epitem, rbn);
1080	struct inode *inode = file_inode(f: epi->ffd.file);
1081
1082	seq_printf(m, fmt: "tfd: %8d events: %8x data: %16llx "
1083	" pos:%lli ino:%lx sdev:%x\n",
1084	epi->ffd.fd, epi->event.events,
1085	(long long)epi->event.data,
1086	(long long)epi->ffd.file->f_pos,
1087	inode->i_ino, inode->i_sb->s_dev);
1088	if (seq_has_overflowed(m))
1089	break;
1090	}
1091	mutex_unlock(lock: &ep->mtx);
1092	}
1093	#endif
1094
1095	/ File callbacks that implement the eventpoll file behaviour /
1096	static const struct file_operations eventpoll_fops = {
1097	#ifdef CONFIG_PROC_FS
1098	.show_fdinfo = ep_show_fdinfo,
1099	#endif
1100	.release = ep_eventpoll_release,
1101	.poll = ep_eventpoll_poll,
1102	.llseek = noop_llseek,
1103	.unlocked_ioctl = ep_eventpoll_ioctl,
1104	.compat_ioctl = compat_ptr_ioctl,
1105	};
1106
1107	/*
1108	* This is called from eventpoll_release() to unlink files from the eventpoll
1109	* interface. We need to have this facility to cleanup correctly files that are
1110	* closed without being removed from the eventpoll interface.
1111	*/
1112	void eventpoll_release_file(struct file *file)
1113	{
1114	struct eventpoll *ep;
1115	struct epitem *epi;
1116	bool dispose;
1117
1118	/*
1119	* Use the 'dying' flag to prevent a concurrent ep_clear_and_put() from
1120	* touching the epitems list before eventpoll_release_file() can access
1121	* the ep->mtx.
1122	*/
1123	again:
1124	spin_lock(lock: &file->f_lock);
1125	if (file->f_ep && file->f_ep->first) {
1126	epi = hlist_entry(file->f_ep->first, struct epitem, fllink);
1127	epi->dying = true;
1128	spin_unlock(lock: &file->f_lock);
1129
1130	/*
1131	* ep access is safe as we still own a reference to the ep
1132	* struct
1133	*/
1134	ep = epi->ep;
1135	mutex_lock(lock: &ep->mtx);
1136	dispose = __ep_remove(ep, epi, force: true);
1137	mutex_unlock(lock: &ep->mtx);
1138
1139	if (dispose && ep_refcount_dec_and_test(ep))
1140	ep_free(ep);
1141	goto again;
1142	}
1143	spin_unlock(lock: &file->f_lock);
1144	}
1145
1146	static int ep_alloc(struct eventpoll **pep)
1147	{
1148	struct eventpoll *ep;
1149
1150	ep = kzalloc(sizeof(*ep), GFP_KERNEL);
1151	if (unlikely(!ep))
1152	return -ENOMEM;
1153
1154	mutex_init(&ep->mtx);
1155	spin_lock_init(&ep->lock);
1156	init_waitqueue_head(&ep->wq);
1157	init_waitqueue_head(&ep->poll_wait);
1158	INIT_LIST_HEAD(list: &ep->rdllist);
1159	ep->rbr = RB_ROOT_CACHED;
1160	ep->ovflist = EP_UNACTIVE_PTR;
1161	ep->user = get_current_user();
1162	refcount_set(r: &ep->refcount, n: `1`);
1163
1164	*pep = ep;
1165
1166	return `0`;
1167	}
1168
1169	/*
1170	* Search the file inside the eventpoll tree. The RB tree operations
1171	* are protected by the "mtx" mutex, and ep_find() must be called with
1172	* "mtx" held.
1173	*/
1174	static struct epitem ep_find(struct* eventpoll ep, struct* file file, int* fd)
1175	{
1176	int kcmp;
1177	struct rb_node *rbp;
1178	struct epitem epi, epir = NULL;
1179	struct epoll_filefd ffd;
1180
1181	ep_set_ffd(ffd: &ffd, file, fd);
1182	for (rbp = ep->rbr.rb_root.rb_node; rbp; ) {
1183	epi = rb_entry(rbp, struct epitem, rbn);
1184	kcmp = ep_cmp_ffd(p1: &ffd, p2: &epi->ffd);
1185	if (kcmp > `0`)
1186	rbp = rbp->rb_right;
1187	else if (kcmp < `0`)
1188	rbp = rbp->rb_left;
1189	else {
1190	epir = epi;
1191	break;
1192	}
1193	}
1194
1195	return epir;
1196	}
1197
1198	#ifdef CONFIG_KCMP
1199	static struct epitem ep_find_tfd(struct* eventpoll ep, int* tfd, unsigned long toff)
1200	{
1201	struct rb_node *rbp;
1202	struct epitem *epi;
1203
1204	for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
1205	epi = rb_entry(rbp, struct epitem, rbn);
1206	if (epi->ffd.fd == tfd) {
1207	if (toff == `0`)
1208	return epi;
1209	else
1210	toff--;
1211	}
1212	cond_resched();
1213	}
1214
1215	return NULL;
1216	}
1217
1218	struct file get_epoll_tfile_raw_ptr(struct* file file, int* tfd,
1219	unsigned long toff)
1220	{
1221	struct file *file_raw;
1222	struct eventpoll *ep;
1223	struct epitem *epi;
1224
1225	if (!is_file_epoll(f: file))
1226	return ERR_PTR(error: -EINVAL);
1227
1228	ep = file->private_data;
1229
1230	mutex_lock(lock: &ep->mtx);
1231	epi = ep_find_tfd(ep, tfd, toff);
1232	if (epi)
1233	file_raw = epi->ffd.file;
1234	else
1235	file_raw = ERR_PTR(error: -ENOENT);
1236	mutex_unlock(lock: &ep->mtx);
1237
1238	return file_raw;
1239	}
1240	#endif /* CONFIG_KCMP */
1241
1242	/*
1243	* This is the callback that is passed to the wait queue wakeup
1244	* mechanism. It is called by the stored file descriptors when they
1245	* have events to report.
1246	*/
1247	static int ep_poll_callback(wait_queue_entry_t wait, unsigned* mode, int sync, void *key)
1248	{
1249	int pwake = `0`;
1250	struct epitem *epi = ep_item_from_wait(p: wait);
1251	struct eventpoll *ep = epi->ep;
1252	__poll_t pollflags = key_to_poll(key);
1253	unsigned long flags;
1254	int ewake = `0`;
1255
1256	spin_lock_irqsave(&ep->lock, flags);
1257
1258	ep_set_busy_poll_napi_id(epi);
1259
1260	/*
1261	* If the event mask does not contain any poll(2) event, we consider the
1262	* descriptor to be disabled. This condition is likely the effect of the
1263	* EPOLLONESHOT bit that disables the descriptor when an event is received,
1264	* until the next EPOLL_CTL_MOD will be issued.
1265	*/
1266	if (!(epi->event.events & ~EP_PRIVATE_BITS))
1267	goto out_unlock;
1268
1269	/*
1270	* Check the events coming with the callback. At this stage, not
1271	* every device reports the events in the "key" parameter of the
1272	* callback. We need to be able to handle both cases here, hence the
1273	* test for "key" != NULL before the event match test.
1274	*/
1275	if (pollflags && !(pollflags & epi->event.events))
1276	goto out_unlock;
1277
1278	/*
1279	* If we are transferring events to userspace, we can hold no locks
1280	* (because we're accessing user memory, and because of linux f_op->poll()
1281	* semantics). All the events that happen during that period of time are
1282	* chained in ep->ovflist and requeued later on.
1283	*/
1284	if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
1285	if (epi->next == EP_UNACTIVE_PTR) {
1286	epi->next = READ_ONCE(ep->ovflist);
1287	WRITE_ONCE(ep->ovflist, epi);
1288	ep_pm_stay_awake_rcu(epi);
1289	}
1290	} else if (!ep_is_linked(epi)) {
1291	/ In the usual case, add event to ready list. /
1292	list_add_tail(new: &epi->rdllink, head: &ep->rdllist);
1293	ep_pm_stay_awake_rcu(epi);
1294	}
1295
1296	/*
1297	* Wake up ( if active ) both the eventpoll wait list and the ->poll()
1298	* wait list.
1299	*/
1300	if (waitqueue_active(wq_head: &ep->wq)) {
1301	if ((epi->event.events & EPOLLEXCLUSIVE) &&
1302	!(pollflags & POLLFREE)) {
1303	switch (pollflags & EPOLLINOUT_BITS) {
1304	case EPOLLIN:
1305	if (epi->event.events & EPOLLIN)
1306	ewake = `1`;
1307	break;
1308	case EPOLLOUT:
1309	if (epi->event.events & EPOLLOUT)
1310	ewake = `1`;
1311	break;
1312	case `0`:
1313	ewake = `1`;
1314	break;
1315	}
1316	}
1317	if (sync)
1318	wake_up_sync(&ep->wq);
1319	else
1320	wake_up(&ep->wq);
1321	}
1322	if (waitqueue_active(wq_head: &ep->poll_wait))
1323	pwake++;
1324
1325	out_unlock:
1326	spin_unlock_irqrestore(lock: &ep->lock, flags);
1327
1328	/ We have to call this outside the lock /
1329	if (pwake)
1330	ep_poll_safewake(ep, epi, pollflags: pollflags & EPOLL_URING_WAKE);
1331
1332	if (!(epi->event.events & EPOLLEXCLUSIVE))
1333	ewake = `1`;
1334
1335	if (pollflags & POLLFREE) {
1336	/*
1337	* If we race with ep_remove_wait_queue() it can miss
1338	* ->whead = NULL and do another remove_wait_queue() after
1339	* us, so we can't use __remove_wait_queue().
1340	*/
1341	list_del_init(entry: &wait->entry);
1342	/*
1343	* ->whead != NULL protects us from the race with
1344	* ep_clear_and_put() or ep_remove(), ep_remove_wait_queue()
1345	* takes whead->lock held by the caller. Once we nullify it,
1346	* nothing protects ep/epi or even wait.
1347	*/
1348	smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL);
1349	}
1350
1351	return ewake;
1352	}
1353
1354	/*
1355	* This is the callback that is used to add our wait queue to the
1356	* target file wakeup lists.
1357	*/
1358	static void ep_ptable_queue_proc(struct file file, wait_queue_head_t whead,
1359	poll_table *pt)
1360	{
1361	struct ep_pqueue epq = container_of(pt, struct* ep_pqueue, pt);
1362	struct epitem *epi = epq->epi;
1363	struct eppoll_entry *pwq;
1364
1365	if (unlikely(!epi)) // an earlier allocation has failed
1366	return;
1367
1368	pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL);
1369	if (unlikely(!pwq)) {
1370	epq->epi = NULL;
1371	return;
1372	}
1373
1374	init_waitqueue_func_entry(wq_entry: &pwq->wait, func: ep_poll_callback);
1375	pwq->whead = whead;
1376	pwq->base = epi;
1377	if (epi->event.events & EPOLLEXCLUSIVE)
1378	add_wait_queue_exclusive(wq_head: whead, wq_entry: &pwq->wait);
1379	else
1380	add_wait_queue(wq_head: whead, wq_entry: &pwq->wait);
1381	pwq->next = epi->pwqlist;
1382	epi->pwqlist = pwq;
1383	}
1384
1385	static void ep_rbtree_insert(struct eventpoll ep, struct* epitem *epi)
1386	{
1387	int kcmp;
1388	struct rb_node *p = &ep->rbr.rb_root.rb_node, parent = NULL;
1389	struct epitem *epic;
1390	bool leftmost = true;
1391
1392	while (*p) {
1393	parent = *p;
1394	epic = rb_entry(parent, struct epitem, rbn);
1395	kcmp = ep_cmp_ffd(p1: &epi->ffd, p2: &epic->ffd);
1396	if (kcmp > `0`) {
1397	p = &parent->rb_right;
1398	leftmost = false;
1399	} else
1400	p = &parent->rb_left;
1401	}
1402	rb_link_node(node: &epi->rbn, parent, rb_link: p);
1403	rb_insert_color_cached(node: &epi->rbn, root: &ep->rbr, leftmost);
1404	}
1405
1406
1407
1408	#define PATH_ARR_SIZE 5
1409	/*
1410	* These are the number paths of length 1 to 5, that we are allowing to emanate
1411	* from a single file of interest. For example, we allow 1000 paths of length
1412	* 1, to emanate from each file of interest. This essentially represents the
1413	* potential wakeup paths, which need to be limited in order to avoid massive
1414	* uncontrolled wakeup storms. The common use case should be a single ep which
1415	* is connected to n file sources. In this case each file source has 1 path
1416	* of length 1. Thus, the numbers below should be more than sufficient. These
1417	* path limits are enforced during an EPOLL_CTL_ADD operation, since a modify
1418	* and delete can't add additional paths. Protected by the epnested_mutex.
1419	*/
1420	static const int path_limits[PATH_ARR_SIZE] = { `1000`, `500`, `100`, `50`, `10` };
1421	static int path_count[PATH_ARR_SIZE];
1422
1423	static int path_count_inc(int nests)
1424	{
1425	/ Allow an arbitrary number of depth 1 paths /
1426	if (nests == `0`)
1427	return `0`;
1428
1429	if (++path_count[nests] > path_limits[nests])
1430	return -`1`;
1431	return `0`;
1432	}
1433
1434	static void path_count_init(void)
1435	{
1436	int i;
1437
1438	for (i = `0`; i < PATH_ARR_SIZE; i++)
1439	path_count[i] = `0`;
1440	}
1441
1442	static int reverse_path_check_proc(struct hlist_head refs, int* depth)
1443	{
1444	int error = `0`;
1445	struct epitem *epi;
1446
1447	if (depth > EP_MAX_NESTS) / too deep nesting /
1448	return -`1`;
1449
1450	/ CTL_DEL can remove links here, but that can't increase our count /
1451	hlist_for_each_entry_rcu(epi, refs, fllink) {
1452	struct hlist_head *refs = &epi->ep->refs;
1453	if (hlist_empty(h: refs))
1454	error = path_count_inc(nests: depth);
1455	else
1456	error = reverse_path_check_proc(refs, depth: depth + `1`);
1457	if (error != `0`)
1458	break;
1459	}
1460	return error;
1461	}
1462
1463	/**
1464	* reverse_path_check - The tfile_check_list is list of epitem_head, which have
1465	* links that are proposed to be newly added. We need to
1466	* make sure that those added links don't add too many
1467	* paths such that we will spend all our time waking up
1468	* eventpoll objects.
1469	*
1470	* Return: %zero if the proposed links don't create too many paths,
1471	* %-1 otherwise.
1472	*/
1473	static int reverse_path_check(void)
1474	{
1475	struct epitems_head *p;
1476
1477	for (p = tfile_check_list; p != EP_UNACTIVE_PTR; p = p->next) {
1478	int error;
1479	path_count_init();
1480	rcu_read_lock();
1481	error = reverse_path_check_proc(refs: &p->epitems, depth: `0`);
1482	rcu_read_unlock();
1483	if (error)
1484	return error;
1485	}
1486	return `0`;
1487	}
1488
1489	static int ep_create_wakeup_source(struct epitem *epi)
1490	{
1491	struct name_snapshot n;
1492	struct wakeup_source *ws;
1493
1494	if (!epi->ep->ws) {
1495	epi->ep->ws = wakeup_source_register(NULL, name: "eventpoll");
1496	if (!epi->ep->ws)
1497	return -ENOMEM;
1498	}
1499
1500	take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry);
1501	ws = wakeup_source_register(NULL, name: n.name.name);
1502	release_dentry_name_snapshot(&n);
1503
1504	if (!ws)
1505	return -ENOMEM;
1506	rcu_assign_pointer(epi->ws, ws);
1507
1508	return `0`;
1509	}
1510
1511	/ rare code path, only used when EPOLL_CTL_MOD removes a wakeup source /
1512	static noinline void ep_destroy_wakeup_source(struct epitem *epi)
1513	{
1514	struct wakeup_source *ws = ep_wakeup_source(epi);
1515
1516	RCU_INIT_POINTER(epi->ws, NULL);
1517
1518	/*
1519	* wait for ep_pm_stay_awake_rcu to finish, synchronize_rcu is
1520	* used internally by wakeup_source_remove, too (called by
1521	* wakeup_source_unregister), so we cannot use call_rcu
1522	*/
1523	synchronize_rcu();
1524	wakeup_source_unregister(ws);
1525	}
1526
1527	static int attach_epitem(struct file file, struct* epitem *epi)
1528	{
1529	struct epitems_head *to_free = NULL;
1530	struct hlist_head *head = NULL;
1531	struct eventpoll *ep = NULL;
1532
1533	if (is_file_epoll(f: file))
1534	ep = file->private_data;
1535
1536	if (ep) {
1537	head = &ep->refs;
1538	} else if (!READ_ONCE(file->f_ep)) {
1539	allocate:
1540	to_free = kmem_cache_zalloc(ephead_cache, GFP_KERNEL);
1541	if (!to_free)
1542	return -ENOMEM;
1543	head = &to_free->epitems;
1544	}
1545	spin_lock(lock: &file->f_lock);
1546	if (!file->f_ep) {
1547	if (unlikely(!head)) {
1548	spin_unlock(lock: &file->f_lock);
1549	goto allocate;
1550	}
1551	/ See eventpoll_release() for details. /
1552	WRITE_ONCE(file->f_ep, head);
1553	to_free = NULL;
1554	}
1555	hlist_add_head_rcu(n: &epi->fllink, h: file->f_ep);
1556	spin_unlock(lock: &file->f_lock);
1557	free_ephead(head: to_free);
1558	return `0`;
1559	}
1560
1561	/*
1562	* Must be called with "mtx" held.
1563	*/
1564	static int ep_insert(struct eventpoll ep, const* struct epoll_event *event,
1565	struct file tfile, int* fd, int full_check)
1566	{
1567	int error, pwake = `0`;
1568	__poll_t revents;
1569	struct epitem *epi;
1570	struct ep_pqueue epq;
1571	struct eventpoll *tep = NULL;
1572
1573	if (is_file_epoll(f: tfile))
1574	tep = tfile->private_data;
1575
1576	lockdep_assert_irqs_enabled();
1577
1578	if (unlikely(percpu_counter_compare(&ep->user->epoll_watches,
1579	max_user_watches) >= `0`))
1580	return -ENOSPC;
1581	percpu_counter_inc(fbc: &ep->user->epoll_watches);
1582
1583	if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) {
1584	percpu_counter_dec(fbc: &ep->user->epoll_watches);
1585	return -ENOMEM;
1586	}
1587
1588	/ Item initialization follow here ... /
1589	INIT_LIST_HEAD(list: &epi->rdllink);
1590	epi->ep = ep;
1591	ep_set_ffd(ffd: &epi->ffd, file: tfile, fd);
1592	epi->event = *event;
1593	epi->next = EP_UNACTIVE_PTR;
1594
1595	if (tep)
1596	mutex_lock_nested(&tep->mtx, `1`);
1597	/ Add the current item to the list of active epoll hook for this file /
1598	if (unlikely(attach_epitem(tfile, epi) < `0`)) {
1599	if (tep)
1600	mutex_unlock(lock: &tep->mtx);
1601	kmem_cache_free(s: epi_cache, objp: epi);
1602	percpu_counter_dec(fbc: &ep->user->epoll_watches);
1603	return -ENOMEM;
1604	}
1605
1606	if (full_check && !tep)
1607	list_file(file: tfile);
1608
1609	/*
1610	* Add the current item to the RB tree. All RB tree operations are
1611	* protected by "mtx", and ep_insert() is called with "mtx" held.
1612	*/
1613	ep_rbtree_insert(ep, epi);
1614	if (tep)
1615	mutex_unlock(lock: &tep->mtx);
1616
1617	/*
1618	* ep_remove_safe() calls in the later error paths can't lead to
1619	* ep_free() as the ep file itself still holds an ep reference.
1620	*/
1621	ep_get(ep);
1622
1623	/ now check if we've created too many backpaths /
1624	if (unlikely(full_check && reverse_path_check())) {
1625	ep_remove_safe(ep, epi);
1626	return -EINVAL;
1627	}
1628
1629	if (epi->event.events & EPOLLWAKEUP) {
1630	error = ep_create_wakeup_source(epi);
1631	if (error) {
1632	ep_remove_safe(ep, epi);
1633	return error;
1634	}
1635	}
1636
1637	/ Initialize the poll table using the queue callback /
1638	epq.epi = epi;
1639	init_poll_funcptr(pt: &epq.pt, qproc: ep_ptable_queue_proc);
1640
1641	/*
1642	* Attach the item to the poll hooks and get current event bits.
1643	* We can safely use the file* here because its usage count has
1644	* been increased by the caller of this function. Note that after
1645	* this operation completes, the poll callback can start hitting
1646	* the new item.
1647	*/
1648	revents = ep_item_poll(epi, pt: &epq.pt, depth: `1`);
1649
1650	/*
1651	* We have to check if something went wrong during the poll wait queue
1652	* install process. Namely an allocation for a wait queue failed due
1653	* high memory pressure.
1654	*/
1655	if (unlikely(!epq.epi)) {
1656	ep_remove_safe(ep, epi);
1657	return -ENOMEM;
1658	}
1659
1660	/ We have to drop the new item inside our item list to keep track of it /
1661	spin_lock_irq(lock: &ep->lock);
1662
1663	/ record NAPI ID of new item if present /
1664	ep_set_busy_poll_napi_id(epi);
1665
1666	/ If the file is already "ready" we drop it inside the ready list /
1667	if (revents && !ep_is_linked(epi)) {
1668	list_add_tail(new: &epi->rdllink, head: &ep->rdllist);
1669	ep_pm_stay_awake(epi);
1670
1671	/ Notify waiting tasks that events are available /
1672	if (waitqueue_active(wq_head: &ep->wq))
1673	wake_up(&ep->wq);
1674	if (waitqueue_active(wq_head: &ep->poll_wait))
1675	pwake++;
1676	}
1677
1678	spin_unlock_irq(lock: &ep->lock);
1679
1680	/ We have to call this outside the lock /
1681	if (pwake)
1682	ep_poll_safewake(ep, NULL, pollflags: `0`);
1683
1684	return `0`;
1685	}
1686
1687	/*
1688	* Modify the interest event mask by dropping an event if the new mask
1689	* has a match in the current file status. Must be called with "mtx" held.
1690	*/
1691	static int ep_modify(struct eventpoll ep, struct* epitem *epi,
1692	const struct epoll_event *event)
1693	{
1694	int pwake = `0`;
1695	poll_table pt;
1696
1697	lockdep_assert_irqs_enabled();
1698
1699	init_poll_funcptr(pt: &pt, NULL);
1700
1701	/*
1702	* Set the new event interest mask before calling f_op->poll();
1703	* otherwise we might miss an event that happens between the
1704	* f_op->poll() call and the new event set registering.
1705	*/
1706	epi->event.events = event->events; / need barrier below /
1707	epi->event.data = event->data; / protected by mtx /
1708	if (epi->event.events & EPOLLWAKEUP) {
1709	if (!ep_has_wakeup_source(epi))
1710	ep_create_wakeup_source(epi);
1711	} else if (ep_has_wakeup_source(epi)) {
1712	ep_destroy_wakeup_source(epi);
1713	}
1714
1715	/*
1716	* The following barrier has two effects:
1717	*
1718	* 1) Flush epi changes above to other CPUs. This ensures
1719	* we do not miss events from ep_poll_callback if an
1720	* event occurs immediately after we call f_op->poll().
1721	* We need this because we did not take ep->lock while
1722	* changing epi above (but ep_poll_callback does take
1723	* ep->lock).
1724	*
1725	* 2) We also need to ensure we do not miss _past_ events
1726	* when calling f_op->poll(). This barrier also
1727	* pairs with the barrier in wq_has_sleeper (see
1728	* comments for wq_has_sleeper).
1729	*
1730	* This barrier will now guarantee ep_poll_callback or f_op->poll
1731	* (or both) will notice the readiness of an item.
1732	*/
1733	smp_mb();
1734
1735	/*
1736	* Get current event bits. We can safely use the file* here because
1737	* its usage count has been increased by the caller of this function.
1738	* If the item is "hot" and it is not registered inside the ready
1739	* list, push it inside.
1740	*/
1741	if (ep_item_poll(epi, pt: &pt, depth: `1`)) {
1742	spin_lock_irq(lock: &ep->lock);
1743	if (!ep_is_linked(epi)) {
1744	list_add_tail(new: &epi->rdllink, head: &ep->rdllist);
1745	ep_pm_stay_awake(epi);
1746
1747	/ Notify waiting tasks that events are available /
1748	if (waitqueue_active(wq_head: &ep->wq))
1749	wake_up(&ep->wq);
1750	if (waitqueue_active(wq_head: &ep->poll_wait))
1751	pwake++;
1752	}
1753	spin_unlock_irq(lock: &ep->lock);
1754	}
1755
1756	/ We have to call this outside the lock /
1757	if (pwake)
1758	ep_poll_safewake(ep, NULL, pollflags: `0`);
1759
1760	return `0`;
1761	}
1762
1763	static int ep_send_events(struct eventpoll *ep,
1764	struct epoll_event __user events, int* maxevents)
1765	{
1766	struct epitem epi, tmp;
1767	LIST_HEAD(txlist);
1768	poll_table pt;
1769	int res = `0`;
1770
1771	/*
1772	* Always short-circuit for fatal signals to allow threads to make a
1773	* timely exit without the chance of finding more events available and
1774	* fetching repeatedly.
1775	*/
1776	if (fatal_signal_pending(current))
1777	return -EINTR;
1778
1779	init_poll_funcptr(pt: &pt, NULL);
1780
1781	mutex_lock(lock: &ep->mtx);
1782	ep_start_scan(ep, txlist: &txlist);
1783
1784	/*
1785	* We can loop without lock because we are passed a task private list.
1786	* Items cannot vanish during the loop we are holding ep->mtx.
1787	*/
1788	list_for_each_entry_safe(epi, tmp, &txlist, rdllink) {
1789	struct wakeup_source *ws;
1790	__poll_t revents;
1791
1792	if (res >= maxevents)
1793	break;
1794
1795	/*
1796	* Activate ep->ws before deactivating epi->ws to prevent
1797	* triggering auto-suspend here (in case we reactive epi->ws
1798	* below).
1799	*
1800	* This could be rearranged to delay the deactivation of epi->ws
1801	* instead, but then epi->ws would temporarily be out of sync
1802	* with ep_is_linked().
1803	*/
1804	ws = ep_wakeup_source(epi);
1805	if (ws) {
1806	if (ws->active)
1807	__pm_stay_awake(ws: ep->ws);
1808	__pm_relax(ws);
1809	}
1810
1811	list_del_init(entry: &epi->rdllink);
1812
1813	/*
1814	* If the event mask intersect the caller-requested one,
1815	* deliver the event to userspace. Again, we are holding ep->mtx,
1816	* so no operations coming from userspace can change the item.
1817	*/
1818	revents = ep_item_poll(epi, pt: &pt, depth: `1`);
1819	if (!revents)
1820	continue;
1821
1822	events = epoll_put_uevent(revents, data: epi->event.data, uevent: events);
1823	if (!events) {
1824	list_add(new: &epi->rdllink, head: &txlist);
1825	ep_pm_stay_awake(epi);
1826	if (!res)
1827	res = -EFAULT;
1828	break;
1829	}
1830	res++;
1831	if (epi->event.events & EPOLLONESHOT)
1832	epi->event.events &= EP_PRIVATE_BITS;
1833	else if (!(epi->event.events & EPOLLET)) {
1834	/*
1835	* If this file has been added with Level
1836	* Trigger mode, we need to insert back inside
1837	* the ready list, so that the next call to
1838	* epoll_wait() will check again the events
1839	* availability. At this point, no one can insert
1840	* into ep->rdllist besides us. The epoll_ctl()
1841	* callers are locked out by
1842	* ep_send_events() holding "mtx" and the
1843	* poll callback will queue them in ep->ovflist.
1844	*/
1845	list_add_tail(new: &epi->rdllink, head: &ep->rdllist);
1846	ep_pm_stay_awake(epi);
1847	}
1848	}
1849	ep_done_scan(ep, txlist: &txlist);
1850	mutex_unlock(lock: &ep->mtx);
1851
1852	return res;
1853	}
1854
1855	static struct timespec64 ep_timeout_to_timespec(struct* timespec64 to, long* ms)
1856	{
1857	struct timespec64 now;
1858
1859	if (ms < `0`)
1860	return NULL;
1861
1862	if (!ms) {
1863	to->tv_sec = `0`;
1864	to->tv_nsec = `0`;
1865	return to;
1866	}
1867
1868	to->tv_sec = ms / MSEC_PER_SEC;
1869	to->tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC);
1870
1871	ktime_get_ts64(ts: &now);
1872	to = timespec64_add_safe(lhs: now, rhs: to);
1873	return to;
1874	}
1875
1876	/*
1877	* autoremove_wake_function, but remove even on failure to wake up, because we
1878	* know that default_wake_function/ttwu will only fail if the thread is already
1879	* woken, and in that case the ep_poll loop will remove the entry anyways, not
1880	* try to reuse it.
1881	*/
1882	static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry,
1883	unsigned int mode, int sync, void *key)
1884	{
1885	int ret = default_wake_function(wq_entry, mode, flags: sync, key);
1886
1887	/*
1888	* Pairs with list_empty_careful in ep_poll, and ensures future loop
1889	* iterations see the cause of this wakeup.
1890	*/
1891	list_del_init_careful(entry: &wq_entry->entry);
1892	return ret;
1893	}
1894
1895	static int ep_try_send_events(struct eventpoll *ep,
1896	struct epoll_event __user events, int* maxevents)
1897	{
1898	int res;
1899
1900	/*
1901	* Try to transfer events to user space. In case we get 0 events and
1902	* there's still timeout left over, we go trying again in search of
1903	* more luck.
1904	*/
1905	res = ep_send_events(ep, events, maxevents);
1906	if (res > `0`)
1907	ep_suspend_napi_irqs(ep);
1908	return res;
1909	}
1910
1911	static int ep_schedule_timeout(ktime_t *to)
1912	{
1913	if (to)
1914	return ktime_after(cmp1: *to, cmp2: ktime_get());
1915	else
1916	return `1`;
1917	}
1918
1919	/**
1920	* ep_poll - Retrieves ready events, and delivers them to the caller-supplied
1921	* event buffer.
1922	*
1923	* @ep: Pointer to the eventpoll context.
1924	* @events: Pointer to the userspace buffer where the ready events should be
1925	* stored.
1926	* @maxevents: Size (in terms of number of events) of the caller event buffer.
1927	* @timeout: Maximum timeout for the ready events fetch operation, in
1928	* timespec. If the timeout is zero, the function will not block,
1929	* while if the @timeout ptr is NULL, the function will block
1930	* until at least one event has been retrieved (or an error
1931	* occurred).
1932	*
1933	* Return: the number of ready events which have been fetched, or an
1934	* error code, in case of error.
1935	*/
1936	static int ep_poll(struct eventpoll ep, struct* epoll_event __user *events,
1937	int maxevents, struct timespec64 *timeout)
1938	{
1939	int res, eavail, timed_out = `0`;
1940	u64 slack = `0`;
1941	wait_queue_entry_t wait;
1942	ktime_t expires, *to = NULL;
1943
1944	lockdep_assert_irqs_enabled();
1945
1946	if (timeout && (timeout->tv_sec \| timeout->tv_nsec)) {
1947	slack = select_estimate_accuracy(tv: timeout);
1948	to = &expires;
1949	to = timespec64_to_ktime(ts: timeout);
1950	} else if (timeout) {
1951	/*
1952	* Avoid the unnecessary trip to the wait queue loop, if the
1953	* caller specified a non blocking operation.
1954	*/
1955	timed_out = `1`;
1956	}
1957
1958	/*
1959	* This call is racy: We may or may not see events that are being added
1960	* to the ready list under the lock (e.g., in IRQ callbacks). For cases
1961	* with a non-zero timeout, this thread will check the ready list under
1962	* lock and will add to the wait queue. For cases with a zero
1963	* timeout, the user by definition should not care and will have to
1964	* recheck again.
1965	*/
1966	eavail = ep_events_available(ep);
1967
1968	while (`1`) {
1969	if (eavail) {
1970	res = ep_try_send_events(ep, events, maxevents);
1971	if (res)
1972	return res;
1973	}
1974
1975	if (timed_out)
1976	return `0`;
1977
1978	eavail = ep_busy_loop(ep);
1979	if (eavail)
1980	continue;
1981
1982	if (signal_pending(current))
1983	return -EINTR;
1984
1985	/*
1986	* Internally init_wait() uses autoremove_wake_function(),
1987	* thus wait entry is removed from the wait queue on each
1988	* wakeup. Why it is important? In case of several waiters
1989	* each new wakeup will hit the next waiter, giving it the
1990	* chance to harvest new event. Otherwise wakeup can be
1991	* lost. This is also good performance-wise, because on
1992	* normal wakeup path no need to call __remove_wait_queue()
1993	* explicitly, thus ep->lock is not taken, which halts the
1994	* event delivery.
1995	*
1996	* In fact, we now use an even more aggressive function that
1997	* unconditionally removes, because we don't reuse the wait
1998	* entry between loop iterations. This lets us also avoid the
1999	* performance issue if a process is killed, causing all of its
2000	* threads to wake up without being removed normally.
2001	*/
2002	init_wait(&wait);
2003	wait.func = ep_autoremove_wake_function;
2004
2005	spin_lock_irq(lock: &ep->lock);
2006	/*
2007	* Barrierless variant, waitqueue_active() is called under
2008	* the same lock on wakeup ep_poll_callback() side, so it
2009	* is safe to avoid an explicit barrier.
2010	*/
2011	__set_current_state(TASK_INTERRUPTIBLE);
2012
2013	/*
2014	* Do the final check under the lock. ep_start/done_scan()
2015	* plays with two lists (->rdllist and ->ovflist) and there
2016	* is always a race when both lists are empty for short
2017	* period of time although events are pending, so lock is
2018	* important.
2019	*/
2020	eavail = ep_events_available(ep);
2021	if (!eavail)
2022	__add_wait_queue_exclusive(wq_head: &ep->wq, wq_entry: &wait);
2023
2024	spin_unlock_irq(lock: &ep->lock);
2025
2026	if (!eavail)
2027	timed_out = !ep_schedule_timeout(to) \|\|
2028	!schedule_hrtimeout_range(expires: to, delta: slack,
2029	mode: HRTIMER_MODE_ABS);
2030	__set_current_state(TASK_RUNNING);
2031
2032	/*
2033	* We were woken up, thus go and try to harvest some events.
2034	* If timed out and still on the wait queue, recheck eavail
2035	* carefully under lock, below.
2036	*/
2037	eavail = `1`;
2038
2039	if (!list_empty_careful(head: &wait.entry)) {
2040	spin_lock_irq(lock: &ep->lock);
2041	/*
2042	* If the thread timed out and is not on the wait queue,
2043	* it means that the thread was woken up after its
2044	* timeout expired before it could reacquire the lock.
2045	* Thus, when wait.entry is empty, it needs to harvest
2046	* events.
2047	*/
2048	if (timed_out)
2049	eavail = list_empty(head: &wait.entry);
2050	__remove_wait_queue(wq_head: &ep->wq, wq_entry: &wait);
2051	spin_unlock_irq(lock: &ep->lock);
2052	}
2053	}
2054	}
2055
2056	/**
2057	* ep_loop_check_proc - verify that adding an epoll file @ep inside another
2058	* epoll file does not create closed loops, and
2059	* determine the depth of the subtree starting at @ep
2060	*
2061	* @ep: the &struct eventpoll to be currently checked.
2062	* @depth: Current depth of the path being checked.
2063	*
2064	* Return: depth of the subtree, or INT_MAX if we found a loop or went too deep.
2065	*/
2066	static int ep_loop_check_proc(struct eventpoll ep, int* depth)
2067	{
2068	int result = `0`;
2069	struct rb_node *rbp;
2070	struct epitem *epi;
2071
2072	if (ep->gen == loop_check_gen)
2073	return ep->loop_check_depth;
2074
2075	mutex_lock_nested(&ep->mtx, depth + `1`);
2076	ep->gen = loop_check_gen;
2077	for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
2078	epi = rb_entry(rbp, struct epitem, rbn);
2079	if (unlikely(is_file_epoll(epi->ffd.file))) {
2080	struct eventpoll *ep_tovisit;
2081	ep_tovisit = epi->ffd.file->private_data;
2082	if (ep_tovisit == inserting_into \|\| depth > EP_MAX_NESTS)
2083	result = INT_MAX;
2084	else
2085	result = max(result, ep_loop_check_proc(ep_tovisit, depth + `1`) + `1`);
2086	if (result > EP_MAX_NESTS)
2087	break;
2088	} else {
2089	/*
2090	* If we've reached a file that is not associated with
2091	* an ep, then we need to check if the newly added
2092	* links are going to add too many wakeup paths. We do
2093	* this by adding it to the tfile_check_list, if it's
2094	* not already there, and calling reverse_path_check()
2095	* during ep_insert().
2096	*/
2097	list_file(file: epi->ffd.file);
2098	}
2099	}
2100	ep->loop_check_depth = result;
2101	mutex_unlock(lock: &ep->mtx);
2102
2103	return result;
2104	}
2105
2106	/ ep_get_upwards_depth_proc - determine depth of @ep when traversed upwards /
2107	static int ep_get_upwards_depth_proc(struct eventpoll ep, int* depth)
2108	{
2109	int result = `0`;
2110	struct epitem *epi;
2111
2112	if (ep->gen == loop_check_gen)
2113	return ep->loop_check_depth;
2114	hlist_for_each_entry_rcu(epi, &ep->refs, fllink)
2115	result = max(result, ep_get_upwards_depth_proc(epi->ep, depth + `1`) + `1`);
2116	ep->gen = loop_check_gen;
2117	ep->loop_check_depth = result;
2118	return result;
2119	}
2120
2121	/**
2122	* ep_loop_check - Performs a check to verify that adding an epoll file (@to)
2123	* into another epoll file (represented by @ep) does not create
2124	* closed loops or too deep chains.
2125	*
2126	* @ep: Pointer to the epoll we are inserting into.
2127	* @to: Pointer to the epoll to be inserted.
2128	*
2129	* Return: %zero if adding the epoll @to inside the epoll @from
2130	* does not violate the constraints, or %-1 otherwise.
2131	*/
2132	static int ep_loop_check(struct eventpoll ep, struct* eventpoll *to)
2133	{
2134	int depth, upwards_depth;
2135
2136	inserting_into = ep;
2137	/*
2138	* Check how deep down we can get from @to, and whether it is possible
2139	* to loop up to @ep.
2140	*/
2141	depth = ep_loop_check_proc(ep: to, depth: `0`);
2142	if (depth > EP_MAX_NESTS)
2143	return -`1`;
2144	/ Check how far up we can go from @ep. /
2145	rcu_read_lock();
2146	upwards_depth = ep_get_upwards_depth_proc(ep, depth: `0`);
2147	rcu_read_unlock();
2148
2149	return (depth+`1`+upwards_depth > EP_MAX_NESTS) ? -`1` : `0`;
2150	}
2151
2152	static void clear_tfile_check_list(void)
2153	{
2154	rcu_read_lock();
2155	while (tfile_check_list != EP_UNACTIVE_PTR) {
2156	struct epitems_head *head = tfile_check_list;
2157	tfile_check_list = head->next;
2158	unlist_file(head);
2159	}
2160	rcu_read_unlock();
2161	}
2162
2163	/*
2164	* Open an eventpoll file descriptor.
2165	*/
2166	static int do_epoll_create(int flags)
2167	{
2168	int error, fd;
2169	struct eventpoll *ep = NULL;
2170	struct file *file;
2171
2172	/ Check the EPOLL_* constant for consistency. /
2173	BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
2174
2175	if (flags & ~EPOLL_CLOEXEC)
2176	return -EINVAL;
2177	/*
2178	* Create the internal data structure ("struct eventpoll").
2179	*/
2180	error = ep_alloc(pep: &ep);
2181	if (error < `0`)
2182	return error;
2183	/*
2184	* Creates all the items needed to setup an eventpoll file. That is,
2185	* a file structure and a free file descriptor.
2186	*/
2187	fd = get_unused_fd_flags(O_RDWR \| (flags & O_CLOEXEC));
2188	if (fd < `0`) {
2189	error = fd;
2190	goto out_free_ep;
2191	}
2192	file = anon_inode_getfile(name: "[eventpoll]", fops: &eventpoll_fops, priv: ep,
2193	O_RDWR \| (flags & O_CLOEXEC));
2194	if (IS_ERR(ptr: file)) {
2195	error = PTR_ERR(ptr: file);
2196	goto out_free_fd;
2197	}
2198	ep->file = file;
2199	fd_install(fd, file);
2200	return fd;
2201
2202	out_free_fd:
2203	put_unused_fd(fd);
2204	out_free_ep:
2205	ep_clear_and_put(ep);
2206	return error;
2207	}
2208
2209	SYSCALL_DEFINE1(epoll_create1, int, flags)
2210	{
2211	return do_epoll_create(flags);
2212	}
2213
2214	SYSCALL_DEFINE1(epoll_create, int, size)
2215	{
2216	if (size <= `0`)
2217	return -EINVAL;
2218
2219	return do_epoll_create(flags: `0`);
2220	}
2221
2222	#ifdef CONFIG_PM_SLEEP
2223	static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev)
2224	{
2225	if ((epev->events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND))
2226	epev->events &= ~EPOLLWAKEUP;
2227	}
2228	#else
2229	static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev)
2230	{
2231	epev->events &= ~EPOLLWAKEUP;
2232	}
2233	#endif
2234
2235	static inline int epoll_mutex_lock(struct mutex mutex, int* depth,
2236	bool nonblock)
2237	{
2238	if (!nonblock) {
2239	mutex_lock_nested(mutex, depth);
2240	return `0`;
2241	}
2242	if (mutex_trylock(lock: mutex))
2243	return `0`;
2244	return -EAGAIN;
2245	}
2246
2247	int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
2248	bool nonblock)
2249	{
2250	int error;
2251	int full_check = `0`;
2252	struct eventpoll *ep;
2253	struct epitem *epi;
2254	struct eventpoll *tep = NULL;
2255
2256	CLASS(fd, f)(fd: epfd);
2257	if (fd_empty(f))
2258	return -EBADF;
2259
2260	/ Get the "struct file " for the target file /*
2261	CLASS(fd, tf)(fd);
2262	if (fd_empty(f: tf))
2263	return -EBADF;
2264
2265	/ The target file descriptor must support poll /
2266	if (!file_can_poll(fd_file(tf)))
2267	return -EPERM;
2268
2269	/ Check if EPOLLWAKEUP is allowed /
2270	if (ep_op_has_event(op))
2271	ep_take_care_of_epollwakeup(epev: epds);
2272
2273	/*
2274	* We have to check that the file structure underneath the file descriptor
2275	* the user passed to us _is_ an eventpoll file. And also we do not permit
2276	* adding an epoll file descriptor inside itself.
2277	*/
2278	error = -EINVAL;
2279	if (fd_file(f) == fd_file(tf) \|\| !is_file_epoll(fd_file(f)))
2280	goto error_tgt_fput;
2281
2282	/*
2283	* epoll adds to the wakeup queue at EPOLL_CTL_ADD time only,
2284	* so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
2285	* Also, we do not currently supported nested exclusive wakeups.
2286	*/
2287	if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
2288	if (op == EPOLL_CTL_MOD)
2289	goto error_tgt_fput;
2290	if (op == EPOLL_CTL_ADD && (is_file_epoll(fd_file(tf)) \|\|
2291	(epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
2292	goto error_tgt_fput;
2293	}
2294
2295	/*
2296	* At this point it is safe to assume that the "private_data" contains
2297	* our own data structure.
2298	*/
2299	ep = fd_file(f)->private_data;
2300
2301	/*
2302	* When we insert an epoll file descriptor inside another epoll file
2303	* descriptor, there is the chance of creating closed loops, which are
2304	* better be handled here, than in more critical paths. While we are
2305	* checking for loops we also determine the list of files reachable
2306	* and hang them on the tfile_check_list, so we can check that we
2307	* haven't created too many possible wakeup paths.
2308	*
2309	* We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
2310	* the epoll file descriptor is attaching directly to a wakeup source,
2311	* unless the epoll file descriptor is nested. The purpose of taking the
2312	* 'epnested_mutex' on add is to prevent complex toplogies such as loops and
2313	* deep wakeup paths from forming in parallel through multiple
2314	* EPOLL_CTL_ADD operations.
2315	*/
2316	error = epoll_mutex_lock(mutex: &ep->mtx, depth: `0`, nonblock);
2317	if (error)
2318	goto error_tgt_fput;
2319	if (op == EPOLL_CTL_ADD) {
2320	if (READ_ONCE(fd_file(f)->f_ep) \|\| ep->gen == loop_check_gen \|\|
2321	is_file_epoll(fd_file(tf))) {
2322	mutex_unlock(lock: &ep->mtx);
2323	error = epoll_mutex_lock(mutex: &epnested_mutex, depth: `0`, nonblock);
2324	if (error)
2325	goto error_tgt_fput;
2326	loop_check_gen++;
2327	full_check = `1`;
2328	if (is_file_epoll(fd_file(tf))) {
2329	tep = fd_file(tf)->private_data;
2330	error = -ELOOP;
2331	if (ep_loop_check(ep, to: tep) != `0`)
2332	goto error_tgt_fput;
2333	}
2334	error = epoll_mutex_lock(mutex: &ep->mtx, depth: `0`, nonblock);
2335	if (error)
2336	goto error_tgt_fput;
2337	}
2338	}
2339
2340	/*
2341	* Try to lookup the file inside our RB tree. Since we grabbed "mtx"
2342	* above, we can be sure to be able to use the item looked up by
2343	* ep_find() till we release the mutex.
2344	*/
2345	epi = ep_find(ep, fd_file(tf), fd);
2346
2347	error = -EINVAL;
2348	switch (op) {
2349	case EPOLL_CTL_ADD:
2350	if (!epi) {
2351	epds->events \|= EPOLLERR \| EPOLLHUP;
2352	error = ep_insert(ep, event: epds, fd_file(tf), fd, full_check);
2353	} else
2354	error = -EEXIST;
2355	break;
2356	case EPOLL_CTL_DEL:
2357	if (epi) {
2358	/*
2359	* The eventpoll itself is still alive: the refcount
2360	* can't go to zero here.
2361	*/
2362	ep_remove_safe(ep, epi);
2363	error = `0`;
2364	} else {
2365	error = -ENOENT;
2366	}
2367	break;
2368	case EPOLL_CTL_MOD:
2369	if (epi) {
2370	if (!(epi->event.events & EPOLLEXCLUSIVE)) {
2371	epds->events \|= EPOLLERR \| EPOLLHUP;
2372	error = ep_modify(ep, epi, event: epds);
2373	}
2374	} else
2375	error = -ENOENT;
2376	break;
2377	}
2378	mutex_unlock(lock: &ep->mtx);
2379
2380	error_tgt_fput:
2381	if (full_check) {
2382	clear_tfile_check_list();
2383	loop_check_gen++;
2384	mutex_unlock(lock: &epnested_mutex);
2385	}
2386	return error;
2387	}
2388
2389	/*
2390	* The following function implements the controller interface for
2391	* the eventpoll file that enables the insertion/removal/change of
2392	* file descriptors inside the interest set.
2393	*/
2394	SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
2395	struct epoll_event __user *, event)
2396	{
2397	struct epoll_event epds;
2398
2399	if (ep_op_has_event(op) &&
2400	copy_from_user(to: &epds, from: event, n: sizeof(struct epoll_event)))
2401	return -EFAULT;
2402
2403	return do_epoll_ctl(epfd, op, fd, epds: &epds, nonblock: false);
2404	}
2405
2406	static int ep_check_params(struct file file, struct* epoll_event __user *evs,
2407	int maxevents)
2408	{
2409	/ The maximum number of event must be greater than zero /
2410	if (maxevents <= `0` \|\| maxevents > EP_MAX_EVENTS)
2411	return -EINVAL;
2412
2413	/ Verify that the area passed by the user is writeable /
2414	if (!access_ok(evs, maxevents * sizeof(struct epoll_event)))
2415	return -EFAULT;
2416
2417	/*
2418	* We have to check that the file structure underneath the fd
2419	* the user passed to us _is_ an eventpoll file.
2420	*/
2421	if (!is_file_epoll(f: file))
2422	return -EINVAL;
2423
2424	return `0`;
2425	}
2426
2427	int epoll_sendevents(struct file file, struct* epoll_event __user *events,
2428	int maxevents)
2429	{
2430	struct eventpoll *ep;
2431	int ret;
2432
2433	ret = ep_check_params(file, evs: events, maxevents);
2434	if (unlikely(ret))
2435	return ret;
2436
2437	ep = file->private_data;
2438	/*
2439	* Racy call, but that's ok - it should get retried based on
2440	* poll readiness anyway.
2441	*/
2442	if (ep_events_available(ep))
2443	return ep_try_send_events(ep, events, maxevents);
2444	return `0`;
2445	}
2446
2447	/*
2448	* Implement the event wait interface for the eventpoll file. It is the kernel
2449	* part of the user space epoll_wait(2).
2450	*/
2451	static int do_epoll_wait(int epfd, struct epoll_event __user *events,
2452	int maxevents, struct timespec64 *to)
2453	{
2454	struct eventpoll *ep;
2455	int ret;
2456
2457	/ Get the "struct file " for the eventpoll file /*
2458	CLASS(fd, f)(fd: epfd);
2459	if (fd_empty(f))
2460	return -EBADF;
2461
2462	ret = ep_check_params(fd_file(f), evs: events, maxevents);
2463	if (unlikely(ret))
2464	return ret;
2465
2466	/*
2467	* At this point it is safe to assume that the "private_data" contains
2468	* our own data structure.
2469	*/
2470	ep = fd_file(f)->private_data;
2471
2472	/ Time to fish for events ... /
2473	return ep_poll(ep, events, maxevents, timeout: to);
2474	}
2475
2476	SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
2477	int, maxevents, int, timeout)
2478	{
2479	struct timespec64 to;
2480
2481	return do_epoll_wait(epfd, events, maxevents,
2482	to: ep_timeout_to_timespec(to: &to, ms: timeout));
2483	}
2484
2485	/*
2486	* Implement the event wait interface for the eventpoll file. It is the kernel
2487	* part of the user space epoll_pwait(2).
2488	*/
2489	static int do_epoll_pwait(int epfd, struct epoll_event __user *events,
2490	int maxevents, struct timespec64 *to,
2491	const sigset_t __user *sigmask, size_t sigsetsize)
2492	{
2493	int error;
2494
2495	/*
2496	* If the caller wants a certain signal mask to be set during the wait,
2497	* we apply it here.
2498	*/
2499	error = set_user_sigmask(umask: sigmask, sigsetsize);
2500	if (error)
2501	return error;
2502
2503	error = do_epoll_wait(epfd, events, maxevents, to);
2504
2505	restore_saved_sigmask_unless(interrupted: error == -EINTR);
2506
2507	return error;
2508	}
2509
2510	SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
2511	int, maxevents, int, timeout, const sigset_t __user *, sigmask,
2512	size_t, sigsetsize)
2513	{
2514	struct timespec64 to;
2515
2516	return do_epoll_pwait(epfd, events, maxevents,
2517	to: ep_timeout_to_timespec(to: &to, ms: timeout),
2518	sigmask, sigsetsize);
2519	}
2520
2521	SYSCALL_DEFINE6(epoll_pwait2, int, epfd, struct epoll_event __user *, events,
2522	int, maxevents, const struct __kernel_timespec __user *, timeout,
2523	const sigset_t __user *, sigmask, size_t, sigsetsize)
2524	{
2525	struct timespec64 ts, *to = NULL;
2526
2527	if (timeout) {
2528	if (get_timespec64(ts: &ts, uts: timeout))
2529	return -EFAULT;
2530	to = &ts;
2531	if (poll_select_set_timeout(to, sec: ts.tv_sec, nsec: ts.tv_nsec))
2532	return -EINVAL;
2533	}
2534
2535	return do_epoll_pwait(epfd, events, maxevents, to,
2536	sigmask, sigsetsize);
2537	}
2538
2539	#ifdef CONFIG_COMPAT
2540	static int do_compat_epoll_pwait(int epfd, struct epoll_event __user *events,
2541	int maxevents, struct timespec64 *timeout,
2542	const compat_sigset_t __user *sigmask,
2543	compat_size_t sigsetsize)
2544	{
2545	long err;
2546
2547	/*
2548	* If the caller wants a certain signal mask to be set during the wait,
2549	* we apply it here.
2550	*/
2551	err = set_compat_user_sigmask(umask: sigmask, sigsetsize);
2552	if (err)
2553	return err;
2554
2555	err = do_epoll_wait(epfd, events, maxevents, to: timeout);
2556
2557	restore_saved_sigmask_unless(interrupted: err == -EINTR);
2558
2559	return err;
2560	}
2561
2562	COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
2563	struct epoll_event __user *, events,
2564	int, maxevents, int, timeout,
2565	const compat_sigset_t __user *, sigmask,
2566	compat_size_t, sigsetsize)
2567	{
2568	struct timespec64 to;
2569
2570	return do_compat_epoll_pwait(epfd, events, maxevents,
2571	timeout: ep_timeout_to_timespec(to: &to, ms: timeout),
2572	sigmask, sigsetsize);
2573	}
2574
2575	COMPAT_SYSCALL_DEFINE6(epoll_pwait2, int, epfd,
2576	struct epoll_event __user *, events,
2577	int, maxevents,
2578	const struct __kernel_timespec __user *, timeout,
2579	const compat_sigset_t __user *, sigmask,
2580	compat_size_t, sigsetsize)
2581	{
2582	struct timespec64 ts, *to = NULL;
2583
2584	if (timeout) {
2585	if (get_timespec64(ts: &ts, uts: timeout))
2586	return -EFAULT;
2587	to = &ts;
2588	if (poll_select_set_timeout(to, sec: ts.tv_sec, nsec: ts.tv_nsec))
2589	return -EINVAL;
2590	}
2591
2592	return do_compat_epoll_pwait(epfd, events, maxevents, timeout: to,
2593	sigmask, sigsetsize);
2594	}
2595
2596	#endif
2597
2598	static int __init eventpoll_init(void)
2599	{
2600	struct sysinfo si;
2601
2602	si_meminfo(val: &si);
2603	/*
2604	* Allows top 4% of lomem to be allocated for epoll watches (per user).
2605	*/
2606	max_user_watches = (((si.totalram - si.totalhigh) / `25`) << PAGE_SHIFT) /
2607	EP_ITEM_COST;
2608	BUG_ON(max_user_watches < `0`);
2609
2610	/*
2611	* We can have many thousands of epitems, so prevent this from
2612	* using an extra cache line on 64-bit (and smaller) CPUs
2613	*/
2614	BUILD_BUG_ON(sizeof(void ) <= `8` && sizeof(struct* epitem) > `128`);
2615
2616	/ Allocates slab cache used to allocate "struct epitem" items /
2617	epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
2618	`0`, SLAB_HWCACHE_ALIGN\|SLAB_PANIC\|SLAB_ACCOUNT, NULL);
2619
2620	/ Allocates slab cache used to allocate "struct eppoll_entry" /
2621	pwq_cache = kmem_cache_create("eventpoll_pwq",
2622	sizeof(struct eppoll_entry), `0`, SLAB_PANIC\|SLAB_ACCOUNT, NULL);
2623	epoll_sysctls_init();
2624
2625	ephead_cache = kmem_cache_create("ep_head",
2626	sizeof(struct epitems_head), `0`, SLAB_PANIC\|SLAB_ACCOUNT, NULL);
2627
2628	return `0`;
2629	}
2630	fs_initcall(eventpoll_init);
2631

Browse the source code of Linux/fs/eventpoll.c