poll.c source code [Linux/io_uring/poll.c]

1	// SPDX-License-Identifier: GPL-2.0
2	#include <linux/kernel.h>
3	#include <linux/errno.h>
4	#include <linux/fs.h>
5	#include <linux/file.h>
6	#include <linux/mm.h>
7	#include <linux/slab.h>
8	#include <linux/poll.h>
9	#include <linux/hashtable.h>
10	#include <linux/io_uring.h>
11
12	#include <trace/events/io_uring.h>
13
14	#include <uapi/linux/io_uring.h>
15
16	#include "io_uring.h"
17	#include "alloc_cache.h"
18	#include "refs.h"
19	#include "napi.h"
20	#include "opdef.h"
21	#include "kbuf.h"
22	#include "poll.h"
23	#include "cancel.h"
24
25	struct io_poll_update {
26	struct file *file;
27	u64 old_user_data;
28	u64 new_user_data;
29	__poll_t events;
30	bool update_events;
31	bool update_user_data;
32	};
33
34	struct io_poll_table {
35	struct poll_table_struct pt;
36	struct io_kiocb *req;
37	int nr_entries;
38	int error;
39	bool owning;
40	/ output value, set only if arm poll returns >0 /
41	__poll_t result_mask;
42	};
43
44	#define IO_POLL_CANCEL_FLAG BIT(31)
45	#define IO_POLL_RETRY_FLAG BIT(30)
46	#define IO_POLL_REF_MASK GENMASK(29, 0)
47
48	/*
49	* We usually have 1-2 refs taken, 128 is more than enough and we want to
50	* maximise the margin between this amount and the moment when it overflows.
51	*/
52	#define IO_POLL_REF_BIAS 128
53
54	#define IO_WQE_F_DOUBLE 1
55
56	static int io_poll_wake(struct wait_queue_entry wait, unsigned* mode, int sync,
57	void *key);
58
59	static inline struct io_kiocb wqe_to_req(struct* wait_queue_entry *wqe)
60	{
61	unsigned long priv = (unsigned long)wqe->private;
62
63	return (struct io_kiocb *)(priv & ~IO_WQE_F_DOUBLE);
64	}
65
66	static inline bool wqe_is_double(struct wait_queue_entry *wqe)
67	{
68	unsigned long priv = (unsigned long)wqe->private;
69
70	return priv & IO_WQE_F_DOUBLE;
71	}
72
73	static bool io_poll_get_ownership_slowpath(struct io_kiocb *req)
74	{
75	int v;
76
77	/*
78	* poll_refs are already elevated and we don't have much hope for
79	* grabbing the ownership. Instead of incrementing set a retry flag
80	* to notify the loop that there might have been some change.
81	*/
82	v = atomic_fetch_or(IO_POLL_RETRY_FLAG, v: &req->poll_refs);
83	if (v & IO_POLL_REF_MASK)
84	return false;
85	return !(atomic_fetch_inc(v: &req->poll_refs) & IO_POLL_REF_MASK);
86	}
87
88	/*
89	* If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can
90	* bump it and acquire ownership. It's disallowed to modify requests while not
91	* owning it, that prevents from races for enqueueing task_work's and b/w
92	* arming poll and wakeups.
93	*/
94	static inline bool io_poll_get_ownership(struct io_kiocb *req)
95	{
96	if (unlikely(atomic_read(&req->poll_refs) >= IO_POLL_REF_BIAS))
97	return io_poll_get_ownership_slowpath(req);
98	return !(atomic_fetch_inc(v: &req->poll_refs) & IO_POLL_REF_MASK);
99	}
100
101	static void io_poll_mark_cancelled(struct io_kiocb *req)
102	{
103	atomic_or(IO_POLL_CANCEL_FLAG, v: &req->poll_refs);
104	}
105
106	static struct io_poll io_poll_get_double(struct* io_kiocb *req)
107	{
108	/ pure poll stashes this in ->async_data, poll driven retry elsewhere /
109	if (req->opcode == IORING_OP_POLL_ADD)
110	return req->async_data;
111	return req->apoll->double_poll;
112	}
113
114	static struct io_poll io_poll_get_single(struct* io_kiocb *req)
115	{
116	if (req->opcode == IORING_OP_POLL_ADD)
117	return io_kiocb_to_cmd(req, struct io_poll);
118	return &req->apoll->poll;
119	}
120
121	static void io_poll_req_insert(struct io_kiocb *req)
122	{
123	struct io_hash_table *table = &req->ctx->cancel_table;
124	u32 index = hash_long(req->cqe.user_data, table->hash_bits);
125
126	lockdep_assert_held(&req->ctx->uring_lock);
127
128	hlist_add_head(n: &req->hash_node, h: &table->hbs[index].list);
129	}
130
131	static void io_init_poll_iocb(struct io_poll *poll, __poll_t events)
132	{
133	poll->head = NULL;
134	#define IO_POLL_UNMASK (EPOLLERR\|EPOLLHUP\|EPOLLNVAL\|EPOLLRDHUP)
135	/ mask in events that we always want/need /
136	poll->events = events \| IO_POLL_UNMASK;
137	INIT_LIST_HEAD(list: &poll->wait.entry);
138	init_waitqueue_func_entry(wq_entry: &poll->wait, func: io_poll_wake);
139	}
140
141	static inline void io_poll_remove_entry(struct io_poll *poll)
142	{
143	struct wait_queue_head *head = smp_load_acquire(&poll->head);
144
145	if (head) {
146	spin_lock_irq(lock: &head->lock);
147	list_del_init(entry: &poll->wait.entry);
148	poll->head = NULL;
149	spin_unlock_irq(lock: &head->lock);
150	}
151	}
152
153	static void io_poll_remove_entries(struct io_kiocb *req)
154	{
155	/*
156	* Nothing to do if neither of those flags are set. Avoid dipping
157	* into the poll/apoll/double cachelines if we can.
158	*/
159	if (!(req->flags & (REQ_F_SINGLE_POLL \| REQ_F_DOUBLE_POLL)))
160	return;
161
162	/*
163	* While we hold the waitqueue lock and the waitqueue is nonempty,
164	* wake_up_pollfree() will wait for us. However, taking the waitqueue
165	* lock in the first place can race with the waitqueue being freed.
166	*
167	* We solve this as eventpoll does: by taking advantage of the fact that
168	* all users of wake_up_pollfree() will RCU-delay the actual free. If
169	* we enter rcu_read_lock() and see that the pointer to the queue is
170	* non-NULL, we can then lock it without the memory being freed out from
171	* under us.
172	*
173	* Keep holding rcu_read_lock() as long as we hold the queue lock, in
174	* case the caller deletes the entry from the queue, leaving it empty.
175	* In that case, only RCU prevents the queue memory from being freed.
176	*/
177	rcu_read_lock();
178	if (req->flags & REQ_F_SINGLE_POLL)
179	io_poll_remove_entry(poll: io_poll_get_single(req));
180	if (req->flags & REQ_F_DOUBLE_POLL)
181	io_poll_remove_entry(poll: io_poll_get_double(req));
182	rcu_read_unlock();
183	}
184
185	enum {
186	IOU_POLL_DONE = `0`,
187	IOU_POLL_NO_ACTION = `1`,
188	IOU_POLL_REMOVE_POLL_USE_RES = `2`,
189	IOU_POLL_REISSUE = `3`,
190	IOU_POLL_REQUEUE = `4`,
191	};
192
193	static void __io_poll_execute(struct io_kiocb req, int* mask)
194	{
195	unsigned flags = `0`;
196
197	io_req_set_res(req, res: mask, cflags: `0`);
198	req->io_task_work.func = io_poll_task_func;
199
200	trace_io_uring_task_add(req, mask);
201
202	if (!(req->flags & REQ_F_POLL_NO_LAZY))
203	flags = IOU_F_TWQ_LAZY_WAKE;
204	__io_req_task_work_add(req, flags);
205	}
206
207	static inline void io_poll_execute(struct io_kiocb req, int* res)
208	{
209	if (io_poll_get_ownership(req))
210	__io_poll_execute(req, mask: res);
211	}
212
213	/*
214	* All poll tw should go through this. Checks for poll events, manages
215	* references, does rewait, etc.
216	*
217	* Returns a negative error on failure. IOU_POLL_NO_ACTION when no action
218	* require, which is either spurious wakeup or multishot CQE is served.
219	* IOU_POLL_DONE when it's done with the request, then the mask is stored in
220	* req->cqe.res. IOU_POLL_REMOVE_POLL_USE_RES indicates to remove multishot
221	* poll and that the result is stored in req->cqe.
222	*/
223	static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw)
224	{
225	int v;
226
227	if (unlikely(io_should_terminate_tw(req->ctx)))
228	return -ECANCELED;
229
230	do {
231	v = atomic_read(v: &req->poll_refs);
232
233	if (unlikely(v != `1`)) {
234	/ tw should be the owner and so have some refs /
235	if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK)))
236	return IOU_POLL_NO_ACTION;
237	if (v & IO_POLL_CANCEL_FLAG)
238	return -ECANCELED;
239	/*
240	* cqe.res contains only events of the first wake up
241	* and all others are to be lost. Redo vfs_poll() to get
242	* up to date state.
243	*/
244	if ((v & IO_POLL_REF_MASK) != `1`)
245	req->cqe.res = `0`;
246
247	if (v & IO_POLL_RETRY_FLAG) {
248	req->cqe.res = `0`;
249	/*
250	* We won't find new events that came in between
251	* vfs_poll and the ref put unless we clear the
252	* flag in advance.
253	*/
254	atomic_andnot(IO_POLL_RETRY_FLAG, v: &req->poll_refs);
255	v &= ~IO_POLL_RETRY_FLAG;
256	}
257	}
258
259	/ the mask was stashed in __io_poll_execute /
260	if (!req->cqe.res) {
261	struct poll_table_struct pt = { ._key = req->apoll_events };
262	req->cqe.res = vfs_poll(file: req->file, pt: &pt) & req->apoll_events;
263	/*
264	* We got woken with a mask, but someone else got to
265	* it first. The above vfs_poll() doesn't add us back
266	* to the waitqueue, so if we get nothing back, we
267	* should be safe and attempt a reissue.
268	*/
269	if (unlikely(!req->cqe.res)) {
270	/ Multishot armed need not reissue /
271	if (!(req->apoll_events & EPOLLONESHOT))
272	continue;
273	return IOU_POLL_REISSUE;
274	}
275	}
276	if (req->apoll_events & EPOLLONESHOT)
277	return IOU_POLL_DONE;
278
279	/ multishot, just fill a CQE and proceed /
280	if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
281	__poll_t mask = mangle_poll(val: req->cqe.res &
282	req->apoll_events);
283
284	if (!io_req_post_cqe(req, res: mask, IORING_CQE_F_MORE)) {
285	io_req_set_res(req, res: mask, cflags: `0`);
286	return IOU_POLL_REMOVE_POLL_USE_RES;
287	}
288	} else {
289	int ret = io_poll_issue(req, tw);
290
291	if (ret == IOU_COMPLETE)
292	return IOU_POLL_REMOVE_POLL_USE_RES;
293	else if (ret == IOU_REQUEUE)
294	return IOU_POLL_REQUEUE;
295	if (ret != IOU_RETRY && ret < `0`)
296	return ret;
297	}
298
299	/ force the next iteration to vfs_poll() /
300	req->cqe.res = `0`;
301
302	/*
303	* Release all references, retry if someone tried to restart
304	* task_work while we were executing it.
305	*/
306	v &= IO_POLL_REF_MASK;
307	} while (atomic_sub_return(i: v, v: &req->poll_refs) & IO_POLL_REF_MASK);
308
309	io_napi_add(req);
310	return IOU_POLL_NO_ACTION;
311	}
312
313	void io_poll_task_func(struct io_kiocb *req, io_tw_token_t tw)
314	{
315	int ret;
316
317	ret = io_poll_check_events(req, tw);
318	if (ret == IOU_POLL_NO_ACTION) {
319	return;
320	} else if (ret == IOU_POLL_REQUEUE) {
321	__io_poll_execute(req, mask: `0`);
322	return;
323	}
324	io_poll_remove_entries(req);
325	/ task_work always has ->uring_lock held /
326	hash_del(node: &req->hash_node);
327
328	if (req->opcode == IORING_OP_POLL_ADD) {
329	if (ret == IOU_POLL_DONE) {
330	struct io_poll *poll;
331
332	poll = io_kiocb_to_cmd(req, struct io_poll);
333	req->cqe.res = mangle_poll(val: req->cqe.res & poll->events);
334	} else if (ret == IOU_POLL_REISSUE) {
335	io_req_task_submit(req, tw);
336	return;
337	} else if (ret != IOU_POLL_REMOVE_POLL_USE_RES) {
338	req->cqe.res = ret;
339	req_set_fail(req);
340	}
341
342	io_req_set_res(req, res: req->cqe.res, cflags: `0`);
343	io_req_task_complete(req, tw);
344	} else {
345	io_tw_lock(ctx: req->ctx, tw);
346
347	if (ret == IOU_POLL_REMOVE_POLL_USE_RES)
348	io_req_task_complete(req, tw);
349	else if (ret == IOU_POLL_DONE \|\| ret == IOU_POLL_REISSUE)
350	io_req_task_submit(req, tw);
351	else
352	io_req_defer_failed(req, res: ret);
353	}
354	}
355
356	static void io_poll_cancel_req(struct io_kiocb *req)
357	{
358	io_poll_mark_cancelled(req);
359	/ kick tw, which should complete the request /
360	io_poll_execute(req, res: `0`);
361	}
362
363	#define IO_ASYNC_POLL_COMMON (EPOLLONESHOT \| EPOLLPRI)
364
365	static __cold int io_pollfree_wake(struct io_kiocb req, struct* io_poll *poll)
366	{
367	io_poll_mark_cancelled(req);
368	/ we have to kick tw in case it's not already /
369	io_poll_execute(req, res: `0`);
370
371	/*
372	* If the waitqueue is being freed early but someone is already
373	* holds ownership over it, we have to tear down the request as
374	* best we can. That means immediately removing the request from
375	* its waitqueue and preventing all further accesses to the
376	* waitqueue via the request.
377	*/
378	list_del_init(entry: &poll->wait.entry);
379
380	/*
381	* Careful: this must be the last step, since as soon
382	* as req->head is NULL'ed out, the request can be
383	* completed and freed, since aio_poll_complete_work()
384	* will no longer need to take the waitqueue lock.
385	*/
386	smp_store_release(&poll->head, NULL);
387	return `1`;
388	}
389
390	static int io_poll_wake(struct wait_queue_entry wait, unsigned* mode, int sync,
391	void *key)
392	{
393	struct io_kiocb *req = wqe_to_req(wqe: wait);
394	struct io_poll poll = container_of(wait, struct* io_poll, wait);
395	__poll_t mask = key_to_poll(key);
396
397	if (unlikely(mask & POLLFREE))
398	return io_pollfree_wake(req, poll);
399
400	/ for instances that support it check for an event match first /
401	if (mask && !(mask & (poll->events & ~IO_ASYNC_POLL_COMMON)))
402	return `0`;
403
404	if (io_poll_get_ownership(req)) {
405	/*
406	* If we trigger a multishot poll off our own wakeup path,
407	* disable multishot as there is a circular dependency between
408	* CQ posting and triggering the event.
409	*/
410	if (mask & EPOLL_URING_WAKE)
411	poll->events \|= EPOLLONESHOT;
412
413	/ optional, saves extra locking for removal in tw handler /
414	if (mask && poll->events & EPOLLONESHOT) {
415	list_del_init(entry: &poll->wait.entry);
416	poll->head = NULL;
417	if (wqe_is_double(wqe: wait))
418	req->flags &= ~REQ_F_DOUBLE_POLL;
419	else
420	req->flags &= ~REQ_F_SINGLE_POLL;
421	}
422	__io_poll_execute(req, mask);
423	}
424	return `1`;
425	}
426
427	/ fails only when polling is already completing by the first entry /
428	static bool io_poll_double_prepare(struct io_kiocb *req)
429	{
430	struct wait_queue_head *head;
431	struct io_poll *poll = io_poll_get_single(req);
432
433	/ head is RCU protected, see io_poll_remove_entries() comments /
434	rcu_read_lock();
435	head = smp_load_acquire(&poll->head);
436	/*
437	* poll arm might not hold ownership and so race for req->flags with
438	* io_poll_wake(). There is only one poll entry queued, serialise with
439	* it by taking its head lock. As we're still arming the tw hanlder
440	* is not going to be run, so there are no races with it.
441	*/
442	if (head) {
443	spin_lock_irq(lock: &head->lock);
444	req->flags \|= REQ_F_DOUBLE_POLL;
445	if (req->opcode == IORING_OP_POLL_ADD)
446	req->flags \|= REQ_F_ASYNC_DATA;
447	spin_unlock_irq(lock: &head->lock);
448	}
449	rcu_read_unlock();
450	return !!head;
451	}
452
453	static void __io_queue_proc(struct io_poll poll, struct* io_poll_table *pt,
454	struct wait_queue_head *head,
455	struct io_poll **poll_ptr)
456	{
457	struct io_kiocb *req = pt->req;
458	unsigned long wqe_private = (unsigned long) req;
459
460	/*
461	* The file being polled uses multiple waitqueues for poll handling
462	* (e.g. one for read, one for write). Setup a separate io_poll
463	* if this happens.
464	*/
465	if (unlikely(pt->nr_entries)) {
466	struct io_poll *first = poll;
467
468	/ double add on the same waitqueue head, ignore /
469	if (first->head == head)
470	return;
471	/ already have a 2nd entry, fail a third attempt /
472	if (*poll_ptr) {
473	if ((*poll_ptr)->head == head)
474	return;
475	pt->error = -EINVAL;
476	return;
477	}
478
479	poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
480	if (!poll) {
481	pt->error = -ENOMEM;
482	return;
483	}
484
485	/ mark as double wq entry /
486	wqe_private \|= IO_WQE_F_DOUBLE;
487	io_init_poll_iocb(poll, events: first->events);
488	if (!io_poll_double_prepare(req)) {
489	/ the request is completing, just back off /
490	kfree(objp: poll);
491	return;
492	}
493	*poll_ptr = poll;
494	} else {
495	/ fine to modify, there is no poll queued to race with us /
496	req->flags \|= REQ_F_SINGLE_POLL;
497	}
498
499	pt->nr_entries++;
500	poll->head = head;
501	poll->wait.private = (void *) wqe_private;
502
503	if (poll->events & EPOLLEXCLUSIVE) {
504	add_wait_queue_exclusive(wq_head: head, wq_entry: &poll->wait);
505	} else {
506	add_wait_queue(wq_head: head, wq_entry: &poll->wait);
507	}
508	}
509
510	static void io_poll_queue_proc(struct file file, struct* wait_queue_head *head,
511	struct poll_table_struct *p)
512	{
513	struct io_poll_table pt = container_of(p, struct* io_poll_table, pt);
514	struct io_poll poll = io_kiocb_to_cmd(pt->req, struct* io_poll);
515
516	__io_queue_proc(poll, pt, head,
517	poll_ptr: (struct io_poll **) &pt->req->async_data);
518	}
519
520	static bool io_poll_can_finish_inline(struct io_kiocb *req,
521	struct io_poll_table *pt)
522	{
523	return pt->owning \|\| io_poll_get_ownership(req);
524	}
525
526	static void io_poll_add_hash(struct io_kiocb req, unsigned* int issue_flags)
527	{
528	struct io_ring_ctx *ctx = req->ctx;
529
530	io_ring_submit_lock(ctx, issue_flags);
531	io_poll_req_insert(req);
532	io_ring_submit_unlock(ctx, issue_flags);
533	}
534
535	/*
536	* Returns 0 when it's handed over for polling. The caller owns the requests if
537	* it returns non-zero, but otherwise should not touch it. Negative values
538	* contain an error code. When the result is >0, the polling has completed
539	* inline and ipt.result_mask is set to the mask.
540	*/
541	static int __io_arm_poll_handler(struct io_kiocb *req,
542	struct io_poll *poll,
543	struct io_poll_table *ipt, __poll_t mask,
544	unsigned issue_flags)
545	{
546	INIT_HLIST_NODE(h: &req->hash_node);
547	io_init_poll_iocb(poll, events: mask);
548	poll->file = req->file;
549	req->apoll_events = poll->events;
550
551	ipt->pt._key = mask;
552	ipt->req = req;
553	ipt->error = `0`;
554	ipt->nr_entries = `0`;
555	/*
556	* Polling is either completed here or via task_work, so if we're in the
557	* task context we're naturally serialised with tw by merit of running
558	* the same task. When it's io-wq, take the ownership to prevent tw
559	* from running. However, when we're in the task context, skip taking
560	* it as an optimisation.
561	*
562	* Note: even though the request won't be completed/freed, without
563	* ownership we still can race with io_poll_wake().
564	* io_poll_can_finish_inline() tries to deal with that.
565	*/
566	ipt->owning = issue_flags & IO_URING_F_UNLOCKED;
567	atomic_set(v: &req->poll_refs, i: (int)ipt->owning);
568
569	/*
570	* Exclusive waits may only wake a limited amount of entries
571	* rather than all of them, this may interfere with lazy
572	* wake if someone does wait(events > 1). Ensure we don't do
573	* lazy wake for those, as we need to process each one as they
574	* come in.
575	*/
576	if (poll->events & EPOLLEXCLUSIVE)
577	req->flags \|= REQ_F_POLL_NO_LAZY;
578
579	mask = vfs_poll(file: req->file, pt: &ipt->pt) & poll->events;
580
581	if (unlikely(ipt->error \|\| !ipt->nr_entries)) {
582	io_poll_remove_entries(req);
583
584	if (!io_poll_can_finish_inline(req, pt: ipt)) {
585	io_poll_mark_cancelled(req);
586	return `0`;
587	} else if (mask && (poll->events & EPOLLET)) {
588	ipt->result_mask = mask;
589	return `1`;
590	}
591	return ipt->error ?: -EINVAL;
592	}
593
594	if (mask &&
595	((poll->events & (EPOLLET\|EPOLLONESHOT)) == (EPOLLET\|EPOLLONESHOT))) {
596	if (!io_poll_can_finish_inline(req, pt: ipt)) {
597	io_poll_add_hash(req, issue_flags);
598	return `0`;
599	}
600	io_poll_remove_entries(req);
601	ipt->result_mask = mask;
602	/ no one else has access to the req, forget about the ref /
603	return `1`;
604	}
605
606	io_poll_add_hash(req, issue_flags);
607
608	if (mask && (poll->events & EPOLLET) &&
609	io_poll_can_finish_inline(req, pt: ipt)) {
610	__io_poll_execute(req, mask);
611	return `0`;
612	}
613	io_napi_add(req);
614
615	if (ipt->owning) {
616	/*
617	* Try to release ownership. If we see a change of state, e.g.
618	* poll was waken up, queue up a tw, it'll deal with it.
619	*/
620	if (atomic_cmpxchg(v: &req->poll_refs, old: `1`, new: `0`) != `1`)
621	__io_poll_execute(req, mask: `0`);
622	}
623	return `0`;
624	}
625
626	static void io_async_queue_proc(struct file file, struct* wait_queue_head *head,
627	struct poll_table_struct *p)
628	{
629	struct io_poll_table pt = container_of(p, struct* io_poll_table, pt);
630	struct async_poll *apoll = pt->req->apoll;
631
632	__io_queue_proc(poll: &apoll->poll, pt, head, poll_ptr: &apoll->double_poll);
633	}
634
635	/*
636	* We can't reliably detect loops in repeated poll triggers and issue
637	* subsequently failing. But rather than fail these immediately, allow a
638	* certain amount of retries before we give up. Given that this condition
639	* should _rarely_ trigger even once, we should be fine with a larger value.
640	*/
641	#define APOLL_MAX_RETRY 128
642
643	static struct async_poll io_req_alloc_apoll(struct* io_kiocb *req,
644	unsigned issue_flags)
645	{
646	struct io_ring_ctx *ctx = req->ctx;
647	struct async_poll *apoll;
648
649	if (req->flags & REQ_F_POLLED) {
650	apoll = req->apoll;
651	kfree(objp: apoll->double_poll);
652	} else {
653	if (!(issue_flags & IO_URING_F_UNLOCKED))
654	apoll = io_cache_alloc(cache: &ctx->apoll_cache, GFP_ATOMIC);
655	else
656	apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
657	if (!apoll)
658	return NULL;
659	apoll->poll.retries = APOLL_MAX_RETRY;
660	}
661	apoll->double_poll = NULL;
662	req->apoll = apoll;
663	if (unlikely(!--apoll->poll.retries))
664	return NULL;
665	return apoll;
666	}
667
668	int io_arm_apoll(struct io_kiocb req, unsigned* issue_flags, __poll_t mask)
669	{
670	struct async_poll *apoll;
671	struct io_poll_table ipt;
672	int ret;
673
674	mask \|= EPOLLET;
675	if (!io_file_can_poll(req))
676	return IO_APOLL_ABORTED;
677	if (!(req->flags & REQ_F_APOLL_MULTISHOT))
678	mask \|= EPOLLONESHOT;
679
680	apoll = io_req_alloc_apoll(req, issue_flags);
681	if (!apoll)
682	return IO_APOLL_ABORTED;
683	req->flags &= ~(REQ_F_SINGLE_POLL \| REQ_F_DOUBLE_POLL);
684	req->flags \|= REQ_F_POLLED;
685	ipt.pt._qproc = io_async_queue_proc;
686
687	ret = __io_arm_poll_handler(req, poll: &apoll->poll, ipt: &ipt, mask, issue_flags);
688	if (ret)
689	return ret > `0` ? IO_APOLL_READY : IO_APOLL_ABORTED;
690	trace_io_uring_poll_arm(req, mask, events: apoll->poll.events);
691	return IO_APOLL_OK;
692	}
693
694	int io_arm_poll_handler(struct io_kiocb req, unsigned* issue_flags)
695	{
696	const struct io_issue_def *def = &io_issue_defs[req->opcode];
697	__poll_t mask = POLLPRI \| POLLERR;
698
699	if (!def->pollin && !def->pollout)
700	return IO_APOLL_ABORTED;
701	if (!io_file_can_poll(req))
702	return IO_APOLL_ABORTED;
703
704	if (def->pollin) {
705	mask \|= EPOLLIN \| EPOLLRDNORM;
706
707	/ If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN /
708	if (req->flags & REQ_F_CLEAR_POLLIN)
709	mask &= ~EPOLLIN;
710	} else {
711	mask \|= EPOLLOUT \| EPOLLWRNORM;
712	}
713	if (def->poll_exclusive)
714	mask \|= EPOLLEXCLUSIVE;
715
716	return io_arm_apoll(req, issue_flags, mask);
717	}
718
719	/*
720	* Returns true if we found and killed one or more poll requests
721	*/
722	__cold bool io_poll_remove_all(struct io_ring_ctx ctx, struct* io_uring_task *tctx,
723	bool cancel_all)
724	{
725	unsigned nr_buckets = `1U` << ctx->cancel_table.hash_bits;
726	struct hlist_node *tmp;
727	struct io_kiocb *req;
728	bool found = false;
729	int i;
730
731	lockdep_assert_held(&ctx->uring_lock);
732
733	for (i = `0`; i < nr_buckets; i++) {
734	struct io_hash_bucket *hb = &ctx->cancel_table.hbs[i];
735
736	hlist_for_each_entry_safe(req, tmp, &hb->list, hash_node) {
737	if (io_match_task_safe(head: req, tctx, cancel_all)) {
738	hlist_del_init(n: &req->hash_node);
739	io_poll_cancel_req(req);
740	found = true;
741	}
742	}
743	}
744	return found;
745	}
746
747	static struct io_kiocb io_poll_find(struct* io_ring_ctx *ctx, bool poll_only,
748	struct io_cancel_data *cd)
749	{
750	struct io_kiocb *req;
751	u32 index = hash_long(cd->data, ctx->cancel_table.hash_bits);
752	struct io_hash_bucket *hb = &ctx->cancel_table.hbs[index];
753
754	hlist_for_each_entry(req, &hb->list, hash_node) {
755	if (cd->data != req->cqe.user_data)
756	continue;
757	if (poll_only && req->opcode != IORING_OP_POLL_ADD)
758	continue;
759	if (cd->flags & IORING_ASYNC_CANCEL_ALL) {
760	if (io_cancel_match_sequence(req, sequence: cd->seq))
761	continue;
762	}
763	return req;
764	}
765	return NULL;
766	}
767
768	static struct io_kiocb io_poll_file_find(struct* io_ring_ctx *ctx,
769	struct io_cancel_data *cd)
770	{
771	unsigned nr_buckets = `1U` << ctx->cancel_table.hash_bits;
772	struct io_kiocb *req;
773	int i;
774
775	for (i = `0`; i < nr_buckets; i++) {
776	struct io_hash_bucket *hb = &ctx->cancel_table.hbs[i];
777
778	hlist_for_each_entry(req, &hb->list, hash_node) {
779	if (io_cancel_req_match(req, cd))
780	return req;
781	}
782	}
783	return NULL;
784	}
785
786	static int io_poll_disarm(struct io_kiocb *req)
787	{
788	if (!req)
789	return -ENOENT;
790	if (!io_poll_get_ownership(req))
791	return -EALREADY;
792	io_poll_remove_entries(req);
793	hash_del(node: &req->hash_node);
794	return `0`;
795	}
796
797	static int __io_poll_cancel(struct io_ring_ctx ctx, struct* io_cancel_data *cd)
798	{
799	struct io_kiocb *req;
800
801	if (cd->flags & (IORING_ASYNC_CANCEL_FD \| IORING_ASYNC_CANCEL_OP \|
802	IORING_ASYNC_CANCEL_ANY))
803	req = io_poll_file_find(ctx, cd);
804	else
805	req = io_poll_find(ctx, poll_only: false, cd);
806
807	if (req) {
808	io_poll_cancel_req(req);
809	return `0`;
810	}
811	return -ENOENT;
812	}
813
814	int io_poll_cancel(struct io_ring_ctx ctx, struct* io_cancel_data *cd,
815	unsigned issue_flags)
816	{
817	int ret;
818
819	io_ring_submit_lock(ctx, issue_flags);
820	ret = __io_poll_cancel(ctx, cd);
821	io_ring_submit_unlock(ctx, issue_flags);
822	return ret;
823	}
824
825	static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
826	unsigned int flags)
827	{
828	u32 events;
829
830	events = READ_ONCE(sqe->poll32_events);
831	#ifdef __BIG_ENDIAN
832	events = swahw32(events);
833	#endif
834	if (!(flags & IORING_POLL_ADD_MULTI))
835	events \|= EPOLLONESHOT;
836	if (!(flags & IORING_POLL_ADD_LEVEL))
837	events \|= EPOLLET;
838	return demangle_poll(val: events) \|
839	(events & (EPOLLEXCLUSIVE\|EPOLLONESHOT\|EPOLLET));
840	}
841
842	int io_poll_remove_prep(struct io_kiocb req, const* struct io_uring_sqe *sqe)
843	{
844	struct io_poll_update upd = io_kiocb_to_cmd(req, struct* io_poll_update);
845	u32 flags;
846
847	if (sqe->buf_index \|\| sqe->splice_fd_in)
848	return -EINVAL;
849	flags = READ_ONCE(sqe->len);
850	if (flags & ~(IORING_POLL_UPDATE_EVENTS \| IORING_POLL_UPDATE_USER_DATA \|
851	IORING_POLL_ADD_MULTI))
852	return -EINVAL;
853	/ meaningless without update /
854	if (flags == IORING_POLL_ADD_MULTI)
855	return -EINVAL;
856
857	upd->old_user_data = READ_ONCE(sqe->addr);
858	upd->update_events = flags & IORING_POLL_UPDATE_EVENTS;
859	upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA;
860
861	upd->new_user_data = READ_ONCE(sqe->off);
862	if (!upd->update_user_data && upd->new_user_data)
863	return -EINVAL;
864	if (upd->update_events)
865	upd->events = io_poll_parse_events(sqe, flags);
866	else if (sqe->poll32_events)
867	return -EINVAL;
868
869	return `0`;
870	}
871
872	int io_poll_add_prep(struct io_kiocb req, const* struct io_uring_sqe *sqe)
873	{
874	struct io_poll poll = io_kiocb_to_cmd(req, struct* io_poll);
875	u32 flags;
876
877	if (sqe->buf_index \|\| sqe->off \|\| sqe->addr)
878	return -EINVAL;
879	flags = READ_ONCE(sqe->len);
880	if (flags & ~IORING_POLL_ADD_MULTI)
881	return -EINVAL;
882	if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP))
883	return -EINVAL;
884
885	poll->events = io_poll_parse_events(sqe, flags);
886	return `0`;
887	}
888
889	int io_poll_add(struct io_kiocb req, unsigned* int issue_flags)
890	{
891	struct io_poll poll = io_kiocb_to_cmd(req, struct* io_poll);
892	struct io_poll_table ipt;
893	int ret;
894
895	ipt.pt._qproc = io_poll_queue_proc;
896
897	ret = __io_arm_poll_handler(req, poll, ipt: &ipt, mask: poll->events, issue_flags);
898	if (ret > `0`) {
899	io_req_set_res(req, res: ipt.result_mask, cflags: `0`);
900	return IOU_COMPLETE;
901	}
902	return ret ?: IOU_ISSUE_SKIP_COMPLETE;
903	}
904
905	int io_poll_remove(struct io_kiocb req, unsigned* int issue_flags)
906	{
907	struct io_poll_update poll_update = io_kiocb_to_cmd(req, struct* io_poll_update);
908	struct io_ring_ctx *ctx = req->ctx;
909	struct io_cancel_data cd = { .ctx = ctx, .data = poll_update->old_user_data, };
910	struct io_kiocb *preq;
911	int ret2, ret = `0`;
912
913	io_ring_submit_lock(ctx, issue_flags);
914	preq = io_poll_find(ctx, poll_only: true, cd: &cd);
915	ret2 = io_poll_disarm(req: preq);
916	if (ret2) {
917	ret = ret2;
918	goto out;
919	}
920	if (WARN_ON_ONCE(preq->opcode != IORING_OP_POLL_ADD)) {
921	ret = -EFAULT;
922	goto out;
923	}
924
925	if (poll_update->update_events \|\| poll_update->update_user_data) {
926	/ only mask one event flags, keep behavior flags /
927	if (poll_update->update_events) {
928	struct io_poll poll = io_kiocb_to_cmd(preq, struct* io_poll);
929
930	poll->events &= ~`0xffff`;
931	poll->events \|= poll_update->events & `0xffff`;
932	poll->events \|= IO_POLL_UNMASK;
933	}
934	if (poll_update->update_user_data)
935	preq->cqe.user_data = poll_update->new_user_data;
936
937	ret2 = io_poll_add(req: preq, issue_flags: issue_flags & ~IO_URING_F_UNLOCKED);
938	/ successfully updated, don't complete poll request /
939	if (!ret2 \|\| ret2 == -EIOCBQUEUED)
940	goto out;
941	}
942
943	req_set_fail(req: preq);
944	io_req_set_res(req: preq, res: -ECANCELED, cflags: `0`);
945	preq->io_task_work.func = io_req_task_complete;
946	io_req_task_work_add(req: preq);
947	out:
948	io_ring_submit_unlock(ctx, issue_flags);
949	if (ret < `0`) {
950	req_set_fail(req);
951	return ret;
952	}
953	/ complete update request, we're done with it /
954	io_req_set_res(req, res: ret, cflags: `0`);
955	return IOU_COMPLETE;
956	}
957

Browse the source code of Linux/io_uring/poll.c