rw.c source code [Linux/io_uring/rw.c]

1	// SPDX-License-Identifier: GPL-2.0
2	#include <linux/kernel.h>
3	#include <linux/errno.h>
4	#include <linux/fs.h>
5	#include <linux/file.h>
6	#include <linux/blk-mq.h>
7	#include <linux/mm.h>
8	#include <linux/slab.h>
9	#include <linux/fsnotify.h>
10	#include <linux/poll.h>
11	#include <linux/nospec.h>
12	#include <linux/compat.h>
13	#include <linux/io_uring/cmd.h>
14	#include <linux/indirect_call_wrapper.h>
15
16	#include <uapi/linux/io_uring.h>
17
18	#include "filetable.h"
19	#include "io_uring.h"
20	#include "opdef.h"
21	#include "kbuf.h"
22	#include "alloc_cache.h"
23	#include "rsrc.h"
24	#include "poll.h"
25	#include "rw.h"
26
27	static void io_complete_rw(struct kiocb kiocb, long* res);
28	static void io_complete_rw_iopoll(struct kiocb kiocb, long* res);
29
30	struct io_rw {
31	/ NOTE: kiocb has the file as the first member, so don't do it here /
32	struct kiocb kiocb;
33	u64 addr;
34	u32 len;
35	rwf_t flags;
36	};
37
38	static bool io_file_supports_nowait(struct io_kiocb *req, __poll_t mask)
39	{
40	/ If FMODE_NOWAIT is set for a file, we're golden /
41	if (req->flags & REQ_F_SUPPORT_NOWAIT)
42	return true;
43	/ No FMODE_NOWAIT, if we can poll, check the status /
44	if (io_file_can_poll(req)) {
45	struct poll_table_struct pt = { ._key = mask };
46
47	return vfs_poll(file: req->file, pt: &pt) & mask;
48	}
49	/ No FMODE_NOWAIT support, and file isn't pollable. Tough luck. /
50	return false;
51	}
52
53	static int io_iov_compat_buffer_select_prep(struct io_rw *rw)
54	{
55	struct compat_iovec __user *uiov = u64_to_user_ptr(rw->addr);
56	struct compat_iovec iov;
57
58	if (copy_from_user(to: &iov, from: uiov, n: sizeof(iov)))
59	return -EFAULT;
60	rw->len = iov.iov_len;
61	return `0`;
62	}
63
64	static int io_iov_buffer_select_prep(struct io_kiocb *req)
65	{
66	struct iovec __user *uiov;
67	struct iovec iov;
68	struct io_rw rw = io_kiocb_to_cmd(req, struct* io_rw);
69
70	if (rw->len != `1`)
71	return -EINVAL;
72
73	if (io_is_compat(ctx: req->ctx))
74	return io_iov_compat_buffer_select_prep(rw);
75
76	uiov = u64_to_user_ptr(rw->addr);
77	if (copy_from_user(to: &iov, from: uiov, n: sizeof(*uiov)))
78	return -EFAULT;
79	rw->len = iov.iov_len;
80	return `0`;
81	}
82
83	static int io_import_vec(int ddir, struct io_kiocb *req,
84	struct io_async_rw *io,
85	const struct iovec __user *uvec,
86	size_t uvec_segs)
87	{
88	int ret, nr_segs;
89	struct iovec *iov;
90
91	if (io->vec.iovec) {
92	nr_segs = io->vec.nr;
93	iov = io->vec.iovec;
94	} else {
95	nr_segs = `1`;
96	iov = &io->fast_iov;
97	}
98
99	ret = __import_iovec(type: ddir, uvec, nr_segs: uvec_segs, fast_segs: nr_segs, iovp: &iov, i: &io->iter,
100	compat: io_is_compat(ctx: req->ctx));
101	if (unlikely(ret < `0`))
102	return ret;
103	if (iov) {
104	req->flags \|= REQ_F_NEED_CLEANUP;
105	io_vec_reset_iovec(iv: &io->vec, iovec: iov, nr: io->iter.nr_segs);
106	}
107	return `0`;
108	}
109
110	static int __io_import_rw_buffer(int ddir, struct io_kiocb *req,
111	struct io_async_rw io, struct* io_br_sel *sel,
112	unsigned int issue_flags)
113	{
114	const struct io_issue_def *def = &io_issue_defs[req->opcode];
115	struct io_rw rw = io_kiocb_to_cmd(req, struct* io_rw);
116	size_t sqe_len = rw->len;
117
118	sel->addr = u64_to_user_ptr(rw->addr);
119	if (def->vectored && !(req->flags & REQ_F_BUFFER_SELECT))
120	return io_import_vec(ddir, req, io, uvec: sel->addr, uvec_segs: sqe_len);
121
122	if (io_do_buffer_select(req)) {
123	*sel = io_buffer_select(req, len: &sqe_len, buf_group: io->buf_group, issue_flags);
124	if (!sel->addr)
125	return -ENOBUFS;
126	rw->addr = (unsigned long) sel->addr;
127	rw->len = sqe_len;
128	}
129	return import_ubuf(type: ddir, buf: sel->addr, len: sqe_len, i: &io->iter);
130	}
131
132	static inline int io_import_rw_buffer(int rw, struct io_kiocb *req,
133	struct io_async_rw *io,
134	struct io_br_sel *sel,
135	unsigned int issue_flags)
136	{
137	int ret;
138
139	ret = __io_import_rw_buffer(ddir: rw, req, io, sel, issue_flags);
140	if (unlikely(ret < `0`))
141	return ret;
142
143	iov_iter_save_state(iter: &io->iter, state: &io->iter_state);
144	return `0`;
145	}
146
147	static void io_rw_recycle(struct io_kiocb req, unsigned* int issue_flags)
148	{
149	struct io_async_rw *rw = req->async_data;
150
151	if (unlikely(issue_flags & IO_URING_F_UNLOCKED))
152	return;
153
154	io_alloc_cache_vec_kasan(iv: &rw->vec);
155	if (rw->vec.nr > IO_VEC_CACHE_SOFT_CAP)
156	io_vec_free(iv: &rw->vec);
157
158	if (io_alloc_cache_put(cache: &req->ctx->rw_cache, entry: rw))
159	io_req_async_data_clear(req, extra_flags: `0`);
160	}
161
162	static void io_req_rw_cleanup(struct io_kiocb req, unsigned* int issue_flags)
163	{
164	/*
165	* Disable quick recycling for anything that's gone through io-wq.
166	* In theory, this should be fine to cleanup. However, some read or
167	* write iter handling touches the iovec AFTER having called into the
168	* handler, eg to reexpand or revert. This means we can have:
169	*
170	* task io-wq
171	* issue
172	* punt to io-wq
173	* issue
174	* blkdev_write_iter()
175	* ->ki_complete()
176	* io_complete_rw()
177	* queue tw complete
178	* run tw
179	* req_rw_cleanup
180	* iov_iter_count() <- look at iov_iter again
181	*
182	* which can lead to a UAF. This is only possible for io-wq offload
183	* as the cleanup can run in parallel. As io-wq is not the fast path,
184	* just leave cleanup to the end.
185	*
186	* This is really a bug in the core code that does this, any issue
187	* path should assume that a successful (or -EIOCBQUEUED) return can
188	* mean that the underlying data can be gone at any time. But that
189	* should be fixed seperately, and then this check could be killed.
190	*/
191	if (!(req->flags & (REQ_F_REISSUE \| REQ_F_REFCOUNT))) {
192	req->flags &= ~REQ_F_NEED_CLEANUP;
193	io_rw_recycle(req, issue_flags);
194	}
195	}
196
197	static int io_rw_alloc_async(struct io_kiocb *req)
198	{
199	struct io_ring_ctx *ctx = req->ctx;
200	struct io_async_rw *rw;
201
202	rw = io_uring_alloc_async_data(cache: &ctx->rw_cache, req);
203	if (!rw)
204	return -ENOMEM;
205	if (rw->vec.iovec)
206	req->flags \|= REQ_F_NEED_CLEANUP;
207	rw->bytes_done = `0`;
208	return `0`;
209	}
210
211	static inline void io_meta_save_state(struct io_async_rw *io)
212	{
213	io->meta_state.seed = io->meta.seed;
214	iov_iter_save_state(iter: &io->meta.iter, state: &io->meta_state.iter_meta);
215	}
216
217	static inline void io_meta_restore(struct io_async_rw io, struct* kiocb *kiocb)
218	{
219	if (kiocb->ki_flags & IOCB_HAS_METADATA) {
220	io->meta.seed = io->meta_state.seed;
221	iov_iter_restore(i: &io->meta.iter, state: &io->meta_state.iter_meta);
222	}
223	}
224
225	static int io_prep_rw_pi(struct io_kiocb req, struct* io_rw rw, int* ddir,
226	u64 attr_ptr, u64 attr_type_mask)
227	{
228	struct io_uring_attr_pi pi_attr;
229	struct io_async_rw *io;
230	int ret;
231
232	if (copy_from_user(to: &pi_attr, u64_to_user_ptr(attr_ptr),
233	n: sizeof(pi_attr)))
234	return -EFAULT;
235
236	if (pi_attr.rsvd)
237	return -EINVAL;
238
239	io = req->async_data;
240	io->meta.flags = pi_attr.flags;
241	io->meta.app_tag = pi_attr.app_tag;
242	io->meta.seed = pi_attr.seed;
243	ret = import_ubuf(type: ddir, u64_to_user_ptr(pi_attr.addr),
244	len: pi_attr.len, i: &io->meta.iter);
245	if (unlikely(ret < `0`))
246	return ret;
247	req->flags \|= REQ_F_HAS_METADATA;
248	io_meta_save_state(io);
249	return ret;
250	}
251
252	static int __io_prep_rw(struct io_kiocb req, const* struct io_uring_sqe *sqe,
253	int ddir)
254	{
255	struct io_rw rw = io_kiocb_to_cmd(req, struct* io_rw);
256	struct io_async_rw *io;
257	unsigned ioprio;
258	u64 attr_type_mask;
259	int ret;
260
261	if (io_rw_alloc_async(req))
262	return -ENOMEM;
263	io = req->async_data;
264
265	rw->kiocb.ki_pos = READ_ONCE(sqe->off);
266	/ used for fixed read/write too - just read unconditionally /
267	req->buf_index = READ_ONCE(sqe->buf_index);
268	io->buf_group = req->buf_index;
269
270	ioprio = READ_ONCE(sqe->ioprio);
271	if (ioprio) {
272	ret = ioprio_check_cap(ioprio);
273	if (ret)
274	return ret;
275
276	rw->kiocb.ki_ioprio = ioprio;
277	} else {
278	rw->kiocb.ki_ioprio = get_current_ioprio();
279	}
280	rw->kiocb.dio_complete = NULL;
281	rw->kiocb.ki_flags = `0`;
282	rw->kiocb.ki_write_stream = READ_ONCE(sqe->write_stream);
283
284	if (req->ctx->flags & IORING_SETUP_IOPOLL)
285	rw->kiocb.ki_complete = io_complete_rw_iopoll;
286	else
287	rw->kiocb.ki_complete = io_complete_rw;
288
289	rw->addr = READ_ONCE(sqe->addr);
290	rw->len = READ_ONCE(sqe->len);
291	rw->flags = (__force rwf_t) READ_ONCE(sqe->rw_flags);
292
293	attr_type_mask = READ_ONCE(sqe->attr_type_mask);
294	if (attr_type_mask) {
295	u64 attr_ptr;
296
297	/ only PI attribute is supported currently /
298	if (attr_type_mask != IORING_RW_ATTR_FLAG_PI)
299	return -EINVAL;
300
301	attr_ptr = READ_ONCE(sqe->attr_ptr);
302	return io_prep_rw_pi(req, rw, ddir, attr_ptr, attr_type_mask);
303	}
304	return `0`;
305	}
306
307	static int io_rw_do_import(struct io_kiocb req, int* ddir)
308	{
309	struct io_br_sel sel = { };
310
311	if (io_do_buffer_select(req))
312	return `0`;
313
314	return io_import_rw_buffer(rw: ddir, req, io: req->async_data, sel: &sel, issue_flags: `0`);
315	}
316
317	static int io_prep_rw(struct io_kiocb req, const* struct io_uring_sqe *sqe,
318	int ddir)
319	{
320	int ret;
321
322	ret = __io_prep_rw(req, sqe, ddir);
323	if (unlikely(ret))
324	return ret;
325
326	return io_rw_do_import(req, ddir);
327	}
328
329	int io_prep_read(struct io_kiocb req, const* struct io_uring_sqe *sqe)
330	{
331	return io_prep_rw(req, sqe, ITER_DEST);
332	}
333
334	int io_prep_write(struct io_kiocb req, const* struct io_uring_sqe *sqe)
335	{
336	return io_prep_rw(req, sqe, ITER_SOURCE);
337	}
338
339	static int io_prep_rwv(struct io_kiocb req, const* struct io_uring_sqe *sqe,
340	int ddir)
341	{
342	int ret;
343
344	ret = io_prep_rw(req, sqe, ddir);
345	if (unlikely(ret))
346	return ret;
347	if (!(req->flags & REQ_F_BUFFER_SELECT))
348	return `0`;
349
350	/*
351	* Have to do this validation here, as this is in io_read() rw->len
352	* might have chanaged due to buffer selection
353	*/
354	return io_iov_buffer_select_prep(req);
355	}
356
357	int io_prep_readv(struct io_kiocb req, const* struct io_uring_sqe *sqe)
358	{
359	return io_prep_rwv(req, sqe, ITER_DEST);
360	}
361
362	int io_prep_writev(struct io_kiocb req, const* struct io_uring_sqe *sqe)
363	{
364	return io_prep_rwv(req, sqe, ITER_SOURCE);
365	}
366
367	static int io_init_rw_fixed(struct io_kiocb req, unsigned* int issue_flags,
368	int ddir)
369	{
370	struct io_rw rw = io_kiocb_to_cmd(req, struct* io_rw);
371	struct io_async_rw *io = req->async_data;
372	int ret;
373
374	if (io->bytes_done)
375	return `0`;
376
377	ret = io_import_reg_buf(req, iter: &io->iter, buf_addr: rw->addr, len: rw->len, ddir,
378	issue_flags);
379	iov_iter_save_state(iter: &io->iter, state: &io->iter_state);
380	return ret;
381	}
382
383	int io_prep_read_fixed(struct io_kiocb req, const* struct io_uring_sqe *sqe)
384	{
385	return __io_prep_rw(req, sqe, ITER_DEST);
386	}
387
388	int io_prep_write_fixed(struct io_kiocb req, const* struct io_uring_sqe *sqe)
389	{
390	return __io_prep_rw(req, sqe, ITER_SOURCE);
391	}
392
393	static int io_rw_import_reg_vec(struct io_kiocb *req,
394	struct io_async_rw *io,
395	int ddir, unsigned int issue_flags)
396	{
397	struct io_rw rw = io_kiocb_to_cmd(req, struct* io_rw);
398	unsigned uvec_segs = rw->len;
399	int ret;
400
401	ret = io_import_reg_vec(ddir, iter: &io->iter, req, vec: &io->vec,
402	nr_iovs: uvec_segs, issue_flags);
403	if (unlikely(ret))
404	return ret;
405	iov_iter_save_state(iter: &io->iter, state: &io->iter_state);
406	req->flags &= ~REQ_F_IMPORT_BUFFER;
407	return `0`;
408	}
409
410	static int io_rw_prep_reg_vec(struct io_kiocb *req)
411	{
412	struct io_rw rw = io_kiocb_to_cmd(req, struct* io_rw);
413	struct io_async_rw *io = req->async_data;
414	const struct iovec __user *uvec;
415
416	uvec = u64_to_user_ptr(rw->addr);
417	return io_prep_reg_iovec(req, iv: &io->vec, uvec, uvec_segs: rw->len);
418	}
419
420	int io_prep_readv_fixed(struct io_kiocb req, const* struct io_uring_sqe *sqe)
421	{
422	int ret;
423
424	ret = __io_prep_rw(req, sqe, ITER_DEST);
425	if (unlikely(ret))
426	return ret;
427	return io_rw_prep_reg_vec(req);
428	}
429
430	int io_prep_writev_fixed(struct io_kiocb req, const* struct io_uring_sqe *sqe)
431	{
432	int ret;
433
434	ret = __io_prep_rw(req, sqe, ITER_SOURCE);
435	if (unlikely(ret))
436	return ret;
437	return io_rw_prep_reg_vec(req);
438	}
439
440	/*
441	* Multishot read is prepared just like a normal read/write request, only
442	* difference is that we set the MULTISHOT flag.
443	*/
444	int io_read_mshot_prep(struct io_kiocb req, const* struct io_uring_sqe *sqe)
445	{
446	struct io_rw rw = io_kiocb_to_cmd(req, struct* io_rw);
447	int ret;
448
449	/ must be used with provided buffers /
450	if (!(req->flags & REQ_F_BUFFER_SELECT))
451	return -EINVAL;
452
453	ret = __io_prep_rw(req, sqe, ITER_DEST);
454	if (unlikely(ret))
455	return ret;
456
457	if (rw->addr \|\| rw->len)
458	return -EINVAL;
459
460	req->flags \|= REQ_F_APOLL_MULTISHOT;
461	return `0`;
462	}
463
464	void io_readv_writev_cleanup(struct io_kiocb *req)
465	{
466	lockdep_assert_held(&req->ctx->uring_lock);
467	io_rw_recycle(req, issue_flags: `0`);
468	}
469
470	static inline loff_t io_kiocb_update_pos(struct* io_kiocb *req)
471	{
472	struct io_rw rw = io_kiocb_to_cmd(req, struct* io_rw);
473
474	if (rw->kiocb.ki_pos != -`1`)
475	return &rw->kiocb.ki_pos;
476
477	if (!(req->file->f_mode & FMODE_STREAM)) {
478	req->flags \|= REQ_F_CUR_POS;
479	rw->kiocb.ki_pos = req->file->f_pos;
480	return &rw->kiocb.ki_pos;
481	}
482
483	rw->kiocb.ki_pos = `0`;
484	return NULL;
485	}
486
487	static bool io_rw_should_reissue(struct io_kiocb *req)
488	{
489	#ifdef CONFIG_BLOCK
490	struct io_rw rw = io_kiocb_to_cmd(req, struct* io_rw);
491	umode_t mode = file_inode(f: req->file)->i_mode;
492	struct io_async_rw *io = req->async_data;
493	struct io_ring_ctx *ctx = req->ctx;
494
495	if (!S_ISBLK(mode) && !S_ISREG(mode))
496	return false;
497	if ((req->flags & REQ_F_NOWAIT) \|\| (io_wq_current_is_worker() &&
498	!(ctx->flags & IORING_SETUP_IOPOLL)))
499	return false;
500	/*
501	* If ref is dying, we might be running poll reap from the exit work.
502	* Don't attempt to reissue from that path, just let it fail with
503	* -EAGAIN.
504	*/
505	if (percpu_ref_is_dying(ref: &ctx->refs))
506	return false;
507
508	io_meta_restore(io, kiocb: &rw->kiocb);
509	iov_iter_restore(i: &io->iter, state: &io->iter_state);
510	return true;
511	#else
512	return false;
513	#endif
514	}
515
516	static void io_req_end_write(struct io_kiocb *req)
517	{
518	if (req->flags & REQ_F_ISREG) {
519	struct io_rw rw = io_kiocb_to_cmd(req, struct* io_rw);
520
521	kiocb_end_write(iocb: &rw->kiocb);
522	}
523	}
524
525	/*
526	* Trigger the notifications after having done some IO, and finish the write
527	* accounting, if any.
528	*/
529	static void io_req_io_end(struct io_kiocb *req)
530	{
531	struct io_rw rw = io_kiocb_to_cmd(req, struct* io_rw);
532
533	if (rw->kiocb.ki_flags & IOCB_WRITE) {
534	io_req_end_write(req);
535	fsnotify_modify(file: req->file);
536	} else {
537	fsnotify_access(file: req->file);
538	}
539	}
540
541	static void __io_complete_rw_common(struct io_kiocb req, long* res)
542	{
543	if (res == req->cqe.res)
544	return;
545	if (res == -EAGAIN && io_rw_should_reissue(req)) {
546	req->flags \|= REQ_F_REISSUE \| REQ_F_BL_NO_RECYCLE;
547	} else {
548	req_set_fail(req);
549	req->cqe.res = res;
550	}
551	}
552
553	static inline int io_fixup_rw_res(struct io_kiocb req, long* res)
554	{
555	struct io_async_rw *io = req->async_data;
556
557	/ add previously done IO, if any /
558	if (req_has_async_data(req) && io->bytes_done > `0`) {
559	if (res < `0`)
560	res = io->bytes_done;
561	else
562	res += io->bytes_done;
563	}
564	return res;
565	}
566
567	void io_req_rw_complete(struct io_kiocb *req, io_tw_token_t tw)
568	{
569	struct io_rw rw = io_kiocb_to_cmd(req, struct* io_rw);
570	struct kiocb *kiocb = &rw->kiocb;
571
572	if ((kiocb->ki_flags & IOCB_DIO_CALLER_COMP) && kiocb->dio_complete) {
573	long res = kiocb->dio_complete(rw->kiocb.private);
574
575	io_req_set_res(req, res: io_fixup_rw_res(req, res), cflags: `0`);
576	}
577
578	io_req_io_end(req);
579
580	if (req->flags & (REQ_F_BUFFER_SELECTED\|REQ_F_BUFFER_RING))
581	req->cqe.flags \|= io_put_kbuf(req, len: req->cqe.res, NULL);
582
583	io_req_rw_cleanup(req, issue_flags: `0`);
584	io_req_task_complete(req, tw);
585	}
586
587	static void io_complete_rw(struct kiocb kiocb, long* res)
588	{
589	struct io_rw rw = container_of(kiocb, struct* io_rw, kiocb);
590	struct io_kiocb *req = cmd_to_io_kiocb(ptr: rw);
591
592	if (!kiocb->dio_complete \|\| !(kiocb->ki_flags & IOCB_DIO_CALLER_COMP)) {
593	__io_complete_rw_common(req, res);
594	io_req_set_res(req, res: io_fixup_rw_res(req, res), cflags: `0`);
595	}
596	req->io_task_work.func = io_req_rw_complete;
597	__io_req_task_work_add(req, flags: IOU_F_TWQ_LAZY_WAKE);
598	}
599
600	static void io_complete_rw_iopoll(struct kiocb kiocb, long* res)
601	{
602	struct io_rw rw = container_of(kiocb, struct* io_rw, kiocb);
603	struct io_kiocb *req = cmd_to_io_kiocb(ptr: rw);
604
605	if (kiocb->ki_flags & IOCB_WRITE)
606	io_req_end_write(req);
607	if (unlikely(res != req->cqe.res)) {
608	if (res == -EAGAIN && io_rw_should_reissue(req))
609	req->flags \|= REQ_F_REISSUE \| REQ_F_BL_NO_RECYCLE;
610	else
611	req->cqe.res = res;
612	}
613
614	/ order with io_iopoll_complete() checking ->iopoll_completed /
615	smp_store_release(&req->iopoll_completed, `1`);
616	}
617
618	static inline void io_rw_done(struct io_kiocb *req, ssize_t ret)
619	{
620	struct io_rw rw = io_kiocb_to_cmd(req, struct* io_rw);
621
622	/ IO was queued async, completion will happen later /
623	if (ret == -EIOCBQUEUED)
624	return;
625
626	/ transform internal restart error codes /
627	if (unlikely(ret < `0`)) {
628	switch (ret) {
629	case -ERESTARTSYS:
630	case -ERESTARTNOINTR:
631	case -ERESTARTNOHAND:
632	case -ERESTART_RESTARTBLOCK:
633	/*
634	* We can't just restart the syscall, since previously
635	* submitted sqes may already be in progress. Just fail
636	* this IO with EINTR.
637	*/
638	ret = -EINTR;
639	break;
640	}
641	}
642
643	if (req->ctx->flags & IORING_SETUP_IOPOLL)
644	io_complete_rw_iopoll(kiocb: &rw->kiocb, res: ret);
645	else
646	io_complete_rw(kiocb: &rw->kiocb, res: ret);
647	}
648
649	static int kiocb_done(struct io_kiocb *req, ssize_t ret,
650	struct io_br_sel sel, unsigned* int issue_flags)
651	{
652	struct io_rw rw = io_kiocb_to_cmd(req, struct* io_rw);
653	unsigned final_ret = io_fixup_rw_res(req, res: ret);
654
655	if (ret >= `0` && req->flags & REQ_F_CUR_POS)
656	req->file->f_pos = rw->kiocb.ki_pos;
657	if (ret >= `0` && !(req->ctx->flags & IORING_SETUP_IOPOLL)) {
658	__io_complete_rw_common(req, res: ret);
659	/*
660	* Safe to call io_end from here as we're inline
661	* from the submission path.
662	*/
663	io_req_io_end(req);
664	io_req_set_res(req, res: final_ret, cflags: io_put_kbuf(req, len: ret, bl: sel->buf_list));
665	io_req_rw_cleanup(req, issue_flags);
666	return IOU_COMPLETE;
667	} else {
668	io_rw_done(req, ret);
669	}
670
671	return IOU_ISSUE_SKIP_COMPLETE;
672	}
673
674	static inline loff_t io_kiocb_ppos(struct* kiocb *kiocb)
675	{
676	return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
677	}
678
679	/*
680	* For files that don't have ->read_iter() and ->write_iter(), handle them
681	* by looping over ->read() or ->write() manually.
682	*/
683	static ssize_t loop_rw_iter(int ddir, struct io_rw rw, struct* iov_iter *iter)
684	{
685	struct io_kiocb *req = cmd_to_io_kiocb(ptr: rw);
686	struct kiocb *kiocb = &rw->kiocb;
687	struct file *file = kiocb->ki_filp;
688	ssize_t ret = `0`;
689	loff_t *ppos;
690
691	/*
692	* Don't support polled IO through this interface, and we can't
693	* support non-blocking either. For the latter, this just causes
694	* the kiocb to be handled from an async context.
695	*/
696	if (kiocb->ki_flags & IOCB_HIPRI)
697	return -EOPNOTSUPP;
698	if ((kiocb->ki_flags & IOCB_NOWAIT) &&
699	!(kiocb->ki_filp->f_flags & O_NONBLOCK))
700	return -EAGAIN;
701	if ((req->flags & REQ_F_BUF_NODE) && req->buf_node->buf->is_kbuf)
702	return -EFAULT;
703
704	ppos = io_kiocb_ppos(kiocb);
705
706	while (iov_iter_count(i: iter)) {
707	void __user *addr;
708	size_t len;
709	ssize_t nr;
710
711	if (iter_is_ubuf(i: iter)) {
712	addr = iter->ubuf + iter->iov_offset;
713	len = iov_iter_count(i: iter);
714	} else if (!iov_iter_is_bvec(i: iter)) {
715	addr = iter_iov_addr(iter);
716	len = iter_iov_len(i: iter);
717	} else {
718	addr = u64_to_user_ptr(rw->addr);
719	len = rw->len;
720	}
721
722	if (ddir == READ)
723	nr = file->f_op->read(file, addr, len, ppos);
724	else
725	nr = file->f_op->write(file, addr, len, ppos);
726
727	if (nr < `0`) {
728	if (!ret)
729	ret = nr;
730	break;
731	}
732	ret += nr;
733	if (!iov_iter_is_bvec(i: iter)) {
734	iov_iter_advance(i: iter, bytes: nr);
735	} else {
736	rw->addr += nr;
737	rw->len -= nr;
738	if (!rw->len)
739	break;
740	}
741	if (nr != len)
742	break;
743	}
744
745	return ret;
746	}
747
748	/*
749	* This is our waitqueue callback handler, registered through __folio_lock_async()
750	* when we initially tried to do the IO with the iocb armed our waitqueue.
751	* This gets called when the page is unlocked, and we generally expect that to
752	* happen when the page IO is completed and the page is now uptodate. This will
753	* queue a task_work based retry of the operation, attempting to copy the data
754	* again. If the latter fails because the page was NOT uptodate, then we will
755	* do a thread based blocking retry of the operation. That's the unexpected
756	* slow path.
757	*/
758	static int io_async_buf_func(struct wait_queue_entry wait, unsigned* mode,
759	int sync, void *arg)
760	{
761	struct wait_page_queue *wpq;
762	struct io_kiocb *req = wait->private;
763	struct io_rw rw = io_kiocb_to_cmd(req, struct* io_rw);
764	struct wait_page_key *key = arg;
765
766	wpq = container_of(wait, struct wait_page_queue, wait);
767
768	if (!wake_page_match(wait_page: wpq, key))
769	return `0`;
770
771	rw->kiocb.ki_flags &= ~IOCB_WAITQ;
772	list_del_init(entry: &wait->entry);
773	io_req_task_queue(req);
774	return `1`;
775	}
776
777	/*
778	* This controls whether a given IO request should be armed for async page
779	* based retry. If we return false here, the request is handed to the async
780	* worker threads for retry. If we're doing buffered reads on a regular file,
781	* we prepare a private wait_page_queue entry and retry the operation. This
782	* will either succeed because the page is now uptodate and unlocked, or it
783	* will register a callback when the page is unlocked at IO completion. Through
784	* that callback, io_uring uses task_work to setup a retry of the operation.
785	* That retry will attempt the buffered read again. The retry will generally
786	* succeed, or in rare cases where it fails, we then fall back to using the
787	* async worker threads for a blocking retry.
788	*/
789	static bool io_rw_should_retry(struct io_kiocb *req)
790	{
791	struct io_async_rw *io = req->async_data;
792	struct wait_page_queue *wait = &io->wpq;
793	struct io_rw rw = io_kiocb_to_cmd(req, struct* io_rw);
794	struct kiocb *kiocb = &rw->kiocb;
795
796	/*
797	* Never retry for NOWAIT or a request with metadata, we just complete
798	* with -EAGAIN.
799	*/
800	if (req->flags & (REQ_F_NOWAIT \| REQ_F_HAS_METADATA))
801	return false;
802
803	/ Only for buffered IO /
804	if (kiocb->ki_flags & (IOCB_DIRECT \| IOCB_HIPRI))
805	return false;
806
807	/*
808	* just use poll if we can, and don't attempt if the fs doesn't
809	* support callback based unlocks
810	*/
811	if (io_file_can_poll(req) \|\|
812	!(req->file->f_op->fop_flags & FOP_BUFFER_RASYNC))
813	return false;
814
815	wait->wait.func = io_async_buf_func;
816	wait->wait.private = req;
817	wait->wait.flags = `0`;
818	INIT_LIST_HEAD(list: &wait->wait.entry);
819	kiocb->ki_flags \|= IOCB_WAITQ;
820	kiocb->ki_flags &= ~IOCB_NOWAIT;
821	kiocb->ki_waitq = wait;
822	return true;
823	}
824
825	static inline int io_iter_do_read(struct io_rw rw, struct* iov_iter *iter)
826	{
827	struct file *file = rw->kiocb.ki_filp;
828
829	if (likely(file->f_op->read_iter))
830	return file->f_op->read_iter(&rw->kiocb, iter);
831	else if (file->f_op->read)
832	return loop_rw_iter(READ, rw, iter);
833	else
834	return -EINVAL;
835	}
836
837	static bool need_complete_io(struct io_kiocb *req)
838	{
839	return req->flags & REQ_F_ISREG \|\|
840	S_ISBLK(file_inode(req->file)->i_mode);
841	}
842
843	static int io_rw_init_file(struct io_kiocb req, fmode_t mode, int* rw_type)
844	{
845	struct io_rw rw = io_kiocb_to_cmd(req, struct* io_rw);
846	struct kiocb *kiocb = &rw->kiocb;
847	struct io_ring_ctx *ctx = req->ctx;
848	struct file *file = req->file;
849	int ret;
850
851	if (unlikely(!(file->f_mode & mode)))
852	return -EBADF;
853
854	if (!(req->flags & REQ_F_FIXED_FILE))
855	req->flags \|= io_file_get_flags(file);
856
857	kiocb->ki_flags = file->f_iocb_flags;
858	ret = kiocb_set_rw_flags(ki: kiocb, flags: rw->flags, rw_type);
859	if (unlikely(ret))
860	return ret;
861	kiocb->ki_flags \|= IOCB_ALLOC_CACHE;
862
863	/*
864	* If the file is marked O_NONBLOCK, still allow retry for it if it
865	* supports async. Otherwise it's impossible to use O_NONBLOCK files
866	* reliably. If not, or it IOCB_NOWAIT is set, don't retry.
867	*/
868	if (kiocb->ki_flags & IOCB_NOWAIT \|\|
869	((file->f_flags & O_NONBLOCK && !(req->flags & REQ_F_SUPPORT_NOWAIT))))
870	req->flags \|= REQ_F_NOWAIT;
871
872	if (ctx->flags & IORING_SETUP_IOPOLL) {
873	if (!(kiocb->ki_flags & IOCB_DIRECT) \|\| !file->f_op->iopoll)
874	return -EOPNOTSUPP;
875	kiocb->private = NULL;
876	kiocb->ki_flags \|= IOCB_HIPRI;
877	req->iopoll_completed = `0`;
878	if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) {
879	/ make sure every req only blocks once/
880	req->flags &= ~REQ_F_IOPOLL_STATE;
881	req->iopoll_start = ktime_get_ns();
882	}
883	} else {
884	if (kiocb->ki_flags & IOCB_HIPRI)
885	return -EINVAL;
886	}
887
888	if (req->flags & REQ_F_HAS_METADATA) {
889	struct io_async_rw *io = req->async_data;
890
891	if (!(file->f_mode & FMODE_HAS_METADATA))
892	return -EINVAL;
893
894	/*
895	* We have a union of meta fields with wpq used for buffered-io
896	* in io_async_rw, so fail it here.
897	*/
898	if (!(req->file->f_flags & O_DIRECT))
899	return -EOPNOTSUPP;
900	kiocb->ki_flags \|= IOCB_HAS_METADATA;
901	kiocb->private = &io->meta;
902	}
903
904	return `0`;
905	}
906
907	static int __io_read(struct io_kiocb req, struct* io_br_sel *sel,
908	unsigned int issue_flags)
909	{
910	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
911	struct io_rw rw = io_kiocb_to_cmd(req, struct* io_rw);
912	struct io_async_rw *io = req->async_data;
913	struct kiocb *kiocb = &rw->kiocb;
914	ssize_t ret;
915	loff_t *ppos;
916
917	if (req->flags & REQ_F_IMPORT_BUFFER) {
918	ret = io_rw_import_reg_vec(req, io, ITER_DEST, issue_flags);
919	if (unlikely(ret))
920	return ret;
921	} else if (io_do_buffer_select(req)) {
922	ret = io_import_rw_buffer(ITER_DEST, req, io, sel, issue_flags);
923	if (unlikely(ret < `0`))
924	return ret;
925	}
926	ret = io_rw_init_file(req, FMODE_READ, READ);
927	if (unlikely(ret))
928	return ret;
929	req->cqe.res = iov_iter_count(i: &io->iter);
930
931	if (force_nonblock) {
932	/ If the file doesn't support async, just async punt /
933	if (unlikely(!io_file_supports_nowait(req, EPOLLIN)))
934	return -EAGAIN;
935	kiocb->ki_flags \|= IOCB_NOWAIT;
936	} else {
937	/ Ensure we clear previously set non-block flag /
938	kiocb->ki_flags &= ~IOCB_NOWAIT;
939	}
940
941	ppos = io_kiocb_update_pos(req);
942
943	ret = rw_verify_area(READ, req->file, ppos, req->cqe.res);
944	if (unlikely(ret))
945	return ret;
946
947	ret = io_iter_do_read(rw, iter: &io->iter);
948
949	/*
950	* Some file systems like to return -EOPNOTSUPP for an IOCB_NOWAIT
951	* issue, even though they should be returning -EAGAIN. To be safe,
952	* retry from blocking context for either.
953	*/
954	if (ret == -EOPNOTSUPP && force_nonblock)
955	ret = -EAGAIN;
956
957	if (ret == -EAGAIN) {
958	/ If we can poll, just do that. /
959	if (io_file_can_poll(req))
960	return -EAGAIN;
961	/ IOPOLL retry should happen for io-wq threads /
962	if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
963	goto done;
964	/ no retry on NONBLOCK nor RWF_NOWAIT /
965	if (req->flags & REQ_F_NOWAIT)
966	goto done;
967	ret = `0`;
968	} else if (ret == -EIOCBQUEUED) {
969	return IOU_ISSUE_SKIP_COMPLETE;
970	} else if (ret == req->cqe.res \|\| ret <= `0` \|\| !force_nonblock \|\|
971	(req->flags & REQ_F_NOWAIT) \|\| !need_complete_io(req) \|\|
972	(issue_flags & IO_URING_F_MULTISHOT)) {
973	/ read all, failed, already did sync or don't want to retry /
974	goto done;
975	}
976
977	/*
978	* Don't depend on the iter state matching what was consumed, or being
979	* untouched in case of error. Restore it and we'll advance it
980	* manually if we need to.
981	*/
982	iov_iter_restore(i: &io->iter, state: &io->iter_state);
983	io_meta_restore(io, kiocb);
984
985	do {
986	/*
987	* We end up here because of a partial read, either from
988	* above or inside this loop. Advance the iter by the bytes
989	* that were consumed.
990	*/
991	iov_iter_advance(i: &io->iter, bytes: ret);
992	if (!iov_iter_count(i: &io->iter))
993	break;
994	io->bytes_done += ret;
995	iov_iter_save_state(iter: &io->iter, state: &io->iter_state);
996
997	/ if we can retry, do so with the callbacks armed /
998	if (!io_rw_should_retry(req)) {
999	kiocb->ki_flags &= ~IOCB_WAITQ;
1000	return -EAGAIN;
1001	}
1002
1003	req->cqe.res = iov_iter_count(i: &io->iter);
1004	/*
1005	* Now retry read with the IOCB_WAITQ parts set in the iocb. If
1006	* we get -EIOCBQUEUED, then we'll get a notification when the
1007	* desired page gets unlocked. We can also get a partial read
1008	* here, and if we do, then just retry at the new offset.
1009	*/
1010	ret = io_iter_do_read(rw, iter: &io->iter);
1011	if (ret == -EIOCBQUEUED)
1012	return IOU_ISSUE_SKIP_COMPLETE;
1013	/ we got some bytes, but not all. retry. /
1014	kiocb->ki_flags &= ~IOCB_WAITQ;
1015	iov_iter_restore(i: &io->iter, state: &io->iter_state);
1016	} while (ret > `0`);
1017	done:
1018	/ it's faster to check here then delegate to kfree /
1019	return ret;
1020	}
1021
1022	int io_read(struct io_kiocb req, unsigned* int issue_flags)
1023	{
1024	struct io_br_sel sel = { };
1025	int ret;
1026
1027	ret = __io_read(req, sel: &sel, issue_flags);
1028	if (ret >= `0`)
1029	return kiocb_done(req, ret, sel: &sel, issue_flags);
1030
1031	if (req->flags & REQ_F_BUFFERS_COMMIT)
1032	io_kbuf_recycle(req, bl: sel.buf_list, issue_flags);
1033	return ret;
1034	}
1035
1036	int io_read_mshot(struct io_kiocb req, unsigned* int issue_flags)
1037	{
1038	struct io_rw rw = io_kiocb_to_cmd(req, struct* io_rw);
1039	struct io_br_sel sel = { };
1040	unsigned int cflags = `0`;
1041	int ret;
1042
1043	/*
1044	* Multishot MUST be used on a pollable file
1045	*/
1046	if (!io_file_can_poll(req))
1047	return -EBADFD;
1048
1049	/ make it sync, multishot doesn't support async execution /
1050	rw->kiocb.ki_complete = NULL;
1051	ret = __io_read(req, sel: &sel, issue_flags);
1052
1053	/*
1054	* If we get -EAGAIN, recycle our buffer and just let normal poll
1055	* handling arm it.
1056	*/
1057	if (ret == -EAGAIN) {
1058	/*
1059	* Reset rw->len to 0 again to avoid clamping future mshot
1060	* reads, in case the buffer size varies.
1061	*/
1062	if (io_kbuf_recycle(req, bl: sel.buf_list, issue_flags))
1063	rw->len = `0`;
1064	return IOU_RETRY;
1065	} else if (ret <= `0`) {
1066	io_kbuf_recycle(req, bl: sel.buf_list, issue_flags);
1067	if (ret < `0`)
1068	req_set_fail(req);
1069	} else if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
1070	cflags = io_put_kbuf(req, len: ret, bl: sel.buf_list);
1071	} else {
1072	/*
1073	* Any successful return value will keep the multishot read
1074	* armed, if it's still set. Put our buffer and post a CQE. If
1075	* we fail to post a CQE, or multishot is no longer set, then
1076	* jump to the termination path. This request is then done.
1077	*/
1078	cflags = io_put_kbuf(req, len: ret, bl: sel.buf_list);
1079	rw->len = `0`; / similarly to above, reset len to 0 /
1080
1081	if (io_req_post_cqe(req, res: ret, cflags: cflags \| IORING_CQE_F_MORE)) {
1082	if (issue_flags & IO_URING_F_MULTISHOT)
1083	/*
1084	* Force retry, as we might have more data to
1085	* be read and otherwise it won't get retried
1086	* until (if ever) another poll is triggered.
1087	*/
1088	io_poll_multishot_retry(req);
1089
1090	return IOU_RETRY;
1091	}
1092	}
1093
1094	/*
1095	* Either an error, or we've hit overflow posting the CQE. For any
1096	* multishot request, hitting overflow will terminate it.
1097	*/
1098	io_req_set_res(req, res: ret, cflags);
1099	io_req_rw_cleanup(req, issue_flags);
1100	return IOU_COMPLETE;
1101	}
1102
1103	static bool io_kiocb_start_write(struct io_kiocb req, struct* kiocb *kiocb)
1104	{
1105	struct inode *inode;
1106	bool ret;
1107
1108	if (!(req->flags & REQ_F_ISREG))
1109	return true;
1110	if (!(kiocb->ki_flags & IOCB_NOWAIT)) {
1111	kiocb_start_write(iocb: kiocb);
1112	return true;
1113	}
1114
1115	inode = file_inode(f: kiocb->ki_filp);
1116	ret = sb_start_write_trylock(sb: inode->i_sb);
1117	if (ret)
1118	__sb_writers_release(inode->i_sb, SB_FREEZE_WRITE);
1119	return ret;
1120	}
1121
1122	int io_write(struct io_kiocb req, unsigned* int issue_flags)
1123	{
1124	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1125	struct io_rw rw = io_kiocb_to_cmd(req, struct* io_rw);
1126	struct io_async_rw *io = req->async_data;
1127	struct kiocb *kiocb = &rw->kiocb;
1128	ssize_t ret, ret2;
1129	loff_t *ppos;
1130
1131	if (req->flags & REQ_F_IMPORT_BUFFER) {
1132	ret = io_rw_import_reg_vec(req, io, ITER_SOURCE, issue_flags);
1133	if (unlikely(ret))
1134	return ret;
1135	}
1136
1137	ret = io_rw_init_file(req, FMODE_WRITE, WRITE);
1138	if (unlikely(ret))
1139	return ret;
1140	req->cqe.res = iov_iter_count(i: &io->iter);
1141
1142	if (force_nonblock) {
1143	/ If the file doesn't support async, just async punt /
1144	if (unlikely(!io_file_supports_nowait(req, EPOLLOUT)))
1145	goto ret_eagain;
1146
1147	/ Check if we can support NOWAIT. /
1148	if (!(kiocb->ki_flags & IOCB_DIRECT) &&
1149	!(req->file->f_op->fop_flags & FOP_BUFFER_WASYNC) &&
1150	(req->flags & REQ_F_ISREG))
1151	goto ret_eagain;
1152
1153	kiocb->ki_flags \|= IOCB_NOWAIT;
1154	} else {
1155	/ Ensure we clear previously set non-block flag /
1156	kiocb->ki_flags &= ~IOCB_NOWAIT;
1157	}
1158
1159	ppos = io_kiocb_update_pos(req);
1160
1161	ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res);
1162	if (unlikely(ret))
1163	return ret;
1164
1165	if (unlikely(!io_kiocb_start_write(req, kiocb)))
1166	return -EAGAIN;
1167	kiocb->ki_flags \|= IOCB_WRITE;
1168
1169	if (likely(req->file->f_op->write_iter))
1170	ret2 = req->file->f_op->write_iter(kiocb, &io->iter);
1171	else if (req->file->f_op->write)
1172	ret2 = loop_rw_iter(WRITE, rw, iter: &io->iter);
1173	else
1174	ret2 = -EINVAL;
1175
1176	/*
1177	* Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
1178	* retry them without IOCB_NOWAIT.
1179	*/
1180	if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
1181	ret2 = -EAGAIN;
1182	/ no retry on NONBLOCK nor RWF_NOWAIT /
1183	if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
1184	goto done;
1185	if (!force_nonblock \|\| ret2 != -EAGAIN) {
1186	/ IOPOLL retry should happen for io-wq threads /
1187	if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
1188	goto ret_eagain;
1189
1190	if (ret2 != req->cqe.res && ret2 >= `0` && need_complete_io(req)) {
1191	trace_io_uring_short_write(ctx: req->ctx, fpos: kiocb->ki_pos - ret2,
1192	wanted: req->cqe.res, got: ret2);
1193
1194	/ This is a partial write. The file pos has already been*
1195	* updated, setup the async struct to complete the request
1196	* in the worker. Also update bytes_done to account for
1197	* the bytes already written.
1198	*/
1199	iov_iter_save_state(iter: &io->iter, state: &io->iter_state);
1200	io->bytes_done += ret2;
1201
1202	if (kiocb->ki_flags & IOCB_WRITE)
1203	io_req_end_write(req);
1204	return -EAGAIN;
1205	}
1206	done:
1207	return kiocb_done(req, ret: ret2, NULL, issue_flags);
1208	} else {
1209	ret_eagain:
1210	iov_iter_restore(i: &io->iter, state: &io->iter_state);
1211	io_meta_restore(io, kiocb);
1212	if (kiocb->ki_flags & IOCB_WRITE)
1213	io_req_end_write(req);
1214	return -EAGAIN;
1215	}
1216	}
1217
1218	int io_read_fixed(struct io_kiocb req, unsigned* int issue_flags)
1219	{
1220	int ret;
1221
1222	ret = io_init_rw_fixed(req, issue_flags, ITER_DEST);
1223	if (unlikely(ret))
1224	return ret;
1225
1226	return io_read(req, issue_flags);
1227	}
1228
1229	int io_write_fixed(struct io_kiocb req, unsigned* int issue_flags)
1230	{
1231	int ret;
1232
1233	ret = io_init_rw_fixed(req, issue_flags, ITER_SOURCE);
1234	if (unlikely(ret))
1235	return ret;
1236
1237	return io_write(req, issue_flags);
1238	}
1239
1240	void io_rw_fail(struct io_kiocb *req)
1241	{
1242	int res;
1243
1244	res = io_fixup_rw_res(req, res: req->cqe.res);
1245	io_req_set_res(req, res, cflags: req->cqe.flags);
1246	}
1247
1248	static int io_uring_classic_poll(struct io_kiocb req, struct* io_comp_batch *iob,
1249	unsigned int poll_flags)
1250	{
1251	struct file *file = req->file;
1252
1253	if (req->opcode == IORING_OP_URING_CMD) {
1254	struct io_uring_cmd *ioucmd;
1255
1256	ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
1257	return file->f_op->uring_cmd_iopoll(ioucmd, iob, poll_flags);
1258	} else {
1259	struct io_rw rw = io_kiocb_to_cmd(req, struct* io_rw);
1260
1261	return file->f_op->iopoll(&rw->kiocb, iob, poll_flags);
1262	}
1263	}
1264
1265	static u64 io_hybrid_iopoll_delay(struct io_ring_ctx ctx, struct* io_kiocb *req)
1266	{
1267	struct hrtimer_sleeper timer;
1268	enum hrtimer_mode mode;
1269	ktime_t kt;
1270	u64 sleep_time;
1271
1272	if (req->flags & REQ_F_IOPOLL_STATE)
1273	return `0`;
1274
1275	if (ctx->hybrid_poll_time == LLONG_MAX)
1276	return `0`;
1277
1278	/ Using half the running time to do schedule /
1279	sleep_time = ctx->hybrid_poll_time / `2`;
1280
1281	kt = ktime_set(secs: `0`, nsecs: sleep_time);
1282	req->flags \|= REQ_F_IOPOLL_STATE;
1283
1284	mode = HRTIMER_MODE_REL;
1285	hrtimer_setup_sleeper_on_stack(sl: &timer, CLOCK_MONOTONIC, mode);
1286	hrtimer_set_expires(timer: &timer.timer, time: kt);
1287	set_current_state(TASK_INTERRUPTIBLE);
1288	hrtimer_sleeper_start_expires(sl: &timer, mode);
1289
1290	if (timer.task)
1291	io_schedule();
1292
1293	hrtimer_cancel(timer: &timer.timer);
1294	__set_current_state(TASK_RUNNING);
1295	destroy_hrtimer_on_stack(timer: &timer.timer);
1296	return sleep_time;
1297	}
1298
1299	static int io_uring_hybrid_poll(struct io_kiocb *req,
1300	struct io_comp_batch iob, unsigned* int poll_flags)
1301	{
1302	struct io_ring_ctx *ctx = req->ctx;
1303	u64 runtime, sleep_time;
1304	int ret;
1305
1306	sleep_time = io_hybrid_iopoll_delay(ctx, req);
1307	ret = io_uring_classic_poll(req, iob, poll_flags);
1308	runtime = ktime_get_ns() - req->iopoll_start - sleep_time;
1309
1310	/*
1311	* Use minimum sleep time if we're polling devices with different
1312	* latencies. We could get more completions from the faster ones.
1313	*/
1314	if (ctx->hybrid_poll_time > runtime)
1315	ctx->hybrid_poll_time = runtime;
1316
1317	return ret;
1318	}
1319
1320	int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
1321	{
1322	struct io_wq_work_node pos, start, *prev;
1323	unsigned int poll_flags = `0`;
1324	DEFINE_IO_COMP_BATCH(iob);
1325	int nr_events = `0`;
1326
1327	/*
1328	* Only spin for completions if we don't have multiple devices hanging
1329	* off our complete list.
1330	*/
1331	if (ctx->poll_multi_queue \|\| force_nonspin)
1332	poll_flags \|= BLK_POLL_ONESHOT;
1333
1334	wq_list_for_each(pos, start, &ctx->iopoll_list) {
1335	struct io_kiocb req = container_of(pos, struct* io_kiocb, comp_list);
1336	int ret;
1337
1338	/*
1339	* Move completed and retryable entries to our local lists.
1340	* If we find a request that requires polling, break out
1341	* and complete those lists first, if we have entries there.
1342	*/
1343	if (READ_ONCE(req->iopoll_completed))
1344	break;
1345
1346	if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL)
1347	ret = io_uring_hybrid_poll(req, iob: &iob, poll_flags);
1348	else
1349	ret = io_uring_classic_poll(req, iob: &iob, poll_flags);
1350
1351	if (unlikely(ret < `0`))
1352	return ret;
1353	else if (ret)
1354	poll_flags \|= BLK_POLL_ONESHOT;
1355
1356	/ iopoll may have completed current req /
1357	if (!rq_list_empty(rl: &iob.req_list) \|\|
1358	READ_ONCE(req->iopoll_completed))
1359	break;
1360	}
1361
1362	if (!rq_list_empty(rl: &iob.req_list))
1363	iob.complete(&iob);
1364	else if (!pos)
1365	return `0`;
1366
1367	prev = start;
1368	wq_list_for_each_resume(pos, prev) {
1369	struct io_kiocb req = container_of(pos, struct* io_kiocb, comp_list);
1370
1371	/ order with io_complete_rw_iopoll(), e.g. ->result updates /
1372	if (!smp_load_acquire(&req->iopoll_completed))
1373	break;
1374	nr_events++;
1375	req->cqe.flags = io_put_kbuf(req, len: req->cqe.res, NULL);
1376	if (req->opcode != IORING_OP_URING_CMD)
1377	io_req_rw_cleanup(req, issue_flags: `0`);
1378	}
1379	if (unlikely(!nr_events))
1380	return `0`;
1381
1382	pos = start ? start->next : ctx->iopoll_list.first;
1383	wq_list_cut(list: &ctx->iopoll_list, last: prev, prev: start);
1384
1385	if (WARN_ON_ONCE(!wq_list_empty(&ctx->submit_state.compl_reqs)))
1386	return `0`;
1387	ctx->submit_state.compl_reqs.first = pos;
1388	__io_submit_flush_completions(ctx);
1389	return nr_events;
1390	}
1391
1392	void io_rw_cache_free(const void *entry)
1393	{
1394	struct io_async_rw rw = (struct* io_async_rw *) entry;
1395
1396	io_vec_free(iv: &rw->vec);
1397	kfree(objp: rw);
1398	}
1399

Browse the source code of Linux/io_uring/rw.c