| 1 | // SPDX-License-Identifier: GPL-2.0 | 
|---|
| 2 | #include <linux/kernel.h> | 
|---|
| 3 | #include <linux/errno.h> | 
|---|
| 4 | #include <linux/fs.h> | 
|---|
| 5 | #include <linux/file.h> | 
|---|
| 6 | #include <linux/blk-mq.h> | 
|---|
| 7 | #include <linux/mm.h> | 
|---|
| 8 | #include <linux/slab.h> | 
|---|
| 9 | #include <linux/fsnotify.h> | 
|---|
| 10 | #include <linux/poll.h> | 
|---|
| 11 | #include <linux/nospec.h> | 
|---|
| 12 | #include <linux/compat.h> | 
|---|
| 13 | #include <linux/io_uring/cmd.h> | 
|---|
| 14 | #include <linux/indirect_call_wrapper.h> | 
|---|
| 15 |  | 
|---|
| 16 | #include <uapi/linux/io_uring.h> | 
|---|
| 17 |  | 
|---|
| 18 | #include "filetable.h" | 
|---|
| 19 | #include "io_uring.h" | 
|---|
| 20 | #include "opdef.h" | 
|---|
| 21 | #include "kbuf.h" | 
|---|
| 22 | #include "alloc_cache.h" | 
|---|
| 23 | #include "rsrc.h" | 
|---|
| 24 | #include "poll.h" | 
|---|
| 25 | #include "rw.h" | 
|---|
| 26 |  | 
|---|
| 27 | static void io_complete_rw(struct kiocb *kiocb, long res); | 
|---|
| 28 | static void io_complete_rw_iopoll(struct kiocb *kiocb, long res); | 
|---|
| 29 |  | 
|---|
| 30 | struct io_rw { | 
|---|
| 31 | /* NOTE: kiocb has the file as the first member, so don't do it here */ | 
|---|
| 32 | struct kiocb			kiocb; | 
|---|
| 33 | u64				addr; | 
|---|
| 34 | u32				len; | 
|---|
| 35 | rwf_t				flags; | 
|---|
| 36 | }; | 
|---|
| 37 |  | 
|---|
| 38 | static bool io_file_supports_nowait(struct io_kiocb *req, __poll_t mask) | 
|---|
| 39 | { | 
|---|
| 40 | /* If FMODE_NOWAIT is set for a file, we're golden */ | 
|---|
| 41 | if (req->flags & REQ_F_SUPPORT_NOWAIT) | 
|---|
| 42 | return true; | 
|---|
| 43 | /* No FMODE_NOWAIT, if we can poll, check the status */ | 
|---|
| 44 | if (io_file_can_poll(req)) { | 
|---|
| 45 | struct poll_table_struct pt = { ._key = mask }; | 
|---|
| 46 |  | 
|---|
| 47 | return vfs_poll(file: req->file, pt: &pt) & mask; | 
|---|
| 48 | } | 
|---|
| 49 | /* No FMODE_NOWAIT support, and file isn't pollable. Tough luck. */ | 
|---|
| 50 | return false; | 
|---|
| 51 | } | 
|---|
| 52 |  | 
|---|
| 53 | static int io_iov_compat_buffer_select_prep(struct io_rw *rw) | 
|---|
| 54 | { | 
|---|
| 55 | struct compat_iovec __user *uiov = u64_to_user_ptr(rw->addr); | 
|---|
| 56 | struct compat_iovec iov; | 
|---|
| 57 |  | 
|---|
| 58 | if (copy_from_user(to: &iov, from: uiov, n: sizeof(iov))) | 
|---|
| 59 | return -EFAULT; | 
|---|
| 60 | rw->len = iov.iov_len; | 
|---|
| 61 | return 0; | 
|---|
| 62 | } | 
|---|
| 63 |  | 
|---|
| 64 | static int io_iov_buffer_select_prep(struct io_kiocb *req) | 
|---|
| 65 | { | 
|---|
| 66 | struct iovec __user *uiov; | 
|---|
| 67 | struct iovec iov; | 
|---|
| 68 | struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); | 
|---|
| 69 |  | 
|---|
| 70 | if (rw->len != 1) | 
|---|
| 71 | return -EINVAL; | 
|---|
| 72 |  | 
|---|
| 73 | if (io_is_compat(ctx: req->ctx)) | 
|---|
| 74 | return io_iov_compat_buffer_select_prep(rw); | 
|---|
| 75 |  | 
|---|
| 76 | uiov = u64_to_user_ptr(rw->addr); | 
|---|
| 77 | if (copy_from_user(to: &iov, from: uiov, n: sizeof(*uiov))) | 
|---|
| 78 | return -EFAULT; | 
|---|
| 79 | rw->len = iov.iov_len; | 
|---|
| 80 | return 0; | 
|---|
| 81 | } | 
|---|
| 82 |  | 
|---|
| 83 | static int io_import_vec(int ddir, struct io_kiocb *req, | 
|---|
| 84 | struct io_async_rw *io, | 
|---|
| 85 | const struct iovec __user *uvec, | 
|---|
| 86 | size_t uvec_segs) | 
|---|
| 87 | { | 
|---|
| 88 | int ret, nr_segs; | 
|---|
| 89 | struct iovec *iov; | 
|---|
| 90 |  | 
|---|
| 91 | if (io->vec.iovec) { | 
|---|
| 92 | nr_segs = io->vec.nr; | 
|---|
| 93 | iov = io->vec.iovec; | 
|---|
| 94 | } else { | 
|---|
| 95 | nr_segs = 1; | 
|---|
| 96 | iov = &io->fast_iov; | 
|---|
| 97 | } | 
|---|
| 98 |  | 
|---|
| 99 | ret = __import_iovec(type: ddir, uvec, nr_segs: uvec_segs, fast_segs: nr_segs, iovp: &iov, i: &io->iter, | 
|---|
| 100 | compat: io_is_compat(ctx: req->ctx)); | 
|---|
| 101 | if (unlikely(ret < 0)) | 
|---|
| 102 | return ret; | 
|---|
| 103 | if (iov) { | 
|---|
| 104 | req->flags |= REQ_F_NEED_CLEANUP; | 
|---|
| 105 | io_vec_reset_iovec(iv: &io->vec, iovec: iov, nr: io->iter.nr_segs); | 
|---|
| 106 | } | 
|---|
| 107 | return 0; | 
|---|
| 108 | } | 
|---|
| 109 |  | 
|---|
| 110 | static int __io_import_rw_buffer(int ddir, struct io_kiocb *req, | 
|---|
| 111 | struct io_async_rw *io, struct io_br_sel *sel, | 
|---|
| 112 | unsigned int issue_flags) | 
|---|
| 113 | { | 
|---|
| 114 | const struct io_issue_def *def = &io_issue_defs[req->opcode]; | 
|---|
| 115 | struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); | 
|---|
| 116 | size_t sqe_len = rw->len; | 
|---|
| 117 |  | 
|---|
| 118 | sel->addr = u64_to_user_ptr(rw->addr); | 
|---|
| 119 | if (def->vectored && !(req->flags & REQ_F_BUFFER_SELECT)) | 
|---|
| 120 | return io_import_vec(ddir, req, io, uvec: sel->addr, uvec_segs: sqe_len); | 
|---|
| 121 |  | 
|---|
| 122 | if (io_do_buffer_select(req)) { | 
|---|
| 123 | *sel = io_buffer_select(req, len: &sqe_len, buf_group: io->buf_group, issue_flags); | 
|---|
| 124 | if (!sel->addr) | 
|---|
| 125 | return -ENOBUFS; | 
|---|
| 126 | rw->addr = (unsigned long) sel->addr; | 
|---|
| 127 | rw->len = sqe_len; | 
|---|
| 128 | } | 
|---|
| 129 | return import_ubuf(type: ddir, buf: sel->addr, len: sqe_len, i: &io->iter); | 
|---|
| 130 | } | 
|---|
| 131 |  | 
|---|
| 132 | static inline int io_import_rw_buffer(int rw, struct io_kiocb *req, | 
|---|
| 133 | struct io_async_rw *io, | 
|---|
| 134 | struct io_br_sel *sel, | 
|---|
| 135 | unsigned int issue_flags) | 
|---|
| 136 | { | 
|---|
| 137 | int ret; | 
|---|
| 138 |  | 
|---|
| 139 | ret = __io_import_rw_buffer(ddir: rw, req, io, sel, issue_flags); | 
|---|
| 140 | if (unlikely(ret < 0)) | 
|---|
| 141 | return ret; | 
|---|
| 142 |  | 
|---|
| 143 | iov_iter_save_state(iter: &io->iter, state: &io->iter_state); | 
|---|
| 144 | return 0; | 
|---|
| 145 | } | 
|---|
| 146 |  | 
|---|
| 147 | static void io_rw_recycle(struct io_kiocb *req, unsigned int issue_flags) | 
|---|
| 148 | { | 
|---|
| 149 | struct io_async_rw *rw = req->async_data; | 
|---|
| 150 |  | 
|---|
| 151 | if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) | 
|---|
| 152 | return; | 
|---|
| 153 |  | 
|---|
| 154 | io_alloc_cache_vec_kasan(iv: &rw->vec); | 
|---|
| 155 | if (rw->vec.nr > IO_VEC_CACHE_SOFT_CAP) | 
|---|
| 156 | io_vec_free(iv: &rw->vec); | 
|---|
| 157 |  | 
|---|
| 158 | if (io_alloc_cache_put(cache: &req->ctx->rw_cache, entry: rw)) | 
|---|
| 159 | io_req_async_data_clear(req, extra_flags: 0); | 
|---|
| 160 | } | 
|---|
| 161 |  | 
|---|
| 162 | static void io_req_rw_cleanup(struct io_kiocb *req, unsigned int issue_flags) | 
|---|
| 163 | { | 
|---|
| 164 | /* | 
|---|
| 165 | * Disable quick recycling for anything that's gone through io-wq. | 
|---|
| 166 | * In theory, this should be fine to cleanup. However, some read or | 
|---|
| 167 | * write iter handling touches the iovec AFTER having called into the | 
|---|
| 168 | * handler, eg to reexpand or revert. This means we can have: | 
|---|
| 169 | * | 
|---|
| 170 | * task			io-wq | 
|---|
| 171 | *   issue | 
|---|
| 172 | *     punt to io-wq | 
|---|
| 173 | *			issue | 
|---|
| 174 | *			  blkdev_write_iter() | 
|---|
| 175 | *			    ->ki_complete() | 
|---|
| 176 | *			      io_complete_rw() | 
|---|
| 177 | *			        queue tw complete | 
|---|
| 178 | *  run tw | 
|---|
| 179 | *    req_rw_cleanup | 
|---|
| 180 | *			iov_iter_count() <- look at iov_iter again | 
|---|
| 181 | * | 
|---|
| 182 | * which can lead to a UAF. This is only possible for io-wq offload | 
|---|
| 183 | * as the cleanup can run in parallel. As io-wq is not the fast path, | 
|---|
| 184 | * just leave cleanup to the end. | 
|---|
| 185 | * | 
|---|
| 186 | * This is really a bug in the core code that does this, any issue | 
|---|
| 187 | * path should assume that a successful (or -EIOCBQUEUED) return can | 
|---|
| 188 | * mean that the underlying data can be gone at any time. But that | 
|---|
| 189 | * should be fixed seperately, and then this check could be killed. | 
|---|
| 190 | */ | 
|---|
| 191 | if (!(req->flags & (REQ_F_REISSUE | REQ_F_REFCOUNT))) { | 
|---|
| 192 | req->flags &= ~REQ_F_NEED_CLEANUP; | 
|---|
| 193 | io_rw_recycle(req, issue_flags); | 
|---|
| 194 | } | 
|---|
| 195 | } | 
|---|
| 196 |  | 
|---|
| 197 | static int io_rw_alloc_async(struct io_kiocb *req) | 
|---|
| 198 | { | 
|---|
| 199 | struct io_ring_ctx *ctx = req->ctx; | 
|---|
| 200 | struct io_async_rw *rw; | 
|---|
| 201 |  | 
|---|
| 202 | rw = io_uring_alloc_async_data(cache: &ctx->rw_cache, req); | 
|---|
| 203 | if (!rw) | 
|---|
| 204 | return -ENOMEM; | 
|---|
| 205 | if (rw->vec.iovec) | 
|---|
| 206 | req->flags |= REQ_F_NEED_CLEANUP; | 
|---|
| 207 | rw->bytes_done = 0; | 
|---|
| 208 | return 0; | 
|---|
| 209 | } | 
|---|
| 210 |  | 
|---|
| 211 | static inline void io_meta_save_state(struct io_async_rw *io) | 
|---|
| 212 | { | 
|---|
| 213 | io->meta_state.seed = io->meta.seed; | 
|---|
| 214 | iov_iter_save_state(iter: &io->meta.iter, state: &io->meta_state.iter_meta); | 
|---|
| 215 | } | 
|---|
| 216 |  | 
|---|
| 217 | static inline void io_meta_restore(struct io_async_rw *io, struct kiocb *kiocb) | 
|---|
| 218 | { | 
|---|
| 219 | if (kiocb->ki_flags & IOCB_HAS_METADATA) { | 
|---|
| 220 | io->meta.seed = io->meta_state.seed; | 
|---|
| 221 | iov_iter_restore(i: &io->meta.iter, state: &io->meta_state.iter_meta); | 
|---|
| 222 | } | 
|---|
| 223 | } | 
|---|
| 224 |  | 
|---|
| 225 | static int io_prep_rw_pi(struct io_kiocb *req, struct io_rw *rw, int ddir, | 
|---|
| 226 | u64 attr_ptr, u64 attr_type_mask) | 
|---|
| 227 | { | 
|---|
| 228 | struct io_uring_attr_pi pi_attr; | 
|---|
| 229 | struct io_async_rw *io; | 
|---|
| 230 | int ret; | 
|---|
| 231 |  | 
|---|
| 232 | if (copy_from_user(to: &pi_attr, u64_to_user_ptr(attr_ptr), | 
|---|
| 233 | n: sizeof(pi_attr))) | 
|---|
| 234 | return -EFAULT; | 
|---|
| 235 |  | 
|---|
| 236 | if (pi_attr.rsvd) | 
|---|
| 237 | return -EINVAL; | 
|---|
| 238 |  | 
|---|
| 239 | io = req->async_data; | 
|---|
| 240 | io->meta.flags = pi_attr.flags; | 
|---|
| 241 | io->meta.app_tag = pi_attr.app_tag; | 
|---|
| 242 | io->meta.seed = pi_attr.seed; | 
|---|
| 243 | ret = import_ubuf(type: ddir, u64_to_user_ptr(pi_attr.addr), | 
|---|
| 244 | len: pi_attr.len, i: &io->meta.iter); | 
|---|
| 245 | if (unlikely(ret < 0)) | 
|---|
| 246 | return ret; | 
|---|
| 247 | req->flags |= REQ_F_HAS_METADATA; | 
|---|
| 248 | io_meta_save_state(io); | 
|---|
| 249 | return ret; | 
|---|
| 250 | } | 
|---|
| 251 |  | 
|---|
| 252 | static int __io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, | 
|---|
| 253 | int ddir) | 
|---|
| 254 | { | 
|---|
| 255 | struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); | 
|---|
| 256 | struct io_async_rw *io; | 
|---|
| 257 | unsigned ioprio; | 
|---|
| 258 | u64 attr_type_mask; | 
|---|
| 259 | int ret; | 
|---|
| 260 |  | 
|---|
| 261 | if (io_rw_alloc_async(req)) | 
|---|
| 262 | return -ENOMEM; | 
|---|
| 263 | io = req->async_data; | 
|---|
| 264 |  | 
|---|
| 265 | rw->kiocb.ki_pos = READ_ONCE(sqe->off); | 
|---|
| 266 | /* used for fixed read/write too - just read unconditionally */ | 
|---|
| 267 | req->buf_index = READ_ONCE(sqe->buf_index); | 
|---|
| 268 | io->buf_group = req->buf_index; | 
|---|
| 269 |  | 
|---|
| 270 | ioprio = READ_ONCE(sqe->ioprio); | 
|---|
| 271 | if (ioprio) { | 
|---|
| 272 | ret = ioprio_check_cap(ioprio); | 
|---|
| 273 | if (ret) | 
|---|
| 274 | return ret; | 
|---|
| 275 |  | 
|---|
| 276 | rw->kiocb.ki_ioprio = ioprio; | 
|---|
| 277 | } else { | 
|---|
| 278 | rw->kiocb.ki_ioprio = get_current_ioprio(); | 
|---|
| 279 | } | 
|---|
| 280 | rw->kiocb.dio_complete = NULL; | 
|---|
| 281 | rw->kiocb.ki_flags = 0; | 
|---|
| 282 | rw->kiocb.ki_write_stream = READ_ONCE(sqe->write_stream); | 
|---|
| 283 |  | 
|---|
| 284 | if (req->ctx->flags & IORING_SETUP_IOPOLL) | 
|---|
| 285 | rw->kiocb.ki_complete = io_complete_rw_iopoll; | 
|---|
| 286 | else | 
|---|
| 287 | rw->kiocb.ki_complete = io_complete_rw; | 
|---|
| 288 |  | 
|---|
| 289 | rw->addr = READ_ONCE(sqe->addr); | 
|---|
| 290 | rw->len = READ_ONCE(sqe->len); | 
|---|
| 291 | rw->flags = (__force rwf_t) READ_ONCE(sqe->rw_flags); | 
|---|
| 292 |  | 
|---|
| 293 | attr_type_mask = READ_ONCE(sqe->attr_type_mask); | 
|---|
| 294 | if (attr_type_mask) { | 
|---|
| 295 | u64 attr_ptr; | 
|---|
| 296 |  | 
|---|
| 297 | /* only PI attribute is supported currently */ | 
|---|
| 298 | if (attr_type_mask != IORING_RW_ATTR_FLAG_PI) | 
|---|
| 299 | return -EINVAL; | 
|---|
| 300 |  | 
|---|
| 301 | attr_ptr = READ_ONCE(sqe->attr_ptr); | 
|---|
| 302 | return io_prep_rw_pi(req, rw, ddir, attr_ptr, attr_type_mask); | 
|---|
| 303 | } | 
|---|
| 304 | return 0; | 
|---|
| 305 | } | 
|---|
| 306 |  | 
|---|
| 307 | static int io_rw_do_import(struct io_kiocb *req, int ddir) | 
|---|
| 308 | { | 
|---|
| 309 | struct io_br_sel sel = { }; | 
|---|
| 310 |  | 
|---|
| 311 | if (io_do_buffer_select(req)) | 
|---|
| 312 | return 0; | 
|---|
| 313 |  | 
|---|
| 314 | return io_import_rw_buffer(rw: ddir, req, io: req->async_data, sel: &sel, issue_flags: 0); | 
|---|
| 315 | } | 
|---|
| 316 |  | 
|---|
| 317 | static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, | 
|---|
| 318 | int ddir) | 
|---|
| 319 | { | 
|---|
| 320 | int ret; | 
|---|
| 321 |  | 
|---|
| 322 | ret = __io_prep_rw(req, sqe, ddir); | 
|---|
| 323 | if (unlikely(ret)) | 
|---|
| 324 | return ret; | 
|---|
| 325 |  | 
|---|
| 326 | return io_rw_do_import(req, ddir); | 
|---|
| 327 | } | 
|---|
| 328 |  | 
|---|
| 329 | int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe) | 
|---|
| 330 | { | 
|---|
| 331 | return io_prep_rw(req, sqe, ITER_DEST); | 
|---|
| 332 | } | 
|---|
| 333 |  | 
|---|
| 334 | int io_prep_write(struct io_kiocb *req, const struct io_uring_sqe *sqe) | 
|---|
| 335 | { | 
|---|
| 336 | return io_prep_rw(req, sqe, ITER_SOURCE); | 
|---|
| 337 | } | 
|---|
| 338 |  | 
|---|
| 339 | static int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe, | 
|---|
| 340 | int ddir) | 
|---|
| 341 | { | 
|---|
| 342 | int ret; | 
|---|
| 343 |  | 
|---|
| 344 | ret = io_prep_rw(req, sqe, ddir); | 
|---|
| 345 | if (unlikely(ret)) | 
|---|
| 346 | return ret; | 
|---|
| 347 | if (!(req->flags & REQ_F_BUFFER_SELECT)) | 
|---|
| 348 | return 0; | 
|---|
| 349 |  | 
|---|
| 350 | /* | 
|---|
| 351 | * Have to do this validation here, as this is in io_read() rw->len | 
|---|
| 352 | * might have chanaged due to buffer selection | 
|---|
| 353 | */ | 
|---|
| 354 | return io_iov_buffer_select_prep(req); | 
|---|
| 355 | } | 
|---|
| 356 |  | 
|---|
| 357 | int io_prep_readv(struct io_kiocb *req, const struct io_uring_sqe *sqe) | 
|---|
| 358 | { | 
|---|
| 359 | return io_prep_rwv(req, sqe, ITER_DEST); | 
|---|
| 360 | } | 
|---|
| 361 |  | 
|---|
| 362 | int io_prep_writev(struct io_kiocb *req, const struct io_uring_sqe *sqe) | 
|---|
| 363 | { | 
|---|
| 364 | return io_prep_rwv(req, sqe, ITER_SOURCE); | 
|---|
| 365 | } | 
|---|
| 366 |  | 
|---|
| 367 | static int io_init_rw_fixed(struct io_kiocb *req, unsigned int issue_flags, | 
|---|
| 368 | int ddir) | 
|---|
| 369 | { | 
|---|
| 370 | struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); | 
|---|
| 371 | struct io_async_rw *io = req->async_data; | 
|---|
| 372 | int ret; | 
|---|
| 373 |  | 
|---|
| 374 | if (io->bytes_done) | 
|---|
| 375 | return 0; | 
|---|
| 376 |  | 
|---|
| 377 | ret = io_import_reg_buf(req, iter: &io->iter, buf_addr: rw->addr, len: rw->len, ddir, | 
|---|
| 378 | issue_flags); | 
|---|
| 379 | iov_iter_save_state(iter: &io->iter, state: &io->iter_state); | 
|---|
| 380 | return ret; | 
|---|
| 381 | } | 
|---|
| 382 |  | 
|---|
| 383 | int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) | 
|---|
| 384 | { | 
|---|
| 385 | return __io_prep_rw(req, sqe, ITER_DEST); | 
|---|
| 386 | } | 
|---|
| 387 |  | 
|---|
| 388 | int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) | 
|---|
| 389 | { | 
|---|
| 390 | return __io_prep_rw(req, sqe, ITER_SOURCE); | 
|---|
| 391 | } | 
|---|
| 392 |  | 
|---|
| 393 | static int io_rw_import_reg_vec(struct io_kiocb *req, | 
|---|
| 394 | struct io_async_rw *io, | 
|---|
| 395 | int ddir, unsigned int issue_flags) | 
|---|
| 396 | { | 
|---|
| 397 | struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); | 
|---|
| 398 | unsigned uvec_segs = rw->len; | 
|---|
| 399 | int ret; | 
|---|
| 400 |  | 
|---|
| 401 | ret = io_import_reg_vec(ddir, iter: &io->iter, req, vec: &io->vec, | 
|---|
| 402 | nr_iovs: uvec_segs, issue_flags); | 
|---|
| 403 | if (unlikely(ret)) | 
|---|
| 404 | return ret; | 
|---|
| 405 | iov_iter_save_state(iter: &io->iter, state: &io->iter_state); | 
|---|
| 406 | req->flags &= ~REQ_F_IMPORT_BUFFER; | 
|---|
| 407 | return 0; | 
|---|
| 408 | } | 
|---|
| 409 |  | 
|---|
| 410 | static int io_rw_prep_reg_vec(struct io_kiocb *req) | 
|---|
| 411 | { | 
|---|
| 412 | struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); | 
|---|
| 413 | struct io_async_rw *io = req->async_data; | 
|---|
| 414 | const struct iovec __user *uvec; | 
|---|
| 415 |  | 
|---|
| 416 | uvec = u64_to_user_ptr(rw->addr); | 
|---|
| 417 | return io_prep_reg_iovec(req, iv: &io->vec, uvec, uvec_segs: rw->len); | 
|---|
| 418 | } | 
|---|
| 419 |  | 
|---|
| 420 | int io_prep_readv_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) | 
|---|
| 421 | { | 
|---|
| 422 | int ret; | 
|---|
| 423 |  | 
|---|
| 424 | ret = __io_prep_rw(req, sqe, ITER_DEST); | 
|---|
| 425 | if (unlikely(ret)) | 
|---|
| 426 | return ret; | 
|---|
| 427 | return io_rw_prep_reg_vec(req); | 
|---|
| 428 | } | 
|---|
| 429 |  | 
|---|
| 430 | int io_prep_writev_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) | 
|---|
| 431 | { | 
|---|
| 432 | int ret; | 
|---|
| 433 |  | 
|---|
| 434 | ret = __io_prep_rw(req, sqe, ITER_SOURCE); | 
|---|
| 435 | if (unlikely(ret)) | 
|---|
| 436 | return ret; | 
|---|
| 437 | return io_rw_prep_reg_vec(req); | 
|---|
| 438 | } | 
|---|
| 439 |  | 
|---|
| 440 | /* | 
|---|
| 441 | * Multishot read is prepared just like a normal read/write request, only | 
|---|
| 442 | * difference is that we set the MULTISHOT flag. | 
|---|
| 443 | */ | 
|---|
| 444 | int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) | 
|---|
| 445 | { | 
|---|
| 446 | struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); | 
|---|
| 447 | int ret; | 
|---|
| 448 |  | 
|---|
| 449 | /* must be used with provided buffers */ | 
|---|
| 450 | if (!(req->flags & REQ_F_BUFFER_SELECT)) | 
|---|
| 451 | return -EINVAL; | 
|---|
| 452 |  | 
|---|
| 453 | ret = __io_prep_rw(req, sqe, ITER_DEST); | 
|---|
| 454 | if (unlikely(ret)) | 
|---|
| 455 | return ret; | 
|---|
| 456 |  | 
|---|
| 457 | if (rw->addr || rw->len) | 
|---|
| 458 | return -EINVAL; | 
|---|
| 459 |  | 
|---|
| 460 | req->flags |= REQ_F_APOLL_MULTISHOT; | 
|---|
| 461 | return 0; | 
|---|
| 462 | } | 
|---|
| 463 |  | 
|---|
| 464 | void io_readv_writev_cleanup(struct io_kiocb *req) | 
|---|
| 465 | { | 
|---|
| 466 | lockdep_assert_held(&req->ctx->uring_lock); | 
|---|
| 467 | io_rw_recycle(req, issue_flags: 0); | 
|---|
| 468 | } | 
|---|
| 469 |  | 
|---|
| 470 | static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) | 
|---|
| 471 | { | 
|---|
| 472 | struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); | 
|---|
| 473 |  | 
|---|
| 474 | if (rw->kiocb.ki_pos != -1) | 
|---|
| 475 | return &rw->kiocb.ki_pos; | 
|---|
| 476 |  | 
|---|
| 477 | if (!(req->file->f_mode & FMODE_STREAM)) { | 
|---|
| 478 | req->flags |= REQ_F_CUR_POS; | 
|---|
| 479 | rw->kiocb.ki_pos = req->file->f_pos; | 
|---|
| 480 | return &rw->kiocb.ki_pos; | 
|---|
| 481 | } | 
|---|
| 482 |  | 
|---|
| 483 | rw->kiocb.ki_pos = 0; | 
|---|
| 484 | return NULL; | 
|---|
| 485 | } | 
|---|
| 486 |  | 
|---|
| 487 | static bool io_rw_should_reissue(struct io_kiocb *req) | 
|---|
| 488 | { | 
|---|
| 489 | #ifdef CONFIG_BLOCK | 
|---|
| 490 | struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); | 
|---|
| 491 | umode_t mode = file_inode(f: req->file)->i_mode; | 
|---|
| 492 | struct io_async_rw *io = req->async_data; | 
|---|
| 493 | struct io_ring_ctx *ctx = req->ctx; | 
|---|
| 494 |  | 
|---|
| 495 | if (!S_ISBLK(mode) && !S_ISREG(mode)) | 
|---|
| 496 | return false; | 
|---|
| 497 | if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() && | 
|---|
| 498 | !(ctx->flags & IORING_SETUP_IOPOLL))) | 
|---|
| 499 | return false; | 
|---|
| 500 | /* | 
|---|
| 501 | * If ref is dying, we might be running poll reap from the exit work. | 
|---|
| 502 | * Don't attempt to reissue from that path, just let it fail with | 
|---|
| 503 | * -EAGAIN. | 
|---|
| 504 | */ | 
|---|
| 505 | if (percpu_ref_is_dying(ref: &ctx->refs)) | 
|---|
| 506 | return false; | 
|---|
| 507 |  | 
|---|
| 508 | io_meta_restore(io, kiocb: &rw->kiocb); | 
|---|
| 509 | iov_iter_restore(i: &io->iter, state: &io->iter_state); | 
|---|
| 510 | return true; | 
|---|
| 511 | #else | 
|---|
| 512 | return false; | 
|---|
| 513 | #endif | 
|---|
| 514 | } | 
|---|
| 515 |  | 
|---|
| 516 | static void io_req_end_write(struct io_kiocb *req) | 
|---|
| 517 | { | 
|---|
| 518 | if (req->flags & REQ_F_ISREG) { | 
|---|
| 519 | struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); | 
|---|
| 520 |  | 
|---|
| 521 | kiocb_end_write(iocb: &rw->kiocb); | 
|---|
| 522 | } | 
|---|
| 523 | } | 
|---|
| 524 |  | 
|---|
| 525 | /* | 
|---|
| 526 | * Trigger the notifications after having done some IO, and finish the write | 
|---|
| 527 | * accounting, if any. | 
|---|
| 528 | */ | 
|---|
| 529 | static void io_req_io_end(struct io_kiocb *req) | 
|---|
| 530 | { | 
|---|
| 531 | struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); | 
|---|
| 532 |  | 
|---|
| 533 | if (rw->kiocb.ki_flags & IOCB_WRITE) { | 
|---|
| 534 | io_req_end_write(req); | 
|---|
| 535 | fsnotify_modify(file: req->file); | 
|---|
| 536 | } else { | 
|---|
| 537 | fsnotify_access(file: req->file); | 
|---|
| 538 | } | 
|---|
| 539 | } | 
|---|
| 540 |  | 
|---|
| 541 | static void __io_complete_rw_common(struct io_kiocb *req, long res) | 
|---|
| 542 | { | 
|---|
| 543 | if (res == req->cqe.res) | 
|---|
| 544 | return; | 
|---|
| 545 | if (res == -EAGAIN && io_rw_should_reissue(req)) { | 
|---|
| 546 | req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE; | 
|---|
| 547 | } else { | 
|---|
| 548 | req_set_fail(req); | 
|---|
| 549 | req->cqe.res = res; | 
|---|
| 550 | } | 
|---|
| 551 | } | 
|---|
| 552 |  | 
|---|
| 553 | static inline int io_fixup_rw_res(struct io_kiocb *req, long res) | 
|---|
| 554 | { | 
|---|
| 555 | struct io_async_rw *io = req->async_data; | 
|---|
| 556 |  | 
|---|
| 557 | /* add previously done IO, if any */ | 
|---|
| 558 | if (req_has_async_data(req) && io->bytes_done > 0) { | 
|---|
| 559 | if (res < 0) | 
|---|
| 560 | res = io->bytes_done; | 
|---|
| 561 | else | 
|---|
| 562 | res += io->bytes_done; | 
|---|
| 563 | } | 
|---|
| 564 | return res; | 
|---|
| 565 | } | 
|---|
| 566 |  | 
|---|
| 567 | void io_req_rw_complete(struct io_kiocb *req, io_tw_token_t tw) | 
|---|
| 568 | { | 
|---|
| 569 | struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); | 
|---|
| 570 | struct kiocb *kiocb = &rw->kiocb; | 
|---|
| 571 |  | 
|---|
| 572 | if ((kiocb->ki_flags & IOCB_DIO_CALLER_COMP) && kiocb->dio_complete) { | 
|---|
| 573 | long res = kiocb->dio_complete(rw->kiocb.private); | 
|---|
| 574 |  | 
|---|
| 575 | io_req_set_res(req, res: io_fixup_rw_res(req, res), cflags: 0); | 
|---|
| 576 | } | 
|---|
| 577 |  | 
|---|
| 578 | io_req_io_end(req); | 
|---|
| 579 |  | 
|---|
| 580 | if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) | 
|---|
| 581 | req->cqe.flags |= io_put_kbuf(req, len: req->cqe.res, NULL); | 
|---|
| 582 |  | 
|---|
| 583 | io_req_rw_cleanup(req, issue_flags: 0); | 
|---|
| 584 | io_req_task_complete(req, tw); | 
|---|
| 585 | } | 
|---|
| 586 |  | 
|---|
| 587 | static void io_complete_rw(struct kiocb *kiocb, long res) | 
|---|
| 588 | { | 
|---|
| 589 | struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb); | 
|---|
| 590 | struct io_kiocb *req = cmd_to_io_kiocb(ptr: rw); | 
|---|
| 591 |  | 
|---|
| 592 | if (!kiocb->dio_complete || !(kiocb->ki_flags & IOCB_DIO_CALLER_COMP)) { | 
|---|
| 593 | __io_complete_rw_common(req, res); | 
|---|
| 594 | io_req_set_res(req, res: io_fixup_rw_res(req, res), cflags: 0); | 
|---|
| 595 | } | 
|---|
| 596 | req->io_task_work.func = io_req_rw_complete; | 
|---|
| 597 | __io_req_task_work_add(req, flags: IOU_F_TWQ_LAZY_WAKE); | 
|---|
| 598 | } | 
|---|
| 599 |  | 
|---|
| 600 | static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) | 
|---|
| 601 | { | 
|---|
| 602 | struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb); | 
|---|
| 603 | struct io_kiocb *req = cmd_to_io_kiocb(ptr: rw); | 
|---|
| 604 |  | 
|---|
| 605 | if (kiocb->ki_flags & IOCB_WRITE) | 
|---|
| 606 | io_req_end_write(req); | 
|---|
| 607 | if (unlikely(res != req->cqe.res)) { | 
|---|
| 608 | if (res == -EAGAIN && io_rw_should_reissue(req)) | 
|---|
| 609 | req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE; | 
|---|
| 610 | else | 
|---|
| 611 | req->cqe.res = res; | 
|---|
| 612 | } | 
|---|
| 613 |  | 
|---|
| 614 | /* order with io_iopoll_complete() checking ->iopoll_completed */ | 
|---|
| 615 | smp_store_release(&req->iopoll_completed, 1); | 
|---|
| 616 | } | 
|---|
| 617 |  | 
|---|
| 618 | static inline void io_rw_done(struct io_kiocb *req, ssize_t ret) | 
|---|
| 619 | { | 
|---|
| 620 | struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); | 
|---|
| 621 |  | 
|---|
| 622 | /* IO was queued async, completion will happen later */ | 
|---|
| 623 | if (ret == -EIOCBQUEUED) | 
|---|
| 624 | return; | 
|---|
| 625 |  | 
|---|
| 626 | /* transform internal restart error codes */ | 
|---|
| 627 | if (unlikely(ret < 0)) { | 
|---|
| 628 | switch (ret) { | 
|---|
| 629 | case -ERESTARTSYS: | 
|---|
| 630 | case -ERESTARTNOINTR: | 
|---|
| 631 | case -ERESTARTNOHAND: | 
|---|
| 632 | case -ERESTART_RESTARTBLOCK: | 
|---|
| 633 | /* | 
|---|
| 634 | * We can't just restart the syscall, since previously | 
|---|
| 635 | * submitted sqes may already be in progress. Just fail | 
|---|
| 636 | * this IO with EINTR. | 
|---|
| 637 | */ | 
|---|
| 638 | ret = -EINTR; | 
|---|
| 639 | break; | 
|---|
| 640 | } | 
|---|
| 641 | } | 
|---|
| 642 |  | 
|---|
| 643 | if (req->ctx->flags & IORING_SETUP_IOPOLL) | 
|---|
| 644 | io_complete_rw_iopoll(kiocb: &rw->kiocb, res: ret); | 
|---|
| 645 | else | 
|---|
| 646 | io_complete_rw(kiocb: &rw->kiocb, res: ret); | 
|---|
| 647 | } | 
|---|
| 648 |  | 
|---|
| 649 | static int kiocb_done(struct io_kiocb *req, ssize_t ret, | 
|---|
| 650 | struct io_br_sel *sel, unsigned int issue_flags) | 
|---|
| 651 | { | 
|---|
| 652 | struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); | 
|---|
| 653 | unsigned final_ret = io_fixup_rw_res(req, res: ret); | 
|---|
| 654 |  | 
|---|
| 655 | if (ret >= 0 && req->flags & REQ_F_CUR_POS) | 
|---|
| 656 | req->file->f_pos = rw->kiocb.ki_pos; | 
|---|
| 657 | if (ret >= 0 && !(req->ctx->flags & IORING_SETUP_IOPOLL)) { | 
|---|
| 658 | __io_complete_rw_common(req, res: ret); | 
|---|
| 659 | /* | 
|---|
| 660 | * Safe to call io_end from here as we're inline | 
|---|
| 661 | * from the submission path. | 
|---|
| 662 | */ | 
|---|
| 663 | io_req_io_end(req); | 
|---|
| 664 | io_req_set_res(req, res: final_ret, cflags: io_put_kbuf(req, len: ret, bl: sel->buf_list)); | 
|---|
| 665 | io_req_rw_cleanup(req, issue_flags); | 
|---|
| 666 | return IOU_COMPLETE; | 
|---|
| 667 | } else { | 
|---|
| 668 | io_rw_done(req, ret); | 
|---|
| 669 | } | 
|---|
| 670 |  | 
|---|
| 671 | return IOU_ISSUE_SKIP_COMPLETE; | 
|---|
| 672 | } | 
|---|
| 673 |  | 
|---|
| 674 | static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) | 
|---|
| 675 | { | 
|---|
| 676 | return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos; | 
|---|
| 677 | } | 
|---|
| 678 |  | 
|---|
| 679 | /* | 
|---|
| 680 | * For files that don't have ->read_iter() and ->write_iter(), handle them | 
|---|
| 681 | * by looping over ->read() or ->write() manually. | 
|---|
| 682 | */ | 
|---|
| 683 | static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter) | 
|---|
| 684 | { | 
|---|
| 685 | struct io_kiocb *req = cmd_to_io_kiocb(ptr: rw); | 
|---|
| 686 | struct kiocb *kiocb = &rw->kiocb; | 
|---|
| 687 | struct file *file = kiocb->ki_filp; | 
|---|
| 688 | ssize_t ret = 0; | 
|---|
| 689 | loff_t *ppos; | 
|---|
| 690 |  | 
|---|
| 691 | /* | 
|---|
| 692 | * Don't support polled IO through this interface, and we can't | 
|---|
| 693 | * support non-blocking either. For the latter, this just causes | 
|---|
| 694 | * the kiocb to be handled from an async context. | 
|---|
| 695 | */ | 
|---|
| 696 | if (kiocb->ki_flags & IOCB_HIPRI) | 
|---|
| 697 | return -EOPNOTSUPP; | 
|---|
| 698 | if ((kiocb->ki_flags & IOCB_NOWAIT) && | 
|---|
| 699 | !(kiocb->ki_filp->f_flags & O_NONBLOCK)) | 
|---|
| 700 | return -EAGAIN; | 
|---|
| 701 | if ((req->flags & REQ_F_BUF_NODE) && req->buf_node->buf->is_kbuf) | 
|---|
| 702 | return -EFAULT; | 
|---|
| 703 |  | 
|---|
| 704 | ppos = io_kiocb_ppos(kiocb); | 
|---|
| 705 |  | 
|---|
| 706 | while (iov_iter_count(i: iter)) { | 
|---|
| 707 | void __user *addr; | 
|---|
| 708 | size_t len; | 
|---|
| 709 | ssize_t nr; | 
|---|
| 710 |  | 
|---|
| 711 | if (iter_is_ubuf(i: iter)) { | 
|---|
| 712 | addr = iter->ubuf + iter->iov_offset; | 
|---|
| 713 | len = iov_iter_count(i: iter); | 
|---|
| 714 | } else if (!iov_iter_is_bvec(i: iter)) { | 
|---|
| 715 | addr = iter_iov_addr(iter); | 
|---|
| 716 | len = iter_iov_len(i: iter); | 
|---|
| 717 | } else { | 
|---|
| 718 | addr = u64_to_user_ptr(rw->addr); | 
|---|
| 719 | len = rw->len; | 
|---|
| 720 | } | 
|---|
| 721 |  | 
|---|
| 722 | if (ddir == READ) | 
|---|
| 723 | nr = file->f_op->read(file, addr, len, ppos); | 
|---|
| 724 | else | 
|---|
| 725 | nr = file->f_op->write(file, addr, len, ppos); | 
|---|
| 726 |  | 
|---|
| 727 | if (nr < 0) { | 
|---|
| 728 | if (!ret) | 
|---|
| 729 | ret = nr; | 
|---|
| 730 | break; | 
|---|
| 731 | } | 
|---|
| 732 | ret += nr; | 
|---|
| 733 | if (!iov_iter_is_bvec(i: iter)) { | 
|---|
| 734 | iov_iter_advance(i: iter, bytes: nr); | 
|---|
| 735 | } else { | 
|---|
| 736 | rw->addr += nr; | 
|---|
| 737 | rw->len -= nr; | 
|---|
| 738 | if (!rw->len) | 
|---|
| 739 | break; | 
|---|
| 740 | } | 
|---|
| 741 | if (nr != len) | 
|---|
| 742 | break; | 
|---|
| 743 | } | 
|---|
| 744 |  | 
|---|
| 745 | return ret; | 
|---|
| 746 | } | 
|---|
| 747 |  | 
|---|
| 748 | /* | 
|---|
| 749 | * This is our waitqueue callback handler, registered through __folio_lock_async() | 
|---|
| 750 | * when we initially tried to do the IO with the iocb armed our waitqueue. | 
|---|
| 751 | * This gets called when the page is unlocked, and we generally expect that to | 
|---|
| 752 | * happen when the page IO is completed and the page is now uptodate. This will | 
|---|
| 753 | * queue a task_work based retry of the operation, attempting to copy the data | 
|---|
| 754 | * again. If the latter fails because the page was NOT uptodate, then we will | 
|---|
| 755 | * do a thread based blocking retry of the operation. That's the unexpected | 
|---|
| 756 | * slow path. | 
|---|
| 757 | */ | 
|---|
| 758 | static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, | 
|---|
| 759 | int sync, void *arg) | 
|---|
| 760 | { | 
|---|
| 761 | struct wait_page_queue *wpq; | 
|---|
| 762 | struct io_kiocb *req = wait->private; | 
|---|
| 763 | struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); | 
|---|
| 764 | struct wait_page_key *key = arg; | 
|---|
| 765 |  | 
|---|
| 766 | wpq = container_of(wait, struct wait_page_queue, wait); | 
|---|
| 767 |  | 
|---|
| 768 | if (!wake_page_match(wait_page: wpq, key)) | 
|---|
| 769 | return 0; | 
|---|
| 770 |  | 
|---|
| 771 | rw->kiocb.ki_flags &= ~IOCB_WAITQ; | 
|---|
| 772 | list_del_init(entry: &wait->entry); | 
|---|
| 773 | io_req_task_queue(req); | 
|---|
| 774 | return 1; | 
|---|
| 775 | } | 
|---|
| 776 |  | 
|---|
| 777 | /* | 
|---|
| 778 | * This controls whether a given IO request should be armed for async page | 
|---|
| 779 | * based retry. If we return false here, the request is handed to the async | 
|---|
| 780 | * worker threads for retry. If we're doing buffered reads on a regular file, | 
|---|
| 781 | * we prepare a private wait_page_queue entry and retry the operation. This | 
|---|
| 782 | * will either succeed because the page is now uptodate and unlocked, or it | 
|---|
| 783 | * will register a callback when the page is unlocked at IO completion. Through | 
|---|
| 784 | * that callback, io_uring uses task_work to setup a retry of the operation. | 
|---|
| 785 | * That retry will attempt the buffered read again. The retry will generally | 
|---|
| 786 | * succeed, or in rare cases where it fails, we then fall back to using the | 
|---|
| 787 | * async worker threads for a blocking retry. | 
|---|
| 788 | */ | 
|---|
| 789 | static bool io_rw_should_retry(struct io_kiocb *req) | 
|---|
| 790 | { | 
|---|
| 791 | struct io_async_rw *io = req->async_data; | 
|---|
| 792 | struct wait_page_queue *wait = &io->wpq; | 
|---|
| 793 | struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); | 
|---|
| 794 | struct kiocb *kiocb = &rw->kiocb; | 
|---|
| 795 |  | 
|---|
| 796 | /* | 
|---|
| 797 | * Never retry for NOWAIT or a request with metadata, we just complete | 
|---|
| 798 | * with -EAGAIN. | 
|---|
| 799 | */ | 
|---|
| 800 | if (req->flags & (REQ_F_NOWAIT | REQ_F_HAS_METADATA)) | 
|---|
| 801 | return false; | 
|---|
| 802 |  | 
|---|
| 803 | /* Only for buffered IO */ | 
|---|
| 804 | if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI)) | 
|---|
| 805 | return false; | 
|---|
| 806 |  | 
|---|
| 807 | /* | 
|---|
| 808 | * just use poll if we can, and don't attempt if the fs doesn't | 
|---|
| 809 | * support callback based unlocks | 
|---|
| 810 | */ | 
|---|
| 811 | if (io_file_can_poll(req) || | 
|---|
| 812 | !(req->file->f_op->fop_flags & FOP_BUFFER_RASYNC)) | 
|---|
| 813 | return false; | 
|---|
| 814 |  | 
|---|
| 815 | wait->wait.func = io_async_buf_func; | 
|---|
| 816 | wait->wait.private = req; | 
|---|
| 817 | wait->wait.flags = 0; | 
|---|
| 818 | INIT_LIST_HEAD(list: &wait->wait.entry); | 
|---|
| 819 | kiocb->ki_flags |= IOCB_WAITQ; | 
|---|
| 820 | kiocb->ki_flags &= ~IOCB_NOWAIT; | 
|---|
| 821 | kiocb->ki_waitq = wait; | 
|---|
| 822 | return true; | 
|---|
| 823 | } | 
|---|
| 824 |  | 
|---|
| 825 | static inline int io_iter_do_read(struct io_rw *rw, struct iov_iter *iter) | 
|---|
| 826 | { | 
|---|
| 827 | struct file *file = rw->kiocb.ki_filp; | 
|---|
| 828 |  | 
|---|
| 829 | if (likely(file->f_op->read_iter)) | 
|---|
| 830 | return file->f_op->read_iter(&rw->kiocb, iter); | 
|---|
| 831 | else if (file->f_op->read) | 
|---|
| 832 | return loop_rw_iter(READ, rw, iter); | 
|---|
| 833 | else | 
|---|
| 834 | return -EINVAL; | 
|---|
| 835 | } | 
|---|
| 836 |  | 
|---|
| 837 | static bool need_complete_io(struct io_kiocb *req) | 
|---|
| 838 | { | 
|---|
| 839 | return req->flags & REQ_F_ISREG || | 
|---|
| 840 | S_ISBLK(file_inode(req->file)->i_mode); | 
|---|
| 841 | } | 
|---|
| 842 |  | 
|---|
| 843 | static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) | 
|---|
| 844 | { | 
|---|
| 845 | struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); | 
|---|
| 846 | struct kiocb *kiocb = &rw->kiocb; | 
|---|
| 847 | struct io_ring_ctx *ctx = req->ctx; | 
|---|
| 848 | struct file *file = req->file; | 
|---|
| 849 | int ret; | 
|---|
| 850 |  | 
|---|
| 851 | if (unlikely(!(file->f_mode & mode))) | 
|---|
| 852 | return -EBADF; | 
|---|
| 853 |  | 
|---|
| 854 | if (!(req->flags & REQ_F_FIXED_FILE)) | 
|---|
| 855 | req->flags |= io_file_get_flags(file); | 
|---|
| 856 |  | 
|---|
| 857 | kiocb->ki_flags = file->f_iocb_flags; | 
|---|
| 858 | ret = kiocb_set_rw_flags(ki: kiocb, flags: rw->flags, rw_type); | 
|---|
| 859 | if (unlikely(ret)) | 
|---|
| 860 | return ret; | 
|---|
| 861 | kiocb->ki_flags |= IOCB_ALLOC_CACHE; | 
|---|
| 862 |  | 
|---|
| 863 | /* | 
|---|
| 864 | * If the file is marked O_NONBLOCK, still allow retry for it if it | 
|---|
| 865 | * supports async. Otherwise it's impossible to use O_NONBLOCK files | 
|---|
| 866 | * reliably. If not, or it IOCB_NOWAIT is set, don't retry. | 
|---|
| 867 | */ | 
|---|
| 868 | if (kiocb->ki_flags & IOCB_NOWAIT || | 
|---|
| 869 | ((file->f_flags & O_NONBLOCK && !(req->flags & REQ_F_SUPPORT_NOWAIT)))) | 
|---|
| 870 | req->flags |= REQ_F_NOWAIT; | 
|---|
| 871 |  | 
|---|
| 872 | if (ctx->flags & IORING_SETUP_IOPOLL) { | 
|---|
| 873 | if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll) | 
|---|
| 874 | return -EOPNOTSUPP; | 
|---|
| 875 | kiocb->private = NULL; | 
|---|
| 876 | kiocb->ki_flags |= IOCB_HIPRI; | 
|---|
| 877 | req->iopoll_completed = 0; | 
|---|
| 878 | if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) { | 
|---|
| 879 | /* make sure every req only blocks once*/ | 
|---|
| 880 | req->flags &= ~REQ_F_IOPOLL_STATE; | 
|---|
| 881 | req->iopoll_start = ktime_get_ns(); | 
|---|
| 882 | } | 
|---|
| 883 | } else { | 
|---|
| 884 | if (kiocb->ki_flags & IOCB_HIPRI) | 
|---|
| 885 | return -EINVAL; | 
|---|
| 886 | } | 
|---|
| 887 |  | 
|---|
| 888 | if (req->flags & REQ_F_HAS_METADATA) { | 
|---|
| 889 | struct io_async_rw *io = req->async_data; | 
|---|
| 890 |  | 
|---|
| 891 | if (!(file->f_mode & FMODE_HAS_METADATA)) | 
|---|
| 892 | return -EINVAL; | 
|---|
| 893 |  | 
|---|
| 894 | /* | 
|---|
| 895 | * We have a union of meta fields with wpq used for buffered-io | 
|---|
| 896 | * in io_async_rw, so fail it here. | 
|---|
| 897 | */ | 
|---|
| 898 | if (!(req->file->f_flags & O_DIRECT)) | 
|---|
| 899 | return -EOPNOTSUPP; | 
|---|
| 900 | kiocb->ki_flags |= IOCB_HAS_METADATA; | 
|---|
| 901 | kiocb->private = &io->meta; | 
|---|
| 902 | } | 
|---|
| 903 |  | 
|---|
| 904 | return 0; | 
|---|
| 905 | } | 
|---|
| 906 |  | 
|---|
| 907 | static int __io_read(struct io_kiocb *req, struct io_br_sel *sel, | 
|---|
| 908 | unsigned int issue_flags) | 
|---|
| 909 | { | 
|---|
| 910 | bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; | 
|---|
| 911 | struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); | 
|---|
| 912 | struct io_async_rw *io = req->async_data; | 
|---|
| 913 | struct kiocb *kiocb = &rw->kiocb; | 
|---|
| 914 | ssize_t ret; | 
|---|
| 915 | loff_t *ppos; | 
|---|
| 916 |  | 
|---|
| 917 | if (req->flags & REQ_F_IMPORT_BUFFER) { | 
|---|
| 918 | ret = io_rw_import_reg_vec(req, io, ITER_DEST, issue_flags); | 
|---|
| 919 | if (unlikely(ret)) | 
|---|
| 920 | return ret; | 
|---|
| 921 | } else if (io_do_buffer_select(req)) { | 
|---|
| 922 | ret = io_import_rw_buffer(ITER_DEST, req, io, sel, issue_flags); | 
|---|
| 923 | if (unlikely(ret < 0)) | 
|---|
| 924 | return ret; | 
|---|
| 925 | } | 
|---|
| 926 | ret = io_rw_init_file(req, FMODE_READ, READ); | 
|---|
| 927 | if (unlikely(ret)) | 
|---|
| 928 | return ret; | 
|---|
| 929 | req->cqe.res = iov_iter_count(i: &io->iter); | 
|---|
| 930 |  | 
|---|
| 931 | if (force_nonblock) { | 
|---|
| 932 | /* If the file doesn't support async, just async punt */ | 
|---|
| 933 | if (unlikely(!io_file_supports_nowait(req, EPOLLIN))) | 
|---|
| 934 | return -EAGAIN; | 
|---|
| 935 | kiocb->ki_flags |= IOCB_NOWAIT; | 
|---|
| 936 | } else { | 
|---|
| 937 | /* Ensure we clear previously set non-block flag */ | 
|---|
| 938 | kiocb->ki_flags &= ~IOCB_NOWAIT; | 
|---|
| 939 | } | 
|---|
| 940 |  | 
|---|
| 941 | ppos = io_kiocb_update_pos(req); | 
|---|
| 942 |  | 
|---|
| 943 | ret = rw_verify_area(READ, req->file, ppos, req->cqe.res); | 
|---|
| 944 | if (unlikely(ret)) | 
|---|
| 945 | return ret; | 
|---|
| 946 |  | 
|---|
| 947 | ret = io_iter_do_read(rw, iter: &io->iter); | 
|---|
| 948 |  | 
|---|
| 949 | /* | 
|---|
| 950 | * Some file systems like to return -EOPNOTSUPP for an IOCB_NOWAIT | 
|---|
| 951 | * issue, even though they should be returning -EAGAIN. To be safe, | 
|---|
| 952 | * retry from blocking context for either. | 
|---|
| 953 | */ | 
|---|
| 954 | if (ret == -EOPNOTSUPP && force_nonblock) | 
|---|
| 955 | ret = -EAGAIN; | 
|---|
| 956 |  | 
|---|
| 957 | if (ret == -EAGAIN) { | 
|---|
| 958 | /* If we can poll, just do that. */ | 
|---|
| 959 | if (io_file_can_poll(req)) | 
|---|
| 960 | return -EAGAIN; | 
|---|
| 961 | /* IOPOLL retry should happen for io-wq threads */ | 
|---|
| 962 | if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) | 
|---|
| 963 | goto done; | 
|---|
| 964 | /* no retry on NONBLOCK nor RWF_NOWAIT */ | 
|---|
| 965 | if (req->flags & REQ_F_NOWAIT) | 
|---|
| 966 | goto done; | 
|---|
| 967 | ret = 0; | 
|---|
| 968 | } else if (ret == -EIOCBQUEUED) { | 
|---|
| 969 | return IOU_ISSUE_SKIP_COMPLETE; | 
|---|
| 970 | } else if (ret == req->cqe.res || ret <= 0 || !force_nonblock || | 
|---|
| 971 | (req->flags & REQ_F_NOWAIT) || !need_complete_io(req) || | 
|---|
| 972 | (issue_flags & IO_URING_F_MULTISHOT)) { | 
|---|
| 973 | /* read all, failed, already did sync or don't want to retry */ | 
|---|
| 974 | goto done; | 
|---|
| 975 | } | 
|---|
| 976 |  | 
|---|
| 977 | /* | 
|---|
| 978 | * Don't depend on the iter state matching what was consumed, or being | 
|---|
| 979 | * untouched in case of error. Restore it and we'll advance it | 
|---|
| 980 | * manually if we need to. | 
|---|
| 981 | */ | 
|---|
| 982 | iov_iter_restore(i: &io->iter, state: &io->iter_state); | 
|---|
| 983 | io_meta_restore(io, kiocb); | 
|---|
| 984 |  | 
|---|
| 985 | do { | 
|---|
| 986 | /* | 
|---|
| 987 | * We end up here because of a partial read, either from | 
|---|
| 988 | * above or inside this loop. Advance the iter by the bytes | 
|---|
| 989 | * that were consumed. | 
|---|
| 990 | */ | 
|---|
| 991 | iov_iter_advance(i: &io->iter, bytes: ret); | 
|---|
| 992 | if (!iov_iter_count(i: &io->iter)) | 
|---|
| 993 | break; | 
|---|
| 994 | io->bytes_done += ret; | 
|---|
| 995 | iov_iter_save_state(iter: &io->iter, state: &io->iter_state); | 
|---|
| 996 |  | 
|---|
| 997 | /* if we can retry, do so with the callbacks armed */ | 
|---|
| 998 | if (!io_rw_should_retry(req)) { | 
|---|
| 999 | kiocb->ki_flags &= ~IOCB_WAITQ; | 
|---|
| 1000 | return -EAGAIN; | 
|---|
| 1001 | } | 
|---|
| 1002 |  | 
|---|
| 1003 | req->cqe.res = iov_iter_count(i: &io->iter); | 
|---|
| 1004 | /* | 
|---|
| 1005 | * Now retry read with the IOCB_WAITQ parts set in the iocb. If | 
|---|
| 1006 | * we get -EIOCBQUEUED, then we'll get a notification when the | 
|---|
| 1007 | * desired page gets unlocked. We can also get a partial read | 
|---|
| 1008 | * here, and if we do, then just retry at the new offset. | 
|---|
| 1009 | */ | 
|---|
| 1010 | ret = io_iter_do_read(rw, iter: &io->iter); | 
|---|
| 1011 | if (ret == -EIOCBQUEUED) | 
|---|
| 1012 | return IOU_ISSUE_SKIP_COMPLETE; | 
|---|
| 1013 | /* we got some bytes, but not all. retry. */ | 
|---|
| 1014 | kiocb->ki_flags &= ~IOCB_WAITQ; | 
|---|
| 1015 | iov_iter_restore(i: &io->iter, state: &io->iter_state); | 
|---|
| 1016 | } while (ret > 0); | 
|---|
| 1017 | done: | 
|---|
| 1018 | /* it's faster to check here then delegate to kfree */ | 
|---|
| 1019 | return ret; | 
|---|
| 1020 | } | 
|---|
| 1021 |  | 
|---|
| 1022 | int io_read(struct io_kiocb *req, unsigned int issue_flags) | 
|---|
| 1023 | { | 
|---|
| 1024 | struct io_br_sel sel = { }; | 
|---|
| 1025 | int ret; | 
|---|
| 1026 |  | 
|---|
| 1027 | ret = __io_read(req, sel: &sel, issue_flags); | 
|---|
| 1028 | if (ret >= 0) | 
|---|
| 1029 | return kiocb_done(req, ret, sel: &sel, issue_flags); | 
|---|
| 1030 |  | 
|---|
| 1031 | if (req->flags & REQ_F_BUFFERS_COMMIT) | 
|---|
| 1032 | io_kbuf_recycle(req, bl: sel.buf_list, issue_flags); | 
|---|
| 1033 | return ret; | 
|---|
| 1034 | } | 
|---|
| 1035 |  | 
|---|
| 1036 | int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) | 
|---|
| 1037 | { | 
|---|
| 1038 | struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); | 
|---|
| 1039 | struct io_br_sel sel = { }; | 
|---|
| 1040 | unsigned int cflags = 0; | 
|---|
| 1041 | int ret; | 
|---|
| 1042 |  | 
|---|
| 1043 | /* | 
|---|
| 1044 | * Multishot MUST be used on a pollable file | 
|---|
| 1045 | */ | 
|---|
| 1046 | if (!io_file_can_poll(req)) | 
|---|
| 1047 | return -EBADFD; | 
|---|
| 1048 |  | 
|---|
| 1049 | /* make it sync, multishot doesn't support async execution */ | 
|---|
| 1050 | rw->kiocb.ki_complete = NULL; | 
|---|
| 1051 | ret = __io_read(req, sel: &sel, issue_flags); | 
|---|
| 1052 |  | 
|---|
| 1053 | /* | 
|---|
| 1054 | * If we get -EAGAIN, recycle our buffer and just let normal poll | 
|---|
| 1055 | * handling arm it. | 
|---|
| 1056 | */ | 
|---|
| 1057 | if (ret == -EAGAIN) { | 
|---|
| 1058 | /* | 
|---|
| 1059 | * Reset rw->len to 0 again to avoid clamping future mshot | 
|---|
| 1060 | * reads, in case the buffer size varies. | 
|---|
| 1061 | */ | 
|---|
| 1062 | if (io_kbuf_recycle(req, bl: sel.buf_list, issue_flags)) | 
|---|
| 1063 | rw->len = 0; | 
|---|
| 1064 | return IOU_RETRY; | 
|---|
| 1065 | } else if (ret <= 0) { | 
|---|
| 1066 | io_kbuf_recycle(req, bl: sel.buf_list, issue_flags); | 
|---|
| 1067 | if (ret < 0) | 
|---|
| 1068 | req_set_fail(req); | 
|---|
| 1069 | } else if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { | 
|---|
| 1070 | cflags = io_put_kbuf(req, len: ret, bl: sel.buf_list); | 
|---|
| 1071 | } else { | 
|---|
| 1072 | /* | 
|---|
| 1073 | * Any successful return value will keep the multishot read | 
|---|
| 1074 | * armed, if it's still set. Put our buffer and post a CQE. If | 
|---|
| 1075 | * we fail to post a CQE, or multishot is no longer set, then | 
|---|
| 1076 | * jump to the termination path. This request is then done. | 
|---|
| 1077 | */ | 
|---|
| 1078 | cflags = io_put_kbuf(req, len: ret, bl: sel.buf_list); | 
|---|
| 1079 | rw->len = 0; /* similarly to above, reset len to 0 */ | 
|---|
| 1080 |  | 
|---|
| 1081 | if (io_req_post_cqe(req, res: ret, cflags: cflags | IORING_CQE_F_MORE)) { | 
|---|
| 1082 | if (issue_flags & IO_URING_F_MULTISHOT) | 
|---|
| 1083 | /* | 
|---|
| 1084 | * Force retry, as we might have more data to | 
|---|
| 1085 | * be read and otherwise it won't get retried | 
|---|
| 1086 | * until (if ever) another poll is triggered. | 
|---|
| 1087 | */ | 
|---|
| 1088 | io_poll_multishot_retry(req); | 
|---|
| 1089 |  | 
|---|
| 1090 | return IOU_RETRY; | 
|---|
| 1091 | } | 
|---|
| 1092 | } | 
|---|
| 1093 |  | 
|---|
| 1094 | /* | 
|---|
| 1095 | * Either an error, or we've hit overflow posting the CQE. For any | 
|---|
| 1096 | * multishot request, hitting overflow will terminate it. | 
|---|
| 1097 | */ | 
|---|
| 1098 | io_req_set_res(req, res: ret, cflags); | 
|---|
| 1099 | io_req_rw_cleanup(req, issue_flags); | 
|---|
| 1100 | return IOU_COMPLETE; | 
|---|
| 1101 | } | 
|---|
| 1102 |  | 
|---|
| 1103 | static bool io_kiocb_start_write(struct io_kiocb *req, struct kiocb *kiocb) | 
|---|
| 1104 | { | 
|---|
| 1105 | struct inode *inode; | 
|---|
| 1106 | bool ret; | 
|---|
| 1107 |  | 
|---|
| 1108 | if (!(req->flags & REQ_F_ISREG)) | 
|---|
| 1109 | return true; | 
|---|
| 1110 | if (!(kiocb->ki_flags & IOCB_NOWAIT)) { | 
|---|
| 1111 | kiocb_start_write(iocb: kiocb); | 
|---|
| 1112 | return true; | 
|---|
| 1113 | } | 
|---|
| 1114 |  | 
|---|
| 1115 | inode = file_inode(f: kiocb->ki_filp); | 
|---|
| 1116 | ret = sb_start_write_trylock(sb: inode->i_sb); | 
|---|
| 1117 | if (ret) | 
|---|
| 1118 | __sb_writers_release(inode->i_sb, SB_FREEZE_WRITE); | 
|---|
| 1119 | return ret; | 
|---|
| 1120 | } | 
|---|
| 1121 |  | 
|---|
| 1122 | int io_write(struct io_kiocb *req, unsigned int issue_flags) | 
|---|
| 1123 | { | 
|---|
| 1124 | bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; | 
|---|
| 1125 | struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); | 
|---|
| 1126 | struct io_async_rw *io = req->async_data; | 
|---|
| 1127 | struct kiocb *kiocb = &rw->kiocb; | 
|---|
| 1128 | ssize_t ret, ret2; | 
|---|
| 1129 | loff_t *ppos; | 
|---|
| 1130 |  | 
|---|
| 1131 | if (req->flags & REQ_F_IMPORT_BUFFER) { | 
|---|
| 1132 | ret = io_rw_import_reg_vec(req, io, ITER_SOURCE, issue_flags); | 
|---|
| 1133 | if (unlikely(ret)) | 
|---|
| 1134 | return ret; | 
|---|
| 1135 | } | 
|---|
| 1136 |  | 
|---|
| 1137 | ret = io_rw_init_file(req, FMODE_WRITE, WRITE); | 
|---|
| 1138 | if (unlikely(ret)) | 
|---|
| 1139 | return ret; | 
|---|
| 1140 | req->cqe.res = iov_iter_count(i: &io->iter); | 
|---|
| 1141 |  | 
|---|
| 1142 | if (force_nonblock) { | 
|---|
| 1143 | /* If the file doesn't support async, just async punt */ | 
|---|
| 1144 | if (unlikely(!io_file_supports_nowait(req, EPOLLOUT))) | 
|---|
| 1145 | goto ret_eagain; | 
|---|
| 1146 |  | 
|---|
| 1147 | /* Check if we can support NOWAIT. */ | 
|---|
| 1148 | if (!(kiocb->ki_flags & IOCB_DIRECT) && | 
|---|
| 1149 | !(req->file->f_op->fop_flags & FOP_BUFFER_WASYNC) && | 
|---|
| 1150 | (req->flags & REQ_F_ISREG)) | 
|---|
| 1151 | goto ret_eagain; | 
|---|
| 1152 |  | 
|---|
| 1153 | kiocb->ki_flags |= IOCB_NOWAIT; | 
|---|
| 1154 | } else { | 
|---|
| 1155 | /* Ensure we clear previously set non-block flag */ | 
|---|
| 1156 | kiocb->ki_flags &= ~IOCB_NOWAIT; | 
|---|
| 1157 | } | 
|---|
| 1158 |  | 
|---|
| 1159 | ppos = io_kiocb_update_pos(req); | 
|---|
| 1160 |  | 
|---|
| 1161 | ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res); | 
|---|
| 1162 | if (unlikely(ret)) | 
|---|
| 1163 | return ret; | 
|---|
| 1164 |  | 
|---|
| 1165 | if (unlikely(!io_kiocb_start_write(req, kiocb))) | 
|---|
| 1166 | return -EAGAIN; | 
|---|
| 1167 | kiocb->ki_flags |= IOCB_WRITE; | 
|---|
| 1168 |  | 
|---|
| 1169 | if (likely(req->file->f_op->write_iter)) | 
|---|
| 1170 | ret2 = req->file->f_op->write_iter(kiocb, &io->iter); | 
|---|
| 1171 | else if (req->file->f_op->write) | 
|---|
| 1172 | ret2 = loop_rw_iter(WRITE, rw, iter: &io->iter); | 
|---|
| 1173 | else | 
|---|
| 1174 | ret2 = -EINVAL; | 
|---|
| 1175 |  | 
|---|
| 1176 | /* | 
|---|
| 1177 | * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just | 
|---|
| 1178 | * retry them without IOCB_NOWAIT. | 
|---|
| 1179 | */ | 
|---|
| 1180 | if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) | 
|---|
| 1181 | ret2 = -EAGAIN; | 
|---|
| 1182 | /* no retry on NONBLOCK nor RWF_NOWAIT */ | 
|---|
| 1183 | if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT)) | 
|---|
| 1184 | goto done; | 
|---|
| 1185 | if (!force_nonblock || ret2 != -EAGAIN) { | 
|---|
| 1186 | /* IOPOLL retry should happen for io-wq threads */ | 
|---|
| 1187 | if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL)) | 
|---|
| 1188 | goto ret_eagain; | 
|---|
| 1189 |  | 
|---|
| 1190 | if (ret2 != req->cqe.res && ret2 >= 0 && need_complete_io(req)) { | 
|---|
| 1191 | trace_io_uring_short_write(ctx: req->ctx, fpos: kiocb->ki_pos - ret2, | 
|---|
| 1192 | wanted: req->cqe.res, got: ret2); | 
|---|
| 1193 |  | 
|---|
| 1194 | /* This is a partial write. The file pos has already been | 
|---|
| 1195 | * updated, setup the async struct to complete the request | 
|---|
| 1196 | * in the worker. Also update bytes_done to account for | 
|---|
| 1197 | * the bytes already written. | 
|---|
| 1198 | */ | 
|---|
| 1199 | iov_iter_save_state(iter: &io->iter, state: &io->iter_state); | 
|---|
| 1200 | io->bytes_done += ret2; | 
|---|
| 1201 |  | 
|---|
| 1202 | if (kiocb->ki_flags & IOCB_WRITE) | 
|---|
| 1203 | io_req_end_write(req); | 
|---|
| 1204 | return -EAGAIN; | 
|---|
| 1205 | } | 
|---|
| 1206 | done: | 
|---|
| 1207 | return kiocb_done(req, ret: ret2, NULL, issue_flags); | 
|---|
| 1208 | } else { | 
|---|
| 1209 | ret_eagain: | 
|---|
| 1210 | iov_iter_restore(i: &io->iter, state: &io->iter_state); | 
|---|
| 1211 | io_meta_restore(io, kiocb); | 
|---|
| 1212 | if (kiocb->ki_flags & IOCB_WRITE) | 
|---|
| 1213 | io_req_end_write(req); | 
|---|
| 1214 | return -EAGAIN; | 
|---|
| 1215 | } | 
|---|
| 1216 | } | 
|---|
| 1217 |  | 
|---|
| 1218 | int io_read_fixed(struct io_kiocb *req, unsigned int issue_flags) | 
|---|
| 1219 | { | 
|---|
| 1220 | int ret; | 
|---|
| 1221 |  | 
|---|
| 1222 | ret = io_init_rw_fixed(req, issue_flags, ITER_DEST); | 
|---|
| 1223 | if (unlikely(ret)) | 
|---|
| 1224 | return ret; | 
|---|
| 1225 |  | 
|---|
| 1226 | return io_read(req, issue_flags); | 
|---|
| 1227 | } | 
|---|
| 1228 |  | 
|---|
| 1229 | int io_write_fixed(struct io_kiocb *req, unsigned int issue_flags) | 
|---|
| 1230 | { | 
|---|
| 1231 | int ret; | 
|---|
| 1232 |  | 
|---|
| 1233 | ret = io_init_rw_fixed(req, issue_flags, ITER_SOURCE); | 
|---|
| 1234 | if (unlikely(ret)) | 
|---|
| 1235 | return ret; | 
|---|
| 1236 |  | 
|---|
| 1237 | return io_write(req, issue_flags); | 
|---|
| 1238 | } | 
|---|
| 1239 |  | 
|---|
| 1240 | void io_rw_fail(struct io_kiocb *req) | 
|---|
| 1241 | { | 
|---|
| 1242 | int res; | 
|---|
| 1243 |  | 
|---|
| 1244 | res = io_fixup_rw_res(req, res: req->cqe.res); | 
|---|
| 1245 | io_req_set_res(req, res, cflags: req->cqe.flags); | 
|---|
| 1246 | } | 
|---|
| 1247 |  | 
|---|
| 1248 | static int io_uring_classic_poll(struct io_kiocb *req, struct io_comp_batch *iob, | 
|---|
| 1249 | unsigned int poll_flags) | 
|---|
| 1250 | { | 
|---|
| 1251 | struct file *file = req->file; | 
|---|
| 1252 |  | 
|---|
| 1253 | if (req->opcode == IORING_OP_URING_CMD) { | 
|---|
| 1254 | struct io_uring_cmd *ioucmd; | 
|---|
| 1255 |  | 
|---|
| 1256 | ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); | 
|---|
| 1257 | return file->f_op->uring_cmd_iopoll(ioucmd, iob, poll_flags); | 
|---|
| 1258 | } else { | 
|---|
| 1259 | struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); | 
|---|
| 1260 |  | 
|---|
| 1261 | return file->f_op->iopoll(&rw->kiocb, iob, poll_flags); | 
|---|
| 1262 | } | 
|---|
| 1263 | } | 
|---|
| 1264 |  | 
|---|
| 1265 | static u64 io_hybrid_iopoll_delay(struct io_ring_ctx *ctx, struct io_kiocb *req) | 
|---|
| 1266 | { | 
|---|
| 1267 | struct hrtimer_sleeper timer; | 
|---|
| 1268 | enum hrtimer_mode mode; | 
|---|
| 1269 | ktime_t kt; | 
|---|
| 1270 | u64 sleep_time; | 
|---|
| 1271 |  | 
|---|
| 1272 | if (req->flags & REQ_F_IOPOLL_STATE) | 
|---|
| 1273 | return 0; | 
|---|
| 1274 |  | 
|---|
| 1275 | if (ctx->hybrid_poll_time == LLONG_MAX) | 
|---|
| 1276 | return 0; | 
|---|
| 1277 |  | 
|---|
| 1278 | /* Using half the running time to do schedule */ | 
|---|
| 1279 | sleep_time = ctx->hybrid_poll_time / 2; | 
|---|
| 1280 |  | 
|---|
| 1281 | kt = ktime_set(secs: 0, nsecs: sleep_time); | 
|---|
| 1282 | req->flags |= REQ_F_IOPOLL_STATE; | 
|---|
| 1283 |  | 
|---|
| 1284 | mode = HRTIMER_MODE_REL; | 
|---|
| 1285 | hrtimer_setup_sleeper_on_stack(sl: &timer, CLOCK_MONOTONIC, mode); | 
|---|
| 1286 | hrtimer_set_expires(timer: &timer.timer, time: kt); | 
|---|
| 1287 | set_current_state(TASK_INTERRUPTIBLE); | 
|---|
| 1288 | hrtimer_sleeper_start_expires(sl: &timer, mode); | 
|---|
| 1289 |  | 
|---|
| 1290 | if (timer.task) | 
|---|
| 1291 | io_schedule(); | 
|---|
| 1292 |  | 
|---|
| 1293 | hrtimer_cancel(timer: &timer.timer); | 
|---|
| 1294 | __set_current_state(TASK_RUNNING); | 
|---|
| 1295 | destroy_hrtimer_on_stack(timer: &timer.timer); | 
|---|
| 1296 | return sleep_time; | 
|---|
| 1297 | } | 
|---|
| 1298 |  | 
|---|
| 1299 | static int io_uring_hybrid_poll(struct io_kiocb *req, | 
|---|
| 1300 | struct io_comp_batch *iob, unsigned int poll_flags) | 
|---|
| 1301 | { | 
|---|
| 1302 | struct io_ring_ctx *ctx = req->ctx; | 
|---|
| 1303 | u64 runtime, sleep_time; | 
|---|
| 1304 | int ret; | 
|---|
| 1305 |  | 
|---|
| 1306 | sleep_time = io_hybrid_iopoll_delay(ctx, req); | 
|---|
| 1307 | ret = io_uring_classic_poll(req, iob, poll_flags); | 
|---|
| 1308 | runtime = ktime_get_ns() - req->iopoll_start - sleep_time; | 
|---|
| 1309 |  | 
|---|
| 1310 | /* | 
|---|
| 1311 | * Use minimum sleep time if we're polling devices with different | 
|---|
| 1312 | * latencies. We could get more completions from the faster ones. | 
|---|
| 1313 | */ | 
|---|
| 1314 | if (ctx->hybrid_poll_time > runtime) | 
|---|
| 1315 | ctx->hybrid_poll_time = runtime; | 
|---|
| 1316 |  | 
|---|
| 1317 | return ret; | 
|---|
| 1318 | } | 
|---|
| 1319 |  | 
|---|
| 1320 | int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) | 
|---|
| 1321 | { | 
|---|
| 1322 | struct io_wq_work_node *pos, *start, *prev; | 
|---|
| 1323 | unsigned int poll_flags = 0; | 
|---|
| 1324 | DEFINE_IO_COMP_BATCH(iob); | 
|---|
| 1325 | int nr_events = 0; | 
|---|
| 1326 |  | 
|---|
| 1327 | /* | 
|---|
| 1328 | * Only spin for completions if we don't have multiple devices hanging | 
|---|
| 1329 | * off our complete list. | 
|---|
| 1330 | */ | 
|---|
| 1331 | if (ctx->poll_multi_queue || force_nonspin) | 
|---|
| 1332 | poll_flags |= BLK_POLL_ONESHOT; | 
|---|
| 1333 |  | 
|---|
| 1334 | wq_list_for_each(pos, start, &ctx->iopoll_list) { | 
|---|
| 1335 | struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); | 
|---|
| 1336 | int ret; | 
|---|
| 1337 |  | 
|---|
| 1338 | /* | 
|---|
| 1339 | * Move completed and retryable entries to our local lists. | 
|---|
| 1340 | * If we find a request that requires polling, break out | 
|---|
| 1341 | * and complete those lists first, if we have entries there. | 
|---|
| 1342 | */ | 
|---|
| 1343 | if (READ_ONCE(req->iopoll_completed)) | 
|---|
| 1344 | break; | 
|---|
| 1345 |  | 
|---|
| 1346 | if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) | 
|---|
| 1347 | ret = io_uring_hybrid_poll(req, iob: &iob, poll_flags); | 
|---|
| 1348 | else | 
|---|
| 1349 | ret = io_uring_classic_poll(req, iob: &iob, poll_flags); | 
|---|
| 1350 |  | 
|---|
| 1351 | if (unlikely(ret < 0)) | 
|---|
| 1352 | return ret; | 
|---|
| 1353 | else if (ret) | 
|---|
| 1354 | poll_flags |= BLK_POLL_ONESHOT; | 
|---|
| 1355 |  | 
|---|
| 1356 | /* iopoll may have completed current req */ | 
|---|
| 1357 | if (!rq_list_empty(rl: &iob.req_list) || | 
|---|
| 1358 | READ_ONCE(req->iopoll_completed)) | 
|---|
| 1359 | break; | 
|---|
| 1360 | } | 
|---|
| 1361 |  | 
|---|
| 1362 | if (!rq_list_empty(rl: &iob.req_list)) | 
|---|
| 1363 | iob.complete(&iob); | 
|---|
| 1364 | else if (!pos) | 
|---|
| 1365 | return 0; | 
|---|
| 1366 |  | 
|---|
| 1367 | prev = start; | 
|---|
| 1368 | wq_list_for_each_resume(pos, prev) { | 
|---|
| 1369 | struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); | 
|---|
| 1370 |  | 
|---|
| 1371 | /* order with io_complete_rw_iopoll(), e.g. ->result updates */ | 
|---|
| 1372 | if (!smp_load_acquire(&req->iopoll_completed)) | 
|---|
| 1373 | break; | 
|---|
| 1374 | nr_events++; | 
|---|
| 1375 | req->cqe.flags = io_put_kbuf(req, len: req->cqe.res, NULL); | 
|---|
| 1376 | if (req->opcode != IORING_OP_URING_CMD) | 
|---|
| 1377 | io_req_rw_cleanup(req, issue_flags: 0); | 
|---|
| 1378 | } | 
|---|
| 1379 | if (unlikely(!nr_events)) | 
|---|
| 1380 | return 0; | 
|---|
| 1381 |  | 
|---|
| 1382 | pos = start ? start->next : ctx->iopoll_list.first; | 
|---|
| 1383 | wq_list_cut(list: &ctx->iopoll_list, last: prev, prev: start); | 
|---|
| 1384 |  | 
|---|
| 1385 | if (WARN_ON_ONCE(!wq_list_empty(&ctx->submit_state.compl_reqs))) | 
|---|
| 1386 | return 0; | 
|---|
| 1387 | ctx->submit_state.compl_reqs.first = pos; | 
|---|
| 1388 | __io_submit_flush_completions(ctx); | 
|---|
| 1389 | return nr_events; | 
|---|
| 1390 | } | 
|---|
| 1391 |  | 
|---|
| 1392 | void io_rw_cache_free(const void *entry) | 
|---|
| 1393 | { | 
|---|
| 1394 | struct io_async_rw *rw = (struct io_async_rw *) entry; | 
|---|
| 1395 |  | 
|---|
| 1396 | io_vec_free(iv: &rw->vec); | 
|---|
| 1397 | kfree(objp: rw); | 
|---|
| 1398 | } | 
|---|
| 1399 |  | 
|---|