1#ifndef IOU_CORE_H
2#define IOU_CORE_H
3
4#include <linux/errno.h>
5#include <linux/lockdep.h>
6#include <linux/resume_user_mode.h>
7#include <linux/kasan.h>
8#include <linux/poll.h>
9#include <linux/io_uring_types.h>
10#include <uapi/linux/eventpoll.h>
11#include "alloc_cache.h"
12#include "io-wq.h"
13#include "slist.h"
14#include "opdef.h"
15
16#ifndef CREATE_TRACE_POINTS
17#include <trace/events/io_uring.h>
18#endif
19
20#define IORING_FEAT_FLAGS (IORING_FEAT_SINGLE_MMAP |\
21 IORING_FEAT_NODROP |\
22 IORING_FEAT_SUBMIT_STABLE |\
23 IORING_FEAT_RW_CUR_POS |\
24 IORING_FEAT_CUR_PERSONALITY |\
25 IORING_FEAT_FAST_POLL |\
26 IORING_FEAT_POLL_32BITS |\
27 IORING_FEAT_SQPOLL_NONFIXED |\
28 IORING_FEAT_EXT_ARG |\
29 IORING_FEAT_NATIVE_WORKERS |\
30 IORING_FEAT_RSRC_TAGS |\
31 IORING_FEAT_CQE_SKIP |\
32 IORING_FEAT_LINKED_FILE |\
33 IORING_FEAT_REG_REG_RING |\
34 IORING_FEAT_RECVSEND_BUNDLE |\
35 IORING_FEAT_MIN_TIMEOUT |\
36 IORING_FEAT_RW_ATTR |\
37 IORING_FEAT_NO_IOWAIT)
38
39#define IORING_SETUP_FLAGS (IORING_SETUP_IOPOLL |\
40 IORING_SETUP_SQPOLL |\
41 IORING_SETUP_SQ_AFF |\
42 IORING_SETUP_CQSIZE |\
43 IORING_SETUP_CLAMP |\
44 IORING_SETUP_ATTACH_WQ |\
45 IORING_SETUP_R_DISABLED |\
46 IORING_SETUP_SUBMIT_ALL |\
47 IORING_SETUP_COOP_TASKRUN |\
48 IORING_SETUP_TASKRUN_FLAG |\
49 IORING_SETUP_SQE128 |\
50 IORING_SETUP_CQE32 |\
51 IORING_SETUP_SINGLE_ISSUER |\
52 IORING_SETUP_DEFER_TASKRUN |\
53 IORING_SETUP_NO_MMAP |\
54 IORING_SETUP_REGISTERED_FD_ONLY |\
55 IORING_SETUP_NO_SQARRAY |\
56 IORING_SETUP_HYBRID_IOPOLL |\
57 IORING_SETUP_CQE_MIXED)
58
59#define IORING_ENTER_FLAGS (IORING_ENTER_GETEVENTS |\
60 IORING_ENTER_SQ_WAKEUP |\
61 IORING_ENTER_SQ_WAIT |\
62 IORING_ENTER_EXT_ARG |\
63 IORING_ENTER_REGISTERED_RING |\
64 IORING_ENTER_ABS_TIMER |\
65 IORING_ENTER_EXT_ARG_REG |\
66 IORING_ENTER_NO_IOWAIT)
67
68
69#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE |\
70 IOSQE_IO_DRAIN |\
71 IOSQE_IO_LINK |\
72 IOSQE_IO_HARDLINK |\
73 IOSQE_ASYNC |\
74 IOSQE_BUFFER_SELECT |\
75 IOSQE_CQE_SKIP_SUCCESS)
76
77enum {
78 IOU_COMPLETE = 0,
79
80 IOU_ISSUE_SKIP_COMPLETE = -EIOCBQUEUED,
81
82 /*
83 * The request has more work to do and should be retried. io_uring will
84 * attempt to wait on the file for eligible opcodes, but otherwise
85 * it'll be handed to iowq for blocking execution. It works for normal
86 * requests as well as for the multi shot mode.
87 */
88 IOU_RETRY = -EAGAIN,
89
90 /*
91 * Requeue the task_work to restart operations on this request. The
92 * actual value isn't important, should just be not an otherwise
93 * valid error code, yet less than -MAX_ERRNO and valid internally.
94 */
95 IOU_REQUEUE = -3072,
96};
97
98struct io_wait_queue {
99 struct wait_queue_entry wq;
100 struct io_ring_ctx *ctx;
101 unsigned cq_tail;
102 unsigned cq_min_tail;
103 unsigned nr_timeouts;
104 int hit_timeout;
105 ktime_t min_timeout;
106 ktime_t timeout;
107 struct hrtimer t;
108
109#ifdef CONFIG_NET_RX_BUSY_POLL
110 ktime_t napi_busy_poll_dt;
111 bool napi_prefer_busy_poll;
112#endif
113};
114
115static inline bool io_should_wake(struct io_wait_queue *iowq)
116{
117 struct io_ring_ctx *ctx = iowq->ctx;
118 int dist = READ_ONCE(ctx->rings->cq.tail) - (int) iowq->cq_tail;
119
120 /*
121 * Wake up if we have enough events, or if a timeout occurred since we
122 * started waiting. For timeouts, we always want to return to userspace,
123 * regardless of event count.
124 */
125 return dist >= 0 || atomic_read(v: &ctx->cq_timeouts) != iowq->nr_timeouts;
126}
127
128#define IORING_MAX_ENTRIES 32768
129#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
130
131unsigned long rings_size(unsigned int flags, unsigned int sq_entries,
132 unsigned int cq_entries, size_t *sq_offset);
133int io_uring_fill_params(unsigned entries, struct io_uring_params *p);
134bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32);
135int io_run_task_work_sig(struct io_ring_ctx *ctx);
136void io_req_defer_failed(struct io_kiocb *req, s32 res);
137bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags);
138void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags);
139bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags);
140bool io_req_post_cqe32(struct io_kiocb *req, struct io_uring_cqe src_cqe[2]);
141void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
142
143void io_req_track_inflight(struct io_kiocb *req);
144struct file *io_file_get_normal(struct io_kiocb *req, int fd);
145struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
146 unsigned issue_flags);
147
148void __io_req_task_work_add(struct io_kiocb *req, unsigned flags);
149void io_req_task_work_add_remote(struct io_kiocb *req, unsigned flags);
150void io_req_task_queue(struct io_kiocb *req);
151void io_req_task_complete(struct io_kiocb *req, io_tw_token_t tw);
152void io_req_task_queue_fail(struct io_kiocb *req, int ret);
153void io_req_task_submit(struct io_kiocb *req, io_tw_token_t tw);
154struct llist_node *io_handle_tw_list(struct llist_node *node, unsigned int *count, unsigned int max_entries);
155struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, unsigned int max_entries, unsigned int *count);
156void tctx_task_work(struct callback_head *cb);
157__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
158
159int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file,
160 int start, int end);
161void io_req_queue_iowq(struct io_kiocb *req);
162
163int io_poll_issue(struct io_kiocb *req, io_tw_token_t tw);
164int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr);
165int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin);
166void __io_submit_flush_completions(struct io_ring_ctx *ctx);
167
168struct io_wq_work *io_wq_free_work(struct io_wq_work *work);
169void io_wq_submit_work(struct io_wq_work *work);
170
171void io_free_req(struct io_kiocb *req);
172void io_queue_next(struct io_kiocb *req);
173void io_task_refs_refill(struct io_uring_task *tctx);
174bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
175
176bool io_match_task_safe(struct io_kiocb *head, struct io_uring_task *tctx,
177 bool cancel_all);
178
179void io_activate_pollwq(struct io_ring_ctx *ctx);
180
181static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
182{
183#if defined(CONFIG_PROVE_LOCKING)
184 lockdep_assert(in_task());
185
186 if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
187 lockdep_assert_held(&ctx->uring_lock);
188
189 if (ctx->flags & IORING_SETUP_IOPOLL) {
190 lockdep_assert_held(&ctx->uring_lock);
191 } else if (!ctx->task_complete) {
192 lockdep_assert_held(&ctx->completion_lock);
193 } else if (ctx->submitter_task) {
194 /*
195 * ->submitter_task may be NULL and we can still post a CQE,
196 * if the ring has been setup with IORING_SETUP_R_DISABLED.
197 * Not from an SQE, as those cannot be submitted, but via
198 * updating tagged resources.
199 */
200 if (!percpu_ref_is_dying(&ctx->refs))
201 lockdep_assert(current == ctx->submitter_task);
202 }
203#endif
204}
205
206static inline bool io_is_compat(struct io_ring_ctx *ctx)
207{
208 return IS_ENABLED(CONFIG_COMPAT) && unlikely(ctx->compat);
209}
210
211static inline void io_req_task_work_add(struct io_kiocb *req)
212{
213 __io_req_task_work_add(req, flags: 0);
214}
215
216static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
217{
218 if (!wq_list_empty(&ctx->submit_state.compl_reqs) ||
219 ctx->submit_state.cq_flush)
220 __io_submit_flush_completions(ctx);
221}
222
223#define io_for_each_link(pos, head) \
224 for (pos = (head); pos; pos = pos->link)
225
226static inline bool io_get_cqe_overflow(struct io_ring_ctx *ctx,
227 struct io_uring_cqe **ret,
228 bool overflow, bool cqe32)
229{
230 io_lockdep_assert_cq_locked(ctx);
231
232 if (unlikely(ctx->cqe_sentinel - ctx->cqe_cached < (cqe32 + 1))) {
233 if (unlikely(!io_cqe_cache_refill(ctx, overflow, cqe32)))
234 return false;
235 }
236 *ret = ctx->cqe_cached;
237 ctx->cached_cq_tail++;
238 ctx->cqe_cached++;
239 if (ctx->flags & IORING_SETUP_CQE32) {
240 ctx->cqe_cached++;
241 } else if (cqe32 && ctx->flags & IORING_SETUP_CQE_MIXED) {
242 ctx->cqe_cached++;
243 ctx->cached_cq_tail++;
244 }
245 WARN_ON_ONCE(ctx->cqe_cached > ctx->cqe_sentinel);
246 return true;
247}
248
249static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret,
250 bool cqe32)
251{
252 return io_get_cqe_overflow(ctx, ret, overflow: false, cqe32);
253}
254
255static inline bool io_defer_get_uncommited_cqe(struct io_ring_ctx *ctx,
256 struct io_uring_cqe **cqe_ret)
257{
258 io_lockdep_assert_cq_locked(ctx);
259
260 ctx->submit_state.cq_flush = true;
261 return io_get_cqe(ctx, ret: cqe_ret, cqe32: ctx->flags & IORING_SETUP_CQE_MIXED);
262}
263
264static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx,
265 struct io_kiocb *req)
266{
267 bool is_cqe32 = req->cqe.flags & IORING_CQE_F_32;
268 struct io_uring_cqe *cqe;
269
270 /*
271 * If we can't get a cq entry, userspace overflowed the submission
272 * (by quite a lot).
273 */
274 if (unlikely(!io_get_cqe(ctx, &cqe, is_cqe32)))
275 return false;
276
277 memcpy(to: cqe, from: &req->cqe, len: sizeof(*cqe));
278 if (ctx->flags & IORING_SETUP_CQE32 || is_cqe32) {
279 memcpy(to: cqe->big_cqe, from: &req->big_cqe, len: sizeof(*cqe));
280 memset(s: &req->big_cqe, c: 0, n: sizeof(req->big_cqe));
281 }
282
283 if (trace_io_uring_complete_enabled())
284 trace_io_uring_complete(ctx: req->ctx, req, cqe);
285 return true;
286}
287
288static inline void req_set_fail(struct io_kiocb *req)
289{
290 req->flags |= REQ_F_FAIL;
291 if (req->flags & REQ_F_CQE_SKIP) {
292 req->flags &= ~REQ_F_CQE_SKIP;
293 req->flags |= REQ_F_SKIP_LINK_CQES;
294 }
295}
296
297static inline void io_req_set_res(struct io_kiocb *req, s32 res, u32 cflags)
298{
299 req->cqe.res = res;
300 req->cqe.flags = cflags;
301}
302
303static inline u32 ctx_cqe32_flags(struct io_ring_ctx *ctx)
304{
305 if (ctx->flags & IORING_SETUP_CQE_MIXED)
306 return IORING_CQE_F_32;
307 return 0;
308}
309
310static inline void io_req_set_res32(struct io_kiocb *req, s32 res, u32 cflags,
311 __u64 extra1, __u64 extra2)
312{
313 req->cqe.res = res;
314 req->cqe.flags = cflags | ctx_cqe32_flags(ctx: req->ctx);
315 req->big_cqe.extra1 = extra1;
316 req->big_cqe.extra2 = extra2;
317}
318
319static inline void *io_uring_alloc_async_data(struct io_alloc_cache *cache,
320 struct io_kiocb *req)
321{
322 if (cache) {
323 req->async_data = io_cache_alloc(cache, GFP_KERNEL);
324 } else {
325 const struct io_issue_def *def = &io_issue_defs[req->opcode];
326
327 WARN_ON_ONCE(!def->async_size);
328 req->async_data = kmalloc(def->async_size, GFP_KERNEL);
329 }
330 if (req->async_data)
331 req->flags |= REQ_F_ASYNC_DATA;
332 return req->async_data;
333}
334
335static inline bool req_has_async_data(struct io_kiocb *req)
336{
337 return req->flags & REQ_F_ASYNC_DATA;
338}
339
340static inline void io_req_async_data_clear(struct io_kiocb *req,
341 io_req_flags_t extra_flags)
342{
343 req->flags &= ~(REQ_F_ASYNC_DATA|extra_flags);
344 req->async_data = NULL;
345}
346
347static inline void io_req_async_data_free(struct io_kiocb *req)
348{
349 kfree(objp: req->async_data);
350 io_req_async_data_clear(req, extra_flags: 0);
351}
352
353static inline void io_put_file(struct io_kiocb *req)
354{
355 if (!(req->flags & REQ_F_FIXED_FILE) && req->file)
356 fput(req->file);
357}
358
359static inline void io_ring_submit_unlock(struct io_ring_ctx *ctx,
360 unsigned issue_flags)
361{
362 lockdep_assert_held(&ctx->uring_lock);
363 if (unlikely(issue_flags & IO_URING_F_UNLOCKED))
364 mutex_unlock(lock: &ctx->uring_lock);
365}
366
367static inline void io_ring_submit_lock(struct io_ring_ctx *ctx,
368 unsigned issue_flags)
369{
370 /*
371 * "Normal" inline submissions always hold the uring_lock, since we
372 * grab it from the system call. Same is true for the SQPOLL offload.
373 * The only exception is when we've detached the request and issue it
374 * from an async worker thread, grab the lock for that case.
375 */
376 if (unlikely(issue_flags & IO_URING_F_UNLOCKED))
377 mutex_lock(lock: &ctx->uring_lock);
378 lockdep_assert_held(&ctx->uring_lock);
379}
380
381static inline void io_commit_cqring(struct io_ring_ctx *ctx)
382{
383 /* order cqe stores with ring update */
384 smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
385}
386
387static inline void __io_wq_wake(struct wait_queue_head *wq)
388{
389 /*
390 *
391 * Pass in EPOLLIN|EPOLL_URING_WAKE as the poll wakeup key. The latter
392 * set in the mask so that if we recurse back into our own poll
393 * waitqueue handlers, we know we have a dependency between eventfd or
394 * epoll and should terminate multishot poll at that point.
395 */
396 if (wq_has_sleeper(wq_head: wq))
397 __wake_up(wq_head: wq, TASK_NORMAL, nr: 0, poll_to_key(EPOLL_URING_WAKE | EPOLLIN));
398}
399
400static inline void io_poll_wq_wake(struct io_ring_ctx *ctx)
401{
402 __io_wq_wake(wq: &ctx->poll_wq);
403}
404
405static inline void io_cqring_wake(struct io_ring_ctx *ctx)
406{
407 /*
408 * Trigger waitqueue handler on all waiters on our waitqueue. This
409 * won't necessarily wake up all the tasks, io_should_wake() will make
410 * that decision.
411 */
412
413 __io_wq_wake(wq: &ctx->cq_wait);
414}
415
416static inline bool io_sqring_full(struct io_ring_ctx *ctx)
417{
418 struct io_rings *r = ctx->rings;
419
420 /*
421 * SQPOLL must use the actual sqring head, as using the cached_sq_head
422 * is race prone if the SQPOLL thread has grabbed entries but not yet
423 * committed them to the ring. For !SQPOLL, this doesn't matter, but
424 * since this helper is just used for SQPOLL sqring waits (or POLLOUT),
425 * just read the actual sqring head unconditionally.
426 */
427 return READ_ONCE(r->sq.tail) - READ_ONCE(r->sq.head) == ctx->sq_entries;
428}
429
430static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
431{
432 struct io_rings *rings = ctx->rings;
433 unsigned int entries;
434
435 /* make sure SQ entry isn't read before tail */
436 entries = smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
437 return min(entries, ctx->sq_entries);
438}
439
440static inline int io_run_task_work(void)
441{
442 bool ret = false;
443
444 /*
445 * Always check-and-clear the task_work notification signal. With how
446 * signaling works for task_work, we can find it set with nothing to
447 * run. We need to clear it for that case, like get_signal() does.
448 */
449 if (test_thread_flag(TIF_NOTIFY_SIGNAL))
450 clear_notify_signal();
451 /*
452 * PF_IO_WORKER never returns to userspace, so check here if we have
453 * notify work that needs processing.
454 */
455 if (current->flags & PF_IO_WORKER) {
456 if (test_thread_flag(TIF_NOTIFY_RESUME)) {
457 __set_current_state(TASK_RUNNING);
458 resume_user_mode_work(NULL);
459 }
460 if (current->io_uring) {
461 unsigned int count = 0;
462
463 __set_current_state(TASK_RUNNING);
464 tctx_task_work_run(current->io_uring, UINT_MAX, count: &count);
465 if (count)
466 ret = true;
467 }
468 }
469 if (task_work_pending(current)) {
470 __set_current_state(TASK_RUNNING);
471 task_work_run();
472 ret = true;
473 }
474
475 return ret;
476}
477
478static inline bool io_local_work_pending(struct io_ring_ctx *ctx)
479{
480 return !llist_empty(head: &ctx->work_llist) || !llist_empty(head: &ctx->retry_llist);
481}
482
483static inline bool io_task_work_pending(struct io_ring_ctx *ctx)
484{
485 return task_work_pending(current) || io_local_work_pending(ctx);
486}
487
488static inline void io_tw_lock(struct io_ring_ctx *ctx, io_tw_token_t tw)
489{
490 lockdep_assert_held(&ctx->uring_lock);
491}
492
493/*
494 * Don't complete immediately but use deferred completion infrastructure.
495 * Protected by ->uring_lock and can only be used either with
496 * IO_URING_F_COMPLETE_DEFER or inside a tw handler holding the mutex.
497 */
498static inline void io_req_complete_defer(struct io_kiocb *req)
499 __must_hold(&req->ctx->uring_lock)
500{
501 struct io_submit_state *state = &req->ctx->submit_state;
502
503 lockdep_assert_held(&req->ctx->uring_lock);
504
505 wq_list_add_tail(node: &req->comp_list, list: &state->compl_reqs);
506}
507
508static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx)
509{
510 if (unlikely(ctx->off_timeout_used ||
511 ctx->has_evfd || ctx->poll_activated))
512 __io_commit_cqring_flush(ctx);
513}
514
515static inline void io_get_task_refs(int nr)
516{
517 struct io_uring_task *tctx = current->io_uring;
518
519 tctx->cached_refs -= nr;
520 if (unlikely(tctx->cached_refs < 0))
521 io_task_refs_refill(tctx);
522}
523
524static inline bool io_req_cache_empty(struct io_ring_ctx *ctx)
525{
526 return !ctx->submit_state.free_list.next;
527}
528
529extern struct kmem_cache *req_cachep;
530
531static inline struct io_kiocb *io_extract_req(struct io_ring_ctx *ctx)
532{
533 struct io_kiocb *req;
534
535 req = container_of(ctx->submit_state.free_list.next, struct io_kiocb, comp_list);
536 wq_stack_extract(stack: &ctx->submit_state.free_list);
537 return req;
538}
539
540static inline bool io_alloc_req(struct io_ring_ctx *ctx, struct io_kiocb **req)
541{
542 if (unlikely(io_req_cache_empty(ctx))) {
543 if (!__io_alloc_req_refill(ctx))
544 return false;
545 }
546 *req = io_extract_req(ctx);
547 return true;
548}
549
550static inline bool io_allowed_defer_tw_run(struct io_ring_ctx *ctx)
551{
552 return likely(ctx->submitter_task == current);
553}
554
555static inline bool io_allowed_run_tw(struct io_ring_ctx *ctx)
556{
557 return likely(!(ctx->flags & IORING_SETUP_DEFER_TASKRUN) ||
558 ctx->submitter_task == current);
559}
560
561/*
562 * Terminate the request if either of these conditions are true:
563 *
564 * 1) It's being executed by the original task, but that task is marked
565 * with PF_EXITING as it's exiting.
566 * 2) PF_KTHREAD is set, in which case the invoker of the task_work is
567 * our fallback task_work.
568 */
569static inline bool io_should_terminate_tw(struct io_ring_ctx *ctx)
570{
571 return (current->flags & (PF_KTHREAD | PF_EXITING)) || percpu_ref_is_dying(ref: &ctx->refs);
572}
573
574static inline void io_req_queue_tw_complete(struct io_kiocb *req, s32 res)
575{
576 io_req_set_res(req, res, cflags: 0);
577 req->io_task_work.func = io_req_task_complete;
578 io_req_task_work_add(req);
579}
580
581/*
582 * IORING_SETUP_SQE128 contexts allocate twice the normal SQE size for each
583 * slot.
584 */
585static inline size_t uring_sqe_size(struct io_ring_ctx *ctx)
586{
587 if (ctx->flags & IORING_SETUP_SQE128)
588 return 2 * sizeof(struct io_uring_sqe);
589 return sizeof(struct io_uring_sqe);
590}
591
592static inline bool io_file_can_poll(struct io_kiocb *req)
593{
594 if (req->flags & REQ_F_CAN_POLL)
595 return true;
596 if (req->file && file_can_poll(file: req->file)) {
597 req->flags |= REQ_F_CAN_POLL;
598 return true;
599 }
600 return false;
601}
602
603static inline ktime_t io_get_time(struct io_ring_ctx *ctx)
604{
605 if (ctx->clockid == CLOCK_MONOTONIC)
606 return ktime_get();
607
608 return ktime_get_with_offset(offs: ctx->clock_offset);
609}
610
611enum {
612 IO_CHECK_CQ_OVERFLOW_BIT,
613 IO_CHECK_CQ_DROPPED_BIT,
614};
615
616static inline bool io_has_work(struct io_ring_ctx *ctx)
617{
618 return test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq) ||
619 io_local_work_pending(ctx);
620}
621#endif
622