1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Code related to the io_uring_register() syscall
4 *
5 * Copyright (C) 2023 Jens Axboe
6 */
7#include <linux/kernel.h>
8#include <linux/errno.h>
9#include <linux/syscalls.h>
10#include <linux/refcount.h>
11#include <linux/bits.h>
12#include <linux/fs.h>
13#include <linux/file.h>
14#include <linux/slab.h>
15#include <linux/uaccess.h>
16#include <linux/nospec.h>
17#include <linux/compat.h>
18#include <linux/io_uring.h>
19#include <linux/io_uring_types.h>
20
21#include "filetable.h"
22#include "io_uring.h"
23#include "opdef.h"
24#include "tctx.h"
25#include "rsrc.h"
26#include "sqpoll.h"
27#include "register.h"
28#include "cancel.h"
29#include "kbuf.h"
30#include "napi.h"
31#include "eventfd.h"
32#include "msg_ring.h"
33#include "memmap.h"
34#include "zcrx.h"
35#include "query.h"
36
37#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
38 IORING_REGISTER_LAST + IORING_OP_LAST)
39
40static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
41 unsigned nr_args)
42{
43 struct io_uring_probe *p;
44 size_t size;
45 int i, ret;
46
47 if (nr_args > IORING_OP_LAST)
48 nr_args = IORING_OP_LAST;
49
50 size = struct_size(p, ops, nr_args);
51 p = memdup_user(arg, size);
52 if (IS_ERR(ptr: p))
53 return PTR_ERR(ptr: p);
54 ret = -EINVAL;
55 if (memchr_inv(s: p, c: 0, n: size))
56 goto out;
57
58 p->last_op = IORING_OP_LAST - 1;
59
60 for (i = 0; i < nr_args; i++) {
61 p->ops[i].op = i;
62 if (io_uring_op_supported(opcode: i))
63 p->ops[i].flags = IO_URING_OP_SUPPORTED;
64 }
65 p->ops_len = i;
66
67 ret = 0;
68 if (copy_to_user(to: arg, from: p, n: size))
69 ret = -EFAULT;
70out:
71 kfree(objp: p);
72 return ret;
73}
74
75int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
76{
77 const struct cred *creds;
78
79 creds = xa_erase(&ctx->personalities, index: id);
80 if (creds) {
81 put_cred(cred: creds);
82 return 0;
83 }
84
85 return -EINVAL;
86}
87
88
89static int io_register_personality(struct io_ring_ctx *ctx)
90{
91 const struct cred *creds;
92 u32 id;
93 int ret;
94
95 creds = get_current_cred();
96
97 ret = xa_alloc_cyclic(xa: &ctx->personalities, id: &id, entry: (void *)creds,
98 XA_LIMIT(0, USHRT_MAX), next: &ctx->pers_next, GFP_KERNEL);
99 if (ret < 0) {
100 put_cred(cred: creds);
101 return ret;
102 }
103 return id;
104}
105
106static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args,
107 struct io_restriction *restrictions)
108{
109 struct io_uring_restriction *res;
110 size_t size;
111 int i, ret;
112
113 if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
114 return -EINVAL;
115
116 size = array_size(nr_args, sizeof(*res));
117 if (size == SIZE_MAX)
118 return -EOVERFLOW;
119
120 res = memdup_user(arg, size);
121 if (IS_ERR(ptr: res))
122 return PTR_ERR(ptr: res);
123
124 ret = -EINVAL;
125
126 for (i = 0; i < nr_args; i++) {
127 switch (res[i].opcode) {
128 case IORING_RESTRICTION_REGISTER_OP:
129 if (res[i].register_op >= IORING_REGISTER_LAST)
130 goto err;
131 __set_bit(res[i].register_op, restrictions->register_op);
132 break;
133 case IORING_RESTRICTION_SQE_OP:
134 if (res[i].sqe_op >= IORING_OP_LAST)
135 goto err;
136 __set_bit(res[i].sqe_op, restrictions->sqe_op);
137 break;
138 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
139 restrictions->sqe_flags_allowed = res[i].sqe_flags;
140 break;
141 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
142 restrictions->sqe_flags_required = res[i].sqe_flags;
143 break;
144 default:
145 goto err;
146 }
147 }
148
149 ret = 0;
150
151err:
152 kfree(objp: res);
153 return ret;
154}
155
156static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
157 void __user *arg, unsigned int nr_args)
158{
159 int ret;
160
161 /* Restrictions allowed only if rings started disabled */
162 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
163 return -EBADFD;
164
165 /* We allow only a single restrictions registration */
166 if (ctx->restrictions.registered)
167 return -EBUSY;
168
169 ret = io_parse_restrictions(arg, nr_args, restrictions: &ctx->restrictions);
170 /* Reset all restrictions if an error happened */
171 if (ret != 0)
172 memset(s: &ctx->restrictions, c: 0, n: sizeof(ctx->restrictions));
173 else
174 ctx->restrictions.registered = true;
175 return ret;
176}
177
178static int io_register_enable_rings(struct io_ring_ctx *ctx)
179{
180 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
181 return -EBADFD;
182
183 if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
184 WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
185 /*
186 * Lazy activation attempts would fail if it was polled before
187 * submitter_task is set.
188 */
189 if (wq_has_sleeper(wq_head: &ctx->poll_wq))
190 io_activate_pollwq(ctx);
191 }
192
193 if (ctx->restrictions.registered)
194 ctx->restricted = 1;
195
196 ctx->flags &= ~IORING_SETUP_R_DISABLED;
197 if (ctx->sq_data && wq_has_sleeper(wq_head: &ctx->sq_data->wait))
198 wake_up(&ctx->sq_data->wait);
199 return 0;
200}
201
202static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
203 cpumask_var_t new_mask)
204{
205 int ret;
206
207 if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
208 ret = io_wq_cpu_affinity(current->io_uring, mask: new_mask);
209 } else {
210 mutex_unlock(lock: &ctx->uring_lock);
211 ret = io_sqpoll_wq_cpu_affinity(ctx, mask: new_mask);
212 mutex_lock(lock: &ctx->uring_lock);
213 }
214
215 return ret;
216}
217
218static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
219 void __user *arg, unsigned len)
220{
221 cpumask_var_t new_mask;
222 int ret;
223
224 if (!alloc_cpumask_var(mask: &new_mask, GFP_KERNEL))
225 return -ENOMEM;
226
227 cpumask_clear(dstp: new_mask);
228 if (len > cpumask_size())
229 len = cpumask_size();
230
231#ifdef CONFIG_COMPAT
232 if (in_compat_syscall())
233 ret = compat_get_bitmap(cpumask_bits(new_mask),
234 umask: (const compat_ulong_t __user *)arg,
235 bitmap_size: len * 8 /* CHAR_BIT */);
236 else
237#endif
238 ret = copy_from_user(to: new_mask, from: arg, n: len);
239
240 if (ret) {
241 free_cpumask_var(mask: new_mask);
242 return -EFAULT;
243 }
244
245 ret = __io_register_iowq_aff(ctx, new_mask);
246 free_cpumask_var(mask: new_mask);
247 return ret;
248}
249
250static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
251{
252 return __io_register_iowq_aff(ctx, NULL);
253}
254
255static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
256 void __user *arg)
257 __must_hold(&ctx->uring_lock)
258{
259 struct io_tctx_node *node;
260 struct io_uring_task *tctx = NULL;
261 struct io_sq_data *sqd = NULL;
262 __u32 new_count[2];
263 int i, ret;
264
265 if (copy_from_user(to: new_count, from: arg, n: sizeof(new_count)))
266 return -EFAULT;
267 for (i = 0; i < ARRAY_SIZE(new_count); i++)
268 if (new_count[i] > INT_MAX)
269 return -EINVAL;
270
271 if (ctx->flags & IORING_SETUP_SQPOLL) {
272 sqd = ctx->sq_data;
273 if (sqd) {
274 struct task_struct *tsk;
275
276 /*
277 * Observe the correct sqd->lock -> ctx->uring_lock
278 * ordering. Fine to drop uring_lock here, we hold
279 * a ref to the ctx.
280 */
281 refcount_inc(r: &sqd->refs);
282 mutex_unlock(lock: &ctx->uring_lock);
283 mutex_lock(lock: &sqd->lock);
284 mutex_lock(lock: &ctx->uring_lock);
285 tsk = sqpoll_task_locked(sqd);
286 if (tsk)
287 tctx = tsk->io_uring;
288 }
289 } else {
290 tctx = current->io_uring;
291 }
292
293 BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
294
295 for (i = 0; i < ARRAY_SIZE(new_count); i++)
296 if (new_count[i])
297 ctx->iowq_limits[i] = new_count[i];
298 ctx->iowq_limits_set = true;
299
300 if (tctx && tctx->io_wq) {
301 ret = io_wq_max_workers(wq: tctx->io_wq, new_count);
302 if (ret)
303 goto err;
304 } else {
305 memset(s: new_count, c: 0, n: sizeof(new_count));
306 }
307
308 if (sqd) {
309 mutex_unlock(lock: &ctx->uring_lock);
310 mutex_unlock(lock: &sqd->lock);
311 io_put_sq_data(sqd);
312 mutex_lock(lock: &ctx->uring_lock);
313 }
314
315 if (copy_to_user(to: arg, from: new_count, n: sizeof(new_count)))
316 return -EFAULT;
317
318 /* that's it for SQPOLL, only the SQPOLL task creates requests */
319 if (sqd)
320 return 0;
321
322 /* now propagate the restriction to all registered users */
323 list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
324 tctx = node->task->io_uring;
325 if (WARN_ON_ONCE(!tctx->io_wq))
326 continue;
327
328 for (i = 0; i < ARRAY_SIZE(new_count); i++)
329 new_count[i] = ctx->iowq_limits[i];
330 /* ignore errors, it always returns zero anyway */
331 (void)io_wq_max_workers(wq: tctx->io_wq, new_count);
332 }
333 return 0;
334err:
335 if (sqd) {
336 mutex_unlock(lock: &ctx->uring_lock);
337 mutex_unlock(lock: &sqd->lock);
338 io_put_sq_data(sqd);
339 mutex_lock(lock: &ctx->uring_lock);
340 }
341 return ret;
342}
343
344static int io_register_clock(struct io_ring_ctx *ctx,
345 struct io_uring_clock_register __user *arg)
346{
347 struct io_uring_clock_register reg;
348
349 if (copy_from_user(to: &reg, from: arg, n: sizeof(reg)))
350 return -EFAULT;
351 if (memchr_inv(s: &reg.__resv, c: 0, n: sizeof(reg.__resv)))
352 return -EINVAL;
353
354 switch (reg.clockid) {
355 case CLOCK_MONOTONIC:
356 ctx->clock_offset = 0;
357 break;
358 case CLOCK_BOOTTIME:
359 ctx->clock_offset = TK_OFFS_BOOT;
360 break;
361 default:
362 return -EINVAL;
363 }
364
365 ctx->clockid = reg.clockid;
366 return 0;
367}
368
369/*
370 * State to maintain until we can swap. Both new and old state, used for
371 * either mapping or freeing.
372 */
373struct io_ring_ctx_rings {
374 struct io_rings *rings;
375 struct io_uring_sqe *sq_sqes;
376
377 struct io_mapped_region sq_region;
378 struct io_mapped_region ring_region;
379};
380
381static void io_register_free_rings(struct io_ring_ctx *ctx,
382 struct io_uring_params *p,
383 struct io_ring_ctx_rings *r)
384{
385 io_free_region(ctx, mr: &r->sq_region);
386 io_free_region(ctx, mr: &r->ring_region);
387}
388
389#define swap_old(ctx, o, n, field) \
390 do { \
391 (o).field = (ctx)->field; \
392 (ctx)->field = (n).field; \
393 } while (0)
394
395#define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP)
396#define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \
397 IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \
398 IORING_SETUP_CQE_MIXED)
399
400static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
401{
402 struct io_uring_region_desc rd;
403 struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL;
404 size_t size, sq_array_offset;
405 unsigned i, tail, old_head;
406 struct io_uring_params p;
407 int ret;
408
409 /* limited to DEFER_TASKRUN for now */
410 if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
411 return -EINVAL;
412 if (copy_from_user(to: &p, from: arg, n: sizeof(p)))
413 return -EFAULT;
414 if (p.flags & ~RESIZE_FLAGS)
415 return -EINVAL;
416
417 /* properties that are always inherited */
418 p.flags |= (ctx->flags & COPY_FLAGS);
419
420 ret = io_uring_fill_params(entries: p.sq_entries, p: &p);
421 if (unlikely(ret))
422 return ret;
423
424 /* nothing to do, but copy params back */
425 if (p.sq_entries == ctx->sq_entries && p.cq_entries == ctx->cq_entries) {
426 if (copy_to_user(to: arg, from: &p, n: sizeof(p)))
427 return -EFAULT;
428 return 0;
429 }
430
431 size = rings_size(flags: p.flags, sq_entries: p.sq_entries, cq_entries: p.cq_entries,
432 sq_offset: &sq_array_offset);
433 if (size == SIZE_MAX)
434 return -EOVERFLOW;
435
436 memset(s: &rd, c: 0, n: sizeof(rd));
437 rd.size = PAGE_ALIGN(size);
438 if (p.flags & IORING_SETUP_NO_MMAP) {
439 rd.user_addr = p.cq_off.user_addr;
440 rd.flags |= IORING_MEM_REGION_TYPE_USER;
441 }
442 ret = io_create_region_mmap_safe(ctx, mr: &n.ring_region, reg: &rd, IORING_OFF_CQ_RING);
443 if (ret) {
444 io_register_free_rings(ctx, p: &p, r: &n);
445 return ret;
446 }
447 n.rings = io_region_get_ptr(mr: &n.ring_region);
448
449 /*
450 * At this point n.rings is shared with userspace, just like o.rings
451 * is as well. While we don't expect userspace to modify it while
452 * a resize is in progress, and it's most likely that userspace will
453 * shoot itself in the foot if it does, we can't always assume good
454 * intent... Use read/write once helpers from here on to indicate the
455 * shared nature of it.
456 */
457 WRITE_ONCE(n.rings->sq_ring_mask, p.sq_entries - 1);
458 WRITE_ONCE(n.rings->cq_ring_mask, p.cq_entries - 1);
459 WRITE_ONCE(n.rings->sq_ring_entries, p.sq_entries);
460 WRITE_ONCE(n.rings->cq_ring_entries, p.cq_entries);
461
462 if (copy_to_user(to: arg, from: &p, n: sizeof(p))) {
463 io_register_free_rings(ctx, p: &p, r: &n);
464 return -EFAULT;
465 }
466
467 if (p.flags & IORING_SETUP_SQE128)
468 size = array_size(2 * sizeof(struct io_uring_sqe), p.sq_entries);
469 else
470 size = array_size(sizeof(struct io_uring_sqe), p.sq_entries);
471 if (size == SIZE_MAX) {
472 io_register_free_rings(ctx, p: &p, r: &n);
473 return -EOVERFLOW;
474 }
475
476 memset(s: &rd, c: 0, n: sizeof(rd));
477 rd.size = PAGE_ALIGN(size);
478 if (p.flags & IORING_SETUP_NO_MMAP) {
479 rd.user_addr = p.sq_off.user_addr;
480 rd.flags |= IORING_MEM_REGION_TYPE_USER;
481 }
482 ret = io_create_region_mmap_safe(ctx, mr: &n.sq_region, reg: &rd, IORING_OFF_SQES);
483 if (ret) {
484 io_register_free_rings(ctx, p: &p, r: &n);
485 return ret;
486 }
487 n.sq_sqes = io_region_get_ptr(mr: &n.sq_region);
488
489 /*
490 * If using SQPOLL, park the thread
491 */
492 if (ctx->sq_data) {
493 mutex_unlock(lock: &ctx->uring_lock);
494 io_sq_thread_park(sqd: ctx->sq_data);
495 mutex_lock(lock: &ctx->uring_lock);
496 }
497
498 /*
499 * We'll do the swap. Grab the ctx->mmap_lock, which will exclude
500 * any new mmap's on the ring fd. Clear out existing mappings to prevent
501 * mmap from seeing them, as we'll unmap them. Any attempt to mmap
502 * existing rings beyond this point will fail. Not that it could proceed
503 * at this point anyway, as the io_uring mmap side needs go grab the
504 * ctx->mmap_lock as well. Likewise, hold the completion lock over the
505 * duration of the actual swap.
506 */
507 mutex_lock(lock: &ctx->mmap_lock);
508 spin_lock(lock: &ctx->completion_lock);
509 o.rings = ctx->rings;
510 ctx->rings = NULL;
511 o.sq_sqes = ctx->sq_sqes;
512 ctx->sq_sqes = NULL;
513
514 /*
515 * Now copy SQ and CQ entries, if any. If either of the destination
516 * rings can't hold what is already there, then fail the operation.
517 */
518 tail = READ_ONCE(o.rings->sq.tail);
519 old_head = READ_ONCE(o.rings->sq.head);
520 if (tail - old_head > p.sq_entries)
521 goto overflow;
522 for (i = old_head; i < tail; i++) {
523 unsigned src_head = i & (ctx->sq_entries - 1);
524 unsigned dst_head = i & (p.sq_entries - 1);
525
526 n.sq_sqes[dst_head] = o.sq_sqes[src_head];
527 }
528 WRITE_ONCE(n.rings->sq.head, old_head);
529 WRITE_ONCE(n.rings->sq.tail, tail);
530
531 tail = READ_ONCE(o.rings->cq.tail);
532 old_head = READ_ONCE(o.rings->cq.head);
533 if (tail - old_head > p.cq_entries) {
534overflow:
535 /* restore old rings, and return -EOVERFLOW via cleanup path */
536 ctx->rings = o.rings;
537 ctx->sq_sqes = o.sq_sqes;
538 to_free = &n;
539 ret = -EOVERFLOW;
540 goto out;
541 }
542 for (i = old_head; i < tail; i++) {
543 unsigned src_head = i & (ctx->cq_entries - 1);
544 unsigned dst_head = i & (p.cq_entries - 1);
545
546 n.rings->cqes[dst_head] = o.rings->cqes[src_head];
547 }
548 WRITE_ONCE(n.rings->cq.head, old_head);
549 WRITE_ONCE(n.rings->cq.tail, tail);
550 /* invalidate cached cqe refill */
551 ctx->cqe_cached = ctx->cqe_sentinel = NULL;
552
553 WRITE_ONCE(n.rings->sq_dropped, READ_ONCE(o.rings->sq_dropped));
554 atomic_set(v: &n.rings->sq_flags, i: atomic_read(v: &o.rings->sq_flags));
555 WRITE_ONCE(n.rings->cq_flags, READ_ONCE(o.rings->cq_flags));
556 WRITE_ONCE(n.rings->cq_overflow, READ_ONCE(o.rings->cq_overflow));
557
558 /* all done, store old pointers and assign new ones */
559 if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
560 ctx->sq_array = (u32 *)((char *)n.rings + sq_array_offset);
561
562 ctx->sq_entries = p.sq_entries;
563 ctx->cq_entries = p.cq_entries;
564
565 ctx->rings = n.rings;
566 ctx->sq_sqes = n.sq_sqes;
567 swap_old(ctx, o, n, ring_region);
568 swap_old(ctx, o, n, sq_region);
569 to_free = &o;
570 ret = 0;
571out:
572 spin_unlock(lock: &ctx->completion_lock);
573 mutex_unlock(lock: &ctx->mmap_lock);
574 io_register_free_rings(ctx, p: &p, r: to_free);
575
576 if (ctx->sq_data)
577 io_sq_thread_unpark(sqd: ctx->sq_data);
578
579 return ret;
580}
581
582static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
583{
584 struct io_uring_mem_region_reg __user *reg_uptr = uarg;
585 struct io_uring_mem_region_reg reg;
586 struct io_uring_region_desc __user *rd_uptr;
587 struct io_uring_region_desc rd;
588 int ret;
589
590 if (io_region_is_set(mr: &ctx->param_region))
591 return -EBUSY;
592 if (copy_from_user(to: &reg, from: reg_uptr, n: sizeof(reg)))
593 return -EFAULT;
594 rd_uptr = u64_to_user_ptr(reg.region_uptr);
595 if (copy_from_user(to: &rd, from: rd_uptr, n: sizeof(rd)))
596 return -EFAULT;
597 if (memchr_inv(s: &reg.__resv, c: 0, n: sizeof(reg.__resv)))
598 return -EINVAL;
599 if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG)
600 return -EINVAL;
601
602 /*
603 * This ensures there are no waiters. Waiters are unlocked and it's
604 * hard to synchronise with them, especially if we need to initialise
605 * the region.
606 */
607 if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) &&
608 !(ctx->flags & IORING_SETUP_R_DISABLED))
609 return -EINVAL;
610
611 ret = io_create_region_mmap_safe(ctx, mr: &ctx->param_region, reg: &rd,
612 IORING_MAP_OFF_PARAM_REGION);
613 if (ret)
614 return ret;
615 if (copy_to_user(to: rd_uptr, from: &rd, n: sizeof(rd))) {
616 io_free_region(ctx, mr: &ctx->param_region);
617 return -EFAULT;
618 }
619
620 if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) {
621 ctx->cq_wait_arg = io_region_get_ptr(mr: &ctx->param_region);
622 ctx->cq_wait_size = rd.size;
623 }
624 return 0;
625}
626
627static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
628 void __user *arg, unsigned nr_args)
629 __releases(ctx->uring_lock)
630 __acquires(ctx->uring_lock)
631{
632 int ret;
633
634 /*
635 * We don't quiesce the refs for register anymore and so it can't be
636 * dying as we're holding a file ref here.
637 */
638 if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
639 return -ENXIO;
640
641 if (ctx->submitter_task && ctx->submitter_task != current)
642 return -EEXIST;
643
644 if (ctx->restricted) {
645 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
646 if (!test_bit(opcode, ctx->restrictions.register_op))
647 return -EACCES;
648 }
649
650 switch (opcode) {
651 case IORING_REGISTER_BUFFERS:
652 ret = -EFAULT;
653 if (!arg)
654 break;
655 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
656 break;
657 case IORING_UNREGISTER_BUFFERS:
658 ret = -EINVAL;
659 if (arg || nr_args)
660 break;
661 ret = io_sqe_buffers_unregister(ctx);
662 break;
663 case IORING_REGISTER_FILES:
664 ret = -EFAULT;
665 if (!arg)
666 break;
667 ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
668 break;
669 case IORING_UNREGISTER_FILES:
670 ret = -EINVAL;
671 if (arg || nr_args)
672 break;
673 ret = io_sqe_files_unregister(ctx);
674 break;
675 case IORING_REGISTER_FILES_UPDATE:
676 ret = io_register_files_update(ctx, arg, nr_args);
677 break;
678 case IORING_REGISTER_EVENTFD:
679 ret = -EINVAL;
680 if (nr_args != 1)
681 break;
682 ret = io_eventfd_register(ctx, arg, eventfd_async: 0);
683 break;
684 case IORING_REGISTER_EVENTFD_ASYNC:
685 ret = -EINVAL;
686 if (nr_args != 1)
687 break;
688 ret = io_eventfd_register(ctx, arg, eventfd_async: 1);
689 break;
690 case IORING_UNREGISTER_EVENTFD:
691 ret = -EINVAL;
692 if (arg || nr_args)
693 break;
694 ret = io_eventfd_unregister(ctx);
695 break;
696 case IORING_REGISTER_PROBE:
697 ret = -EINVAL;
698 if (!arg || nr_args > 256)
699 break;
700 ret = io_probe(ctx, arg, nr_args);
701 break;
702 case IORING_REGISTER_PERSONALITY:
703 ret = -EINVAL;
704 if (arg || nr_args)
705 break;
706 ret = io_register_personality(ctx);
707 break;
708 case IORING_UNREGISTER_PERSONALITY:
709 ret = -EINVAL;
710 if (arg)
711 break;
712 ret = io_unregister_personality(ctx, id: nr_args);
713 break;
714 case IORING_REGISTER_ENABLE_RINGS:
715 ret = -EINVAL;
716 if (arg || nr_args)
717 break;
718 ret = io_register_enable_rings(ctx);
719 break;
720 case IORING_REGISTER_RESTRICTIONS:
721 ret = io_register_restrictions(ctx, arg, nr_args);
722 break;
723 case IORING_REGISTER_FILES2:
724 ret = io_register_rsrc(ctx, arg, size: nr_args, type: IORING_RSRC_FILE);
725 break;
726 case IORING_REGISTER_FILES_UPDATE2:
727 ret = io_register_rsrc_update(ctx, arg, size: nr_args,
728 type: IORING_RSRC_FILE);
729 break;
730 case IORING_REGISTER_BUFFERS2:
731 ret = io_register_rsrc(ctx, arg, size: nr_args, type: IORING_RSRC_BUFFER);
732 break;
733 case IORING_REGISTER_BUFFERS_UPDATE:
734 ret = io_register_rsrc_update(ctx, arg, size: nr_args,
735 type: IORING_RSRC_BUFFER);
736 break;
737 case IORING_REGISTER_IOWQ_AFF:
738 ret = -EINVAL;
739 if (!arg || !nr_args)
740 break;
741 ret = io_register_iowq_aff(ctx, arg, len: nr_args);
742 break;
743 case IORING_UNREGISTER_IOWQ_AFF:
744 ret = -EINVAL;
745 if (arg || nr_args)
746 break;
747 ret = io_unregister_iowq_aff(ctx);
748 break;
749 case IORING_REGISTER_IOWQ_MAX_WORKERS:
750 ret = -EINVAL;
751 if (!arg || nr_args != 2)
752 break;
753 ret = io_register_iowq_max_workers(ctx, arg);
754 break;
755 case IORING_REGISTER_RING_FDS:
756 ret = io_ringfd_register(ctx, arg: arg, nr_args);
757 break;
758 case IORING_UNREGISTER_RING_FDS:
759 ret = io_ringfd_unregister(ctx, arg: arg, nr_args);
760 break;
761 case IORING_REGISTER_PBUF_RING:
762 ret = -EINVAL;
763 if (!arg || nr_args != 1)
764 break;
765 ret = io_register_pbuf_ring(ctx, arg);
766 break;
767 case IORING_UNREGISTER_PBUF_RING:
768 ret = -EINVAL;
769 if (!arg || nr_args != 1)
770 break;
771 ret = io_unregister_pbuf_ring(ctx, arg);
772 break;
773 case IORING_REGISTER_SYNC_CANCEL:
774 ret = -EINVAL;
775 if (!arg || nr_args != 1)
776 break;
777 ret = io_sync_cancel(ctx, arg);
778 break;
779 case IORING_REGISTER_FILE_ALLOC_RANGE:
780 ret = -EINVAL;
781 if (!arg || nr_args)
782 break;
783 ret = io_register_file_alloc_range(ctx, arg);
784 break;
785 case IORING_REGISTER_PBUF_STATUS:
786 ret = -EINVAL;
787 if (!arg || nr_args != 1)
788 break;
789 ret = io_register_pbuf_status(ctx, arg);
790 break;
791 case IORING_REGISTER_NAPI:
792 ret = -EINVAL;
793 if (!arg || nr_args != 1)
794 break;
795 ret = io_register_napi(ctx, arg);
796 break;
797 case IORING_UNREGISTER_NAPI:
798 ret = -EINVAL;
799 if (nr_args != 1)
800 break;
801 ret = io_unregister_napi(ctx, arg);
802 break;
803 case IORING_REGISTER_CLOCK:
804 ret = -EINVAL;
805 if (!arg || nr_args)
806 break;
807 ret = io_register_clock(ctx, arg);
808 break;
809 case IORING_REGISTER_CLONE_BUFFERS:
810 ret = -EINVAL;
811 if (!arg || nr_args != 1)
812 break;
813 ret = io_register_clone_buffers(ctx, arg);
814 break;
815 case IORING_REGISTER_ZCRX_IFQ:
816 ret = -EINVAL;
817 if (!arg || nr_args != 1)
818 break;
819 ret = io_register_zcrx_ifq(ctx, arg);
820 break;
821 case IORING_REGISTER_RESIZE_RINGS:
822 ret = -EINVAL;
823 if (!arg || nr_args != 1)
824 break;
825 ret = io_register_resize_rings(ctx, arg);
826 break;
827 case IORING_REGISTER_MEM_REGION:
828 ret = -EINVAL;
829 if (!arg || nr_args != 1)
830 break;
831 ret = io_register_mem_region(ctx, uarg: arg);
832 break;
833 case IORING_REGISTER_QUERY:
834 ret = io_query(ctx, arg, nr_args);
835 break;
836 case IORING_REGISTER_ZCRX_REFILL:
837 ret = io_zcrx_return_bufs(ctx, arg, nr_arg: nr_args);
838 break;
839 default:
840 ret = -EINVAL;
841 break;
842 }
843
844 return ret;
845}
846
847/*
848 * Given an 'fd' value, return the ctx associated with if. If 'registered' is
849 * true, then the registered index is used. Otherwise, the normal fd table.
850 * Caller must call fput() on the returned file, unless it's an ERR_PTR.
851 */
852struct file *io_uring_register_get_file(unsigned int fd, bool registered)
853{
854 struct file *file;
855
856 if (registered) {
857 /*
858 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
859 * need only dereference our task private array to find it.
860 */
861 struct io_uring_task *tctx = current->io_uring;
862
863 if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
864 return ERR_PTR(error: -EINVAL);
865 fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
866 file = tctx->registered_rings[fd];
867 if (file)
868 get_file(f: file);
869 } else {
870 file = fget(fd);
871 }
872
873 if (unlikely(!file))
874 return ERR_PTR(error: -EBADF);
875 if (io_is_uring_fops(file))
876 return file;
877 fput(file);
878 return ERR_PTR(error: -EOPNOTSUPP);
879}
880
881static int io_uring_register_send_msg_ring(void __user *arg, unsigned int nr_args)
882{
883 struct io_uring_sqe sqe;
884
885 if (!arg || nr_args != 1)
886 return -EINVAL;
887 if (copy_from_user(to: &sqe, from: arg, n: sizeof(sqe)))
888 return -EFAULT;
889 /* no flags supported */
890 if (sqe.flags)
891 return -EINVAL;
892 if (sqe.opcode != IORING_OP_MSG_RING)
893 return -EINVAL;
894
895 return io_uring_sync_msg_ring(sqe: &sqe);
896}
897
898/*
899 * "blind" registration opcodes are ones where there's no ring given, and
900 * hence the source fd must be -1.
901 */
902static int io_uring_register_blind(unsigned int opcode, void __user *arg,
903 unsigned int nr_args)
904{
905 switch (opcode) {
906 case IORING_REGISTER_SEND_MSG_RING:
907 return io_uring_register_send_msg_ring(arg, nr_args);
908 case IORING_REGISTER_QUERY:
909 return io_query(NULL, arg, nr_args);
910 }
911 return -EINVAL;
912}
913
914SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
915 void __user *, arg, unsigned int, nr_args)
916{
917 struct io_ring_ctx *ctx;
918 long ret = -EBADF;
919 struct file *file;
920 bool use_registered_ring;
921
922 use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
923 opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
924
925 if (opcode >= IORING_REGISTER_LAST)
926 return -EINVAL;
927
928 if (fd == -1)
929 return io_uring_register_blind(opcode, arg, nr_args);
930
931 file = io_uring_register_get_file(fd, registered: use_registered_ring);
932 if (IS_ERR(ptr: file))
933 return PTR_ERR(ptr: file);
934 ctx = file->private_data;
935
936 mutex_lock(lock: &ctx->uring_lock);
937 ret = __io_uring_register(ctx, opcode, arg, nr_args);
938
939 trace_io_uring_register(ctx, opcode, nr_files: ctx->file_table.data.nr,
940 nr_bufs: ctx->buf_table.nr, ret);
941 mutex_unlock(lock: &ctx->uring_lock);
942
943 fput(file);
944 return ret;
945}
946