| 1 | #ifndef IO_URING_TYPES_H | 
|---|
| 2 | #define IO_URING_TYPES_H | 
|---|
| 3 |  | 
|---|
| 4 | #include <linux/blkdev.h> | 
|---|
| 5 | #include <linux/hashtable.h> | 
|---|
| 6 | #include <linux/task_work.h> | 
|---|
| 7 | #include <linux/bitmap.h> | 
|---|
| 8 | #include <linux/llist.h> | 
|---|
| 9 | #include <uapi/linux/io_uring.h> | 
|---|
| 10 |  | 
|---|
| 11 | enum { | 
|---|
| 12 | /* | 
|---|
| 13 | * A hint to not wake right away but delay until there are enough of | 
|---|
| 14 | * tw's queued to match the number of CQEs the task is waiting for. | 
|---|
| 15 | * | 
|---|
| 16 | * Must not be used with requests generating more than one CQE. | 
|---|
| 17 | * It's also ignored unless IORING_SETUP_DEFER_TASKRUN is set. | 
|---|
| 18 | */ | 
|---|
| 19 | IOU_F_TWQ_LAZY_WAKE			= 1, | 
|---|
| 20 | }; | 
|---|
| 21 |  | 
|---|
| 22 | enum io_uring_cmd_flags { | 
|---|
| 23 | IO_URING_F_COMPLETE_DEFER	= 1, | 
|---|
| 24 | IO_URING_F_UNLOCKED		= 2, | 
|---|
| 25 | /* the request is executed from poll, it should not be freed */ | 
|---|
| 26 | IO_URING_F_MULTISHOT		= 4, | 
|---|
| 27 | /* executed by io-wq */ | 
|---|
| 28 | IO_URING_F_IOWQ			= 8, | 
|---|
| 29 | /* executed inline from syscall */ | 
|---|
| 30 | IO_URING_F_INLINE		= 16, | 
|---|
| 31 | /* int's last bit, sign checks are usually faster than a bit test */ | 
|---|
| 32 | IO_URING_F_NONBLOCK		= INT_MIN, | 
|---|
| 33 |  | 
|---|
| 34 | /* ctx state flags, for URING_CMD */ | 
|---|
| 35 | IO_URING_F_SQE128		= (1 << 8), | 
|---|
| 36 | IO_URING_F_CQE32		= (1 << 9), | 
|---|
| 37 | IO_URING_F_IOPOLL		= (1 << 10), | 
|---|
| 38 |  | 
|---|
| 39 | /* set when uring wants to cancel a previously issued command */ | 
|---|
| 40 | IO_URING_F_CANCEL		= (1 << 11), | 
|---|
| 41 | IO_URING_F_COMPAT		= (1 << 12), | 
|---|
| 42 | IO_URING_F_TASK_DEAD		= (1 << 13), | 
|---|
| 43 | }; | 
|---|
| 44 |  | 
|---|
| 45 | struct io_wq_work_node { | 
|---|
| 46 | struct io_wq_work_node *next; | 
|---|
| 47 | }; | 
|---|
| 48 |  | 
|---|
| 49 | struct io_wq_work_list { | 
|---|
| 50 | struct io_wq_work_node *first; | 
|---|
| 51 | struct io_wq_work_node *last; | 
|---|
| 52 | }; | 
|---|
| 53 |  | 
|---|
| 54 | struct io_wq_work { | 
|---|
| 55 | struct io_wq_work_node list; | 
|---|
| 56 | atomic_t flags; | 
|---|
| 57 | /* place it here instead of io_kiocb as it fills padding and saves 4B */ | 
|---|
| 58 | int cancel_seq; | 
|---|
| 59 | }; | 
|---|
| 60 |  | 
|---|
| 61 | struct io_rsrc_data { | 
|---|
| 62 | unsigned int			nr; | 
|---|
| 63 | struct io_rsrc_node		**nodes; | 
|---|
| 64 | }; | 
|---|
| 65 |  | 
|---|
| 66 | struct io_file_table { | 
|---|
| 67 | struct io_rsrc_data data; | 
|---|
| 68 | unsigned long *bitmap; | 
|---|
| 69 | unsigned int alloc_hint; | 
|---|
| 70 | }; | 
|---|
| 71 |  | 
|---|
| 72 | struct io_hash_bucket { | 
|---|
| 73 | struct hlist_head	list; | 
|---|
| 74 | } ____cacheline_aligned_in_smp; | 
|---|
| 75 |  | 
|---|
| 76 | struct io_hash_table { | 
|---|
| 77 | struct io_hash_bucket	*hbs; | 
|---|
| 78 | unsigned		hash_bits; | 
|---|
| 79 | }; | 
|---|
| 80 |  | 
|---|
| 81 | struct io_mapped_region { | 
|---|
| 82 | struct page		**pages; | 
|---|
| 83 | void			*ptr; | 
|---|
| 84 | unsigned		nr_pages; | 
|---|
| 85 | unsigned		flags; | 
|---|
| 86 | }; | 
|---|
| 87 |  | 
|---|
| 88 | /* | 
|---|
| 89 | * Return value from io_buffer_list selection, to avoid stashing it in | 
|---|
| 90 | * struct io_kiocb. For legacy/classic provided buffers, keeping a reference | 
|---|
| 91 | * across execution contexts are fine. But for ring provided buffers, the | 
|---|
| 92 | * list may go away as soon as ->uring_lock is dropped. As the io_kiocb | 
|---|
| 93 | * persists, it's better to just keep the buffer local for those cases. | 
|---|
| 94 | */ | 
|---|
| 95 | struct io_br_sel { | 
|---|
| 96 | struct io_buffer_list *buf_list; | 
|---|
| 97 | /* | 
|---|
| 98 | * Some selection parts return the user address, others return an error. | 
|---|
| 99 | */ | 
|---|
| 100 | union { | 
|---|
| 101 | void __user *addr; | 
|---|
| 102 | ssize_t val; | 
|---|
| 103 | }; | 
|---|
| 104 | }; | 
|---|
| 105 |  | 
|---|
| 106 |  | 
|---|
| 107 | /* | 
|---|
| 108 | * Arbitrary limit, can be raised if need be | 
|---|
| 109 | */ | 
|---|
| 110 | #define IO_RINGFD_REG_MAX 16 | 
|---|
| 111 |  | 
|---|
| 112 | struct io_uring_task { | 
|---|
| 113 | /* submission side */ | 
|---|
| 114 | int				cached_refs; | 
|---|
| 115 | const struct io_ring_ctx 	*last; | 
|---|
| 116 | struct task_struct		*task; | 
|---|
| 117 | struct io_wq			*io_wq; | 
|---|
| 118 | struct file			*registered_rings[IO_RINGFD_REG_MAX]; | 
|---|
| 119 |  | 
|---|
| 120 | struct xarray			xa; | 
|---|
| 121 | struct wait_queue_head		wait; | 
|---|
| 122 | atomic_t			in_cancel; | 
|---|
| 123 | atomic_t			inflight_tracked; | 
|---|
| 124 | struct percpu_counter		inflight; | 
|---|
| 125 |  | 
|---|
| 126 | struct { /* task_work */ | 
|---|
| 127 | struct llist_head	task_list; | 
|---|
| 128 | struct callback_head	task_work; | 
|---|
| 129 | } ____cacheline_aligned_in_smp; | 
|---|
| 130 | }; | 
|---|
| 131 |  | 
|---|
| 132 | struct iou_vec { | 
|---|
| 133 | union { | 
|---|
| 134 | struct iovec	*iovec; | 
|---|
| 135 | struct bio_vec	*bvec; | 
|---|
| 136 | }; | 
|---|
| 137 | unsigned		nr; /* number of struct iovec it can hold */ | 
|---|
| 138 | }; | 
|---|
| 139 |  | 
|---|
| 140 | struct io_uring { | 
|---|
| 141 | u32 head; | 
|---|
| 142 | u32 tail; | 
|---|
| 143 | }; | 
|---|
| 144 |  | 
|---|
| 145 | /* | 
|---|
| 146 | * This data is shared with the application through the mmap at offsets | 
|---|
| 147 | * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING. | 
|---|
| 148 | * | 
|---|
| 149 | * The offsets to the member fields are published through struct | 
|---|
| 150 | * io_sqring_offsets when calling io_uring_setup. | 
|---|
| 151 | */ | 
|---|
| 152 | struct io_rings { | 
|---|
| 153 | /* | 
|---|
| 154 | * Head and tail offsets into the ring; the offsets need to be | 
|---|
| 155 | * masked to get valid indices. | 
|---|
| 156 | * | 
|---|
| 157 | * The kernel controls head of the sq ring and the tail of the cq ring, | 
|---|
| 158 | * and the application controls tail of the sq ring and the head of the | 
|---|
| 159 | * cq ring. | 
|---|
| 160 | */ | 
|---|
| 161 | struct io_uring		sq, cq; | 
|---|
| 162 | /* | 
|---|
| 163 | * Bitmasks to apply to head and tail offsets (constant, equals | 
|---|
| 164 | * ring_entries - 1) | 
|---|
| 165 | */ | 
|---|
| 166 | u32			sq_ring_mask, cq_ring_mask; | 
|---|
| 167 | /* Ring sizes (constant, power of 2) */ | 
|---|
| 168 | u32			sq_ring_entries, cq_ring_entries; | 
|---|
| 169 | /* | 
|---|
| 170 | * Number of invalid entries dropped by the kernel due to | 
|---|
| 171 | * invalid index stored in array | 
|---|
| 172 | * | 
|---|
| 173 | * Written by the kernel, shouldn't be modified by the | 
|---|
| 174 | * application (i.e. get number of "new events" by comparing to | 
|---|
| 175 | * cached value). | 
|---|
| 176 | * | 
|---|
| 177 | * After a new SQ head value was read by the application this | 
|---|
| 178 | * counter includes all submissions that were dropped reaching | 
|---|
| 179 | * the new SQ head (and possibly more). | 
|---|
| 180 | */ | 
|---|
| 181 | u32			sq_dropped; | 
|---|
| 182 | /* | 
|---|
| 183 | * Runtime SQ flags | 
|---|
| 184 | * | 
|---|
| 185 | * Written by the kernel, shouldn't be modified by the | 
|---|
| 186 | * application. | 
|---|
| 187 | * | 
|---|
| 188 | * The application needs a full memory barrier before checking | 
|---|
| 189 | * for IORING_SQ_NEED_WAKEUP after updating the sq tail. | 
|---|
| 190 | */ | 
|---|
| 191 | atomic_t		sq_flags; | 
|---|
| 192 | /* | 
|---|
| 193 | * Runtime CQ flags | 
|---|
| 194 | * | 
|---|
| 195 | * Written by the application, shouldn't be modified by the | 
|---|
| 196 | * kernel. | 
|---|
| 197 | */ | 
|---|
| 198 | u32			cq_flags; | 
|---|
| 199 | /* | 
|---|
| 200 | * Number of completion events lost because the queue was full; | 
|---|
| 201 | * this should be avoided by the application by making sure | 
|---|
| 202 | * there are not more requests pending than there is space in | 
|---|
| 203 | * the completion queue. | 
|---|
| 204 | * | 
|---|
| 205 | * Written by the kernel, shouldn't be modified by the | 
|---|
| 206 | * application (i.e. get number of "new events" by comparing to | 
|---|
| 207 | * cached value). | 
|---|
| 208 | * | 
|---|
| 209 | * As completion events come in out of order this counter is not | 
|---|
| 210 | * ordered with any other data. | 
|---|
| 211 | */ | 
|---|
| 212 | u32			cq_overflow; | 
|---|
| 213 | /* | 
|---|
| 214 | * Ring buffer of completion events. | 
|---|
| 215 | * | 
|---|
| 216 | * The kernel writes completion events fresh every time they are | 
|---|
| 217 | * produced, so the application is allowed to modify pending | 
|---|
| 218 | * entries. | 
|---|
| 219 | */ | 
|---|
| 220 | struct io_uring_cqe	cqes[] ____cacheline_aligned_in_smp; | 
|---|
| 221 | }; | 
|---|
| 222 |  | 
|---|
| 223 | struct io_restriction { | 
|---|
| 224 | DECLARE_BITMAP(register_op, IORING_REGISTER_LAST); | 
|---|
| 225 | DECLARE_BITMAP(sqe_op, IORING_OP_LAST); | 
|---|
| 226 | u8 sqe_flags_allowed; | 
|---|
| 227 | u8 sqe_flags_required; | 
|---|
| 228 | bool registered; | 
|---|
| 229 | }; | 
|---|
| 230 |  | 
|---|
| 231 | struct io_submit_link { | 
|---|
| 232 | struct io_kiocb		*head; | 
|---|
| 233 | struct io_kiocb		*last; | 
|---|
| 234 | }; | 
|---|
| 235 |  | 
|---|
| 236 | struct io_submit_state { | 
|---|
| 237 | /* inline/task_work completion list, under ->uring_lock */ | 
|---|
| 238 | struct io_wq_work_node	free_list; | 
|---|
| 239 | /* batch completion logic */ | 
|---|
| 240 | struct io_wq_work_list	compl_reqs; | 
|---|
| 241 | struct io_submit_link	link; | 
|---|
| 242 |  | 
|---|
| 243 | bool			plug_started; | 
|---|
| 244 | bool			need_plug; | 
|---|
| 245 | bool			cq_flush; | 
|---|
| 246 | unsigned short		submit_nr; | 
|---|
| 247 | struct blk_plug		plug; | 
|---|
| 248 | }; | 
|---|
| 249 |  | 
|---|
| 250 | struct io_alloc_cache { | 
|---|
| 251 | void			**entries; | 
|---|
| 252 | unsigned int		nr_cached; | 
|---|
| 253 | unsigned int		max_cached; | 
|---|
| 254 | unsigned int		elem_size; | 
|---|
| 255 | unsigned int		init_clear; | 
|---|
| 256 | }; | 
|---|
| 257 |  | 
|---|
| 258 | struct io_ring_ctx { | 
|---|
| 259 | /* const or read-mostly hot data */ | 
|---|
| 260 | struct { | 
|---|
| 261 | unsigned int		flags; | 
|---|
| 262 | unsigned int		drain_next: 1; | 
|---|
| 263 | unsigned int		restricted: 1; | 
|---|
| 264 | unsigned int		off_timeout_used: 1; | 
|---|
| 265 | unsigned int		drain_active: 1; | 
|---|
| 266 | unsigned int		has_evfd: 1; | 
|---|
| 267 | /* all CQEs should be posted only by the submitter task */ | 
|---|
| 268 | unsigned int		task_complete: 1; | 
|---|
| 269 | unsigned int		lockless_cq: 1; | 
|---|
| 270 | unsigned int		syscall_iopoll: 1; | 
|---|
| 271 | unsigned int		poll_activated: 1; | 
|---|
| 272 | unsigned int		drain_disabled: 1; | 
|---|
| 273 | unsigned int		compat: 1; | 
|---|
| 274 | unsigned int		iowq_limits_set : 1; | 
|---|
| 275 |  | 
|---|
| 276 | struct task_struct	*submitter_task; | 
|---|
| 277 | struct io_rings		*rings; | 
|---|
| 278 | struct percpu_ref	refs; | 
|---|
| 279 |  | 
|---|
| 280 | clockid_t		clockid; | 
|---|
| 281 | enum tk_offsets		clock_offset; | 
|---|
| 282 |  | 
|---|
| 283 | enum task_work_notify_mode	notify_method; | 
|---|
| 284 | unsigned			sq_thread_idle; | 
|---|
| 285 | } ____cacheline_aligned_in_smp; | 
|---|
| 286 |  | 
|---|
| 287 | /* submission data */ | 
|---|
| 288 | struct { | 
|---|
| 289 | struct mutex		uring_lock; | 
|---|
| 290 |  | 
|---|
| 291 | /* | 
|---|
| 292 | * Ring buffer of indices into array of io_uring_sqe, which is | 
|---|
| 293 | * mmapped by the application using the IORING_OFF_SQES offset. | 
|---|
| 294 | * | 
|---|
| 295 | * This indirection could e.g. be used to assign fixed | 
|---|
| 296 | * io_uring_sqe entries to operations and only submit them to | 
|---|
| 297 | * the queue when needed. | 
|---|
| 298 | * | 
|---|
| 299 | * The kernel modifies neither the indices array nor the entries | 
|---|
| 300 | * array. | 
|---|
| 301 | */ | 
|---|
| 302 | u32			*sq_array; | 
|---|
| 303 | struct io_uring_sqe	*sq_sqes; | 
|---|
| 304 | unsigned		cached_sq_head; | 
|---|
| 305 | unsigned		sq_entries; | 
|---|
| 306 |  | 
|---|
| 307 | /* | 
|---|
| 308 | * Fixed resources fast path, should be accessed only under | 
|---|
| 309 | * uring_lock, and updated through io_uring_register(2) | 
|---|
| 310 | */ | 
|---|
| 311 | atomic_t		cancel_seq; | 
|---|
| 312 |  | 
|---|
| 313 | /* | 
|---|
| 314 | * ->iopoll_list is protected by the ctx->uring_lock for | 
|---|
| 315 | * io_uring instances that don't use IORING_SETUP_SQPOLL. | 
|---|
| 316 | * For SQPOLL, only the single threaded io_sq_thread() will | 
|---|
| 317 | * manipulate the list, hence no extra locking is needed there. | 
|---|
| 318 | */ | 
|---|
| 319 | bool			poll_multi_queue; | 
|---|
| 320 | struct io_wq_work_list	iopoll_list; | 
|---|
| 321 |  | 
|---|
| 322 | struct io_file_table	file_table; | 
|---|
| 323 | struct io_rsrc_data	buf_table; | 
|---|
| 324 | struct io_alloc_cache	node_cache; | 
|---|
| 325 | struct io_alloc_cache	imu_cache; | 
|---|
| 326 |  | 
|---|
| 327 | struct io_submit_state	submit_state; | 
|---|
| 328 |  | 
|---|
| 329 | /* | 
|---|
| 330 | * Modifications are protected by ->uring_lock and ->mmap_lock. | 
|---|
| 331 | * The flags, buf_pages and buf_nr_pages fields should be stable | 
|---|
| 332 | * once published. | 
|---|
| 333 | */ | 
|---|
| 334 | struct xarray		io_bl_xa; | 
|---|
| 335 |  | 
|---|
| 336 | struct io_hash_table	cancel_table; | 
|---|
| 337 | struct io_alloc_cache	apoll_cache; | 
|---|
| 338 | struct io_alloc_cache	netmsg_cache; | 
|---|
| 339 | struct io_alloc_cache	rw_cache; | 
|---|
| 340 | struct io_alloc_cache	cmd_cache; | 
|---|
| 341 |  | 
|---|
| 342 | /* | 
|---|
| 343 | * Any cancelable uring_cmd is added to this list in | 
|---|
| 344 | * ->uring_cmd() by io_uring_cmd_insert_cancelable() | 
|---|
| 345 | */ | 
|---|
| 346 | struct hlist_head	cancelable_uring_cmd; | 
|---|
| 347 | /* | 
|---|
| 348 | * For Hybrid IOPOLL, runtime in hybrid polling, without | 
|---|
| 349 | * scheduling time | 
|---|
| 350 | */ | 
|---|
| 351 | u64					hybrid_poll_time; | 
|---|
| 352 | } ____cacheline_aligned_in_smp; | 
|---|
| 353 |  | 
|---|
| 354 | struct { | 
|---|
| 355 | /* | 
|---|
| 356 | * We cache a range of free CQEs we can use, once exhausted it | 
|---|
| 357 | * should go through a slower range setup, see __io_get_cqe() | 
|---|
| 358 | */ | 
|---|
| 359 | struct io_uring_cqe	*cqe_cached; | 
|---|
| 360 | struct io_uring_cqe	*cqe_sentinel; | 
|---|
| 361 |  | 
|---|
| 362 | unsigned		cached_cq_tail; | 
|---|
| 363 | unsigned		cq_entries; | 
|---|
| 364 | struct io_ev_fd	__rcu	*io_ev_fd; | 
|---|
| 365 |  | 
|---|
| 366 | void			*cq_wait_arg; | 
|---|
| 367 | size_t			cq_wait_size; | 
|---|
| 368 | } ____cacheline_aligned_in_smp; | 
|---|
| 369 |  | 
|---|
| 370 | /* | 
|---|
| 371 | * task_work and async notification delivery cacheline. Expected to | 
|---|
| 372 | * regularly bounce b/w CPUs. | 
|---|
| 373 | */ | 
|---|
| 374 | struct { | 
|---|
| 375 | struct llist_head	work_llist; | 
|---|
| 376 | struct llist_head	retry_llist; | 
|---|
| 377 | unsigned long		check_cq; | 
|---|
| 378 | atomic_t		cq_wait_nr; | 
|---|
| 379 | atomic_t		cq_timeouts; | 
|---|
| 380 | struct wait_queue_head	cq_wait; | 
|---|
| 381 | } ____cacheline_aligned_in_smp; | 
|---|
| 382 |  | 
|---|
| 383 | /* timeouts */ | 
|---|
| 384 | struct { | 
|---|
| 385 | raw_spinlock_t		timeout_lock; | 
|---|
| 386 | struct list_head	timeout_list; | 
|---|
| 387 | struct list_head	ltimeout_list; | 
|---|
| 388 | unsigned		cq_last_tm_flush; | 
|---|
| 389 | } ____cacheline_aligned_in_smp; | 
|---|
| 390 |  | 
|---|
| 391 | spinlock_t		completion_lock; | 
|---|
| 392 |  | 
|---|
| 393 | struct list_head	cq_overflow_list; | 
|---|
| 394 |  | 
|---|
| 395 | struct hlist_head	waitid_list; | 
|---|
| 396 |  | 
|---|
| 397 | #ifdef CONFIG_FUTEX | 
|---|
| 398 | struct hlist_head	futex_list; | 
|---|
| 399 | struct io_alloc_cache	futex_cache; | 
|---|
| 400 | #endif | 
|---|
| 401 |  | 
|---|
| 402 | const struct cred	*sq_creds;	/* cred used for __io_sq_thread() */ | 
|---|
| 403 | struct io_sq_data	*sq_data;	/* if using sq thread polling */ | 
|---|
| 404 |  | 
|---|
| 405 | struct wait_queue_head	sqo_sq_wait; | 
|---|
| 406 | struct list_head	sqd_list; | 
|---|
| 407 |  | 
|---|
| 408 | unsigned int		file_alloc_start; | 
|---|
| 409 | unsigned int		file_alloc_end; | 
|---|
| 410 |  | 
|---|
| 411 | /* Keep this last, we don't need it for the fast path */ | 
|---|
| 412 | struct wait_queue_head		poll_wq; | 
|---|
| 413 | struct io_restriction		restrictions; | 
|---|
| 414 |  | 
|---|
| 415 | /* Stores zcrx object pointers of type struct io_zcrx_ifq */ | 
|---|
| 416 | struct xarray			zcrx_ctxs; | 
|---|
| 417 |  | 
|---|
| 418 | u32			pers_next; | 
|---|
| 419 | struct xarray		personalities; | 
|---|
| 420 |  | 
|---|
| 421 | /* hashed buffered write serialization */ | 
|---|
| 422 | struct io_wq_hash		*hash_map; | 
|---|
| 423 |  | 
|---|
| 424 | /* Only used for accounting purposes */ | 
|---|
| 425 | struct user_struct		*user; | 
|---|
| 426 | struct mm_struct		*mm_account; | 
|---|
| 427 |  | 
|---|
| 428 | /* ctx exit and cancelation */ | 
|---|
| 429 | struct llist_head		fallback_llist; | 
|---|
| 430 | struct delayed_work		fallback_work; | 
|---|
| 431 | struct work_struct		exit_work; | 
|---|
| 432 | struct list_head		tctx_list; | 
|---|
| 433 | struct completion		ref_comp; | 
|---|
| 434 |  | 
|---|
| 435 | /* io-wq management, e.g. thread count */ | 
|---|
| 436 | u32				iowq_limits[2]; | 
|---|
| 437 |  | 
|---|
| 438 | struct callback_head		poll_wq_task_work; | 
|---|
| 439 | struct list_head		defer_list; | 
|---|
| 440 | unsigned			nr_drained; | 
|---|
| 441 |  | 
|---|
| 442 | #ifdef CONFIG_NET_RX_BUSY_POLL | 
|---|
| 443 | struct list_head	napi_list;	/* track busy poll napi_id */ | 
|---|
| 444 | spinlock_t		napi_lock;	/* napi_list lock */ | 
|---|
| 445 |  | 
|---|
| 446 | /* napi busy poll default timeout */ | 
|---|
| 447 | ktime_t			napi_busy_poll_dt; | 
|---|
| 448 | bool			napi_prefer_busy_poll; | 
|---|
| 449 | u8			napi_track_mode; | 
|---|
| 450 |  | 
|---|
| 451 | DECLARE_HASHTABLE(napi_ht, 4); | 
|---|
| 452 | #endif | 
|---|
| 453 |  | 
|---|
| 454 | /* protected by ->completion_lock */ | 
|---|
| 455 | unsigned			evfd_last_cq_tail; | 
|---|
| 456 | unsigned			nr_req_allocated; | 
|---|
| 457 |  | 
|---|
| 458 | /* | 
|---|
| 459 | * Protection for resize vs mmap races - both the mmap and resize | 
|---|
| 460 | * side will need to grab this lock, to prevent either side from | 
|---|
| 461 | * being run concurrently with the other. | 
|---|
| 462 | */ | 
|---|
| 463 | struct mutex			mmap_lock; | 
|---|
| 464 |  | 
|---|
| 465 | struct io_mapped_region		sq_region; | 
|---|
| 466 | struct io_mapped_region		ring_region; | 
|---|
| 467 | /* used for optimised request parameter and wait argument passing  */ | 
|---|
| 468 | struct io_mapped_region		param_region; | 
|---|
| 469 | }; | 
|---|
| 470 |  | 
|---|
| 471 | /* | 
|---|
| 472 | * Token indicating function is called in task work context: | 
|---|
| 473 | * ctx->uring_lock is held and any completions generated will be flushed. | 
|---|
| 474 | * ONLY core io_uring.c should instantiate this struct. | 
|---|
| 475 | */ | 
|---|
| 476 | struct io_tw_state { | 
|---|
| 477 | }; | 
|---|
| 478 | /* Alias to use in code that doesn't instantiate struct io_tw_state */ | 
|---|
| 479 | typedef struct io_tw_state io_tw_token_t; | 
|---|
| 480 |  | 
|---|
| 481 | enum { | 
|---|
| 482 | REQ_F_FIXED_FILE_BIT	= IOSQE_FIXED_FILE_BIT, | 
|---|
| 483 | REQ_F_IO_DRAIN_BIT	= IOSQE_IO_DRAIN_BIT, | 
|---|
| 484 | REQ_F_LINK_BIT		= IOSQE_IO_LINK_BIT, | 
|---|
| 485 | REQ_F_HARDLINK_BIT	= IOSQE_IO_HARDLINK_BIT, | 
|---|
| 486 | REQ_F_FORCE_ASYNC_BIT	= IOSQE_ASYNC_BIT, | 
|---|
| 487 | REQ_F_BUFFER_SELECT_BIT	= IOSQE_BUFFER_SELECT_BIT, | 
|---|
| 488 | REQ_F_CQE_SKIP_BIT	= IOSQE_CQE_SKIP_SUCCESS_BIT, | 
|---|
| 489 |  | 
|---|
| 490 | /* first byte is taken by user flags, shift it to not overlap */ | 
|---|
| 491 | REQ_F_FAIL_BIT		= 8, | 
|---|
| 492 | REQ_F_INFLIGHT_BIT, | 
|---|
| 493 | REQ_F_CUR_POS_BIT, | 
|---|
| 494 | REQ_F_NOWAIT_BIT, | 
|---|
| 495 | REQ_F_LINK_TIMEOUT_BIT, | 
|---|
| 496 | REQ_F_NEED_CLEANUP_BIT, | 
|---|
| 497 | REQ_F_POLLED_BIT, | 
|---|
| 498 | REQ_F_HYBRID_IOPOLL_STATE_BIT, | 
|---|
| 499 | REQ_F_BUFFER_SELECTED_BIT, | 
|---|
| 500 | REQ_F_BUFFER_RING_BIT, | 
|---|
| 501 | REQ_F_REISSUE_BIT, | 
|---|
| 502 | REQ_F_CREDS_BIT, | 
|---|
| 503 | REQ_F_REFCOUNT_BIT, | 
|---|
| 504 | REQ_F_ARM_LTIMEOUT_BIT, | 
|---|
| 505 | REQ_F_ASYNC_DATA_BIT, | 
|---|
| 506 | REQ_F_SKIP_LINK_CQES_BIT, | 
|---|
| 507 | REQ_F_SINGLE_POLL_BIT, | 
|---|
| 508 | REQ_F_DOUBLE_POLL_BIT, | 
|---|
| 509 | REQ_F_MULTISHOT_BIT, | 
|---|
| 510 | REQ_F_APOLL_MULTISHOT_BIT, | 
|---|
| 511 | REQ_F_CLEAR_POLLIN_BIT, | 
|---|
| 512 | /* keep async read/write and isreg together and in order */ | 
|---|
| 513 | REQ_F_SUPPORT_NOWAIT_BIT, | 
|---|
| 514 | REQ_F_ISREG_BIT, | 
|---|
| 515 | REQ_F_POLL_NO_LAZY_BIT, | 
|---|
| 516 | REQ_F_CAN_POLL_BIT, | 
|---|
| 517 | REQ_F_BL_EMPTY_BIT, | 
|---|
| 518 | REQ_F_BL_NO_RECYCLE_BIT, | 
|---|
| 519 | REQ_F_BUFFERS_COMMIT_BIT, | 
|---|
| 520 | REQ_F_BUF_NODE_BIT, | 
|---|
| 521 | REQ_F_HAS_METADATA_BIT, | 
|---|
| 522 | REQ_F_IMPORT_BUFFER_BIT, | 
|---|
| 523 | REQ_F_SQE_COPIED_BIT, | 
|---|
| 524 |  | 
|---|
| 525 | /* not a real bit, just to check we're not overflowing the space */ | 
|---|
| 526 | __REQ_F_LAST_BIT, | 
|---|
| 527 | }; | 
|---|
| 528 |  | 
|---|
| 529 | typedef u64 __bitwise io_req_flags_t; | 
|---|
| 530 | #define IO_REQ_FLAG(bitno)	((__force io_req_flags_t) BIT_ULL((bitno))) | 
|---|
| 531 |  | 
|---|
| 532 | enum { | 
|---|
| 533 | /* ctx owns file */ | 
|---|
| 534 | REQ_F_FIXED_FILE	= IO_REQ_FLAG(REQ_F_FIXED_FILE_BIT), | 
|---|
| 535 | /* drain existing IO first */ | 
|---|
| 536 | REQ_F_IO_DRAIN		= IO_REQ_FLAG(REQ_F_IO_DRAIN_BIT), | 
|---|
| 537 | /* linked sqes */ | 
|---|
| 538 | REQ_F_LINK		= IO_REQ_FLAG(REQ_F_LINK_BIT), | 
|---|
| 539 | /* doesn't sever on completion < 0 */ | 
|---|
| 540 | REQ_F_HARDLINK		= IO_REQ_FLAG(REQ_F_HARDLINK_BIT), | 
|---|
| 541 | /* IOSQE_ASYNC */ | 
|---|
| 542 | REQ_F_FORCE_ASYNC	= IO_REQ_FLAG(REQ_F_FORCE_ASYNC_BIT), | 
|---|
| 543 | /* IOSQE_BUFFER_SELECT */ | 
|---|
| 544 | REQ_F_BUFFER_SELECT	= IO_REQ_FLAG(REQ_F_BUFFER_SELECT_BIT), | 
|---|
| 545 | /* IOSQE_CQE_SKIP_SUCCESS */ | 
|---|
| 546 | REQ_F_CQE_SKIP		= IO_REQ_FLAG(REQ_F_CQE_SKIP_BIT), | 
|---|
| 547 |  | 
|---|
| 548 | /* fail rest of links */ | 
|---|
| 549 | REQ_F_FAIL		= IO_REQ_FLAG(REQ_F_FAIL_BIT), | 
|---|
| 550 | /* on inflight list, should be cancelled and waited on exit reliably */ | 
|---|
| 551 | REQ_F_INFLIGHT		= IO_REQ_FLAG(REQ_F_INFLIGHT_BIT), | 
|---|
| 552 | /* read/write uses file position */ | 
|---|
| 553 | REQ_F_CUR_POS		= IO_REQ_FLAG(REQ_F_CUR_POS_BIT), | 
|---|
| 554 | /* must not punt to workers */ | 
|---|
| 555 | REQ_F_NOWAIT		= IO_REQ_FLAG(REQ_F_NOWAIT_BIT), | 
|---|
| 556 | /* has or had linked timeout */ | 
|---|
| 557 | REQ_F_LINK_TIMEOUT	= IO_REQ_FLAG(REQ_F_LINK_TIMEOUT_BIT), | 
|---|
| 558 | /* needs cleanup */ | 
|---|
| 559 | REQ_F_NEED_CLEANUP	= IO_REQ_FLAG(REQ_F_NEED_CLEANUP_BIT), | 
|---|
| 560 | /* already went through poll handler */ | 
|---|
| 561 | REQ_F_POLLED		= IO_REQ_FLAG(REQ_F_POLLED_BIT), | 
|---|
| 562 | /* every req only blocks once in hybrid poll */ | 
|---|
| 563 | REQ_F_IOPOLL_STATE        = IO_REQ_FLAG(REQ_F_HYBRID_IOPOLL_STATE_BIT), | 
|---|
| 564 | /* buffer already selected */ | 
|---|
| 565 | REQ_F_BUFFER_SELECTED	= IO_REQ_FLAG(REQ_F_BUFFER_SELECTED_BIT), | 
|---|
| 566 | /* buffer selected from ring, needs commit */ | 
|---|
| 567 | REQ_F_BUFFER_RING	= IO_REQ_FLAG(REQ_F_BUFFER_RING_BIT), | 
|---|
| 568 | /* caller should reissue async */ | 
|---|
| 569 | REQ_F_REISSUE		= IO_REQ_FLAG(REQ_F_REISSUE_BIT), | 
|---|
| 570 | /* supports async reads/writes */ | 
|---|
| 571 | REQ_F_SUPPORT_NOWAIT	= IO_REQ_FLAG(REQ_F_SUPPORT_NOWAIT_BIT), | 
|---|
| 572 | /* regular file */ | 
|---|
| 573 | REQ_F_ISREG		= IO_REQ_FLAG(REQ_F_ISREG_BIT), | 
|---|
| 574 | /* has creds assigned */ | 
|---|
| 575 | REQ_F_CREDS		= IO_REQ_FLAG(REQ_F_CREDS_BIT), | 
|---|
| 576 | /* skip refcounting if not set */ | 
|---|
| 577 | REQ_F_REFCOUNT		= IO_REQ_FLAG(REQ_F_REFCOUNT_BIT), | 
|---|
| 578 | /* there is a linked timeout that has to be armed */ | 
|---|
| 579 | REQ_F_ARM_LTIMEOUT	= IO_REQ_FLAG(REQ_F_ARM_LTIMEOUT_BIT), | 
|---|
| 580 | /* ->async_data allocated */ | 
|---|
| 581 | REQ_F_ASYNC_DATA	= IO_REQ_FLAG(REQ_F_ASYNC_DATA_BIT), | 
|---|
| 582 | /* don't post CQEs while failing linked requests */ | 
|---|
| 583 | REQ_F_SKIP_LINK_CQES	= IO_REQ_FLAG(REQ_F_SKIP_LINK_CQES_BIT), | 
|---|
| 584 | /* single poll may be active */ | 
|---|
| 585 | REQ_F_SINGLE_POLL	= IO_REQ_FLAG(REQ_F_SINGLE_POLL_BIT), | 
|---|
| 586 | /* double poll may active */ | 
|---|
| 587 | REQ_F_DOUBLE_POLL	= IO_REQ_FLAG(REQ_F_DOUBLE_POLL_BIT), | 
|---|
| 588 | /* request posts multiple completions, should be set at prep time */ | 
|---|
| 589 | REQ_F_MULTISHOT		= IO_REQ_FLAG(REQ_F_MULTISHOT_BIT), | 
|---|
| 590 | /* fast poll multishot mode */ | 
|---|
| 591 | REQ_F_APOLL_MULTISHOT	= IO_REQ_FLAG(REQ_F_APOLL_MULTISHOT_BIT), | 
|---|
| 592 | /* recvmsg special flag, clear EPOLLIN */ | 
|---|
| 593 | REQ_F_CLEAR_POLLIN	= IO_REQ_FLAG(REQ_F_CLEAR_POLLIN_BIT), | 
|---|
| 594 | /* don't use lazy poll wake for this request */ | 
|---|
| 595 | REQ_F_POLL_NO_LAZY	= IO_REQ_FLAG(REQ_F_POLL_NO_LAZY_BIT), | 
|---|
| 596 | /* file is pollable */ | 
|---|
| 597 | REQ_F_CAN_POLL		= IO_REQ_FLAG(REQ_F_CAN_POLL_BIT), | 
|---|
| 598 | /* buffer list was empty after selection of buffer */ | 
|---|
| 599 | REQ_F_BL_EMPTY		= IO_REQ_FLAG(REQ_F_BL_EMPTY_BIT), | 
|---|
| 600 | /* don't recycle provided buffers for this request */ | 
|---|
| 601 | REQ_F_BL_NO_RECYCLE	= IO_REQ_FLAG(REQ_F_BL_NO_RECYCLE_BIT), | 
|---|
| 602 | /* buffer ring head needs incrementing on put */ | 
|---|
| 603 | REQ_F_BUFFERS_COMMIT	= IO_REQ_FLAG(REQ_F_BUFFERS_COMMIT_BIT), | 
|---|
| 604 | /* buf node is valid */ | 
|---|
| 605 | REQ_F_BUF_NODE		= IO_REQ_FLAG(REQ_F_BUF_NODE_BIT), | 
|---|
| 606 | /* request has read/write metadata assigned */ | 
|---|
| 607 | REQ_F_HAS_METADATA	= IO_REQ_FLAG(REQ_F_HAS_METADATA_BIT), | 
|---|
| 608 | /* | 
|---|
| 609 | * For vectored fixed buffers, resolve iovec to registered buffers. | 
|---|
| 610 | * For SEND_ZC, whether to import buffers (i.e. the first issue). | 
|---|
| 611 | */ | 
|---|
| 612 | REQ_F_IMPORT_BUFFER	= IO_REQ_FLAG(REQ_F_IMPORT_BUFFER_BIT), | 
|---|
| 613 | /* ->sqe_copy() has been called, if necessary */ | 
|---|
| 614 | REQ_F_SQE_COPIED	= IO_REQ_FLAG(REQ_F_SQE_COPIED_BIT), | 
|---|
| 615 | }; | 
|---|
| 616 |  | 
|---|
| 617 | typedef void (*io_req_tw_func_t)(struct io_kiocb *req, io_tw_token_t tw); | 
|---|
| 618 |  | 
|---|
| 619 | struct io_task_work { | 
|---|
| 620 | struct llist_node		node; | 
|---|
| 621 | io_req_tw_func_t		func; | 
|---|
| 622 | }; | 
|---|
| 623 |  | 
|---|
| 624 | struct io_cqe { | 
|---|
| 625 | __u64	user_data; | 
|---|
| 626 | __s32	res; | 
|---|
| 627 | /* fd initially, then cflags for completion */ | 
|---|
| 628 | union { | 
|---|
| 629 | __u32	flags; | 
|---|
| 630 | int	fd; | 
|---|
| 631 | }; | 
|---|
| 632 | }; | 
|---|
| 633 |  | 
|---|
| 634 | /* | 
|---|
| 635 | * Each request type overlays its private data structure on top of this one. | 
|---|
| 636 | * They must not exceed this one in size. | 
|---|
| 637 | */ | 
|---|
| 638 | struct io_cmd_data { | 
|---|
| 639 | struct file		*file; | 
|---|
| 640 | /* each command gets 56 bytes of data */ | 
|---|
| 641 | __u8			data[56]; | 
|---|
| 642 | }; | 
|---|
| 643 |  | 
|---|
| 644 | static inline void io_kiocb_cmd_sz_check(size_t cmd_sz) | 
|---|
| 645 | { | 
|---|
| 646 | BUILD_BUG_ON(cmd_sz > sizeof(struct io_cmd_data)); | 
|---|
| 647 | } | 
|---|
| 648 | #define io_kiocb_to_cmd(req, cmd_type) ( \ | 
|---|
| 649 | io_kiocb_cmd_sz_check(sizeof(cmd_type)) , \ | 
|---|
| 650 | ((cmd_type *)&(req)->cmd) \ | 
|---|
| 651 | ) | 
|---|
| 652 |  | 
|---|
| 653 | static inline struct io_kiocb *cmd_to_io_kiocb(void *ptr) | 
|---|
| 654 | { | 
|---|
| 655 | return ptr; | 
|---|
| 656 | } | 
|---|
| 657 |  | 
|---|
| 658 | struct io_kiocb { | 
|---|
| 659 | union { | 
|---|
| 660 | /* | 
|---|
| 661 | * NOTE! Each of the io_kiocb union members has the file pointer | 
|---|
| 662 | * as the first entry in their struct definition. So you can | 
|---|
| 663 | * access the file pointer through any of the sub-structs, | 
|---|
| 664 | * or directly as just 'file' in this struct. | 
|---|
| 665 | */ | 
|---|
| 666 | struct file		*file; | 
|---|
| 667 | struct io_cmd_data	cmd; | 
|---|
| 668 | }; | 
|---|
| 669 |  | 
|---|
| 670 | u8				opcode; | 
|---|
| 671 | /* polled IO has completed */ | 
|---|
| 672 | u8				iopoll_completed; | 
|---|
| 673 | /* | 
|---|
| 674 | * Can be either a fixed buffer index, or used with provided buffers. | 
|---|
| 675 | * For the latter, it points to the selected buffer ID. | 
|---|
| 676 | */ | 
|---|
| 677 | u16				buf_index; | 
|---|
| 678 |  | 
|---|
| 679 | unsigned			nr_tw; | 
|---|
| 680 |  | 
|---|
| 681 | /* REQ_F_* flags */ | 
|---|
| 682 | io_req_flags_t			flags; | 
|---|
| 683 |  | 
|---|
| 684 | struct io_cqe			cqe; | 
|---|
| 685 |  | 
|---|
| 686 | struct io_ring_ctx		*ctx; | 
|---|
| 687 | struct io_uring_task		*tctx; | 
|---|
| 688 |  | 
|---|
| 689 | union { | 
|---|
| 690 | /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ | 
|---|
| 691 | struct io_buffer	*kbuf; | 
|---|
| 692 |  | 
|---|
| 693 | struct io_rsrc_node	*buf_node; | 
|---|
| 694 | }; | 
|---|
| 695 |  | 
|---|
| 696 | union { | 
|---|
| 697 | /* used by request caches, completion batching and iopoll */ | 
|---|
| 698 | struct io_wq_work_node	comp_list; | 
|---|
| 699 | /* cache ->apoll->events */ | 
|---|
| 700 | __poll_t apoll_events; | 
|---|
| 701 | }; | 
|---|
| 702 |  | 
|---|
| 703 | struct io_rsrc_node		*file_node; | 
|---|
| 704 |  | 
|---|
| 705 | atomic_t			refs; | 
|---|
| 706 | bool				cancel_seq_set; | 
|---|
| 707 | struct io_task_work		io_task_work; | 
|---|
| 708 | union { | 
|---|
| 709 | /* | 
|---|
| 710 | * for polled requests, i.e. IORING_OP_POLL_ADD and async armed | 
|---|
| 711 | * poll | 
|---|
| 712 | */ | 
|---|
| 713 | struct hlist_node	hash_node; | 
|---|
| 714 | /* For IOPOLL setup queues, with hybrid polling */ | 
|---|
| 715 | u64                     iopoll_start; | 
|---|
| 716 | /* for private io_kiocb freeing */ | 
|---|
| 717 | struct rcu_head		rcu_head; | 
|---|
| 718 | }; | 
|---|
| 719 | /* internal polling, see IORING_FEAT_FAST_POLL */ | 
|---|
| 720 | struct async_poll		*apoll; | 
|---|
| 721 | /* opcode allocated if it needs to store data for async defer */ | 
|---|
| 722 | void				*async_data; | 
|---|
| 723 | /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */ | 
|---|
| 724 | atomic_t			poll_refs; | 
|---|
| 725 | struct io_kiocb			*link; | 
|---|
| 726 | /* custom credentials, valid IFF REQ_F_CREDS is set */ | 
|---|
| 727 | const struct cred		*creds; | 
|---|
| 728 | struct io_wq_work		work; | 
|---|
| 729 |  | 
|---|
| 730 | struct io_big_cqe { | 
|---|
| 731 | u64			; | 
|---|
| 732 | u64			; | 
|---|
| 733 | } big_cqe; | 
|---|
| 734 | }; | 
|---|
| 735 |  | 
|---|
| 736 | struct io_overflow_cqe { | 
|---|
| 737 | struct list_head list; | 
|---|
| 738 | struct io_uring_cqe cqe; | 
|---|
| 739 | }; | 
|---|
| 740 | #endif | 
|---|
| 741 |  | 
|---|