| 1 | // SPDX-License-Identifier: GPL-2.0 |
| 2 | |
| 3 | #include "io_uring.h" |
| 4 | #include "napi.h" |
| 5 | |
| 6 | #ifdef CONFIG_NET_RX_BUSY_POLL |
| 7 | |
| 8 | /* Timeout for cleanout of stale entries. */ |
| 9 | #define NAPI_TIMEOUT (60 * SEC_CONVERSION) |
| 10 | |
| 11 | struct io_napi_entry { |
| 12 | unsigned int napi_id; |
| 13 | struct list_head list; |
| 14 | |
| 15 | unsigned long timeout; |
| 16 | struct hlist_node node; |
| 17 | |
| 18 | struct rcu_head rcu; |
| 19 | }; |
| 20 | |
| 21 | static struct io_napi_entry *io_napi_hash_find(struct hlist_head *hash_list, |
| 22 | unsigned int napi_id) |
| 23 | { |
| 24 | struct io_napi_entry *e; |
| 25 | |
| 26 | hlist_for_each_entry_rcu(e, hash_list, node) { |
| 27 | if (e->napi_id != napi_id) |
| 28 | continue; |
| 29 | return e; |
| 30 | } |
| 31 | |
| 32 | return NULL; |
| 33 | } |
| 34 | |
| 35 | static inline ktime_t net_to_ktime(unsigned long t) |
| 36 | { |
| 37 | /* napi approximating usecs, reverse busy_loop_current_time */ |
| 38 | return ns_to_ktime(ns: t << 10); |
| 39 | } |
| 40 | |
| 41 | int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id) |
| 42 | { |
| 43 | struct hlist_head *hash_list; |
| 44 | struct io_napi_entry *e; |
| 45 | |
| 46 | /* Non-NAPI IDs can be rejected. */ |
| 47 | if (!napi_id_valid(napi_id)) |
| 48 | return -EINVAL; |
| 49 | |
| 50 | hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))]; |
| 51 | |
| 52 | scoped_guard(rcu) { |
| 53 | e = io_napi_hash_find(hash_list, napi_id); |
| 54 | if (e) { |
| 55 | WRITE_ONCE(e->timeout, jiffies + NAPI_TIMEOUT); |
| 56 | return -EEXIST; |
| 57 | } |
| 58 | } |
| 59 | |
| 60 | e = kmalloc(sizeof(*e), GFP_NOWAIT); |
| 61 | if (!e) |
| 62 | return -ENOMEM; |
| 63 | |
| 64 | e->napi_id = napi_id; |
| 65 | e->timeout = jiffies + NAPI_TIMEOUT; |
| 66 | |
| 67 | /* |
| 68 | * guard(spinlock) is not used to manually unlock it before calling |
| 69 | * kfree() |
| 70 | */ |
| 71 | spin_lock(lock: &ctx->napi_lock); |
| 72 | if (unlikely(io_napi_hash_find(hash_list, napi_id))) { |
| 73 | spin_unlock(lock: &ctx->napi_lock); |
| 74 | kfree(objp: e); |
| 75 | return -EEXIST; |
| 76 | } |
| 77 | |
| 78 | hlist_add_tail_rcu(n: &e->node, h: hash_list); |
| 79 | list_add_tail_rcu(new: &e->list, head: &ctx->napi_list); |
| 80 | spin_unlock(lock: &ctx->napi_lock); |
| 81 | return 0; |
| 82 | } |
| 83 | |
| 84 | static int __io_napi_del_id(struct io_ring_ctx *ctx, unsigned int napi_id) |
| 85 | { |
| 86 | struct hlist_head *hash_list; |
| 87 | struct io_napi_entry *e; |
| 88 | |
| 89 | /* Non-NAPI IDs can be rejected. */ |
| 90 | if (!napi_id_valid(napi_id)) |
| 91 | return -EINVAL; |
| 92 | |
| 93 | hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))]; |
| 94 | guard(spinlock)(l: &ctx->napi_lock); |
| 95 | e = io_napi_hash_find(hash_list, napi_id); |
| 96 | if (!e) |
| 97 | return -ENOENT; |
| 98 | |
| 99 | list_del_rcu(entry: &e->list); |
| 100 | hash_del_rcu(node: &e->node); |
| 101 | kfree_rcu(e, rcu); |
| 102 | return 0; |
| 103 | } |
| 104 | |
| 105 | static void __io_napi_remove_stale(struct io_ring_ctx *ctx) |
| 106 | { |
| 107 | struct io_napi_entry *e; |
| 108 | |
| 109 | guard(spinlock)(l: &ctx->napi_lock); |
| 110 | /* |
| 111 | * list_for_each_entry_safe() is not required as long as: |
| 112 | * 1. list_del_rcu() does not reset the deleted node next pointer |
| 113 | * 2. kfree_rcu() delays the memory freeing until the next quiescent |
| 114 | * state |
| 115 | */ |
| 116 | list_for_each_entry(e, &ctx->napi_list, list) { |
| 117 | if (time_after(jiffies, READ_ONCE(e->timeout))) { |
| 118 | list_del_rcu(entry: &e->list); |
| 119 | hash_del_rcu(node: &e->node); |
| 120 | kfree_rcu(e, rcu); |
| 121 | } |
| 122 | } |
| 123 | } |
| 124 | |
| 125 | static inline void io_napi_remove_stale(struct io_ring_ctx *ctx, bool is_stale) |
| 126 | { |
| 127 | if (is_stale) |
| 128 | __io_napi_remove_stale(ctx); |
| 129 | } |
| 130 | |
| 131 | static inline bool io_napi_busy_loop_timeout(ktime_t start_time, |
| 132 | ktime_t bp) |
| 133 | { |
| 134 | if (bp) { |
| 135 | ktime_t end_time = ktime_add(start_time, bp); |
| 136 | ktime_t now = net_to_ktime(t: busy_loop_current_time()); |
| 137 | |
| 138 | return ktime_after(cmp1: now, cmp2: end_time); |
| 139 | } |
| 140 | |
| 141 | return true; |
| 142 | } |
| 143 | |
| 144 | static bool io_napi_busy_loop_should_end(void *data, |
| 145 | unsigned long start_time) |
| 146 | { |
| 147 | struct io_wait_queue *iowq = data; |
| 148 | |
| 149 | if (signal_pending(current)) |
| 150 | return true; |
| 151 | if (io_should_wake(iowq) || io_has_work(ctx: iowq->ctx)) |
| 152 | return true; |
| 153 | if (io_napi_busy_loop_timeout(start_time: net_to_ktime(t: start_time), |
| 154 | bp: iowq->napi_busy_poll_dt)) |
| 155 | return true; |
| 156 | |
| 157 | return false; |
| 158 | } |
| 159 | |
| 160 | /* |
| 161 | * never report stale entries |
| 162 | */ |
| 163 | static bool static_tracking_do_busy_loop(struct io_ring_ctx *ctx, |
| 164 | bool (*loop_end)(void *, unsigned long), |
| 165 | void *loop_end_arg) |
| 166 | { |
| 167 | struct io_napi_entry *e; |
| 168 | |
| 169 | list_for_each_entry_rcu(e, &ctx->napi_list, list) |
| 170 | napi_busy_loop_rcu(napi_id: e->napi_id, loop_end, loop_end_arg, |
| 171 | prefer_busy_poll: ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET); |
| 172 | return false; |
| 173 | } |
| 174 | |
| 175 | static bool |
| 176 | dynamic_tracking_do_busy_loop(struct io_ring_ctx *ctx, |
| 177 | bool (*loop_end)(void *, unsigned long), |
| 178 | void *loop_end_arg) |
| 179 | { |
| 180 | struct io_napi_entry *e; |
| 181 | bool is_stale = false; |
| 182 | |
| 183 | list_for_each_entry_rcu(e, &ctx->napi_list, list) { |
| 184 | napi_busy_loop_rcu(napi_id: e->napi_id, loop_end, loop_end_arg, |
| 185 | prefer_busy_poll: ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET); |
| 186 | |
| 187 | if (time_after(jiffies, READ_ONCE(e->timeout))) |
| 188 | is_stale = true; |
| 189 | } |
| 190 | |
| 191 | return is_stale; |
| 192 | } |
| 193 | |
| 194 | static inline bool |
| 195 | __io_napi_do_busy_loop(struct io_ring_ctx *ctx, |
| 196 | bool (*loop_end)(void *, unsigned long), |
| 197 | void *loop_end_arg) |
| 198 | { |
| 199 | if (READ_ONCE(ctx->napi_track_mode) == IO_URING_NAPI_TRACKING_STATIC) |
| 200 | return static_tracking_do_busy_loop(ctx, loop_end, loop_end_arg); |
| 201 | return dynamic_tracking_do_busy_loop(ctx, loop_end, loop_end_arg); |
| 202 | } |
| 203 | |
| 204 | static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx, |
| 205 | struct io_wait_queue *iowq) |
| 206 | { |
| 207 | unsigned long start_time = busy_loop_current_time(); |
| 208 | bool (*loop_end)(void *, unsigned long) = NULL; |
| 209 | void *loop_end_arg = NULL; |
| 210 | bool is_stale = false; |
| 211 | |
| 212 | /* Singular lists use a different napi loop end check function and are |
| 213 | * only executed once. |
| 214 | */ |
| 215 | if (list_is_singular(head: &ctx->napi_list)) { |
| 216 | loop_end = io_napi_busy_loop_should_end; |
| 217 | loop_end_arg = iowq; |
| 218 | } |
| 219 | |
| 220 | scoped_guard(rcu) { |
| 221 | do { |
| 222 | is_stale = __io_napi_do_busy_loop(ctx, loop_end, |
| 223 | loop_end_arg); |
| 224 | } while (!io_napi_busy_loop_should_end(data: iowq, start_time) && |
| 225 | !loop_end_arg); |
| 226 | } |
| 227 | |
| 228 | io_napi_remove_stale(ctx, is_stale); |
| 229 | } |
| 230 | |
| 231 | /* |
| 232 | * io_napi_init() - Init napi settings |
| 233 | * @ctx: pointer to io-uring context structure |
| 234 | * |
| 235 | * Init napi settings in the io-uring context. |
| 236 | */ |
| 237 | void io_napi_init(struct io_ring_ctx *ctx) |
| 238 | { |
| 239 | u64 sys_dt = READ_ONCE(sysctl_net_busy_poll) * NSEC_PER_USEC; |
| 240 | |
| 241 | INIT_LIST_HEAD(list: &ctx->napi_list); |
| 242 | spin_lock_init(&ctx->napi_lock); |
| 243 | ctx->napi_prefer_busy_poll = false; |
| 244 | ctx->napi_busy_poll_dt = ns_to_ktime(ns: sys_dt); |
| 245 | ctx->napi_track_mode = IO_URING_NAPI_TRACKING_INACTIVE; |
| 246 | } |
| 247 | |
| 248 | /* |
| 249 | * io_napi_free() - Deallocate napi |
| 250 | * @ctx: pointer to io-uring context structure |
| 251 | * |
| 252 | * Free the napi list and the hash table in the io-uring context. |
| 253 | */ |
| 254 | void io_napi_free(struct io_ring_ctx *ctx) |
| 255 | { |
| 256 | struct io_napi_entry *e; |
| 257 | |
| 258 | guard(spinlock)(l: &ctx->napi_lock); |
| 259 | list_for_each_entry(e, &ctx->napi_list, list) { |
| 260 | hash_del_rcu(node: &e->node); |
| 261 | kfree_rcu(e, rcu); |
| 262 | } |
| 263 | INIT_LIST_HEAD_RCU(list: &ctx->napi_list); |
| 264 | } |
| 265 | |
| 266 | static int io_napi_register_napi(struct io_ring_ctx *ctx, |
| 267 | struct io_uring_napi *napi) |
| 268 | { |
| 269 | switch (napi->op_param) { |
| 270 | case IO_URING_NAPI_TRACKING_DYNAMIC: |
| 271 | case IO_URING_NAPI_TRACKING_STATIC: |
| 272 | break; |
| 273 | default: |
| 274 | return -EINVAL; |
| 275 | } |
| 276 | /* clean the napi list for new settings */ |
| 277 | io_napi_free(ctx); |
| 278 | WRITE_ONCE(ctx->napi_track_mode, napi->op_param); |
| 279 | WRITE_ONCE(ctx->napi_busy_poll_dt, napi->busy_poll_to * NSEC_PER_USEC); |
| 280 | WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi->prefer_busy_poll); |
| 281 | return 0; |
| 282 | } |
| 283 | |
| 284 | /* |
| 285 | * io_napi_register() - Register napi with io-uring |
| 286 | * @ctx: pointer to io-uring context structure |
| 287 | * @arg: pointer to io_uring_napi structure |
| 288 | * |
| 289 | * Register napi in the io-uring context. |
| 290 | */ |
| 291 | int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) |
| 292 | { |
| 293 | const struct io_uring_napi curr = { |
| 294 | .busy_poll_to = ktime_to_us(kt: ctx->napi_busy_poll_dt), |
| 295 | .prefer_busy_poll = ctx->napi_prefer_busy_poll, |
| 296 | .op_param = ctx->napi_track_mode |
| 297 | }; |
| 298 | struct io_uring_napi napi; |
| 299 | |
| 300 | if (ctx->flags & IORING_SETUP_IOPOLL) |
| 301 | return -EINVAL; |
| 302 | if (copy_from_user(to: &napi, from: arg, n: sizeof(napi))) |
| 303 | return -EFAULT; |
| 304 | if (napi.pad[0] || napi.pad[1] || napi.resv) |
| 305 | return -EINVAL; |
| 306 | |
| 307 | if (copy_to_user(to: arg, from: &curr, n: sizeof(curr))) |
| 308 | return -EFAULT; |
| 309 | |
| 310 | switch (napi.opcode) { |
| 311 | case IO_URING_NAPI_REGISTER_OP: |
| 312 | return io_napi_register_napi(ctx, napi: &napi); |
| 313 | case IO_URING_NAPI_STATIC_ADD_ID: |
| 314 | if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC) |
| 315 | return -EINVAL; |
| 316 | return __io_napi_add_id(ctx, napi_id: napi.op_param); |
| 317 | case IO_URING_NAPI_STATIC_DEL_ID: |
| 318 | if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC) |
| 319 | return -EINVAL; |
| 320 | return __io_napi_del_id(ctx, napi_id: napi.op_param); |
| 321 | default: |
| 322 | return -EINVAL; |
| 323 | } |
| 324 | } |
| 325 | |
| 326 | /* |
| 327 | * io_napi_unregister() - Unregister napi with io-uring |
| 328 | * @ctx: pointer to io-uring context structure |
| 329 | * @arg: pointer to io_uring_napi structure |
| 330 | * |
| 331 | * Unregister napi. If arg has been specified copy the busy poll timeout and |
| 332 | * prefer busy poll setting to the passed in structure. |
| 333 | */ |
| 334 | int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg) |
| 335 | { |
| 336 | const struct io_uring_napi curr = { |
| 337 | .busy_poll_to = ktime_to_us(kt: ctx->napi_busy_poll_dt), |
| 338 | .prefer_busy_poll = ctx->napi_prefer_busy_poll |
| 339 | }; |
| 340 | |
| 341 | if (arg && copy_to_user(to: arg, from: &curr, n: sizeof(curr))) |
| 342 | return -EFAULT; |
| 343 | |
| 344 | WRITE_ONCE(ctx->napi_busy_poll_dt, 0); |
| 345 | WRITE_ONCE(ctx->napi_prefer_busy_poll, false); |
| 346 | WRITE_ONCE(ctx->napi_track_mode, IO_URING_NAPI_TRACKING_INACTIVE); |
| 347 | return 0; |
| 348 | } |
| 349 | |
| 350 | /* |
| 351 | * __io_napi_busy_loop() - execute busy poll loop |
| 352 | * @ctx: pointer to io-uring context structure |
| 353 | * @iowq: pointer to io wait queue |
| 354 | * |
| 355 | * Execute the busy poll loop and merge the spliced off list. |
| 356 | */ |
| 357 | void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) |
| 358 | { |
| 359 | if (ctx->flags & IORING_SETUP_SQPOLL) |
| 360 | return; |
| 361 | |
| 362 | iowq->napi_busy_poll_dt = READ_ONCE(ctx->napi_busy_poll_dt); |
| 363 | if (iowq->timeout != KTIME_MAX) { |
| 364 | ktime_t dt = ktime_sub(iowq->timeout, io_get_time(ctx)); |
| 365 | |
| 366 | iowq->napi_busy_poll_dt = min_t(u64, iowq->napi_busy_poll_dt, dt); |
| 367 | } |
| 368 | |
| 369 | iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll); |
| 370 | io_napi_blocking_busy_loop(ctx, iowq); |
| 371 | } |
| 372 | |
| 373 | /* |
| 374 | * io_napi_sqpoll_busy_poll() - busy poll loop for sqpoll |
| 375 | * @ctx: pointer to io-uring context structure |
| 376 | * |
| 377 | * Splice of the napi list and execute the napi busy poll loop. |
| 378 | */ |
| 379 | int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx) |
| 380 | { |
| 381 | bool is_stale = false; |
| 382 | |
| 383 | if (!READ_ONCE(ctx->napi_busy_poll_dt)) |
| 384 | return 0; |
| 385 | if (list_empty_careful(head: &ctx->napi_list)) |
| 386 | return 0; |
| 387 | |
| 388 | scoped_guard(rcu) { |
| 389 | is_stale = __io_napi_do_busy_loop(ctx, NULL, NULL); |
| 390 | } |
| 391 | |
| 392 | io_napi_remove_stale(ctx, is_stale); |
| 393 | return 1; |
| 394 | } |
| 395 | |
| 396 | #endif |
| 397 | |