| 1 | // SPDX-License-Identifier: GPL-2.0 |
| 2 | #include <linux/memcontrol.h> |
| 3 | #include <linux/rwsem.h> |
| 4 | #include <linux/shrinker.h> |
| 5 | #include <linux/rculist.h> |
| 6 | #include <trace/events/vmscan.h> |
| 7 | |
| 8 | #include "internal.h" |
| 9 | |
| 10 | LIST_HEAD(shrinker_list); |
| 11 | DEFINE_MUTEX(shrinker_mutex); |
| 12 | |
| 13 | #ifdef CONFIG_MEMCG |
| 14 | static int shrinker_nr_max; |
| 15 | |
| 16 | static inline int shrinker_unit_size(int nr_items) |
| 17 | { |
| 18 | return (DIV_ROUND_UP(nr_items, SHRINKER_UNIT_BITS) * sizeof(struct shrinker_info_unit *)); |
| 19 | } |
| 20 | |
| 21 | static inline void shrinker_unit_free(struct shrinker_info *info, int start) |
| 22 | { |
| 23 | struct shrinker_info_unit **unit; |
| 24 | int nr, i; |
| 25 | |
| 26 | if (!info) |
| 27 | return; |
| 28 | |
| 29 | unit = info->unit; |
| 30 | nr = DIV_ROUND_UP(info->map_nr_max, SHRINKER_UNIT_BITS); |
| 31 | |
| 32 | for (i = start; i < nr; i++) { |
| 33 | if (!unit[i]) |
| 34 | break; |
| 35 | |
| 36 | kfree(unit[i]); |
| 37 | unit[i] = NULL; |
| 38 | } |
| 39 | } |
| 40 | |
| 41 | static inline int shrinker_unit_alloc(struct shrinker_info *new, |
| 42 | struct shrinker_info *old, int nid) |
| 43 | { |
| 44 | struct shrinker_info_unit *unit; |
| 45 | int nr = DIV_ROUND_UP(new->map_nr_max, SHRINKER_UNIT_BITS); |
| 46 | int start = old ? DIV_ROUND_UP(old->map_nr_max, SHRINKER_UNIT_BITS) : 0; |
| 47 | int i; |
| 48 | |
| 49 | for (i = start; i < nr; i++) { |
| 50 | unit = kzalloc_node(sizeof(*unit), GFP_KERNEL, nid); |
| 51 | if (!unit) { |
| 52 | shrinker_unit_free(new, start); |
| 53 | return -ENOMEM; |
| 54 | } |
| 55 | |
| 56 | new->unit[i] = unit; |
| 57 | } |
| 58 | |
| 59 | return 0; |
| 60 | } |
| 61 | |
| 62 | void free_shrinker_info(struct mem_cgroup *memcg) |
| 63 | { |
| 64 | struct mem_cgroup_per_node *pn; |
| 65 | struct shrinker_info *info; |
| 66 | int nid; |
| 67 | |
| 68 | for_each_node(nid) { |
| 69 | pn = memcg->nodeinfo[nid]; |
| 70 | info = rcu_dereference_protected(pn->shrinker_info, true); |
| 71 | shrinker_unit_free(info, 0); |
| 72 | kvfree(info); |
| 73 | rcu_assign_pointer(pn->shrinker_info, NULL); |
| 74 | } |
| 75 | } |
| 76 | |
| 77 | int alloc_shrinker_info(struct mem_cgroup *memcg) |
| 78 | { |
| 79 | int nid, ret = 0; |
| 80 | int array_size = 0; |
| 81 | |
| 82 | mutex_lock(&shrinker_mutex); |
| 83 | array_size = shrinker_unit_size(shrinker_nr_max); |
| 84 | for_each_node(nid) { |
| 85 | struct shrinker_info *info = kvzalloc_node(sizeof(*info) + array_size, |
| 86 | GFP_KERNEL, nid); |
| 87 | if (!info) |
| 88 | goto err; |
| 89 | info->map_nr_max = shrinker_nr_max; |
| 90 | if (shrinker_unit_alloc(info, NULL, nid)) { |
| 91 | kvfree(info); |
| 92 | goto err; |
| 93 | } |
| 94 | rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); |
| 95 | } |
| 96 | mutex_unlock(&shrinker_mutex); |
| 97 | |
| 98 | return ret; |
| 99 | |
| 100 | err: |
| 101 | mutex_unlock(&shrinker_mutex); |
| 102 | free_shrinker_info(memcg); |
| 103 | return -ENOMEM; |
| 104 | } |
| 105 | |
| 106 | static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, |
| 107 | int nid) |
| 108 | { |
| 109 | return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, |
| 110 | lockdep_is_held(&shrinker_mutex)); |
| 111 | } |
| 112 | |
| 113 | static int expand_one_shrinker_info(struct mem_cgroup *memcg, int new_size, |
| 114 | int old_size, int new_nr_max) |
| 115 | { |
| 116 | struct shrinker_info *new, *old; |
| 117 | struct mem_cgroup_per_node *pn; |
| 118 | int nid; |
| 119 | |
| 120 | for_each_node(nid) { |
| 121 | pn = memcg->nodeinfo[nid]; |
| 122 | old = shrinker_info_protected(memcg, nid); |
| 123 | /* Not yet online memcg */ |
| 124 | if (!old) |
| 125 | return 0; |
| 126 | |
| 127 | /* Already expanded this shrinker_info */ |
| 128 | if (new_nr_max <= old->map_nr_max) |
| 129 | continue; |
| 130 | |
| 131 | new = kvzalloc_node(sizeof(*new) + new_size, GFP_KERNEL, nid); |
| 132 | if (!new) |
| 133 | return -ENOMEM; |
| 134 | |
| 135 | new->map_nr_max = new_nr_max; |
| 136 | |
| 137 | memcpy(new->unit, old->unit, old_size); |
| 138 | if (shrinker_unit_alloc(new, old, nid)) { |
| 139 | kvfree(new); |
| 140 | return -ENOMEM; |
| 141 | } |
| 142 | |
| 143 | rcu_assign_pointer(pn->shrinker_info, new); |
| 144 | kvfree_rcu(old, rcu); |
| 145 | } |
| 146 | |
| 147 | return 0; |
| 148 | } |
| 149 | |
| 150 | static int expand_shrinker_info(int new_id) |
| 151 | { |
| 152 | int ret = 0; |
| 153 | int new_nr_max = round_up(new_id + 1, SHRINKER_UNIT_BITS); |
| 154 | int new_size, old_size = 0; |
| 155 | struct mem_cgroup *memcg; |
| 156 | |
| 157 | if (!root_mem_cgroup) |
| 158 | goto out; |
| 159 | |
| 160 | lockdep_assert_held(&shrinker_mutex); |
| 161 | |
| 162 | new_size = shrinker_unit_size(new_nr_max); |
| 163 | old_size = shrinker_unit_size(shrinker_nr_max); |
| 164 | |
| 165 | memcg = mem_cgroup_iter(NULL, NULL, NULL); |
| 166 | do { |
| 167 | ret = expand_one_shrinker_info(memcg, new_size, old_size, |
| 168 | new_nr_max); |
| 169 | if (ret) { |
| 170 | mem_cgroup_iter_break(NULL, memcg); |
| 171 | goto out; |
| 172 | } |
| 173 | } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); |
| 174 | out: |
| 175 | if (!ret) |
| 176 | shrinker_nr_max = new_nr_max; |
| 177 | |
| 178 | return ret; |
| 179 | } |
| 180 | |
| 181 | static inline int shrinker_id_to_index(int shrinker_id) |
| 182 | { |
| 183 | return shrinker_id / SHRINKER_UNIT_BITS; |
| 184 | } |
| 185 | |
| 186 | static inline int shrinker_id_to_offset(int shrinker_id) |
| 187 | { |
| 188 | return shrinker_id % SHRINKER_UNIT_BITS; |
| 189 | } |
| 190 | |
| 191 | static inline int calc_shrinker_id(int index, int offset) |
| 192 | { |
| 193 | return index * SHRINKER_UNIT_BITS + offset; |
| 194 | } |
| 195 | |
| 196 | void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) |
| 197 | { |
| 198 | if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { |
| 199 | struct shrinker_info *info; |
| 200 | struct shrinker_info_unit *unit; |
| 201 | |
| 202 | rcu_read_lock(); |
| 203 | info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); |
| 204 | unit = info->unit[shrinker_id_to_index(shrinker_id)]; |
| 205 | if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) { |
| 206 | /* Pairs with smp mb in shrink_slab() */ |
| 207 | smp_mb__before_atomic(); |
| 208 | set_bit(shrinker_id_to_offset(shrinker_id), unit->map); |
| 209 | } |
| 210 | rcu_read_unlock(); |
| 211 | } |
| 212 | } |
| 213 | |
| 214 | static DEFINE_IDR(shrinker_idr); |
| 215 | |
| 216 | static int shrinker_memcg_alloc(struct shrinker *shrinker) |
| 217 | { |
| 218 | int id, ret = -ENOMEM; |
| 219 | |
| 220 | if (mem_cgroup_disabled()) |
| 221 | return -ENOSYS; |
| 222 | |
| 223 | mutex_lock(&shrinker_mutex); |
| 224 | id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); |
| 225 | if (id < 0) |
| 226 | goto unlock; |
| 227 | |
| 228 | if (id >= shrinker_nr_max) { |
| 229 | if (expand_shrinker_info(id)) { |
| 230 | idr_remove(&shrinker_idr, id); |
| 231 | goto unlock; |
| 232 | } |
| 233 | } |
| 234 | shrinker->id = id; |
| 235 | ret = 0; |
| 236 | unlock: |
| 237 | mutex_unlock(&shrinker_mutex); |
| 238 | return ret; |
| 239 | } |
| 240 | |
| 241 | static void shrinker_memcg_remove(struct shrinker *shrinker) |
| 242 | { |
| 243 | int id = shrinker->id; |
| 244 | |
| 245 | BUG_ON(id < 0); |
| 246 | |
| 247 | lockdep_assert_held(&shrinker_mutex); |
| 248 | |
| 249 | idr_remove(&shrinker_idr, id); |
| 250 | } |
| 251 | |
| 252 | static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, |
| 253 | struct mem_cgroup *memcg) |
| 254 | { |
| 255 | struct shrinker_info *info; |
| 256 | struct shrinker_info_unit *unit; |
| 257 | long nr_deferred; |
| 258 | |
| 259 | rcu_read_lock(); |
| 260 | info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); |
| 261 | unit = info->unit[shrinker_id_to_index(shrinker->id)]; |
| 262 | nr_deferred = atomic_long_xchg(&unit->nr_deferred[shrinker_id_to_offset(shrinker->id)], 0); |
| 263 | rcu_read_unlock(); |
| 264 | |
| 265 | return nr_deferred; |
| 266 | } |
| 267 | |
| 268 | static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, |
| 269 | struct mem_cgroup *memcg) |
| 270 | { |
| 271 | struct shrinker_info *info; |
| 272 | struct shrinker_info_unit *unit; |
| 273 | long nr_deferred; |
| 274 | |
| 275 | rcu_read_lock(); |
| 276 | info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); |
| 277 | unit = info->unit[shrinker_id_to_index(shrinker->id)]; |
| 278 | nr_deferred = |
| 279 | atomic_long_add_return(nr, &unit->nr_deferred[shrinker_id_to_offset(shrinker->id)]); |
| 280 | rcu_read_unlock(); |
| 281 | |
| 282 | return nr_deferred; |
| 283 | } |
| 284 | |
| 285 | void reparent_shrinker_deferred(struct mem_cgroup *memcg) |
| 286 | { |
| 287 | int nid, index, offset; |
| 288 | long nr; |
| 289 | struct mem_cgroup *parent; |
| 290 | struct shrinker_info *child_info, *parent_info; |
| 291 | struct shrinker_info_unit *child_unit, *parent_unit; |
| 292 | |
| 293 | parent = parent_mem_cgroup(memcg); |
| 294 | if (!parent) |
| 295 | parent = root_mem_cgroup; |
| 296 | |
| 297 | /* Prevent from concurrent shrinker_info expand */ |
| 298 | mutex_lock(&shrinker_mutex); |
| 299 | for_each_node(nid) { |
| 300 | child_info = shrinker_info_protected(memcg, nid); |
| 301 | parent_info = shrinker_info_protected(parent, nid); |
| 302 | for (index = 0; index < shrinker_id_to_index(child_info->map_nr_max); index++) { |
| 303 | child_unit = child_info->unit[index]; |
| 304 | parent_unit = parent_info->unit[index]; |
| 305 | for (offset = 0; offset < SHRINKER_UNIT_BITS; offset++) { |
| 306 | nr = atomic_long_read(&child_unit->nr_deferred[offset]); |
| 307 | atomic_long_add(nr, &parent_unit->nr_deferred[offset]); |
| 308 | } |
| 309 | } |
| 310 | } |
| 311 | mutex_unlock(&shrinker_mutex); |
| 312 | } |
| 313 | #else |
| 314 | static int shrinker_memcg_alloc(struct shrinker *shrinker) |
| 315 | { |
| 316 | return -ENOSYS; |
| 317 | } |
| 318 | |
| 319 | static void shrinker_memcg_remove(struct shrinker *shrinker) |
| 320 | { |
| 321 | } |
| 322 | |
| 323 | static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, |
| 324 | struct mem_cgroup *memcg) |
| 325 | { |
| 326 | return 0; |
| 327 | } |
| 328 | |
| 329 | static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, |
| 330 | struct mem_cgroup *memcg) |
| 331 | { |
| 332 | return 0; |
| 333 | } |
| 334 | #endif /* CONFIG_MEMCG */ |
| 335 | |
| 336 | static long xchg_nr_deferred(struct shrinker *shrinker, |
| 337 | struct shrink_control *sc) |
| 338 | { |
| 339 | int nid = sc->nid; |
| 340 | |
| 341 | if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) |
| 342 | nid = 0; |
| 343 | |
| 344 | if (sc->memcg && |
| 345 | (shrinker->flags & SHRINKER_MEMCG_AWARE)) |
| 346 | return xchg_nr_deferred_memcg(nid, shrinker, |
| 347 | memcg: sc->memcg); |
| 348 | |
| 349 | return atomic_long_xchg(v: &shrinker->nr_deferred[nid], new: 0); |
| 350 | } |
| 351 | |
| 352 | |
| 353 | static long add_nr_deferred(long nr, struct shrinker *shrinker, |
| 354 | struct shrink_control *sc) |
| 355 | { |
| 356 | int nid = sc->nid; |
| 357 | |
| 358 | if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) |
| 359 | nid = 0; |
| 360 | |
| 361 | if (sc->memcg && |
| 362 | (shrinker->flags & SHRINKER_MEMCG_AWARE)) |
| 363 | return add_nr_deferred_memcg(nr, nid, shrinker, |
| 364 | memcg: sc->memcg); |
| 365 | |
| 366 | return atomic_long_add_return(i: nr, v: &shrinker->nr_deferred[nid]); |
| 367 | } |
| 368 | |
| 369 | #define SHRINK_BATCH 128 |
| 370 | |
| 371 | static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, |
| 372 | struct shrinker *shrinker, int priority) |
| 373 | { |
| 374 | unsigned long freed = 0; |
| 375 | unsigned long long delta; |
| 376 | long total_scan; |
| 377 | long freeable; |
| 378 | long nr; |
| 379 | long new_nr; |
| 380 | long batch_size = shrinker->batch ? shrinker->batch |
| 381 | : SHRINK_BATCH; |
| 382 | long scanned = 0, next_deferred; |
| 383 | |
| 384 | freeable = shrinker->count_objects(shrinker, shrinkctl); |
| 385 | if (freeable == 0 || freeable == SHRINK_EMPTY) |
| 386 | return freeable; |
| 387 | |
| 388 | /* |
| 389 | * copy the current shrinker scan count into a local variable |
| 390 | * and zero it so that other concurrent shrinker invocations |
| 391 | * don't also do this scanning work. |
| 392 | */ |
| 393 | nr = xchg_nr_deferred(shrinker, sc: shrinkctl); |
| 394 | |
| 395 | if (shrinker->seeks) { |
| 396 | delta = freeable >> priority; |
| 397 | delta *= 4; |
| 398 | do_div(delta, shrinker->seeks); |
| 399 | } else { |
| 400 | /* |
| 401 | * These objects don't require any IO to create. Trim |
| 402 | * them aggressively under memory pressure to keep |
| 403 | * them from causing refetches in the IO caches. |
| 404 | */ |
| 405 | delta = freeable / 2; |
| 406 | } |
| 407 | |
| 408 | total_scan = nr >> priority; |
| 409 | total_scan += delta; |
| 410 | total_scan = min(total_scan, (2 * freeable)); |
| 411 | |
| 412 | trace_mm_shrink_slab_start(shr: shrinker, sc: shrinkctl, nr_objects_to_shrink: nr, |
| 413 | cache_items: freeable, delta, total_scan, priority); |
| 414 | |
| 415 | /* |
| 416 | * Normally, we should not scan less than batch_size objects in one |
| 417 | * pass to avoid too frequent shrinker calls, but if the slab has less |
| 418 | * than batch_size objects in total and we are really tight on memory, |
| 419 | * we will try to reclaim all available objects, otherwise we can end |
| 420 | * up failing allocations although there are plenty of reclaimable |
| 421 | * objects spread over several slabs with usage less than the |
| 422 | * batch_size. |
| 423 | * |
| 424 | * We detect the "tight on memory" situations by looking at the total |
| 425 | * number of objects we want to scan (total_scan). If it is greater |
| 426 | * than the total number of objects on slab (freeable), we must be |
| 427 | * scanning at high prio and therefore should try to reclaim as much as |
| 428 | * possible. |
| 429 | */ |
| 430 | while (total_scan >= batch_size || |
| 431 | total_scan >= freeable) { |
| 432 | unsigned long ret; |
| 433 | unsigned long nr_to_scan = min(batch_size, total_scan); |
| 434 | |
| 435 | shrinkctl->nr_to_scan = nr_to_scan; |
| 436 | shrinkctl->nr_scanned = nr_to_scan; |
| 437 | ret = shrinker->scan_objects(shrinker, shrinkctl); |
| 438 | if (ret == SHRINK_STOP) |
| 439 | break; |
| 440 | freed += ret; |
| 441 | |
| 442 | count_vm_events(item: SLABS_SCANNED, delta: shrinkctl->nr_scanned); |
| 443 | total_scan -= shrinkctl->nr_scanned; |
| 444 | scanned += shrinkctl->nr_scanned; |
| 445 | |
| 446 | cond_resched(); |
| 447 | } |
| 448 | |
| 449 | /* |
| 450 | * The deferred work is increased by any new work (delta) that wasn't |
| 451 | * done, decreased by old deferred work that was done now. |
| 452 | * |
| 453 | * And it is capped to two times of the freeable items. |
| 454 | */ |
| 455 | next_deferred = max_t(long, (nr + delta - scanned), 0); |
| 456 | next_deferred = min(next_deferred, (2 * freeable)); |
| 457 | |
| 458 | /* |
| 459 | * move the unused scan count back into the shrinker in a |
| 460 | * manner that handles concurrent updates. |
| 461 | */ |
| 462 | new_nr = add_nr_deferred(nr: next_deferred, shrinker, sc: shrinkctl); |
| 463 | |
| 464 | trace_mm_shrink_slab_end(shr: shrinker, nid: shrinkctl->nid, shrinker_retval: freed, unused_scan_cnt: nr, new_scan_cnt: new_nr, total_scan); |
| 465 | return freed; |
| 466 | } |
| 467 | |
| 468 | #ifdef CONFIG_MEMCG |
| 469 | static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, |
| 470 | struct mem_cgroup *memcg, int priority) |
| 471 | { |
| 472 | struct shrinker_info *info; |
| 473 | unsigned long ret, freed = 0; |
| 474 | int offset, index = 0; |
| 475 | |
| 476 | if (!mem_cgroup_online(memcg)) |
| 477 | return 0; |
| 478 | |
| 479 | /* |
| 480 | * lockless algorithm of memcg shrink. |
| 481 | * |
| 482 | * The shrinker_info may be freed asynchronously via RCU in the |
| 483 | * expand_one_shrinker_info(), so the rcu_read_lock() needs to be used |
| 484 | * to ensure the existence of the shrinker_info. |
| 485 | * |
| 486 | * The shrinker_info_unit is never freed unless its corresponding memcg |
| 487 | * is destroyed. Here we already hold the refcount of memcg, so the |
| 488 | * memcg will not be destroyed, and of course shrinker_info_unit will |
| 489 | * not be freed. |
| 490 | * |
| 491 | * So in the memcg shrink: |
| 492 | * step 1: use rcu_read_lock() to guarantee existence of the |
| 493 | * shrinker_info. |
| 494 | * step 2: after getting shrinker_info_unit we can safely release the |
| 495 | * RCU lock. |
| 496 | * step 3: traverse the bitmap and calculate shrinker_id |
| 497 | * step 4: use rcu_read_lock() to guarantee existence of the shrinker. |
| 498 | * step 5: use shrinker_id to find the shrinker, then use |
| 499 | * shrinker_try_get() to guarantee existence of the shrinker, |
| 500 | * then we can release the RCU lock to do do_shrink_slab() that |
| 501 | * may sleep. |
| 502 | * step 6: do shrinker_put() paired with step 5 to put the refcount, |
| 503 | * if the refcount reaches 0, then wake up the waiter in |
| 504 | * shrinker_free() by calling complete(). |
| 505 | * Note: here is different from the global shrink, we don't |
| 506 | * need to acquire the RCU lock to guarantee existence of |
| 507 | * the shrinker, because we don't need to use this |
| 508 | * shrinker to traverse the next shrinker in the bitmap. |
| 509 | * step 7: we have already exited the read-side of rcu critical section |
| 510 | * before calling do_shrink_slab(), the shrinker_info may be |
| 511 | * released in expand_one_shrinker_info(), so go back to step 1 |
| 512 | * to reacquire the shrinker_info. |
| 513 | */ |
| 514 | again: |
| 515 | rcu_read_lock(); |
| 516 | info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); |
| 517 | if (unlikely(!info)) |
| 518 | goto unlock; |
| 519 | |
| 520 | if (index < shrinker_id_to_index(info->map_nr_max)) { |
| 521 | struct shrinker_info_unit *unit; |
| 522 | |
| 523 | unit = info->unit[index]; |
| 524 | |
| 525 | rcu_read_unlock(); |
| 526 | |
| 527 | for_each_set_bit(offset, unit->map, SHRINKER_UNIT_BITS) { |
| 528 | struct shrink_control sc = { |
| 529 | .gfp_mask = gfp_mask, |
| 530 | .nid = nid, |
| 531 | .memcg = memcg, |
| 532 | }; |
| 533 | struct shrinker *shrinker; |
| 534 | int shrinker_id = calc_shrinker_id(index, offset); |
| 535 | |
| 536 | rcu_read_lock(); |
| 537 | shrinker = idr_find(&shrinker_idr, shrinker_id); |
| 538 | if (unlikely(!shrinker || !shrinker_try_get(shrinker))) { |
| 539 | clear_bit(offset, unit->map); |
| 540 | rcu_read_unlock(); |
| 541 | continue; |
| 542 | } |
| 543 | rcu_read_unlock(); |
| 544 | |
| 545 | /* Call non-slab shrinkers even though kmem is disabled */ |
| 546 | if (!memcg_kmem_online() && |
| 547 | !(shrinker->flags & SHRINKER_NONSLAB)) |
| 548 | continue; |
| 549 | |
| 550 | ret = do_shrink_slab(&sc, shrinker, priority); |
| 551 | if (ret == SHRINK_EMPTY) { |
| 552 | clear_bit(offset, unit->map); |
| 553 | /* |
| 554 | * After the shrinker reported that it had no objects to |
| 555 | * free, but before we cleared the corresponding bit in |
| 556 | * the memcg shrinker map, a new object might have been |
| 557 | * added. To make sure, we have the bit set in this |
| 558 | * case, we invoke the shrinker one more time and reset |
| 559 | * the bit if it reports that it is not empty anymore. |
| 560 | * The memory barrier here pairs with the barrier in |
| 561 | * set_shrinker_bit(): |
| 562 | * |
| 563 | * list_lru_add() shrink_slab_memcg() |
| 564 | * list_add_tail() clear_bit() |
| 565 | * <MB> <MB> |
| 566 | * set_bit() do_shrink_slab() |
| 567 | */ |
| 568 | smp_mb__after_atomic(); |
| 569 | ret = do_shrink_slab(&sc, shrinker, priority); |
| 570 | if (ret == SHRINK_EMPTY) |
| 571 | ret = 0; |
| 572 | else |
| 573 | set_shrinker_bit(memcg, nid, shrinker_id); |
| 574 | } |
| 575 | freed += ret; |
| 576 | shrinker_put(shrinker); |
| 577 | } |
| 578 | |
| 579 | index++; |
| 580 | goto again; |
| 581 | } |
| 582 | unlock: |
| 583 | rcu_read_unlock(); |
| 584 | return freed; |
| 585 | } |
| 586 | #else /* !CONFIG_MEMCG */ |
| 587 | static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, |
| 588 | struct mem_cgroup *memcg, int priority) |
| 589 | { |
| 590 | return 0; |
| 591 | } |
| 592 | #endif /* CONFIG_MEMCG */ |
| 593 | |
| 594 | /** |
| 595 | * shrink_slab - shrink slab caches |
| 596 | * @gfp_mask: allocation context |
| 597 | * @nid: node whose slab caches to target |
| 598 | * @memcg: memory cgroup whose slab caches to target |
| 599 | * @priority: the reclaim priority |
| 600 | * |
| 601 | * Call the shrink functions to age shrinkable caches. |
| 602 | * |
| 603 | * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set, |
| 604 | * unaware shrinkers will receive a node id of 0 instead. |
| 605 | * |
| 606 | * @memcg specifies the memory cgroup to target. Unaware shrinkers |
| 607 | * are called only if it is the root cgroup. |
| 608 | * |
| 609 | * @priority is sc->priority, we take the number of objects and >> by priority |
| 610 | * in order to get the scan target. |
| 611 | * |
| 612 | * Returns the number of reclaimed slab objects. |
| 613 | */ |
| 614 | unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, |
| 615 | int priority) |
| 616 | { |
| 617 | unsigned long ret, freed = 0; |
| 618 | struct shrinker *shrinker; |
| 619 | |
| 620 | /* |
| 621 | * The root memcg might be allocated even though memcg is disabled |
| 622 | * via "cgroup_disable=memory" boot parameter. This could make |
| 623 | * mem_cgroup_is_root() return false, then just run memcg slab |
| 624 | * shrink, but skip global shrink. This may result in premature |
| 625 | * oom. |
| 626 | */ |
| 627 | if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg)) |
| 628 | return shrink_slab_memcg(gfp_mask, nid, memcg, priority); |
| 629 | |
| 630 | /* |
| 631 | * lockless algorithm of global shrink. |
| 632 | * |
| 633 | * In the unregistration setp, the shrinker will be freed asynchronously |
| 634 | * via RCU after its refcount reaches 0. So both rcu_read_lock() and |
| 635 | * shrinker_try_get() can be used to ensure the existence of the shrinker. |
| 636 | * |
| 637 | * So in the global shrink: |
| 638 | * step 1: use rcu_read_lock() to guarantee existence of the shrinker |
| 639 | * and the validity of the shrinker_list walk. |
| 640 | * step 2: use shrinker_try_get() to try get the refcount, if successful, |
| 641 | * then the existence of the shrinker can also be guaranteed, |
| 642 | * so we can release the RCU lock to do do_shrink_slab() that |
| 643 | * may sleep. |
| 644 | * step 3: *MUST* to reacquire the RCU lock before calling shrinker_put(), |
| 645 | * which ensures that neither this shrinker nor the next shrinker |
| 646 | * will be freed in the next traversal operation. |
| 647 | * step 4: do shrinker_put() paired with step 2 to put the refcount, |
| 648 | * if the refcount reaches 0, then wake up the waiter in |
| 649 | * shrinker_free() by calling complete(). |
| 650 | */ |
| 651 | rcu_read_lock(); |
| 652 | list_for_each_entry_rcu(shrinker, &shrinker_list, list) { |
| 653 | struct shrink_control sc = { |
| 654 | .gfp_mask = gfp_mask, |
| 655 | .nid = nid, |
| 656 | .memcg = memcg, |
| 657 | }; |
| 658 | |
| 659 | if (!shrinker_try_get(shrinker)) |
| 660 | continue; |
| 661 | |
| 662 | rcu_read_unlock(); |
| 663 | |
| 664 | ret = do_shrink_slab(shrinkctl: &sc, shrinker, priority); |
| 665 | if (ret == SHRINK_EMPTY) |
| 666 | ret = 0; |
| 667 | freed += ret; |
| 668 | |
| 669 | rcu_read_lock(); |
| 670 | shrinker_put(shrinker); |
| 671 | } |
| 672 | |
| 673 | rcu_read_unlock(); |
| 674 | cond_resched(); |
| 675 | return freed; |
| 676 | } |
| 677 | |
| 678 | struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...) |
| 679 | { |
| 680 | struct shrinker *shrinker; |
| 681 | unsigned int size; |
| 682 | va_list ap; |
| 683 | int err; |
| 684 | |
| 685 | shrinker = kzalloc(sizeof(struct shrinker), GFP_KERNEL); |
| 686 | if (!shrinker) |
| 687 | return NULL; |
| 688 | |
| 689 | va_start(ap, fmt); |
| 690 | err = shrinker_debugfs_name_alloc(shrinker, fmt, ap); |
| 691 | va_end(ap); |
| 692 | if (err) |
| 693 | goto err_name; |
| 694 | |
| 695 | shrinker->flags = flags | SHRINKER_ALLOCATED; |
| 696 | shrinker->seeks = DEFAULT_SEEKS; |
| 697 | |
| 698 | if (flags & SHRINKER_MEMCG_AWARE) { |
| 699 | err = shrinker_memcg_alloc(shrinker); |
| 700 | if (err == -ENOSYS) { |
| 701 | /* Memcg is not supported, fallback to non-memcg-aware shrinker. */ |
| 702 | shrinker->flags &= ~SHRINKER_MEMCG_AWARE; |
| 703 | goto non_memcg; |
| 704 | } |
| 705 | |
| 706 | if (err) |
| 707 | goto err_flags; |
| 708 | |
| 709 | return shrinker; |
| 710 | } |
| 711 | |
| 712 | non_memcg: |
| 713 | /* |
| 714 | * The nr_deferred is available on per memcg level for memcg aware |
| 715 | * shrinkers, so only allocate nr_deferred in the following cases: |
| 716 | * - non-memcg-aware shrinkers |
| 717 | * - !CONFIG_MEMCG |
| 718 | * - memcg is disabled by kernel command line |
| 719 | */ |
| 720 | size = sizeof(*shrinker->nr_deferred); |
| 721 | if (flags & SHRINKER_NUMA_AWARE) |
| 722 | size *= nr_node_ids; |
| 723 | |
| 724 | shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); |
| 725 | if (!shrinker->nr_deferred) |
| 726 | goto err_flags; |
| 727 | |
| 728 | return shrinker; |
| 729 | |
| 730 | err_flags: |
| 731 | shrinker_debugfs_name_free(shrinker); |
| 732 | err_name: |
| 733 | kfree(objp: shrinker); |
| 734 | return NULL; |
| 735 | } |
| 736 | EXPORT_SYMBOL_GPL(shrinker_alloc); |
| 737 | |
| 738 | void shrinker_register(struct shrinker *shrinker) |
| 739 | { |
| 740 | if (unlikely(!(shrinker->flags & SHRINKER_ALLOCATED))) { |
| 741 | pr_warn("Must use shrinker_alloc() to dynamically allocate the shrinker" ); |
| 742 | return; |
| 743 | } |
| 744 | |
| 745 | mutex_lock(lock: &shrinker_mutex); |
| 746 | list_add_tail_rcu(new: &shrinker->list, head: &shrinker_list); |
| 747 | shrinker->flags |= SHRINKER_REGISTERED; |
| 748 | shrinker_debugfs_add(shrinker); |
| 749 | mutex_unlock(lock: &shrinker_mutex); |
| 750 | |
| 751 | init_completion(x: &shrinker->done); |
| 752 | /* |
| 753 | * Now the shrinker is fully set up, take the first reference to it to |
| 754 | * indicate that lookup operations are now allowed to use it via |
| 755 | * shrinker_try_get(). |
| 756 | */ |
| 757 | refcount_set(r: &shrinker->refcount, n: 1); |
| 758 | } |
| 759 | EXPORT_SYMBOL_GPL(shrinker_register); |
| 760 | |
| 761 | static void shrinker_free_rcu_cb(struct rcu_head *head) |
| 762 | { |
| 763 | struct shrinker *shrinker = container_of(head, struct shrinker, rcu); |
| 764 | |
| 765 | kfree(objp: shrinker->nr_deferred); |
| 766 | kfree(objp: shrinker); |
| 767 | } |
| 768 | |
| 769 | void shrinker_free(struct shrinker *shrinker) |
| 770 | { |
| 771 | struct dentry *debugfs_entry = NULL; |
| 772 | int debugfs_id; |
| 773 | |
| 774 | if (!shrinker) |
| 775 | return; |
| 776 | |
| 777 | if (shrinker->flags & SHRINKER_REGISTERED) { |
| 778 | /* drop the initial refcount */ |
| 779 | shrinker_put(shrinker); |
| 780 | /* |
| 781 | * Wait for all lookups of the shrinker to complete, after that, |
| 782 | * no shrinker is running or will run again, then we can safely |
| 783 | * free it asynchronously via RCU and safely free the structure |
| 784 | * where the shrinker is located, such as super_block etc. |
| 785 | */ |
| 786 | wait_for_completion(&shrinker->done); |
| 787 | } |
| 788 | |
| 789 | mutex_lock(lock: &shrinker_mutex); |
| 790 | if (shrinker->flags & SHRINKER_REGISTERED) { |
| 791 | /* |
| 792 | * Now we can safely remove it from the shrinker_list and then |
| 793 | * free it. |
| 794 | */ |
| 795 | list_del_rcu(entry: &shrinker->list); |
| 796 | debugfs_entry = shrinker_debugfs_detach(shrinker, debugfs_id: &debugfs_id); |
| 797 | shrinker->flags &= ~SHRINKER_REGISTERED; |
| 798 | } |
| 799 | |
| 800 | shrinker_debugfs_name_free(shrinker); |
| 801 | |
| 802 | if (shrinker->flags & SHRINKER_MEMCG_AWARE) |
| 803 | shrinker_memcg_remove(shrinker); |
| 804 | mutex_unlock(lock: &shrinker_mutex); |
| 805 | |
| 806 | if (debugfs_entry) |
| 807 | shrinker_debugfs_remove(debugfs_entry, debugfs_id); |
| 808 | |
| 809 | call_rcu(head: &shrinker->rcu, func: shrinker_free_rcu_cb); |
| 810 | } |
| 811 | EXPORT_SYMBOL_GPL(shrinker_free); |
| 812 | |