1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Slab allocator functions that are independent of the allocator strategy
4 *
5 * (C) 2012 Christoph Lameter <cl@gentwo.org>
6 */
7#include <linux/slab.h>
8
9#include <linux/mm.h>
10#include <linux/poison.h>
11#include <linux/interrupt.h>
12#include <linux/memory.h>
13#include <linux/cache.h>
14#include <linux/compiler.h>
15#include <linux/kfence.h>
16#include <linux/module.h>
17#include <linux/cpu.h>
18#include <linux/uaccess.h>
19#include <linux/seq_file.h>
20#include <linux/dma-mapping.h>
21#include <linux/swiotlb.h>
22#include <linux/proc_fs.h>
23#include <linux/debugfs.h>
24#include <linux/kmemleak.h>
25#include <linux/kasan.h>
26#include <asm/cacheflush.h>
27#include <asm/tlbflush.h>
28#include <asm/page.h>
29#include <linux/memcontrol.h>
30#include <linux/stackdepot.h>
31#include <trace/events/rcu.h>
32
33#include "../kernel/rcu/rcu.h"
34#include "internal.h"
35#include "slab.h"
36
37#define CREATE_TRACE_POINTS
38#include <trace/events/kmem.h>
39
40enum slab_state slab_state;
41LIST_HEAD(slab_caches);
42DEFINE_MUTEX(slab_mutex);
43struct kmem_cache *kmem_cache;
44
45/*
46 * Set of flags that will prevent slab merging
47 */
48#define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
49 SLAB_TRACE | SLAB_TYPESAFE_BY_RCU | SLAB_NOLEAKTRACE | \
50 SLAB_FAILSLAB | SLAB_NO_MERGE)
51
52#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
53 SLAB_CACHE_DMA32 | SLAB_ACCOUNT)
54
55/*
56 * Merge control. If this is set then no merging of slab caches will occur.
57 */
58static bool slab_nomerge = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT);
59
60static int __init setup_slab_nomerge(char *str)
61{
62 slab_nomerge = true;
63 return 1;
64}
65
66static int __init setup_slab_merge(char *str)
67{
68 slab_nomerge = false;
69 return 1;
70}
71
72__setup_param("slub_nomerge", slub_nomerge, setup_slab_nomerge, 0);
73__setup_param("slub_merge", slub_merge, setup_slab_merge, 0);
74
75__setup("slab_nomerge", setup_slab_nomerge);
76__setup("slab_merge", setup_slab_merge);
77
78/*
79 * Determine the size of a slab object
80 */
81unsigned int kmem_cache_size(struct kmem_cache *s)
82{
83 return s->object_size;
84}
85EXPORT_SYMBOL(kmem_cache_size);
86
87#ifdef CONFIG_DEBUG_VM
88
89static bool kmem_cache_is_duplicate_name(const char *name)
90{
91 struct kmem_cache *s;
92
93 list_for_each_entry(s, &slab_caches, list) {
94 if (!strcmp(s->name, name))
95 return true;
96 }
97
98 return false;
99}
100
101static int kmem_cache_sanity_check(const char *name, unsigned int size)
102{
103 if (!name || in_interrupt() || size > KMALLOC_MAX_SIZE) {
104 pr_err("kmem_cache_create(%s) integrity check failed\n", name);
105 return -EINVAL;
106 }
107
108 /* Duplicate names will confuse slabtop, et al */
109 WARN(kmem_cache_is_duplicate_name(name),
110 "kmem_cache of name '%s' already exists\n", name);
111
112 WARN_ON(strchr(name, ' ')); /* It confuses parsers */
113 return 0;
114}
115#else
116static inline int kmem_cache_sanity_check(const char *name, unsigned int size)
117{
118 return 0;
119}
120#endif
121
122/*
123 * Figure out what the alignment of the objects will be given a set of
124 * flags, a user specified alignment and the size of the objects.
125 */
126static unsigned int calculate_alignment(slab_flags_t flags,
127 unsigned int align, unsigned int size)
128{
129 /*
130 * If the user wants hardware cache aligned objects then follow that
131 * suggestion if the object is sufficiently large.
132 *
133 * The hardware cache alignment cannot override the specified
134 * alignment though. If that is greater then use it.
135 */
136 if (flags & SLAB_HWCACHE_ALIGN) {
137 unsigned int ralign;
138
139 ralign = cache_line_size();
140 while (size <= ralign / 2)
141 ralign /= 2;
142 align = max(align, ralign);
143 }
144
145 align = max(align, arch_slab_minalign());
146
147 return ALIGN(align, sizeof(void *));
148}
149
150/*
151 * Find a mergeable slab cache
152 */
153int slab_unmergeable(struct kmem_cache *s)
154{
155 if (slab_nomerge || (s->flags & SLAB_NEVER_MERGE))
156 return 1;
157
158 if (s->ctor)
159 return 1;
160
161#ifdef CONFIG_HARDENED_USERCOPY
162 if (s->usersize)
163 return 1;
164#endif
165
166 if (s->cpu_sheaves)
167 return 1;
168
169 /*
170 * We may have set a slab to be unmergeable during bootstrap.
171 */
172 if (s->refcount < 0)
173 return 1;
174
175 return 0;
176}
177
178struct kmem_cache *find_mergeable(unsigned int size, unsigned int align,
179 slab_flags_t flags, const char *name, void (*ctor)(void *))
180{
181 struct kmem_cache *s;
182
183 if (slab_nomerge)
184 return NULL;
185
186 if (ctor)
187 return NULL;
188
189 flags = kmem_cache_flags(flags, name);
190
191 if (flags & SLAB_NEVER_MERGE)
192 return NULL;
193
194 size = ALIGN(size, sizeof(void *));
195 align = calculate_alignment(flags, align, size);
196 size = ALIGN(size, align);
197
198 list_for_each_entry_reverse(s, &slab_caches, list) {
199 if (slab_unmergeable(s))
200 continue;
201
202 if (size > s->size)
203 continue;
204
205 if ((flags & SLAB_MERGE_SAME) != (s->flags & SLAB_MERGE_SAME))
206 continue;
207 /*
208 * Check if alignment is compatible.
209 * Courtesy of Adrian Drzewiecki
210 */
211 if ((s->size & ~(align - 1)) != s->size)
212 continue;
213
214 if (s->size - size >= sizeof(void *))
215 continue;
216
217 return s;
218 }
219 return NULL;
220}
221
222static struct kmem_cache *create_cache(const char *name,
223 unsigned int object_size,
224 struct kmem_cache_args *args,
225 slab_flags_t flags)
226{
227 struct kmem_cache *s;
228 int err;
229
230 /* If a custom freelist pointer is requested make sure it's sane. */
231 err = -EINVAL;
232 if (args->use_freeptr_offset &&
233 (args->freeptr_offset >= object_size ||
234 !(flags & SLAB_TYPESAFE_BY_RCU) ||
235 !IS_ALIGNED(args->freeptr_offset, __alignof__(freeptr_t))))
236 goto out;
237
238 err = -ENOMEM;
239 s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
240 if (!s)
241 goto out;
242 err = do_kmem_cache_create(s, name, size: object_size, args, flags);
243 if (err)
244 goto out_free_cache;
245
246 s->refcount = 1;
247 list_add(new: &s->list, head: &slab_caches);
248 return s;
249
250out_free_cache:
251 kmem_cache_free(s: kmem_cache, objp: s);
252out:
253 return ERR_PTR(error: err);
254}
255
256/**
257 * __kmem_cache_create_args - Create a kmem cache.
258 * @name: A string which is used in /proc/slabinfo to identify this cache.
259 * @object_size: The size of objects to be created in this cache.
260 * @args: Additional arguments for the cache creation (see
261 * &struct kmem_cache_args).
262 * @flags: See the desriptions of individual flags. The common ones are listed
263 * in the description below.
264 *
265 * Not to be called directly, use the kmem_cache_create() wrapper with the same
266 * parameters.
267 *
268 * Commonly used @flags:
269 *
270 * &SLAB_ACCOUNT - Account allocations to memcg.
271 *
272 * &SLAB_HWCACHE_ALIGN - Align objects on cache line boundaries.
273 *
274 * &SLAB_RECLAIM_ACCOUNT - Objects are reclaimable.
275 *
276 * &SLAB_TYPESAFE_BY_RCU - Slab page (not individual objects) freeing delayed
277 * by a grace period - see the full description before using.
278 *
279 * Context: Cannot be called within a interrupt, but can be interrupted.
280 *
281 * Return: a pointer to the cache on success, NULL on failure.
282 */
283struct kmem_cache *__kmem_cache_create_args(const char *name,
284 unsigned int object_size,
285 struct kmem_cache_args *args,
286 slab_flags_t flags)
287{
288 struct kmem_cache *s = NULL;
289 const char *cache_name;
290 int err;
291
292#ifdef CONFIG_SLUB_DEBUG
293 /*
294 * If no slab_debug was enabled globally, the static key is not yet
295 * enabled by setup_slub_debug(). Enable it if the cache is being
296 * created with any of the debugging flags passed explicitly.
297 * It's also possible that this is the first cache created with
298 * SLAB_STORE_USER and we should init stack_depot for it.
299 */
300 if (flags & SLAB_DEBUG_FLAGS)
301 static_branch_enable(&slub_debug_enabled);
302 if (flags & SLAB_STORE_USER)
303 stack_depot_init();
304#else
305 flags &= ~SLAB_DEBUG_FLAGS;
306#endif
307
308 mutex_lock(lock: &slab_mutex);
309
310 err = kmem_cache_sanity_check(name, size: object_size);
311 if (err) {
312 goto out_unlock;
313 }
314
315 if (flags & ~SLAB_FLAGS_PERMITTED) {
316 err = -EINVAL;
317 goto out_unlock;
318 }
319
320 /* Fail closed on bad usersize of useroffset values. */
321 if (!IS_ENABLED(CONFIG_HARDENED_USERCOPY) ||
322 WARN_ON(!args->usersize && args->useroffset) ||
323 WARN_ON(object_size < args->usersize ||
324 object_size - args->usersize < args->useroffset))
325 args->usersize = args->useroffset = 0;
326
327 if (!args->usersize && !args->sheaf_capacity)
328 s = __kmem_cache_alias(name, size: object_size, align: args->align, flags,
329 ctor: args->ctor);
330 if (s)
331 goto out_unlock;
332
333 cache_name = kstrdup_const(s: name, GFP_KERNEL);
334 if (!cache_name) {
335 err = -ENOMEM;
336 goto out_unlock;
337 }
338
339 args->align = calculate_alignment(flags, align: args->align, size: object_size);
340 s = create_cache(name: cache_name, object_size, args, flags);
341 if (IS_ERR(ptr: s)) {
342 err = PTR_ERR(ptr: s);
343 kfree_const(x: cache_name);
344 }
345
346out_unlock:
347 mutex_unlock(lock: &slab_mutex);
348
349 if (err) {
350 if (flags & SLAB_PANIC)
351 panic(fmt: "%s: Failed to create slab '%s'. Error %d\n",
352 __func__, name, err);
353 else {
354 pr_warn("%s(%s) failed with error %d\n",
355 __func__, name, err);
356 dump_stack();
357 }
358 return NULL;
359 }
360 return s;
361}
362EXPORT_SYMBOL(__kmem_cache_create_args);
363
364static struct kmem_cache *kmem_buckets_cache __ro_after_init;
365
366/**
367 * kmem_buckets_create - Create a set of caches that handle dynamic sized
368 * allocations via kmem_buckets_alloc()
369 * @name: A prefix string which is used in /proc/slabinfo to identify this
370 * cache. The individual caches with have their sizes as the suffix.
371 * @flags: SLAB flags (see kmem_cache_create() for details).
372 * @useroffset: Starting offset within an allocation that may be copied
373 * to/from userspace.
374 * @usersize: How many bytes, starting at @useroffset, may be copied
375 * to/from userspace.
376 * @ctor: A constructor for the objects, run when new allocations are made.
377 *
378 * Cannot be called within an interrupt, but can be interrupted.
379 *
380 * Return: a pointer to the cache on success, NULL on failure. When
381 * CONFIG_SLAB_BUCKETS is not enabled, ZERO_SIZE_PTR is returned, and
382 * subsequent calls to kmem_buckets_alloc() will fall back to kmalloc().
383 * (i.e. callers only need to check for NULL on failure.)
384 */
385kmem_buckets *kmem_buckets_create(const char *name, slab_flags_t flags,
386 unsigned int useroffset,
387 unsigned int usersize,
388 void (*ctor)(void *))
389{
390 unsigned long mask = 0;
391 unsigned int idx;
392 kmem_buckets *b;
393
394 BUILD_BUG_ON(ARRAY_SIZE(kmalloc_caches[KMALLOC_NORMAL]) > BITS_PER_LONG);
395
396 /*
397 * When the separate buckets API is not built in, just return
398 * a non-NULL value for the kmem_buckets pointer, which will be
399 * unused when performing allocations.
400 */
401 if (!IS_ENABLED(CONFIG_SLAB_BUCKETS))
402 return ZERO_SIZE_PTR;
403
404 if (WARN_ON(!kmem_buckets_cache))
405 return NULL;
406
407 b = kmem_cache_alloc(kmem_buckets_cache, GFP_KERNEL|__GFP_ZERO);
408 if (WARN_ON(!b))
409 return NULL;
410
411 flags |= SLAB_NO_MERGE;
412
413 for (idx = 0; idx < ARRAY_SIZE(kmalloc_caches[KMALLOC_NORMAL]); idx++) {
414 char *short_size, *cache_name;
415 unsigned int cache_useroffset, cache_usersize;
416 unsigned int size, aligned_idx;
417
418 if (!kmalloc_caches[KMALLOC_NORMAL][idx])
419 continue;
420
421 size = kmalloc_caches[KMALLOC_NORMAL][idx]->object_size;
422 if (!size)
423 continue;
424
425 short_size = strchr(kmalloc_caches[KMALLOC_NORMAL][idx]->name, '-');
426 if (WARN_ON(!short_size))
427 goto fail;
428
429 if (useroffset >= size) {
430 cache_useroffset = 0;
431 cache_usersize = 0;
432 } else {
433 cache_useroffset = useroffset;
434 cache_usersize = min(size - cache_useroffset, usersize);
435 }
436
437 aligned_idx = __kmalloc_index(size, size_is_constant: false);
438 if (!(*b)[aligned_idx]) {
439 cache_name = kasprintf(GFP_KERNEL, fmt: "%s-%s", name, short_size + 1);
440 if (WARN_ON(!cache_name))
441 goto fail;
442 (*b)[aligned_idx] = kmem_cache_create_usercopy(name: cache_name, size,
443 align: 0, flags, useroffset: cache_useroffset,
444 usersize: cache_usersize, ctor);
445 kfree(objp: cache_name);
446 if (WARN_ON(!(*b)[aligned_idx]))
447 goto fail;
448 set_bit(nr: aligned_idx, addr: &mask);
449 }
450 if (idx != aligned_idx)
451 (*b)[idx] = (*b)[aligned_idx];
452 }
453
454 return b;
455
456fail:
457 for_each_set_bit(idx, &mask, ARRAY_SIZE(kmalloc_caches[KMALLOC_NORMAL]))
458 kmem_cache_destroy(s: (*b)[idx]);
459 kmem_cache_free(s: kmem_buckets_cache, objp: b);
460
461 return NULL;
462}
463EXPORT_SYMBOL(kmem_buckets_create);
464
465/*
466 * For a given kmem_cache, kmem_cache_destroy() should only be called
467 * once or there will be a use-after-free problem. The actual deletion
468 * and release of the kobject does not need slab_mutex or cpu_hotplug_lock
469 * protection. So they are now done without holding those locks.
470 */
471static void kmem_cache_release(struct kmem_cache *s)
472{
473 kfence_shutdown_cache(s);
474 if (__is_defined(SLAB_SUPPORTS_SYSFS) && slab_state >= FULL)
475 sysfs_slab_release(s);
476 else
477 slab_kmem_cache_release(s);
478}
479
480void slab_kmem_cache_release(struct kmem_cache *s)
481{
482 __kmem_cache_release(s);
483 kfree_const(x: s->name);
484 kmem_cache_free(s: kmem_cache, objp: s);
485}
486
487void kmem_cache_destroy(struct kmem_cache *s)
488{
489 int err;
490
491 if (unlikely(!s) || !kasan_check_byte(address: s))
492 return;
493
494 /* in-flight kfree_rcu()'s may include objects from our cache */
495 kvfree_rcu_barrier();
496
497 if (IS_ENABLED(CONFIG_SLUB_RCU_DEBUG) &&
498 (s->flags & SLAB_TYPESAFE_BY_RCU)) {
499 /*
500 * Under CONFIG_SLUB_RCU_DEBUG, when objects in a
501 * SLAB_TYPESAFE_BY_RCU slab are freed, SLUB will internally
502 * defer their freeing with call_rcu().
503 * Wait for such call_rcu() invocations here before actually
504 * destroying the cache.
505 *
506 * It doesn't matter that we haven't looked at the slab refcount
507 * yet - slabs with SLAB_TYPESAFE_BY_RCU can't be merged, so
508 * the refcount should be 1 here.
509 */
510 rcu_barrier();
511 }
512
513 /* Wait for deferred work from kmalloc/kfree_nolock() */
514 defer_free_barrier();
515
516 cpus_read_lock();
517 mutex_lock(lock: &slab_mutex);
518
519 s->refcount--;
520 if (s->refcount) {
521 mutex_unlock(lock: &slab_mutex);
522 cpus_read_unlock();
523 return;
524 }
525
526 /* free asan quarantined objects */
527 kasan_cache_shutdown(cache: s);
528
529 err = __kmem_cache_shutdown(s);
530 if (!slab_in_kunit_test())
531 WARN(err, "%s %s: Slab cache still has objects when called from %pS",
532 __func__, s->name, (void *)_RET_IP_);
533
534 list_del(entry: &s->list);
535
536 mutex_unlock(lock: &slab_mutex);
537 cpus_read_unlock();
538
539 if (slab_state >= FULL)
540 sysfs_slab_unlink(s);
541 debugfs_slab_release(s);
542
543 if (err)
544 return;
545
546 if (s->flags & SLAB_TYPESAFE_BY_RCU)
547 rcu_barrier();
548
549 kmem_cache_release(s);
550}
551EXPORT_SYMBOL(kmem_cache_destroy);
552
553/**
554 * kmem_cache_shrink - Shrink a cache.
555 * @cachep: The cache to shrink.
556 *
557 * Releases as many slabs as possible for a cache.
558 * To help debugging, a zero exit status indicates all slabs were released.
559 *
560 * Return: %0 if all slabs were released, non-zero otherwise
561 */
562int kmem_cache_shrink(struct kmem_cache *cachep)
563{
564 kasan_cache_shrink(cache: cachep);
565
566 return __kmem_cache_shrink(cachep);
567}
568EXPORT_SYMBOL(kmem_cache_shrink);
569
570bool slab_is_available(void)
571{
572 return slab_state >= UP;
573}
574
575#ifdef CONFIG_PRINTK
576static void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
577{
578 if (__kfence_obj_info(kpp, object, slab))
579 return;
580 __kmem_obj_info(kpp, object, slab);
581}
582
583/**
584 * kmem_dump_obj - Print available slab provenance information
585 * @object: slab object for which to find provenance information.
586 *
587 * This function uses pr_cont(), so that the caller is expected to have
588 * printed out whatever preamble is appropriate. The provenance information
589 * depends on the type of object and on how much debugging is enabled.
590 * For a slab-cache object, the fact that it is a slab object is printed,
591 * and, if available, the slab name, return address, and stack trace from
592 * the allocation and last free path of that object.
593 *
594 * Return: %true if the pointer is to a not-yet-freed object from
595 * kmalloc() or kmem_cache_alloc(), either %true or %false if the pointer
596 * is to an already-freed object, and %false otherwise.
597 */
598bool kmem_dump_obj(void *object)
599{
600 char *cp = IS_ENABLED(CONFIG_MMU) ? "" : "/vmalloc";
601 int i;
602 struct slab *slab;
603 unsigned long ptroffset;
604 struct kmem_obj_info kp = { };
605
606 /* Some arches consider ZERO_SIZE_PTR to be a valid address. */
607 if (object < (void *)PAGE_SIZE || !virt_addr_valid(object))
608 return false;
609 slab = virt_to_slab(addr: object);
610 if (!slab)
611 return false;
612
613 kmem_obj_info(kpp: &kp, object, slab);
614 if (kp.kp_slab_cache)
615 pr_cont(" slab%s %s", cp, kp.kp_slab_cache->name);
616 else
617 pr_cont(" slab%s", cp);
618 if (is_kfence_address(addr: object))
619 pr_cont(" (kfence)");
620 if (kp.kp_objp)
621 pr_cont(" start %px", kp.kp_objp);
622 if (kp.kp_data_offset)
623 pr_cont(" data offset %lu", kp.kp_data_offset);
624 if (kp.kp_objp) {
625 ptroffset = ((char *)object - (char *)kp.kp_objp) - kp.kp_data_offset;
626 pr_cont(" pointer offset %lu", ptroffset);
627 }
628 if (kp.kp_slab_cache && kp.kp_slab_cache->object_size)
629 pr_cont(" size %u", kp.kp_slab_cache->object_size);
630 if (kp.kp_ret)
631 pr_cont(" allocated at %pS\n", kp.kp_ret);
632 else
633 pr_cont("\n");
634 for (i = 0; i < ARRAY_SIZE(kp.kp_stack); i++) {
635 if (!kp.kp_stack[i])
636 break;
637 pr_info(" %pS\n", kp.kp_stack[i]);
638 }
639
640 if (kp.kp_free_stack[0])
641 pr_cont(" Free path:\n");
642
643 for (i = 0; i < ARRAY_SIZE(kp.kp_free_stack); i++) {
644 if (!kp.kp_free_stack[i])
645 break;
646 pr_info(" %pS\n", kp.kp_free_stack[i]);
647 }
648
649 return true;
650}
651EXPORT_SYMBOL_GPL(kmem_dump_obj);
652#endif
653
654/* Create a cache during boot when no slab services are available yet */
655void __init create_boot_cache(struct kmem_cache *s, const char *name,
656 unsigned int size, slab_flags_t flags,
657 unsigned int useroffset, unsigned int usersize)
658{
659 int err;
660 unsigned int align = ARCH_KMALLOC_MINALIGN;
661 struct kmem_cache_args kmem_args = {};
662
663 /*
664 * kmalloc caches guarantee alignment of at least the largest
665 * power-of-two divisor of the size. For power-of-two sizes,
666 * it is the size itself.
667 */
668 if (flags & SLAB_KMALLOC)
669 align = max(align, 1U << (ffs(size) - 1));
670 kmem_args.align = calculate_alignment(flags, align, size);
671
672#ifdef CONFIG_HARDENED_USERCOPY
673 kmem_args.useroffset = useroffset;
674 kmem_args.usersize = usersize;
675#endif
676
677 err = do_kmem_cache_create(s, name, size, args: &kmem_args, flags);
678
679 if (err)
680 panic(fmt: "Creation of kmalloc slab %s size=%u failed. Reason %d\n",
681 name, size, err);
682
683 s->refcount = -1; /* Exempt from merging for now */
684}
685
686static struct kmem_cache *__init create_kmalloc_cache(const char *name,
687 unsigned int size,
688 slab_flags_t flags)
689{
690 struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
691
692 if (!s)
693 panic(fmt: "Out of memory when creating slab %s\n", name);
694
695 create_boot_cache(s, name, size, flags: flags | SLAB_KMALLOC, useroffset: 0, usersize: size);
696 list_add(new: &s->list, head: &slab_caches);
697 s->refcount = 1;
698 return s;
699}
700
701kmem_buckets kmalloc_caches[NR_KMALLOC_TYPES] __ro_after_init =
702{ /* initialization for https://llvm.org/pr42570 */ };
703EXPORT_SYMBOL(kmalloc_caches);
704
705#ifdef CONFIG_RANDOM_KMALLOC_CACHES
706unsigned long random_kmalloc_seed __ro_after_init;
707EXPORT_SYMBOL(random_kmalloc_seed);
708#endif
709
710/*
711 * Conversion table for small slabs sizes / 8 to the index in the
712 * kmalloc array. This is necessary for slabs < 192 since we have non power
713 * of two cache sizes there. The size of larger slabs can be determined using
714 * fls.
715 */
716u8 kmalloc_size_index[24] __ro_after_init = {
717 3, /* 8 */
718 4, /* 16 */
719 5, /* 24 */
720 5, /* 32 */
721 6, /* 40 */
722 6, /* 48 */
723 6, /* 56 */
724 6, /* 64 */
725 1, /* 72 */
726 1, /* 80 */
727 1, /* 88 */
728 1, /* 96 */
729 7, /* 104 */
730 7, /* 112 */
731 7, /* 120 */
732 7, /* 128 */
733 2, /* 136 */
734 2, /* 144 */
735 2, /* 152 */
736 2, /* 160 */
737 2, /* 168 */
738 2, /* 176 */
739 2, /* 184 */
740 2 /* 192 */
741};
742
743size_t kmalloc_size_roundup(size_t size)
744{
745 if (size && size <= KMALLOC_MAX_CACHE_SIZE) {
746 /*
747 * The flags don't matter since size_index is common to all.
748 * Neither does the caller for just getting ->object_size.
749 */
750 return kmalloc_slab(size, NULL, GFP_KERNEL, caller: 0)->object_size;
751 }
752
753 /* Above the smaller buckets, size is a multiple of page size. */
754 if (size && size <= KMALLOC_MAX_SIZE)
755 return PAGE_SIZE << get_order(size);
756
757 /*
758 * Return 'size' for 0 - kmalloc() returns ZERO_SIZE_PTR
759 * and very large size - kmalloc() may fail.
760 */
761 return size;
762
763}
764EXPORT_SYMBOL(kmalloc_size_roundup);
765
766#ifdef CONFIG_ZONE_DMA
767#define KMALLOC_DMA_NAME(sz) .name[KMALLOC_DMA] = "dma-kmalloc-" #sz,
768#else
769#define KMALLOC_DMA_NAME(sz)
770#endif
771
772#ifdef CONFIG_MEMCG
773#define KMALLOC_CGROUP_NAME(sz) .name[KMALLOC_CGROUP] = "kmalloc-cg-" #sz,
774#else
775#define KMALLOC_CGROUP_NAME(sz)
776#endif
777
778#ifndef CONFIG_SLUB_TINY
779#define KMALLOC_RCL_NAME(sz) .name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #sz,
780#else
781#define KMALLOC_RCL_NAME(sz)
782#endif
783
784#ifdef CONFIG_RANDOM_KMALLOC_CACHES
785#define __KMALLOC_RANDOM_CONCAT(a, b) a ## b
786#define KMALLOC_RANDOM_NAME(N, sz) __KMALLOC_RANDOM_CONCAT(KMA_RAND_, N)(sz)
787#define KMA_RAND_1(sz) .name[KMALLOC_RANDOM_START + 1] = "kmalloc-rnd-01-" #sz,
788#define KMA_RAND_2(sz) KMA_RAND_1(sz) .name[KMALLOC_RANDOM_START + 2] = "kmalloc-rnd-02-" #sz,
789#define KMA_RAND_3(sz) KMA_RAND_2(sz) .name[KMALLOC_RANDOM_START + 3] = "kmalloc-rnd-03-" #sz,
790#define KMA_RAND_4(sz) KMA_RAND_3(sz) .name[KMALLOC_RANDOM_START + 4] = "kmalloc-rnd-04-" #sz,
791#define KMA_RAND_5(sz) KMA_RAND_4(sz) .name[KMALLOC_RANDOM_START + 5] = "kmalloc-rnd-05-" #sz,
792#define KMA_RAND_6(sz) KMA_RAND_5(sz) .name[KMALLOC_RANDOM_START + 6] = "kmalloc-rnd-06-" #sz,
793#define KMA_RAND_7(sz) KMA_RAND_6(sz) .name[KMALLOC_RANDOM_START + 7] = "kmalloc-rnd-07-" #sz,
794#define KMA_RAND_8(sz) KMA_RAND_7(sz) .name[KMALLOC_RANDOM_START + 8] = "kmalloc-rnd-08-" #sz,
795#define KMA_RAND_9(sz) KMA_RAND_8(sz) .name[KMALLOC_RANDOM_START + 9] = "kmalloc-rnd-09-" #sz,
796#define KMA_RAND_10(sz) KMA_RAND_9(sz) .name[KMALLOC_RANDOM_START + 10] = "kmalloc-rnd-10-" #sz,
797#define KMA_RAND_11(sz) KMA_RAND_10(sz) .name[KMALLOC_RANDOM_START + 11] = "kmalloc-rnd-11-" #sz,
798#define KMA_RAND_12(sz) KMA_RAND_11(sz) .name[KMALLOC_RANDOM_START + 12] = "kmalloc-rnd-12-" #sz,
799#define KMA_RAND_13(sz) KMA_RAND_12(sz) .name[KMALLOC_RANDOM_START + 13] = "kmalloc-rnd-13-" #sz,
800#define KMA_RAND_14(sz) KMA_RAND_13(sz) .name[KMALLOC_RANDOM_START + 14] = "kmalloc-rnd-14-" #sz,
801#define KMA_RAND_15(sz) KMA_RAND_14(sz) .name[KMALLOC_RANDOM_START + 15] = "kmalloc-rnd-15-" #sz,
802#else // CONFIG_RANDOM_KMALLOC_CACHES
803#define KMALLOC_RANDOM_NAME(N, sz)
804#endif
805
806#define INIT_KMALLOC_INFO(__size, __short_size) \
807{ \
808 .name[KMALLOC_NORMAL] = "kmalloc-" #__short_size, \
809 KMALLOC_RCL_NAME(__short_size) \
810 KMALLOC_CGROUP_NAME(__short_size) \
811 KMALLOC_DMA_NAME(__short_size) \
812 KMALLOC_RANDOM_NAME(RANDOM_KMALLOC_CACHES_NR, __short_size) \
813 .size = __size, \
814}
815
816/*
817 * kmalloc_info[] is to make slab_debug=,kmalloc-xx option work at boot time.
818 * kmalloc_index() supports up to 2^21=2MB, so the final entry of the table is
819 * kmalloc-2M.
820 */
821const struct kmalloc_info_struct kmalloc_info[] __initconst = {
822 INIT_KMALLOC_INFO(0, 0),
823 INIT_KMALLOC_INFO(96, 96),
824 INIT_KMALLOC_INFO(192, 192),
825 INIT_KMALLOC_INFO(8, 8),
826 INIT_KMALLOC_INFO(16, 16),
827 INIT_KMALLOC_INFO(32, 32),
828 INIT_KMALLOC_INFO(64, 64),
829 INIT_KMALLOC_INFO(128, 128),
830 INIT_KMALLOC_INFO(256, 256),
831 INIT_KMALLOC_INFO(512, 512),
832 INIT_KMALLOC_INFO(1024, 1k),
833 INIT_KMALLOC_INFO(2048, 2k),
834 INIT_KMALLOC_INFO(4096, 4k),
835 INIT_KMALLOC_INFO(8192, 8k),
836 INIT_KMALLOC_INFO(16384, 16k),
837 INIT_KMALLOC_INFO(32768, 32k),
838 INIT_KMALLOC_INFO(65536, 64k),
839 INIT_KMALLOC_INFO(131072, 128k),
840 INIT_KMALLOC_INFO(262144, 256k),
841 INIT_KMALLOC_INFO(524288, 512k),
842 INIT_KMALLOC_INFO(1048576, 1M),
843 INIT_KMALLOC_INFO(2097152, 2M)
844};
845
846/*
847 * Patch up the size_index table if we have strange large alignment
848 * requirements for the kmalloc array. This is only the case for
849 * MIPS it seems. The standard arches will not generate any code here.
850 *
851 * Largest permitted alignment is 256 bytes due to the way we
852 * handle the index determination for the smaller caches.
853 *
854 * Make sure that nothing crazy happens if someone starts tinkering
855 * around with ARCH_KMALLOC_MINALIGN
856 */
857void __init setup_kmalloc_cache_index_table(void)
858{
859 unsigned int i;
860
861 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
862 !is_power_of_2(KMALLOC_MIN_SIZE));
863
864 for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) {
865 unsigned int elem = size_index_elem(bytes: i);
866
867 if (elem >= ARRAY_SIZE(kmalloc_size_index))
868 break;
869 kmalloc_size_index[elem] = KMALLOC_SHIFT_LOW;
870 }
871
872 if (KMALLOC_MIN_SIZE >= 64) {
873 /*
874 * The 96 byte sized cache is not used if the alignment
875 * is 64 byte.
876 */
877 for (i = 64 + 8; i <= 96; i += 8)
878 kmalloc_size_index[size_index_elem(bytes: i)] = 7;
879
880 }
881
882 if (KMALLOC_MIN_SIZE >= 128) {
883 /*
884 * The 192 byte sized cache is not used if the alignment
885 * is 128 byte. Redirect kmalloc to use the 256 byte cache
886 * instead.
887 */
888 for (i = 128 + 8; i <= 192; i += 8)
889 kmalloc_size_index[size_index_elem(bytes: i)] = 8;
890 }
891}
892
893static unsigned int __kmalloc_minalign(void)
894{
895 unsigned int minalign = dma_get_cache_alignment();
896
897 if (IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) &&
898 is_swiotlb_allocated())
899 minalign = ARCH_KMALLOC_MINALIGN;
900
901 return max(minalign, arch_slab_minalign());
902}
903
904static void __init
905new_kmalloc_cache(int idx, enum kmalloc_cache_type type)
906{
907 slab_flags_t flags = 0;
908 unsigned int minalign = __kmalloc_minalign();
909 unsigned int aligned_size = kmalloc_info[idx].size;
910 int aligned_idx = idx;
911
912 if ((KMALLOC_RECLAIM != KMALLOC_NORMAL) && (type == KMALLOC_RECLAIM)) {
913 flags |= SLAB_RECLAIM_ACCOUNT;
914 } else if (IS_ENABLED(CONFIG_MEMCG) && (type == KMALLOC_CGROUP)) {
915 if (mem_cgroup_kmem_disabled()) {
916 kmalloc_caches[type][idx] = kmalloc_caches[KMALLOC_NORMAL][idx];
917 return;
918 }
919 flags |= SLAB_ACCOUNT;
920 } else if (IS_ENABLED(CONFIG_ZONE_DMA) && (type == KMALLOC_DMA)) {
921 flags |= SLAB_CACHE_DMA;
922 }
923
924#ifdef CONFIG_RANDOM_KMALLOC_CACHES
925 if (type >= KMALLOC_RANDOM_START && type <= KMALLOC_RANDOM_END)
926 flags |= SLAB_NO_MERGE;
927#endif
928
929 /*
930 * If CONFIG_MEMCG is enabled, disable cache merging for
931 * KMALLOC_NORMAL caches.
932 */
933 if (IS_ENABLED(CONFIG_MEMCG) && (type == KMALLOC_NORMAL))
934 flags |= SLAB_NO_MERGE;
935
936 if (minalign > ARCH_KMALLOC_MINALIGN) {
937 aligned_size = ALIGN(aligned_size, minalign);
938 aligned_idx = __kmalloc_index(size: aligned_size, size_is_constant: false);
939 }
940
941 if (!kmalloc_caches[type][aligned_idx])
942 kmalloc_caches[type][aligned_idx] = create_kmalloc_cache(
943 name: kmalloc_info[aligned_idx].name[type],
944 size: aligned_size, flags);
945 if (idx != aligned_idx)
946 kmalloc_caches[type][idx] = kmalloc_caches[type][aligned_idx];
947}
948
949/*
950 * Create the kmalloc array. Some of the regular kmalloc arrays
951 * may already have been created because they were needed to
952 * enable allocations for slab creation.
953 */
954void __init create_kmalloc_caches(void)
955{
956 int i;
957 enum kmalloc_cache_type type;
958
959 /*
960 * Including KMALLOC_CGROUP if CONFIG_MEMCG defined
961 */
962 for (type = KMALLOC_NORMAL; type < NR_KMALLOC_TYPES; type++) {
963 /* Caches that are NOT of the two-to-the-power-of size. */
964 if (KMALLOC_MIN_SIZE <= 32)
965 new_kmalloc_cache(idx: 1, type);
966 if (KMALLOC_MIN_SIZE <= 64)
967 new_kmalloc_cache(idx: 2, type);
968
969 /* Caches that are of the two-to-the-power-of size. */
970 for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++)
971 new_kmalloc_cache(idx: i, type);
972 }
973#ifdef CONFIG_RANDOM_KMALLOC_CACHES
974 random_kmalloc_seed = get_random_u64();
975#endif
976
977 /* Kmalloc array is now usable */
978 slab_state = UP;
979
980 if (IS_ENABLED(CONFIG_SLAB_BUCKETS))
981 kmem_buckets_cache = kmem_cache_create("kmalloc_buckets",
982 sizeof(kmem_buckets),
983 0, SLAB_NO_MERGE, NULL);
984}
985
986/**
987 * __ksize -- Report full size of underlying allocation
988 * @object: pointer to the object
989 *
990 * This should only be used internally to query the true size of allocations.
991 * It is not meant to be a way to discover the usable size of an allocation
992 * after the fact. Instead, use kmalloc_size_roundup(). Using memory beyond
993 * the originally requested allocation size may trigger KASAN, UBSAN_BOUNDS,
994 * and/or FORTIFY_SOURCE.
995 *
996 * Return: size of the actual memory used by @object in bytes
997 */
998size_t __ksize(const void *object)
999{
1000 struct folio *folio;
1001
1002 if (unlikely(object == ZERO_SIZE_PTR))
1003 return 0;
1004
1005 folio = virt_to_folio(x: object);
1006
1007 if (unlikely(!folio_test_slab(folio))) {
1008 if (WARN_ON(folio_size(folio) <= KMALLOC_MAX_CACHE_SIZE))
1009 return 0;
1010 if (WARN_ON(object != folio_address(folio)))
1011 return 0;
1012 return folio_size(folio);
1013 }
1014
1015#ifdef CONFIG_SLUB_DEBUG
1016 skip_orig_size_check(folio_slab(folio)->slab_cache, object);
1017#endif
1018
1019 return slab_ksize(folio_slab(folio)->slab_cache);
1020}
1021
1022gfp_t kmalloc_fix_flags(gfp_t flags)
1023{
1024 gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
1025
1026 flags &= ~GFP_SLAB_BUG_MASK;
1027 pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n",
1028 invalid_mask, &invalid_mask, flags, &flags);
1029 dump_stack();
1030
1031 return flags;
1032}
1033
1034#ifdef CONFIG_SLAB_FREELIST_RANDOM
1035/* Randomize a generic freelist */
1036static void freelist_randomize(unsigned int *list,
1037 unsigned int count)
1038{
1039 unsigned int rand;
1040 unsigned int i;
1041
1042 for (i = 0; i < count; i++)
1043 list[i] = i;
1044
1045 /* Fisher-Yates shuffle */
1046 for (i = count - 1; i > 0; i--) {
1047 rand = get_random_u32_below(i + 1);
1048 swap(list[i], list[rand]);
1049 }
1050}
1051
1052/* Create a random sequence per cache */
1053int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count,
1054 gfp_t gfp)
1055{
1056
1057 if (count < 2 || cachep->random_seq)
1058 return 0;
1059
1060 cachep->random_seq = kcalloc(count, sizeof(unsigned int), gfp);
1061 if (!cachep->random_seq)
1062 return -ENOMEM;
1063
1064 freelist_randomize(cachep->random_seq, count);
1065 return 0;
1066}
1067
1068/* Destroy the per-cache random freelist sequence */
1069void cache_random_seq_destroy(struct kmem_cache *cachep)
1070{
1071 kfree(cachep->random_seq);
1072 cachep->random_seq = NULL;
1073}
1074#endif /* CONFIG_SLAB_FREELIST_RANDOM */
1075
1076#ifdef CONFIG_SLUB_DEBUG
1077#define SLABINFO_RIGHTS (0400)
1078
1079static void print_slabinfo_header(struct seq_file *m)
1080{
1081 /*
1082 * Output format version, so at least we can change it
1083 * without _too_ many complaints.
1084 */
1085 seq_puts(m, s: "slabinfo - version: 2.1\n");
1086 seq_puts(m, s: "# name <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>");
1087 seq_puts(m, s: " : tunables <limit> <batchcount> <sharedfactor>");
1088 seq_puts(m, s: " : slabdata <active_slabs> <num_slabs> <sharedavail>");
1089 seq_putc(m, c: '\n');
1090}
1091
1092static void *slab_start(struct seq_file *m, loff_t *pos)
1093{
1094 mutex_lock(lock: &slab_mutex);
1095 return seq_list_start(head: &slab_caches, pos: *pos);
1096}
1097
1098static void *slab_next(struct seq_file *m, void *p, loff_t *pos)
1099{
1100 return seq_list_next(v: p, head: &slab_caches, ppos: pos);
1101}
1102
1103static void slab_stop(struct seq_file *m, void *p)
1104{
1105 mutex_unlock(lock: &slab_mutex);
1106}
1107
1108static void cache_show(struct kmem_cache *s, struct seq_file *m)
1109{
1110 struct slabinfo sinfo;
1111
1112 memset(s: &sinfo, c: 0, n: sizeof(sinfo));
1113 get_slabinfo(s, sinfo: &sinfo);
1114
1115 seq_printf(m, fmt: "%-17s %6lu %6lu %6u %4u %4d",
1116 s->name, sinfo.active_objs, sinfo.num_objs, s->size,
1117 sinfo.objects_per_slab, (1 << sinfo.cache_order));
1118
1119 seq_printf(m, fmt: " : tunables %4u %4u %4u",
1120 sinfo.limit, sinfo.batchcount, sinfo.shared);
1121 seq_printf(m, fmt: " : slabdata %6lu %6lu %6lu",
1122 sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail);
1123 seq_putc(m, c: '\n');
1124}
1125
1126static int slab_show(struct seq_file *m, void *p)
1127{
1128 struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
1129
1130 if (p == slab_caches.next)
1131 print_slabinfo_header(m);
1132 cache_show(s, m);
1133 return 0;
1134}
1135
1136void dump_unreclaimable_slab(void)
1137{
1138 struct kmem_cache *s;
1139 struct slabinfo sinfo;
1140
1141 /*
1142 * Here acquiring slab_mutex is risky since we don't prefer to get
1143 * sleep in oom path. But, without mutex hold, it may introduce a
1144 * risk of crash.
1145 * Use mutex_trylock to protect the list traverse, dump nothing
1146 * without acquiring the mutex.
1147 */
1148 if (!mutex_trylock(lock: &slab_mutex)) {
1149 pr_warn("excessive unreclaimable slab but cannot dump stats\n");
1150 return;
1151 }
1152
1153 pr_info("Unreclaimable slab info:\n");
1154 pr_info("Name Used Total\n");
1155
1156 list_for_each_entry(s, &slab_caches, list) {
1157 if (s->flags & SLAB_RECLAIM_ACCOUNT)
1158 continue;
1159
1160 get_slabinfo(s, sinfo: &sinfo);
1161
1162 if (sinfo.num_objs > 0)
1163 pr_info("%-17s %10luKB %10luKB\n", s->name,
1164 (sinfo.active_objs * s->size) / 1024,
1165 (sinfo.num_objs * s->size) / 1024);
1166 }
1167 mutex_unlock(lock: &slab_mutex);
1168}
1169
1170/*
1171 * slabinfo_op - iterator that generates /proc/slabinfo
1172 *
1173 * Output layout:
1174 * cache-name
1175 * num-active-objs
1176 * total-objs
1177 * object size
1178 * num-active-slabs
1179 * total-slabs
1180 * num-pages-per-slab
1181 * + further values on SMP and with statistics enabled
1182 */
1183static const struct seq_operations slabinfo_op = {
1184 .start = slab_start,
1185 .next = slab_next,
1186 .stop = slab_stop,
1187 .show = slab_show,
1188};
1189
1190static int slabinfo_open(struct inode *inode, struct file *file)
1191{
1192 return seq_open(file, &slabinfo_op);
1193}
1194
1195static const struct proc_ops slabinfo_proc_ops = {
1196 .proc_flags = PROC_ENTRY_PERMANENT,
1197 .proc_open = slabinfo_open,
1198 .proc_read = seq_read,
1199 .proc_lseek = seq_lseek,
1200 .proc_release = seq_release,
1201};
1202
1203static int __init slab_proc_init(void)
1204{
1205 proc_create(name: "slabinfo", SLABINFO_RIGHTS, NULL, proc_ops: &slabinfo_proc_ops);
1206 return 0;
1207}
1208module_init(slab_proc_init);
1209
1210#endif /* CONFIG_SLUB_DEBUG */
1211
1212/**
1213 * kfree_sensitive - Clear sensitive information in memory before freeing
1214 * @p: object to free memory of
1215 *
1216 * The memory of the object @p points to is zeroed before freed.
1217 * If @p is %NULL, kfree_sensitive() does nothing.
1218 *
1219 * Note: this function zeroes the whole allocated buffer which can be a good
1220 * deal bigger than the requested buffer size passed to kmalloc(). So be
1221 * careful when using this function in performance sensitive code.
1222 */
1223void kfree_sensitive(const void *p)
1224{
1225 size_t ks;
1226 void *mem = (void *)p;
1227
1228 ks = ksize(objp: mem);
1229 if (ks) {
1230 kasan_unpoison_range(address: mem, size: ks);
1231 memzero_explicit(s: mem, count: ks);
1232 }
1233 kfree(objp: mem);
1234}
1235EXPORT_SYMBOL(kfree_sensitive);
1236
1237size_t ksize(const void *objp)
1238{
1239 /*
1240 * We need to first check that the pointer to the object is valid.
1241 * The KASAN report printed from ksize() is more useful, then when
1242 * it's printed later when the behaviour could be undefined due to
1243 * a potential use-after-free or double-free.
1244 *
1245 * We use kasan_check_byte(), which is supported for the hardware
1246 * tag-based KASAN mode, unlike kasan_check_read/write().
1247 *
1248 * If the pointed to memory is invalid, we return 0 to avoid users of
1249 * ksize() writing to and potentially corrupting the memory region.
1250 *
1251 * We want to perform the check before __ksize(), to avoid potentially
1252 * crashing in __ksize() due to accessing invalid metadata.
1253 */
1254 if (unlikely(ZERO_OR_NULL_PTR(objp)) || !kasan_check_byte(address: objp))
1255 return 0;
1256
1257 return kfence_ksize(addr: objp) ?: __ksize(object: objp);
1258}
1259EXPORT_SYMBOL(ksize);
1260
1261#ifdef CONFIG_BPF_SYSCALL
1262#include <linux/btf.h>
1263
1264__bpf_kfunc_start_defs();
1265
1266__bpf_kfunc struct kmem_cache *bpf_get_kmem_cache(u64 addr)
1267{
1268 struct slab *slab;
1269
1270 if (!virt_addr_valid((void *)(long)addr))
1271 return NULL;
1272
1273 slab = virt_to_slab((void *)(long)addr);
1274 return slab ? slab->slab_cache : NULL;
1275}
1276
1277__bpf_kfunc_end_defs();
1278#endif /* CONFIG_BPF_SYSCALL */
1279
1280/* Tracepoints definitions. */
1281EXPORT_TRACEPOINT_SYMBOL(kmalloc);
1282EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
1283EXPORT_TRACEPOINT_SYMBOL(kfree);
1284EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
1285
1286#ifndef CONFIG_KVFREE_RCU_BATCHED
1287
1288void kvfree_call_rcu(struct rcu_head *head, void *ptr)
1289{
1290 if (head) {
1291 kasan_record_aux_stack(ptr);
1292 call_rcu(head, kvfree_rcu_cb);
1293 return;
1294 }
1295
1296 // kvfree_rcu(one_arg) call.
1297 might_sleep();
1298 synchronize_rcu();
1299 kvfree(ptr);
1300}
1301EXPORT_SYMBOL_GPL(kvfree_call_rcu);
1302
1303void __init kvfree_rcu_init(void)
1304{
1305}
1306
1307#else /* CONFIG_KVFREE_RCU_BATCHED */
1308
1309/*
1310 * This rcu parameter is runtime-read-only. It reflects
1311 * a minimum allowed number of objects which can be cached
1312 * per-CPU. Object size is equal to one page. This value
1313 * can be changed at boot time.
1314 */
1315static int rcu_min_cached_objs = 5;
1316module_param(rcu_min_cached_objs, int, 0444);
1317
1318// A page shrinker can ask for pages to be freed to make them
1319// available for other parts of the system. This usually happens
1320// under low memory conditions, and in that case we should also
1321// defer page-cache filling for a short time period.
1322//
1323// The default value is 5 seconds, which is long enough to reduce
1324// interference with the shrinker while it asks other systems to
1325// drain their caches.
1326static int rcu_delay_page_cache_fill_msec = 5000;
1327module_param(rcu_delay_page_cache_fill_msec, int, 0444);
1328
1329static struct workqueue_struct *rcu_reclaim_wq;
1330
1331/* Maximum number of jiffies to wait before draining a batch. */
1332#define KFREE_DRAIN_JIFFIES (5 * HZ)
1333#define KFREE_N_BATCHES 2
1334#define FREE_N_CHANNELS 2
1335
1336/**
1337 * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers
1338 * @list: List node. All blocks are linked between each other
1339 * @gp_snap: Snapshot of RCU state for objects placed to this bulk
1340 * @nr_records: Number of active pointers in the array
1341 * @records: Array of the kvfree_rcu() pointers
1342 */
1343struct kvfree_rcu_bulk_data {
1344 struct list_head list;
1345 struct rcu_gp_oldstate gp_snap;
1346 unsigned long nr_records;
1347 void *records[] __counted_by(nr_records);
1348};
1349
1350/*
1351 * This macro defines how many entries the "records" array
1352 * will contain. It is based on the fact that the size of
1353 * kvfree_rcu_bulk_data structure becomes exactly one page.
1354 */
1355#define KVFREE_BULK_MAX_ENTR \
1356 ((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *))
1357
1358/**
1359 * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
1360 * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
1361 * @head_free: List of kfree_rcu() objects waiting for a grace period
1362 * @head_free_gp_snap: Grace-period snapshot to check for attempted premature frees.
1363 * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
1364 * @krcp: Pointer to @kfree_rcu_cpu structure
1365 */
1366
1367struct kfree_rcu_cpu_work {
1368 struct rcu_work rcu_work;
1369 struct rcu_head *head_free;
1370 struct rcu_gp_oldstate head_free_gp_snap;
1371 struct list_head bulk_head_free[FREE_N_CHANNELS];
1372 struct kfree_rcu_cpu *krcp;
1373};
1374
1375/**
1376 * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
1377 * @head: List of kfree_rcu() objects not yet waiting for a grace period
1378 * @head_gp_snap: Snapshot of RCU state for objects placed to "@head"
1379 * @bulk_head: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
1380 * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
1381 * @lock: Synchronize access to this structure
1382 * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
1383 * @initialized: The @rcu_work fields have been initialized
1384 * @head_count: Number of objects in rcu_head singular list
1385 * @bulk_count: Number of objects in bulk-list
1386 * @bkvcache:
1387 * A simple cache list that contains objects for reuse purpose.
1388 * In order to save some per-cpu space the list is singular.
1389 * Even though it is lockless an access has to be protected by the
1390 * per-cpu lock.
1391 * @page_cache_work: A work to refill the cache when it is empty
1392 * @backoff_page_cache_fill: Delay cache refills
1393 * @work_in_progress: Indicates that page_cache_work is running
1394 * @hrtimer: A hrtimer for scheduling a page_cache_work
1395 * @nr_bkv_objs: number of allocated objects at @bkvcache.
1396 *
1397 * This is a per-CPU structure. The reason that it is not included in
1398 * the rcu_data structure is to permit this code to be extracted from
1399 * the RCU files. Such extraction could allow further optimization of
1400 * the interactions with the slab allocators.
1401 */
1402struct kfree_rcu_cpu {
1403 // Objects queued on a linked list
1404 // through their rcu_head structures.
1405 struct rcu_head *head;
1406 unsigned long head_gp_snap;
1407 atomic_t head_count;
1408
1409 // Objects queued on a bulk-list.
1410 struct list_head bulk_head[FREE_N_CHANNELS];
1411 atomic_t bulk_count[FREE_N_CHANNELS];
1412
1413 struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
1414 raw_spinlock_t lock;
1415 struct delayed_work monitor_work;
1416 bool initialized;
1417
1418 struct delayed_work page_cache_work;
1419 atomic_t backoff_page_cache_fill;
1420 atomic_t work_in_progress;
1421 struct hrtimer hrtimer;
1422
1423 struct llist_head bkvcache;
1424 int nr_bkv_objs;
1425};
1426
1427static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = {
1428 .lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock),
1429};
1430
1431static __always_inline void
1432debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead)
1433{
1434#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
1435 int i;
1436
1437 for (i = 0; i < bhead->nr_records; i++)
1438 debug_rcu_head_unqueue((struct rcu_head *)(bhead->records[i]));
1439#endif
1440}
1441
1442static inline struct kfree_rcu_cpu *
1443krc_this_cpu_lock(unsigned long *flags)
1444{
1445 struct kfree_rcu_cpu *krcp;
1446
1447 local_irq_save(*flags); // For safely calling this_cpu_ptr().
1448 krcp = this_cpu_ptr(&krc);
1449 raw_spin_lock(&krcp->lock);
1450
1451 return krcp;
1452}
1453
1454static inline void
1455krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)
1456{
1457 raw_spin_unlock_irqrestore(&krcp->lock, flags);
1458}
1459
1460static inline struct kvfree_rcu_bulk_data *
1461get_cached_bnode(struct kfree_rcu_cpu *krcp)
1462{
1463 if (!krcp->nr_bkv_objs)
1464 return NULL;
1465
1466 WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs - 1);
1467 return (struct kvfree_rcu_bulk_data *)
1468 llist_del_first(head: &krcp->bkvcache);
1469}
1470
1471static inline bool
1472put_cached_bnode(struct kfree_rcu_cpu *krcp,
1473 struct kvfree_rcu_bulk_data *bnode)
1474{
1475 // Check the limit.
1476 if (krcp->nr_bkv_objs >= rcu_min_cached_objs)
1477 return false;
1478
1479 llist_add(new: (struct llist_node *) bnode, head: &krcp->bkvcache);
1480 WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs + 1);
1481 return true;
1482}
1483
1484static int
1485drain_page_cache(struct kfree_rcu_cpu *krcp)
1486{
1487 unsigned long flags;
1488 struct llist_node *page_list, *pos, *n;
1489 int freed = 0;
1490
1491 if (!rcu_min_cached_objs)
1492 return 0;
1493
1494 raw_spin_lock_irqsave(&krcp->lock, flags);
1495 page_list = llist_del_all(head: &krcp->bkvcache);
1496 WRITE_ONCE(krcp->nr_bkv_objs, 0);
1497 raw_spin_unlock_irqrestore(&krcp->lock, flags);
1498
1499 llist_for_each_safe(pos, n, page_list) {
1500 free_page((unsigned long)pos);
1501 freed++;
1502 }
1503
1504 return freed;
1505}
1506
1507static void
1508kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp,
1509 struct kvfree_rcu_bulk_data *bnode, int idx)
1510{
1511 unsigned long flags;
1512 int i;
1513
1514 if (!WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&bnode->gp_snap))) {
1515 debug_rcu_bhead_unqueue(bhead: bnode);
1516 rcu_lock_acquire(&rcu_callback_map);
1517 if (idx == 0) { // kmalloc() / kfree().
1518 trace_rcu_invoke_kfree_bulk_callback(
1519 rcuname: "slab", nr_records: bnode->nr_records,
1520 p: bnode->records);
1521
1522 kfree_bulk(size: bnode->nr_records, p: bnode->records);
1523 } else { // vmalloc() / vfree().
1524 for (i = 0; i < bnode->nr_records; i++) {
1525 trace_rcu_invoke_kvfree_callback(
1526 rcuname: "slab", rhp: bnode->records[i], offset: 0);
1527
1528 vfree(addr: bnode->records[i]);
1529 }
1530 }
1531 rcu_lock_release(&rcu_callback_map);
1532 }
1533
1534 raw_spin_lock_irqsave(&krcp->lock, flags);
1535 if (put_cached_bnode(krcp, bnode))
1536 bnode = NULL;
1537 raw_spin_unlock_irqrestore(&krcp->lock, flags);
1538
1539 if (bnode)
1540 free_page((unsigned long) bnode);
1541
1542 cond_resched_tasks_rcu_qs();
1543}
1544
1545static void
1546kvfree_rcu_list(struct rcu_head *head)
1547{
1548 struct rcu_head *next;
1549
1550 for (; head; head = next) {
1551 void *ptr = (void *) head->func;
1552 unsigned long offset = (void *) head - ptr;
1553
1554 next = head->next;
1555 debug_rcu_head_unqueue(head: (struct rcu_head *)ptr);
1556 rcu_lock_acquire(&rcu_callback_map);
1557 trace_rcu_invoke_kvfree_callback(rcuname: "slab", rhp: head, offset);
1558
1559 kvfree(addr: ptr);
1560
1561 rcu_lock_release(&rcu_callback_map);
1562 cond_resched_tasks_rcu_qs();
1563 }
1564}
1565
1566/*
1567 * This function is invoked in workqueue context after a grace period.
1568 * It frees all the objects queued on ->bulk_head_free or ->head_free.
1569 */
1570static void kfree_rcu_work(struct work_struct *work)
1571{
1572 unsigned long flags;
1573 struct kvfree_rcu_bulk_data *bnode, *n;
1574 struct list_head bulk_head[FREE_N_CHANNELS];
1575 struct rcu_head *head;
1576 struct kfree_rcu_cpu *krcp;
1577 struct kfree_rcu_cpu_work *krwp;
1578 struct rcu_gp_oldstate head_gp_snap;
1579 int i;
1580
1581 krwp = container_of(to_rcu_work(work),
1582 struct kfree_rcu_cpu_work, rcu_work);
1583 krcp = krwp->krcp;
1584
1585 raw_spin_lock_irqsave(&krcp->lock, flags);
1586 // Channels 1 and 2.
1587 for (i = 0; i < FREE_N_CHANNELS; i++)
1588 list_replace_init(old: &krwp->bulk_head_free[i], new: &bulk_head[i]);
1589
1590 // Channel 3.
1591 head = krwp->head_free;
1592 krwp->head_free = NULL;
1593 head_gp_snap = krwp->head_free_gp_snap;
1594 raw_spin_unlock_irqrestore(&krcp->lock, flags);
1595
1596 // Handle the first two channels.
1597 for (i = 0; i < FREE_N_CHANNELS; i++) {
1598 // Start from the tail page, so a GP is likely passed for it.
1599 list_for_each_entry_safe(bnode, n, &bulk_head[i], list)
1600 kvfree_rcu_bulk(krcp, bnode, idx: i);
1601 }
1602
1603 /*
1604 * This is used when the "bulk" path can not be used for the
1605 * double-argument of kvfree_rcu(). This happens when the
1606 * page-cache is empty, which means that objects are instead
1607 * queued on a linked list through their rcu_head structures.
1608 * This list is named "Channel 3".
1609 */
1610 if (head && !WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&head_gp_snap)))
1611 kvfree_rcu_list(head);
1612}
1613
1614static bool kfree_rcu_sheaf(void *obj)
1615{
1616 struct kmem_cache *s;
1617 struct folio *folio;
1618 struct slab *slab;
1619
1620 if (is_vmalloc_addr(x: obj))
1621 return false;
1622
1623 folio = virt_to_folio(x: obj);
1624 if (unlikely(!folio_test_slab(folio)))
1625 return false;
1626
1627 slab = folio_slab(folio);
1628 s = slab->slab_cache;
1629 if (s->cpu_sheaves) {
1630 if (likely(!IS_ENABLED(CONFIG_NUMA) ||
1631 slab_nid(slab) == numa_mem_id()))
1632 return __kfree_rcu_sheaf(s, obj);
1633 }
1634
1635 return false;
1636}
1637
1638static bool
1639need_offload_krc(struct kfree_rcu_cpu *krcp)
1640{
1641 int i;
1642
1643 for (i = 0; i < FREE_N_CHANNELS; i++)
1644 if (!list_empty(head: &krcp->bulk_head[i]))
1645 return true;
1646
1647 return !!READ_ONCE(krcp->head);
1648}
1649
1650static bool
1651need_wait_for_krwp_work(struct kfree_rcu_cpu_work *krwp)
1652{
1653 int i;
1654
1655 for (i = 0; i < FREE_N_CHANNELS; i++)
1656 if (!list_empty(head: &krwp->bulk_head_free[i]))
1657 return true;
1658
1659 return !!krwp->head_free;
1660}
1661
1662static int krc_count(struct kfree_rcu_cpu *krcp)
1663{
1664 int sum = atomic_read(v: &krcp->head_count);
1665 int i;
1666
1667 for (i = 0; i < FREE_N_CHANNELS; i++)
1668 sum += atomic_read(v: &krcp->bulk_count[i]);
1669
1670 return sum;
1671}
1672
1673static void
1674__schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
1675{
1676 long delay, delay_left;
1677
1678 delay = krc_count(krcp) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES;
1679 if (delayed_work_pending(&krcp->monitor_work)) {
1680 delay_left = krcp->monitor_work.timer.expires - jiffies;
1681 if (delay < delay_left)
1682 mod_delayed_work(wq: rcu_reclaim_wq, dwork: &krcp->monitor_work, delay);
1683 return;
1684 }
1685 queue_delayed_work(wq: rcu_reclaim_wq, dwork: &krcp->monitor_work, delay);
1686}
1687
1688static void
1689schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
1690{
1691 unsigned long flags;
1692
1693 raw_spin_lock_irqsave(&krcp->lock, flags);
1694 __schedule_delayed_monitor_work(krcp);
1695 raw_spin_unlock_irqrestore(&krcp->lock, flags);
1696}
1697
1698static void
1699kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp)
1700{
1701 struct list_head bulk_ready[FREE_N_CHANNELS];
1702 struct kvfree_rcu_bulk_data *bnode, *n;
1703 struct rcu_head *head_ready = NULL;
1704 unsigned long flags;
1705 int i;
1706
1707 raw_spin_lock_irqsave(&krcp->lock, flags);
1708 for (i = 0; i < FREE_N_CHANNELS; i++) {
1709 INIT_LIST_HEAD(list: &bulk_ready[i]);
1710
1711 list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) {
1712 if (!poll_state_synchronize_rcu_full(rgosp: &bnode->gp_snap))
1713 break;
1714
1715 atomic_sub(i: bnode->nr_records, v: &krcp->bulk_count[i]);
1716 list_move(list: &bnode->list, head: &bulk_ready[i]);
1717 }
1718 }
1719
1720 if (krcp->head && poll_state_synchronize_rcu(oldstate: krcp->head_gp_snap)) {
1721 head_ready = krcp->head;
1722 atomic_set(v: &krcp->head_count, i: 0);
1723 WRITE_ONCE(krcp->head, NULL);
1724 }
1725 raw_spin_unlock_irqrestore(&krcp->lock, flags);
1726
1727 for (i = 0; i < FREE_N_CHANNELS; i++) {
1728 list_for_each_entry_safe(bnode, n, &bulk_ready[i], list)
1729 kvfree_rcu_bulk(krcp, bnode, idx: i);
1730 }
1731
1732 if (head_ready)
1733 kvfree_rcu_list(head: head_ready);
1734}
1735
1736/*
1737 * Return: %true if a work is queued, %false otherwise.
1738 */
1739static bool
1740kvfree_rcu_queue_batch(struct kfree_rcu_cpu *krcp)
1741{
1742 unsigned long flags;
1743 bool queued = false;
1744 int i, j;
1745
1746 raw_spin_lock_irqsave(&krcp->lock, flags);
1747
1748 // Attempt to start a new batch.
1749 for (i = 0; i < KFREE_N_BATCHES; i++) {
1750 struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]);
1751
1752 // Try to detach bulk_head or head and attach it, only when
1753 // all channels are free. Any channel is not free means at krwp
1754 // there is on-going rcu work to handle krwp's free business.
1755 if (need_wait_for_krwp_work(krwp))
1756 continue;
1757
1758 // kvfree_rcu_drain_ready() might handle this krcp, if so give up.
1759 if (need_offload_krc(krcp)) {
1760 // Channel 1 corresponds to the SLAB-pointer bulk path.
1761 // Channel 2 corresponds to vmalloc-pointer bulk path.
1762 for (j = 0; j < FREE_N_CHANNELS; j++) {
1763 if (list_empty(head: &krwp->bulk_head_free[j])) {
1764 atomic_set(v: &krcp->bulk_count[j], i: 0);
1765 list_replace_init(old: &krcp->bulk_head[j],
1766 new: &krwp->bulk_head_free[j]);
1767 }
1768 }
1769
1770 // Channel 3 corresponds to both SLAB and vmalloc
1771 // objects queued on the linked list.
1772 if (!krwp->head_free) {
1773 krwp->head_free = krcp->head;
1774 get_state_synchronize_rcu_full(rgosp: &krwp->head_free_gp_snap);
1775 atomic_set(v: &krcp->head_count, i: 0);
1776 WRITE_ONCE(krcp->head, NULL);
1777 }
1778
1779 // One work is per one batch, so there are three
1780 // "free channels", the batch can handle. Break
1781 // the loop since it is done with this CPU thus
1782 // queuing an RCU work is _always_ success here.
1783 queued = queue_rcu_work(wq: rcu_reclaim_wq, rwork: &krwp->rcu_work);
1784 WARN_ON_ONCE(!queued);
1785 break;
1786 }
1787 }
1788
1789 raw_spin_unlock_irqrestore(&krcp->lock, flags);
1790 return queued;
1791}
1792
1793/*
1794 * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
1795 */
1796static void kfree_rcu_monitor(struct work_struct *work)
1797{
1798 struct kfree_rcu_cpu *krcp = container_of(work,
1799 struct kfree_rcu_cpu, monitor_work.work);
1800
1801 // Drain ready for reclaim.
1802 kvfree_rcu_drain_ready(krcp);
1803
1804 // Queue a batch for a rest.
1805 kvfree_rcu_queue_batch(krcp);
1806
1807 // If there is nothing to detach, it means that our job is
1808 // successfully done here. In case of having at least one
1809 // of the channels that is still busy we should rearm the
1810 // work to repeat an attempt. Because previous batches are
1811 // still in progress.
1812 if (need_offload_krc(krcp))
1813 schedule_delayed_monitor_work(krcp);
1814}
1815
1816static void fill_page_cache_func(struct work_struct *work)
1817{
1818 struct kvfree_rcu_bulk_data *bnode;
1819 struct kfree_rcu_cpu *krcp =
1820 container_of(work, struct kfree_rcu_cpu,
1821 page_cache_work.work);
1822 unsigned long flags;
1823 int nr_pages;
1824 bool pushed;
1825 int i;
1826
1827 nr_pages = atomic_read(v: &krcp->backoff_page_cache_fill) ?
1828 1 : rcu_min_cached_objs;
1829
1830 for (i = READ_ONCE(krcp->nr_bkv_objs); i < nr_pages; i++) {
1831 bnode = (struct kvfree_rcu_bulk_data *)
1832 __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
1833
1834 if (!bnode)
1835 break;
1836
1837 raw_spin_lock_irqsave(&krcp->lock, flags);
1838 pushed = put_cached_bnode(krcp, bnode);
1839 raw_spin_unlock_irqrestore(&krcp->lock, flags);
1840
1841 if (!pushed) {
1842 free_page((unsigned long) bnode);
1843 break;
1844 }
1845 }
1846
1847 atomic_set(v: &krcp->work_in_progress, i: 0);
1848 atomic_set(v: &krcp->backoff_page_cache_fill, i: 0);
1849}
1850
1851// Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock()
1852// state specified by flags. If can_alloc is true, the caller must
1853// be schedulable and not be holding any locks or mutexes that might be
1854// acquired by the memory allocator or anything that it might invoke.
1855// Returns true if ptr was successfully recorded, else the caller must
1856// use a fallback.
1857static inline bool
1858add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
1859 unsigned long *flags, void *ptr, bool can_alloc)
1860{
1861 struct kvfree_rcu_bulk_data *bnode;
1862 int idx;
1863
1864 *krcp = krc_this_cpu_lock(flags);
1865 if (unlikely(!(*krcp)->initialized))
1866 return false;
1867
1868 idx = !!is_vmalloc_addr(x: ptr);
1869 bnode = list_first_entry_or_null(&(*krcp)->bulk_head[idx],
1870 struct kvfree_rcu_bulk_data, list);
1871
1872 /* Check if a new block is required. */
1873 if (!bnode || bnode->nr_records == KVFREE_BULK_MAX_ENTR) {
1874 bnode = get_cached_bnode(krcp: *krcp);
1875 if (!bnode && can_alloc) {
1876 krc_this_cpu_unlock(krcp: *krcp, flags: *flags);
1877
1878 // __GFP_NORETRY - allows a light-weight direct reclaim
1879 // what is OK from minimizing of fallback hitting point of
1880 // view. Apart of that it forbids any OOM invoking what is
1881 // also beneficial since we are about to release memory soon.
1882 //
1883 // __GFP_NOMEMALLOC - prevents from consuming of all the
1884 // memory reserves. Please note we have a fallback path.
1885 //
1886 // __GFP_NOWARN - it is supposed that an allocation can
1887 // be failed under low memory or high memory pressure
1888 // scenarios.
1889 bnode = (struct kvfree_rcu_bulk_data *)
1890 __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
1891 raw_spin_lock_irqsave(&(*krcp)->lock, *flags);
1892 }
1893
1894 if (!bnode)
1895 return false;
1896
1897 // Initialize the new block and attach it.
1898 bnode->nr_records = 0;
1899 list_add(new: &bnode->list, head: &(*krcp)->bulk_head[idx]);
1900 }
1901
1902 // Finally insert and update the GP for this page.
1903 bnode->nr_records++;
1904 bnode->records[bnode->nr_records - 1] = ptr;
1905 get_state_synchronize_rcu_full(rgosp: &bnode->gp_snap);
1906 atomic_inc(v: &(*krcp)->bulk_count[idx]);
1907
1908 return true;
1909}
1910
1911static enum hrtimer_restart
1912schedule_page_work_fn(struct hrtimer *t)
1913{
1914 struct kfree_rcu_cpu *krcp =
1915 container_of(t, struct kfree_rcu_cpu, hrtimer);
1916
1917 queue_delayed_work(wq: system_highpri_wq, dwork: &krcp->page_cache_work, delay: 0);
1918 return HRTIMER_NORESTART;
1919}
1920
1921static void
1922run_page_cache_worker(struct kfree_rcu_cpu *krcp)
1923{
1924 // If cache disabled, bail out.
1925 if (!rcu_min_cached_objs)
1926 return;
1927
1928 if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
1929 !atomic_xchg(v: &krcp->work_in_progress, new: 1)) {
1930 if (atomic_read(v: &krcp->backoff_page_cache_fill)) {
1931 queue_delayed_work(wq: rcu_reclaim_wq,
1932 dwork: &krcp->page_cache_work,
1933 delay: msecs_to_jiffies(m: rcu_delay_page_cache_fill_msec));
1934 } else {
1935 hrtimer_setup(timer: &krcp->hrtimer, function: schedule_page_work_fn, CLOCK_MONOTONIC,
1936 mode: HRTIMER_MODE_REL);
1937 hrtimer_start(timer: &krcp->hrtimer, tim: 0, mode: HRTIMER_MODE_REL);
1938 }
1939 }
1940}
1941
1942void __init kfree_rcu_scheduler_running(void)
1943{
1944 int cpu;
1945
1946 for_each_possible_cpu(cpu) {
1947 struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
1948
1949 if (need_offload_krc(krcp))
1950 schedule_delayed_monitor_work(krcp);
1951 }
1952}
1953
1954/*
1955 * Queue a request for lazy invocation of the appropriate free routine
1956 * after a grace period. Please note that three paths are maintained,
1957 * two for the common case using arrays of pointers and a third one that
1958 * is used only when the main paths cannot be used, for example, due to
1959 * memory pressure.
1960 *
1961 * Each kvfree_call_rcu() request is added to a batch. The batch will be drained
1962 * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will
1963 * be free'd in workqueue context. This allows us to: batch requests together to
1964 * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load.
1965 */
1966void kvfree_call_rcu(struct rcu_head *head, void *ptr)
1967{
1968 unsigned long flags;
1969 struct kfree_rcu_cpu *krcp;
1970 bool success;
1971
1972 /*
1973 * Please note there is a limitation for the head-less
1974 * variant, that is why there is a clear rule for such
1975 * objects: it can be used from might_sleep() context
1976 * only. For other places please embed an rcu_head to
1977 * your data.
1978 */
1979 if (!head)
1980 might_sleep();
1981
1982 if (!IS_ENABLED(CONFIG_PREEMPT_RT) && kfree_rcu_sheaf(obj: ptr))
1983 return;
1984
1985 // Queue the object but don't yet schedule the batch.
1986 if (debug_rcu_head_queue(head: ptr)) {
1987 // Probable double kfree_rcu(), just leak.
1988 WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n",
1989 __func__, head);
1990
1991 // Mark as success and leave.
1992 return;
1993 }
1994
1995 kasan_record_aux_stack(ptr);
1996 success = add_ptr_to_bulk_krc_lock(krcp: &krcp, flags: &flags, ptr, can_alloc: !head);
1997 if (!success) {
1998 run_page_cache_worker(krcp);
1999
2000 if (head == NULL)
2001 // Inline if kvfree_rcu(one_arg) call.
2002 goto unlock_return;
2003
2004 head->func = ptr;
2005 head->next = krcp->head;
2006 WRITE_ONCE(krcp->head, head);
2007 atomic_inc(v: &krcp->head_count);
2008
2009 // Take a snapshot for this krcp.
2010 krcp->head_gp_snap = get_state_synchronize_rcu();
2011 success = true;
2012 }
2013
2014 /*
2015 * The kvfree_rcu() caller considers the pointer freed at this point
2016 * and likely removes any references to it. Since the actual slab
2017 * freeing (and kmemleak_free()) is deferred, tell kmemleak to ignore
2018 * this object (no scanning or false positives reporting).
2019 */
2020 kmemleak_ignore(ptr);
2021
2022 // Set timer to drain after KFREE_DRAIN_JIFFIES.
2023 if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING)
2024 __schedule_delayed_monitor_work(krcp);
2025
2026unlock_return:
2027 krc_this_cpu_unlock(krcp, flags);
2028
2029 /*
2030 * Inline kvfree() after synchronize_rcu(). We can do
2031 * it from might_sleep() context only, so the current
2032 * CPU can pass the QS state.
2033 */
2034 if (!success) {
2035 debug_rcu_head_unqueue(head: (struct rcu_head *) ptr);
2036 synchronize_rcu();
2037 kvfree(addr: ptr);
2038 }
2039}
2040EXPORT_SYMBOL_GPL(kvfree_call_rcu);
2041
2042/**
2043 * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete.
2044 *
2045 * Note that a single argument of kvfree_rcu() call has a slow path that
2046 * triggers synchronize_rcu() following by freeing a pointer. It is done
2047 * before the return from the function. Therefore for any single-argument
2048 * call that will result in a kfree() to a cache that is to be destroyed
2049 * during module exit, it is developer's responsibility to ensure that all
2050 * such calls have returned before the call to kmem_cache_destroy().
2051 */
2052void kvfree_rcu_barrier(void)
2053{
2054 struct kfree_rcu_cpu_work *krwp;
2055 struct kfree_rcu_cpu *krcp;
2056 bool queued;
2057 int i, cpu;
2058
2059 flush_all_rcu_sheaves();
2060
2061 /*
2062 * Firstly we detach objects and queue them over an RCU-batch
2063 * for all CPUs. Finally queued works are flushed for each CPU.
2064 *
2065 * Please note. If there are outstanding batches for a particular
2066 * CPU, those have to be finished first following by queuing a new.
2067 */
2068 for_each_possible_cpu(cpu) {
2069 krcp = per_cpu_ptr(&krc, cpu);
2070
2071 /*
2072 * Check if this CPU has any objects which have been queued for a
2073 * new GP completion. If not(means nothing to detach), we are done
2074 * with it. If any batch is pending/running for this "krcp", below
2075 * per-cpu flush_rcu_work() waits its completion(see last step).
2076 */
2077 if (!need_offload_krc(krcp))
2078 continue;
2079
2080 while (1) {
2081 /*
2082 * If we are not able to queue a new RCU work it means:
2083 * - batches for this CPU are still in flight which should
2084 * be flushed first and then repeat;
2085 * - no objects to detach, because of concurrency.
2086 */
2087 queued = kvfree_rcu_queue_batch(krcp);
2088
2089 /*
2090 * Bail out, if there is no need to offload this "krcp"
2091 * anymore. As noted earlier it can run concurrently.
2092 */
2093 if (queued || !need_offload_krc(krcp))
2094 break;
2095
2096 /* There are ongoing batches. */
2097 for (i = 0; i < KFREE_N_BATCHES; i++) {
2098 krwp = &(krcp->krw_arr[i]);
2099 flush_rcu_work(rwork: &krwp->rcu_work);
2100 }
2101 }
2102 }
2103
2104 /*
2105 * Now we guarantee that all objects are flushed.
2106 */
2107 for_each_possible_cpu(cpu) {
2108 krcp = per_cpu_ptr(&krc, cpu);
2109
2110 /*
2111 * A monitor work can drain ready to reclaim objects
2112 * directly. Wait its completion if running or pending.
2113 */
2114 cancel_delayed_work_sync(dwork: &krcp->monitor_work);
2115
2116 for (i = 0; i < KFREE_N_BATCHES; i++) {
2117 krwp = &(krcp->krw_arr[i]);
2118 flush_rcu_work(rwork: &krwp->rcu_work);
2119 }
2120 }
2121}
2122EXPORT_SYMBOL_GPL(kvfree_rcu_barrier);
2123
2124static unsigned long
2125kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
2126{
2127 int cpu;
2128 unsigned long count = 0;
2129
2130 /* Snapshot count of all CPUs */
2131 for_each_possible_cpu(cpu) {
2132 struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
2133
2134 count += krc_count(krcp);
2135 count += READ_ONCE(krcp->nr_bkv_objs);
2136 atomic_set(v: &krcp->backoff_page_cache_fill, i: 1);
2137 }
2138
2139 return count == 0 ? SHRINK_EMPTY : count;
2140}
2141
2142static unsigned long
2143kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
2144{
2145 int cpu, freed = 0;
2146
2147 for_each_possible_cpu(cpu) {
2148 int count;
2149 struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
2150
2151 count = krc_count(krcp);
2152 count += drain_page_cache(krcp);
2153 kfree_rcu_monitor(work: &krcp->monitor_work.work);
2154
2155 sc->nr_to_scan -= count;
2156 freed += count;
2157
2158 if (sc->nr_to_scan <= 0)
2159 break;
2160 }
2161
2162 return freed == 0 ? SHRINK_STOP : freed;
2163}
2164
2165void __init kvfree_rcu_init(void)
2166{
2167 int cpu;
2168 int i, j;
2169 struct shrinker *kfree_rcu_shrinker;
2170
2171 rcu_reclaim_wq = alloc_workqueue("kvfree_rcu_reclaim",
2172 WQ_UNBOUND | WQ_MEM_RECLAIM, 0);
2173 WARN_ON(!rcu_reclaim_wq);
2174
2175 /* Clamp it to [0:100] seconds interval. */
2176 if (rcu_delay_page_cache_fill_msec < 0 ||
2177 rcu_delay_page_cache_fill_msec > 100 * MSEC_PER_SEC) {
2178
2179 rcu_delay_page_cache_fill_msec =
2180 clamp(rcu_delay_page_cache_fill_msec, 0,
2181 (int) (100 * MSEC_PER_SEC));
2182
2183 pr_info("Adjusting rcutree.rcu_delay_page_cache_fill_msec to %d ms.\n",
2184 rcu_delay_page_cache_fill_msec);
2185 }
2186
2187 for_each_possible_cpu(cpu) {
2188 struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
2189
2190 for (i = 0; i < KFREE_N_BATCHES; i++) {
2191 INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
2192 krcp->krw_arr[i].krcp = krcp;
2193
2194 for (j = 0; j < FREE_N_CHANNELS; j++)
2195 INIT_LIST_HEAD(list: &krcp->krw_arr[i].bulk_head_free[j]);
2196 }
2197
2198 for (i = 0; i < FREE_N_CHANNELS; i++)
2199 INIT_LIST_HEAD(list: &krcp->bulk_head[i]);
2200
2201 INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
2202 INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func);
2203 krcp->initialized = true;
2204 }
2205
2206 kfree_rcu_shrinker = shrinker_alloc(flags: 0, fmt: "slab-kvfree-rcu");
2207 if (!kfree_rcu_shrinker) {
2208 pr_err("Failed to allocate kfree_rcu() shrinker!\n");
2209 return;
2210 }
2211
2212 kfree_rcu_shrinker->count_objects = kfree_rcu_shrink_count;
2213 kfree_rcu_shrinker->scan_objects = kfree_rcu_shrink_scan;
2214
2215 shrinker_register(shrinker: kfree_rcu_shrinker);
2216}
2217
2218#endif /* CONFIG_KVFREE_RCU_BATCHED */
2219
2220