slab_common.c source code [Linux/mm/slab_common.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Slab allocator functions that are independent of the allocator strategy
4	*
5	* (C) 2012 Christoph Lameter <cl@gentwo.org>
6	*/
7	#include <linux/slab.h>
8
9	#include <linux/mm.h>
10	#include <linux/poison.h>
11	#include <linux/interrupt.h>
12	#include <linux/memory.h>
13	#include <linux/cache.h>
14	#include <linux/compiler.h>
15	#include <linux/kfence.h>
16	#include <linux/module.h>
17	#include <linux/cpu.h>
18	#include <linux/uaccess.h>
19	#include <linux/seq_file.h>
20	#include <linux/dma-mapping.h>
21	#include <linux/swiotlb.h>
22	#include <linux/proc_fs.h>
23	#include <linux/debugfs.h>
24	#include <linux/kmemleak.h>
25	#include <linux/kasan.h>
26	#include <asm/cacheflush.h>
27	#include <asm/tlbflush.h>
28	#include <asm/page.h>
29	#include <linux/memcontrol.h>
30	#include <linux/stackdepot.h>
31	#include <trace/events/rcu.h>
32
33	#include "../kernel/rcu/rcu.h"
34	#include "internal.h"
35	#include "slab.h"
36
37	#define CREATE_TRACE_POINTS
38	#include <trace/events/kmem.h>
39
40	enum slab_state slab_state;
41	LIST_HEAD(slab_caches);
42	DEFINE_MUTEX(slab_mutex);
43	struct kmem_cache *kmem_cache;
44
45	/*
46	* Set of flags that will prevent slab merging
47	*/
48	#define SLAB_NEVER_MERGE (SLAB_RED_ZONE \| SLAB_POISON \| SLAB_STORE_USER \| \
49	SLAB_TRACE \| SLAB_TYPESAFE_BY_RCU \| SLAB_NOLEAKTRACE \| \
50	SLAB_FAILSLAB \| SLAB_NO_MERGE)
51
52	#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT \| SLAB_CACHE_DMA \| \
53	SLAB_CACHE_DMA32 \| SLAB_ACCOUNT)
54
55	/*
56	* Merge control. If this is set then no merging of slab caches will occur.
57	*/
58	static bool slab_nomerge = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT);
59
60	static int __init setup_slab_nomerge(char *str)
61	{
62	slab_nomerge = true;
63	return `1`;
64	}
65
66	static int __init setup_slab_merge(char *str)
67	{
68	slab_nomerge = false;
69	return `1`;
70	}
71
72	__setup_param("slub_nomerge", slub_nomerge, setup_slab_nomerge, `0`);
73	__setup_param("slub_merge", slub_merge, setup_slab_merge, `0`);
74
75	__setup("slab_nomerge", setup_slab_nomerge);
76	__setup("slab_merge", setup_slab_merge);
77
78	/*
79	* Determine the size of a slab object
80	*/
81	unsigned int kmem_cache_size(struct kmem_cache *s)
82	{
83	return s->object_size;
84	}
85	EXPORT_SYMBOL(kmem_cache_size);
86
87	#ifdef CONFIG_DEBUG_VM
88
89	static bool kmem_cache_is_duplicate_name(const char *name)
90	{
91	struct kmem_cache *s;
92
93	list_for_each_entry(s, &slab_caches, list) {
94	if (!strcmp(s->name, name))
95	return true;
96	}
97
98	return false;
99	}
100
101	static int kmem_cache_sanity_check(const char name, unsigned* int size)
102	{
103	if (!name \|\| in_interrupt() \|\| size > KMALLOC_MAX_SIZE) {
104	pr_err("kmem_cache_create(%s) integrity check failed\n", name);
105	return -EINVAL;
106	}
107
108	/ Duplicate names will confuse slabtop, et al /
109	WARN(kmem_cache_is_duplicate_name(name),
110	"kmem_cache of name '%s' already exists\n", name);
111
112	WARN_ON(strchr(name, `' '`)); / It confuses parsers /
113	return `0`;
114	}
115	#else
116	static inline int kmem_cache_sanity_check(const char name, unsigned* int size)
117	{
118	return `0`;
119	}
120	#endif
121
122	/*
123	* Figure out what the alignment of the objects will be given a set of
124	* flags, a user specified alignment and the size of the objects.
125	*/
126	static unsigned int calculate_alignment(slab_flags_t flags,
127	unsigned int align, unsigned int size)
128	{
129	/*
130	* If the user wants hardware cache aligned objects then follow that
131	* suggestion if the object is sufficiently large.
132	*
133	* The hardware cache alignment cannot override the specified
134	* alignment though. If that is greater then use it.
135	*/
136	if (flags & SLAB_HWCACHE_ALIGN) {
137	unsigned int ralign;
138
139	ralign = cache_line_size();
140	while (size <= ralign / `2`)
141	ralign /= `2`;
142	align = max(align, ralign);
143	}
144
145	align = max(align, arch_slab_minalign());
146
147	return ALIGN(align, sizeof(void *));
148	}
149
150	/*
151	* Find a mergeable slab cache
152	*/
153	int slab_unmergeable(struct kmem_cache *s)
154	{
155	if (slab_nomerge \|\| (s->flags & SLAB_NEVER_MERGE))
156	return `1`;
157
158	if (s->ctor)
159	return `1`;
160
161	#ifdef CONFIG_HARDENED_USERCOPY
162	if (s->usersize)
163	return `1`;
164	#endif
165
166	if (s->cpu_sheaves)
167	return `1`;
168
169	/*
170	* We may have set a slab to be unmergeable during bootstrap.
171	*/
172	if (s->refcount < `0`)
173	return `1`;
174
175	return `0`;
176	}
177
178	struct kmem_cache find_mergeable(unsigned* int size, unsigned int align,
179	slab_flags_t flags, const char name, void* (ctor)(void* *))
180	{
181	struct kmem_cache *s;
182
183	if (slab_nomerge)
184	return NULL;
185
186	if (ctor)
187	return NULL;
188
189	flags = kmem_cache_flags(flags, name);
190
191	if (flags & SLAB_NEVER_MERGE)
192	return NULL;
193
194	size = ALIGN(size, sizeof(void *));
195	align = calculate_alignment(flags, align, size);
196	size = ALIGN(size, align);
197
198	list_for_each_entry_reverse(s, &slab_caches, list) {
199	if (slab_unmergeable(s))
200	continue;
201
202	if (size > s->size)
203	continue;
204
205	if ((flags & SLAB_MERGE_SAME) != (s->flags & SLAB_MERGE_SAME))
206	continue;
207	/*
208	* Check if alignment is compatible.
209	* Courtesy of Adrian Drzewiecki
210	*/
211	if ((s->size & ~(align - `1`)) != s->size)
212	continue;
213
214	if (s->size - size >= sizeof(void *))
215	continue;
216
217	return s;
218	}
219	return NULL;
220	}
221
222	static struct kmem_cache create_cache(const* char *name,
223	unsigned int object_size,
224	struct kmem_cache_args *args,
225	slab_flags_t flags)
226	{
227	struct kmem_cache *s;
228	int err;
229
230	/ If a custom freelist pointer is requested make sure it's sane. /
231	err = -EINVAL;
232	if (args->use_freeptr_offset &&
233	(args->freeptr_offset >= object_size \|\|
234	!(flags & SLAB_TYPESAFE_BY_RCU) \|\|
235	!IS_ALIGNED(args->freeptr_offset, __alignof__(freeptr_t))))
236	goto out;
237
238	err = -ENOMEM;
239	s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
240	if (!s)
241	goto out;
242	err = do_kmem_cache_create(s, name, size: object_size, args, flags);
243	if (err)
244	goto out_free_cache;
245
246	s->refcount = `1`;
247	list_add(new: &s->list, head: &slab_caches);
248	return s;
249
250	out_free_cache:
251	kmem_cache_free(s: kmem_cache, objp: s);
252	out:
253	return ERR_PTR(error: err);
254	}
255
256	/**
257	* __kmem_cache_create_args - Create a kmem cache.
258	* @name: A string which is used in /proc/slabinfo to identify this cache.
259	* @object_size: The size of objects to be created in this cache.
260	* @args: Additional arguments for the cache creation (see
261	* &struct kmem_cache_args).
262	* @flags: See the desriptions of individual flags. The common ones are listed
263	* in the description below.
264	*
265	* Not to be called directly, use the kmem_cache_create() wrapper with the same
266	* parameters.
267	*
268	* Commonly used @flags:
269	*
270	* &SLAB_ACCOUNT - Account allocations to memcg.
271	*
272	* &SLAB_HWCACHE_ALIGN - Align objects on cache line boundaries.
273	*
274	* &SLAB_RECLAIM_ACCOUNT - Objects are reclaimable.
275	*
276	* &SLAB_TYPESAFE_BY_RCU - Slab page (not individual objects) freeing delayed
277	* by a grace period - see the full description before using.
278	*
279	* Context: Cannot be called within a interrupt, but can be interrupted.
280	*
281	* Return: a pointer to the cache on success, NULL on failure.
282	*/
283	struct kmem_cache __kmem_cache_create_args(const* char *name,
284	unsigned int object_size,
285	struct kmem_cache_args *args,
286	slab_flags_t flags)
287	{
288	struct kmem_cache *s = NULL;
289	const char *cache_name;
290	int err;
291
292	#ifdef CONFIG_SLUB_DEBUG
293	/*
294	* If no slab_debug was enabled globally, the static key is not yet
295	* enabled by setup_slub_debug(). Enable it if the cache is being
296	* created with any of the debugging flags passed explicitly.
297	* It's also possible that this is the first cache created with
298	* SLAB_STORE_USER and we should init stack_depot for it.
299	*/
300	if (flags & SLAB_DEBUG_FLAGS)
301	static_branch_enable(&slub_debug_enabled);
302	if (flags & SLAB_STORE_USER)
303	stack_depot_init();
304	#else
305	flags &= ~SLAB_DEBUG_FLAGS;
306	#endif
307
308	mutex_lock(lock: &slab_mutex);
309
310	err = kmem_cache_sanity_check(name, size: object_size);
311	if (err) {
312	goto out_unlock;
313	}
314
315	if (flags & ~SLAB_FLAGS_PERMITTED) {
316	err = -EINVAL;
317	goto out_unlock;
318	}
319
320	/ Fail closed on bad usersize of useroffset values. /
321	if (!IS_ENABLED(CONFIG_HARDENED_USERCOPY) \|\|
322	WARN_ON(!args->usersize && args->useroffset) \|\|
323	WARN_ON(object_size < args->usersize \|\|
324	object_size - args->usersize < args->useroffset))
325	args->usersize = args->useroffset = `0`;
326
327	if (!args->usersize && !args->sheaf_capacity)
328	s = __kmem_cache_alias(name, size: object_size, align: args->align, flags,
329	ctor: args->ctor);
330	if (s)
331	goto out_unlock;
332
333	cache_name = kstrdup_const(s: name, GFP_KERNEL);
334	if (!cache_name) {
335	err = -ENOMEM;
336	goto out_unlock;
337	}
338
339	args->align = calculate_alignment(flags, align: args->align, size: object_size);
340	s = create_cache(name: cache_name, object_size, args, flags);
341	if (IS_ERR(ptr: s)) {
342	err = PTR_ERR(ptr: s);
343	kfree_const(x: cache_name);
344	}
345
346	out_unlock:
347	mutex_unlock(lock: &slab_mutex);
348
349	if (err) {
350	if (flags & SLAB_PANIC)
351	panic(fmt: "%s: Failed to create slab '%s'. Error %d\n",
352	__func__, name, err);
353	else {
354	pr_warn("%s(%s) failed with error %d\n",
355	__func__, name, err);
356	dump_stack();
357	}
358	return NULL;
359	}
360	return s;
361	}
362	EXPORT_SYMBOL(__kmem_cache_create_args);
363
364	static struct kmem_cache *kmem_buckets_cache __ro_after_init;
365
366	/**
367	* kmem_buckets_create - Create a set of caches that handle dynamic sized
368	* allocations via kmem_buckets_alloc()
369	* @name: A prefix string which is used in /proc/slabinfo to identify this
370	* cache. The individual caches with have their sizes as the suffix.
371	* @flags: SLAB flags (see kmem_cache_create() for details).
372	* @useroffset: Starting offset within an allocation that may be copied
373	* to/from userspace.
374	* @usersize: How many bytes, starting at @useroffset, may be copied
375	* to/from userspace.
376	* @ctor: A constructor for the objects, run when new allocations are made.
377	*
378	* Cannot be called within an interrupt, but can be interrupted.
379	*
380	* Return: a pointer to the cache on success, NULL on failure. When
381	* CONFIG_SLAB_BUCKETS is not enabled, ZERO_SIZE_PTR is returned, and
382	* subsequent calls to kmem_buckets_alloc() will fall back to kmalloc().
383	* (i.e. callers only need to check for NULL on failure.)
384	*/
385	kmem_buckets kmem_buckets_create(const* char *name, slab_flags_t flags,
386	unsigned int useroffset,
387	unsigned int usersize,
388	void (ctor)(void* *))
389	{
390	unsigned long mask = `0`;
391	unsigned int idx;
392	kmem_buckets *b;
393
394	BUILD_BUG_ON(ARRAY_SIZE(kmalloc_caches[KMALLOC_NORMAL]) > BITS_PER_LONG);
395
396	/*
397	* When the separate buckets API is not built in, just return
398	* a non-NULL value for the kmem_buckets pointer, which will be
399	* unused when performing allocations.
400	*/
401	if (!IS_ENABLED(CONFIG_SLAB_BUCKETS))
402	return ZERO_SIZE_PTR;
403
404	if (WARN_ON(!kmem_buckets_cache))
405	return NULL;
406
407	b = kmem_cache_alloc(kmem_buckets_cache, GFP_KERNEL\|__GFP_ZERO);
408	if (WARN_ON(!b))
409	return NULL;
410
411	flags \|= SLAB_NO_MERGE;
412
413	for (idx = `0`; idx < ARRAY_SIZE(kmalloc_caches[KMALLOC_NORMAL]); idx++) {
414	char short_size, cache_name;
415	unsigned int cache_useroffset, cache_usersize;
416	unsigned int size, aligned_idx;
417
418	if (!kmalloc_caches[KMALLOC_NORMAL][idx])
419	continue;
420
421	size = kmalloc_caches[KMALLOC_NORMAL][idx]->object_size;
422	if (!size)
423	continue;
424
425	short_size = strchr(kmalloc_caches[KMALLOC_NORMAL][idx]->name, `'-'`);
426	if (WARN_ON(!short_size))
427	goto fail;
428
429	if (useroffset >= size) {
430	cache_useroffset = `0`;
431	cache_usersize = `0`;
432	} else {
433	cache_useroffset = useroffset;
434	cache_usersize = min(size - cache_useroffset, usersize);
435	}
436
437	aligned_idx = __kmalloc_index(size, size_is_constant: false);
438	if (!(*b)[aligned_idx]) {
439	cache_name = kasprintf(GFP_KERNEL, fmt: "%s-%s", name, short_size + `1`);
440	if (WARN_ON(!cache_name))
441	goto fail;
442	(*b)[aligned_idx] = kmem_cache_create_usercopy(name: cache_name, size,
443	align: `0`, flags, useroffset: cache_useroffset,
444	usersize: cache_usersize, ctor);
445	kfree(objp: cache_name);
446	if (WARN_ON(!(*b)[aligned_idx]))
447	goto fail;
448	set_bit(nr: aligned_idx, addr: &mask);
449	}
450	if (idx != aligned_idx)
451	(b)[idx] = (b)[aligned_idx];
452	}
453
454	return b;
455
456	fail:
457	for_each_set_bit(idx, &mask, ARRAY_SIZE(kmalloc_caches[KMALLOC_NORMAL]))
458	kmem_cache_destroy(s: (*b)[idx]);
459	kmem_cache_free(s: kmem_buckets_cache, objp: b);
460
461	return NULL;
462	}
463	EXPORT_SYMBOL(kmem_buckets_create);
464
465	/*
466	* For a given kmem_cache, kmem_cache_destroy() should only be called
467	* once or there will be a use-after-free problem. The actual deletion
468	* and release of the kobject does not need slab_mutex or cpu_hotplug_lock
469	* protection. So they are now done without holding those locks.
470	*/
471	static void kmem_cache_release(struct kmem_cache *s)
472	{
473	kfence_shutdown_cache(s);
474	if (__is_defined(SLAB_SUPPORTS_SYSFS) && slab_state >= FULL)
475	sysfs_slab_release(s);
476	else
477	slab_kmem_cache_release(s);
478	}
479
480	void slab_kmem_cache_release(struct kmem_cache *s)
481	{
482	__kmem_cache_release(s);
483	kfree_const(x: s->name);
484	kmem_cache_free(s: kmem_cache, objp: s);
485	}
486
487	void kmem_cache_destroy(struct kmem_cache *s)
488	{
489	int err;
490
491	if (unlikely(!s) \|\| !kasan_check_byte(address: s))
492	return;
493
494	/ in-flight kfree_rcu()'s may include objects from our cache /
495	kvfree_rcu_barrier();
496
497	if (IS_ENABLED(CONFIG_SLUB_RCU_DEBUG) &&
498	(s->flags & SLAB_TYPESAFE_BY_RCU)) {
499	/*
500	* Under CONFIG_SLUB_RCU_DEBUG, when objects in a
501	* SLAB_TYPESAFE_BY_RCU slab are freed, SLUB will internally
502	* defer their freeing with call_rcu().
503	* Wait for such call_rcu() invocations here before actually
504	* destroying the cache.
505	*
506	* It doesn't matter that we haven't looked at the slab refcount
507	* yet - slabs with SLAB_TYPESAFE_BY_RCU can't be merged, so
508	* the refcount should be 1 here.
509	*/
510	rcu_barrier();
511	}
512
513	/ Wait for deferred work from kmalloc/kfree_nolock() /
514	defer_free_barrier();
515
516	cpus_read_lock();
517	mutex_lock(lock: &slab_mutex);
518
519	s->refcount--;
520	if (s->refcount) {
521	mutex_unlock(lock: &slab_mutex);
522	cpus_read_unlock();
523	return;
524	}
525
526	/ free asan quarantined objects /
527	kasan_cache_shutdown(cache: s);
528
529	err = __kmem_cache_shutdown(s);
530	if (!slab_in_kunit_test())
531	WARN(err, "%s %s: Slab cache still has objects when called from %pS",
532	__func__, s->name, (void *)_RET_IP_);
533
534	list_del(entry: &s->list);
535
536	mutex_unlock(lock: &slab_mutex);
537	cpus_read_unlock();
538
539	if (slab_state >= FULL)
540	sysfs_slab_unlink(s);
541	debugfs_slab_release(s);
542
543	if (err)
544	return;
545
546	if (s->flags & SLAB_TYPESAFE_BY_RCU)
547	rcu_barrier();
548
549	kmem_cache_release(s);
550	}
551	EXPORT_SYMBOL(kmem_cache_destroy);
552
553	/**
554	* kmem_cache_shrink - Shrink a cache.
555	* @cachep: The cache to shrink.
556	*
557	* Releases as many slabs as possible for a cache.
558	* To help debugging, a zero exit status indicates all slabs were released.
559	*
560	* Return: %0 if all slabs were released, non-zero otherwise
561	*/
562	int kmem_cache_shrink(struct kmem_cache *cachep)
563	{
564	kasan_cache_shrink(cache: cachep);
565
566	return __kmem_cache_shrink(cachep);
567	}
568	EXPORT_SYMBOL(kmem_cache_shrink);
569
570	bool slab_is_available(void)
571	{
572	return slab_state >= UP;
573	}
574
575	#ifdef CONFIG_PRINTK
576	static void kmem_obj_info(struct kmem_obj_info kpp, void* object, struct* slab *slab)
577	{
578	if (__kfence_obj_info(kpp, object, slab))
579	return;
580	__kmem_obj_info(kpp, object, slab);
581	}
582
583	/**
584	* kmem_dump_obj - Print available slab provenance information
585	* @object: slab object for which to find provenance information.
586	*
587	* This function uses pr_cont(), so that the caller is expected to have
588	* printed out whatever preamble is appropriate. The provenance information
589	* depends on the type of object and on how much debugging is enabled.
590	* For a slab-cache object, the fact that it is a slab object is printed,
591	* and, if available, the slab name, return address, and stack trace from
592	* the allocation and last free path of that object.
593	*
594	* Return: %true if the pointer is to a not-yet-freed object from
595	* kmalloc() or kmem_cache_alloc(), either %true or %false if the pointer
596	* is to an already-freed object, and %false otherwise.
597	*/
598	bool kmem_dump_obj(void *object)
599	{
600	char *cp = IS_ENABLED(CONFIG_MMU) ? "" : "/vmalloc";
601	int i;
602	struct slab *slab;
603	unsigned long ptroffset;
604	struct kmem_obj_info kp = { };
605
606	/ Some arches consider ZERO_SIZE_PTR to be a valid address. /
607	if (object < (void *)PAGE_SIZE \|\| !virt_addr_valid(object))
608	return false;
609	slab = virt_to_slab(addr: object);
610	if (!slab)
611	return false;
612
613	kmem_obj_info(kpp: &kp, object, slab);
614	if (kp.kp_slab_cache)
615	pr_cont(" slab%s %s", cp, kp.kp_slab_cache->name);
616	else
617	pr_cont(" slab%s", cp);
618	if (is_kfence_address(addr: object))
619	pr_cont(" (kfence)");
620	if (kp.kp_objp)
621	pr_cont(" start %px", kp.kp_objp);
622	if (kp.kp_data_offset)
623	pr_cont(" data offset %lu", kp.kp_data_offset);
624	if (kp.kp_objp) {
625	ptroffset = ((char )object - (char* *)kp.kp_objp) - kp.kp_data_offset;
626	pr_cont(" pointer offset %lu", ptroffset);
627	}
628	if (kp.kp_slab_cache && kp.kp_slab_cache->object_size)
629	pr_cont(" size %u", kp.kp_slab_cache->object_size);
630	if (kp.kp_ret)
631	pr_cont(" allocated at %pS\n", kp.kp_ret);
632	else
633	pr_cont("\n");
634	for (i = `0`; i < ARRAY_SIZE(kp.kp_stack); i++) {
635	if (!kp.kp_stack[i])
636	break;
637	pr_info(" %pS\n", kp.kp_stack[i]);
638	}
639
640	if (kp.kp_free_stack[`0`])
641	pr_cont(" Free path:\n");
642
643	for (i = `0`; i < ARRAY_SIZE(kp.kp_free_stack); i++) {
644	if (!kp.kp_free_stack[i])
645	break;
646	pr_info(" %pS\n", kp.kp_free_stack[i]);
647	}
648
649	return true;
650	}
651	EXPORT_SYMBOL_GPL(kmem_dump_obj);
652	#endif
653
654	/ Create a cache during boot when no slab services are available yet /
655	void __init create_boot_cache(struct kmem_cache s, const* char *name,
656	unsigned int size, slab_flags_t flags,
657	unsigned int useroffset, unsigned int usersize)
658	{
659	int err;
660	unsigned int align = ARCH_KMALLOC_MINALIGN;
661	struct kmem_cache_args kmem_args = {};
662
663	/*
664	* kmalloc caches guarantee alignment of at least the largest
665	* power-of-two divisor of the size. For power-of-two sizes,
666	* it is the size itself.
667	*/
668	if (flags & SLAB_KMALLOC)
669	align = max(align, `1U` << (ffs(size) - `1`));
670	kmem_args.align = calculate_alignment(flags, align, size);
671
672	#ifdef CONFIG_HARDENED_USERCOPY
673	kmem_args.useroffset = useroffset;
674	kmem_args.usersize = usersize;
675	#endif
676
677	err = do_kmem_cache_create(s, name, size, args: &kmem_args, flags);
678
679	if (err)
680	panic(fmt: "Creation of kmalloc slab %s size=%u failed. Reason %d\n",
681	name, size, err);
682
683	s->refcount = -`1`; / Exempt from merging for now /
684	}
685
686	static struct kmem_cache __init create_kmalloc_cache(const* char *name,
687	unsigned int size,
688	slab_flags_t flags)
689	{
690	struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
691
692	if (!s)
693	panic(fmt: "Out of memory when creating slab %s\n", name);
694
695	create_boot_cache(s, name, size, flags: flags \| SLAB_KMALLOC, useroffset: `0`, usersize: size);
696	list_add(new: &s->list, head: &slab_caches);
697	s->refcount = `1`;
698	return s;
699	}
700
701	kmem_buckets kmalloc_caches[NR_KMALLOC_TYPES] __ro_after_init =
702	{ / initialization for https://llvm.org/pr42570 / };
703	EXPORT_SYMBOL(kmalloc_caches);
704
705	#ifdef CONFIG_RANDOM_KMALLOC_CACHES
706	unsigned long random_kmalloc_seed __ro_after_init;
707	EXPORT_SYMBOL(random_kmalloc_seed);
708	#endif
709
710	/*
711	* Conversion table for small slabs sizes / 8 to the index in the
712	* kmalloc array. This is necessary for slabs < 192 since we have non power
713	* of two cache sizes there. The size of larger slabs can be determined using
714	* fls.
715	*/
716	u8 kmalloc_size_index[`24`] __ro_after_init = {
717	`3`, / 8 /
718	`4`, / 16 /
719	`5`, / 24 /
720	`5`, / 32 /
721	`6`, / 40 /
722	`6`, / 48 /
723	`6`, / 56 /
724	`6`, / 64 /
725	`1`, / 72 /
726	`1`, / 80 /
727	`1`, / 88 /
728	`1`, / 96 /
729	`7`, / 104 /
730	`7`, / 112 /
731	`7`, / 120 /
732	`7`, / 128 /
733	`2`, / 136 /
734	`2`, / 144 /
735	`2`, / 152 /
736	`2`, / 160 /
737	`2`, / 168 /
738	`2`, / 176 /
739	`2`, / 184 /
740	`2` / 192 /
741	};
742
743	size_t kmalloc_size_roundup(size_t size)
744	{
745	if (size && size <= KMALLOC_MAX_CACHE_SIZE) {
746	/*
747	* The flags don't matter since size_index is common to all.
748	* Neither does the caller for just getting ->object_size.
749	*/
750	return kmalloc_slab(size, NULL, GFP_KERNEL, caller: `0`)->object_size;
751	}
752
753	/ Above the smaller buckets, size is a multiple of page size. /
754	if (size && size <= KMALLOC_MAX_SIZE)
755	return PAGE_SIZE << get_order(size);
756
757	/*
758	* Return 'size' for 0 - kmalloc() returns ZERO_SIZE_PTR
759	* and very large size - kmalloc() may fail.
760	*/
761	return size;
762
763	}
764	EXPORT_SYMBOL(kmalloc_size_roundup);
765
766	#ifdef CONFIG_ZONE_DMA
767	#define KMALLOC_DMA_NAME(sz) .name[KMALLOC_DMA] = "dma-kmalloc-" #sz,
768	#else
769	#define KMALLOC_DMA_NAME(sz)
770	#endif
771
772	#ifdef CONFIG_MEMCG
773	#define KMALLOC_CGROUP_NAME(sz) .name[KMALLOC_CGROUP] = "kmalloc-cg-" #sz,
774	#else
775	#define KMALLOC_CGROUP_NAME(sz)
776	#endif
777
778	#ifndef CONFIG_SLUB_TINY
779	#define KMALLOC_RCL_NAME(sz) .name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #sz,
780	#else
781	#define KMALLOC_RCL_NAME(sz)
782	#endif
783
784	#ifdef CONFIG_RANDOM_KMALLOC_CACHES
785	#define __KMALLOC_RANDOM_CONCAT(a, b) a ## b
786	#define KMALLOC_RANDOM_NAME(N, sz) __KMALLOC_RANDOM_CONCAT(KMA_RAND_, N)(sz)
787	#define KMA_RAND_1(sz) .name[KMALLOC_RANDOM_START + 1] = "kmalloc-rnd-01-" #sz,
788	#define KMA_RAND_2(sz) KMA_RAND_1(sz) .name[KMALLOC_RANDOM_START + 2] = "kmalloc-rnd-02-" #sz,
789	#define KMA_RAND_3(sz) KMA_RAND_2(sz) .name[KMALLOC_RANDOM_START + 3] = "kmalloc-rnd-03-" #sz,
790	#define KMA_RAND_4(sz) KMA_RAND_3(sz) .name[KMALLOC_RANDOM_START + 4] = "kmalloc-rnd-04-" #sz,
791	#define KMA_RAND_5(sz) KMA_RAND_4(sz) .name[KMALLOC_RANDOM_START + 5] = "kmalloc-rnd-05-" #sz,
792	#define KMA_RAND_6(sz) KMA_RAND_5(sz) .name[KMALLOC_RANDOM_START + 6] = "kmalloc-rnd-06-" #sz,
793	#define KMA_RAND_7(sz) KMA_RAND_6(sz) .name[KMALLOC_RANDOM_START + 7] = "kmalloc-rnd-07-" #sz,
794	#define KMA_RAND_8(sz) KMA_RAND_7(sz) .name[KMALLOC_RANDOM_START + 8] = "kmalloc-rnd-08-" #sz,
795	#define KMA_RAND_9(sz) KMA_RAND_8(sz) .name[KMALLOC_RANDOM_START + 9] = "kmalloc-rnd-09-" #sz,
796	#define KMA_RAND_10(sz) KMA_RAND_9(sz) .name[KMALLOC_RANDOM_START + 10] = "kmalloc-rnd-10-" #sz,
797	#define KMA_RAND_11(sz) KMA_RAND_10(sz) .name[KMALLOC_RANDOM_START + 11] = "kmalloc-rnd-11-" #sz,
798	#define KMA_RAND_12(sz) KMA_RAND_11(sz) .name[KMALLOC_RANDOM_START + 12] = "kmalloc-rnd-12-" #sz,
799	#define KMA_RAND_13(sz) KMA_RAND_12(sz) .name[KMALLOC_RANDOM_START + 13] = "kmalloc-rnd-13-" #sz,
800	#define KMA_RAND_14(sz) KMA_RAND_13(sz) .name[KMALLOC_RANDOM_START + 14] = "kmalloc-rnd-14-" #sz,
801	#define KMA_RAND_15(sz) KMA_RAND_14(sz) .name[KMALLOC_RANDOM_START + 15] = "kmalloc-rnd-15-" #sz,
802	#else // CONFIG_RANDOM_KMALLOC_CACHES
803	#define KMALLOC_RANDOM_NAME(N, sz)
804	#endif
805
806	#define INIT_KMALLOC_INFO(__size, __short_size) \
807	{ \
808	.name[KMALLOC_NORMAL] = "kmalloc-" #__short_size, \
809	KMALLOC_RCL_NAME(__short_size) \
810	KMALLOC_CGROUP_NAME(__short_size) \
811	KMALLOC_DMA_NAME(__short_size) \
812	KMALLOC_RANDOM_NAME(RANDOM_KMALLOC_CACHES_NR, __short_size) \
813	.size = __size, \
814	}
815
816	/*
817	* kmalloc_info[] is to make slab_debug=,kmalloc-xx option work at boot time.
818	* kmalloc_index() supports up to 2^21=2MB, so the final entry of the table is
819	* kmalloc-2M.
820	*/
821	const struct kmalloc_info_struct kmalloc_info[] __initconst = {
822	INIT_KMALLOC_INFO(`0`, `0`),
823	INIT_KMALLOC_INFO(`96`, `96`),
824	INIT_KMALLOC_INFO(`192`, `192`),
825	INIT_KMALLOC_INFO(`8`, `8`),
826	INIT_KMALLOC_INFO(`16`, `16`),
827	INIT_KMALLOC_INFO(`32`, `32`),
828	INIT_KMALLOC_INFO(`64`, `64`),
829	INIT_KMALLOC_INFO(`128`, `128`),
830	INIT_KMALLOC_INFO(`256`, `256`),
831	INIT_KMALLOC_INFO(`512`, `512`),
832	INIT_KMALLOC_INFO(`1024`, `1k`),
833	INIT_KMALLOC_INFO(`2048`, `2k`),
834	INIT_KMALLOC_INFO(`4096`, `4k`),
835	INIT_KMALLOC_INFO(`8192`, `8k`),
836	INIT_KMALLOC_INFO(`16384`, `16k`),
837	INIT_KMALLOC_INFO(`32768`, `32k`),
838	INIT_KMALLOC_INFO(`65536`, `64k`),
839	INIT_KMALLOC_INFO(`131072`, `128k`),
840	INIT_KMALLOC_INFO(`262144`, `256k`),
841	INIT_KMALLOC_INFO(`524288`, `512k`),
842	INIT_KMALLOC_INFO(`1048576`, `1M`),
843	INIT_KMALLOC_INFO(`2097152`, `2M`)
844	};
845
846	/*
847	* Patch up the size_index table if we have strange large alignment
848	* requirements for the kmalloc array. This is only the case for
849	* MIPS it seems. The standard arches will not generate any code here.
850	*
851	* Largest permitted alignment is 256 bytes due to the way we
852	* handle the index determination for the smaller caches.
853	*
854	* Make sure that nothing crazy happens if someone starts tinkering
855	* around with ARCH_KMALLOC_MINALIGN
856	*/
857	void __init setup_kmalloc_cache_index_table(void)
858	{
859	unsigned int i;
860
861	BUILD_BUG_ON(KMALLOC_MIN_SIZE > `256` \|\|
862	!is_power_of_2(KMALLOC_MIN_SIZE));
863
864	for (i = `8`; i < KMALLOC_MIN_SIZE; i += `8`) {
865	unsigned int elem = size_index_elem(bytes: i);
866
867	if (elem >= ARRAY_SIZE(kmalloc_size_index))
868	break;
869	kmalloc_size_index[elem] = KMALLOC_SHIFT_LOW;
870	}
871
872	if (KMALLOC_MIN_SIZE >= `64`) {
873	/*
874	* The 96 byte sized cache is not used if the alignment
875	* is 64 byte.
876	*/
877	for (i = `64` + `8`; i <= `96`; i += `8`)
878	kmalloc_size_index[size_index_elem(bytes: i)] = `7`;
879
880	}
881
882	if (KMALLOC_MIN_SIZE >= `128`) {
883	/*
884	* The 192 byte sized cache is not used if the alignment
885	* is 128 byte. Redirect kmalloc to use the 256 byte cache
886	* instead.
887	*/
888	for (i = `128` + `8`; i <= `192`; i += `8`)
889	kmalloc_size_index[size_index_elem(bytes: i)] = `8`;
890	}
891	}
892
893	static unsigned int __kmalloc_minalign(void)
894	{
895	unsigned int minalign = dma_get_cache_alignment();
896
897	if (IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) &&
898	is_swiotlb_allocated())
899	minalign = ARCH_KMALLOC_MINALIGN;
900
901	return max(minalign, arch_slab_minalign());
902	}
903
904	static void __init
905	new_kmalloc_cache(int idx, enum kmalloc_cache_type type)
906	{
907	slab_flags_t flags = `0`;
908	unsigned int minalign = __kmalloc_minalign();
909	unsigned int aligned_size = kmalloc_info[idx].size;
910	int aligned_idx = idx;
911
912	if ((KMALLOC_RECLAIM != KMALLOC_NORMAL) && (type == KMALLOC_RECLAIM)) {
913	flags \|= SLAB_RECLAIM_ACCOUNT;
914	} else if (IS_ENABLED(CONFIG_MEMCG) && (type == KMALLOC_CGROUP)) {
915	if (mem_cgroup_kmem_disabled()) {
916	kmalloc_caches[type][idx] = kmalloc_caches[KMALLOC_NORMAL][idx];
917	return;
918	}
919	flags \|= SLAB_ACCOUNT;
920	} else if (IS_ENABLED(CONFIG_ZONE_DMA) && (type == KMALLOC_DMA)) {
921	flags \|= SLAB_CACHE_DMA;
922	}
923
924	#ifdef CONFIG_RANDOM_KMALLOC_CACHES
925	if (type >= KMALLOC_RANDOM_START && type <= KMALLOC_RANDOM_END)
926	flags \|= SLAB_NO_MERGE;
927	#endif
928
929	/*
930	* If CONFIG_MEMCG is enabled, disable cache merging for
931	* KMALLOC_NORMAL caches.
932	*/
933	if (IS_ENABLED(CONFIG_MEMCG) && (type == KMALLOC_NORMAL))
934	flags \|= SLAB_NO_MERGE;
935
936	if (minalign > ARCH_KMALLOC_MINALIGN) {
937	aligned_size = ALIGN(aligned_size, minalign);
938	aligned_idx = __kmalloc_index(size: aligned_size, size_is_constant: false);
939	}
940
941	if (!kmalloc_caches[type][aligned_idx])
942	kmalloc_caches[type][aligned_idx] = create_kmalloc_cache(
943	name: kmalloc_info[aligned_idx].name[type],
944	size: aligned_size, flags);
945	if (idx != aligned_idx)
946	kmalloc_caches[type][idx] = kmalloc_caches[type][aligned_idx];
947	}
948
949	/*
950	* Create the kmalloc array. Some of the regular kmalloc arrays
951	* may already have been created because they were needed to
952	* enable allocations for slab creation.
953	*/
954	void __init create_kmalloc_caches(void)
955	{
956	int i;
957	enum kmalloc_cache_type type;
958
959	/*
960	* Including KMALLOC_CGROUP if CONFIG_MEMCG defined
961	*/
962	for (type = KMALLOC_NORMAL; type < NR_KMALLOC_TYPES; type++) {
963	/ Caches that are NOT of the two-to-the-power-of size. /
964	if (KMALLOC_MIN_SIZE <= `32`)
965	new_kmalloc_cache(idx: `1`, type);
966	if (KMALLOC_MIN_SIZE <= `64`)
967	new_kmalloc_cache(idx: `2`, type);
968
969	/ Caches that are of the two-to-the-power-of size. /
970	for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++)
971	new_kmalloc_cache(idx: i, type);
972	}
973	#ifdef CONFIG_RANDOM_KMALLOC_CACHES
974	random_kmalloc_seed = get_random_u64();
975	#endif
976
977	/ Kmalloc array is now usable /
978	slab_state = UP;
979
980	if (IS_ENABLED(CONFIG_SLAB_BUCKETS))
981	kmem_buckets_cache = kmem_cache_create("kmalloc_buckets",
982	sizeof(kmem_buckets),
983	`0`, SLAB_NO_MERGE, NULL);
984	}
985
986	/**
987	* __ksize -- Report full size of underlying allocation
988	* @object: pointer to the object
989	*
990	* This should only be used internally to query the true size of allocations.
991	* It is not meant to be a way to discover the usable size of an allocation
992	* after the fact. Instead, use kmalloc_size_roundup(). Using memory beyond
993	* the originally requested allocation size may trigger KASAN, UBSAN_BOUNDS,
994	* and/or FORTIFY_SOURCE.
995	*
996	* Return: size of the actual memory used by @object in bytes
997	*/
998	size_t __ksize(const void *object)
999	{
1000	struct folio *folio;
1001
1002	if (unlikely(object == ZERO_SIZE_PTR))
1003	return `0`;
1004
1005	folio = virt_to_folio(x: object);
1006
1007	if (unlikely(!folio_test_slab(folio))) {
1008	if (WARN_ON(folio_size(folio) <= KMALLOC_MAX_CACHE_SIZE))
1009	return `0`;
1010	if (WARN_ON(object != folio_address(folio)))
1011	return `0`;
1012	return folio_size(folio);
1013	}
1014
1015	#ifdef CONFIG_SLUB_DEBUG
1016	skip_orig_size_check(folio_slab(folio)->slab_cache, object);
1017	#endif
1018
1019	return slab_ksize(folio_slab(folio)->slab_cache);
1020	}
1021
1022	gfp_t kmalloc_fix_flags(gfp_t flags)
1023	{
1024	gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
1025
1026	flags &= ~GFP_SLAB_BUG_MASK;
1027	pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n",
1028	invalid_mask, &invalid_mask, flags, &flags);
1029	dump_stack();
1030
1031	return flags;
1032	}
1033
1034	#ifdef CONFIG_SLAB_FREELIST_RANDOM
1035	/ Randomize a generic freelist /
1036	static void freelist_randomize(unsigned int *list,
1037	unsigned int count)
1038	{
1039	unsigned int rand;
1040	unsigned int i;
1041
1042	for (i = `0`; i < count; i++)
1043	list[i] = i;
1044
1045	/ Fisher-Yates shuffle /
1046	for (i = count - `1`; i > `0`; i--) {
1047	rand = get_random_u32_below(i + `1`);
1048	swap(list[i], list[rand]);
1049	}
1050	}
1051
1052	/ Create a random sequence per cache /
1053	int cache_random_seq_create(struct kmem_cache cachep, unsigned* int count,
1054	gfp_t gfp)
1055	{
1056
1057	if (count < `2` \|\| cachep->random_seq)
1058	return `0`;
1059
1060	cachep->random_seq = kcalloc(count, sizeof(unsigned int), gfp);
1061	if (!cachep->random_seq)
1062	return -ENOMEM;
1063
1064	freelist_randomize(cachep->random_seq, count);
1065	return `0`;
1066	}
1067
1068	/ Destroy the per-cache random freelist sequence /
1069	void cache_random_seq_destroy(struct kmem_cache *cachep)
1070	{
1071	kfree(cachep->random_seq);
1072	cachep->random_seq = NULL;
1073	}
1074	#endif /* CONFIG_SLAB_FREELIST_RANDOM */
1075
1076	#ifdef CONFIG_SLUB_DEBUG
1077	#define SLABINFO_RIGHTS (0400)
1078
1079	static void print_slabinfo_header(struct seq_file *m)
1080	{
1081	/*
1082	* Output format version, so at least we can change it
1083	* without _too_ many complaints.
1084	*/
1085	seq_puts(m, s: "slabinfo - version: 2.1\n");
1086	seq_puts(m, s: "# name <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>");
1087	seq_puts(m, s: " : tunables <limit> <batchcount> <sharedfactor>");
1088	seq_puts(m, s: " : slabdata <active_slabs> <num_slabs> <sharedavail>");
1089	seq_putc(m, c: `'\n'`);
1090	}
1091
1092	static void slab_start(struct* seq_file m, loff_t pos)
1093	{
1094	mutex_lock(lock: &slab_mutex);
1095	return seq_list_start(head: &slab_caches, pos: *pos);
1096	}
1097
1098	static void slab_next(struct* seq_file m, void* p, loff_t pos)
1099	{
1100	return seq_list_next(v: p, head: &slab_caches, ppos: pos);
1101	}
1102
1103	static void slab_stop(struct seq_file m, void* *p)
1104	{
1105	mutex_unlock(lock: &slab_mutex);
1106	}
1107
1108	static void cache_show(struct kmem_cache s, struct* seq_file *m)
1109	{
1110	struct slabinfo sinfo;
1111
1112	memset(s: &sinfo, c: `0`, n: sizeof(sinfo));
1113	get_slabinfo(s, sinfo: &sinfo);
1114
1115	seq_printf(m, fmt: "%-17s %6lu %6lu %6u %4u %4d",
1116	s->name, sinfo.active_objs, sinfo.num_objs, s->size,
1117	sinfo.objects_per_slab, (`1` << sinfo.cache_order));
1118
1119	seq_printf(m, fmt: " : tunables %4u %4u %4u",
1120	sinfo.limit, sinfo.batchcount, sinfo.shared);
1121	seq_printf(m, fmt: " : slabdata %6lu %6lu %6lu",
1122	sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail);
1123	seq_putc(m, c: `'\n'`);
1124	}
1125
1126	static int slab_show(struct seq_file m, void* *p)
1127	{
1128	struct kmem_cache s = list_entry(p, struct* kmem_cache, list);
1129
1130	if (p == slab_caches.next)
1131	print_slabinfo_header(m);
1132	cache_show(s, m);
1133	return `0`;
1134	}
1135
1136	void dump_unreclaimable_slab(void)
1137	{
1138	struct kmem_cache *s;
1139	struct slabinfo sinfo;
1140
1141	/*
1142	* Here acquiring slab_mutex is risky since we don't prefer to get
1143	* sleep in oom path. But, without mutex hold, it may introduce a
1144	* risk of crash.
1145	* Use mutex_trylock to protect the list traverse, dump nothing
1146	* without acquiring the mutex.
1147	*/
1148	if (!mutex_trylock(lock: &slab_mutex)) {
1149	pr_warn("excessive unreclaimable slab but cannot dump stats\n");
1150	return;
1151	}
1152
1153	pr_info("Unreclaimable slab info:\n");
1154	pr_info("Name Used Total\n");
1155
1156	list_for_each_entry(s, &slab_caches, list) {
1157	if (s->flags & SLAB_RECLAIM_ACCOUNT)
1158	continue;
1159
1160	get_slabinfo(s, sinfo: &sinfo);
1161
1162	if (sinfo.num_objs > `0`)
1163	pr_info("%-17s %10luKB %10luKB\n", s->name,
1164	(sinfo.active_objs * s->size) / `1024`,
1165	(sinfo.num_objs * s->size) / `1024`);
1166	}
1167	mutex_unlock(lock: &slab_mutex);
1168	}
1169
1170	/*
1171	* slabinfo_op - iterator that generates /proc/slabinfo
1172	*
1173	* Output layout:
1174	* cache-name
1175	* num-active-objs
1176	* total-objs
1177	* object size
1178	* num-active-slabs
1179	* total-slabs
1180	* num-pages-per-slab
1181	* + further values on SMP and with statistics enabled
1182	*/
1183	static const struct seq_operations slabinfo_op = {
1184	.start = slab_start,
1185	.next = slab_next,
1186	.stop = slab_stop,
1187	.show = slab_show,
1188	};
1189
1190	static int slabinfo_open(struct inode inode, struct* file *file)
1191	{
1192	return seq_open(file, &slabinfo_op);
1193	}
1194
1195	static const struct proc_ops slabinfo_proc_ops = {
1196	.proc_flags = PROC_ENTRY_PERMANENT,
1197	.proc_open = slabinfo_open,
1198	.proc_read = seq_read,
1199	.proc_lseek = seq_lseek,
1200	.proc_release = seq_release,
1201	};
1202
1203	static int __init slab_proc_init(void)
1204	{
1205	proc_create(name: "slabinfo", SLABINFO_RIGHTS, NULL, proc_ops: &slabinfo_proc_ops);
1206	return `0`;
1207	}
1208	module_init(slab_proc_init);
1209
1210	#endif /* CONFIG_SLUB_DEBUG */
1211
1212	/**
1213	* kfree_sensitive - Clear sensitive information in memory before freeing
1214	* @p: object to free memory of
1215	*
1216	* The memory of the object @p points to is zeroed before freed.
1217	* If @p is %NULL, kfree_sensitive() does nothing.
1218	*
1219	* Note: this function zeroes the whole allocated buffer which can be a good
1220	* deal bigger than the requested buffer size passed to kmalloc(). So be
1221	* careful when using this function in performance sensitive code.
1222	*/
1223	void kfree_sensitive(const void *p)
1224	{
1225	size_t ks;
1226	void mem = (void* *)p;
1227
1228	ks = ksize(objp: mem);
1229	if (ks) {
1230	kasan_unpoison_range(address: mem, size: ks);
1231	memzero_explicit(s: mem, count: ks);
1232	}
1233	kfree(objp: mem);
1234	}
1235	EXPORT_SYMBOL(kfree_sensitive);
1236
1237	size_t ksize(const void *objp)
1238	{
1239	/*
1240	* We need to first check that the pointer to the object is valid.
1241	* The KASAN report printed from ksize() is more useful, then when
1242	* it's printed later when the behaviour could be undefined due to
1243	* a potential use-after-free or double-free.
1244	*
1245	* We use kasan_check_byte(), which is supported for the hardware
1246	* tag-based KASAN mode, unlike kasan_check_read/write().
1247	*
1248	* If the pointed to memory is invalid, we return 0 to avoid users of
1249	* ksize() writing to and potentially corrupting the memory region.
1250	*
1251	* We want to perform the check before __ksize(), to avoid potentially
1252	* crashing in __ksize() due to accessing invalid metadata.
1253	*/
1254	if (unlikely(ZERO_OR_NULL_PTR(objp)) \|\| !kasan_check_byte(address: objp))
1255	return `0`;
1256
1257	return kfence_ksize(addr: objp) ?: __ksize(object: objp);
1258	}
1259	EXPORT_SYMBOL(ksize);
1260
1261	#ifdef CONFIG_BPF_SYSCALL
1262	#include <linux/btf.h>
1263
1264	__bpf_kfunc_start_defs();
1265
1266	__bpf_kfunc struct kmem_cache *bpf_get_kmem_cache(u64 addr)
1267	{
1268	struct slab *slab;
1269
1270	if (!virt_addr_valid((void )(long*)addr))
1271	return NULL;
1272
1273	slab = virt_to_slab((void )(long*)addr);
1274	return slab ? slab->slab_cache : NULL;
1275	}
1276
1277	__bpf_kfunc_end_defs();
1278	#endif /* CONFIG_BPF_SYSCALL */
1279
1280	/ Tracepoints definitions. /
1281	EXPORT_TRACEPOINT_SYMBOL(kmalloc);
1282	EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
1283	EXPORT_TRACEPOINT_SYMBOL(kfree);
1284	EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
1285
1286	#ifndef CONFIG_KVFREE_RCU_BATCHED
1287
1288	void kvfree_call_rcu(struct rcu_head head, void* *ptr)
1289	{
1290	if (head) {
1291	kasan_record_aux_stack(ptr);
1292	call_rcu(head, kvfree_rcu_cb);
1293	return;
1294	}
1295
1296	// kvfree_rcu(one_arg) call.
1297	might_sleep();
1298	synchronize_rcu();
1299	kvfree(ptr);
1300	}
1301	EXPORT_SYMBOL_GPL(kvfree_call_rcu);
1302
1303	void __init kvfree_rcu_init(void)
1304	{
1305	}
1306
1307	#else /* CONFIG_KVFREE_RCU_BATCHED */
1308
1309	/*
1310	* This rcu parameter is runtime-read-only. It reflects
1311	* a minimum allowed number of objects which can be cached
1312	* per-CPU. Object size is equal to one page. This value
1313	* can be changed at boot time.
1314	*/
1315	static int rcu_min_cached_objs = `5`;
1316	module_param(rcu_min_cached_objs, int, `0444`);
1317
1318	// A page shrinker can ask for pages to be freed to make them
1319	// available for other parts of the system. This usually happens
1320	// under low memory conditions, and in that case we should also
1321	// defer page-cache filling for a short time period.
1322	//
1323	// The default value is 5 seconds, which is long enough to reduce
1324	// interference with the shrinker while it asks other systems to
1325	// drain their caches.
1326	static int rcu_delay_page_cache_fill_msec = `5000`;
1327	module_param(rcu_delay_page_cache_fill_msec, int, `0444`);
1328
1329	static struct workqueue_struct *rcu_reclaim_wq;
1330
1331	/ Maximum number of jiffies to wait before draining a batch. /
1332	#define KFREE_DRAIN_JIFFIES (5 * HZ)
1333	#define KFREE_N_BATCHES 2
1334	#define FREE_N_CHANNELS 2
1335
1336	/**
1337	* struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers
1338	* @list: List node. All blocks are linked between each other
1339	* @gp_snap: Snapshot of RCU state for objects placed to this bulk
1340	* @nr_records: Number of active pointers in the array
1341	* @records: Array of the kvfree_rcu() pointers
1342	*/
1343	struct kvfree_rcu_bulk_data {
1344	struct list_head list;
1345	struct rcu_gp_oldstate gp_snap;
1346	unsigned long nr_records;
1347	void *records[] __counted_by(nr_records);
1348	};
1349
1350	/*
1351	* This macro defines how many entries the "records" array
1352	* will contain. It is based on the fact that the size of
1353	* kvfree_rcu_bulk_data structure becomes exactly one page.
1354	*/
1355	#define KVFREE_BULK_MAX_ENTR \
1356	((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *))
1357
1358	/**
1359	* struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
1360	* @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
1361	* @head_free: List of kfree_rcu() objects waiting for a grace period
1362	* @head_free_gp_snap: Grace-period snapshot to check for attempted premature frees.
1363	* @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
1364	* @krcp: Pointer to @kfree_rcu_cpu structure
1365	*/
1366
1367	struct kfree_rcu_cpu_work {
1368	struct rcu_work rcu_work;
1369	struct rcu_head *head_free;
1370	struct rcu_gp_oldstate head_free_gp_snap;
1371	struct list_head bulk_head_free[FREE_N_CHANNELS];
1372	struct kfree_rcu_cpu *krcp;
1373	};
1374
1375	/**
1376	* struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
1377	* @head: List of kfree_rcu() objects not yet waiting for a grace period
1378	* @head_gp_snap: Snapshot of RCU state for objects placed to "@head"
1379	* @bulk_head: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
1380	* @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
1381	* @lock: Synchronize access to this structure
1382	* @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
1383	* @initialized: The @rcu_work fields have been initialized
1384	* @head_count: Number of objects in rcu_head singular list
1385	* @bulk_count: Number of objects in bulk-list
1386	* @bkvcache:
1387	* A simple cache list that contains objects for reuse purpose.
1388	* In order to save some per-cpu space the list is singular.
1389	* Even though it is lockless an access has to be protected by the
1390	* per-cpu lock.
1391	* @page_cache_work: A work to refill the cache when it is empty
1392	* @backoff_page_cache_fill: Delay cache refills
1393	* @work_in_progress: Indicates that page_cache_work is running
1394	* @hrtimer: A hrtimer for scheduling a page_cache_work
1395	* @nr_bkv_objs: number of allocated objects at @bkvcache.
1396	*
1397	* This is a per-CPU structure. The reason that it is not included in
1398	* the rcu_data structure is to permit this code to be extracted from
1399	* the RCU files. Such extraction could allow further optimization of
1400	* the interactions with the slab allocators.
1401	*/
1402	struct kfree_rcu_cpu {
1403	// Objects queued on a linked list
1404	// through their rcu_head structures.
1405	struct rcu_head *head;
1406	unsigned long head_gp_snap;
1407	atomic_t head_count;
1408
1409	// Objects queued on a bulk-list.
1410	struct list_head bulk_head[FREE_N_CHANNELS];
1411	atomic_t bulk_count[FREE_N_CHANNELS];
1412
1413	struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
1414	raw_spinlock_t lock;
1415	struct delayed_work monitor_work;
1416	bool initialized;
1417
1418	struct delayed_work page_cache_work;
1419	atomic_t backoff_page_cache_fill;
1420	atomic_t work_in_progress;
1421	struct hrtimer hrtimer;
1422
1423	struct llist_head bkvcache;
1424	int nr_bkv_objs;
1425	};
1426
1427	static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = {
1428	.lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock),
1429	};
1430
1431	static __always_inline void
1432	debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead)
1433	{
1434	#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
1435	int i;
1436
1437	for (i = `0`; i < bhead->nr_records; i++)
1438	debug_rcu_head_unqueue((struct rcu_head *)(bhead->records[i]));
1439	#endif
1440	}
1441
1442	static inline struct kfree_rcu_cpu *
1443	krc_this_cpu_lock(unsigned long *flags)
1444	{
1445	struct kfree_rcu_cpu *krcp;
1446
1447	local_irq_save(flags); // For safely calling this_cpu_ptr().*
1448	krcp = this_cpu_ptr(&krc);
1449	raw_spin_lock(&krcp->lock);
1450
1451	return krcp;
1452	}
1453
1454	static inline void
1455	krc_this_cpu_unlock(struct kfree_rcu_cpu krcp, unsigned* long flags)
1456	{
1457	raw_spin_unlock_irqrestore(&krcp->lock, flags);
1458	}
1459
1460	static inline struct kvfree_rcu_bulk_data *
1461	get_cached_bnode(struct kfree_rcu_cpu *krcp)
1462	{
1463	if (!krcp->nr_bkv_objs)
1464	return NULL;
1465
1466	WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs - `1`);
1467	return (struct kvfree_rcu_bulk_data *)
1468	llist_del_first(head: &krcp->bkvcache);
1469	}
1470
1471	static inline bool
1472	put_cached_bnode(struct kfree_rcu_cpu *krcp,
1473	struct kvfree_rcu_bulk_data *bnode)
1474	{
1475	// Check the limit.
1476	if (krcp->nr_bkv_objs >= rcu_min_cached_objs)
1477	return false;
1478
1479	llist_add(new: (struct llist_node *) bnode, head: &krcp->bkvcache);
1480	WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs + `1`);
1481	return true;
1482	}
1483
1484	static int
1485	drain_page_cache(struct kfree_rcu_cpu *krcp)
1486	{
1487	unsigned long flags;
1488	struct llist_node page_list, pos, *n;
1489	int freed = `0`;
1490
1491	if (!rcu_min_cached_objs)
1492	return `0`;
1493
1494	raw_spin_lock_irqsave(&krcp->lock, flags);
1495	page_list = llist_del_all(head: &krcp->bkvcache);
1496	WRITE_ONCE(krcp->nr_bkv_objs, `0`);
1497	raw_spin_unlock_irqrestore(&krcp->lock, flags);
1498
1499	llist_for_each_safe(pos, n, page_list) {
1500	free_page((unsigned long)pos);
1501	freed++;
1502	}
1503
1504	return freed;
1505	}
1506
1507	static void
1508	kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp,
1509	struct kvfree_rcu_bulk_data bnode, int* idx)
1510	{
1511	unsigned long flags;
1512	int i;
1513
1514	if (!WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&bnode->gp_snap))) {
1515	debug_rcu_bhead_unqueue(bhead: bnode);
1516	rcu_lock_acquire(&rcu_callback_map);
1517	if (idx == `0`) { // kmalloc() / kfree().
1518	trace_rcu_invoke_kfree_bulk_callback(
1519	rcuname: "slab", nr_records: bnode->nr_records,
1520	p: bnode->records);
1521
1522	kfree_bulk(size: bnode->nr_records, p: bnode->records);
1523	} else { // vmalloc() / vfree().
1524	for (i = `0`; i < bnode->nr_records; i++) {
1525	trace_rcu_invoke_kvfree_callback(
1526	rcuname: "slab", rhp: bnode->records[i], offset: `0`);
1527
1528	vfree(addr: bnode->records[i]);
1529	}
1530	}
1531	rcu_lock_release(&rcu_callback_map);
1532	}
1533
1534	raw_spin_lock_irqsave(&krcp->lock, flags);
1535	if (put_cached_bnode(krcp, bnode))
1536	bnode = NULL;
1537	raw_spin_unlock_irqrestore(&krcp->lock, flags);
1538
1539	if (bnode)
1540	free_page((unsigned long) bnode);
1541
1542	cond_resched_tasks_rcu_qs();
1543	}
1544
1545	static void
1546	kvfree_rcu_list(struct rcu_head *head)
1547	{
1548	struct rcu_head *next;
1549
1550	for (; head; head = next) {
1551	void ptr = (void* *) head->func;
1552	unsigned long offset = (void *) head - ptr;
1553
1554	next = head->next;
1555	debug_rcu_head_unqueue(head: (struct rcu_head *)ptr);
1556	rcu_lock_acquire(&rcu_callback_map);
1557	trace_rcu_invoke_kvfree_callback(rcuname: "slab", rhp: head, offset);
1558
1559	kvfree(addr: ptr);
1560
1561	rcu_lock_release(&rcu_callback_map);
1562	cond_resched_tasks_rcu_qs();
1563	}
1564	}
1565
1566	/*
1567	* This function is invoked in workqueue context after a grace period.
1568	* It frees all the objects queued on ->bulk_head_free or ->head_free.
1569	*/
1570	static void kfree_rcu_work(struct work_struct *work)
1571	{
1572	unsigned long flags;
1573	struct kvfree_rcu_bulk_data bnode, n;
1574	struct list_head bulk_head[FREE_N_CHANNELS];
1575	struct rcu_head *head;
1576	struct kfree_rcu_cpu *krcp;
1577	struct kfree_rcu_cpu_work *krwp;
1578	struct rcu_gp_oldstate head_gp_snap;
1579	int i;
1580
1581	krwp = container_of(to_rcu_work(work),
1582	struct kfree_rcu_cpu_work, rcu_work);
1583	krcp = krwp->krcp;
1584
1585	raw_spin_lock_irqsave(&krcp->lock, flags);
1586	// Channels 1 and 2.
1587	for (i = `0`; i < FREE_N_CHANNELS; i++)
1588	list_replace_init(old: &krwp->bulk_head_free[i], new: &bulk_head[i]);
1589
1590	// Channel 3.
1591	head = krwp->head_free;
1592	krwp->head_free = NULL;
1593	head_gp_snap = krwp->head_free_gp_snap;
1594	raw_spin_unlock_irqrestore(&krcp->lock, flags);
1595
1596	// Handle the first two channels.
1597	for (i = `0`; i < FREE_N_CHANNELS; i++) {
1598	// Start from the tail page, so a GP is likely passed for it.
1599	list_for_each_entry_safe(bnode, n, &bulk_head[i], list)
1600	kvfree_rcu_bulk(krcp, bnode, idx: i);
1601	}
1602
1603	/*
1604	* This is used when the "bulk" path can not be used for the
1605	* double-argument of kvfree_rcu(). This happens when the
1606	* page-cache is empty, which means that objects are instead
1607	* queued on a linked list through their rcu_head structures.
1608	* This list is named "Channel 3".
1609	*/
1610	if (head && !WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&head_gp_snap)))
1611	kvfree_rcu_list(head);
1612	}
1613
1614	static bool kfree_rcu_sheaf(void *obj)
1615	{
1616	struct kmem_cache *s;
1617	struct folio *folio;
1618	struct slab *slab;
1619
1620	if (is_vmalloc_addr(x: obj))
1621	return false;
1622
1623	folio = virt_to_folio(x: obj);
1624	if (unlikely(!folio_test_slab(folio)))
1625	return false;
1626
1627	slab = folio_slab(folio);
1628	s = slab->slab_cache;
1629	if (s->cpu_sheaves) {
1630	if (likely(!IS_ENABLED(CONFIG_NUMA) \|\|
1631	slab_nid(slab) == numa_mem_id()))
1632	return __kfree_rcu_sheaf(s, obj);
1633	}
1634
1635	return false;
1636	}
1637
1638	static bool
1639	need_offload_krc(struct kfree_rcu_cpu *krcp)
1640	{
1641	int i;
1642
1643	for (i = `0`; i < FREE_N_CHANNELS; i++)
1644	if (!list_empty(head: &krcp->bulk_head[i]))
1645	return true;
1646
1647	return !!READ_ONCE(krcp->head);
1648	}
1649
1650	static bool
1651	need_wait_for_krwp_work(struct kfree_rcu_cpu_work *krwp)
1652	{
1653	int i;
1654
1655	for (i = `0`; i < FREE_N_CHANNELS; i++)
1656	if (!list_empty(head: &krwp->bulk_head_free[i]))
1657	return true;
1658
1659	return !!krwp->head_free;
1660	}
1661
1662	static int krc_count(struct kfree_rcu_cpu *krcp)
1663	{
1664	int sum = atomic_read(v: &krcp->head_count);
1665	int i;
1666
1667	for (i = `0`; i < FREE_N_CHANNELS; i++)
1668	sum += atomic_read(v: &krcp->bulk_count[i]);
1669
1670	return sum;
1671	}
1672
1673	static void
1674	__schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
1675	{
1676	long delay, delay_left;
1677
1678	delay = krc_count(krcp) >= KVFREE_BULK_MAX_ENTR ? `1`:KFREE_DRAIN_JIFFIES;
1679	if (delayed_work_pending(&krcp->monitor_work)) {
1680	delay_left = krcp->monitor_work.timer.expires - jiffies;
1681	if (delay < delay_left)
1682	mod_delayed_work(wq: rcu_reclaim_wq, dwork: &krcp->monitor_work, delay);
1683	return;
1684	}
1685	queue_delayed_work(wq: rcu_reclaim_wq, dwork: &krcp->monitor_work, delay);
1686	}
1687
1688	static void
1689	schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
1690	{
1691	unsigned long flags;
1692
1693	raw_spin_lock_irqsave(&krcp->lock, flags);
1694	__schedule_delayed_monitor_work(krcp);
1695	raw_spin_unlock_irqrestore(&krcp->lock, flags);
1696	}
1697
1698	static void
1699	kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp)
1700	{
1701	struct list_head bulk_ready[FREE_N_CHANNELS];
1702	struct kvfree_rcu_bulk_data bnode, n;
1703	struct rcu_head *head_ready = NULL;
1704	unsigned long flags;
1705	int i;
1706
1707	raw_spin_lock_irqsave(&krcp->lock, flags);
1708	for (i = `0`; i < FREE_N_CHANNELS; i++) {
1709	INIT_LIST_HEAD(list: &bulk_ready[i]);
1710
1711	list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) {
1712	if (!poll_state_synchronize_rcu_full(rgosp: &bnode->gp_snap))
1713	break;
1714
1715	atomic_sub(i: bnode->nr_records, v: &krcp->bulk_count[i]);
1716	list_move(list: &bnode->list, head: &bulk_ready[i]);
1717	}
1718	}
1719
1720	if (krcp->head && poll_state_synchronize_rcu(oldstate: krcp->head_gp_snap)) {
1721	head_ready = krcp->head;
1722	atomic_set(v: &krcp->head_count, i: `0`);
1723	WRITE_ONCE(krcp->head, NULL);
1724	}
1725	raw_spin_unlock_irqrestore(&krcp->lock, flags);
1726
1727	for (i = `0`; i < FREE_N_CHANNELS; i++) {
1728	list_for_each_entry_safe(bnode, n, &bulk_ready[i], list)
1729	kvfree_rcu_bulk(krcp, bnode, idx: i);
1730	}
1731
1732	if (head_ready)
1733	kvfree_rcu_list(head: head_ready);
1734	}
1735
1736	/*
1737	* Return: %true if a work is queued, %false otherwise.
1738	*/
1739	static bool
1740	kvfree_rcu_queue_batch(struct kfree_rcu_cpu *krcp)
1741	{
1742	unsigned long flags;
1743	bool queued = false;
1744	int i, j;
1745
1746	raw_spin_lock_irqsave(&krcp->lock, flags);
1747
1748	// Attempt to start a new batch.
1749	for (i = `0`; i < KFREE_N_BATCHES; i++) {
1750	struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]);
1751
1752	// Try to detach bulk_head or head and attach it, only when
1753	// all channels are free. Any channel is not free means at krwp
1754	// there is on-going rcu work to handle krwp's free business.
1755	if (need_wait_for_krwp_work(krwp))
1756	continue;
1757
1758	// kvfree_rcu_drain_ready() might handle this krcp, if so give up.
1759	if (need_offload_krc(krcp)) {
1760	// Channel 1 corresponds to the SLAB-pointer bulk path.
1761	// Channel 2 corresponds to vmalloc-pointer bulk path.
1762	for (j = `0`; j < FREE_N_CHANNELS; j++) {
1763	if (list_empty(head: &krwp->bulk_head_free[j])) {
1764	atomic_set(v: &krcp->bulk_count[j], i: `0`);
1765	list_replace_init(old: &krcp->bulk_head[j],
1766	new: &krwp->bulk_head_free[j]);
1767	}
1768	}
1769
1770	// Channel 3 corresponds to both SLAB and vmalloc
1771	// objects queued on the linked list.
1772	if (!krwp->head_free) {
1773	krwp->head_free = krcp->head;
1774	get_state_synchronize_rcu_full(rgosp: &krwp->head_free_gp_snap);
1775	atomic_set(v: &krcp->head_count, i: `0`);
1776	WRITE_ONCE(krcp->head, NULL);
1777	}
1778
1779	// One work is per one batch, so there are three
1780	// "free channels", the batch can handle. Break
1781	// the loop since it is done with this CPU thus
1782	// queuing an RCU work is _always_ success here.
1783	queued = queue_rcu_work(wq: rcu_reclaim_wq, rwork: &krwp->rcu_work);
1784	WARN_ON_ONCE(!queued);
1785	break;
1786	}
1787	}
1788
1789	raw_spin_unlock_irqrestore(&krcp->lock, flags);
1790	return queued;
1791	}
1792
1793	/*
1794	* This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
1795	*/
1796	static void kfree_rcu_monitor(struct work_struct *work)
1797	{
1798	struct kfree_rcu_cpu *krcp = container_of(work,
1799	struct kfree_rcu_cpu, monitor_work.work);
1800
1801	// Drain ready for reclaim.
1802	kvfree_rcu_drain_ready(krcp);
1803
1804	// Queue a batch for a rest.
1805	kvfree_rcu_queue_batch(krcp);
1806
1807	// If there is nothing to detach, it means that our job is
1808	// successfully done here. In case of having at least one
1809	// of the channels that is still busy we should rearm the
1810	// work to repeat an attempt. Because previous batches are
1811	// still in progress.
1812	if (need_offload_krc(krcp))
1813	schedule_delayed_monitor_work(krcp);
1814	}
1815
1816	static void fill_page_cache_func(struct work_struct *work)
1817	{
1818	struct kvfree_rcu_bulk_data *bnode;
1819	struct kfree_rcu_cpu *krcp =
1820	container_of(work, struct kfree_rcu_cpu,
1821	page_cache_work.work);
1822	unsigned long flags;
1823	int nr_pages;
1824	bool pushed;
1825	int i;
1826
1827	nr_pages = atomic_read(v: &krcp->backoff_page_cache_fill) ?
1828	`1` : rcu_min_cached_objs;
1829
1830	for (i = READ_ONCE(krcp->nr_bkv_objs); i < nr_pages; i++) {
1831	bnode = (struct kvfree_rcu_bulk_data *)
1832	__get_free_page(GFP_KERNEL \| __GFP_NORETRY \| __GFP_NOMEMALLOC \| __GFP_NOWARN);
1833
1834	if (!bnode)
1835	break;
1836
1837	raw_spin_lock_irqsave(&krcp->lock, flags);
1838	pushed = put_cached_bnode(krcp, bnode);
1839	raw_spin_unlock_irqrestore(&krcp->lock, flags);
1840
1841	if (!pushed) {
1842	free_page((unsigned long) bnode);
1843	break;
1844	}
1845	}
1846
1847	atomic_set(v: &krcp->work_in_progress, i: `0`);
1848	atomic_set(v: &krcp->backoff_page_cache_fill, i: `0`);
1849	}
1850
1851	// Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock()
1852	// state specified by flags. If can_alloc is true, the caller must
1853	// be schedulable and not be holding any locks or mutexes that might be
1854	// acquired by the memory allocator or anything that it might invoke.
1855	// Returns true if ptr was successfully recorded, else the caller must
1856	// use a fallback.
1857	static inline bool
1858	add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
1859	unsigned long flags, void* *ptr, bool can_alloc)
1860	{
1861	struct kvfree_rcu_bulk_data *bnode;
1862	int idx;
1863
1864	*krcp = krc_this_cpu_lock(flags);
1865	if (unlikely(!(*krcp)->initialized))
1866	return false;
1867
1868	idx = !!is_vmalloc_addr(x: ptr);
1869	bnode = list_first_entry_or_null(&(*krcp)->bulk_head[idx],
1870	struct kvfree_rcu_bulk_data, list);
1871
1872	/ Check if a new block is required. /
1873	if (!bnode \|\| bnode->nr_records == KVFREE_BULK_MAX_ENTR) {
1874	bnode = get_cached_bnode(krcp: *krcp);
1875	if (!bnode && can_alloc) {
1876	krc_this_cpu_unlock(krcp: krcp, flags: flags);
1877
1878	// __GFP_NORETRY - allows a light-weight direct reclaim
1879	// what is OK from minimizing of fallback hitting point of
1880	// view. Apart of that it forbids any OOM invoking what is
1881	// also beneficial since we are about to release memory soon.
1882	//
1883	// __GFP_NOMEMALLOC - prevents from consuming of all the
1884	// memory reserves. Please note we have a fallback path.
1885	//
1886	// __GFP_NOWARN - it is supposed that an allocation can
1887	// be failed under low memory or high memory pressure
1888	// scenarios.
1889	bnode = (struct kvfree_rcu_bulk_data *)
1890	__get_free_page(GFP_KERNEL \| __GFP_NORETRY \| __GFP_NOMEMALLOC \| __GFP_NOWARN);
1891	raw_spin_lock_irqsave(&(krcp)->lock, flags);
1892	}
1893
1894	if (!bnode)
1895	return false;
1896
1897	// Initialize the new block and attach it.
1898	bnode->nr_records = `0`;
1899	list_add(new: &bnode->list, head: &(*krcp)->bulk_head[idx]);
1900	}
1901
1902	// Finally insert and update the GP for this page.
1903	bnode->nr_records++;
1904	bnode->records[bnode->nr_records - `1`] = ptr;
1905	get_state_synchronize_rcu_full(rgosp: &bnode->gp_snap);
1906	atomic_inc(v: &(*krcp)->bulk_count[idx]);
1907
1908	return true;
1909	}
1910
1911	static enum hrtimer_restart
1912	schedule_page_work_fn(struct hrtimer *t)
1913	{
1914	struct kfree_rcu_cpu *krcp =
1915	container_of(t, struct kfree_rcu_cpu, hrtimer);
1916
1917	queue_delayed_work(wq: system_highpri_wq, dwork: &krcp->page_cache_work, delay: `0`);
1918	return HRTIMER_NORESTART;
1919	}
1920
1921	static void
1922	run_page_cache_worker(struct kfree_rcu_cpu *krcp)
1923	{
1924	// If cache disabled, bail out.
1925	if (!rcu_min_cached_objs)
1926	return;
1927
1928	if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
1929	!atomic_xchg(v: &krcp->work_in_progress, new: `1`)) {
1930	if (atomic_read(v: &krcp->backoff_page_cache_fill)) {
1931	queue_delayed_work(wq: rcu_reclaim_wq,
1932	dwork: &krcp->page_cache_work,
1933	delay: msecs_to_jiffies(m: rcu_delay_page_cache_fill_msec));
1934	} else {
1935	hrtimer_setup(timer: &krcp->hrtimer, function: schedule_page_work_fn, CLOCK_MONOTONIC,
1936	mode: HRTIMER_MODE_REL);
1937	hrtimer_start(timer: &krcp->hrtimer, tim: `0`, mode: HRTIMER_MODE_REL);
1938	}
1939	}
1940	}
1941
1942	void __init kfree_rcu_scheduler_running(void)
1943	{
1944	int cpu;
1945
1946	for_each_possible_cpu(cpu) {
1947	struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
1948
1949	if (need_offload_krc(krcp))
1950	schedule_delayed_monitor_work(krcp);
1951	}
1952	}
1953
1954	/*
1955	* Queue a request for lazy invocation of the appropriate free routine
1956	* after a grace period. Please note that three paths are maintained,
1957	* two for the common case using arrays of pointers and a third one that
1958	* is used only when the main paths cannot be used, for example, due to
1959	* memory pressure.
1960	*
1961	* Each kvfree_call_rcu() request is added to a batch. The batch will be drained
1962	* every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will
1963	* be free'd in workqueue context. This allows us to: batch requests together to
1964	* reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load.
1965	*/
1966	void kvfree_call_rcu(struct rcu_head head, void* *ptr)
1967	{
1968	unsigned long flags;
1969	struct kfree_rcu_cpu *krcp;
1970	bool success;
1971
1972	/*
1973	* Please note there is a limitation for the head-less
1974	* variant, that is why there is a clear rule for such
1975	* objects: it can be used from might_sleep() context
1976	* only. For other places please embed an rcu_head to
1977	* your data.
1978	*/
1979	if (!head)
1980	might_sleep();
1981
1982	if (!IS_ENABLED(CONFIG_PREEMPT_RT) && kfree_rcu_sheaf(obj: ptr))
1983	return;
1984
1985	// Queue the object but don't yet schedule the batch.
1986	if (debug_rcu_head_queue(head: ptr)) {
1987	// Probable double kfree_rcu(), just leak.
1988	WARN_ONCE(`1`, "%s(): Double-freed call. rcu_head %p\n",
1989	__func__, head);
1990
1991	// Mark as success and leave.
1992	return;
1993	}
1994
1995	kasan_record_aux_stack(ptr);
1996	success = add_ptr_to_bulk_krc_lock(krcp: &krcp, flags: &flags, ptr, can_alloc: !head);
1997	if (!success) {
1998	run_page_cache_worker(krcp);
1999
2000	if (head == NULL)
2001	// Inline if kvfree_rcu(one_arg) call.
2002	goto unlock_return;
2003
2004	head->func = ptr;
2005	head->next = krcp->head;
2006	WRITE_ONCE(krcp->head, head);
2007	atomic_inc(v: &krcp->head_count);
2008
2009	// Take a snapshot for this krcp.
2010	krcp->head_gp_snap = get_state_synchronize_rcu();
2011	success = true;
2012	}
2013
2014	/*
2015	* The kvfree_rcu() caller considers the pointer freed at this point
2016	* and likely removes any references to it. Since the actual slab
2017	* freeing (and kmemleak_free()) is deferred, tell kmemleak to ignore
2018	* this object (no scanning or false positives reporting).
2019	*/
2020	kmemleak_ignore(ptr);
2021
2022	// Set timer to drain after KFREE_DRAIN_JIFFIES.
2023	if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING)
2024	__schedule_delayed_monitor_work(krcp);
2025
2026	unlock_return:
2027	krc_this_cpu_unlock(krcp, flags);
2028
2029	/*
2030	* Inline kvfree() after synchronize_rcu(). We can do
2031	* it from might_sleep() context only, so the current
2032	* CPU can pass the QS state.
2033	*/
2034	if (!success) {
2035	debug_rcu_head_unqueue(head: (struct rcu_head *) ptr);
2036	synchronize_rcu();
2037	kvfree(addr: ptr);
2038	}
2039	}
2040	EXPORT_SYMBOL_GPL(kvfree_call_rcu);
2041
2042	/**
2043	* kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete.
2044	*
2045	* Note that a single argument of kvfree_rcu() call has a slow path that
2046	* triggers synchronize_rcu() following by freeing a pointer. It is done
2047	* before the return from the function. Therefore for any single-argument
2048	* call that will result in a kfree() to a cache that is to be destroyed
2049	* during module exit, it is developer's responsibility to ensure that all
2050	* such calls have returned before the call to kmem_cache_destroy().
2051	*/
2052	void kvfree_rcu_barrier(void)
2053	{
2054	struct kfree_rcu_cpu_work *krwp;
2055	struct kfree_rcu_cpu *krcp;
2056	bool queued;
2057	int i, cpu;
2058
2059	flush_all_rcu_sheaves();
2060
2061	/*
2062	* Firstly we detach objects and queue them over an RCU-batch
2063	* for all CPUs. Finally queued works are flushed for each CPU.
2064	*
2065	* Please note. If there are outstanding batches for a particular
2066	* CPU, those have to be finished first following by queuing a new.
2067	*/
2068	for_each_possible_cpu(cpu) {
2069	krcp = per_cpu_ptr(&krc, cpu);
2070
2071	/*
2072	* Check if this CPU has any objects which have been queued for a
2073	* new GP completion. If not(means nothing to detach), we are done
2074	* with it. If any batch is pending/running for this "krcp", below
2075	* per-cpu flush_rcu_work() waits its completion(see last step).
2076	*/
2077	if (!need_offload_krc(krcp))
2078	continue;
2079
2080	while (`1`) {
2081	/*
2082	* If we are not able to queue a new RCU work it means:
2083	* - batches for this CPU are still in flight which should
2084	* be flushed first and then repeat;
2085	* - no objects to detach, because of concurrency.
2086	*/
2087	queued = kvfree_rcu_queue_batch(krcp);
2088
2089	/*
2090	* Bail out, if there is no need to offload this "krcp"
2091	* anymore. As noted earlier it can run concurrently.
2092	*/
2093	if (queued \|\| !need_offload_krc(krcp))
2094	break;
2095
2096	/ There are ongoing batches. /
2097	for (i = `0`; i < KFREE_N_BATCHES; i++) {
2098	krwp = &(krcp->krw_arr[i]);
2099	flush_rcu_work(rwork: &krwp->rcu_work);
2100	}
2101	}
2102	}
2103
2104	/*
2105	* Now we guarantee that all objects are flushed.
2106	*/
2107	for_each_possible_cpu(cpu) {
2108	krcp = per_cpu_ptr(&krc, cpu);
2109
2110	/*
2111	* A monitor work can drain ready to reclaim objects
2112	* directly. Wait its completion if running or pending.
2113	*/
2114	cancel_delayed_work_sync(dwork: &krcp->monitor_work);
2115
2116	for (i = `0`; i < KFREE_N_BATCHES; i++) {
2117	krwp = &(krcp->krw_arr[i]);
2118	flush_rcu_work(rwork: &krwp->rcu_work);
2119	}
2120	}
2121	}
2122	EXPORT_SYMBOL_GPL(kvfree_rcu_barrier);
2123
2124	static unsigned long
2125	kfree_rcu_shrink_count(struct shrinker shrink, struct* shrink_control *sc)
2126	{
2127	int cpu;
2128	unsigned long count = `0`;
2129
2130	/ Snapshot count of all CPUs /
2131	for_each_possible_cpu(cpu) {
2132	struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
2133
2134	count += krc_count(krcp);
2135	count += READ_ONCE(krcp->nr_bkv_objs);
2136	atomic_set(v: &krcp->backoff_page_cache_fill, i: `1`);
2137	}
2138
2139	return count == `0` ? SHRINK_EMPTY : count;
2140	}
2141
2142	static unsigned long
2143	kfree_rcu_shrink_scan(struct shrinker shrink, struct* shrink_control *sc)
2144	{
2145	int cpu, freed = `0`;
2146
2147	for_each_possible_cpu(cpu) {
2148	int count;
2149	struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
2150
2151	count = krc_count(krcp);
2152	count += drain_page_cache(krcp);
2153	kfree_rcu_monitor(work: &krcp->monitor_work.work);
2154
2155	sc->nr_to_scan -= count;
2156	freed += count;
2157
2158	if (sc->nr_to_scan <= `0`)
2159	break;
2160	}
2161
2162	return freed == `0` ? SHRINK_STOP : freed;
2163	}
2164
2165	void __init kvfree_rcu_init(void)
2166	{
2167	int cpu;
2168	int i, j;
2169	struct shrinker *kfree_rcu_shrinker;
2170
2171	rcu_reclaim_wq = alloc_workqueue("kvfree_rcu_reclaim",
2172	WQ_UNBOUND \| WQ_MEM_RECLAIM, `0`);
2173	WARN_ON(!rcu_reclaim_wq);
2174
2175	/ Clamp it to [0:100] seconds interval. /
2176	if (rcu_delay_page_cache_fill_msec < `0` \|\|
2177	rcu_delay_page_cache_fill_msec > `100` * MSEC_PER_SEC) {
2178
2179	rcu_delay_page_cache_fill_msec =
2180	clamp(rcu_delay_page_cache_fill_msec, `0`,
2181	(int) (`100` * MSEC_PER_SEC));
2182
2183	pr_info("Adjusting rcutree.rcu_delay_page_cache_fill_msec to %d ms.\n",
2184	rcu_delay_page_cache_fill_msec);
2185	}
2186
2187	for_each_possible_cpu(cpu) {
2188	struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
2189
2190	for (i = `0`; i < KFREE_N_BATCHES; i++) {
2191	INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
2192	krcp->krw_arr[i].krcp = krcp;
2193
2194	for (j = `0`; j < FREE_N_CHANNELS; j++)
2195	INIT_LIST_HEAD(list: &krcp->krw_arr[i].bulk_head_free[j]);
2196	}
2197
2198	for (i = `0`; i < FREE_N_CHANNELS; i++)
2199	INIT_LIST_HEAD(list: &krcp->bulk_head[i]);
2200
2201	INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
2202	INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func);
2203	krcp->initialized = true;
2204	}
2205
2206	kfree_rcu_shrinker = shrinker_alloc(flags: `0`, fmt: "slab-kvfree-rcu");
2207	if (!kfree_rcu_shrinker) {
2208	pr_err("Failed to allocate kfree_rcu() shrinker!\n");
2209	return;
2210	}
2211
2212	kfree_rcu_shrinker->count_objects = kfree_rcu_shrink_count;
2213	kfree_rcu_shrinker->scan_objects = kfree_rcu_shrink_scan;
2214
2215	shrinker_register(shrinker: kfree_rcu_shrinker);
2216	}
2217
2218	#endif /* CONFIG_KVFREE_RCU_BATCHED */
2219
2220

Browse the source code of Linux/mm/slab_common.c