1// SPDX-License-Identifier: GPL-2.0
2/*
3 * SLUB: A slab allocator that limits cache line use instead of queuing
4 * objects in per cpu and per node lists.
5 *
6 * The allocator synchronizes using per slab locks or atomic operations
7 * and only uses a centralized lock to manage a pool of partial slabs.
8 *
9 * (C) 2007 SGI, Christoph Lameter
10 * (C) 2011 Linux Foundation, Christoph Lameter
11 */
12
13#include <linux/mm.h>
14#include <linux/swap.h> /* mm_account_reclaimed_pages() */
15#include <linux/module.h>
16#include <linux/bit_spinlock.h>
17#include <linux/interrupt.h>
18#include <linux/swab.h>
19#include <linux/bitops.h>
20#include <linux/slab.h>
21#include "slab.h"
22#include <linux/vmalloc.h>
23#include <linux/proc_fs.h>
24#include <linux/seq_file.h>
25#include <linux/kasan.h>
26#include <linux/node.h>
27#include <linux/kmsan.h>
28#include <linux/cpu.h>
29#include <linux/cpuset.h>
30#include <linux/mempolicy.h>
31#include <linux/ctype.h>
32#include <linux/stackdepot.h>
33#include <linux/debugobjects.h>
34#include <linux/kallsyms.h>
35#include <linux/kfence.h>
36#include <linux/memory.h>
37#include <linux/math64.h>
38#include <linux/fault-inject.h>
39#include <linux/kmemleak.h>
40#include <linux/stacktrace.h>
41#include <linux/prefetch.h>
42#include <linux/memcontrol.h>
43#include <linux/random.h>
44#include <kunit/test.h>
45#include <kunit/test-bug.h>
46#include <linux/sort.h>
47#include <linux/irq_work.h>
48#include <linux/kprobes.h>
49#include <linux/debugfs.h>
50#include <trace/events/kmem.h>
51
52#include "internal.h"
53
54/*
55 * Lock order:
56 * 1. slab_mutex (Global Mutex)
57 * 2. node->list_lock (Spinlock)
58 * 3. kmem_cache->cpu_slab->lock (Local lock)
59 * 4. slab_lock(slab) (Only on some arches)
60 * 5. object_map_lock (Only for debugging)
61 *
62 * slab_mutex
63 *
64 * The role of the slab_mutex is to protect the list of all the slabs
65 * and to synchronize major metadata changes to slab cache structures.
66 * Also synchronizes memory hotplug callbacks.
67 *
68 * slab_lock
69 *
70 * The slab_lock is a wrapper around the page lock, thus it is a bit
71 * spinlock.
72 *
73 * The slab_lock is only used on arches that do not have the ability
74 * to do a cmpxchg_double. It only protects:
75 *
76 * A. slab->freelist -> List of free objects in a slab
77 * B. slab->inuse -> Number of objects in use
78 * C. slab->objects -> Number of objects in slab
79 * D. slab->frozen -> frozen state
80 *
81 * Frozen slabs
82 *
83 * If a slab is frozen then it is exempt from list management. It is
84 * the cpu slab which is actively allocated from by the processor that
85 * froze it and it is not on any list. The processor that froze the
86 * slab is the one who can perform list operations on the slab. Other
87 * processors may put objects onto the freelist but the processor that
88 * froze the slab is the only one that can retrieve the objects from the
89 * slab's freelist.
90 *
91 * CPU partial slabs
92 *
93 * The partially empty slabs cached on the CPU partial list are used
94 * for performance reasons, which speeds up the allocation process.
95 * These slabs are not frozen, but are also exempt from list management,
96 * by clearing the SL_partial flag when moving out of the node
97 * partial list. Please see __slab_free() for more details.
98 *
99 * To sum up, the current scheme is:
100 * - node partial slab: SL_partial && !frozen
101 * - cpu partial slab: !SL_partial && !frozen
102 * - cpu slab: !SL_partial && frozen
103 * - full slab: !SL_partial && !frozen
104 *
105 * list_lock
106 *
107 * The list_lock protects the partial and full list on each node and
108 * the partial slab counter. If taken then no new slabs may be added or
109 * removed from the lists nor make the number of partial slabs be modified.
110 * (Note that the total number of slabs is an atomic value that may be
111 * modified without taking the list lock).
112 *
113 * The list_lock is a centralized lock and thus we avoid taking it as
114 * much as possible. As long as SLUB does not have to handle partial
115 * slabs, operations can continue without any centralized lock. F.e.
116 * allocating a long series of objects that fill up slabs does not require
117 * the list lock.
118 *
119 * For debug caches, all allocations are forced to go through a list_lock
120 * protected region to serialize against concurrent validation.
121 *
122 * cpu_slab->lock local lock
123 *
124 * This locks protect slowpath manipulation of all kmem_cache_cpu fields
125 * except the stat counters. This is a percpu structure manipulated only by
126 * the local cpu, so the lock protects against being preempted or interrupted
127 * by an irq. Fast path operations rely on lockless operations instead.
128 *
129 * On PREEMPT_RT, the local lock neither disables interrupts nor preemption
130 * which means the lockless fastpath cannot be used as it might interfere with
131 * an in-progress slow path operations. In this case the local lock is always
132 * taken but it still utilizes the freelist for the common operations.
133 *
134 * lockless fastpaths
135 *
136 * The fast path allocation (slab_alloc_node()) and freeing (do_slab_free())
137 * are fully lockless when satisfied from the percpu slab (and when
138 * cmpxchg_double is possible to use, otherwise slab_lock is taken).
139 * They also don't disable preemption or migration or irqs. They rely on
140 * the transaction id (tid) field to detect being preempted or moved to
141 * another cpu.
142 *
143 * irq, preemption, migration considerations
144 *
145 * Interrupts are disabled as part of list_lock or local_lock operations, or
146 * around the slab_lock operation, in order to make the slab allocator safe
147 * to use in the context of an irq.
148 *
149 * In addition, preemption (or migration on PREEMPT_RT) is disabled in the
150 * allocation slowpath, bulk allocation, and put_cpu_partial(), so that the
151 * local cpu doesn't change in the process and e.g. the kmem_cache_cpu pointer
152 * doesn't have to be revalidated in each section protected by the local lock.
153 *
154 * SLUB assigns one slab for allocation to each processor.
155 * Allocations only occur from these slabs called cpu slabs.
156 *
157 * Slabs with free elements are kept on a partial list and during regular
158 * operations no list for full slabs is used. If an object in a full slab is
159 * freed then the slab will show up again on the partial lists.
160 * We track full slabs for debugging purposes though because otherwise we
161 * cannot scan all objects.
162 *
163 * Slabs are freed when they become empty. Teardown and setup is
164 * minimal so we rely on the page allocators per cpu caches for
165 * fast frees and allocs.
166 *
167 * slab->frozen The slab is frozen and exempt from list processing.
168 * This means that the slab is dedicated to a purpose
169 * such as satisfying allocations for a specific
170 * processor. Objects may be freed in the slab while
171 * it is frozen but slab_free will then skip the usual
172 * list operations. It is up to the processor holding
173 * the slab to integrate the slab into the slab lists
174 * when the slab is no longer needed.
175 *
176 * One use of this flag is to mark slabs that are
177 * used for allocations. Then such a slab becomes a cpu
178 * slab. The cpu slab may be equipped with an additional
179 * freelist that allows lockless access to
180 * free objects in addition to the regular freelist
181 * that requires the slab lock.
182 *
183 * SLAB_DEBUG_FLAGS Slab requires special handling due to debug
184 * options set. This moves slab handling out of
185 * the fast path and disables lockless freelists.
186 */
187
188/**
189 * enum slab_flags - How the slab flags bits are used.
190 * @SL_locked: Is locked with slab_lock()
191 * @SL_partial: On the per-node partial list
192 * @SL_pfmemalloc: Was allocated from PF_MEMALLOC reserves
193 *
194 * The slab flags share space with the page flags but some bits have
195 * different interpretations. The high bits are used for information
196 * like zone/node/section.
197 */
198enum slab_flags {
199 SL_locked = PG_locked,
200 SL_partial = PG_workingset, /* Historical reasons for this bit */
201 SL_pfmemalloc = PG_active, /* Historical reasons for this bit */
202};
203
204/*
205 * We could simply use migrate_disable()/enable() but as long as it's a
206 * function call even on !PREEMPT_RT, use inline preempt_disable() there.
207 */
208#ifndef CONFIG_PREEMPT_RT
209#define slub_get_cpu_ptr(var) get_cpu_ptr(var)
210#define slub_put_cpu_ptr(var) put_cpu_ptr(var)
211#define USE_LOCKLESS_FAST_PATH() (true)
212#else
213#define slub_get_cpu_ptr(var) \
214({ \
215 migrate_disable(); \
216 this_cpu_ptr(var); \
217})
218#define slub_put_cpu_ptr(var) \
219do { \
220 (void)(var); \
221 migrate_enable(); \
222} while (0)
223#define USE_LOCKLESS_FAST_PATH() (false)
224#endif
225
226#ifndef CONFIG_SLUB_TINY
227#define __fastpath_inline __always_inline
228#else
229#define __fastpath_inline
230#endif
231
232#ifdef CONFIG_SLUB_DEBUG
233#ifdef CONFIG_SLUB_DEBUG_ON
234DEFINE_STATIC_KEY_TRUE(slub_debug_enabled);
235#else
236DEFINE_STATIC_KEY_FALSE(slub_debug_enabled);
237#endif
238#endif /* CONFIG_SLUB_DEBUG */
239
240#ifdef CONFIG_NUMA
241static DEFINE_STATIC_KEY_FALSE(strict_numa);
242#endif
243
244/* Structure holding parameters for get_partial() call chain */
245struct partial_context {
246 gfp_t flags;
247 unsigned int orig_size;
248 void *object;
249};
250
251static inline bool kmem_cache_debug(struct kmem_cache *s)
252{
253 return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS);
254}
255
256void *fixup_red_left(struct kmem_cache *s, void *p)
257{
258 if (kmem_cache_debug_flags(s, SLAB_RED_ZONE))
259 p += s->red_left_pad;
260
261 return p;
262}
263
264static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
265{
266#ifdef CONFIG_SLUB_CPU_PARTIAL
267 return !kmem_cache_debug(s);
268#else
269 return false;
270#endif
271}
272
273/*
274 * Issues still to be resolved:
275 *
276 * - Support PAGE_ALLOC_DEBUG. Should be easy to do.
277 *
278 * - Variable sizing of the per node arrays
279 */
280
281/* Enable to log cmpxchg failures */
282#undef SLUB_DEBUG_CMPXCHG
283
284#ifndef CONFIG_SLUB_TINY
285/*
286 * Minimum number of partial slabs. These will be left on the partial
287 * lists even if they are empty. kmem_cache_shrink may reclaim them.
288 */
289#define MIN_PARTIAL 5
290
291/*
292 * Maximum number of desirable partial slabs.
293 * The existence of more partial slabs makes kmem_cache_shrink
294 * sort the partial list by the number of objects in use.
295 */
296#define MAX_PARTIAL 10
297#else
298#define MIN_PARTIAL 0
299#define MAX_PARTIAL 0
300#endif
301
302#define DEBUG_DEFAULT_FLAGS (SLAB_CONSISTENCY_CHECKS | SLAB_RED_ZONE | \
303 SLAB_POISON | SLAB_STORE_USER)
304
305/*
306 * These debug flags cannot use CMPXCHG because there might be consistency
307 * issues when checking or reading debug information
308 */
309#define SLAB_NO_CMPXCHG (SLAB_CONSISTENCY_CHECKS | SLAB_STORE_USER | \
310 SLAB_TRACE)
311
312
313/*
314 * Debugging flags that require metadata to be stored in the slab. These get
315 * disabled when slab_debug=O is used and a cache's min order increases with
316 * metadata.
317 */
318#define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
319
320#define OO_SHIFT 16
321#define OO_MASK ((1 << OO_SHIFT) - 1)
322#define MAX_OBJS_PER_PAGE 32767 /* since slab.objects is u15 */
323
324/* Internal SLUB flags */
325/* Poison object */
326#define __OBJECT_POISON __SLAB_FLAG_BIT(_SLAB_OBJECT_POISON)
327/* Use cmpxchg_double */
328
329#ifdef system_has_freelist_aba
330#define __CMPXCHG_DOUBLE __SLAB_FLAG_BIT(_SLAB_CMPXCHG_DOUBLE)
331#else
332#define __CMPXCHG_DOUBLE __SLAB_FLAG_UNUSED
333#endif
334
335/*
336 * Tracking user of a slab.
337 */
338#define TRACK_ADDRS_COUNT 16
339struct track {
340 unsigned long addr; /* Called from address */
341#ifdef CONFIG_STACKDEPOT
342 depot_stack_handle_t handle;
343#endif
344 int cpu; /* Was running on cpu */
345 int pid; /* Pid context */
346 unsigned long when; /* When did the operation occur */
347};
348
349enum track_item { TRACK_ALLOC, TRACK_FREE };
350
351#ifdef SLAB_SUPPORTS_SYSFS
352static int sysfs_slab_add(struct kmem_cache *);
353static int sysfs_slab_alias(struct kmem_cache *, const char *);
354#else
355static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
356static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
357 { return 0; }
358#endif
359
360#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_SLUB_DEBUG)
361static void debugfs_slab_add(struct kmem_cache *);
362#else
363static inline void debugfs_slab_add(struct kmem_cache *s) { }
364#endif
365
366enum stat_item {
367 ALLOC_PCS, /* Allocation from percpu sheaf */
368 ALLOC_FASTPATH, /* Allocation from cpu slab */
369 ALLOC_SLOWPATH, /* Allocation by getting a new cpu slab */
370 FREE_PCS, /* Free to percpu sheaf */
371 FREE_RCU_SHEAF, /* Free to rcu_free sheaf */
372 FREE_RCU_SHEAF_FAIL, /* Failed to free to a rcu_free sheaf */
373 FREE_FASTPATH, /* Free to cpu slab */
374 FREE_SLOWPATH, /* Freeing not to cpu slab */
375 FREE_FROZEN, /* Freeing to frozen slab */
376 FREE_ADD_PARTIAL, /* Freeing moves slab to partial list */
377 FREE_REMOVE_PARTIAL, /* Freeing removes last object */
378 ALLOC_FROM_PARTIAL, /* Cpu slab acquired from node partial list */
379 ALLOC_SLAB, /* Cpu slab acquired from page allocator */
380 ALLOC_REFILL, /* Refill cpu slab from slab freelist */
381 ALLOC_NODE_MISMATCH, /* Switching cpu slab */
382 FREE_SLAB, /* Slab freed to the page allocator */
383 CPUSLAB_FLUSH, /* Abandoning of the cpu slab */
384 DEACTIVATE_FULL, /* Cpu slab was full when deactivated */
385 DEACTIVATE_EMPTY, /* Cpu slab was empty when deactivated */
386 DEACTIVATE_TO_HEAD, /* Cpu slab was moved to the head of partials */
387 DEACTIVATE_TO_TAIL, /* Cpu slab was moved to the tail of partials */
388 DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */
389 DEACTIVATE_BYPASS, /* Implicit deactivation */
390 ORDER_FALLBACK, /* Number of times fallback was necessary */
391 CMPXCHG_DOUBLE_CPU_FAIL,/* Failures of this_cpu_cmpxchg_double */
392 CMPXCHG_DOUBLE_FAIL, /* Failures of slab freelist update */
393 CPU_PARTIAL_ALLOC, /* Used cpu partial on alloc */
394 CPU_PARTIAL_FREE, /* Refill cpu partial on free */
395 CPU_PARTIAL_NODE, /* Refill cpu partial from node partial */
396 CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */
397 SHEAF_FLUSH, /* Objects flushed from a sheaf */
398 SHEAF_REFILL, /* Objects refilled to a sheaf */
399 SHEAF_ALLOC, /* Allocation of an empty sheaf */
400 SHEAF_FREE, /* Freeing of an empty sheaf */
401 BARN_GET, /* Got full sheaf from barn */
402 BARN_GET_FAIL, /* Failed to get full sheaf from barn */
403 BARN_PUT, /* Put full sheaf to barn */
404 BARN_PUT_FAIL, /* Failed to put full sheaf to barn */
405 SHEAF_PREFILL_FAST, /* Sheaf prefill grabbed the spare sheaf */
406 SHEAF_PREFILL_SLOW, /* Sheaf prefill found no spare sheaf */
407 SHEAF_PREFILL_OVERSIZE, /* Allocation of oversize sheaf for prefill */
408 SHEAF_RETURN_FAST, /* Sheaf return reattached spare sheaf */
409 SHEAF_RETURN_SLOW, /* Sheaf return could not reattach spare */
410 NR_SLUB_STAT_ITEMS
411};
412
413#ifndef CONFIG_SLUB_TINY
414/*
415 * When changing the layout, make sure freelist and tid are still compatible
416 * with this_cpu_cmpxchg_double() alignment requirements.
417 */
418struct kmem_cache_cpu {
419 union {
420 struct {
421 void **freelist; /* Pointer to next available object */
422 unsigned long tid; /* Globally unique transaction id */
423 };
424 freelist_aba_t freelist_tid;
425 };
426 struct slab *slab; /* The slab from which we are allocating */
427#ifdef CONFIG_SLUB_CPU_PARTIAL
428 struct slab *partial; /* Partially allocated slabs */
429#endif
430 local_trylock_t lock; /* Protects the fields above */
431#ifdef CONFIG_SLUB_STATS
432 unsigned int stat[NR_SLUB_STAT_ITEMS];
433#endif
434};
435#endif /* CONFIG_SLUB_TINY */
436
437static inline void stat(const struct kmem_cache *s, enum stat_item si)
438{
439#ifdef CONFIG_SLUB_STATS
440 /*
441 * The rmw is racy on a preemptible kernel but this is acceptable, so
442 * avoid this_cpu_add()'s irq-disable overhead.
443 */
444 raw_cpu_inc(s->cpu_slab->stat[si]);
445#endif
446}
447
448static inline
449void stat_add(const struct kmem_cache *s, enum stat_item si, int v)
450{
451#ifdef CONFIG_SLUB_STATS
452 raw_cpu_add(s->cpu_slab->stat[si], v);
453#endif
454}
455
456#define MAX_FULL_SHEAVES 10
457#define MAX_EMPTY_SHEAVES 10
458
459struct node_barn {
460 spinlock_t lock;
461 struct list_head sheaves_full;
462 struct list_head sheaves_empty;
463 unsigned int nr_full;
464 unsigned int nr_empty;
465};
466
467struct slab_sheaf {
468 union {
469 struct rcu_head rcu_head;
470 struct list_head barn_list;
471 /* only used for prefilled sheafs */
472 unsigned int capacity;
473 };
474 struct kmem_cache *cache;
475 unsigned int size;
476 int node; /* only used for rcu_sheaf */
477 void *objects[];
478};
479
480struct slub_percpu_sheaves {
481 local_trylock_t lock;
482 struct slab_sheaf *main; /* never NULL when unlocked */
483 struct slab_sheaf *spare; /* empty or full, may be NULL */
484 struct slab_sheaf *rcu_free; /* for batching kfree_rcu() */
485};
486
487/*
488 * The slab lists for all objects.
489 */
490struct kmem_cache_node {
491 spinlock_t list_lock;
492 unsigned long nr_partial;
493 struct list_head partial;
494#ifdef CONFIG_SLUB_DEBUG
495 atomic_long_t nr_slabs;
496 atomic_long_t total_objects;
497 struct list_head full;
498#endif
499 struct node_barn *barn;
500};
501
502static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
503{
504 return s->node[node];
505}
506
507/*
508 * Get the barn of the current cpu's closest memory node. It may not exist on
509 * systems with memoryless nodes but without CONFIG_HAVE_MEMORYLESS_NODES
510 */
511static inline struct node_barn *get_barn(struct kmem_cache *s)
512{
513 struct kmem_cache_node *n = get_node(s, node: numa_mem_id());
514
515 if (!n)
516 return NULL;
517
518 return n->barn;
519}
520
521/*
522 * Iterator over all nodes. The body will be executed for each node that has
523 * a kmem_cache_node structure allocated (which is true for all online nodes)
524 */
525#define for_each_kmem_cache_node(__s, __node, __n) \
526 for (__node = 0; __node < nr_node_ids; __node++) \
527 if ((__n = get_node(__s, __node)))
528
529/*
530 * Tracks for which NUMA nodes we have kmem_cache_nodes allocated.
531 * Corresponds to node_state[N_MEMORY], but can temporarily
532 * differ during memory hotplug/hotremove operations.
533 * Protected by slab_mutex.
534 */
535static nodemask_t slab_nodes;
536
537/*
538 * Workqueue used for flush_cpu_slab().
539 */
540static struct workqueue_struct *flushwq;
541
542struct slub_flush_work {
543 struct work_struct work;
544 struct kmem_cache *s;
545 bool skip;
546};
547
548static DEFINE_MUTEX(flush_lock);
549static DEFINE_PER_CPU(struct slub_flush_work, slub_flush);
550
551/********************************************************************
552 * Core slab cache functions
553 *******************************************************************/
554
555/*
556 * Returns freelist pointer (ptr). With hardening, this is obfuscated
557 * with an XOR of the address where the pointer is held and a per-cache
558 * random number.
559 */
560static inline freeptr_t freelist_ptr_encode(const struct kmem_cache *s,
561 void *ptr, unsigned long ptr_addr)
562{
563 unsigned long encoded;
564
565#ifdef CONFIG_SLAB_FREELIST_HARDENED
566 encoded = (unsigned long)ptr ^ s->random ^ swab(ptr_addr);
567#else
568 encoded = (unsigned long)ptr;
569#endif
570 return (freeptr_t){.v = encoded};
571}
572
573static inline void *freelist_ptr_decode(const struct kmem_cache *s,
574 freeptr_t ptr, unsigned long ptr_addr)
575{
576 void *decoded;
577
578#ifdef CONFIG_SLAB_FREELIST_HARDENED
579 decoded = (void *)(ptr.v ^ s->random ^ swab(ptr_addr));
580#else
581 decoded = (void *)ptr.v;
582#endif
583 return decoded;
584}
585
586static inline void *get_freepointer(struct kmem_cache *s, void *object)
587{
588 unsigned long ptr_addr;
589 freeptr_t p;
590
591 object = kasan_reset_tag(addr: object);
592 ptr_addr = (unsigned long)object + s->offset;
593 p = *(freeptr_t *)(ptr_addr);
594 return freelist_ptr_decode(s, ptr: p, ptr_addr);
595}
596
597#ifndef CONFIG_SLUB_TINY
598static void prefetch_freepointer(const struct kmem_cache *s, void *object)
599{
600 prefetchw(x: object + s->offset);
601}
602#endif
603
604/*
605 * When running under KMSAN, get_freepointer_safe() may return an uninitialized
606 * pointer value in the case the current thread loses the race for the next
607 * memory chunk in the freelist. In that case this_cpu_cmpxchg_double() in
608 * slab_alloc_node() will fail, so the uninitialized value won't be used, but
609 * KMSAN will still check all arguments of cmpxchg because of imperfect
610 * handling of inline assembly.
611 * To work around this problem, we apply __no_kmsan_checks to ensure that
612 * get_freepointer_safe() returns initialized memory.
613 */
614__no_kmsan_checks
615static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
616{
617 unsigned long freepointer_addr;
618 freeptr_t p;
619
620 if (!debug_pagealloc_enabled_static())
621 return get_freepointer(s, object);
622
623 object = kasan_reset_tag(addr: object);
624 freepointer_addr = (unsigned long)object + s->offset;
625 copy_from_kernel_nofault(dst: &p, src: (freeptr_t *)freepointer_addr, size: sizeof(p));
626 return freelist_ptr_decode(s, ptr: p, ptr_addr: freepointer_addr);
627}
628
629static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
630{
631 unsigned long freeptr_addr = (unsigned long)object + s->offset;
632
633#ifdef CONFIG_SLAB_FREELIST_HARDENED
634 BUG_ON(object == fp); /* naive detection of double free or corruption */
635#endif
636
637 freeptr_addr = (unsigned long)kasan_reset_tag(addr: (void *)freeptr_addr);
638 *(freeptr_t *)freeptr_addr = freelist_ptr_encode(s, ptr: fp, ptr_addr: freeptr_addr);
639}
640
641/*
642 * See comment in calculate_sizes().
643 */
644static inline bool freeptr_outside_object(struct kmem_cache *s)
645{
646 return s->offset >= s->inuse;
647}
648
649/*
650 * Return offset of the end of info block which is inuse + free pointer if
651 * not overlapping with object.
652 */
653static inline unsigned int get_info_end(struct kmem_cache *s)
654{
655 if (freeptr_outside_object(s))
656 return s->inuse + sizeof(void *);
657 else
658 return s->inuse;
659}
660
661/* Loop over all objects in a slab */
662#define for_each_object(__p, __s, __addr, __objects) \
663 for (__p = fixup_red_left(__s, __addr); \
664 __p < (__addr) + (__objects) * (__s)->size; \
665 __p += (__s)->size)
666
667static inline unsigned int order_objects(unsigned int order, unsigned int size)
668{
669 return ((unsigned int)PAGE_SIZE << order) / size;
670}
671
672static inline struct kmem_cache_order_objects oo_make(unsigned int order,
673 unsigned int size)
674{
675 struct kmem_cache_order_objects x = {
676 (order << OO_SHIFT) + order_objects(order, size)
677 };
678
679 return x;
680}
681
682static inline unsigned int oo_order(struct kmem_cache_order_objects x)
683{
684 return x.x >> OO_SHIFT;
685}
686
687static inline unsigned int oo_objects(struct kmem_cache_order_objects x)
688{
689 return x.x & OO_MASK;
690}
691
692#ifdef CONFIG_SLUB_CPU_PARTIAL
693static void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects)
694{
695 unsigned int nr_slabs;
696
697 s->cpu_partial = nr_objects;
698
699 /*
700 * We take the number of objects but actually limit the number of
701 * slabs on the per cpu partial list, in order to limit excessive
702 * growth of the list. For simplicity we assume that the slabs will
703 * be half-full.
704 */
705 nr_slabs = DIV_ROUND_UP(nr_objects * 2, oo_objects(s->oo));
706 s->cpu_partial_slabs = nr_slabs;
707}
708
709static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s)
710{
711 return s->cpu_partial_slabs;
712}
713#else
714static inline void
715slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects)
716{
717}
718
719static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s)
720{
721 return 0;
722}
723#endif /* CONFIG_SLUB_CPU_PARTIAL */
724
725/*
726 * If network-based swap is enabled, slub must keep track of whether memory
727 * were allocated from pfmemalloc reserves.
728 */
729static inline bool slab_test_pfmemalloc(const struct slab *slab)
730{
731 return test_bit(SL_pfmemalloc, &slab->flags.f);
732}
733
734static inline void slab_set_pfmemalloc(struct slab *slab)
735{
736 set_bit(nr: SL_pfmemalloc, addr: &slab->flags.f);
737}
738
739static inline void __slab_clear_pfmemalloc(struct slab *slab)
740{
741 __clear_bit(SL_pfmemalloc, &slab->flags.f);
742}
743
744/*
745 * Per slab locking using the pagelock
746 */
747static __always_inline void slab_lock(struct slab *slab)
748{
749 bit_spin_lock(bitnum: SL_locked, addr: &slab->flags.f);
750}
751
752static __always_inline void slab_unlock(struct slab *slab)
753{
754 bit_spin_unlock(bitnum: SL_locked, addr: &slab->flags.f);
755}
756
757static inline bool
758__update_freelist_fast(struct slab *slab,
759 void *freelist_old, unsigned long counters_old,
760 void *freelist_new, unsigned long counters_new)
761{
762#ifdef system_has_freelist_aba
763 freelist_aba_t old = { .freelist = freelist_old, .counter = counters_old };
764 freelist_aba_t new = { .freelist = freelist_new, .counter = counters_new };
765
766 return try_cmpxchg_freelist(&slab->freelist_counter.full, &old.full, new.full);
767#else
768 return false;
769#endif
770}
771
772static inline bool
773__update_freelist_slow(struct slab *slab,
774 void *freelist_old, unsigned long counters_old,
775 void *freelist_new, unsigned long counters_new)
776{
777 bool ret = false;
778
779 slab_lock(slab);
780 if (slab->freelist == freelist_old &&
781 slab->counters == counters_old) {
782 slab->freelist = freelist_new;
783 slab->counters = counters_new;
784 ret = true;
785 }
786 slab_unlock(slab);
787
788 return ret;
789}
790
791/*
792 * Interrupts must be disabled (for the fallback code to work right), typically
793 * by an _irqsave() lock variant. On PREEMPT_RT the preempt_disable(), which is
794 * part of bit_spin_lock(), is sufficient because the policy is not to allow any
795 * allocation/ free operation in hardirq context. Therefore nothing can
796 * interrupt the operation.
797 */
798static inline bool __slab_update_freelist(struct kmem_cache *s, struct slab *slab,
799 void *freelist_old, unsigned long counters_old,
800 void *freelist_new, unsigned long counters_new,
801 const char *n)
802{
803 bool ret;
804
805 if (USE_LOCKLESS_FAST_PATH())
806 lockdep_assert_irqs_disabled();
807
808 if (s->flags & __CMPXCHG_DOUBLE) {
809 ret = __update_freelist_fast(slab, freelist_old, counters_old,
810 freelist_new, counters_new);
811 } else {
812 ret = __update_freelist_slow(slab, freelist_old, counters_old,
813 freelist_new, counters_new);
814 }
815 if (likely(ret))
816 return true;
817
818 cpu_relax();
819 stat(s, si: CMPXCHG_DOUBLE_FAIL);
820
821#ifdef SLUB_DEBUG_CMPXCHG
822 pr_info("%s %s: cmpxchg double redo ", n, s->name);
823#endif
824
825 return false;
826}
827
828static inline bool slab_update_freelist(struct kmem_cache *s, struct slab *slab,
829 void *freelist_old, unsigned long counters_old,
830 void *freelist_new, unsigned long counters_new,
831 const char *n)
832{
833 bool ret;
834
835 if (s->flags & __CMPXCHG_DOUBLE) {
836 ret = __update_freelist_fast(slab, freelist_old, counters_old,
837 freelist_new, counters_new);
838 } else {
839 unsigned long flags;
840
841 local_irq_save(flags);
842 ret = __update_freelist_slow(slab, freelist_old, counters_old,
843 freelist_new, counters_new);
844 local_irq_restore(flags);
845 }
846 if (likely(ret))
847 return true;
848
849 cpu_relax();
850 stat(s, si: CMPXCHG_DOUBLE_FAIL);
851
852#ifdef SLUB_DEBUG_CMPXCHG
853 pr_info("%s %s: cmpxchg double redo ", n, s->name);
854#endif
855
856 return false;
857}
858
859/*
860 * kmalloc caches has fixed sizes (mostly power of 2), and kmalloc() API
861 * family will round up the real request size to these fixed ones, so
862 * there could be an extra area than what is requested. Save the original
863 * request size in the meta data area, for better debug and sanity check.
864 */
865static inline void set_orig_size(struct kmem_cache *s,
866 void *object, unsigned int orig_size)
867{
868 void *p = kasan_reset_tag(addr: object);
869
870 if (!slub_debug_orig_size(s))
871 return;
872
873 p += get_info_end(s);
874 p += sizeof(struct track) * 2;
875
876 *(unsigned int *)p = orig_size;
877}
878
879static inline unsigned int get_orig_size(struct kmem_cache *s, void *object)
880{
881 void *p = kasan_reset_tag(addr: object);
882
883 if (is_kfence_address(addr: object))
884 return kfence_ksize(addr: object);
885
886 if (!slub_debug_orig_size(s))
887 return s->object_size;
888
889 p += get_info_end(s);
890 p += sizeof(struct track) * 2;
891
892 return *(unsigned int *)p;
893}
894
895#ifdef CONFIG_SLUB_DEBUG
896
897/*
898 * For debugging context when we want to check if the struct slab pointer
899 * appears to be valid.
900 */
901static inline bool validate_slab_ptr(struct slab *slab)
902{
903 return PageSlab(slab_page(slab));
904}
905
906static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)];
907static DEFINE_SPINLOCK(object_map_lock);
908
909static void __fill_map(unsigned long *obj_map, struct kmem_cache *s,
910 struct slab *slab)
911{
912 void *addr = slab_address(slab);
913 void *p;
914
915 bitmap_zero(dst: obj_map, nbits: slab->objects);
916
917 for (p = slab->freelist; p; p = get_freepointer(s, object: p))
918 set_bit(nr: __obj_to_index(cache: s, addr, obj: p), addr: obj_map);
919}
920
921#if IS_ENABLED(CONFIG_KUNIT)
922static bool slab_add_kunit_errors(void)
923{
924 struct kunit_resource *resource;
925
926 if (!kunit_get_current_test())
927 return false;
928
929 resource = kunit_find_named_resource(current->kunit_test, "slab_errors");
930 if (!resource)
931 return false;
932
933 (*(int *)resource->data)++;
934 kunit_put_resource(resource);
935 return true;
936}
937
938bool slab_in_kunit_test(void)
939{
940 struct kunit_resource *resource;
941
942 if (!kunit_get_current_test())
943 return false;
944
945 resource = kunit_find_named_resource(current->kunit_test, "slab_errors");
946 if (!resource)
947 return false;
948
949 kunit_put_resource(resource);
950 return true;
951}
952#else
953static inline bool slab_add_kunit_errors(void) { return false; }
954#endif
955
956static inline unsigned int size_from_object(struct kmem_cache *s)
957{
958 if (s->flags & SLAB_RED_ZONE)
959 return s->size - s->red_left_pad;
960
961 return s->size;
962}
963
964static inline void *restore_red_left(struct kmem_cache *s, void *p)
965{
966 if (s->flags & SLAB_RED_ZONE)
967 p -= s->red_left_pad;
968
969 return p;
970}
971
972/*
973 * Debug settings:
974 */
975#if defined(CONFIG_SLUB_DEBUG_ON)
976static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS;
977#else
978static slab_flags_t slub_debug;
979#endif
980
981static char *slub_debug_string;
982static int disable_higher_order_debug;
983
984/*
985 * slub is about to manipulate internal object metadata. This memory lies
986 * outside the range of the allocated object, so accessing it would normally
987 * be reported by kasan as a bounds error. metadata_access_enable() is used
988 * to tell kasan that these accesses are OK.
989 */
990static inline void metadata_access_enable(void)
991{
992 kasan_disable_current();
993 kmsan_disable_current();
994}
995
996static inline void metadata_access_disable(void)
997{
998 kmsan_enable_current();
999 kasan_enable_current();
1000}
1001
1002/*
1003 * Object debugging
1004 */
1005
1006/* Verify that a pointer has an address that is valid within a slab page */
1007static inline int check_valid_pointer(struct kmem_cache *s,
1008 struct slab *slab, void *object)
1009{
1010 void *base;
1011
1012 if (!object)
1013 return 1;
1014
1015 base = slab_address(slab);
1016 object = kasan_reset_tag(addr: object);
1017 object = restore_red_left(s, p: object);
1018 if (object < base || object >= base + slab->objects * s->size ||
1019 (object - base) % s->size) {
1020 return 0;
1021 }
1022
1023 return 1;
1024}
1025
1026static void print_section(char *level, char *text, u8 *addr,
1027 unsigned int length)
1028{
1029 metadata_access_enable();
1030 print_hex_dump(level, prefix_str: text, prefix_type: DUMP_PREFIX_ADDRESS,
1031 rowsize: 16, groupsize: 1, buf: kasan_reset_tag(addr: (void *)addr), len: length, ascii: 1);
1032 metadata_access_disable();
1033}
1034
1035static struct track *get_track(struct kmem_cache *s, void *object,
1036 enum track_item alloc)
1037{
1038 struct track *p;
1039
1040 p = object + get_info_end(s);
1041
1042 return kasan_reset_tag(addr: p + alloc);
1043}
1044
1045#ifdef CONFIG_STACKDEPOT
1046static noinline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags)
1047{
1048 depot_stack_handle_t handle;
1049 unsigned long entries[TRACK_ADDRS_COUNT];
1050 unsigned int nr_entries;
1051
1052 nr_entries = stack_trace_save(store: entries, ARRAY_SIZE(entries), skipnr: 3);
1053 handle = stack_depot_save(entries, nr_entries, alloc_flags: gfp_flags);
1054
1055 return handle;
1056}
1057#else
1058static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags)
1059{
1060 return 0;
1061}
1062#endif
1063
1064static void set_track_update(struct kmem_cache *s, void *object,
1065 enum track_item alloc, unsigned long addr,
1066 depot_stack_handle_t handle)
1067{
1068 struct track *p = get_track(s, object, alloc);
1069
1070#ifdef CONFIG_STACKDEPOT
1071 p->handle = handle;
1072#endif
1073 p->addr = addr;
1074 p->cpu = smp_processor_id();
1075 p->pid = current->pid;
1076 p->when = jiffies;
1077}
1078
1079static __always_inline void set_track(struct kmem_cache *s, void *object,
1080 enum track_item alloc, unsigned long addr, gfp_t gfp_flags)
1081{
1082 depot_stack_handle_t handle = set_track_prepare(gfp_flags);
1083
1084 set_track_update(s, object, alloc, addr, handle);
1085}
1086
1087static void init_tracking(struct kmem_cache *s, void *object)
1088{
1089 struct track *p;
1090
1091 if (!(s->flags & SLAB_STORE_USER))
1092 return;
1093
1094 p = get_track(s, object, alloc: TRACK_ALLOC);
1095 memset(s: p, c: 0, n: 2*sizeof(struct track));
1096}
1097
1098static void print_track(const char *s, struct track *t, unsigned long pr_time)
1099{
1100 depot_stack_handle_t handle __maybe_unused;
1101
1102 if (!t->addr)
1103 return;
1104
1105 pr_err("%s in %pS age=%lu cpu=%u pid=%d\n",
1106 s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid);
1107#ifdef CONFIG_STACKDEPOT
1108 handle = READ_ONCE(t->handle);
1109 if (handle)
1110 stack_depot_print(stack: handle);
1111 else
1112 pr_err("object allocation/free stack trace missing\n");
1113#endif
1114}
1115
1116void print_tracking(struct kmem_cache *s, void *object)
1117{
1118 unsigned long pr_time = jiffies;
1119 if (!(s->flags & SLAB_STORE_USER))
1120 return;
1121
1122 print_track(s: "Allocated", t: get_track(s, object, alloc: TRACK_ALLOC), pr_time);
1123 print_track(s: "Freed", t: get_track(s, object, alloc: TRACK_FREE), pr_time);
1124}
1125
1126static void print_slab_info(const struct slab *slab)
1127{
1128 pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%pGp\n",
1129 slab, slab->objects, slab->inuse, slab->freelist,
1130 &slab->flags.f);
1131}
1132
1133void skip_orig_size_check(struct kmem_cache *s, const void *object)
1134{
1135 set_orig_size(s, object: (void *)object, orig_size: s->object_size);
1136}
1137
1138static void __slab_bug(struct kmem_cache *s, const char *fmt, va_list argsp)
1139{
1140 struct va_format vaf;
1141 va_list args;
1142
1143 va_copy(args, argsp);
1144 vaf.fmt = fmt;
1145 vaf.va = &args;
1146 pr_err("=============================================================================\n");
1147 pr_err("BUG %s (%s): %pV\n", s ? s->name : "<unknown>", print_tainted(), &vaf);
1148 pr_err("-----------------------------------------------------------------------------\n\n");
1149 va_end(args);
1150}
1151
1152static void slab_bug(struct kmem_cache *s, const char *fmt, ...)
1153{
1154 va_list args;
1155
1156 va_start(args, fmt);
1157 __slab_bug(s, fmt, argsp: args);
1158 va_end(args);
1159}
1160
1161__printf(2, 3)
1162static void slab_fix(struct kmem_cache *s, const char *fmt, ...)
1163{
1164 struct va_format vaf;
1165 va_list args;
1166
1167 if (slab_add_kunit_errors())
1168 return;
1169
1170 va_start(args, fmt);
1171 vaf.fmt = fmt;
1172 vaf.va = &args;
1173 pr_err("FIX %s: %pV\n", s->name, &vaf);
1174 va_end(args);
1175}
1176
1177static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p)
1178{
1179 unsigned int off; /* Offset of last byte */
1180 u8 *addr = slab_address(slab);
1181
1182 print_tracking(s, object: p);
1183
1184 print_slab_info(slab);
1185
1186 pr_err("Object 0x%p @offset=%tu fp=0x%p\n\n",
1187 p, p - addr, get_freepointer(s, p));
1188
1189 if (s->flags & SLAB_RED_ZONE)
1190 print_section(KERN_ERR, text: "Redzone ", addr: p - s->red_left_pad,
1191 length: s->red_left_pad);
1192 else if (p > addr + 16)
1193 print_section(KERN_ERR, text: "Bytes b4 ", addr: p - 16, length: 16);
1194
1195 print_section(KERN_ERR, text: "Object ", addr: p,
1196 min_t(unsigned int, s->object_size, PAGE_SIZE));
1197 if (s->flags & SLAB_RED_ZONE)
1198 print_section(KERN_ERR, text: "Redzone ", addr: p + s->object_size,
1199 length: s->inuse - s->object_size);
1200
1201 off = get_info_end(s);
1202
1203 if (s->flags & SLAB_STORE_USER)
1204 off += 2 * sizeof(struct track);
1205
1206 if (slub_debug_orig_size(s))
1207 off += sizeof(unsigned int);
1208
1209 off += kasan_metadata_size(cache: s, in_object: false);
1210
1211 if (off != size_from_object(s))
1212 /* Beginning of the filler is the free pointer */
1213 print_section(KERN_ERR, text: "Padding ", addr: p + off,
1214 length: size_from_object(s) - off);
1215}
1216
1217static void object_err(struct kmem_cache *s, struct slab *slab,
1218 u8 *object, const char *reason)
1219{
1220 if (slab_add_kunit_errors())
1221 return;
1222
1223 slab_bug(s, fmt: reason);
1224 if (!object || !check_valid_pointer(s, slab, object)) {
1225 print_slab_info(slab);
1226 pr_err("Invalid pointer 0x%p\n", object);
1227 } else {
1228 print_trailer(s, slab, p: object);
1229 }
1230 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
1231
1232 WARN_ON(1);
1233}
1234
1235static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
1236 void **freelist, void *nextfree)
1237{
1238 if ((s->flags & SLAB_CONSISTENCY_CHECKS) &&
1239 !check_valid_pointer(s, slab, object: nextfree) && freelist) {
1240 object_err(s, slab, object: *freelist, reason: "Freechain corrupt");
1241 *freelist = NULL;
1242 slab_fix(s, fmt: "Isolate corrupted freechain");
1243 return true;
1244 }
1245
1246 return false;
1247}
1248
1249static void __slab_err(struct slab *slab)
1250{
1251 if (slab_in_kunit_test())
1252 return;
1253
1254 print_slab_info(slab);
1255 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
1256
1257 WARN_ON(1);
1258}
1259
1260static __printf(3, 4) void slab_err(struct kmem_cache *s, struct slab *slab,
1261 const char *fmt, ...)
1262{
1263 va_list args;
1264
1265 if (slab_add_kunit_errors())
1266 return;
1267
1268 va_start(args, fmt);
1269 __slab_bug(s, fmt, argsp: args);
1270 va_end(args);
1271
1272 __slab_err(slab);
1273}
1274
1275static void init_object(struct kmem_cache *s, void *object, u8 val)
1276{
1277 u8 *p = kasan_reset_tag(addr: object);
1278 unsigned int poison_size = s->object_size;
1279
1280 if (s->flags & SLAB_RED_ZONE) {
1281 /*
1282 * Here and below, avoid overwriting the KMSAN shadow. Keeping
1283 * the shadow makes it possible to distinguish uninit-value
1284 * from use-after-free.
1285 */
1286 memset_no_sanitize_memory(s: p - s->red_left_pad, c: val,
1287 n: s->red_left_pad);
1288
1289 if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) {
1290 /*
1291 * Redzone the extra allocated space by kmalloc than
1292 * requested, and the poison size will be limited to
1293 * the original request size accordingly.
1294 */
1295 poison_size = get_orig_size(s, object);
1296 }
1297 }
1298
1299 if (s->flags & __OBJECT_POISON) {
1300 memset_no_sanitize_memory(s: p, POISON_FREE, n: poison_size - 1);
1301 memset_no_sanitize_memory(s: p + poison_size - 1, POISON_END, n: 1);
1302 }
1303
1304 if (s->flags & SLAB_RED_ZONE)
1305 memset_no_sanitize_memory(s: p + poison_size, c: val,
1306 n: s->inuse - poison_size);
1307}
1308
1309static void restore_bytes(struct kmem_cache *s, const char *message, u8 data,
1310 void *from, void *to)
1311{
1312 slab_fix(s, fmt: "Restoring %s 0x%p-0x%p=0x%x", message, from, to - 1, data);
1313 memset(s: from, c: data, n: to - from);
1314}
1315
1316#ifdef CONFIG_KMSAN
1317#define pad_check_attributes noinline __no_kmsan_checks
1318#else
1319#define pad_check_attributes
1320#endif
1321
1322static pad_check_attributes int
1323check_bytes_and_report(struct kmem_cache *s, struct slab *slab,
1324 u8 *object, const char *what, u8 *start, unsigned int value,
1325 unsigned int bytes, bool slab_obj_print)
1326{
1327 u8 *fault;
1328 u8 *end;
1329 u8 *addr = slab_address(slab);
1330
1331 metadata_access_enable();
1332 fault = memchr_inv(s: kasan_reset_tag(addr: start), c: value, n: bytes);
1333 metadata_access_disable();
1334 if (!fault)
1335 return 1;
1336
1337 end = start + bytes;
1338 while (end > fault && end[-1] == value)
1339 end--;
1340
1341 if (slab_add_kunit_errors())
1342 goto skip_bug_print;
1343
1344 pr_err("[%s overwritten] 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
1345 what, fault, end - 1, fault - addr, fault[0], value);
1346
1347 if (slab_obj_print)
1348 object_err(s, slab, object, reason: "Object corrupt");
1349
1350skip_bug_print:
1351 restore_bytes(s, message: what, data: value, from: fault, to: end);
1352 return 0;
1353}
1354
1355/*
1356 * Object layout:
1357 *
1358 * object address
1359 * Bytes of the object to be managed.
1360 * If the freepointer may overlay the object then the free
1361 * pointer is at the middle of the object.
1362 *
1363 * Poisoning uses 0x6b (POISON_FREE) and the last byte is
1364 * 0xa5 (POISON_END)
1365 *
1366 * object + s->object_size
1367 * Padding to reach word boundary. This is also used for Redzoning.
1368 * Padding is extended by another word if Redzoning is enabled and
1369 * object_size == inuse.
1370 *
1371 * We fill with 0xbb (SLUB_RED_INACTIVE) for inactive objects and with
1372 * 0xcc (SLUB_RED_ACTIVE) for objects in use.
1373 *
1374 * object + s->inuse
1375 * Meta data starts here.
1376 *
1377 * A. Free pointer (if we cannot overwrite object on free)
1378 * B. Tracking data for SLAB_STORE_USER
1379 * C. Original request size for kmalloc object (SLAB_STORE_USER enabled)
1380 * D. Padding to reach required alignment boundary or at minimum
1381 * one word if debugging is on to be able to detect writes
1382 * before the word boundary.
1383 *
1384 * Padding is done using 0x5a (POISON_INUSE)
1385 *
1386 * object + s->size
1387 * Nothing is used beyond s->size.
1388 *
1389 * If slabcaches are merged then the object_size and inuse boundaries are mostly
1390 * ignored. And therefore no slab options that rely on these boundaries
1391 * may be used with merged slabcaches.
1392 */
1393
1394static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p)
1395{
1396 unsigned long off = get_info_end(s); /* The end of info */
1397
1398 if (s->flags & SLAB_STORE_USER) {
1399 /* We also have user information there */
1400 off += 2 * sizeof(struct track);
1401
1402 if (s->flags & SLAB_KMALLOC)
1403 off += sizeof(unsigned int);
1404 }
1405
1406 off += kasan_metadata_size(cache: s, in_object: false);
1407
1408 if (size_from_object(s) == off)
1409 return 1;
1410
1411 return check_bytes_and_report(s, slab, object: p, what: "Object padding",
1412 start: p + off, POISON_INUSE, bytes: size_from_object(s) - off, slab_obj_print: true);
1413}
1414
1415/* Check the pad bytes at the end of a slab page */
1416static pad_check_attributes void
1417slab_pad_check(struct kmem_cache *s, struct slab *slab)
1418{
1419 u8 *start;
1420 u8 *fault;
1421 u8 *end;
1422 u8 *pad;
1423 int length;
1424 int remainder;
1425
1426 if (!(s->flags & SLAB_POISON))
1427 return;
1428
1429 start = slab_address(slab);
1430 length = slab_size(slab);
1431 end = start + length;
1432 remainder = length % s->size;
1433 if (!remainder)
1434 return;
1435
1436 pad = end - remainder;
1437 metadata_access_enable();
1438 fault = memchr_inv(s: kasan_reset_tag(addr: pad), POISON_INUSE, n: remainder);
1439 metadata_access_disable();
1440 if (!fault)
1441 return;
1442 while (end > fault && end[-1] == POISON_INUSE)
1443 end--;
1444
1445 slab_bug(s, fmt: "Padding overwritten. 0x%p-0x%p @offset=%tu",
1446 fault, end - 1, fault - start);
1447 print_section(KERN_ERR, text: "Padding ", addr: pad, length: remainder);
1448 __slab_err(slab);
1449
1450 restore_bytes(s, message: "slab padding", POISON_INUSE, from: fault, to: end);
1451}
1452
1453static int check_object(struct kmem_cache *s, struct slab *slab,
1454 void *object, u8 val)
1455{
1456 u8 *p = object;
1457 u8 *endobject = object + s->object_size;
1458 unsigned int orig_size, kasan_meta_size;
1459 int ret = 1;
1460
1461 if (s->flags & SLAB_RED_ZONE) {
1462 if (!check_bytes_and_report(s, slab, object, what: "Left Redzone",
1463 start: object - s->red_left_pad, value: val, bytes: s->red_left_pad, slab_obj_print: ret))
1464 ret = 0;
1465
1466 if (!check_bytes_and_report(s, slab, object, what: "Right Redzone",
1467 start: endobject, value: val, bytes: s->inuse - s->object_size, slab_obj_print: ret))
1468 ret = 0;
1469
1470 if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) {
1471 orig_size = get_orig_size(s, object);
1472
1473 if (s->object_size > orig_size &&
1474 !check_bytes_and_report(s, slab, object,
1475 what: "kmalloc Redzone", start: p + orig_size,
1476 value: val, bytes: s->object_size - orig_size, slab_obj_print: ret)) {
1477 ret = 0;
1478 }
1479 }
1480 } else {
1481 if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) {
1482 if (!check_bytes_and_report(s, slab, object: p, what: "Alignment padding",
1483 start: endobject, POISON_INUSE,
1484 bytes: s->inuse - s->object_size, slab_obj_print: ret))
1485 ret = 0;
1486 }
1487 }
1488
1489 if (s->flags & SLAB_POISON) {
1490 if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON)) {
1491 /*
1492 * KASAN can save its free meta data inside of the
1493 * object at offset 0. Thus, skip checking the part of
1494 * the redzone that overlaps with the meta data.
1495 */
1496 kasan_meta_size = kasan_metadata_size(cache: s, in_object: true);
1497 if (kasan_meta_size < s->object_size - 1 &&
1498 !check_bytes_and_report(s, slab, object: p, what: "Poison",
1499 start: p + kasan_meta_size, POISON_FREE,
1500 bytes: s->object_size - kasan_meta_size - 1, slab_obj_print: ret))
1501 ret = 0;
1502 if (kasan_meta_size < s->object_size &&
1503 !check_bytes_and_report(s, slab, object: p, what: "End Poison",
1504 start: p + s->object_size - 1, POISON_END, bytes: 1, slab_obj_print: ret))
1505 ret = 0;
1506 }
1507 /*
1508 * check_pad_bytes cleans up on its own.
1509 */
1510 if (!check_pad_bytes(s, slab, p))
1511 ret = 0;
1512 }
1513
1514 /*
1515 * Cannot check freepointer while object is allocated if
1516 * object and freepointer overlap.
1517 */
1518 if ((freeptr_outside_object(s) || val != SLUB_RED_ACTIVE) &&
1519 !check_valid_pointer(s, slab, object: get_freepointer(s, object: p))) {
1520 object_err(s, slab, object: p, reason: "Freepointer corrupt");
1521 /*
1522 * No choice but to zap it and thus lose the remainder
1523 * of the free objects in this slab. May cause
1524 * another error because the object count is now wrong.
1525 */
1526 set_freepointer(s, object: p, NULL);
1527 ret = 0;
1528 }
1529
1530 return ret;
1531}
1532
1533/*
1534 * Checks if the slab state looks sane. Assumes the struct slab pointer
1535 * was either obtained in a way that ensures it's valid, or validated
1536 * by validate_slab_ptr()
1537 */
1538static int check_slab(struct kmem_cache *s, struct slab *slab)
1539{
1540 int maxobj;
1541
1542 maxobj = order_objects(order: slab_order(slab), size: s->size);
1543 if (slab->objects > maxobj) {
1544 slab_err(s, slab, fmt: "objects %u > max %u",
1545 slab->objects, maxobj);
1546 return 0;
1547 }
1548 if (slab->inuse > slab->objects) {
1549 slab_err(s, slab, fmt: "inuse %u > max %u",
1550 slab->inuse, slab->objects);
1551 return 0;
1552 }
1553 if (slab->frozen) {
1554 slab_err(s, slab, fmt: "Slab disabled since SLUB metadata consistency check failed");
1555 return 0;
1556 }
1557
1558 /* Slab_pad_check fixes things up after itself */
1559 slab_pad_check(s, slab);
1560 return 1;
1561}
1562
1563/*
1564 * Determine if a certain object in a slab is on the freelist. Must hold the
1565 * slab lock to guarantee that the chains are in a consistent state.
1566 */
1567static bool on_freelist(struct kmem_cache *s, struct slab *slab, void *search)
1568{
1569 int nr = 0;
1570 void *fp;
1571 void *object = NULL;
1572 int max_objects;
1573
1574 fp = slab->freelist;
1575 while (fp && nr <= slab->objects) {
1576 if (fp == search)
1577 return true;
1578 if (!check_valid_pointer(s, slab, object: fp)) {
1579 if (object) {
1580 object_err(s, slab, object,
1581 reason: "Freechain corrupt");
1582 set_freepointer(s, object, NULL);
1583 break;
1584 } else {
1585 slab_err(s, slab, fmt: "Freepointer corrupt");
1586 slab->freelist = NULL;
1587 slab->inuse = slab->objects;
1588 slab_fix(s, fmt: "Freelist cleared");
1589 return false;
1590 }
1591 }
1592 object = fp;
1593 fp = get_freepointer(s, object);
1594 nr++;
1595 }
1596
1597 if (nr > slab->objects) {
1598 slab_err(s, slab, fmt: "Freelist cycle detected");
1599 slab->freelist = NULL;
1600 slab->inuse = slab->objects;
1601 slab_fix(s, fmt: "Freelist cleared");
1602 return false;
1603 }
1604
1605 max_objects = order_objects(order: slab_order(slab), size: s->size);
1606 if (max_objects > MAX_OBJS_PER_PAGE)
1607 max_objects = MAX_OBJS_PER_PAGE;
1608
1609 if (slab->objects != max_objects) {
1610 slab_err(s, slab, fmt: "Wrong number of objects. Found %d but should be %d",
1611 slab->objects, max_objects);
1612 slab->objects = max_objects;
1613 slab_fix(s, fmt: "Number of objects adjusted");
1614 }
1615 if (slab->inuse != slab->objects - nr) {
1616 slab_err(s, slab, fmt: "Wrong object count. Counter is %d but counted were %d",
1617 slab->inuse, slab->objects - nr);
1618 slab->inuse = slab->objects - nr;
1619 slab_fix(s, fmt: "Object count adjusted");
1620 }
1621 return search == NULL;
1622}
1623
1624static void trace(struct kmem_cache *s, struct slab *slab, void *object,
1625 int alloc)
1626{
1627 if (s->flags & SLAB_TRACE) {
1628 pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
1629 s->name,
1630 alloc ? "alloc" : "free",
1631 object, slab->inuse,
1632 slab->freelist);
1633
1634 if (!alloc)
1635 print_section(KERN_INFO, text: "Object ", addr: (void *)object,
1636 length: s->object_size);
1637
1638 dump_stack();
1639 }
1640}
1641
1642/*
1643 * Tracking of fully allocated slabs for debugging purposes.
1644 */
1645static void add_full(struct kmem_cache *s,
1646 struct kmem_cache_node *n, struct slab *slab)
1647{
1648 if (!(s->flags & SLAB_STORE_USER))
1649 return;
1650
1651 lockdep_assert_held(&n->list_lock);
1652 list_add(new: &slab->slab_list, head: &n->full);
1653}
1654
1655static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct slab *slab)
1656{
1657 if (!(s->flags & SLAB_STORE_USER))
1658 return;
1659
1660 lockdep_assert_held(&n->list_lock);
1661 list_del(entry: &slab->slab_list);
1662}
1663
1664static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
1665{
1666 return atomic_long_read(v: &n->nr_slabs);
1667}
1668
1669static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
1670{
1671 struct kmem_cache_node *n = get_node(s, node);
1672
1673 atomic_long_inc(v: &n->nr_slabs);
1674 atomic_long_add(i: objects, v: &n->total_objects);
1675}
1676static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects)
1677{
1678 struct kmem_cache_node *n = get_node(s, node);
1679
1680 atomic_long_dec(v: &n->nr_slabs);
1681 atomic_long_sub(i: objects, v: &n->total_objects);
1682}
1683
1684/* Object debug checks for alloc/free paths */
1685static void setup_object_debug(struct kmem_cache *s, void *object)
1686{
1687 if (!kmem_cache_debug_flags(s, SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))
1688 return;
1689
1690 init_object(s, object, SLUB_RED_INACTIVE);
1691 init_tracking(s, object);
1692}
1693
1694static
1695void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr)
1696{
1697 if (!kmem_cache_debug_flags(s, SLAB_POISON))
1698 return;
1699
1700 metadata_access_enable();
1701 memset(s: kasan_reset_tag(addr), POISON_INUSE, n: slab_size(slab));
1702 metadata_access_disable();
1703}
1704
1705static inline int alloc_consistency_checks(struct kmem_cache *s,
1706 struct slab *slab, void *object)
1707{
1708 if (!check_slab(s, slab))
1709 return 0;
1710
1711 if (!check_valid_pointer(s, slab, object)) {
1712 object_err(s, slab, object, reason: "Freelist Pointer check fails");
1713 return 0;
1714 }
1715
1716 if (!check_object(s, slab, object, SLUB_RED_INACTIVE))
1717 return 0;
1718
1719 return 1;
1720}
1721
1722static noinline bool alloc_debug_processing(struct kmem_cache *s,
1723 struct slab *slab, void *object, int orig_size)
1724{
1725 if (s->flags & SLAB_CONSISTENCY_CHECKS) {
1726 if (!alloc_consistency_checks(s, slab, object))
1727 goto bad;
1728 }
1729
1730 /* Success. Perform special debug activities for allocs */
1731 trace(s, slab, object, alloc: 1);
1732 set_orig_size(s, object, orig_size);
1733 init_object(s, object, SLUB_RED_ACTIVE);
1734 return true;
1735
1736bad:
1737 /*
1738 * Let's do the best we can to avoid issues in the future. Marking all
1739 * objects as used avoids touching the remaining objects.
1740 */
1741 slab_fix(s, fmt: "Marking all objects used");
1742 slab->inuse = slab->objects;
1743 slab->freelist = NULL;
1744 slab->frozen = 1; /* mark consistency-failed slab as frozen */
1745
1746 return false;
1747}
1748
1749static inline int free_consistency_checks(struct kmem_cache *s,
1750 struct slab *slab, void *object, unsigned long addr)
1751{
1752 if (!check_valid_pointer(s, slab, object)) {
1753 slab_err(s, slab, fmt: "Invalid object pointer 0x%p", object);
1754 return 0;
1755 }
1756
1757 if (on_freelist(s, slab, search: object)) {
1758 object_err(s, slab, object, reason: "Object already free");
1759 return 0;
1760 }
1761
1762 if (!check_object(s, slab, object, SLUB_RED_ACTIVE))
1763 return 0;
1764
1765 if (unlikely(s != slab->slab_cache)) {
1766 if (!slab->slab_cache) {
1767 slab_err(NULL, slab, fmt: "No slab cache for object 0x%p",
1768 object);
1769 } else {
1770 object_err(s, slab, object,
1771 reason: "page slab pointer corrupt.");
1772 }
1773 return 0;
1774 }
1775 return 1;
1776}
1777
1778/*
1779 * Parse a block of slab_debug options. Blocks are delimited by ';'
1780 *
1781 * @str: start of block
1782 * @flags: returns parsed flags, or DEBUG_DEFAULT_FLAGS if none specified
1783 * @slabs: return start of list of slabs, or NULL when there's no list
1784 * @init: assume this is initial parsing and not per-kmem-create parsing
1785 *
1786 * returns the start of next block if there's any, or NULL
1787 */
1788static char *
1789parse_slub_debug_flags(char *str, slab_flags_t *flags, char **slabs, bool init)
1790{
1791 bool higher_order_disable = false;
1792
1793 /* Skip any completely empty blocks */
1794 while (*str && *str == ';')
1795 str++;
1796
1797 if (*str == ',') {
1798 /*
1799 * No options but restriction on slabs. This means full
1800 * debugging for slabs matching a pattern.
1801 */
1802 *flags = DEBUG_DEFAULT_FLAGS;
1803 goto check_slabs;
1804 }
1805 *flags = 0;
1806
1807 /* Determine which debug features should be switched on */
1808 for (; *str && *str != ',' && *str != ';'; str++) {
1809 switch (tolower(*str)) {
1810 case '-':
1811 *flags = 0;
1812 break;
1813 case 'f':
1814 *flags |= SLAB_CONSISTENCY_CHECKS;
1815 break;
1816 case 'z':
1817 *flags |= SLAB_RED_ZONE;
1818 break;
1819 case 'p':
1820 *flags |= SLAB_POISON;
1821 break;
1822 case 'u':
1823 *flags |= SLAB_STORE_USER;
1824 break;
1825 case 't':
1826 *flags |= SLAB_TRACE;
1827 break;
1828 case 'a':
1829 *flags |= SLAB_FAILSLAB;
1830 break;
1831 case 'o':
1832 /*
1833 * Avoid enabling debugging on caches if its minimum
1834 * order would increase as a result.
1835 */
1836 higher_order_disable = true;
1837 break;
1838 default:
1839 if (init)
1840 pr_err("slab_debug option '%c' unknown. skipped\n", *str);
1841 }
1842 }
1843check_slabs:
1844 if (*str == ',')
1845 *slabs = ++str;
1846 else
1847 *slabs = NULL;
1848
1849 /* Skip over the slab list */
1850 while (*str && *str != ';')
1851 str++;
1852
1853 /* Skip any completely empty blocks */
1854 while (*str && *str == ';')
1855 str++;
1856
1857 if (init && higher_order_disable)
1858 disable_higher_order_debug = 1;
1859
1860 if (*str)
1861 return str;
1862 else
1863 return NULL;
1864}
1865
1866static int __init setup_slub_debug(char *str)
1867{
1868 slab_flags_t flags;
1869 slab_flags_t global_flags;
1870 char *saved_str;
1871 char *slab_list;
1872 bool global_slub_debug_changed = false;
1873 bool slab_list_specified = false;
1874
1875 global_flags = DEBUG_DEFAULT_FLAGS;
1876 if (*str++ != '=' || !*str)
1877 /*
1878 * No options specified. Switch on full debugging.
1879 */
1880 goto out;
1881
1882 saved_str = str;
1883 while (str) {
1884 str = parse_slub_debug_flags(str, flags: &flags, slabs: &slab_list, init: true);
1885
1886 if (!slab_list) {
1887 global_flags = flags;
1888 global_slub_debug_changed = true;
1889 } else {
1890 slab_list_specified = true;
1891 if (flags & SLAB_STORE_USER)
1892 stack_depot_request_early_init();
1893 }
1894 }
1895
1896 /*
1897 * For backwards compatibility, a single list of flags with list of
1898 * slabs means debugging is only changed for those slabs, so the global
1899 * slab_debug should be unchanged (0 or DEBUG_DEFAULT_FLAGS, depending
1900 * on CONFIG_SLUB_DEBUG_ON). We can extended that to multiple lists as
1901 * long as there is no option specifying flags without a slab list.
1902 */
1903 if (slab_list_specified) {
1904 if (!global_slub_debug_changed)
1905 global_flags = slub_debug;
1906 slub_debug_string = saved_str;
1907 }
1908out:
1909 slub_debug = global_flags;
1910 if (slub_debug & SLAB_STORE_USER)
1911 stack_depot_request_early_init();
1912 if (slub_debug != 0 || slub_debug_string)
1913 static_branch_enable(&slub_debug_enabled);
1914 else
1915 static_branch_disable(&slub_debug_enabled);
1916 if ((static_branch_unlikely(&init_on_alloc) ||
1917 static_branch_unlikely(&init_on_free)) &&
1918 (slub_debug & SLAB_POISON))
1919 pr_info("mem auto-init: SLAB_POISON will take precedence over init_on_alloc/init_on_free\n");
1920 return 1;
1921}
1922
1923__setup("slab_debug", setup_slub_debug);
1924__setup_param("slub_debug", slub_debug, setup_slub_debug, 0);
1925
1926/*
1927 * kmem_cache_flags - apply debugging options to the cache
1928 * @flags: flags to set
1929 * @name: name of the cache
1930 *
1931 * Debug option(s) are applied to @flags. In addition to the debug
1932 * option(s), if a slab name (or multiple) is specified i.e.
1933 * slab_debug=<Debug-Options>,<slab name1>,<slab name2> ...
1934 * then only the select slabs will receive the debug option(s).
1935 */
1936slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name)
1937{
1938 char *iter;
1939 size_t len;
1940 char *next_block;
1941 slab_flags_t block_flags;
1942 slab_flags_t slub_debug_local = slub_debug;
1943
1944 if (flags & SLAB_NO_USER_FLAGS)
1945 return flags;
1946
1947 /*
1948 * If the slab cache is for debugging (e.g. kmemleak) then
1949 * don't store user (stack trace) information by default,
1950 * but let the user enable it via the command line below.
1951 */
1952 if (flags & SLAB_NOLEAKTRACE)
1953 slub_debug_local &= ~SLAB_STORE_USER;
1954
1955 len = strlen(name);
1956 next_block = slub_debug_string;
1957 /* Go through all blocks of debug options, see if any matches our slab's name */
1958 while (next_block) {
1959 next_block = parse_slub_debug_flags(str: next_block, flags: &block_flags, slabs: &iter, init: false);
1960 if (!iter)
1961 continue;
1962 /* Found a block that has a slab list, search it */
1963 while (*iter) {
1964 char *end, *glob;
1965 size_t cmplen;
1966
1967 end = strchrnul(iter, ',');
1968 if (next_block && next_block < end)
1969 end = next_block - 1;
1970
1971 glob = strnchr(iter, end - iter, '*');
1972 if (glob)
1973 cmplen = glob - iter;
1974 else
1975 cmplen = max_t(size_t, len, (end - iter));
1976
1977 if (!strncmp(name, iter, cmplen)) {
1978 flags |= block_flags;
1979 return flags;
1980 }
1981
1982 if (!*end || *end == ';')
1983 break;
1984 iter = end + 1;
1985 }
1986 }
1987
1988 return flags | slub_debug_local;
1989}
1990#else /* !CONFIG_SLUB_DEBUG */
1991static inline void setup_object_debug(struct kmem_cache *s, void *object) {}
1992static inline
1993void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {}
1994
1995static inline bool alloc_debug_processing(struct kmem_cache *s,
1996 struct slab *slab, void *object, int orig_size) { return true; }
1997
1998static inline bool free_debug_processing(struct kmem_cache *s,
1999 struct slab *slab, void *head, void *tail, int *bulk_cnt,
2000 unsigned long addr, depot_stack_handle_t handle) { return true; }
2001
2002static inline void slab_pad_check(struct kmem_cache *s, struct slab *slab) {}
2003static inline int check_object(struct kmem_cache *s, struct slab *slab,
2004 void *object, u8 val) { return 1; }
2005static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags) { return 0; }
2006static inline void set_track(struct kmem_cache *s, void *object,
2007 enum track_item alloc, unsigned long addr, gfp_t gfp_flags) {}
2008static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
2009 struct slab *slab) {}
2010static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
2011 struct slab *slab) {}
2012slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name)
2013{
2014 return flags;
2015}
2016#define slub_debug 0
2017
2018#define disable_higher_order_debug 0
2019
2020static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
2021 { return 0; }
2022static inline void inc_slabs_node(struct kmem_cache *s, int node,
2023 int objects) {}
2024static inline void dec_slabs_node(struct kmem_cache *s, int node,
2025 int objects) {}
2026#ifndef CONFIG_SLUB_TINY
2027static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
2028 void **freelist, void *nextfree)
2029{
2030 return false;
2031}
2032#endif
2033#endif /* CONFIG_SLUB_DEBUG */
2034
2035#ifdef CONFIG_SLAB_OBJ_EXT
2036
2037#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
2038
2039static inline void mark_objexts_empty(struct slabobj_ext *obj_exts)
2040{
2041 struct slabobj_ext *slab_exts;
2042 struct slab *obj_exts_slab;
2043
2044 obj_exts_slab = virt_to_slab(obj_exts);
2045 slab_exts = slab_obj_exts(obj_exts_slab);
2046 if (slab_exts) {
2047 unsigned int offs = obj_to_index(obj_exts_slab->slab_cache,
2048 obj_exts_slab, obj_exts);
2049 /* codetag should be NULL */
2050 WARN_ON(slab_exts[offs].ref.ct);
2051 set_codetag_empty(&slab_exts[offs].ref);
2052 }
2053}
2054
2055static inline void mark_failed_objexts_alloc(struct slab *slab)
2056{
2057 slab->obj_exts = OBJEXTS_ALLOC_FAIL;
2058}
2059
2060static inline void handle_failed_objexts_alloc(unsigned long obj_exts,
2061 struct slabobj_ext *vec, unsigned int objects)
2062{
2063 /*
2064 * If vector previously failed to allocate then we have live
2065 * objects with no tag reference. Mark all references in this
2066 * vector as empty to avoid warnings later on.
2067 */
2068 if (obj_exts == OBJEXTS_ALLOC_FAIL) {
2069 unsigned int i;
2070
2071 for (i = 0; i < objects; i++)
2072 set_codetag_empty(&vec[i].ref);
2073 }
2074}
2075
2076#else /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */
2077
2078static inline void mark_objexts_empty(struct slabobj_ext *obj_exts) {}
2079static inline void mark_failed_objexts_alloc(struct slab *slab) {}
2080static inline void handle_failed_objexts_alloc(unsigned long obj_exts,
2081 struct slabobj_ext *vec, unsigned int objects) {}
2082
2083#endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */
2084
2085/*
2086 * The allocated objcg pointers array is not accounted directly.
2087 * Moreover, it should not come from DMA buffer and is not readily
2088 * reclaimable. So those GFP bits should be masked off.
2089 */
2090#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \
2091 __GFP_ACCOUNT | __GFP_NOFAIL)
2092
2093static inline void init_slab_obj_exts(struct slab *slab)
2094{
2095 slab->obj_exts = 0;
2096}
2097
2098int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
2099 gfp_t gfp, bool new_slab)
2100{
2101 bool allow_spin = gfpflags_allow_spinning(gfp);
2102 unsigned int objects = objs_per_slab(s, slab);
2103 unsigned long new_exts;
2104 unsigned long old_exts;
2105 struct slabobj_ext *vec;
2106
2107 gfp &= ~OBJCGS_CLEAR_MASK;
2108 /* Prevent recursive extension vector allocation */
2109 gfp |= __GFP_NO_OBJ_EXT;
2110
2111 /*
2112 * Note that allow_spin may be false during early boot and its
2113 * restricted GFP_BOOT_MASK. Due to kmalloc_nolock() only supporting
2114 * architectures with cmpxchg16b, early obj_exts will be missing for
2115 * very early allocations on those.
2116 */
2117 if (unlikely(!allow_spin)) {
2118 size_t sz = objects * sizeof(struct slabobj_ext);
2119
2120 vec = kmalloc_nolock(sz, __GFP_ZERO | __GFP_NO_OBJ_EXT,
2121 slab_nid(slab));
2122 } else {
2123 vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp,
2124 slab_nid(slab));
2125 }
2126 if (!vec) {
2127 /* Mark vectors which failed to allocate */
2128 mark_failed_objexts_alloc(slab);
2129
2130 return -ENOMEM;
2131 }
2132
2133 new_exts = (unsigned long)vec;
2134 if (unlikely(!allow_spin))
2135 new_exts |= OBJEXTS_NOSPIN_ALLOC;
2136#ifdef CONFIG_MEMCG
2137 new_exts |= MEMCG_DATA_OBJEXTS;
2138#endif
2139 old_exts = READ_ONCE(slab->obj_exts);
2140 handle_failed_objexts_alloc(old_exts, vec, objects);
2141 if (new_slab) {
2142 /*
2143 * If the slab is brand new and nobody can yet access its
2144 * obj_exts, no synchronization is required and obj_exts can
2145 * be simply assigned.
2146 */
2147 slab->obj_exts = new_exts;
2148 } else if ((old_exts & ~OBJEXTS_FLAGS_MASK) ||
2149 cmpxchg(&slab->obj_exts, old_exts, new_exts) != old_exts) {
2150 /*
2151 * If the slab is already in use, somebody can allocate and
2152 * assign slabobj_exts in parallel. In this case the existing
2153 * objcg vector should be reused.
2154 */
2155 mark_objexts_empty(vec);
2156 if (unlikely(!allow_spin))
2157 kfree_nolock(vec);
2158 else
2159 kfree(vec);
2160 return 0;
2161 }
2162
2163 if (allow_spin)
2164 kmemleak_not_leak(vec);
2165 return 0;
2166}
2167
2168static inline void free_slab_obj_exts(struct slab *slab)
2169{
2170 struct slabobj_ext *obj_exts;
2171
2172 obj_exts = slab_obj_exts(slab);
2173 if (!obj_exts)
2174 return;
2175
2176 /*
2177 * obj_exts was created with __GFP_NO_OBJ_EXT flag, therefore its
2178 * corresponding extension will be NULL. alloc_tag_sub() will throw a
2179 * warning if slab has extensions but the extension of an object is
2180 * NULL, therefore replace NULL with CODETAG_EMPTY to indicate that
2181 * the extension for obj_exts is expected to be NULL.
2182 */
2183 mark_objexts_empty(obj_exts);
2184 if (unlikely(READ_ONCE(slab->obj_exts) & OBJEXTS_NOSPIN_ALLOC))
2185 kfree_nolock(obj_exts);
2186 else
2187 kfree(obj_exts);
2188 slab->obj_exts = 0;
2189}
2190
2191#else /* CONFIG_SLAB_OBJ_EXT */
2192
2193static inline void init_slab_obj_exts(struct slab *slab)
2194{
2195}
2196
2197static int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
2198 gfp_t gfp, bool new_slab)
2199{
2200 return 0;
2201}
2202
2203static inline void free_slab_obj_exts(struct slab *slab)
2204{
2205}
2206
2207#endif /* CONFIG_SLAB_OBJ_EXT */
2208
2209#ifdef CONFIG_MEM_ALLOC_PROFILING
2210
2211static inline struct slabobj_ext *
2212prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p)
2213{
2214 struct slab *slab;
2215
2216 slab = virt_to_slab(p);
2217 if (!slab_obj_exts(slab) &&
2218 alloc_slab_obj_exts(slab, s, flags, false)) {
2219 pr_warn_once("%s, %s: Failed to create slab extension vector!\n",
2220 __func__, s->name);
2221 return NULL;
2222 }
2223
2224 return slab_obj_exts(slab) + obj_to_index(s, slab, p);
2225}
2226
2227/* Should be called only if mem_alloc_profiling_enabled() */
2228static noinline void
2229__alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
2230{
2231 struct slabobj_ext *obj_exts;
2232
2233 if (!object)
2234 return;
2235
2236 if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE))
2237 return;
2238
2239 if (flags & __GFP_NO_OBJ_EXT)
2240 return;
2241
2242 obj_exts = prepare_slab_obj_exts_hook(s, flags, object);
2243 /*
2244 * Currently obj_exts is used only for allocation profiling.
2245 * If other users appear then mem_alloc_profiling_enabled()
2246 * check should be added before alloc_tag_add().
2247 */
2248 if (likely(obj_exts))
2249 alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size);
2250 else
2251 alloc_tag_set_inaccurate(current->alloc_tag);
2252}
2253
2254static inline void
2255alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
2256{
2257 if (mem_alloc_profiling_enabled())
2258 __alloc_tagging_slab_alloc_hook(s, object, flags);
2259}
2260
2261/* Should be called only if mem_alloc_profiling_enabled() */
2262static noinline void
2263__alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
2264 int objects)
2265{
2266 struct slabobj_ext *obj_exts;
2267 int i;
2268
2269 /* slab->obj_exts might not be NULL if it was created for MEMCG accounting. */
2270 if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE))
2271 return;
2272
2273 obj_exts = slab_obj_exts(slab);
2274 if (!obj_exts)
2275 return;
2276
2277 for (i = 0; i < objects; i++) {
2278 unsigned int off = obj_to_index(s, slab, p[i]);
2279
2280 alloc_tag_sub(&obj_exts[off].ref, s->size);
2281 }
2282}
2283
2284static inline void
2285alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
2286 int objects)
2287{
2288 if (mem_alloc_profiling_enabled())
2289 __alloc_tagging_slab_free_hook(s, slab, p, objects);
2290}
2291
2292#else /* CONFIG_MEM_ALLOC_PROFILING */
2293
2294static inline void
2295alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
2296{
2297}
2298
2299static inline void
2300alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
2301 int objects)
2302{
2303}
2304
2305#endif /* CONFIG_MEM_ALLOC_PROFILING */
2306
2307
2308#ifdef CONFIG_MEMCG
2309
2310static void memcg_alloc_abort_single(struct kmem_cache *s, void *object);
2311
2312static __fastpath_inline
2313bool memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
2314 gfp_t flags, size_t size, void **p)
2315{
2316 if (likely(!memcg_kmem_online()))
2317 return true;
2318
2319 if (likely(!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT)))
2320 return true;
2321
2322 if (likely(__memcg_slab_post_alloc_hook(s, lru, flags, size, p)))
2323 return true;
2324
2325 if (likely(size == 1)) {
2326 memcg_alloc_abort_single(s, *p);
2327 *p = NULL;
2328 } else {
2329 kmem_cache_free_bulk(s, size, p);
2330 }
2331
2332 return false;
2333}
2334
2335static __fastpath_inline
2336void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
2337 int objects)
2338{
2339 struct slabobj_ext *obj_exts;
2340
2341 if (!memcg_kmem_online())
2342 return;
2343
2344 obj_exts = slab_obj_exts(slab);
2345 if (likely(!obj_exts))
2346 return;
2347
2348 __memcg_slab_free_hook(s, slab, p, objects, obj_exts);
2349}
2350
2351static __fastpath_inline
2352bool memcg_slab_post_charge(void *p, gfp_t flags)
2353{
2354 struct slabobj_ext *slab_exts;
2355 struct kmem_cache *s;
2356 struct folio *folio;
2357 struct slab *slab;
2358 unsigned long off;
2359
2360 folio = virt_to_folio(p);
2361 if (!folio_test_slab(folio)) {
2362 int size;
2363
2364 if (folio_memcg_kmem(folio))
2365 return true;
2366
2367 if (__memcg_kmem_charge_page(folio_page(folio, 0), flags,
2368 folio_order(folio)))
2369 return false;
2370
2371 /*
2372 * This folio has already been accounted in the global stats but
2373 * not in the memcg stats. So, subtract from the global and use
2374 * the interface which adds to both global and memcg stats.
2375 */
2376 size = folio_size(folio);
2377 node_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, -size);
2378 lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, size);
2379 return true;
2380 }
2381
2382 slab = folio_slab(folio);
2383 s = slab->slab_cache;
2384
2385 /*
2386 * Ignore KMALLOC_NORMAL cache to avoid possible circular dependency
2387 * of slab_obj_exts being allocated from the same slab and thus the slab
2388 * becoming effectively unfreeable.
2389 */
2390 if (is_kmalloc_normal(s))
2391 return true;
2392
2393 /* Ignore already charged objects. */
2394 slab_exts = slab_obj_exts(slab);
2395 if (slab_exts) {
2396 off = obj_to_index(s, slab, p);
2397 if (unlikely(slab_exts[off].objcg))
2398 return true;
2399 }
2400
2401 return __memcg_slab_post_alloc_hook(s, NULL, flags, 1, &p);
2402}
2403
2404#else /* CONFIG_MEMCG */
2405static inline bool memcg_slab_post_alloc_hook(struct kmem_cache *s,
2406 struct list_lru *lru,
2407 gfp_t flags, size_t size,
2408 void **p)
2409{
2410 return true;
2411}
2412
2413static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
2414 void **p, int objects)
2415{
2416}
2417
2418static inline bool memcg_slab_post_charge(void *p, gfp_t flags)
2419{
2420 return true;
2421}
2422#endif /* CONFIG_MEMCG */
2423
2424#ifdef CONFIG_SLUB_RCU_DEBUG
2425static void slab_free_after_rcu_debug(struct rcu_head *rcu_head);
2426
2427struct rcu_delayed_free {
2428 struct rcu_head head;
2429 void *object;
2430};
2431#endif
2432
2433/*
2434 * Hooks for other subsystems that check memory allocations. In a typical
2435 * production configuration these hooks all should produce no code at all.
2436 *
2437 * Returns true if freeing of the object can proceed, false if its reuse
2438 * was delayed by CONFIG_SLUB_RCU_DEBUG or KASAN quarantine, or it was returned
2439 * to KFENCE.
2440 */
2441static __always_inline
2442bool slab_free_hook(struct kmem_cache *s, void *x, bool init,
2443 bool after_rcu_delay)
2444{
2445 /* Are the object contents still accessible? */
2446 bool still_accessible = (s->flags & SLAB_TYPESAFE_BY_RCU) && !after_rcu_delay;
2447
2448 kmemleak_free_recursive(ptr: x, flags: s->flags);
2449 kmsan_slab_free(s, object: x);
2450
2451 debug_check_no_locks_freed(from: x, len: s->object_size);
2452
2453 if (!(s->flags & SLAB_DEBUG_OBJECTS))
2454 debug_check_no_obj_freed(address: x, size: s->object_size);
2455
2456 /* Use KCSAN to help debug racy use-after-free. */
2457 if (!still_accessible)
2458 __kcsan_check_access(ptr: x, size: s->object_size,
2459 KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT);
2460
2461 if (kfence_free(addr: x))
2462 return false;
2463
2464 /*
2465 * Give KASAN a chance to notice an invalid free operation before we
2466 * modify the object.
2467 */
2468 if (kasan_slab_pre_free(s, object: x))
2469 return false;
2470
2471#ifdef CONFIG_SLUB_RCU_DEBUG
2472 if (still_accessible) {
2473 struct rcu_delayed_free *delayed_free;
2474
2475 delayed_free = kmalloc(sizeof(*delayed_free), GFP_NOWAIT);
2476 if (delayed_free) {
2477 /*
2478 * Let KASAN track our call stack as a "related work
2479 * creation", just like if the object had been freed
2480 * normally via kfree_rcu().
2481 * We have to do this manually because the rcu_head is
2482 * not located inside the object.
2483 */
2484 kasan_record_aux_stack(x);
2485
2486 delayed_free->object = x;
2487 call_rcu(&delayed_free->head, slab_free_after_rcu_debug);
2488 return false;
2489 }
2490 }
2491#endif /* CONFIG_SLUB_RCU_DEBUG */
2492
2493 /*
2494 * As memory initialization might be integrated into KASAN,
2495 * kasan_slab_free and initialization memset's must be
2496 * kept together to avoid discrepancies in behavior.
2497 *
2498 * The initialization memset's clear the object and the metadata,
2499 * but don't touch the SLAB redzone.
2500 *
2501 * The object's freepointer is also avoided if stored outside the
2502 * object.
2503 */
2504 if (unlikely(init)) {
2505 int rsize;
2506 unsigned int inuse, orig_size;
2507
2508 inuse = get_info_end(s);
2509 orig_size = get_orig_size(s, object: x);
2510 if (!kasan_has_integrated_init())
2511 memset(s: kasan_reset_tag(addr: x), c: 0, n: orig_size);
2512 rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad : 0;
2513 memset(s: (char *)kasan_reset_tag(addr: x) + inuse, c: 0,
2514 n: s->size - inuse - rsize);
2515 /*
2516 * Restore orig_size, otherwize kmalloc redzone overwritten
2517 * would be reported
2518 */
2519 set_orig_size(s, object: x, orig_size);
2520
2521 }
2522 /* KASAN might put x into memory quarantine, delaying its reuse. */
2523 return !kasan_slab_free(s, object: x, init, still_accessible, no_quarantine: false);
2524}
2525
2526static __fastpath_inline
2527bool slab_free_freelist_hook(struct kmem_cache *s, void **head, void **tail,
2528 int *cnt)
2529{
2530
2531 void *object;
2532 void *next = *head;
2533 void *old_tail = *tail;
2534 bool init;
2535
2536 if (is_kfence_address(addr: next)) {
2537 slab_free_hook(s, x: next, init: false, after_rcu_delay: false);
2538 return false;
2539 }
2540
2541 /* Head and tail of the reconstructed freelist */
2542 *head = NULL;
2543 *tail = NULL;
2544
2545 init = slab_want_init_on_free(c: s);
2546
2547 do {
2548 object = next;
2549 next = get_freepointer(s, object);
2550
2551 /* If object's reuse doesn't have to be delayed */
2552 if (likely(slab_free_hook(s, object, init, false))) {
2553 /* Move object to the new freelist */
2554 set_freepointer(s, object, fp: *head);
2555 *head = object;
2556 if (!*tail)
2557 *tail = object;
2558 } else {
2559 /*
2560 * Adjust the reconstructed freelist depth
2561 * accordingly if object's reuse is delayed.
2562 */
2563 --(*cnt);
2564 }
2565 } while (object != old_tail);
2566
2567 return *head != NULL;
2568}
2569
2570static void *setup_object(struct kmem_cache *s, void *object)
2571{
2572 setup_object_debug(s, object);
2573 object = kasan_init_slab_obj(cache: s, object);
2574 if (unlikely(s->ctor)) {
2575 kasan_unpoison_new_object(cache: s, object);
2576 s->ctor(object);
2577 kasan_poison_new_object(cache: s, object);
2578 }
2579 return object;
2580}
2581
2582static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp)
2583{
2584 struct slab_sheaf *sheaf = kzalloc(struct_size(sheaf, objects,
2585 s->sheaf_capacity), gfp);
2586
2587 if (unlikely(!sheaf))
2588 return NULL;
2589
2590 sheaf->cache = s;
2591
2592 stat(s, si: SHEAF_ALLOC);
2593
2594 return sheaf;
2595}
2596
2597static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf)
2598{
2599 kfree(objp: sheaf);
2600
2601 stat(s, si: SHEAF_FREE);
2602}
2603
2604static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags,
2605 size_t size, void **p);
2606
2607
2608static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf,
2609 gfp_t gfp)
2610{
2611 int to_fill = s->sheaf_capacity - sheaf->size;
2612 int filled;
2613
2614 if (!to_fill)
2615 return 0;
2616
2617 filled = __kmem_cache_alloc_bulk(s, flags: gfp, size: to_fill,
2618 p: &sheaf->objects[sheaf->size]);
2619
2620 sheaf->size += filled;
2621
2622 stat_add(s, si: SHEAF_REFILL, v: filled);
2623
2624 if (filled < to_fill)
2625 return -ENOMEM;
2626
2627 return 0;
2628}
2629
2630
2631static struct slab_sheaf *alloc_full_sheaf(struct kmem_cache *s, gfp_t gfp)
2632{
2633 struct slab_sheaf *sheaf = alloc_empty_sheaf(s, gfp);
2634
2635 if (!sheaf)
2636 return NULL;
2637
2638 if (refill_sheaf(s, sheaf, gfp)) {
2639 free_empty_sheaf(s, sheaf);
2640 return NULL;
2641 }
2642
2643 return sheaf;
2644}
2645
2646/*
2647 * Maximum number of objects freed during a single flush of main pcs sheaf.
2648 * Translates directly to an on-stack array size.
2649 */
2650#define PCS_BATCH_MAX 32U
2651
2652static void __kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p);
2653
2654/*
2655 * Free all objects from the main sheaf. In order to perform
2656 * __kmem_cache_free_bulk() outside of cpu_sheaves->lock, work in batches where
2657 * object pointers are moved to a on-stack array under the lock. To bound the
2658 * stack usage, limit each batch to PCS_BATCH_MAX.
2659 *
2660 * returns true if at least partially flushed
2661 */
2662static bool sheaf_flush_main(struct kmem_cache *s)
2663{
2664 struct slub_percpu_sheaves *pcs;
2665 unsigned int batch, remaining;
2666 void *objects[PCS_BATCH_MAX];
2667 struct slab_sheaf *sheaf;
2668 bool ret = false;
2669
2670next_batch:
2671 if (!local_trylock(&s->cpu_sheaves->lock))
2672 return ret;
2673
2674 pcs = this_cpu_ptr(s->cpu_sheaves);
2675 sheaf = pcs->main;
2676
2677 batch = min(PCS_BATCH_MAX, sheaf->size);
2678
2679 sheaf->size -= batch;
2680 memcpy(to: objects, from: sheaf->objects + sheaf->size, len: batch * sizeof(void *));
2681
2682 remaining = sheaf->size;
2683
2684 local_unlock(&s->cpu_sheaves->lock);
2685
2686 __kmem_cache_free_bulk(s, size: batch, p: &objects[0]);
2687
2688 stat_add(s, si: SHEAF_FLUSH, v: batch);
2689
2690 ret = true;
2691
2692 if (remaining)
2693 goto next_batch;
2694
2695 return ret;
2696}
2697
2698/*
2699 * Free all objects from a sheaf that's unused, i.e. not linked to any
2700 * cpu_sheaves, so we need no locking and batching. The locking is also not
2701 * necessary when flushing cpu's sheaves (both spare and main) during cpu
2702 * hotremove as the cpu is not executing anymore.
2703 */
2704static void sheaf_flush_unused(struct kmem_cache *s, struct slab_sheaf *sheaf)
2705{
2706 if (!sheaf->size)
2707 return;
2708
2709 stat_add(s, si: SHEAF_FLUSH, v: sheaf->size);
2710
2711 __kmem_cache_free_bulk(s, size: sheaf->size, p: &sheaf->objects[0]);
2712
2713 sheaf->size = 0;
2714}
2715
2716static void __rcu_free_sheaf_prepare(struct kmem_cache *s,
2717 struct slab_sheaf *sheaf)
2718{
2719 bool init = slab_want_init_on_free(c: s);
2720 void **p = &sheaf->objects[0];
2721 unsigned int i = 0;
2722
2723 while (i < sheaf->size) {
2724 struct slab *slab = virt_to_slab(addr: p[i]);
2725
2726 memcg_slab_free_hook(s, slab, p: p + i, objects: 1);
2727 alloc_tagging_slab_free_hook(s, slab, p: p + i, objects: 1);
2728
2729 if (unlikely(!slab_free_hook(s, p[i], init, true))) {
2730 p[i] = p[--sheaf->size];
2731 continue;
2732 }
2733
2734 i++;
2735 }
2736}
2737
2738static void rcu_free_sheaf_nobarn(struct rcu_head *head)
2739{
2740 struct slab_sheaf *sheaf;
2741 struct kmem_cache *s;
2742
2743 sheaf = container_of(head, struct slab_sheaf, rcu_head);
2744 s = sheaf->cache;
2745
2746 __rcu_free_sheaf_prepare(s, sheaf);
2747
2748 sheaf_flush_unused(s, sheaf);
2749
2750 free_empty_sheaf(s, sheaf);
2751}
2752
2753/*
2754 * Caller needs to make sure migration is disabled in order to fully flush
2755 * single cpu's sheaves
2756 *
2757 * must not be called from an irq
2758 *
2759 * flushing operations are rare so let's keep it simple and flush to slabs
2760 * directly, skipping the barn
2761 */
2762static void pcs_flush_all(struct kmem_cache *s)
2763{
2764 struct slub_percpu_sheaves *pcs;
2765 struct slab_sheaf *spare, *rcu_free;
2766
2767 local_lock(&s->cpu_sheaves->lock);
2768 pcs = this_cpu_ptr(s->cpu_sheaves);
2769
2770 spare = pcs->spare;
2771 pcs->spare = NULL;
2772
2773 rcu_free = pcs->rcu_free;
2774 pcs->rcu_free = NULL;
2775
2776 local_unlock(&s->cpu_sheaves->lock);
2777
2778 if (spare) {
2779 sheaf_flush_unused(s, sheaf: spare);
2780 free_empty_sheaf(s, sheaf: spare);
2781 }
2782
2783 if (rcu_free)
2784 call_rcu(head: &rcu_free->rcu_head, func: rcu_free_sheaf_nobarn);
2785
2786 sheaf_flush_main(s);
2787}
2788
2789static void __pcs_flush_all_cpu(struct kmem_cache *s, unsigned int cpu)
2790{
2791 struct slub_percpu_sheaves *pcs;
2792
2793 pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
2794
2795 /* The cpu is not executing anymore so we don't need pcs->lock */
2796 sheaf_flush_unused(s, sheaf: pcs->main);
2797 if (pcs->spare) {
2798 sheaf_flush_unused(s, sheaf: pcs->spare);
2799 free_empty_sheaf(s, sheaf: pcs->spare);
2800 pcs->spare = NULL;
2801 }
2802
2803 if (pcs->rcu_free) {
2804 call_rcu(head: &pcs->rcu_free->rcu_head, func: rcu_free_sheaf_nobarn);
2805 pcs->rcu_free = NULL;
2806 }
2807}
2808
2809static void pcs_destroy(struct kmem_cache *s)
2810{
2811 int cpu;
2812
2813 for_each_possible_cpu(cpu) {
2814 struct slub_percpu_sheaves *pcs;
2815
2816 pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
2817
2818 /* can happen when unwinding failed create */
2819 if (!pcs->main)
2820 continue;
2821
2822 /*
2823 * We have already passed __kmem_cache_shutdown() so everything
2824 * was flushed and there should be no objects allocated from
2825 * slabs, otherwise kmem_cache_destroy() would have aborted.
2826 * Therefore something would have to be really wrong if the
2827 * warnings here trigger, and we should rather leave objects and
2828 * sheaves to leak in that case.
2829 */
2830
2831 WARN_ON(pcs->spare);
2832 WARN_ON(pcs->rcu_free);
2833
2834 if (!WARN_ON(pcs->main->size)) {
2835 free_empty_sheaf(s, sheaf: pcs->main);
2836 pcs->main = NULL;
2837 }
2838 }
2839
2840 free_percpu(pdata: s->cpu_sheaves);
2841 s->cpu_sheaves = NULL;
2842}
2843
2844static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn)
2845{
2846 struct slab_sheaf *empty = NULL;
2847 unsigned long flags;
2848
2849 if (!data_race(barn->nr_empty))
2850 return NULL;
2851
2852 spin_lock_irqsave(&barn->lock, flags);
2853
2854 if (likely(barn->nr_empty)) {
2855 empty = list_first_entry(&barn->sheaves_empty,
2856 struct slab_sheaf, barn_list);
2857 list_del(entry: &empty->barn_list);
2858 barn->nr_empty--;
2859 }
2860
2861 spin_unlock_irqrestore(lock: &barn->lock, flags);
2862
2863 return empty;
2864}
2865
2866/*
2867 * The following two functions are used mainly in cases where we have to undo an
2868 * intended action due to a race or cpu migration. Thus they do not check the
2869 * empty or full sheaf limits for simplicity.
2870 */
2871
2872static void barn_put_empty_sheaf(struct node_barn *barn, struct slab_sheaf *sheaf)
2873{
2874 unsigned long flags;
2875
2876 spin_lock_irqsave(&barn->lock, flags);
2877
2878 list_add(new: &sheaf->barn_list, head: &barn->sheaves_empty);
2879 barn->nr_empty++;
2880
2881 spin_unlock_irqrestore(lock: &barn->lock, flags);
2882}
2883
2884static void barn_put_full_sheaf(struct node_barn *barn, struct slab_sheaf *sheaf)
2885{
2886 unsigned long flags;
2887
2888 spin_lock_irqsave(&barn->lock, flags);
2889
2890 list_add(new: &sheaf->barn_list, head: &barn->sheaves_full);
2891 barn->nr_full++;
2892
2893 spin_unlock_irqrestore(lock: &barn->lock, flags);
2894}
2895
2896static struct slab_sheaf *barn_get_full_or_empty_sheaf(struct node_barn *barn)
2897{
2898 struct slab_sheaf *sheaf = NULL;
2899 unsigned long flags;
2900
2901 if (!data_race(barn->nr_full) && !data_race(barn->nr_empty))
2902 return NULL;
2903
2904 spin_lock_irqsave(&barn->lock, flags);
2905
2906 if (barn->nr_full) {
2907 sheaf = list_first_entry(&barn->sheaves_full, struct slab_sheaf,
2908 barn_list);
2909 list_del(entry: &sheaf->barn_list);
2910 barn->nr_full--;
2911 } else if (barn->nr_empty) {
2912 sheaf = list_first_entry(&barn->sheaves_empty,
2913 struct slab_sheaf, barn_list);
2914 list_del(entry: &sheaf->barn_list);
2915 barn->nr_empty--;
2916 }
2917
2918 spin_unlock_irqrestore(lock: &barn->lock, flags);
2919
2920 return sheaf;
2921}
2922
2923/*
2924 * If a full sheaf is available, return it and put the supplied empty one to
2925 * barn. We ignore the limit on empty sheaves as the number of sheaves doesn't
2926 * change.
2927 */
2928static struct slab_sheaf *
2929barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty)
2930{
2931 struct slab_sheaf *full = NULL;
2932 unsigned long flags;
2933
2934 if (!data_race(barn->nr_full))
2935 return NULL;
2936
2937 spin_lock_irqsave(&barn->lock, flags);
2938
2939 if (likely(barn->nr_full)) {
2940 full = list_first_entry(&barn->sheaves_full, struct slab_sheaf,
2941 barn_list);
2942 list_del(entry: &full->barn_list);
2943 list_add(new: &empty->barn_list, head: &barn->sheaves_empty);
2944 barn->nr_full--;
2945 barn->nr_empty++;
2946 }
2947
2948 spin_unlock_irqrestore(lock: &barn->lock, flags);
2949
2950 return full;
2951}
2952
2953/*
2954 * If an empty sheaf is available, return it and put the supplied full one to
2955 * barn. But if there are too many full sheaves, reject this with -E2BIG.
2956 */
2957static struct slab_sheaf *
2958barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full)
2959{
2960 struct slab_sheaf *empty;
2961 unsigned long flags;
2962
2963 /* we don't repeat this check under barn->lock as it's not critical */
2964 if (data_race(barn->nr_full) >= MAX_FULL_SHEAVES)
2965 return ERR_PTR(error: -E2BIG);
2966 if (!data_race(barn->nr_empty))
2967 return ERR_PTR(error: -ENOMEM);
2968
2969 spin_lock_irqsave(&barn->lock, flags);
2970
2971 if (likely(barn->nr_empty)) {
2972 empty = list_first_entry(&barn->sheaves_empty, struct slab_sheaf,
2973 barn_list);
2974 list_del(entry: &empty->barn_list);
2975 list_add(new: &full->barn_list, head: &barn->sheaves_full);
2976 barn->nr_empty--;
2977 barn->nr_full++;
2978 } else {
2979 empty = ERR_PTR(error: -ENOMEM);
2980 }
2981
2982 spin_unlock_irqrestore(lock: &barn->lock, flags);
2983
2984 return empty;
2985}
2986
2987static void barn_init(struct node_barn *barn)
2988{
2989 spin_lock_init(&barn->lock);
2990 INIT_LIST_HEAD(list: &barn->sheaves_full);
2991 INIT_LIST_HEAD(list: &barn->sheaves_empty);
2992 barn->nr_full = 0;
2993 barn->nr_empty = 0;
2994}
2995
2996static void barn_shrink(struct kmem_cache *s, struct node_barn *barn)
2997{
2998 struct list_head empty_list;
2999 struct list_head full_list;
3000 struct slab_sheaf *sheaf, *sheaf2;
3001 unsigned long flags;
3002
3003 INIT_LIST_HEAD(list: &empty_list);
3004 INIT_LIST_HEAD(list: &full_list);
3005
3006 spin_lock_irqsave(&barn->lock, flags);
3007
3008 list_splice_init(list: &barn->sheaves_full, head: &full_list);
3009 barn->nr_full = 0;
3010 list_splice_init(list: &barn->sheaves_empty, head: &empty_list);
3011 barn->nr_empty = 0;
3012
3013 spin_unlock_irqrestore(lock: &barn->lock, flags);
3014
3015 list_for_each_entry_safe(sheaf, sheaf2, &full_list, barn_list) {
3016 sheaf_flush_unused(s, sheaf);
3017 free_empty_sheaf(s, sheaf);
3018 }
3019
3020 list_for_each_entry_safe(sheaf, sheaf2, &empty_list, barn_list)
3021 free_empty_sheaf(s, sheaf);
3022}
3023
3024/*
3025 * Slab allocation and freeing
3026 */
3027static inline struct slab *alloc_slab_page(gfp_t flags, int node,
3028 struct kmem_cache_order_objects oo,
3029 bool allow_spin)
3030{
3031 struct folio *folio;
3032 struct slab *slab;
3033 unsigned int order = oo_order(x: oo);
3034
3035 if (unlikely(!allow_spin))
3036 folio = (struct folio *)alloc_frozen_pages_nolock(0/* __GFP_COMP is implied */,
3037 node, order);
3038 else if (node == NUMA_NO_NODE)
3039 folio = (struct folio *)alloc_frozen_pages(flags, order);
3040 else
3041 folio = (struct folio *)__alloc_frozen_pages(flags, order, node, NULL);
3042
3043 if (!folio)
3044 return NULL;
3045
3046 slab = folio_slab(folio);
3047 __folio_set_slab(folio);
3048 if (folio_is_pfmemalloc(folio))
3049 slab_set_pfmemalloc(slab);
3050
3051 return slab;
3052}
3053
3054#ifdef CONFIG_SLAB_FREELIST_RANDOM
3055/* Pre-initialize the random sequence cache */
3056static int init_cache_random_seq(struct kmem_cache *s)
3057{
3058 unsigned int count = oo_objects(s->oo);
3059 int err;
3060
3061 /* Bailout if already initialised */
3062 if (s->random_seq)
3063 return 0;
3064
3065 err = cache_random_seq_create(s, count, GFP_KERNEL);
3066 if (err) {
3067 pr_err("SLUB: Unable to initialize free list for %s\n",
3068 s->name);
3069 return err;
3070 }
3071
3072 /* Transform to an offset on the set of pages */
3073 if (s->random_seq) {
3074 unsigned int i;
3075
3076 for (i = 0; i < count; i++)
3077 s->random_seq[i] *= s->size;
3078 }
3079 return 0;
3080}
3081
3082/* Initialize each random sequence freelist per cache */
3083static void __init init_freelist_randomization(void)
3084{
3085 struct kmem_cache *s;
3086
3087 mutex_lock(&slab_mutex);
3088
3089 list_for_each_entry(s, &slab_caches, list)
3090 init_cache_random_seq(s);
3091
3092 mutex_unlock(&slab_mutex);
3093}
3094
3095/* Get the next entry on the pre-computed freelist randomized */
3096static void *next_freelist_entry(struct kmem_cache *s,
3097 unsigned long *pos, void *start,
3098 unsigned long page_limit,
3099 unsigned long freelist_count)
3100{
3101 unsigned int idx;
3102
3103 /*
3104 * If the target page allocation failed, the number of objects on the
3105 * page might be smaller than the usual size defined by the cache.
3106 */
3107 do {
3108 idx = s->random_seq[*pos];
3109 *pos += 1;
3110 if (*pos >= freelist_count)
3111 *pos = 0;
3112 } while (unlikely(idx >= page_limit));
3113
3114 return (char *)start + idx;
3115}
3116
3117/* Shuffle the single linked freelist based on a random pre-computed sequence */
3118static bool shuffle_freelist(struct kmem_cache *s, struct slab *slab)
3119{
3120 void *start;
3121 void *cur;
3122 void *next;
3123 unsigned long idx, pos, page_limit, freelist_count;
3124
3125 if (slab->objects < 2 || !s->random_seq)
3126 return false;
3127
3128 freelist_count = oo_objects(s->oo);
3129 pos = get_random_u32_below(freelist_count);
3130
3131 page_limit = slab->objects * s->size;
3132 start = fixup_red_left(s, slab_address(slab));
3133
3134 /* First entry is used as the base of the freelist */
3135 cur = next_freelist_entry(s, &pos, start, page_limit, freelist_count);
3136 cur = setup_object(s, cur);
3137 slab->freelist = cur;
3138
3139 for (idx = 1; idx < slab->objects; idx++) {
3140 next = next_freelist_entry(s, &pos, start, page_limit,
3141 freelist_count);
3142 next = setup_object(s, next);
3143 set_freepointer(s, cur, next);
3144 cur = next;
3145 }
3146 set_freepointer(s, cur, NULL);
3147
3148 return true;
3149}
3150#else
3151static inline int init_cache_random_seq(struct kmem_cache *s)
3152{
3153 return 0;
3154}
3155static inline void init_freelist_randomization(void) { }
3156static inline bool shuffle_freelist(struct kmem_cache *s, struct slab *slab)
3157{
3158 return false;
3159}
3160#endif /* CONFIG_SLAB_FREELIST_RANDOM */
3161
3162static __always_inline void account_slab(struct slab *slab, int order,
3163 struct kmem_cache *s, gfp_t gfp)
3164{
3165 if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT))
3166 alloc_slab_obj_exts(slab, s, gfp, new_slab: true);
3167
3168 mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
3169 PAGE_SIZE << order);
3170}
3171
3172static __always_inline void unaccount_slab(struct slab *slab, int order,
3173 struct kmem_cache *s)
3174{
3175 /*
3176 * The slab object extensions should now be freed regardless of
3177 * whether mem_alloc_profiling_enabled() or not because profiling
3178 * might have been disabled after slab->obj_exts got allocated.
3179 */
3180 free_slab_obj_exts(slab);
3181
3182 mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
3183 -(PAGE_SIZE << order));
3184}
3185
3186static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
3187{
3188 bool allow_spin = gfpflags_allow_spinning(gfp_flags: flags);
3189 struct slab *slab;
3190 struct kmem_cache_order_objects oo = s->oo;
3191 gfp_t alloc_gfp;
3192 void *start, *p, *next;
3193 int idx;
3194 bool shuffle;
3195
3196 flags &= gfp_allowed_mask;
3197
3198 flags |= s->allocflags;
3199
3200 /*
3201 * Let the initial higher-order allocation fail under memory pressure
3202 * so we fall-back to the minimum order allocation.
3203 */
3204 alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
3205 if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(x: oo) > oo_order(x: s->min))
3206 alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_RECLAIM;
3207
3208 /*
3209 * __GFP_RECLAIM could be cleared on the first allocation attempt,
3210 * so pass allow_spin flag directly.
3211 */
3212 slab = alloc_slab_page(flags: alloc_gfp, node, oo, allow_spin);
3213 if (unlikely(!slab)) {
3214 oo = s->min;
3215 alloc_gfp = flags;
3216 /*
3217 * Allocation may have failed due to fragmentation.
3218 * Try a lower order alloc if possible
3219 */
3220 slab = alloc_slab_page(flags: alloc_gfp, node, oo, allow_spin);
3221 if (unlikely(!slab))
3222 return NULL;
3223 stat(s, si: ORDER_FALLBACK);
3224 }
3225
3226 slab->objects = oo_objects(x: oo);
3227 slab->inuse = 0;
3228 slab->frozen = 0;
3229 init_slab_obj_exts(slab);
3230
3231 account_slab(slab, order: oo_order(x: oo), s, gfp: flags);
3232
3233 slab->slab_cache = s;
3234
3235 kasan_poison_slab(slab);
3236
3237 start = slab_address(slab);
3238
3239 setup_slab_debug(s, slab, addr: start);
3240
3241 shuffle = shuffle_freelist(s, slab);
3242
3243 if (!shuffle) {
3244 start = fixup_red_left(s, p: start);
3245 start = setup_object(s, object: start);
3246 slab->freelist = start;
3247 for (idx = 0, p = start; idx < slab->objects - 1; idx++) {
3248 next = p + s->size;
3249 next = setup_object(s, object: next);
3250 set_freepointer(s, object: p, fp: next);
3251 p = next;
3252 }
3253 set_freepointer(s, object: p, NULL);
3254 }
3255
3256 return slab;
3257}
3258
3259static struct slab *new_slab(struct kmem_cache *s, gfp_t flags, int node)
3260{
3261 if (unlikely(flags & GFP_SLAB_BUG_MASK))
3262 flags = kmalloc_fix_flags(flags);
3263
3264 WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO));
3265
3266 return allocate_slab(s,
3267 flags: flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
3268}
3269
3270static void __free_slab(struct kmem_cache *s, struct slab *slab)
3271{
3272 struct folio *folio = slab_folio(slab);
3273 int order = folio_order(folio);
3274 int pages = 1 << order;
3275
3276 __slab_clear_pfmemalloc(slab);
3277 folio->mapping = NULL;
3278 __folio_clear_slab(folio);
3279 mm_account_reclaimed_pages(pages);
3280 unaccount_slab(slab, order, s);
3281 free_frozen_pages(page: &folio->page, order);
3282}
3283
3284static void rcu_free_slab(struct rcu_head *h)
3285{
3286 struct slab *slab = container_of(h, struct slab, rcu_head);
3287
3288 __free_slab(s: slab->slab_cache, slab);
3289}
3290
3291static void free_slab(struct kmem_cache *s, struct slab *slab)
3292{
3293 if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) {
3294 void *p;
3295
3296 slab_pad_check(s, slab);
3297 for_each_object(p, s, slab_address(slab), slab->objects)
3298 check_object(s, slab, object: p, SLUB_RED_INACTIVE);
3299 }
3300
3301 if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU))
3302 call_rcu(head: &slab->rcu_head, func: rcu_free_slab);
3303 else
3304 __free_slab(s, slab);
3305}
3306
3307static void discard_slab(struct kmem_cache *s, struct slab *slab)
3308{
3309 dec_slabs_node(s, node: slab_nid(slab), objects: slab->objects);
3310 free_slab(s, slab);
3311}
3312
3313static inline bool slab_test_node_partial(const struct slab *slab)
3314{
3315 return test_bit(SL_partial, &slab->flags.f);
3316}
3317
3318static inline void slab_set_node_partial(struct slab *slab)
3319{
3320 set_bit(nr: SL_partial, addr: &slab->flags.f);
3321}
3322
3323static inline void slab_clear_node_partial(struct slab *slab)
3324{
3325 clear_bit(nr: SL_partial, addr: &slab->flags.f);
3326}
3327
3328/*
3329 * Management of partially allocated slabs.
3330 */
3331static inline void
3332__add_partial(struct kmem_cache_node *n, struct slab *slab, int tail)
3333{
3334 n->nr_partial++;
3335 if (tail == DEACTIVATE_TO_TAIL)
3336 list_add_tail(new: &slab->slab_list, head: &n->partial);
3337 else
3338 list_add(new: &slab->slab_list, head: &n->partial);
3339 slab_set_node_partial(slab);
3340}
3341
3342static inline void add_partial(struct kmem_cache_node *n,
3343 struct slab *slab, int tail)
3344{
3345 lockdep_assert_held(&n->list_lock);
3346 __add_partial(n, slab, tail);
3347}
3348
3349static inline void remove_partial(struct kmem_cache_node *n,
3350 struct slab *slab)
3351{
3352 lockdep_assert_held(&n->list_lock);
3353 list_del(entry: &slab->slab_list);
3354 slab_clear_node_partial(slab);
3355 n->nr_partial--;
3356}
3357
3358/*
3359 * Called only for kmem_cache_debug() caches instead of remove_partial(), with a
3360 * slab from the n->partial list. Remove only a single object from the slab, do
3361 * the alloc_debug_processing() checks and leave the slab on the list, or move
3362 * it to full list if it was the last free object.
3363 */
3364static void *alloc_single_from_partial(struct kmem_cache *s,
3365 struct kmem_cache_node *n, struct slab *slab, int orig_size)
3366{
3367 void *object;
3368
3369 lockdep_assert_held(&n->list_lock);
3370
3371#ifdef CONFIG_SLUB_DEBUG
3372 if (s->flags & SLAB_CONSISTENCY_CHECKS) {
3373 if (!validate_slab_ptr(slab)) {
3374 slab_err(s, slab, fmt: "Not a valid slab page");
3375 return NULL;
3376 }
3377 }
3378#endif
3379
3380 object = slab->freelist;
3381 slab->freelist = get_freepointer(s, object);
3382 slab->inuse++;
3383
3384 if (!alloc_debug_processing(s, slab, object, orig_size)) {
3385 remove_partial(n, slab);
3386 return NULL;
3387 }
3388
3389 if (slab->inuse == slab->objects) {
3390 remove_partial(n, slab);
3391 add_full(s, n, slab);
3392 }
3393
3394 return object;
3395}
3396
3397static void defer_deactivate_slab(struct slab *slab, void *flush_freelist);
3398
3399/*
3400 * Called only for kmem_cache_debug() caches to allocate from a freshly
3401 * allocated slab. Allocate a single object instead of whole freelist
3402 * and put the slab to the partial (or full) list.
3403 */
3404static void *alloc_single_from_new_slab(struct kmem_cache *s, struct slab *slab,
3405 int orig_size, gfp_t gfpflags)
3406{
3407 bool allow_spin = gfpflags_allow_spinning(gfp_flags: gfpflags);
3408 int nid = slab_nid(slab);
3409 struct kmem_cache_node *n = get_node(s, node: nid);
3410 unsigned long flags;
3411 void *object;
3412
3413 if (!allow_spin && !spin_trylock_irqsave(&n->list_lock, flags)) {
3414 /* Unlucky, discard newly allocated slab */
3415 slab->frozen = 1;
3416 defer_deactivate_slab(slab, NULL);
3417 return NULL;
3418 }
3419
3420 object = slab->freelist;
3421 slab->freelist = get_freepointer(s, object);
3422 slab->inuse = 1;
3423
3424 if (!alloc_debug_processing(s, slab, object, orig_size)) {
3425 /*
3426 * It's not really expected that this would fail on a
3427 * freshly allocated slab, but a concurrent memory
3428 * corruption in theory could cause that.
3429 * Leak memory of allocated slab.
3430 */
3431 if (!allow_spin)
3432 spin_unlock_irqrestore(lock: &n->list_lock, flags);
3433 return NULL;
3434 }
3435
3436 if (allow_spin)
3437 spin_lock_irqsave(&n->list_lock, flags);
3438
3439 if (slab->inuse == slab->objects)
3440 add_full(s, n, slab);
3441 else
3442 add_partial(n, slab, tail: DEACTIVATE_TO_HEAD);
3443
3444 inc_slabs_node(s, node: nid, objects: slab->objects);
3445 spin_unlock_irqrestore(lock: &n->list_lock, flags);
3446
3447 return object;
3448}
3449
3450#ifdef CONFIG_SLUB_CPU_PARTIAL
3451static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain);
3452#else
3453static inline void put_cpu_partial(struct kmem_cache *s, struct slab *slab,
3454 int drain) { }
3455#endif
3456static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags);
3457
3458/*
3459 * Try to allocate a partial slab from a specific node.
3460 */
3461static struct slab *get_partial_node(struct kmem_cache *s,
3462 struct kmem_cache_node *n,
3463 struct partial_context *pc)
3464{
3465 struct slab *slab, *slab2, *partial = NULL;
3466 unsigned long flags;
3467 unsigned int partial_slabs = 0;
3468
3469 /*
3470 * Racy check. If we mistakenly see no partial slabs then we
3471 * just allocate an empty slab. If we mistakenly try to get a
3472 * partial slab and there is none available then get_partial()
3473 * will return NULL.
3474 */
3475 if (!n || !n->nr_partial)
3476 return NULL;
3477
3478 if (gfpflags_allow_spinning(gfp_flags: pc->flags))
3479 spin_lock_irqsave(&n->list_lock, flags);
3480 else if (!spin_trylock_irqsave(&n->list_lock, flags))
3481 return NULL;
3482 list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) {
3483 if (!pfmemalloc_match(slab, gfpflags: pc->flags))
3484 continue;
3485
3486 if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
3487 void *object = alloc_single_from_partial(s, n, slab,
3488 orig_size: pc->orig_size);
3489 if (object) {
3490 partial = slab;
3491 pc->object = object;
3492 break;
3493 }
3494 continue;
3495 }
3496
3497 remove_partial(n, slab);
3498
3499 if (!partial) {
3500 partial = slab;
3501 stat(s, si: ALLOC_FROM_PARTIAL);
3502
3503 if ((slub_get_cpu_partial(s) == 0)) {
3504 break;
3505 }
3506 } else {
3507 put_cpu_partial(s, slab, drain: 0);
3508 stat(s, si: CPU_PARTIAL_NODE);
3509
3510 if (++partial_slabs > slub_get_cpu_partial(s) / 2) {
3511 break;
3512 }
3513 }
3514 }
3515 spin_unlock_irqrestore(lock: &n->list_lock, flags);
3516 return partial;
3517}
3518
3519/*
3520 * Get a slab from somewhere. Search in increasing NUMA distances.
3521 */
3522static struct slab *get_any_partial(struct kmem_cache *s,
3523 struct partial_context *pc)
3524{
3525#ifdef CONFIG_NUMA
3526 struct zonelist *zonelist;
3527 struct zoneref *z;
3528 struct zone *zone;
3529 enum zone_type highest_zoneidx = gfp_zone(flags: pc->flags);
3530 struct slab *slab;
3531 unsigned int cpuset_mems_cookie;
3532
3533 /*
3534 * The defrag ratio allows a configuration of the tradeoffs between
3535 * inter node defragmentation and node local allocations. A lower
3536 * defrag_ratio increases the tendency to do local allocations
3537 * instead of attempting to obtain partial slabs from other nodes.
3538 *
3539 * If the defrag_ratio is set to 0 then kmalloc() always
3540 * returns node local objects. If the ratio is higher then kmalloc()
3541 * may return off node objects because partial slabs are obtained
3542 * from other nodes and filled up.
3543 *
3544 * If /sys/kernel/slab/xx/remote_node_defrag_ratio is set to 100
3545 * (which makes defrag_ratio = 1000) then every (well almost)
3546 * allocation will first attempt to defrag slab caches on other nodes.
3547 * This means scanning over all nodes to look for partial slabs which
3548 * may be expensive if we do it every time we are trying to find a slab
3549 * with available objects.
3550 */
3551 if (!s->remote_node_defrag_ratio ||
3552 get_cycles() % 1024 > s->remote_node_defrag_ratio)
3553 return NULL;
3554
3555 do {
3556 cpuset_mems_cookie = read_mems_allowed_begin();
3557 zonelist = node_zonelist(nid: mempolicy_slab_node(), flags: pc->flags);
3558 for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) {
3559 struct kmem_cache_node *n;
3560
3561 n = get_node(s, node: zone_to_nid(zone));
3562
3563 if (n && cpuset_zone_allowed(z: zone, gfp_mask: pc->flags) &&
3564 n->nr_partial > s->min_partial) {
3565 slab = get_partial_node(s, n, pc);
3566 if (slab) {
3567 /*
3568 * Don't check read_mems_allowed_retry()
3569 * here - if mems_allowed was updated in
3570 * parallel, that was a harmless race
3571 * between allocation and the cpuset
3572 * update
3573 */
3574 return slab;
3575 }
3576 }
3577 }
3578 } while (read_mems_allowed_retry(seq: cpuset_mems_cookie));
3579#endif /* CONFIG_NUMA */
3580 return NULL;
3581}
3582
3583/*
3584 * Get a partial slab, lock it and return it.
3585 */
3586static struct slab *get_partial(struct kmem_cache *s, int node,
3587 struct partial_context *pc)
3588{
3589 struct slab *slab;
3590 int searchnode = node;
3591
3592 if (node == NUMA_NO_NODE)
3593 searchnode = numa_mem_id();
3594
3595 slab = get_partial_node(s, n: get_node(s, node: searchnode), pc);
3596 if (slab || (node != NUMA_NO_NODE && (pc->flags & __GFP_THISNODE)))
3597 return slab;
3598
3599 return get_any_partial(s, pc);
3600}
3601
3602#ifndef CONFIG_SLUB_TINY
3603
3604#ifdef CONFIG_PREEMPTION
3605/*
3606 * Calculate the next globally unique transaction for disambiguation
3607 * during cmpxchg. The transactions start with the cpu number and are then
3608 * incremented by CONFIG_NR_CPUS.
3609 */
3610#define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS)
3611#else
3612/*
3613 * No preemption supported therefore also no need to check for
3614 * different cpus.
3615 */
3616#define TID_STEP 1
3617#endif /* CONFIG_PREEMPTION */
3618
3619static inline unsigned long next_tid(unsigned long tid)
3620{
3621 return tid + TID_STEP;
3622}
3623
3624#ifdef SLUB_DEBUG_CMPXCHG
3625static inline unsigned int tid_to_cpu(unsigned long tid)
3626{
3627 return tid % TID_STEP;
3628}
3629
3630static inline unsigned long tid_to_event(unsigned long tid)
3631{
3632 return tid / TID_STEP;
3633}
3634#endif
3635
3636static inline unsigned int init_tid(int cpu)
3637{
3638 return cpu;
3639}
3640
3641static inline void note_cmpxchg_failure(const char *n,
3642 const struct kmem_cache *s, unsigned long tid)
3643{
3644#ifdef SLUB_DEBUG_CMPXCHG
3645 unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
3646
3647 pr_info("%s %s: cmpxchg redo ", n, s->name);
3648
3649 if (IS_ENABLED(CONFIG_PREEMPTION) &&
3650 tid_to_cpu(tid) != tid_to_cpu(actual_tid)) {
3651 pr_warn("due to cpu change %d -> %d\n",
3652 tid_to_cpu(tid), tid_to_cpu(actual_tid));
3653 } else if (tid_to_event(tid) != tid_to_event(actual_tid)) {
3654 pr_warn("due to cpu running other code. Event %ld->%ld\n",
3655 tid_to_event(tid), tid_to_event(actual_tid));
3656 } else {
3657 pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n",
3658 actual_tid, tid, next_tid(tid));
3659 }
3660#endif
3661 stat(s, si: CMPXCHG_DOUBLE_CPU_FAIL);
3662}
3663
3664static void init_kmem_cache_cpus(struct kmem_cache *s)
3665{
3666#ifdef CONFIG_PREEMPT_RT
3667 /*
3668 * Register lockdep key for non-boot kmem caches to avoid
3669 * WARN_ON_ONCE(static_obj(key))) in lockdep_register_key()
3670 */
3671 bool finegrain_lockdep = !init_section_contains(s, 1);
3672#else
3673 /*
3674 * Don't bother with different lockdep classes for each
3675 * kmem_cache, since we only use local_trylock_irqsave().
3676 */
3677 bool finegrain_lockdep = false;
3678#endif
3679 int cpu;
3680 struct kmem_cache_cpu *c;
3681
3682 if (finegrain_lockdep)
3683 lockdep_register_key(key: &s->lock_key);
3684 for_each_possible_cpu(cpu) {
3685 c = per_cpu_ptr(s->cpu_slab, cpu);
3686 local_trylock_init(&c->lock);
3687 if (finegrain_lockdep)
3688 lockdep_set_class(&c->lock, &s->lock_key);
3689 c->tid = init_tid(cpu);
3690 }
3691}
3692
3693/*
3694 * Finishes removing the cpu slab. Merges cpu's freelist with slab's freelist,
3695 * unfreezes the slabs and puts it on the proper list.
3696 * Assumes the slab has been already safely taken away from kmem_cache_cpu
3697 * by the caller.
3698 */
3699static void deactivate_slab(struct kmem_cache *s, struct slab *slab,
3700 void *freelist)
3701{
3702 struct kmem_cache_node *n = get_node(s, node: slab_nid(slab));
3703 int free_delta = 0;
3704 void *nextfree, *freelist_iter, *freelist_tail;
3705 int tail = DEACTIVATE_TO_HEAD;
3706 unsigned long flags = 0;
3707 struct slab new;
3708 struct slab old;
3709
3710 if (READ_ONCE(slab->freelist)) {
3711 stat(s, si: DEACTIVATE_REMOTE_FREES);
3712 tail = DEACTIVATE_TO_TAIL;
3713 }
3714
3715 /*
3716 * Stage one: Count the objects on cpu's freelist as free_delta and
3717 * remember the last object in freelist_tail for later splicing.
3718 */
3719 freelist_tail = NULL;
3720 freelist_iter = freelist;
3721 while (freelist_iter) {
3722 nextfree = get_freepointer(s, object: freelist_iter);
3723
3724 /*
3725 * If 'nextfree' is invalid, it is possible that the object at
3726 * 'freelist_iter' is already corrupted. So isolate all objects
3727 * starting at 'freelist_iter' by skipping them.
3728 */
3729 if (freelist_corrupted(s, slab, freelist: &freelist_iter, nextfree))
3730 break;
3731
3732 freelist_tail = freelist_iter;
3733 free_delta++;
3734
3735 freelist_iter = nextfree;
3736 }
3737
3738 /*
3739 * Stage two: Unfreeze the slab while splicing the per-cpu
3740 * freelist to the head of slab's freelist.
3741 */
3742 do {
3743 old.freelist = READ_ONCE(slab->freelist);
3744 old.counters = READ_ONCE(slab->counters);
3745 VM_BUG_ON(!old.frozen);
3746
3747 /* Determine target state of the slab */
3748 new.counters = old.counters;
3749 new.frozen = 0;
3750 if (freelist_tail) {
3751 new.inuse -= free_delta;
3752 set_freepointer(s, object: freelist_tail, fp: old.freelist);
3753 new.freelist = freelist;
3754 } else {
3755 new.freelist = old.freelist;
3756 }
3757 } while (!slab_update_freelist(s, slab,
3758 freelist_old: old.freelist, counters_old: old.counters,
3759 freelist_new: new.freelist, counters_new: new.counters,
3760 n: "unfreezing slab"));
3761
3762 /*
3763 * Stage three: Manipulate the slab list based on the updated state.
3764 */
3765 if (!new.inuse && n->nr_partial >= s->min_partial) {
3766 stat(s, si: DEACTIVATE_EMPTY);
3767 discard_slab(s, slab);
3768 stat(s, si: FREE_SLAB);
3769 } else if (new.freelist) {
3770 spin_lock_irqsave(&n->list_lock, flags);
3771 add_partial(n, slab, tail);
3772 spin_unlock_irqrestore(lock: &n->list_lock, flags);
3773 stat(s, si: tail);
3774 } else {
3775 stat(s, si: DEACTIVATE_FULL);
3776 }
3777}
3778
3779/*
3780 * ___slab_alloc()'s caller is supposed to check if kmem_cache::kmem_cache_cpu::lock
3781 * can be acquired without a deadlock before invoking the function.
3782 *
3783 * Without LOCKDEP we trust the code to be correct. kmalloc_nolock() is
3784 * using local_lock_is_locked() properly before calling local_lock_cpu_slab(),
3785 * and kmalloc() is not used in an unsupported context.
3786 *
3787 * With LOCKDEP, on PREEMPT_RT lockdep does its checking in local_lock_irqsave().
3788 * On !PREEMPT_RT we use trylock to avoid false positives in NMI, but
3789 * lockdep_assert() will catch a bug in case:
3790 * #1
3791 * kmalloc() -> ___slab_alloc() -> irqsave -> NMI -> bpf -> kmalloc_nolock()
3792 * or
3793 * #2
3794 * kmalloc() -> ___slab_alloc() -> irqsave -> tracepoint/kprobe -> bpf -> kmalloc_nolock()
3795 *
3796 * On PREEMPT_RT an invocation is not possible from IRQ-off or preempt
3797 * disabled context. The lock will always be acquired and if needed it
3798 * block and sleep until the lock is available.
3799 * #1 is possible in !PREEMPT_RT only.
3800 * #2 is possible in both with a twist that irqsave is replaced with rt_spinlock:
3801 * kmalloc() -> ___slab_alloc() -> rt_spin_lock(kmem_cache_A) ->
3802 * tracepoint/kprobe -> bpf -> kmalloc_nolock() -> rt_spin_lock(kmem_cache_B)
3803 *
3804 * local_lock_is_locked() prevents the case kmem_cache_A == kmem_cache_B
3805 */
3806#if defined(CONFIG_PREEMPT_RT) || !defined(CONFIG_LOCKDEP)
3807#define local_lock_cpu_slab(s, flags) \
3808 local_lock_irqsave(&(s)->cpu_slab->lock, flags)
3809#else
3810#define local_lock_cpu_slab(s, flags) \
3811 do { \
3812 bool __l = local_trylock_irqsave(&(s)->cpu_slab->lock, flags); \
3813 lockdep_assert(__l); \
3814 } while (0)
3815#endif
3816
3817#define local_unlock_cpu_slab(s, flags) \
3818 local_unlock_irqrestore(&(s)->cpu_slab->lock, flags)
3819
3820#ifdef CONFIG_SLUB_CPU_PARTIAL
3821static void __put_partials(struct kmem_cache *s, struct slab *partial_slab)
3822{
3823 struct kmem_cache_node *n = NULL, *n2 = NULL;
3824 struct slab *slab, *slab_to_discard = NULL;
3825 unsigned long flags = 0;
3826
3827 while (partial_slab) {
3828 slab = partial_slab;
3829 partial_slab = slab->next;
3830
3831 n2 = get_node(s, node: slab_nid(slab));
3832 if (n != n2) {
3833 if (n)
3834 spin_unlock_irqrestore(lock: &n->list_lock, flags);
3835
3836 n = n2;
3837 spin_lock_irqsave(&n->list_lock, flags);
3838 }
3839
3840 if (unlikely(!slab->inuse && n->nr_partial >= s->min_partial)) {
3841 slab->next = slab_to_discard;
3842 slab_to_discard = slab;
3843 } else {
3844 add_partial(n, slab, tail: DEACTIVATE_TO_TAIL);
3845 stat(s, si: FREE_ADD_PARTIAL);
3846 }
3847 }
3848
3849 if (n)
3850 spin_unlock_irqrestore(lock: &n->list_lock, flags);
3851
3852 while (slab_to_discard) {
3853 slab = slab_to_discard;
3854 slab_to_discard = slab_to_discard->next;
3855
3856 stat(s, si: DEACTIVATE_EMPTY);
3857 discard_slab(s, slab);
3858 stat(s, si: FREE_SLAB);
3859 }
3860}
3861
3862/*
3863 * Put all the cpu partial slabs to the node partial list.
3864 */
3865static void put_partials(struct kmem_cache *s)
3866{
3867 struct slab *partial_slab;
3868 unsigned long flags;
3869
3870 local_lock_irqsave(&s->cpu_slab->lock, flags);
3871 partial_slab = this_cpu_read(s->cpu_slab->partial);
3872 this_cpu_write(s->cpu_slab->partial, NULL);
3873 local_unlock_irqrestore(&s->cpu_slab->lock, flags);
3874
3875 if (partial_slab)
3876 __put_partials(s, partial_slab);
3877}
3878
3879static void put_partials_cpu(struct kmem_cache *s,
3880 struct kmem_cache_cpu *c)
3881{
3882 struct slab *partial_slab;
3883
3884 partial_slab = slub_percpu_partial(c);
3885 c->partial = NULL;
3886
3887 if (partial_slab)
3888 __put_partials(s, partial_slab);
3889}
3890
3891/*
3892 * Put a slab into a partial slab slot if available.
3893 *
3894 * If we did not find a slot then simply move all the partials to the
3895 * per node partial list.
3896 */
3897static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain)
3898{
3899 struct slab *oldslab;
3900 struct slab *slab_to_put = NULL;
3901 unsigned long flags;
3902 int slabs = 0;
3903
3904 local_lock_cpu_slab(s, flags);
3905
3906 oldslab = this_cpu_read(s->cpu_slab->partial);
3907
3908 if (oldslab) {
3909 if (drain && oldslab->slabs >= s->cpu_partial_slabs) {
3910 /*
3911 * Partial array is full. Move the existing set to the
3912 * per node partial list. Postpone the actual unfreezing
3913 * outside of the critical section.
3914 */
3915 slab_to_put = oldslab;
3916 oldslab = NULL;
3917 } else {
3918 slabs = oldslab->slabs;
3919 }
3920 }
3921
3922 slabs++;
3923
3924 slab->slabs = slabs;
3925 slab->next = oldslab;
3926
3927 this_cpu_write(s->cpu_slab->partial, slab);
3928
3929 local_unlock_cpu_slab(s, flags);
3930
3931 if (slab_to_put) {
3932 __put_partials(s, partial_slab: slab_to_put);
3933 stat(s, si: CPU_PARTIAL_DRAIN);
3934 }
3935}
3936
3937#else /* CONFIG_SLUB_CPU_PARTIAL */
3938
3939static inline void put_partials(struct kmem_cache *s) { }
3940static inline void put_partials_cpu(struct kmem_cache *s,
3941 struct kmem_cache_cpu *c) { }
3942
3943#endif /* CONFIG_SLUB_CPU_PARTIAL */
3944
3945static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
3946{
3947 unsigned long flags;
3948 struct slab *slab;
3949 void *freelist;
3950
3951 local_lock_irqsave(&s->cpu_slab->lock, flags);
3952
3953 slab = c->slab;
3954 freelist = c->freelist;
3955
3956 c->slab = NULL;
3957 c->freelist = NULL;
3958 c->tid = next_tid(tid: c->tid);
3959
3960 local_unlock_irqrestore(&s->cpu_slab->lock, flags);
3961
3962 if (slab) {
3963 deactivate_slab(s, slab, freelist);
3964 stat(s, si: CPUSLAB_FLUSH);
3965 }
3966}
3967
3968static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
3969{
3970 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
3971 void *freelist = c->freelist;
3972 struct slab *slab = c->slab;
3973
3974 c->slab = NULL;
3975 c->freelist = NULL;
3976 c->tid = next_tid(tid: c->tid);
3977
3978 if (slab) {
3979 deactivate_slab(s, slab, freelist);
3980 stat(s, si: CPUSLAB_FLUSH);
3981 }
3982
3983 put_partials_cpu(s, c);
3984}
3985
3986static inline void flush_this_cpu_slab(struct kmem_cache *s)
3987{
3988 struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab);
3989
3990 if (c->slab)
3991 flush_slab(s, c);
3992
3993 put_partials(s);
3994}
3995
3996static bool has_cpu_slab(int cpu, struct kmem_cache *s)
3997{
3998 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
3999
4000 return c->slab || slub_percpu_partial(c);
4001}
4002
4003#else /* CONFIG_SLUB_TINY */
4004static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { }
4005static inline bool has_cpu_slab(int cpu, struct kmem_cache *s) { return false; }
4006static inline void flush_this_cpu_slab(struct kmem_cache *s) { }
4007#endif /* CONFIG_SLUB_TINY */
4008
4009static bool has_pcs_used(int cpu, struct kmem_cache *s)
4010{
4011 struct slub_percpu_sheaves *pcs;
4012
4013 if (!s->cpu_sheaves)
4014 return false;
4015
4016 pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
4017
4018 return (pcs->spare || pcs->rcu_free || pcs->main->size);
4019}
4020
4021/*
4022 * Flush cpu slab.
4023 *
4024 * Called from CPU work handler with migration disabled.
4025 */
4026static void flush_cpu_slab(struct work_struct *w)
4027{
4028 struct kmem_cache *s;
4029 struct slub_flush_work *sfw;
4030
4031 sfw = container_of(w, struct slub_flush_work, work);
4032
4033 s = sfw->s;
4034
4035 if (s->cpu_sheaves)
4036 pcs_flush_all(s);
4037
4038 flush_this_cpu_slab(s);
4039}
4040
4041static void flush_all_cpus_locked(struct kmem_cache *s)
4042{
4043 struct slub_flush_work *sfw;
4044 unsigned int cpu;
4045
4046 lockdep_assert_cpus_held();
4047 mutex_lock(lock: &flush_lock);
4048
4049 for_each_online_cpu(cpu) {
4050 sfw = &per_cpu(slub_flush, cpu);
4051 if (!has_cpu_slab(cpu, s) && !has_pcs_used(cpu, s)) {
4052 sfw->skip = true;
4053 continue;
4054 }
4055 INIT_WORK(&sfw->work, flush_cpu_slab);
4056 sfw->skip = false;
4057 sfw->s = s;
4058 queue_work_on(cpu, wq: flushwq, work: &sfw->work);
4059 }
4060
4061 for_each_online_cpu(cpu) {
4062 sfw = &per_cpu(slub_flush, cpu);
4063 if (sfw->skip)
4064 continue;
4065 flush_work(work: &sfw->work);
4066 }
4067
4068 mutex_unlock(lock: &flush_lock);
4069}
4070
4071static void flush_all(struct kmem_cache *s)
4072{
4073 cpus_read_lock();
4074 flush_all_cpus_locked(s);
4075 cpus_read_unlock();
4076}
4077
4078static void flush_rcu_sheaf(struct work_struct *w)
4079{
4080 struct slub_percpu_sheaves *pcs;
4081 struct slab_sheaf *rcu_free;
4082 struct slub_flush_work *sfw;
4083 struct kmem_cache *s;
4084
4085 sfw = container_of(w, struct slub_flush_work, work);
4086 s = sfw->s;
4087
4088 local_lock(&s->cpu_sheaves->lock);
4089 pcs = this_cpu_ptr(s->cpu_sheaves);
4090
4091 rcu_free = pcs->rcu_free;
4092 pcs->rcu_free = NULL;
4093
4094 local_unlock(&s->cpu_sheaves->lock);
4095
4096 if (rcu_free)
4097 call_rcu(head: &rcu_free->rcu_head, func: rcu_free_sheaf_nobarn);
4098}
4099
4100
4101/* needed for kvfree_rcu_barrier() */
4102void flush_all_rcu_sheaves(void)
4103{
4104 struct slub_flush_work *sfw;
4105 struct kmem_cache *s;
4106 unsigned int cpu;
4107
4108 cpus_read_lock();
4109 mutex_lock(lock: &slab_mutex);
4110
4111 list_for_each_entry(s, &slab_caches, list) {
4112 if (!s->cpu_sheaves)
4113 continue;
4114
4115 mutex_lock(lock: &flush_lock);
4116
4117 for_each_online_cpu(cpu) {
4118 sfw = &per_cpu(slub_flush, cpu);
4119
4120 /*
4121 * we don't check if rcu_free sheaf exists - racing
4122 * __kfree_rcu_sheaf() might have just removed it.
4123 * by executing flush_rcu_sheaf() on the cpu we make
4124 * sure the __kfree_rcu_sheaf() finished its call_rcu()
4125 */
4126
4127 INIT_WORK(&sfw->work, flush_rcu_sheaf);
4128 sfw->s = s;
4129 queue_work_on(cpu, wq: flushwq, work: &sfw->work);
4130 }
4131
4132 for_each_online_cpu(cpu) {
4133 sfw = &per_cpu(slub_flush, cpu);
4134 flush_work(work: &sfw->work);
4135 }
4136
4137 mutex_unlock(lock: &flush_lock);
4138 }
4139
4140 mutex_unlock(lock: &slab_mutex);
4141 cpus_read_unlock();
4142
4143 rcu_barrier();
4144}
4145
4146/*
4147 * Use the cpu notifier to insure that the cpu slabs are flushed when
4148 * necessary.
4149 */
4150static int slub_cpu_dead(unsigned int cpu)
4151{
4152 struct kmem_cache *s;
4153
4154 mutex_lock(lock: &slab_mutex);
4155 list_for_each_entry(s, &slab_caches, list) {
4156 __flush_cpu_slab(s, cpu);
4157 if (s->cpu_sheaves)
4158 __pcs_flush_all_cpu(s, cpu);
4159 }
4160 mutex_unlock(lock: &slab_mutex);
4161 return 0;
4162}
4163
4164/*
4165 * Check if the objects in a per cpu structure fit numa
4166 * locality expectations.
4167 */
4168static inline int node_match(struct slab *slab, int node)
4169{
4170#ifdef CONFIG_NUMA
4171 if (node != NUMA_NO_NODE && slab_nid(slab) != node)
4172 return 0;
4173#endif
4174 return 1;
4175}
4176
4177#ifdef CONFIG_SLUB_DEBUG
4178static int count_free(struct slab *slab)
4179{
4180 return slab->objects - slab->inuse;
4181}
4182
4183static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
4184{
4185 return atomic_long_read(v: &n->total_objects);
4186}
4187
4188/* Supports checking bulk free of a constructed freelist */
4189static inline bool free_debug_processing(struct kmem_cache *s,
4190 struct slab *slab, void *head, void *tail, int *bulk_cnt,
4191 unsigned long addr, depot_stack_handle_t handle)
4192{
4193 bool checks_ok = false;
4194 void *object = head;
4195 int cnt = 0;
4196
4197 if (s->flags & SLAB_CONSISTENCY_CHECKS) {
4198 if (!check_slab(s, slab))
4199 goto out;
4200 }
4201
4202 if (slab->inuse < *bulk_cnt) {
4203 slab_err(s, slab, fmt: "Slab has %d allocated objects but %d are to be freed\n",
4204 slab->inuse, *bulk_cnt);
4205 goto out;
4206 }
4207
4208next_object:
4209
4210 if (++cnt > *bulk_cnt)
4211 goto out_cnt;
4212
4213 if (s->flags & SLAB_CONSISTENCY_CHECKS) {
4214 if (!free_consistency_checks(s, slab, object, addr))
4215 goto out;
4216 }
4217
4218 if (s->flags & SLAB_STORE_USER)
4219 set_track_update(s, object, alloc: TRACK_FREE, addr, handle);
4220 trace(s, slab, object, alloc: 0);
4221 /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
4222 init_object(s, object, SLUB_RED_INACTIVE);
4223
4224 /* Reached end of constructed freelist yet? */
4225 if (object != tail) {
4226 object = get_freepointer(s, object);
4227 goto next_object;
4228 }
4229 checks_ok = true;
4230
4231out_cnt:
4232 if (cnt != *bulk_cnt) {
4233 slab_err(s, slab, fmt: "Bulk free expected %d objects but found %d\n",
4234 *bulk_cnt, cnt);
4235 *bulk_cnt = cnt;
4236 }
4237
4238out:
4239
4240 if (!checks_ok)
4241 slab_fix(s, fmt: "Object at 0x%p not freed", object);
4242
4243 return checks_ok;
4244}
4245#endif /* CONFIG_SLUB_DEBUG */
4246
4247#if defined(CONFIG_SLUB_DEBUG) || defined(SLAB_SUPPORTS_SYSFS)
4248static unsigned long count_partial(struct kmem_cache_node *n,
4249 int (*get_count)(struct slab *))
4250{
4251 unsigned long flags;
4252 unsigned long x = 0;
4253 struct slab *slab;
4254
4255 spin_lock_irqsave(&n->list_lock, flags);
4256 list_for_each_entry(slab, &n->partial, slab_list)
4257 x += get_count(slab);
4258 spin_unlock_irqrestore(lock: &n->list_lock, flags);
4259 return x;
4260}
4261#endif /* CONFIG_SLUB_DEBUG || SLAB_SUPPORTS_SYSFS */
4262
4263#ifdef CONFIG_SLUB_DEBUG
4264#define MAX_PARTIAL_TO_SCAN 10000
4265
4266static unsigned long count_partial_free_approx(struct kmem_cache_node *n)
4267{
4268 unsigned long flags;
4269 unsigned long x = 0;
4270 struct slab *slab;
4271
4272 spin_lock_irqsave(&n->list_lock, flags);
4273 if (n->nr_partial <= MAX_PARTIAL_TO_SCAN) {
4274 list_for_each_entry(slab, &n->partial, slab_list)
4275 x += slab->objects - slab->inuse;
4276 } else {
4277 /*
4278 * For a long list, approximate the total count of objects in
4279 * it to meet the limit on the number of slabs to scan.
4280 * Scan from both the list's head and tail for better accuracy.
4281 */
4282 unsigned long scanned = 0;
4283
4284 list_for_each_entry(slab, &n->partial, slab_list) {
4285 x += slab->objects - slab->inuse;
4286 if (++scanned == MAX_PARTIAL_TO_SCAN / 2)
4287 break;
4288 }
4289 list_for_each_entry_reverse(slab, &n->partial, slab_list) {
4290 x += slab->objects - slab->inuse;
4291 if (++scanned == MAX_PARTIAL_TO_SCAN)
4292 break;
4293 }
4294 x = mult_frac(x, n->nr_partial, scanned);
4295 x = min(x, node_nr_objs(n));
4296 }
4297 spin_unlock_irqrestore(lock: &n->list_lock, flags);
4298 return x;
4299}
4300
4301static noinline void
4302slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
4303{
4304 static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
4305 DEFAULT_RATELIMIT_BURST);
4306 int cpu = raw_smp_processor_id();
4307 int node;
4308 struct kmem_cache_node *n;
4309
4310 if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs))
4311 return;
4312
4313 pr_warn("SLUB: Unable to allocate memory on CPU %u (of node %d) on node %d, gfp=%#x(%pGg)\n",
4314 cpu, cpu_to_node(cpu), nid, gfpflags, &gfpflags);
4315 pr_warn(" cache: %s, object size: %u, buffer size: %u, default order: %u, min order: %u\n",
4316 s->name, s->object_size, s->size, oo_order(s->oo),
4317 oo_order(s->min));
4318
4319 if (oo_order(x: s->min) > get_order(size: s->object_size))
4320 pr_warn(" %s debugging increased min order, use slab_debug=O to disable.\n",
4321 s->name);
4322
4323 for_each_kmem_cache_node(s, node, n) {
4324 unsigned long nr_slabs;
4325 unsigned long nr_objs;
4326 unsigned long nr_free;
4327
4328 nr_free = count_partial_free_approx(n);
4329 nr_slabs = node_nr_slabs(n);
4330 nr_objs = node_nr_objs(n);
4331
4332 pr_warn(" node %d: slabs: %ld, objs: %ld, free: %ld\n",
4333 node, nr_slabs, nr_objs, nr_free);
4334 }
4335}
4336#else /* CONFIG_SLUB_DEBUG */
4337static inline void
4338slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) { }
4339#endif
4340
4341static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags)
4342{
4343 if (unlikely(slab_test_pfmemalloc(slab)))
4344 return gfp_pfmemalloc_allowed(gfp_mask: gfpflags);
4345
4346 return true;
4347}
4348
4349#ifndef CONFIG_SLUB_TINY
4350static inline bool
4351__update_cpu_freelist_fast(struct kmem_cache *s,
4352 void *freelist_old, void *freelist_new,
4353 unsigned long tid)
4354{
4355 freelist_aba_t old = { .freelist = freelist_old, .counter = tid };
4356 freelist_aba_t new = { .freelist = freelist_new, .counter = next_tid(tid) };
4357
4358 return this_cpu_try_cmpxchg_freelist(s->cpu_slab->freelist_tid.full,
4359 &old.full, new.full);
4360}
4361
4362/*
4363 * Check the slab->freelist and either transfer the freelist to the
4364 * per cpu freelist or deactivate the slab.
4365 *
4366 * The slab is still frozen if the return value is not NULL.
4367 *
4368 * If this function returns NULL then the slab has been unfrozen.
4369 */
4370static inline void *get_freelist(struct kmem_cache *s, struct slab *slab)
4371{
4372 struct slab new;
4373 unsigned long counters;
4374 void *freelist;
4375
4376 lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));
4377
4378 do {
4379 freelist = slab->freelist;
4380 counters = slab->counters;
4381
4382 new.counters = counters;
4383
4384 new.inuse = slab->objects;
4385 new.frozen = freelist != NULL;
4386
4387 } while (!__slab_update_freelist(s, slab,
4388 freelist_old: freelist, counters_old: counters,
4389 NULL, counters_new: new.counters,
4390 n: "get_freelist"));
4391
4392 return freelist;
4393}
4394
4395/*
4396 * Freeze the partial slab and return the pointer to the freelist.
4397 */
4398static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab)
4399{
4400 struct slab new;
4401 unsigned long counters;
4402 void *freelist;
4403
4404 do {
4405 freelist = slab->freelist;
4406 counters = slab->counters;
4407
4408 new.counters = counters;
4409 VM_BUG_ON(new.frozen);
4410
4411 new.inuse = slab->objects;
4412 new.frozen = 1;
4413
4414 } while (!slab_update_freelist(s, slab,
4415 freelist_old: freelist, counters_old: counters,
4416 NULL, counters_new: new.counters,
4417 n: "freeze_slab"));
4418
4419 return freelist;
4420}
4421
4422/*
4423 * Slow path. The lockless freelist is empty or we need to perform
4424 * debugging duties.
4425 *
4426 * Processing is still very fast if new objects have been freed to the
4427 * regular freelist. In that case we simply take over the regular freelist
4428 * as the lockless freelist and zap the regular freelist.
4429 *
4430 * If that is not working then we fall back to the partial lists. We take the
4431 * first element of the freelist as the object to allocate now and move the
4432 * rest of the freelist to the lockless freelist.
4433 *
4434 * And if we were unable to get a new slab from the partial slab lists then
4435 * we need to allocate a new slab. This is the slowest path since it involves
4436 * a call to the page allocator and the setup of a new slab.
4437 *
4438 * Version of __slab_alloc to use when we know that preemption is
4439 * already disabled (which is the case for bulk allocation).
4440 */
4441static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
4442 unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
4443{
4444 bool allow_spin = gfpflags_allow_spinning(gfp_flags: gfpflags);
4445 void *freelist;
4446 struct slab *slab;
4447 unsigned long flags;
4448 struct partial_context pc;
4449 bool try_thisnode = true;
4450
4451 stat(s, si: ALLOC_SLOWPATH);
4452
4453reread_slab:
4454
4455 slab = READ_ONCE(c->slab);
4456 if (!slab) {
4457 /*
4458 * if the node is not online or has no normal memory, just
4459 * ignore the node constraint
4460 */
4461 if (unlikely(node != NUMA_NO_NODE &&
4462 !node_isset(node, slab_nodes)))
4463 node = NUMA_NO_NODE;
4464 goto new_slab;
4465 }
4466
4467 if (unlikely(!node_match(slab, node))) {
4468 /*
4469 * same as above but node_match() being false already
4470 * implies node != NUMA_NO_NODE.
4471 *
4472 * We don't strictly honor pfmemalloc and NUMA preferences
4473 * when !allow_spin because:
4474 *
4475 * 1. Most kmalloc() users allocate objects on the local node,
4476 * so kmalloc_nolock() tries not to interfere with them by
4477 * deactivating the cpu slab.
4478 *
4479 * 2. Deactivating due to NUMA or pfmemalloc mismatch may cause
4480 * unnecessary slab allocations even when n->partial list
4481 * is not empty.
4482 */
4483 if (!node_isset(node, slab_nodes) ||
4484 !allow_spin) {
4485 node = NUMA_NO_NODE;
4486 } else {
4487 stat(s, si: ALLOC_NODE_MISMATCH);
4488 goto deactivate_slab;
4489 }
4490 }
4491
4492 /*
4493 * By rights, we should be searching for a slab page that was
4494 * PFMEMALLOC but right now, we are losing the pfmemalloc
4495 * information when the page leaves the per-cpu allocator
4496 */
4497 if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin))
4498 goto deactivate_slab;
4499
4500 /* must check again c->slab in case we got preempted and it changed */
4501 local_lock_cpu_slab(s, flags);
4502
4503 if (unlikely(slab != c->slab)) {
4504 local_unlock_cpu_slab(s, flags);
4505 goto reread_slab;
4506 }
4507 freelist = c->freelist;
4508 if (freelist)
4509 goto load_freelist;
4510
4511 freelist = get_freelist(s, slab);
4512
4513 if (!freelist) {
4514 c->slab = NULL;
4515 c->tid = next_tid(tid: c->tid);
4516 local_unlock_cpu_slab(s, flags);
4517 stat(s, si: DEACTIVATE_BYPASS);
4518 goto new_slab;
4519 }
4520
4521 stat(s, si: ALLOC_REFILL);
4522
4523load_freelist:
4524
4525 lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));
4526
4527 /*
4528 * freelist is pointing to the list of objects to be used.
4529 * slab is pointing to the slab from which the objects are obtained.
4530 * That slab must be frozen for per cpu allocations to work.
4531 */
4532 VM_BUG_ON(!c->slab->frozen);
4533 c->freelist = get_freepointer(s, object: freelist);
4534 c->tid = next_tid(tid: c->tid);
4535 local_unlock_cpu_slab(s, flags);
4536 return freelist;
4537
4538deactivate_slab:
4539
4540 local_lock_cpu_slab(s, flags);
4541 if (slab != c->slab) {
4542 local_unlock_cpu_slab(s, flags);
4543 goto reread_slab;
4544 }
4545 freelist = c->freelist;
4546 c->slab = NULL;
4547 c->freelist = NULL;
4548 c->tid = next_tid(tid: c->tid);
4549 local_unlock_cpu_slab(s, flags);
4550 deactivate_slab(s, slab, freelist);
4551
4552new_slab:
4553
4554#ifdef CONFIG_SLUB_CPU_PARTIAL
4555 while (slub_percpu_partial(c)) {
4556 local_lock_cpu_slab(s, flags);
4557 if (unlikely(c->slab)) {
4558 local_unlock_cpu_slab(s, flags);
4559 goto reread_slab;
4560 }
4561 if (unlikely(!slub_percpu_partial(c))) {
4562 local_unlock_cpu_slab(s, flags);
4563 /* we were preempted and partial list got empty */
4564 goto new_objects;
4565 }
4566
4567 slab = slub_percpu_partial(c);
4568 slub_set_percpu_partial(c, slab);
4569
4570 if (likely(node_match(slab, node) &&
4571 pfmemalloc_match(slab, gfpflags)) ||
4572 !allow_spin) {
4573 c->slab = slab;
4574 freelist = get_freelist(s, slab);
4575 VM_BUG_ON(!freelist);
4576 stat(s, si: CPU_PARTIAL_ALLOC);
4577 goto load_freelist;
4578 }
4579
4580 local_unlock_cpu_slab(s, flags);
4581
4582 slab->next = NULL;
4583 __put_partials(s, partial_slab: slab);
4584 }
4585#endif
4586
4587new_objects:
4588
4589 pc.flags = gfpflags;
4590 /*
4591 * When a preferred node is indicated but no __GFP_THISNODE
4592 *
4593 * 1) try to get a partial slab from target node only by having
4594 * __GFP_THISNODE in pc.flags for get_partial()
4595 * 2) if 1) failed, try to allocate a new slab from target node with
4596 * GPF_NOWAIT | __GFP_THISNODE opportunistically
4597 * 3) if 2) failed, retry with original gfpflags which will allow
4598 * get_partial() try partial lists of other nodes before potentially
4599 * allocating new page from other nodes
4600 */
4601 if (unlikely(node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE)
4602 && try_thisnode)) {
4603 if (unlikely(!allow_spin))
4604 /* Do not upgrade gfp to NOWAIT from more restrictive mode */
4605 pc.flags = gfpflags | __GFP_THISNODE;
4606 else
4607 pc.flags = GFP_NOWAIT | __GFP_THISNODE;
4608 }
4609
4610 pc.orig_size = orig_size;
4611 slab = get_partial(s, node, pc: &pc);
4612 if (slab) {
4613 if (kmem_cache_debug(s)) {
4614 freelist = pc.object;
4615 /*
4616 * For debug caches here we had to go through
4617 * alloc_single_from_partial() so just store the
4618 * tracking info and return the object.
4619 *
4620 * Due to disabled preemption we need to disallow
4621 * blocking. The flags are further adjusted by
4622 * gfp_nested_mask() in stack_depot itself.
4623 */
4624 if (s->flags & SLAB_STORE_USER)
4625 set_track(s, object: freelist, alloc: TRACK_ALLOC, addr,
4626 gfp_flags: gfpflags & ~(__GFP_DIRECT_RECLAIM));
4627
4628 return freelist;
4629 }
4630
4631 freelist = freeze_slab(s, slab);
4632 goto retry_load_slab;
4633 }
4634
4635 slub_put_cpu_ptr(s->cpu_slab);
4636 slab = new_slab(s, flags: pc.flags, node);
4637 c = slub_get_cpu_ptr(s->cpu_slab);
4638
4639 if (unlikely(!slab)) {
4640 if (node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE)
4641 && try_thisnode) {
4642 try_thisnode = false;
4643 goto new_objects;
4644 }
4645 slab_out_of_memory(s, gfpflags, nid: node);
4646 return NULL;
4647 }
4648
4649 stat(s, si: ALLOC_SLAB);
4650
4651 if (kmem_cache_debug(s)) {
4652 freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags);
4653
4654 if (unlikely(!freelist))
4655 goto new_objects;
4656
4657 if (s->flags & SLAB_STORE_USER)
4658 set_track(s, object: freelist, alloc: TRACK_ALLOC, addr,
4659 gfp_flags: gfpflags & ~(__GFP_DIRECT_RECLAIM));
4660
4661 return freelist;
4662 }
4663
4664 /*
4665 * No other reference to the slab yet so we can
4666 * muck around with it freely without cmpxchg
4667 */
4668 freelist = slab->freelist;
4669 slab->freelist = NULL;
4670 slab->inuse = slab->objects;
4671 slab->frozen = 1;
4672
4673 inc_slabs_node(s, node: slab_nid(slab), objects: slab->objects);
4674
4675 if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) {
4676 /*
4677 * For !pfmemalloc_match() case we don't load freelist so that
4678 * we don't make further mismatched allocations easier.
4679 */
4680 deactivate_slab(s, slab, freelist: get_freepointer(s, object: freelist));
4681 return freelist;
4682 }
4683
4684retry_load_slab:
4685
4686 local_lock_cpu_slab(s, flags);
4687 if (unlikely(c->slab)) {
4688 void *flush_freelist = c->freelist;
4689 struct slab *flush_slab = c->slab;
4690
4691 c->slab = NULL;
4692 c->freelist = NULL;
4693 c->tid = next_tid(tid: c->tid);
4694
4695 local_unlock_cpu_slab(s, flags);
4696
4697 if (unlikely(!allow_spin)) {
4698 /* Reentrant slub cannot take locks, defer */
4699 defer_deactivate_slab(slab: flush_slab, flush_freelist);
4700 } else {
4701 deactivate_slab(s, slab: flush_slab, freelist: flush_freelist);
4702 }
4703
4704 stat(s, si: CPUSLAB_FLUSH);
4705
4706 goto retry_load_slab;
4707 }
4708 c->slab = slab;
4709
4710 goto load_freelist;
4711}
4712/*
4713 * We disallow kprobes in ___slab_alloc() to prevent reentrance
4714 *
4715 * kmalloc() -> ___slab_alloc() -> local_lock_cpu_slab() protected part of
4716 * ___slab_alloc() manipulating c->freelist -> kprobe -> bpf ->
4717 * kmalloc_nolock() or kfree_nolock() -> __update_cpu_freelist_fast()
4718 * manipulating c->freelist without lock.
4719 *
4720 * This does not prevent kprobe in functions called from ___slab_alloc() such as
4721 * local_lock_irqsave() itself, and that is fine, we only need to protect the
4722 * c->freelist manipulation in ___slab_alloc() itself.
4723 */
4724NOKPROBE_SYMBOL(___slab_alloc);
4725
4726/*
4727 * A wrapper for ___slab_alloc() for contexts where preemption is not yet
4728 * disabled. Compensates for possible cpu changes by refetching the per cpu area
4729 * pointer.
4730 */
4731static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
4732 unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
4733{
4734 void *p;
4735
4736#ifdef CONFIG_PREEMPT_COUNT
4737 /*
4738 * We may have been preempted and rescheduled on a different
4739 * cpu before disabling preemption. Need to reload cpu area
4740 * pointer.
4741 */
4742 c = slub_get_cpu_ptr(s->cpu_slab);
4743#endif
4744 if (unlikely(!gfpflags_allow_spinning(gfpflags))) {
4745 if (local_lock_is_locked(&s->cpu_slab->lock)) {
4746 /*
4747 * EBUSY is an internal signal to kmalloc_nolock() to
4748 * retry a different bucket. It's not propagated
4749 * to the caller.
4750 */
4751 p = ERR_PTR(error: -EBUSY);
4752 goto out;
4753 }
4754 }
4755 p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size);
4756out:
4757#ifdef CONFIG_PREEMPT_COUNT
4758 slub_put_cpu_ptr(s->cpu_slab);
4759#endif
4760 return p;
4761}
4762
4763static __always_inline void *__slab_alloc_node(struct kmem_cache *s,
4764 gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
4765{
4766 struct kmem_cache_cpu *c;
4767 struct slab *slab;
4768 unsigned long tid;
4769 void *object;
4770
4771redo:
4772 /*
4773 * Must read kmem_cache cpu data via this cpu ptr. Preemption is
4774 * enabled. We may switch back and forth between cpus while
4775 * reading from one cpu area. That does not matter as long
4776 * as we end up on the original cpu again when doing the cmpxchg.
4777 *
4778 * We must guarantee that tid and kmem_cache_cpu are retrieved on the
4779 * same cpu. We read first the kmem_cache_cpu pointer and use it to read
4780 * the tid. If we are preempted and switched to another cpu between the
4781 * two reads, it's OK as the two are still associated with the same cpu
4782 * and cmpxchg later will validate the cpu.
4783 */
4784 c = raw_cpu_ptr(s->cpu_slab);
4785 tid = READ_ONCE(c->tid);
4786
4787 /*
4788 * Irqless object alloc/free algorithm used here depends on sequence
4789 * of fetching cpu_slab's data. tid should be fetched before anything
4790 * on c to guarantee that object and slab associated with previous tid
4791 * won't be used with current tid. If we fetch tid first, object and
4792 * slab could be one associated with next tid and our alloc/free
4793 * request will be failed. In this case, we will retry. So, no problem.
4794 */
4795 barrier();
4796
4797 /*
4798 * The transaction ids are globally unique per cpu and per operation on
4799 * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
4800 * occurs on the right processor and that there was no operation on the
4801 * linked list in between.
4802 */
4803
4804 object = c->freelist;
4805 slab = c->slab;
4806
4807#ifdef CONFIG_NUMA
4808 if (static_branch_unlikely(&strict_numa) &&
4809 node == NUMA_NO_NODE) {
4810
4811 struct mempolicy *mpol = current->mempolicy;
4812
4813 if (mpol) {
4814 /*
4815 * Special BIND rule support. If existing slab
4816 * is in permitted set then do not redirect
4817 * to a particular node.
4818 * Otherwise we apply the memory policy to get
4819 * the node we need to allocate on.
4820 */
4821 if (mpol->mode != MPOL_BIND || !slab ||
4822 !node_isset(slab_nid(slab), mpol->nodes))
4823
4824 node = mempolicy_slab_node();
4825 }
4826 }
4827#endif
4828
4829 if (!USE_LOCKLESS_FAST_PATH() ||
4830 unlikely(!object || !slab || !node_match(slab, node))) {
4831 object = __slab_alloc(s, gfpflags, node, addr, c, orig_size);
4832 } else {
4833 void *next_object = get_freepointer_safe(s, object);
4834
4835 /*
4836 * The cmpxchg will only match if there was no additional
4837 * operation and if we are on the right processor.
4838 *
4839 * The cmpxchg does the following atomically (without lock
4840 * semantics!)
4841 * 1. Relocate first pointer to the current per cpu area.
4842 * 2. Verify that tid and freelist have not been changed
4843 * 3. If they were not changed replace tid and freelist
4844 *
4845 * Since this is without lock semantics the protection is only
4846 * against code executing on this cpu *not* from access by
4847 * other cpus.
4848 */
4849 if (unlikely(!__update_cpu_freelist_fast(s, object, next_object, tid))) {
4850 note_cmpxchg_failure(n: "slab_alloc", s, tid);
4851 goto redo;
4852 }
4853 prefetch_freepointer(s, object: next_object);
4854 stat(s, si: ALLOC_FASTPATH);
4855 }
4856
4857 return object;
4858}
4859#else /* CONFIG_SLUB_TINY */
4860static void *__slab_alloc_node(struct kmem_cache *s,
4861 gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
4862{
4863 struct partial_context pc;
4864 struct slab *slab;
4865 void *object;
4866
4867 pc.flags = gfpflags;
4868 pc.orig_size = orig_size;
4869 slab = get_partial(s, node, &pc);
4870
4871 if (slab)
4872 return pc.object;
4873
4874 slab = new_slab(s, gfpflags, node);
4875 if (unlikely(!slab)) {
4876 slab_out_of_memory(s, gfpflags, node);
4877 return NULL;
4878 }
4879
4880 object = alloc_single_from_new_slab(s, slab, orig_size, gfpflags);
4881
4882 return object;
4883}
4884#endif /* CONFIG_SLUB_TINY */
4885
4886/*
4887 * If the object has been wiped upon free, make sure it's fully initialized by
4888 * zeroing out freelist pointer.
4889 *
4890 * Note that we also wipe custom freelist pointers.
4891 */
4892static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
4893 void *obj)
4894{
4895 if (unlikely(slab_want_init_on_free(s)) && obj &&
4896 !freeptr_outside_object(s))
4897 memset(s: (void *)((char *)kasan_reset_tag(addr: obj) + s->offset),
4898 c: 0, n: sizeof(void *));
4899}
4900
4901static __fastpath_inline
4902struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
4903{
4904 flags &= gfp_allowed_mask;
4905
4906 might_alloc(gfp_mask: flags);
4907
4908 if (unlikely(should_failslab(s, flags)))
4909 return NULL;
4910
4911 return s;
4912}
4913
4914static __fastpath_inline
4915bool slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
4916 gfp_t flags, size_t size, void **p, bool init,
4917 unsigned int orig_size)
4918{
4919 unsigned int zero_size = s->object_size;
4920 bool kasan_init = init;
4921 size_t i;
4922 gfp_t init_flags = flags & gfp_allowed_mask;
4923
4924 /*
4925 * For kmalloc object, the allocated memory size(object_size) is likely
4926 * larger than the requested size(orig_size). If redzone check is
4927 * enabled for the extra space, don't zero it, as it will be redzoned
4928 * soon. The redzone operation for this extra space could be seen as a
4929 * replacement of current poisoning under certain debug option, and
4930 * won't break other sanity checks.
4931 */
4932 if (kmem_cache_debug_flags(s, SLAB_STORE_USER | SLAB_RED_ZONE) &&
4933 (s->flags & SLAB_KMALLOC))
4934 zero_size = orig_size;
4935
4936 /*
4937 * When slab_debug is enabled, avoid memory initialization integrated
4938 * into KASAN and instead zero out the memory via the memset below with
4939 * the proper size. Otherwise, KASAN might overwrite SLUB redzones and
4940 * cause false-positive reports. This does not lead to a performance
4941 * penalty on production builds, as slab_debug is not intended to be
4942 * enabled there.
4943 */
4944 if (__slub_debug_enabled())
4945 kasan_init = false;
4946
4947 /*
4948 * As memory initialization might be integrated into KASAN,
4949 * kasan_slab_alloc and initialization memset must be
4950 * kept together to avoid discrepancies in behavior.
4951 *
4952 * As p[i] might get tagged, memset and kmemleak hook come after KASAN.
4953 */
4954 for (i = 0; i < size; i++) {
4955 p[i] = kasan_slab_alloc(s, object: p[i], flags: init_flags, init: kasan_init);
4956 if (p[i] && init && (!kasan_init ||
4957 !kasan_has_integrated_init()))
4958 memset(s: p[i], c: 0, n: zero_size);
4959 if (gfpflags_allow_spinning(gfp_flags: flags))
4960 kmemleak_alloc_recursive(ptr: p[i], size: s->object_size, min_count: 1,
4961 flags: s->flags, gfp: init_flags);
4962 kmsan_slab_alloc(s, object: p[i], flags: init_flags);
4963 alloc_tagging_slab_alloc_hook(s, object: p[i], flags);
4964 }
4965
4966 return memcg_slab_post_alloc_hook(s, lru, flags, size, p);
4967}
4968
4969/*
4970 * Replace the empty main sheaf with a (at least partially) full sheaf.
4971 *
4972 * Must be called with the cpu_sheaves local lock locked. If successful, returns
4973 * the pcs pointer and the local lock locked (possibly on a different cpu than
4974 * initially called). If not successful, returns NULL and the local lock
4975 * unlocked.
4976 */
4977static struct slub_percpu_sheaves *
4978__pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, gfp_t gfp)
4979{
4980 struct slab_sheaf *empty = NULL;
4981 struct slab_sheaf *full;
4982 struct node_barn *barn;
4983 bool can_alloc;
4984
4985 lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock));
4986
4987 if (pcs->spare && pcs->spare->size > 0) {
4988 swap(pcs->main, pcs->spare);
4989 return pcs;
4990 }
4991
4992 barn = get_barn(s);
4993 if (!barn) {
4994 local_unlock(&s->cpu_sheaves->lock);
4995 return NULL;
4996 }
4997
4998 full = barn_replace_empty_sheaf(barn, empty: pcs->main);
4999
5000 if (full) {
5001 stat(s, si: BARN_GET);
5002 pcs->main = full;
5003 return pcs;
5004 }
5005
5006 stat(s, si: BARN_GET_FAIL);
5007
5008 can_alloc = gfpflags_allow_blocking(gfp_flags: gfp);
5009
5010 if (can_alloc) {
5011 if (pcs->spare) {
5012 empty = pcs->spare;
5013 pcs->spare = NULL;
5014 } else {
5015 empty = barn_get_empty_sheaf(barn);
5016 }
5017 }
5018
5019 local_unlock(&s->cpu_sheaves->lock);
5020
5021 if (!can_alloc)
5022 return NULL;
5023
5024 if (empty) {
5025 if (!refill_sheaf(s, sheaf: empty, gfp)) {
5026 full = empty;
5027 } else {
5028 /*
5029 * we must be very low on memory so don't bother
5030 * with the barn
5031 */
5032 free_empty_sheaf(s, sheaf: empty);
5033 }
5034 } else {
5035 full = alloc_full_sheaf(s, gfp);
5036 }
5037
5038 if (!full)
5039 return NULL;
5040
5041 /*
5042 * we can reach here only when gfpflags_allow_blocking
5043 * so this must not be an irq
5044 */
5045 local_lock(&s->cpu_sheaves->lock);
5046 pcs = this_cpu_ptr(s->cpu_sheaves);
5047
5048 /*
5049 * If we are returning empty sheaf, we either got it from the
5050 * barn or had to allocate one. If we are returning a full
5051 * sheaf, it's due to racing or being migrated to a different
5052 * cpu. Breaching the barn's sheaf limits should be thus rare
5053 * enough so just ignore them to simplify the recovery.
5054 */
5055
5056 if (pcs->main->size == 0) {
5057 barn_put_empty_sheaf(barn, sheaf: pcs->main);
5058 pcs->main = full;
5059 return pcs;
5060 }
5061
5062 if (!pcs->spare) {
5063 pcs->spare = full;
5064 return pcs;
5065 }
5066
5067 if (pcs->spare->size == 0) {
5068 barn_put_empty_sheaf(barn, sheaf: pcs->spare);
5069 pcs->spare = full;
5070 return pcs;
5071 }
5072
5073 barn_put_full_sheaf(barn, sheaf: full);
5074 stat(s, si: BARN_PUT);
5075
5076 return pcs;
5077}
5078
5079static __fastpath_inline
5080void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp, int node)
5081{
5082 struct slub_percpu_sheaves *pcs;
5083 bool node_requested;
5084 void *object;
5085
5086#ifdef CONFIG_NUMA
5087 if (static_branch_unlikely(&strict_numa) &&
5088 node == NUMA_NO_NODE) {
5089
5090 struct mempolicy *mpol = current->mempolicy;
5091
5092 if (mpol) {
5093 /*
5094 * Special BIND rule support. If the local node
5095 * is in permitted set then do not redirect
5096 * to a particular node.
5097 * Otherwise we apply the memory policy to get
5098 * the node we need to allocate on.
5099 */
5100 if (mpol->mode != MPOL_BIND ||
5101 !node_isset(numa_mem_id(), mpol->nodes))
5102
5103 node = mempolicy_slab_node();
5104 }
5105 }
5106#endif
5107
5108 node_requested = IS_ENABLED(CONFIG_NUMA) && node != NUMA_NO_NODE;
5109
5110 /*
5111 * We assume the percpu sheaves contain only local objects although it's
5112 * not completely guaranteed, so we verify later.
5113 */
5114 if (unlikely(node_requested && node != numa_mem_id()))
5115 return NULL;
5116
5117 if (!local_trylock(&s->cpu_sheaves->lock))
5118 return NULL;
5119
5120 pcs = this_cpu_ptr(s->cpu_sheaves);
5121
5122 if (unlikely(pcs->main->size == 0)) {
5123 pcs = __pcs_replace_empty_main(s, pcs, gfp);
5124 if (unlikely(!pcs))
5125 return NULL;
5126 }
5127
5128 object = pcs->main->objects[pcs->main->size - 1];
5129
5130 if (unlikely(node_requested)) {
5131 /*
5132 * Verify that the object was from the node we want. This could
5133 * be false because of cpu migration during an unlocked part of
5134 * the current allocation or previous freeing process.
5135 */
5136 if (folio_nid(folio: virt_to_folio(x: object)) != node) {
5137 local_unlock(&s->cpu_sheaves->lock);
5138 return NULL;
5139 }
5140 }
5141
5142 pcs->main->size--;
5143
5144 local_unlock(&s->cpu_sheaves->lock);
5145
5146 stat(s, si: ALLOC_PCS);
5147
5148 return object;
5149}
5150
5151static __fastpath_inline
5152unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
5153{
5154 struct slub_percpu_sheaves *pcs;
5155 struct slab_sheaf *main;
5156 unsigned int allocated = 0;
5157 unsigned int batch;
5158
5159next_batch:
5160 if (!local_trylock(&s->cpu_sheaves->lock))
5161 return allocated;
5162
5163 pcs = this_cpu_ptr(s->cpu_sheaves);
5164
5165 if (unlikely(pcs->main->size == 0)) {
5166
5167 struct slab_sheaf *full;
5168 struct node_barn *barn;
5169
5170 if (pcs->spare && pcs->spare->size > 0) {
5171 swap(pcs->main, pcs->spare);
5172 goto do_alloc;
5173 }
5174
5175 barn = get_barn(s);
5176 if (!barn) {
5177 local_unlock(&s->cpu_sheaves->lock);
5178 return allocated;
5179 }
5180
5181 full = barn_replace_empty_sheaf(barn, empty: pcs->main);
5182
5183 if (full) {
5184 stat(s, si: BARN_GET);
5185 pcs->main = full;
5186 goto do_alloc;
5187 }
5188
5189 stat(s, si: BARN_GET_FAIL);
5190
5191 local_unlock(&s->cpu_sheaves->lock);
5192
5193 /*
5194 * Once full sheaves in barn are depleted, let the bulk
5195 * allocation continue from slab pages, otherwise we would just
5196 * be copying arrays of pointers twice.
5197 */
5198 return allocated;
5199 }
5200
5201do_alloc:
5202
5203 main = pcs->main;
5204 batch = min(size, main->size);
5205
5206 main->size -= batch;
5207 memcpy(to: p, from: main->objects + main->size, len: batch * sizeof(void *));
5208
5209 local_unlock(&s->cpu_sheaves->lock);
5210
5211 stat_add(s, si: ALLOC_PCS, v: batch);
5212
5213 allocated += batch;
5214
5215 if (batch < size) {
5216 p += batch;
5217 size -= batch;
5218 goto next_batch;
5219 }
5220
5221 return allocated;
5222}
5223
5224
5225/*
5226 * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
5227 * have the fastpath folded into their functions. So no function call
5228 * overhead for requests that can be satisfied on the fastpath.
5229 *
5230 * The fastpath works by first checking if the lockless freelist can be used.
5231 * If not then __slab_alloc is called for slow processing.
5232 *
5233 * Otherwise we can simply pick the next object from the lockless free list.
5234 */
5235static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list_lru *lru,
5236 gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
5237{
5238 void *object;
5239 bool init = false;
5240
5241 s = slab_pre_alloc_hook(s, flags: gfpflags);
5242 if (unlikely(!s))
5243 return NULL;
5244
5245 object = kfence_alloc(s, size: orig_size, flags: gfpflags);
5246 if (unlikely(object))
5247 goto out;
5248
5249 if (s->cpu_sheaves)
5250 object = alloc_from_pcs(s, gfp: gfpflags, node);
5251
5252 if (!object)
5253 object = __slab_alloc_node(s, gfpflags, node, addr, orig_size);
5254
5255 maybe_wipe_obj_freeptr(s, obj: object);
5256 init = slab_want_init_on_alloc(flags: gfpflags, c: s);
5257
5258out:
5259 /*
5260 * When init equals 'true', like for kzalloc() family, only
5261 * @orig_size bytes might be zeroed instead of s->object_size
5262 * In case this fails due to memcg_slab_post_alloc_hook(),
5263 * object is set to NULL
5264 */
5265 slab_post_alloc_hook(s, lru, flags: gfpflags, size: 1, p: &object, init, orig_size);
5266
5267 return object;
5268}
5269
5270void *kmem_cache_alloc_noprof(struct kmem_cache *s, gfp_t gfpflags)
5271{
5272 void *ret = slab_alloc_node(s, NULL, gfpflags, NUMA_NO_NODE, _RET_IP_,
5273 orig_size: s->object_size);
5274
5275 trace_kmem_cache_alloc(_RET_IP_, ptr: ret, s, gfp_flags: gfpflags, NUMA_NO_NODE);
5276
5277 return ret;
5278}
5279EXPORT_SYMBOL(kmem_cache_alloc_noprof);
5280
5281void *kmem_cache_alloc_lru_noprof(struct kmem_cache *s, struct list_lru *lru,
5282 gfp_t gfpflags)
5283{
5284 void *ret = slab_alloc_node(s, lru, gfpflags, NUMA_NO_NODE, _RET_IP_,
5285 orig_size: s->object_size);
5286
5287 trace_kmem_cache_alloc(_RET_IP_, ptr: ret, s, gfp_flags: gfpflags, NUMA_NO_NODE);
5288
5289 return ret;
5290}
5291EXPORT_SYMBOL(kmem_cache_alloc_lru_noprof);
5292
5293bool kmem_cache_charge(void *objp, gfp_t gfpflags)
5294{
5295 if (!memcg_kmem_online())
5296 return true;
5297
5298 return memcg_slab_post_charge(p: objp, flags: gfpflags);
5299}
5300EXPORT_SYMBOL(kmem_cache_charge);
5301
5302/**
5303 * kmem_cache_alloc_node - Allocate an object on the specified node
5304 * @s: The cache to allocate from.
5305 * @gfpflags: See kmalloc().
5306 * @node: node number of the target node.
5307 *
5308 * Identical to kmem_cache_alloc but it will allocate memory on the given
5309 * node, which can improve the performance for cpu bound structures.
5310 *
5311 * Fallback to other node is possible if __GFP_THISNODE is not set.
5312 *
5313 * Return: pointer to the new object or %NULL in case of error
5314 */
5315void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t gfpflags, int node)
5316{
5317 void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, orig_size: s->object_size);
5318
5319 trace_kmem_cache_alloc(_RET_IP_, ptr: ret, s, gfp_flags: gfpflags, node);
5320
5321 return ret;
5322}
5323EXPORT_SYMBOL(kmem_cache_alloc_node_noprof);
5324
5325/*
5326 * returns a sheaf that has at least the requested size
5327 * when prefilling is needed, do so with given gfp flags
5328 *
5329 * return NULL if sheaf allocation or prefilling failed
5330 */
5331struct slab_sheaf *
5332kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size)
5333{
5334 struct slub_percpu_sheaves *pcs;
5335 struct slab_sheaf *sheaf = NULL;
5336 struct node_barn *barn;
5337
5338 if (unlikely(size > s->sheaf_capacity)) {
5339
5340 /*
5341 * slab_debug disables cpu sheaves intentionally so all
5342 * prefilled sheaves become "oversize" and we give up on
5343 * performance for the debugging. Same with SLUB_TINY.
5344 * Creating a cache without sheaves and then requesting a
5345 * prefilled sheaf is however not expected, so warn.
5346 */
5347 WARN_ON_ONCE(s->sheaf_capacity == 0 &&
5348 !IS_ENABLED(CONFIG_SLUB_TINY) &&
5349 !(s->flags & SLAB_DEBUG_FLAGS));
5350
5351 sheaf = kzalloc(struct_size(sheaf, objects, size), gfp);
5352 if (!sheaf)
5353 return NULL;
5354
5355 stat(s, si: SHEAF_PREFILL_OVERSIZE);
5356 sheaf->cache = s;
5357 sheaf->capacity = size;
5358
5359 if (!__kmem_cache_alloc_bulk(s, flags: gfp, size,
5360 p: &sheaf->objects[0])) {
5361 kfree(objp: sheaf);
5362 return NULL;
5363 }
5364
5365 sheaf->size = size;
5366
5367 return sheaf;
5368 }
5369
5370 local_lock(&s->cpu_sheaves->lock);
5371 pcs = this_cpu_ptr(s->cpu_sheaves);
5372
5373 if (pcs->spare) {
5374 sheaf = pcs->spare;
5375 pcs->spare = NULL;
5376 stat(s, si: SHEAF_PREFILL_FAST);
5377 } else {
5378 barn = get_barn(s);
5379
5380 stat(s, si: SHEAF_PREFILL_SLOW);
5381 if (barn)
5382 sheaf = barn_get_full_or_empty_sheaf(barn);
5383 if (sheaf && sheaf->size)
5384 stat(s, si: BARN_GET);
5385 else
5386 stat(s, si: BARN_GET_FAIL);
5387 }
5388
5389 local_unlock(&s->cpu_sheaves->lock);
5390
5391
5392 if (!sheaf)
5393 sheaf = alloc_empty_sheaf(s, gfp);
5394
5395 if (sheaf && sheaf->size < size) {
5396 if (refill_sheaf(s, sheaf, gfp)) {
5397 sheaf_flush_unused(s, sheaf);
5398 free_empty_sheaf(s, sheaf);
5399 sheaf = NULL;
5400 }
5401 }
5402
5403 if (sheaf)
5404 sheaf->capacity = s->sheaf_capacity;
5405
5406 return sheaf;
5407}
5408
5409/*
5410 * Use this to return a sheaf obtained by kmem_cache_prefill_sheaf()
5411 *
5412 * If the sheaf cannot simply become the percpu spare sheaf, but there's space
5413 * for a full sheaf in the barn, we try to refill the sheaf back to the cache's
5414 * sheaf_capacity to avoid handling partially full sheaves.
5415 *
5416 * If the refill fails because gfp is e.g. GFP_NOWAIT, or the barn is full, the
5417 * sheaf is instead flushed and freed.
5418 */
5419void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp,
5420 struct slab_sheaf *sheaf)
5421{
5422 struct slub_percpu_sheaves *pcs;
5423 struct node_barn *barn;
5424
5425 if (unlikely(sheaf->capacity != s->sheaf_capacity)) {
5426 sheaf_flush_unused(s, sheaf);
5427 kfree(objp: sheaf);
5428 return;
5429 }
5430
5431 local_lock(&s->cpu_sheaves->lock);
5432 pcs = this_cpu_ptr(s->cpu_sheaves);
5433 barn = get_barn(s);
5434
5435 if (!pcs->spare) {
5436 pcs->spare = sheaf;
5437 sheaf = NULL;
5438 stat(s, si: SHEAF_RETURN_FAST);
5439 }
5440
5441 local_unlock(&s->cpu_sheaves->lock);
5442
5443 if (!sheaf)
5444 return;
5445
5446 stat(s, si: SHEAF_RETURN_SLOW);
5447
5448 /*
5449 * If the barn has too many full sheaves or we fail to refill the sheaf,
5450 * simply flush and free it.
5451 */
5452 if (!barn || data_race(barn->nr_full) >= MAX_FULL_SHEAVES ||
5453 refill_sheaf(s, sheaf, gfp)) {
5454 sheaf_flush_unused(s, sheaf);
5455 free_empty_sheaf(s, sheaf);
5456 return;
5457 }
5458
5459 barn_put_full_sheaf(barn, sheaf);
5460 stat(s, si: BARN_PUT);
5461}
5462
5463/*
5464 * refill a sheaf previously returned by kmem_cache_prefill_sheaf to at least
5465 * the given size
5466 *
5467 * the sheaf might be replaced by a new one when requesting more than
5468 * s->sheaf_capacity objects if such replacement is necessary, but the refill
5469 * fails (returning -ENOMEM), the existing sheaf is left intact
5470 *
5471 * In practice we always refill to full sheaf's capacity.
5472 */
5473int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp,
5474 struct slab_sheaf **sheafp, unsigned int size)
5475{
5476 struct slab_sheaf *sheaf;
5477
5478 /*
5479 * TODO: do we want to support *sheaf == NULL to be equivalent of
5480 * kmem_cache_prefill_sheaf() ?
5481 */
5482 if (!sheafp || !(*sheafp))
5483 return -EINVAL;
5484
5485 sheaf = *sheafp;
5486 if (sheaf->size >= size)
5487 return 0;
5488
5489 if (likely(sheaf->capacity >= size)) {
5490 if (likely(sheaf->capacity == s->sheaf_capacity))
5491 return refill_sheaf(s, sheaf, gfp);
5492
5493 if (!__kmem_cache_alloc_bulk(s, flags: gfp, size: sheaf->capacity - sheaf->size,
5494 p: &sheaf->objects[sheaf->size])) {
5495 return -ENOMEM;
5496 }
5497 sheaf->size = sheaf->capacity;
5498
5499 return 0;
5500 }
5501
5502 /*
5503 * We had a regular sized sheaf and need an oversize one, or we had an
5504 * oversize one already but need a larger one now.
5505 * This should be a very rare path so let's not complicate it.
5506 */
5507 sheaf = kmem_cache_prefill_sheaf(s, gfp, size);
5508 if (!sheaf)
5509 return -ENOMEM;
5510
5511 kmem_cache_return_sheaf(s, gfp, sheaf: *sheafp);
5512 *sheafp = sheaf;
5513 return 0;
5514}
5515
5516/*
5517 * Allocate from a sheaf obtained by kmem_cache_prefill_sheaf()
5518 *
5519 * Guaranteed not to fail as many allocations as was the requested size.
5520 * After the sheaf is emptied, it fails - no fallback to the slab cache itself.
5521 *
5522 * The gfp parameter is meant only to specify __GFP_ZERO or __GFP_ACCOUNT
5523 * memcg charging is forced over limit if necessary, to avoid failure.
5524 */
5525void *
5526kmem_cache_alloc_from_sheaf_noprof(struct kmem_cache *s, gfp_t gfp,
5527 struct slab_sheaf *sheaf)
5528{
5529 void *ret = NULL;
5530 bool init;
5531
5532 if (sheaf->size == 0)
5533 goto out;
5534
5535 ret = sheaf->objects[--sheaf->size];
5536
5537 init = slab_want_init_on_alloc(flags: gfp, c: s);
5538
5539 /* add __GFP_NOFAIL to force successful memcg charging */
5540 slab_post_alloc_hook(s, NULL, flags: gfp | __GFP_NOFAIL, size: 1, p: &ret, init, orig_size: s->object_size);
5541out:
5542 trace_kmem_cache_alloc(_RET_IP_, ptr: ret, s, gfp_flags: gfp, NUMA_NO_NODE);
5543
5544 return ret;
5545}
5546
5547unsigned int kmem_cache_sheaf_size(struct slab_sheaf *sheaf)
5548{
5549 return sheaf->size;
5550}
5551/*
5552 * To avoid unnecessary overhead, we pass through large allocation requests
5553 * directly to the page allocator. We use __GFP_COMP, because we will need to
5554 * know the allocation order to free the pages properly in kfree.
5555 */
5556static void *___kmalloc_large_node(size_t size, gfp_t flags, int node)
5557{
5558 struct folio *folio;
5559 void *ptr = NULL;
5560 unsigned int order = get_order(size);
5561
5562 if (unlikely(flags & GFP_SLAB_BUG_MASK))
5563 flags = kmalloc_fix_flags(flags);
5564
5565 flags |= __GFP_COMP;
5566
5567 if (node == NUMA_NO_NODE)
5568 folio = (struct folio *)alloc_frozen_pages_noprof(flags, order);
5569 else
5570 folio = (struct folio *)__alloc_frozen_pages_noprof(flags, order, nid: node, NULL);
5571
5572 if (folio) {
5573 ptr = folio_address(folio);
5574 lruvec_stat_mod_folio(folio, idx: NR_SLAB_UNRECLAIMABLE_B,
5575 PAGE_SIZE << order);
5576 __folio_set_large_kmalloc(folio);
5577 }
5578
5579 ptr = kasan_kmalloc_large(ptr, size, flags);
5580 /* As ptr might get tagged, call kmemleak hook after KASAN. */
5581 kmemleak_alloc(ptr, size, min_count: 1, gfp: flags);
5582 kmsan_kmalloc_large(ptr, size, flags);
5583
5584 return ptr;
5585}
5586
5587void *__kmalloc_large_noprof(size_t size, gfp_t flags)
5588{
5589 void *ret = ___kmalloc_large_node(size, flags, NUMA_NO_NODE);
5590
5591 trace_kmalloc(_RET_IP_, ptr: ret, bytes_req: size, PAGE_SIZE << get_order(size),
5592 gfp_flags: flags, NUMA_NO_NODE);
5593 return ret;
5594}
5595EXPORT_SYMBOL(__kmalloc_large_noprof);
5596
5597void *__kmalloc_large_node_noprof(size_t size, gfp_t flags, int node)
5598{
5599 void *ret = ___kmalloc_large_node(size, flags, node);
5600
5601 trace_kmalloc(_RET_IP_, ptr: ret, bytes_req: size, PAGE_SIZE << get_order(size),
5602 gfp_flags: flags, node);
5603 return ret;
5604}
5605EXPORT_SYMBOL(__kmalloc_large_node_noprof);
5606
5607static __always_inline
5608void *__do_kmalloc_node(size_t size, kmem_buckets *b, gfp_t flags, int node,
5609 unsigned long caller)
5610{
5611 struct kmem_cache *s;
5612 void *ret;
5613
5614 if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
5615 ret = __kmalloc_large_node_noprof(size, flags, node);
5616 trace_kmalloc(call_site: caller, ptr: ret, bytes_req: size,
5617 PAGE_SIZE << get_order(size), gfp_flags: flags, node);
5618 return ret;
5619 }
5620
5621 if (unlikely(!size))
5622 return ZERO_SIZE_PTR;
5623
5624 s = kmalloc_slab(size, b, flags, caller);
5625
5626 ret = slab_alloc_node(s, NULL, gfpflags: flags, node, addr: caller, orig_size: size);
5627 ret = kasan_kmalloc(s, object: ret, size, flags);
5628 trace_kmalloc(call_site: caller, ptr: ret, bytes_req: size, bytes_alloc: s->size, gfp_flags: flags, node);
5629 return ret;
5630}
5631void *__kmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node)
5632{
5633 return __do_kmalloc_node(size, PASS_BUCKET_PARAM(b), flags, node, _RET_IP_);
5634}
5635EXPORT_SYMBOL(__kmalloc_node_noprof);
5636
5637void *__kmalloc_noprof(size_t size, gfp_t flags)
5638{
5639 return __do_kmalloc_node(size, NULL, flags, NUMA_NO_NODE, _RET_IP_);
5640}
5641EXPORT_SYMBOL(__kmalloc_noprof);
5642
5643/**
5644 * kmalloc_nolock - Allocate an object of given size from any context.
5645 * @size: size to allocate
5646 * @gfp_flags: GFP flags. Only __GFP_ACCOUNT, __GFP_ZERO, __GFP_NO_OBJ_EXT
5647 * allowed.
5648 * @node: node number of the target node.
5649 *
5650 * Return: pointer to the new object or NULL in case of error.
5651 * NULL does not mean EBUSY or EAGAIN. It means ENOMEM.
5652 * There is no reason to call it again and expect !NULL.
5653 */
5654void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node)
5655{
5656 gfp_t alloc_gfp = __GFP_NOWARN | __GFP_NOMEMALLOC | gfp_flags;
5657 struct kmem_cache *s;
5658 bool can_retry = true;
5659 void *ret = ERR_PTR(error: -EBUSY);
5660
5661 VM_WARN_ON_ONCE(gfp_flags & ~(__GFP_ACCOUNT | __GFP_ZERO |
5662 __GFP_NO_OBJ_EXT));
5663
5664 if (unlikely(!size))
5665 return ZERO_SIZE_PTR;
5666
5667 if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq()))
5668 /* kmalloc_nolock() in PREEMPT_RT is not supported from irq */
5669 return NULL;
5670retry:
5671 if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
5672 return NULL;
5673 s = kmalloc_slab(size, NULL, flags: alloc_gfp, _RET_IP_);
5674
5675 if (!(s->flags & __CMPXCHG_DOUBLE) && !kmem_cache_debug(s))
5676 /*
5677 * kmalloc_nolock() is not supported on architectures that
5678 * don't implement cmpxchg16b, but debug caches don't use
5679 * per-cpu slab and per-cpu partial slabs. They rely on
5680 * kmem_cache_node->list_lock, so kmalloc_nolock() can
5681 * attempt to allocate from debug caches by
5682 * spin_trylock_irqsave(&n->list_lock, ...)
5683 */
5684 return NULL;
5685
5686 /*
5687 * Do not call slab_alloc_node(), since trylock mode isn't
5688 * compatible with slab_pre_alloc_hook/should_failslab and
5689 * kfence_alloc. Hence call __slab_alloc_node() (at most twice)
5690 * and slab_post_alloc_hook() directly.
5691 *
5692 * In !PREEMPT_RT ___slab_alloc() manipulates (freelist,tid) pair
5693 * in irq saved region. It assumes that the same cpu will not
5694 * __update_cpu_freelist_fast() into the same (freelist,tid) pair.
5695 * Therefore use in_nmi() to check whether particular bucket is in
5696 * irq protected section.
5697 *
5698 * If in_nmi() && local_lock_is_locked(s->cpu_slab) then it means that
5699 * this cpu was interrupted somewhere inside ___slab_alloc() after
5700 * it did local_lock_irqsave(&s->cpu_slab->lock, flags).
5701 * In this case fast path with __update_cpu_freelist_fast() is not safe.
5702 */
5703#ifndef CONFIG_SLUB_TINY
5704 if (!in_nmi() || !local_lock_is_locked(&s->cpu_slab->lock))
5705#endif
5706 ret = __slab_alloc_node(s, gfpflags: alloc_gfp, node, _RET_IP_, orig_size: size);
5707
5708 if (PTR_ERR(ptr: ret) == -EBUSY) {
5709 if (can_retry) {
5710 /* pick the next kmalloc bucket */
5711 size = s->object_size + 1;
5712 /*
5713 * Another alternative is to
5714 * if (memcg) alloc_gfp &= ~__GFP_ACCOUNT;
5715 * else if (!memcg) alloc_gfp |= __GFP_ACCOUNT;
5716 * to retry from bucket of the same size.
5717 */
5718 can_retry = false;
5719 goto retry;
5720 }
5721 ret = NULL;
5722 }
5723
5724 maybe_wipe_obj_freeptr(s, obj: ret);
5725 slab_post_alloc_hook(s, NULL, flags: alloc_gfp, size: 1, p: &ret,
5726 init: slab_want_init_on_alloc(flags: alloc_gfp, c: s), orig_size: size);
5727
5728 ret = kasan_kmalloc(s, object: ret, size, flags: alloc_gfp);
5729 return ret;
5730}
5731EXPORT_SYMBOL_GPL(kmalloc_nolock_noprof);
5732
5733void *__kmalloc_node_track_caller_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags,
5734 int node, unsigned long caller)
5735{
5736 return __do_kmalloc_node(size, PASS_BUCKET_PARAM(b), flags, node, caller);
5737
5738}
5739EXPORT_SYMBOL(__kmalloc_node_track_caller_noprof);
5740
5741void *__kmalloc_cache_noprof(struct kmem_cache *s, gfp_t gfpflags, size_t size)
5742{
5743 void *ret = slab_alloc_node(s, NULL, gfpflags, NUMA_NO_NODE,
5744 _RET_IP_, orig_size: size);
5745
5746 trace_kmalloc(_RET_IP_, ptr: ret, bytes_req: size, bytes_alloc: s->size, gfp_flags: gfpflags, NUMA_NO_NODE);
5747
5748 ret = kasan_kmalloc(s, object: ret, size, flags: gfpflags);
5749 return ret;
5750}
5751EXPORT_SYMBOL(__kmalloc_cache_noprof);
5752
5753void *__kmalloc_cache_node_noprof(struct kmem_cache *s, gfp_t gfpflags,
5754 int node, size_t size)
5755{
5756 void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, orig_size: size);
5757
5758 trace_kmalloc(_RET_IP_, ptr: ret, bytes_req: size, bytes_alloc: s->size, gfp_flags: gfpflags, node);
5759
5760 ret = kasan_kmalloc(s, object: ret, size, flags: gfpflags);
5761 return ret;
5762}
5763EXPORT_SYMBOL(__kmalloc_cache_node_noprof);
5764
5765static noinline void free_to_partial_list(
5766 struct kmem_cache *s, struct slab *slab,
5767 void *head, void *tail, int bulk_cnt,
5768 unsigned long addr)
5769{
5770 struct kmem_cache_node *n = get_node(s, node: slab_nid(slab));
5771 struct slab *slab_free = NULL;
5772 int cnt = bulk_cnt;
5773 unsigned long flags;
5774 depot_stack_handle_t handle = 0;
5775
5776 /*
5777 * We cannot use GFP_NOWAIT as there are callsites where waking up
5778 * kswapd could deadlock
5779 */
5780 if (s->flags & SLAB_STORE_USER)
5781 handle = set_track_prepare(__GFP_NOWARN);
5782
5783 spin_lock_irqsave(&n->list_lock, flags);
5784
5785 if (free_debug_processing(s, slab, head, tail, bulk_cnt: &cnt, addr, handle)) {
5786 void *prior = slab->freelist;
5787
5788 /* Perform the actual freeing while we still hold the locks */
5789 slab->inuse -= cnt;
5790 set_freepointer(s, object: tail, fp: prior);
5791 slab->freelist = head;
5792
5793 /*
5794 * If the slab is empty, and node's partial list is full,
5795 * it should be discarded anyway no matter it's on full or
5796 * partial list.
5797 */
5798 if (slab->inuse == 0 && n->nr_partial >= s->min_partial)
5799 slab_free = slab;
5800
5801 if (!prior) {
5802 /* was on full list */
5803 remove_full(s, n, slab);
5804 if (!slab_free) {
5805 add_partial(n, slab, tail: DEACTIVATE_TO_TAIL);
5806 stat(s, si: FREE_ADD_PARTIAL);
5807 }
5808 } else if (slab_free) {
5809 remove_partial(n, slab);
5810 stat(s, si: FREE_REMOVE_PARTIAL);
5811 }
5812 }
5813
5814 if (slab_free) {
5815 /*
5816 * Update the counters while still holding n->list_lock to
5817 * prevent spurious validation warnings
5818 */
5819 dec_slabs_node(s, node: slab_nid(slab: slab_free), objects: slab_free->objects);
5820 }
5821
5822 spin_unlock_irqrestore(lock: &n->list_lock, flags);
5823
5824 if (slab_free) {
5825 stat(s, si: FREE_SLAB);
5826 free_slab(s, slab: slab_free);
5827 }
5828}
5829
5830/*
5831 * Slow path handling. This may still be called frequently since objects
5832 * have a longer lifetime than the cpu slabs in most processing loads.
5833 *
5834 * So we still attempt to reduce cache line usage. Just take the slab
5835 * lock and free the item. If there is no additional partial slab
5836 * handling required then we can return immediately.
5837 */
5838static void __slab_free(struct kmem_cache *s, struct slab *slab,
5839 void *head, void *tail, int cnt,
5840 unsigned long addr)
5841
5842{
5843 void *prior;
5844 int was_frozen;
5845 struct slab new;
5846 unsigned long counters;
5847 struct kmem_cache_node *n = NULL;
5848 unsigned long flags;
5849 bool on_node_partial;
5850
5851 stat(s, si: FREE_SLOWPATH);
5852
5853 if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
5854 free_to_partial_list(s, slab, head, tail, bulk_cnt: cnt, addr);
5855 return;
5856 }
5857
5858 do {
5859 if (unlikely(n)) {
5860 spin_unlock_irqrestore(lock: &n->list_lock, flags);
5861 n = NULL;
5862 }
5863 prior = slab->freelist;
5864 counters = slab->counters;
5865 set_freepointer(s, object: tail, fp: prior);
5866 new.counters = counters;
5867 was_frozen = new.frozen;
5868 new.inuse -= cnt;
5869 if ((!new.inuse || !prior) && !was_frozen) {
5870 /* Needs to be taken off a list */
5871 if (!kmem_cache_has_cpu_partial(s) || prior) {
5872
5873 n = get_node(s, node: slab_nid(slab));
5874 /*
5875 * Speculatively acquire the list_lock.
5876 * If the cmpxchg does not succeed then we may
5877 * drop the list_lock without any processing.
5878 *
5879 * Otherwise the list_lock will synchronize with
5880 * other processors updating the list of slabs.
5881 */
5882 spin_lock_irqsave(&n->list_lock, flags);
5883
5884 on_node_partial = slab_test_node_partial(slab);
5885 }
5886 }
5887
5888 } while (!slab_update_freelist(s, slab,
5889 freelist_old: prior, counters_old: counters,
5890 freelist_new: head, counters_new: new.counters,
5891 n: "__slab_free"));
5892
5893 if (likely(!n)) {
5894
5895 if (likely(was_frozen)) {
5896 /*
5897 * The list lock was not taken therefore no list
5898 * activity can be necessary.
5899 */
5900 stat(s, si: FREE_FROZEN);
5901 } else if (kmem_cache_has_cpu_partial(s) && !prior) {
5902 /*
5903 * If we started with a full slab then put it onto the
5904 * per cpu partial list.
5905 */
5906 put_cpu_partial(s, slab, drain: 1);
5907 stat(s, si: CPU_PARTIAL_FREE);
5908 }
5909
5910 return;
5911 }
5912
5913 /*
5914 * This slab was partially empty but not on the per-node partial list,
5915 * in which case we shouldn't manipulate its list, just return.
5916 */
5917 if (prior && !on_node_partial) {
5918 spin_unlock_irqrestore(lock: &n->list_lock, flags);
5919 return;
5920 }
5921
5922 if (unlikely(!new.inuse && n->nr_partial >= s->min_partial))
5923 goto slab_empty;
5924
5925 /*
5926 * Objects left in the slab. If it was not on the partial list before
5927 * then add it.
5928 */
5929 if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
5930 add_partial(n, slab, tail: DEACTIVATE_TO_TAIL);
5931 stat(s, si: FREE_ADD_PARTIAL);
5932 }
5933 spin_unlock_irqrestore(lock: &n->list_lock, flags);
5934 return;
5935
5936slab_empty:
5937 if (prior) {
5938 /*
5939 * Slab on the partial list.
5940 */
5941 remove_partial(n, slab);
5942 stat(s, si: FREE_REMOVE_PARTIAL);
5943 }
5944
5945 spin_unlock_irqrestore(lock: &n->list_lock, flags);
5946 stat(s, si: FREE_SLAB);
5947 discard_slab(s, slab);
5948}
5949
5950/*
5951 * pcs is locked. We should have get rid of the spare sheaf and obtained an
5952 * empty sheaf, while the main sheaf is full. We want to install the empty sheaf
5953 * as a main sheaf, and make the current main sheaf a spare sheaf.
5954 *
5955 * However due to having relinquished the cpu_sheaves lock when obtaining
5956 * the empty sheaf, we need to handle some unlikely but possible cases.
5957 *
5958 * If we put any sheaf to barn here, it's because we were interrupted or have
5959 * been migrated to a different cpu, which should be rare enough so just ignore
5960 * the barn's limits to simplify the handling.
5961 *
5962 * An alternative scenario that gets us here is when we fail
5963 * barn_replace_full_sheaf(), because there's no empty sheaf available in the
5964 * barn, so we had to allocate it by alloc_empty_sheaf(). But because we saw the
5965 * limit on full sheaves was not exceeded, we assume it didn't change and just
5966 * put the full sheaf there.
5967 */
5968static void __pcs_install_empty_sheaf(struct kmem_cache *s,
5969 struct slub_percpu_sheaves *pcs, struct slab_sheaf *empty,
5970 struct node_barn *barn)
5971{
5972 lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock));
5973
5974 /* This is what we expect to find if nobody interrupted us. */
5975 if (likely(!pcs->spare)) {
5976 pcs->spare = pcs->main;
5977 pcs->main = empty;
5978 return;
5979 }
5980
5981 /*
5982 * Unlikely because if the main sheaf had space, we would have just
5983 * freed to it. Get rid of our empty sheaf.
5984 */
5985 if (pcs->main->size < s->sheaf_capacity) {
5986 barn_put_empty_sheaf(barn, sheaf: empty);
5987 return;
5988 }
5989
5990 /* Also unlikely for the same reason */
5991 if (pcs->spare->size < s->sheaf_capacity) {
5992 swap(pcs->main, pcs->spare);
5993 barn_put_empty_sheaf(barn, sheaf: empty);
5994 return;
5995 }
5996
5997 /*
5998 * We probably failed barn_replace_full_sheaf() due to no empty sheaf
5999 * available there, but we allocated one, so finish the job.
6000 */
6001 barn_put_full_sheaf(barn, sheaf: pcs->main);
6002 stat(s, si: BARN_PUT);
6003 pcs->main = empty;
6004}
6005
6006/*
6007 * Replace the full main sheaf with a (at least partially) empty sheaf.
6008 *
6009 * Must be called with the cpu_sheaves local lock locked. If successful, returns
6010 * the pcs pointer and the local lock locked (possibly on a different cpu than
6011 * initially called). If not successful, returns NULL and the local lock
6012 * unlocked.
6013 */
6014static struct slub_percpu_sheaves *
6015__pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs)
6016{
6017 struct slab_sheaf *empty;
6018 struct node_barn *barn;
6019 bool put_fail;
6020
6021restart:
6022 lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock));
6023
6024 barn = get_barn(s);
6025 if (!barn) {
6026 local_unlock(&s->cpu_sheaves->lock);
6027 return NULL;
6028 }
6029
6030 put_fail = false;
6031
6032 if (!pcs->spare) {
6033 empty = barn_get_empty_sheaf(barn);
6034 if (empty) {
6035 pcs->spare = pcs->main;
6036 pcs->main = empty;
6037 return pcs;
6038 }
6039 goto alloc_empty;
6040 }
6041
6042 if (pcs->spare->size < s->sheaf_capacity) {
6043 swap(pcs->main, pcs->spare);
6044 return pcs;
6045 }
6046
6047 empty = barn_replace_full_sheaf(barn, full: pcs->main);
6048
6049 if (!IS_ERR(ptr: empty)) {
6050 stat(s, si: BARN_PUT);
6051 pcs->main = empty;
6052 return pcs;
6053 }
6054
6055 if (PTR_ERR(ptr: empty) == -E2BIG) {
6056 /* Since we got here, spare exists and is full */
6057 struct slab_sheaf *to_flush = pcs->spare;
6058
6059 stat(s, si: BARN_PUT_FAIL);
6060
6061 pcs->spare = NULL;
6062 local_unlock(&s->cpu_sheaves->lock);
6063
6064 sheaf_flush_unused(s, sheaf: to_flush);
6065 empty = to_flush;
6066 goto got_empty;
6067 }
6068
6069 /*
6070 * We could not replace full sheaf because barn had no empty
6071 * sheaves. We can still allocate it and put the full sheaf in
6072 * __pcs_install_empty_sheaf(), but if we fail to allocate it,
6073 * make sure to count the fail.
6074 */
6075 put_fail = true;
6076
6077alloc_empty:
6078 local_unlock(&s->cpu_sheaves->lock);
6079
6080 empty = alloc_empty_sheaf(s, GFP_NOWAIT);
6081 if (empty)
6082 goto got_empty;
6083
6084 if (put_fail)
6085 stat(s, si: BARN_PUT_FAIL);
6086
6087 if (!sheaf_flush_main(s))
6088 return NULL;
6089
6090 if (!local_trylock(&s->cpu_sheaves->lock))
6091 return NULL;
6092
6093 pcs = this_cpu_ptr(s->cpu_sheaves);
6094
6095 /*
6096 * we flushed the main sheaf so it should be empty now,
6097 * but in case we got preempted or migrated, we need to
6098 * check again
6099 */
6100 if (pcs->main->size == s->sheaf_capacity)
6101 goto restart;
6102
6103 return pcs;
6104
6105got_empty:
6106 if (!local_trylock(&s->cpu_sheaves->lock)) {
6107 barn_put_empty_sheaf(barn, sheaf: empty);
6108 return NULL;
6109 }
6110
6111 pcs = this_cpu_ptr(s->cpu_sheaves);
6112 __pcs_install_empty_sheaf(s, pcs, empty, barn);
6113
6114 return pcs;
6115}
6116
6117/*
6118 * Free an object to the percpu sheaves.
6119 * The object is expected to have passed slab_free_hook() already.
6120 */
6121static __fastpath_inline
6122bool free_to_pcs(struct kmem_cache *s, void *object)
6123{
6124 struct slub_percpu_sheaves *pcs;
6125
6126 if (!local_trylock(&s->cpu_sheaves->lock))
6127 return false;
6128
6129 pcs = this_cpu_ptr(s->cpu_sheaves);
6130
6131 if (unlikely(pcs->main->size == s->sheaf_capacity)) {
6132
6133 pcs = __pcs_replace_full_main(s, pcs);
6134 if (unlikely(!pcs))
6135 return false;
6136 }
6137
6138 pcs->main->objects[pcs->main->size++] = object;
6139
6140 local_unlock(&s->cpu_sheaves->lock);
6141
6142 stat(s, si: FREE_PCS);
6143
6144 return true;
6145}
6146
6147static void rcu_free_sheaf(struct rcu_head *head)
6148{
6149 struct kmem_cache_node *n;
6150 struct slab_sheaf *sheaf;
6151 struct node_barn *barn = NULL;
6152 struct kmem_cache *s;
6153
6154 sheaf = container_of(head, struct slab_sheaf, rcu_head);
6155
6156 s = sheaf->cache;
6157
6158 /*
6159 * This may remove some objects due to slab_free_hook() returning false,
6160 * so that the sheaf might no longer be completely full. But it's easier
6161 * to handle it as full (unless it became completely empty), as the code
6162 * handles it fine. The only downside is that sheaf will serve fewer
6163 * allocations when reused. It only happens due to debugging, which is a
6164 * performance hit anyway.
6165 */
6166 __rcu_free_sheaf_prepare(s, sheaf);
6167
6168 n = get_node(s, node: sheaf->node);
6169 if (!n)
6170 goto flush;
6171
6172 barn = n->barn;
6173
6174 /* due to slab_free_hook() */
6175 if (unlikely(sheaf->size == 0))
6176 goto empty;
6177
6178 /*
6179 * Checking nr_full/nr_empty outside lock avoids contention in case the
6180 * barn is at the respective limit. Due to the race we might go over the
6181 * limit but that should be rare and harmless.
6182 */
6183
6184 if (data_race(barn->nr_full) < MAX_FULL_SHEAVES) {
6185 stat(s, si: BARN_PUT);
6186 barn_put_full_sheaf(barn, sheaf);
6187 return;
6188 }
6189
6190flush:
6191 stat(s, si: BARN_PUT_FAIL);
6192 sheaf_flush_unused(s, sheaf);
6193
6194empty:
6195 if (barn && data_race(barn->nr_empty) < MAX_EMPTY_SHEAVES) {
6196 barn_put_empty_sheaf(barn, sheaf);
6197 return;
6198 }
6199
6200 free_empty_sheaf(s, sheaf);
6201}
6202
6203bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj)
6204{
6205 struct slub_percpu_sheaves *pcs;
6206 struct slab_sheaf *rcu_sheaf;
6207
6208 if (!local_trylock(&s->cpu_sheaves->lock))
6209 goto fail;
6210
6211 pcs = this_cpu_ptr(s->cpu_sheaves);
6212
6213 if (unlikely(!pcs->rcu_free)) {
6214
6215 struct slab_sheaf *empty;
6216 struct node_barn *barn;
6217
6218 if (pcs->spare && pcs->spare->size == 0) {
6219 pcs->rcu_free = pcs->spare;
6220 pcs->spare = NULL;
6221 goto do_free;
6222 }
6223
6224 barn = get_barn(s);
6225 if (!barn) {
6226 local_unlock(&s->cpu_sheaves->lock);
6227 goto fail;
6228 }
6229
6230 empty = barn_get_empty_sheaf(barn);
6231
6232 if (empty) {
6233 pcs->rcu_free = empty;
6234 goto do_free;
6235 }
6236
6237 local_unlock(&s->cpu_sheaves->lock);
6238
6239 empty = alloc_empty_sheaf(s, GFP_NOWAIT);
6240
6241 if (!empty)
6242 goto fail;
6243
6244 if (!local_trylock(&s->cpu_sheaves->lock)) {
6245 barn_put_empty_sheaf(barn, sheaf: empty);
6246 goto fail;
6247 }
6248
6249 pcs = this_cpu_ptr(s->cpu_sheaves);
6250
6251 if (unlikely(pcs->rcu_free))
6252 barn_put_empty_sheaf(barn, sheaf: empty);
6253 else
6254 pcs->rcu_free = empty;
6255 }
6256
6257do_free:
6258
6259 rcu_sheaf = pcs->rcu_free;
6260
6261 /*
6262 * Since we flush immediately when size reaches capacity, we never reach
6263 * this with size already at capacity, so no OOB write is possible.
6264 */
6265 rcu_sheaf->objects[rcu_sheaf->size++] = obj;
6266
6267 if (likely(rcu_sheaf->size < s->sheaf_capacity)) {
6268 rcu_sheaf = NULL;
6269 } else {
6270 pcs->rcu_free = NULL;
6271 rcu_sheaf->node = numa_mem_id();
6272 }
6273
6274 /*
6275 * we flush before local_unlock to make sure a racing
6276 * flush_all_rcu_sheaves() doesn't miss this sheaf
6277 */
6278 if (rcu_sheaf)
6279 call_rcu(head: &rcu_sheaf->rcu_head, func: rcu_free_sheaf);
6280
6281 local_unlock(&s->cpu_sheaves->lock);
6282
6283 stat(s, si: FREE_RCU_SHEAF);
6284 return true;
6285
6286fail:
6287 stat(s, si: FREE_RCU_SHEAF_FAIL);
6288 return false;
6289}
6290
6291/*
6292 * Bulk free objects to the percpu sheaves.
6293 * Unlike free_to_pcs() this includes the calls to all necessary hooks
6294 * and the fallback to freeing to slab pages.
6295 */
6296static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
6297{
6298 struct slub_percpu_sheaves *pcs;
6299 struct slab_sheaf *main, *empty;
6300 bool init = slab_want_init_on_free(c: s);
6301 unsigned int batch, i = 0;
6302 struct node_barn *barn;
6303 void *remote_objects[PCS_BATCH_MAX];
6304 unsigned int remote_nr = 0;
6305 int node = numa_mem_id();
6306
6307next_remote_batch:
6308 while (i < size) {
6309 struct slab *slab = virt_to_slab(addr: p[i]);
6310
6311 memcg_slab_free_hook(s, slab, p: p + i, objects: 1);
6312 alloc_tagging_slab_free_hook(s, slab, p: p + i, objects: 1);
6313
6314 if (unlikely(!slab_free_hook(s, p[i], init, false))) {
6315 p[i] = p[--size];
6316 if (!size)
6317 goto flush_remote;
6318 continue;
6319 }
6320
6321 if (unlikely(IS_ENABLED(CONFIG_NUMA) && slab_nid(slab) != node)) {
6322 remote_objects[remote_nr] = p[i];
6323 p[i] = p[--size];
6324 if (++remote_nr >= PCS_BATCH_MAX)
6325 goto flush_remote;
6326 continue;
6327 }
6328
6329 i++;
6330 }
6331
6332next_batch:
6333 if (!local_trylock(&s->cpu_sheaves->lock))
6334 goto fallback;
6335
6336 pcs = this_cpu_ptr(s->cpu_sheaves);
6337
6338 if (likely(pcs->main->size < s->sheaf_capacity))
6339 goto do_free;
6340
6341 barn = get_barn(s);
6342 if (!barn)
6343 goto no_empty;
6344
6345 if (!pcs->spare) {
6346 empty = barn_get_empty_sheaf(barn);
6347 if (!empty)
6348 goto no_empty;
6349
6350 pcs->spare = pcs->main;
6351 pcs->main = empty;
6352 goto do_free;
6353 }
6354
6355 if (pcs->spare->size < s->sheaf_capacity) {
6356 swap(pcs->main, pcs->spare);
6357 goto do_free;
6358 }
6359
6360 empty = barn_replace_full_sheaf(barn, full: pcs->main);
6361 if (IS_ERR(ptr: empty)) {
6362 stat(s, si: BARN_PUT_FAIL);
6363 goto no_empty;
6364 }
6365
6366 stat(s, si: BARN_PUT);
6367 pcs->main = empty;
6368
6369do_free:
6370 main = pcs->main;
6371 batch = min(size, s->sheaf_capacity - main->size);
6372
6373 memcpy(to: main->objects + main->size, from: p, len: batch * sizeof(void *));
6374 main->size += batch;
6375
6376 local_unlock(&s->cpu_sheaves->lock);
6377
6378 stat_add(s, si: FREE_PCS, v: batch);
6379
6380 if (batch < size) {
6381 p += batch;
6382 size -= batch;
6383 goto next_batch;
6384 }
6385
6386 return;
6387
6388no_empty:
6389 local_unlock(&s->cpu_sheaves->lock);
6390
6391 /*
6392 * if we depleted all empty sheaves in the barn or there are too
6393 * many full sheaves, free the rest to slab pages
6394 */
6395fallback:
6396 __kmem_cache_free_bulk(s, size, p);
6397
6398flush_remote:
6399 if (remote_nr) {
6400 __kmem_cache_free_bulk(s, size: remote_nr, p: &remote_objects[0]);
6401 if (i < size) {
6402 remote_nr = 0;
6403 goto next_remote_batch;
6404 }
6405 }
6406}
6407
6408struct defer_free {
6409 struct llist_head objects;
6410 struct llist_head slabs;
6411 struct irq_work work;
6412};
6413
6414static void free_deferred_objects(struct irq_work *work);
6415
6416static DEFINE_PER_CPU(struct defer_free, defer_free_objects) = {
6417 .objects = LLIST_HEAD_INIT(objects),
6418 .slabs = LLIST_HEAD_INIT(slabs),
6419 .work = IRQ_WORK_INIT(free_deferred_objects),
6420};
6421
6422/*
6423 * In PREEMPT_RT irq_work runs in per-cpu kthread, so it's safe
6424 * to take sleeping spin_locks from __slab_free() and deactivate_slab().
6425 * In !PREEMPT_RT irq_work will run after local_unlock_irqrestore().
6426 */
6427static void free_deferred_objects(struct irq_work *work)
6428{
6429 struct defer_free *df = container_of(work, struct defer_free, work);
6430 struct llist_head *objs = &df->objects;
6431 struct llist_head *slabs = &df->slabs;
6432 struct llist_node *llnode, *pos, *t;
6433
6434 if (llist_empty(head: objs) && llist_empty(head: slabs))
6435 return;
6436
6437 llnode = llist_del_all(head: objs);
6438 llist_for_each_safe(pos, t, llnode) {
6439 struct kmem_cache *s;
6440 struct slab *slab;
6441 void *x = pos;
6442
6443 slab = virt_to_slab(addr: x);
6444 s = slab->slab_cache;
6445
6446 /*
6447 * We used freepointer in 'x' to link 'x' into df->objects.
6448 * Clear it to NULL to avoid false positive detection
6449 * of "Freepointer corruption".
6450 */
6451 *(void **)x = NULL;
6452
6453 /* Point 'x' back to the beginning of allocated object */
6454 x -= s->offset;
6455 __slab_free(s, slab, head: x, tail: x, cnt: 1, _THIS_IP_);
6456 }
6457
6458 llnode = llist_del_all(head: slabs);
6459 llist_for_each_safe(pos, t, llnode) {
6460 struct slab *slab = container_of(pos, struct slab, llnode);
6461
6462#ifdef CONFIG_SLUB_TINY
6463 discard_slab(slab->slab_cache, slab);
6464#else
6465 deactivate_slab(s: slab->slab_cache, slab, freelist: slab->flush_freelist);
6466#endif
6467 }
6468}
6469
6470static void defer_free(struct kmem_cache *s, void *head)
6471{
6472 struct defer_free *df;
6473
6474 guard(preempt)();
6475
6476 df = this_cpu_ptr(&defer_free_objects);
6477 if (llist_add(new: head + s->offset, head: &df->objects))
6478 irq_work_queue(work: &df->work);
6479}
6480
6481static void defer_deactivate_slab(struct slab *slab, void *flush_freelist)
6482{
6483 struct defer_free *df;
6484
6485 slab->flush_freelist = flush_freelist;
6486
6487 guard(preempt)();
6488
6489 df = this_cpu_ptr(&defer_free_objects);
6490 if (llist_add(new: &slab->llnode, head: &df->slabs))
6491 irq_work_queue(work: &df->work);
6492}
6493
6494void defer_free_barrier(void)
6495{
6496 int cpu;
6497
6498 for_each_possible_cpu(cpu)
6499 irq_work_sync(work: &per_cpu_ptr(&defer_free_objects, cpu)->work);
6500}
6501
6502#ifndef CONFIG_SLUB_TINY
6503/*
6504 * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
6505 * can perform fastpath freeing without additional function calls.
6506 *
6507 * The fastpath is only possible if we are freeing to the current cpu slab
6508 * of this processor. This typically the case if we have just allocated
6509 * the item before.
6510 *
6511 * If fastpath is not possible then fall back to __slab_free where we deal
6512 * with all sorts of special processing.
6513 *
6514 * Bulk free of a freelist with several objects (all pointing to the
6515 * same slab) possible by specifying head and tail ptr, plus objects
6516 * count (cnt). Bulk free indicated by tail pointer being set.
6517 */
6518static __always_inline void do_slab_free(struct kmem_cache *s,
6519 struct slab *slab, void *head, void *tail,
6520 int cnt, unsigned long addr)
6521{
6522 /* cnt == 0 signals that it's called from kfree_nolock() */
6523 bool allow_spin = cnt;
6524 struct kmem_cache_cpu *c;
6525 unsigned long tid;
6526 void **freelist;
6527
6528redo:
6529 /*
6530 * Determine the currently cpus per cpu slab.
6531 * The cpu may change afterward. However that does not matter since
6532 * data is retrieved via this pointer. If we are on the same cpu
6533 * during the cmpxchg then the free will succeed.
6534 */
6535 c = raw_cpu_ptr(s->cpu_slab);
6536 tid = READ_ONCE(c->tid);
6537
6538 /* Same with comment on barrier() in __slab_alloc_node() */
6539 barrier();
6540
6541 if (unlikely(slab != c->slab)) {
6542 if (unlikely(!allow_spin)) {
6543 /*
6544 * __slab_free() can locklessly cmpxchg16 into a slab,
6545 * but then it might need to take spin_lock or local_lock
6546 * in put_cpu_partial() for further processing.
6547 * Avoid the complexity and simply add to a deferred list.
6548 */
6549 defer_free(s, head);
6550 } else {
6551 __slab_free(s, slab, head, tail, cnt, addr);
6552 }
6553 return;
6554 }
6555
6556 if (unlikely(!allow_spin)) {
6557 if ((in_nmi() || !USE_LOCKLESS_FAST_PATH()) &&
6558 local_lock_is_locked(&s->cpu_slab->lock)) {
6559 defer_free(s, head);
6560 return;
6561 }
6562 cnt = 1; /* restore cnt. kfree_nolock() frees one object at a time */
6563 }
6564
6565 if (USE_LOCKLESS_FAST_PATH()) {
6566 freelist = READ_ONCE(c->freelist);
6567
6568 set_freepointer(s, object: tail, fp: freelist);
6569
6570 if (unlikely(!__update_cpu_freelist_fast(s, freelist, head, tid))) {
6571 note_cmpxchg_failure(n: "slab_free", s, tid);
6572 goto redo;
6573 }
6574 } else {
6575 __maybe_unused unsigned long flags = 0;
6576
6577 /* Update the free list under the local lock */
6578 local_lock_cpu_slab(s, flags);
6579 c = this_cpu_ptr(s->cpu_slab);
6580 if (unlikely(slab != c->slab)) {
6581 local_unlock_cpu_slab(s, flags);
6582 goto redo;
6583 }
6584 tid = c->tid;
6585 freelist = c->freelist;
6586
6587 set_freepointer(s, object: tail, fp: freelist);
6588 c->freelist = head;
6589 c->tid = next_tid(tid);
6590
6591 local_unlock_cpu_slab(s, flags);
6592 }
6593 stat_add(s, si: FREE_FASTPATH, v: cnt);
6594}
6595#else /* CONFIG_SLUB_TINY */
6596static void do_slab_free(struct kmem_cache *s,
6597 struct slab *slab, void *head, void *tail,
6598 int cnt, unsigned long addr)
6599{
6600 __slab_free(s, slab, head, tail, cnt, addr);
6601}
6602#endif /* CONFIG_SLUB_TINY */
6603
6604static __fastpath_inline
6605void slab_free(struct kmem_cache *s, struct slab *slab, void *object,
6606 unsigned long addr)
6607{
6608 memcg_slab_free_hook(s, slab, p: &object, objects: 1);
6609 alloc_tagging_slab_free_hook(s, slab, p: &object, objects: 1);
6610
6611 if (unlikely(!slab_free_hook(s, object, slab_want_init_on_free(s), false)))
6612 return;
6613
6614 if (s->cpu_sheaves && likely(!IS_ENABLED(CONFIG_NUMA) ||
6615 slab_nid(slab) == numa_mem_id())) {
6616 if (likely(free_to_pcs(s, object)))
6617 return;
6618 }
6619
6620 do_slab_free(s, slab, head: object, tail: object, cnt: 1, addr);
6621}
6622
6623#ifdef CONFIG_MEMCG
6624/* Do not inline the rare memcg charging failed path into the allocation path */
6625static noinline
6626void memcg_alloc_abort_single(struct kmem_cache *s, void *object)
6627{
6628 if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false)))
6629 do_slab_free(s, virt_to_slab(object), object, object, 1, _RET_IP_);
6630}
6631#endif
6632
6633static __fastpath_inline
6634void slab_free_bulk(struct kmem_cache *s, struct slab *slab, void *head,
6635 void *tail, void **p, int cnt, unsigned long addr)
6636{
6637 memcg_slab_free_hook(s, slab, p, objects: cnt);
6638 alloc_tagging_slab_free_hook(s, slab, p, objects: cnt);
6639 /*
6640 * With KASAN enabled slab_free_freelist_hook modifies the freelist
6641 * to remove objects, whose reuse must be delayed.
6642 */
6643 if (likely(slab_free_freelist_hook(s, &head, &tail, &cnt)))
6644 do_slab_free(s, slab, head, tail, cnt, addr);
6645}
6646
6647#ifdef CONFIG_SLUB_RCU_DEBUG
6648static void slab_free_after_rcu_debug(struct rcu_head *rcu_head)
6649{
6650 struct rcu_delayed_free *delayed_free =
6651 container_of(rcu_head, struct rcu_delayed_free, head);
6652 void *object = delayed_free->object;
6653 struct slab *slab = virt_to_slab(object);
6654 struct kmem_cache *s;
6655
6656 kfree(delayed_free);
6657
6658 if (WARN_ON(is_kfence_address(object)))
6659 return;
6660
6661 /* find the object and the cache again */
6662 if (WARN_ON(!slab))
6663 return;
6664 s = slab->slab_cache;
6665 if (WARN_ON(!(s->flags & SLAB_TYPESAFE_BY_RCU)))
6666 return;
6667
6668 /* resume freeing */
6669 if (slab_free_hook(s, object, slab_want_init_on_free(s), true))
6670 do_slab_free(s, slab, object, object, 1, _THIS_IP_);
6671}
6672#endif /* CONFIG_SLUB_RCU_DEBUG */
6673
6674#ifdef CONFIG_KASAN_GENERIC
6675void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr)
6676{
6677 do_slab_free(cache, virt_to_slab(x), x, x, 1, addr);
6678}
6679#endif
6680
6681static inline struct kmem_cache *virt_to_cache(const void *obj)
6682{
6683 struct slab *slab;
6684
6685 slab = virt_to_slab(addr: obj);
6686 if (WARN_ONCE(!slab, "%s: Object is not a Slab page!\n", __func__))
6687 return NULL;
6688 return slab->slab_cache;
6689}
6690
6691static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
6692{
6693 struct kmem_cache *cachep;
6694
6695 if (!IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) &&
6696 !kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS))
6697 return s;
6698
6699 cachep = virt_to_cache(obj: x);
6700 if (WARN(cachep && cachep != s,
6701 "%s: Wrong slab cache. %s but object is from %s\n",
6702 __func__, s->name, cachep->name))
6703 print_tracking(s: cachep, object: x);
6704 return cachep;
6705}
6706
6707/**
6708 * kmem_cache_free - Deallocate an object
6709 * @s: The cache the allocation was from.
6710 * @x: The previously allocated object.
6711 *
6712 * Free an object which was previously allocated from this
6713 * cache.
6714 */
6715void kmem_cache_free(struct kmem_cache *s, void *x)
6716{
6717 s = cache_from_obj(s, x);
6718 if (!s)
6719 return;
6720 trace_kmem_cache_free(_RET_IP_, ptr: x, s);
6721 slab_free(s, slab: virt_to_slab(addr: x), object: x, _RET_IP_);
6722}
6723EXPORT_SYMBOL(kmem_cache_free);
6724
6725static void free_large_kmalloc(struct folio *folio, void *object)
6726{
6727 unsigned int order = folio_order(folio);
6728
6729 if (WARN_ON_ONCE(!folio_test_large_kmalloc(folio))) {
6730 dump_page(page: &folio->page, reason: "Not a kmalloc allocation");
6731 return;
6732 }
6733
6734 if (WARN_ON_ONCE(order == 0))
6735 pr_warn_once("object pointer: 0x%p\n", object);
6736
6737 kmemleak_free(ptr: object);
6738 kasan_kfree_large(ptr: object);
6739 kmsan_kfree_large(ptr: object);
6740
6741 lruvec_stat_mod_folio(folio, idx: NR_SLAB_UNRECLAIMABLE_B,
6742 val: -(PAGE_SIZE << order));
6743 __folio_clear_large_kmalloc(folio);
6744 free_frozen_pages(page: &folio->page, order);
6745}
6746
6747/*
6748 * Given an rcu_head embedded within an object obtained from kvmalloc at an
6749 * offset < 4k, free the object in question.
6750 */
6751void kvfree_rcu_cb(struct rcu_head *head)
6752{
6753 void *obj = head;
6754 struct folio *folio;
6755 struct slab *slab;
6756 struct kmem_cache *s;
6757 void *slab_addr;
6758
6759 if (is_vmalloc_addr(x: obj)) {
6760 obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj);
6761 vfree(addr: obj);
6762 return;
6763 }
6764
6765 folio = virt_to_folio(x: obj);
6766 if (!folio_test_slab(folio)) {
6767 /*
6768 * rcu_head offset can be only less than page size so no need to
6769 * consider folio order
6770 */
6771 obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj);
6772 free_large_kmalloc(folio, object: obj);
6773 return;
6774 }
6775
6776 slab = folio_slab(folio);
6777 s = slab->slab_cache;
6778 slab_addr = folio_address(folio);
6779
6780 if (is_kfence_address(addr: obj)) {
6781 obj = kfence_object_start(addr: obj);
6782 } else {
6783 unsigned int idx = __obj_to_index(cache: s, addr: slab_addr, obj);
6784
6785 obj = slab_addr + s->size * idx;
6786 obj = fixup_red_left(s, p: obj);
6787 }
6788
6789 slab_free(s, slab, object: obj, _RET_IP_);
6790}
6791
6792/**
6793 * kfree - free previously allocated memory
6794 * @object: pointer returned by kmalloc() or kmem_cache_alloc()
6795 *
6796 * If @object is NULL, no operation is performed.
6797 */
6798void kfree(const void *object)
6799{
6800 struct folio *folio;
6801 struct slab *slab;
6802 struct kmem_cache *s;
6803 void *x = (void *)object;
6804
6805 trace_kfree(_RET_IP_, ptr: object);
6806
6807 if (unlikely(ZERO_OR_NULL_PTR(object)))
6808 return;
6809
6810 folio = virt_to_folio(x: object);
6811 if (unlikely(!folio_test_slab(folio))) {
6812 free_large_kmalloc(folio, object: (void *)object);
6813 return;
6814 }
6815
6816 slab = folio_slab(folio);
6817 s = slab->slab_cache;
6818 slab_free(s, slab, object: x, _RET_IP_);
6819}
6820EXPORT_SYMBOL(kfree);
6821
6822/*
6823 * Can be called while holding raw_spinlock_t or from IRQ and NMI,
6824 * but ONLY for objects allocated by kmalloc_nolock().
6825 * Debug checks (like kmemleak and kfence) were skipped on allocation,
6826 * hence
6827 * obj = kmalloc(); kfree_nolock(obj);
6828 * will miss kmemleak/kfence book keeping and will cause false positives.
6829 * large_kmalloc is not supported either.
6830 */
6831void kfree_nolock(const void *object)
6832{
6833 struct folio *folio;
6834 struct slab *slab;
6835 struct kmem_cache *s;
6836 void *x = (void *)object;
6837
6838 if (unlikely(ZERO_OR_NULL_PTR(object)))
6839 return;
6840
6841 folio = virt_to_folio(x: object);
6842 if (unlikely(!folio_test_slab(folio))) {
6843 WARN_ONCE(1, "large_kmalloc is not supported by kfree_nolock()");
6844 return;
6845 }
6846
6847 slab = folio_slab(folio);
6848 s = slab->slab_cache;
6849
6850 memcg_slab_free_hook(s, slab, p: &x, objects: 1);
6851 alloc_tagging_slab_free_hook(s, slab, p: &x, objects: 1);
6852 /*
6853 * Unlike slab_free() do NOT call the following:
6854 * kmemleak_free_recursive(x, s->flags);
6855 * debug_check_no_locks_freed(x, s->object_size);
6856 * debug_check_no_obj_freed(x, s->object_size);
6857 * __kcsan_check_access(x, s->object_size, ..);
6858 * kfence_free(x);
6859 * since they take spinlocks or not safe from any context.
6860 */
6861 kmsan_slab_free(s, object: x);
6862 /*
6863 * If KASAN finds a kernel bug it will do kasan_report_invalid_free()
6864 * which will call raw_spin_lock_irqsave() which is technically
6865 * unsafe from NMI, but take chance and report kernel bug.
6866 * The sequence of
6867 * kasan_report_invalid_free() -> raw_spin_lock_irqsave() -> NMI
6868 * -> kfree_nolock() -> kasan_report_invalid_free() on the same CPU
6869 * is double buggy and deserves to deadlock.
6870 */
6871 if (kasan_slab_pre_free(s, object: x))
6872 return;
6873 /*
6874 * memcg, kasan_slab_pre_free are done for 'x'.
6875 * The only thing left is kasan_poison without quarantine,
6876 * since kasan quarantine takes locks and not supported from NMI.
6877 */
6878 kasan_slab_free(s, object: x, init: false, still_accessible: false, /* skip quarantine */no_quarantine: true);
6879#ifndef CONFIG_SLUB_TINY
6880 do_slab_free(s, slab, head: x, tail: x, cnt: 0, _RET_IP_);
6881#else
6882 defer_free(s, x);
6883#endif
6884}
6885EXPORT_SYMBOL_GPL(kfree_nolock);
6886
6887static __always_inline __realloc_size(2) void *
6888__do_krealloc(const void *p, size_t new_size, unsigned long align, gfp_t flags, int nid)
6889{
6890 void *ret;
6891 size_t ks = 0;
6892 int orig_size = 0;
6893 struct kmem_cache *s = NULL;
6894
6895 if (unlikely(ZERO_OR_NULL_PTR(p)))
6896 goto alloc_new;
6897
6898 /* Check for double-free. */
6899 if (!kasan_check_byte(address: p))
6900 return NULL;
6901
6902 /*
6903 * If reallocation is not necessary (e. g. the new size is less
6904 * than the current allocated size), the current allocation will be
6905 * preserved unless __GFP_THISNODE is set. In the latter case a new
6906 * allocation on the requested node will be attempted.
6907 */
6908 if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE &&
6909 nid != page_to_nid(virt_to_page(p)))
6910 goto alloc_new;
6911
6912 if (is_kfence_address(addr: p)) {
6913 ks = orig_size = kfence_ksize(addr: p);
6914 } else {
6915 struct folio *folio;
6916
6917 folio = virt_to_folio(x: p);
6918 if (unlikely(!folio_test_slab(folio))) {
6919 /* Big kmalloc object */
6920 WARN_ON(folio_size(folio) <= KMALLOC_MAX_CACHE_SIZE);
6921 WARN_ON(p != folio_address(folio));
6922 ks = folio_size(folio);
6923 } else {
6924 s = folio_slab(folio)->slab_cache;
6925 orig_size = get_orig_size(s, object: (void *)p);
6926 ks = s->object_size;
6927 }
6928 }
6929
6930 /* If the old object doesn't fit, allocate a bigger one */
6931 if (new_size > ks)
6932 goto alloc_new;
6933
6934 /* If the old object doesn't satisfy the new alignment, allocate a new one */
6935 if (!IS_ALIGNED((unsigned long)p, align))
6936 goto alloc_new;
6937
6938 /* Zero out spare memory. */
6939 if (want_init_on_alloc(flags)) {
6940 kasan_disable_current();
6941 if (orig_size && orig_size < new_size)
6942 memset(s: kasan_reset_tag(addr: p) + orig_size, c: 0, n: new_size - orig_size);
6943 else
6944 memset(s: kasan_reset_tag(addr: p) + new_size, c: 0, n: ks - new_size);
6945 kasan_enable_current();
6946 }
6947
6948 /* Setup kmalloc redzone when needed */
6949 if (s && slub_debug_orig_size(s)) {
6950 set_orig_size(s, object: (void *)p, orig_size: new_size);
6951 if (s->flags & SLAB_RED_ZONE && new_size < ks)
6952 memset_no_sanitize_memory(s: kasan_reset_tag(addr: p) + new_size,
6953 SLUB_RED_ACTIVE, n: ks - new_size);
6954 }
6955
6956 p = kasan_krealloc(object: p, new_size, flags);
6957 return (void *)p;
6958
6959alloc_new:
6960 ret = kmalloc_node_track_caller_noprof(new_size, flags, nid, _RET_IP_);
6961 if (ret && p) {
6962 /* Disable KASAN checks as the object's redzone is accessed. */
6963 kasan_disable_current();
6964 memcpy(to: ret, from: kasan_reset_tag(addr: p), len: orig_size ?: ks);
6965 kasan_enable_current();
6966 }
6967
6968 return ret;
6969}
6970
6971/**
6972 * krealloc_node_align - reallocate memory. The contents will remain unchanged.
6973 * @p: object to reallocate memory for.
6974 * @new_size: how many bytes of memory are required.
6975 * @align: desired alignment.
6976 * @flags: the type of memory to allocate.
6977 * @nid: NUMA node or NUMA_NO_NODE
6978 *
6979 * If @p is %NULL, krealloc() behaves exactly like kmalloc(). If @new_size
6980 * is 0 and @p is not a %NULL pointer, the object pointed to is freed.
6981 *
6982 * Only alignments up to those guaranteed by kmalloc() will be honored. Please see
6983 * Documentation/core-api/memory-allocation.rst for more details.
6984 *
6985 * If __GFP_ZERO logic is requested, callers must ensure that, starting with the
6986 * initial memory allocation, every subsequent call to this API for the same
6987 * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
6988 * __GFP_ZERO is not fully honored by this API.
6989 *
6990 * When slub_debug_orig_size() is off, krealloc() only knows about the bucket
6991 * size of an allocation (but not the exact size it was allocated with) and
6992 * hence implements the following semantics for shrinking and growing buffers
6993 * with __GFP_ZERO::
6994 *
6995 * new bucket
6996 * 0 size size
6997 * |--------|----------------|
6998 * | keep | zero |
6999 *
7000 * Otherwise, the original allocation size 'orig_size' could be used to
7001 * precisely clear the requested size, and the new size will also be stored
7002 * as the new 'orig_size'.
7003 *
7004 * In any case, the contents of the object pointed to are preserved up to the
7005 * lesser of the new and old sizes.
7006 *
7007 * Return: pointer to the allocated memory or %NULL in case of error
7008 */
7009void *krealloc_node_align_noprof(const void *p, size_t new_size, unsigned long align,
7010 gfp_t flags, int nid)
7011{
7012 void *ret;
7013
7014 if (unlikely(!new_size)) {
7015 kfree(p);
7016 return ZERO_SIZE_PTR;
7017 }
7018
7019 ret = __do_krealloc(p, new_size, align, flags, nid);
7020 if (ret && kasan_reset_tag(addr: p) != kasan_reset_tag(addr: ret))
7021 kfree(p);
7022
7023 return ret;
7024}
7025EXPORT_SYMBOL(krealloc_node_align_noprof);
7026
7027static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size)
7028{
7029 /*
7030 * We want to attempt a large physically contiguous block first because
7031 * it is less likely to fragment multiple larger blocks and therefore
7032 * contribute to a long term fragmentation less than vmalloc fallback.
7033 * However make sure that larger requests are not too disruptive - i.e.
7034 * do not direct reclaim unless physically continuous memory is preferred
7035 * (__GFP_RETRY_MAYFAIL mode). We still kick in kswapd/kcompactd to
7036 * start working in the background
7037 */
7038 if (size > PAGE_SIZE) {
7039 flags |= __GFP_NOWARN;
7040
7041 if (!(flags & __GFP_RETRY_MAYFAIL))
7042 flags &= ~__GFP_DIRECT_RECLAIM;
7043
7044 /* nofail semantic is implemented by the vmalloc fallback */
7045 flags &= ~__GFP_NOFAIL;
7046 }
7047
7048 return flags;
7049}
7050
7051/**
7052 * __kvmalloc_node - attempt to allocate physically contiguous memory, but upon
7053 * failure, fall back to non-contiguous (vmalloc) allocation.
7054 * @size: size of the request.
7055 * @b: which set of kmalloc buckets to allocate from.
7056 * @align: desired alignment.
7057 * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
7058 * @node: numa node to allocate from
7059 *
7060 * Only alignments up to those guaranteed by kmalloc() will be honored. Please see
7061 * Documentation/core-api/memory-allocation.rst for more details.
7062 *
7063 * Uses kmalloc to get the memory but if the allocation fails then falls back
7064 * to the vmalloc allocator. Use kvfree for freeing the memory.
7065 *
7066 * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier.
7067 * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
7068 * preferable to the vmalloc fallback, due to visible performance drawbacks.
7069 *
7070 * Return: pointer to the allocated memory of %NULL in case of failure
7071 */
7072void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), unsigned long align,
7073 gfp_t flags, int node)
7074{
7075 void *ret;
7076
7077 /*
7078 * It doesn't really make sense to fallback to vmalloc for sub page
7079 * requests
7080 */
7081 ret = __do_kmalloc_node(size, PASS_BUCKET_PARAM(b),
7082 flags: kmalloc_gfp_adjust(flags, size),
7083 node, _RET_IP_);
7084 if (ret || size <= PAGE_SIZE)
7085 return ret;
7086
7087 /* non-sleeping allocations are not supported by vmalloc */
7088 if (!gfpflags_allow_blocking(gfp_flags: flags))
7089 return NULL;
7090
7091 /* Don't even allow crazy sizes */
7092 if (unlikely(size > INT_MAX)) {
7093 WARN_ON_ONCE(!(flags & __GFP_NOWARN));
7094 return NULL;
7095 }
7096
7097 /*
7098 * kvmalloc() can always use VM_ALLOW_HUGE_VMAP,
7099 * since the callers already cannot assume anything
7100 * about the resulting pointer, and cannot play
7101 * protection games.
7102 */
7103 return __vmalloc_node_range_noprof(size, align, VMALLOC_START, VMALLOC_END,
7104 gfp_mask: flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
7105 node, caller: __builtin_return_address(0));
7106}
7107EXPORT_SYMBOL(__kvmalloc_node_noprof);
7108
7109/**
7110 * kvfree() - Free memory.
7111 * @addr: Pointer to allocated memory.
7112 *
7113 * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc().
7114 * It is slightly more efficient to use kfree() or vfree() if you are certain
7115 * that you know which one to use.
7116 *
7117 * Context: Either preemptible task context or not-NMI interrupt.
7118 */
7119void kvfree(const void *addr)
7120{
7121 if (is_vmalloc_addr(x: addr))
7122 vfree(addr);
7123 else
7124 kfree(addr);
7125}
7126EXPORT_SYMBOL(kvfree);
7127
7128/**
7129 * kvfree_sensitive - Free a data object containing sensitive information.
7130 * @addr: address of the data object to be freed.
7131 * @len: length of the data object.
7132 *
7133 * Use the special memzero_explicit() function to clear the content of a
7134 * kvmalloc'ed object containing sensitive data to make sure that the
7135 * compiler won't optimize out the data clearing.
7136 */
7137void kvfree_sensitive(const void *addr, size_t len)
7138{
7139 if (likely(!ZERO_OR_NULL_PTR(addr))) {
7140 memzero_explicit(s: (void *)addr, count: len);
7141 kvfree(addr);
7142 }
7143}
7144EXPORT_SYMBOL(kvfree_sensitive);
7145
7146/**
7147 * kvrealloc_node_align - reallocate memory; contents remain unchanged
7148 * @p: object to reallocate memory for
7149 * @size: the size to reallocate
7150 * @align: desired alignment
7151 * @flags: the flags for the page level allocator
7152 * @nid: NUMA node id
7153 *
7154 * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0
7155 * and @p is not a %NULL pointer, the object pointed to is freed.
7156 *
7157 * Only alignments up to those guaranteed by kmalloc() will be honored. Please see
7158 * Documentation/core-api/memory-allocation.rst for more details.
7159 *
7160 * If __GFP_ZERO logic is requested, callers must ensure that, starting with the
7161 * initial memory allocation, every subsequent call to this API for the same
7162 * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
7163 * __GFP_ZERO is not fully honored by this API.
7164 *
7165 * In any case, the contents of the object pointed to are preserved up to the
7166 * lesser of the new and old sizes.
7167 *
7168 * This function must not be called concurrently with itself or kvfree() for the
7169 * same memory allocation.
7170 *
7171 * Return: pointer to the allocated memory or %NULL in case of error
7172 */
7173void *kvrealloc_node_align_noprof(const void *p, size_t size, unsigned long align,
7174 gfp_t flags, int nid)
7175{
7176 void *n;
7177
7178 if (is_vmalloc_addr(x: p))
7179 return vrealloc_node_align_noprof(p, size, align, flags, nid);
7180
7181 n = krealloc_node_align_noprof(p, size, align, kmalloc_gfp_adjust(flags, size), nid);
7182 if (!n) {
7183 /* We failed to krealloc(), fall back to kvmalloc(). */
7184 n = kvmalloc_node_align_noprof(size, align, flags, nid);
7185 if (!n)
7186 return NULL;
7187
7188 if (p) {
7189 /* We already know that `p` is not a vmalloc address. */
7190 kasan_disable_current();
7191 memcpy(to: n, from: kasan_reset_tag(addr: p), len: ksize(objp: p));
7192 kasan_enable_current();
7193
7194 kfree(p);
7195 }
7196 }
7197
7198 return n;
7199}
7200EXPORT_SYMBOL(kvrealloc_node_align_noprof);
7201
7202struct detached_freelist {
7203 struct slab *slab;
7204 void *tail;
7205 void *freelist;
7206 int cnt;
7207 struct kmem_cache *s;
7208};
7209
7210/*
7211 * This function progressively scans the array with free objects (with
7212 * a limited look ahead) and extract objects belonging to the same
7213 * slab. It builds a detached freelist directly within the given
7214 * slab/objects. This can happen without any need for
7215 * synchronization, because the objects are owned by running process.
7216 * The freelist is build up as a single linked list in the objects.
7217 * The idea is, that this detached freelist can then be bulk
7218 * transferred to the real freelist(s), but only requiring a single
7219 * synchronization primitive. Look ahead in the array is limited due
7220 * to performance reasons.
7221 */
7222static inline
7223int build_detached_freelist(struct kmem_cache *s, size_t size,
7224 void **p, struct detached_freelist *df)
7225{
7226 int lookahead = 3;
7227 void *object;
7228 struct folio *folio;
7229 size_t same;
7230
7231 object = p[--size];
7232 folio = virt_to_folio(x: object);
7233 if (!s) {
7234 /* Handle kalloc'ed objects */
7235 if (unlikely(!folio_test_slab(folio))) {
7236 free_large_kmalloc(folio, object);
7237 df->slab = NULL;
7238 return size;
7239 }
7240 /* Derive kmem_cache from object */
7241 df->slab = folio_slab(folio);
7242 df->s = df->slab->slab_cache;
7243 } else {
7244 df->slab = folio_slab(folio);
7245 df->s = cache_from_obj(s, x: object); /* Support for memcg */
7246 }
7247
7248 /* Start new detached freelist */
7249 df->tail = object;
7250 df->freelist = object;
7251 df->cnt = 1;
7252
7253 if (is_kfence_address(addr: object))
7254 return size;
7255
7256 set_freepointer(s: df->s, object, NULL);
7257
7258 same = size;
7259 while (size) {
7260 object = p[--size];
7261 /* df->slab is always set at this point */
7262 if (df->slab == virt_to_slab(addr: object)) {
7263 /* Opportunity build freelist */
7264 set_freepointer(s: df->s, object, fp: df->freelist);
7265 df->freelist = object;
7266 df->cnt++;
7267 same--;
7268 if (size != same)
7269 swap(p[size], p[same]);
7270 continue;
7271 }
7272
7273 /* Limit look ahead search */
7274 if (!--lookahead)
7275 break;
7276 }
7277
7278 return same;
7279}
7280
7281/*
7282 * Internal bulk free of objects that were not initialised by the post alloc
7283 * hooks and thus should not be processed by the free hooks
7284 */
7285static void __kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
7286{
7287 if (!size)
7288 return;
7289
7290 do {
7291 struct detached_freelist df;
7292
7293 size = build_detached_freelist(s, size, p, df: &df);
7294 if (!df.slab)
7295 continue;
7296
7297 if (kfence_free(addr: df.freelist))
7298 continue;
7299
7300 do_slab_free(s: df.s, slab: df.slab, head: df.freelist, tail: df.tail, cnt: df.cnt,
7301 _RET_IP_);
7302 } while (likely(size));
7303}
7304
7305/* Note that interrupts must be enabled when calling this function. */
7306void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
7307{
7308 if (!size)
7309 return;
7310
7311 /*
7312 * freeing to sheaves is so incompatible with the detached freelist so
7313 * once we go that way, we have to do everything differently
7314 */
7315 if (s && s->cpu_sheaves) {
7316 free_to_pcs_bulk(s, size, p);
7317 return;
7318 }
7319
7320 do {
7321 struct detached_freelist df;
7322
7323 size = build_detached_freelist(s, size, p, df: &df);
7324 if (!df.slab)
7325 continue;
7326
7327 slab_free_bulk(s: df.s, slab: df.slab, head: df.freelist, tail: df.tail, p: &p[size],
7328 cnt: df.cnt, _RET_IP_);
7329 } while (likely(size));
7330}
7331EXPORT_SYMBOL(kmem_cache_free_bulk);
7332
7333#ifndef CONFIG_SLUB_TINY
7334static inline
7335int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
7336 void **p)
7337{
7338 struct kmem_cache_cpu *c;
7339 unsigned long irqflags;
7340 int i;
7341
7342 /*
7343 * Drain objects in the per cpu slab, while disabling local
7344 * IRQs, which protects against PREEMPT and interrupts
7345 * handlers invoking normal fastpath.
7346 */
7347 c = slub_get_cpu_ptr(s->cpu_slab);
7348 local_lock_irqsave(&s->cpu_slab->lock, irqflags);
7349
7350 for (i = 0; i < size; i++) {
7351 void *object = kfence_alloc(s, size: s->object_size, flags);
7352
7353 if (unlikely(object)) {
7354 p[i] = object;
7355 continue;
7356 }
7357
7358 object = c->freelist;
7359 if (unlikely(!object)) {
7360 /*
7361 * We may have removed an object from c->freelist using
7362 * the fastpath in the previous iteration; in that case,
7363 * c->tid has not been bumped yet.
7364 * Since ___slab_alloc() may reenable interrupts while
7365 * allocating memory, we should bump c->tid now.
7366 */
7367 c->tid = next_tid(tid: c->tid);
7368
7369 local_unlock_irqrestore(&s->cpu_slab->lock, irqflags);
7370
7371 /*
7372 * Invoking slow path likely have side-effect
7373 * of re-populating per CPU c->freelist
7374 */
7375 p[i] = ___slab_alloc(s, gfpflags: flags, NUMA_NO_NODE,
7376 _RET_IP_, c, orig_size: s->object_size);
7377 if (unlikely(!p[i]))
7378 goto error;
7379
7380 c = this_cpu_ptr(s->cpu_slab);
7381 maybe_wipe_obj_freeptr(s, obj: p[i]);
7382
7383 local_lock_irqsave(&s->cpu_slab->lock, irqflags);
7384
7385 continue; /* goto for-loop */
7386 }
7387 c->freelist = get_freepointer(s, object);
7388 p[i] = object;
7389 maybe_wipe_obj_freeptr(s, obj: p[i]);
7390 stat(s, si: ALLOC_FASTPATH);
7391 }
7392 c->tid = next_tid(tid: c->tid);
7393 local_unlock_irqrestore(&s->cpu_slab->lock, irqflags);
7394 slub_put_cpu_ptr(s->cpu_slab);
7395
7396 return i;
7397
7398error:
7399 slub_put_cpu_ptr(s->cpu_slab);
7400 __kmem_cache_free_bulk(s, size: i, p);
7401 return 0;
7402
7403}
7404#else /* CONFIG_SLUB_TINY */
7405static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags,
7406 size_t size, void **p)
7407{
7408 int i;
7409
7410 for (i = 0; i < size; i++) {
7411 void *object = kfence_alloc(s, s->object_size, flags);
7412
7413 if (unlikely(object)) {
7414 p[i] = object;
7415 continue;
7416 }
7417
7418 p[i] = __slab_alloc_node(s, flags, NUMA_NO_NODE,
7419 _RET_IP_, s->object_size);
7420 if (unlikely(!p[i]))
7421 goto error;
7422
7423 maybe_wipe_obj_freeptr(s, p[i]);
7424 }
7425
7426 return i;
7427
7428error:
7429 __kmem_cache_free_bulk(s, i, p);
7430 return 0;
7431}
7432#endif /* CONFIG_SLUB_TINY */
7433
7434/* Note that interrupts must be enabled when calling this function. */
7435int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size,
7436 void **p)
7437{
7438 unsigned int i = 0;
7439
7440 if (!size)
7441 return 0;
7442
7443 s = slab_pre_alloc_hook(s, flags);
7444 if (unlikely(!s))
7445 return 0;
7446
7447 if (s->cpu_sheaves)
7448 i = alloc_from_pcs_bulk(s, size, p);
7449
7450 if (i < size) {
7451 /*
7452 * If we ran out of memory, don't bother with freeing back to
7453 * the percpu sheaves, we have bigger problems.
7454 */
7455 if (unlikely(__kmem_cache_alloc_bulk(s, flags, size - i, p + i) == 0)) {
7456 if (i > 0)
7457 __kmem_cache_free_bulk(s, size: i, p);
7458 return 0;
7459 }
7460 }
7461
7462 /*
7463 * memcg and kmem_cache debug support and memory initialization.
7464 * Done outside of the IRQ disabled fastpath loop.
7465 */
7466 if (unlikely(!slab_post_alloc_hook(s, NULL, flags, size, p,
7467 slab_want_init_on_alloc(flags, s), s->object_size))) {
7468 return 0;
7469 }
7470
7471 return size;
7472}
7473EXPORT_SYMBOL(kmem_cache_alloc_bulk_noprof);
7474
7475/*
7476 * Object placement in a slab is made very easy because we always start at
7477 * offset 0. If we tune the size of the object to the alignment then we can
7478 * get the required alignment by putting one properly sized object after
7479 * another.
7480 *
7481 * Notice that the allocation order determines the sizes of the per cpu
7482 * caches. Each processor has always one slab available for allocations.
7483 * Increasing the allocation order reduces the number of times that slabs
7484 * must be moved on and off the partial lists and is therefore a factor in
7485 * locking overhead.
7486 */
7487
7488/*
7489 * Minimum / Maximum order of slab pages. This influences locking overhead
7490 * and slab fragmentation. A higher order reduces the number of partial slabs
7491 * and increases the number of allocations possible without having to
7492 * take the list_lock.
7493 */
7494static unsigned int slub_min_order;
7495static unsigned int slub_max_order =
7496 IS_ENABLED(CONFIG_SLUB_TINY) ? 1 : PAGE_ALLOC_COSTLY_ORDER;
7497static unsigned int slub_min_objects;
7498
7499/*
7500 * Calculate the order of allocation given an slab object size.
7501 *
7502 * The order of allocation has significant impact on performance and other
7503 * system components. Generally order 0 allocations should be preferred since
7504 * order 0 does not cause fragmentation in the page allocator. Larger objects
7505 * be problematic to put into order 0 slabs because there may be too much
7506 * unused space left. We go to a higher order if more than 1/16th of the slab
7507 * would be wasted.
7508 *
7509 * In order to reach satisfactory performance we must ensure that a minimum
7510 * number of objects is in one slab. Otherwise we may generate too much
7511 * activity on the partial lists which requires taking the list_lock. This is
7512 * less a concern for large slabs though which are rarely used.
7513 *
7514 * slab_max_order specifies the order where we begin to stop considering the
7515 * number of objects in a slab as critical. If we reach slab_max_order then
7516 * we try to keep the page order as low as possible. So we accept more waste
7517 * of space in favor of a small page order.
7518 *
7519 * Higher order allocations also allow the placement of more objects in a
7520 * slab and thereby reduce object handling overhead. If the user has
7521 * requested a higher minimum order then we start with that one instead of
7522 * the smallest order which will fit the object.
7523 */
7524static inline unsigned int calc_slab_order(unsigned int size,
7525 unsigned int min_order, unsigned int max_order,
7526 unsigned int fract_leftover)
7527{
7528 unsigned int order;
7529
7530 for (order = min_order; order <= max_order; order++) {
7531
7532 unsigned int slab_size = (unsigned int)PAGE_SIZE << order;
7533 unsigned int rem;
7534
7535 rem = slab_size % size;
7536
7537 if (rem <= slab_size / fract_leftover)
7538 break;
7539 }
7540
7541 return order;
7542}
7543
7544static inline int calculate_order(unsigned int size)
7545{
7546 unsigned int order;
7547 unsigned int min_objects;
7548 unsigned int max_objects;
7549 unsigned int min_order;
7550
7551 min_objects = slub_min_objects;
7552 if (!min_objects) {
7553 /*
7554 * Some architectures will only update present cpus when
7555 * onlining them, so don't trust the number if it's just 1. But
7556 * we also don't want to use nr_cpu_ids always, as on some other
7557 * architectures, there can be many possible cpus, but never
7558 * onlined. Here we compromise between trying to avoid too high
7559 * order on systems that appear larger than they are, and too
7560 * low order on systems that appear smaller than they are.
7561 */
7562 unsigned int nr_cpus = num_present_cpus();
7563 if (nr_cpus <= 1)
7564 nr_cpus = nr_cpu_ids;
7565 min_objects = 4 * (fls(x: nr_cpus) + 1);
7566 }
7567 /* min_objects can't be 0 because get_order(0) is undefined */
7568 max_objects = max(order_objects(slub_max_order, size), 1U);
7569 min_objects = min(min_objects, max_objects);
7570
7571 min_order = max_t(unsigned int, slub_min_order,
7572 get_order(min_objects * size));
7573 if (order_objects(order: min_order, size) > MAX_OBJS_PER_PAGE)
7574 return get_order(size: size * MAX_OBJS_PER_PAGE) - 1;
7575
7576 /*
7577 * Attempt to find best configuration for a slab. This works by first
7578 * attempting to generate a layout with the best possible configuration
7579 * and backing off gradually.
7580 *
7581 * We start with accepting at most 1/16 waste and try to find the
7582 * smallest order from min_objects-derived/slab_min_order up to
7583 * slab_max_order that will satisfy the constraint. Note that increasing
7584 * the order can only result in same or less fractional waste, not more.
7585 *
7586 * If that fails, we increase the acceptable fraction of waste and try
7587 * again. The last iteration with fraction of 1/2 would effectively
7588 * accept any waste and give us the order determined by min_objects, as
7589 * long as at least single object fits within slab_max_order.
7590 */
7591 for (unsigned int fraction = 16; fraction > 1; fraction /= 2) {
7592 order = calc_slab_order(size, min_order, max_order: slub_max_order,
7593 fract_leftover: fraction);
7594 if (order <= slub_max_order)
7595 return order;
7596 }
7597
7598 /*
7599 * Doh this slab cannot be placed using slab_max_order.
7600 */
7601 order = get_order(size);
7602 if (order <= MAX_PAGE_ORDER)
7603 return order;
7604 return -ENOSYS;
7605}
7606
7607static void
7608init_kmem_cache_node(struct kmem_cache_node *n, struct node_barn *barn)
7609{
7610 n->nr_partial = 0;
7611 spin_lock_init(&n->list_lock);
7612 INIT_LIST_HEAD(list: &n->partial);
7613#ifdef CONFIG_SLUB_DEBUG
7614 atomic_long_set(v: &n->nr_slabs, i: 0);
7615 atomic_long_set(v: &n->total_objects, i: 0);
7616 INIT_LIST_HEAD(list: &n->full);
7617#endif
7618 n->barn = barn;
7619 if (barn)
7620 barn_init(barn);
7621}
7622
7623#ifndef CONFIG_SLUB_TINY
7624static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
7625{
7626 BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
7627 NR_KMALLOC_TYPES * KMALLOC_SHIFT_HIGH *
7628 sizeof(struct kmem_cache_cpu));
7629
7630 /*
7631 * Must align to double word boundary for the double cmpxchg
7632 * instructions to work; see __pcpu_double_call_return_bool().
7633 */
7634 s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu),
7635 2 * sizeof(void *));
7636
7637 if (!s->cpu_slab)
7638 return 0;
7639
7640 init_kmem_cache_cpus(s);
7641
7642 return 1;
7643}
7644#else
7645static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
7646{
7647 return 1;
7648}
7649#endif /* CONFIG_SLUB_TINY */
7650
7651static int init_percpu_sheaves(struct kmem_cache *s)
7652{
7653 int cpu;
7654
7655 for_each_possible_cpu(cpu) {
7656 struct slub_percpu_sheaves *pcs;
7657
7658 pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
7659
7660 local_trylock_init(&pcs->lock);
7661
7662 pcs->main = alloc_empty_sheaf(s, GFP_KERNEL);
7663
7664 if (!pcs->main)
7665 return -ENOMEM;
7666 }
7667
7668 return 0;
7669}
7670
7671static struct kmem_cache *kmem_cache_node;
7672
7673/*
7674 * No kmalloc_node yet so do it by hand. We know that this is the first
7675 * slab on the node for this slabcache. There are no concurrent accesses
7676 * possible.
7677 *
7678 * Note that this function only works on the kmem_cache_node
7679 * when allocating for the kmem_cache_node. This is used for bootstrapping
7680 * memory on a fresh node that has no slab structures yet.
7681 */
7682static void early_kmem_cache_node_alloc(int node)
7683{
7684 struct slab *slab;
7685 struct kmem_cache_node *n;
7686
7687 BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
7688
7689 slab = new_slab(s: kmem_cache_node, GFP_NOWAIT, node);
7690
7691 BUG_ON(!slab);
7692 if (slab_nid(slab) != node) {
7693 pr_err("SLUB: Unable to allocate memory from node %d\n", node);
7694 pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n");
7695 }
7696
7697 n = slab->freelist;
7698 BUG_ON(!n);
7699#ifdef CONFIG_SLUB_DEBUG
7700 init_object(s: kmem_cache_node, object: n, SLUB_RED_ACTIVE);
7701#endif
7702 n = kasan_slab_alloc(s: kmem_cache_node, object: n, GFP_KERNEL, init: false);
7703 slab->freelist = get_freepointer(s: kmem_cache_node, object: n);
7704 slab->inuse = 1;
7705 kmem_cache_node->node[node] = n;
7706 init_kmem_cache_node(n, NULL);
7707 inc_slabs_node(s: kmem_cache_node, node, objects: slab->objects);
7708
7709 /*
7710 * No locks need to be taken here as it has just been
7711 * initialized and there is no concurrent access.
7712 */
7713 __add_partial(n, slab, tail: DEACTIVATE_TO_HEAD);
7714}
7715
7716static void free_kmem_cache_nodes(struct kmem_cache *s)
7717{
7718 int node;
7719 struct kmem_cache_node *n;
7720
7721 for_each_kmem_cache_node(s, node, n) {
7722 if (n->barn) {
7723 WARN_ON(n->barn->nr_full);
7724 WARN_ON(n->barn->nr_empty);
7725 kfree(n->barn);
7726 n->barn = NULL;
7727 }
7728
7729 s->node[node] = NULL;
7730 kmem_cache_free(kmem_cache_node, n);
7731 }
7732}
7733
7734void __kmem_cache_release(struct kmem_cache *s)
7735{
7736 cache_random_seq_destroy(cachep: s);
7737 if (s->cpu_sheaves)
7738 pcs_destroy(s);
7739#ifndef CONFIG_SLUB_TINY
7740#ifdef CONFIG_PREEMPT_RT
7741 if (s->cpu_slab)
7742 lockdep_unregister_key(&s->lock_key);
7743#endif
7744 free_percpu(pdata: s->cpu_slab);
7745#endif
7746 free_kmem_cache_nodes(s);
7747}
7748
7749static int init_kmem_cache_nodes(struct kmem_cache *s)
7750{
7751 int node;
7752
7753 for_each_node_mask(node, slab_nodes) {
7754 struct kmem_cache_node *n;
7755 struct node_barn *barn = NULL;
7756
7757 if (slab_state == DOWN) {
7758 early_kmem_cache_node_alloc(node);
7759 continue;
7760 }
7761
7762 if (s->cpu_sheaves) {
7763 barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, node);
7764
7765 if (!barn)
7766 return 0;
7767 }
7768
7769 n = kmem_cache_alloc_node(kmem_cache_node,
7770 GFP_KERNEL, node);
7771 if (!n) {
7772 kfree(barn);
7773 return 0;
7774 }
7775
7776 init_kmem_cache_node(n, barn);
7777
7778 s->node[node] = n;
7779 }
7780 return 1;
7781}
7782
7783static void set_cpu_partial(struct kmem_cache *s)
7784{
7785#ifdef CONFIG_SLUB_CPU_PARTIAL
7786 unsigned int nr_objects;
7787
7788 /*
7789 * cpu_partial determined the maximum number of objects kept in the
7790 * per cpu partial lists of a processor.
7791 *
7792 * Per cpu partial lists mainly contain slabs that just have one
7793 * object freed. If they are used for allocation then they can be
7794 * filled up again with minimal effort. The slab will never hit the
7795 * per node partial lists and therefore no locking will be required.
7796 *
7797 * For backwards compatibility reasons, this is determined as number
7798 * of objects, even though we now limit maximum number of pages, see
7799 * slub_set_cpu_partial()
7800 */
7801 if (!kmem_cache_has_cpu_partial(s))
7802 nr_objects = 0;
7803 else if (s->size >= PAGE_SIZE)
7804 nr_objects = 6;
7805 else if (s->size >= 1024)
7806 nr_objects = 24;
7807 else if (s->size >= 256)
7808 nr_objects = 52;
7809 else
7810 nr_objects = 120;
7811
7812 slub_set_cpu_partial(s, nr_objects);
7813#endif
7814}
7815
7816/*
7817 * calculate_sizes() determines the order and the distribution of data within
7818 * a slab object.
7819 */
7820static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s)
7821{
7822 slab_flags_t flags = s->flags;
7823 unsigned int size = s->object_size;
7824 unsigned int order;
7825
7826 /*
7827 * Round up object size to the next word boundary. We can only
7828 * place the free pointer at word boundaries and this determines
7829 * the possible location of the free pointer.
7830 */
7831 size = ALIGN(size, sizeof(void *));
7832
7833#ifdef CONFIG_SLUB_DEBUG
7834 /*
7835 * Determine if we can poison the object itself. If the user of
7836 * the slab may touch the object after free or before allocation
7837 * then we should never poison the object itself.
7838 */
7839 if ((flags & SLAB_POISON) && !(flags & SLAB_TYPESAFE_BY_RCU) &&
7840 !s->ctor)
7841 s->flags |= __OBJECT_POISON;
7842 else
7843 s->flags &= ~__OBJECT_POISON;
7844
7845
7846 /*
7847 * If we are Redzoning then check if there is some space between the
7848 * end of the object and the free pointer. If not then add an
7849 * additional word to have some bytes to store Redzone information.
7850 */
7851 if ((flags & SLAB_RED_ZONE) && size == s->object_size)
7852 size += sizeof(void *);
7853#endif
7854
7855 /*
7856 * With that we have determined the number of bytes in actual use
7857 * by the object and redzoning.
7858 */
7859 s->inuse = size;
7860
7861 if (((flags & SLAB_TYPESAFE_BY_RCU) && !args->use_freeptr_offset) ||
7862 (flags & SLAB_POISON) || s->ctor ||
7863 ((flags & SLAB_RED_ZONE) &&
7864 (s->object_size < sizeof(void *) || slub_debug_orig_size(s)))) {
7865 /*
7866 * Relocate free pointer after the object if it is not
7867 * permitted to overwrite the first word of the object on
7868 * kmem_cache_free.
7869 *
7870 * This is the case if we do RCU, have a constructor or
7871 * destructor, are poisoning the objects, or are
7872 * redzoning an object smaller than sizeof(void *) or are
7873 * redzoning an object with slub_debug_orig_size() enabled,
7874 * in which case the right redzone may be extended.
7875 *
7876 * The assumption that s->offset >= s->inuse means free
7877 * pointer is outside of the object is used in the
7878 * freeptr_outside_object() function. If that is no
7879 * longer true, the function needs to be modified.
7880 */
7881 s->offset = size;
7882 size += sizeof(void *);
7883 } else if ((flags & SLAB_TYPESAFE_BY_RCU) && args->use_freeptr_offset) {
7884 s->offset = args->freeptr_offset;
7885 } else {
7886 /*
7887 * Store freelist pointer near middle of object to keep
7888 * it away from the edges of the object to avoid small
7889 * sized over/underflows from neighboring allocations.
7890 */
7891 s->offset = ALIGN_DOWN(s->object_size / 2, sizeof(void *));
7892 }
7893
7894#ifdef CONFIG_SLUB_DEBUG
7895 if (flags & SLAB_STORE_USER) {
7896 /*
7897 * Need to store information about allocs and frees after
7898 * the object.
7899 */
7900 size += 2 * sizeof(struct track);
7901
7902 /* Save the original kmalloc request size */
7903 if (flags & SLAB_KMALLOC)
7904 size += sizeof(unsigned int);
7905 }
7906#endif
7907
7908 kasan_cache_create(cache: s, size: &size, flags: &s->flags);
7909#ifdef CONFIG_SLUB_DEBUG
7910 if (flags & SLAB_RED_ZONE) {
7911 /*
7912 * Add some empty padding so that we can catch
7913 * overwrites from earlier objects rather than let
7914 * tracking information or the free pointer be
7915 * corrupted if a user writes before the start
7916 * of the object.
7917 */
7918 size += sizeof(void *);
7919
7920 s->red_left_pad = sizeof(void *);
7921 s->red_left_pad = ALIGN(s->red_left_pad, s->align);
7922 size += s->red_left_pad;
7923 }
7924#endif
7925
7926 /*
7927 * SLUB stores one object immediately after another beginning from
7928 * offset 0. In order to align the objects we have to simply size
7929 * each object to conform to the alignment.
7930 */
7931 size = ALIGN(size, s->align);
7932 s->size = size;
7933 s->reciprocal_size = reciprocal_value(d: size);
7934 order = calculate_order(size);
7935
7936 if ((int)order < 0)
7937 return 0;
7938
7939 s->allocflags = __GFP_COMP;
7940
7941 if (s->flags & SLAB_CACHE_DMA)
7942 s->allocflags |= GFP_DMA;
7943
7944 if (s->flags & SLAB_CACHE_DMA32)
7945 s->allocflags |= GFP_DMA32;
7946
7947 if (s->flags & SLAB_RECLAIM_ACCOUNT)
7948 s->allocflags |= __GFP_RECLAIMABLE;
7949
7950 /*
7951 * Determine the number of objects per slab
7952 */
7953 s->oo = oo_make(order, size);
7954 s->min = oo_make(order: get_order(size), size);
7955
7956 return !!oo_objects(x: s->oo);
7957}
7958
7959static void list_slab_objects(struct kmem_cache *s, struct slab *slab)
7960{
7961#ifdef CONFIG_SLUB_DEBUG
7962 void *addr = slab_address(slab);
7963 void *p;
7964
7965 if (!slab_add_kunit_errors())
7966 slab_bug(s, fmt: "Objects remaining on __kmem_cache_shutdown()");
7967
7968 spin_lock(lock: &object_map_lock);
7969 __fill_map(obj_map: object_map, s, slab);
7970
7971 for_each_object(p, s, addr, slab->objects) {
7972
7973 if (!test_bit(__obj_to_index(s, addr, p), object_map)) {
7974 if (slab_add_kunit_errors())
7975 continue;
7976 pr_err("Object 0x%p @offset=%tu\n", p, p - addr);
7977 print_tracking(s, object: p);
7978 }
7979 }
7980 spin_unlock(lock: &object_map_lock);
7981
7982 __slab_err(slab);
7983#endif
7984}
7985
7986/*
7987 * Attempt to free all partial slabs on a node.
7988 * This is called from __kmem_cache_shutdown(). We must take list_lock
7989 * because sysfs file might still access partial list after the shutdowning.
7990 */
7991static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
7992{
7993 LIST_HEAD(discard);
7994 struct slab *slab, *h;
7995
7996 BUG_ON(irqs_disabled());
7997 spin_lock_irq(lock: &n->list_lock);
7998 list_for_each_entry_safe(slab, h, &n->partial, slab_list) {
7999 if (!slab->inuse) {
8000 remove_partial(n, slab);
8001 list_add(new: &slab->slab_list, head: &discard);
8002 } else {
8003 list_slab_objects(s, slab);
8004 }
8005 }
8006 spin_unlock_irq(lock: &n->list_lock);
8007
8008 list_for_each_entry_safe(slab, h, &discard, slab_list)
8009 discard_slab(s, slab);
8010}
8011
8012bool __kmem_cache_empty(struct kmem_cache *s)
8013{
8014 int node;
8015 struct kmem_cache_node *n;
8016
8017 for_each_kmem_cache_node(s, node, n)
8018 if (n->nr_partial || node_nr_slabs(n))
8019 return false;
8020 return true;
8021}
8022
8023/*
8024 * Release all resources used by a slab cache.
8025 */
8026int __kmem_cache_shutdown(struct kmem_cache *s)
8027{
8028 int node;
8029 struct kmem_cache_node *n;
8030
8031 flush_all_cpus_locked(s);
8032
8033 /* we might have rcu sheaves in flight */
8034 if (s->cpu_sheaves)
8035 rcu_barrier();
8036
8037 /* Attempt to free all objects */
8038 for_each_kmem_cache_node(s, node, n) {
8039 if (n->barn)
8040 barn_shrink(s, barn: n->barn);
8041 free_partial(s, n);
8042 if (n->nr_partial || node_nr_slabs(n))
8043 return 1;
8044 }
8045 return 0;
8046}
8047
8048#ifdef CONFIG_PRINTK
8049void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
8050{
8051 void *base;
8052 int __maybe_unused i;
8053 unsigned int objnr;
8054 void *objp;
8055 void *objp0;
8056 struct kmem_cache *s = slab->slab_cache;
8057 struct track __maybe_unused *trackp;
8058
8059 kpp->kp_ptr = object;
8060 kpp->kp_slab = slab;
8061 kpp->kp_slab_cache = s;
8062 base = slab_address(slab);
8063 objp0 = kasan_reset_tag(addr: object);
8064#ifdef CONFIG_SLUB_DEBUG
8065 objp = restore_red_left(s, p: objp0);
8066#else
8067 objp = objp0;
8068#endif
8069 objnr = obj_to_index(cache: s, slab, obj: objp);
8070 kpp->kp_data_offset = (unsigned long)((char *)objp0 - (char *)objp);
8071 objp = base + s->size * objnr;
8072 kpp->kp_objp = objp;
8073 if (WARN_ON_ONCE(objp < base || objp >= base + slab->objects * s->size
8074 || (objp - base) % s->size) ||
8075 !(s->flags & SLAB_STORE_USER))
8076 return;
8077#ifdef CONFIG_SLUB_DEBUG
8078 objp = fixup_red_left(s, p: objp);
8079 trackp = get_track(s, object: objp, alloc: TRACK_ALLOC);
8080 kpp->kp_ret = (void *)trackp->addr;
8081#ifdef CONFIG_STACKDEPOT
8082 {
8083 depot_stack_handle_t handle;
8084 unsigned long *entries;
8085 unsigned int nr_entries;
8086
8087 handle = READ_ONCE(trackp->handle);
8088 if (handle) {
8089 nr_entries = stack_depot_fetch(handle, entries: &entries);
8090 for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++)
8091 kpp->kp_stack[i] = (void *)entries[i];
8092 }
8093
8094 trackp = get_track(s, object: objp, alloc: TRACK_FREE);
8095 handle = READ_ONCE(trackp->handle);
8096 if (handle) {
8097 nr_entries = stack_depot_fetch(handle, entries: &entries);
8098 for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++)
8099 kpp->kp_free_stack[i] = (void *)entries[i];
8100 }
8101 }
8102#endif
8103#endif
8104}
8105#endif
8106
8107/********************************************************************
8108 * Kmalloc subsystem
8109 *******************************************************************/
8110
8111static int __init setup_slub_min_order(char *str)
8112{
8113 get_option(str: &str, pint: (int *)&slub_min_order);
8114
8115 if (slub_min_order > slub_max_order)
8116 slub_max_order = slub_min_order;
8117
8118 return 1;
8119}
8120
8121__setup("slab_min_order=", setup_slub_min_order);
8122__setup_param("slub_min_order=", slub_min_order, setup_slub_min_order, 0);
8123
8124
8125static int __init setup_slub_max_order(char *str)
8126{
8127 get_option(str: &str, pint: (int *)&slub_max_order);
8128 slub_max_order = min_t(unsigned int, slub_max_order, MAX_PAGE_ORDER);
8129
8130 if (slub_min_order > slub_max_order)
8131 slub_min_order = slub_max_order;
8132
8133 return 1;
8134}
8135
8136__setup("slab_max_order=", setup_slub_max_order);
8137__setup_param("slub_max_order=", slub_max_order, setup_slub_max_order, 0);
8138
8139static int __init setup_slub_min_objects(char *str)
8140{
8141 get_option(str: &str, pint: (int *)&slub_min_objects);
8142
8143 return 1;
8144}
8145
8146__setup("slab_min_objects=", setup_slub_min_objects);
8147__setup_param("slub_min_objects=", slub_min_objects, setup_slub_min_objects, 0);
8148
8149#ifdef CONFIG_NUMA
8150static int __init setup_slab_strict_numa(char *str)
8151{
8152 if (nr_node_ids > 1) {
8153 static_branch_enable(&strict_numa);
8154 pr_info("SLUB: Strict NUMA enabled.\n");
8155 } else {
8156 pr_warn("slab_strict_numa parameter set on non NUMA system.\n");
8157 }
8158
8159 return 1;
8160}
8161
8162__setup("slab_strict_numa", setup_slab_strict_numa);
8163#endif
8164
8165
8166#ifdef CONFIG_HARDENED_USERCOPY
8167/*
8168 * Rejects incorrectly sized objects and objects that are to be copied
8169 * to/from userspace but do not fall entirely within the containing slab
8170 * cache's usercopy region.
8171 *
8172 * Returns NULL if check passes, otherwise const char * to name of cache
8173 * to indicate an error.
8174 */
8175void __check_heap_object(const void *ptr, unsigned long n,
8176 const struct slab *slab, bool to_user)
8177{
8178 struct kmem_cache *s;
8179 unsigned int offset;
8180 bool is_kfence = is_kfence_address(ptr);
8181
8182 ptr = kasan_reset_tag(ptr);
8183
8184 /* Find object and usable object size. */
8185 s = slab->slab_cache;
8186
8187 /* Reject impossible pointers. */
8188 if (ptr < slab_address(slab))
8189 usercopy_abort("SLUB object not in SLUB page?!", NULL,
8190 to_user, 0, n);
8191
8192 /* Find offset within object. */
8193 if (is_kfence)
8194 offset = ptr - kfence_object_start(ptr);
8195 else
8196 offset = (ptr - slab_address(slab)) % s->size;
8197
8198 /* Adjust for redzone and reject if within the redzone. */
8199 if (!is_kfence && kmem_cache_debug_flags(s, SLAB_RED_ZONE)) {
8200 if (offset < s->red_left_pad)
8201 usercopy_abort("SLUB object in left red zone",
8202 s->name, to_user, offset, n);
8203 offset -= s->red_left_pad;
8204 }
8205
8206 /* Allow address range falling entirely within usercopy region. */
8207 if (offset >= s->useroffset &&
8208 offset - s->useroffset <= s->usersize &&
8209 n <= s->useroffset - offset + s->usersize)
8210 return;
8211
8212 usercopy_abort("SLUB object", s->name, to_user, offset, n);
8213}
8214#endif /* CONFIG_HARDENED_USERCOPY */
8215
8216#define SHRINK_PROMOTE_MAX 32
8217
8218/*
8219 * kmem_cache_shrink discards empty slabs and promotes the slabs filled
8220 * up most to the head of the partial lists. New allocations will then
8221 * fill those up and thus they can be removed from the partial lists.
8222 *
8223 * The slabs with the least items are placed last. This results in them
8224 * being allocated from last increasing the chance that the last objects
8225 * are freed in them.
8226 */
8227static int __kmem_cache_do_shrink(struct kmem_cache *s)
8228{
8229 int node;
8230 int i;
8231 struct kmem_cache_node *n;
8232 struct slab *slab;
8233 struct slab *t;
8234 struct list_head discard;
8235 struct list_head promote[SHRINK_PROMOTE_MAX];
8236 unsigned long flags;
8237 int ret = 0;
8238
8239 for_each_kmem_cache_node(s, node, n) {
8240 INIT_LIST_HEAD(list: &discard);
8241 for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
8242 INIT_LIST_HEAD(list: promote + i);
8243
8244 if (n->barn)
8245 barn_shrink(s, barn: n->barn);
8246
8247 spin_lock_irqsave(&n->list_lock, flags);
8248
8249 /*
8250 * Build lists of slabs to discard or promote.
8251 *
8252 * Note that concurrent frees may occur while we hold the
8253 * list_lock. slab->inuse here is the upper limit.
8254 */
8255 list_for_each_entry_safe(slab, t, &n->partial, slab_list) {
8256 int free = slab->objects - slab->inuse;
8257
8258 /* Do not reread slab->inuse */
8259 barrier();
8260
8261 /* We do not keep full slabs on the list */
8262 BUG_ON(free <= 0);
8263
8264 if (free == slab->objects) {
8265 list_move(list: &slab->slab_list, head: &discard);
8266 slab_clear_node_partial(slab);
8267 n->nr_partial--;
8268 dec_slabs_node(s, node, objects: slab->objects);
8269 } else if (free <= SHRINK_PROMOTE_MAX)
8270 list_move(list: &slab->slab_list, head: promote + free - 1);
8271 }
8272
8273 /*
8274 * Promote the slabs filled up most to the head of the
8275 * partial list.
8276 */
8277 for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
8278 list_splice(list: promote + i, head: &n->partial);
8279
8280 spin_unlock_irqrestore(lock: &n->list_lock, flags);
8281
8282 /* Release empty slabs */
8283 list_for_each_entry_safe(slab, t, &discard, slab_list)
8284 free_slab(s, slab);
8285
8286 if (node_nr_slabs(n))
8287 ret = 1;
8288 }
8289
8290 return ret;
8291}
8292
8293int __kmem_cache_shrink(struct kmem_cache *s)
8294{
8295 flush_all(s);
8296 return __kmem_cache_do_shrink(s);
8297}
8298
8299static int slab_mem_going_offline_callback(void)
8300{
8301 struct kmem_cache *s;
8302
8303 mutex_lock(lock: &slab_mutex);
8304 list_for_each_entry(s, &slab_caches, list) {
8305 flush_all_cpus_locked(s);
8306 __kmem_cache_do_shrink(s);
8307 }
8308 mutex_unlock(lock: &slab_mutex);
8309
8310 return 0;
8311}
8312
8313static int slab_mem_going_online_callback(int nid)
8314{
8315 struct kmem_cache_node *n;
8316 struct kmem_cache *s;
8317 int ret = 0;
8318
8319 /*
8320 * We are bringing a node online. No memory is available yet. We must
8321 * allocate a kmem_cache_node structure in order to bring the node
8322 * online.
8323 */
8324 mutex_lock(lock: &slab_mutex);
8325 list_for_each_entry(s, &slab_caches, list) {
8326 struct node_barn *barn = NULL;
8327
8328 /*
8329 * The structure may already exist if the node was previously
8330 * onlined and offlined.
8331 */
8332 if (get_node(s, node: nid))
8333 continue;
8334
8335 if (s->cpu_sheaves) {
8336 barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, nid);
8337
8338 if (!barn) {
8339 ret = -ENOMEM;
8340 goto out;
8341 }
8342 }
8343
8344 /*
8345 * XXX: kmem_cache_alloc_node will fallback to other nodes
8346 * since memory is not yet available from the node that
8347 * is brought up.
8348 */
8349 n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL);
8350 if (!n) {
8351 kfree(barn);
8352 ret = -ENOMEM;
8353 goto out;
8354 }
8355
8356 init_kmem_cache_node(n, barn);
8357
8358 s->node[nid] = n;
8359 }
8360 /*
8361 * Any cache created after this point will also have kmem_cache_node
8362 * initialized for the new node.
8363 */
8364 node_set(nid, slab_nodes);
8365out:
8366 mutex_unlock(lock: &slab_mutex);
8367 return ret;
8368}
8369
8370static int slab_memory_callback(struct notifier_block *self,
8371 unsigned long action, void *arg)
8372{
8373 struct node_notify *nn = arg;
8374 int nid = nn->nid;
8375 int ret = 0;
8376
8377 switch (action) {
8378 case NODE_ADDING_FIRST_MEMORY:
8379 ret = slab_mem_going_online_callback(nid);
8380 break;
8381 case NODE_REMOVING_LAST_MEMORY:
8382 ret = slab_mem_going_offline_callback();
8383 break;
8384 }
8385 if (ret)
8386 ret = notifier_from_errno(err: ret);
8387 else
8388 ret = NOTIFY_OK;
8389 return ret;
8390}
8391
8392/********************************************************************
8393 * Basic setup of slabs
8394 *******************************************************************/
8395
8396/*
8397 * Used for early kmem_cache structures that were allocated using
8398 * the page allocator. Allocate them properly then fix up the pointers
8399 * that may be pointing to the wrong kmem_cache structure.
8400 */
8401
8402static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
8403{
8404 int node;
8405 struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
8406 struct kmem_cache_node *n;
8407
8408 memcpy(to: s, from: static_cache, len: kmem_cache->object_size);
8409
8410 /*
8411 * This runs very early, and only the boot processor is supposed to be
8412 * up. Even if it weren't true, IRQs are not up so we couldn't fire
8413 * IPIs around.
8414 */
8415 __flush_cpu_slab(s, smp_processor_id());
8416 for_each_kmem_cache_node(s, node, n) {
8417 struct slab *p;
8418
8419 list_for_each_entry(p, &n->partial, slab_list)
8420 p->slab_cache = s;
8421
8422#ifdef CONFIG_SLUB_DEBUG
8423 list_for_each_entry(p, &n->full, slab_list)
8424 p->slab_cache = s;
8425#endif
8426 }
8427 list_add(new: &s->list, head: &slab_caches);
8428 return s;
8429}
8430
8431void __init kmem_cache_init(void)
8432{
8433 static __initdata struct kmem_cache boot_kmem_cache,
8434 boot_kmem_cache_node;
8435 int node;
8436
8437 if (debug_guardpage_minorder())
8438 slub_max_order = 0;
8439
8440 /* Inform pointer hashing choice about slub debugging state. */
8441 hash_pointers_finalize(slub_debug: __slub_debug_enabled());
8442
8443 kmem_cache_node = &boot_kmem_cache_node;
8444 kmem_cache = &boot_kmem_cache;
8445
8446 /*
8447 * Initialize the nodemask for which we will allocate per node
8448 * structures. Here we don't need taking slab_mutex yet.
8449 */
8450 for_each_node_state(node, N_MEMORY)
8451 node_set(node, slab_nodes);
8452
8453 create_boot_cache(kmem_cache_node, name: "kmem_cache_node",
8454 size: sizeof(struct kmem_cache_node),
8455 SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, useroffset: 0, usersize: 0);
8456
8457 hotplug_node_notifier(fn: slab_memory_callback, SLAB_CALLBACK_PRI);
8458
8459 /* Able to allocate the per node structures */
8460 slab_state = PARTIAL;
8461
8462 create_boot_cache(kmem_cache, name: "kmem_cache",
8463 offsetof(struct kmem_cache, node) +
8464 nr_node_ids * sizeof(struct kmem_cache_node *),
8465 SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, useroffset: 0, usersize: 0);
8466
8467 kmem_cache = bootstrap(static_cache: &boot_kmem_cache);
8468 kmem_cache_node = bootstrap(static_cache: &boot_kmem_cache_node);
8469
8470 /* Now we can use the kmem_cache to allocate kmalloc slabs */
8471 setup_kmalloc_cache_index_table();
8472 create_kmalloc_caches();
8473
8474 /* Setup random freelists for each cache */
8475 init_freelist_randomization();
8476
8477 cpuhp_setup_state_nocalls(state: CPUHP_SLUB_DEAD, name: "slub:dead", NULL,
8478 teardown: slub_cpu_dead);
8479
8480 pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%u\n",
8481 cache_line_size(),
8482 slub_min_order, slub_max_order, slub_min_objects,
8483 nr_cpu_ids, nr_node_ids);
8484}
8485
8486void __init kmem_cache_init_late(void)
8487{
8488#ifndef CONFIG_SLUB_TINY
8489 flushwq = alloc_workqueue("slub_flushwq", WQ_MEM_RECLAIM, 0);
8490 WARN_ON(!flushwq);
8491#endif
8492}
8493
8494struct kmem_cache *
8495__kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
8496 slab_flags_t flags, void (*ctor)(void *))
8497{
8498 struct kmem_cache *s;
8499
8500 s = find_mergeable(size, align, flags, name, ctor);
8501 if (s) {
8502 if (sysfs_slab_alias(s, name))
8503 pr_err("SLUB: Unable to add cache alias %s to sysfs\n",
8504 name);
8505
8506 s->refcount++;
8507
8508 /*
8509 * Adjust the object sizes so that we clear
8510 * the complete object on kzalloc.
8511 */
8512 s->object_size = max(s->object_size, size);
8513 s->inuse = max(s->inuse, ALIGN(size, sizeof(void *)));
8514 }
8515
8516 return s;
8517}
8518
8519int do_kmem_cache_create(struct kmem_cache *s, const char *name,
8520 unsigned int size, struct kmem_cache_args *args,
8521 slab_flags_t flags)
8522{
8523 int err = -EINVAL;
8524
8525 s->name = name;
8526 s->size = s->object_size = size;
8527
8528 s->flags = kmem_cache_flags(flags, name: s->name);
8529#ifdef CONFIG_SLAB_FREELIST_HARDENED
8530 s->random = get_random_long();
8531#endif
8532 s->align = args->align;
8533 s->ctor = args->ctor;
8534#ifdef CONFIG_HARDENED_USERCOPY
8535 s->useroffset = args->useroffset;
8536 s->usersize = args->usersize;
8537#endif
8538
8539 if (!calculate_sizes(args, s))
8540 goto out;
8541 if (disable_higher_order_debug) {
8542 /*
8543 * Disable debugging flags that store metadata if the min slab
8544 * order increased.
8545 */
8546 if (get_order(size: s->size) > get_order(size: s->object_size)) {
8547 s->flags &= ~DEBUG_METADATA_FLAGS;
8548 s->offset = 0;
8549 if (!calculate_sizes(args, s))
8550 goto out;
8551 }
8552 }
8553
8554#ifdef system_has_freelist_aba
8555 if (system_has_freelist_aba() && !(s->flags & SLAB_NO_CMPXCHG)) {
8556 /* Enable fast mode */
8557 s->flags |= __CMPXCHG_DOUBLE;
8558 }
8559#endif
8560
8561 /*
8562 * The larger the object size is, the more slabs we want on the partial
8563 * list to avoid pounding the page allocator excessively.
8564 */
8565 s->min_partial = min_t(unsigned long, MAX_PARTIAL, ilog2(s->size) / 2);
8566 s->min_partial = max_t(unsigned long, MIN_PARTIAL, s->min_partial);
8567
8568 set_cpu_partial(s);
8569
8570 if (args->sheaf_capacity && !IS_ENABLED(CONFIG_SLUB_TINY)
8571 && !(s->flags & SLAB_DEBUG_FLAGS)) {
8572 s->cpu_sheaves = alloc_percpu(struct slub_percpu_sheaves);
8573 if (!s->cpu_sheaves) {
8574 err = -ENOMEM;
8575 goto out;
8576 }
8577 // TODO: increase capacity to grow slab_sheaf up to next kmalloc size?
8578 s->sheaf_capacity = args->sheaf_capacity;
8579 }
8580
8581#ifdef CONFIG_NUMA
8582 s->remote_node_defrag_ratio = 1000;
8583#endif
8584
8585 /* Initialize the pre-computed randomized freelist if slab is up */
8586 if (slab_state >= UP) {
8587 if (init_cache_random_seq(s))
8588 goto out;
8589 }
8590
8591 if (!init_kmem_cache_nodes(s))
8592 goto out;
8593
8594 if (!alloc_kmem_cache_cpus(s))
8595 goto out;
8596
8597 if (s->cpu_sheaves) {
8598 err = init_percpu_sheaves(s);
8599 if (err)
8600 goto out;
8601 }
8602
8603 err = 0;
8604
8605 /* Mutex is not taken during early boot */
8606 if (slab_state <= UP)
8607 goto out;
8608
8609 /*
8610 * Failing to create sysfs files is not critical to SLUB functionality.
8611 * If it fails, proceed with cache creation without these files.
8612 */
8613 if (sysfs_slab_add(s))
8614 pr_err("SLUB: Unable to add cache %s to sysfs\n", s->name);
8615
8616 if (s->flags & SLAB_STORE_USER)
8617 debugfs_slab_add(s);
8618
8619out:
8620 if (err)
8621 __kmem_cache_release(s);
8622 return err;
8623}
8624
8625#ifdef SLAB_SUPPORTS_SYSFS
8626static int count_inuse(struct slab *slab)
8627{
8628 return slab->inuse;
8629}
8630
8631static int count_total(struct slab *slab)
8632{
8633 return slab->objects;
8634}
8635#endif
8636
8637#ifdef CONFIG_SLUB_DEBUG
8638static void validate_slab(struct kmem_cache *s, struct slab *slab,
8639 unsigned long *obj_map)
8640{
8641 void *p;
8642 void *addr = slab_address(slab);
8643
8644 if (!validate_slab_ptr(slab)) {
8645 slab_err(s, slab, fmt: "Not a valid slab page");
8646 return;
8647 }
8648
8649 if (!check_slab(s, slab) || !on_freelist(s, slab, NULL))
8650 return;
8651
8652 /* Now we know that a valid freelist exists */
8653 __fill_map(obj_map, s, slab);
8654 for_each_object(p, s, addr, slab->objects) {
8655 u8 val = test_bit(__obj_to_index(s, addr, p), obj_map) ?
8656 SLUB_RED_INACTIVE : SLUB_RED_ACTIVE;
8657
8658 if (!check_object(s, slab, object: p, val))
8659 break;
8660 }
8661}
8662
8663static int validate_slab_node(struct kmem_cache *s,
8664 struct kmem_cache_node *n, unsigned long *obj_map)
8665{
8666 unsigned long count = 0;
8667 struct slab *slab;
8668 unsigned long flags;
8669
8670 spin_lock_irqsave(&n->list_lock, flags);
8671
8672 list_for_each_entry(slab, &n->partial, slab_list) {
8673 validate_slab(s, slab, obj_map);
8674 count++;
8675 }
8676 if (count != n->nr_partial) {
8677 pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n",
8678 s->name, count, n->nr_partial);
8679 slab_add_kunit_errors();
8680 }
8681
8682 if (!(s->flags & SLAB_STORE_USER))
8683 goto out;
8684
8685 list_for_each_entry(slab, &n->full, slab_list) {
8686 validate_slab(s, slab, obj_map);
8687 count++;
8688 }
8689 if (count != node_nr_slabs(n)) {
8690 pr_err("SLUB: %s %ld slabs counted but counter=%ld\n",
8691 s->name, count, node_nr_slabs(n));
8692 slab_add_kunit_errors();
8693 }
8694
8695out:
8696 spin_unlock_irqrestore(lock: &n->list_lock, flags);
8697 return count;
8698}
8699
8700long validate_slab_cache(struct kmem_cache *s)
8701{
8702 int node;
8703 unsigned long count = 0;
8704 struct kmem_cache_node *n;
8705 unsigned long *obj_map;
8706
8707 obj_map = bitmap_alloc(nbits: oo_objects(x: s->oo), GFP_KERNEL);
8708 if (!obj_map)
8709 return -ENOMEM;
8710
8711 flush_all(s);
8712 for_each_kmem_cache_node(s, node, n)
8713 count += validate_slab_node(s, n, obj_map);
8714
8715 bitmap_free(bitmap: obj_map);
8716
8717 return count;
8718}
8719EXPORT_SYMBOL(validate_slab_cache);
8720
8721#ifdef CONFIG_DEBUG_FS
8722/*
8723 * Generate lists of code addresses where slabcache objects are allocated
8724 * and freed.
8725 */
8726
8727struct location {
8728 depot_stack_handle_t handle;
8729 unsigned long count;
8730 unsigned long addr;
8731 unsigned long waste;
8732 long long sum_time;
8733 long min_time;
8734 long max_time;
8735 long min_pid;
8736 long max_pid;
8737 DECLARE_BITMAP(cpus, NR_CPUS);
8738 nodemask_t nodes;
8739};
8740
8741struct loc_track {
8742 unsigned long max;
8743 unsigned long count;
8744 struct location *loc;
8745 loff_t idx;
8746};
8747
8748static struct dentry *slab_debugfs_root;
8749
8750static void free_loc_track(struct loc_track *t)
8751{
8752 if (t->max)
8753 free_pages(addr: (unsigned long)t->loc,
8754 order: get_order(size: sizeof(struct location) * t->max));
8755}
8756
8757static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags)
8758{
8759 struct location *l;
8760 int order;
8761
8762 order = get_order(size: sizeof(struct location) * max);
8763
8764 l = (void *)__get_free_pages(flags, order);
8765 if (!l)
8766 return 0;
8767
8768 if (t->count) {
8769 memcpy(to: l, from: t->loc, len: sizeof(struct location) * t->count);
8770 free_loc_track(t);
8771 }
8772 t->max = max;
8773 t->loc = l;
8774 return 1;
8775}
8776
8777static int add_location(struct loc_track *t, struct kmem_cache *s,
8778 const struct track *track,
8779 unsigned int orig_size)
8780{
8781 long start, end, pos;
8782 struct location *l;
8783 unsigned long caddr, chandle, cwaste;
8784 unsigned long age = jiffies - track->when;
8785 depot_stack_handle_t handle = 0;
8786 unsigned int waste = s->object_size - orig_size;
8787
8788#ifdef CONFIG_STACKDEPOT
8789 handle = READ_ONCE(track->handle);
8790#endif
8791 start = -1;
8792 end = t->count;
8793
8794 for ( ; ; ) {
8795 pos = start + (end - start + 1) / 2;
8796
8797 /*
8798 * There is nothing at "end". If we end up there
8799 * we need to add something to before end.
8800 */
8801 if (pos == end)
8802 break;
8803
8804 l = &t->loc[pos];
8805 caddr = l->addr;
8806 chandle = l->handle;
8807 cwaste = l->waste;
8808 if ((track->addr == caddr) && (handle == chandle) &&
8809 (waste == cwaste)) {
8810
8811 l->count++;
8812 if (track->when) {
8813 l->sum_time += age;
8814 if (age < l->min_time)
8815 l->min_time = age;
8816 if (age > l->max_time)
8817 l->max_time = age;
8818
8819 if (track->pid < l->min_pid)
8820 l->min_pid = track->pid;
8821 if (track->pid > l->max_pid)
8822 l->max_pid = track->pid;
8823
8824 cpumask_set_cpu(cpu: track->cpu,
8825 to_cpumask(l->cpus));
8826 }
8827 node_set(page_to_nid(virt_to_page(track)), l->nodes);
8828 return 1;
8829 }
8830
8831 if (track->addr < caddr)
8832 end = pos;
8833 else if (track->addr == caddr && handle < chandle)
8834 end = pos;
8835 else if (track->addr == caddr && handle == chandle &&
8836 waste < cwaste)
8837 end = pos;
8838 else
8839 start = pos;
8840 }
8841
8842 /*
8843 * Not found. Insert new tracking element.
8844 */
8845 if (t->count >= t->max && !alloc_loc_track(t, max: 2 * t->max, GFP_ATOMIC))
8846 return 0;
8847
8848 l = t->loc + pos;
8849 if (pos < t->count)
8850 memmove(dest: l + 1, src: l,
8851 count: (t->count - pos) * sizeof(struct location));
8852 t->count++;
8853 l->count = 1;
8854 l->addr = track->addr;
8855 l->sum_time = age;
8856 l->min_time = age;
8857 l->max_time = age;
8858 l->min_pid = track->pid;
8859 l->max_pid = track->pid;
8860 l->handle = handle;
8861 l->waste = waste;
8862 cpumask_clear(to_cpumask(l->cpus));
8863 cpumask_set_cpu(cpu: track->cpu, to_cpumask(l->cpus));
8864 nodes_clear(l->nodes);
8865 node_set(page_to_nid(virt_to_page(track)), l->nodes);
8866 return 1;
8867}
8868
8869static void process_slab(struct loc_track *t, struct kmem_cache *s,
8870 struct slab *slab, enum track_item alloc,
8871 unsigned long *obj_map)
8872{
8873 void *addr = slab_address(slab);
8874 bool is_alloc = (alloc == TRACK_ALLOC);
8875 void *p;
8876
8877 __fill_map(obj_map, s, slab);
8878
8879 for_each_object(p, s, addr, slab->objects)
8880 if (!test_bit(__obj_to_index(s, addr, p), obj_map))
8881 add_location(t, s, track: get_track(s, object: p, alloc),
8882 orig_size: is_alloc ? get_orig_size(s, object: p) :
8883 s->object_size);
8884}
8885#endif /* CONFIG_DEBUG_FS */
8886#endif /* CONFIG_SLUB_DEBUG */
8887
8888#ifdef SLAB_SUPPORTS_SYSFS
8889enum slab_stat_type {
8890 SL_ALL, /* All slabs */
8891 SL_PARTIAL, /* Only partially allocated slabs */
8892 SL_CPU, /* Only slabs used for cpu caches */
8893 SL_OBJECTS, /* Determine allocated objects not slabs */
8894 SL_TOTAL /* Determine object capacity not slabs */
8895};
8896
8897#define SO_ALL (1 << SL_ALL)
8898#define SO_PARTIAL (1 << SL_PARTIAL)
8899#define SO_CPU (1 << SL_CPU)
8900#define SO_OBJECTS (1 << SL_OBJECTS)
8901#define SO_TOTAL (1 << SL_TOTAL)
8902
8903static ssize_t show_slab_objects(struct kmem_cache *s,
8904 char *buf, unsigned long flags)
8905{
8906 unsigned long total = 0;
8907 int node;
8908 int x;
8909 unsigned long *nodes;
8910 int len = 0;
8911
8912 nodes = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL);
8913 if (!nodes)
8914 return -ENOMEM;
8915
8916 if (flags & SO_CPU) {
8917 int cpu;
8918
8919 for_each_possible_cpu(cpu) {
8920 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab,
8921 cpu);
8922 int node;
8923 struct slab *slab;
8924
8925 slab = READ_ONCE(c->slab);
8926 if (!slab)
8927 continue;
8928
8929 node = slab_nid(slab);
8930 if (flags & SO_TOTAL)
8931 x = slab->objects;
8932 else if (flags & SO_OBJECTS)
8933 x = slab->inuse;
8934 else
8935 x = 1;
8936
8937 total += x;
8938 nodes[node] += x;
8939
8940#ifdef CONFIG_SLUB_CPU_PARTIAL
8941 slab = slub_percpu_partial_read_once(c);
8942 if (slab) {
8943 node = slab_nid(slab);
8944 if (flags & SO_TOTAL)
8945 WARN_ON_ONCE(1);
8946 else if (flags & SO_OBJECTS)
8947 WARN_ON_ONCE(1);
8948 else
8949 x = data_race(slab->slabs);
8950 total += x;
8951 nodes[node] += x;
8952 }
8953#endif
8954 }
8955 }
8956
8957 /*
8958 * It is impossible to take "mem_hotplug_lock" here with "kernfs_mutex"
8959 * already held which will conflict with an existing lock order:
8960 *
8961 * mem_hotplug_lock->slab_mutex->kernfs_mutex
8962 *
8963 * We don't really need mem_hotplug_lock (to hold off
8964 * slab_mem_going_offline_callback) here because slab's memory hot
8965 * unplug code doesn't destroy the kmem_cache->node[] data.
8966 */
8967
8968#ifdef CONFIG_SLUB_DEBUG
8969 if (flags & SO_ALL) {
8970 struct kmem_cache_node *n;
8971
8972 for_each_kmem_cache_node(s, node, n) {
8973
8974 if (flags & SO_TOTAL)
8975 x = node_nr_objs(n);
8976 else if (flags & SO_OBJECTS)
8977 x = node_nr_objs(n) - count_partial(n, get_count: count_free);
8978 else
8979 x = node_nr_slabs(n);
8980 total += x;
8981 nodes[node] += x;
8982 }
8983
8984 } else
8985#endif
8986 if (flags & SO_PARTIAL) {
8987 struct kmem_cache_node *n;
8988
8989 for_each_kmem_cache_node(s, node, n) {
8990 if (flags & SO_TOTAL)
8991 x = count_partial(n, get_count: count_total);
8992 else if (flags & SO_OBJECTS)
8993 x = count_partial(n, get_count: count_inuse);
8994 else
8995 x = n->nr_partial;
8996 total += x;
8997 nodes[node] += x;
8998 }
8999 }
9000
9001 len += sysfs_emit_at(buf, at: len, fmt: "%lu", total);
9002#ifdef CONFIG_NUMA
9003 for (node = 0; node < nr_node_ids; node++) {
9004 if (nodes[node])
9005 len += sysfs_emit_at(buf, at: len, fmt: " N%d=%lu",
9006 node, nodes[node]);
9007 }
9008#endif
9009 len += sysfs_emit_at(buf, at: len, fmt: "\n");
9010 kfree(nodes);
9011
9012 return len;
9013}
9014
9015#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
9016#define to_slab(n) container_of(n, struct kmem_cache, kobj)
9017
9018struct slab_attribute {
9019 struct attribute attr;
9020 ssize_t (*show)(struct kmem_cache *s, char *buf);
9021 ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count);
9022};
9023
9024#define SLAB_ATTR_RO(_name) \
9025 static struct slab_attribute _name##_attr = __ATTR_RO_MODE(_name, 0400)
9026
9027#define SLAB_ATTR(_name) \
9028 static struct slab_attribute _name##_attr = __ATTR_RW_MODE(_name, 0600)
9029
9030static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
9031{
9032 return sysfs_emit(buf, fmt: "%u\n", s->size);
9033}
9034SLAB_ATTR_RO(slab_size);
9035
9036static ssize_t align_show(struct kmem_cache *s, char *buf)
9037{
9038 return sysfs_emit(buf, fmt: "%u\n", s->align);
9039}
9040SLAB_ATTR_RO(align);
9041
9042static ssize_t object_size_show(struct kmem_cache *s, char *buf)
9043{
9044 return sysfs_emit(buf, fmt: "%u\n", s->object_size);
9045}
9046SLAB_ATTR_RO(object_size);
9047
9048static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
9049{
9050 return sysfs_emit(buf, fmt: "%u\n", oo_objects(x: s->oo));
9051}
9052SLAB_ATTR_RO(objs_per_slab);
9053
9054static ssize_t order_show(struct kmem_cache *s, char *buf)
9055{
9056 return sysfs_emit(buf, fmt: "%u\n", oo_order(x: s->oo));
9057}
9058SLAB_ATTR_RO(order);
9059
9060static ssize_t sheaf_capacity_show(struct kmem_cache *s, char *buf)
9061{
9062 return sysfs_emit(buf, fmt: "%u\n", s->sheaf_capacity);
9063}
9064SLAB_ATTR_RO(sheaf_capacity);
9065
9066static ssize_t min_partial_show(struct kmem_cache *s, char *buf)
9067{
9068 return sysfs_emit(buf, fmt: "%lu\n", s->min_partial);
9069}
9070
9071static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
9072 size_t length)
9073{
9074 unsigned long min;
9075 int err;
9076
9077 err = kstrtoul(s: buf, base: 10, res: &min);
9078 if (err)
9079 return err;
9080
9081 s->min_partial = min;
9082 return length;
9083}
9084SLAB_ATTR(min_partial);
9085
9086static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf)
9087{
9088 unsigned int nr_partial = 0;
9089#ifdef CONFIG_SLUB_CPU_PARTIAL
9090 nr_partial = s->cpu_partial;
9091#endif
9092
9093 return sysfs_emit(buf, fmt: "%u\n", nr_partial);
9094}
9095
9096static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
9097 size_t length)
9098{
9099 unsigned int objects;
9100 int err;
9101
9102 err = kstrtouint(s: buf, base: 10, res: &objects);
9103 if (err)
9104 return err;
9105 if (objects && !kmem_cache_has_cpu_partial(s))
9106 return -EINVAL;
9107
9108 slub_set_cpu_partial(s, nr_objects: objects);
9109 flush_all(s);
9110 return length;
9111}
9112SLAB_ATTR(cpu_partial);
9113
9114static ssize_t ctor_show(struct kmem_cache *s, char *buf)
9115{
9116 if (!s->ctor)
9117 return 0;
9118 return sysfs_emit(buf, fmt: "%pS\n", s->ctor);
9119}
9120SLAB_ATTR_RO(ctor);
9121
9122static ssize_t aliases_show(struct kmem_cache *s, char *buf)
9123{
9124 return sysfs_emit(buf, fmt: "%d\n", s->refcount < 0 ? 0 : s->refcount - 1);
9125}
9126SLAB_ATTR_RO(aliases);
9127
9128static ssize_t partial_show(struct kmem_cache *s, char *buf)
9129{
9130 return show_slab_objects(s, buf, SO_PARTIAL);
9131}
9132SLAB_ATTR_RO(partial);
9133
9134static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf)
9135{
9136 return show_slab_objects(s, buf, SO_CPU);
9137}
9138SLAB_ATTR_RO(cpu_slabs);
9139
9140static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
9141{
9142 return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS);
9143}
9144SLAB_ATTR_RO(objects_partial);
9145
9146static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
9147{
9148 int objects = 0;
9149 int slabs = 0;
9150 int cpu __maybe_unused;
9151 int len = 0;
9152
9153#ifdef CONFIG_SLUB_CPU_PARTIAL
9154 for_each_online_cpu(cpu) {
9155 struct slab *slab;
9156
9157 slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
9158
9159 if (slab)
9160 slabs += data_race(slab->slabs);
9161 }
9162#endif
9163
9164 /* Approximate half-full slabs, see slub_set_cpu_partial() */
9165 objects = (slabs * oo_objects(x: s->oo)) / 2;
9166 len += sysfs_emit_at(buf, at: len, fmt: "%d(%d)", objects, slabs);
9167
9168#ifdef CONFIG_SLUB_CPU_PARTIAL
9169 for_each_online_cpu(cpu) {
9170 struct slab *slab;
9171
9172 slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
9173 if (slab) {
9174 slabs = data_race(slab->slabs);
9175 objects = (slabs * oo_objects(x: s->oo)) / 2;
9176 len += sysfs_emit_at(buf, at: len, fmt: " C%d=%d(%d)",
9177 cpu, objects, slabs);
9178 }
9179 }
9180#endif
9181 len += sysfs_emit_at(buf, at: len, fmt: "\n");
9182
9183 return len;
9184}
9185SLAB_ATTR_RO(slabs_cpu_partial);
9186
9187static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
9188{
9189 return sysfs_emit(buf, fmt: "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
9190}
9191SLAB_ATTR_RO(reclaim_account);
9192
9193static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
9194{
9195 return sysfs_emit(buf, fmt: "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
9196}
9197SLAB_ATTR_RO(hwcache_align);
9198
9199#ifdef CONFIG_ZONE_DMA
9200static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
9201{
9202 return sysfs_emit(buf, fmt: "%d\n", !!(s->flags & SLAB_CACHE_DMA));
9203}
9204SLAB_ATTR_RO(cache_dma);
9205#endif
9206
9207#ifdef CONFIG_HARDENED_USERCOPY
9208static ssize_t usersize_show(struct kmem_cache *s, char *buf)
9209{
9210 return sysfs_emit(buf, "%u\n", s->usersize);
9211}
9212SLAB_ATTR_RO(usersize);
9213#endif
9214
9215static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
9216{
9217 return sysfs_emit(buf, fmt: "%d\n", !!(s->flags & SLAB_TYPESAFE_BY_RCU));
9218}
9219SLAB_ATTR_RO(destroy_by_rcu);
9220
9221#ifdef CONFIG_SLUB_DEBUG
9222static ssize_t slabs_show(struct kmem_cache *s, char *buf)
9223{
9224 return show_slab_objects(s, buf, SO_ALL);
9225}
9226SLAB_ATTR_RO(slabs);
9227
9228static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
9229{
9230 return show_slab_objects(s, buf, SO_ALL|SO_TOTAL);
9231}
9232SLAB_ATTR_RO(total_objects);
9233
9234static ssize_t objects_show(struct kmem_cache *s, char *buf)
9235{
9236 return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS);
9237}
9238SLAB_ATTR_RO(objects);
9239
9240static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
9241{
9242 return sysfs_emit(buf, fmt: "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS));
9243}
9244SLAB_ATTR_RO(sanity_checks);
9245
9246static ssize_t trace_show(struct kmem_cache *s, char *buf)
9247{
9248 return sysfs_emit(buf, fmt: "%d\n", !!(s->flags & SLAB_TRACE));
9249}
9250SLAB_ATTR_RO(trace);
9251
9252static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
9253{
9254 return sysfs_emit(buf, fmt: "%d\n", !!(s->flags & SLAB_RED_ZONE));
9255}
9256
9257SLAB_ATTR_RO(red_zone);
9258
9259static ssize_t poison_show(struct kmem_cache *s, char *buf)
9260{
9261 return sysfs_emit(buf, fmt: "%d\n", !!(s->flags & SLAB_POISON));
9262}
9263
9264SLAB_ATTR_RO(poison);
9265
9266static ssize_t store_user_show(struct kmem_cache *s, char *buf)
9267{
9268 return sysfs_emit(buf, fmt: "%d\n", !!(s->flags & SLAB_STORE_USER));
9269}
9270
9271SLAB_ATTR_RO(store_user);
9272
9273static ssize_t validate_show(struct kmem_cache *s, char *buf)
9274{
9275 return 0;
9276}
9277
9278static ssize_t validate_store(struct kmem_cache *s,
9279 const char *buf, size_t length)
9280{
9281 int ret = -EINVAL;
9282
9283 if (buf[0] == '1' && kmem_cache_debug(s)) {
9284 ret = validate_slab_cache(s);
9285 if (ret >= 0)
9286 ret = length;
9287 }
9288 return ret;
9289}
9290SLAB_ATTR(validate);
9291
9292#endif /* CONFIG_SLUB_DEBUG */
9293
9294#ifdef CONFIG_FAILSLAB
9295static ssize_t failslab_show(struct kmem_cache *s, char *buf)
9296{
9297 return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
9298}
9299
9300static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
9301 size_t length)
9302{
9303 if (s->refcount > 1)
9304 return -EINVAL;
9305
9306 if (buf[0] == '1')
9307 WRITE_ONCE(s->flags, s->flags | SLAB_FAILSLAB);
9308 else
9309 WRITE_ONCE(s->flags, s->flags & ~SLAB_FAILSLAB);
9310
9311 return length;
9312}
9313SLAB_ATTR(failslab);
9314#endif
9315
9316static ssize_t shrink_show(struct kmem_cache *s, char *buf)
9317{
9318 return 0;
9319}
9320
9321static ssize_t shrink_store(struct kmem_cache *s,
9322 const char *buf, size_t length)
9323{
9324 if (buf[0] == '1')
9325 kmem_cache_shrink(s);
9326 else
9327 return -EINVAL;
9328 return length;
9329}
9330SLAB_ATTR(shrink);
9331
9332#ifdef CONFIG_NUMA
9333static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
9334{
9335 return sysfs_emit(buf, fmt: "%u\n", s->remote_node_defrag_ratio / 10);
9336}
9337
9338static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
9339 const char *buf, size_t length)
9340{
9341 unsigned int ratio;
9342 int err;
9343
9344 err = kstrtouint(s: buf, base: 10, res: &ratio);
9345 if (err)
9346 return err;
9347 if (ratio > 100)
9348 return -ERANGE;
9349
9350 s->remote_node_defrag_ratio = ratio * 10;
9351
9352 return length;
9353}
9354SLAB_ATTR(remote_node_defrag_ratio);
9355#endif
9356
9357#ifdef CONFIG_SLUB_STATS
9358static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
9359{
9360 unsigned long sum = 0;
9361 int cpu;
9362 int len = 0;
9363 int *data = kmalloc_array(nr_cpu_ids, sizeof(int), GFP_KERNEL);
9364
9365 if (!data)
9366 return -ENOMEM;
9367
9368 for_each_online_cpu(cpu) {
9369 unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si];
9370
9371 data[cpu] = x;
9372 sum += x;
9373 }
9374
9375 len += sysfs_emit_at(buf, len, "%lu", sum);
9376
9377#ifdef CONFIG_SMP
9378 for_each_online_cpu(cpu) {
9379 if (data[cpu])
9380 len += sysfs_emit_at(buf, len, " C%d=%u",
9381 cpu, data[cpu]);
9382 }
9383#endif
9384 kfree(data);
9385 len += sysfs_emit_at(buf, len, "\n");
9386
9387 return len;
9388}
9389
9390static void clear_stat(struct kmem_cache *s, enum stat_item si)
9391{
9392 int cpu;
9393
9394 for_each_online_cpu(cpu)
9395 per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0;
9396}
9397
9398#define STAT_ATTR(si, text) \
9399static ssize_t text##_show(struct kmem_cache *s, char *buf) \
9400{ \
9401 return show_stat(s, buf, si); \
9402} \
9403static ssize_t text##_store(struct kmem_cache *s, \
9404 const char *buf, size_t length) \
9405{ \
9406 if (buf[0] != '0') \
9407 return -EINVAL; \
9408 clear_stat(s, si); \
9409 return length; \
9410} \
9411SLAB_ATTR(text); \
9412
9413STAT_ATTR(ALLOC_PCS, alloc_cpu_sheaf);
9414STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
9415STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
9416STAT_ATTR(FREE_PCS, free_cpu_sheaf);
9417STAT_ATTR(FREE_RCU_SHEAF, free_rcu_sheaf);
9418STAT_ATTR(FREE_RCU_SHEAF_FAIL, free_rcu_sheaf_fail);
9419STAT_ATTR(FREE_FASTPATH, free_fastpath);
9420STAT_ATTR(FREE_SLOWPATH, free_slowpath);
9421STAT_ATTR(FREE_FROZEN, free_frozen);
9422STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial);
9423STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial);
9424STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial);
9425STAT_ATTR(ALLOC_SLAB, alloc_slab);
9426STAT_ATTR(ALLOC_REFILL, alloc_refill);
9427STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch);
9428STAT_ATTR(FREE_SLAB, free_slab);
9429STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush);
9430STAT_ATTR(DEACTIVATE_FULL, deactivate_full);
9431STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty);
9432STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head);
9433STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail);
9434STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees);
9435STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass);
9436STAT_ATTR(ORDER_FALLBACK, order_fallback);
9437STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail);
9438STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail);
9439STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc);
9440STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
9441STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node);
9442STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain);
9443STAT_ATTR(SHEAF_FLUSH, sheaf_flush);
9444STAT_ATTR(SHEAF_REFILL, sheaf_refill);
9445STAT_ATTR(SHEAF_ALLOC, sheaf_alloc);
9446STAT_ATTR(SHEAF_FREE, sheaf_free);
9447STAT_ATTR(BARN_GET, barn_get);
9448STAT_ATTR(BARN_GET_FAIL, barn_get_fail);
9449STAT_ATTR(BARN_PUT, barn_put);
9450STAT_ATTR(BARN_PUT_FAIL, barn_put_fail);
9451STAT_ATTR(SHEAF_PREFILL_FAST, sheaf_prefill_fast);
9452STAT_ATTR(SHEAF_PREFILL_SLOW, sheaf_prefill_slow);
9453STAT_ATTR(SHEAF_PREFILL_OVERSIZE, sheaf_prefill_oversize);
9454STAT_ATTR(SHEAF_RETURN_FAST, sheaf_return_fast);
9455STAT_ATTR(SHEAF_RETURN_SLOW, sheaf_return_slow);
9456#endif /* CONFIG_SLUB_STATS */
9457
9458#ifdef CONFIG_KFENCE
9459static ssize_t skip_kfence_show(struct kmem_cache *s, char *buf)
9460{
9461 return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_SKIP_KFENCE));
9462}
9463
9464static ssize_t skip_kfence_store(struct kmem_cache *s,
9465 const char *buf, size_t length)
9466{
9467 int ret = length;
9468
9469 if (buf[0] == '0')
9470 s->flags &= ~SLAB_SKIP_KFENCE;
9471 else if (buf[0] == '1')
9472 s->flags |= SLAB_SKIP_KFENCE;
9473 else
9474 ret = -EINVAL;
9475
9476 return ret;
9477}
9478SLAB_ATTR(skip_kfence);
9479#endif
9480
9481static struct attribute *slab_attrs[] = {
9482 &slab_size_attr.attr,
9483 &object_size_attr.attr,
9484 &objs_per_slab_attr.attr,
9485 &order_attr.attr,
9486 &sheaf_capacity_attr.attr,
9487 &min_partial_attr.attr,
9488 &cpu_partial_attr.attr,
9489 &objects_partial_attr.attr,
9490 &partial_attr.attr,
9491 &cpu_slabs_attr.attr,
9492 &ctor_attr.attr,
9493 &aliases_attr.attr,
9494 &align_attr.attr,
9495 &hwcache_align_attr.attr,
9496 &reclaim_account_attr.attr,
9497 &destroy_by_rcu_attr.attr,
9498 &shrink_attr.attr,
9499 &slabs_cpu_partial_attr.attr,
9500#ifdef CONFIG_SLUB_DEBUG
9501 &total_objects_attr.attr,
9502 &objects_attr.attr,
9503 &slabs_attr.attr,
9504 &sanity_checks_attr.attr,
9505 &trace_attr.attr,
9506 &red_zone_attr.attr,
9507 &poison_attr.attr,
9508 &store_user_attr.attr,
9509 &validate_attr.attr,
9510#endif
9511#ifdef CONFIG_ZONE_DMA
9512 &cache_dma_attr.attr,
9513#endif
9514#ifdef CONFIG_NUMA
9515 &remote_node_defrag_ratio_attr.attr,
9516#endif
9517#ifdef CONFIG_SLUB_STATS
9518 &alloc_cpu_sheaf_attr.attr,
9519 &alloc_fastpath_attr.attr,
9520 &alloc_slowpath_attr.attr,
9521 &free_cpu_sheaf_attr.attr,
9522 &free_rcu_sheaf_attr.attr,
9523 &free_rcu_sheaf_fail_attr.attr,
9524 &free_fastpath_attr.attr,
9525 &free_slowpath_attr.attr,
9526 &free_frozen_attr.attr,
9527 &free_add_partial_attr.attr,
9528 &free_remove_partial_attr.attr,
9529 &alloc_from_partial_attr.attr,
9530 &alloc_slab_attr.attr,
9531 &alloc_refill_attr.attr,
9532 &alloc_node_mismatch_attr.attr,
9533 &free_slab_attr.attr,
9534 &cpuslab_flush_attr.attr,
9535 &deactivate_full_attr.attr,
9536 &deactivate_empty_attr.attr,
9537 &deactivate_to_head_attr.attr,
9538 &deactivate_to_tail_attr.attr,
9539 &deactivate_remote_frees_attr.attr,
9540 &deactivate_bypass_attr.attr,
9541 &order_fallback_attr.attr,
9542 &cmpxchg_double_fail_attr.attr,
9543 &cmpxchg_double_cpu_fail_attr.attr,
9544 &cpu_partial_alloc_attr.attr,
9545 &cpu_partial_free_attr.attr,
9546 &cpu_partial_node_attr.attr,
9547 &cpu_partial_drain_attr.attr,
9548 &sheaf_flush_attr.attr,
9549 &sheaf_refill_attr.attr,
9550 &sheaf_alloc_attr.attr,
9551 &sheaf_free_attr.attr,
9552 &barn_get_attr.attr,
9553 &barn_get_fail_attr.attr,
9554 &barn_put_attr.attr,
9555 &barn_put_fail_attr.attr,
9556 &sheaf_prefill_fast_attr.attr,
9557 &sheaf_prefill_slow_attr.attr,
9558 &sheaf_prefill_oversize_attr.attr,
9559 &sheaf_return_fast_attr.attr,
9560 &sheaf_return_slow_attr.attr,
9561#endif
9562#ifdef CONFIG_FAILSLAB
9563 &failslab_attr.attr,
9564#endif
9565#ifdef CONFIG_HARDENED_USERCOPY
9566 &usersize_attr.attr,
9567#endif
9568#ifdef CONFIG_KFENCE
9569 &skip_kfence_attr.attr,
9570#endif
9571
9572 NULL
9573};
9574
9575static const struct attribute_group slab_attr_group = {
9576 .attrs = slab_attrs,
9577};
9578
9579static ssize_t slab_attr_show(struct kobject *kobj,
9580 struct attribute *attr,
9581 char *buf)
9582{
9583 struct slab_attribute *attribute;
9584 struct kmem_cache *s;
9585
9586 attribute = to_slab_attr(attr);
9587 s = to_slab(kobj);
9588
9589 if (!attribute->show)
9590 return -EIO;
9591
9592 return attribute->show(s, buf);
9593}
9594
9595static ssize_t slab_attr_store(struct kobject *kobj,
9596 struct attribute *attr,
9597 const char *buf, size_t len)
9598{
9599 struct slab_attribute *attribute;
9600 struct kmem_cache *s;
9601
9602 attribute = to_slab_attr(attr);
9603 s = to_slab(kobj);
9604
9605 if (!attribute->store)
9606 return -EIO;
9607
9608 return attribute->store(s, buf, len);
9609}
9610
9611static void kmem_cache_release(struct kobject *k)
9612{
9613 slab_kmem_cache_release(to_slab(k));
9614}
9615
9616static const struct sysfs_ops slab_sysfs_ops = {
9617 .show = slab_attr_show,
9618 .store = slab_attr_store,
9619};
9620
9621static const struct kobj_type slab_ktype = {
9622 .sysfs_ops = &slab_sysfs_ops,
9623 .release = kmem_cache_release,
9624};
9625
9626static struct kset *slab_kset;
9627
9628static inline struct kset *cache_kset(struct kmem_cache *s)
9629{
9630 return slab_kset;
9631}
9632
9633#define ID_STR_LENGTH 32
9634
9635/* Create a unique string id for a slab cache:
9636 *
9637 * Format :[flags-]size
9638 */
9639static char *create_unique_id(struct kmem_cache *s)
9640{
9641 char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
9642 char *p = name;
9643
9644 if (!name)
9645 return ERR_PTR(error: -ENOMEM);
9646
9647 *p++ = ':';
9648 /*
9649 * First flags affecting slabcache operations. We will only
9650 * get here for aliasable slabs so we do not need to support
9651 * too many flags. The flags here must cover all flags that
9652 * are matched during merging to guarantee that the id is
9653 * unique.
9654 */
9655 if (s->flags & SLAB_CACHE_DMA)
9656 *p++ = 'd';
9657 if (s->flags & SLAB_CACHE_DMA32)
9658 *p++ = 'D';
9659 if (s->flags & SLAB_RECLAIM_ACCOUNT)
9660 *p++ = 'a';
9661 if (s->flags & SLAB_CONSISTENCY_CHECKS)
9662 *p++ = 'F';
9663 if (s->flags & SLAB_ACCOUNT)
9664 *p++ = 'A';
9665 if (p != name + 1)
9666 *p++ = '-';
9667 p += snprintf(buf: p, ID_STR_LENGTH - (p - name), fmt: "%07u", s->size);
9668
9669 if (WARN_ON(p > name + ID_STR_LENGTH - 1)) {
9670 kfree(name);
9671 return ERR_PTR(error: -EINVAL);
9672 }
9673 kmsan_unpoison_memory(address: name, size: p - name);
9674 return name;
9675}
9676
9677static int sysfs_slab_add(struct kmem_cache *s)
9678{
9679 int err;
9680 const char *name;
9681 struct kset *kset = cache_kset(s);
9682 int unmergeable = slab_unmergeable(s);
9683
9684 if (!unmergeable && disable_higher_order_debug &&
9685 (slub_debug & DEBUG_METADATA_FLAGS))
9686 unmergeable = 1;
9687
9688 if (unmergeable) {
9689 /*
9690 * Slabcache can never be merged so we can use the name proper.
9691 * This is typically the case for debug situations. In that
9692 * case we can catch duplicate names easily.
9693 */
9694 sysfs_remove_link(kobj: &slab_kset->kobj, name: s->name);
9695 name = s->name;
9696 } else {
9697 /*
9698 * Create a unique name for the slab as a target
9699 * for the symlinks.
9700 */
9701 name = create_unique_id(s);
9702 if (IS_ERR(ptr: name))
9703 return PTR_ERR(ptr: name);
9704 }
9705
9706 s->kobj.kset = kset;
9707 err = kobject_init_and_add(kobj: &s->kobj, ktype: &slab_ktype, NULL, fmt: "%s", name);
9708 if (err)
9709 goto out;
9710
9711 err = sysfs_create_group(kobj: &s->kobj, grp: &slab_attr_group);
9712 if (err)
9713 goto out_del_kobj;
9714
9715 if (!unmergeable) {
9716 /* Setup first alias */
9717 sysfs_slab_alias(s, s->name);
9718 }
9719out:
9720 if (!unmergeable)
9721 kfree(name);
9722 return err;
9723out_del_kobj:
9724 kobject_del(kobj: &s->kobj);
9725 goto out;
9726}
9727
9728void sysfs_slab_unlink(struct kmem_cache *s)
9729{
9730 if (s->kobj.state_in_sysfs)
9731 kobject_del(kobj: &s->kobj);
9732}
9733
9734void sysfs_slab_release(struct kmem_cache *s)
9735{
9736 kobject_put(kobj: &s->kobj);
9737}
9738
9739/*
9740 * Need to buffer aliases during bootup until sysfs becomes
9741 * available lest we lose that information.
9742 */
9743struct saved_alias {
9744 struct kmem_cache *s;
9745 const char *name;
9746 struct saved_alias *next;
9747};
9748
9749static struct saved_alias *alias_list;
9750
9751static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
9752{
9753 struct saved_alias *al;
9754
9755 if (slab_state == FULL) {
9756 /*
9757 * If we have a leftover link then remove it.
9758 */
9759 sysfs_remove_link(kobj: &slab_kset->kobj, name);
9760 /*
9761 * The original cache may have failed to generate sysfs file.
9762 * In that case, sysfs_create_link() returns -ENOENT and
9763 * symbolic link creation is skipped.
9764 */
9765 return sysfs_create_link(kobj: &slab_kset->kobj, target: &s->kobj, name);
9766 }
9767
9768 al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL);
9769 if (!al)
9770 return -ENOMEM;
9771
9772 al->s = s;
9773 al->name = name;
9774 al->next = alias_list;
9775 alias_list = al;
9776 kmsan_unpoison_memory(address: al, size: sizeof(*al));
9777 return 0;
9778}
9779
9780static int __init slab_sysfs_init(void)
9781{
9782 struct kmem_cache *s;
9783 int err;
9784
9785 mutex_lock(lock: &slab_mutex);
9786
9787 slab_kset = kset_create_and_add(name: "slab", NULL, parent_kobj: kernel_kobj);
9788 if (!slab_kset) {
9789 mutex_unlock(lock: &slab_mutex);
9790 pr_err("Cannot register slab subsystem.\n");
9791 return -ENOMEM;
9792 }
9793
9794 slab_state = FULL;
9795
9796 list_for_each_entry(s, &slab_caches, list) {
9797 err = sysfs_slab_add(s);
9798 if (err)
9799 pr_err("SLUB: Unable to add boot slab %s to sysfs\n",
9800 s->name);
9801 }
9802
9803 while (alias_list) {
9804 struct saved_alias *al = alias_list;
9805
9806 alias_list = alias_list->next;
9807 err = sysfs_slab_alias(s: al->s, name: al->name);
9808 if (err)
9809 pr_err("SLUB: Unable to add boot slab alias %s to sysfs\n",
9810 al->name);
9811 kfree(al);
9812 }
9813
9814 mutex_unlock(lock: &slab_mutex);
9815 return 0;
9816}
9817late_initcall(slab_sysfs_init);
9818#endif /* SLAB_SUPPORTS_SYSFS */
9819
9820#if defined(CONFIG_SLUB_DEBUG) && defined(CONFIG_DEBUG_FS)
9821static int slab_debugfs_show(struct seq_file *seq, void *v)
9822{
9823 struct loc_track *t = seq->private;
9824 struct location *l;
9825 unsigned long idx;
9826
9827 idx = (unsigned long) t->idx;
9828 if (idx < t->count) {
9829 l = &t->loc[idx];
9830
9831 seq_printf(m: seq, fmt: "%7ld ", l->count);
9832
9833 if (l->addr)
9834 seq_printf(m: seq, fmt: "%pS", (void *)l->addr);
9835 else
9836 seq_puts(m: seq, s: "<not-available>");
9837
9838 if (l->waste)
9839 seq_printf(m: seq, fmt: " waste=%lu/%lu",
9840 l->count * l->waste, l->waste);
9841
9842 if (l->sum_time != l->min_time) {
9843 seq_printf(m: seq, fmt: " age=%ld/%llu/%ld",
9844 l->min_time, div_u64(dividend: l->sum_time, divisor: l->count),
9845 l->max_time);
9846 } else
9847 seq_printf(m: seq, fmt: " age=%ld", l->min_time);
9848
9849 if (l->min_pid != l->max_pid)
9850 seq_printf(m: seq, fmt: " pid=%ld-%ld", l->min_pid, l->max_pid);
9851 else
9852 seq_printf(m: seq, fmt: " pid=%ld",
9853 l->min_pid);
9854
9855 if (num_online_cpus() > 1 && !cpumask_empty(to_cpumask(l->cpus)))
9856 seq_printf(m: seq, fmt: " cpus=%*pbl",
9857 cpumask_pr_args(to_cpumask(l->cpus)));
9858
9859 if (nr_online_nodes > 1 && !nodes_empty(l->nodes))
9860 seq_printf(m: seq, fmt: " nodes=%*pbl",
9861 nodemask_pr_args(&l->nodes));
9862
9863#ifdef CONFIG_STACKDEPOT
9864 {
9865 depot_stack_handle_t handle;
9866 unsigned long *entries;
9867 unsigned int nr_entries, j;
9868
9869 handle = READ_ONCE(l->handle);
9870 if (handle) {
9871 nr_entries = stack_depot_fetch(handle, entries: &entries);
9872 seq_puts(m: seq, s: "\n");
9873 for (j = 0; j < nr_entries; j++)
9874 seq_printf(m: seq, fmt: " %pS\n", (void *)entries[j]);
9875 }
9876 }
9877#endif
9878 seq_puts(m: seq, s: "\n");
9879 }
9880
9881 if (!idx && !t->count)
9882 seq_puts(m: seq, s: "No data\n");
9883
9884 return 0;
9885}
9886
9887static void slab_debugfs_stop(struct seq_file *seq, void *v)
9888{
9889}
9890
9891static void *slab_debugfs_next(struct seq_file *seq, void *v, loff_t *ppos)
9892{
9893 struct loc_track *t = seq->private;
9894
9895 t->idx = ++(*ppos);
9896 if (*ppos <= t->count)
9897 return ppos;
9898
9899 return NULL;
9900}
9901
9902static int cmp_loc_by_count(const void *a, const void *b)
9903{
9904 struct location *loc1 = (struct location *)a;
9905 struct location *loc2 = (struct location *)b;
9906
9907 return cmp_int(loc2->count, loc1->count);
9908}
9909
9910static void *slab_debugfs_start(struct seq_file *seq, loff_t *ppos)
9911{
9912 struct loc_track *t = seq->private;
9913
9914 t->idx = *ppos;
9915 return ppos;
9916}
9917
9918static const struct seq_operations slab_debugfs_sops = {
9919 .start = slab_debugfs_start,
9920 .next = slab_debugfs_next,
9921 .stop = slab_debugfs_stop,
9922 .show = slab_debugfs_show,
9923};
9924
9925static int slab_debug_trace_open(struct inode *inode, struct file *filep)
9926{
9927
9928 struct kmem_cache_node *n;
9929 enum track_item alloc;
9930 int node;
9931 struct loc_track *t = __seq_open_private(filep, &slab_debugfs_sops,
9932 sizeof(struct loc_track));
9933 struct kmem_cache *s = file_inode(f: filep)->i_private;
9934 unsigned long *obj_map;
9935
9936 if (!t)
9937 return -ENOMEM;
9938
9939 obj_map = bitmap_alloc(nbits: oo_objects(x: s->oo), GFP_KERNEL);
9940 if (!obj_map) {
9941 seq_release_private(inode, filep);
9942 return -ENOMEM;
9943 }
9944
9945 alloc = debugfs_get_aux_num(filep);
9946
9947 if (!alloc_loc_track(t, PAGE_SIZE / sizeof(struct location), GFP_KERNEL)) {
9948 bitmap_free(bitmap: obj_map);
9949 seq_release_private(inode, filep);
9950 return -ENOMEM;
9951 }
9952
9953 for_each_kmem_cache_node(s, node, n) {
9954 unsigned long flags;
9955 struct slab *slab;
9956
9957 if (!node_nr_slabs(n))
9958 continue;
9959
9960 spin_lock_irqsave(&n->list_lock, flags);
9961 list_for_each_entry(slab, &n->partial, slab_list)
9962 process_slab(t, s, slab, alloc, obj_map);
9963 list_for_each_entry(slab, &n->full, slab_list)
9964 process_slab(t, s, slab, alloc, obj_map);
9965 spin_unlock_irqrestore(lock: &n->list_lock, flags);
9966 }
9967
9968 /* Sort locations by count */
9969 sort(base: t->loc, num: t->count, size: sizeof(struct location),
9970 cmp_func: cmp_loc_by_count, NULL);
9971
9972 bitmap_free(bitmap: obj_map);
9973 return 0;
9974}
9975
9976static int slab_debug_trace_release(struct inode *inode, struct file *file)
9977{
9978 struct seq_file *seq = file->private_data;
9979 struct loc_track *t = seq->private;
9980
9981 free_loc_track(t);
9982 return seq_release_private(inode, file);
9983}
9984
9985static const struct file_operations slab_debugfs_fops = {
9986 .open = slab_debug_trace_open,
9987 .read = seq_read,
9988 .llseek = seq_lseek,
9989 .release = slab_debug_trace_release,
9990};
9991
9992static void debugfs_slab_add(struct kmem_cache *s)
9993{
9994 struct dentry *slab_cache_dir;
9995
9996 if (unlikely(!slab_debugfs_root))
9997 return;
9998
9999 slab_cache_dir = debugfs_create_dir(name: s->name, parent: slab_debugfs_root);
10000
10001 debugfs_create_file_aux_num("alloc_traces", 0400, slab_cache_dir, s,
10002 TRACK_ALLOC, &slab_debugfs_fops);
10003
10004 debugfs_create_file_aux_num("free_traces", 0400, slab_cache_dir, s,
10005 TRACK_FREE, &slab_debugfs_fops);
10006}
10007
10008void debugfs_slab_release(struct kmem_cache *s)
10009{
10010 debugfs_lookup_and_remove(name: s->name, parent: slab_debugfs_root);
10011}
10012
10013static int __init slab_debugfs_init(void)
10014{
10015 struct kmem_cache *s;
10016
10017 slab_debugfs_root = debugfs_create_dir(name: "slab", NULL);
10018
10019 list_for_each_entry(s, &slab_caches, list)
10020 if (s->flags & SLAB_STORE_USER)
10021 debugfs_slab_add(s);
10022
10023 return 0;
10024
10025}
10026__initcall(slab_debugfs_init);
10027#endif
10028/*
10029 * The /proc/slabinfo ABI
10030 */
10031#ifdef CONFIG_SLUB_DEBUG
10032void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
10033{
10034 unsigned long nr_slabs = 0;
10035 unsigned long nr_objs = 0;
10036 unsigned long nr_free = 0;
10037 int node;
10038 struct kmem_cache_node *n;
10039
10040 for_each_kmem_cache_node(s, node, n) {
10041 nr_slabs += node_nr_slabs(n);
10042 nr_objs += node_nr_objs(n);
10043 nr_free += count_partial_free_approx(n);
10044 }
10045
10046 sinfo->active_objs = nr_objs - nr_free;
10047 sinfo->num_objs = nr_objs;
10048 sinfo->active_slabs = nr_slabs;
10049 sinfo->num_slabs = nr_slabs;
10050 sinfo->objects_per_slab = oo_objects(x: s->oo);
10051 sinfo->cache_order = oo_order(x: s->oo);
10052}
10053#endif /* CONFIG_SLUB_DEBUG */
10054