| 1 | /* SPDX-License-Identifier: GPL-2.0 */ | 
|---|
| 2 |  | 
|---|
| 3 | #ifndef _LINUX_OBJPOOL_H | 
|---|
| 4 | #define _LINUX_OBJPOOL_H | 
|---|
| 5 |  | 
|---|
| 6 | #include <linux/types.h> | 
|---|
| 7 | #include <linux/refcount.h> | 
|---|
| 8 | #include <linux/atomic.h> | 
|---|
| 9 | #include <linux/cpumask.h> | 
|---|
| 10 | #include <linux/irqflags.h> | 
|---|
| 11 | #include <linux/smp.h> | 
|---|
| 12 |  | 
|---|
| 13 | /* | 
|---|
| 14 | * objpool: ring-array based lockless MPMC queue | 
|---|
| 15 | * | 
|---|
| 16 | * Copyright: wuqiang.matt@bytedance.com,mhiramat@kernel.org | 
|---|
| 17 | * | 
|---|
| 18 | * objpool is a scalable implementation of high performance queue for | 
|---|
| 19 | * object allocation and reclamation, such as kretprobe instances. | 
|---|
| 20 | * | 
|---|
| 21 | * With leveraging percpu ring-array to mitigate hot spots of memory | 
|---|
| 22 | * contention, it delivers near-linear scalability for high parallel | 
|---|
| 23 | * scenarios. The objpool is best suited for the following cases: | 
|---|
| 24 | * 1) Memory allocation or reclamation are prohibited or too expensive | 
|---|
| 25 | * 2) Consumers are of different priorities, such as irqs and threads | 
|---|
| 26 | * | 
|---|
| 27 | * Limitations: | 
|---|
| 28 | * 1) Maximum objects (capacity) is fixed after objpool creation | 
|---|
| 29 | * 2) All pre-allocated objects are managed in percpu ring array, | 
|---|
| 30 | *    which consumes more memory than linked lists | 
|---|
| 31 | */ | 
|---|
| 32 |  | 
|---|
| 33 | /** | 
|---|
| 34 | * struct objpool_slot - percpu ring array of objpool | 
|---|
| 35 | * @head: head sequence of the local ring array (to retrieve at) | 
|---|
| 36 | * @tail: tail sequence of the local ring array (to append at) | 
|---|
| 37 | * @last: the last sequence number marked as ready for retrieve | 
|---|
| 38 | * @mask: bits mask for modulo capacity to compute array indexes | 
|---|
| 39 | * @entries: object entries on this slot | 
|---|
| 40 | * | 
|---|
| 41 | * Represents a cpu-local array-based ring buffer, its size is specialized | 
|---|
| 42 | * during initialization of object pool. The percpu objpool node is to be | 
|---|
| 43 | * allocated from local memory for NUMA system, and to be kept compact in | 
|---|
| 44 | * continuous memory: CPU assigned number of objects are stored just after | 
|---|
| 45 | * the body of objpool_node. | 
|---|
| 46 | * | 
|---|
| 47 | * Real size of the ring array is far too smaller than the value range of | 
|---|
| 48 | * head and tail, typed as uint32_t: [0, 2^32), so only lower bits (mask) | 
|---|
| 49 | * of head and tail are used as the actual position in the ring array. In | 
|---|
| 50 | * general the ring array is acting like a small sliding window, which is | 
|---|
| 51 | * always moving forward in the loop of [0, 2^32). | 
|---|
| 52 | */ | 
|---|
| 53 | struct objpool_slot { | 
|---|
| 54 | uint32_t            head; | 
|---|
| 55 | uint32_t            tail; | 
|---|
| 56 | uint32_t            last; | 
|---|
| 57 | uint32_t            mask; | 
|---|
| 58 | void               *entries[]; | 
|---|
| 59 | } __packed; | 
|---|
| 60 |  | 
|---|
| 61 | struct objpool_head; | 
|---|
| 62 |  | 
|---|
| 63 | /* | 
|---|
| 64 | * caller-specified callback for object initial setup, it's only called | 
|---|
| 65 | * once for each object (just after the memory allocation of the object) | 
|---|
| 66 | */ | 
|---|
| 67 | typedef int (*objpool_init_obj_cb)(void *obj, void *context); | 
|---|
| 68 |  | 
|---|
| 69 | /* caller-specified cleanup callback for objpool destruction */ | 
|---|
| 70 | typedef int (*objpool_fini_cb)(struct objpool_head *head, void *context); | 
|---|
| 71 |  | 
|---|
| 72 | /** | 
|---|
| 73 | * struct objpool_head - object pooling metadata | 
|---|
| 74 | * @obj_size:   object size, aligned to sizeof(void *) | 
|---|
| 75 | * @nr_objs:    total objs (to be pre-allocated with objpool) | 
|---|
| 76 | * @nr_possible_cpus: cached value of num_possible_cpus() | 
|---|
| 77 | * @capacity:   max objs can be managed by one objpool_slot | 
|---|
| 78 | * @gfp:        gfp flags for kmalloc & vmalloc | 
|---|
| 79 | * @ref:        refcount of objpool | 
|---|
| 80 | * @flags:      flags for objpool management | 
|---|
| 81 | * @cpu_slots:  pointer to the array of objpool_slot | 
|---|
| 82 | * @release:    resource cleanup callback | 
|---|
| 83 | * @context:    caller-provided context | 
|---|
| 84 | */ | 
|---|
| 85 | struct objpool_head { | 
|---|
| 86 | int                     obj_size; | 
|---|
| 87 | int                     nr_objs; | 
|---|
| 88 | int                     nr_possible_cpus; | 
|---|
| 89 | int                     capacity; | 
|---|
| 90 | gfp_t                   gfp; | 
|---|
| 91 | refcount_t              ref; | 
|---|
| 92 | unsigned long           flags; | 
|---|
| 93 | struct objpool_slot   **cpu_slots; | 
|---|
| 94 | objpool_fini_cb         release; | 
|---|
| 95 | void                   *context; | 
|---|
| 96 | }; | 
|---|
| 97 |  | 
|---|
| 98 | #define OBJPOOL_NR_OBJECT_MAX	(1UL << 24) /* maximum numbers of total objects */ | 
|---|
| 99 | #define OBJPOOL_OBJECT_SIZE_MAX	(1UL << 16) /* maximum size of an object */ | 
|---|
| 100 |  | 
|---|
| 101 | /** | 
|---|
| 102 | * objpool_init() - initialize objpool and pre-allocated objects | 
|---|
| 103 | * @pool:    the object pool to be initialized, declared by caller | 
|---|
| 104 | * @nr_objs: total objects to be pre-allocated by this object pool | 
|---|
| 105 | * @object_size: size of an object (should be > 0) | 
|---|
| 106 | * @gfp:     flags for memory allocation (via kmalloc or vmalloc) | 
|---|
| 107 | * @context: user context for object initialization callback | 
|---|
| 108 | * @objinit: object initialization callback for extra setup | 
|---|
| 109 | * @release: cleanup callback for extra cleanup task | 
|---|
| 110 | * | 
|---|
| 111 | * return value: 0 for success, otherwise error code | 
|---|
| 112 | * | 
|---|
| 113 | * All pre-allocated objects are to be zeroed after memory allocation. | 
|---|
| 114 | * Caller could do extra initialization in objinit callback. objinit() | 
|---|
| 115 | * will be called just after slot allocation and called only once for | 
|---|
| 116 | * each object. After that the objpool won't touch any content of the | 
|---|
| 117 | * objects. It's caller's duty to perform reinitialization after each | 
|---|
| 118 | * pop (object allocation) or do clearance before each push (object | 
|---|
| 119 | * reclamation). | 
|---|
| 120 | */ | 
|---|
| 121 | int objpool_init(struct objpool_head *pool, int nr_objs, int object_size, | 
|---|
| 122 | gfp_t gfp, void *context, objpool_init_obj_cb objinit, | 
|---|
| 123 | objpool_fini_cb release); | 
|---|
| 124 |  | 
|---|
| 125 | /* try to retrieve object from slot */ | 
|---|
| 126 | static inline void *__objpool_try_get_slot(struct objpool_head *pool, int cpu) | 
|---|
| 127 | { | 
|---|
| 128 | struct objpool_slot *slot = pool->cpu_slots[cpu]; | 
|---|
| 129 | /* load head snapshot, other cpus may change it */ | 
|---|
| 130 | uint32_t head = smp_load_acquire(&slot->head); | 
|---|
| 131 |  | 
|---|
| 132 | while (head != READ_ONCE(slot->last)) { | 
|---|
| 133 | void *obj; | 
|---|
| 134 |  | 
|---|
| 135 | /* | 
|---|
| 136 | * data visibility of 'last' and 'head' could be out of | 
|---|
| 137 | * order since memory updating of 'last' and 'head' are | 
|---|
| 138 | * performed in push() and pop() independently | 
|---|
| 139 | * | 
|---|
| 140 | * before any retrieving attempts, pop() must guarantee | 
|---|
| 141 | * 'last' is behind 'head', that is to say, there must | 
|---|
| 142 | * be available objects in slot, which could be ensured | 
|---|
| 143 | * by condition 'last != head && last - head <= nr_objs' | 
|---|
| 144 | * that is equivalent to 'last - head - 1 < nr_objs' as | 
|---|
| 145 | * 'last' and 'head' are both unsigned int32 | 
|---|
| 146 | */ | 
|---|
| 147 | if (READ_ONCE(slot->last) - head - 1 >= pool->nr_objs) { | 
|---|
| 148 | head = READ_ONCE(slot->head); | 
|---|
| 149 | continue; | 
|---|
| 150 | } | 
|---|
| 151 |  | 
|---|
| 152 | /* obj must be retrieved before moving forward head */ | 
|---|
| 153 | obj = READ_ONCE(slot->entries[head & slot->mask]); | 
|---|
| 154 |  | 
|---|
| 155 | /* move head forward to mark it's consumption */ | 
|---|
| 156 | if (try_cmpxchg_release(&slot->head, &head, head + 1)) | 
|---|
| 157 | return obj; | 
|---|
| 158 | } | 
|---|
| 159 |  | 
|---|
| 160 | return NULL; | 
|---|
| 161 | } | 
|---|
| 162 |  | 
|---|
| 163 | /** | 
|---|
| 164 | * objpool_pop() - allocate an object from objpool | 
|---|
| 165 | * @pool: object pool | 
|---|
| 166 | * | 
|---|
| 167 | * return value: object ptr or NULL if failed | 
|---|
| 168 | */ | 
|---|
| 169 | static inline void *objpool_pop(struct objpool_head *pool) | 
|---|
| 170 | { | 
|---|
| 171 | void *obj = NULL; | 
|---|
| 172 | unsigned long flags; | 
|---|
| 173 | int start, cpu; | 
|---|
| 174 |  | 
|---|
| 175 | /* disable local irq to avoid preemption & interruption */ | 
|---|
| 176 | raw_local_irq_save(flags); | 
|---|
| 177 |  | 
|---|
| 178 | start = raw_smp_processor_id(); | 
|---|
| 179 | for_each_possible_cpu_wrap(cpu, start) { | 
|---|
| 180 | obj = __objpool_try_get_slot(pool, cpu); | 
|---|
| 181 | if (obj) | 
|---|
| 182 | break; | 
|---|
| 183 | } | 
|---|
| 184 | raw_local_irq_restore(flags); | 
|---|
| 185 |  | 
|---|
| 186 | return obj; | 
|---|
| 187 | } | 
|---|
| 188 |  | 
|---|
| 189 | /* adding object to slot, abort if the slot was already full */ | 
|---|
| 190 | static inline int | 
|---|
| 191 | __objpool_try_add_slot(void *obj, struct objpool_head *pool, int cpu) | 
|---|
| 192 | { | 
|---|
| 193 | struct objpool_slot *slot = pool->cpu_slots[cpu]; | 
|---|
| 194 | uint32_t head, tail; | 
|---|
| 195 |  | 
|---|
| 196 | /* loading tail and head as a local snapshot, tail first */ | 
|---|
| 197 | tail = READ_ONCE(slot->tail); | 
|---|
| 198 |  | 
|---|
| 199 | do { | 
|---|
| 200 | head = READ_ONCE(slot->head); | 
|---|
| 201 | /* fault caught: something must be wrong */ | 
|---|
| 202 | WARN_ON_ONCE(tail - head > pool->nr_objs); | 
|---|
| 203 | } while (!try_cmpxchg_acquire(&slot->tail, &tail, tail + 1)); | 
|---|
| 204 |  | 
|---|
| 205 | /* now the tail position is reserved for the given obj */ | 
|---|
| 206 | WRITE_ONCE(slot->entries[tail & slot->mask], obj); | 
|---|
| 207 | /* update sequence to make this obj available for pop() */ | 
|---|
| 208 | smp_store_release(&slot->last, tail + 1); | 
|---|
| 209 |  | 
|---|
| 210 | return 0; | 
|---|
| 211 | } | 
|---|
| 212 |  | 
|---|
| 213 | /** | 
|---|
| 214 | * objpool_push() - reclaim the object and return back to objpool | 
|---|
| 215 | * @obj:  object ptr to be pushed to objpool | 
|---|
| 216 | * @pool: object pool | 
|---|
| 217 | * | 
|---|
| 218 | * return: 0 or error code (it fails only when user tries to push | 
|---|
| 219 | * the same object multiple times or wrong "objects" into objpool) | 
|---|
| 220 | */ | 
|---|
| 221 | static inline int objpool_push(void *obj, struct objpool_head *pool) | 
|---|
| 222 | { | 
|---|
| 223 | unsigned long flags; | 
|---|
| 224 | int rc; | 
|---|
| 225 |  | 
|---|
| 226 | /* disable local irq to avoid preemption & interruption */ | 
|---|
| 227 | raw_local_irq_save(flags); | 
|---|
| 228 | rc = __objpool_try_add_slot(obj, pool, raw_smp_processor_id()); | 
|---|
| 229 | raw_local_irq_restore(flags); | 
|---|
| 230 |  | 
|---|
| 231 | return rc; | 
|---|
| 232 | } | 
|---|
| 233 |  | 
|---|
| 234 |  | 
|---|
| 235 | /** | 
|---|
| 236 | * objpool_drop() - discard the object and deref objpool | 
|---|
| 237 | * @obj:  object ptr to be discarded | 
|---|
| 238 | * @pool: object pool | 
|---|
| 239 | * | 
|---|
| 240 | * return: 0 if objpool was released; -EAGAIN if there are still | 
|---|
| 241 | *         outstanding objects | 
|---|
| 242 | * | 
|---|
| 243 | * objpool_drop is normally for the release of outstanding objects | 
|---|
| 244 | * after objpool cleanup (objpool_fini). Thinking of this example: | 
|---|
| 245 | * kretprobe is unregistered and objpool_fini() is called to release | 
|---|
| 246 | * all remained objects, but there are still objects being used by | 
|---|
| 247 | * unfinished kretprobes (like blockable function: sys_accept). So | 
|---|
| 248 | * only when the last outstanding object is dropped could the whole | 
|---|
| 249 | * objpool be released along with the call of objpool_drop() | 
|---|
| 250 | */ | 
|---|
| 251 | int objpool_drop(void *obj, struct objpool_head *pool); | 
|---|
| 252 |  | 
|---|
| 253 | /** | 
|---|
| 254 | * objpool_free() - release objpool forcely (all objects to be freed) | 
|---|
| 255 | * @pool: object pool to be released | 
|---|
| 256 | */ | 
|---|
| 257 | void objpool_free(struct objpool_head *pool); | 
|---|
| 258 |  | 
|---|
| 259 | /** | 
|---|
| 260 | * objpool_fini() - deref object pool (also releasing unused objects) | 
|---|
| 261 | * @pool: object pool to be dereferenced | 
|---|
| 262 | * | 
|---|
| 263 | * objpool_fini() will try to release all remained free objects and | 
|---|
| 264 | * then drop an extra reference of the objpool. If all objects are | 
|---|
| 265 | * already returned to objpool (so called synchronous use cases), | 
|---|
| 266 | * the objpool itself will be freed together. But if there are still | 
|---|
| 267 | * outstanding objects (so called asynchronous use cases, such like | 
|---|
| 268 | * blockable kretprobe), the objpool won't be released until all | 
|---|
| 269 | * the outstanding objects are dropped, but the caller must assure | 
|---|
| 270 | * there are no concurrent objpool_push() on the fly. Normally RCU | 
|---|
| 271 | * is being required to make sure all ongoing objpool_push() must | 
|---|
| 272 | * be finished before calling objpool_fini(), so does test_objpool, | 
|---|
| 273 | * kretprobe or rethook | 
|---|
| 274 | */ | 
|---|
| 275 | void objpool_fini(struct objpool_head *pool); | 
|---|
| 276 |  | 
|---|
| 277 | #endif /* _LINUX_OBJPOOL_H */ | 
|---|
| 278 |  | 
|---|