| 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
| 2 | |
| 3 | #ifndef _LINUX_OBJPOOL_H |
| 4 | #define _LINUX_OBJPOOL_H |
| 5 | |
| 6 | #include <linux/types.h> |
| 7 | #include <linux/refcount.h> |
| 8 | #include <linux/atomic.h> |
| 9 | #include <linux/cpumask.h> |
| 10 | #include <linux/irqflags.h> |
| 11 | #include <linux/smp.h> |
| 12 | |
| 13 | /* |
| 14 | * objpool: ring-array based lockless MPMC queue |
| 15 | * |
| 16 | * Copyright: wuqiang.matt@bytedance.com,mhiramat@kernel.org |
| 17 | * |
| 18 | * objpool is a scalable implementation of high performance queue for |
| 19 | * object allocation and reclamation, such as kretprobe instances. |
| 20 | * |
| 21 | * With leveraging percpu ring-array to mitigate hot spots of memory |
| 22 | * contention, it delivers near-linear scalability for high parallel |
| 23 | * scenarios. The objpool is best suited for the following cases: |
| 24 | * 1) Memory allocation or reclamation are prohibited or too expensive |
| 25 | * 2) Consumers are of different priorities, such as irqs and threads |
| 26 | * |
| 27 | * Limitations: |
| 28 | * 1) Maximum objects (capacity) is fixed after objpool creation |
| 29 | * 2) All pre-allocated objects are managed in percpu ring array, |
| 30 | * which consumes more memory than linked lists |
| 31 | */ |
| 32 | |
| 33 | /** |
| 34 | * struct objpool_slot - percpu ring array of objpool |
| 35 | * @head: head sequence of the local ring array (to retrieve at) |
| 36 | * @tail: tail sequence of the local ring array (to append at) |
| 37 | * @last: the last sequence number marked as ready for retrieve |
| 38 | * @mask: bits mask for modulo capacity to compute array indexes |
| 39 | * @entries: object entries on this slot |
| 40 | * |
| 41 | * Represents a cpu-local array-based ring buffer, its size is specialized |
| 42 | * during initialization of object pool. The percpu objpool node is to be |
| 43 | * allocated from local memory for NUMA system, and to be kept compact in |
| 44 | * continuous memory: CPU assigned number of objects are stored just after |
| 45 | * the body of objpool_node. |
| 46 | * |
| 47 | * Real size of the ring array is far too smaller than the value range of |
| 48 | * head and tail, typed as uint32_t: [0, 2^32), so only lower bits (mask) |
| 49 | * of head and tail are used as the actual position in the ring array. In |
| 50 | * general the ring array is acting like a small sliding window, which is |
| 51 | * always moving forward in the loop of [0, 2^32). |
| 52 | */ |
| 53 | struct objpool_slot { |
| 54 | uint32_t head; |
| 55 | uint32_t tail; |
| 56 | uint32_t last; |
| 57 | uint32_t mask; |
| 58 | void *entries[]; |
| 59 | } __packed; |
| 60 | |
| 61 | struct objpool_head; |
| 62 | |
| 63 | /* |
| 64 | * caller-specified callback for object initial setup, it's only called |
| 65 | * once for each object (just after the memory allocation of the object) |
| 66 | */ |
| 67 | typedef int (*objpool_init_obj_cb)(void *obj, void *context); |
| 68 | |
| 69 | /* caller-specified cleanup callback for objpool destruction */ |
| 70 | typedef int (*objpool_fini_cb)(struct objpool_head *head, void *context); |
| 71 | |
| 72 | /** |
| 73 | * struct objpool_head - object pooling metadata |
| 74 | * @obj_size: object size, aligned to sizeof(void *) |
| 75 | * @nr_objs: total objs (to be pre-allocated with objpool) |
| 76 | * @nr_possible_cpus: cached value of num_possible_cpus() |
| 77 | * @capacity: max objs can be managed by one objpool_slot |
| 78 | * @gfp: gfp flags for kmalloc & vmalloc |
| 79 | * @ref: refcount of objpool |
| 80 | * @flags: flags for objpool management |
| 81 | * @cpu_slots: pointer to the array of objpool_slot |
| 82 | * @release: resource cleanup callback |
| 83 | * @context: caller-provided context |
| 84 | */ |
| 85 | struct objpool_head { |
| 86 | int obj_size; |
| 87 | int nr_objs; |
| 88 | int nr_possible_cpus; |
| 89 | int capacity; |
| 90 | gfp_t gfp; |
| 91 | refcount_t ref; |
| 92 | unsigned long flags; |
| 93 | struct objpool_slot **cpu_slots; |
| 94 | objpool_fini_cb release; |
| 95 | void *context; |
| 96 | }; |
| 97 | |
| 98 | #define OBJPOOL_NR_OBJECT_MAX (1UL << 24) /* maximum numbers of total objects */ |
| 99 | #define OBJPOOL_OBJECT_SIZE_MAX (1UL << 16) /* maximum size of an object */ |
| 100 | |
| 101 | /** |
| 102 | * objpool_init() - initialize objpool and pre-allocated objects |
| 103 | * @pool: the object pool to be initialized, declared by caller |
| 104 | * @nr_objs: total objects to be pre-allocated by this object pool |
| 105 | * @object_size: size of an object (should be > 0) |
| 106 | * @gfp: flags for memory allocation (via kmalloc or vmalloc) |
| 107 | * @context: user context for object initialization callback |
| 108 | * @objinit: object initialization callback for extra setup |
| 109 | * @release: cleanup callback for extra cleanup task |
| 110 | * |
| 111 | * return value: 0 for success, otherwise error code |
| 112 | * |
| 113 | * All pre-allocated objects are to be zeroed after memory allocation. |
| 114 | * Caller could do extra initialization in objinit callback. objinit() |
| 115 | * will be called just after slot allocation and called only once for |
| 116 | * each object. After that the objpool won't touch any content of the |
| 117 | * objects. It's caller's duty to perform reinitialization after each |
| 118 | * pop (object allocation) or do clearance before each push (object |
| 119 | * reclamation). |
| 120 | */ |
| 121 | int objpool_init(struct objpool_head *pool, int nr_objs, int object_size, |
| 122 | gfp_t gfp, void *context, objpool_init_obj_cb objinit, |
| 123 | objpool_fini_cb release); |
| 124 | |
| 125 | /* try to retrieve object from slot */ |
| 126 | static inline void *__objpool_try_get_slot(struct objpool_head *pool, int cpu) |
| 127 | { |
| 128 | struct objpool_slot *slot = pool->cpu_slots[cpu]; |
| 129 | /* load head snapshot, other cpus may change it */ |
| 130 | uint32_t head = smp_load_acquire(&slot->head); |
| 131 | |
| 132 | while (head != READ_ONCE(slot->last)) { |
| 133 | void *obj; |
| 134 | |
| 135 | /* |
| 136 | * data visibility of 'last' and 'head' could be out of |
| 137 | * order since memory updating of 'last' and 'head' are |
| 138 | * performed in push() and pop() independently |
| 139 | * |
| 140 | * before any retrieving attempts, pop() must guarantee |
| 141 | * 'last' is behind 'head', that is to say, there must |
| 142 | * be available objects in slot, which could be ensured |
| 143 | * by condition 'last != head && last - head <= nr_objs' |
| 144 | * that is equivalent to 'last - head - 1 < nr_objs' as |
| 145 | * 'last' and 'head' are both unsigned int32 |
| 146 | */ |
| 147 | if (READ_ONCE(slot->last) - head - 1 >= pool->nr_objs) { |
| 148 | head = READ_ONCE(slot->head); |
| 149 | continue; |
| 150 | } |
| 151 | |
| 152 | /* obj must be retrieved before moving forward head */ |
| 153 | obj = READ_ONCE(slot->entries[head & slot->mask]); |
| 154 | |
| 155 | /* move head forward to mark it's consumption */ |
| 156 | if (try_cmpxchg_release(&slot->head, &head, head + 1)) |
| 157 | return obj; |
| 158 | } |
| 159 | |
| 160 | return NULL; |
| 161 | } |
| 162 | |
| 163 | /** |
| 164 | * objpool_pop() - allocate an object from objpool |
| 165 | * @pool: object pool |
| 166 | * |
| 167 | * return value: object ptr or NULL if failed |
| 168 | */ |
| 169 | static inline void *objpool_pop(struct objpool_head *pool) |
| 170 | { |
| 171 | void *obj = NULL; |
| 172 | unsigned long flags; |
| 173 | int start, cpu; |
| 174 | |
| 175 | /* disable local irq to avoid preemption & interruption */ |
| 176 | raw_local_irq_save(flags); |
| 177 | |
| 178 | start = raw_smp_processor_id(); |
| 179 | for_each_possible_cpu_wrap(cpu, start) { |
| 180 | obj = __objpool_try_get_slot(pool, cpu); |
| 181 | if (obj) |
| 182 | break; |
| 183 | } |
| 184 | raw_local_irq_restore(flags); |
| 185 | |
| 186 | return obj; |
| 187 | } |
| 188 | |
| 189 | /* adding object to slot, abort if the slot was already full */ |
| 190 | static inline int |
| 191 | __objpool_try_add_slot(void *obj, struct objpool_head *pool, int cpu) |
| 192 | { |
| 193 | struct objpool_slot *slot = pool->cpu_slots[cpu]; |
| 194 | uint32_t head, tail; |
| 195 | |
| 196 | /* loading tail and head as a local snapshot, tail first */ |
| 197 | tail = READ_ONCE(slot->tail); |
| 198 | |
| 199 | do { |
| 200 | head = READ_ONCE(slot->head); |
| 201 | /* fault caught: something must be wrong */ |
| 202 | WARN_ON_ONCE(tail - head > pool->nr_objs); |
| 203 | } while (!try_cmpxchg_acquire(&slot->tail, &tail, tail + 1)); |
| 204 | |
| 205 | /* now the tail position is reserved for the given obj */ |
| 206 | WRITE_ONCE(slot->entries[tail & slot->mask], obj); |
| 207 | /* update sequence to make this obj available for pop() */ |
| 208 | smp_store_release(&slot->last, tail + 1); |
| 209 | |
| 210 | return 0; |
| 211 | } |
| 212 | |
| 213 | /** |
| 214 | * objpool_push() - reclaim the object and return back to objpool |
| 215 | * @obj: object ptr to be pushed to objpool |
| 216 | * @pool: object pool |
| 217 | * |
| 218 | * return: 0 or error code (it fails only when user tries to push |
| 219 | * the same object multiple times or wrong "objects" into objpool) |
| 220 | */ |
| 221 | static inline int objpool_push(void *obj, struct objpool_head *pool) |
| 222 | { |
| 223 | unsigned long flags; |
| 224 | int rc; |
| 225 | |
| 226 | /* disable local irq to avoid preemption & interruption */ |
| 227 | raw_local_irq_save(flags); |
| 228 | rc = __objpool_try_add_slot(obj, pool, raw_smp_processor_id()); |
| 229 | raw_local_irq_restore(flags); |
| 230 | |
| 231 | return rc; |
| 232 | } |
| 233 | |
| 234 | |
| 235 | /** |
| 236 | * objpool_drop() - discard the object and deref objpool |
| 237 | * @obj: object ptr to be discarded |
| 238 | * @pool: object pool |
| 239 | * |
| 240 | * return: 0 if objpool was released; -EAGAIN if there are still |
| 241 | * outstanding objects |
| 242 | * |
| 243 | * objpool_drop is normally for the release of outstanding objects |
| 244 | * after objpool cleanup (objpool_fini). Thinking of this example: |
| 245 | * kretprobe is unregistered and objpool_fini() is called to release |
| 246 | * all remained objects, but there are still objects being used by |
| 247 | * unfinished kretprobes (like blockable function: sys_accept). So |
| 248 | * only when the last outstanding object is dropped could the whole |
| 249 | * objpool be released along with the call of objpool_drop() |
| 250 | */ |
| 251 | int objpool_drop(void *obj, struct objpool_head *pool); |
| 252 | |
| 253 | /** |
| 254 | * objpool_free() - release objpool forcely (all objects to be freed) |
| 255 | * @pool: object pool to be released |
| 256 | */ |
| 257 | void objpool_free(struct objpool_head *pool); |
| 258 | |
| 259 | /** |
| 260 | * objpool_fini() - deref object pool (also releasing unused objects) |
| 261 | * @pool: object pool to be dereferenced |
| 262 | * |
| 263 | * objpool_fini() will try to release all remained free objects and |
| 264 | * then drop an extra reference of the objpool. If all objects are |
| 265 | * already returned to objpool (so called synchronous use cases), |
| 266 | * the objpool itself will be freed together. But if there are still |
| 267 | * outstanding objects (so called asynchronous use cases, such like |
| 268 | * blockable kretprobe), the objpool won't be released until all |
| 269 | * the outstanding objects are dropped, but the caller must assure |
| 270 | * there are no concurrent objpool_push() on the fly. Normally RCU |
| 271 | * is being required to make sure all ongoing objpool_push() must |
| 272 | * be finished before calling objpool_fini(), so does test_objpool, |
| 273 | * kretprobe or rethook |
| 274 | */ |
| 275 | void objpool_fini(struct objpool_head *pool); |
| 276 | |
| 277 | #endif /* _LINUX_OBJPOOL_H */ |
| 278 | |