| 1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ | 
|---|
| 2 | /* | 
|---|
| 3 | * Resilient Queued Spin Lock | 
|---|
| 4 | * | 
|---|
| 5 | * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates. | 
|---|
| 6 | * | 
|---|
| 7 | * Authors: Kumar Kartikeya Dwivedi <memxor@gmail.com> | 
|---|
| 8 | */ | 
|---|
| 9 | #ifndef __ASM_GENERIC_RQSPINLOCK_H | 
|---|
| 10 | #define __ASM_GENERIC_RQSPINLOCK_H | 
|---|
| 11 |  | 
|---|
| 12 | #include <linux/types.h> | 
|---|
| 13 | #include <vdso/time64.h> | 
|---|
| 14 | #include <linux/percpu.h> | 
|---|
| 15 | #ifdef CONFIG_QUEUED_SPINLOCKS | 
|---|
| 16 | #include <asm/qspinlock.h> | 
|---|
| 17 | #endif | 
|---|
| 18 |  | 
|---|
| 19 | struct rqspinlock { | 
|---|
| 20 | union { | 
|---|
| 21 | atomic_t val; | 
|---|
| 22 | u32 locked; | 
|---|
| 23 | }; | 
|---|
| 24 | }; | 
|---|
| 25 |  | 
|---|
| 26 | /* Even though this is same as struct rqspinlock, we need to emit a distinct | 
|---|
| 27 | * type in BTF for BPF programs. | 
|---|
| 28 | */ | 
|---|
| 29 | struct bpf_res_spin_lock { | 
|---|
| 30 | u32 val; | 
|---|
| 31 | }; | 
|---|
| 32 |  | 
|---|
| 33 | struct qspinlock; | 
|---|
| 34 | #ifdef CONFIG_QUEUED_SPINLOCKS | 
|---|
| 35 | typedef struct qspinlock rqspinlock_t; | 
|---|
| 36 | #else | 
|---|
| 37 | typedef struct rqspinlock rqspinlock_t; | 
|---|
| 38 | #endif | 
|---|
| 39 |  | 
|---|
| 40 | extern int resilient_tas_spin_lock(rqspinlock_t *lock); | 
|---|
| 41 | #ifdef CONFIG_QUEUED_SPINLOCKS | 
|---|
| 42 | extern int resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val); | 
|---|
| 43 | #endif | 
|---|
| 44 |  | 
|---|
| 45 | #ifndef resilient_virt_spin_lock_enabled | 
|---|
| 46 | static __always_inline bool resilient_virt_spin_lock_enabled(void) | 
|---|
| 47 | { | 
|---|
| 48 | return false; | 
|---|
| 49 | } | 
|---|
| 50 | #endif | 
|---|
| 51 |  | 
|---|
| 52 | #ifndef resilient_virt_spin_lock | 
|---|
| 53 | static __always_inline int resilient_virt_spin_lock(rqspinlock_t *lock) | 
|---|
| 54 | { | 
|---|
| 55 | return 0; | 
|---|
| 56 | } | 
|---|
| 57 | #endif | 
|---|
| 58 |  | 
|---|
| 59 | /* | 
|---|
| 60 | * Default timeout for waiting loops is 0.25 seconds | 
|---|
| 61 | */ | 
|---|
| 62 | #define RES_DEF_TIMEOUT (NSEC_PER_SEC / 4) | 
|---|
| 63 |  | 
|---|
| 64 | /* | 
|---|
| 65 | * Choose 31 as it makes rqspinlock_held cacheline-aligned. | 
|---|
| 66 | */ | 
|---|
| 67 | #define RES_NR_HELD 31 | 
|---|
| 68 |  | 
|---|
| 69 | struct rqspinlock_held { | 
|---|
| 70 | int cnt; | 
|---|
| 71 | void *locks[RES_NR_HELD]; | 
|---|
| 72 | }; | 
|---|
| 73 |  | 
|---|
| 74 | DECLARE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks); | 
|---|
| 75 |  | 
|---|
| 76 | static __always_inline void grab_held_lock_entry(void *lock) | 
|---|
| 77 | { | 
|---|
| 78 | int cnt = this_cpu_inc_return(rqspinlock_held_locks.cnt); | 
|---|
| 79 |  | 
|---|
| 80 | if (unlikely(cnt > RES_NR_HELD)) { | 
|---|
| 81 | /* Still keep the inc so we decrement later. */ | 
|---|
| 82 | return; | 
|---|
| 83 | } | 
|---|
| 84 |  | 
|---|
| 85 | /* | 
|---|
| 86 | * Implied compiler barrier in per-CPU operations; otherwise we can have | 
|---|
| 87 | * the compiler reorder inc with write to table, allowing interrupts to | 
|---|
| 88 | * overwrite and erase our write to the table (as on interrupt exit it | 
|---|
| 89 | * will be reset to NULL). | 
|---|
| 90 | * | 
|---|
| 91 | * It is fine for cnt inc to be reordered wrt remote readers though, | 
|---|
| 92 | * they won't observe our entry until the cnt update is visible, that's | 
|---|
| 93 | * all. | 
|---|
| 94 | */ | 
|---|
| 95 | this_cpu_write(rqspinlock_held_locks.locks[cnt - 1], lock); | 
|---|
| 96 | } | 
|---|
| 97 |  | 
|---|
| 98 | /* | 
|---|
| 99 | * We simply don't support out-of-order unlocks, and keep the logic simple here. | 
|---|
| 100 | * The verifier prevents BPF programs from unlocking out-of-order, and the same | 
|---|
| 101 | * holds for in-kernel users. | 
|---|
| 102 | * | 
|---|
| 103 | * It is possible to run into misdetection scenarios of AA deadlocks on the same | 
|---|
| 104 | * CPU, and missed ABBA deadlocks on remote CPUs if this function pops entries | 
|---|
| 105 | * out of order (due to lock A, lock B, unlock A, unlock B) pattern. The correct | 
|---|
| 106 | * logic to preserve right entries in the table would be to walk the array of | 
|---|
| 107 | * held locks and swap and clear out-of-order entries, but that's too | 
|---|
| 108 | * complicated and we don't have a compelling use case for out of order unlocking. | 
|---|
| 109 | */ | 
|---|
| 110 | static __always_inline void release_held_lock_entry(void) | 
|---|
| 111 | { | 
|---|
| 112 | struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); | 
|---|
| 113 |  | 
|---|
| 114 | if (unlikely(rqh->cnt > RES_NR_HELD)) | 
|---|
| 115 | goto dec; | 
|---|
| 116 | WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL); | 
|---|
| 117 | dec: | 
|---|
| 118 | /* | 
|---|
| 119 | * Reordering of clearing above with inc and its write in | 
|---|
| 120 | * grab_held_lock_entry that came before us (in same acquisition | 
|---|
| 121 | * attempt) is ok, we either see a valid entry or NULL when it's | 
|---|
| 122 | * visible. | 
|---|
| 123 | * | 
|---|
| 124 | * But this helper is invoked when we unwind upon failing to acquire the | 
|---|
| 125 | * lock. Unlike the unlock path which constitutes a release store after | 
|---|
| 126 | * we clear the entry, we need to emit a write barrier here. Otherwise, | 
|---|
| 127 | * we may have a situation as follows: | 
|---|
| 128 | * | 
|---|
| 129 | * <error> for lock B | 
|---|
| 130 | * release_held_lock_entry | 
|---|
| 131 | * | 
|---|
| 132 | * try_cmpxchg_acquire for lock A | 
|---|
| 133 | * grab_held_lock_entry | 
|---|
| 134 | * | 
|---|
| 135 | * Lack of any ordering means reordering may occur such that dec, inc | 
|---|
| 136 | * are done before entry is overwritten. This permits a remote lock | 
|---|
| 137 | * holder of lock B (which this CPU failed to acquire) to now observe it | 
|---|
| 138 | * as being attempted on this CPU, and may lead to misdetection (if this | 
|---|
| 139 | * CPU holds a lock it is attempting to acquire, leading to false ABBA | 
|---|
| 140 | * diagnosis). | 
|---|
| 141 | * | 
|---|
| 142 | * In case of unlock, we will always do a release on the lock word after | 
|---|
| 143 | * releasing the entry, ensuring that other CPUs cannot hold the lock | 
|---|
| 144 | * (and make conclusions about deadlocks) until the entry has been | 
|---|
| 145 | * cleared on the local CPU, preventing any anomalies. Reordering is | 
|---|
| 146 | * still possible there, but a remote CPU cannot observe a lock in our | 
|---|
| 147 | * table which it is already holding, since visibility entails our | 
|---|
| 148 | * release store for the said lock has not retired. | 
|---|
| 149 | * | 
|---|
| 150 | * In theory we don't have a problem if the dec and WRITE_ONCE above get | 
|---|
| 151 | * reordered with each other, we either notice an empty NULL entry on | 
|---|
| 152 | * top (if dec succeeds WRITE_ONCE), or a potentially stale entry which | 
|---|
| 153 | * cannot be observed (if dec precedes WRITE_ONCE). | 
|---|
| 154 | * | 
|---|
| 155 | * Emit the write barrier _before_ the dec, this permits dec-inc | 
|---|
| 156 | * reordering but that is harmless as we'd have new entry set to NULL | 
|---|
| 157 | * already, i.e. they cannot precede the NULL store above. | 
|---|
| 158 | */ | 
|---|
| 159 | smp_wmb(); | 
|---|
| 160 | this_cpu_dec(rqspinlock_held_locks.cnt); | 
|---|
| 161 | } | 
|---|
| 162 |  | 
|---|
| 163 | #ifdef CONFIG_QUEUED_SPINLOCKS | 
|---|
| 164 |  | 
|---|
| 165 | /** | 
|---|
| 166 | * res_spin_lock - acquire a queued spinlock | 
|---|
| 167 | * @lock: Pointer to queued spinlock structure | 
|---|
| 168 | * | 
|---|
| 169 | * Return: | 
|---|
| 170 | * * 0		- Lock was acquired successfully. | 
|---|
| 171 | * * -EDEADLK	- Lock acquisition failed because of AA/ABBA deadlock. | 
|---|
| 172 | * * -ETIMEDOUT - Lock acquisition failed because of timeout. | 
|---|
| 173 | */ | 
|---|
| 174 | static __always_inline int res_spin_lock(rqspinlock_t *lock) | 
|---|
| 175 | { | 
|---|
| 176 | int val = 0; | 
|---|
| 177 |  | 
|---|
| 178 | if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) { | 
|---|
| 179 | grab_held_lock_entry(lock); | 
|---|
| 180 | return 0; | 
|---|
| 181 | } | 
|---|
| 182 | return resilient_queued_spin_lock_slowpath(lock, val); | 
|---|
| 183 | } | 
|---|
| 184 |  | 
|---|
| 185 | #else | 
|---|
| 186 |  | 
|---|
| 187 | #define res_spin_lock(lock) resilient_tas_spin_lock(lock) | 
|---|
| 188 |  | 
|---|
| 189 | #endif /* CONFIG_QUEUED_SPINLOCKS */ | 
|---|
| 190 |  | 
|---|
| 191 | static __always_inline void res_spin_unlock(rqspinlock_t *lock) | 
|---|
| 192 | { | 
|---|
| 193 | struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); | 
|---|
| 194 |  | 
|---|
| 195 | if (unlikely(rqh->cnt > RES_NR_HELD)) | 
|---|
| 196 | goto unlock; | 
|---|
| 197 | WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL); | 
|---|
| 198 | unlock: | 
|---|
| 199 | /* | 
|---|
| 200 | * Release barrier, ensures correct ordering. See release_held_lock_entry | 
|---|
| 201 | * for details.  Perform release store instead of queued_spin_unlock, | 
|---|
| 202 | * since we use this function for test-and-set fallback as well. When we | 
|---|
| 203 | * have CONFIG_QUEUED_SPINLOCKS=n, we clear the full 4-byte lockword. | 
|---|
| 204 | * | 
|---|
| 205 | * Like release_held_lock_entry, we can do the release before the dec. | 
|---|
| 206 | * We simply care about not seeing the 'lock' in our table from a remote | 
|---|
| 207 | * CPU once the lock has been released, which doesn't rely on the dec. | 
|---|
| 208 | * | 
|---|
| 209 | * Unlike smp_wmb(), release is not a two way fence, hence it is | 
|---|
| 210 | * possible for a inc to move up and reorder with our clearing of the | 
|---|
| 211 | * entry. This isn't a problem however, as for a misdiagnosis of ABBA, | 
|---|
| 212 | * the remote CPU needs to hold this lock, which won't be released until | 
|---|
| 213 | * the store below is done, which would ensure the entry is overwritten | 
|---|
| 214 | * to NULL, etc. | 
|---|
| 215 | */ | 
|---|
| 216 | smp_store_release(&lock->locked, 0); | 
|---|
| 217 | this_cpu_dec(rqspinlock_held_locks.cnt); | 
|---|
| 218 | } | 
|---|
| 219 |  | 
|---|
| 220 | #ifdef CONFIG_QUEUED_SPINLOCKS | 
|---|
| 221 | #define raw_res_spin_lock_init(lock) ({ *(lock) = (rqspinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; }) | 
|---|
| 222 | #else | 
|---|
| 223 | #define raw_res_spin_lock_init(lock) ({ *(lock) = (rqspinlock_t){0}; }) | 
|---|
| 224 | #endif | 
|---|
| 225 |  | 
|---|
| 226 | #define raw_res_spin_lock(lock)                    \ | 
|---|
| 227 | ({                                         \ | 
|---|
| 228 | int __ret;                         \ | 
|---|
| 229 | preempt_disable();                 \ | 
|---|
| 230 | __ret = res_spin_lock(lock);	   \ | 
|---|
| 231 | if (__ret)                         \ | 
|---|
| 232 | preempt_enable();          \ | 
|---|
| 233 | __ret;                             \ | 
|---|
| 234 | }) | 
|---|
| 235 |  | 
|---|
| 236 | #define raw_res_spin_unlock(lock) ({ res_spin_unlock(lock); preempt_enable(); }) | 
|---|
| 237 |  | 
|---|
| 238 | #define raw_res_spin_lock_irqsave(lock, flags)    \ | 
|---|
| 239 | ({                                        \ | 
|---|
| 240 | int __ret;                        \ | 
|---|
| 241 | local_irq_save(flags);            \ | 
|---|
| 242 | __ret = raw_res_spin_lock(lock);  \ | 
|---|
| 243 | if (__ret)                        \ | 
|---|
| 244 | local_irq_restore(flags); \ | 
|---|
| 245 | __ret;                            \ | 
|---|
| 246 | }) | 
|---|
| 247 |  | 
|---|
| 248 | #define raw_res_spin_unlock_irqrestore(lock, flags) ({ raw_res_spin_unlock(lock); local_irq_restore(flags); }) | 
|---|
| 249 |  | 
|---|
| 250 | #endif /* __ASM_GENERIC_RQSPINLOCK_H */ | 
|---|
| 251 |  | 
|---|