| 1 | // SPDX-License-Identifier: GPL-2.0-only | 
|---|
| 2 | #include <linux/atomic.h> | 
|---|
| 3 | #include <linux/percpu.h> | 
|---|
| 4 | #include <linux/wait.h> | 
|---|
| 5 | #include <linux/lockdep.h> | 
|---|
| 6 | #include <linux/percpu-rwsem.h> | 
|---|
| 7 | #include <linux/rcupdate.h> | 
|---|
| 8 | #include <linux/sched.h> | 
|---|
| 9 | #include <linux/sched/task.h> | 
|---|
| 10 | #include <linux/sched/debug.h> | 
|---|
| 11 | #include <linux/errno.h> | 
|---|
| 12 | #include <trace/events/lock.h> | 
|---|
| 13 |  | 
|---|
| 14 | int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, | 
|---|
| 15 | const char *name, struct lock_class_key *key) | 
|---|
| 16 | { | 
|---|
| 17 | sem->read_count = alloc_percpu(int); | 
|---|
| 18 | if (unlikely(!sem->read_count)) | 
|---|
| 19 | return -ENOMEM; | 
|---|
| 20 |  | 
|---|
| 21 | rcu_sync_init(&sem->rss); | 
|---|
| 22 | rcuwait_init(w: &sem->writer); | 
|---|
| 23 | init_waitqueue_head(&sem->waiters); | 
|---|
| 24 | atomic_set(v: &sem->block, i: 0); | 
|---|
| 25 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 
|---|
| 26 | debug_check_no_locks_freed((void *)sem, sizeof(*sem)); | 
|---|
| 27 | lockdep_init_map(&sem->dep_map, name, key, 0); | 
|---|
| 28 | #endif | 
|---|
| 29 | return 0; | 
|---|
| 30 | } | 
|---|
| 31 | EXPORT_SYMBOL_GPL(__percpu_init_rwsem); | 
|---|
| 32 |  | 
|---|
| 33 | void percpu_free_rwsem(struct percpu_rw_semaphore *sem) | 
|---|
| 34 | { | 
|---|
| 35 | /* | 
|---|
| 36 | * XXX: temporary kludge. The error path in alloc_super() | 
|---|
| 37 | * assumes that percpu_free_rwsem() is safe after kzalloc(). | 
|---|
| 38 | */ | 
|---|
| 39 | if (!sem->read_count) | 
|---|
| 40 | return; | 
|---|
| 41 |  | 
|---|
| 42 | rcu_sync_dtor(&sem->rss); | 
|---|
| 43 | free_percpu(pdata: sem->read_count); | 
|---|
| 44 | sem->read_count = NULL; /* catch use after free bugs */ | 
|---|
| 45 | } | 
|---|
| 46 | EXPORT_SYMBOL_GPL(percpu_free_rwsem); | 
|---|
| 47 |  | 
|---|
| 48 | static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem) | 
|---|
| 49 | { | 
|---|
| 50 | this_cpu_inc(*sem->read_count); | 
|---|
| 51 |  | 
|---|
| 52 | /* | 
|---|
| 53 | * Due to having preemption disabled the decrement happens on | 
|---|
| 54 | * the same CPU as the increment, avoiding the | 
|---|
| 55 | * increment-on-one-CPU-and-decrement-on-another problem. | 
|---|
| 56 | * | 
|---|
| 57 | * If the reader misses the writer's assignment of sem->block, then the | 
|---|
| 58 | * writer is guaranteed to see the reader's increment. | 
|---|
| 59 | * | 
|---|
| 60 | * Conversely, any readers that increment their sem->read_count after | 
|---|
| 61 | * the writer looks are guaranteed to see the sem->block value, which | 
|---|
| 62 | * in turn means that they are guaranteed to immediately decrement | 
|---|
| 63 | * their sem->read_count, so that it doesn't matter that the writer | 
|---|
| 64 | * missed them. | 
|---|
| 65 | */ | 
|---|
| 66 |  | 
|---|
| 67 | smp_mb(); /* A matches D */ | 
|---|
| 68 |  | 
|---|
| 69 | /* | 
|---|
| 70 | * If !sem->block the critical section starts here, matched by the | 
|---|
| 71 | * release in percpu_up_write(). | 
|---|
| 72 | */ | 
|---|
| 73 | if (likely(!atomic_read_acquire(&sem->block))) | 
|---|
| 74 | return true; | 
|---|
| 75 |  | 
|---|
| 76 | this_cpu_dec(*sem->read_count); | 
|---|
| 77 |  | 
|---|
| 78 | /* Prod writer to re-evaluate readers_active_check() */ | 
|---|
| 79 | rcuwait_wake_up(w: &sem->writer); | 
|---|
| 80 |  | 
|---|
| 81 | return false; | 
|---|
| 82 | } | 
|---|
| 83 |  | 
|---|
| 84 | static inline bool __percpu_down_write_trylock(struct percpu_rw_semaphore *sem) | 
|---|
| 85 | { | 
|---|
| 86 | if (atomic_read(v: &sem->block)) | 
|---|
| 87 | return false; | 
|---|
| 88 |  | 
|---|
| 89 | return atomic_xchg(v: &sem->block, new: 1) == 0; | 
|---|
| 90 | } | 
|---|
| 91 |  | 
|---|
| 92 | static bool __percpu_rwsem_trylock(struct percpu_rw_semaphore *sem, bool reader) | 
|---|
| 93 | { | 
|---|
| 94 | if (reader) { | 
|---|
| 95 | bool ret; | 
|---|
| 96 |  | 
|---|
| 97 | preempt_disable(); | 
|---|
| 98 | ret = __percpu_down_read_trylock(sem); | 
|---|
| 99 | preempt_enable(); | 
|---|
| 100 |  | 
|---|
| 101 | return ret; | 
|---|
| 102 | } | 
|---|
| 103 | return __percpu_down_write_trylock(sem); | 
|---|
| 104 | } | 
|---|
| 105 |  | 
|---|
| 106 | /* | 
|---|
| 107 | * The return value of wait_queue_entry::func means: | 
|---|
| 108 | * | 
|---|
| 109 | *  <0 - error, wakeup is terminated and the error is returned | 
|---|
| 110 | *   0 - no wakeup, a next waiter is tried | 
|---|
| 111 | *  >0 - woken, if EXCLUSIVE, counted towards @nr_exclusive. | 
|---|
| 112 | * | 
|---|
| 113 | * We use EXCLUSIVE for both readers and writers to preserve FIFO order, | 
|---|
| 114 | * and play games with the return value to allow waking multiple readers. | 
|---|
| 115 | * | 
|---|
| 116 | * Specifically, we wake readers until we've woken a single writer, or until a | 
|---|
| 117 | * trylock fails. | 
|---|
| 118 | */ | 
|---|
| 119 | static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry, | 
|---|
| 120 | unsigned int mode, int wake_flags, | 
|---|
| 121 | void *key) | 
|---|
| 122 | { | 
|---|
| 123 | bool reader = wq_entry->flags & WQ_FLAG_CUSTOM; | 
|---|
| 124 | struct percpu_rw_semaphore *sem = key; | 
|---|
| 125 | struct task_struct *p; | 
|---|
| 126 |  | 
|---|
| 127 | /* concurrent against percpu_down_write(), can get stolen */ | 
|---|
| 128 | if (!__percpu_rwsem_trylock(sem, reader)) | 
|---|
| 129 | return 1; | 
|---|
| 130 |  | 
|---|
| 131 | p = get_task_struct(t: wq_entry->private); | 
|---|
| 132 | list_del_init(entry: &wq_entry->entry); | 
|---|
| 133 | smp_store_release(&wq_entry->private, NULL); | 
|---|
| 134 |  | 
|---|
| 135 | wake_up_process(tsk: p); | 
|---|
| 136 | put_task_struct(t: p); | 
|---|
| 137 |  | 
|---|
| 138 | return !reader; /* wake (readers until) 1 writer */ | 
|---|
| 139 | } | 
|---|
| 140 |  | 
|---|
| 141 | static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader, | 
|---|
| 142 | bool freeze) | 
|---|
| 143 | { | 
|---|
| 144 | DEFINE_WAIT_FUNC(wq_entry, percpu_rwsem_wake_function); | 
|---|
| 145 | bool wait; | 
|---|
| 146 |  | 
|---|
| 147 | spin_lock_irq(lock: &sem->waiters.lock); | 
|---|
| 148 | /* | 
|---|
| 149 | * Serialize against the wakeup in percpu_up_write(), if we fail | 
|---|
| 150 | * the trylock, the wakeup must see us on the list. | 
|---|
| 151 | */ | 
|---|
| 152 | wait = !__percpu_rwsem_trylock(sem, reader); | 
|---|
| 153 | if (wait) { | 
|---|
| 154 | wq_entry.flags |= WQ_FLAG_EXCLUSIVE | reader * WQ_FLAG_CUSTOM; | 
|---|
| 155 | __add_wait_queue_entry_tail(wq_head: &sem->waiters, wq_entry: &wq_entry); | 
|---|
| 156 | } | 
|---|
| 157 | spin_unlock_irq(lock: &sem->waiters.lock); | 
|---|
| 158 |  | 
|---|
| 159 | while (wait) { | 
|---|
| 160 | set_current_state(TASK_UNINTERRUPTIBLE | | 
|---|
| 161 | (freeze ? TASK_FREEZABLE : 0)); | 
|---|
| 162 | if (!smp_load_acquire(&wq_entry.private)) | 
|---|
| 163 | break; | 
|---|
| 164 | schedule(); | 
|---|
| 165 | } | 
|---|
| 166 | __set_current_state(TASK_RUNNING); | 
|---|
| 167 | } | 
|---|
| 168 |  | 
|---|
| 169 | bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try, | 
|---|
| 170 | bool freeze) | 
|---|
| 171 | { | 
|---|
| 172 | if (__percpu_down_read_trylock(sem)) | 
|---|
| 173 | return true; | 
|---|
| 174 |  | 
|---|
| 175 | if (try) | 
|---|
| 176 | return false; | 
|---|
| 177 |  | 
|---|
| 178 | trace_contention_begin(lock: sem, LCB_F_PERCPU | LCB_F_READ); | 
|---|
| 179 | preempt_enable(); | 
|---|
| 180 | percpu_rwsem_wait(sem, /* .reader = */ true, freeze); | 
|---|
| 181 | preempt_disable(); | 
|---|
| 182 | trace_contention_end(lock: sem, ret: 0); | 
|---|
| 183 |  | 
|---|
| 184 | return true; | 
|---|
| 185 | } | 
|---|
| 186 | EXPORT_SYMBOL_GPL(__percpu_down_read); | 
|---|
| 187 |  | 
|---|
| 188 | #define per_cpu_sum(var)						\ | 
|---|
| 189 | ({									\ | 
|---|
| 190 | TYPEOF_UNQUAL(var) __sum = 0;					\ | 
|---|
| 191 | int cpu;							\ | 
|---|
| 192 | compiletime_assert_atomic_type(__sum);				\ | 
|---|
| 193 | for_each_possible_cpu(cpu)					\ | 
|---|
| 194 | __sum += per_cpu(var, cpu);				\ | 
|---|
| 195 | __sum;								\ | 
|---|
| 196 | }) | 
|---|
| 197 |  | 
|---|
| 198 | bool percpu_is_read_locked(struct percpu_rw_semaphore *sem) | 
|---|
| 199 | { | 
|---|
| 200 | return per_cpu_sum(*sem->read_count) != 0 && !atomic_read(v: &sem->block); | 
|---|
| 201 | } | 
|---|
| 202 | EXPORT_SYMBOL_GPL(percpu_is_read_locked); | 
|---|
| 203 |  | 
|---|
| 204 | /* | 
|---|
| 205 | * Return true if the modular sum of the sem->read_count per-CPU variable is | 
|---|
| 206 | * zero.  If this sum is zero, then it is stable due to the fact that if any | 
|---|
| 207 | * newly arriving readers increment a given counter, they will immediately | 
|---|
| 208 | * decrement that same counter. | 
|---|
| 209 | * | 
|---|
| 210 | * Assumes sem->block is set. | 
|---|
| 211 | */ | 
|---|
| 212 | static bool readers_active_check(struct percpu_rw_semaphore *sem) | 
|---|
| 213 | { | 
|---|
| 214 | if (per_cpu_sum(*sem->read_count) != 0) | 
|---|
| 215 | return false; | 
|---|
| 216 |  | 
|---|
| 217 | /* | 
|---|
| 218 | * If we observed the decrement; ensure we see the entire critical | 
|---|
| 219 | * section. | 
|---|
| 220 | */ | 
|---|
| 221 |  | 
|---|
| 222 | smp_mb(); /* C matches B */ | 
|---|
| 223 |  | 
|---|
| 224 | return true; | 
|---|
| 225 | } | 
|---|
| 226 |  | 
|---|
| 227 | void __sched percpu_down_write(struct percpu_rw_semaphore *sem) | 
|---|
| 228 | { | 
|---|
| 229 | bool contended = false; | 
|---|
| 230 |  | 
|---|
| 231 | might_sleep(); | 
|---|
| 232 | rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); | 
|---|
| 233 |  | 
|---|
| 234 | /* Notify readers to take the slow path. */ | 
|---|
| 235 | rcu_sync_enter(&sem->rss); | 
|---|
| 236 |  | 
|---|
| 237 | /* | 
|---|
| 238 | * Try set sem->block; this provides writer-writer exclusion. | 
|---|
| 239 | * Having sem->block set makes new readers block. | 
|---|
| 240 | */ | 
|---|
| 241 | if (!__percpu_down_write_trylock(sem)) { | 
|---|
| 242 | trace_contention_begin(lock: sem, LCB_F_PERCPU | LCB_F_WRITE); | 
|---|
| 243 | percpu_rwsem_wait(sem, /* .reader = */ false, freeze: false); | 
|---|
| 244 | contended = true; | 
|---|
| 245 | } | 
|---|
| 246 |  | 
|---|
| 247 | /* smp_mb() implied by __percpu_down_write_trylock() on success -- D matches A */ | 
|---|
| 248 |  | 
|---|
| 249 | /* | 
|---|
| 250 | * If they don't see our store of sem->block, then we are guaranteed to | 
|---|
| 251 | * see their sem->read_count increment, and therefore will wait for | 
|---|
| 252 | * them. | 
|---|
| 253 | */ | 
|---|
| 254 |  | 
|---|
| 255 | /* Wait for all active readers to complete. */ | 
|---|
| 256 | rcuwait_wait_event(&sem->writer, readers_active_check(sem), TASK_UNINTERRUPTIBLE); | 
|---|
| 257 | if (contended) | 
|---|
| 258 | trace_contention_end(lock: sem, ret: 0); | 
|---|
| 259 | } | 
|---|
| 260 | EXPORT_SYMBOL_GPL(percpu_down_write); | 
|---|
| 261 |  | 
|---|
| 262 | void percpu_up_write(struct percpu_rw_semaphore *sem) | 
|---|
| 263 | { | 
|---|
| 264 | rwsem_release(&sem->dep_map, _RET_IP_); | 
|---|
| 265 |  | 
|---|
| 266 | /* | 
|---|
| 267 | * Signal the writer is done, no fast path yet. | 
|---|
| 268 | * | 
|---|
| 269 | * One reason that we cannot just immediately flip to readers_fast is | 
|---|
| 270 | * that new readers might fail to see the results of this writer's | 
|---|
| 271 | * critical section. | 
|---|
| 272 | * | 
|---|
| 273 | * Therefore we force it through the slow path which guarantees an | 
|---|
| 274 | * acquire and thereby guarantees the critical section's consistency. | 
|---|
| 275 | */ | 
|---|
| 276 | atomic_set_release(v: &sem->block, i: 0); | 
|---|
| 277 |  | 
|---|
| 278 | /* | 
|---|
| 279 | * Prod any pending reader/writer to make progress. | 
|---|
| 280 | */ | 
|---|
| 281 | __wake_up(wq_head: &sem->waiters, TASK_NORMAL, nr: 1, key: sem); | 
|---|
| 282 |  | 
|---|
| 283 | /* | 
|---|
| 284 | * Once this completes (at least one RCU-sched grace period hence) the | 
|---|
| 285 | * reader fast path will be available again. Safe to use outside the | 
|---|
| 286 | * exclusive write lock because its counting. | 
|---|
| 287 | */ | 
|---|
| 288 | rcu_sync_exit(&sem->rss); | 
|---|
| 289 | } | 
|---|
| 290 | EXPORT_SYMBOL_GPL(percpu_up_write); | 
|---|
| 291 |  | 
|---|