| 1 | // SPDX-License-Identifier: GPL-2.0-or-later | 
|---|
| 2 |  | 
|---|
| 3 | #include <linux/plist.h> | 
|---|
| 4 | #include <linux/sched/signal.h> | 
|---|
| 5 |  | 
|---|
| 6 | #include "futex.h" | 
|---|
| 7 | #include "../locking/rtmutex_common.h" | 
|---|
| 8 |  | 
|---|
| 9 | /* | 
|---|
| 10 | * On PREEMPT_RT, the hash bucket lock is a 'sleeping' spinlock with an | 
|---|
| 11 | * underlying rtmutex. The task which is about to be requeued could have | 
|---|
| 12 | * just woken up (timeout, signal). After the wake up the task has to | 
|---|
| 13 | * acquire hash bucket lock, which is held by the requeue code.  As a task | 
|---|
| 14 | * can only be blocked on _ONE_ rtmutex at a time, the proxy lock blocking | 
|---|
| 15 | * and the hash bucket lock blocking would collide and corrupt state. | 
|---|
| 16 | * | 
|---|
| 17 | * On !PREEMPT_RT this is not a problem and everything could be serialized | 
|---|
| 18 | * on hash bucket lock, but aside of having the benefit of common code, | 
|---|
| 19 | * this allows to avoid doing the requeue when the task is already on the | 
|---|
| 20 | * way out and taking the hash bucket lock of the original uaddr1 when the | 
|---|
| 21 | * requeue has been completed. | 
|---|
| 22 | * | 
|---|
| 23 | * The following state transitions are valid: | 
|---|
| 24 | * | 
|---|
| 25 | * On the waiter side: | 
|---|
| 26 | *   Q_REQUEUE_PI_NONE		-> Q_REQUEUE_PI_IGNORE | 
|---|
| 27 | *   Q_REQUEUE_PI_IN_PROGRESS	-> Q_REQUEUE_PI_WAIT | 
|---|
| 28 | * | 
|---|
| 29 | * On the requeue side: | 
|---|
| 30 | *   Q_REQUEUE_PI_NONE		-> Q_REQUEUE_PI_INPROGRESS | 
|---|
| 31 | *   Q_REQUEUE_PI_IN_PROGRESS	-> Q_REQUEUE_PI_DONE/LOCKED | 
|---|
| 32 | *   Q_REQUEUE_PI_IN_PROGRESS	-> Q_REQUEUE_PI_NONE (requeue failed) | 
|---|
| 33 | *   Q_REQUEUE_PI_WAIT		-> Q_REQUEUE_PI_DONE/LOCKED | 
|---|
| 34 | *   Q_REQUEUE_PI_WAIT		-> Q_REQUEUE_PI_IGNORE (requeue failed) | 
|---|
| 35 | * | 
|---|
| 36 | * The requeue side ignores a waiter with state Q_REQUEUE_PI_IGNORE as this | 
|---|
| 37 | * signals that the waiter is already on the way out. It also means that | 
|---|
| 38 | * the waiter is still on the 'wait' futex, i.e. uaddr1. | 
|---|
| 39 | * | 
|---|
| 40 | * The waiter side signals early wakeup to the requeue side either through | 
|---|
| 41 | * setting state to Q_REQUEUE_PI_IGNORE or to Q_REQUEUE_PI_WAIT depending | 
|---|
| 42 | * on the current state. In case of Q_REQUEUE_PI_IGNORE it can immediately | 
|---|
| 43 | * proceed to take the hash bucket lock of uaddr1. If it set state to WAIT, | 
|---|
| 44 | * which means the wakeup is interleaving with a requeue in progress it has | 
|---|
| 45 | * to wait for the requeue side to change the state. Either to DONE/LOCKED | 
|---|
| 46 | * or to IGNORE. DONE/LOCKED means the waiter q is now on the uaddr2 futex | 
|---|
| 47 | * and either blocked (DONE) or has acquired it (LOCKED). IGNORE is set by | 
|---|
| 48 | * the requeue side when the requeue attempt failed via deadlock detection | 
|---|
| 49 | * and therefore the waiter q is still on the uaddr1 futex. | 
|---|
| 50 | */ | 
|---|
| 51 | enum { | 
|---|
| 52 | Q_REQUEUE_PI_NONE		=  0, | 
|---|
| 53 | Q_REQUEUE_PI_IGNORE, | 
|---|
| 54 | Q_REQUEUE_PI_IN_PROGRESS, | 
|---|
| 55 | Q_REQUEUE_PI_WAIT, | 
|---|
| 56 | Q_REQUEUE_PI_DONE, | 
|---|
| 57 | Q_REQUEUE_PI_LOCKED, | 
|---|
| 58 | }; | 
|---|
| 59 |  | 
|---|
| 60 | const struct futex_q futex_q_init = { | 
|---|
| 61 | /* list gets initialized in futex_queue()*/ | 
|---|
| 62 | .wake		= futex_wake_mark, | 
|---|
| 63 | .key		= FUTEX_KEY_INIT, | 
|---|
| 64 | .bitset		= FUTEX_BITSET_MATCH_ANY, | 
|---|
| 65 | .requeue_state	= ATOMIC_INIT(Q_REQUEUE_PI_NONE), | 
|---|
| 66 | }; | 
|---|
| 67 |  | 
|---|
| 68 | /** | 
|---|
| 69 | * requeue_futex() - Requeue a futex_q from one hb to another | 
|---|
| 70 | * @q:		the futex_q to requeue | 
|---|
| 71 | * @hb1:	the source hash_bucket | 
|---|
| 72 | * @hb2:	the target hash_bucket | 
|---|
| 73 | * @key2:	the new key for the requeued futex_q | 
|---|
| 74 | */ | 
|---|
| 75 | static inline | 
|---|
| 76 | void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, | 
|---|
| 77 | struct futex_hash_bucket *hb2, union futex_key *key2) | 
|---|
| 78 | { | 
|---|
| 79 |  | 
|---|
| 80 | /* | 
|---|
| 81 | * If key1 and key2 hash to the same bucket, no need to | 
|---|
| 82 | * requeue. | 
|---|
| 83 | */ | 
|---|
| 84 | if (likely(&hb1->chain != &hb2->chain)) { | 
|---|
| 85 | plist_del(node: &q->list, head: &hb1->chain); | 
|---|
| 86 | futex_hb_waiters_dec(hb: hb1); | 
|---|
| 87 | futex_hb_waiters_inc(hb: hb2); | 
|---|
| 88 | plist_add(node: &q->list, head: &hb2->chain); | 
|---|
| 89 | q->lock_ptr = &hb2->lock; | 
|---|
| 90 | /* | 
|---|
| 91 | * hb1 and hb2 belong to the same futex_hash_bucket_private | 
|---|
| 92 | * because if we managed get a reference on hb1 then it can't be | 
|---|
| 93 | * replaced. Therefore we avoid put(hb1)+get(hb2) here. | 
|---|
| 94 | */ | 
|---|
| 95 | } | 
|---|
| 96 | q->key = *key2; | 
|---|
| 97 | } | 
|---|
| 98 |  | 
|---|
| 99 | static inline bool futex_requeue_pi_prepare(struct futex_q *q, | 
|---|
| 100 | struct futex_pi_state *pi_state) | 
|---|
| 101 | { | 
|---|
| 102 | int old, new; | 
|---|
| 103 |  | 
|---|
| 104 | /* | 
|---|
| 105 | * Set state to Q_REQUEUE_PI_IN_PROGRESS unless an early wakeup has | 
|---|
| 106 | * already set Q_REQUEUE_PI_IGNORE to signal that requeue should | 
|---|
| 107 | * ignore the waiter. | 
|---|
| 108 | */ | 
|---|
| 109 | old = atomic_read_acquire(v: &q->requeue_state); | 
|---|
| 110 | do { | 
|---|
| 111 | if (old == Q_REQUEUE_PI_IGNORE) | 
|---|
| 112 | return false; | 
|---|
| 113 |  | 
|---|
| 114 | /* | 
|---|
| 115 | * futex_proxy_trylock_atomic() might have set it to | 
|---|
| 116 | * IN_PROGRESS and a interleaved early wake to WAIT. | 
|---|
| 117 | * | 
|---|
| 118 | * It was considered to have an extra state for that | 
|---|
| 119 | * trylock, but that would just add more conditionals | 
|---|
| 120 | * all over the place for a dubious value. | 
|---|
| 121 | */ | 
|---|
| 122 | if (old != Q_REQUEUE_PI_NONE) | 
|---|
| 123 | break; | 
|---|
| 124 |  | 
|---|
| 125 | new = Q_REQUEUE_PI_IN_PROGRESS; | 
|---|
| 126 | } while (!atomic_try_cmpxchg(v: &q->requeue_state, old: &old, new)); | 
|---|
| 127 |  | 
|---|
| 128 | q->pi_state = pi_state; | 
|---|
| 129 | return true; | 
|---|
| 130 | } | 
|---|
| 131 |  | 
|---|
| 132 | static inline void futex_requeue_pi_complete(struct futex_q *q, int locked) | 
|---|
| 133 | { | 
|---|
| 134 | int old, new; | 
|---|
| 135 |  | 
|---|
| 136 | old = atomic_read_acquire(v: &q->requeue_state); | 
|---|
| 137 | do { | 
|---|
| 138 | if (old == Q_REQUEUE_PI_IGNORE) | 
|---|
| 139 | return; | 
|---|
| 140 |  | 
|---|
| 141 | if (locked >= 0) { | 
|---|
| 142 | /* Requeue succeeded. Set DONE or LOCKED */ | 
|---|
| 143 | WARN_ON_ONCE(old != Q_REQUEUE_PI_IN_PROGRESS && | 
|---|
| 144 | old != Q_REQUEUE_PI_WAIT); | 
|---|
| 145 | new = Q_REQUEUE_PI_DONE + locked; | 
|---|
| 146 | } else if (old == Q_REQUEUE_PI_IN_PROGRESS) { | 
|---|
| 147 | /* Deadlock, no early wakeup interleave */ | 
|---|
| 148 | new = Q_REQUEUE_PI_NONE; | 
|---|
| 149 | } else { | 
|---|
| 150 | /* Deadlock, early wakeup interleave. */ | 
|---|
| 151 | WARN_ON_ONCE(old != Q_REQUEUE_PI_WAIT); | 
|---|
| 152 | new = Q_REQUEUE_PI_IGNORE; | 
|---|
| 153 | } | 
|---|
| 154 | } while (!atomic_try_cmpxchg(v: &q->requeue_state, old: &old, new)); | 
|---|
| 155 |  | 
|---|
| 156 | #ifdef CONFIG_PREEMPT_RT | 
|---|
| 157 | /* If the waiter interleaved with the requeue let it know */ | 
|---|
| 158 | if (unlikely(old == Q_REQUEUE_PI_WAIT)) | 
|---|
| 159 | rcuwait_wake_up(&q->requeue_wait); | 
|---|
| 160 | #endif | 
|---|
| 161 | } | 
|---|
| 162 |  | 
|---|
| 163 | static inline int futex_requeue_pi_wakeup_sync(struct futex_q *q) | 
|---|
| 164 | { | 
|---|
| 165 | int old, new; | 
|---|
| 166 |  | 
|---|
| 167 | old = atomic_read_acquire(v: &q->requeue_state); | 
|---|
| 168 | do { | 
|---|
| 169 | /* Is requeue done already? */ | 
|---|
| 170 | if (old >= Q_REQUEUE_PI_DONE) | 
|---|
| 171 | return old; | 
|---|
| 172 |  | 
|---|
| 173 | /* | 
|---|
| 174 | * If not done, then tell the requeue code to either ignore | 
|---|
| 175 | * the waiter or to wake it up once the requeue is done. | 
|---|
| 176 | */ | 
|---|
| 177 | new = Q_REQUEUE_PI_WAIT; | 
|---|
| 178 | if (old == Q_REQUEUE_PI_NONE) | 
|---|
| 179 | new = Q_REQUEUE_PI_IGNORE; | 
|---|
| 180 | } while (!atomic_try_cmpxchg(v: &q->requeue_state, old: &old, new)); | 
|---|
| 181 |  | 
|---|
| 182 | /* If the requeue was in progress, wait for it to complete */ | 
|---|
| 183 | if (old == Q_REQUEUE_PI_IN_PROGRESS) { | 
|---|
| 184 | #ifdef CONFIG_PREEMPT_RT | 
|---|
| 185 | rcuwait_wait_event(&q->requeue_wait, | 
|---|
| 186 | atomic_read(&q->requeue_state) != Q_REQUEUE_PI_WAIT, | 
|---|
| 187 | TASK_UNINTERRUPTIBLE); | 
|---|
| 188 | #else | 
|---|
| 189 | (void)atomic_cond_read_relaxed(&q->requeue_state, VAL != Q_REQUEUE_PI_WAIT); | 
|---|
| 190 | #endif | 
|---|
| 191 | } | 
|---|
| 192 |  | 
|---|
| 193 | /* | 
|---|
| 194 | * Requeue is now either prohibited or complete. Reread state | 
|---|
| 195 | * because during the wait above it might have changed. Nothing | 
|---|
| 196 | * will modify q->requeue_state after this point. | 
|---|
| 197 | */ | 
|---|
| 198 | return atomic_read(v: &q->requeue_state); | 
|---|
| 199 | } | 
|---|
| 200 |  | 
|---|
| 201 | /** | 
|---|
| 202 | * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue | 
|---|
| 203 | * @q:		the futex_q | 
|---|
| 204 | * @key:	the key of the requeue target futex | 
|---|
| 205 | * @hb:		the hash_bucket of the requeue target futex | 
|---|
| 206 | * | 
|---|
| 207 | * During futex_requeue, with requeue_pi=1, it is possible to acquire the | 
|---|
| 208 | * target futex if it is uncontended or via a lock steal. | 
|---|
| 209 | * | 
|---|
| 210 | * 1) Set @q::key to the requeue target futex key so the waiter can detect | 
|---|
| 211 | *    the wakeup on the right futex. | 
|---|
| 212 | * | 
|---|
| 213 | * 2) Dequeue @q from the hash bucket. | 
|---|
| 214 | * | 
|---|
| 215 | * 3) Set @q::rt_waiter to NULL so the woken up task can detect atomic lock | 
|---|
| 216 | *    acquisition. | 
|---|
| 217 | * | 
|---|
| 218 | * 4) Set the q->lock_ptr to the requeue target hb->lock for the case that | 
|---|
| 219 | *    the waiter has to fixup the pi state. | 
|---|
| 220 | * | 
|---|
| 221 | * 5) Complete the requeue state so the waiter can make progress. After | 
|---|
| 222 | *    this point the waiter task can return from the syscall immediately in | 
|---|
| 223 | *    case that the pi state does not have to be fixed up. | 
|---|
| 224 | * | 
|---|
| 225 | * 6) Wake the waiter task. | 
|---|
| 226 | * | 
|---|
| 227 | * Must be called with both q->lock_ptr and hb->lock held. | 
|---|
| 228 | */ | 
|---|
| 229 | static inline | 
|---|
| 230 | void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, | 
|---|
| 231 | struct futex_hash_bucket *hb) | 
|---|
| 232 | { | 
|---|
| 233 | struct task_struct *task; | 
|---|
| 234 |  | 
|---|
| 235 | q->key = *key; | 
|---|
| 236 | __futex_unqueue(q); | 
|---|
| 237 |  | 
|---|
| 238 | WARN_ON(!q->rt_waiter); | 
|---|
| 239 | q->rt_waiter = NULL; | 
|---|
| 240 | /* | 
|---|
| 241 | * Acquire a reference for the waiter to ensure valid | 
|---|
| 242 | * futex_q::lock_ptr. | 
|---|
| 243 | */ | 
|---|
| 244 | futex_hash_get(hb); | 
|---|
| 245 | q->drop_hb_ref = true; | 
|---|
| 246 | q->lock_ptr = &hb->lock; | 
|---|
| 247 | task = READ_ONCE(q->task); | 
|---|
| 248 |  | 
|---|
| 249 | /* Signal locked state to the waiter */ | 
|---|
| 250 | futex_requeue_pi_complete(q, locked: 1); | 
|---|
| 251 | wake_up_state(tsk: task, TASK_NORMAL); | 
|---|
| 252 | } | 
|---|
| 253 |  | 
|---|
| 254 | /** | 
|---|
| 255 | * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter | 
|---|
| 256 | * @pifutex:		the user address of the to futex | 
|---|
| 257 | * @hb1:		the from futex hash bucket, must be locked by the caller | 
|---|
| 258 | * @hb2:		the to futex hash bucket, must be locked by the caller | 
|---|
| 259 | * @key1:		the from futex key | 
|---|
| 260 | * @key2:		the to futex key | 
|---|
| 261 | * @ps:			address to store the pi_state pointer | 
|---|
| 262 | * @exiting:		Pointer to store the task pointer of the owner task | 
|---|
| 263 | *			which is in the middle of exiting | 
|---|
| 264 | * @set_waiters:	force setting the FUTEX_WAITERS bit (1) or not (0) | 
|---|
| 265 | * | 
|---|
| 266 | * Try and get the lock on behalf of the top waiter if we can do it atomically. | 
|---|
| 267 | * Wake the top waiter if we succeed.  If the caller specified set_waiters, | 
|---|
| 268 | * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. | 
|---|
| 269 | * hb1 and hb2 must be held by the caller. | 
|---|
| 270 | * | 
|---|
| 271 | * @exiting is only set when the return value is -EBUSY. If so, this holds | 
|---|
| 272 | * a refcount on the exiting task on return and the caller needs to drop it | 
|---|
| 273 | * after waiting for the exit to complete. | 
|---|
| 274 | * | 
|---|
| 275 | * Return: | 
|---|
| 276 | *  -  0 - failed to acquire the lock atomically; | 
|---|
| 277 | *  - >0 - acquired the lock, return value is vpid of the top_waiter | 
|---|
| 278 | *  - <0 - error | 
|---|
| 279 | */ | 
|---|
| 280 | static int | 
|---|
| 281 | futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, | 
|---|
| 282 | struct futex_hash_bucket *hb2, union futex_key *key1, | 
|---|
| 283 | union futex_key *key2, struct futex_pi_state **ps, | 
|---|
| 284 | struct task_struct **exiting, int set_waiters) | 
|---|
| 285 | { | 
|---|
| 286 | struct futex_q *top_waiter; | 
|---|
| 287 | u32 curval; | 
|---|
| 288 | int ret; | 
|---|
| 289 |  | 
|---|
| 290 | if (futex_get_value_locked(dest: &curval, from: pifutex)) | 
|---|
| 291 | return -EFAULT; | 
|---|
| 292 |  | 
|---|
| 293 | if (unlikely(should_fail_futex(true))) | 
|---|
| 294 | return -EFAULT; | 
|---|
| 295 |  | 
|---|
| 296 | /* | 
|---|
| 297 | * Find the top_waiter and determine if there are additional waiters. | 
|---|
| 298 | * If the caller intends to requeue more than 1 waiter to pifutex, | 
|---|
| 299 | * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now, | 
|---|
| 300 | * as we have means to handle the possible fault.  If not, don't set | 
|---|
| 301 | * the bit unnecessarily as it will force the subsequent unlock to enter | 
|---|
| 302 | * the kernel. | 
|---|
| 303 | */ | 
|---|
| 304 | top_waiter = futex_top_waiter(hb: hb1, key: key1); | 
|---|
| 305 |  | 
|---|
| 306 | /* There are no waiters, nothing for us to do. */ | 
|---|
| 307 | if (!top_waiter) | 
|---|
| 308 | return 0; | 
|---|
| 309 |  | 
|---|
| 310 | /* | 
|---|
| 311 | * Ensure that this is a waiter sitting in futex_wait_requeue_pi() | 
|---|
| 312 | * and waiting on the 'waitqueue' futex which is always !PI. | 
|---|
| 313 | */ | 
|---|
| 314 | if (!top_waiter->rt_waiter || top_waiter->pi_state) | 
|---|
| 315 | return -EINVAL; | 
|---|
| 316 |  | 
|---|
| 317 | /* Ensure we requeue to the expected futex. */ | 
|---|
| 318 | if (!futex_match(key1: top_waiter->requeue_pi_key, key2)) | 
|---|
| 319 | return -EINVAL; | 
|---|
| 320 |  | 
|---|
| 321 | /* Ensure that this does not race against an early wakeup */ | 
|---|
| 322 | if (!futex_requeue_pi_prepare(q: top_waiter, NULL)) | 
|---|
| 323 | return -EAGAIN; | 
|---|
| 324 |  | 
|---|
| 325 | /* | 
|---|
| 326 | * Try to take the lock for top_waiter and set the FUTEX_WAITERS bit | 
|---|
| 327 | * in the contended case or if @set_waiters is true. | 
|---|
| 328 | * | 
|---|
| 329 | * In the contended case PI state is attached to the lock owner. If | 
|---|
| 330 | * the user space lock can be acquired then PI state is attached to | 
|---|
| 331 | * the new owner (@top_waiter->task) when @set_waiters is true. | 
|---|
| 332 | */ | 
|---|
| 333 | ret = futex_lock_pi_atomic(uaddr: pifutex, hb: hb2, key: key2, ps, task: top_waiter->task, | 
|---|
| 334 | exiting, set_waiters); | 
|---|
| 335 | if (ret == 1) { | 
|---|
| 336 | /* | 
|---|
| 337 | * Lock was acquired in user space and PI state was | 
|---|
| 338 | * attached to @top_waiter->task. That means state is fully | 
|---|
| 339 | * consistent and the waiter can return to user space | 
|---|
| 340 | * immediately after the wakeup. | 
|---|
| 341 | */ | 
|---|
| 342 | requeue_pi_wake_futex(q: top_waiter, key: key2, hb: hb2); | 
|---|
| 343 | } else if (ret < 0) { | 
|---|
| 344 | /* Rewind top_waiter::requeue_state */ | 
|---|
| 345 | futex_requeue_pi_complete(q: top_waiter, locked: ret); | 
|---|
| 346 | } else { | 
|---|
| 347 | /* | 
|---|
| 348 | * futex_lock_pi_atomic() did not acquire the user space | 
|---|
| 349 | * futex, but managed to establish the proxy lock and pi | 
|---|
| 350 | * state. top_waiter::requeue_state cannot be fixed up here | 
|---|
| 351 | * because the waiter is not enqueued on the rtmutex | 
|---|
| 352 | * yet. This is handled at the callsite depending on the | 
|---|
| 353 | * result of rt_mutex_start_proxy_lock() which is | 
|---|
| 354 | * guaranteed to be reached with this function returning 0. | 
|---|
| 355 | */ | 
|---|
| 356 | } | 
|---|
| 357 | return ret; | 
|---|
| 358 | } | 
|---|
| 359 |  | 
|---|
| 360 | /** | 
|---|
| 361 | * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 | 
|---|
| 362 | * @uaddr1:	source futex user address | 
|---|
| 363 | * @flags1:	futex flags (FLAGS_SHARED, etc.) | 
|---|
| 364 | * @uaddr2:	target futex user address | 
|---|
| 365 | * @flags2:	futex flags (FLAGS_SHARED, etc.) | 
|---|
| 366 | * @nr_wake:	number of waiters to wake (must be 1 for requeue_pi) | 
|---|
| 367 | * @nr_requeue:	number of waiters to requeue (0-INT_MAX) | 
|---|
| 368 | * @cmpval:	@uaddr1 expected value (or %NULL) | 
|---|
| 369 | * @requeue_pi:	if we are attempting to requeue from a non-pi futex to a | 
|---|
| 370 | *		pi futex (pi to pi requeue is not supported) | 
|---|
| 371 | * | 
|---|
| 372 | * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire | 
|---|
| 373 | * uaddr2 atomically on behalf of the top waiter. | 
|---|
| 374 | * | 
|---|
| 375 | * Return: | 
|---|
| 376 | *  - >=0 - on success, the number of tasks requeued or woken; | 
|---|
| 377 | *  -  <0 - on error | 
|---|
| 378 | */ | 
|---|
| 379 | int futex_requeue(u32 __user *uaddr1, unsigned int flags1, | 
|---|
| 380 | u32 __user *uaddr2, unsigned int flags2, | 
|---|
| 381 | int nr_wake, int nr_requeue, u32 *cmpval, int requeue_pi) | 
|---|
| 382 | { | 
|---|
| 383 | union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; | 
|---|
| 384 | int task_count = 0, ret; | 
|---|
| 385 | struct futex_pi_state *pi_state = NULL; | 
|---|
| 386 | struct futex_q *this, *next; | 
|---|
| 387 | DEFINE_WAKE_Q(wake_q); | 
|---|
| 388 |  | 
|---|
| 389 | if (nr_wake < 0 || nr_requeue < 0) | 
|---|
| 390 | return -EINVAL; | 
|---|
| 391 |  | 
|---|
| 392 | /* | 
|---|
| 393 | * When PI not supported: return -ENOSYS if requeue_pi is true, | 
|---|
| 394 | * consequently the compiler knows requeue_pi is always false past | 
|---|
| 395 | * this point which will optimize away all the conditional code | 
|---|
| 396 | * further down. | 
|---|
| 397 | */ | 
|---|
| 398 | if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi) | 
|---|
| 399 | return -ENOSYS; | 
|---|
| 400 |  | 
|---|
| 401 | if (requeue_pi) { | 
|---|
| 402 | /* | 
|---|
| 403 | * Requeue PI only works on two distinct uaddrs. This | 
|---|
| 404 | * check is only valid for private futexes. See below. | 
|---|
| 405 | */ | 
|---|
| 406 | if (uaddr1 == uaddr2) | 
|---|
| 407 | return -EINVAL; | 
|---|
| 408 |  | 
|---|
| 409 | /* | 
|---|
| 410 | * futex_requeue() allows the caller to define the number | 
|---|
| 411 | * of waiters to wake up via the @nr_wake argument. With | 
|---|
| 412 | * REQUEUE_PI, waking up more than one waiter is creating | 
|---|
| 413 | * more problems than it solves. Waking up a waiter makes | 
|---|
| 414 | * only sense if the PI futex @uaddr2 is uncontended as | 
|---|
| 415 | * this allows the requeue code to acquire the futex | 
|---|
| 416 | * @uaddr2 before waking the waiter. The waiter can then | 
|---|
| 417 | * return to user space without further action. A secondary | 
|---|
| 418 | * wakeup would just make the futex_wait_requeue_pi() | 
|---|
| 419 | * handling more complex, because that code would have to | 
|---|
| 420 | * look up pi_state and do more or less all the handling | 
|---|
| 421 | * which the requeue code has to do for the to be requeued | 
|---|
| 422 | * waiters. So restrict the number of waiters to wake to | 
|---|
| 423 | * one, and only wake it up when the PI futex is | 
|---|
| 424 | * uncontended. Otherwise requeue it and let the unlock of | 
|---|
| 425 | * the PI futex handle the wakeup. | 
|---|
| 426 | * | 
|---|
| 427 | * All REQUEUE_PI users, e.g. pthread_cond_signal() and | 
|---|
| 428 | * pthread_cond_broadcast() must use nr_wake=1. | 
|---|
| 429 | */ | 
|---|
| 430 | if (nr_wake != 1) | 
|---|
| 431 | return -EINVAL; | 
|---|
| 432 |  | 
|---|
| 433 | /* | 
|---|
| 434 | * requeue_pi requires a pi_state, try to allocate it now | 
|---|
| 435 | * without any locks in case it fails. | 
|---|
| 436 | */ | 
|---|
| 437 | if (refill_pi_state_cache()) | 
|---|
| 438 | return -ENOMEM; | 
|---|
| 439 | } | 
|---|
| 440 |  | 
|---|
| 441 | retry: | 
|---|
| 442 | ret = get_futex_key(uaddr: uaddr1, flags: flags1, key: &key1, rw: FUTEX_READ); | 
|---|
| 443 | if (unlikely(ret != 0)) | 
|---|
| 444 | return ret; | 
|---|
| 445 | ret = get_futex_key(uaddr: uaddr2, flags: flags2, key: &key2, | 
|---|
| 446 | rw: requeue_pi ? FUTEX_WRITE : FUTEX_READ); | 
|---|
| 447 | if (unlikely(ret != 0)) | 
|---|
| 448 | return ret; | 
|---|
| 449 |  | 
|---|
| 450 | /* | 
|---|
| 451 | * The check above which compares uaddrs is not sufficient for | 
|---|
| 452 | * shared futexes. We need to compare the keys: | 
|---|
| 453 | */ | 
|---|
| 454 | if (requeue_pi && futex_match(key1: &key1, key2: &key2)) | 
|---|
| 455 | return -EINVAL; | 
|---|
| 456 |  | 
|---|
| 457 | retry_private: | 
|---|
| 458 | if (1) { | 
|---|
| 459 | CLASS(hb, hb1)(key: &key1); | 
|---|
| 460 | CLASS(hb, hb2)(key: &key2); | 
|---|
| 461 |  | 
|---|
| 462 | futex_hb_waiters_inc(hb: hb2); | 
|---|
| 463 | double_lock_hb(hb1, hb2); | 
|---|
| 464 |  | 
|---|
| 465 | if (likely(cmpval != NULL)) { | 
|---|
| 466 | u32 curval; | 
|---|
| 467 |  | 
|---|
| 468 | ret = futex_get_value_locked(dest: &curval, from: uaddr1); | 
|---|
| 469 |  | 
|---|
| 470 | if (unlikely(ret)) { | 
|---|
| 471 | futex_hb_waiters_dec(hb: hb2); | 
|---|
| 472 | double_unlock_hb(hb1, hb2); | 
|---|
| 473 |  | 
|---|
| 474 | ret = get_user(curval, uaddr1); | 
|---|
| 475 | if (ret) | 
|---|
| 476 | return ret; | 
|---|
| 477 |  | 
|---|
| 478 | if (!(flags1 & FLAGS_SHARED)) | 
|---|
| 479 | goto retry_private; | 
|---|
| 480 |  | 
|---|
| 481 | goto retry; | 
|---|
| 482 | } | 
|---|
| 483 | if (curval != *cmpval) { | 
|---|
| 484 | ret = -EAGAIN; | 
|---|
| 485 | goto out_unlock; | 
|---|
| 486 | } | 
|---|
| 487 | } | 
|---|
| 488 |  | 
|---|
| 489 | if (requeue_pi) { | 
|---|
| 490 | struct task_struct *exiting = NULL; | 
|---|
| 491 |  | 
|---|
| 492 | /* | 
|---|
| 493 | * Attempt to acquire uaddr2 and wake the top waiter. If we | 
|---|
| 494 | * intend to requeue waiters, force setting the FUTEX_WAITERS | 
|---|
| 495 | * bit.  We force this here where we are able to easily handle | 
|---|
| 496 | * faults rather in the requeue loop below. | 
|---|
| 497 | * | 
|---|
| 498 | * Updates topwaiter::requeue_state if a top waiter exists. | 
|---|
| 499 | */ | 
|---|
| 500 | ret = futex_proxy_trylock_atomic(pifutex: uaddr2, hb1, hb2, key1: &key1, | 
|---|
| 501 | key2: &key2, ps: &pi_state, | 
|---|
| 502 | exiting: &exiting, set_waiters: nr_requeue); | 
|---|
| 503 |  | 
|---|
| 504 | /* | 
|---|
| 505 | * At this point the top_waiter has either taken uaddr2 or | 
|---|
| 506 | * is waiting on it. In both cases pi_state has been | 
|---|
| 507 | * established and an initial refcount on it. In case of an | 
|---|
| 508 | * error there's nothing. | 
|---|
| 509 | * | 
|---|
| 510 | * The top waiter's requeue_state is up to date: | 
|---|
| 511 | * | 
|---|
| 512 | *  - If the lock was acquired atomically (ret == 1), then | 
|---|
| 513 | *    the state is Q_REQUEUE_PI_LOCKED. | 
|---|
| 514 | * | 
|---|
| 515 | *    The top waiter has been dequeued and woken up and can | 
|---|
| 516 | *    return to user space immediately. The kernel/user | 
|---|
| 517 | *    space state is consistent. In case that there must be | 
|---|
| 518 | *    more waiters requeued the WAITERS bit in the user | 
|---|
| 519 | *    space futex is set so the top waiter task has to go | 
|---|
| 520 | *    into the syscall slowpath to unlock the futex. This | 
|---|
| 521 | *    will block until this requeue operation has been | 
|---|
| 522 | *    completed and the hash bucket locks have been | 
|---|
| 523 | *    dropped. | 
|---|
| 524 | * | 
|---|
| 525 | *  - If the trylock failed with an error (ret < 0) then | 
|---|
| 526 | *    the state is either Q_REQUEUE_PI_NONE, i.e. "nothing | 
|---|
| 527 | *    happened", or Q_REQUEUE_PI_IGNORE when there was an | 
|---|
| 528 | *    interleaved early wakeup. | 
|---|
| 529 | * | 
|---|
| 530 | *  - If the trylock did not succeed (ret == 0) then the | 
|---|
| 531 | *    state is either Q_REQUEUE_PI_IN_PROGRESS or | 
|---|
| 532 | *    Q_REQUEUE_PI_WAIT if an early wakeup interleaved. | 
|---|
| 533 | *    This will be cleaned up in the loop below, which | 
|---|
| 534 | *    cannot fail because futex_proxy_trylock_atomic() did | 
|---|
| 535 | *    the same sanity checks for requeue_pi as the loop | 
|---|
| 536 | *    below does. | 
|---|
| 537 | */ | 
|---|
| 538 | switch (ret) { | 
|---|
| 539 | case 0: | 
|---|
| 540 | /* We hold a reference on the pi state. */ | 
|---|
| 541 | break; | 
|---|
| 542 |  | 
|---|
| 543 | case 1: | 
|---|
| 544 | /* | 
|---|
| 545 | * futex_proxy_trylock_atomic() acquired the user space | 
|---|
| 546 | * futex. Adjust task_count. | 
|---|
| 547 | */ | 
|---|
| 548 | task_count++; | 
|---|
| 549 | ret = 0; | 
|---|
| 550 | break; | 
|---|
| 551 |  | 
|---|
| 552 | /* | 
|---|
| 553 | * If the above failed, then pi_state is NULL and | 
|---|
| 554 | * waiter::requeue_state is correct. | 
|---|
| 555 | */ | 
|---|
| 556 | case -EFAULT: | 
|---|
| 557 | futex_hb_waiters_dec(hb: hb2); | 
|---|
| 558 | double_unlock_hb(hb1, hb2); | 
|---|
| 559 | ret = fault_in_user_writeable(uaddr: uaddr2); | 
|---|
| 560 | if (!ret) | 
|---|
| 561 | goto retry; | 
|---|
| 562 | return ret; | 
|---|
| 563 | case -EBUSY: | 
|---|
| 564 | case -EAGAIN: | 
|---|
| 565 | /* | 
|---|
| 566 | * Two reasons for this: | 
|---|
| 567 | * - EBUSY: Owner is exiting and we just wait for the | 
|---|
| 568 | *   exit to complete. | 
|---|
| 569 | * - EAGAIN: The user space value changed. | 
|---|
| 570 | */ | 
|---|
| 571 | futex_hb_waiters_dec(hb: hb2); | 
|---|
| 572 | double_unlock_hb(hb1, hb2); | 
|---|
| 573 | /* | 
|---|
| 574 | * Handle the case where the owner is in the middle of | 
|---|
| 575 | * exiting. Wait for the exit to complete otherwise | 
|---|
| 576 | * this task might loop forever, aka. live lock. | 
|---|
| 577 | */ | 
|---|
| 578 | wait_for_owner_exiting(ret, exiting); | 
|---|
| 579 | cond_resched(); | 
|---|
| 580 | goto retry; | 
|---|
| 581 | default: | 
|---|
| 582 | goto out_unlock; | 
|---|
| 583 | } | 
|---|
| 584 | } | 
|---|
| 585 |  | 
|---|
| 586 | plist_for_each_entry_safe(this, next, &hb1->chain, list) { | 
|---|
| 587 | if (task_count - nr_wake >= nr_requeue) | 
|---|
| 588 | break; | 
|---|
| 589 |  | 
|---|
| 590 | if (!futex_match(key1: &this->key, key2: &key1)) | 
|---|
| 591 | continue; | 
|---|
| 592 |  | 
|---|
| 593 | /* | 
|---|
| 594 | * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always | 
|---|
| 595 | * be paired with each other and no other futex ops. | 
|---|
| 596 | * | 
|---|
| 597 | * We should never be requeueing a futex_q with a pi_state, | 
|---|
| 598 | * which is awaiting a futex_unlock_pi(). | 
|---|
| 599 | */ | 
|---|
| 600 | if ((requeue_pi && !this->rt_waiter) || | 
|---|
| 601 | (!requeue_pi && this->rt_waiter) || | 
|---|
| 602 | this->pi_state) { | 
|---|
| 603 | ret = -EINVAL; | 
|---|
| 604 | break; | 
|---|
| 605 | } | 
|---|
| 606 |  | 
|---|
| 607 | /* Plain futexes just wake or requeue and are done */ | 
|---|
| 608 | if (!requeue_pi) { | 
|---|
| 609 | if (++task_count <= nr_wake) | 
|---|
| 610 | this->wake(&wake_q, this); | 
|---|
| 611 | else | 
|---|
| 612 | requeue_futex(q: this, hb1, hb2, key2: &key2); | 
|---|
| 613 | continue; | 
|---|
| 614 | } | 
|---|
| 615 |  | 
|---|
| 616 | /* Ensure we requeue to the expected futex for requeue_pi. */ | 
|---|
| 617 | if (!futex_match(key1: this->requeue_pi_key, key2: &key2)) { | 
|---|
| 618 | ret = -EINVAL; | 
|---|
| 619 | break; | 
|---|
| 620 | } | 
|---|
| 621 |  | 
|---|
| 622 | /* | 
|---|
| 623 | * Requeue nr_requeue waiters and possibly one more in the case | 
|---|
| 624 | * of requeue_pi if we couldn't acquire the lock atomically. | 
|---|
| 625 | * | 
|---|
| 626 | * Prepare the waiter to take the rt_mutex. Take a refcount | 
|---|
| 627 | * on the pi_state and store the pointer in the futex_q | 
|---|
| 628 | * object of the waiter. | 
|---|
| 629 | */ | 
|---|
| 630 | get_pi_state(pi_state); | 
|---|
| 631 |  | 
|---|
| 632 | /* Don't requeue when the waiter is already on the way out. */ | 
|---|
| 633 | if (!futex_requeue_pi_prepare(q: this, pi_state)) { | 
|---|
| 634 | /* | 
|---|
| 635 | * Early woken waiter signaled that it is on the | 
|---|
| 636 | * way out. Drop the pi_state reference and try the | 
|---|
| 637 | * next waiter. @this->pi_state is still NULL. | 
|---|
| 638 | */ | 
|---|
| 639 | put_pi_state(pi_state); | 
|---|
| 640 | continue; | 
|---|
| 641 | } | 
|---|
| 642 |  | 
|---|
| 643 | ret = rt_mutex_start_proxy_lock(lock: &pi_state->pi_mutex, | 
|---|
| 644 | waiter: this->rt_waiter, | 
|---|
| 645 | task: this->task); | 
|---|
| 646 |  | 
|---|
| 647 | if (ret == 1) { | 
|---|
| 648 | /* | 
|---|
| 649 | * We got the lock. We do neither drop the refcount | 
|---|
| 650 | * on pi_state nor clear this->pi_state because the | 
|---|
| 651 | * waiter needs the pi_state for cleaning up the | 
|---|
| 652 | * user space value. It will drop the refcount | 
|---|
| 653 | * after doing so. this::requeue_state is updated | 
|---|
| 654 | * in the wakeup as well. | 
|---|
| 655 | */ | 
|---|
| 656 | requeue_pi_wake_futex(q: this, key: &key2, hb: hb2); | 
|---|
| 657 | task_count++; | 
|---|
| 658 | } else if (!ret) { | 
|---|
| 659 | /* Waiter is queued, move it to hb2 */ | 
|---|
| 660 | requeue_futex(q: this, hb1, hb2, key2: &key2); | 
|---|
| 661 | futex_requeue_pi_complete(q: this, locked: 0); | 
|---|
| 662 | task_count++; | 
|---|
| 663 | } else { | 
|---|
| 664 | /* | 
|---|
| 665 | * rt_mutex_start_proxy_lock() detected a potential | 
|---|
| 666 | * deadlock when we tried to queue that waiter. | 
|---|
| 667 | * Drop the pi_state reference which we took above | 
|---|
| 668 | * and remove the pointer to the state from the | 
|---|
| 669 | * waiters futex_q object. | 
|---|
| 670 | */ | 
|---|
| 671 | this->pi_state = NULL; | 
|---|
| 672 | put_pi_state(pi_state); | 
|---|
| 673 | futex_requeue_pi_complete(q: this, locked: ret); | 
|---|
| 674 | /* | 
|---|
| 675 | * We stop queueing more waiters and let user space | 
|---|
| 676 | * deal with the mess. | 
|---|
| 677 | */ | 
|---|
| 678 | break; | 
|---|
| 679 | } | 
|---|
| 680 | } | 
|---|
| 681 |  | 
|---|
| 682 | /* | 
|---|
| 683 | * We took an extra initial reference to the pi_state in | 
|---|
| 684 | * futex_proxy_trylock_atomic(). We need to drop it here again. | 
|---|
| 685 | */ | 
|---|
| 686 | put_pi_state(pi_state); | 
|---|
| 687 |  | 
|---|
| 688 | out_unlock: | 
|---|
| 689 | futex_hb_waiters_dec(hb: hb2); | 
|---|
| 690 | double_unlock_hb(hb1, hb2); | 
|---|
| 691 | } | 
|---|
| 692 | wake_up_q(head: &wake_q); | 
|---|
| 693 | return ret ? ret : task_count; | 
|---|
| 694 | } | 
|---|
| 695 |  | 
|---|
| 696 | /** | 
|---|
| 697 | * handle_early_requeue_pi_wakeup() - Handle early wakeup on the initial futex | 
|---|
| 698 | * @hb:		the hash_bucket futex_q was original enqueued on | 
|---|
| 699 | * @q:		the futex_q woken while waiting to be requeued | 
|---|
| 700 | * @timeout:	the timeout associated with the wait (NULL if none) | 
|---|
| 701 | * | 
|---|
| 702 | * Determine the cause for the early wakeup. | 
|---|
| 703 | * | 
|---|
| 704 | * Return: | 
|---|
| 705 | *  -EWOULDBLOCK or -ETIMEDOUT or -ERESTARTNOINTR | 
|---|
| 706 | */ | 
|---|
| 707 | static inline | 
|---|
| 708 | int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, | 
|---|
| 709 | struct futex_q *q, | 
|---|
| 710 | struct hrtimer_sleeper *timeout) | 
|---|
| 711 | { | 
|---|
| 712 | int ret; | 
|---|
| 713 |  | 
|---|
| 714 | /* | 
|---|
| 715 | * With the hb lock held, we avoid races while we process the wakeup. | 
|---|
| 716 | * We only need to hold hb (and not hb2) to ensure atomicity as the | 
|---|
| 717 | * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb. | 
|---|
| 718 | * It can't be requeued from uaddr2 to something else since we don't | 
|---|
| 719 | * support a PI aware source futex for requeue. | 
|---|
| 720 | */ | 
|---|
| 721 | WARN_ON_ONCE(&hb->lock != q->lock_ptr); | 
|---|
| 722 |  | 
|---|
| 723 | /* | 
|---|
| 724 | * We were woken prior to requeue by a timeout or a signal. | 
|---|
| 725 | * Unqueue the futex_q and determine which it was. | 
|---|
| 726 | */ | 
|---|
| 727 | plist_del(node: &q->list, head: &hb->chain); | 
|---|
| 728 | futex_hb_waiters_dec(hb); | 
|---|
| 729 |  | 
|---|
| 730 | /* Handle spurious wakeups gracefully */ | 
|---|
| 731 | ret = -EWOULDBLOCK; | 
|---|
| 732 | if (timeout && !timeout->task) | 
|---|
| 733 | ret = -ETIMEDOUT; | 
|---|
| 734 | else if (signal_pending(current)) | 
|---|
| 735 | ret = -ERESTARTNOINTR; | 
|---|
| 736 | return ret; | 
|---|
| 737 | } | 
|---|
| 738 |  | 
|---|
| 739 | /** | 
|---|
| 740 | * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 | 
|---|
| 741 | * @uaddr:	the futex we initially wait on (non-pi) | 
|---|
| 742 | * @flags:	futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be | 
|---|
| 743 | *		the same type, no requeueing from private to shared, etc. | 
|---|
| 744 | * @val:	the expected value of uaddr | 
|---|
| 745 | * @abs_time:	absolute timeout | 
|---|
| 746 | * @bitset:	32 bit wakeup bitset set by userspace, defaults to all | 
|---|
| 747 | * @uaddr2:	the pi futex we will take prior to returning to user-space | 
|---|
| 748 | * | 
|---|
| 749 | * The caller will wait on uaddr and will be requeued by futex_requeue() to | 
|---|
| 750 | * uaddr2 which must be PI aware and unique from uaddr.  Normal wakeup will wake | 
|---|
| 751 | * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to | 
|---|
| 752 | * userspace.  This ensures the rt_mutex maintains an owner when it has waiters; | 
|---|
| 753 | * without one, the pi logic would not know which task to boost/deboost, if | 
|---|
| 754 | * there was a need to. | 
|---|
| 755 | * | 
|---|
| 756 | * We call schedule in futex_wait_queue() when we enqueue and return there | 
|---|
| 757 | * via the following-- | 
|---|
| 758 | * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() | 
|---|
| 759 | * 2) wakeup on uaddr2 after a requeue | 
|---|
| 760 | * 3) signal | 
|---|
| 761 | * 4) timeout | 
|---|
| 762 | * | 
|---|
| 763 | * If 3, cleanup and return -ERESTARTNOINTR. | 
|---|
| 764 | * | 
|---|
| 765 | * If 2, we may then block on trying to take the rt_mutex and return via: | 
|---|
| 766 | * 5) successful lock | 
|---|
| 767 | * 6) signal | 
|---|
| 768 | * 7) timeout | 
|---|
| 769 | * 8) other lock acquisition failure | 
|---|
| 770 | * | 
|---|
| 771 | * If 6, return -EWOULDBLOCK (restarting the syscall would do the same). | 
|---|
| 772 | * | 
|---|
| 773 | * If 4 or 7, we cleanup and return with -ETIMEDOUT. | 
|---|
| 774 | * | 
|---|
| 775 | * Return: | 
|---|
| 776 | *  -  0 - On success; | 
|---|
| 777 | *  - <0 - On error | 
|---|
| 778 | */ | 
|---|
| 779 | int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, | 
|---|
| 780 | u32 val, ktime_t *abs_time, u32 bitset, | 
|---|
| 781 | u32 __user *uaddr2) | 
|---|
| 782 | { | 
|---|
| 783 | struct hrtimer_sleeper timeout, *to; | 
|---|
| 784 | struct rt_mutex_waiter rt_waiter; | 
|---|
| 785 | union futex_key key2 = FUTEX_KEY_INIT; | 
|---|
| 786 | struct futex_q q = futex_q_init; | 
|---|
| 787 | struct rt_mutex_base *pi_mutex; | 
|---|
| 788 | int res, ret; | 
|---|
| 789 |  | 
|---|
| 790 | if (!IS_ENABLED(CONFIG_FUTEX_PI)) | 
|---|
| 791 | return -ENOSYS; | 
|---|
| 792 |  | 
|---|
| 793 | if (uaddr == uaddr2) | 
|---|
| 794 | return -EINVAL; | 
|---|
| 795 |  | 
|---|
| 796 | if (!bitset) | 
|---|
| 797 | return -EINVAL; | 
|---|
| 798 |  | 
|---|
| 799 | to = futex_setup_timer(time: abs_time, timeout: &timeout, flags, | 
|---|
| 800 | current->timer_slack_ns); | 
|---|
| 801 |  | 
|---|
| 802 | /* | 
|---|
| 803 | * The waiter is allocated on our stack, manipulated by the requeue | 
|---|
| 804 | * code while we sleep on uaddr. | 
|---|
| 805 | */ | 
|---|
| 806 | rt_mutex_init_waiter(waiter: &rt_waiter); | 
|---|
| 807 |  | 
|---|
| 808 | ret = get_futex_key(uaddr: uaddr2, flags, key: &key2, rw: FUTEX_WRITE); | 
|---|
| 809 | if (unlikely(ret != 0)) | 
|---|
| 810 | goto out; | 
|---|
| 811 |  | 
|---|
| 812 | q.bitset = bitset; | 
|---|
| 813 | q.rt_waiter = &rt_waiter; | 
|---|
| 814 | q.requeue_pi_key = &key2; | 
|---|
| 815 |  | 
|---|
| 816 | /* | 
|---|
| 817 | * Prepare to wait on uaddr. On success, it holds hb->lock and q | 
|---|
| 818 | * is initialized. | 
|---|
| 819 | */ | 
|---|
| 820 | ret = futex_wait_setup(uaddr, val, flags, q: &q, key2: &key2, current); | 
|---|
| 821 | if (ret) | 
|---|
| 822 | goto out; | 
|---|
| 823 |  | 
|---|
| 824 | /* Queue the futex_q, drop the hb lock, wait for wakeup. */ | 
|---|
| 825 | futex_do_wait(q: &q, timeout: to); | 
|---|
| 826 |  | 
|---|
| 827 | switch (futex_requeue_pi_wakeup_sync(q: &q)) { | 
|---|
| 828 | case Q_REQUEUE_PI_IGNORE: | 
|---|
| 829 | { | 
|---|
| 830 | CLASS(hb, hb)(key: &q.key); | 
|---|
| 831 | /* The waiter is still on uaddr1 */ | 
|---|
| 832 | spin_lock(lock: &hb->lock); | 
|---|
| 833 | ret = handle_early_requeue_pi_wakeup(hb, q: &q, timeout: to); | 
|---|
| 834 | spin_unlock(lock: &hb->lock); | 
|---|
| 835 | } | 
|---|
| 836 | break; | 
|---|
| 837 |  | 
|---|
| 838 | case Q_REQUEUE_PI_LOCKED: | 
|---|
| 839 | /* The requeue acquired the lock */ | 
|---|
| 840 | if (q.pi_state && (q.pi_state->owner != current)) { | 
|---|
| 841 | futex_q_lockptr_lock(q: &q); | 
|---|
| 842 | ret = fixup_pi_owner(uaddr: uaddr2, q: &q, locked: true); | 
|---|
| 843 | /* | 
|---|
| 844 | * Drop the reference to the pi state which the | 
|---|
| 845 | * requeue_pi() code acquired for us. | 
|---|
| 846 | */ | 
|---|
| 847 | put_pi_state(pi_state: q.pi_state); | 
|---|
| 848 | spin_unlock(lock: q.lock_ptr); | 
|---|
| 849 | /* | 
|---|
| 850 | * Adjust the return value. It's either -EFAULT or | 
|---|
| 851 | * success (1) but the caller expects 0 for success. | 
|---|
| 852 | */ | 
|---|
| 853 | ret = ret < 0 ? ret : 0; | 
|---|
| 854 | } | 
|---|
| 855 | break; | 
|---|
| 856 |  | 
|---|
| 857 | case Q_REQUEUE_PI_DONE: | 
|---|
| 858 | /* Requeue completed. Current is 'pi_blocked_on' the rtmutex */ | 
|---|
| 859 | pi_mutex = &q.pi_state->pi_mutex; | 
|---|
| 860 | ret = rt_mutex_wait_proxy_lock(lock: pi_mutex, to, waiter: &rt_waiter); | 
|---|
| 861 |  | 
|---|
| 862 | /* | 
|---|
| 863 | * See futex_unlock_pi()'s cleanup: comment. | 
|---|
| 864 | */ | 
|---|
| 865 | if (ret && !rt_mutex_cleanup_proxy_lock(lock: pi_mutex, waiter: &rt_waiter)) | 
|---|
| 866 | ret = 0; | 
|---|
| 867 |  | 
|---|
| 868 | futex_q_lockptr_lock(q: &q); | 
|---|
| 869 | debug_rt_mutex_free_waiter(waiter: &rt_waiter); | 
|---|
| 870 | /* | 
|---|
| 871 | * Fixup the pi_state owner and possibly acquire the lock if we | 
|---|
| 872 | * haven't already. | 
|---|
| 873 | */ | 
|---|
| 874 | res = fixup_pi_owner(uaddr: uaddr2, q: &q, locked: !ret); | 
|---|
| 875 | /* | 
|---|
| 876 | * If fixup_pi_owner() returned an error, propagate that.  If it | 
|---|
| 877 | * acquired the lock, clear -ETIMEDOUT or -EINTR. | 
|---|
| 878 | */ | 
|---|
| 879 | if (res) | 
|---|
| 880 | ret = (res < 0) ? res : 0; | 
|---|
| 881 |  | 
|---|
| 882 | futex_unqueue_pi(q: &q); | 
|---|
| 883 | spin_unlock(lock: q.lock_ptr); | 
|---|
| 884 |  | 
|---|
| 885 | if (ret == -EINTR) { | 
|---|
| 886 | /* | 
|---|
| 887 | * We've already been requeued, but cannot restart | 
|---|
| 888 | * by calling futex_lock_pi() directly. We could | 
|---|
| 889 | * restart this syscall, but it would detect that | 
|---|
| 890 | * the user space "val" changed and return | 
|---|
| 891 | * -EWOULDBLOCK.  Save the overhead of the restart | 
|---|
| 892 | * and return -EWOULDBLOCK directly. | 
|---|
| 893 | */ | 
|---|
| 894 | ret = -EWOULDBLOCK; | 
|---|
| 895 | } | 
|---|
| 896 | break; | 
|---|
| 897 | default: | 
|---|
| 898 | BUG(); | 
|---|
| 899 | } | 
|---|
| 900 | if (q.drop_hb_ref) { | 
|---|
| 901 | CLASS(hb, hb)(key: &q.key); | 
|---|
| 902 | /* Additional reference from requeue_pi_wake_futex() */ | 
|---|
| 903 | futex_hash_put(hb); | 
|---|
| 904 | } | 
|---|
| 905 |  | 
|---|
| 906 | out: | 
|---|
| 907 | if (to) { | 
|---|
| 908 | hrtimer_cancel(timer: &to->timer); | 
|---|
| 909 | destroy_hrtimer_on_stack(timer: &to->timer); | 
|---|
| 910 | } | 
|---|
| 911 | return ret; | 
|---|
| 912 | } | 
|---|
| 913 |  | 
|---|
| 914 |  | 
|---|