| 1 | // SPDX-License-Identifier: GPL-2.0-or-later | 
|---|
| 2 |  | 
|---|
| 3 | #include <linux/slab.h> | 
|---|
| 4 | #include <linux/sched/rt.h> | 
|---|
| 5 | #include <linux/sched/task.h> | 
|---|
| 6 |  | 
|---|
| 7 | #include "futex.h" | 
|---|
| 8 | #include "../locking/rtmutex_common.h" | 
|---|
| 9 |  | 
|---|
| 10 | /* | 
|---|
| 11 | * PI code: | 
|---|
| 12 | */ | 
|---|
| 13 | int refill_pi_state_cache(void) | 
|---|
| 14 | { | 
|---|
| 15 | struct futex_pi_state *pi_state; | 
|---|
| 16 |  | 
|---|
| 17 | if (likely(current->pi_state_cache)) | 
|---|
| 18 | return 0; | 
|---|
| 19 |  | 
|---|
| 20 | pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL); | 
|---|
| 21 |  | 
|---|
| 22 | if (!pi_state) | 
|---|
| 23 | return -ENOMEM; | 
|---|
| 24 |  | 
|---|
| 25 | INIT_LIST_HEAD(list: &pi_state->list); | 
|---|
| 26 | /* pi_mutex gets initialized later */ | 
|---|
| 27 | pi_state->owner = NULL; | 
|---|
| 28 | refcount_set(r: &pi_state->refcount, n: 1); | 
|---|
| 29 | pi_state->key = FUTEX_KEY_INIT; | 
|---|
| 30 |  | 
|---|
| 31 | current->pi_state_cache = pi_state; | 
|---|
| 32 |  | 
|---|
| 33 | return 0; | 
|---|
| 34 | } | 
|---|
| 35 |  | 
|---|
| 36 | static struct futex_pi_state *alloc_pi_state(void) | 
|---|
| 37 | { | 
|---|
| 38 | struct futex_pi_state *pi_state = current->pi_state_cache; | 
|---|
| 39 |  | 
|---|
| 40 | WARN_ON(!pi_state); | 
|---|
| 41 | current->pi_state_cache = NULL; | 
|---|
| 42 |  | 
|---|
| 43 | return pi_state; | 
|---|
| 44 | } | 
|---|
| 45 |  | 
|---|
| 46 | static void pi_state_update_owner(struct futex_pi_state *pi_state, | 
|---|
| 47 | struct task_struct *new_owner) | 
|---|
| 48 | { | 
|---|
| 49 | struct task_struct *old_owner = pi_state->owner; | 
|---|
| 50 |  | 
|---|
| 51 | lockdep_assert_held(&pi_state->pi_mutex.wait_lock); | 
|---|
| 52 |  | 
|---|
| 53 | if (old_owner) { | 
|---|
| 54 | raw_spin_lock(&old_owner->pi_lock); | 
|---|
| 55 | WARN_ON(list_empty(&pi_state->list)); | 
|---|
| 56 | list_del_init(entry: &pi_state->list); | 
|---|
| 57 | raw_spin_unlock(&old_owner->pi_lock); | 
|---|
| 58 | } | 
|---|
| 59 |  | 
|---|
| 60 | if (new_owner) { | 
|---|
| 61 | raw_spin_lock(&new_owner->pi_lock); | 
|---|
| 62 | WARN_ON(!list_empty(&pi_state->list)); | 
|---|
| 63 | list_add(new: &pi_state->list, head: &new_owner->pi_state_list); | 
|---|
| 64 | pi_state->owner = new_owner; | 
|---|
| 65 | raw_spin_unlock(&new_owner->pi_lock); | 
|---|
| 66 | } | 
|---|
| 67 | } | 
|---|
| 68 |  | 
|---|
| 69 | void get_pi_state(struct futex_pi_state *pi_state) | 
|---|
| 70 | { | 
|---|
| 71 | WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount)); | 
|---|
| 72 | } | 
|---|
| 73 |  | 
|---|
| 74 | /* | 
|---|
| 75 | * Drops a reference to the pi_state object and frees or caches it | 
|---|
| 76 | * when the last reference is gone. | 
|---|
| 77 | */ | 
|---|
| 78 | void put_pi_state(struct futex_pi_state *pi_state) | 
|---|
| 79 | { | 
|---|
| 80 | if (!pi_state) | 
|---|
| 81 | return; | 
|---|
| 82 |  | 
|---|
| 83 | if (!refcount_dec_and_test(r: &pi_state->refcount)) | 
|---|
| 84 | return; | 
|---|
| 85 |  | 
|---|
| 86 | /* | 
|---|
| 87 | * If pi_state->owner is NULL, the owner is most probably dying | 
|---|
| 88 | * and has cleaned up the pi_state already | 
|---|
| 89 | */ | 
|---|
| 90 | if (pi_state->owner) { | 
|---|
| 91 | unsigned long flags; | 
|---|
| 92 |  | 
|---|
| 93 | raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags); | 
|---|
| 94 | pi_state_update_owner(pi_state, NULL); | 
|---|
| 95 | rt_mutex_proxy_unlock(lock: &pi_state->pi_mutex); | 
|---|
| 96 | raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags); | 
|---|
| 97 | } | 
|---|
| 98 |  | 
|---|
| 99 | if (current->pi_state_cache) { | 
|---|
| 100 | kfree(objp: pi_state); | 
|---|
| 101 | } else { | 
|---|
| 102 | /* | 
|---|
| 103 | * pi_state->list is already empty. | 
|---|
| 104 | * clear pi_state->owner. | 
|---|
| 105 | * refcount is at 0 - put it back to 1. | 
|---|
| 106 | */ | 
|---|
| 107 | pi_state->owner = NULL; | 
|---|
| 108 | refcount_set(r: &pi_state->refcount, n: 1); | 
|---|
| 109 | current->pi_state_cache = pi_state; | 
|---|
| 110 | } | 
|---|
| 111 | } | 
|---|
| 112 |  | 
|---|
| 113 | /* | 
|---|
| 114 | * We need to check the following states: | 
|---|
| 115 | * | 
|---|
| 116 | *      Waiter | pi_state | pi->owner | uTID      | uODIED | ? | 
|---|
| 117 | * | 
|---|
| 118 | * [1]  NULL   | ---      | ---       | 0         | 0/1    | Valid | 
|---|
| 119 | * [2]  NULL   | ---      | ---       | >0        | 0/1    | Valid | 
|---|
| 120 | * | 
|---|
| 121 | * [3]  Found  | NULL     | --        | Any       | 0/1    | Invalid | 
|---|
| 122 | * | 
|---|
| 123 | * [4]  Found  | Found    | NULL      | 0         | 1      | Valid | 
|---|
| 124 | * [5]  Found  | Found    | NULL      | >0        | 1      | Invalid | 
|---|
| 125 | * | 
|---|
| 126 | * [6]  Found  | Found    | task      | 0         | 1      | Valid | 
|---|
| 127 | * | 
|---|
| 128 | * [7]  Found  | Found    | NULL      | Any       | 0      | Invalid | 
|---|
| 129 | * | 
|---|
| 130 | * [8]  Found  | Found    | task      | ==taskTID | 0/1    | Valid | 
|---|
| 131 | * [9]  Found  | Found    | task      | 0         | 0      | Invalid | 
|---|
| 132 | * [10] Found  | Found    | task      | !=taskTID | 0/1    | Invalid | 
|---|
| 133 | * | 
|---|
| 134 | * [1]	Indicates that the kernel can acquire the futex atomically. We | 
|---|
| 135 | *	came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. | 
|---|
| 136 | * | 
|---|
| 137 | * [2]	Valid, if TID does not belong to a kernel thread. If no matching | 
|---|
| 138 | *      thread is found then it indicates that the owner TID has died. | 
|---|
| 139 | * | 
|---|
| 140 | * [3]	Invalid. The waiter is queued on a non PI futex | 
|---|
| 141 | * | 
|---|
| 142 | * [4]	Valid state after exit_robust_list(), which sets the user space | 
|---|
| 143 | *	value to FUTEX_WAITERS | FUTEX_OWNER_DIED. | 
|---|
| 144 | * | 
|---|
| 145 | * [5]	The user space value got manipulated between exit_robust_list() | 
|---|
| 146 | *	and exit_pi_state_list() | 
|---|
| 147 | * | 
|---|
| 148 | * [6]	Valid state after exit_pi_state_list() which sets the new owner in | 
|---|
| 149 | *	the pi_state but cannot access the user space value. | 
|---|
| 150 | * | 
|---|
| 151 | * [7]	pi_state->owner can only be NULL when the OWNER_DIED bit is set. | 
|---|
| 152 | * | 
|---|
| 153 | * [8]	Owner and user space value match | 
|---|
| 154 | * | 
|---|
| 155 | * [9]	There is no transient state which sets the user space TID to 0 | 
|---|
| 156 | *	except exit_robust_list(), but this is indicated by the | 
|---|
| 157 | *	FUTEX_OWNER_DIED bit. See [4] | 
|---|
| 158 | * | 
|---|
| 159 | * [10] There is no transient state which leaves owner and user space | 
|---|
| 160 | *	TID out of sync. Except one error case where the kernel is denied | 
|---|
| 161 | *	write access to the user address, see fixup_pi_state_owner(). | 
|---|
| 162 | * | 
|---|
| 163 | * | 
|---|
| 164 | * Serialization and lifetime rules: | 
|---|
| 165 | * | 
|---|
| 166 | * hb->lock: | 
|---|
| 167 | * | 
|---|
| 168 | *	hb -> futex_q, relation | 
|---|
| 169 | *	futex_q -> pi_state, relation | 
|---|
| 170 | * | 
|---|
| 171 | *	(cannot be raw because hb can contain arbitrary amount | 
|---|
| 172 | *	 of futex_q's) | 
|---|
| 173 | * | 
|---|
| 174 | * pi_mutex->wait_lock: | 
|---|
| 175 | * | 
|---|
| 176 | *	{uval, pi_state} | 
|---|
| 177 | * | 
|---|
| 178 | *	(and pi_mutex 'obviously') | 
|---|
| 179 | * | 
|---|
| 180 | * p->pi_lock: | 
|---|
| 181 | * | 
|---|
| 182 | *	p->pi_state_list -> pi_state->list, relation | 
|---|
| 183 | *	pi_mutex->owner -> pi_state->owner, relation | 
|---|
| 184 | * | 
|---|
| 185 | * pi_state->refcount: | 
|---|
| 186 | * | 
|---|
| 187 | *	pi_state lifetime | 
|---|
| 188 | * | 
|---|
| 189 | * | 
|---|
| 190 | * Lock order: | 
|---|
| 191 | * | 
|---|
| 192 | *   hb->lock | 
|---|
| 193 | *     pi_mutex->wait_lock | 
|---|
| 194 | *       p->pi_lock | 
|---|
| 195 | * | 
|---|
| 196 | */ | 
|---|
| 197 |  | 
|---|
| 198 | /* | 
|---|
| 199 | * Validate that the existing waiter has a pi_state and sanity check | 
|---|
| 200 | * the pi_state against the user space value. If correct, attach to | 
|---|
| 201 | * it. | 
|---|
| 202 | */ | 
|---|
| 203 | static int attach_to_pi_state(u32 __user *uaddr, u32 uval, | 
|---|
| 204 | struct futex_pi_state *pi_state, | 
|---|
| 205 | struct futex_pi_state **ps) | 
|---|
| 206 | { | 
|---|
| 207 | pid_t pid = uval & FUTEX_TID_MASK; | 
|---|
| 208 | u32 uval2; | 
|---|
| 209 | int ret; | 
|---|
| 210 |  | 
|---|
| 211 | /* | 
|---|
| 212 | * Userspace might have messed up non-PI and PI futexes [3] | 
|---|
| 213 | */ | 
|---|
| 214 | if (unlikely(!pi_state)) | 
|---|
| 215 | return -EINVAL; | 
|---|
| 216 |  | 
|---|
| 217 | /* | 
|---|
| 218 | * We get here with hb->lock held, and having found a | 
|---|
| 219 | * futex_top_waiter(). This means that futex_lock_pi() of said futex_q | 
|---|
| 220 | * has dropped the hb->lock in between futex_queue() and futex_unqueue_pi(), | 
|---|
| 221 | * which in turn means that futex_lock_pi() still has a reference on | 
|---|
| 222 | * our pi_state. | 
|---|
| 223 | * | 
|---|
| 224 | * The waiter holding a reference on @pi_state also protects against | 
|---|
| 225 | * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi() | 
|---|
| 226 | * and futex_wait_requeue_pi() as it cannot go to 0 and consequently | 
|---|
| 227 | * free pi_state before we can take a reference ourselves. | 
|---|
| 228 | */ | 
|---|
| 229 | WARN_ON(!refcount_read(&pi_state->refcount)); | 
|---|
| 230 |  | 
|---|
| 231 | /* | 
|---|
| 232 | * Now that we have a pi_state, we can acquire wait_lock | 
|---|
| 233 | * and do the state validation. | 
|---|
| 234 | */ | 
|---|
| 235 | raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); | 
|---|
| 236 |  | 
|---|
| 237 | /* | 
|---|
| 238 | * Since {uval, pi_state} is serialized by wait_lock, and our current | 
|---|
| 239 | * uval was read without holding it, it can have changed. Verify it | 
|---|
| 240 | * still is what we expect it to be, otherwise retry the entire | 
|---|
| 241 | * operation. | 
|---|
| 242 | */ | 
|---|
| 243 | if (futex_get_value_locked(dest: &uval2, from: uaddr)) | 
|---|
| 244 | goto out_efault; | 
|---|
| 245 |  | 
|---|
| 246 | if (uval != uval2) | 
|---|
| 247 | goto out_eagain; | 
|---|
| 248 |  | 
|---|
| 249 | /* | 
|---|
| 250 | * Handle the owner died case: | 
|---|
| 251 | */ | 
|---|
| 252 | if (uval & FUTEX_OWNER_DIED) { | 
|---|
| 253 | /* | 
|---|
| 254 | * exit_pi_state_list sets owner to NULL and wakes the | 
|---|
| 255 | * topmost waiter. The task which acquires the | 
|---|
| 256 | * pi_state->rt_mutex will fixup owner. | 
|---|
| 257 | */ | 
|---|
| 258 | if (!pi_state->owner) { | 
|---|
| 259 | /* | 
|---|
| 260 | * No pi state owner, but the user space TID | 
|---|
| 261 | * is not 0. Inconsistent state. [5] | 
|---|
| 262 | */ | 
|---|
| 263 | if (pid) | 
|---|
| 264 | goto out_einval; | 
|---|
| 265 | /* | 
|---|
| 266 | * Take a ref on the state and return success. [4] | 
|---|
| 267 | */ | 
|---|
| 268 | goto out_attach; | 
|---|
| 269 | } | 
|---|
| 270 |  | 
|---|
| 271 | /* | 
|---|
| 272 | * If TID is 0, then either the dying owner has not | 
|---|
| 273 | * yet executed exit_pi_state_list() or some waiter | 
|---|
| 274 | * acquired the rtmutex in the pi state, but did not | 
|---|
| 275 | * yet fixup the TID in user space. | 
|---|
| 276 | * | 
|---|
| 277 | * Take a ref on the state and return success. [6] | 
|---|
| 278 | */ | 
|---|
| 279 | if (!pid) | 
|---|
| 280 | goto out_attach; | 
|---|
| 281 | } else { | 
|---|
| 282 | /* | 
|---|
| 283 | * If the owner died bit is not set, then the pi_state | 
|---|
| 284 | * must have an owner. [7] | 
|---|
| 285 | */ | 
|---|
| 286 | if (!pi_state->owner) | 
|---|
| 287 | goto out_einval; | 
|---|
| 288 | } | 
|---|
| 289 |  | 
|---|
| 290 | /* | 
|---|
| 291 | * Bail out if user space manipulated the futex value. If pi | 
|---|
| 292 | * state exists then the owner TID must be the same as the | 
|---|
| 293 | * user space TID. [9/10] | 
|---|
| 294 | */ | 
|---|
| 295 | if (pid != task_pid_vnr(tsk: pi_state->owner)) | 
|---|
| 296 | goto out_einval; | 
|---|
| 297 |  | 
|---|
| 298 | out_attach: | 
|---|
| 299 | get_pi_state(pi_state); | 
|---|
| 300 | raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); | 
|---|
| 301 | *ps = pi_state; | 
|---|
| 302 | return 0; | 
|---|
| 303 |  | 
|---|
| 304 | out_einval: | 
|---|
| 305 | ret = -EINVAL; | 
|---|
| 306 | goto out_error; | 
|---|
| 307 |  | 
|---|
| 308 | out_eagain: | 
|---|
| 309 | ret = -EAGAIN; | 
|---|
| 310 | goto out_error; | 
|---|
| 311 |  | 
|---|
| 312 | out_efault: | 
|---|
| 313 | ret = -EFAULT; | 
|---|
| 314 | goto out_error; | 
|---|
| 315 |  | 
|---|
| 316 | out_error: | 
|---|
| 317 | raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); | 
|---|
| 318 | return ret; | 
|---|
| 319 | } | 
|---|
| 320 |  | 
|---|
| 321 | static int handle_exit_race(u32 __user *uaddr, u32 uval, | 
|---|
| 322 | struct task_struct *tsk) | 
|---|
| 323 | { | 
|---|
| 324 | u32 uval2; | 
|---|
| 325 |  | 
|---|
| 326 | /* | 
|---|
| 327 | * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the | 
|---|
| 328 | * caller that the alleged owner is busy. | 
|---|
| 329 | */ | 
|---|
| 330 | if (tsk && tsk->futex_state != FUTEX_STATE_DEAD) | 
|---|
| 331 | return -EBUSY; | 
|---|
| 332 |  | 
|---|
| 333 | /* | 
|---|
| 334 | * Reread the user space value to handle the following situation: | 
|---|
| 335 | * | 
|---|
| 336 | * CPU0				CPU1 | 
|---|
| 337 | * | 
|---|
| 338 | * sys_exit()			sys_futex() | 
|---|
| 339 | *  do_exit()			 futex_lock_pi() | 
|---|
| 340 | *                                futex_lock_pi_atomic() | 
|---|
| 341 | *   exit_signals(tsk)		    No waiters: | 
|---|
| 342 | *    tsk->flags |= PF_EXITING;	    *uaddr == 0x00000PID | 
|---|
| 343 | *  mm_release(tsk)		    Set waiter bit | 
|---|
| 344 | *   exit_robust_list(tsk) {	    *uaddr = 0x80000PID; | 
|---|
| 345 | *      Set owner died		    attach_to_pi_owner() { | 
|---|
| 346 | *    *uaddr = 0xC0000000;	     tsk = get_task(PID); | 
|---|
| 347 | *   }				     if (!tsk->flags & PF_EXITING) { | 
|---|
| 348 | *  ...				       attach(); | 
|---|
| 349 | *  tsk->futex_state =               } else { | 
|---|
| 350 | *	FUTEX_STATE_DEAD;              if (tsk->futex_state != | 
|---|
| 351 | *					  FUTEX_STATE_DEAD) | 
|---|
| 352 | *				         return -EAGAIN; | 
|---|
| 353 | *				       return -ESRCH; <--- FAIL | 
|---|
| 354 | *				     } | 
|---|
| 355 | * | 
|---|
| 356 | * Returning ESRCH unconditionally is wrong here because the | 
|---|
| 357 | * user space value has been changed by the exiting task. | 
|---|
| 358 | * | 
|---|
| 359 | * The same logic applies to the case where the exiting task is | 
|---|
| 360 | * already gone. | 
|---|
| 361 | */ | 
|---|
| 362 | if (futex_get_value_locked(dest: &uval2, from: uaddr)) | 
|---|
| 363 | return -EFAULT; | 
|---|
| 364 |  | 
|---|
| 365 | /* If the user space value has changed, try again. */ | 
|---|
| 366 | if (uval2 != uval) | 
|---|
| 367 | return -EAGAIN; | 
|---|
| 368 |  | 
|---|
| 369 | /* | 
|---|
| 370 | * The exiting task did not have a robust list, the robust list was | 
|---|
| 371 | * corrupted or the user space value in *uaddr is simply bogus. | 
|---|
| 372 | * Give up and tell user space. | 
|---|
| 373 | */ | 
|---|
| 374 | return -ESRCH; | 
|---|
| 375 | } | 
|---|
| 376 |  | 
|---|
| 377 | static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key, | 
|---|
| 378 | struct futex_pi_state **ps) | 
|---|
| 379 | { | 
|---|
| 380 | /* | 
|---|
| 381 | * No existing pi state. First waiter. [2] | 
|---|
| 382 | * | 
|---|
| 383 | * This creates pi_state, we have hb->lock held, this means nothing can | 
|---|
| 384 | * observe this state, wait_lock is irrelevant. | 
|---|
| 385 | */ | 
|---|
| 386 | struct futex_pi_state *pi_state = alloc_pi_state(); | 
|---|
| 387 |  | 
|---|
| 388 | /* | 
|---|
| 389 | * Initialize the pi_mutex in locked state and make @p | 
|---|
| 390 | * the owner of it: | 
|---|
| 391 | */ | 
|---|
| 392 | rt_mutex_init_proxy_locked(lock: &pi_state->pi_mutex, proxy_owner: p); | 
|---|
| 393 |  | 
|---|
| 394 | /* Store the key for possible exit cleanups: */ | 
|---|
| 395 | pi_state->key = *key; | 
|---|
| 396 |  | 
|---|
| 397 | WARN_ON(!list_empty(&pi_state->list)); | 
|---|
| 398 | list_add(new: &pi_state->list, head: &p->pi_state_list); | 
|---|
| 399 | /* | 
|---|
| 400 | * Assignment without holding pi_state->pi_mutex.wait_lock is safe | 
|---|
| 401 | * because there is no concurrency as the object is not published yet. | 
|---|
| 402 | */ | 
|---|
| 403 | pi_state->owner = p; | 
|---|
| 404 |  | 
|---|
| 405 | *ps = pi_state; | 
|---|
| 406 | } | 
|---|
| 407 | /* | 
|---|
| 408 | * Lookup the task for the TID provided from user space and attach to | 
|---|
| 409 | * it after doing proper sanity checks. | 
|---|
| 410 | */ | 
|---|
| 411 | static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key, | 
|---|
| 412 | struct futex_pi_state **ps, | 
|---|
| 413 | struct task_struct **exiting) | 
|---|
| 414 | { | 
|---|
| 415 | pid_t pid = uval & FUTEX_TID_MASK; | 
|---|
| 416 | struct task_struct *p; | 
|---|
| 417 |  | 
|---|
| 418 | /* | 
|---|
| 419 | * We are the first waiter - try to look up the real owner and attach | 
|---|
| 420 | * the new pi_state to it, but bail out when TID = 0 [1] | 
|---|
| 421 | * | 
|---|
| 422 | * The !pid check is paranoid. None of the call sites should end up | 
|---|
| 423 | * with pid == 0, but better safe than sorry. Let the caller retry | 
|---|
| 424 | */ | 
|---|
| 425 | if (!pid) | 
|---|
| 426 | return -EAGAIN; | 
|---|
| 427 | p = find_get_task_by_vpid(nr: pid); | 
|---|
| 428 | if (!p) | 
|---|
| 429 | return handle_exit_race(uaddr, uval, NULL); | 
|---|
| 430 |  | 
|---|
| 431 | if (unlikely(p->flags & PF_KTHREAD)) { | 
|---|
| 432 | put_task_struct(t: p); | 
|---|
| 433 | return -EPERM; | 
|---|
| 434 | } | 
|---|
| 435 |  | 
|---|
| 436 | /* | 
|---|
| 437 | * We need to look at the task state to figure out, whether the | 
|---|
| 438 | * task is exiting. To protect against the change of the task state | 
|---|
| 439 | * in futex_exit_release(), we do this protected by p->pi_lock: | 
|---|
| 440 | */ | 
|---|
| 441 | raw_spin_lock_irq(&p->pi_lock); | 
|---|
| 442 | if (unlikely(p->futex_state != FUTEX_STATE_OK)) { | 
|---|
| 443 | /* | 
|---|
| 444 | * The task is on the way out. When the futex state is | 
|---|
| 445 | * FUTEX_STATE_DEAD, we know that the task has finished | 
|---|
| 446 | * the cleanup: | 
|---|
| 447 | */ | 
|---|
| 448 | int ret = handle_exit_race(uaddr, uval, tsk: p); | 
|---|
| 449 |  | 
|---|
| 450 | raw_spin_unlock_irq(&p->pi_lock); | 
|---|
| 451 | /* | 
|---|
| 452 | * If the owner task is between FUTEX_STATE_EXITING and | 
|---|
| 453 | * FUTEX_STATE_DEAD then store the task pointer and keep | 
|---|
| 454 | * the reference on the task struct. The calling code will | 
|---|
| 455 | * drop all locks, wait for the task to reach | 
|---|
| 456 | * FUTEX_STATE_DEAD and then drop the refcount. This is | 
|---|
| 457 | * required to prevent a live lock when the current task | 
|---|
| 458 | * preempted the exiting task between the two states. | 
|---|
| 459 | */ | 
|---|
| 460 | if (ret == -EBUSY) | 
|---|
| 461 | *exiting = p; | 
|---|
| 462 | else | 
|---|
| 463 | put_task_struct(t: p); | 
|---|
| 464 | return ret; | 
|---|
| 465 | } | 
|---|
| 466 |  | 
|---|
| 467 | __attach_to_pi_owner(p, key, ps); | 
|---|
| 468 | raw_spin_unlock_irq(&p->pi_lock); | 
|---|
| 469 |  | 
|---|
| 470 | put_task_struct(t: p); | 
|---|
| 471 |  | 
|---|
| 472 | return 0; | 
|---|
| 473 | } | 
|---|
| 474 |  | 
|---|
| 475 | static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) | 
|---|
| 476 | { | 
|---|
| 477 | int err; | 
|---|
| 478 | u32 curval; | 
|---|
| 479 |  | 
|---|
| 480 | if (unlikely(should_fail_futex(true))) | 
|---|
| 481 | return -EFAULT; | 
|---|
| 482 |  | 
|---|
| 483 | err = futex_cmpxchg_value_locked(curval: &curval, uaddr, uval, newval); | 
|---|
| 484 | if (unlikely(err)) | 
|---|
| 485 | return err; | 
|---|
| 486 |  | 
|---|
| 487 | /* If user space value changed, let the caller retry */ | 
|---|
| 488 | return curval != uval ? -EAGAIN : 0; | 
|---|
| 489 | } | 
|---|
| 490 |  | 
|---|
| 491 | /** | 
|---|
| 492 | * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex | 
|---|
| 493 | * @uaddr:		the pi futex user address | 
|---|
| 494 | * @hb:			the pi futex hash bucket | 
|---|
| 495 | * @key:		the futex key associated with uaddr and hb | 
|---|
| 496 | * @ps:			the pi_state pointer where we store the result of the | 
|---|
| 497 | *			lookup | 
|---|
| 498 | * @task:		the task to perform the atomic lock work for.  This will | 
|---|
| 499 | *			be "current" except in the case of requeue pi. | 
|---|
| 500 | * @exiting:		Pointer to store the task pointer of the owner task | 
|---|
| 501 | *			which is in the middle of exiting | 
|---|
| 502 | * @set_waiters:	force setting the FUTEX_WAITERS bit (1) or not (0) | 
|---|
| 503 | * | 
|---|
| 504 | * Return: | 
|---|
| 505 | *  -  0 - ready to wait; | 
|---|
| 506 | *  -  1 - acquired the lock; | 
|---|
| 507 | *  - <0 - error | 
|---|
| 508 | * | 
|---|
| 509 | * The hb->lock must be held by the caller. | 
|---|
| 510 | * | 
|---|
| 511 | * @exiting is only set when the return value is -EBUSY. If so, this holds | 
|---|
| 512 | * a refcount on the exiting task on return and the caller needs to drop it | 
|---|
| 513 | * after waiting for the exit to complete. | 
|---|
| 514 | */ | 
|---|
| 515 | int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, | 
|---|
| 516 | union futex_key *key, | 
|---|
| 517 | struct futex_pi_state **ps, | 
|---|
| 518 | struct task_struct *task, | 
|---|
| 519 | struct task_struct **exiting, | 
|---|
| 520 | int set_waiters) | 
|---|
| 521 | { | 
|---|
| 522 | u32 uval, newval, vpid = task_pid_vnr(tsk: task); | 
|---|
| 523 | struct futex_q *top_waiter; | 
|---|
| 524 | int ret; | 
|---|
| 525 |  | 
|---|
| 526 | /* | 
|---|
| 527 | * Read the user space value first so we can validate a few | 
|---|
| 528 | * things before proceeding further. | 
|---|
| 529 | */ | 
|---|
| 530 | if (futex_get_value_locked(dest: &uval, from: uaddr)) | 
|---|
| 531 | return -EFAULT; | 
|---|
| 532 |  | 
|---|
| 533 | if (unlikely(should_fail_futex(true))) | 
|---|
| 534 | return -EFAULT; | 
|---|
| 535 |  | 
|---|
| 536 | /* | 
|---|
| 537 | * Detect deadlocks. | 
|---|
| 538 | */ | 
|---|
| 539 | if ((unlikely((uval & FUTEX_TID_MASK) == vpid))) | 
|---|
| 540 | return -EDEADLK; | 
|---|
| 541 |  | 
|---|
| 542 | if ((unlikely(should_fail_futex(true)))) | 
|---|
| 543 | return -EDEADLK; | 
|---|
| 544 |  | 
|---|
| 545 | /* | 
|---|
| 546 | * Lookup existing state first. If it exists, try to attach to | 
|---|
| 547 | * its pi_state. | 
|---|
| 548 | */ | 
|---|
| 549 | top_waiter = futex_top_waiter(hb, key); | 
|---|
| 550 | if (top_waiter) | 
|---|
| 551 | return attach_to_pi_state(uaddr, uval, pi_state: top_waiter->pi_state, ps); | 
|---|
| 552 |  | 
|---|
| 553 | /* | 
|---|
| 554 | * No waiter and user TID is 0. We are here because the | 
|---|
| 555 | * waiters or the owner died bit is set or called from | 
|---|
| 556 | * requeue_cmp_pi or for whatever reason something took the | 
|---|
| 557 | * syscall. | 
|---|
| 558 | */ | 
|---|
| 559 | if (!(uval & FUTEX_TID_MASK)) { | 
|---|
| 560 | /* | 
|---|
| 561 | * We take over the futex. No other waiters and the user space | 
|---|
| 562 | * TID is 0. We preserve the owner died bit. | 
|---|
| 563 | */ | 
|---|
| 564 | newval = uval & FUTEX_OWNER_DIED; | 
|---|
| 565 | newval |= vpid; | 
|---|
| 566 |  | 
|---|
| 567 | /* The futex requeue_pi code can enforce the waiters bit */ | 
|---|
| 568 | if (set_waiters) | 
|---|
| 569 | newval |= FUTEX_WAITERS; | 
|---|
| 570 |  | 
|---|
| 571 | ret = lock_pi_update_atomic(uaddr, uval, newval); | 
|---|
| 572 | if (ret) | 
|---|
| 573 | return ret; | 
|---|
| 574 |  | 
|---|
| 575 | /* | 
|---|
| 576 | * If the waiter bit was requested the caller also needs PI | 
|---|
| 577 | * state attached to the new owner of the user space futex. | 
|---|
| 578 | * | 
|---|
| 579 | * @task is guaranteed to be alive and it cannot be exiting | 
|---|
| 580 | * because it is either sleeping or waiting in | 
|---|
| 581 | * futex_requeue_pi_wakeup_sync(). | 
|---|
| 582 | * | 
|---|
| 583 | * No need to do the full attach_to_pi_owner() exercise | 
|---|
| 584 | * because @task is known and valid. | 
|---|
| 585 | */ | 
|---|
| 586 | if (set_waiters) { | 
|---|
| 587 | raw_spin_lock_irq(&task->pi_lock); | 
|---|
| 588 | __attach_to_pi_owner(p: task, key, ps); | 
|---|
| 589 | raw_spin_unlock_irq(&task->pi_lock); | 
|---|
| 590 | } | 
|---|
| 591 | return 1; | 
|---|
| 592 | } | 
|---|
| 593 |  | 
|---|
| 594 | /* | 
|---|
| 595 | * First waiter. Set the waiters bit before attaching ourself to | 
|---|
| 596 | * the owner. If owner tries to unlock, it will be forced into | 
|---|
| 597 | * the kernel and blocked on hb->lock. | 
|---|
| 598 | */ | 
|---|
| 599 | newval = uval | FUTEX_WAITERS; | 
|---|
| 600 | ret = lock_pi_update_atomic(uaddr, uval, newval); | 
|---|
| 601 | if (ret) | 
|---|
| 602 | return ret; | 
|---|
| 603 | /* | 
|---|
| 604 | * If the update of the user space value succeeded, we try to | 
|---|
| 605 | * attach to the owner. If that fails, no harm done, we only | 
|---|
| 606 | * set the FUTEX_WAITERS bit in the user space variable. | 
|---|
| 607 | */ | 
|---|
| 608 | return attach_to_pi_owner(uaddr, uval: newval, key, ps, exiting); | 
|---|
| 609 | } | 
|---|
| 610 |  | 
|---|
| 611 | /* | 
|---|
| 612 | * Caller must hold a reference on @pi_state. | 
|---|
| 613 | */ | 
|---|
| 614 | static int wake_futex_pi(u32 __user *uaddr, u32 uval, | 
|---|
| 615 | struct futex_pi_state *pi_state, | 
|---|
| 616 | struct rt_mutex_waiter *top_waiter) | 
|---|
| 617 | { | 
|---|
| 618 | struct task_struct *new_owner; | 
|---|
| 619 | bool postunlock = false; | 
|---|
| 620 | DEFINE_RT_WAKE_Q(wqh); | 
|---|
| 621 | u32 curval, newval; | 
|---|
| 622 | int ret = 0; | 
|---|
| 623 |  | 
|---|
| 624 | new_owner = top_waiter->task; | 
|---|
| 625 |  | 
|---|
| 626 | /* | 
|---|
| 627 | * We pass it to the next owner. The WAITERS bit is always kept | 
|---|
| 628 | * enabled while there is PI state around. We cleanup the owner | 
|---|
| 629 | * died bit, because we are the owner. | 
|---|
| 630 | */ | 
|---|
| 631 | newval = FUTEX_WAITERS | task_pid_vnr(tsk: new_owner); | 
|---|
| 632 |  | 
|---|
| 633 | if (unlikely(should_fail_futex(true))) { | 
|---|
| 634 | ret = -EFAULT; | 
|---|
| 635 | goto out_unlock; | 
|---|
| 636 | } | 
|---|
| 637 |  | 
|---|
| 638 | ret = futex_cmpxchg_value_locked(curval: &curval, uaddr, uval, newval); | 
|---|
| 639 | if (!ret && (curval != uval)) { | 
|---|
| 640 | /* | 
|---|
| 641 | * If a unconditional UNLOCK_PI operation (user space did not | 
|---|
| 642 | * try the TID->0 transition) raced with a waiter setting the | 
|---|
| 643 | * FUTEX_WAITERS flag between get_user() and locking the hash | 
|---|
| 644 | * bucket lock, retry the operation. | 
|---|
| 645 | */ | 
|---|
| 646 | if ((FUTEX_TID_MASK & curval) == uval) | 
|---|
| 647 | ret = -EAGAIN; | 
|---|
| 648 | else | 
|---|
| 649 | ret = -EINVAL; | 
|---|
| 650 | } | 
|---|
| 651 |  | 
|---|
| 652 | if (!ret) { | 
|---|
| 653 | /* | 
|---|
| 654 | * This is a point of no return; once we modified the uval | 
|---|
| 655 | * there is no going back and subsequent operations must | 
|---|
| 656 | * not fail. | 
|---|
| 657 | */ | 
|---|
| 658 | pi_state_update_owner(pi_state, new_owner); | 
|---|
| 659 | postunlock = __rt_mutex_futex_unlock(lock: &pi_state->pi_mutex, wqh: &wqh); | 
|---|
| 660 | } | 
|---|
| 661 |  | 
|---|
| 662 | out_unlock: | 
|---|
| 663 | raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); | 
|---|
| 664 |  | 
|---|
| 665 | if (postunlock) | 
|---|
| 666 | rt_mutex_postunlock(wqh: &wqh); | 
|---|
| 667 |  | 
|---|
| 668 | return ret; | 
|---|
| 669 | } | 
|---|
| 670 |  | 
|---|
| 671 | static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, | 
|---|
| 672 | struct task_struct *argowner) | 
|---|
| 673 | { | 
|---|
| 674 | struct futex_pi_state *pi_state = q->pi_state; | 
|---|
| 675 | struct task_struct *oldowner, *newowner; | 
|---|
| 676 | u32 uval, curval, newval, newtid; | 
|---|
| 677 | int err = 0; | 
|---|
| 678 |  | 
|---|
| 679 | oldowner = pi_state->owner; | 
|---|
| 680 |  | 
|---|
| 681 | /* | 
|---|
| 682 | * We are here because either: | 
|---|
| 683 | * | 
|---|
| 684 | *  - we stole the lock and pi_state->owner needs updating to reflect | 
|---|
| 685 | *    that (@argowner == current), | 
|---|
| 686 | * | 
|---|
| 687 | * or: | 
|---|
| 688 | * | 
|---|
| 689 | *  - someone stole our lock and we need to fix things to point to the | 
|---|
| 690 | *    new owner (@argowner == NULL). | 
|---|
| 691 | * | 
|---|
| 692 | * Either way, we have to replace the TID in the user space variable. | 
|---|
| 693 | * This must be atomic as we have to preserve the owner died bit here. | 
|---|
| 694 | * | 
|---|
| 695 | * Note: We write the user space value _before_ changing the pi_state | 
|---|
| 696 | * because we can fault here. Imagine swapped out pages or a fork | 
|---|
| 697 | * that marked all the anonymous memory readonly for cow. | 
|---|
| 698 | * | 
|---|
| 699 | * Modifying pi_state _before_ the user space value would leave the | 
|---|
| 700 | * pi_state in an inconsistent state when we fault here, because we | 
|---|
| 701 | * need to drop the locks to handle the fault. This might be observed | 
|---|
| 702 | * in the PID checks when attaching to PI state . | 
|---|
| 703 | */ | 
|---|
| 704 | retry: | 
|---|
| 705 | if (!argowner) { | 
|---|
| 706 | if (oldowner != current) { | 
|---|
| 707 | /* | 
|---|
| 708 | * We raced against a concurrent self; things are | 
|---|
| 709 | * already fixed up. Nothing to do. | 
|---|
| 710 | */ | 
|---|
| 711 | return 0; | 
|---|
| 712 | } | 
|---|
| 713 |  | 
|---|
| 714 | if (__rt_mutex_futex_trylock(l: &pi_state->pi_mutex)) { | 
|---|
| 715 | /* We got the lock. pi_state is correct. Tell caller. */ | 
|---|
| 716 | return 1; | 
|---|
| 717 | } | 
|---|
| 718 |  | 
|---|
| 719 | /* | 
|---|
| 720 | * The trylock just failed, so either there is an owner or | 
|---|
| 721 | * there is a higher priority waiter than this one. | 
|---|
| 722 | */ | 
|---|
| 723 | newowner = rt_mutex_owner(lock: &pi_state->pi_mutex); | 
|---|
| 724 | /* | 
|---|
| 725 | * If the higher priority waiter has not yet taken over the | 
|---|
| 726 | * rtmutex then newowner is NULL. We can't return here with | 
|---|
| 727 | * that state because it's inconsistent vs. the user space | 
|---|
| 728 | * state. So drop the locks and try again. It's a valid | 
|---|
| 729 | * situation and not any different from the other retry | 
|---|
| 730 | * conditions. | 
|---|
| 731 | */ | 
|---|
| 732 | if (unlikely(!newowner)) { | 
|---|
| 733 | err = -EAGAIN; | 
|---|
| 734 | goto handle_err; | 
|---|
| 735 | } | 
|---|
| 736 | } else { | 
|---|
| 737 | WARN_ON_ONCE(argowner != current); | 
|---|
| 738 | if (oldowner == current) { | 
|---|
| 739 | /* | 
|---|
| 740 | * We raced against a concurrent self; things are | 
|---|
| 741 | * already fixed up. Nothing to do. | 
|---|
| 742 | */ | 
|---|
| 743 | return 1; | 
|---|
| 744 | } | 
|---|
| 745 | newowner = argowner; | 
|---|
| 746 | } | 
|---|
| 747 |  | 
|---|
| 748 | newtid = task_pid_vnr(tsk: newowner) | FUTEX_WAITERS; | 
|---|
| 749 | /* Owner died? */ | 
|---|
| 750 | if (!pi_state->owner) | 
|---|
| 751 | newtid |= FUTEX_OWNER_DIED; | 
|---|
| 752 |  | 
|---|
| 753 | err = futex_get_value_locked(dest: &uval, from: uaddr); | 
|---|
| 754 | if (err) | 
|---|
| 755 | goto handle_err; | 
|---|
| 756 |  | 
|---|
| 757 | for (;;) { | 
|---|
| 758 | newval = (uval & FUTEX_OWNER_DIED) | newtid; | 
|---|
| 759 |  | 
|---|
| 760 | err = futex_cmpxchg_value_locked(curval: &curval, uaddr, uval, newval); | 
|---|
| 761 | if (err) | 
|---|
| 762 | goto handle_err; | 
|---|
| 763 |  | 
|---|
| 764 | if (curval == uval) | 
|---|
| 765 | break; | 
|---|
| 766 | uval = curval; | 
|---|
| 767 | } | 
|---|
| 768 |  | 
|---|
| 769 | /* | 
|---|
| 770 | * We fixed up user space. Now we need to fix the pi_state | 
|---|
| 771 | * itself. | 
|---|
| 772 | */ | 
|---|
| 773 | pi_state_update_owner(pi_state, new_owner: newowner); | 
|---|
| 774 |  | 
|---|
| 775 | return argowner == current; | 
|---|
| 776 |  | 
|---|
| 777 | /* | 
|---|
| 778 | * In order to reschedule or handle a page fault, we need to drop the | 
|---|
| 779 | * locks here. In the case of a fault, this gives the other task | 
|---|
| 780 | * (either the highest priority waiter itself or the task which stole | 
|---|
| 781 | * the rtmutex) the chance to try the fixup of the pi_state. So once we | 
|---|
| 782 | * are back from handling the fault we need to check the pi_state after | 
|---|
| 783 | * reacquiring the locks and before trying to do another fixup. When | 
|---|
| 784 | * the fixup has been done already we simply return. | 
|---|
| 785 | * | 
|---|
| 786 | * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely | 
|---|
| 787 | * drop hb->lock since the caller owns the hb -> futex_q relation. | 
|---|
| 788 | * Dropping the pi_mutex->wait_lock requires the state revalidate. | 
|---|
| 789 | */ | 
|---|
| 790 | handle_err: | 
|---|
| 791 | raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); | 
|---|
| 792 | spin_unlock(lock: q->lock_ptr); | 
|---|
| 793 |  | 
|---|
| 794 | switch (err) { | 
|---|
| 795 | case -EFAULT: | 
|---|
| 796 | err = fault_in_user_writeable(uaddr); | 
|---|
| 797 | break; | 
|---|
| 798 |  | 
|---|
| 799 | case -EAGAIN: | 
|---|
| 800 | cond_resched(); | 
|---|
| 801 | err = 0; | 
|---|
| 802 | break; | 
|---|
| 803 |  | 
|---|
| 804 | default: | 
|---|
| 805 | WARN_ON_ONCE(1); | 
|---|
| 806 | break; | 
|---|
| 807 | } | 
|---|
| 808 |  | 
|---|
| 809 | futex_q_lockptr_lock(q); | 
|---|
| 810 | raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); | 
|---|
| 811 |  | 
|---|
| 812 | /* | 
|---|
| 813 | * Check if someone else fixed it for us: | 
|---|
| 814 | */ | 
|---|
| 815 | if (pi_state->owner != oldowner) | 
|---|
| 816 | return argowner == current; | 
|---|
| 817 |  | 
|---|
| 818 | /* Retry if err was -EAGAIN or the fault in succeeded */ | 
|---|
| 819 | if (!err) | 
|---|
| 820 | goto retry; | 
|---|
| 821 |  | 
|---|
| 822 | /* | 
|---|
| 823 | * fault_in_user_writeable() failed so user state is immutable. At | 
|---|
| 824 | * best we can make the kernel state consistent but user state will | 
|---|
| 825 | * be most likely hosed and any subsequent unlock operation will be | 
|---|
| 826 | * rejected due to PI futex rule [10]. | 
|---|
| 827 | * | 
|---|
| 828 | * Ensure that the rtmutex owner is also the pi_state owner despite | 
|---|
| 829 | * the user space value claiming something different. There is no | 
|---|
| 830 | * point in unlocking the rtmutex if current is the owner as it | 
|---|
| 831 | * would need to wait until the next waiter has taken the rtmutex | 
|---|
| 832 | * to guarantee consistent state. Keep it simple. Userspace asked | 
|---|
| 833 | * for this wreckaged state. | 
|---|
| 834 | * | 
|---|
| 835 | * The rtmutex has an owner - either current or some other | 
|---|
| 836 | * task. See the EAGAIN loop above. | 
|---|
| 837 | */ | 
|---|
| 838 | pi_state_update_owner(pi_state, new_owner: rt_mutex_owner(lock: &pi_state->pi_mutex)); | 
|---|
| 839 |  | 
|---|
| 840 | return err; | 
|---|
| 841 | } | 
|---|
| 842 |  | 
|---|
| 843 | static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, | 
|---|
| 844 | struct task_struct *argowner) | 
|---|
| 845 | { | 
|---|
| 846 | struct futex_pi_state *pi_state = q->pi_state; | 
|---|
| 847 | int ret; | 
|---|
| 848 |  | 
|---|
| 849 | lockdep_assert_held(q->lock_ptr); | 
|---|
| 850 |  | 
|---|
| 851 | raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); | 
|---|
| 852 | ret = __fixup_pi_state_owner(uaddr, q, argowner); | 
|---|
| 853 | raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); | 
|---|
| 854 | return ret; | 
|---|
| 855 | } | 
|---|
| 856 |  | 
|---|
| 857 | /** | 
|---|
| 858 | * fixup_pi_owner() - Post lock pi_state and corner case management | 
|---|
| 859 | * @uaddr:	user address of the futex | 
|---|
| 860 | * @q:		futex_q (contains pi_state and access to the rt_mutex) | 
|---|
| 861 | * @locked:	if the attempt to take the rt_mutex succeeded (1) or not (0) | 
|---|
| 862 | * | 
|---|
| 863 | * After attempting to lock an rt_mutex, this function is called to cleanup | 
|---|
| 864 | * the pi_state owner as well as handle race conditions that may allow us to | 
|---|
| 865 | * acquire the lock. Must be called with the hb lock held. | 
|---|
| 866 | * | 
|---|
| 867 | * Return: | 
|---|
| 868 | *  -  1 - success, lock taken; | 
|---|
| 869 | *  -  0 - success, lock not taken; | 
|---|
| 870 | *  - <0 - on error (-EFAULT) | 
|---|
| 871 | */ | 
|---|
| 872 | int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked) | 
|---|
| 873 | { | 
|---|
| 874 | if (locked) { | 
|---|
| 875 | /* | 
|---|
| 876 | * Got the lock. We might not be the anticipated owner if we | 
|---|
| 877 | * did a lock-steal - fix up the PI-state in that case: | 
|---|
| 878 | * | 
|---|
| 879 | * Speculative pi_state->owner read (we don't hold wait_lock); | 
|---|
| 880 | * since we own the lock pi_state->owner == current is the | 
|---|
| 881 | * stable state, anything else needs more attention. | 
|---|
| 882 | */ | 
|---|
| 883 | if (q->pi_state->owner != current) | 
|---|
| 884 | return fixup_pi_state_owner(uaddr, q, current); | 
|---|
| 885 | return 1; | 
|---|
| 886 | } | 
|---|
| 887 |  | 
|---|
| 888 | /* | 
|---|
| 889 | * If we didn't get the lock; check if anybody stole it from us. In | 
|---|
| 890 | * that case, we need to fix up the uval to point to them instead of | 
|---|
| 891 | * us, otherwise bad things happen. [10] | 
|---|
| 892 | * | 
|---|
| 893 | * Another speculative read; pi_state->owner == current is unstable | 
|---|
| 894 | * but needs our attention. | 
|---|
| 895 | */ | 
|---|
| 896 | if (q->pi_state->owner == current) | 
|---|
| 897 | return fixup_pi_state_owner(uaddr, q, NULL); | 
|---|
| 898 |  | 
|---|
| 899 | /* | 
|---|
| 900 | * Paranoia check. If we did not take the lock, then we should not be | 
|---|
| 901 | * the owner of the rt_mutex. Warn and establish consistent state. | 
|---|
| 902 | */ | 
|---|
| 903 | if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current)) | 
|---|
| 904 | return fixup_pi_state_owner(uaddr, q, current); | 
|---|
| 905 |  | 
|---|
| 906 | return 0; | 
|---|
| 907 | } | 
|---|
| 908 |  | 
|---|
| 909 | /* | 
|---|
| 910 | * Userspace tried a 0 -> TID atomic transition of the futex value | 
|---|
| 911 | * and failed. The kernel side here does the whole locking operation: | 
|---|
| 912 | * if there are waiters then it will block as a consequence of relying | 
|---|
| 913 | * on rt-mutexes, it does PI, etc. (Due to races the kernel might see | 
|---|
| 914 | * a 0 value of the futex too.). | 
|---|
| 915 | * | 
|---|
| 916 | * Also serves as futex trylock_pi()'ing, and due semantics. | 
|---|
| 917 | */ | 
|---|
| 918 | int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock) | 
|---|
| 919 | { | 
|---|
| 920 | struct hrtimer_sleeper timeout, *to; | 
|---|
| 921 | struct task_struct *exiting = NULL; | 
|---|
| 922 | struct rt_mutex_waiter rt_waiter; | 
|---|
| 923 | struct futex_q q = futex_q_init; | 
|---|
| 924 | DEFINE_WAKE_Q(wake_q); | 
|---|
| 925 | int res, ret; | 
|---|
| 926 |  | 
|---|
| 927 | if (!IS_ENABLED(CONFIG_FUTEX_PI)) | 
|---|
| 928 | return -ENOSYS; | 
|---|
| 929 |  | 
|---|
| 930 | if (refill_pi_state_cache()) | 
|---|
| 931 | return -ENOMEM; | 
|---|
| 932 |  | 
|---|
| 933 | to = futex_setup_timer(time, timeout: &timeout, flags, range_ns: 0); | 
|---|
| 934 |  | 
|---|
| 935 | retry: | 
|---|
| 936 | ret = get_futex_key(uaddr, flags, key: &q.key, rw: FUTEX_WRITE); | 
|---|
| 937 | if (unlikely(ret != 0)) | 
|---|
| 938 | goto out; | 
|---|
| 939 |  | 
|---|
| 940 | retry_private: | 
|---|
| 941 | if (1) { | 
|---|
| 942 | CLASS(hb, hb)(key: &q.key); | 
|---|
| 943 |  | 
|---|
| 944 | futex_q_lock(q: &q, hb); | 
|---|
| 945 |  | 
|---|
| 946 | ret = futex_lock_pi_atomic(uaddr, hb, key: &q.key, ps: &q.pi_state, current, | 
|---|
| 947 | exiting: &exiting, set_waiters: 0); | 
|---|
| 948 | if (unlikely(ret)) { | 
|---|
| 949 | /* | 
|---|
| 950 | * Atomic work succeeded and we got the lock, | 
|---|
| 951 | * or failed. Either way, we do _not_ block. | 
|---|
| 952 | */ | 
|---|
| 953 | switch (ret) { | 
|---|
| 954 | case 1: | 
|---|
| 955 | /* We got the lock. */ | 
|---|
| 956 | ret = 0; | 
|---|
| 957 | goto out_unlock_put_key; | 
|---|
| 958 | case -EFAULT: | 
|---|
| 959 | goto uaddr_faulted; | 
|---|
| 960 | case -EBUSY: | 
|---|
| 961 | case -EAGAIN: | 
|---|
| 962 | /* | 
|---|
| 963 | * Two reasons for this: | 
|---|
| 964 | * - EBUSY: Task is exiting and we just wait for the | 
|---|
| 965 | *   exit to complete. | 
|---|
| 966 | * - EAGAIN: The user space value changed. | 
|---|
| 967 | */ | 
|---|
| 968 | futex_q_unlock(hb); | 
|---|
| 969 | /* | 
|---|
| 970 | * Handle the case where the owner is in the middle of | 
|---|
| 971 | * exiting. Wait for the exit to complete otherwise | 
|---|
| 972 | * this task might loop forever, aka. live lock. | 
|---|
| 973 | */ | 
|---|
| 974 | wait_for_owner_exiting(ret, exiting); | 
|---|
| 975 | cond_resched(); | 
|---|
| 976 | goto retry; | 
|---|
| 977 | default: | 
|---|
| 978 | goto out_unlock_put_key; | 
|---|
| 979 | } | 
|---|
| 980 | } | 
|---|
| 981 |  | 
|---|
| 982 | WARN_ON(!q.pi_state); | 
|---|
| 983 |  | 
|---|
| 984 | /* | 
|---|
| 985 | * Only actually queue now that the atomic ops are done: | 
|---|
| 986 | */ | 
|---|
| 987 | __futex_queue(q: &q, hb, current); | 
|---|
| 988 |  | 
|---|
| 989 | if (trylock) { | 
|---|
| 990 | ret = rt_mutex_futex_trylock(l: &q.pi_state->pi_mutex); | 
|---|
| 991 | /* Fixup the trylock return value: */ | 
|---|
| 992 | ret = ret ? 0 : -EWOULDBLOCK; | 
|---|
| 993 | goto no_block; | 
|---|
| 994 | } | 
|---|
| 995 |  | 
|---|
| 996 | /* | 
|---|
| 997 | * Caution; releasing @hb in-scope. The hb->lock is still locked | 
|---|
| 998 | * while the reference is dropped. The reference can not be dropped | 
|---|
| 999 | * after the unlock because if a user initiated resize is in progress | 
|---|
| 1000 | * then we might need to wake him. This can not be done after the | 
|---|
| 1001 | * rt_mutex_pre_schedule() invocation. The hb will remain valid because | 
|---|
| 1002 | * the thread, performing resize, will block on hb->lock during | 
|---|
| 1003 | * the requeue. | 
|---|
| 1004 | */ | 
|---|
| 1005 | futex_hash_put(no_free_ptr(hb)); | 
|---|
| 1006 | /* | 
|---|
| 1007 | * Must be done before we enqueue the waiter, here is unfortunately | 
|---|
| 1008 | * under the hb lock, but that *should* work because it does nothing. | 
|---|
| 1009 | */ | 
|---|
| 1010 | rt_mutex_pre_schedule(); | 
|---|
| 1011 |  | 
|---|
| 1012 | rt_mutex_init_waiter(waiter: &rt_waiter); | 
|---|
| 1013 |  | 
|---|
| 1014 | /* | 
|---|
| 1015 | * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not | 
|---|
| 1016 | * hold it while doing rt_mutex_start_proxy(), because then it will | 
|---|
| 1017 | * include hb->lock in the blocking chain, even through we'll not in | 
|---|
| 1018 | * fact hold it while blocking. This will lead it to report -EDEADLK | 
|---|
| 1019 | * and BUG when futex_unlock_pi() interleaves with this. | 
|---|
| 1020 | * | 
|---|
| 1021 | * Therefore acquire wait_lock while holding hb->lock, but drop the | 
|---|
| 1022 | * latter before calling __rt_mutex_start_proxy_lock(). This | 
|---|
| 1023 | * interleaves with futex_unlock_pi() -- which does a similar lock | 
|---|
| 1024 | * handoff -- such that the latter can observe the futex_q::pi_state | 
|---|
| 1025 | * before __rt_mutex_start_proxy_lock() is done. | 
|---|
| 1026 | */ | 
|---|
| 1027 | raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); | 
|---|
| 1028 | spin_unlock(lock: q.lock_ptr); | 
|---|
| 1029 | /* | 
|---|
| 1030 | * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter | 
|---|
| 1031 | * such that futex_unlock_pi() is guaranteed to observe the waiter when | 
|---|
| 1032 | * it sees the futex_q::pi_state. | 
|---|
| 1033 | */ | 
|---|
| 1034 | ret = __rt_mutex_start_proxy_lock(lock: &q.pi_state->pi_mutex, waiter: &rt_waiter, current, &wake_q); | 
|---|
| 1035 | raw_spin_unlock_irq_wake(lock: &q.pi_state->pi_mutex.wait_lock, wake_q: &wake_q); | 
|---|
| 1036 |  | 
|---|
| 1037 | if (ret) { | 
|---|
| 1038 | if (ret == 1) | 
|---|
| 1039 | ret = 0; | 
|---|
| 1040 | goto cleanup; | 
|---|
| 1041 | } | 
|---|
| 1042 |  | 
|---|
| 1043 | if (unlikely(to)) | 
|---|
| 1044 | hrtimer_sleeper_start_expires(sl: to, mode: HRTIMER_MODE_ABS); | 
|---|
| 1045 |  | 
|---|
| 1046 | ret = rt_mutex_wait_proxy_lock(lock: &q.pi_state->pi_mutex, to, waiter: &rt_waiter); | 
|---|
| 1047 |  | 
|---|
| 1048 | cleanup: | 
|---|
| 1049 | /* | 
|---|
| 1050 | * If we failed to acquire the lock (deadlock/signal/timeout), we must | 
|---|
| 1051 | * unwind the above, however we canont lock hb->lock because | 
|---|
| 1052 | * rt_mutex already has a waiter enqueued and hb->lock can itself try | 
|---|
| 1053 | * and enqueue an rt_waiter through rtlock. | 
|---|
| 1054 | * | 
|---|
| 1055 | * Doing the cleanup without holding hb->lock can cause inconsistent | 
|---|
| 1056 | * state between hb and pi_state, but only in the direction of not | 
|---|
| 1057 | * seeing a waiter that is leaving. | 
|---|
| 1058 | * | 
|---|
| 1059 | * See futex_unlock_pi(), it deals with this inconsistency. | 
|---|
| 1060 | * | 
|---|
| 1061 | * There be dragons here, since we must deal with the inconsistency on | 
|---|
| 1062 | * the way out (here), it is impossible to detect/warn about the race | 
|---|
| 1063 | * the other way around (missing an incoming waiter). | 
|---|
| 1064 | * | 
|---|
| 1065 | * What could possibly go wrong... | 
|---|
| 1066 | */ | 
|---|
| 1067 | if (ret && !rt_mutex_cleanup_proxy_lock(lock: &q.pi_state->pi_mutex, waiter: &rt_waiter)) | 
|---|
| 1068 | ret = 0; | 
|---|
| 1069 |  | 
|---|
| 1070 | /* | 
|---|
| 1071 | * Now that the rt_waiter has been dequeued, it is safe to use | 
|---|
| 1072 | * spinlock/rtlock (which might enqueue its own rt_waiter) and fix up | 
|---|
| 1073 | * the | 
|---|
| 1074 | */ | 
|---|
| 1075 | futex_q_lockptr_lock(q: &q); | 
|---|
| 1076 | /* | 
|---|
| 1077 | * Waiter is unqueued. | 
|---|
| 1078 | */ | 
|---|
| 1079 | rt_mutex_post_schedule(); | 
|---|
| 1080 | no_block: | 
|---|
| 1081 | /* | 
|---|
| 1082 | * Fixup the pi_state owner and possibly acquire the lock if we | 
|---|
| 1083 | * haven't already. | 
|---|
| 1084 | */ | 
|---|
| 1085 | res = fixup_pi_owner(uaddr, q: &q, locked: !ret); | 
|---|
| 1086 | /* | 
|---|
| 1087 | * If fixup_pi_owner() returned an error, propagate that.  If it acquired | 
|---|
| 1088 | * the lock, clear our -ETIMEDOUT or -EINTR. | 
|---|
| 1089 | */ | 
|---|
| 1090 | if (res) | 
|---|
| 1091 | ret = (res < 0) ? res : 0; | 
|---|
| 1092 |  | 
|---|
| 1093 | futex_unqueue_pi(q: &q); | 
|---|
| 1094 | spin_unlock(lock: q.lock_ptr); | 
|---|
| 1095 | if (q.drop_hb_ref) { | 
|---|
| 1096 | CLASS(hb, hb)(key: &q.key); | 
|---|
| 1097 | /* Additional reference from futex_unlock_pi() */ | 
|---|
| 1098 | futex_hash_put(hb); | 
|---|
| 1099 | } | 
|---|
| 1100 | goto out; | 
|---|
| 1101 |  | 
|---|
| 1102 | out_unlock_put_key: | 
|---|
| 1103 | futex_q_unlock(hb); | 
|---|
| 1104 | goto out; | 
|---|
| 1105 |  | 
|---|
| 1106 | uaddr_faulted: | 
|---|
| 1107 | futex_q_unlock(hb); | 
|---|
| 1108 |  | 
|---|
| 1109 | ret = fault_in_user_writeable(uaddr); | 
|---|
| 1110 | if (ret) | 
|---|
| 1111 | goto out; | 
|---|
| 1112 |  | 
|---|
| 1113 | if (!(flags & FLAGS_SHARED)) | 
|---|
| 1114 | goto retry_private; | 
|---|
| 1115 |  | 
|---|
| 1116 | goto retry; | 
|---|
| 1117 | } | 
|---|
| 1118 |  | 
|---|
| 1119 | out: | 
|---|
| 1120 | if (to) { | 
|---|
| 1121 | hrtimer_cancel(timer: &to->timer); | 
|---|
| 1122 | destroy_hrtimer_on_stack(timer: &to->timer); | 
|---|
| 1123 | } | 
|---|
| 1124 | return ret != -EINTR ? ret : -ERESTARTNOINTR; | 
|---|
| 1125 | } | 
|---|
| 1126 |  | 
|---|
| 1127 | /* | 
|---|
| 1128 | * Userspace attempted a TID -> 0 atomic transition, and failed. | 
|---|
| 1129 | * This is the in-kernel slowpath: we look up the PI state (if any), | 
|---|
| 1130 | * and do the rt-mutex unlock. | 
|---|
| 1131 | */ | 
|---|
| 1132 | int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) | 
|---|
| 1133 | { | 
|---|
| 1134 | u32 curval, uval, vpid = task_pid_vnr(current); | 
|---|
| 1135 | union futex_key key = FUTEX_KEY_INIT; | 
|---|
| 1136 | struct futex_q *top_waiter; | 
|---|
| 1137 | int ret; | 
|---|
| 1138 |  | 
|---|
| 1139 | if (!IS_ENABLED(CONFIG_FUTEX_PI)) | 
|---|
| 1140 | return -ENOSYS; | 
|---|
| 1141 |  | 
|---|
| 1142 | retry: | 
|---|
| 1143 | if (get_user(uval, uaddr)) | 
|---|
| 1144 | return -EFAULT; | 
|---|
| 1145 | /* | 
|---|
| 1146 | * We release only a lock we actually own: | 
|---|
| 1147 | */ | 
|---|
| 1148 | if ((uval & FUTEX_TID_MASK) != vpid) | 
|---|
| 1149 | return -EPERM; | 
|---|
| 1150 |  | 
|---|
| 1151 | ret = get_futex_key(uaddr, flags, key: &key, rw: FUTEX_WRITE); | 
|---|
| 1152 | if (ret) | 
|---|
| 1153 | return ret; | 
|---|
| 1154 |  | 
|---|
| 1155 | CLASS(hb, hb)(key: &key); | 
|---|
| 1156 | spin_lock(lock: &hb->lock); | 
|---|
| 1157 | retry_hb: | 
|---|
| 1158 |  | 
|---|
| 1159 | /* | 
|---|
| 1160 | * Check waiters first. We do not trust user space values at | 
|---|
| 1161 | * all and we at least want to know if user space fiddled | 
|---|
| 1162 | * with the futex value instead of blindly unlocking. | 
|---|
| 1163 | */ | 
|---|
| 1164 | top_waiter = futex_top_waiter(hb, key: &key); | 
|---|
| 1165 | if (top_waiter) { | 
|---|
| 1166 | struct futex_pi_state *pi_state = top_waiter->pi_state; | 
|---|
| 1167 | struct rt_mutex_waiter *rt_waiter; | 
|---|
| 1168 |  | 
|---|
| 1169 | ret = -EINVAL; | 
|---|
| 1170 | if (!pi_state) | 
|---|
| 1171 | goto out_unlock; | 
|---|
| 1172 |  | 
|---|
| 1173 | /* | 
|---|
| 1174 | * If current does not own the pi_state then the futex is | 
|---|
| 1175 | * inconsistent and user space fiddled with the futex value. | 
|---|
| 1176 | */ | 
|---|
| 1177 | if (pi_state->owner != current) | 
|---|
| 1178 | goto out_unlock; | 
|---|
| 1179 |  | 
|---|
| 1180 | /* | 
|---|
| 1181 | * By taking wait_lock while still holding hb->lock, we ensure | 
|---|
| 1182 | * there is no point where we hold neither; and thereby | 
|---|
| 1183 | * wake_futex_pi() must observe any new waiters. | 
|---|
| 1184 | * | 
|---|
| 1185 | * Since the cleanup: case in futex_lock_pi() removes the | 
|---|
| 1186 | * rt_waiter without holding hb->lock, it is possible for | 
|---|
| 1187 | * wake_futex_pi() to not find a waiter while the above does, | 
|---|
| 1188 | * in this case the waiter is on the way out and it can be | 
|---|
| 1189 | * ignored. | 
|---|
| 1190 | * | 
|---|
| 1191 | * In particular; this forces __rt_mutex_start_proxy() to | 
|---|
| 1192 | * complete such that we're guaranteed to observe the | 
|---|
| 1193 | * rt_waiter. | 
|---|
| 1194 | */ | 
|---|
| 1195 | raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); | 
|---|
| 1196 |  | 
|---|
| 1197 | /* | 
|---|
| 1198 | * Futex vs rt_mutex waiter state -- if there are no rt_mutex | 
|---|
| 1199 | * waiters even though futex thinks there are, then the waiter | 
|---|
| 1200 | * is leaving. The entry needs to be removed from the list so a | 
|---|
| 1201 | * new futex_lock_pi() is not using this stale PI-state while | 
|---|
| 1202 | * the futex is available in user space again. | 
|---|
| 1203 | * There can be more than one task on its way out so it needs | 
|---|
| 1204 | * to retry. | 
|---|
| 1205 | */ | 
|---|
| 1206 | rt_waiter = rt_mutex_top_waiter(lock: &pi_state->pi_mutex); | 
|---|
| 1207 | if (!rt_waiter) { | 
|---|
| 1208 | /* | 
|---|
| 1209 | * Acquire a reference for the leaving waiter to ensure | 
|---|
| 1210 | * valid futex_q::lock_ptr. | 
|---|
| 1211 | */ | 
|---|
| 1212 | futex_hash_get(hb); | 
|---|
| 1213 | top_waiter->drop_hb_ref = true; | 
|---|
| 1214 | __futex_unqueue(q: top_waiter); | 
|---|
| 1215 | raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); | 
|---|
| 1216 | goto retry_hb; | 
|---|
| 1217 | } | 
|---|
| 1218 |  | 
|---|
| 1219 | get_pi_state(pi_state); | 
|---|
| 1220 | spin_unlock(lock: &hb->lock); | 
|---|
| 1221 |  | 
|---|
| 1222 | /* drops pi_state->pi_mutex.wait_lock */ | 
|---|
| 1223 | ret = wake_futex_pi(uaddr, uval, pi_state, top_waiter: rt_waiter); | 
|---|
| 1224 |  | 
|---|
| 1225 | put_pi_state(pi_state); | 
|---|
| 1226 |  | 
|---|
| 1227 | /* | 
|---|
| 1228 | * Success, we're done! No tricky corner cases. | 
|---|
| 1229 | */ | 
|---|
| 1230 | if (!ret) | 
|---|
| 1231 | return ret; | 
|---|
| 1232 | /* | 
|---|
| 1233 | * The atomic access to the futex value generated a | 
|---|
| 1234 | * pagefault, so retry the user-access and the wakeup: | 
|---|
| 1235 | */ | 
|---|
| 1236 | if (ret == -EFAULT) | 
|---|
| 1237 | goto pi_faulted; | 
|---|
| 1238 | /* | 
|---|
| 1239 | * A unconditional UNLOCK_PI op raced against a waiter | 
|---|
| 1240 | * setting the FUTEX_WAITERS bit. Try again. | 
|---|
| 1241 | */ | 
|---|
| 1242 | if (ret == -EAGAIN) | 
|---|
| 1243 | goto pi_retry; | 
|---|
| 1244 | /* | 
|---|
| 1245 | * wake_futex_pi has detected invalid state. Tell user | 
|---|
| 1246 | * space. | 
|---|
| 1247 | */ | 
|---|
| 1248 | return ret; | 
|---|
| 1249 | } | 
|---|
| 1250 |  | 
|---|
| 1251 | /* | 
|---|
| 1252 | * We have no kernel internal state, i.e. no waiters in the | 
|---|
| 1253 | * kernel. Waiters which are about to queue themselves are stuck | 
|---|
| 1254 | * on hb->lock. So we can safely ignore them. We do neither | 
|---|
| 1255 | * preserve the WAITERS bit not the OWNER_DIED one. We are the | 
|---|
| 1256 | * owner. | 
|---|
| 1257 | */ | 
|---|
| 1258 | if ((ret = futex_cmpxchg_value_locked(curval: &curval, uaddr, uval, newval: 0))) { | 
|---|
| 1259 | spin_unlock(lock: &hb->lock); | 
|---|
| 1260 | switch (ret) { | 
|---|
| 1261 | case -EFAULT: | 
|---|
| 1262 | goto pi_faulted; | 
|---|
| 1263 |  | 
|---|
| 1264 | case -EAGAIN: | 
|---|
| 1265 | goto pi_retry; | 
|---|
| 1266 |  | 
|---|
| 1267 | default: | 
|---|
| 1268 | WARN_ON_ONCE(1); | 
|---|
| 1269 | return ret; | 
|---|
| 1270 | } | 
|---|
| 1271 | } | 
|---|
| 1272 |  | 
|---|
| 1273 | /* | 
|---|
| 1274 | * If uval has changed, let user space handle it. | 
|---|
| 1275 | */ | 
|---|
| 1276 | ret = (curval == uval) ? 0 : -EAGAIN; | 
|---|
| 1277 |  | 
|---|
| 1278 | out_unlock: | 
|---|
| 1279 | spin_unlock(lock: &hb->lock); | 
|---|
| 1280 | return ret; | 
|---|
| 1281 |  | 
|---|
| 1282 | pi_retry: | 
|---|
| 1283 | cond_resched(); | 
|---|
| 1284 | goto retry; | 
|---|
| 1285 |  | 
|---|
| 1286 | pi_faulted: | 
|---|
| 1287 |  | 
|---|
| 1288 | ret = fault_in_user_writeable(uaddr); | 
|---|
| 1289 | if (!ret) | 
|---|
| 1290 | goto retry; | 
|---|
| 1291 |  | 
|---|
| 1292 | return ret; | 
|---|
| 1293 | } | 
|---|
| 1294 |  | 
|---|
| 1295 |  | 
|---|