| 1 | // SPDX-License-Identifier: GPL-2.0-or-later | 
|---|
| 2 |  | 
|---|
| 3 | #include <linux/plist.h> | 
|---|
| 4 | #include <linux/sched/task.h> | 
|---|
| 5 | #include <linux/sched/signal.h> | 
|---|
| 6 | #include <linux/freezer.h> | 
|---|
| 7 |  | 
|---|
| 8 | #include "futex.h" | 
|---|
| 9 |  | 
|---|
| 10 | /* | 
|---|
| 11 | * READ this before attempting to hack on futexes! | 
|---|
| 12 | * | 
|---|
| 13 | * Basic futex operation and ordering guarantees | 
|---|
| 14 | * ============================================= | 
|---|
| 15 | * | 
|---|
| 16 | * The waiter reads the futex value in user space and calls | 
|---|
| 17 | * futex_wait(). This function computes the hash bucket and acquires | 
|---|
| 18 | * the hash bucket lock. After that it reads the futex user space value | 
|---|
| 19 | * again and verifies that the data has not changed. If it has not changed | 
|---|
| 20 | * it enqueues itself into the hash bucket, releases the hash bucket lock | 
|---|
| 21 | * and schedules. | 
|---|
| 22 | * | 
|---|
| 23 | * The waker side modifies the user space value of the futex and calls | 
|---|
| 24 | * futex_wake(). This function computes the hash bucket and acquires the | 
|---|
| 25 | * hash bucket lock. Then it looks for waiters on that futex in the hash | 
|---|
| 26 | * bucket and wakes them. | 
|---|
| 27 | * | 
|---|
| 28 | * In futex wake up scenarios where no tasks are blocked on a futex, taking | 
|---|
| 29 | * the hb spinlock can be avoided and simply return. In order for this | 
|---|
| 30 | * optimization to work, ordering guarantees must exist so that the waiter | 
|---|
| 31 | * being added to the list is acknowledged when the list is concurrently being | 
|---|
| 32 | * checked by the waker, avoiding scenarios like the following: | 
|---|
| 33 | * | 
|---|
| 34 | * CPU 0                               CPU 1 | 
|---|
| 35 | * val = *futex; | 
|---|
| 36 | * sys_futex(WAIT, futex, val); | 
|---|
| 37 | *   futex_wait(futex, val); | 
|---|
| 38 | *   uval = *futex; | 
|---|
| 39 | *                                     *futex = newval; | 
|---|
| 40 | *                                     sys_futex(WAKE, futex); | 
|---|
| 41 | *                                       futex_wake(futex); | 
|---|
| 42 | *                                       if (queue_empty()) | 
|---|
| 43 | *                                         return; | 
|---|
| 44 | *   if (uval == val) | 
|---|
| 45 | *      lock(hash_bucket(futex)); | 
|---|
| 46 | *      queue(); | 
|---|
| 47 | *     unlock(hash_bucket(futex)); | 
|---|
| 48 | *     schedule(); | 
|---|
| 49 | * | 
|---|
| 50 | * This would cause the waiter on CPU 0 to wait forever because it | 
|---|
| 51 | * missed the transition of the user space value from val to newval | 
|---|
| 52 | * and the waker did not find the waiter in the hash bucket queue. | 
|---|
| 53 | * | 
|---|
| 54 | * The correct serialization ensures that a waiter either observes | 
|---|
| 55 | * the changed user space value before blocking or is woken by a | 
|---|
| 56 | * concurrent waker: | 
|---|
| 57 | * | 
|---|
| 58 | * CPU 0                                 CPU 1 | 
|---|
| 59 | * val = *futex; | 
|---|
| 60 | * sys_futex(WAIT, futex, val); | 
|---|
| 61 | *   futex_wait(futex, val); | 
|---|
| 62 | * | 
|---|
| 63 | *   waiters++; (a) | 
|---|
| 64 | *   smp_mb(); (A) <-- paired with -. | 
|---|
| 65 | *                                  | | 
|---|
| 66 | *   lock(hash_bucket(futex));      | | 
|---|
| 67 | *                                  | | 
|---|
| 68 | *   uval = *futex;                 | | 
|---|
| 69 | *                                  |        *futex = newval; | 
|---|
| 70 | *                                  |        sys_futex(WAKE, futex); | 
|---|
| 71 | *                                  |          futex_wake(futex); | 
|---|
| 72 | *                                  | | 
|---|
| 73 | *                                  `--------> smp_mb(); (B) | 
|---|
| 74 | *   if (uval == val) | 
|---|
| 75 | *     queue(); | 
|---|
| 76 | *     unlock(hash_bucket(futex)); | 
|---|
| 77 | *     schedule();                         if (waiters) | 
|---|
| 78 | *                                           lock(hash_bucket(futex)); | 
|---|
| 79 | *   else                                    wake_waiters(futex); | 
|---|
| 80 | *     waiters--; (b)                        unlock(hash_bucket(futex)); | 
|---|
| 81 | * | 
|---|
| 82 | * Where (A) orders the waiters increment and the futex value read through | 
|---|
| 83 | * atomic operations (see futex_hb_waiters_inc) and where (B) orders the write | 
|---|
| 84 | * to futex and the waiters read (see futex_hb_waiters_pending()). | 
|---|
| 85 | * | 
|---|
| 86 | * This yields the following case (where X:=waiters, Y:=futex): | 
|---|
| 87 | * | 
|---|
| 88 | *	X = Y = 0 | 
|---|
| 89 | * | 
|---|
| 90 | *	w[X]=1		w[Y]=1 | 
|---|
| 91 | *	MB		MB | 
|---|
| 92 | *	r[Y]=y		r[X]=x | 
|---|
| 93 | * | 
|---|
| 94 | * Which guarantees that x==0 && y==0 is impossible; which translates back into | 
|---|
| 95 | * the guarantee that we cannot both miss the futex variable change and the | 
|---|
| 96 | * enqueue. | 
|---|
| 97 | * | 
|---|
| 98 | * Note that a new waiter is accounted for in (a) even when it is possible that | 
|---|
| 99 | * the wait call can return error, in which case we backtrack from it in (b). | 
|---|
| 100 | * Refer to the comment in futex_q_lock(). | 
|---|
| 101 | * | 
|---|
| 102 | * Similarly, in order to account for waiters being requeued on another | 
|---|
| 103 | * address we always increment the waiters for the destination bucket before | 
|---|
| 104 | * acquiring the lock. It then decrements them again  after releasing it - | 
|---|
| 105 | * the code that actually moves the futex(es) between hash buckets (requeue_futex) | 
|---|
| 106 | * will do the additional required waiter count housekeeping. This is done for | 
|---|
| 107 | * double_lock_hb() and double_unlock_hb(), respectively. | 
|---|
| 108 | */ | 
|---|
| 109 |  | 
|---|
| 110 | bool __futex_wake_mark(struct futex_q *q) | 
|---|
| 111 | { | 
|---|
| 112 | if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n")) | 
|---|
| 113 | return false; | 
|---|
| 114 |  | 
|---|
| 115 | __futex_unqueue(q); | 
|---|
| 116 | /* | 
|---|
| 117 | * The waiting task can free the futex_q as soon as q->lock_ptr = NULL | 
|---|
| 118 | * is written, without taking any locks. This is possible in the event | 
|---|
| 119 | * of a spurious wakeup, for example. A memory barrier is required here | 
|---|
| 120 | * to prevent the following store to lock_ptr from getting ahead of the | 
|---|
| 121 | * plist_del in __futex_unqueue(). | 
|---|
| 122 | */ | 
|---|
| 123 | smp_store_release(&q->lock_ptr, NULL); | 
|---|
| 124 |  | 
|---|
| 125 | return true; | 
|---|
| 126 | } | 
|---|
| 127 |  | 
|---|
| 128 | /* | 
|---|
| 129 | * The hash bucket lock must be held when this is called. | 
|---|
| 130 | * Afterwards, the futex_q must not be accessed. Callers | 
|---|
| 131 | * must ensure to later call wake_up_q() for the actual | 
|---|
| 132 | * wakeups to occur. | 
|---|
| 133 | */ | 
|---|
| 134 | void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q) | 
|---|
| 135 | { | 
|---|
| 136 | struct task_struct *p = q->task; | 
|---|
| 137 |  | 
|---|
| 138 | get_task_struct(t: p); | 
|---|
| 139 |  | 
|---|
| 140 | if (!__futex_wake_mark(q)) { | 
|---|
| 141 | put_task_struct(t: p); | 
|---|
| 142 | return; | 
|---|
| 143 | } | 
|---|
| 144 |  | 
|---|
| 145 | /* | 
|---|
| 146 | * Queue the task for later wakeup for after we've released | 
|---|
| 147 | * the hb->lock. | 
|---|
| 148 | */ | 
|---|
| 149 | wake_q_add_safe(head: wake_q, task: p); | 
|---|
| 150 | } | 
|---|
| 151 |  | 
|---|
| 152 | /* | 
|---|
| 153 | * Wake up waiters matching bitset queued on this futex (uaddr). | 
|---|
| 154 | */ | 
|---|
| 155 | int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) | 
|---|
| 156 | { | 
|---|
| 157 | struct futex_q *this, *next; | 
|---|
| 158 | union futex_key key = FUTEX_KEY_INIT; | 
|---|
| 159 | DEFINE_WAKE_Q(wake_q); | 
|---|
| 160 | int ret; | 
|---|
| 161 |  | 
|---|
| 162 | if (!bitset) | 
|---|
| 163 | return -EINVAL; | 
|---|
| 164 |  | 
|---|
| 165 | ret = get_futex_key(uaddr, flags, key: &key, rw: FUTEX_READ); | 
|---|
| 166 | if (unlikely(ret != 0)) | 
|---|
| 167 | return ret; | 
|---|
| 168 |  | 
|---|
| 169 | if ((flags & FLAGS_STRICT) && !nr_wake) | 
|---|
| 170 | return 0; | 
|---|
| 171 |  | 
|---|
| 172 | CLASS(hb, hb)(key: &key); | 
|---|
| 173 |  | 
|---|
| 174 | /* Make sure we really have tasks to wakeup */ | 
|---|
| 175 | if (!futex_hb_waiters_pending(hb)) | 
|---|
| 176 | return ret; | 
|---|
| 177 |  | 
|---|
| 178 | spin_lock(lock: &hb->lock); | 
|---|
| 179 |  | 
|---|
| 180 | plist_for_each_entry_safe(this, next, &hb->chain, list) { | 
|---|
| 181 | if (futex_match (key1: &this->key, key2: &key)) { | 
|---|
| 182 | if (this->pi_state || this->rt_waiter) { | 
|---|
| 183 | ret = -EINVAL; | 
|---|
| 184 | break; | 
|---|
| 185 | } | 
|---|
| 186 |  | 
|---|
| 187 | /* Check if one of the bits is set in both bitsets */ | 
|---|
| 188 | if (!(this->bitset & bitset)) | 
|---|
| 189 | continue; | 
|---|
| 190 |  | 
|---|
| 191 | this->wake(&wake_q, this); | 
|---|
| 192 | if (++ret >= nr_wake) | 
|---|
| 193 | break; | 
|---|
| 194 | } | 
|---|
| 195 | } | 
|---|
| 196 |  | 
|---|
| 197 | spin_unlock(lock: &hb->lock); | 
|---|
| 198 | wake_up_q(head: &wake_q); | 
|---|
| 199 | return ret; | 
|---|
| 200 | } | 
|---|
| 201 |  | 
|---|
| 202 | static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) | 
|---|
| 203 | { | 
|---|
| 204 | unsigned int op =	  (encoded_op & 0x70000000) >> 28; | 
|---|
| 205 | unsigned int cmp =	  (encoded_op & 0x0f000000) >> 24; | 
|---|
| 206 | int oparg = sign_extend32(value: (encoded_op & 0x00fff000) >> 12, index: 11); | 
|---|
| 207 | int cmparg = sign_extend32(value: encoded_op & 0x00000fff, index: 11); | 
|---|
| 208 | int oldval, ret; | 
|---|
| 209 |  | 
|---|
| 210 | if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) { | 
|---|
| 211 | if (oparg < 0 || oparg > 31) { | 
|---|
| 212 | /* | 
|---|
| 213 | * kill this print and return -EINVAL when userspace | 
|---|
| 214 | * is sane again | 
|---|
| 215 | */ | 
|---|
| 216 | pr_info_ratelimited( "futex_wake_op: %s tries to shift op by %d; fix this program\n", | 
|---|
| 217 | current->comm, oparg); | 
|---|
| 218 | oparg &= 31; | 
|---|
| 219 | } | 
|---|
| 220 | oparg = 1 << oparg; | 
|---|
| 221 | } | 
|---|
| 222 |  | 
|---|
| 223 | pagefault_disable(); | 
|---|
| 224 | ret = arch_futex_atomic_op_inuser(op, oparg, oval: &oldval, uaddr); | 
|---|
| 225 | pagefault_enable(); | 
|---|
| 226 | if (ret) | 
|---|
| 227 | return ret; | 
|---|
| 228 |  | 
|---|
| 229 | switch (cmp) { | 
|---|
| 230 | case FUTEX_OP_CMP_EQ: | 
|---|
| 231 | return oldval == cmparg; | 
|---|
| 232 | case FUTEX_OP_CMP_NE: | 
|---|
| 233 | return oldval != cmparg; | 
|---|
| 234 | case FUTEX_OP_CMP_LT: | 
|---|
| 235 | return oldval < cmparg; | 
|---|
| 236 | case FUTEX_OP_CMP_GE: | 
|---|
| 237 | return oldval >= cmparg; | 
|---|
| 238 | case FUTEX_OP_CMP_LE: | 
|---|
| 239 | return oldval <= cmparg; | 
|---|
| 240 | case FUTEX_OP_CMP_GT: | 
|---|
| 241 | return oldval > cmparg; | 
|---|
| 242 | default: | 
|---|
| 243 | return -ENOSYS; | 
|---|
| 244 | } | 
|---|
| 245 | } | 
|---|
| 246 |  | 
|---|
| 247 | /* | 
|---|
| 248 | * Wake up all waiters hashed on the physical page that is mapped | 
|---|
| 249 | * to this virtual address: | 
|---|
| 250 | */ | 
|---|
| 251 | int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, | 
|---|
| 252 | int nr_wake, int nr_wake2, int op) | 
|---|
| 253 | { | 
|---|
| 254 | union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; | 
|---|
| 255 | struct futex_q *this, *next; | 
|---|
| 256 | int ret, op_ret; | 
|---|
| 257 | DEFINE_WAKE_Q(wake_q); | 
|---|
| 258 |  | 
|---|
| 259 | retry: | 
|---|
| 260 | ret = get_futex_key(uaddr: uaddr1, flags, key: &key1, rw: FUTEX_READ); | 
|---|
| 261 | if (unlikely(ret != 0)) | 
|---|
| 262 | return ret; | 
|---|
| 263 | ret = get_futex_key(uaddr: uaddr2, flags, key: &key2, rw: FUTEX_WRITE); | 
|---|
| 264 | if (unlikely(ret != 0)) | 
|---|
| 265 | return ret; | 
|---|
| 266 |  | 
|---|
| 267 | retry_private: | 
|---|
| 268 | if (1) { | 
|---|
| 269 | CLASS(hb, hb1)(key: &key1); | 
|---|
| 270 | CLASS(hb, hb2)(key: &key2); | 
|---|
| 271 |  | 
|---|
| 272 | double_lock_hb(hb1, hb2); | 
|---|
| 273 | op_ret = futex_atomic_op_inuser(encoded_op: op, uaddr: uaddr2); | 
|---|
| 274 | if (unlikely(op_ret < 0)) { | 
|---|
| 275 | double_unlock_hb(hb1, hb2); | 
|---|
| 276 |  | 
|---|
| 277 | if (!IS_ENABLED(CONFIG_MMU) || | 
|---|
| 278 | unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) { | 
|---|
| 279 | /* | 
|---|
| 280 | * we don't get EFAULT from MMU faults if we don't have | 
|---|
| 281 | * an MMU, but we might get them from range checking | 
|---|
| 282 | */ | 
|---|
| 283 | ret = op_ret; | 
|---|
| 284 | return ret; | 
|---|
| 285 | } | 
|---|
| 286 |  | 
|---|
| 287 | if (op_ret == -EFAULT) { | 
|---|
| 288 | ret = fault_in_user_writeable(uaddr: uaddr2); | 
|---|
| 289 | if (ret) | 
|---|
| 290 | return ret; | 
|---|
| 291 | } | 
|---|
| 292 |  | 
|---|
| 293 | cond_resched(); | 
|---|
| 294 | if (!(flags & FLAGS_SHARED)) | 
|---|
| 295 | goto retry_private; | 
|---|
| 296 | goto retry; | 
|---|
| 297 | } | 
|---|
| 298 |  | 
|---|
| 299 | plist_for_each_entry_safe(this, next, &hb1->chain, list) { | 
|---|
| 300 | if (futex_match(key1: &this->key, key2: &key1)) { | 
|---|
| 301 | if (this->pi_state || this->rt_waiter) { | 
|---|
| 302 | ret = -EINVAL; | 
|---|
| 303 | goto out_unlock; | 
|---|
| 304 | } | 
|---|
| 305 | this->wake(&wake_q, this); | 
|---|
| 306 | if (++ret >= nr_wake) | 
|---|
| 307 | break; | 
|---|
| 308 | } | 
|---|
| 309 | } | 
|---|
| 310 |  | 
|---|
| 311 | if (op_ret > 0) { | 
|---|
| 312 | op_ret = 0; | 
|---|
| 313 | plist_for_each_entry_safe(this, next, &hb2->chain, list) { | 
|---|
| 314 | if (futex_match(key1: &this->key, key2: &key2)) { | 
|---|
| 315 | if (this->pi_state || this->rt_waiter) { | 
|---|
| 316 | ret = -EINVAL; | 
|---|
| 317 | goto out_unlock; | 
|---|
| 318 | } | 
|---|
| 319 | this->wake(&wake_q, this); | 
|---|
| 320 | if (++op_ret >= nr_wake2) | 
|---|
| 321 | break; | 
|---|
| 322 | } | 
|---|
| 323 | } | 
|---|
| 324 | ret += op_ret; | 
|---|
| 325 | } | 
|---|
| 326 |  | 
|---|
| 327 | out_unlock: | 
|---|
| 328 | double_unlock_hb(hb1, hb2); | 
|---|
| 329 | } | 
|---|
| 330 | wake_up_q(head: &wake_q); | 
|---|
| 331 | return ret; | 
|---|
| 332 | } | 
|---|
| 333 |  | 
|---|
| 334 | static long futex_wait_restart(struct restart_block *restart); | 
|---|
| 335 |  | 
|---|
| 336 | /** | 
|---|
| 337 | * futex_do_wait() - wait for wakeup, timeout, or signal | 
|---|
| 338 | * @q:		the futex_q to queue up on | 
|---|
| 339 | * @timeout:	the prepared hrtimer_sleeper, or null for no timeout | 
|---|
| 340 | */ | 
|---|
| 341 | void futex_do_wait(struct futex_q *q, struct hrtimer_sleeper *timeout) | 
|---|
| 342 | { | 
|---|
| 343 | /* Arm the timer */ | 
|---|
| 344 | if (timeout) | 
|---|
| 345 | hrtimer_sleeper_start_expires(sl: timeout, mode: HRTIMER_MODE_ABS); | 
|---|
| 346 |  | 
|---|
| 347 | /* | 
|---|
| 348 | * If we have been removed from the hash list, then another task | 
|---|
| 349 | * has tried to wake us, and we can skip the call to schedule(). | 
|---|
| 350 | */ | 
|---|
| 351 | if (likely(!plist_node_empty(&q->list))) { | 
|---|
| 352 | /* | 
|---|
| 353 | * If the timer has already expired, current will already be | 
|---|
| 354 | * flagged for rescheduling. Only call schedule if there | 
|---|
| 355 | * is no timeout, or if it has yet to expire. | 
|---|
| 356 | */ | 
|---|
| 357 | if (!timeout || timeout->task) | 
|---|
| 358 | schedule(); | 
|---|
| 359 | } | 
|---|
| 360 | __set_current_state(TASK_RUNNING); | 
|---|
| 361 | } | 
|---|
| 362 |  | 
|---|
| 363 | /** | 
|---|
| 364 | * futex_unqueue_multiple - Remove various futexes from their hash bucket | 
|---|
| 365 | * @v:	   The list of futexes to unqueue | 
|---|
| 366 | * @count: Number of futexes in the list | 
|---|
| 367 | * | 
|---|
| 368 | * Helper to unqueue a list of futexes. This can't fail. | 
|---|
| 369 | * | 
|---|
| 370 | * Return: | 
|---|
| 371 | *  - >=0 - Index of the last futex that was awoken; | 
|---|
| 372 | *  - -1  - No futex was awoken | 
|---|
| 373 | */ | 
|---|
| 374 | int futex_unqueue_multiple(struct futex_vector *v, int count) | 
|---|
| 375 | { | 
|---|
| 376 | int ret = -1, i; | 
|---|
| 377 |  | 
|---|
| 378 | for (i = 0; i < count; i++) { | 
|---|
| 379 | if (!futex_unqueue(q: &v[i].q)) | 
|---|
| 380 | ret = i; | 
|---|
| 381 | } | 
|---|
| 382 |  | 
|---|
| 383 | return ret; | 
|---|
| 384 | } | 
|---|
| 385 |  | 
|---|
| 386 | /** | 
|---|
| 387 | * futex_wait_multiple_setup - Prepare to wait and enqueue multiple futexes | 
|---|
| 388 | * @vs:		The futex list to wait on | 
|---|
| 389 | * @count:	The size of the list | 
|---|
| 390 | * @woken:	Index of the last woken futex, if any. Used to notify the | 
|---|
| 391 | *		caller that it can return this index to userspace (return parameter) | 
|---|
| 392 | * | 
|---|
| 393 | * Prepare multiple futexes in a single step and enqueue them. This may fail if | 
|---|
| 394 | * the futex list is invalid or if any futex was already awoken. On success the | 
|---|
| 395 | * task is ready to interruptible sleep. | 
|---|
| 396 | * | 
|---|
| 397 | * Return: | 
|---|
| 398 | *  -  1 - One of the futexes was woken by another thread | 
|---|
| 399 | *  -  0 - Success | 
|---|
| 400 | *  - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL | 
|---|
| 401 | */ | 
|---|
| 402 | int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken) | 
|---|
| 403 | { | 
|---|
| 404 | bool retry = false; | 
|---|
| 405 | int ret, i; | 
|---|
| 406 | u32 uval; | 
|---|
| 407 |  | 
|---|
| 408 | /* | 
|---|
| 409 | * Make sure to have a reference on the private_hash such that we | 
|---|
| 410 | * don't block on rehash after changing the task state below. | 
|---|
| 411 | */ | 
|---|
| 412 | guard(private_hash)(); | 
|---|
| 413 |  | 
|---|
| 414 | /* | 
|---|
| 415 | * Enqueuing multiple futexes is tricky, because we need to enqueue | 
|---|
| 416 | * each futex on the list before dealing with the next one to avoid | 
|---|
| 417 | * deadlocking on the hash bucket. But, before enqueuing, we need to | 
|---|
| 418 | * make sure that current->state is TASK_INTERRUPTIBLE, so we don't | 
|---|
| 419 | * lose any wake events, which cannot be done before the get_futex_key | 
|---|
| 420 | * of the next key, because it calls get_user_pages, which can sleep. | 
|---|
| 421 | * Thus, we fetch the list of futexes keys in two steps, by first | 
|---|
| 422 | * pinning all the memory keys in the futex key, and only then we read | 
|---|
| 423 | * each key and queue the corresponding futex. | 
|---|
| 424 | * | 
|---|
| 425 | * Private futexes doesn't need to recalculate hash in retry, so skip | 
|---|
| 426 | * get_futex_key() when retrying. | 
|---|
| 427 | */ | 
|---|
| 428 | retry: | 
|---|
| 429 | for (i = 0; i < count; i++) { | 
|---|
| 430 | if (!(vs[i].w.flags & FLAGS_SHARED) && retry) | 
|---|
| 431 | continue; | 
|---|
| 432 |  | 
|---|
| 433 | ret = get_futex_key(u64_to_user_ptr(vs[i].w.uaddr), | 
|---|
| 434 | flags: vs[i].w.flags, | 
|---|
| 435 | key: &vs[i].q.key, rw: FUTEX_READ); | 
|---|
| 436 |  | 
|---|
| 437 | if (unlikely(ret)) | 
|---|
| 438 | return ret; | 
|---|
| 439 | } | 
|---|
| 440 |  | 
|---|
| 441 | set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); | 
|---|
| 442 |  | 
|---|
| 443 | for (i = 0; i < count; i++) { | 
|---|
| 444 | u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr; | 
|---|
| 445 | struct futex_q *q = &vs[i].q; | 
|---|
| 446 | u32 val = vs[i].w.val; | 
|---|
| 447 |  | 
|---|
| 448 | if (1) { | 
|---|
| 449 | CLASS(hb, hb)(key: &q->key); | 
|---|
| 450 |  | 
|---|
| 451 | futex_q_lock(q, hb); | 
|---|
| 452 | ret = futex_get_value_locked(dest: &uval, from: uaddr); | 
|---|
| 453 |  | 
|---|
| 454 | if (!ret && uval == val) { | 
|---|
| 455 | /* | 
|---|
| 456 | * The bucket lock can't be held while dealing with the | 
|---|
| 457 | * next futex. Queue each futex at this moment so hb can | 
|---|
| 458 | * be unlocked. | 
|---|
| 459 | */ | 
|---|
| 460 | futex_queue(q, hb, current); | 
|---|
| 461 | continue; | 
|---|
| 462 | } | 
|---|
| 463 |  | 
|---|
| 464 | futex_q_unlock(hb); | 
|---|
| 465 | } | 
|---|
| 466 | __set_current_state(TASK_RUNNING); | 
|---|
| 467 |  | 
|---|
| 468 | /* | 
|---|
| 469 | * Even if something went wrong, if we find out that a futex | 
|---|
| 470 | * was woken, we don't return error and return this index to | 
|---|
| 471 | * userspace | 
|---|
| 472 | */ | 
|---|
| 473 | *woken = futex_unqueue_multiple(v: vs, count: i); | 
|---|
| 474 | if (*woken >= 0) | 
|---|
| 475 | return 1; | 
|---|
| 476 |  | 
|---|
| 477 | if (ret) { | 
|---|
| 478 | /* | 
|---|
| 479 | * If we need to handle a page fault, we need to do so | 
|---|
| 480 | * without any lock and any enqueued futex (otherwise | 
|---|
| 481 | * we could lose some wakeup). So we do it here, after | 
|---|
| 482 | * undoing all the work done so far. In success, we | 
|---|
| 483 | * retry all the work. | 
|---|
| 484 | */ | 
|---|
| 485 | if (get_user(uval, uaddr)) | 
|---|
| 486 | return -EFAULT; | 
|---|
| 487 |  | 
|---|
| 488 | retry = true; | 
|---|
| 489 | goto retry; | 
|---|
| 490 | } | 
|---|
| 491 |  | 
|---|
| 492 | if (uval != val) | 
|---|
| 493 | return -EWOULDBLOCK; | 
|---|
| 494 | } | 
|---|
| 495 |  | 
|---|
| 496 | return 0; | 
|---|
| 497 | } | 
|---|
| 498 |  | 
|---|
| 499 | /** | 
|---|
| 500 | * futex_sleep_multiple - Check sleeping conditions and sleep | 
|---|
| 501 | * @vs:    List of futexes to wait for | 
|---|
| 502 | * @count: Length of vs | 
|---|
| 503 | * @to:    Timeout | 
|---|
| 504 | * | 
|---|
| 505 | * Sleep if and only if the timeout hasn't expired and no futex on the list has | 
|---|
| 506 | * been woken up. | 
|---|
| 507 | */ | 
|---|
| 508 | static void futex_sleep_multiple(struct futex_vector *vs, unsigned int count, | 
|---|
| 509 | struct hrtimer_sleeper *to) | 
|---|
| 510 | { | 
|---|
| 511 | if (to && !to->task) | 
|---|
| 512 | return; | 
|---|
| 513 |  | 
|---|
| 514 | for (; count; count--, vs++) { | 
|---|
| 515 | if (!READ_ONCE(vs->q.lock_ptr)) | 
|---|
| 516 | return; | 
|---|
| 517 | } | 
|---|
| 518 |  | 
|---|
| 519 | schedule(); | 
|---|
| 520 | } | 
|---|
| 521 |  | 
|---|
| 522 | /** | 
|---|
| 523 | * futex_wait_multiple - Prepare to wait on and enqueue several futexes | 
|---|
| 524 | * @vs:		The list of futexes to wait on | 
|---|
| 525 | * @count:	The number of objects | 
|---|
| 526 | * @to:		Timeout before giving up and returning to userspace | 
|---|
| 527 | * | 
|---|
| 528 | * Entry point for the FUTEX_WAIT_MULTIPLE futex operation, this function | 
|---|
| 529 | * sleeps on a group of futexes and returns on the first futex that is | 
|---|
| 530 | * wake, or after the timeout has elapsed. | 
|---|
| 531 | * | 
|---|
| 532 | * Return: | 
|---|
| 533 | *  - >=0 - Hint to the futex that was awoken | 
|---|
| 534 | *  - <0  - On error | 
|---|
| 535 | */ | 
|---|
| 536 | int futex_wait_multiple(struct futex_vector *vs, unsigned int count, | 
|---|
| 537 | struct hrtimer_sleeper *to) | 
|---|
| 538 | { | 
|---|
| 539 | int ret, hint = 0; | 
|---|
| 540 |  | 
|---|
| 541 | if (to) | 
|---|
| 542 | hrtimer_sleeper_start_expires(sl: to, mode: HRTIMER_MODE_ABS); | 
|---|
| 543 |  | 
|---|
| 544 | while (1) { | 
|---|
| 545 | ret = futex_wait_multiple_setup(vs, count, woken: &hint); | 
|---|
| 546 | if (ret) { | 
|---|
| 547 | if (ret > 0) { | 
|---|
| 548 | /* A futex was woken during setup */ | 
|---|
| 549 | ret = hint; | 
|---|
| 550 | } | 
|---|
| 551 | return ret; | 
|---|
| 552 | } | 
|---|
| 553 |  | 
|---|
| 554 | futex_sleep_multiple(vs, count, to); | 
|---|
| 555 |  | 
|---|
| 556 | __set_current_state(TASK_RUNNING); | 
|---|
| 557 |  | 
|---|
| 558 | ret = futex_unqueue_multiple(v: vs, count); | 
|---|
| 559 | if (ret >= 0) | 
|---|
| 560 | return ret; | 
|---|
| 561 |  | 
|---|
| 562 | if (to && !to->task) | 
|---|
| 563 | return -ETIMEDOUT; | 
|---|
| 564 | else if (signal_pending(current)) | 
|---|
| 565 | return -ERESTARTSYS; | 
|---|
| 566 | /* | 
|---|
| 567 | * The final case is a spurious wakeup, for | 
|---|
| 568 | * which just retry. | 
|---|
| 569 | */ | 
|---|
| 570 | } | 
|---|
| 571 | } | 
|---|
| 572 |  | 
|---|
| 573 | /** | 
|---|
| 574 | * futex_wait_setup() - Prepare to wait on a futex | 
|---|
| 575 | * @uaddr:	the futex userspace address | 
|---|
| 576 | * @val:	the expected value | 
|---|
| 577 | * @flags:	futex flags (FLAGS_SHARED, etc.) | 
|---|
| 578 | * @q:		the associated futex_q | 
|---|
| 579 | * @key2:	the second futex_key if used for requeue PI | 
|---|
| 580 | * @task:	Task queueing this futex | 
|---|
| 581 | * | 
|---|
| 582 | * Setup the futex_q and locate the hash_bucket.  Get the futex value and | 
|---|
| 583 | * compare it with the expected value.  Handle atomic faults internally. | 
|---|
| 584 | * Return with the hb lock held on success, and unlocked on failure. | 
|---|
| 585 | * | 
|---|
| 586 | * Return: | 
|---|
| 587 | *  -  0 - uaddr contains val and hb has been locked; | 
|---|
| 588 | *  - <0 - On error and the hb is unlocked. A possible reason: the uaddr can not | 
|---|
| 589 | *	   be read, does not contain the expected value or is not properly aligned. | 
|---|
| 590 | */ | 
|---|
| 591 | int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, | 
|---|
| 592 | struct futex_q *q, union futex_key *key2, | 
|---|
| 593 | struct task_struct *task) | 
|---|
| 594 | { | 
|---|
| 595 | u32 uval; | 
|---|
| 596 | int ret; | 
|---|
| 597 |  | 
|---|
| 598 | /* | 
|---|
| 599 | * Access the page AFTER the hash-bucket is locked. | 
|---|
| 600 | * Order is important: | 
|---|
| 601 | * | 
|---|
| 602 | *   Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); | 
|---|
| 603 | *   Userspace waker:  if (cond(var)) { var = new; futex_wake(&var); } | 
|---|
| 604 | * | 
|---|
| 605 | * The basic logical guarantee of a futex is that it blocks ONLY | 
|---|
| 606 | * if cond(var) is known to be true at the time of blocking, for | 
|---|
| 607 | * any cond.  If we locked the hash-bucket after testing *uaddr, that | 
|---|
| 608 | * would open a race condition where we could block indefinitely with | 
|---|
| 609 | * cond(var) false, which would violate the guarantee. | 
|---|
| 610 | * | 
|---|
| 611 | * On the other hand, we insert q and release the hash-bucket only | 
|---|
| 612 | * after testing *uaddr.  This guarantees that futex_wait() will NOT | 
|---|
| 613 | * absorb a wakeup if *uaddr does not match the desired values | 
|---|
| 614 | * while the syscall executes. | 
|---|
| 615 | */ | 
|---|
| 616 | retry: | 
|---|
| 617 | ret = get_futex_key(uaddr, flags, key: &q->key, rw: FUTEX_READ); | 
|---|
| 618 | if (unlikely(ret != 0)) | 
|---|
| 619 | return ret; | 
|---|
| 620 |  | 
|---|
| 621 | retry_private: | 
|---|
| 622 | if (1) { | 
|---|
| 623 | CLASS(hb, hb)(key: &q->key); | 
|---|
| 624 |  | 
|---|
| 625 | futex_q_lock(q, hb); | 
|---|
| 626 |  | 
|---|
| 627 | ret = futex_get_value_locked(dest: &uval, from: uaddr); | 
|---|
| 628 |  | 
|---|
| 629 | if (ret) { | 
|---|
| 630 | futex_q_unlock(hb); | 
|---|
| 631 |  | 
|---|
| 632 | ret = get_user(uval, uaddr); | 
|---|
| 633 | if (ret) | 
|---|
| 634 | return ret; | 
|---|
| 635 |  | 
|---|
| 636 | if (!(flags & FLAGS_SHARED)) | 
|---|
| 637 | goto retry_private; | 
|---|
| 638 |  | 
|---|
| 639 | goto retry; | 
|---|
| 640 | } | 
|---|
| 641 |  | 
|---|
| 642 | if (uval != val) { | 
|---|
| 643 | futex_q_unlock(hb); | 
|---|
| 644 | return -EWOULDBLOCK; | 
|---|
| 645 | } | 
|---|
| 646 |  | 
|---|
| 647 | if (key2 && futex_match(key1: &q->key, key2)) { | 
|---|
| 648 | futex_q_unlock(hb); | 
|---|
| 649 | return -EINVAL; | 
|---|
| 650 | } | 
|---|
| 651 |  | 
|---|
| 652 | /* | 
|---|
| 653 | * The task state is guaranteed to be set before another task can | 
|---|
| 654 | * wake it. set_current_state() is implemented using smp_store_mb() and | 
|---|
| 655 | * futex_queue() calls spin_unlock() upon completion, both serializing | 
|---|
| 656 | * access to the hash list and forcing another memory barrier. | 
|---|
| 657 | */ | 
|---|
| 658 | if (task == current) | 
|---|
| 659 | set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); | 
|---|
| 660 | futex_queue(q, hb, task); | 
|---|
| 661 | } | 
|---|
| 662 |  | 
|---|
| 663 | return ret; | 
|---|
| 664 | } | 
|---|
| 665 |  | 
|---|
| 666 | int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, | 
|---|
| 667 | struct hrtimer_sleeper *to, u32 bitset) | 
|---|
| 668 | { | 
|---|
| 669 | struct futex_q q = futex_q_init; | 
|---|
| 670 | int ret; | 
|---|
| 671 |  | 
|---|
| 672 | if (!bitset) | 
|---|
| 673 | return -EINVAL; | 
|---|
| 674 |  | 
|---|
| 675 | q.bitset = bitset; | 
|---|
| 676 |  | 
|---|
| 677 | retry: | 
|---|
| 678 | /* | 
|---|
| 679 | * Prepare to wait on uaddr. On success, it holds hb->lock and q | 
|---|
| 680 | * is initialized. | 
|---|
| 681 | */ | 
|---|
| 682 | ret = futex_wait_setup(uaddr, val, flags, q: &q, NULL, current); | 
|---|
| 683 | if (ret) | 
|---|
| 684 | return ret; | 
|---|
| 685 |  | 
|---|
| 686 | /* futex_queue and wait for wakeup, timeout, or a signal. */ | 
|---|
| 687 | futex_do_wait(q: &q, timeout: to); | 
|---|
| 688 |  | 
|---|
| 689 | /* If we were woken (and unqueued), we succeeded, whatever. */ | 
|---|
| 690 | if (!futex_unqueue(q: &q)) | 
|---|
| 691 | return 0; | 
|---|
| 692 |  | 
|---|
| 693 | if (to && !to->task) | 
|---|
| 694 | return -ETIMEDOUT; | 
|---|
| 695 |  | 
|---|
| 696 | /* | 
|---|
| 697 | * We expect signal_pending(current), but we might be the | 
|---|
| 698 | * victim of a spurious wakeup as well. | 
|---|
| 699 | */ | 
|---|
| 700 | if (!signal_pending(current)) | 
|---|
| 701 | goto retry; | 
|---|
| 702 |  | 
|---|
| 703 | return -ERESTARTSYS; | 
|---|
| 704 | } | 
|---|
| 705 |  | 
|---|
| 706 | int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset) | 
|---|
| 707 | { | 
|---|
| 708 | struct hrtimer_sleeper timeout, *to; | 
|---|
| 709 | struct restart_block *restart; | 
|---|
| 710 | int ret; | 
|---|
| 711 |  | 
|---|
| 712 | to = futex_setup_timer(time: abs_time, timeout: &timeout, flags, | 
|---|
| 713 | current->timer_slack_ns); | 
|---|
| 714 |  | 
|---|
| 715 | ret = __futex_wait(uaddr, flags, val, to, bitset); | 
|---|
| 716 |  | 
|---|
| 717 | /* No timeout, nothing to clean up. */ | 
|---|
| 718 | if (!to) | 
|---|
| 719 | return ret; | 
|---|
| 720 |  | 
|---|
| 721 | hrtimer_cancel(timer: &to->timer); | 
|---|
| 722 | destroy_hrtimer_on_stack(timer: &to->timer); | 
|---|
| 723 |  | 
|---|
| 724 | if (ret == -ERESTARTSYS) { | 
|---|
| 725 | restart = ¤t->restart_block; | 
|---|
| 726 | restart->futex.uaddr = uaddr; | 
|---|
| 727 | restart->futex.val = val; | 
|---|
| 728 | restart->futex.time = *abs_time; | 
|---|
| 729 | restart->futex.bitset = bitset; | 
|---|
| 730 | restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; | 
|---|
| 731 |  | 
|---|
| 732 | return set_restart_fn(restart, fn: futex_wait_restart); | 
|---|
| 733 | } | 
|---|
| 734 |  | 
|---|
| 735 | return ret; | 
|---|
| 736 | } | 
|---|
| 737 |  | 
|---|
| 738 | static long futex_wait_restart(struct restart_block *restart) | 
|---|
| 739 | { | 
|---|
| 740 | u32 __user *uaddr = restart->futex.uaddr; | 
|---|
| 741 | ktime_t t, *tp = NULL; | 
|---|
| 742 |  | 
|---|
| 743 | if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { | 
|---|
| 744 | t = restart->futex.time; | 
|---|
| 745 | tp = &t; | 
|---|
| 746 | } | 
|---|
| 747 | restart->fn = do_no_restart_syscall; | 
|---|
| 748 |  | 
|---|
| 749 | return (long)futex_wait(uaddr, flags: restart->futex.flags, | 
|---|
| 750 | val: restart->futex.val, abs_time: tp, bitset: restart->futex.bitset); | 
|---|
| 751 | } | 
|---|
| 752 |  | 
|---|
| 753 |  | 
|---|