1// SPDX-License-Identifier: GPL-2.0
2#define CREATE_TRACE_POINTS
3#include <trace/events/mmap_lock.h>
4
5#include <linux/mm.h>
6#include <linux/cgroup.h>
7#include <linux/memcontrol.h>
8#include <linux/mmap_lock.h>
9#include <linux/mutex.h>
10#include <linux/percpu.h>
11#include <linux/rcupdate.h>
12#include <linux/smp.h>
13#include <linux/trace_events.h>
14#include <linux/local_lock.h>
15
16EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking);
17EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned);
18EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released);
19
20#ifdef CONFIG_TRACING
21/*
22 * Trace calls must be in a separate file, as otherwise there's a circular
23 * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h.
24 */
25
26void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write)
27{
28 trace_mmap_lock_start_locking(mm, write);
29}
30EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking);
31
32void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
33 bool success)
34{
35 trace_mmap_lock_acquire_returned(mm, write, success);
36}
37EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned);
38
39void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write)
40{
41 trace_mmap_lock_released(mm, write);
42}
43EXPORT_SYMBOL(__mmap_lock_do_trace_released);
44#endif /* CONFIG_TRACING */
45
46#ifdef CONFIG_MMU
47#ifdef CONFIG_PER_VMA_LOCK
48static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching)
49{
50 unsigned int tgt_refcnt = VMA_LOCK_OFFSET;
51
52 /* Additional refcnt if the vma is attached. */
53 if (!detaching)
54 tgt_refcnt++;
55
56 /*
57 * If vma is detached then only vma_mark_attached() can raise the
58 * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
59 */
60 if (!refcount_add_not_zero(VMA_LOCK_OFFSET, r: &vma->vm_refcnt))
61 return false;
62
63 rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
64 rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
65 refcount_read(&vma->vm_refcnt) == tgt_refcnt,
66 TASK_UNINTERRUPTIBLE);
67 lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
68
69 return true;
70}
71
72static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
73{
74 *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, r: &vma->vm_refcnt);
75 rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
76}
77
78void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
79{
80 bool locked;
81
82 /*
83 * __vma_enter_locked() returns false immediately if the vma is not
84 * attached, otherwise it waits until refcnt is indicating that vma
85 * is attached with no readers.
86 */
87 locked = __vma_enter_locked(vma, detaching: false);
88
89 /*
90 * We should use WRITE_ONCE() here because we can have concurrent reads
91 * from the early lockless pessimistic check in vma_start_read().
92 * We don't really care about the correctness of that early check, but
93 * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
94 */
95 WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
96
97 if (locked) {
98 bool detached;
99
100 __vma_exit_locked(vma, detached: &detached);
101 WARN_ON_ONCE(detached); /* vma should remain attached */
102 }
103}
104EXPORT_SYMBOL_GPL(__vma_start_write);
105
106void vma_mark_detached(struct vm_area_struct *vma)
107{
108 vma_assert_write_locked(vma);
109 vma_assert_attached(vma);
110
111 /*
112 * We are the only writer, so no need to use vma_refcount_put().
113 * The condition below is unlikely because the vma has been already
114 * write-locked and readers can increment vm_refcnt only temporarily
115 * before they check vm_lock_seq, realize the vma is locked and drop
116 * back the vm_refcnt. That is a narrow window for observing a raised
117 * vm_refcnt.
118 */
119 if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
120 /* Wait until vma is detached with no readers. */
121 if (__vma_enter_locked(vma, detaching: true)) {
122 bool detached;
123
124 __vma_exit_locked(vma, detached: &detached);
125 WARN_ON_ONCE(!detached);
126 }
127 }
128}
129
130/*
131 * Try to read-lock a vma. The function is allowed to occasionally yield false
132 * locked result to avoid performance overhead, in which case we fall back to
133 * using mmap_lock. The function should never yield false unlocked result.
134 * False locked result is possible if mm_lock_seq overflows or if vma gets
135 * reused and attached to a different mm before we lock it.
136 * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got
137 * detached.
138 *
139 * IMPORTANT: RCU lock must be held upon entering the function, but upon error
140 * IT IS RELEASED. The caller must handle this correctly.
141 */
142static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
143 struct vm_area_struct *vma)
144{
145 struct mm_struct *other_mm;
146 int oldcnt;
147
148 RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu lock held");
149 /*
150 * Check before locking. A race might cause false locked result.
151 * We can use READ_ONCE() for the mm_lock_seq here, and don't need
152 * ACQUIRE semantics, because this is just a lockless check whose result
153 * we don't rely on for anything - the mm_lock_seq read against which we
154 * need ordering is below.
155 */
156 if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) {
157 vma = NULL;
158 goto err;
159 }
160
161 /*
162 * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire()
163 * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET.
164 * Acquire fence is required here to avoid reordering against later
165 * vm_lock_seq check and checks inside lock_vma_under_rcu().
166 */
167 if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
168 VMA_REF_LIMIT))) {
169 /* return EAGAIN if vma got detached from under us */
170 vma = oldcnt ? NULL : ERR_PTR(error: -EAGAIN);
171 goto err;
172 }
173
174 rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
175
176 if (unlikely(vma->vm_mm != mm))
177 goto err_unstable;
178
179 /*
180 * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result.
181 * False unlocked result is impossible because we modify and check
182 * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq
183 * modification invalidates all existing locks.
184 *
185 * We must use ACQUIRE semantics for the mm_lock_seq so that if we are
186 * racing with vma_end_write_all(), we only start reading from the VMA
187 * after it has been unlocked.
188 * This pairs with RELEASE semantics in vma_end_write_all().
189 */
190 if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) {
191 vma_refcount_put(vma);
192 vma = NULL;
193 goto err;
194 }
195
196 return vma;
197err:
198 rcu_read_unlock();
199
200 return vma;
201err_unstable:
202 /*
203 * If vma got attached to another mm from under us, that mm is not
204 * stable and can be freed in the narrow window after vma->vm_refcnt
205 * is dropped and before rcuwait_wake_up(mm) is called. Grab it before
206 * releasing vma->vm_refcnt.
207 */
208 other_mm = vma->vm_mm; /* use a copy as vma can be freed after we drop vm_refcnt */
209
210 /* __mmdrop() is a heavy operation, do it after dropping RCU lock. */
211 rcu_read_unlock();
212 mmgrab(mm: other_mm);
213 vma_refcount_put(vma);
214 mmdrop(mm: other_mm);
215
216 return NULL;
217}
218
219/*
220 * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
221 * stable and not isolated. If the VMA is not found or is being modified the
222 * function returns NULL.
223 */
224struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
225 unsigned long address)
226{
227 MA_STATE(mas, &mm->mm_mt, address, address);
228 struct vm_area_struct *vma;
229
230retry:
231 rcu_read_lock();
232 vma = mas_walk(mas: &mas);
233 if (!vma) {
234 rcu_read_unlock();
235 goto inval;
236 }
237
238 vma = vma_start_read(mm, vma);
239 if (IS_ERR_OR_NULL(ptr: vma)) {
240 /* Check if the VMA got isolated after we found it */
241 if (PTR_ERR(ptr: vma) == -EAGAIN) {
242 count_vm_vma_lock_event(VMA_LOCK_MISS);
243 /* The area was replaced with another one */
244 goto retry;
245 }
246
247 /* Failed to lock the VMA */
248 goto inval;
249 }
250 /*
251 * At this point, we have a stable reference to a VMA: The VMA is
252 * locked and we know it hasn't already been isolated.
253 * From here on, we can access the VMA without worrying about which
254 * fields are accessible for RCU readers.
255 */
256 rcu_read_unlock();
257
258 /* Check if the vma we locked is the right one. */
259 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
260 vma_end_read(vma);
261 goto inval;
262 }
263
264 return vma;
265
266inval:
267 count_vm_vma_lock_event(VMA_LOCK_ABORT);
268 return NULL;
269}
270
271static struct vm_area_struct *lock_next_vma_under_mmap_lock(struct mm_struct *mm,
272 struct vma_iterator *vmi,
273 unsigned long from_addr)
274{
275 struct vm_area_struct *vma;
276 int ret;
277
278 ret = mmap_read_lock_killable(mm);
279 if (ret)
280 return ERR_PTR(error: ret);
281
282 /* Lookup the vma at the last position again under mmap_read_lock */
283 vma_iter_set(vmi, addr: from_addr);
284 vma = vma_next(vmi);
285 if (vma) {
286 /* Very unlikely vma->vm_refcnt overflow case */
287 if (unlikely(!vma_start_read_locked(vma)))
288 vma = ERR_PTR(error: -EAGAIN);
289 }
290
291 mmap_read_unlock(mm);
292
293 return vma;
294}
295
296struct vm_area_struct *lock_next_vma(struct mm_struct *mm,
297 struct vma_iterator *vmi,
298 unsigned long from_addr)
299{
300 struct vm_area_struct *vma;
301 unsigned int mm_wr_seq;
302 bool mmap_unlocked;
303
304 RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu read lock held");
305retry:
306 /* Start mmap_lock speculation in case we need to verify the vma later */
307 mmap_unlocked = mmap_lock_speculate_try_begin(mm, seq: &mm_wr_seq);
308 vma = vma_next(vmi);
309 if (!vma)
310 return NULL;
311
312 vma = vma_start_read(mm, vma);
313 if (IS_ERR_OR_NULL(ptr: vma)) {
314 /*
315 * Retry immediately if the vma gets detached from under us.
316 * Infinite loop should not happen because the vma we find will
317 * have to be constantly knocked out from under us.
318 */
319 if (PTR_ERR(ptr: vma) == -EAGAIN) {
320 /* reset to search from the last address */
321 rcu_read_lock();
322 vma_iter_set(vmi, addr: from_addr);
323 goto retry;
324 }
325
326 goto fallback;
327 }
328
329 /* Verify the vma is not behind the last search position. */
330 if (unlikely(from_addr >= vma->vm_end))
331 goto fallback_unlock;
332
333 /*
334 * vma can be ahead of the last search position but we need to verify
335 * it was not shrunk after we found it and another vma has not been
336 * installed ahead of it. Otherwise we might observe a gap that should
337 * not be there.
338 */
339 if (from_addr < vma->vm_start) {
340 /* Verify only if the address space might have changed since vma lookup. */
341 if (!mmap_unlocked || mmap_lock_speculate_retry(mm, seq: mm_wr_seq)) {
342 vma_iter_set(vmi, addr: from_addr);
343 if (vma != vma_next(vmi))
344 goto fallback_unlock;
345 }
346 }
347
348 return vma;
349
350fallback_unlock:
351 rcu_read_unlock();
352 vma_end_read(vma);
353fallback:
354 vma = lock_next_vma_under_mmap_lock(mm, vmi, from_addr);
355 rcu_read_lock();
356 /* Reinitialize the iterator after re-entering rcu read section */
357 vma_iter_set(vmi, addr: IS_ERR_OR_NULL(ptr: vma) ? from_addr : vma->vm_end);
358
359 return vma;
360}
361#endif /* CONFIG_PER_VMA_LOCK */
362
363#ifdef CONFIG_LOCK_MM_AND_FIND_VMA
364#include <linux/extable.h>
365
366static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
367{
368 if (likely(mmap_read_trylock(mm)))
369 return true;
370
371 if (regs && !user_mode(regs)) {
372 unsigned long ip = exception_ip(regs);
373 if (!search_exception_tables(add: ip))
374 return false;
375 }
376
377 return !mmap_read_lock_killable(mm);
378}
379
380static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
381{
382 /*
383 * We don't have this operation yet.
384 *
385 * It should be easy enough to do: it's basically a
386 * atomic_long_try_cmpxchg_acquire()
387 * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
388 * it also needs the proper lockdep magic etc.
389 */
390 return false;
391}
392
393static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
394{
395 mmap_read_unlock(mm);
396 if (regs && !user_mode(regs)) {
397 unsigned long ip = exception_ip(regs);
398 if (!search_exception_tables(add: ip))
399 return false;
400 }
401 return !mmap_write_lock_killable(mm);
402}
403
404/*
405 * Helper for page fault handling.
406 *
407 * This is kind of equivalent to "mmap_read_lock()" followed
408 * by "find_extend_vma()", except it's a lot more careful about
409 * the locking (and will drop the lock on failure).
410 *
411 * For example, if we have a kernel bug that causes a page
412 * fault, we don't want to just use mmap_read_lock() to get
413 * the mm lock, because that would deadlock if the bug were
414 * to happen while we're holding the mm lock for writing.
415 *
416 * So this checks the exception tables on kernel faults in
417 * order to only do this all for instructions that are actually
418 * expected to fault.
419 *
420 * We can also actually take the mm lock for writing if we
421 * need to extend the vma, which helps the VM layer a lot.
422 */
423struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
424 unsigned long addr, struct pt_regs *regs)
425{
426 struct vm_area_struct *vma;
427
428 if (!get_mmap_lock_carefully(mm, regs))
429 return NULL;
430
431 vma = find_vma(mm, addr);
432 if (likely(vma && (vma->vm_start <= addr)))
433 return vma;
434
435 /*
436 * Well, dang. We might still be successful, but only
437 * if we can extend a vma to do so.
438 */
439 if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
440 mmap_read_unlock(mm);
441 return NULL;
442 }
443
444 /*
445 * We can try to upgrade the mmap lock atomically,
446 * in which case we can continue to use the vma
447 * we already looked up.
448 *
449 * Otherwise we'll have to drop the mmap lock and
450 * re-take it, and also look up the vma again,
451 * re-checking it.
452 */
453 if (!mmap_upgrade_trylock(mm)) {
454 if (!upgrade_mmap_lock_carefully(mm, regs))
455 return NULL;
456
457 vma = find_vma(mm, addr);
458 if (!vma)
459 goto fail;
460 if (vma->vm_start <= addr)
461 goto success;
462 if (!(vma->vm_flags & VM_GROWSDOWN))
463 goto fail;
464 }
465
466 if (expand_stack_locked(vma, address: addr))
467 goto fail;
468
469success:
470 mmap_write_downgrade(mm);
471 return vma;
472
473fail:
474 mmap_write_unlock(mm);
475 return NULL;
476}
477#endif /* CONFIG_LOCK_MM_AND_FIND_VMA */
478
479#else /* CONFIG_MMU */
480
481/*
482 * At least xtensa ends up having protection faults even with no
483 * MMU.. No stack expansion, at least.
484 */
485struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
486 unsigned long addr, struct pt_regs *regs)
487{
488 struct vm_area_struct *vma;
489
490 mmap_read_lock(mm);
491 vma = vma_lookup(mm, addr);
492 if (!vma)
493 mmap_read_unlock(mm);
494 return vma;
495}
496
497#endif /* CONFIG_MMU */
498