mmap_lock.c source code [Linux/mm/mmap_lock.c]

1	// SPDX-License-Identifier: GPL-2.0
2	#define CREATE_TRACE_POINTS
3	#include <trace/events/mmap_lock.h>
4
5	#include <linux/mm.h>
6	#include <linux/cgroup.h>
7	#include <linux/memcontrol.h>
8	#include <linux/mmap_lock.h>
9	#include <linux/mutex.h>
10	#include <linux/percpu.h>
11	#include <linux/rcupdate.h>
12	#include <linux/smp.h>
13	#include <linux/trace_events.h>
14	#include <linux/local_lock.h>
15
16	EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking);
17	EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned);
18	EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released);
19
20	#ifdef CONFIG_TRACING
21	/*
22	* Trace calls must be in a separate file, as otherwise there's a circular
23	* dependency between linux/mmap_lock.h and trace/events/mmap_lock.h.
24	*/
25
26	void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write)
27	{
28	trace_mmap_lock_start_locking(mm, write);
29	}
30	EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking);
31
32	void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
33	bool success)
34	{
35	trace_mmap_lock_acquire_returned(mm, write, success);
36	}
37	EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned);
38
39	void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write)
40	{
41	trace_mmap_lock_released(mm, write);
42	}
43	EXPORT_SYMBOL(__mmap_lock_do_trace_released);
44	#endif /* CONFIG_TRACING */
45
46	#ifdef CONFIG_MMU
47	#ifdef CONFIG_PER_VMA_LOCK
48	static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching)
49	{
50	unsigned int tgt_refcnt = VMA_LOCK_OFFSET;
51
52	/ Additional refcnt if the vma is attached. /
53	if (!detaching)
54	tgt_refcnt++;
55
56	/*
57	* If vma is detached then only vma_mark_attached() can raise the
58	* vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
59	*/
60	if (!refcount_add_not_zero(VMA_LOCK_OFFSET, r: &vma->vm_refcnt))
61	return false;
62
63	rwsem_acquire(&vma->vmlock_dep_map, `0`, `0`, _RET_IP_);
64	rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
65	refcount_read(&vma->vm_refcnt) == tgt_refcnt,
66	TASK_UNINTERRUPTIBLE);
67	lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
68
69	return true;
70	}
71
72	static inline void __vma_exit_locked(struct vm_area_struct vma, bool detached)
73	{
74	*detached = refcount_sub_and_test(VMA_LOCK_OFFSET, r: &vma->vm_refcnt);
75	rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
76	}
77
78	void __vma_start_write(struct vm_area_struct vma, unsigned* int mm_lock_seq)
79	{
80	bool locked;
81
82	/*
83	* __vma_enter_locked() returns false immediately if the vma is not
84	* attached, otherwise it waits until refcnt is indicating that vma
85	* is attached with no readers.
86	*/
87	locked = __vma_enter_locked(vma, detaching: false);
88
89	/*
90	* We should use WRITE_ONCE() here because we can have concurrent reads
91	* from the early lockless pessimistic check in vma_start_read().
92	* We don't really care about the correctness of that early check, but
93	* we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
94	*/
95	WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
96
97	if (locked) {
98	bool detached;
99
100	__vma_exit_locked(vma, detached: &detached);
101	WARN_ON_ONCE(detached); / vma should remain attached /
102	}
103	}
104	EXPORT_SYMBOL_GPL(__vma_start_write);
105
106	void vma_mark_detached(struct vm_area_struct *vma)
107	{
108	vma_assert_write_locked(vma);
109	vma_assert_attached(vma);
110
111	/*
112	* We are the only writer, so no need to use vma_refcount_put().
113	* The condition below is unlikely because the vma has been already
114	* write-locked and readers can increment vm_refcnt only temporarily
115	* before they check vm_lock_seq, realize the vma is locked and drop
116	* back the vm_refcnt. That is a narrow window for observing a raised
117	* vm_refcnt.
118	*/
119	if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
120	/ Wait until vma is detached with no readers. /
121	if (__vma_enter_locked(vma, detaching: true)) {
122	bool detached;
123
124	__vma_exit_locked(vma, detached: &detached);
125	WARN_ON_ONCE(!detached);
126	}
127	}
128	}
129
130	/*
131	* Try to read-lock a vma. The function is allowed to occasionally yield false
132	* locked result to avoid performance overhead, in which case we fall back to
133	* using mmap_lock. The function should never yield false unlocked result.
134	* False locked result is possible if mm_lock_seq overflows or if vma gets
135	* reused and attached to a different mm before we lock it.
136	* Returns the vma on success, NULL on failure to lock and EAGAIN if vma got
137	* detached.
138	*
139	* IMPORTANT: RCU lock must be held upon entering the function, but upon error
140	* IT IS RELEASED. The caller must handle this correctly.
141	*/
142	static inline struct vm_area_struct vma_start_read(struct* mm_struct *mm,
143	struct vm_area_struct *vma)
144	{
145	struct mm_struct *other_mm;
146	int oldcnt;
147
148	RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu lock held");
149	/*
150	* Check before locking. A race might cause false locked result.
151	* We can use READ_ONCE() for the mm_lock_seq here, and don't need
152	* ACQUIRE semantics, because this is just a lockless check whose result
153	* we don't rely on for anything - the mm_lock_seq read against which we
154	* need ordering is below.
155	*/
156	if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) {
157	vma = NULL;
158	goto err;
159	}
160
161	/*
162	* If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire()
163	* will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET.
164	* Acquire fence is required here to avoid reordering against later
165	* vm_lock_seq check and checks inside lock_vma_under_rcu().
166	*/
167	if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
168	VMA_REF_LIMIT))) {
169	/ return EAGAIN if vma got detached from under us /
170	vma = oldcnt ? NULL : ERR_PTR(error: -EAGAIN);
171	goto err;
172	}
173
174	rwsem_acquire_read(&vma->vmlock_dep_map, `0`, `1`, _RET_IP_);
175
176	if (unlikely(vma->vm_mm != mm))
177	goto err_unstable;
178
179	/*
180	* Overflow of vm_lock_seq/mm_lock_seq might produce false locked result.
181	* False unlocked result is impossible because we modify and check
182	* vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq
183	* modification invalidates all existing locks.
184	*
185	* We must use ACQUIRE semantics for the mm_lock_seq so that if we are
186	* racing with vma_end_write_all(), we only start reading from the VMA
187	* after it has been unlocked.
188	* This pairs with RELEASE semantics in vma_end_write_all().
189	*/
190	if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) {
191	vma_refcount_put(vma);
192	vma = NULL;
193	goto err;
194	}
195
196	return vma;
197	err:
198	rcu_read_unlock();
199
200	return vma;
201	err_unstable:
202	/*
203	* If vma got attached to another mm from under us, that mm is not
204	* stable and can be freed in the narrow window after vma->vm_refcnt
205	* is dropped and before rcuwait_wake_up(mm) is called. Grab it before
206	* releasing vma->vm_refcnt.
207	*/
208	other_mm = vma->vm_mm; / use a copy as vma can be freed after we drop vm_refcnt /
209
210	/ __mmdrop() is a heavy operation, do it after dropping RCU lock. /
211	rcu_read_unlock();
212	mmgrab(mm: other_mm);
213	vma_refcount_put(vma);
214	mmdrop(mm: other_mm);
215
216	return NULL;
217	}
218
219	/*
220	* Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
221	* stable and not isolated. If the VMA is not found or is being modified the
222	* function returns NULL.
223	*/
224	struct vm_area_struct lock_vma_under_rcu(struct* mm_struct *mm,
225	unsigned long address)
226	{
227	MA_STATE(mas, &mm->mm_mt, address, address);
228	struct vm_area_struct *vma;
229
230	retry:
231	rcu_read_lock();
232	vma = mas_walk(mas: &mas);
233	if (!vma) {
234	rcu_read_unlock();
235	goto inval;
236	}
237
238	vma = vma_start_read(mm, vma);
239	if (IS_ERR_OR_NULL(ptr: vma)) {
240	/ Check if the VMA got isolated after we found it /
241	if (PTR_ERR(ptr: vma) == -EAGAIN) {
242	count_vm_vma_lock_event(VMA_LOCK_MISS);
243	/ The area was replaced with another one /
244	goto retry;
245	}
246
247	/ Failed to lock the VMA /
248	goto inval;
249	}
250	/*
251	* At this point, we have a stable reference to a VMA: The VMA is
252	* locked and we know it hasn't already been isolated.
253	* From here on, we can access the VMA without worrying about which
254	* fields are accessible for RCU readers.
255	*/
256	rcu_read_unlock();
257
258	/ Check if the vma we locked is the right one. /
259	if (unlikely(address < vma->vm_start \|\| address >= vma->vm_end)) {
260	vma_end_read(vma);
261	goto inval;
262	}
263
264	return vma;
265
266	inval:
267	count_vm_vma_lock_event(VMA_LOCK_ABORT);
268	return NULL;
269	}
270
271	static struct vm_area_struct lock_next_vma_under_mmap_lock(struct* mm_struct *mm,
272	struct vma_iterator *vmi,
273	unsigned long from_addr)
274	{
275	struct vm_area_struct *vma;
276	int ret;
277
278	ret = mmap_read_lock_killable(mm);
279	if (ret)
280	return ERR_PTR(error: ret);
281
282	/ Lookup the vma at the last position again under mmap_read_lock /
283	vma_iter_set(vmi, addr: from_addr);
284	vma = vma_next(vmi);
285	if (vma) {
286	/ Very unlikely vma->vm_refcnt overflow case /
287	if (unlikely(!vma_start_read_locked(vma)))
288	vma = ERR_PTR(error: -EAGAIN);
289	}
290
291	mmap_read_unlock(mm);
292
293	return vma;
294	}
295
296	struct vm_area_struct lock_next_vma(struct* mm_struct *mm,
297	struct vma_iterator *vmi,
298	unsigned long from_addr)
299	{
300	struct vm_area_struct *vma;
301	unsigned int mm_wr_seq;
302	bool mmap_unlocked;
303
304	RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu read lock held");
305	retry:
306	/ Start mmap_lock speculation in case we need to verify the vma later /
307	mmap_unlocked = mmap_lock_speculate_try_begin(mm, seq: &mm_wr_seq);
308	vma = vma_next(vmi);
309	if (!vma)
310	return NULL;
311
312	vma = vma_start_read(mm, vma);
313	if (IS_ERR_OR_NULL(ptr: vma)) {
314	/*
315	* Retry immediately if the vma gets detached from under us.
316	* Infinite loop should not happen because the vma we find will
317	* have to be constantly knocked out from under us.
318	*/
319	if (PTR_ERR(ptr: vma) == -EAGAIN) {
320	/ reset to search from the last address /
321	rcu_read_lock();
322	vma_iter_set(vmi, addr: from_addr);
323	goto retry;
324	}
325
326	goto fallback;
327	}
328
329	/ Verify the vma is not behind the last search position. /
330	if (unlikely(from_addr >= vma->vm_end))
331	goto fallback_unlock;
332
333	/*
334	* vma can be ahead of the last search position but we need to verify
335	* it was not shrunk after we found it and another vma has not been
336	* installed ahead of it. Otherwise we might observe a gap that should
337	* not be there.
338	*/
339	if (from_addr < vma->vm_start) {
340	/ Verify only if the address space might have changed since vma lookup. /
341	if (!mmap_unlocked \|\| mmap_lock_speculate_retry(mm, seq: mm_wr_seq)) {
342	vma_iter_set(vmi, addr: from_addr);
343	if (vma != vma_next(vmi))
344	goto fallback_unlock;
345	}
346	}
347
348	return vma;
349
350	fallback_unlock:
351	rcu_read_unlock();
352	vma_end_read(vma);
353	fallback:
354	vma = lock_next_vma_under_mmap_lock(mm, vmi, from_addr);
355	rcu_read_lock();
356	/ Reinitialize the iterator after re-entering rcu read section /
357	vma_iter_set(vmi, addr: IS_ERR_OR_NULL(ptr: vma) ? from_addr : vma->vm_end);
358
359	return vma;
360	}
361	#endif /* CONFIG_PER_VMA_LOCK */
362
363	#ifdef CONFIG_LOCK_MM_AND_FIND_VMA
364	#include <linux/extable.h>
365
366	static inline bool get_mmap_lock_carefully(struct mm_struct mm, struct* pt_regs *regs)
367	{
368	if (likely(mmap_read_trylock(mm)))
369	return true;
370
371	if (regs && !user_mode(regs)) {
372	unsigned long ip = exception_ip(regs);
373	if (!search_exception_tables(add: ip))
374	return false;
375	}
376
377	return !mmap_read_lock_killable(mm);
378	}
379
380	static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
381	{
382	/*
383	* We don't have this operation yet.
384	*
385	* It should be easy enough to do: it's basically a
386	* atomic_long_try_cmpxchg_acquire()
387	* from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
388	* it also needs the proper lockdep magic etc.
389	*/
390	return false;
391	}
392
393	static inline bool upgrade_mmap_lock_carefully(struct mm_struct mm, struct* pt_regs *regs)
394	{
395	mmap_read_unlock(mm);
396	if (regs && !user_mode(regs)) {
397	unsigned long ip = exception_ip(regs);
398	if (!search_exception_tables(add: ip))
399	return false;
400	}
401	return !mmap_write_lock_killable(mm);
402	}
403
404	/*
405	* Helper for page fault handling.
406	*
407	* This is kind of equivalent to "mmap_read_lock()" followed
408	* by "find_extend_vma()", except it's a lot more careful about
409	* the locking (and will drop the lock on failure).
410	*
411	* For example, if we have a kernel bug that causes a page
412	* fault, we don't want to just use mmap_read_lock() to get
413	* the mm lock, because that would deadlock if the bug were
414	* to happen while we're holding the mm lock for writing.
415	*
416	* So this checks the exception tables on kernel faults in
417	* order to only do this all for instructions that are actually
418	* expected to fault.
419	*
420	* We can also actually take the mm lock for writing if we
421	* need to extend the vma, which helps the VM layer a lot.
422	*/
423	struct vm_area_struct lock_mm_and_find_vma(struct* mm_struct *mm,
424	unsigned long addr, struct pt_regs *regs)
425	{
426	struct vm_area_struct *vma;
427
428	if (!get_mmap_lock_carefully(mm, regs))
429	return NULL;
430
431	vma = find_vma(mm, addr);
432	if (likely(vma && (vma->vm_start <= addr)))
433	return vma;
434
435	/*
436	* Well, dang. We might still be successful, but only
437	* if we can extend a vma to do so.
438	*/
439	if (!vma \|\| !(vma->vm_flags & VM_GROWSDOWN)) {
440	mmap_read_unlock(mm);
441	return NULL;
442	}
443
444	/*
445	* We can try to upgrade the mmap lock atomically,
446	* in which case we can continue to use the vma
447	* we already looked up.
448	*
449	* Otherwise we'll have to drop the mmap lock and
450	* re-take it, and also look up the vma again,
451	* re-checking it.
452	*/
453	if (!mmap_upgrade_trylock(mm)) {
454	if (!upgrade_mmap_lock_carefully(mm, regs))
455	return NULL;
456
457	vma = find_vma(mm, addr);
458	if (!vma)
459	goto fail;
460	if (vma->vm_start <= addr)
461	goto success;
462	if (!(vma->vm_flags & VM_GROWSDOWN))
463	goto fail;
464	}
465
466	if (expand_stack_locked(vma, address: addr))
467	goto fail;
468
469	success:
470	mmap_write_downgrade(mm);
471	return vma;
472
473	fail:
474	mmap_write_unlock(mm);
475	return NULL;
476	}
477	#endif /* CONFIG_LOCK_MM_AND_FIND_VMA */
478
479	#else /* CONFIG_MMU */
480
481	/*
482	* At least xtensa ends up having protection faults even with no
483	* MMU.. No stack expansion, at least.
484	*/
485	struct vm_area_struct lock_mm_and_find_vma(struct* mm_struct *mm,
486	unsigned long addr, struct pt_regs *regs)
487	{
488	struct vm_area_struct *vma;
489
490	mmap_read_lock(mm);
491	vma = vma_lookup(mm, addr);
492	if (!vma)
493	mmap_read_unlock(mm);
494	return vma;
495	}
496
497	#endif /* CONFIG_MMU */
498

Browse the source code of Linux/mm/mmap_lock.c